Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 13 Apr 2015 20:36:45 +0000 (13:36 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 13 Apr 2015 20:36:45 +0000 (13:36 -0700)
Pull x86 vdso changes from Ingo Molnar:
 "Misc vDSO updates"

* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/vdso: Remove x32 intermediates during 'make clean'
  x86/vdso: Teach 'make clean' to remove generated vdso-image-*.c files
  x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
  x86/vdso: Fix the x86 vdso2c tool includes

688 files changed:
Documentation/acpi/apei/einj.txt
Documentation/devicetree/bindings/net/dsa/dsa.txt
Documentation/devicetree/bindings/thermal/rcar-thermal.txt
Documentation/input/alps.txt
Documentation/input/event-codes.txt
Documentation/input/multi-touch-protocol.txt
Documentation/kernel-parameters.txt
Documentation/rtc.txt
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/devices/s390_flic.txt
Documentation/x86/boot.txt
MAINTAINERS
Makefile
arch/alpha/kernel/rtc.c
arch/arm/common/bL_switcher.c
arch/arm/include/asm/jump_label.h
arch/arm/include/asm/kvm_arm.h
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_mmio.h
arch/arm/include/asm/mach/time.h
arch/arm/include/uapi/asm/kvm.h
arch/arm/kernel/asm-offsets.c
arch/arm/kernel/time.c
arch/arm/kvm/Kconfig
arch/arm/kvm/Makefile
arch/arm/kvm/arm.c
arch/arm/kvm/guest.c
arch/arm/kvm/interrupts_head.S
arch/arm/kvm/mmio.c
arch/arm/kvm/mmu.c
arch/arm/kvm/trace.h
arch/arm/mach-omap2/cpuidle44xx.c
arch/arm/mach-tegra/cpuidle-tegra114.c
arch/arm/mach-tegra/cpuidle-tegra20.c
arch/arm/mach-tegra/cpuidle-tegra30.c
arch/arm/plat-omap/counter_32k.c
arch/arm64/include/asm/esr.h
arch/arm64/include/asm/jump_label.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_mmio.h
arch/arm64/include/uapi/asm/kvm.h
arch/arm64/kernel/vdso.c
arch/arm64/kvm/Kconfig
arch/arm64/kvm/Makefile
arch/mips/include/asm/asmmacro-32.h
arch/mips/include/asm/asmmacro.h
arch/mips/include/asm/fpu.h
arch/mips/include/asm/jump_label.h
arch/mips/include/asm/kdebug.h
arch/mips/include/asm/kvm_host.h
arch/mips/include/asm/processor.h
arch/mips/include/uapi/asm/kvm.h
arch/mips/kernel/asm-offsets.c
arch/mips/kernel/genex.S
arch/mips/kernel/ptrace.c
arch/mips/kernel/r4k_fpu.S
arch/mips/kernel/traps.c
arch/mips/kvm/Makefile
arch/mips/kvm/emulate.c
arch/mips/kvm/fpu.S [new file with mode: 0644]
arch/mips/kvm/locore.S
arch/mips/kvm/mips.c
arch/mips/kvm/msa.S [new file with mode: 0644]
arch/mips/kvm/stats.c
arch/mips/kvm/tlb.c
arch/mips/kvm/trap_emul.c
arch/mips/lasat/sysctl.c
arch/nios2/include/asm/thread_info.h
arch/nios2/include/uapi/asm/ptrace.h
arch/nios2/kernel/entry.S
arch/nios2/kernel/signal.c
arch/nios2/mm/cacheflush.c
arch/powerpc/include/asm/cputhreads.h
arch/powerpc/kvm/mpic.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/platforms/powernv/opal-wrappers.S
arch/powerpc/platforms/pseries/hvCall.S
arch/powerpc/platforms/pseries/lpar.c
arch/s390/include/asm/jump_label.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/uapi/asm/kvm.h
arch/s390/include/uapi/asm/sie.h
arch/s390/kernel/asm-offsets.c
arch/s390/kernel/time.c
arch/s390/kvm/diag.c
arch/s390/kvm/gaccess.c
arch/s390/kvm/gaccess.h
arch/s390/kvm/guestdbg.c
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/s390/kvm/sigp.c
arch/sparc/include/asm/jump_label.h
arch/sparc/kernel/pci.c
arch/sparc/kernel/time_32.c
arch/tile/kernel/time.c
arch/x86/Kconfig
arch/x86/boot/compressed/aslr.c
arch/x86/boot/compressed/head_32.S
arch/x86/boot/compressed/head_64.S
arch/x86/boot/compressed/misc.c
arch/x86/boot/compressed/misc.h
arch/x86/boot/string.c
arch/x86/boot/video-mode.c
arch/x86/boot/video.c
arch/x86/boot/video.h
arch/x86/configs/i386_defconfig
arch/x86/configs/x86_64_defconfig
arch/x86/crypto/crc32c-pcl-intel-asm_64.S
arch/x86/crypto/twofish-x86_64-asm_64.S
arch/x86/ia32/Makefile
arch/x86/ia32/ia32_signal.c
arch/x86/ia32/ia32entry.S
arch/x86/ia32/nosyscall.c [deleted file]
arch/x86/ia32/sys_ia32.c
arch/x86/ia32/syscall_ia32.c [deleted file]
arch/x86/include/asm/alternative-asm.h
arch/x86/include/asm/alternative.h
arch/x86/include/asm/apic.h
arch/x86/include/asm/barrier.h
arch/x86/include/asm/calling.h
arch/x86/include/asm/compat.h
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/desc.h
arch/x86/include/asm/dwarf2.h
arch/x86/include/asm/efi.h
arch/x86/include/asm/elf.h
arch/x86/include/asm/fpu-internal.h
arch/x86/include/asm/hw_irq.h
arch/x86/include/asm/insn.h
arch/x86/include/asm/iommu_table.h
arch/x86/include/asm/irqflags.h
arch/x86/include/asm/jump_label.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_para.h
arch/x86/include/asm/mce.h
arch/x86/include/asm/microcode.h
arch/x86/include/asm/microcode_intel.h
arch/x86/include/asm/mwait.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/ptrace.h
arch/x86/include/asm/pvclock.h
arch/x86/include/asm/segment.h
arch/x86/include/asm/setup.h
arch/x86/include/asm/sigcontext.h
arch/x86/include/asm/sighandling.h
arch/x86/include/asm/smap.h
arch/x86/include/asm/smp.h
arch/x86/include/asm/special_insns.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/uaccess_64.h
arch/x86/include/uapi/asm/bootparam.h
arch/x86/include/uapi/asm/ptrace-abi.h
arch/x86/include/uapi/asm/ptrace.h
arch/x86/include/uapi/asm/sigcontext.h
arch/x86/include/uapi/asm/vmx.h
arch/x86/kernel/Makefile
arch/x86/kernel/alternative.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/x2apic_cluster.c
arch/x86/kernel/apic/x2apic_uv_x.c
arch/x86/kernel/asm-offsets_32.c
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/cpu/mcheck/mce-internal.h
arch/x86/kernel/cpu/mcheck/mce-severity.c
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/mcheck/mce_amd.c
arch/x86/kernel/cpu/mcheck/mce_intel.c
arch/x86/kernel/cpu/microcode/amd.c
arch/x86/kernel/cpu/microcode/core_early.c
arch/x86/kernel/cpu/microcode/intel.c
arch/x86/kernel/cpu/microcode/intel_early.c
arch/x86/kernel/cpu/microcode/intel_lib.c
arch/x86/kernel/cpu/mkcapflags.sh
arch/x86/kernel/cpu/perf_event.c
arch/x86/kernel/cpu/perf_event_intel.c
arch/x86/kernel/crash.c
arch/x86/kernel/devicetree.c
arch/x86/kernel/dumpstack.c
arch/x86/kernel/dumpstack_32.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/e820.c
arch/x86/kernel/early_printk.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/head64.c
arch/x86/kernel/head_32.S
arch/x86/kernel/head_64.S
arch/x86/kernel/i387.c
arch/x86/kernel/ioport.c
arch/x86/kernel/irq.c
arch/x86/kernel/irq_32.c
arch/x86/kernel/irq_64.c
arch/x86/kernel/irqinit.c
arch/x86/kernel/kgdb.c
arch/x86/kernel/kprobes/core.c
arch/x86/kernel/module.c
arch/x86/kernel/perf_regs.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/pvclock.c
arch/x86/kernel/reboot.c
arch/x86/kernel/relocate_kernel_32.S
arch/x86/kernel/relocate_kernel_64.S
arch/x86/kernel/setup.c
arch/x86/kernel/signal.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/sys_x86_64.c
arch/x86/kernel/syscall_32.c
arch/x86/kernel/time.c
arch/x86/kernel/traps.c
arch/x86/kernel/uprobes.c
arch/x86/kernel/vm86_32.c
arch/x86/kernel/vsyscall_gtod.c
arch/x86/kernel/xsave.c
arch/x86/kvm/Makefile
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/emulate.c
arch/x86/kvm/i8254.c
arch/x86/kvm/i8254.h
arch/x86/kvm/i8259.c
arch/x86/kvm/ioapic.c
arch/x86/kvm/ioapic.h
arch/x86/kvm/irq.h
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/pmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/lguest/boot.c
arch/x86/lib/atomic64_cx8_32.S
arch/x86/lib/checksum_32.S
arch/x86/lib/clear_page_64.S
arch/x86/lib/copy_page_64.S
arch/x86/lib/copy_user_64.S
arch/x86/lib/csum-copy_64.S
arch/x86/lib/insn.c
arch/x86/lib/memcpy_64.S
arch/x86/lib/memmove_64.S
arch/x86/lib/memset_64.S
arch/x86/lib/msr-reg.S
arch/x86/lib/rwsem.S
arch/x86/lib/thunk_32.S
arch/x86/lib/thunk_64.S
arch/x86/lib/usercopy_64.c
arch/x86/lib/x86-opcode-map.txt
arch/x86/mm/fault.c
arch/x86/mm/init.c
arch/x86/mm/init_64.c
arch/x86/mm/numa.c
arch/x86/mm/pageattr.c
arch/x86/mm/pat.c
arch/x86/mm/pgtable.c
arch/x86/oprofile/backtrace.c
arch/x86/platform/efi/efi-bgrt.c
arch/x86/platform/efi/efi.c
arch/x86/platform/efi/efi_32.c
arch/x86/platform/efi/efi_64.c
arch/x86/platform/intel-quark/imr_selftest.c
arch/x86/platform/uv/tlb_uv.c
arch/x86/power/cpu.c
arch/x86/syscalls/syscall_32.tbl
arch/x86/syscalls/syscall_64.tbl
arch/x86/um/asm/barrier.h
arch/x86/um/sys_call_table_64.c
arch/x86/vdso/vclock_gettime.c
arch/x86/xen/enlighten.c
arch/x86/xen/p2m.c
arch/x86/xen/smp.c
arch/x86/xen/suspend.c
arch/x86/xen/xen-asm_64.S
block/blk-mq.c
block/blk-settings.c
drivers/acpi/acpi_pad.c
drivers/acpi/processor_idle.c
drivers/ata/libata-core.c
drivers/char/ipmi/ipmi_powernv.c
drivers/char/ipmi/ipmi_si_intf.c
drivers/char/ipmi/ipmi_ssif.c
drivers/clocksource/arm_arch_timer.c
drivers/clocksource/dw_apb_timer_of.c
drivers/clocksource/em_sti.c
drivers/clocksource/sh_cmt.c
drivers/clocksource/sh_tmu.c
drivers/clocksource/sun4i_timer.c
drivers/clocksource/tegra20_timer.c
drivers/clocksource/time-efm32.c
drivers/clocksource/timer-atmel-pit.c
drivers/clocksource/timer-sun5i.c
drivers/cpufreq/cpufreq.c
drivers/cpuidle/cpuidle.c
drivers/cpuidle/driver.c
drivers/cpuidle/sysfs.c
drivers/dma/bcm2835-dma.c
drivers/dma/cppi41.c
drivers/dma/dma-jz4740.c
drivers/dma/dmaengine.c
drivers/dma/edma.c
drivers/dma/moxart-dma.c
drivers/dma/omap-dma.c
drivers/firmware/dmi_scan.c
drivers/firmware/efi/libstub/arm-stub.c
drivers/firmware/efi/libstub/efistub.h
drivers/firmware/efi/libstub/fdt.c
drivers/gpio/gpio-mpc8xxx.c
drivers/gpio/gpio-syscon.c
drivers/gpio/gpiolib-acpi.c
drivers/gpu/drm/drm_crtc.c
drivers/gpu/drm/drm_edid_load.c
drivers/gpu/drm/drm_probe_helper.c
drivers/gpu/drm/exynos/exynos_drm_fimd.c
drivers/gpu/drm/exynos/exynos_mixer.c
drivers/gpu/drm/i915/i915_drv.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem_execbuffer.c
drivers/gpu/drm/i915/intel_sprite.c
drivers/gpu/drm/radeon/cikd.h
drivers/gpu/drm/radeon/radeon.h
drivers/gpu/drm/radeon/radeon_bios.c
drivers/gpu/drm/radeon/radeon_mn.c
drivers/gpu/drm/radeon/radeon_pm.c
drivers/gpu/drm/radeon/radeon_ring.c
drivers/gpu/drm/radeon/radeon_ttm.c
drivers/gpu/drm/radeon/vce_v2_0.c
drivers/idle/intel_idle.c
drivers/iio/accel/bma180.c
drivers/iio/accel/bmc150-accel.c
drivers/iio/accel/kxcjk-1013.c
drivers/iio/adc/Kconfig
drivers/iio/adc/at91_adc.c
drivers/iio/adc/ti_am335x_adc.c
drivers/iio/adc/vf610_adc.c
drivers/iio/gyro/bmg160.c
drivers/iio/imu/adis_trigger.c
drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c
drivers/iio/imu/kmx61.c
drivers/iio/industrialio-core.c
drivers/iio/industrialio-event.c
drivers/iio/proximity/sx9500.c
drivers/infiniband/core/umem.c
drivers/input/mouse/alps.c
drivers/input/mouse/synaptics.c
drivers/iommu/arm-smmu.c
drivers/iommu/intel-iommu.c
drivers/iommu/ipmmu-vmsa.c
drivers/irqchip/irq-gic-v3-its.c
drivers/lguest/Kconfig
drivers/md/md.c
drivers/md/raid0.c
drivers/media/dvb-frontends/rtl2832.c
drivers/media/pci/cx23885/cx23885-417.c
drivers/media/platform/s5p-jpeg/jpeg-core.c
drivers/media/platform/s5p-jpeg/jpeg-hw-exynos3250.c
drivers/media/platform/s5p-mfc/s5p_mfc.c
drivers/media/platform/s5p-mfc/s5p_mfc_common.h
drivers/media/platform/s5p-mfc/s5p_mfc_opr.h
drivers/media/platform/s5p-mfc/s5p_mfc_opr_v5.c
drivers/media/platform/s5p-mfc/s5p_mfc_opr_v6.c
drivers/media/platform/s5p-tv/Kconfig
drivers/media/platform/sh_veu.c
drivers/media/platform/soc_camera/atmel-isi.c
drivers/media/platform/soc_camera/soc_camera.c
drivers/media/usb/dvb-usb-v2/rtl28xxu.c
drivers/media/usb/gspca/Kconfig
drivers/media/v4l2-core/videobuf2-core.c
drivers/media/v4l2-core/videobuf2-dma-contig.c
drivers/misc/enclosure.c
drivers/misc/sgi-xp/xpc_main.c
drivers/net/bonding/bond_main.c
drivers/net/can/flexcan.c
drivers/net/can/usb/gs_usb.c
drivers/net/can/usb/kvaser_usb.c
drivers/net/can/usb/peak_usb/pcan_ucan.h
drivers/net/can/usb/peak_usb/pcan_usb_fd.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.h
drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
drivers/net/ethernet/chelsio/cxgb4/sge.c
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h
drivers/net/ethernet/chelsio/cxgb4vf/sge.c
drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
drivers/net/ethernet/freescale/fec_main.c
drivers/net/ethernet/freescale/ucc_geth.c
drivers/net/ethernet/marvell/mvneta.c
drivers/net/ethernet/mellanox/mlx4/cmd.c
drivers/net/ethernet/mellanox/mlx4/en_netdev.c
drivers/net/ethernet/mellanox/mlx4/eq.c
drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
drivers/net/ethernet/rocker/rocker.c
drivers/net/ipvlan/ipvlan.h
drivers/net/ipvlan/ipvlan_core.c
drivers/net/ipvlan/ipvlan_main.c
drivers/net/usb/asix_common.c
drivers/net/usb/cdc_ether.c
drivers/net/usb/cdc_ncm.c
drivers/net/usb/r8152.c
drivers/net/usb/sr9800.c
drivers/net/usb/usbnet.c
drivers/net/wireless/ath/ath9k/beacon.c
drivers/net/wireless/ath/ath9k/common.h
drivers/net/wireless/ath/ath9k/hw.c
drivers/net/wireless/brcm80211/brcmfmac/feature.c
drivers/net/wireless/iwlwifi/dvm/dev.h
drivers/net/wireless/iwlwifi/dvm/mac80211.c
drivers/net/wireless/iwlwifi/dvm/ucode.c
drivers/net/wireless/iwlwifi/iwl-drv.c
drivers/net/wireless/iwlwifi/mvm/rs.c
drivers/net/wireless/iwlwifi/mvm/time-event.c
drivers/net/wireless/iwlwifi/mvm/tx.c
drivers/net/wireless/iwlwifi/pcie/drv.c
drivers/net/wireless/rtlwifi/pci.c
drivers/net/xen-netfront.c
drivers/of/address.c
drivers/pci/host/pcie-designware.c
drivers/pci/host/pcie-spear13xx.c
drivers/pci/hotplug/cpci_hotplug_pci.c
drivers/pci/pci-acpi.c
drivers/pci/pcie/aer/aerdrv_errprint.c
drivers/rtc/class.c
drivers/rtc/interface.c
drivers/rtc/rtc-ab3100.c
drivers/rtc/rtc-mc13xxx.c
drivers/rtc/rtc-mxc.c
drivers/rtc/rtc-test.c
drivers/rtc/systohc.c
drivers/scsi/be2iscsi/be_main.c
drivers/scsi/scsi_lib.c
drivers/staging/iio/Kconfig
drivers/staging/iio/magnetometer/hmc5843_core.c
drivers/target/iscsi/iscsi_target.c
drivers/target/target_core_device.c
drivers/thermal/st/st_thermal.c
drivers/thermal/st/st_thermal_memmap.c
drivers/thermal/st/st_thermal_syscfg.c
drivers/thermal/thermal_core.c
drivers/tty/serial/fsl_lpuart.c
drivers/tty/serial/samsung.c
drivers/usb/host/xhci-hub.c
drivers/usb/host/xhci-pci.c
drivers/usb/isp1760/isp1760-udc.c
drivers/usb/serial/ftdi_sio.c
drivers/usb/serial/ftdi_sio_ids.h
drivers/usb/serial/keyspan_pda.c
drivers/xen/Kconfig
drivers/xen/balloon.c
fs/aio.c
fs/cifs/cifsencrypt.c
fs/cifs/connect.c
fs/cifs/file.c
fs/cifs/inode.c
fs/cifs/smb2misc.c
fs/cifs/smb2ops.c
fs/cifs/smb2pdu.c
fs/fs-writeback.c
fs/locks.c
fs/nfsd/blocklayout.c
fs/nfsd/blocklayoutxdr.c
fs/nfsd/nfs4layouts.c
fs/nfsd/nfs4proc.c
fs/nfsd/nfs4state.c
fs/nfsd/nfs4xdr.c
fs/nfsd/nfscache.c
fs/ocfs2/file.c
include/kvm/arm_arch_timer.h
include/kvm/arm_vgic.h
include/kvm/iodev.h [new file with mode: 0644]
include/linux/blk_types.h
include/linux/clockchips.h
include/linux/clocksource.h
include/linux/compiler.h
include/linux/cpuidle.h
include/linux/dmapool.h
include/linux/efi.h
include/linux/fs.h
include/linux/init.h
include/linux/irq_work.h
include/linux/irqchip/arm-gic-v3.h
include/linux/jump_label.h
include/linux/kvm_host.h
include/linux/lcm.h
include/linux/mmzone.h
include/linux/netdevice.h
include/linux/rtc.h
include/linux/sched.h
include/linux/seqlock.h
include/linux/stddef.h
include/linux/sunrpc/debug.h
include/linux/tick.h
include/linux/timekeeper_internal.h
include/linux/timekeeping.h
include/linux/usb/usbnet.h
include/linux/vfio.h
include/linux/writeback.h
include/media/atmel-isi.h
include/net/ip.h
include/net/ip6_route.h
include/net/sock.h
include/uapi/linux/input.h
include/uapi/linux/kvm.h
include/uapi/linux/nfsd/export.h
kernel/cpu.c
kernel/futex.c
kernel/locking/mcs_spinlock.h
kernel/locking/mutex.c
kernel/locking/osq_lock.c
kernel/locking/rtmutex.c
kernel/locking/rwsem-spinlock.c
kernel/locking/rwsem-xadd.c
kernel/locking/rwsem.c
kernel/locking/rwsem.h [new file with mode: 0644]
kernel/module.c
kernel/power/snapshot.c
kernel/sched/core.c
kernel/sched/deadline.c
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/features.h
kernel/sched/idle.c
kernel/sched/rt.c
kernel/sched/sched.h
kernel/sysctl.c
kernel/time/Kconfig
kernel/time/Makefile
kernel/time/clockevents.c
kernel/time/clocksource.c
kernel/time/hrtimer.c
kernel/time/jiffies.c
kernel/time/ntp.c
kernel/time/sched_clock.c
kernel/time/tick-broadcast.c
kernel/time/tick-common.c
kernel/time/tick-internal.h
kernel/time/tick-oneshot.c
kernel/time/tick-sched.c
kernel/time/tick-sched.h [new file with mode: 0644]
kernel/time/timekeeping.c
kernel/time/timekeeping.h
kernel/time/timer.c
kernel/time/timer_list.c
lib/Kconfig.debug
lib/lcm.c
lib/lockref.c
lib/nlattr.c
mm/mremap.c
net/ceph/messenger.c
net/core/dev.c
net/core/fib_rules.c
net/core/net_namespace.c
net/core/rtnetlink.c
net/core/sock.c
net/decnet/dn_rules.c
net/dsa/dsa.c
net/ipv4/fib_frontend.c
net/ipv4/ipmr.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv6/fib6_rules.c
net/ipv6/ip6_output.c
net/ipv6/ip6mr.c
net/ipv6/ndisc.c
net/ipv6/tcp_ipv6.c
net/iucv/af_iucv.c
net/l2tp/l2tp_core.c
net/mac80211/agg-rx.c
net/mac80211/rx.c
net/mac80211/sta_info.h
net/openvswitch/vport.c
net/sunrpc/clnt.c
net/sunrpc/debugfs.c
net/sunrpc/sunrpc_syms.c
net/sunrpc/xprt.c
net/tipc/core.c
sound/firewire/bebob/bebob_maudio.c
sound/pci/hda/patch_realtek.c
sound/soc/codecs/pcm512x.c
sound/usb/mixer_quirks.c
sound/usb/quirks.c
tools/perf/bench/mem-memcpy-x86-64-asm-def.h
tools/perf/bench/mem-memcpy-x86-64-asm.S
tools/perf/bench/mem-memcpy.c
tools/perf/bench/mem-memset-x86-64-asm-def.h
tools/perf/bench/mem-memset-x86-64-asm.S
tools/perf/util/include/asm/alternative-asm.h
tools/testing/selftests/Makefile
tools/testing/selftests/breakpoints/Makefile
tools/testing/selftests/cpu-hotplug/Makefile
tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh [new file with mode: 0755]
tools/testing/selftests/cpu-hotplug/on-off-test.sh [deleted file]
tools/testing/selftests/efivarfs/Makefile
tools/testing/selftests/efivarfs/efivarfs.sh [changed mode: 0644->0755]
tools/testing/selftests/exec/Makefile
tools/testing/selftests/firmware/Makefile
tools/testing/selftests/firmware/fw_filesystem.sh [changed mode: 0644->0755]
tools/testing/selftests/firmware/fw_userhelper.sh [changed mode: 0644->0755]
tools/testing/selftests/ftrace/Makefile
tools/testing/selftests/ftrace/test.d/00basic/basic4.tc
tools/testing/selftests/ftrace/test.d/event/event-enable.tc
tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc
tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc
tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter-stack.tc
tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter.tc
tools/testing/selftests/ftrace/test.d/ftrace/func_profiler.tc
tools/testing/selftests/gen_kselftest_tar.sh [new file with mode: 0755]
tools/testing/selftests/ipc/Makefile
tools/testing/selftests/kcmp/Makefile
tools/testing/selftests/kselftest_install.sh [new file with mode: 0755]
tools/testing/selftests/lib.mk [new file with mode: 0644]
tools/testing/selftests/memfd/Makefile
tools/testing/selftests/memory-hotplug/Makefile
tools/testing/selftests/memory-hotplug/mem-on-off-test.sh [new file with mode: 0755]
tools/testing/selftests/memory-hotplug/on-off-test.sh [deleted file]
tools/testing/selftests/mount/.gitignore [new file with mode: 0644]
tools/testing/selftests/mount/Makefile
tools/testing/selftests/mqueue/Makefile
tools/testing/selftests/net/Makefile
tools/testing/selftests/net/run_afpackettests [changed mode: 0644->0755]
tools/testing/selftests/net/run_netsocktests [changed mode: 0644->0755]
tools/testing/selftests/powerpc/Makefile
tools/testing/selftests/powerpc/copyloops/Makefile
tools/testing/selftests/powerpc/mm/Makefile
tools/testing/selftests/powerpc/pmu/Makefile
tools/testing/selftests/powerpc/pmu/ebb/Makefile
tools/testing/selftests/powerpc/primitives/Makefile
tools/testing/selftests/powerpc/stringloops/Makefile
tools/testing/selftests/powerpc/tm/Makefile
tools/testing/selftests/ptrace/Makefile
tools/testing/selftests/size/Makefile
tools/testing/selftests/sysctl/Makefile
tools/testing/selftests/sysctl/run_numerictests [changed mode: 0644->0755]
tools/testing/selftests/sysctl/run_stringtests [changed mode: 0644->0755]
tools/testing/selftests/timers/Makefile
tools/testing/selftests/timers/alarmtimer-suspend.c [new file with mode: 0644]
tools/testing/selftests/timers/change_skew.c [new file with mode: 0644]
tools/testing/selftests/timers/clocksource-switch.c [new file with mode: 0644]
tools/testing/selftests/timers/inconsistency-check.c [new file with mode: 0644]
tools/testing/selftests/timers/leap-a-day.c [new file with mode: 0644]
tools/testing/selftests/timers/leapcrash.c [new file with mode: 0644]
tools/testing/selftests/timers/mqueue-lat.c [new file with mode: 0644]
tools/testing/selftests/timers/nanosleep.c [new file with mode: 0644]
tools/testing/selftests/timers/nsleep-lat.c [new file with mode: 0644]
tools/testing/selftests/timers/posix_timers.c
tools/testing/selftests/timers/raw_skew.c [new file with mode: 0644]
tools/testing/selftests/timers/rtctest.c [new file with mode: 0644]
tools/testing/selftests/timers/set-2038.c [new file with mode: 0644]
tools/testing/selftests/timers/set-tai.c [new file with mode: 0644]
tools/testing/selftests/timers/set-timer-lat.c [new file with mode: 0644]
tools/testing/selftests/timers/skew_consistency.c [new file with mode: 0644]
tools/testing/selftests/timers/threadtest.c [new file with mode: 0644]
tools/testing/selftests/timers/valid-adjtimex.c [new file with mode: 0644]
tools/testing/selftests/user/Makefile
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/run_vmtests [changed mode: 0644->0755]
tools/testing/selftests/x86/.gitignore [new file with mode: 0644]
tools/testing/selftests/x86/Makefile [new file with mode: 0644]
tools/testing/selftests/x86/run_x86_tests.sh [new file with mode: 0644]
tools/testing/selftests/x86/sigreturn.c [new file with mode: 0644]
tools/testing/selftests/x86/trivial_32bit_program.c [new file with mode: 0644]
virt/kvm/arm/arch_timer.c
virt/kvm/arm/vgic-v2-emul.c
virt/kvm/arm/vgic-v3-emul.c
virt/kvm/arm/vgic.c
virt/kvm/arm/vgic.h
virt/kvm/coalesced_mmio.c
virt/kvm/eventfd.c
virt/kvm/iodev.h [deleted file]
virt/kvm/irqchip.c
virt/kvm/kvm_main.c

index f51861bcb07bbbc226e0d1af2897f6b49bcf82c5..e550c8b98139974343de5e4890c94f642f0a53d3 100644 (file)
                        APEI Error INJection
                        ~~~~~~~~~~~~~~~~~~~~
 
-EINJ provides a hardware error injection mechanism
-It is very useful for debugging and testing of other APEI and RAS features.
+EINJ provides a hardware error injection mechanism. It is very useful
+for debugging and testing APEI and RAS features in general.
 
-To use EINJ, make sure the following are enabled in your kernel
+You need to check whether your BIOS supports EINJ first. For that, look
+for early boot messages similar to this one:
+
+ACPI: EINJ 0x000000007370A000 000150 (v01 INTEL           00000001 INTL 00000001)
+
+which shows that the BIOS is exposing an EINJ table - it is the
+mechanism through which the injection is done.
+
+Alternatively, look in /sys/firmware/acpi/tables for an "EINJ" file,
+which is a different representation of the same thing.
+
+It doesn't necessarily mean that EINJ is not supported if those above
+don't exist: before you give up, go into BIOS setup to see if the BIOS
+has an option to enable error injection. Look for something called WHEA
+or similar. Often, you need to enable an ACPI5 support option prior, in
+order to see the APEI,EINJ,... functionality supported and exposed by
+the BIOS menu.
+
+To use EINJ, make sure the following are options enabled in your kernel
 configuration:
 
 CONFIG_DEBUG_FS
 CONFIG_ACPI_APEI
 CONFIG_ACPI_APEI_EINJ
 
-The user interface of EINJ is debug file system, under the
-directory apei/einj. The following files are provided.
+The EINJ user interface is in <debugfs mount point>/apei/einj.
+
+The following files belong to it:
 
 - available_error_type
-  Reading this file returns the error injection capability of the
-  platform, that is, which error types are supported. The error type
-  definition is as follow, the left field is the error type value, the
-  right field is error description.
-
-    0x00000001 Processor Correctable
-    0x00000002 Processor Uncorrectable non-fatal
-    0x00000004 Processor Uncorrectable fatal
-    0x00000008  Memory Correctable
-    0x00000010  Memory Uncorrectable non-fatal
-    0x00000020  Memory Uncorrectable fatal
-    0x00000040 PCI Express Correctable
-    0x00000080 PCI Express Uncorrectable fatal
-    0x00000100 PCI Express Uncorrectable non-fatal
-    0x00000200 Platform Correctable
-    0x00000400 Platform Uncorrectable non-fatal
-    0x00000800 Platform Uncorrectable fatal
-
-  The format of file contents are as above, except there are only the
-  available error type lines.
+
+  This file shows which error types are supported:
+
+  Error Type Value     Error Description
+  ================     =================
+  0x00000001           Processor Correctable
+  0x00000002           Processor Uncorrectable non-fatal
+  0x00000004           Processor Uncorrectable fatal
+  0x00000008           Memory Correctable
+  0x00000010           Memory Uncorrectable non-fatal
+  0x00000020           Memory Uncorrectable fatal
+  0x00000040           PCI Express Correctable
+  0x00000080           PCI Express Uncorrectable fatal
+  0x00000100           PCI Express Uncorrectable non-fatal
+  0x00000200           Platform Correctable
+  0x00000400           Platform Uncorrectable non-fatal
+  0x00000800           Platform Uncorrectable fatal
+
+  The format of the file contents are as above, except present are only
+  the available error types.
 
 - error_type
-  This file is used to set the error type value. The error type value
-  is defined in "available_error_type" description.
+
+  Set the value of the error type being injected. Possible error types
+  are defined in the file available_error_type above.
 
 - error_inject
-  Write any integer to this file to trigger the error
-  injection. Before this, please specify all necessary error
-  parameters.
+
+  Write any integer to this file to trigger the error injection. Make
+  sure you have specified all necessary error parameters, i.e. this
+  write should be the last step when injecting errors.
 
 - flags
-  Present for kernel version 3.13 and above. Used to specify which
-  of param{1..4} are valid and should be used by BIOS during injection.
-  Value is a bitmask as specified in ACPI5.0 spec for the
+
+  Present for kernel versions 3.13 and above. Used to specify which
+  of param{1..4} are valid and should be used by the firmware during
+  injection. Value is a bitmask as specified in ACPI5.0 spec for the
   SET_ERROR_TYPE_WITH_ADDRESS data structure:
-       Bit 0 - Processor APIC field valid (see param3 below)
-       Bit 1 - Memory address and mask valid (param1 and param2)
-       Bit 2 - PCIe (seg,bus,dev,fn) valid (param4 below)
-  If set to zero, legacy behaviour is used where the type of injection
-  specifies just one bit set, and param1 is multiplexed.
+
+       Bit 0 - Processor APIC field valid (see param3 below).
+       Bit 1 - Memory address and mask valid (param1 and param2).
+       Bit 2 - PCIe (seg,bus,dev,fn) valid (see param4 below).
+
+  If set to zero, legacy behavior is mimicked where the type of
+  injection specifies just one bit set, and param1 is multiplexed.
 
 - param1
-  This file is used to set the first error parameter value. Effect of
-  parameter depends on error_type specified. For example, if error
-  type is memory related type, the param1 should be a valid physical
-  memory address. [Unless "flag" is set - see above]
+
+  This file is used to set the first error parameter value. Its effect
+  depends on the error type specified in error_type. For example, if
+  error type is memory related type, the param1 should be a valid
+  physical memory address. [Unless "flag" is set - see above]
 
 - param2
-  This file is used to set the second error parameter value. Effect of
-  parameter depends on error_type specified. For example, if error
-  type is memory related type, the param2 should be a physical memory
-  address mask. Linux requires page or narrower granularity, say,
-  0xfffffffffffff000.
+
+  Same use as param1 above. For example, if error type is of memory
+  related type, then param2 should be a physical memory address mask.
+  Linux requires page or narrower granularity, say, 0xfffffffffffff000.
 
 - param3
-  Used when the 0x1 bit is set in "flag" to specify the APIC id
+
+  Used when the 0x1 bit is set in "flags" to specify the APIC id
 
 - param4
-  Used when the 0x4 bit is set in "flag" to specify target PCIe device
+  Used when the 0x4 bit is set in "flags" to specify target PCIe device
 
 - notrigger
-  The EINJ mechanism is a two step process. First inject the error, then
-  perform some actions to trigger it. Setting "notrigger" to 1 skips the
-  trigger phase, which *may* allow the user to cause the error in some other
-  context by a simple access to the cpu, memory location, or device that is
-  the target of the error injection. Whether this actually works depends
-  on what operations the BIOS actually includes in the trigger phase.
-
-BIOS versions based in the ACPI 4.0 specification have limited options
-to control where the errors are injected.  Your BIOS may support an
-extension (enabled with the param_extension=1 module parameter, or
-boot command line einj.param_extension=1). This allows the address
-and mask for memory injections to be specified by the param1 and
-param2 files in apei/einj.
-
-BIOS versions using the ACPI 5.0 specification have more control over
-the target of the injection. For processor related errors (type 0x1,
-0x2 and 0x4) the APICID of the target should be provided using the
-param1 file in apei/einj. For memory errors (type 0x8, 0x10 and 0x20)
-the address is set using param1 with a mask in param2 (0x0 is equivalent
-to all ones). For PCI express errors (type 0x40, 0x80 and 0x100) the
-segment, bus, device and function are specified using param1:
+
+  The error injection mechanism is a two-step process. First inject the
+  error, then perform some actions to trigger it. Setting "notrigger"
+  to 1 skips the trigger phase, which *may* allow the user to cause the
+  error in some other context by a simple access to the CPU, memory
+  location, or device that is the target of the error injection. Whether
+  this actually works depends on what operations the BIOS actually
+  includes in the trigger phase.
+
+BIOS versions based on the ACPI 4.0 specification have limited options
+in controlling where the errors are injected. Your BIOS may support an
+extension (enabled with the param_extension=1 module parameter, or boot
+command line einj.param_extension=1). This allows the address and mask
+for memory injections to be specified by the param1 and param2 files in
+apei/einj.
+
+BIOS versions based on the ACPI 5.0 specification have more control over
+the target of the injection. For processor-related errors (type 0x1, 0x2
+and 0x4), you can set flags to 0x3 (param3 for bit 0, and param1 and
+param2 for bit 1) so that you have more information added to the error
+signature being injected. The actual data passed is this:
+
+       memory_address = param1;
+       memory_address_range = param2;
+       apicid = param3;
+       pcie_sbdf = param4;
+
+For memory errors (type 0x8, 0x10 and 0x20) the address is set using
+param1 with a mask in param2 (0x0 is equivalent to all ones). For PCI
+express errors (type 0x40, 0x80 and 0x100) the segment, bus, device and
+function are specified using param1:
 
          31     24 23    16 15    11 10      8  7        0
        +-------------------------------------------------+
        | segment |   bus  | device | function | reserved |
        +-------------------------------------------------+
 
-An ACPI 5.0 BIOS may also allow vendor specific errors to be injected.
+Anyway, you get the idea, if there's doubt just take a look at the code
+in drivers/acpi/apei/einj.c.
+
+An ACPI 5.0 BIOS may also allow vendor-specific errors to be injected.
 In this case a file named vendor will contain identifying information
 from the BIOS that hopefully will allow an application wishing to use
-the vendor specific extension to tell that they are running on a BIOS
+the vendor-specific extension to tell that they are running on a BIOS
 that supports it. All vendor extensions have the 0x80000000 bit set in
 error_type. A file vendor_flags controls the interpretation of param1
 and param2 (1 = PROCESSOR, 2 = MEMORY, 4 = PCI). See your BIOS vendor
 documentation for details (and expect changes to this API if vendors
 creativity in using this feature expands beyond our expectations).
 
-Example:
+
+An error injection example:
+
 # cd /sys/kernel/debug/apei/einj
 # cat available_error_type             # See which errors can be injected
 0x00000002     Processor Uncorrectable non-fatal
 0x00000008     Memory Correctable
 0x00000010     Memory Uncorrectable non-fatal
 # echo 0x12345000 > param1             # Set memory address for injection
-# echo 0xfffffffffffff000 > param2     # Mask - anywhere in this page
+# echo $((-1 << 12)) > param2          # Mask 0xfffffffffffff000 - anywhere in this page
 # echo 0x8 > error_type                        # Choose correctable memory error
 # echo 1 > error_inject                        # Inject now
 
+You should see something like this in dmesg:
+
+[22715.830801] EDAC sbridge MC3: HANDLING MCE MEMORY ERROR
+[22715.834759] EDAC sbridge MC3: CPU 0: Machine Check Event: 0 Bank 7: 8c00004000010090
+[22715.834759] EDAC sbridge MC3: TSC 0
+[22715.834759] EDAC sbridge MC3: ADDR 12345000 EDAC sbridge MC3: MISC 144780c86
+[22715.834759] EDAC sbridge MC3: PROCESSOR 0:306e7 TIME 1422553404 SOCKET 0 APIC 0
+[22716.616173] EDAC MC3: 1 CE memory read error on CPU_SrcID#0_Channel#0_DIMM#0 (channel:0 slot:0 page:0x12345 offset:0x0 grain:32 syndrome:0x0 -  area:DRAM err_code:0001:0090 socket:0 channel_mask:1 rank:0)
 
 For more information about EINJ, please refer to ACPI specification
 version 4.0, section 17.5 and ACPI 5.0, section 18.6.
index e124847443f87c9465ec82dc03ce792d2d1cf5c4..f0b4cd72411d66bf12aeec3d64335d6641b26674 100644 (file)
@@ -19,7 +19,9 @@ the parent DSA node. The maximum number of allowed child nodes is 4
 (DSA_MAX_SWITCHES).
 Each of these switch child nodes should have the following required properties:
 
-- reg                  : Describes the switch address on the MII bus
+- reg                  : Contains two fields. The first one describes the
+                         address on the MII bus. The second is the switch
+                         number that must be unique in cascaded configurations
 - #address-cells       : Must be 1
 - #size-cells          : Must be 0
 
index 43404b197933262859e79686d5a8fa25ab12e5f3..332e625f6ed01cb4e442c0ea530ab29eb92f2a77 100644 (file)
@@ -4,7 +4,7 @@ Required properties:
 - compatible           : "renesas,thermal-<soctype>", "renesas,rcar-thermal"
                          as fallback.
                          Examples with soctypes are:
-                           - "renesas,thermal-r8a73a4" (R-Mobile AP6)
+                           - "renesas,thermal-r8a73a4" (R-Mobile APE6)
                            - "renesas,thermal-r8a7779" (R-Car H1)
                            - "renesas,thermal-r8a7790" (R-Car H2)
                            - "renesas,thermal-r8a7791" (R-Car M2-W)
index a63e5e013a8cddee63b1d3520dd1c2c73e80dc31..92ae734c00c348ab810373e0dc838a92462c932f 100644 (file)
@@ -114,6 +114,9 @@ ALPS Absolute Mode - Protocol Version 2
  byte 4:  0   y6   y5   y4   y3   y2   y1   y0
  byte 5:  0   z6   z5   z4   z3   z2   z1   z0
 
+Protocol Version 2 DualPoint devices send standard PS/2 mouse packets for
+the DualPoint Stick.
+
 Dualpoint device -- interleaved packet format
 ---------------------------------------------
 
@@ -127,6 +130,11 @@ Dualpoint device -- interleaved packet format
  byte 7:    0   y6   y5   y4   y3   y2   y1   y0
  byte 8:    0   z6   z5   z4   z3   z2   z1   z0
 
+Devices which use the interleaving format normally send standard PS/2 mouse
+packets for the DualPoint Stick + ALPS Absolute Mode packets for the
+touchpad, switching to the interleaved packet format when both the stick and
+the touchpad are used at the same time.
+
 ALPS Absolute Mode - Protocol Version 3
 ---------------------------------------
 
index c587a966413e8597da3241f851db92f6e6df1d81..96705616f5820a6d48d6cfda497c7fcf30917e2a 100644 (file)
@@ -294,6 +294,12 @@ accordingly. This property does not affect kernel behavior.
 The kernel does not provide button emulation for such devices but treats
 them as any other INPUT_PROP_BUTTONPAD device.
 
+INPUT_PROP_ACCELEROMETER
+-------------------------
+Directional axes on this device (absolute and/or relative x, y, z) represent
+accelerometer data. All other axes retain their meaning. A device must not mix
+regular directional axes and accelerometer axes on the same event node.
+
 Guidelines:
 ==========
 The guidelines below ensure proper single-touch and multi-finger functionality.
index 7b4f59c09ee2301d077446f5f9ce9c2b96aa2551..b85d000faeb4067c9ab1ed06690105d459a40a4e 100644 (file)
@@ -312,9 +312,12 @@ ABS_MT_TOOL_TYPE
 
 The type of approaching tool. A lot of kernel drivers cannot distinguish
 between different tool types, such as a finger or a pen. In such cases, the
-event should be omitted. The protocol currently supports MT_TOOL_FINGER and
-MT_TOOL_PEN [2]. For type B devices, this event is handled by input core;
-drivers should instead use input_mt_report_slot_state().
+event should be omitted. The protocol currently supports MT_TOOL_FINGER,
+MT_TOOL_PEN, and MT_TOOL_PALM [2]. For type B devices, this event is handled
+by input core; drivers should instead use input_mt_report_slot_state().
+A contact's ABS_MT_TOOL_TYPE may change over time while still touching the
+device, because the firmware may not be able to determine which tool is being
+used when it first appears.
 
 ABS_MT_BLOB_ID
 
index bfcb1a62a7b48466f3cbe5a08d3e1a632eadc3d2..01aa47d3b6ab607e6a66f2a6a827b4b1afe4da73 100644 (file)
@@ -1036,7 +1036,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        Format: {"off" | "on" | "skip[mbr]"}
 
        efi=            [EFI]
-                       Format: { "old_map", "nochunk", "noruntime" }
+                       Format: { "old_map", "nochunk", "noruntime", "debug" }
                        old_map [X86-64]: switch to the old ioremap-based EFI
                        runtime services mapping. 32-bit still uses this one by
                        default.
@@ -1044,6 +1044,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        boot stub, as chunking can cause problems with some
                        firmware implementations.
                        noruntime : disable EFI runtime services support
+                       debug: enable misc debug output
 
        efi_no_storage_paranoia [EFI; X86]
                        Using this parameter you can use more than 50% of
index 596b60c08b7451a9739d22aa2a29cb3c5c2559c9..8446f1ea1410b87b071047dc310a787a92606c31 100644 (file)
@@ -204,266 +204,4 @@ Some common examples:
 
     *  RTC_PIE_ON, RTC_PIE_OFF: These are also emulated by the generic code.
 
-If all else fails, check out the rtc-test.c driver!
-
-
--------------------- 8< ---------------- 8< -----------------------------
-
-/*
- *      Real Time Clock Driver Test/Example Program
- *
- *      Compile with:
- *                  gcc -s -Wall -Wstrict-prototypes rtctest.c -o rtctest
- *
- *      Copyright (C) 1996, Paul Gortmaker.
- *
- *      Released under the GNU General Public License, version 2,
- *      included herein by reference.
- *
- */
-
-#include <stdio.h>
-#include <linux/rtc.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <errno.h>
-
-
-/*
- * This expects the new RTC class driver framework, working with
- * clocks that will often not be clones of what the PC-AT had.
- * Use the command line to specify another RTC if you need one.
- */
-static const char default_rtc[] = "/dev/rtc0";
-
-
-int main(int argc, char **argv)
-{
-       int i, fd, retval, irqcount = 0;
-       unsigned long tmp, data;
-       struct rtc_time rtc_tm;
-       const char *rtc = default_rtc;
-
-       switch (argc) {
-       case 2:
-               rtc = argv[1];
-               /* FALLTHROUGH */
-       case 1:
-               break;
-       default:
-               fprintf(stderr, "usage:  rtctest [rtcdev]\n");
-               return 1;
-       }
-
-       fd = open(rtc, O_RDONLY);
-
-       if (fd ==  -1) {
-               perror(rtc);
-               exit(errno);
-       }
-
-       fprintf(stderr, "\n\t\t\tRTC Driver Test Example.\n\n");
-
-       /* Turn on update interrupts (one per second) */
-       retval = ioctl(fd, RTC_UIE_ON, 0);
-       if (retval == -1) {
-               if (errno == ENOTTY) {
-                       fprintf(stderr,
-                               "\n...Update IRQs not supported.\n");
-                       goto test_READ;
-               }
-               perror("RTC_UIE_ON ioctl");
-               exit(errno);
-       }
-
-       fprintf(stderr, "Counting 5 update (1/sec) interrupts from reading %s:",
-                       rtc);
-       fflush(stderr);
-       for (i=1; i<6; i++) {
-               /* This read will block */
-               retval = read(fd, &data, sizeof(unsigned long));
-               if (retval == -1) {
-                       perror("read");
-                       exit(errno);
-               }
-               fprintf(stderr, " %d",i);
-               fflush(stderr);
-               irqcount++;
-       }
-
-       fprintf(stderr, "\nAgain, from using select(2) on /dev/rtc:");
-       fflush(stderr);
-       for (i=1; i<6; i++) {
-               struct timeval tv = {5, 0};     /* 5 second timeout on select */
-               fd_set readfds;
-
-               FD_ZERO(&readfds);
-               FD_SET(fd, &readfds);
-               /* The select will wait until an RTC interrupt happens. */
-               retval = select(fd+1, &readfds, NULL, NULL, &tv);
-               if (retval == -1) {
-                       perror("select");
-                       exit(errno);
-               }
-               /* This read won't block unlike the select-less case above. */
-               retval = read(fd, &data, sizeof(unsigned long));
-               if (retval == -1) {
-                       perror("read");
-                       exit(errno);
-               }
-               fprintf(stderr, " %d",i);
-               fflush(stderr);
-               irqcount++;
-       }
-
-       /* Turn off update interrupts */
-       retval = ioctl(fd, RTC_UIE_OFF, 0);
-       if (retval == -1) {
-               perror("RTC_UIE_OFF ioctl");
-               exit(errno);
-       }
-
-test_READ:
-       /* Read the RTC time/date */
-       retval = ioctl(fd, RTC_RD_TIME, &rtc_tm);
-       if (retval == -1) {
-               perror("RTC_RD_TIME ioctl");
-               exit(errno);
-       }
-
-       fprintf(stderr, "\n\nCurrent RTC date/time is %d-%d-%d, %02d:%02d:%02d.\n",
-               rtc_tm.tm_mday, rtc_tm.tm_mon + 1, rtc_tm.tm_year + 1900,
-               rtc_tm.tm_hour, rtc_tm.tm_min, rtc_tm.tm_sec);
-
-       /* Set the alarm to 5 sec in the future, and check for rollover */
-       rtc_tm.tm_sec += 5;
-       if (rtc_tm.tm_sec >= 60) {
-               rtc_tm.tm_sec %= 60;
-               rtc_tm.tm_min++;
-       }
-       if (rtc_tm.tm_min == 60) {
-               rtc_tm.tm_min = 0;
-               rtc_tm.tm_hour++;
-       }
-       if (rtc_tm.tm_hour == 24)
-               rtc_tm.tm_hour = 0;
-
-       retval = ioctl(fd, RTC_ALM_SET, &rtc_tm);
-       if (retval == -1) {
-               if (errno == ENOTTY) {
-                       fprintf(stderr,
-                               "\n...Alarm IRQs not supported.\n");
-                       goto test_PIE;
-               }
-               perror("RTC_ALM_SET ioctl");
-               exit(errno);
-       }
-
-       /* Read the current alarm settings */
-       retval = ioctl(fd, RTC_ALM_READ, &rtc_tm);
-       if (retval == -1) {
-               perror("RTC_ALM_READ ioctl");
-               exit(errno);
-       }
-
-       fprintf(stderr, "Alarm time now set to %02d:%02d:%02d.\n",
-               rtc_tm.tm_hour, rtc_tm.tm_min, rtc_tm.tm_sec);
-
-       /* Enable alarm interrupts */
-       retval = ioctl(fd, RTC_AIE_ON, 0);
-       if (retval == -1) {
-               perror("RTC_AIE_ON ioctl");
-               exit(errno);
-       }
-
-       fprintf(stderr, "Waiting 5 seconds for alarm...");
-       fflush(stderr);
-       /* This blocks until the alarm ring causes an interrupt */
-       retval = read(fd, &data, sizeof(unsigned long));
-       if (retval == -1) {
-               perror("read");
-               exit(errno);
-       }
-       irqcount++;
-       fprintf(stderr, " okay. Alarm rang.\n");
-
-       /* Disable alarm interrupts */
-       retval = ioctl(fd, RTC_AIE_OFF, 0);
-       if (retval == -1) {
-               perror("RTC_AIE_OFF ioctl");
-               exit(errno);
-       }
-
-test_PIE:
-       /* Read periodic IRQ rate */
-       retval = ioctl(fd, RTC_IRQP_READ, &tmp);
-       if (retval == -1) {
-               /* not all RTCs support periodic IRQs */
-               if (errno == ENOTTY) {
-                       fprintf(stderr, "\nNo periodic IRQ support\n");
-                       goto done;
-               }
-               perror("RTC_IRQP_READ ioctl");
-               exit(errno);
-       }
-       fprintf(stderr, "\nPeriodic IRQ rate is %ldHz.\n", tmp);
-
-       fprintf(stderr, "Counting 20 interrupts at:");
-       fflush(stderr);
-
-       /* The frequencies 128Hz, 256Hz, ... 8192Hz are only allowed for root. */
-       for (tmp=2; tmp<=64; tmp*=2) {
-
-               retval = ioctl(fd, RTC_IRQP_SET, tmp);
-               if (retval == -1) {
-                       /* not all RTCs can change their periodic IRQ rate */
-                       if (errno == ENOTTY) {
-                               fprintf(stderr,
-                                       "\n...Periodic IRQ rate is fixed\n");
-                               goto done;
-                       }
-                       perror("RTC_IRQP_SET ioctl");
-                       exit(errno);
-               }
-
-               fprintf(stderr, "\n%ldHz:\t", tmp);
-               fflush(stderr);
-
-               /* Enable periodic interrupts */
-               retval = ioctl(fd, RTC_PIE_ON, 0);
-               if (retval == -1) {
-                       perror("RTC_PIE_ON ioctl");
-                       exit(errno);
-               }
-
-               for (i=1; i<21; i++) {
-                       /* This blocks */
-                       retval = read(fd, &data, sizeof(unsigned long));
-                       if (retval == -1) {
-                               perror("read");
-                               exit(errno);
-                       }
-                       fprintf(stderr, " %d",i);
-                       fflush(stderr);
-                       irqcount++;
-               }
-
-               /* Disable periodic interrupts */
-               retval = ioctl(fd, RTC_PIE_OFF, 0);
-               if (retval == -1) {
-                       perror("RTC_PIE_OFF ioctl");
-                       exit(errno);
-               }
-       }
-
-done:
-       fprintf(stderr, "\n\n\t\t\t *** Test complete ***\n");
-
-       close(fd);
-
-       return 0;
-}
+If all else fails, check out the tools/testing/selftests/timers/rtctest.c test!
index b112efc816f155093ef80815db788a2cdddf0ecc..bc9f6fe44e27614c2f26f155385fe0d5d13d2e3b 100644 (file)
@@ -997,7 +997,7 @@ for vm-wide capabilities.
 4.38 KVM_GET_MP_STATE
 
 Capability: KVM_CAP_MP_STATE
-Architectures: x86, s390
+Architectures: x86, s390, arm, arm64
 Type: vcpu ioctl
 Parameters: struct kvm_mp_state (out)
 Returns: 0 on success; -1 on error
@@ -1011,7 +1011,7 @@ uniprocessor guests).
 
 Possible values are:
 
- - KVM_MP_STATE_RUNNABLE:        the vcpu is currently running [x86]
+ - KVM_MP_STATE_RUNNABLE:        the vcpu is currently running [x86,arm/arm64]
  - KVM_MP_STATE_UNINITIALIZED:   the vcpu is an application processor (AP)
                                  which has not yet received an INIT signal [x86]
  - KVM_MP_STATE_INIT_RECEIVED:   the vcpu has received an INIT signal, and is
@@ -1020,7 +1020,7 @@ Possible values are:
                                  is waiting for an interrupt [x86]
  - KVM_MP_STATE_SIPI_RECEIVED:   the vcpu has just received a SIPI (vector
                                  accessible via KVM_GET_VCPU_EVENTS) [x86]
- - KVM_MP_STATE_STOPPED:         the vcpu is stopped [s390]
+ - KVM_MP_STATE_STOPPED:         the vcpu is stopped [s390,arm/arm64]
  - KVM_MP_STATE_CHECK_STOP:      the vcpu is in a special error state [s390]
  - KVM_MP_STATE_OPERATING:       the vcpu is operating (running or halted)
                                  [s390]
@@ -1031,11 +1031,15 @@ On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
 in-kernel irqchip, the multiprocessing state must be maintained by userspace on
 these architectures.
 
+For arm/arm64:
+
+The only states that are valid are KVM_MP_STATE_STOPPED and
+KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not.
 
 4.39 KVM_SET_MP_STATE
 
 Capability: KVM_CAP_MP_STATE
-Architectures: x86, s390
+Architectures: x86, s390, arm, arm64
 Type: vcpu ioctl
 Parameters: struct kvm_mp_state (in)
 Returns: 0 on success; -1 on error
@@ -1047,6 +1051,10 @@ On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
 in-kernel irqchip, the multiprocessing state must be maintained by userspace on
 these architectures.
 
+For arm/arm64:
+
+The only states that are valid are KVM_MP_STATE_STOPPED and
+KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not.
 
 4.40 KVM_SET_IDENTITY_MAP_ADDR
 
@@ -1967,15 +1975,25 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_STATUS       | 32
   MIPS  | KVM_REG_MIPS_CP0_CAUSE        | 32
   MIPS  | KVM_REG_MIPS_CP0_EPC          | 64
+  MIPS  | KVM_REG_MIPS_CP0_PRID         | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG       | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG1      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG2      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG3      | 32
+  MIPS  | KVM_REG_MIPS_CP0_CONFIG4      | 32
+  MIPS  | KVM_REG_MIPS_CP0_CONFIG5      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG7      | 32
   MIPS  | KVM_REG_MIPS_CP0_ERROREPC     | 64
   MIPS  | KVM_REG_MIPS_COUNT_CTL        | 64
   MIPS  | KVM_REG_MIPS_COUNT_RESUME     | 64
   MIPS  | KVM_REG_MIPS_COUNT_HZ         | 64
+  MIPS  | KVM_REG_MIPS_FPR_32(0..31)    | 32
+  MIPS  | KVM_REG_MIPS_FPR_64(0..31)    | 64
+  MIPS  | KVM_REG_MIPS_VEC_128(0..31)   | 128
+  MIPS  | KVM_REG_MIPS_FCR_IR           | 32
+  MIPS  | KVM_REG_MIPS_FCR_CSR          | 32
+  MIPS  | KVM_REG_MIPS_MSA_IR           | 32
+  MIPS  | KVM_REG_MIPS_MSA_CSR          | 32
 
 ARM registers are mapped using the lower 32 bits.  The upper 16 of that
 is the register group type, or coprocessor number:
@@ -2029,6 +2047,25 @@ patterns depending on whether they're 32-bit or 64-bit registers:
 MIPS KVM control registers (see above) have the following id bit patterns:
   0x7030 0000 0002 <reg:16>
 
+MIPS FPU registers (see KVM_REG_MIPS_FPR_{32,64}() above) have the following
+id bit patterns depending on the size of the register being accessed. They are
+always accessed according to the current guest FPU mode (Status.FR and
+Config5.FRE), i.e. as the guest would see them, and they become unpredictable
+if the guest FPU mode is changed. MIPS SIMD Architecture (MSA) vector
+registers (see KVM_REG_MIPS_VEC_128() above) have similar patterns as they
+overlap the FPU registers:
+  0x7020 0000 0003 00 <0:3> <reg:5> (32-bit FPU registers)
+  0x7030 0000 0003 00 <0:3> <reg:5> (64-bit FPU registers)
+  0x7040 0000 0003 00 <0:3> <reg:5> (128-bit MSA vector registers)
+
+MIPS FPU control registers (see KVM_REG_MIPS_FCR_{IR,CSR} above) have the
+following id bit patterns:
+  0x7020 0000 0003 01 <0:3> <reg:5>
+
+MIPS MSA control registers (see KVM_REG_MIPS_MSA_{IR,CSR} above) have the
+following id bit patterns:
+  0x7020 0000 0003 02 <0:3> <reg:5>
+
 
 4.69 KVM_GET_ONE_REG
 
@@ -2234,7 +2271,7 @@ into the hash PTE second double word).
 4.75 KVM_IRQFD
 
 Capability: KVM_CAP_IRQFD
-Architectures: x86 s390
+Architectures: x86 s390 arm arm64
 Type: vm ioctl
 Parameters: struct kvm_irqfd (in)
 Returns: 0 on success, -1 on error
@@ -2260,6 +2297,10 @@ Note that closing the resamplefd is not sufficient to disable the
 irqfd.  The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
 and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
 
+On ARM/ARM64, the gsi field in the kvm_irqfd struct specifies the Shared
+Peripheral Interrupt (SPI) index, such that the GIC interrupt ID is
+given by gsi + 32.
+
 4.76 KVM_PPC_ALLOCATE_HTAB
 
 Capability: KVM_CAP_PPC_ALLOC_HTAB
@@ -2716,6 +2757,227 @@ The fields in each entry are defined as follows:
    eax, ebx, ecx, edx: the values returned by the cpuid instruction for
          this function/index combination
 
+4.89 KVM_S390_MEM_OP
+
+Capability: KVM_CAP_S390_MEM_OP
+Architectures: s390
+Type: vcpu ioctl
+Parameters: struct kvm_s390_mem_op (in)
+Returns: = 0 on success,
+         < 0 on generic error (e.g. -EFAULT or -ENOMEM),
+         > 0 if an exception occurred while walking the page tables
+
+Read or write data from/to the logical (virtual) memory of a VPCU.
+
+Parameters are specified via the following structure:
+
+struct kvm_s390_mem_op {
+       __u64 gaddr;            /* the guest address */
+       __u64 flags;            /* flags */
+       __u32 size;             /* amount of bytes */
+       __u32 op;               /* type of operation */
+       __u64 buf;              /* buffer in userspace */
+       __u8 ar;                /* the access register number */
+       __u8 reserved[31];      /* should be set to 0 */
+};
+
+The type of operation is specified in the "op" field. It is either
+KVM_S390_MEMOP_LOGICAL_READ for reading from logical memory space or
+KVM_S390_MEMOP_LOGICAL_WRITE for writing to logical memory space. The
+KVM_S390_MEMOP_F_CHECK_ONLY flag can be set in the "flags" field to check
+whether the corresponding memory access would create an access exception
+(without touching the data in the memory at the destination). In case an
+access exception occurred while walking the MMU tables of the guest, the
+ioctl returns a positive error number to indicate the type of exception.
+This exception is also raised directly at the corresponding VCPU if the
+flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field.
+
+The start address of the memory region has to be specified in the "gaddr"
+field, and the length of the region in the "size" field. "buf" is the buffer
+supplied by the userspace application where the read data should be written
+to for KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written
+is stored for a KVM_S390_MEMOP_LOGICAL_WRITE. "buf" is unused and can be NULL
+when KVM_S390_MEMOP_F_CHECK_ONLY is specified. "ar" designates the access
+register number to be used.
+
+The "reserved" field is meant for future extensions. It is not used by
+KVM with the currently defined set of flags.
+
+4.90 KVM_S390_GET_SKEYS
+
+Capability: KVM_CAP_S390_SKEYS
+Architectures: s390
+Type: vm ioctl
+Parameters: struct kvm_s390_skeys
+Returns: 0 on success, KVM_S390_GET_KEYS_NONE if guest is not using storage
+         keys, negative value on error
+
+This ioctl is used to get guest storage key values on the s390
+architecture. The ioctl takes parameters via the kvm_s390_skeys struct.
+
+struct kvm_s390_skeys {
+       __u64 start_gfn;
+       __u64 count;
+       __u64 skeydata_addr;
+       __u32 flags;
+       __u32 reserved[9];
+};
+
+The start_gfn field is the number of the first guest frame whose storage keys
+you want to get.
+
+The count field is the number of consecutive frames (starting from start_gfn)
+whose storage keys to get. The count field must be at least 1 and the maximum
+allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range
+will cause the ioctl to return -EINVAL.
+
+The skeydata_addr field is the address to a buffer large enough to hold count
+bytes. This buffer will be filled with storage key data by the ioctl.
+
+4.91 KVM_S390_SET_SKEYS
+
+Capability: KVM_CAP_S390_SKEYS
+Architectures: s390
+Type: vm ioctl
+Parameters: struct kvm_s390_skeys
+Returns: 0 on success, negative value on error
+
+This ioctl is used to set guest storage key values on the s390
+architecture. The ioctl takes parameters via the kvm_s390_skeys struct.
+See section on KVM_S390_GET_SKEYS for struct definition.
+
+The start_gfn field is the number of the first guest frame whose storage keys
+you want to set.
+
+The count field is the number of consecutive frames (starting from start_gfn)
+whose storage keys to get. The count field must be at least 1 and the maximum
+allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range
+will cause the ioctl to return -EINVAL.
+
+The skeydata_addr field is the address to a buffer containing count bytes of
+storage keys. Each byte in the buffer will be set as the storage key for a
+single frame starting at start_gfn for count frames.
+
+Note: If any architecturally invalid key value is found in the given data then
+the ioctl will return -EINVAL.
+
+4.92 KVM_S390_IRQ
+
+Capability: KVM_CAP_S390_INJECT_IRQ
+Architectures: s390
+Type: vcpu ioctl
+Parameters: struct kvm_s390_irq (in)
+Returns: 0 on success, -1 on error
+Errors:
+  EINVAL: interrupt type is invalid
+          type is KVM_S390_SIGP_STOP and flag parameter is invalid value
+          type is KVM_S390_INT_EXTERNAL_CALL and code is bigger
+            than the maximum of VCPUs
+  EBUSY:  type is KVM_S390_SIGP_SET_PREFIX and vcpu is not stopped
+          type is KVM_S390_SIGP_STOP and a stop irq is already pending
+          type is KVM_S390_INT_EXTERNAL_CALL and an external call interrupt
+            is already pending
+
+Allows to inject an interrupt to the guest.
+
+Using struct kvm_s390_irq as a parameter allows
+to inject additional payload which is not
+possible via KVM_S390_INTERRUPT.
+
+Interrupt parameters are passed via kvm_s390_irq:
+
+struct kvm_s390_irq {
+       __u64 type;
+       union {
+               struct kvm_s390_io_info io;
+               struct kvm_s390_ext_info ext;
+               struct kvm_s390_pgm_info pgm;
+               struct kvm_s390_emerg_info emerg;
+               struct kvm_s390_extcall_info extcall;
+               struct kvm_s390_prefix_info prefix;
+               struct kvm_s390_stop_info stop;
+               struct kvm_s390_mchk_info mchk;
+               char reserved[64];
+       } u;
+};
+
+type can be one of the following:
+
+KVM_S390_SIGP_STOP - sigp stop; parameter in .stop
+KVM_S390_PROGRAM_INT - program check; parameters in .pgm
+KVM_S390_SIGP_SET_PREFIX - sigp set prefix; parameters in .prefix
+KVM_S390_RESTART - restart; no parameters
+KVM_S390_INT_CLOCK_COMP - clock comparator interrupt; no parameters
+KVM_S390_INT_CPU_TIMER - CPU timer interrupt; no parameters
+KVM_S390_INT_EMERGENCY - sigp emergency; parameters in .emerg
+KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall
+KVM_S390_MCHK - machine check interrupt; parameters in .mchk
+
+
+Note that the vcpu ioctl is asynchronous to vcpu execution.
+
+4.94 KVM_S390_GET_IRQ_STATE
+
+Capability: KVM_CAP_S390_IRQ_STATE
+Architectures: s390
+Type: vcpu ioctl
+Parameters: struct kvm_s390_irq_state (out)
+Returns: >= number of bytes copied into buffer,
+         -EINVAL if buffer size is 0,
+         -ENOBUFS if buffer size is too small to fit all pending interrupts,
+         -EFAULT if the buffer address was invalid
+
+This ioctl allows userspace to retrieve the complete state of all currently
+pending interrupts in a single buffer. Use cases include migration
+and introspection. The parameter structure contains the address of a
+userspace buffer and its length:
+
+struct kvm_s390_irq_state {
+       __u64 buf;
+       __u32 flags;
+       __u32 len;
+       __u32 reserved[4];
+};
+
+Userspace passes in the above struct and for each pending interrupt a
+struct kvm_s390_irq is copied to the provided buffer.
+
+If -ENOBUFS is returned the buffer provided was too small and userspace
+may retry with a bigger buffer.
+
+4.95 KVM_S390_SET_IRQ_STATE
+
+Capability: KVM_CAP_S390_IRQ_STATE
+Architectures: s390
+Type: vcpu ioctl
+Parameters: struct kvm_s390_irq_state (in)
+Returns: 0 on success,
+         -EFAULT if the buffer address was invalid,
+         -EINVAL for an invalid buffer length (see below),
+         -EBUSY if there were already interrupts pending,
+         errors occurring when actually injecting the
+          interrupt. See KVM_S390_IRQ.
+
+This ioctl allows userspace to set the complete state of all cpu-local
+interrupts currently pending for the vcpu. It is intended for restoring
+interrupt state after a migration. The input parameter is a userspace buffer
+containing a struct kvm_s390_irq_state:
+
+struct kvm_s390_irq_state {
+       __u64 buf;
+       __u32 len;
+       __u32 pad;
+};
+
+The userspace memory referenced by buf contains a struct kvm_s390_irq
+for each interrupt to be injected into the guest.
+If one of the interrupts could not be injected for some reason the
+ioctl aborts.
+
+len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0
+and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
+which is the maximum number of possibly pending cpu-local interrupts.
+
 5. The kvm_run structure
 ------------------------
 
@@ -3189,6 +3451,31 @@ Parameters: none
 This capability enables the in-kernel irqchip for s390. Please refer to
 "4.24 KVM_CREATE_IRQCHIP" for details.
 
+6.9 KVM_CAP_MIPS_FPU
+
+Architectures: mips
+Target: vcpu
+Parameters: args[0] is reserved for future use (should be 0).
+
+This capability allows the use of the host Floating Point Unit by the guest. It
+allows the Config1.FP bit to be set to enable the FPU in the guest. Once this is
+done the KVM_REG_MIPS_FPR_* and KVM_REG_MIPS_FCR_* registers can be accessed
+(depending on the current guest FPU register mode), and the Status.FR,
+Config5.FRE bits are accessible via the KVM API and also from the guest,
+depending on them being supported by the FPU.
+
+6.10 KVM_CAP_MIPS_MSA
+
+Architectures: mips
+Target: vcpu
+Parameters: args[0] is reserved for future use (should be 0).
+
+This capability allows the use of the MIPS SIMD Architecture (MSA) by the guest.
+It allows the Config3.MSAP bit to be set to enable the use of MSA by the guest.
+Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be
+accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
+the guest.
+
 7. Capabilities that can be enabled on VMs
 ------------------------------------------
 
@@ -3248,3 +3535,41 @@ All other orders will be handled completely in user space.
 Only privileged operation exceptions will be checked for in the kernel (or even
 in the hardware prior to interception). If this capability is not enabled, the
 old way of handling SIGP orders is used (partially in kernel and user space).
+
+7.3 KVM_CAP_S390_VECTOR_REGISTERS
+
+Architectures: s390
+Parameters: none
+Returns: 0 on success, negative value on error
+
+Allows use of the vector registers introduced with z13 processor, and
+provides for the synchronization between host and user space.  Will
+return -EINVAL if the machine does not support vectors.
+
+7.4 KVM_CAP_S390_USER_STSI
+
+Architectures: s390
+Parameters: none
+
+This capability allows post-handlers for the STSI instruction. After
+initial handling in the kernel, KVM exits to user space with
+KVM_EXIT_S390_STSI to allow user space to insert further data.
+
+Before exiting to userspace, kvm handlers should fill in s390_stsi field of
+vcpu->run:
+struct {
+       __u64 addr;
+       __u8 ar;
+       __u8 reserved;
+       __u8 fc;
+       __u8 sel1;
+       __u16 sel2;
+} s390_stsi;
+
+@addr - guest address of STSI SYSIB
+@fc   - function code
+@sel1 - selector 1
+@sel2 - selector 2
+@ar   - access register number
+
+KVM handlers should exit to userspace with rc = -EREMOTE.
index 4ceef53164b0289237238c3cf29d83e38e5d34de..d1ad9d5cae467ceb2c1169ce8b53d70078aedf27 100644 (file)
@@ -27,6 +27,9 @@ Groups:
     Copies all floating interrupts into a buffer provided by userspace.
     When the buffer is too small it returns -ENOMEM, which is the indication
     for userspace to try again with a bigger buffer.
+    -ENOBUFS is returned when the allocation of a kernelspace buffer has
+    failed.
+    -EFAULT is returned when copying data to userspace failed.
     All interrupts remain pending, i.e. are not deleted from the list of
     currently pending interrupts.
     attr->addr contains the userspace address of the buffer into which all
index a75e3adaa39da277fb89150fb1d31daf8d1296ef..88b85899d30953a6be096a9e045fcf54be3676b4 100644 (file)
@@ -406,6 +406,12 @@ Protocol:  2.00+
        - If 0, the protected-mode code is loaded at 0x10000.
        - If 1, the protected-mode code is loaded at 0x100000.
 
+  Bit 1 (kernel internal): ALSR_FLAG
+       - Used internally by the compressed kernel to communicate
+         KASLR status to kernel proper.
+         If 1, KASLR enabled.
+         If 0, KASLR disabled.
+
   Bit 5 (write): QUIET_FLAG
        - If 0, print early messages.
        - If 1, suppress early messages.
index 1de6afa8ee51c747b592e6a10fce1cf742d2bc96..b84686826b23cca2bb9699d6fae5d9b8cae35afa 100644 (file)
@@ -637,8 +637,7 @@ F:      drivers/gpu/drm/radeon/radeon_kfd.h
 F:      include/uapi/linux/kfd_ioctl.h
 
 AMD MICROCODE UPDATE SUPPORT
-M:     Andreas Herrmann <herrmann.der.user@googlemail.com>
-L:     amd64-microcode@amd64.org
+M:     Borislav Petkov <bp@alien8.de>
 S:     Maintained
 F:     arch/x86/kernel/cpu/microcode/amd*
 
@@ -5095,7 +5094,7 @@ S:        Supported
 F:     drivers/platform/x86/intel_menlow.c
 
 INTEL IA32 MICROCODE UPDATE SUPPORT
-M:     Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+M:     Borislav Petkov <bp@alien8.de>
 S:     Maintained
 F:     arch/x86/kernel/cpu/microcode/core*
 F:     arch/x86/kernel/cpu/microcode/intel*
@@ -5136,22 +5135,21 @@ M:      Deepak Saxena <dsaxena@plexity.net>
 S:     Maintained
 F:     drivers/char/hw_random/ixp4xx-rng.c
 
-INTEL ETHERNET DRIVERS (e100/e1000/e1000e/fm10k/igb/igbvf/ixgb/ixgbe/ixgbevf/i40e/i40evf)
+INTEL ETHERNET DRIVERS
 M:     Jeff Kirsher <jeffrey.t.kirsher@intel.com>
-M:     Jesse Brandeburg <jesse.brandeburg@intel.com>
-M:     Bruce Allan <bruce.w.allan@intel.com>
-M:     Carolyn Wyborny <carolyn.wyborny@intel.com>
-M:     Don Skidmore <donald.c.skidmore@intel.com>
-M:     Greg Rose <gregory.v.rose@intel.com>
-M:     Matthew Vick <matthew.vick@intel.com>
-M:     John Ronciak <john.ronciak@intel.com>
-M:     Mitch Williams <mitch.a.williams@intel.com>
-M:     Linux NICS <linux.nics@intel.com>
-L:     e1000-devel@lists.sourceforge.net
+R:     Jesse Brandeburg <jesse.brandeburg@intel.com>
+R:     Shannon Nelson <shannon.nelson@intel.com>
+R:     Carolyn Wyborny <carolyn.wyborny@intel.com>
+R:     Don Skidmore <donald.c.skidmore@intel.com>
+R:     Matthew Vick <matthew.vick@intel.com>
+R:     John Ronciak <john.ronciak@intel.com>
+R:     Mitch Williams <mitch.a.williams@intel.com>
+L:     intel-wired-lan@lists.osuosl.org
 W:     http://www.intel.com/support/feedback.htm
 W:     http://e1000.sourceforge.net/
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net.git
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-next.git
+Q:     http://patchwork.ozlabs.org/project/intel-wired-lan/list/
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue.git
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git
 S:     Supported
 F:     Documentation/networking/e100.txt
 F:     Documentation/networking/e1000.txt
@@ -5593,6 +5591,8 @@ S:        Supported
 F:     Documentation/*/kvm*.txt
 F:     Documentation/virtual/kvm/
 F:     arch/*/kvm/
+F:     arch/x86/kernel/kvm.c
+F:     arch/x86/kernel/kvmclock.c
 F:     arch/*/include/asm/kvm*
 F:     include/linux/kvm*
 F:     include/uapi/linux/kvm*
@@ -8561,6 +8561,7 @@ F:        include/uapi/linux/timex.h
 F:     kernel/time/clocksource.c
 F:     kernel/time/time*.c
 F:     kernel/time/ntp.c
+F:     tools/testing/selftests/timers/
 
 SC1200 WDT DRIVER
 M:     Zwane Mwaikambo <zwanem@gmail.com>
index da36a3be7969049870d969ae4a623fd7a0c15ee1..9b76ce1e08bbb80d15e3f2ee4859157323a23ee0 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
 PATCHLEVEL = 0
 SUBLEVEL = 0
-EXTRAVERSION = -rc6
+EXTRAVERSION =
 NAME = Hurr durr I'ma sheep
 
 # *DOCUMENTATION*
@@ -779,6 +779,7 @@ KBUILD_ARFLAGS := $(call ar-option,D)
 # check for 'asm goto'
 ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
        KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
+       KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
 endif
 
 include $(srctree)/scripts/Makefile.kasan
index c8d284d8521fc715a01122787294dd4fa2dbcbf5..f535a3fd0f60cc9651e89b83cb821b510374d087 100644 (file)
@@ -116,7 +116,7 @@ alpha_rtc_set_time(struct device *dev, struct rtc_time *tm)
 }
 
 static int
-alpha_rtc_set_mmss(struct device *dev, unsigned long nowtime)
+alpha_rtc_set_mmss(struct device *dev, time64_t nowtime)
 {
        int retval = 0;
        int real_seconds, real_minutes, cmos_minutes;
@@ -211,7 +211,7 @@ alpha_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
 static const struct rtc_class_ops alpha_rtc_ops = {
        .read_time = alpha_rtc_read_time,
        .set_time = alpha_rtc_set_time,
-       .set_mmss = alpha_rtc_set_mmss,
+       .set_mmss64 = alpha_rtc_set_mmss,
        .ioctl = alpha_rtc_ioctl,
 };
 
@@ -276,7 +276,7 @@ do_remote_mmss(void *data)
 }
 
 static int
-remote_set_mmss(struct device *dev, unsigned long now)
+remote_set_mmss(struct device *dev, time64_t now)
 {
        union remote_data x;
        if (smp_processor_id() != boot_cpuid) {
@@ -290,7 +290,7 @@ remote_set_mmss(struct device *dev, unsigned long now)
 static const struct rtc_class_ops remote_rtc_ops = {
        .read_time = remote_read_time,
        .set_time = remote_set_time,
-       .set_mmss = remote_set_mmss,
+       .set_mmss64 = remote_set_mmss,
        .ioctl = alpha_rtc_ioctl,
 };
 #endif
index 6eaddc47c43dfbd60f52b6d49c425acbd55cab62..37dc0fe1093fb24bb26b1c852c568bd8fcd3f6d2 100644 (file)
@@ -151,8 +151,6 @@ static int bL_switch_to(unsigned int new_cluster_id)
        unsigned int mpidr, this_cpu, that_cpu;
        unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster;
        struct completion inbound_alive;
-       struct tick_device *tdev;
-       enum clock_event_mode tdev_mode;
        long volatile *handshake_ptr;
        int ipi_nr, ret;
 
@@ -219,13 +217,7 @@ static int bL_switch_to(unsigned int new_cluster_id)
        /* redirect GIC's SGIs to our counterpart */
        gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]);
 
-       tdev = tick_get_device(this_cpu);
-       if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu)))
-               tdev = NULL;
-       if (tdev) {
-               tdev_mode = tdev->evtdev->mode;
-               clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
-       }
+       tick_suspend_local();
 
        ret = cpu_pm_enter();
 
@@ -251,11 +243,7 @@ static int bL_switch_to(unsigned int new_cluster_id)
 
        ret = cpu_pm_exit();
 
-       if (tdev) {
-               clockevents_set_mode(tdev->evtdev, tdev_mode);
-               clockevents_program_event(tdev->evtdev,
-                                         tdev->evtdev->next_event, 1);
-       }
+       tick_resume_local();
 
        trace_cpu_migrate_finish(ktime_get_real_ns(), ib_mpidr);
        local_fiq_enable();
index 70f9b9bfb1f9646a1bdfe3ab597c5b9c26e80b7e..5f337dc5c1087f49ebf8eae9fde6051d7c9d45be 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef _ASM_ARM_JUMP_LABEL_H
 #define _ASM_ARM_JUMP_LABEL_H
 
-#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
 
 #include <linux/types.h>
 
@@ -27,8 +27,6 @@ l_yes:
        return true;
 }
 
-#endif /* __KERNEL__ */
-
 typedef u32 jump_label_t;
 
 struct jump_entry {
@@ -37,4 +35,5 @@ struct jump_entry {
        jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif
index 816db0bf2dd8addbd9844488b5a72d4495be72c7..d995821f1698c67bc3e57e2073af9cdc3353fab4 100644 (file)
 #define HSR_COND       (0xfU << HSR_COND_SHIFT)
 
 #define FSC_FAULT      (0x04)
+#define FSC_ACCESS     (0x08)
 #define FSC_PERM       (0x0c)
 
 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
index 41008cd7c53f6b39d1476d5a46dc610c578e8d5c..d71607c16601b6b1e1a595e32562195ccd63f5b1 100644 (file)
@@ -27,6 +27,8 @@
 #include <asm/fpstate.h>
 #include <kvm/arm_arch_timer.h>
 
+#define __KVM_HAVE_ARCH_INTC_INITIALIZED
+
 #if defined(CONFIG_KVM_ARM_MAX_VCPUS)
 #define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS
 #else
@@ -165,19 +167,10 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 
 unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
 /* We do not have shadow page tables, hence the empty hooks */
-static inline int kvm_age_hva(struct kvm *kvm, unsigned long start,
-                             unsigned long end)
-{
-       return 0;
-}
-
-static inline int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-       return 0;
-}
-
 static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
                                                         unsigned long address)
 {
index 3f83db2f6cf053cb083c15b5fa910b5d7ae7ee61..d8e90c8cb5fa0ab4c0486cb6d9fd53d576456291 100644 (file)
@@ -28,28 +28,6 @@ struct kvm_decode {
        bool sign_extend;
 };
 
-/*
- * The in-kernel MMIO emulation code wants to use a copy of run->mmio,
- * which is an anonymous type. Use our own type instead.
- */
-struct kvm_exit_mmio {
-       phys_addr_t     phys_addr;
-       u8              data[8];
-       u32             len;
-       bool            is_write;
-       void            *private;
-};
-
-static inline void kvm_prepare_mmio(struct kvm_run *run,
-                                   struct kvm_exit_mmio *mmio)
-{
-       run->mmio.phys_addr     = mmio->phys_addr;
-       run->mmio.len           = mmio->len;
-       run->mmio.is_write      = mmio->is_write;
-       memcpy(run->mmio.data, mmio->data, mmio->len);
-       run->exit_reason        = KVM_EXIT_MMIO;
-}
-
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 phys_addr_t fault_ipa);
index 90c12e1e695c97682c229174a975bb6fdb97403e..0f79e4dec7f98ccddb8429a1e6b262ca146c0d38 100644 (file)
@@ -12,8 +12,7 @@
 
 extern void timer_tick(void);
 
-struct timespec;
-typedef void (*clock_access_fn)(struct timespec *);
+typedef void (*clock_access_fn)(struct timespec64 *);
 extern int register_persistent_clock(clock_access_fn read_boot,
                                     clock_access_fn read_persistent);
 
index 0db25bc328643de55ded82f3b9583a748038d97d..2499867dd0d86d79477e85ac0933228b9bbcf6a2 100644 (file)
@@ -198,6 +198,9 @@ struct kvm_arch_memory_slot {
 /* Highest supported SPI, from VGIC_NR_IRQS */
 #define KVM_ARM_IRQ_GIC_MAX            127
 
+/* One single KVM irqchip, ie. the VGIC */
+#define KVM_NR_IRQCHIPS          1
+
 /* PSCI interface */
 #define KVM_PSCI_FN_BASE               0x95c1ba5e
 #define KVM_PSCI_FN(n)                 (KVM_PSCI_FN_BASE + (n))
index 2d2d6087b9b105d5dadcd66f9821deefe50d1e66..488eaac56028f59ed08fc4030de25dd7d6a59ccb 100644 (file)
@@ -190,7 +190,6 @@ int main(void)
   DEFINE(VCPU_HxFAR,           offsetof(struct kvm_vcpu, arch.fault.hxfar));
   DEFINE(VCPU_HPFAR,           offsetof(struct kvm_vcpu, arch.fault.hpfar));
   DEFINE(VCPU_HYP_PC,          offsetof(struct kvm_vcpu, arch.fault.hyp_pc));
-#ifdef CONFIG_KVM_ARM_VGIC
   DEFINE(VCPU_VGIC_CPU,                offsetof(struct kvm_vcpu, arch.vgic_cpu));
   DEFINE(VGIC_V2_CPU_HCR,      offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
   DEFINE(VGIC_V2_CPU_VMCR,     offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
@@ -200,14 +199,11 @@ int main(void)
   DEFINE(VGIC_V2_CPU_APR,      offsetof(struct vgic_cpu, vgic_v2.vgic_apr));
   DEFINE(VGIC_V2_CPU_LR,       offsetof(struct vgic_cpu, vgic_v2.vgic_lr));
   DEFINE(VGIC_CPU_NR_LR,       offsetof(struct vgic_cpu, nr_lr));
-#ifdef CONFIG_KVM_ARM_TIMER
   DEFINE(VCPU_TIMER_CNTV_CTL,  offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
   DEFINE(VCPU_TIMER_CNTV_CVAL, offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_cval));
   DEFINE(KVM_TIMER_CNTVOFF,    offsetof(struct kvm, arch.timer.cntvoff));
   DEFINE(KVM_TIMER_ENABLED,    offsetof(struct kvm, arch.timer.enabled));
-#endif
   DEFINE(KVM_VGIC_VCTRL,       offsetof(struct kvm, arch.vgic.vctrl_base));
-#endif
   DEFINE(KVM_VTTBR,            offsetof(struct kvm, arch.vttbr));
 #endif
   return 0; 
index 0cc7e58c47cc79fd53ab4feac1b9440300e7a562..a66e37e211a9a8cbfabf85d9213f6b73db417085 100644 (file)
@@ -76,7 +76,7 @@ void timer_tick(void)
 }
 #endif
 
-static void dummy_clock_access(struct timespec *ts)
+static void dummy_clock_access(struct timespec64 *ts)
 {
        ts->tv_sec = 0;
        ts->tv_nsec = 0;
@@ -85,12 +85,12 @@ static void dummy_clock_access(struct timespec *ts)
 static clock_access_fn __read_persistent_clock = dummy_clock_access;
 static clock_access_fn __read_boot_clock = dummy_clock_access;;
 
-void read_persistent_clock(struct timespec *ts)
+void read_persistent_clock64(struct timespec64 *ts)
 {
        __read_persistent_clock(ts);
 }
 
-void read_boot_clock(struct timespec *ts)
+void read_boot_clock64(struct timespec64 *ts)
 {
        __read_boot_clock(ts);
 }
index 338ace78ed18611bcb4aea64baeb490fcf2bca05..f1f79d1043096093a780e46e4a68cf3796a153ac 100644 (file)
@@ -18,6 +18,7 @@ if VIRTUALIZATION
 
 config KVM
        bool "Kernel-based Virtual Machine (KVM) support"
+       depends on MMU && OF
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        select HAVE_KVM_CPU_RELAX_INTERCEPT
@@ -26,10 +27,12 @@ config KVM
        select KVM_ARM_HOST
        select KVM_GENERIC_DIRTYLOG_READ_PROTECT
        select SRCU
-       depends on ARM_VIRT_EXT && ARM_LPAE
+       select MMU_NOTIFIER
+       select HAVE_KVM_EVENTFD
+       select HAVE_KVM_IRQFD
+       depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER
        ---help---
-         Support hosting virtualized guest machines. You will also
-         need to select one or more of the processor modules below.
+         Support hosting virtualized guest machines.
 
          This module provides access to the hardware capabilities through
          a character device node named /dev/kvm.
@@ -37,10 +40,7 @@ config KVM
          If unsure, say N.
 
 config KVM_ARM_HOST
-       bool "KVM host support for ARM cpus."
-       depends on KVM
-       depends on MMU
-       select  MMU_NOTIFIER
+       bool
        ---help---
          Provides host support for ARM processors.
 
@@ -55,20 +55,4 @@ config KVM_ARM_MAX_VCPUS
          large, so only choose a reasonable number that you expect to
          actually use.
 
-config KVM_ARM_VGIC
-       bool "KVM support for Virtual GIC"
-       depends on KVM_ARM_HOST && OF
-       select HAVE_KVM_IRQCHIP
-       default y
-       ---help---
-         Adds support for a hardware assisted, in-kernel GIC emulation.
-
-config KVM_ARM_TIMER
-       bool "KVM support for Architected Timers"
-       depends on KVM_ARM_VGIC && ARM_ARCH_TIMER
-       select HAVE_KVM_IRQCHIP
-       default y
-       ---help---
-         Adds support for the Architected Timers in virtual machines
-
 endif # VIRTUALIZATION
index 443b8bea43e93e862653f5f153d3ce7759da5528..139e46c08b6ec5daff4a3692a569027cf23c0347 100644 (file)
@@ -7,7 +7,7 @@ ifeq ($(plus_virt),+virt)
        plus_virt_def := -DREQUIRES_VIRT=1
 endif
 
-ccflags-y += -Ivirt/kvm -Iarch/arm/kvm
+ccflags-y += -Iarch/arm/kvm
 CFLAGS_arm.o := -I. $(plus_virt_def)
 CFLAGS_mmu.o := -I.
 
@@ -15,12 +15,12 @@ AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt)
 AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)
 
 KVM := ../../../virt/kvm
-kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
+kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o
 
 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
 obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
-obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o
-obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
-obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
-obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
+obj-y += $(KVM)/arm/vgic.o
+obj-y += $(KVM)/arm/vgic-v2.o
+obj-y += $(KVM)/arm/vgic-v2-emul.o
+obj-y += $(KVM)/arm/arch_timer.o
index 5560f74f9eeef1e3e4d2c9c39fc672e539eee93f..6f536451ab784e99966a308c0892e8614214591f 100644 (file)
@@ -61,8 +61,6 @@ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
 static u8 kvm_next_vmid;
 static DEFINE_SPINLOCK(kvm_vmid_lock);
 
-static bool vgic_present;
-
 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
 {
        BUG_ON(preemptible());
@@ -173,8 +171,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        int r;
        switch (ext) {
        case KVM_CAP_IRQCHIP:
-               r = vgic_present;
-               break;
+       case KVM_CAP_IRQFD:
+       case KVM_CAP_IOEVENTFD:
        case KVM_CAP_DEVICE_CTRL:
        case KVM_CAP_USER_MEMORY:
        case KVM_CAP_SYNC_MMU:
@@ -183,6 +181,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ARM_PSCI:
        case KVM_CAP_ARM_PSCI_0_2:
        case KVM_CAP_READONLY_MEM:
+       case KVM_CAP_MP_STATE:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -268,7 +267,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-       return 0;
+       return kvm_timer_should_fire(vcpu);
 }
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
@@ -313,13 +312,29 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
-       return -EINVAL;
+       if (vcpu->arch.pause)
+               mp_state->mp_state = KVM_MP_STATE_STOPPED;
+       else
+               mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+
+       return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
-       return -EINVAL;
+       switch (mp_state->mp_state) {
+       case KVM_MP_STATE_RUNNABLE:
+               vcpu->arch.pause = false;
+               break;
+       case KVM_MP_STATE_STOPPED:
+               vcpu->arch.pause = true;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
 }
 
 /**
@@ -452,6 +467,11 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+       return vgic_initialized(kvm);
+}
+
 static void vcpu_pause(struct kvm_vcpu *vcpu)
 {
        wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
@@ -831,8 +851,6 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
 
        switch (dev_id) {
        case KVM_ARM_DEVICE_VGIC_V2:
-               if (!vgic_present)
-                       return -ENXIO;
                return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
        default:
                return -ENODEV;
@@ -847,10 +865,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
        switch (ioctl) {
        case KVM_CREATE_IRQCHIP: {
-               if (vgic_present)
-                       return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-               else
-                       return -ENXIO;
+               return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
        }
        case KVM_ARM_SET_DEVICE_ADDR: {
                struct kvm_arm_device_addr dev_addr;
@@ -1035,10 +1050,6 @@ static int init_hyp_mode(void)
        if (err)
                goto out_free_context;
 
-#ifdef CONFIG_KVM_ARM_VGIC
-               vgic_present = true;
-#endif
-
        /*
         * Init HYP architected timer support
         */
index 384bab67c4629a9bece251d5577c9bf908f8348f..d503fbb787d362752b9b6b688b2829e19b675095 100644 (file)
@@ -109,22 +109,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        return -EINVAL;
 }
 
-#ifndef CONFIG_KVM_ARM_TIMER
-
-#define NUM_TIMER_REGS 0
-
-static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
-{
-       return 0;
-}
-
-static bool is_timer_reg(u64 index)
-{
-       return false;
-}
-
-#else
-
 #define NUM_TIMER_REGS 3
 
 static bool is_timer_reg(u64 index)
@@ -152,8 +136,6 @@ static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
        return 0;
 }
 
-#endif
-
 static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 {
        void __user *uaddr = (void __user *)(long)reg->addr;
index 14d488388480ea50a80d24b18bbed9636c8d1c25..35e4a3a0c476cd9730afa52192bea5b3ff6f20fd 100644 (file)
@@ -402,7 +402,6 @@ vcpu        .req    r0              @ vcpu pointer always in r0
  * Assumes vcpu pointer in vcpu reg
  */
 .macro save_vgic_state
-#ifdef CONFIG_KVM_ARM_VGIC
        /* Get VGIC VCTRL base into r2 */
        ldr     r2, [vcpu, #VCPU_KVM]
        ldr     r2, [r2, #KVM_VGIC_VCTRL]
@@ -460,7 +459,6 @@ ARM_BE8(rev r6, r6  )
        subs    r4, r4, #1
        bne     1b
 2:
-#endif
 .endm
 
 /*
@@ -469,7 +467,6 @@ ARM_BE8(rev r6, r6  )
  * Assumes vcpu pointer in vcpu reg
  */
 .macro restore_vgic_state
-#ifdef CONFIG_KVM_ARM_VGIC
        /* Get VGIC VCTRL base into r2 */
        ldr     r2, [vcpu, #VCPU_KVM]
        ldr     r2, [r2, #KVM_VGIC_VCTRL]
@@ -501,7 +498,6 @@ ARM_BE8(rev r6, r6  )
        subs    r4, r4, #1
        bne     1b
 2:
-#endif
 .endm
 
 #define CNTHCTL_PL1PCTEN       (1 << 0)
@@ -515,7 +511,6 @@ ARM_BE8(rev r6, r6  )
  * Clobbers r2-r5
  */
 .macro save_timer_state
-#ifdef CONFIG_KVM_ARM_TIMER
        ldr     r4, [vcpu, #VCPU_KVM]
        ldr     r2, [r4, #KVM_TIMER_ENABLED]
        cmp     r2, #0
@@ -537,7 +532,6 @@ ARM_BE8(rev r6, r6  )
        mcrr    p15, 4, r2, r2, c14     @ CNTVOFF
 
 1:
-#endif
        @ Allow physical timer/counter access for the host
        mrc     p15, 4, r2, c14, c1, 0  @ CNTHCTL
        orr     r2, r2, #(CNTHCTL_PL1PCEN | CNTHCTL_PL1PCTEN)
@@ -559,7 +553,6 @@ ARM_BE8(rev r6, r6  )
        bic     r2, r2, #CNTHCTL_PL1PCEN
        mcr     p15, 4, r2, c14, c1, 0  @ CNTHCTL
 
-#ifdef CONFIG_KVM_ARM_TIMER
        ldr     r4, [vcpu, #VCPU_KVM]
        ldr     r2, [r4, #KVM_TIMER_ENABLED]
        cmp     r2, #0
@@ -579,7 +572,6 @@ ARM_BE8(rev r6, r6  )
        and     r2, r2, #3
        mcr     p15, 0, r2, c14, c3, 1  @ CNTV_CTL
 1:
-#endif
 .endm
 
 .equ vmentry,  0
index 5d3bfc0eb3f000cb41cb217eb7fdc2611da85fc9..974b1c606d044c239bfa14ffbdf66f0fc982c4fb 100644 (file)
@@ -121,12 +121,11 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
        return 0;
 }
 
-static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
-                     struct kvm_exit_mmio *mmio)
+static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
 {
        unsigned long rt;
-       int len;
-       bool is_write, sign_extend;
+       int access_size;
+       bool sign_extend;
 
        if (kvm_vcpu_dabt_isextabt(vcpu)) {
                /* cache operation on I/O addr, tell guest unsupported */
@@ -140,17 +139,15 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                return 1;
        }
 
-       len = kvm_vcpu_dabt_get_as(vcpu);
-       if (unlikely(len < 0))
-               return len;
+       access_size = kvm_vcpu_dabt_get_as(vcpu);
+       if (unlikely(access_size < 0))
+               return access_size;
 
-       is_write = kvm_vcpu_dabt_iswrite(vcpu);
+       *is_write = kvm_vcpu_dabt_iswrite(vcpu);
        sign_extend = kvm_vcpu_dabt_issext(vcpu);
        rt = kvm_vcpu_dabt_get_rd(vcpu);
 
-       mmio->is_write = is_write;
-       mmio->phys_addr = fault_ipa;
-       mmio->len = len;
+       *len = access_size;
        vcpu->arch.mmio_decode.sign_extend = sign_extend;
        vcpu->arch.mmio_decode.rt = rt;
 
@@ -165,20 +162,20 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 phys_addr_t fault_ipa)
 {
-       struct kvm_exit_mmio mmio;
        unsigned long data;
        unsigned long rt;
        int ret;
+       bool is_write;
+       int len;
+       u8 data_buf[8];
 
        /*
-        * Prepare MMIO operation. First stash it in a private
-        * structure that we can use for in-kernel emulation. If the
-        * kernel can't handle it, copy it into run->mmio and let user
-        * space do its magic.
+        * Prepare MMIO operation. First decode the syndrome data we get
+        * from the CPU. Then try if some in-kernel emulation feels
+        * responsible, otherwise let user space do its magic.
         */
-
        if (kvm_vcpu_dabt_isvalid(vcpu)) {
-               ret = decode_hsr(vcpu, fault_ipa, &mmio);
+               ret = decode_hsr(vcpu, &is_write, &len);
                if (ret)
                        return ret;
        } else {
@@ -188,21 +185,34 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
 
        rt = vcpu->arch.mmio_decode.rt;
 
-       if (mmio.is_write) {
-               data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt),
-                                              mmio.len);
+       if (is_write) {
+               data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt), len);
+
+               trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data);
+               mmio_write_buf(data_buf, len, data);
 
-               trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, mmio.len,
-                              fault_ipa, data);
-               mmio_write_buf(mmio.data, mmio.len, data);
+               ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
+                                      data_buf);
        } else {
-               trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, mmio.len,
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
                               fault_ipa, 0);
+
+               ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
+                                     data_buf);
        }
 
-       if (vgic_handle_mmio(vcpu, run, &mmio))
+       /* Now prepare kvm_run for the potential return to userland. */
+       run->mmio.is_write      = is_write;
+       run->mmio.phys_addr     = fault_ipa;
+       run->mmio.len           = len;
+       memcpy(run->mmio.data, data_buf, len);
+
+       if (!ret) {
+               /* We handled the access successfully in the kernel. */
+               kvm_handle_mmio_return(vcpu, run);
                return 1;
+       }
 
-       kvm_prepare_mmio(run, &mmio);
+       run->exit_reason        = KVM_EXIT_MMIO;
        return 0;
 }
index 5656d79c5a44f4d2ca816e15b647abf29a114e0b..15b050d46fc968afdc53029ada4b7d945ee23515 100644 (file)
@@ -1330,10 +1330,51 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 out_unlock:
        spin_unlock(&kvm->mmu_lock);
+       kvm_set_pfn_accessed(pfn);
        kvm_release_pfn_clean(pfn);
        return ret;
 }
 
+/*
+ * Resolve the access fault by making the page young again.
+ * Note that because the faulting entry is guaranteed not to be
+ * cached in the TLB, we don't need to invalidate anything.
+ */
+static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
+{
+       pmd_t *pmd;
+       pte_t *pte;
+       pfn_t pfn;
+       bool pfn_valid = false;
+
+       trace_kvm_access_fault(fault_ipa);
+
+       spin_lock(&vcpu->kvm->mmu_lock);
+
+       pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa);
+       if (!pmd || pmd_none(*pmd))     /* Nothing there */
+               goto out;
+
+       if (kvm_pmd_huge(*pmd)) {       /* THP, HugeTLB */
+               *pmd = pmd_mkyoung(*pmd);
+               pfn = pmd_pfn(*pmd);
+               pfn_valid = true;
+               goto out;
+       }
+
+       pte = pte_offset_kernel(pmd, fault_ipa);
+       if (pte_none(*pte))             /* Nothing there either */
+               goto out;
+
+       *pte = pte_mkyoung(*pte);       /* Just a page... */
+       pfn = pte_pfn(*pte);
+       pfn_valid = true;
+out:
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       if (pfn_valid)
+               kvm_set_pfn_accessed(pfn);
+}
+
 /**
  * kvm_handle_guest_abort - handles all 2nd stage aborts
  * @vcpu:      the VCPU pointer
@@ -1364,7 +1405,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
        /* Check the stage-2 fault is trans. fault or write fault */
        fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
-       if (fault_status != FSC_FAULT && fault_status != FSC_PERM) {
+       if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
+           fault_status != FSC_ACCESS) {
                kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
                        kvm_vcpu_trap_get_class(vcpu),
                        (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
@@ -1400,6 +1442,12 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
        /* Userspace should not be able to register out-of-bounds IPAs */
        VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
 
+       if (fault_status == FSC_ACCESS) {
+               handle_access_fault(vcpu, fault_ipa);
+               ret = 1;
+               goto out_unlock;
+       }
+
        ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
        if (ret == 0)
                ret = 1;
@@ -1408,15 +1456,16 @@ out_unlock:
        return ret;
 }
 
-static void handle_hva_to_gpa(struct kvm *kvm,
-                             unsigned long start,
-                             unsigned long end,
-                             void (*handler)(struct kvm *kvm,
-                                             gpa_t gpa, void *data),
-                             void *data)
+static int handle_hva_to_gpa(struct kvm *kvm,
+                            unsigned long start,
+                            unsigned long end,
+                            int (*handler)(struct kvm *kvm,
+                                           gpa_t gpa, void *data),
+                            void *data)
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
+       int ret = 0;
 
        slots = kvm_memslots(kvm);
 
@@ -1440,14 +1489,17 @@ static void handle_hva_to_gpa(struct kvm *kvm,
 
                for (; gfn < gfn_end; ++gfn) {
                        gpa_t gpa = gfn << PAGE_SHIFT;
-                       handler(kvm, gpa, data);
+                       ret |= handler(kvm, gpa, data);
                }
        }
+
+       return ret;
 }
 
-static void kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
+static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
 {
        unmap_stage2_range(kvm, gpa, PAGE_SIZE);
+       return 0;
 }
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
@@ -1473,7 +1525,7 @@ int kvm_unmap_hva_range(struct kvm *kvm,
        return 0;
 }
 
-static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
+static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
 {
        pte_t *pte = (pte_t *)data;
 
@@ -1485,6 +1537,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
         * through this calling path.
         */
        stage2_set_pte(kvm, NULL, gpa, pte, 0);
+       return 0;
 }
 
 
@@ -1501,6 +1554,67 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
        handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
 }
 
+static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
+{
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pmd = stage2_get_pmd(kvm, NULL, gpa);
+       if (!pmd || pmd_none(*pmd))     /* Nothing there */
+               return 0;
+
+       if (kvm_pmd_huge(*pmd)) {       /* THP, HugeTLB */
+               if (pmd_young(*pmd)) {
+                       *pmd = pmd_mkold(*pmd);
+                       return 1;
+               }
+
+               return 0;
+       }
+
+       pte = pte_offset_kernel(pmd, gpa);
+       if (pte_none(*pte))
+               return 0;
+
+       if (pte_young(*pte)) {
+               *pte = pte_mkold(*pte); /* Just a page... */
+               return 1;
+       }
+
+       return 0;
+}
+
+static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
+{
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pmd = stage2_get_pmd(kvm, NULL, gpa);
+       if (!pmd || pmd_none(*pmd))     /* Nothing there */
+               return 0;
+
+       if (kvm_pmd_huge(*pmd))         /* THP, HugeTLB */
+               return pmd_young(*pmd);
+
+       pte = pte_offset_kernel(pmd, gpa);
+       if (!pte_none(*pte))            /* Just a page... */
+               return pte_young(*pte);
+
+       return 0;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+       trace_kvm_age_hva(start, end);
+       return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       trace_kvm_test_age_hva(hva);
+       return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
+}
+
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
        mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
index 6817664b46b80419047066686a47a8bc7953ebeb..0ec35392d2083ac4d8df391de0670d6ea6be6378 100644 (file)
@@ -68,6 +68,21 @@ TRACE_EVENT(kvm_guest_fault,
                  __entry->hxfar, __entry->vcpu_pc)
 );
 
+TRACE_EVENT(kvm_access_fault,
+       TP_PROTO(unsigned long ipa),
+       TP_ARGS(ipa),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  ipa             )
+       ),
+
+       TP_fast_assign(
+               __entry->ipa            = ipa;
+       ),
+
+       TP_printk("IPA: %lx", __entry->ipa)
+);
+
 TRACE_EVENT(kvm_irq_line,
        TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
        TP_ARGS(type, vcpu_idx, irq_num, level),
@@ -210,6 +225,39 @@ TRACE_EVENT(kvm_set_spte_hva,
        TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
 );
 
+TRACE_EVENT(kvm_age_hva,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  start           )
+               __field(        unsigned long,  end             )
+       ),
+
+       TP_fast_assign(
+               __entry->start          = start;
+               __entry->end            = end;
+       ),
+
+       TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
+                 __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_test_age_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
+);
+
 TRACE_EVENT(kvm_hvc,
        TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
        TP_ARGS(vcpu_pc, r0, imm),
index 01e398a868bcbf9d0e33456e8fc70c0a2996848d..57d429830e09b4ceffa71efc3b3b88f6a88c68cd 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/cpuidle.h>
 #include <linux/cpu_pm.h>
 #include <linux/export.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 
 #include <asm/cpuidle.h>
 #include <asm/proc-fns.h>
@@ -84,7 +84,6 @@ static int omap_enter_idle_coupled(struct cpuidle_device *dev,
 {
        struct idle_statedata *cx = state_ptr + index;
        u32 mpuss_can_lose_context = 0;
-       int cpu_id = smp_processor_id();
 
        /*
         * CPU0 has to wait and stay ON until CPU1 is OFF state.
@@ -112,7 +111,7 @@ static int omap_enter_idle_coupled(struct cpuidle_device *dev,
        mpuss_can_lose_context = (cx->mpu_state == PWRDM_POWER_RET) &&
                                 (cx->mpu_logic_state == PWRDM_POWER_OFF);
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu_id);
+       tick_broadcast_enter();
 
        /*
         * Call idle CPU PM enter notifier chain so that
@@ -169,7 +168,7 @@ static int omap_enter_idle_coupled(struct cpuidle_device *dev,
        if (dev->cpu == 0 && mpuss_can_lose_context)
                cpu_cluster_pm_exit();
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu_id);
+       tick_broadcast_exit();
 
 fail:
        cpuidle_coupled_parallel_barrier(dev, &abort_barrier);
@@ -184,8 +183,7 @@ fail:
  */
 static void omap_setup_broadcast_timer(void *arg)
 {
-       int cpu = smp_processor_id();
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ON, &cpu);
+       tick_broadcast_enable();
 }
 
 static struct cpuidle_driver omap4_idle_driver = {
index f2b586d7b15dfe7cade115ecca4f785324eb15e0..155807fa6fdd0f5a3078064391ac5bb310ba3f53 100644 (file)
@@ -15,7 +15,7 @@
  */
 
 #include <asm/firmware.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 #include <linux/cpuidle.h>
 #include <linux/cpu_pm.h>
 #include <linux/kernel.h>
@@ -44,7 +44,7 @@ static int tegra114_idle_power_down(struct cpuidle_device *dev,
        tegra_set_cpu_in_lp2();
        cpu_pm_enter();
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
+       tick_broadcast_enter();
 
        call_firmware_op(prepare_idle);
 
@@ -52,7 +52,7 @@ static int tegra114_idle_power_down(struct cpuidle_device *dev,
        if (call_firmware_op(do_idle, 0) == -ENOSYS)
                cpu_suspend(0, tegra30_sleep_cpu_secondary_finish);
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+       tick_broadcast_exit();
 
        cpu_pm_exit();
        tegra_clear_cpu_in_lp2();
index 4f25a7c7ca0fed7b74c8aab8b0f4281021f9e175..48844ae6c3a119b8493aaffef1c805cf929c7111 100644 (file)
@@ -20,7 +20,7 @@
  */
 
 #include <linux/clk/tegra.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 #include <linux/cpuidle.h>
 #include <linux/cpu_pm.h>
 #include <linux/kernel.h>
@@ -136,11 +136,11 @@ static bool tegra20_cpu_cluster_power_down(struct cpuidle_device *dev,
        if (tegra20_reset_cpu_1() || !tegra_cpu_rail_off_ready())
                return false;
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
+       tick_broadcast_enter();
 
        tegra_idle_lp2_last();
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+       tick_broadcast_exit();
 
        if (cpu_online(1))
                tegra20_wake_cpu1_from_reset();
@@ -153,13 +153,13 @@ static bool tegra20_idle_enter_lp2_cpu_1(struct cpuidle_device *dev,
                                         struct cpuidle_driver *drv,
                                         int index)
 {
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
+       tick_broadcast_enter();
 
        cpu_suspend(0, tegra20_sleep_cpu_secondary_finish);
 
        tegra20_cpu_clear_resettable();
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+       tick_broadcast_exit();
 
        return true;
 }
index f8815ed65d9d5227b3f21ddea6c316bb9e24723a..84d809a3cba3b2f720d261ece0dc07df14ec72cf 100644 (file)
@@ -20,7 +20,7 @@
  */
 
 #include <linux/clk/tegra.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 #include <linux/cpuidle.h>
 #include <linux/cpu_pm.h>
 #include <linux/kernel.h>
@@ -76,11 +76,11 @@ static bool tegra30_cpu_cluster_power_down(struct cpuidle_device *dev,
                return false;
        }
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
+       tick_broadcast_enter();
 
        tegra_idle_lp2_last();
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+       tick_broadcast_exit();
 
        return true;
 }
@@ -90,13 +90,13 @@ static bool tegra30_cpu_core_power_down(struct cpuidle_device *dev,
                                        struct cpuidle_driver *drv,
                                        int index)
 {
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
+       tick_broadcast_enter();
 
        smp_wmb();
 
        cpu_suspend(0, tegra30_sleep_cpu_secondary_finish);
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+       tick_broadcast_exit();
 
        return true;
 }
index 61b4d705c26720eb88f2dbcf7eab72361d31d14d..2438b96004c1c36013cb8e55fe2fb4b2eb180663 100644 (file)
@@ -44,24 +44,20 @@ static u64 notrace omap_32k_read_sched_clock(void)
 }
 
 /**
- * omap_read_persistent_clock -  Return time from a persistent clock.
+ * omap_read_persistent_clock64 -  Return time from a persistent clock.
  *
  * Reads the time from a source which isn't disabled during PM, the
  * 32k sync timer.  Convert the cycles elapsed since last read into
- * nsecs and adds to a monotonically increasing timespec.
+ * nsecs and adds to a monotonically increasing timespec64.
  */
-static struct timespec persistent_ts;
+static struct timespec64 persistent_ts;
 static cycles_t cycles;
 static unsigned int persistent_mult, persistent_shift;
-static DEFINE_SPINLOCK(read_persistent_clock_lock);
 
-static void omap_read_persistent_clock(struct timespec *ts)
+static void omap_read_persistent_clock64(struct timespec64 *ts)
 {
        unsigned long long nsecs;
        cycles_t last_cycles;
-       unsigned long flags;
-
-       spin_lock_irqsave(&read_persistent_clock_lock, flags);
 
        last_cycles = cycles;
        cycles = sync32k_cnt_reg ? readl_relaxed(sync32k_cnt_reg) : 0;
@@ -69,11 +65,9 @@ static void omap_read_persistent_clock(struct timespec *ts)
        nsecs = clocksource_cyc2ns(cycles - last_cycles,
                                        persistent_mult, persistent_shift);
 
-       timespec_add_ns(&persistent_ts, nsecs);
+       timespec64_add_ns(&persistent_ts, nsecs);
 
        *ts = persistent_ts;
-
-       spin_unlock_irqrestore(&read_persistent_clock_lock, flags);
 }
 
 /**
@@ -103,7 +97,7 @@ int __init omap_init_clocksource_32k(void __iomem *vbase)
 
        /*
         * 120000 rough estimate from the calculations in
-        * __clocksource_updatefreq_scale.
+        * __clocksource_update_freq_scale.
         */
        clocks_calc_mult_shift(&persistent_mult, &persistent_shift,
                        32768, NSEC_PER_SEC, 120000);
@@ -116,7 +110,7 @@ int __init omap_init_clocksource_32k(void __iomem *vbase)
        }
 
        sched_clock_register(omap_32k_read_sched_clock, 32, 32768);
-       register_persistent_clock(NULL, omap_read_persistent_clock);
+       register_persistent_clock(NULL, omap_read_persistent_clock64);
        pr_info("OMAP clocksource: 32k_counter at 32768 Hz\n");
 
        return 0;
index 92bbae38159821cb6ce45ab3dab456c7eedced15..70522450ca2342a66b3ec27167bb94e08590f6d1 100644 (file)
@@ -90,6 +90,7 @@
 #define ESR_ELx_FSC            (0x3F)
 #define ESR_ELx_FSC_TYPE       (0x3C)
 #define ESR_ELx_FSC_EXTABT     (0x10)
+#define ESR_ELx_FSC_ACCESS     (0x08)
 #define ESR_ELx_FSC_FAULT      (0x04)
 #define ESR_ELx_FSC_PERM       (0x0C)
 #define ESR_ELx_CV             (UL(1) << 24)
index 076a1c714049ffede6cf8fb3d46c6efcf3ac7f1f..c0e5165c2f76d3c1b979bbaac11121f6e27885da 100644 (file)
  */
 #ifndef __ASM_JUMP_LABEL_H
 #define __ASM_JUMP_LABEL_H
+
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 #include <asm/insn.h>
 
-#ifdef __KERNEL__
-
 #define JUMP_LABEL_NOP_SIZE            AARCH64_INSN_SIZE
 
 static __always_inline bool arch_static_branch(struct static_key *key)
@@ -39,8 +40,6 @@ l_yes:
        return true;
 }
 
-#endif /* __KERNEL__ */
-
 typedef u64 jump_label_t;
 
 struct jump_entry {
@@ -49,4 +48,5 @@ struct jump_entry {
        jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif /* __ASM_JUMP_LABEL_H */
index 54bb4ba974417e269656d50adb524654851fbbd2..ac6fafb95fe71e48048fe3831f226853f2f4914d 100644 (file)
 
 /* For compatibility with fault code shared with 32-bit */
 #define FSC_FAULT      ESR_ELx_FSC_FAULT
+#define FSC_ACCESS     ESR_ELx_FSC_ACCESS
 #define FSC_PERM       ESR_ELx_FSC_PERM
 
 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
index 8ac3c70fe3c6ae7f234c5505a4a3c0e5bbd6c4ff..f0f58c9beec0e87c8c7eefa7a8ba52ba518e07c3 100644 (file)
@@ -28,6 +28,8 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmio.h>
 
+#define __KVM_HAVE_ARCH_INTC_INITIALIZED
+
 #if defined(CONFIG_KVM_ARM_MAX_VCPUS)
 #define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS
 #else
@@ -177,19 +179,10 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_unmap_hva_range(struct kvm *kvm,
                        unsigned long start, unsigned long end);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
 /* We do not have shadow page tables, hence the empty hooks */
-static inline int kvm_age_hva(struct kvm *kvm, unsigned long start,
-                             unsigned long end)
-{
-       return 0;
-}
-
-static inline int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-       return 0;
-}
-
 static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
                                                         unsigned long address)
 {
index 9f52beb7cb1355e1ebf129b899830b7b463d67b2..889c908ee631b526594b5dfc32ef5dfde15480df 100644 (file)
@@ -31,28 +31,6 @@ struct kvm_decode {
        bool sign_extend;
 };
 
-/*
- * The in-kernel MMIO emulation code wants to use a copy of run->mmio,
- * which is an anonymous type. Use our own type instead.
- */
-struct kvm_exit_mmio {
-       phys_addr_t     phys_addr;
-       u8              data[8];
-       u32             len;
-       bool            is_write;
-       void            *private;
-};
-
-static inline void kvm_prepare_mmio(struct kvm_run *run,
-                                   struct kvm_exit_mmio *mmio)
-{
-       run->mmio.phys_addr     = mmio->phys_addr;
-       run->mmio.len           = mmio->len;
-       run->mmio.is_write      = mmio->is_write;
-       memcpy(run->mmio.data, mmio->data, mmio->len);
-       run->exit_reason        = KVM_EXIT_MMIO;
-}
-
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 phys_addr_t fault_ipa);
index 3ef77a4660187ace735216639ffc1e8f8d11a38b..c154c0b7eb604ae09214beed276063f222af69ae 100644 (file)
@@ -191,6 +191,9 @@ struct kvm_arch_memory_slot {
 /* Highest supported SPI, from VGIC_NR_IRQS */
 #define KVM_ARM_IRQ_GIC_MAX            127
 
+/* One single KVM irqchip, ie. the VGIC */
+#define KVM_NR_IRQCHIPS          1
+
 /* PSCI interface */
 #define KVM_PSCI_FN_BASE               0x95c1ba5e
 #define KVM_PSCI_FN(n)                 (KVM_PSCI_FN_BASE + (n))
index 32aeea083d93b2391122ad9d1c49c3925121e38b..ec37ab3f524f303419d2cc3a82b79c119e61de1d 100644 (file)
@@ -200,7 +200,7 @@ up_fail:
 void update_vsyscall(struct timekeeper *tk)
 {
        struct timespec xtime_coarse;
-       u32 use_syscall = strcmp(tk->tkr.clock->name, "arch_sys_counter");
+       u32 use_syscall = strcmp(tk->tkr_mono.clock->name, "arch_sys_counter");
 
        ++vdso_data->tb_seq_count;
        smp_wmb();
@@ -213,11 +213,11 @@ void update_vsyscall(struct timekeeper *tk)
        vdso_data->wtm_clock_nsec               = tk->wall_to_monotonic.tv_nsec;
 
        if (!use_syscall) {
-               vdso_data->cs_cycle_last        = tk->tkr.cycle_last;
+               vdso_data->cs_cycle_last        = tk->tkr_mono.cycle_last;
                vdso_data->xtime_clock_sec      = tk->xtime_sec;
-               vdso_data->xtime_clock_nsec     = tk->tkr.xtime_nsec;
-               vdso_data->cs_mult              = tk->tkr.mult;
-               vdso_data->cs_shift             = tk->tkr.shift;
+               vdso_data->xtime_clock_nsec     = tk->tkr_mono.xtime_nsec;
+               vdso_data->cs_mult              = tk->tkr_mono.mult;
+               vdso_data->cs_shift             = tk->tkr_mono.shift;
        }
 
        smp_wmb();
index f5590c81d95f9e494bd82b5dda127762d5ea01f7..5105e297ed5fef43509f264a001138d299fb5cba 100644 (file)
@@ -18,6 +18,7 @@ if VIRTUALIZATION
 
 config KVM
        bool "Kernel-based Virtual Machine (KVM) support"
+       depends on OF
        select MMU_NOTIFIER
        select PREEMPT_NOTIFIERS
        select ANON_INODES
@@ -25,10 +26,10 @@ config KVM
        select HAVE_KVM_ARCH_TLB_FLUSH_ALL
        select KVM_MMIO
        select KVM_ARM_HOST
-       select KVM_ARM_VGIC
-       select KVM_ARM_TIMER
        select KVM_GENERIC_DIRTYLOG_READ_PROTECT
        select SRCU
+       select HAVE_KVM_EVENTFD
+       select HAVE_KVM_IRQFD
        ---help---
          Support hosting virtualized guest machines.
 
@@ -50,17 +51,4 @@ config KVM_ARM_MAX_VCPUS
          large, so only choose a reasonable number that you expect to
          actually use.
 
-config KVM_ARM_VGIC
-       bool
-       depends on KVM_ARM_HOST && OF
-       select HAVE_KVM_IRQCHIP
-       ---help---
-         Adds support for a hardware assisted, in-kernel GIC emulation.
-
-config KVM_ARM_TIMER
-       bool
-       depends on KVM_ARM_VGIC
-       ---help---
-         Adds support for the Architected Timers in virtual machines.
-
 endif # VIRTUALIZATION
index 4e6e09ee4033503088d686af976a6f7b0f3ee46a..d5904f876cdb535a373c6299beeec09bb5538331 100644 (file)
@@ -2,7 +2,7 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-ccflags-y += -Ivirt/kvm -Iarch/arm64/kvm
+ccflags-y += -Iarch/arm64/kvm
 CFLAGS_arm.o := -I.
 CFLAGS_mmu.o := -I.
 
@@ -11,7 +11,7 @@ ARM=../../../arch/arm/kvm
 
 obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
 
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
 
@@ -19,11 +19,11 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
 
-kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o
-kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o
-kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o
-kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o
-kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o
-kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3-emul.o
-kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o
-kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v2-switch.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v3-switch.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
index cdac7b3eeaf7fa6524b8f6ffb92edc7209eef4c5..80386470d3a4414da1a6ff17d14b624ddd417ad7 100644 (file)
        .set push
        SET_HARDFLOAT
        cfc1    \tmp,  fcr31
-       swc1    $f0,  THREAD_FPR0_LS64(\thread)
-       swc1    $f1,  THREAD_FPR1_LS64(\thread)
-       swc1    $f2,  THREAD_FPR2_LS64(\thread)
-       swc1    $f3,  THREAD_FPR3_LS64(\thread)
-       swc1    $f4,  THREAD_FPR4_LS64(\thread)
-       swc1    $f5,  THREAD_FPR5_LS64(\thread)
-       swc1    $f6,  THREAD_FPR6_LS64(\thread)
-       swc1    $f7,  THREAD_FPR7_LS64(\thread)
-       swc1    $f8,  THREAD_FPR8_LS64(\thread)
-       swc1    $f9,  THREAD_FPR9_LS64(\thread)
-       swc1    $f10, THREAD_FPR10_LS64(\thread)
-       swc1    $f11, THREAD_FPR11_LS64(\thread)
-       swc1    $f12, THREAD_FPR12_LS64(\thread)
-       swc1    $f13, THREAD_FPR13_LS64(\thread)
-       swc1    $f14, THREAD_FPR14_LS64(\thread)
-       swc1    $f15, THREAD_FPR15_LS64(\thread)
-       swc1    $f16, THREAD_FPR16_LS64(\thread)
-       swc1    $f17, THREAD_FPR17_LS64(\thread)
-       swc1    $f18, THREAD_FPR18_LS64(\thread)
-       swc1    $f19, THREAD_FPR19_LS64(\thread)
-       swc1    $f20, THREAD_FPR20_LS64(\thread)
-       swc1    $f21, THREAD_FPR21_LS64(\thread)
-       swc1    $f22, THREAD_FPR22_LS64(\thread)
-       swc1    $f23, THREAD_FPR23_LS64(\thread)
-       swc1    $f24, THREAD_FPR24_LS64(\thread)
-       swc1    $f25, THREAD_FPR25_LS64(\thread)
-       swc1    $f26, THREAD_FPR26_LS64(\thread)
-       swc1    $f27, THREAD_FPR27_LS64(\thread)
-       swc1    $f28, THREAD_FPR28_LS64(\thread)
-       swc1    $f29, THREAD_FPR29_LS64(\thread)
-       swc1    $f30, THREAD_FPR30_LS64(\thread)
-       swc1    $f31, THREAD_FPR31_LS64(\thread)
+       swc1    $f0,  THREAD_FPR0(\thread)
+       swc1    $f1,  THREAD_FPR1(\thread)
+       swc1    $f2,  THREAD_FPR2(\thread)
+       swc1    $f3,  THREAD_FPR3(\thread)
+       swc1    $f4,  THREAD_FPR4(\thread)
+       swc1    $f5,  THREAD_FPR5(\thread)
+       swc1    $f6,  THREAD_FPR6(\thread)
+       swc1    $f7,  THREAD_FPR7(\thread)
+       swc1    $f8,  THREAD_FPR8(\thread)
+       swc1    $f9,  THREAD_FPR9(\thread)
+       swc1    $f10, THREAD_FPR10(\thread)
+       swc1    $f11, THREAD_FPR11(\thread)
+       swc1    $f12, THREAD_FPR12(\thread)
+       swc1    $f13, THREAD_FPR13(\thread)
+       swc1    $f14, THREAD_FPR14(\thread)
+       swc1    $f15, THREAD_FPR15(\thread)
+       swc1    $f16, THREAD_FPR16(\thread)
+       swc1    $f17, THREAD_FPR17(\thread)
+       swc1    $f18, THREAD_FPR18(\thread)
+       swc1    $f19, THREAD_FPR19(\thread)
+       swc1    $f20, THREAD_FPR20(\thread)
+       swc1    $f21, THREAD_FPR21(\thread)
+       swc1    $f22, THREAD_FPR22(\thread)
+       swc1    $f23, THREAD_FPR23(\thread)
+       swc1    $f24, THREAD_FPR24(\thread)
+       swc1    $f25, THREAD_FPR25(\thread)
+       swc1    $f26, THREAD_FPR26(\thread)
+       swc1    $f27, THREAD_FPR27(\thread)
+       swc1    $f28, THREAD_FPR28(\thread)
+       swc1    $f29, THREAD_FPR29(\thread)
+       swc1    $f30, THREAD_FPR30(\thread)
+       swc1    $f31, THREAD_FPR31(\thread)
        sw      \tmp, THREAD_FCR31(\thread)
        .set pop
        .endm
        .set push
        SET_HARDFLOAT
        lw      \tmp, THREAD_FCR31(\thread)
-       lwc1    $f0,  THREAD_FPR0_LS64(\thread)
-       lwc1    $f1,  THREAD_FPR1_LS64(\thread)
-       lwc1    $f2,  THREAD_FPR2_LS64(\thread)
-       lwc1    $f3,  THREAD_FPR3_LS64(\thread)
-       lwc1    $f4,  THREAD_FPR4_LS64(\thread)
-       lwc1    $f5,  THREAD_FPR5_LS64(\thread)
-       lwc1    $f6,  THREAD_FPR6_LS64(\thread)
-       lwc1    $f7,  THREAD_FPR7_LS64(\thread)
-       lwc1    $f8,  THREAD_FPR8_LS64(\thread)
-       lwc1    $f9,  THREAD_FPR9_LS64(\thread)
-       lwc1    $f10, THREAD_FPR10_LS64(\thread)
-       lwc1    $f11, THREAD_FPR11_LS64(\thread)
-       lwc1    $f12, THREAD_FPR12_LS64(\thread)
-       lwc1    $f13, THREAD_FPR13_LS64(\thread)
-       lwc1    $f14, THREAD_FPR14_LS64(\thread)
-       lwc1    $f15, THREAD_FPR15_LS64(\thread)
-       lwc1    $f16, THREAD_FPR16_LS64(\thread)
-       lwc1    $f17, THREAD_FPR17_LS64(\thread)
-       lwc1    $f18, THREAD_FPR18_LS64(\thread)
-       lwc1    $f19, THREAD_FPR19_LS64(\thread)
-       lwc1    $f20, THREAD_FPR20_LS64(\thread)
-       lwc1    $f21, THREAD_FPR21_LS64(\thread)
-       lwc1    $f22, THREAD_FPR22_LS64(\thread)
-       lwc1    $f23, THREAD_FPR23_LS64(\thread)
-       lwc1    $f24, THREAD_FPR24_LS64(\thread)
-       lwc1    $f25, THREAD_FPR25_LS64(\thread)
-       lwc1    $f26, THREAD_FPR26_LS64(\thread)
-       lwc1    $f27, THREAD_FPR27_LS64(\thread)
-       lwc1    $f28, THREAD_FPR28_LS64(\thread)
-       lwc1    $f29, THREAD_FPR29_LS64(\thread)
-       lwc1    $f30, THREAD_FPR30_LS64(\thread)
-       lwc1    $f31, THREAD_FPR31_LS64(\thread)
+       lwc1    $f0,  THREAD_FPR0(\thread)
+       lwc1    $f1,  THREAD_FPR1(\thread)
+       lwc1    $f2,  THREAD_FPR2(\thread)
+       lwc1    $f3,  THREAD_FPR3(\thread)
+       lwc1    $f4,  THREAD_FPR4(\thread)
+       lwc1    $f5,  THREAD_FPR5(\thread)
+       lwc1    $f6,  THREAD_FPR6(\thread)
+       lwc1    $f7,  THREAD_FPR7(\thread)
+       lwc1    $f8,  THREAD_FPR8(\thread)
+       lwc1    $f9,  THREAD_FPR9(\thread)
+       lwc1    $f10, THREAD_FPR10(\thread)
+       lwc1    $f11, THREAD_FPR11(\thread)
+       lwc1    $f12, THREAD_FPR12(\thread)
+       lwc1    $f13, THREAD_FPR13(\thread)
+       lwc1    $f14, THREAD_FPR14(\thread)
+       lwc1    $f15, THREAD_FPR15(\thread)
+       lwc1    $f16, THREAD_FPR16(\thread)
+       lwc1    $f17, THREAD_FPR17(\thread)
+       lwc1    $f18, THREAD_FPR18(\thread)
+       lwc1    $f19, THREAD_FPR19(\thread)
+       lwc1    $f20, THREAD_FPR20(\thread)
+       lwc1    $f21, THREAD_FPR21(\thread)
+       lwc1    $f22, THREAD_FPR22(\thread)
+       lwc1    $f23, THREAD_FPR23(\thread)
+       lwc1    $f24, THREAD_FPR24(\thread)
+       lwc1    $f25, THREAD_FPR25(\thread)
+       lwc1    $f26, THREAD_FPR26(\thread)
+       lwc1    $f27, THREAD_FPR27(\thread)
+       lwc1    $f28, THREAD_FPR28(\thread)
+       lwc1    $f29, THREAD_FPR29(\thread)
+       lwc1    $f30, THREAD_FPR30(\thread)
+       lwc1    $f31, THREAD_FPR31(\thread)
        ctc1    \tmp, fcr31
        .set pop
        .endm
index 0cae4595e985bbc3d8043b3bb85aef66c582615b..6156ac8c4cfb9a854bf3ed3a5546606216161118 100644 (file)
        .set    push
        SET_HARDFLOAT
        cfc1    \tmp, fcr31
-       sdc1    $f0,  THREAD_FPR0_LS64(\thread)
-       sdc1    $f2,  THREAD_FPR2_LS64(\thread)
-       sdc1    $f4,  THREAD_FPR4_LS64(\thread)
-       sdc1    $f6,  THREAD_FPR6_LS64(\thread)
-       sdc1    $f8,  THREAD_FPR8_LS64(\thread)
-       sdc1    $f10, THREAD_FPR10_LS64(\thread)
-       sdc1    $f12, THREAD_FPR12_LS64(\thread)
-       sdc1    $f14, THREAD_FPR14_LS64(\thread)
-       sdc1    $f16, THREAD_FPR16_LS64(\thread)
-       sdc1    $f18, THREAD_FPR18_LS64(\thread)
-       sdc1    $f20, THREAD_FPR20_LS64(\thread)
-       sdc1    $f22, THREAD_FPR22_LS64(\thread)
-       sdc1    $f24, THREAD_FPR24_LS64(\thread)
-       sdc1    $f26, THREAD_FPR26_LS64(\thread)
-       sdc1    $f28, THREAD_FPR28_LS64(\thread)
-       sdc1    $f30, THREAD_FPR30_LS64(\thread)
+       sdc1    $f0,  THREAD_FPR0(\thread)
+       sdc1    $f2,  THREAD_FPR2(\thread)
+       sdc1    $f4,  THREAD_FPR4(\thread)
+       sdc1    $f6,  THREAD_FPR6(\thread)
+       sdc1    $f8,  THREAD_FPR8(\thread)
+       sdc1    $f10, THREAD_FPR10(\thread)
+       sdc1    $f12, THREAD_FPR12(\thread)
+       sdc1    $f14, THREAD_FPR14(\thread)
+       sdc1    $f16, THREAD_FPR16(\thread)
+       sdc1    $f18, THREAD_FPR18(\thread)
+       sdc1    $f20, THREAD_FPR20(\thread)
+       sdc1    $f22, THREAD_FPR22(\thread)
+       sdc1    $f24, THREAD_FPR24(\thread)
+       sdc1    $f26, THREAD_FPR26(\thread)
+       sdc1    $f28, THREAD_FPR28(\thread)
+       sdc1    $f30, THREAD_FPR30(\thread)
        sw      \tmp, THREAD_FCR31(\thread)
        .set    pop
        .endm
        .set    push
        .set    mips64r2
        SET_HARDFLOAT
-       sdc1    $f1,  THREAD_FPR1_LS64(\thread)
-       sdc1    $f3,  THREAD_FPR3_LS64(\thread)
-       sdc1    $f5,  THREAD_FPR5_LS64(\thread)
-       sdc1    $f7,  THREAD_FPR7_LS64(\thread)
-       sdc1    $f9,  THREAD_FPR9_LS64(\thread)
-       sdc1    $f11, THREAD_FPR11_LS64(\thread)
-       sdc1    $f13, THREAD_FPR13_LS64(\thread)
-       sdc1    $f15, THREAD_FPR15_LS64(\thread)
-       sdc1    $f17, THREAD_FPR17_LS64(\thread)
-       sdc1    $f19, THREAD_FPR19_LS64(\thread)
-       sdc1    $f21, THREAD_FPR21_LS64(\thread)
-       sdc1    $f23, THREAD_FPR23_LS64(\thread)
-       sdc1    $f25, THREAD_FPR25_LS64(\thread)
-       sdc1    $f27, THREAD_FPR27_LS64(\thread)
-       sdc1    $f29, THREAD_FPR29_LS64(\thread)
-       sdc1    $f31, THREAD_FPR31_LS64(\thread)
+       sdc1    $f1,  THREAD_FPR1(\thread)
+       sdc1    $f3,  THREAD_FPR3(\thread)
+       sdc1    $f5,  THREAD_FPR5(\thread)
+       sdc1    $f7,  THREAD_FPR7(\thread)
+       sdc1    $f9,  THREAD_FPR9(\thread)
+       sdc1    $f11, THREAD_FPR11(\thread)
+       sdc1    $f13, THREAD_FPR13(\thread)
+       sdc1    $f15, THREAD_FPR15(\thread)
+       sdc1    $f17, THREAD_FPR17(\thread)
+       sdc1    $f19, THREAD_FPR19(\thread)
+       sdc1    $f21, THREAD_FPR21(\thread)
+       sdc1    $f23, THREAD_FPR23(\thread)
+       sdc1    $f25, THREAD_FPR25(\thread)
+       sdc1    $f27, THREAD_FPR27(\thread)
+       sdc1    $f29, THREAD_FPR29(\thread)
+       sdc1    $f31, THREAD_FPR31(\thread)
        .set    pop
        .endm
 
        .set    push
        SET_HARDFLOAT
        lw      \tmp, THREAD_FCR31(\thread)
-       ldc1    $f0,  THREAD_FPR0_LS64(\thread)
-       ldc1    $f2,  THREAD_FPR2_LS64(\thread)
-       ldc1    $f4,  THREAD_FPR4_LS64(\thread)
-       ldc1    $f6,  THREAD_FPR6_LS64(\thread)
-       ldc1    $f8,  THREAD_FPR8_LS64(\thread)
-       ldc1    $f10, THREAD_FPR10_LS64(\thread)
-       ldc1    $f12, THREAD_FPR12_LS64(\thread)
-       ldc1    $f14, THREAD_FPR14_LS64(\thread)
-       ldc1    $f16, THREAD_FPR16_LS64(\thread)
-       ldc1    $f18, THREAD_FPR18_LS64(\thread)
-       ldc1    $f20, THREAD_FPR20_LS64(\thread)
-       ldc1    $f22, THREAD_FPR22_LS64(\thread)
-       ldc1    $f24, THREAD_FPR24_LS64(\thread)
-       ldc1    $f26, THREAD_FPR26_LS64(\thread)
-       ldc1    $f28, THREAD_FPR28_LS64(\thread)
-       ldc1    $f30, THREAD_FPR30_LS64(\thread)
+       ldc1    $f0,  THREAD_FPR0(\thread)
+       ldc1    $f2,  THREAD_FPR2(\thread)
+       ldc1    $f4,  THREAD_FPR4(\thread)
+       ldc1    $f6,  THREAD_FPR6(\thread)
+       ldc1    $f8,  THREAD_FPR8(\thread)
+       ldc1    $f10, THREAD_FPR10(\thread)
+       ldc1    $f12, THREAD_FPR12(\thread)
+       ldc1    $f14, THREAD_FPR14(\thread)
+       ldc1    $f16, THREAD_FPR16(\thread)
+       ldc1    $f18, THREAD_FPR18(\thread)
+       ldc1    $f20, THREAD_FPR20(\thread)
+       ldc1    $f22, THREAD_FPR22(\thread)
+       ldc1    $f24, THREAD_FPR24(\thread)
+       ldc1    $f26, THREAD_FPR26(\thread)
+       ldc1    $f28, THREAD_FPR28(\thread)
+       ldc1    $f30, THREAD_FPR30(\thread)
        ctc1    \tmp, fcr31
        .endm
 
        .set    push
        .set    mips64r2
        SET_HARDFLOAT
-       ldc1    $f1,  THREAD_FPR1_LS64(\thread)
-       ldc1    $f3,  THREAD_FPR3_LS64(\thread)
-       ldc1    $f5,  THREAD_FPR5_LS64(\thread)
-       ldc1    $f7,  THREAD_FPR7_LS64(\thread)
-       ldc1    $f9,  THREAD_FPR9_LS64(\thread)
-       ldc1    $f11, THREAD_FPR11_LS64(\thread)
-       ldc1    $f13, THREAD_FPR13_LS64(\thread)
-       ldc1    $f15, THREAD_FPR15_LS64(\thread)
-       ldc1    $f17, THREAD_FPR17_LS64(\thread)
-       ldc1    $f19, THREAD_FPR19_LS64(\thread)
-       ldc1    $f21, THREAD_FPR21_LS64(\thread)
-       ldc1    $f23, THREAD_FPR23_LS64(\thread)
-       ldc1    $f25, THREAD_FPR25_LS64(\thread)
-       ldc1    $f27, THREAD_FPR27_LS64(\thread)
-       ldc1    $f29, THREAD_FPR29_LS64(\thread)
-       ldc1    $f31, THREAD_FPR31_LS64(\thread)
+       ldc1    $f1,  THREAD_FPR1(\thread)
+       ldc1    $f3,  THREAD_FPR3(\thread)
+       ldc1    $f5,  THREAD_FPR5(\thread)
+       ldc1    $f7,  THREAD_FPR7(\thread)
+       ldc1    $f9,  THREAD_FPR9(\thread)
+       ldc1    $f11, THREAD_FPR11(\thread)
+       ldc1    $f13, THREAD_FPR13(\thread)
+       ldc1    $f15, THREAD_FPR15(\thread)
+       ldc1    $f17, THREAD_FPR17(\thread)
+       ldc1    $f19, THREAD_FPR19(\thread)
+       ldc1    $f21, THREAD_FPR21(\thread)
+       ldc1    $f23, THREAD_FPR23(\thread)
+       ldc1    $f25, THREAD_FPR25(\thread)
+       ldc1    $f27, THREAD_FPR27(\thread)
+       ldc1    $f29, THREAD_FPR29(\thread)
+       ldc1    $f31, THREAD_FPR31(\thread)
        .set    pop
        .endm
 
        .endm
 
 #ifdef TOOLCHAIN_SUPPORTS_MSA
+       .macro  _cfcmsa rd, cs
+       .set    push
+       .set    mips32r2
+       .set    msa
+       cfcmsa  \rd, $\cs
+       .set    pop
+       .endm
+
+       .macro  _ctcmsa cd, rs
+       .set    push
+       .set    mips32r2
+       .set    msa
+       ctcmsa  $\cd, \rs
+       .set    pop
+       .endm
+
        .macro  ld_d    wd, off, base
        .set    push
        .set    mips32r2
        .set    pop
        .endm
 
-       .macro  copy_u_w        rd, ws, n
+       .macro  copy_u_w        ws, n
        .set    push
        .set    mips32r2
        .set    msa
-       copy_u.w \rd, $w\ws[\n]
+       copy_u.w $1, $w\ws[\n]
        .set    pop
        .endm
 
-       .macro  copy_u_d        rd, ws, n
+       .macro  copy_u_d        ws, n
        .set    push
        .set    mips64r2
        .set    msa
-       copy_u.d \rd, $w\ws[\n]
+       copy_u.d $1, $w\ws[\n]
        .set    pop
        .endm
 
-       .macro  insert_w        wd, n, rs
+       .macro  insert_w        wd, n
        .set    push
        .set    mips32r2
        .set    msa
-       insert.w $w\wd[\n], \rs
+       insert.w $w\wd[\n], $1
        .set    pop
        .endm
 
-       .macro  insert_d        wd, n, rs
+       .macro  insert_d        wd, n
        .set    push
        .set    mips64r2
        .set    msa
-       insert.d $w\wd[\n], \rs
+       insert.d $w\wd[\n], $1
        .set    pop
        .endm
 #else
        /*
         * Temporary until all toolchains in use include MSA support.
         */
-       .macro  cfcmsa  rd, cs
+       .macro  _cfcmsa rd, cs
        .set    push
        .set    noat
        SET_HARDFLOAT
        .set    pop
        .endm
 
-       .macro  ctcmsa  cd, rs
+       .macro  _ctcmsa cd, rs
        .set    push
        .set    noat
        SET_HARDFLOAT
        .set    pop
        .endm
 
-       .macro  copy_u_w        rd, ws, n
+       .macro  copy_u_w        ws, n
        .set    push
        .set    noat
        SET_HARDFLOAT
        .insn
        .word   COPY_UW_MSA_INSN | (\n << 16) | (\ws << 11)
-       /* move triggers an assembler bug... */
-       or      \rd, $1, zero
        .set    pop
        .endm
 
-       .macro  copy_u_d        rd, ws, n
+       .macro  copy_u_d        ws, n
        .set    push
        .set    noat
        SET_HARDFLOAT
        .insn
        .word   COPY_UD_MSA_INSN | (\n << 16) | (\ws << 11)
-       /* move triggers an assembler bug... */
-       or      \rd, $1, zero
        .set    pop
        .endm
 
-       .macro  insert_w        wd, n, rs
+       .macro  insert_w        wd, n
        .set    push
        .set    noat
        SET_HARDFLOAT
-       /* move triggers an assembler bug... */
-       or      $1, \rs, zero
        .word   INSERT_W_MSA_INSN | (\n << 16) | (\wd << 6)
        .set    pop
        .endm
 
-       .macro  insert_d        wd, n, rs
+       .macro  insert_d        wd, n
        .set    push
        .set    noat
        SET_HARDFLOAT
-       /* move triggers an assembler bug... */
-       or      $1, \rs, zero
        .word   INSERT_D_MSA_INSN | (\n << 16) | (\wd << 6)
        .set    pop
        .endm
        .set    push
        .set    noat
        SET_HARDFLOAT
-       cfcmsa  $1, MSA_CSR
+       _cfcmsa $1, MSA_CSR
        sw      $1, THREAD_MSA_CSR(\thread)
        .set    pop
        .endm
        .set    noat
        SET_HARDFLOAT
        lw      $1, THREAD_MSA_CSR(\thread)
-       ctcmsa  MSA_CSR, $1
+       _ctcmsa MSA_CSR, $1
        .set    pop
        ld_d    0, THREAD_FPR0, \thread
        ld_d    1, THREAD_FPR1, \thread
        insert_w \wd, 2
        insert_w \wd, 3
 #endif
-       .if     31-\wd
-       msa_init_upper  (\wd+1)
-       .endif
        .endm
 
        .macro  msa_init_all_upper
        SET_HARDFLOAT
        not     $1, zero
        msa_init_upper  0
+       msa_init_upper  1
+       msa_init_upper  2
+       msa_init_upper  3
+       msa_init_upper  4
+       msa_init_upper  5
+       msa_init_upper  6
+       msa_init_upper  7
+       msa_init_upper  8
+       msa_init_upper  9
+       msa_init_upper  10
+       msa_init_upper  11
+       msa_init_upper  12
+       msa_init_upper  13
+       msa_init_upper  14
+       msa_init_upper  15
+       msa_init_upper  16
+       msa_init_upper  17
+       msa_init_upper  18
+       msa_init_upper  19
+       msa_init_upper  20
+       msa_init_upper  21
+       msa_init_upper  22
+       msa_init_upper  23
+       msa_init_upper  24
+       msa_init_upper  25
+       msa_init_upper  26
+       msa_init_upper  27
+       msa_init_upper  28
+       msa_init_upper  29
+       msa_init_upper  30
+       msa_init_upper  31
        .set    pop
        .endm
 
index dd083e999b08a14ffdbef46d5f5f4a0731e9f18e..b104ad9d655f2da157544fcf783a225377cb996d 100644 (file)
@@ -48,6 +48,12 @@ enum fpu_mode {
 #define FPU_FR_MASK            0x1
 };
 
+#define __disable_fpu()                                                        \
+do {                                                                   \
+       clear_c0_status(ST0_CU1);                                       \
+       disable_fpu_hazard();                                           \
+} while (0)
+
 static inline int __enable_fpu(enum fpu_mode mode)
 {
        int fr;
@@ -86,7 +92,12 @@ fr_common:
                enable_fpu_hazard();
 
                /* check FR has the desired value */
-               return (!!(read_c0_status() & ST0_FR) == !!fr) ? 0 : SIGFPE;
+               if (!!(read_c0_status() & ST0_FR) == !!fr)
+                       return 0;
+
+               /* unsupported FR value */
+               __disable_fpu();
+               return SIGFPE;
 
        default:
                BUG();
@@ -95,12 +106,6 @@ fr_common:
        return SIGFPE;
 }
 
-#define __disable_fpu()                                                        \
-do {                                                                   \
-       clear_c0_status(ST0_CU1);                                       \
-       disable_fpu_hazard();                                           \
-} while (0)
-
 #define clear_fpu_owner()      clear_thread_flag(TIF_USEDFPU)
 
 static inline int __is_fpu_owner(void)
@@ -170,6 +175,7 @@ static inline void lose_fpu(int save)
                }
                disable_msa();
                clear_thread_flag(TIF_USEDMSA);
+               __disable_fpu();
        } else if (is_fpu_owner()) {
                if (save)
                        _save_fp(current);
index fdbff44e5482cd20acc5efc798091894a3292c4f..608aa57799c8194a88d39d2e9b6d9968e072ac8c 100644 (file)
@@ -8,9 +8,9 @@
 #ifndef _ASM_MIPS_JUMP_LABEL_H
 #define _ASM_MIPS_JUMP_LABEL_H
 
-#include <linux/types.h>
+#ifndef __ASSEMBLY__
 
-#ifdef __KERNEL__
+#include <linux/types.h>
 
 #define JUMP_LABEL_NOP_SIZE 4
 
@@ -39,8 +39,6 @@ l_yes:
        return true;
 }
 
-#endif /* __KERNEL__ */
-
 #ifdef CONFIG_64BIT
 typedef u64 jump_label_t;
 #else
@@ -53,4 +51,5 @@ struct jump_entry {
        jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif /* _ASM_MIPS_JUMP_LABEL_H */
index 6a9af5fcb5d72ef7878dc9581f366b07568a1c0f..cba22ab7ad4d5fd9087aeb8181c6c45ba3398a8c 100644 (file)
@@ -10,7 +10,8 @@ enum die_val {
        DIE_RI,
        DIE_PAGE_FAULT,
        DIE_BREAK,
-       DIE_SSTEPBP
+       DIE_SSTEPBP,
+       DIE_MSAFP
 };
 
 #endif /* _ASM_MIPS_KDEBUG_H */
index ac4fc716062b791003c76f5572d56863bcdcb2cd..4c25823563fe16dfe8f4008351c111eb0dd5c4ad 100644 (file)
 
 /* MIPS KVM register ids */
 #define MIPS_CP0_32(_R, _S)                                    \
-       (KVM_REG_MIPS | KVM_REG_SIZE_U32 | 0x10000 | (8 * (_R) + (_S)))
+       (KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U32 | (8 * (_R) + (_S)))
 
 #define MIPS_CP0_64(_R, _S)                                    \
-       (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 0x10000 | (8 * (_R) + (_S)))
+       (KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U64 | (8 * (_R) + (_S)))
 
 #define KVM_REG_MIPS_CP0_INDEX         MIPS_CP0_32(0, 0)
 #define KVM_REG_MIPS_CP0_ENTRYLO0      MIPS_CP0_64(2, 0)
 #define KVM_REG_MIPS_CP0_STATUS                MIPS_CP0_32(12, 0)
 #define KVM_REG_MIPS_CP0_CAUSE         MIPS_CP0_32(13, 0)
 #define KVM_REG_MIPS_CP0_EPC           MIPS_CP0_64(14, 0)
+#define KVM_REG_MIPS_CP0_PRID          MIPS_CP0_32(15, 0)
 #define KVM_REG_MIPS_CP0_EBASE         MIPS_CP0_64(15, 1)
 #define KVM_REG_MIPS_CP0_CONFIG                MIPS_CP0_32(16, 0)
 #define KVM_REG_MIPS_CP0_CONFIG1       MIPS_CP0_32(16, 1)
 #define KVM_REG_MIPS_CP0_CONFIG2       MIPS_CP0_32(16, 2)
 #define KVM_REG_MIPS_CP0_CONFIG3       MIPS_CP0_32(16, 3)
+#define KVM_REG_MIPS_CP0_CONFIG4       MIPS_CP0_32(16, 4)
+#define KVM_REG_MIPS_CP0_CONFIG5       MIPS_CP0_32(16, 5)
 #define KVM_REG_MIPS_CP0_CONFIG7       MIPS_CP0_32(16, 7)
 #define KVM_REG_MIPS_CP0_XCONTEXT      MIPS_CP0_64(20, 0)
 #define KVM_REG_MIPS_CP0_ERROREPC      MIPS_CP0_64(30, 0)
@@ -119,6 +122,10 @@ struct kvm_vcpu_stat {
        u32 syscall_exits;
        u32 resvd_inst_exits;
        u32 break_inst_exits;
+       u32 trap_inst_exits;
+       u32 msa_fpe_exits;
+       u32 fpe_exits;
+       u32 msa_disabled_exits;
        u32 flush_dcache_exits;
        u32 halt_successful_poll;
        u32 halt_wakeup;
@@ -138,6 +145,10 @@ enum kvm_mips_exit_types {
        SYSCALL_EXITS,
        RESVD_INST_EXITS,
        BREAK_INST_EXITS,
+       TRAP_INST_EXITS,
+       MSA_FPE_EXITS,
+       FPE_EXITS,
+       MSA_DISABLED_EXITS,
        FLUSH_DCACHE_EXITS,
        MAX_KVM_MIPS_EXIT_TYPES
 };
@@ -206,6 +217,8 @@ struct mips_coproc {
 #define MIPS_CP0_CONFIG1_SEL   1
 #define MIPS_CP0_CONFIG2_SEL   2
 #define MIPS_CP0_CONFIG3_SEL   3
+#define MIPS_CP0_CONFIG4_SEL   4
+#define MIPS_CP0_CONFIG5_SEL   5
 
 /* Config0 register bits */
 #define CP0C0_M                        31
@@ -262,31 +275,6 @@ struct mips_coproc {
 #define CP0C3_SM               1
 #define CP0C3_TL               0
 
-/* Have config1, Cacheable, noncoherent, write-back, write allocate*/
-#define MIPS_CONFIG0                                           \
-  ((1 << CP0C0_M) | (0x3 << CP0C0_K0))
-
-/* Have config2, no coprocessor2 attached, no MDMX support attached,
-   no performance counters, watch registers present,
-   no code compression, EJTAG present, no FPU, no watch registers */
-#define MIPS_CONFIG1                                           \
-((1 << CP0C1_M) |                                              \
- (0 << CP0C1_C2) | (0 << CP0C1_MD) | (0 << CP0C1_PC) |         \
- (0 << CP0C1_WR) | (0 << CP0C1_CA) | (1 << CP0C1_EP) |         \
- (0 << CP0C1_FP))
-
-/* Have config3, no tertiary/secondary caches implemented */
-#define MIPS_CONFIG2                                           \
-((1 << CP0C2_M))
-
-/* No config4, no DSP ASE, no large physaddr (PABITS),
-   no external interrupt controller, no vectored interrupts,
-   no 1kb pages, no SmartMIPS ASE, no trace logic */
-#define MIPS_CONFIG3                                           \
-((0 << CP0C3_M) | (0 << CP0C3_DSPP) | (0 << CP0C3_LPA) |       \
- (0 << CP0C3_VEIC) | (0 << CP0C3_VInt) | (0 << CP0C3_SP) |     \
- (0 << CP0C3_SM) | (0 << CP0C3_TL))
-
 /* MMU types, the first four entries have the same layout as the
    CP0C0_MT field.  */
 enum mips_mmu_types {
@@ -321,7 +309,9 @@ enum mips_mmu_types {
  */
 #define T_TRAP                 13      /* Trap instruction */
 #define T_VCEI                 14      /* Virtual coherency exception */
+#define T_MSAFPE               14      /* MSA floating point exception */
 #define T_FPE                  15      /* Floating point exception */
+#define T_MSADIS               21      /* MSA disabled exception */
 #define T_WATCH                        23      /* Watch address reference */
 #define T_VCED                 31      /* Virtual coherency data */
 
@@ -374,6 +364,9 @@ struct kvm_mips_tlb {
        long tlb_lo1;
 };
 
+#define KVM_MIPS_FPU_FPU       0x1
+#define KVM_MIPS_FPU_MSA       0x2
+
 #define KVM_MIPS_GUEST_TLB_SIZE        64
 struct kvm_vcpu_arch {
        void *host_ebase, *guest_ebase;
@@ -395,6 +388,8 @@ struct kvm_vcpu_arch {
 
        /* FPU State */
        struct mips_fpu_struct fpu;
+       /* Which FPU state is loaded (KVM_MIPS_FPU_*) */
+       unsigned int fpu_inuse;
 
        /* COP0 State */
        struct mips_coproc *cop0;
@@ -441,6 +436,9 @@ struct kvm_vcpu_arch {
 
        /* WAIT executed */
        int wait;
+
+       u8 fpu_enabled;
+       u8 msa_enabled;
 };
 
 
@@ -482,11 +480,15 @@ struct kvm_vcpu_arch {
 #define kvm_read_c0_guest_config1(cop0)                (cop0->reg[MIPS_CP0_CONFIG][1])
 #define kvm_read_c0_guest_config2(cop0)                (cop0->reg[MIPS_CP0_CONFIG][2])
 #define kvm_read_c0_guest_config3(cop0)                (cop0->reg[MIPS_CP0_CONFIG][3])
+#define kvm_read_c0_guest_config4(cop0)                (cop0->reg[MIPS_CP0_CONFIG][4])
+#define kvm_read_c0_guest_config5(cop0)                (cop0->reg[MIPS_CP0_CONFIG][5])
 #define kvm_read_c0_guest_config7(cop0)                (cop0->reg[MIPS_CP0_CONFIG][7])
 #define kvm_write_c0_guest_config(cop0, val)   (cop0->reg[MIPS_CP0_CONFIG][0] = (val))
 #define kvm_write_c0_guest_config1(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][1] = (val))
 #define kvm_write_c0_guest_config2(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][2] = (val))
 #define kvm_write_c0_guest_config3(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][3] = (val))
+#define kvm_write_c0_guest_config4(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][4] = (val))
+#define kvm_write_c0_guest_config5(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][5] = (val))
 #define kvm_write_c0_guest_config7(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
 #define kvm_read_c0_guest_errorepc(cop0)       (cop0->reg[MIPS_CP0_ERROR_PC][0])
 #define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
@@ -567,6 +569,31 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
        kvm_set_c0_guest_ebase(cop0, ((val) & (change)));               \
 }
 
+/* Helpers */
+
+static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
+{
+       return (!__builtin_constant_p(cpu_has_fpu) || cpu_has_fpu) &&
+               vcpu->fpu_enabled;
+}
+
+static inline bool kvm_mips_guest_has_fpu(struct kvm_vcpu_arch *vcpu)
+{
+       return kvm_mips_guest_can_have_fpu(vcpu) &&
+               kvm_read_c0_guest_config1(vcpu->cop0) & MIPS_CONF1_FP;
+}
+
+static inline bool kvm_mips_guest_can_have_msa(struct kvm_vcpu_arch *vcpu)
+{
+       return (!__builtin_constant_p(cpu_has_msa) || cpu_has_msa) &&
+               vcpu->msa_enabled;
+}
+
+static inline bool kvm_mips_guest_has_msa(struct kvm_vcpu_arch *vcpu)
+{
+       return kvm_mips_guest_can_have_msa(vcpu) &&
+               kvm_read_c0_guest_config3(vcpu->cop0) & MIPS_CONF3_MSA;
+}
 
 struct kvm_mips_callbacks {
        int (*handle_cop_unusable)(struct kvm_vcpu *vcpu);
@@ -578,6 +605,10 @@ struct kvm_mips_callbacks {
        int (*handle_syscall)(struct kvm_vcpu *vcpu);
        int (*handle_res_inst)(struct kvm_vcpu *vcpu);
        int (*handle_break)(struct kvm_vcpu *vcpu);
+       int (*handle_trap)(struct kvm_vcpu *vcpu);
+       int (*handle_msa_fpe)(struct kvm_vcpu *vcpu);
+       int (*handle_fpe)(struct kvm_vcpu *vcpu);
+       int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
        int (*vm_init)(struct kvm *kvm);
        int (*vcpu_init)(struct kvm_vcpu *vcpu);
        int (*vcpu_setup)(struct kvm_vcpu *vcpu);
@@ -596,6 +627,8 @@ struct kvm_mips_callbacks {
                           const struct kvm_one_reg *reg, s64 *v);
        int (*set_one_reg)(struct kvm_vcpu *vcpu,
                           const struct kvm_one_reg *reg, s64 v);
+       int (*vcpu_get_regs)(struct kvm_vcpu *vcpu);
+       int (*vcpu_set_regs)(struct kvm_vcpu *vcpu);
 };
 extern struct kvm_mips_callbacks *kvm_mips_callbacks;
 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
@@ -606,6 +639,19 @@ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
 /* Trampoline ASM routine to start running in "Guest" context */
 extern int __kvm_mips_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
 
+/* FPU/MSA context management */
+void __kvm_save_fpu(struct kvm_vcpu_arch *vcpu);
+void __kvm_restore_fpu(struct kvm_vcpu_arch *vcpu);
+void __kvm_restore_fcsr(struct kvm_vcpu_arch *vcpu);
+void __kvm_save_msa(struct kvm_vcpu_arch *vcpu);
+void __kvm_restore_msa(struct kvm_vcpu_arch *vcpu);
+void __kvm_restore_msa_upper(struct kvm_vcpu_arch *vcpu);
+void __kvm_restore_msacsr(struct kvm_vcpu_arch *vcpu);
+void kvm_own_fpu(struct kvm_vcpu *vcpu);
+void kvm_own_msa(struct kvm_vcpu *vcpu);
+void kvm_drop_fpu(struct kvm_vcpu *vcpu);
+void kvm_lose_fpu(struct kvm_vcpu *vcpu);
+
 /* TLB handling */
 uint32_t kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
 
@@ -711,6 +757,26 @@ extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
                                                     struct kvm_run *run,
                                                     struct kvm_vcpu *vcpu);
 
+extern enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
+                                                      uint32_t *opc,
+                                                      struct kvm_run *run,
+                                                      struct kvm_vcpu *vcpu);
+
+extern enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
+                                                        uint32_t *opc,
+                                                        struct kvm_run *run,
+                                                        struct kvm_vcpu *vcpu);
+
+extern enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
+                                                     uint32_t *opc,
+                                                     struct kvm_run *run,
+                                                     struct kvm_vcpu *vcpu);
+
+extern enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
+                                                        uint32_t *opc,
+                                                        struct kvm_run *run,
+                                                        struct kvm_vcpu *vcpu);
+
 extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
                                                         struct kvm_run *run);
 
@@ -749,6 +815,11 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst,
                                            struct kvm_run *run,
                                            struct kvm_vcpu *vcpu);
 
+unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu);
+unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu);
+unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
+unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
+
 /* Dynamic binary translation */
 extern int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
                                      struct kvm_vcpu *vcpu);
index b5dcbee01fd7a52641584cbbf8b80848f7c6f4b9..9b3b48e21c221ffdcfc04ec9cae6165576043225 100644 (file)
@@ -105,7 +105,7 @@ union fpureg {
 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 # define FPR_IDX(width, idx)   (idx)
 #else
-# define FPR_IDX(width, idx)   ((FPU_REG_WIDTH / (width)) - 1 - (idx))
+# define FPR_IDX(width, idx)   ((idx) ^ ((64 / (width)) - 1))
 #endif
 
 #define BUILD_FPR_ACCESS(width) \
index 2c04b6d9ff85380de722745e934944411a5e33d3..6985eb59b08534581f7b4316655367182cb6f64f 100644 (file)
@@ -36,77 +36,85 @@ struct kvm_regs {
 
 /*
  * for KVM_GET_FPU and KVM_SET_FPU
- *
- * If Status[FR] is zero (32-bit FPU), the upper 32-bits of the FPRs
- * are zero filled.
  */
 struct kvm_fpu {
-       __u64 fpr[32];
-       __u32 fir;
-       __u32 fccr;
-       __u32 fexr;
-       __u32 fenr;
-       __u32 fcsr;
-       __u32 pad;
 };
 
 
 /*
- * For MIPS, we use KVM_SET_ONE_REG and KVM_GET_ONE_REG to access CP0
+ * For MIPS, we use KVM_SET_ONE_REG and KVM_GET_ONE_REG to access various
  * registers.  The id field is broken down as follows:
  *
- *  bits[2..0]   - Register 'sel' index.
- *  bits[7..3]   - Register 'rd'  index.
- *  bits[15..8]  - Must be zero.
- *  bits[31..16] - 1 -> CP0 registers.
- *  bits[51..32] - Must be zero.
  *  bits[63..52] - As per linux/kvm.h
+ *  bits[51..32] - Must be zero.
+ *  bits[31..16] - Register set.
+ *
+ * Register set = 0: GP registers from kvm_regs (see definitions below).
+ *
+ * Register set = 1: CP0 registers.
+ *  bits[15..8]  - Must be zero.
+ *  bits[7..3]   - Register 'rd'  index.
+ *  bits[2..0]   - Register 'sel' index.
+ *
+ * Register set = 2: KVM specific registers (see definitions below).
+ *
+ * Register set = 3: FPU / MSA registers (see definitions below).
  *
  * Other sets registers may be added in the future.  Each set would
  * have its own identifier in bits[31..16].
- *
- * The registers defined in struct kvm_regs are also accessible, the
- * id values for these are below.
  */
 
-#define KVM_REG_MIPS_R0 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 0)
-#define KVM_REG_MIPS_R1 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 1)
-#define KVM_REG_MIPS_R2 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 2)
-#define KVM_REG_MIPS_R3 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 3)
-#define KVM_REG_MIPS_R4 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 4)
-#define KVM_REG_MIPS_R5 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 5)
-#define KVM_REG_MIPS_R6 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 6)
-#define KVM_REG_MIPS_R7 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 7)
-#define KVM_REG_MIPS_R8 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 8)
-#define KVM_REG_MIPS_R9 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 9)
-#define KVM_REG_MIPS_R10 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 10)
-#define KVM_REG_MIPS_R11 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 11)
-#define KVM_REG_MIPS_R12 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 12)
-#define KVM_REG_MIPS_R13 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 13)
-#define KVM_REG_MIPS_R14 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 14)
-#define KVM_REG_MIPS_R15 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 15)
-#define KVM_REG_MIPS_R16 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 16)
-#define KVM_REG_MIPS_R17 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 17)
-#define KVM_REG_MIPS_R18 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 18)
-#define KVM_REG_MIPS_R19 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 19)
-#define KVM_REG_MIPS_R20 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 20)
-#define KVM_REG_MIPS_R21 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 21)
-#define KVM_REG_MIPS_R22 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 22)
-#define KVM_REG_MIPS_R23 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 23)
-#define KVM_REG_MIPS_R24 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 24)
-#define KVM_REG_MIPS_R25 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 25)
-#define KVM_REG_MIPS_R26 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 26)
-#define KVM_REG_MIPS_R27 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 27)
-#define KVM_REG_MIPS_R28 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 28)
-#define KVM_REG_MIPS_R29 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 29)
-#define KVM_REG_MIPS_R30 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 30)
-#define KVM_REG_MIPS_R31 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 31)
-
-#define KVM_REG_MIPS_HI (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 32)
-#define KVM_REG_MIPS_LO (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 33)
-#define KVM_REG_MIPS_PC (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 34)
-
-/* KVM specific control registers */
+#define KVM_REG_MIPS_GP                (KVM_REG_MIPS | 0x0000000000000000ULL)
+#define KVM_REG_MIPS_CP0       (KVM_REG_MIPS | 0x0000000000010000ULL)
+#define KVM_REG_MIPS_KVM       (KVM_REG_MIPS | 0x0000000000020000ULL)
+#define KVM_REG_MIPS_FPU       (KVM_REG_MIPS | 0x0000000000030000ULL)
+
+
+/*
+ * KVM_REG_MIPS_GP - General purpose registers from kvm_regs.
+ */
+
+#define KVM_REG_MIPS_R0                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  0)
+#define KVM_REG_MIPS_R1                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  1)
+#define KVM_REG_MIPS_R2                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  2)
+#define KVM_REG_MIPS_R3                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  3)
+#define KVM_REG_MIPS_R4                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  4)
+#define KVM_REG_MIPS_R5                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  5)
+#define KVM_REG_MIPS_R6                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  6)
+#define KVM_REG_MIPS_R7                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  7)
+#define KVM_REG_MIPS_R8                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  8)
+#define KVM_REG_MIPS_R9                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 |  9)
+#define KVM_REG_MIPS_R10       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 10)
+#define KVM_REG_MIPS_R11       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 11)
+#define KVM_REG_MIPS_R12       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 12)
+#define KVM_REG_MIPS_R13       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 13)
+#define KVM_REG_MIPS_R14       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 14)
+#define KVM_REG_MIPS_R15       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 15)
+#define KVM_REG_MIPS_R16       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 16)
+#define KVM_REG_MIPS_R17       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 17)
+#define KVM_REG_MIPS_R18       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 18)
+#define KVM_REG_MIPS_R19       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 19)
+#define KVM_REG_MIPS_R20       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 20)
+#define KVM_REG_MIPS_R21       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 21)
+#define KVM_REG_MIPS_R22       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 22)
+#define KVM_REG_MIPS_R23       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 23)
+#define KVM_REG_MIPS_R24       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 24)
+#define KVM_REG_MIPS_R25       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 25)
+#define KVM_REG_MIPS_R26       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 26)
+#define KVM_REG_MIPS_R27       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 27)
+#define KVM_REG_MIPS_R28       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 28)
+#define KVM_REG_MIPS_R29       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 29)
+#define KVM_REG_MIPS_R30       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 30)
+#define KVM_REG_MIPS_R31       (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 31)
+
+#define KVM_REG_MIPS_HI                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 32)
+#define KVM_REG_MIPS_LO                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 33)
+#define KVM_REG_MIPS_PC                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 34)
+
+
+/*
+ * KVM_REG_MIPS_KVM - KVM specific control registers.
+ */
 
 /*
  * CP0_Count control
@@ -118,8 +126,7 @@ struct kvm_fpu {
  *        safely without losing time or guest timer interrupts.
  * Other: Reserved, do not change.
  */
-#define KVM_REG_MIPS_COUNT_CTL         (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \
-                                        0x20000 | 0)
+#define KVM_REG_MIPS_COUNT_CTL     (KVM_REG_MIPS_KVM | KVM_REG_SIZE_U64 | 0)
 #define KVM_REG_MIPS_COUNT_CTL_DC      0x00000001
 
 /*
@@ -131,15 +138,46 @@ struct kvm_fpu {
  * emulated.
  * Modifications to times in the future are rejected.
  */
-#define KVM_REG_MIPS_COUNT_RESUME      (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \
-                                        0x20000 | 1)
+#define KVM_REG_MIPS_COUNT_RESUME   (KVM_REG_MIPS_KVM | KVM_REG_SIZE_U64 | 1)
 /*
  * CP0_Count rate in Hz
  * Specifies the rate of the CP0_Count timer in Hz. Modifications occur without
  * discontinuities in CP0_Count.
  */
-#define KVM_REG_MIPS_COUNT_HZ          (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \
-                                        0x20000 | 2)
+#define KVM_REG_MIPS_COUNT_HZ      (KVM_REG_MIPS_KVM | KVM_REG_SIZE_U64 | 2)
+
+
+/*
+ * KVM_REG_MIPS_FPU - Floating Point and MIPS SIMD Architecture (MSA) registers.
+ *
+ *  bits[15..8]  - Register subset (see definitions below).
+ *  bits[7..5]   - Must be zero.
+ *  bits[4..0]   - Register number within register subset.
+ */
+
+#define KVM_REG_MIPS_FPR       (KVM_REG_MIPS_FPU | 0x0000000000000000ULL)
+#define KVM_REG_MIPS_FCR       (KVM_REG_MIPS_FPU | 0x0000000000000100ULL)
+#define KVM_REG_MIPS_MSACR     (KVM_REG_MIPS_FPU | 0x0000000000000200ULL)
+
+/*
+ * KVM_REG_MIPS_FPR - Floating point / Vector registers.
+ */
+#define KVM_REG_MIPS_FPR_32(n) (KVM_REG_MIPS_FPR | KVM_REG_SIZE_U32  | (n))
+#define KVM_REG_MIPS_FPR_64(n) (KVM_REG_MIPS_FPR | KVM_REG_SIZE_U64  | (n))
+#define KVM_REG_MIPS_VEC_128(n)        (KVM_REG_MIPS_FPR | KVM_REG_SIZE_U128 | (n))
+
+/*
+ * KVM_REG_MIPS_FCR - Floating point control registers.
+ */
+#define KVM_REG_MIPS_FCR_IR    (KVM_REG_MIPS_FCR | KVM_REG_SIZE_U32 |  0)
+#define KVM_REG_MIPS_FCR_CSR   (KVM_REG_MIPS_FCR | KVM_REG_SIZE_U32 | 31)
+
+/*
+ * KVM_REG_MIPS_MSACR - MIPS SIMD Architecture (MSA) control registers.
+ */
+#define KVM_REG_MIPS_MSA_IR     (KVM_REG_MIPS_MSACR | KVM_REG_SIZE_U32 |  0)
+#define KVM_REG_MIPS_MSA_CSR    (KVM_REG_MIPS_MSACR | KVM_REG_SIZE_U32 |  1)
+
 
 /*
  * KVM MIPS specific structures and definitions
index 750d67ac41e9b19affe066d5be8d1f56f7363041..e59fd7cfac9e35b2eeb90912c58504b9ed435606 100644 (file)
@@ -167,72 +167,6 @@ void output_thread_fpu_defines(void)
        OFFSET(THREAD_FPR30, task_struct, thread.fpu.fpr[30]);
        OFFSET(THREAD_FPR31, task_struct, thread.fpu.fpr[31]);
 
-       /* the least significant 64 bits of each FP register */
-       OFFSET(THREAD_FPR0_LS64, task_struct,
-              thread.fpu.fpr[0].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR1_LS64, task_struct,
-              thread.fpu.fpr[1].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR2_LS64, task_struct,
-              thread.fpu.fpr[2].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR3_LS64, task_struct,
-              thread.fpu.fpr[3].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR4_LS64, task_struct,
-              thread.fpu.fpr[4].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR5_LS64, task_struct,
-              thread.fpu.fpr[5].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR6_LS64, task_struct,
-              thread.fpu.fpr[6].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR7_LS64, task_struct,
-              thread.fpu.fpr[7].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR8_LS64, task_struct,
-              thread.fpu.fpr[8].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR9_LS64, task_struct,
-              thread.fpu.fpr[9].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR10_LS64, task_struct,
-              thread.fpu.fpr[10].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR11_LS64, task_struct,
-              thread.fpu.fpr[11].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR12_LS64, task_struct,
-              thread.fpu.fpr[12].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR13_LS64, task_struct,
-              thread.fpu.fpr[13].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR14_LS64, task_struct,
-              thread.fpu.fpr[14].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR15_LS64, task_struct,
-              thread.fpu.fpr[15].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR16_LS64, task_struct,
-              thread.fpu.fpr[16].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR17_LS64, task_struct,
-              thread.fpu.fpr[17].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR18_LS64, task_struct,
-              thread.fpu.fpr[18].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR19_LS64, task_struct,
-              thread.fpu.fpr[19].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR20_LS64, task_struct,
-              thread.fpu.fpr[20].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR21_LS64, task_struct,
-              thread.fpu.fpr[21].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR22_LS64, task_struct,
-              thread.fpu.fpr[22].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR23_LS64, task_struct,
-              thread.fpu.fpr[23].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR24_LS64, task_struct,
-              thread.fpu.fpr[24].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR25_LS64, task_struct,
-              thread.fpu.fpr[25].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR26_LS64, task_struct,
-              thread.fpu.fpr[26].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR27_LS64, task_struct,
-              thread.fpu.fpr[27].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR28_LS64, task_struct,
-              thread.fpu.fpr[28].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR29_LS64, task_struct,
-              thread.fpu.fpr[29].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR30_LS64, task_struct,
-              thread.fpu.fpr[30].val64[FPR_IDX(64, 0)]);
-       OFFSET(THREAD_FPR31_LS64, task_struct,
-              thread.fpu.fpr[31].val64[FPR_IDX(64, 0)]);
-
        OFFSET(THREAD_FCR31, task_struct, thread.fpu.fcr31);
        OFFSET(THREAD_MSA_CSR, task_struct, thread.fpu.msacsr);
        BLANK();
@@ -470,6 +404,45 @@ void output_kvm_defines(void)
        OFFSET(VCPU_LO, kvm_vcpu_arch, lo);
        OFFSET(VCPU_HI, kvm_vcpu_arch, hi);
        OFFSET(VCPU_PC, kvm_vcpu_arch, pc);
+       BLANK();
+
+       OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]);
+       OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]);
+       OFFSET(VCPU_FPR2, kvm_vcpu_arch, fpu.fpr[2]);
+       OFFSET(VCPU_FPR3, kvm_vcpu_arch, fpu.fpr[3]);
+       OFFSET(VCPU_FPR4, kvm_vcpu_arch, fpu.fpr[4]);
+       OFFSET(VCPU_FPR5, kvm_vcpu_arch, fpu.fpr[5]);
+       OFFSET(VCPU_FPR6, kvm_vcpu_arch, fpu.fpr[6]);
+       OFFSET(VCPU_FPR7, kvm_vcpu_arch, fpu.fpr[7]);
+       OFFSET(VCPU_FPR8, kvm_vcpu_arch, fpu.fpr[8]);
+       OFFSET(VCPU_FPR9, kvm_vcpu_arch, fpu.fpr[9]);
+       OFFSET(VCPU_FPR10, kvm_vcpu_arch, fpu.fpr[10]);
+       OFFSET(VCPU_FPR11, kvm_vcpu_arch, fpu.fpr[11]);
+       OFFSET(VCPU_FPR12, kvm_vcpu_arch, fpu.fpr[12]);
+       OFFSET(VCPU_FPR13, kvm_vcpu_arch, fpu.fpr[13]);
+       OFFSET(VCPU_FPR14, kvm_vcpu_arch, fpu.fpr[14]);
+       OFFSET(VCPU_FPR15, kvm_vcpu_arch, fpu.fpr[15]);
+       OFFSET(VCPU_FPR16, kvm_vcpu_arch, fpu.fpr[16]);
+       OFFSET(VCPU_FPR17, kvm_vcpu_arch, fpu.fpr[17]);
+       OFFSET(VCPU_FPR18, kvm_vcpu_arch, fpu.fpr[18]);
+       OFFSET(VCPU_FPR19, kvm_vcpu_arch, fpu.fpr[19]);
+       OFFSET(VCPU_FPR20, kvm_vcpu_arch, fpu.fpr[20]);
+       OFFSET(VCPU_FPR21, kvm_vcpu_arch, fpu.fpr[21]);
+       OFFSET(VCPU_FPR22, kvm_vcpu_arch, fpu.fpr[22]);
+       OFFSET(VCPU_FPR23, kvm_vcpu_arch, fpu.fpr[23]);
+       OFFSET(VCPU_FPR24, kvm_vcpu_arch, fpu.fpr[24]);
+       OFFSET(VCPU_FPR25, kvm_vcpu_arch, fpu.fpr[25]);
+       OFFSET(VCPU_FPR26, kvm_vcpu_arch, fpu.fpr[26]);
+       OFFSET(VCPU_FPR27, kvm_vcpu_arch, fpu.fpr[27]);
+       OFFSET(VCPU_FPR28, kvm_vcpu_arch, fpu.fpr[28]);
+       OFFSET(VCPU_FPR29, kvm_vcpu_arch, fpu.fpr[29]);
+       OFFSET(VCPU_FPR30, kvm_vcpu_arch, fpu.fpr[30]);
+       OFFSET(VCPU_FPR31, kvm_vcpu_arch, fpu.fpr[31]);
+
+       OFFSET(VCPU_FCR31, kvm_vcpu_arch, fpu.fcr31);
+       OFFSET(VCPU_MSA_CSR, kvm_vcpu_arch, fpu.msacsr);
+       BLANK();
+
        OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0);
        OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid);
        OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid);
index 2ebaabe3af1513269e100d8bcffa9e8e9cb1f2c8..af42e7003f12d025cd31e2a5d167f2f4b158d37a 100644 (file)
@@ -360,12 +360,15 @@ NESTED(nmi_handler, PT_SIZE, sp)
        .set    mips1
        SET_HARDFLOAT
        cfc1    a1, fcr31
-       li      a2, ~(0x3f << 12)
-       and     a2, a1
-       ctc1    a2, fcr31
        .set    pop
-       TRACE_IRQS_ON
-       STI
+       CLI
+       TRACE_IRQS_OFF
+       .endm
+
+       .macro  __build_clear_msa_fpe
+       _cfcmsa a1, MSA_CSR
+       CLI
+       TRACE_IRQS_OFF
        .endm
 
        .macro  __build_clear_ade
@@ -426,7 +429,7 @@ NESTED(nmi_handler, PT_SIZE, sp)
        BUILD_HANDLER cpu cpu sti silent                /* #11 */
        BUILD_HANDLER ov ov sti silent                  /* #12 */
        BUILD_HANDLER tr tr sti silent                  /* #13 */
-       BUILD_HANDLER msa_fpe msa_fpe sti silent        /* #14 */
+       BUILD_HANDLER msa_fpe msa_fpe msa_fpe silent    /* #14 */
        BUILD_HANDLER fpe fpe fpe silent                /* #15 */
        BUILD_HANDLER ftlb ftlb none silent             /* #16 */
        BUILD_HANDLER msa msa sti silent                /* #21 */
index 51045281259403c55fcefac09d510f874a3047bb..7da6e324dd354a77991c4fe2dd998c07b998d956 100644 (file)
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
 
+static void init_fp_ctx(struct task_struct *target)
+{
+       /* If FP has been used then the target already has context */
+       if (tsk_used_math(target))
+               return;
+
+       /* Begin with data registers set to all 1s... */
+       memset(&target->thread.fpu.fpr, ~0, sizeof(target->thread.fpu.fpr));
+
+       /* ...and FCSR zeroed */
+       target->thread.fpu.fcr31 = 0;
+
+       /*
+        * Record that the target has "used" math, such that the context
+        * just initialised, and any modifications made by the caller,
+        * aren't discarded.
+        */
+       set_stopped_child_used_math(target);
+}
+
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -142,6 +162,7 @@ int ptrace_setfpregs(struct task_struct *child, __u32 __user *data)
        if (!access_ok(VERIFY_READ, data, 33 * 8))
                return -EIO;
 
+       init_fp_ctx(child);
        fregs = get_fpu_regs(child);
 
        for (i = 0; i < 32; i++) {
@@ -439,6 +460,8 @@ static int fpr_set(struct task_struct *target,
 
        /* XXX fcr31  */
 
+       init_fp_ctx(target);
+
        if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t))
                return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                          &target->thread.fpu,
@@ -660,12 +683,7 @@ long arch_ptrace(struct task_struct *child, long request,
                case FPR_BASE ... FPR_BASE + 31: {
                        union fpureg *fregs = get_fpu_regs(child);
 
-                       if (!tsk_used_math(child)) {
-                               /* FP not yet used  */
-                               memset(&child->thread.fpu, ~0,
-                                      sizeof(child->thread.fpu));
-                               child->thread.fpu.fcr31 = 0;
-                       }
+                       init_fp_ctx(child);
 #ifdef CONFIG_32BIT
                        if (test_thread_flag(TIF_32BIT_FPREGS)) {
                                /*
index 676c5030a953bf9cca5ad038a7526d3b94ce372d..1d88af26ba82a0c3ee58ff8ff3b2b1661dad9455 100644 (file)
@@ -34,7 +34,6 @@
        .endm
 
        .set    noreorder
-       .set    MIPS_ISA_ARCH_LEVEL_RAW
 
 LEAF(_save_fp_context)
        .set    push
@@ -103,6 +102,7 @@ LEAF(_save_fp_context)
        /* Save 32-bit process floating point context */
 LEAF(_save_fp_context32)
        .set push
+       .set MIPS_ISA_ARCH_LEVEL_RAW
        SET_HARDFLOAT
        cfc1    t1, fcr31
 
index 33984c04b60b710516f1b0bfb88aa52aaa04629f..5b4d711f878da251a101526671a80283503736a4 100644 (file)
@@ -701,6 +701,13 @@ asmlinkage void do_ov(struct pt_regs *regs)
 
 int process_fpemu_return(int sig, void __user *fault_addr)
 {
+       /*
+        * We can't allow the emulated instruction to leave any of the cause
+        * bits set in FCSR. If they were then the kernel would take an FP
+        * exception when restoring FP context.
+        */
+       current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X;
+
        if (sig == SIGSEGV || sig == SIGBUS) {
                struct siginfo si = {0};
                si.si_addr = fault_addr;
@@ -781,6 +788,11 @@ asmlinkage void do_fpe(struct pt_regs *regs, unsigned long fcr31)
        if (notify_die(DIE_FP, "FP exception", regs, 0, regs_to_trapnr(regs),
                       SIGFPE) == NOTIFY_STOP)
                goto out;
+
+       /* Clear FCSR.Cause before enabling interrupts */
+       write_32bit_cp1_register(CP1_STATUS, fcr31 & ~FPU_CSR_ALL_X);
+       local_irq_enable();
+
        die_if_kernel("FP exception in kernel code", regs);
 
        if (fcr31 & FPU_CSR_UNI_X) {
@@ -804,18 +816,12 @@ asmlinkage void do_fpe(struct pt_regs *regs, unsigned long fcr31)
                sig = fpu_emulator_cop1Handler(regs, &current->thread.fpu, 1,
                                               &fault_addr);
 
-               /*
-                * We can't allow the emulated instruction to leave any of
-                * the cause bit set in $fcr31.
-                */
-               current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X;
+               /* If something went wrong, signal */
+               process_fpemu_return(sig, fault_addr);
 
                /* Restore the hardware register state */
                own_fpu(1);     /* Using the FPU again.  */
 
-               /* If something went wrong, signal */
-               process_fpemu_return(sig, fault_addr);
-
                goto out;
        } else if (fcr31 & FPU_CSR_INV_X)
                info.si_code = FPE_FLTINV;
@@ -1392,13 +1398,22 @@ out:
        exception_exit(prev_state);
 }
 
-asmlinkage void do_msa_fpe(struct pt_regs *regs)
+asmlinkage void do_msa_fpe(struct pt_regs *regs, unsigned int msacsr)
 {
        enum ctx_state prev_state;
 
        prev_state = exception_enter();
+       if (notify_die(DIE_MSAFP, "MSA FP exception", regs, 0,
+                      regs_to_trapnr(regs), SIGFPE) == NOTIFY_STOP)
+               goto out;
+
+       /* Clear MSACSR.Cause before enabling interrupts */
+       write_msa_csr(msacsr & ~MSA_CSR_CAUSEF);
+       local_irq_enable();
+
        die_if_kernel("do_msa_fpe invoked from kernel context!", regs);
        force_sig(SIGFPE, current);
+out:
        exception_exit(prev_state);
 }
 
index 401fe027c2612cf774fa53173420f93c072ea3c7..637ebbebd549701c1a0a67e7e3a8fbb9d2cb61f0 100644 (file)
@@ -1,13 +1,15 @@
 # Makefile for KVM support for MIPS
 #
 
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
+common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
 
-kvm-objs := $(common-objs) mips.o emulate.o locore.o \
+common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
+
+kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \
            interrupt.o stats.o commpage.o \
-           dyntrans.o trap_emul.o
+           dyntrans.o trap_emul.o fpu.o
 
 obj-$(CONFIG_KVM)      += kvm.o
 obj-y                  += callback.o tlb.o
index fb3e8dfd1ff647263c0ca93a1d2ba6c5fdbd906b..6230f376a44e7ab6f09041c4b805e54e59468489 100644 (file)
@@ -884,6 +884,84 @@ enum emulation_result kvm_mips_emul_tlbp(struct kvm_vcpu *vcpu)
        return EMULATE_DONE;
 }
 
+/**
+ * kvm_mips_config1_wrmask() - Find mask of writable bits in guest Config1
+ * @vcpu:      Virtual CPU.
+ *
+ * Finds the mask of bits which are writable in the guest's Config1 CP0
+ * register, by userland (currently read-only to the guest).
+ */
+unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu)
+{
+       unsigned int mask = 0;
+
+       /* Permit FPU to be present if FPU is supported */
+       if (kvm_mips_guest_can_have_fpu(&vcpu->arch))
+               mask |= MIPS_CONF1_FP;
+
+       return mask;
+}
+
+/**
+ * kvm_mips_config3_wrmask() - Find mask of writable bits in guest Config3
+ * @vcpu:      Virtual CPU.
+ *
+ * Finds the mask of bits which are writable in the guest's Config3 CP0
+ * register, by userland (currently read-only to the guest).
+ */
+unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
+{
+       /* Config4 is optional */
+       unsigned int mask = MIPS_CONF_M;
+
+       /* Permit MSA to be present if MSA is supported */
+       if (kvm_mips_guest_can_have_msa(&vcpu->arch))
+               mask |= MIPS_CONF3_MSA;
+
+       return mask;
+}
+
+/**
+ * kvm_mips_config4_wrmask() - Find mask of writable bits in guest Config4
+ * @vcpu:      Virtual CPU.
+ *
+ * Finds the mask of bits which are writable in the guest's Config4 CP0
+ * register, by userland (currently read-only to the guest).
+ */
+unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu)
+{
+       /* Config5 is optional */
+       return MIPS_CONF_M;
+}
+
+/**
+ * kvm_mips_config5_wrmask() - Find mask of writable bits in guest Config5
+ * @vcpu:      Virtual CPU.
+ *
+ * Finds the mask of bits which are writable in the guest's Config5 CP0
+ * register, by the guest itself.
+ */
+unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu)
+{
+       unsigned int mask = 0;
+
+       /* Permit MSAEn changes if MSA supported and enabled */
+       if (kvm_mips_guest_has_msa(&vcpu->arch))
+               mask |= MIPS_CONF5_MSAEN;
+
+       /*
+        * Permit guest FPU mode changes if FPU is enabled and the relevant
+        * feature exists according to FIR register.
+        */
+       if (kvm_mips_guest_has_fpu(&vcpu->arch)) {
+               if (cpu_has_fre)
+                       mask |= MIPS_CONF5_FRE;
+               /* We don't support UFR or UFE */
+       }
+
+       return mask;
+}
+
 enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                           uint32_t cause, struct kvm_run *run,
                                           struct kvm_vcpu *vcpu)
@@ -1021,18 +1099,114 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
                                kvm_mips_write_compare(vcpu,
                                                       vcpu->arch.gprs[rt]);
                        } else if ((rd == MIPS_CP0_STATUS) && (sel == 0)) {
-                               kvm_write_c0_guest_status(cop0,
-                                                         vcpu->arch.gprs[rt]);
+                               unsigned int old_val, val, change;
+
+                               old_val = kvm_read_c0_guest_status(cop0);
+                               val = vcpu->arch.gprs[rt];
+                               change = val ^ old_val;
+
+                               /* Make sure that the NMI bit is never set */
+                               val &= ~ST0_NMI;
+
+                               /*
+                                * Don't allow CU1 or FR to be set unless FPU
+                                * capability enabled and exists in guest
+                                * configuration.
+                                */
+                               if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                                       val &= ~(ST0_CU1 | ST0_FR);
+
+                               /*
+                                * Also don't allow FR to be set if host doesn't
+                                * support it.
+                                */
+                               if (!(current_cpu_data.fpu_id & MIPS_FPIR_F64))
+                                       val &= ~ST0_FR;
+
+
+                               /* Handle changes in FPU mode */
+                               preempt_disable();
+
+                               /*
+                                * FPU and Vector register state is made
+                                * UNPREDICTABLE by a change of FR, so don't
+                                * even bother saving it.
+                                */
+                               if (change & ST0_FR)
+                                       kvm_drop_fpu(vcpu);
+
+                               /*
+                                * If MSA state is already live, it is undefined
+                                * how it interacts with FR=0 FPU state, and we
+                                * don't want to hit reserved instruction
+                                * exceptions trying to save the MSA state later
+                                * when CU=1 && FR=1, so play it safe and save
+                                * it first.
+                                */
+                               if (change & ST0_CU1 && !(val & ST0_FR) &&
+                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+                                       kvm_lose_fpu(vcpu);
+
                                /*
-                                * Make sure that CU1 and NMI bits are
-                                * never set
+                                * Propagate CU1 (FPU enable) changes
+                                * immediately if the FPU context is already
+                                * loaded. When disabling we leave the context
+                                * loaded so it can be quickly enabled again in
+                                * the near future.
                                 */
-                               kvm_clear_c0_guest_status(cop0,
-                                                         (ST0_CU1 | ST0_NMI));
+                               if (change & ST0_CU1 &&
+                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+                                       change_c0_status(ST0_CU1, val);
+
+                               preempt_enable();
+
+                               kvm_write_c0_guest_status(cop0, val);
 
 #ifdef CONFIG_KVM_MIPS_DYN_TRANS
-                               kvm_mips_trans_mtc0(inst, opc, vcpu);
+                               /*
+                                * If FPU present, we need CU1/FR bits to take
+                                * effect fairly soon.
+                                */
+                               if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                                       kvm_mips_trans_mtc0(inst, opc, vcpu);
 #endif
+                       } else if ((rd == MIPS_CP0_CONFIG) && (sel == 5)) {
+                               unsigned int old_val, val, change, wrmask;
+
+                               old_val = kvm_read_c0_guest_config5(cop0);
+                               val = vcpu->arch.gprs[rt];
+
+                               /* Only a few bits are writable in Config5 */
+                               wrmask = kvm_mips_config5_wrmask(vcpu);
+                               change = (val ^ old_val) & wrmask;
+                               val = old_val ^ change;
+
+
+                               /* Handle changes in FPU/MSA modes */
+                               preempt_disable();
+
+                               /*
+                                * Propagate FRE changes immediately if the FPU
+                                * context is already loaded.
+                                */
+                               if (change & MIPS_CONF5_FRE &&
+                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+                                       change_c0_config5(MIPS_CONF5_FRE, val);
+
+                               /*
+                                * Propagate MSAEn changes immediately if the
+                                * MSA context is already loaded. When disabling
+                                * we leave the context loaded so it can be
+                                * quickly enabled again in the near future.
+                                */
+                               if (change & MIPS_CONF5_MSAEN &&
+                                   vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+                                       change_c0_config5(MIPS_CONF5_MSAEN,
+                                                         val);
+
+                               preempt_enable();
+
+                               kvm_write_c0_guest_config5(cop0, val);
                        } else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
                                uint32_t old_cause, new_cause;
 
@@ -1970,6 +2144,146 @@ enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
        return er;
 }
 
+enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
+                                               uint32_t *opc,
+                                               struct kvm_run *run,
+                                               struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       struct kvm_vcpu_arch *arch = &vcpu->arch;
+       enum emulation_result er = EMULATE_DONE;
+
+       if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
+               /* save old pc */
+               kvm_write_c0_guest_epc(cop0, arch->pc);
+               kvm_set_c0_guest_status(cop0, ST0_EXL);
+
+               if (cause & CAUSEF_BD)
+                       kvm_set_c0_guest_cause(cop0, CAUSEF_BD);
+               else
+                       kvm_clear_c0_guest_cause(cop0, CAUSEF_BD);
+
+               kvm_debug("Delivering TRAP @ pc %#lx\n", arch->pc);
+
+               kvm_change_c0_guest_cause(cop0, (0xff),
+                                         (T_TRAP << CAUSEB_EXCCODE));
+
+               /* Set PC to the exception entry point */
+               arch->pc = KVM_GUEST_KSEG0 + 0x180;
+
+       } else {
+               kvm_err("Trying to deliver TRAP when EXL is already set\n");
+               er = EMULATE_FAIL;
+       }
+
+       return er;
+}
+
+enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
+                                                 uint32_t *opc,
+                                                 struct kvm_run *run,
+                                                 struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       struct kvm_vcpu_arch *arch = &vcpu->arch;
+       enum emulation_result er = EMULATE_DONE;
+
+       if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
+               /* save old pc */
+               kvm_write_c0_guest_epc(cop0, arch->pc);
+               kvm_set_c0_guest_status(cop0, ST0_EXL);
+
+               if (cause & CAUSEF_BD)
+                       kvm_set_c0_guest_cause(cop0, CAUSEF_BD);
+               else
+                       kvm_clear_c0_guest_cause(cop0, CAUSEF_BD);
+
+               kvm_debug("Delivering MSAFPE @ pc %#lx\n", arch->pc);
+
+               kvm_change_c0_guest_cause(cop0, (0xff),
+                                         (T_MSAFPE << CAUSEB_EXCCODE));
+
+               /* Set PC to the exception entry point */
+               arch->pc = KVM_GUEST_KSEG0 + 0x180;
+
+       } else {
+               kvm_err("Trying to deliver MSAFPE when EXL is already set\n");
+               er = EMULATE_FAIL;
+       }
+
+       return er;
+}
+
+enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
+                                              uint32_t *opc,
+                                              struct kvm_run *run,
+                                              struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       struct kvm_vcpu_arch *arch = &vcpu->arch;
+       enum emulation_result er = EMULATE_DONE;
+
+       if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
+               /* save old pc */
+               kvm_write_c0_guest_epc(cop0, arch->pc);
+               kvm_set_c0_guest_status(cop0, ST0_EXL);
+
+               if (cause & CAUSEF_BD)
+                       kvm_set_c0_guest_cause(cop0, CAUSEF_BD);
+               else
+                       kvm_clear_c0_guest_cause(cop0, CAUSEF_BD);
+
+               kvm_debug("Delivering FPE @ pc %#lx\n", arch->pc);
+
+               kvm_change_c0_guest_cause(cop0, (0xff),
+                                         (T_FPE << CAUSEB_EXCCODE));
+
+               /* Set PC to the exception entry point */
+               arch->pc = KVM_GUEST_KSEG0 + 0x180;
+
+       } else {
+               kvm_err("Trying to deliver FPE when EXL is already set\n");
+               er = EMULATE_FAIL;
+       }
+
+       return er;
+}
+
+enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
+                                                 uint32_t *opc,
+                                                 struct kvm_run *run,
+                                                 struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       struct kvm_vcpu_arch *arch = &vcpu->arch;
+       enum emulation_result er = EMULATE_DONE;
+
+       if ((kvm_read_c0_guest_status(cop0) & ST0_EXL) == 0) {
+               /* save old pc */
+               kvm_write_c0_guest_epc(cop0, arch->pc);
+               kvm_set_c0_guest_status(cop0, ST0_EXL);
+
+               if (cause & CAUSEF_BD)
+                       kvm_set_c0_guest_cause(cop0, CAUSEF_BD);
+               else
+                       kvm_clear_c0_guest_cause(cop0, CAUSEF_BD);
+
+               kvm_debug("Delivering MSADIS @ pc %#lx\n", arch->pc);
+
+               kvm_change_c0_guest_cause(cop0, (0xff),
+                                         (T_MSADIS << CAUSEB_EXCCODE));
+
+               /* Set PC to the exception entry point */
+               arch->pc = KVM_GUEST_KSEG0 + 0x180;
+
+       } else {
+               kvm_err("Trying to deliver MSADIS when EXL is already set\n");
+               er = EMULATE_FAIL;
+       }
+
+       return er;
+}
+
 /* ll/sc, rdhwr, sync emulation */
 
 #define OPCODE 0xfc000000
@@ -2176,6 +2490,10 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause,
                case T_SYSCALL:
                case T_BREAK:
                case T_RES_INST:
+               case T_TRAP:
+               case T_MSAFPE:
+               case T_FPE:
+               case T_MSADIS:
                        break;
 
                case T_COP_UNUSABLE:
diff --git a/arch/mips/kvm/fpu.S b/arch/mips/kvm/fpu.S
new file mode 100644 (file)
index 0000000..531fbf5
--- /dev/null
@@ -0,0 +1,122 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * FPU context handling code for KVM.
+ *
+ * Copyright (C) 2015 Imagination Technologies Ltd.
+ */
+
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/fpregdef.h>
+#include <asm/mipsregs.h>
+#include <asm/regdef.h>
+
+       .set    noreorder
+       .set    noat
+
+LEAF(__kvm_save_fpu)
+       .set    push
+       .set    mips64r2
+       SET_HARDFLOAT
+       mfc0    t0, CP0_STATUS
+       sll     t0, t0, 5                       # is Status.FR set?
+       bgez    t0, 1f                          # no: skip odd doubles
+        nop
+       sdc1    $f1,  VCPU_FPR1(a0)
+       sdc1    $f3,  VCPU_FPR3(a0)
+       sdc1    $f5,  VCPU_FPR5(a0)
+       sdc1    $f7,  VCPU_FPR7(a0)
+       sdc1    $f9,  VCPU_FPR9(a0)
+       sdc1    $f11, VCPU_FPR11(a0)
+       sdc1    $f13, VCPU_FPR13(a0)
+       sdc1    $f15, VCPU_FPR15(a0)
+       sdc1    $f17, VCPU_FPR17(a0)
+       sdc1    $f19, VCPU_FPR19(a0)
+       sdc1    $f21, VCPU_FPR21(a0)
+       sdc1    $f23, VCPU_FPR23(a0)
+       sdc1    $f25, VCPU_FPR25(a0)
+       sdc1    $f27, VCPU_FPR27(a0)
+       sdc1    $f29, VCPU_FPR29(a0)
+       sdc1    $f31, VCPU_FPR31(a0)
+1:     sdc1    $f0,  VCPU_FPR0(a0)
+       sdc1    $f2,  VCPU_FPR2(a0)
+       sdc1    $f4,  VCPU_FPR4(a0)
+       sdc1    $f6,  VCPU_FPR6(a0)
+       sdc1    $f8,  VCPU_FPR8(a0)
+       sdc1    $f10, VCPU_FPR10(a0)
+       sdc1    $f12, VCPU_FPR12(a0)
+       sdc1    $f14, VCPU_FPR14(a0)
+       sdc1    $f16, VCPU_FPR16(a0)
+       sdc1    $f18, VCPU_FPR18(a0)
+       sdc1    $f20, VCPU_FPR20(a0)
+       sdc1    $f22, VCPU_FPR22(a0)
+       sdc1    $f24, VCPU_FPR24(a0)
+       sdc1    $f26, VCPU_FPR26(a0)
+       sdc1    $f28, VCPU_FPR28(a0)
+       jr      ra
+        sdc1   $f30, VCPU_FPR30(a0)
+       .set    pop
+       END(__kvm_save_fpu)
+
+LEAF(__kvm_restore_fpu)
+       .set    push
+       .set    mips64r2
+       SET_HARDFLOAT
+       mfc0    t0, CP0_STATUS
+       sll     t0, t0, 5                       # is Status.FR set?
+       bgez    t0, 1f                          # no: skip odd doubles
+        nop
+       ldc1    $f1,  VCPU_FPR1(a0)
+       ldc1    $f3,  VCPU_FPR3(a0)
+       ldc1    $f5,  VCPU_FPR5(a0)
+       ldc1    $f7,  VCPU_FPR7(a0)
+       ldc1    $f9,  VCPU_FPR9(a0)
+       ldc1    $f11, VCPU_FPR11(a0)
+       ldc1    $f13, VCPU_FPR13(a0)
+       ldc1    $f15, VCPU_FPR15(a0)
+       ldc1    $f17, VCPU_FPR17(a0)
+       ldc1    $f19, VCPU_FPR19(a0)
+       ldc1    $f21, VCPU_FPR21(a0)
+       ldc1    $f23, VCPU_FPR23(a0)
+       ldc1    $f25, VCPU_FPR25(a0)
+       ldc1    $f27, VCPU_FPR27(a0)
+       ldc1    $f29, VCPU_FPR29(a0)
+       ldc1    $f31, VCPU_FPR31(a0)
+1:     ldc1    $f0,  VCPU_FPR0(a0)
+       ldc1    $f2,  VCPU_FPR2(a0)
+       ldc1    $f4,  VCPU_FPR4(a0)
+       ldc1    $f6,  VCPU_FPR6(a0)
+       ldc1    $f8,  VCPU_FPR8(a0)
+       ldc1    $f10, VCPU_FPR10(a0)
+       ldc1    $f12, VCPU_FPR12(a0)
+       ldc1    $f14, VCPU_FPR14(a0)
+       ldc1    $f16, VCPU_FPR16(a0)
+       ldc1    $f18, VCPU_FPR18(a0)
+       ldc1    $f20, VCPU_FPR20(a0)
+       ldc1    $f22, VCPU_FPR22(a0)
+       ldc1    $f24, VCPU_FPR24(a0)
+       ldc1    $f26, VCPU_FPR26(a0)
+       ldc1    $f28, VCPU_FPR28(a0)
+       jr      ra
+        ldc1   $f30, VCPU_FPR30(a0)
+       .set    pop
+       END(__kvm_restore_fpu)
+
+LEAF(__kvm_restore_fcsr)
+       .set    push
+       SET_HARDFLOAT
+       lw      t0, VCPU_FCR31(a0)
+       /*
+        * The ctc1 must stay at this offset in __kvm_restore_fcsr.
+        * See kvm_mips_csr_die_notify() which handles t0 containing a value
+        * which triggers an FP Exception, which must be stepped over and
+        * ignored since the set cause bits must remain there for the guest.
+        */
+       ctc1    t0, fcr31
+       jr      ra
+        nop
+       .set    pop
+       END(__kvm_restore_fcsr)
index 4a68b176d6e4f8dff9680a2c22a41019be1d574f..c567240386a0f10818b0e06433f4b33b1bf5e6dc 100644 (file)
@@ -36,6 +36,8 @@
 #define PT_HOST_USERLOCAL   PT_EPC
 
 #define CP0_DDATA_LO        $28,3
+#define CP0_CONFIG3         $16,3
+#define CP0_CONFIG5         $16,5
 #define CP0_EBASE           $15,1
 
 #define CP0_INTCTL          $12,1
@@ -353,6 +355,42 @@ NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
        LONG_L  k0, VCPU_HOST_EBASE(k1)
        mtc0    k0,CP0_EBASE
 
+       /*
+        * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't
+        * trigger FPE for pending exceptions.
+        */
+       .set    at
+       and     v1, v0, ST0_CU1
+       beqz    v1, 1f
+        nop
+       .set    push
+       SET_HARDFLOAT
+       cfc1    t0, fcr31
+       sw      t0, VCPU_FCR31(k1)
+       ctc1    zero,fcr31
+       .set    pop
+       .set    noat
+1:
+
+#ifdef CONFIG_CPU_HAS_MSA
+       /*
+        * If MSA is enabled, save MSACSR and clear it so that later
+        * instructions don't trigger MSAFPE for pending exceptions.
+        */
+       mfc0    t0, CP0_CONFIG3
+       ext     t0, t0, 28, 1 /* MIPS_CONF3_MSAP */
+       beqz    t0, 1f
+        nop
+       mfc0    t0, CP0_CONFIG5
+       ext     t0, t0, 27, 1 /* MIPS_CONF5_MSAEN */
+       beqz    t0, 1f
+        nop
+       _cfcmsa t0, MSA_CSR
+       sw      t0, VCPU_MSA_CSR(k1)
+       _ctcmsa MSA_CSR, zero
+1:
+#endif
+
        /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
        .set    at
        and     v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
index c9eccf5df912037e2b71bbb4a7dddd2a1d2d866e..bb68e8d520e83b5a30b74b22ae3292b1dd1469e1 100644 (file)
@@ -11,6 +11,7 @@
 
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
@@ -48,6 +49,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "syscall",      VCPU_STAT(syscall_exits),      KVM_STAT_VCPU },
        { "resvd_inst",   VCPU_STAT(resvd_inst_exits),   KVM_STAT_VCPU },
        { "break_inst",   VCPU_STAT(break_inst_exits),   KVM_STAT_VCPU },
+       { "trap_inst",    VCPU_STAT(trap_inst_exits),    KVM_STAT_VCPU },
+       { "msa_fpe",      VCPU_STAT(msa_fpe_exits),      KVM_STAT_VCPU },
+       { "fpe",          VCPU_STAT(fpe_exits),          KVM_STAT_VCPU },
+       { "msa_disabled", VCPU_STAT(msa_disabled_exits), KVM_STAT_VCPU },
        { "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
        { "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
        { "halt_wakeup",  VCPU_STAT(halt_wakeup),        KVM_STAT_VCPU },
@@ -504,10 +509,13 @@ static u64 kvm_mips_get_one_regs[] = {
        KVM_REG_MIPS_CP0_STATUS,
        KVM_REG_MIPS_CP0_CAUSE,
        KVM_REG_MIPS_CP0_EPC,
+       KVM_REG_MIPS_CP0_PRID,
        KVM_REG_MIPS_CP0_CONFIG,
        KVM_REG_MIPS_CP0_CONFIG1,
        KVM_REG_MIPS_CP0_CONFIG2,
        KVM_REG_MIPS_CP0_CONFIG3,
+       KVM_REG_MIPS_CP0_CONFIG4,
+       KVM_REG_MIPS_CP0_CONFIG5,
        KVM_REG_MIPS_CP0_CONFIG7,
        KVM_REG_MIPS_CP0_ERROREPC,
 
@@ -520,10 +528,14 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
                            const struct kvm_one_reg *reg)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
+       struct mips_fpu_struct *fpu = &vcpu->arch.fpu;
        int ret;
        s64 v;
+       s64 vs[2];
+       unsigned int idx;
 
        switch (reg->id) {
+       /* General purpose registers */
        case KVM_REG_MIPS_R0 ... KVM_REG_MIPS_R31:
                v = (long)vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0];
                break;
@@ -537,6 +549,67 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
                v = (long)vcpu->arch.pc;
                break;
 
+       /* Floating point registers */
+       case KVM_REG_MIPS_FPR_32(0) ... KVM_REG_MIPS_FPR_32(31):
+               if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                       return -EINVAL;
+               idx = reg->id - KVM_REG_MIPS_FPR_32(0);
+               /* Odd singles in top of even double when FR=0 */
+               if (kvm_read_c0_guest_status(cop0) & ST0_FR)
+                       v = get_fpr32(&fpu->fpr[idx], 0);
+               else
+                       v = get_fpr32(&fpu->fpr[idx & ~1], idx & 1);
+               break;
+       case KVM_REG_MIPS_FPR_64(0) ... KVM_REG_MIPS_FPR_64(31):
+               if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                       return -EINVAL;
+               idx = reg->id - KVM_REG_MIPS_FPR_64(0);
+               /* Can't access odd doubles in FR=0 mode */
+               if (idx & 1 && !(kvm_read_c0_guest_status(cop0) & ST0_FR))
+                       return -EINVAL;
+               v = get_fpr64(&fpu->fpr[idx], 0);
+               break;
+       case KVM_REG_MIPS_FCR_IR:
+               if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                       return -EINVAL;
+               v = boot_cpu_data.fpu_id;
+               break;
+       case KVM_REG_MIPS_FCR_CSR:
+               if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                       return -EINVAL;
+               v = fpu->fcr31;
+               break;
+
+       /* MIPS SIMD Architecture (MSA) registers */
+       case KVM_REG_MIPS_VEC_128(0) ... KVM_REG_MIPS_VEC_128(31):
+               if (!kvm_mips_guest_has_msa(&vcpu->arch))
+                       return -EINVAL;
+               /* Can't access MSA registers in FR=0 mode */
+               if (!(kvm_read_c0_guest_status(cop0) & ST0_FR))
+                       return -EINVAL;
+               idx = reg->id - KVM_REG_MIPS_VEC_128(0);
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+               /* least significant byte first */
+               vs[0] = get_fpr64(&fpu->fpr[idx], 0);
+               vs[1] = get_fpr64(&fpu->fpr[idx], 1);
+#else
+               /* most significant byte first */
+               vs[0] = get_fpr64(&fpu->fpr[idx], 1);
+               vs[1] = get_fpr64(&fpu->fpr[idx], 0);
+#endif
+               break;
+       case KVM_REG_MIPS_MSA_IR:
+               if (!kvm_mips_guest_has_msa(&vcpu->arch))
+                       return -EINVAL;
+               v = boot_cpu_data.msa_id;
+               break;
+       case KVM_REG_MIPS_MSA_CSR:
+               if (!kvm_mips_guest_has_msa(&vcpu->arch))
+                       return -EINVAL;
+               v = fpu->msacsr;
+               break;
+
+       /* Co-processor 0 registers */
        case KVM_REG_MIPS_CP0_INDEX:
                v = (long)kvm_read_c0_guest_index(cop0);
                break;
@@ -573,8 +646,8 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_EPC:
                v = (long)kvm_read_c0_guest_epc(cop0);
                break;
-       case KVM_REG_MIPS_CP0_ERROREPC:
-               v = (long)kvm_read_c0_guest_errorepc(cop0);
+       case KVM_REG_MIPS_CP0_PRID:
+               v = (long)kvm_read_c0_guest_prid(cop0);
                break;
        case KVM_REG_MIPS_CP0_CONFIG:
                v = (long)kvm_read_c0_guest_config(cop0);
@@ -588,9 +661,18 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_CONFIG3:
                v = (long)kvm_read_c0_guest_config3(cop0);
                break;
+       case KVM_REG_MIPS_CP0_CONFIG4:
+               v = (long)kvm_read_c0_guest_config4(cop0);
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG5:
+               v = (long)kvm_read_c0_guest_config5(cop0);
+               break;
        case KVM_REG_MIPS_CP0_CONFIG7:
                v = (long)kvm_read_c0_guest_config7(cop0);
                break;
+       case KVM_REG_MIPS_CP0_ERROREPC:
+               v = (long)kvm_read_c0_guest_errorepc(cop0);
+               break;
        /* registers to be handled specially */
        case KVM_REG_MIPS_CP0_COUNT:
        case KVM_REG_MIPS_COUNT_CTL:
@@ -612,6 +694,10 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
                u32 v32 = (u32)v;
 
                return put_user(v32, uaddr32);
+       } else if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U128) {
+               void __user *uaddr = (void __user *)(long)reg->addr;
+
+               return copy_to_user(uaddr, vs, 16);
        } else {
                return -EINVAL;
        }
@@ -621,7 +707,10 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
                            const struct kvm_one_reg *reg)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       u64 v;
+       struct mips_fpu_struct *fpu = &vcpu->arch.fpu;
+       s64 v;
+       s64 vs[2];
+       unsigned int idx;
 
        if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64) {
                u64 __user *uaddr64 = (u64 __user *)(long)reg->addr;
@@ -635,11 +724,16 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
                if (get_user(v32, uaddr32) != 0)
                        return -EFAULT;
                v = (s64)v32;
+       } else if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U128) {
+               void __user *uaddr = (void __user *)(long)reg->addr;
+
+               return copy_from_user(vs, uaddr, 16);
        } else {
                return -EINVAL;
        }
 
        switch (reg->id) {
+       /* General purpose registers */
        case KVM_REG_MIPS_R0:
                /* Silently ignore requests to set $0 */
                break;
@@ -656,6 +750,64 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
                vcpu->arch.pc = v;
                break;
 
+       /* Floating point registers */
+       case KVM_REG_MIPS_FPR_32(0) ... KVM_REG_MIPS_FPR_32(31):
+               if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                       return -EINVAL;
+               idx = reg->id - KVM_REG_MIPS_FPR_32(0);
+               /* Odd singles in top of even double when FR=0 */
+               if (kvm_read_c0_guest_status(cop0) & ST0_FR)
+                       set_fpr32(&fpu->fpr[idx], 0, v);
+               else
+                       set_fpr32(&fpu->fpr[idx & ~1], idx & 1, v);
+               break;
+       case KVM_REG_MIPS_FPR_64(0) ... KVM_REG_MIPS_FPR_64(31):
+               if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                       return -EINVAL;
+               idx = reg->id - KVM_REG_MIPS_FPR_64(0);
+               /* Can't access odd doubles in FR=0 mode */
+               if (idx & 1 && !(kvm_read_c0_guest_status(cop0) & ST0_FR))
+                       return -EINVAL;
+               set_fpr64(&fpu->fpr[idx], 0, v);
+               break;
+       case KVM_REG_MIPS_FCR_IR:
+               if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                       return -EINVAL;
+               /* Read-only */
+               break;
+       case KVM_REG_MIPS_FCR_CSR:
+               if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                       return -EINVAL;
+               fpu->fcr31 = v;
+               break;
+
+       /* MIPS SIMD Architecture (MSA) registers */
+       case KVM_REG_MIPS_VEC_128(0) ... KVM_REG_MIPS_VEC_128(31):
+               if (!kvm_mips_guest_has_msa(&vcpu->arch))
+                       return -EINVAL;
+               idx = reg->id - KVM_REG_MIPS_VEC_128(0);
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+               /* least significant byte first */
+               set_fpr64(&fpu->fpr[idx], 0, vs[0]);
+               set_fpr64(&fpu->fpr[idx], 1, vs[1]);
+#else
+               /* most significant byte first */
+               set_fpr64(&fpu->fpr[idx], 1, vs[0]);
+               set_fpr64(&fpu->fpr[idx], 0, vs[1]);
+#endif
+               break;
+       case KVM_REG_MIPS_MSA_IR:
+               if (!kvm_mips_guest_has_msa(&vcpu->arch))
+                       return -EINVAL;
+               /* Read-only */
+               break;
+       case KVM_REG_MIPS_MSA_CSR:
+               if (!kvm_mips_guest_has_msa(&vcpu->arch))
+                       return -EINVAL;
+               fpu->msacsr = v;
+               break;
+
+       /* Co-processor 0 registers */
        case KVM_REG_MIPS_CP0_INDEX:
                kvm_write_c0_guest_index(cop0, v);
                break;
@@ -686,6 +838,9 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_EPC:
                kvm_write_c0_guest_epc(cop0, v);
                break;
+       case KVM_REG_MIPS_CP0_PRID:
+               kvm_write_c0_guest_prid(cop0, v);
+               break;
        case KVM_REG_MIPS_CP0_ERROREPC:
                kvm_write_c0_guest_errorepc(cop0, v);
                break;
@@ -693,6 +848,12 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
        case KVM_REG_MIPS_CP0_COUNT:
        case KVM_REG_MIPS_CP0_COMPARE:
        case KVM_REG_MIPS_CP0_CAUSE:
+       case KVM_REG_MIPS_CP0_CONFIG:
+       case KVM_REG_MIPS_CP0_CONFIG1:
+       case KVM_REG_MIPS_CP0_CONFIG2:
+       case KVM_REG_MIPS_CP0_CONFIG3:
+       case KVM_REG_MIPS_CP0_CONFIG4:
+       case KVM_REG_MIPS_CP0_CONFIG5:
        case KVM_REG_MIPS_COUNT_CTL:
        case KVM_REG_MIPS_COUNT_RESUME:
        case KVM_REG_MIPS_COUNT_HZ:
@@ -703,6 +864,33 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+                                    struct kvm_enable_cap *cap)
+{
+       int r = 0;
+
+       if (!kvm_vm_ioctl_check_extension(vcpu->kvm, cap->cap))
+               return -EINVAL;
+       if (cap->flags)
+               return -EINVAL;
+       if (cap->args[0])
+               return -EINVAL;
+
+       switch (cap->cap) {
+       case KVM_CAP_MIPS_FPU:
+               vcpu->arch.fpu_enabled = true;
+               break;
+       case KVM_CAP_MIPS_MSA:
+               vcpu->arch.msa_enabled = true;
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+
+       return r;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
                         unsigned long arg)
 {
@@ -760,6 +948,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
                        r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
                        break;
                }
+       case KVM_ENABLE_CAP: {
+               struct kvm_enable_cap cap;
+
+               r = -EFAULT;
+               if (copy_from_user(&cap, argp, sizeof(cap)))
+                       goto out;
+               r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+               break;
+       }
        default:
                r = -ENOIOCTLCMD;
        }
@@ -868,11 +1065,30 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
        switch (ext) {
        case KVM_CAP_ONE_REG:
+       case KVM_CAP_ENABLE_CAP:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
                break;
+       case KVM_CAP_MIPS_FPU:
+               r = !!cpu_has_fpu;
+               break;
+       case KVM_CAP_MIPS_MSA:
+               /*
+                * We don't support MSA vector partitioning yet:
+                * 1) It would require explicit support which can't be tested
+                *    yet due to lack of support in current hardware.
+                * 2) It extends the state that would need to be saved/restored
+                *    by e.g. QEMU for migration.
+                *
+                * When vector partitioning hardware becomes available, support
+                * could be added by requiring a flag when enabling
+                * KVM_CAP_MIPS_MSA capability to indicate that userland knows
+                * to save/restore the appropriate extra state.
+                */
+               r = cpu_has_msa && !(boot_cpu_data.msa_id & MSA_IR_WRPF);
+               break;
        default:
                r = 0;
                break;
@@ -1119,6 +1335,30 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                ret = kvm_mips_callbacks->handle_break(vcpu);
                break;
 
+       case T_TRAP:
+               ++vcpu->stat.trap_inst_exits;
+               trace_kvm_exit(vcpu, TRAP_INST_EXITS);
+               ret = kvm_mips_callbacks->handle_trap(vcpu);
+               break;
+
+       case T_MSAFPE:
+               ++vcpu->stat.msa_fpe_exits;
+               trace_kvm_exit(vcpu, MSA_FPE_EXITS);
+               ret = kvm_mips_callbacks->handle_msa_fpe(vcpu);
+               break;
+
+       case T_FPE:
+               ++vcpu->stat.fpe_exits;
+               trace_kvm_exit(vcpu, FPE_EXITS);
+               ret = kvm_mips_callbacks->handle_fpe(vcpu);
+               break;
+
+       case T_MSADIS:
+               ++vcpu->stat.msa_disabled_exits;
+               trace_kvm_exit(vcpu, MSA_DISABLED_EXITS);
+               ret = kvm_mips_callbacks->handle_msa_disabled(vcpu);
+               break;
+
        default:
                kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x  BadVaddr: %#lx Status: %#lx\n",
                        exccode, opc, kvm_get_inst(opc, vcpu), badvaddr,
@@ -1146,12 +1386,233 @@ skip_emul:
                }
        }
 
+       if (ret == RESUME_GUEST) {
+               /*
+                * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
+                * is live), restore FCR31 / MSACSR.
+                *
+                * This should be before returning to the guest exception
+                * vector, as it may well cause an [MSA] FP exception if there
+                * are pending exception bits unmasked. (see
+                * kvm_mips_csr_die_notifier() for how that is handled).
+                */
+               if (kvm_mips_guest_has_fpu(&vcpu->arch) &&
+                   read_c0_status() & ST0_CU1)
+                       __kvm_restore_fcsr(&vcpu->arch);
+
+               if (kvm_mips_guest_has_msa(&vcpu->arch) &&
+                   read_c0_config5() & MIPS_CONF5_MSAEN)
+                       __kvm_restore_msacsr(&vcpu->arch);
+       }
+
        /* Disable HTW before returning to guest or host */
        htw_stop();
 
        return ret;
 }
 
+/* Enable FPU for guest and restore context */
+void kvm_own_fpu(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       unsigned int sr, cfg5;
+
+       preempt_disable();
+
+       sr = kvm_read_c0_guest_status(cop0);
+
+       /*
+        * If MSA state is already live, it is undefined how it interacts with
+        * FR=0 FPU state, and we don't want to hit reserved instruction
+        * exceptions trying to save the MSA state later when CU=1 && FR=1, so
+        * play it safe and save it first.
+        *
+        * In theory we shouldn't ever hit this case since kvm_lose_fpu() should
+        * get called when guest CU1 is set, however we can't trust the guest
+        * not to clobber the status register directly via the commpage.
+        */
+       if (cpu_has_msa && sr & ST0_CU1 && !(sr & ST0_FR) &&
+           vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+               kvm_lose_fpu(vcpu);
+
+       /*
+        * Enable FPU for guest
+        * We set FR and FRE according to guest context
+        */
+       change_c0_status(ST0_CU1 | ST0_FR, sr);
+       if (cpu_has_fre) {
+               cfg5 = kvm_read_c0_guest_config5(cop0);
+               change_c0_config5(MIPS_CONF5_FRE, cfg5);
+       }
+       enable_fpu_hazard();
+
+       /* If guest FPU state not active, restore it now */
+       if (!(vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)) {
+               __kvm_restore_fpu(&vcpu->arch);
+               vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+       }
+
+       preempt_enable();
+}
+
+#ifdef CONFIG_CPU_HAS_MSA
+/* Enable MSA for guest and restore context */
+void kvm_own_msa(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       unsigned int sr, cfg5;
+
+       preempt_disable();
+
+       /*
+        * Enable FPU if enabled in guest, since we're restoring FPU context
+        * anyway. We set FR and FRE according to guest context.
+        */
+       if (kvm_mips_guest_has_fpu(&vcpu->arch)) {
+               sr = kvm_read_c0_guest_status(cop0);
+
+               /*
+                * If FR=0 FPU state is already live, it is undefined how it
+                * interacts with MSA state, so play it safe and save it first.
+                */
+               if (!(sr & ST0_FR) &&
+                   (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU |
+                               KVM_MIPS_FPU_MSA)) == KVM_MIPS_FPU_FPU)
+                       kvm_lose_fpu(vcpu);
+
+               change_c0_status(ST0_CU1 | ST0_FR, sr);
+               if (sr & ST0_CU1 && cpu_has_fre) {
+                       cfg5 = kvm_read_c0_guest_config5(cop0);
+                       change_c0_config5(MIPS_CONF5_FRE, cfg5);
+               }
+       }
+
+       /* Enable MSA for guest */
+       set_c0_config5(MIPS_CONF5_MSAEN);
+       enable_fpu_hazard();
+
+       switch (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA)) {
+       case KVM_MIPS_FPU_FPU:
+               /*
+                * Guest FPU state already loaded, only restore upper MSA state
+                */
+               __kvm_restore_msa_upper(&vcpu->arch);
+               vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+               break;
+       case 0:
+               /* Neither FPU or MSA already active, restore full MSA state */
+               __kvm_restore_msa(&vcpu->arch);
+               vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+               if (kvm_mips_guest_has_fpu(&vcpu->arch))
+                       vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+               break;
+       default:
+               break;
+       }
+
+       preempt_enable();
+}
+#endif
+
+/* Drop FPU & MSA without saving it */
+void kvm_drop_fpu(struct kvm_vcpu *vcpu)
+{
+       preempt_disable();
+       if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+               disable_msa();
+               vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_MSA;
+       }
+       if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+               clear_c0_status(ST0_CU1 | ST0_FR);
+               vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+       }
+       preempt_enable();
+}
+
+/* Save and disable FPU & MSA */
+void kvm_lose_fpu(struct kvm_vcpu *vcpu)
+{
+       /*
+        * FPU & MSA get disabled in root context (hardware) when it is disabled
+        * in guest context (software), but the register state in the hardware
+        * may still be in use. This is why we explicitly re-enable the hardware
+        * before saving.
+        */
+
+       preempt_disable();
+       if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+               set_c0_config5(MIPS_CONF5_MSAEN);
+               enable_fpu_hazard();
+
+               __kvm_save_msa(&vcpu->arch);
+
+               /* Disable MSA & FPU */
+               disable_msa();
+               if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+                       clear_c0_status(ST0_CU1 | ST0_FR);
+               vcpu->arch.fpu_inuse &= ~(KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA);
+       } else if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+               set_c0_status(ST0_CU1);
+               enable_fpu_hazard();
+
+               __kvm_save_fpu(&vcpu->arch);
+               vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+
+               /* Disable FPU */
+               clear_c0_status(ST0_CU1 | ST0_FR);
+       }
+       preempt_enable();
+}
+
+/*
+ * Step over a specific ctc1 to FCSR and a specific ctcmsa to MSACSR which are
+ * used to restore guest FCSR/MSACSR state and may trigger a "harmless" FP/MSAFP
+ * exception if cause bits are set in the value being written.
+ */
+static int kvm_mips_csr_die_notify(struct notifier_block *self,
+                                  unsigned long cmd, void *ptr)
+{
+       struct die_args *args = (struct die_args *)ptr;
+       struct pt_regs *regs = args->regs;
+       unsigned long pc;
+
+       /* Only interested in FPE and MSAFPE */
+       if (cmd != DIE_FP && cmd != DIE_MSAFP)
+               return NOTIFY_DONE;
+
+       /* Return immediately if guest context isn't active */
+       if (!(current->flags & PF_VCPU))
+               return NOTIFY_DONE;
+
+       /* Should never get here from user mode */
+       BUG_ON(user_mode(regs));
+
+       pc = instruction_pointer(regs);
+       switch (cmd) {
+       case DIE_FP:
+               /* match 2nd instruction in __kvm_restore_fcsr */
+               if (pc != (unsigned long)&__kvm_restore_fcsr + 4)
+                       return NOTIFY_DONE;
+               break;
+       case DIE_MSAFP:
+               /* match 2nd/3rd instruction in __kvm_restore_msacsr */
+               if (!cpu_has_msa ||
+                   pc < (unsigned long)&__kvm_restore_msacsr + 4 ||
+                   pc > (unsigned long)&__kvm_restore_msacsr + 8)
+                       return NOTIFY_DONE;
+               break;
+       }
+
+       /* Move PC forward a little and continue executing */
+       instruction_pointer(regs) += 4;
+
+       return NOTIFY_STOP;
+}
+
+static struct notifier_block kvm_mips_csr_die_notifier = {
+       .notifier_call = kvm_mips_csr_die_notify,
+};
+
 int __init kvm_mips_init(void)
 {
        int ret;
@@ -1161,6 +1622,8 @@ int __init kvm_mips_init(void)
        if (ret)
                return ret;
 
+       register_die_notifier(&kvm_mips_csr_die_notifier);
+
        /*
         * On MIPS, kernel modules are executed from "mapped space", which
         * requires TLBs. The TLB handling code is statically linked with
@@ -1173,7 +1636,6 @@ int __init kvm_mips_init(void)
        kvm_mips_release_pfn_clean = kvm_release_pfn_clean;
        kvm_mips_is_error_pfn = is_error_pfn;
 
-       pr_info("KVM/MIPS Initialized\n");
        return 0;
 }
 
@@ -1185,7 +1647,7 @@ void __exit kvm_mips_exit(void)
        kvm_mips_release_pfn_clean = NULL;
        kvm_mips_is_error_pfn = NULL;
 
-       pr_info("KVM/MIPS unloaded\n");
+       unregister_die_notifier(&kvm_mips_csr_die_notifier);
 }
 
 module_init(kvm_mips_init);
diff --git a/arch/mips/kvm/msa.S b/arch/mips/kvm/msa.S
new file mode 100644 (file)
index 0000000..d02f0c6
--- /dev/null
@@ -0,0 +1,161 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * MIPS SIMD Architecture (MSA) context handling code for KVM.
+ *
+ * Copyright (C) 2015 Imagination Technologies Ltd.
+ */
+
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asmmacro.h>
+#include <asm/regdef.h>
+
+       .set    noreorder
+       .set    noat
+
+LEAF(__kvm_save_msa)
+       st_d    0,  VCPU_FPR0,  a0
+       st_d    1,  VCPU_FPR1,  a0
+       st_d    2,  VCPU_FPR2,  a0
+       st_d    3,  VCPU_FPR3,  a0
+       st_d    4,  VCPU_FPR4,  a0
+       st_d    5,  VCPU_FPR5,  a0
+       st_d    6,  VCPU_FPR6,  a0
+       st_d    7,  VCPU_FPR7,  a0
+       st_d    8,  VCPU_FPR8,  a0
+       st_d    9,  VCPU_FPR9,  a0
+       st_d    10, VCPU_FPR10, a0
+       st_d    11, VCPU_FPR11, a0
+       st_d    12, VCPU_FPR12, a0
+       st_d    13, VCPU_FPR13, a0
+       st_d    14, VCPU_FPR14, a0
+       st_d    15, VCPU_FPR15, a0
+       st_d    16, VCPU_FPR16, a0
+       st_d    17, VCPU_FPR17, a0
+       st_d    18, VCPU_FPR18, a0
+       st_d    19, VCPU_FPR19, a0
+       st_d    20, VCPU_FPR20, a0
+       st_d    21, VCPU_FPR21, a0
+       st_d    22, VCPU_FPR22, a0
+       st_d    23, VCPU_FPR23, a0
+       st_d    24, VCPU_FPR24, a0
+       st_d    25, VCPU_FPR25, a0
+       st_d    26, VCPU_FPR26, a0
+       st_d    27, VCPU_FPR27, a0
+       st_d    28, VCPU_FPR28, a0
+       st_d    29, VCPU_FPR29, a0
+       st_d    30, VCPU_FPR30, a0
+       st_d    31, VCPU_FPR31, a0
+       jr      ra
+        nop
+       END(__kvm_save_msa)
+
+LEAF(__kvm_restore_msa)
+       ld_d    0,  VCPU_FPR0,  a0
+       ld_d    1,  VCPU_FPR1,  a0
+       ld_d    2,  VCPU_FPR2,  a0
+       ld_d    3,  VCPU_FPR3,  a0
+       ld_d    4,  VCPU_FPR4,  a0
+       ld_d    5,  VCPU_FPR5,  a0
+       ld_d    6,  VCPU_FPR6,  a0
+       ld_d    7,  VCPU_FPR7,  a0
+       ld_d    8,  VCPU_FPR8,  a0
+       ld_d    9,  VCPU_FPR9,  a0
+       ld_d    10, VCPU_FPR10, a0
+       ld_d    11, VCPU_FPR11, a0
+       ld_d    12, VCPU_FPR12, a0
+       ld_d    13, VCPU_FPR13, a0
+       ld_d    14, VCPU_FPR14, a0
+       ld_d    15, VCPU_FPR15, a0
+       ld_d    16, VCPU_FPR16, a0
+       ld_d    17, VCPU_FPR17, a0
+       ld_d    18, VCPU_FPR18, a0
+       ld_d    19, VCPU_FPR19, a0
+       ld_d    20, VCPU_FPR20, a0
+       ld_d    21, VCPU_FPR21, a0
+       ld_d    22, VCPU_FPR22, a0
+       ld_d    23, VCPU_FPR23, a0
+       ld_d    24, VCPU_FPR24, a0
+       ld_d    25, VCPU_FPR25, a0
+       ld_d    26, VCPU_FPR26, a0
+       ld_d    27, VCPU_FPR27, a0
+       ld_d    28, VCPU_FPR28, a0
+       ld_d    29, VCPU_FPR29, a0
+       ld_d    30, VCPU_FPR30, a0
+       ld_d    31, VCPU_FPR31, a0
+       jr      ra
+        nop
+       END(__kvm_restore_msa)
+
+       .macro  kvm_restore_msa_upper   wr, off, base
+       .set    push
+       .set    noat
+#ifdef CONFIG_64BIT
+       ld      $1, \off(\base)
+       insert_d \wr, 1
+#elif defined(CONFIG_CPU_LITTLE_ENDIAN)
+       lw      $1, \off(\base)
+       insert_w \wr, 2
+       lw      $1, (\off+4)(\base)
+       insert_w \wr, 3
+#else /* CONFIG_CPU_BIG_ENDIAN */
+       lw      $1, (\off+4)(\base)
+       insert_w \wr, 2
+       lw      $1, \off(\base)
+       insert_w \wr, 3
+#endif
+       .set    pop
+       .endm
+
+LEAF(__kvm_restore_msa_upper)
+       kvm_restore_msa_upper   0,  VCPU_FPR0 +8, a0
+       kvm_restore_msa_upper   1,  VCPU_FPR1 +8, a0
+       kvm_restore_msa_upper   2,  VCPU_FPR2 +8, a0
+       kvm_restore_msa_upper   3,  VCPU_FPR3 +8, a0
+       kvm_restore_msa_upper   4,  VCPU_FPR4 +8, a0
+       kvm_restore_msa_upper   5,  VCPU_FPR5 +8, a0
+       kvm_restore_msa_upper   6,  VCPU_FPR6 +8, a0
+       kvm_restore_msa_upper   7,  VCPU_FPR7 +8, a0
+       kvm_restore_msa_upper   8,  VCPU_FPR8 +8, a0
+       kvm_restore_msa_upper   9,  VCPU_FPR9 +8, a0
+       kvm_restore_msa_upper   10, VCPU_FPR10+8, a0
+       kvm_restore_msa_upper   11, VCPU_FPR11+8, a0
+       kvm_restore_msa_upper   12, VCPU_FPR12+8, a0
+       kvm_restore_msa_upper   13, VCPU_FPR13+8, a0
+       kvm_restore_msa_upper   14, VCPU_FPR14+8, a0
+       kvm_restore_msa_upper   15, VCPU_FPR15+8, a0
+       kvm_restore_msa_upper   16, VCPU_FPR16+8, a0
+       kvm_restore_msa_upper   17, VCPU_FPR17+8, a0
+       kvm_restore_msa_upper   18, VCPU_FPR18+8, a0
+       kvm_restore_msa_upper   19, VCPU_FPR19+8, a0
+       kvm_restore_msa_upper   20, VCPU_FPR20+8, a0
+       kvm_restore_msa_upper   21, VCPU_FPR21+8, a0
+       kvm_restore_msa_upper   22, VCPU_FPR22+8, a0
+       kvm_restore_msa_upper   23, VCPU_FPR23+8, a0
+       kvm_restore_msa_upper   24, VCPU_FPR24+8, a0
+       kvm_restore_msa_upper   25, VCPU_FPR25+8, a0
+       kvm_restore_msa_upper   26, VCPU_FPR26+8, a0
+       kvm_restore_msa_upper   27, VCPU_FPR27+8, a0
+       kvm_restore_msa_upper   28, VCPU_FPR28+8, a0
+       kvm_restore_msa_upper   29, VCPU_FPR29+8, a0
+       kvm_restore_msa_upper   30, VCPU_FPR30+8, a0
+       kvm_restore_msa_upper   31, VCPU_FPR31+8, a0
+       jr      ra
+        nop
+       END(__kvm_restore_msa_upper)
+
+LEAF(__kvm_restore_msacsr)
+       lw      t0, VCPU_MSA_CSR(a0)
+       /*
+        * The ctcmsa must stay at this offset in __kvm_restore_msacsr.
+        * See kvm_mips_csr_die_notify() which handles t0 containing a value
+        * which triggers an MSA FP Exception, which must be stepped over and
+        * ignored since the set cause bits must remain there for the guest.
+        */
+       _ctcmsa MSA_CSR, t0
+       jr      ra
+        nop
+       END(__kvm_restore_msacsr)
index a74d6024c5ad5f5c7e0701a309e246bd42532992..888bb67070ac6d1139a7f06f5958e180680511ef 100644 (file)
@@ -25,6 +25,10 @@ char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES] = {
        "System Call",
        "Reserved Inst",
        "Break Inst",
+       "Trap Inst",
+       "MSA FPE",
+       "FPE",
+       "MSA Disabled",
        "D-Cache Flushes",
 };
 
index b6beb0e07b1b3b535f7625d61100e6c0087de00d..aed0ac2a4972cd1daf0f2992db6c100e9912fb70 100644 (file)
@@ -733,6 +733,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                }
        }
 
+       /* restore guest state to registers */
+       kvm_mips_callbacks->vcpu_set_regs(vcpu);
+
        local_irq_restore(flags);
 
 }
@@ -751,6 +754,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
        vcpu->arch.preempt_entryhi = read_c0_entryhi();
        vcpu->arch.last_sched_cpu = cpu;
 
+       /* save guest state in registers */
+       kvm_mips_callbacks->vcpu_get_regs(vcpu);
+
        if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
             ASID_VERSION_MASK)) {
                kvm_debug("%s: Dropping MMU Context:  %#lx\n", __func__,
index fd7257b70e656fcb8c53d72b552f240f5b255ce6..d836ed5b0bc7ea38e36350304a6238a520e0d74d 100644 (file)
@@ -39,16 +39,30 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
 
 static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 {
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
        struct kvm_run *run = vcpu->run;
        uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
        unsigned long cause = vcpu->arch.host_cp0_cause;
        enum emulation_result er = EMULATE_DONE;
        int ret = RESUME_GUEST;
 
-       if (((cause & CAUSEF_CE) >> CAUSEB_CE) == 1)
-               er = kvm_mips_emulate_fpu_exc(cause, opc, run, vcpu);
-       else
+       if (((cause & CAUSEF_CE) >> CAUSEB_CE) == 1) {
+               /* FPU Unusable */
+               if (!kvm_mips_guest_has_fpu(&vcpu->arch) ||
+                   (kvm_read_c0_guest_status(cop0) & ST0_CU1) == 0) {
+                       /*
+                        * Unusable/no FPU in guest:
+                        * deliver guest COP1 Unusable Exception
+                        */
+                       er = kvm_mips_emulate_fpu_exc(cause, opc, run, vcpu);
+               } else {
+                       /* Restore FPU state */
+                       kvm_own_fpu(vcpu);
+                       er = EMULATE_DONE;
+               }
+       } else {
                er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
+       }
 
        switch (er) {
        case EMULATE_DONE:
@@ -330,6 +344,107 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
        return ret;
 }
 
+static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+       uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
+       unsigned long cause = vcpu->arch.host_cp0_cause;
+       enum emulation_result er = EMULATE_DONE;
+       int ret = RESUME_GUEST;
+
+       er = kvm_mips_emulate_trap_exc(cause, opc, run, vcpu);
+       if (er == EMULATE_DONE) {
+               ret = RESUME_GUEST;
+       } else {
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               ret = RESUME_HOST;
+       }
+       return ret;
+}
+
+static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+       uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
+       unsigned long cause = vcpu->arch.host_cp0_cause;
+       enum emulation_result er = EMULATE_DONE;
+       int ret = RESUME_GUEST;
+
+       er = kvm_mips_emulate_msafpe_exc(cause, opc, run, vcpu);
+       if (er == EMULATE_DONE) {
+               ret = RESUME_GUEST;
+       } else {
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               ret = RESUME_HOST;
+       }
+       return ret;
+}
+
+static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+       uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
+       unsigned long cause = vcpu->arch.host_cp0_cause;
+       enum emulation_result er = EMULATE_DONE;
+       int ret = RESUME_GUEST;
+
+       er = kvm_mips_emulate_fpe_exc(cause, opc, run, vcpu);
+       if (er == EMULATE_DONE) {
+               ret = RESUME_GUEST;
+       } else {
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               ret = RESUME_HOST;
+       }
+       return ret;
+}
+
+/**
+ * kvm_trap_emul_handle_msa_disabled() - Guest used MSA while disabled in root.
+ * @vcpu:      Virtual CPU context.
+ *
+ * Handle when the guest attempts to use MSA when it is disabled.
+ */
+static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       struct kvm_run *run = vcpu->run;
+       uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+       unsigned long cause = vcpu->arch.host_cp0_cause;
+       enum emulation_result er = EMULATE_DONE;
+       int ret = RESUME_GUEST;
+
+       if (!kvm_mips_guest_has_msa(&vcpu->arch) ||
+           (kvm_read_c0_guest_status(cop0) & (ST0_CU1 | ST0_FR)) == ST0_CU1) {
+               /*
+                * No MSA in guest, or FPU enabled and not in FR=1 mode,
+                * guest reserved instruction exception
+                */
+               er = kvm_mips_emulate_ri_exc(cause, opc, run, vcpu);
+       } else if (!(kvm_read_c0_guest_config5(cop0) & MIPS_CONF5_MSAEN)) {
+               /* MSA disabled by guest, guest MSA disabled exception */
+               er = kvm_mips_emulate_msadis_exc(cause, opc, run, vcpu);
+       } else {
+               /* Restore MSA/FPU state */
+               kvm_own_msa(vcpu);
+               er = EMULATE_DONE;
+       }
+
+       switch (er) {
+       case EMULATE_DONE:
+               ret = RESUME_GUEST;
+               break;
+
+       case EMULATE_FAIL:
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               ret = RESUME_HOST;
+               break;
+
+       default:
+               BUG();
+       }
+       return ret;
+}
+
 static int kvm_trap_emul_vm_init(struct kvm *kvm)
 {
        return 0;
@@ -351,8 +466,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
         * guest will come up as expected, for now we simulate a MIPS 24kc
         */
        kvm_write_c0_guest_prid(cop0, 0x00019300);
-       kvm_write_c0_guest_config(cop0,
-                                 MIPS_CONFIG0 | (0x1 << CP0C0_AR) |
+       /* Have config1, Cacheable, noncoherent, write-back, write allocate */
+       kvm_write_c0_guest_config(cop0, MIPS_CONF_M | (0x3 << CP0C0_K0) |
+                                 (0x1 << CP0C0_AR) |
                                  (MMU_TYPE_R4000 << CP0C0_MT));
 
        /* Read the cache characteristics from the host Config1 Register */
@@ -368,10 +484,18 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
              (1 << CP0C1_WR) | (1 << CP0C1_CA));
        kvm_write_c0_guest_config1(cop0, config1);
 
-       kvm_write_c0_guest_config2(cop0, MIPS_CONFIG2);
-       /* MIPS_CONFIG2 | (read_c0_config2() & 0xfff) */
-       kvm_write_c0_guest_config3(cop0, MIPS_CONFIG3 | (0 << CP0C3_VInt) |
-                                        (1 << CP0C3_ULRI));
+       /* Have config3, no tertiary/secondary caches implemented */
+       kvm_write_c0_guest_config2(cop0, MIPS_CONF_M);
+       /* MIPS_CONF_M | (read_c0_config2() & 0xfff) */
+
+       /* Have config4, UserLocal */
+       kvm_write_c0_guest_config3(cop0, MIPS_CONF_M | MIPS_CONF3_ULRI);
+
+       /* Have config5 */
+       kvm_write_c0_guest_config4(cop0, MIPS_CONF_M);
+
+       /* No config6 */
+       kvm_write_c0_guest_config5(cop0, 0);
 
        /* Set Wait IE/IXMT Ignore in Config7, IAR, AR */
        kvm_write_c0_guest_config7(cop0, (MIPS_CONF7_WII) | (1 << 10));
@@ -416,6 +540,7 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        int ret = 0;
+       unsigned int cur, change;
 
        switch (reg->id) {
        case KVM_REG_MIPS_CP0_COUNT:
@@ -444,6 +569,44 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
                        kvm_write_c0_guest_cause(cop0, v);
                }
                break;
+       case KVM_REG_MIPS_CP0_CONFIG:
+               /* read-only for now */
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG1:
+               cur = kvm_read_c0_guest_config1(cop0);
+               change = (cur ^ v) & kvm_mips_config1_wrmask(vcpu);
+               if (change) {
+                       v = cur ^ change;
+                       kvm_write_c0_guest_config1(cop0, v);
+               }
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG2:
+               /* read-only for now */
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG3:
+               cur = kvm_read_c0_guest_config3(cop0);
+               change = (cur ^ v) & kvm_mips_config3_wrmask(vcpu);
+               if (change) {
+                       v = cur ^ change;
+                       kvm_write_c0_guest_config3(cop0, v);
+               }
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG4:
+               cur = kvm_read_c0_guest_config4(cop0);
+               change = (cur ^ v) & kvm_mips_config4_wrmask(vcpu);
+               if (change) {
+                       v = cur ^ change;
+                       kvm_write_c0_guest_config4(cop0, v);
+               }
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG5:
+               cur = kvm_read_c0_guest_config5(cop0);
+               change = (cur ^ v) & kvm_mips_config5_wrmask(vcpu);
+               if (change) {
+                       v = cur ^ change;
+                       kvm_write_c0_guest_config5(cop0, v);
+               }
+               break;
        case KVM_REG_MIPS_COUNT_CTL:
                ret = kvm_mips_set_count_ctl(vcpu, v);
                break;
@@ -459,6 +622,18 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
        return ret;
 }
 
+static int kvm_trap_emul_vcpu_get_regs(struct kvm_vcpu *vcpu)
+{
+       kvm_lose_fpu(vcpu);
+
+       return 0;
+}
+
+static int kvm_trap_emul_vcpu_set_regs(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
 static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
        /* exit handlers */
        .handle_cop_unusable = kvm_trap_emul_handle_cop_unusable,
@@ -470,6 +645,10 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
        .handle_syscall = kvm_trap_emul_handle_syscall,
        .handle_res_inst = kvm_trap_emul_handle_res_inst,
        .handle_break = kvm_trap_emul_handle_break,
+       .handle_trap = kvm_trap_emul_handle_trap,
+       .handle_msa_fpe = kvm_trap_emul_handle_msa_fpe,
+       .handle_fpe = kvm_trap_emul_handle_fpe,
+       .handle_msa_disabled = kvm_trap_emul_handle_msa_disabled,
 
        .vm_init = kvm_trap_emul_vm_init,
        .vcpu_init = kvm_trap_emul_vcpu_init,
@@ -483,6 +662,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
        .irq_clear = kvm_mips_irq_clear_cb,
        .get_one_reg = kvm_trap_emul_get_one_reg,
        .set_one_reg = kvm_trap_emul_set_one_reg,
+       .vcpu_get_regs = kvm_trap_emul_vcpu_get_regs,
+       .vcpu_set_regs = kvm_trap_emul_vcpu_set_regs,
 };
 
 int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
index 3b7f65cc42187e95627bdd14b461be0e60e7342b..cf9b4633257eb86dd14cea3656b94104ce8d909a 100644 (file)
@@ -75,11 +75,11 @@ static int rtctmp;
 int proc_dolasatrtc(struct ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
 {
-       struct timespec ts;
+       struct timespec64 ts;
        int r;
 
        if (!write) {
-               read_persistent_clock(&ts);
+               read_persistent_clock64(&ts);
                rtctmp = ts.tv_sec;
                /* check for time < 0 and set to 0 */
                if (rtctmp < 0)
index 1f266575beb51c459432ac918e96e0b3b810ae85..a16e55cbd8ad99dce9f448d14c3a4ddeb20b8e42 100644 (file)
@@ -47,7 +47,6 @@ struct thread_info {
                                                  0-0x7FFFFFFF for user-thead
                                                  0-0xFFFFFFFF for kernel-thread
                                                */
-       struct restart_block    restart_block;
        struct pt_regs          *regs;
 };
 
@@ -64,9 +63,6 @@ struct thread_info {
        .cpu            = 0,                    \
        .preempt_count  = INIT_PREEMPT_COUNT,   \
        .addr_limit     = KERNEL_DS,            \
-       .restart_block  = {                     \
-               .fn = do_no_restart_syscall,    \
-       },                                      \
 }
 
 #define init_thread_info       (init_thread_union.thread_info)
index 71a330597adff689dcae092d63839aad1fb8b7d3..eff00e67c0a245f5208ece2f662f9055f328f3c6 100644 (file)
 #define PTR_IPENDING   37
 #define PTR_CPUID      38
 #define PTR_CTL6       39
-#define PTR_CTL7       40
+#define PTR_EXCEPTION  40
 #define PTR_PTEADDR    41
 #define PTR_TLBACC     42
 #define PTR_TLBMISC    43
+#define PTR_ECCINJ     44
+#define PTR_BADADDR    45
+#define PTR_CONFIG     46
+#define PTR_MPUBASE    47
+#define PTR_MPUACC     48
 
-#define NUM_PTRACE_REG (PTR_TLBMISC + 1)
+#define NUM_PTRACE_REG (PTR_MPUACC + 1)
 
 /* User structures for general purpose registers.  */
 struct user_pt_regs {
index 7729bd3f2e79d48937e7dbead303640610ca24ae..27b006c52e12e1193fd392033f5804c1c42c56da 100644 (file)
@@ -161,7 +161,7 @@ ENTRY(inthandler)
  ***********************************************************************
  */
 ENTRY(handle_trap)
-       ldw     r24, -4(ea)     /* instruction that caused the exception */
+       ldwio   r24, -4(ea)     /* instruction that caused the exception */
        srli    r24, r24, 4
        andi    r24, r24, 0x7c
        movia   r9,trap_table
index dda41e4fe7070885ee7ab77e4c5e9e18e51dd0a3..20662b0f6c9e30cd52279ce3274bcbc463fa5236 100644 (file)
@@ -43,7 +43,7 @@ static inline int rt_restore_ucontext(struct pt_regs *regs,
        int err;
 
        /* Always make any pending restarted system calls return -EINTR */
-       current_thread_info()->restart_block.fn = do_no_restart_syscall;
+       current->restart_block.fn = do_no_restart_syscall;
 
        err = __get_user(temp, &uc->uc_mcontext.version);
        if (temp != MCONTEXT_VERSION)
index 2ae482b4266931cbcdd28267630b4f7bd2f56074..796642932e2ef446a9e78b72e7fecccc4ed14647 100644 (file)
@@ -23,9 +23,6 @@ static void __flush_dcache(unsigned long start, unsigned long end)
        end += (cpuinfo.dcache_line_size - 1);
        end &= ~(cpuinfo.dcache_line_size - 1);
 
-       if (end > start + cpuinfo.dcache_size)
-               end = start + cpuinfo.dcache_size;
-
        for (addr = start; addr < end; addr += cpuinfo.dcache_line_size) {
                __asm__ __volatile__ ("   flushda 0(%0)\n"
                                        : /* Outputs */
index 2bf8e9307be9833b5bf9af51defdfada8feae68d..4c8ad592ae3351d96f7e225411d9f551657c5bc7 100644 (file)
@@ -55,7 +55,7 @@ static inline cpumask_t cpu_thread_mask_to_cores(const struct cpumask *threads)
 
 static inline int cpu_nr_cores(void)
 {
-       return NR_CPUS >> threads_shift;
+       return nr_cpu_ids >> threads_shift;
 }
 
 static inline cpumask_t cpu_online_cores_map(void)
index 39b3a8f816f28d0ecd61ca8a110d22bb99fb4507..6249cdc834d14977ffe5344a4b15857de01abe79 100644 (file)
@@ -34,7 +34,7 @@
 #include <asm/kvm_para.h>
 #include <asm/kvm_host.h>
 #include <asm/kvm_ppc.h>
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 #define MAX_CPU     32
 #define MAX_SRC     256
@@ -289,11 +289,6 @@ static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ)
        clear_bit(n_IRQ, q->queue);
 }
 
-static inline int IRQ_testbit(struct irq_queue *q, int n_IRQ)
-{
-       return test_bit(n_IRQ, q->queue);
-}
-
 static void IRQ_check(struct openpic *opp, struct irq_queue *q)
 {
        int irq = -1;
@@ -1374,8 +1369,9 @@ static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val)
        return -ENXIO;
 }
 
-static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
-                        int len, void *ptr)
+static int kvm_mpic_read(struct kvm_vcpu *vcpu,
+                        struct kvm_io_device *this,
+                        gpa_t addr, int len, void *ptr)
 {
        struct openpic *opp = container_of(this, struct openpic, mmio);
        int ret;
@@ -1415,8 +1411,9 @@ static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
        return ret;
 }
 
-static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr,
-                         int len, const void *ptr)
+static int kvm_mpic_write(struct kvm_vcpu *vcpu,
+                         struct kvm_io_device *this,
+                         gpa_t addr, int len, const void *ptr)
 {
        struct openpic *opp = container_of(this, struct openpic, mmio);
        int ret;
index 27c0face86f45cdac10ac74ba309de04094a8367..24bfe401373e44aad58268c95caf7e2a5e09198e 100644 (file)
@@ -807,7 +807,7 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
 
-       ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+       ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, run->mmio.phys_addr,
                              bytes, &run->mmio.data);
 
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -880,7 +880,7 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
 
-       ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+       ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, run->mmio.phys_addr,
                               bytes, &run->mmio.data);
 
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
index 0509bca5e830b656c8a553c047740b68b172f4b1..fcbe899fe299303d8e96d3faf0ba74eb5dc1ee53 100644 (file)
@@ -9,11 +9,11 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/jump_label.h>
 #include <asm/ppc_asm.h>
 #include <asm/hvcall.h>
 #include <asm/asm-offsets.h>
 #include <asm/opal.h>
-#include <asm/jump_label.h>
 
        .section        ".text"
 
index ccd53f91e8aa8e4f3b30bc550f55a63f509818a8..74b5b8e239c8235ac61b529b8085cc3ceff096ed 100644 (file)
@@ -7,12 +7,12 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#include <linux/jump_label.h>
 #include <asm/hvcall.h>
 #include <asm/processor.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/ptrace.h>
-#include <asm/jump_label.h>
 
        .section        ".text"
        
index b5682fd6c9846b2cdb259720e35a2326cf7c45a1..b7a67e3d2201e4d5988e9cb3df651651c12024c8 100644 (file)
@@ -26,7 +26,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/console.h>
 #include <linux/export.h>
-#include <linux/static_key.h>
+#include <linux/jump_label.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/page.h>
index 58642fd29c878c8d1fb5d384ecf58427b553552e..2b77e235b5fbdf54707551e067da19bd9ffb3a11 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _ASM_S390_JUMP_LABEL_H
 #define _ASM_S390_JUMP_LABEL_H
 
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 
 #define JUMP_LABEL_NOP_SIZE 6
@@ -39,4 +41,5 @@ struct jump_entry {
        jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif
index f407bbf5ee94ca5e2f6122951e52ce2d7db1a7ef..d01fc588b5c378fddc46eba49e28b4de4be1f1a9 100644 (file)
@@ -172,7 +172,9 @@ struct kvm_s390_sie_block {
        __u32   fac;                    /* 0x01a0 */
        __u8    reserved1a4[20];        /* 0x01a4 */
        __u64   cbrlo;                  /* 0x01b8 */
-       __u8    reserved1c0[30];        /* 0x01c0 */
+       __u8    reserved1c0[8];         /* 0x01c0 */
+       __u32   ecd;                    /* 0x01c8 */
+       __u8    reserved1cc[18];        /* 0x01cc */
        __u64   pp;                     /* 0x01de */
        __u8    reserved1e6[2];         /* 0x01e6 */
        __u64   itdba;                  /* 0x01e8 */
@@ -183,11 +185,17 @@ struct kvm_s390_itdb {
        __u8    data[256];
 } __packed;
 
+struct kvm_s390_vregs {
+       __vector128 vrs[32];
+       __u8    reserved200[512];       /* for future vector expansion */
+} __packed;
+
 struct sie_page {
        struct kvm_s390_sie_block sie_block;
        __u8 reserved200[1024];         /* 0x0200 */
        struct kvm_s390_itdb itdb;      /* 0x0600 */
-       __u8 reserved700[2304];         /* 0x0700 */
+       __u8 reserved700[1280];         /* 0x0700 */
+       struct kvm_s390_vregs vregs;    /* 0x0c00 */
 } __packed;
 
 struct kvm_vcpu_stat {
@@ -238,6 +246,7 @@ struct kvm_vcpu_stat {
        u32 instruction_sigp_stop;
        u32 instruction_sigp_stop_store_status;
        u32 instruction_sigp_store_status;
+       u32 instruction_sigp_store_adtl_status;
        u32 instruction_sigp_arch;
        u32 instruction_sigp_prefix;
        u32 instruction_sigp_restart;
@@ -270,6 +279,7 @@ struct kvm_vcpu_stat {
 #define PGM_SPECIAL_OPERATION          0x13
 #define PGM_OPERAND                    0x15
 #define PGM_TRACE_TABEL                        0x16
+#define PGM_VECTOR_PROCESSING          0x1b
 #define PGM_SPACE_SWITCH               0x1c
 #define PGM_HFP_SQUARE_ROOT            0x1d
 #define PGM_PC_TRANSLATION_SPEC                0x1f
@@ -334,6 +344,11 @@ enum irq_types {
        IRQ_PEND_COUNT
 };
 
+/* We have 2M for virtio device descriptor pages. Smallest amount of
+ * memory per page is 24 bytes (1 queue), so (2048*1024) / 24 = 87381
+ */
+#define KVM_S390_MAX_VIRTIO_IRQS 87381
+
 /*
  * Repressible (non-floating) machine check interrupts
  * subclass bits in MCIC
@@ -411,13 +426,32 @@ struct kvm_s390_local_interrupt {
        unsigned long pending_irqs;
 };
 
+#define FIRQ_LIST_IO_ISC_0 0
+#define FIRQ_LIST_IO_ISC_1 1
+#define FIRQ_LIST_IO_ISC_2 2
+#define FIRQ_LIST_IO_ISC_3 3
+#define FIRQ_LIST_IO_ISC_4 4
+#define FIRQ_LIST_IO_ISC_5 5
+#define FIRQ_LIST_IO_ISC_6 6
+#define FIRQ_LIST_IO_ISC_7 7
+#define FIRQ_LIST_PFAULT   8
+#define FIRQ_LIST_VIRTIO   9
+#define FIRQ_LIST_COUNT   10
+#define FIRQ_CNTR_IO       0
+#define FIRQ_CNTR_SERVICE  1
+#define FIRQ_CNTR_VIRTIO   2
+#define FIRQ_CNTR_PFAULT   3
+#define FIRQ_MAX_COUNT     4
+
 struct kvm_s390_float_interrupt {
+       unsigned long pending_irqs;
        spinlock_t lock;
-       struct list_head list;
-       atomic_t active;
+       struct list_head lists[FIRQ_LIST_COUNT];
+       int counters[FIRQ_MAX_COUNT];
+       struct kvm_s390_mchk_info mchk;
+       struct kvm_s390_ext_info srv_signal;
        int next_rr_cpu;
        unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
-       unsigned int irq_count;
 };
 
 struct kvm_hw_wp_info_arch {
@@ -465,6 +499,7 @@ struct kvm_vcpu_arch {
        s390_fp_regs      host_fpregs;
        unsigned int      host_acrs[NUM_ACRS];
        s390_fp_regs      guest_fpregs;
+       struct kvm_s390_vregs   *host_vregs;
        struct kvm_s390_local_interrupt local_int;
        struct hrtimer    ckc_timer;
        struct kvm_s390_pgm_info pgm;
@@ -553,6 +588,7 @@ struct kvm_arch{
        int use_cmma;
        int user_cpu_state_ctrl;
        int user_sigp;
+       int user_stsi;
        struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
        wait_queue_head_t ipte_wq;
        int ipte_lock_count;
index 9c77e60b9a269a59bcde0d4fe0a5fb77348f1520..ef1a5fcc6c66bbf5705173b41371378c4b541483 100644 (file)
@@ -150,6 +150,7 @@ struct kvm_guest_debug_arch {
 #define KVM_SYNC_CRS    (1UL << 3)
 #define KVM_SYNC_ARCH0  (1UL << 4)
 #define KVM_SYNC_PFAULT (1UL << 5)
+#define KVM_SYNC_VRS    (1UL << 6)
 /* definition of registers in kvm_run */
 struct kvm_sync_regs {
        __u64 prefix;   /* prefix register */
@@ -164,6 +165,9 @@ struct kvm_sync_regs {
        __u64 pft;      /* pfault token [PFAULT] */
        __u64 pfs;      /* pfault select [PFAULT] */
        __u64 pfc;      /* pfault compare [PFAULT] */
+       __u64 vrs[32][2];       /* vector registers */
+       __u8  reserved[512];    /* for future vector expansion */
+       __u32 fpc;      /* only valid with vector registers */
 };
 
 #define KVM_REG_S390_TODPR     (KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1)
index d4096fdfc6ab45b02eda2f0a7258da5b98f46930..ee69c0854c8891067b67a0d8920e7f3cf671f308 100644 (file)
  * and returns a key, which can be used to find a mnemonic name
  * of the instruction in the icpt_insn_codes table.
  */
-#define icpt_insn_decoder(insn)                        \
+#define icpt_insn_decoder(insn) (              \
        INSN_DECODE_IPA0(0x01, insn, 48, 0xff)  \
        INSN_DECODE_IPA0(0xaa, insn, 48, 0x0f)  \
        INSN_DECODE_IPA0(0xb2, insn, 48, 0xff)  \
        INSN_DECODE_IPA0(0xe5, insn, 48, 0xff)  \
        INSN_DECODE_IPA0(0xeb, insn, 16, 0xff)  \
        INSN_DECODE_IPA0(0xc8, insn, 48, 0x0f)  \
-       INSN_DECODE(insn)
+       INSN_DECODE(insn))
 
 #endif /* _UAPI_ASM_S390_SIE_H */
index e07e91605353003084ff19511c1c520e8bca1dd7..8dc4db10d1608e81eafad41e5f87d1c92af7bbf5 100644 (file)
@@ -171,6 +171,7 @@ int main(void)
 #else /* CONFIG_32BIT */
        DEFINE(__LC_DATA_EXC_CODE, offsetof(struct _lowcore, data_exc_code));
        DEFINE(__LC_MCCK_FAIL_STOR_ADDR, offsetof(struct _lowcore, failing_storage_address));
+       DEFINE(__LC_VX_SAVE_AREA_ADDR, offsetof(struct _lowcore, vector_save_area_addr));
        DEFINE(__LC_EXT_PARAMS2, offsetof(struct _lowcore, ext_params2));
        DEFINE(SAVE_AREA_BASE, offsetof(struct _lowcore, floating_pt_save_area));
        DEFINE(__LC_PASTE, offsetof(struct _lowcore, paste));
index 20660dddb2d67f1e4ebdd16a2a89ff99602935f1..170ddd2018b31667df8619b471df42b7fb562705 100644 (file)
@@ -215,20 +215,20 @@ void update_vsyscall(struct timekeeper *tk)
 {
        u64 nsecps;
 
-       if (tk->tkr.clock != &clocksource_tod)
+       if (tk->tkr_mono.clock != &clocksource_tod)
                return;
 
        /* Make userspace gettimeofday spin until we're done. */
        ++vdso_data->tb_update_count;
        smp_wmb();
-       vdso_data->xtime_tod_stamp = tk->tkr.cycle_last;
+       vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last;
        vdso_data->xtime_clock_sec = tk->xtime_sec;
-       vdso_data->xtime_clock_nsec = tk->tkr.xtime_nsec;
+       vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
        vdso_data->wtom_clock_sec =
                tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
-       vdso_data->wtom_clock_nsec = tk->tkr.xtime_nsec +
-               + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr.shift);
-       nsecps = (u64) NSEC_PER_SEC << tk->tkr.shift;
+       vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec +
+               + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+       nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift;
        while (vdso_data->wtom_clock_nsec >= nsecps) {
                vdso_data->wtom_clock_nsec -= nsecps;
                vdso_data->wtom_clock_sec++;
@@ -236,7 +236,7 @@ void update_vsyscall(struct timekeeper *tk)
 
        vdso_data->xtime_coarse_sec = tk->xtime_sec;
        vdso_data->xtime_coarse_nsec =
-               (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+               (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        vdso_data->wtom_coarse_sec =
                vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec;
        vdso_data->wtom_coarse_nsec =
@@ -246,8 +246,8 @@ void update_vsyscall(struct timekeeper *tk)
                vdso_data->wtom_coarse_sec++;
        }
 
-       vdso_data->tk_mult = tk->tkr.mult;
-       vdso_data->tk_shift = tk->tkr.shift;
+       vdso_data->tk_mult = tk->tkr_mono.mult;
+       vdso_data->tk_shift = tk->tkr_mono.shift;
        smp_wmb();
        ++vdso_data->tb_update_count;
 }
@@ -283,7 +283,7 @@ void __init time_init(void)
        if (register_external_irq(EXT_IRQ_TIMING_ALERT, timing_alert_interrupt))
                panic("Couldn't request external interrupt 0x1406");
 
-       if (clocksource_register(&clocksource_tod) != 0)
+       if (__clocksource_register(&clocksource_tod) != 0)
                panic("Could not register TOD clock source");
 
        /* Enable TOD clock interrupts on the boot cpu. */
index 9254afff250c968682db79c453d0542868bfdd9c..fc7ec95848c39c527c2a24ee723c9f45624e31a0 100644 (file)
@@ -77,7 +77,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
 
        if (vcpu->run->s.regs.gprs[rx] & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-       rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], &parm, sizeof(parm));
+       rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm));
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
        if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258)
@@ -213,7 +213,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
         * - gpr 3 contains the virtqueue index (passed as datamatch)
         * - gpr 4 contains the index on the bus (optionally)
         */
-       ret = kvm_io_bus_write_cookie(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
+       ret = kvm_io_bus_write_cookie(vcpu, KVM_VIRTIO_CCW_NOTIFY_BUS,
                                      vcpu->run->s.regs.gprs[2] & 0xffffffff,
                                      8, &vcpu->run->s.regs.gprs[3],
                                      vcpu->run->s.regs.gprs[4]);
@@ -230,7 +230,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
 
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
 {
-       int code = kvm_s390_get_base_disp_rs(vcpu) & 0xffff;
+       int code = kvm_s390_get_base_disp_rs(vcpu, NULL) & 0xffff;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
index 267523cac6de7860cda644017e2b52ccfc7ea8d4..a7559f7207df3a0ac62d0fc16b199f3b4c6b6dac 100644 (file)
@@ -10,6 +10,7 @@
 #include <asm/pgtable.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
+#include <asm/switch_to.h>
 
 union asce {
        unsigned long val;
@@ -207,6 +208,54 @@ union raddress {
        unsigned long pfra : 52; /* Page-Frame Real Address */
 };
 
+union alet {
+       u32 val;
+       struct {
+               u32 reserved : 7;
+               u32 p        : 1;
+               u32 alesn    : 8;
+               u32 alen     : 16;
+       };
+};
+
+union ald {
+       u32 val;
+       struct {
+               u32     : 1;
+               u32 alo : 24;
+               u32 all : 7;
+       };
+};
+
+struct ale {
+       unsigned long i      : 1; /* ALEN-Invalid Bit */
+       unsigned long        : 5;
+       unsigned long fo     : 1; /* Fetch-Only Bit */
+       unsigned long p      : 1; /* Private Bit */
+       unsigned long alesn  : 8; /* Access-List-Entry Sequence Number */
+       unsigned long aleax  : 16; /* Access-List-Entry Authorization Index */
+       unsigned long        : 32;
+       unsigned long        : 1;
+       unsigned long asteo  : 25; /* ASN-Second-Table-Entry Origin */
+       unsigned long        : 6;
+       unsigned long astesn : 32; /* ASTE Sequence Number */
+} __packed;
+
+struct aste {
+       unsigned long i      : 1; /* ASX-Invalid Bit */
+       unsigned long ato    : 29; /* Authority-Table Origin */
+       unsigned long        : 1;
+       unsigned long b      : 1; /* Base-Space Bit */
+       unsigned long ax     : 16; /* Authorization Index */
+       unsigned long atl    : 12; /* Authority-Table Length */
+       unsigned long        : 2;
+       unsigned long ca     : 1; /* Controlled-ASN Bit */
+       unsigned long ra     : 1; /* Reusable-ASN Bit */
+       unsigned long asce   : 64; /* Address-Space-Control Element */
+       unsigned long ald    : 32;
+       unsigned long astesn : 32;
+       /* .. more fields there */
+} __packed;
 
 int ipte_lock_held(struct kvm_vcpu *vcpu)
 {
@@ -307,15 +356,157 @@ void ipte_unlock(struct kvm_vcpu *vcpu)
                ipte_unlock_simple(vcpu);
 }
 
-static unsigned long get_vcpu_asce(struct kvm_vcpu *vcpu)
+static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
+                         int write)
+{
+       union alet alet;
+       struct ale ale;
+       struct aste aste;
+       unsigned long ald_addr, authority_table_addr;
+       union ald ald;
+       int eax, rc;
+       u8 authority_table;
+
+       if (ar >= NUM_ACRS)
+               return -EINVAL;
+
+       save_access_regs(vcpu->run->s.regs.acrs);
+       alet.val = vcpu->run->s.regs.acrs[ar];
+
+       if (ar == 0 || alet.val == 0) {
+               asce->val = vcpu->arch.sie_block->gcr[1];
+               return 0;
+       } else if (alet.val == 1) {
+               asce->val = vcpu->arch.sie_block->gcr[7];
+               return 0;
+       }
+
+       if (alet.reserved)
+               return PGM_ALET_SPECIFICATION;
+
+       if (alet.p)
+               ald_addr = vcpu->arch.sie_block->gcr[5];
+       else
+               ald_addr = vcpu->arch.sie_block->gcr[2];
+       ald_addr &= 0x7fffffc0;
+
+       rc = read_guest_real(vcpu, ald_addr + 16, &ald.val, sizeof(union ald));
+       if (rc)
+               return rc;
+
+       if (alet.alen / 8 > ald.all)
+               return PGM_ALEN_TRANSLATION;
+
+       if (0x7fffffff - ald.alo * 128 < alet.alen * 16)
+               return PGM_ADDRESSING;
+
+       rc = read_guest_real(vcpu, ald.alo * 128 + alet.alen * 16, &ale,
+                            sizeof(struct ale));
+       if (rc)
+               return rc;
+
+       if (ale.i == 1)
+               return PGM_ALEN_TRANSLATION;
+       if (ale.alesn != alet.alesn)
+               return PGM_ALE_SEQUENCE;
+
+       rc = read_guest_real(vcpu, ale.asteo * 64, &aste, sizeof(struct aste));
+       if (rc)
+               return rc;
+
+       if (aste.i)
+               return PGM_ASTE_VALIDITY;
+       if (aste.astesn != ale.astesn)
+               return PGM_ASTE_SEQUENCE;
+
+       if (ale.p == 1) {
+               eax = (vcpu->arch.sie_block->gcr[8] >> 16) & 0xffff;
+               if (ale.aleax != eax) {
+                       if (eax / 16 > aste.atl)
+                               return PGM_EXTENDED_AUTHORITY;
+
+                       authority_table_addr = aste.ato * 4 + eax / 4;
+
+                       rc = read_guest_real(vcpu, authority_table_addr,
+                                            &authority_table,
+                                            sizeof(u8));
+                       if (rc)
+                               return rc;
+
+                       if ((authority_table & (0x40 >> ((eax & 3) * 2))) == 0)
+                               return PGM_EXTENDED_AUTHORITY;
+               }
+       }
+
+       if (ale.fo == 1 && write)
+               return PGM_PROTECTION;
+
+       asce->val = aste.asce;
+       return 0;
+}
+
+struct trans_exc_code_bits {
+       unsigned long addr : 52; /* Translation-exception Address */
+       unsigned long fsi  : 2;  /* Access Exception Fetch/Store Indication */
+       unsigned long      : 6;
+       unsigned long b60  : 1;
+       unsigned long b61  : 1;
+       unsigned long as   : 2;  /* ASCE Identifier */
+};
+
+enum {
+       FSI_UNKNOWN = 0, /* Unknown wether fetch or store */
+       FSI_STORE   = 1, /* Exception was due to store operation */
+       FSI_FETCH   = 2  /* Exception was due to fetch operation */
+};
+
+static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
+                        ar_t ar, int write)
 {
+       int rc;
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+       struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
+       struct trans_exc_code_bits *tec_bits;
+
+       memset(pgm, 0, sizeof(*pgm));
+       tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+       tec_bits->fsi = write ? FSI_STORE : FSI_FETCH;
+       tec_bits->as = psw_bits(*psw).as;
+
+       if (!psw_bits(*psw).t) {
+               asce->val = 0;
+               asce->r = 1;
+               return 0;
+       }
+
        switch (psw_bits(vcpu->arch.sie_block->gpsw).as) {
        case PSW_AS_PRIMARY:
-               return vcpu->arch.sie_block->gcr[1];
+               asce->val = vcpu->arch.sie_block->gcr[1];
+               return 0;
        case PSW_AS_SECONDARY:
-               return vcpu->arch.sie_block->gcr[7];
+               asce->val = vcpu->arch.sie_block->gcr[7];
+               return 0;
        case PSW_AS_HOME:
-               return vcpu->arch.sie_block->gcr[13];
+               asce->val = vcpu->arch.sie_block->gcr[13];
+               return 0;
+       case PSW_AS_ACCREG:
+               rc = ar_translation(vcpu, asce, ar, write);
+               switch (rc) {
+               case PGM_ALEN_TRANSLATION:
+               case PGM_ALE_SEQUENCE:
+               case PGM_ASTE_VALIDITY:
+               case PGM_ASTE_SEQUENCE:
+               case PGM_EXTENDED_AUTHORITY:
+                       vcpu->arch.pgm.exc_access_id = ar;
+                       break;
+               case PGM_PROTECTION:
+                       tec_bits->b60 = 1;
+                       tec_bits->b61 = 1;
+                       break;
+               }
+               if (rc > 0)
+                       pgm->code = rc;
+               return rc;
        }
        return 0;
 }
@@ -330,10 +521,11 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  * @vcpu: virtual cpu
  * @gva: guest virtual address
  * @gpa: points to where guest physical (absolute) address should be stored
+ * @asce: effective asce
  * @write: indicates if access is a write access
  *
  * Translate a guest virtual address into a guest absolute address by means
- * of dynamic address translation as specified by the architecuture.
+ * of dynamic address translation as specified by the architecture.
  * If the resulting absolute address is not available in the configuration
  * an addressing exception is indicated and @gpa will not be changed.
  *
@@ -345,7 +537,8 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  *           by the architecture
  */
 static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
-                                    unsigned long *gpa, int write)
+                                    unsigned long *gpa, const union asce asce,
+                                    int write)
 {
        union vaddress vaddr = {.addr = gva};
        union raddress raddr = {.addr = gva};
@@ -354,12 +547,10 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
        union ctlreg0 ctlreg0;
        unsigned long ptr;
        int edat1, edat2;
-       union asce asce;
 
        ctlreg0.val = vcpu->arch.sie_block->gcr[0];
        edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8);
        edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78);
-       asce.val = get_vcpu_asce(vcpu);
        if (asce.r)
                goto real_address;
        ptr = asce.origin * 4096;
@@ -506,48 +697,30 @@ static inline int is_low_address(unsigned long ga)
        return (ga & ~0x11fful) == 0;
 }
 
-static int low_address_protection_enabled(struct kvm_vcpu *vcpu)
+static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
+                                         const union asce asce)
 {
        union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]};
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
-       union asce asce;
 
        if (!ctlreg0.lap)
                return 0;
-       asce.val = get_vcpu_asce(vcpu);
        if (psw_bits(*psw).t && asce.p)
                return 0;
        return 1;
 }
 
-struct trans_exc_code_bits {
-       unsigned long addr : 52; /* Translation-exception Address */
-       unsigned long fsi  : 2;  /* Access Exception Fetch/Store Indication */
-       unsigned long      : 7;
-       unsigned long b61  : 1;
-       unsigned long as   : 2;  /* ASCE Identifier */
-};
-
-enum {
-       FSI_UNKNOWN = 0, /* Unknown wether fetch or store */
-       FSI_STORE   = 1, /* Exception was due to store operation */
-       FSI_FETCH   = 2  /* Exception was due to fetch operation */
-};
-
 static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
                            unsigned long *pages, unsigned long nr_pages,
-                           int write)
+                           const union asce asce, int write)
 {
        struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
        struct trans_exc_code_bits *tec_bits;
        int lap_enabled, rc;
 
-       memset(pgm, 0, sizeof(*pgm));
        tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-       tec_bits->fsi = write ? FSI_STORE : FSI_FETCH;
-       tec_bits->as = psw_bits(*psw).as;
-       lap_enabled = low_address_protection_enabled(vcpu);
+       lap_enabled = low_address_protection_enabled(vcpu, asce);
        while (nr_pages) {
                ga = kvm_s390_logical_to_effective(vcpu, ga);
                tec_bits->addr = ga >> PAGE_SHIFT;
@@ -557,7 +730,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
                }
                ga &= PAGE_MASK;
                if (psw_bits(*psw).t) {
-                       rc = guest_translate(vcpu, ga, pages, write);
+                       rc = guest_translate(vcpu, ga, pages, asce, write);
                        if (rc < 0)
                                return rc;
                        if (rc == PGM_PROTECTION)
@@ -578,7 +751,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
        return 0;
 }
 
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
                 unsigned long len, int write)
 {
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -591,20 +764,19 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
 
        if (!len)
                return 0;
-       /* Access register mode is not supported yet. */
-       if (psw_bits(*psw).t && psw_bits(*psw).as == PSW_AS_ACCREG)
-               return -EOPNOTSUPP;
+       rc = get_vcpu_asce(vcpu, &asce, ar, write);
+       if (rc)
+               return rc;
        nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
        pages = pages_array;
        if (nr_pages > ARRAY_SIZE(pages_array))
                pages = vmalloc(nr_pages * sizeof(unsigned long));
        if (!pages)
                return -ENOMEM;
-       asce.val = get_vcpu_asce(vcpu);
        need_ipte_lock = psw_bits(*psw).t && !asce.r;
        if (need_ipte_lock)
                ipte_lock(vcpu);
-       rc = guest_page_range(vcpu, ga, pages, nr_pages, write);
+       rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, write);
        for (idx = 0; idx < nr_pages && !rc; idx++) {
                gpa = *(pages + idx) + (ga & ~PAGE_MASK);
                _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
@@ -652,7 +824,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  * Note: The IPTE lock is not taken during this function, so the caller
  * has to take care of this.
  */
-int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
+int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
                            unsigned long *gpa, int write)
 {
        struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
@@ -661,26 +833,21 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
        union asce asce;
        int rc;
 
-       /* Access register mode is not supported yet. */
-       if (psw_bits(*psw).t && psw_bits(*psw).as == PSW_AS_ACCREG)
-               return -EOPNOTSUPP;
-
        gva = kvm_s390_logical_to_effective(vcpu, gva);
-       memset(pgm, 0, sizeof(*pgm));
        tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-       tec->as = psw_bits(*psw).as;
-       tec->fsi = write ? FSI_STORE : FSI_FETCH;
+       rc = get_vcpu_asce(vcpu, &asce, ar, write);
        tec->addr = gva >> PAGE_SHIFT;
-       if (is_low_address(gva) && low_address_protection_enabled(vcpu)) {
+       if (rc)
+               return rc;
+       if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
                if (write) {
                        rc = pgm->code = PGM_PROTECTION;
                        return rc;
                }
        }
 
-       asce.val = get_vcpu_asce(vcpu);
        if (psw_bits(*psw).t && !asce.r) {      /* Use DAT? */
-               rc = guest_translate(vcpu, gva, gpa, write);
+               rc = guest_translate(vcpu, gva, gpa, asce, write);
                if (rc > 0) {
                        if (rc == PGM_PROTECTION)
                                tec->b61 = 1;
@@ -697,28 +864,51 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
 }
 
 /**
- * kvm_s390_check_low_addr_protection - check for low-address protection
- * @ga: Guest address
+ * check_gva_range - test a range of guest virtual addresses for accessibility
+ */
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+                   unsigned long length, int is_write)
+{
+       unsigned long gpa;
+       unsigned long currlen;
+       int rc = 0;
+
+       ipte_lock(vcpu);
+       while (length > 0 && !rc) {
+               currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE));
+               rc = guest_translate_address(vcpu, gva, ar, &gpa, is_write);
+               gva += currlen;
+               length -= currlen;
+       }
+       ipte_unlock(vcpu);
+
+       return rc;
+}
+
+/**
+ * kvm_s390_check_low_addr_prot_real - check for low-address protection
+ * @gra: Guest real address
  *
  * Checks whether an address is subject to low-address protection and set
  * up vcpu->arch.pgm accordingly if necessary.
  *
  * Return: 0 if no protection exception, or PGM_PROTECTION if protected.
  */
-int kvm_s390_check_low_addr_protection(struct kvm_vcpu *vcpu, unsigned long ga)
+int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
 {
        struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
        psw_t *psw = &vcpu->arch.sie_block->gpsw;
        struct trans_exc_code_bits *tec_bits;
+       union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]};
 
-       if (!is_low_address(ga) || !low_address_protection_enabled(vcpu))
+       if (!ctlreg0.lap || !is_low_address(gra))
                return 0;
 
        memset(pgm, 0, sizeof(*pgm));
        tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
        tec_bits->fsi = FSI_STORE;
        tec_bits->as = psw_bits(*psw).as;
-       tec_bits->addr = ga >> PAGE_SHIFT;
+       tec_bits->addr = gra >> PAGE_SHIFT;
        pgm->code = PGM_PROTECTION;
 
        return pgm->code;
index 0149cf15058ab9e8d12918192353884ac4ce8f4f..ef03726cc6611acd1e52fb6970e2e802a1730cbd 100644 (file)
@@ -156,9 +156,11 @@ int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
 }
 
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
-                           unsigned long *gpa, int write);
+                           ar_t ar, unsigned long *gpa, int write);
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+                   unsigned long length, int is_write);
 
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
                 unsigned long len, int write);
 
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
@@ -168,6 +170,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  * write_guest - copy data from kernel space to guest space
  * @vcpu: virtual cpu
  * @ga: guest address
+ * @ar: access register
  * @data: source address in kernel space
  * @len: number of bytes to copy
  *
@@ -176,8 +179,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  * If DAT is off data will be copied to guest real or absolute memory.
  * If DAT is on data will be copied to the address space as specified by
  * the address space bits of the PSW:
- * Primary, secondory or home space (access register mode is currently not
- * implemented).
+ * Primary, secondary, home space or access register mode.
  * The addressing mode of the PSW is also inspected, so that address wrap
  * around is taken into account for 24-, 31- and 64-bit addressing mode,
  * if the to be copied data crosses page boundaries in guest address space.
@@ -210,16 +212,17 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  *      if data has been changed in guest space in case of an exception.
  */
 static inline __must_check
-int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
                unsigned long len)
 {
-       return access_guest(vcpu, ga, data, len, 1);
+       return access_guest(vcpu, ga, ar, data, len, 1);
 }
 
 /**
  * read_guest - copy data from guest space to kernel space
  * @vcpu: virtual cpu
  * @ga: guest address
+ * @ar: access register
  * @data: destination address in kernel space
  * @len: number of bytes to copy
  *
@@ -229,10 +232,10 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
  * data will be copied from guest space to kernel space.
  */
 static inline __must_check
-int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
               unsigned long len)
 {
-       return access_guest(vcpu, ga, data, len, 0);
+       return access_guest(vcpu, ga, ar, data, len, 0);
 }
 
 /**
@@ -330,6 +333,6 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
 void ipte_lock(struct kvm_vcpu *vcpu);
 void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
-int kvm_s390_check_low_addr_protection(struct kvm_vcpu *vcpu, unsigned long ga);
+int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
 #endif /* __KVM_S390_GACCESS_H */
index 3e8d4092ce30f10a6bd4b9e486e926889a937b2e..e97b3455d7e6bfbb7b12c606a5224c5efeba37d4 100644 (file)
@@ -191,8 +191,8 @@ static int __import_wp_info(struct kvm_vcpu *vcpu,
        if (!wp_info->old_data)
                return -ENOMEM;
        /* try to backup the original value */
-       ret = read_guest(vcpu, wp_info->phys_addr, wp_info->old_data,
-                        wp_info->len);
+       ret = read_guest_abs(vcpu, wp_info->phys_addr, wp_info->old_data,
+                            wp_info->len);
        if (ret) {
                kfree(wp_info->old_data);
                wp_info->old_data = NULL;
@@ -362,8 +362,8 @@ static struct kvm_hw_wp_info_arch *any_wp_changed(struct kvm_vcpu *vcpu)
                        continue;
 
                /* refetch the wp data and compare it to the old value */
-               if (!read_guest(vcpu, wp_info->phys_addr, temp,
-                               wp_info->len)) {
+               if (!read_guest_abs(vcpu, wp_info->phys_addr, temp,
+                                   wp_info->len)) {
                        if (memcmp(temp, wp_info->old_data, wp_info->len)) {
                                kfree(temp);
                                return wp_info;
index bebd2157edd019448ff772c9ac75a778e7e124f4..9e3779e3e496314a4e3f15823c152867c3a3cadd 100644 (file)
@@ -165,6 +165,7 @@ static void __extract_prog_irq(struct kvm_vcpu *vcpu,
                pgm_info->mon_class_nr = vcpu->arch.sie_block->mcn;
                pgm_info->mon_code = vcpu->arch.sie_block->tecmc;
                break;
+       case PGM_VECTOR_PROCESSING:
        case PGM_DATA:
                pgm_info->data_exc_code = vcpu->arch.sie_block->dxc;
                break;
@@ -319,7 +320,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 
        /* Make sure that the source is paged-in */
        rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2],
-                                    &srcaddr, 0);
+                                    reg2, &srcaddr, 0);
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
        rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
@@ -328,7 +329,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 
        /* Make sure that the destination is paged-in */
        rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1],
-                                    &dstaddr, 1);
+                                    reg1, &dstaddr, 1);
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
        rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
index 073b5f387d1dd3484186dd69dcfc5aae63d90b21..9de47265ef73da07ffd7ef37337bf2e44e59bd46 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * handling kvm guest interrupts
  *
- * Copyright IBM Corp. 2008,2014
+ * Copyright IBM Corp. 2008, 2015
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
 #include <linux/signal.h>
 #include <linux/slab.h>
 #include <linux/bitmap.h>
+#include <linux/vmalloc.h>
 #include <asm/asm-offsets.h>
+#include <asm/dis.h>
 #include <asm/uaccess.h>
 #include <asm/sclp.h>
+#include <asm/isc.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include "trace-s390.h"
 #define PFAULT_DONE 0x0680
 #define VIRTIO_PARAM 0x0d00
 
-static int is_ioint(u64 type)
-{
-       return ((type & 0xfffe0000u) != 0xfffe0000u);
-}
-
 int psw_extint_disabled(struct kvm_vcpu *vcpu)
 {
        return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
@@ -72,70 +70,45 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
        return 1;
 }
 
-static u64 int_word_to_isc_bits(u32 int_word)
+static int ckc_irq_pending(struct kvm_vcpu *vcpu)
+{
+       if (!(vcpu->arch.sie_block->ckc <
+             get_tod_clock_fast() + vcpu->arch.sie_block->epoch))
+               return 0;
+       return ckc_interrupts_enabled(vcpu);
+}
+
+static int cpu_timer_interrupts_enabled(struct kvm_vcpu *vcpu)
+{
+       return !psw_extint_disabled(vcpu) &&
+              (vcpu->arch.sie_block->gcr[0] & 0x400ul);
+}
+
+static int cpu_timer_irq_pending(struct kvm_vcpu *vcpu)
+{
+       return (vcpu->arch.sie_block->cputm >> 63) &&
+              cpu_timer_interrupts_enabled(vcpu);
+}
+
+static inline int is_ioirq(unsigned long irq_type)
 {
-       u8 isc = (int_word & 0x38000000) >> 27;
+       return ((irq_type >= IRQ_PEND_IO_ISC_0) &&
+               (irq_type <= IRQ_PEND_IO_ISC_7));
+}
 
+static uint64_t isc_to_isc_bits(int isc)
+{
        return (0x80 >> isc) << 24;
 }
 
-static int __must_check __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
-                                     struct kvm_s390_interrupt_info *inti)
+static inline u8 int_word_to_isc(u32 int_word)
 {
-       switch (inti->type) {
-       case KVM_S390_INT_EXTERNAL_CALL:
-               if (psw_extint_disabled(vcpu))
-                       return 0;
-               if (vcpu->arch.sie_block->gcr[0] & 0x2000ul)
-                       return 1;
-               return 0;
-       case KVM_S390_INT_EMERGENCY:
-               if (psw_extint_disabled(vcpu))
-                       return 0;
-               if (vcpu->arch.sie_block->gcr[0] & 0x4000ul)
-                       return 1;
-               return 0;
-       case KVM_S390_INT_CLOCK_COMP:
-               return ckc_interrupts_enabled(vcpu);
-       case KVM_S390_INT_CPU_TIMER:
-               if (psw_extint_disabled(vcpu))
-                       return 0;
-               if (vcpu->arch.sie_block->gcr[0] & 0x400ul)
-                       return 1;
-               return 0;
-       case KVM_S390_INT_SERVICE:
-       case KVM_S390_INT_PFAULT_INIT:
-       case KVM_S390_INT_PFAULT_DONE:
-       case KVM_S390_INT_VIRTIO:
-               if (psw_extint_disabled(vcpu))
-                       return 0;
-               if (vcpu->arch.sie_block->gcr[0] & 0x200ul)
-                       return 1;
-               return 0;
-       case KVM_S390_PROGRAM_INT:
-       case KVM_S390_SIGP_STOP:
-       case KVM_S390_SIGP_SET_PREFIX:
-       case KVM_S390_RESTART:
-               return 1;
-       case KVM_S390_MCHK:
-               if (psw_mchk_disabled(vcpu))
-                       return 0;
-               if (vcpu->arch.sie_block->gcr[14] & inti->mchk.cr14)
-                       return 1;
-               return 0;
-       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-               if (psw_ioint_disabled(vcpu))
-                       return 0;
-               if (vcpu->arch.sie_block->gcr[6] &
-                   int_word_to_isc_bits(inti->io.io_int_word))
-                       return 1;
-               return 0;
-       default:
-               printk(KERN_WARNING "illegal interrupt type %llx\n",
-                      inti->type);
-               BUG();
-       }
-       return 0;
+       return (int_word & 0x38000000) >> 27;
+}
+
+static inline unsigned long pending_floating_irqs(struct kvm_vcpu *vcpu)
+{
+       return vcpu->kvm->arch.float_int.pending_irqs;
 }
 
 static inline unsigned long pending_local_irqs(struct kvm_vcpu *vcpu)
@@ -143,12 +116,31 @@ static inline unsigned long pending_local_irqs(struct kvm_vcpu *vcpu)
        return vcpu->arch.local_int.pending_irqs;
 }
 
-static unsigned long deliverable_local_irqs(struct kvm_vcpu *vcpu)
+static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
+                                  unsigned long active_mask)
+{
+       int i;
+
+       for (i = 0; i <= MAX_ISC; i++)
+               if (!(vcpu->arch.sie_block->gcr[6] & isc_to_isc_bits(i)))
+                       active_mask &= ~(1UL << (IRQ_PEND_IO_ISC_0 + i));
+
+       return active_mask;
+}
+
+static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 {
-       unsigned long active_mask = pending_local_irqs(vcpu);
+       unsigned long active_mask;
+
+       active_mask = pending_local_irqs(vcpu);
+       active_mask |= pending_floating_irqs(vcpu);
 
        if (psw_extint_disabled(vcpu))
                active_mask &= ~IRQ_PEND_EXT_MASK;
+       if (psw_ioint_disabled(vcpu))
+               active_mask &= ~IRQ_PEND_IO_MASK;
+       else
+               active_mask = disable_iscs(vcpu, active_mask);
        if (!(vcpu->arch.sie_block->gcr[0] & 0x2000ul))
                __clear_bit(IRQ_PEND_EXT_EXTERNAL, &active_mask);
        if (!(vcpu->arch.sie_block->gcr[0] & 0x4000ul))
@@ -157,8 +149,13 @@ static unsigned long deliverable_local_irqs(struct kvm_vcpu *vcpu)
                __clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &active_mask);
        if (!(vcpu->arch.sie_block->gcr[0] & 0x400ul))
                __clear_bit(IRQ_PEND_EXT_CPU_TIMER, &active_mask);
+       if (!(vcpu->arch.sie_block->gcr[0] & 0x200ul))
+               __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask);
        if (psw_mchk_disabled(vcpu))
                active_mask &= ~IRQ_PEND_MCHK_MASK;
+       if (!(vcpu->arch.sie_block->gcr[14] &
+             vcpu->kvm->arch.float_int.mchk.cr14))
+               __clear_bit(IRQ_PEND_MCHK_REP, &active_mask);
 
        /*
         * STOP irqs will never be actively delivered. They are triggered via
@@ -200,6 +197,16 @@ static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
        atomic_set_mask(flag, &vcpu->arch.sie_block->cpuflags);
 }
 
+static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
+{
+       if (!(pending_floating_irqs(vcpu) & IRQ_PEND_IO_MASK))
+               return;
+       else if (psw_ioint_disabled(vcpu))
+               __set_cpuflag(vcpu, CPUSTAT_IO_INT);
+       else
+               vcpu->arch.sie_block->lctl |= LCTL_CR6;
+}
+
 static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
 {
        if (!(pending_local_irqs(vcpu) & IRQ_PEND_EXT_MASK))
@@ -226,47 +233,17 @@ static void set_intercept_indicators_stop(struct kvm_vcpu *vcpu)
                __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
 }
 
-/* Set interception request for non-deliverable local interrupts */
-static void set_intercept_indicators_local(struct kvm_vcpu *vcpu)
+/* Set interception request for non-deliverable interrupts */
+static void set_intercept_indicators(struct kvm_vcpu *vcpu)
 {
+       set_intercept_indicators_io(vcpu);
        set_intercept_indicators_ext(vcpu);
        set_intercept_indicators_mchk(vcpu);
        set_intercept_indicators_stop(vcpu);
 }
 
-static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
-                                     struct kvm_s390_interrupt_info *inti)
-{
-       switch (inti->type) {
-       case KVM_S390_INT_SERVICE:
-       case KVM_S390_INT_PFAULT_DONE:
-       case KVM_S390_INT_VIRTIO:
-               if (psw_extint_disabled(vcpu))
-                       __set_cpuflag(vcpu, CPUSTAT_EXT_INT);
-               else
-                       vcpu->arch.sie_block->lctl |= LCTL_CR0;
-               break;
-       case KVM_S390_MCHK:
-               if (psw_mchk_disabled(vcpu))
-                       vcpu->arch.sie_block->ictl |= ICTL_LPSW;
-               else
-                       vcpu->arch.sie_block->lctl |= LCTL_CR14;
-               break;
-       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-               if (psw_ioint_disabled(vcpu))
-                       __set_cpuflag(vcpu, CPUSTAT_IO_INT);
-               else
-                       vcpu->arch.sie_block->lctl |= LCTL_CR6;
-               break;
-       default:
-               BUG();
-       }
-}
-
 static u16 get_ilc(struct kvm_vcpu *vcpu)
 {
-       const unsigned short table[] = { 2, 4, 4, 6 };
-
        switch (vcpu->arch.sie_block->icptcode) {
        case ICPT_INST:
        case ICPT_INSTPROGI:
@@ -274,7 +251,7 @@ static u16 get_ilc(struct kvm_vcpu *vcpu)
        case ICPT_PARTEXEC:
        case ICPT_IOINST:
                /* last instruction only stored for these icptcodes */
-               return table[vcpu->arch.sie_block->ipa >> 14];
+               return insn_length(vcpu->arch.sie_block->ipa >> 8);
        case ICPT_PROGI:
                return vcpu->arch.sie_block->pgmilc;
        default:
@@ -350,38 +327,72 @@ static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu)
 
 static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
 {
+       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
-       struct kvm_s390_mchk_info mchk;
-       int rc;
+       struct kvm_s390_mchk_info mchk = {};
+       unsigned long adtl_status_addr;
+       int deliver = 0;
+       int rc = 0;
 
+       spin_lock(&fi->lock);
        spin_lock(&li->lock);
-       mchk = li->irq.mchk;
+       if (test_bit(IRQ_PEND_MCHK_EX, &li->pending_irqs) ||
+           test_bit(IRQ_PEND_MCHK_REP, &li->pending_irqs)) {
+               /*
+                * If there was an exigent machine check pending, then any
+                * repressible machine checks that might have been pending
+                * are indicated along with it, so always clear bits for
+                * repressible and exigent interrupts
+                */
+               mchk = li->irq.mchk;
+               clear_bit(IRQ_PEND_MCHK_EX, &li->pending_irqs);
+               clear_bit(IRQ_PEND_MCHK_REP, &li->pending_irqs);
+               memset(&li->irq.mchk, 0, sizeof(mchk));
+               deliver = 1;
+       }
        /*
-        * If there was an exigent machine check pending, then any repressible
-        * machine checks that might have been pending are indicated along
-        * with it, so always clear both bits
+        * We indicate floating repressible conditions along with
+        * other pending conditions. Channel Report Pending and Channel
+        * Subsystem damage are the only two and and are indicated by
+        * bits in mcic and masked in cr14.
         */
-       clear_bit(IRQ_PEND_MCHK_EX, &li->pending_irqs);
-       clear_bit(IRQ_PEND_MCHK_REP, &li->pending_irqs);
-       memset(&li->irq.mchk, 0, sizeof(mchk));
+       if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) {
+               mchk.mcic |= fi->mchk.mcic;
+               mchk.cr14 |= fi->mchk.cr14;
+               memset(&fi->mchk, 0, sizeof(mchk));
+               deliver = 1;
+       }
        spin_unlock(&li->lock);
+       spin_unlock(&fi->lock);
 
-       VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
-                  mchk.mcic);
-       trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_MCHK,
-                                        mchk.cr14, mchk.mcic);
-
-       rc  = kvm_s390_vcpu_store_status(vcpu, KVM_S390_STORE_STATUS_PREFIXED);
-       rc |= put_guest_lc(vcpu, mchk.mcic,
-                          (u64 __user *) __LC_MCCK_CODE);
-       rc |= put_guest_lc(vcpu, mchk.failing_storage_address,
-                          (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
-       rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA,
-                            &mchk.fixed_logout, sizeof(mchk.fixed_logout));
-       rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
-                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
-                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       if (deliver) {
+               VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
+                          mchk.mcic);
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
+                                                KVM_S390_MCHK,
+                                                mchk.cr14, mchk.mcic);
+
+               rc  = kvm_s390_vcpu_store_status(vcpu,
+                                                KVM_S390_STORE_STATUS_PREFIXED);
+               rc |= read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR,
+                                   &adtl_status_addr,
+                                   sizeof(unsigned long));
+               rc |= kvm_s390_vcpu_store_adtl_status(vcpu,
+                                                     adtl_status_addr);
+               rc |= put_guest_lc(vcpu, mchk.mcic,
+                                  (u64 __user *) __LC_MCCK_CODE);
+               rc |= put_guest_lc(vcpu, mchk.failing_storage_address,
+                                  (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
+               rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA,
+                                    &mchk.fixed_logout,
+                                    sizeof(mchk.fixed_logout));
+               rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
+                                    &vcpu->arch.sie_block->gpsw,
+                                    sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
+                                   &vcpu->arch.sie_block->gpsw,
+                                   sizeof(psw_t));
+       }
        return rc ? -EFAULT : 0;
 }
 
@@ -484,7 +495,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        struct kvm_s390_pgm_info pgm_info;
-       int rc = 0;
+       int rc = 0, nullifying = false;
        u16 ilc = get_ilc(vcpu);
 
        spin_lock(&li->lock);
@@ -509,6 +520,8 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
        case PGM_LX_TRANSLATION:
        case PGM_PRIMARY_AUTHORITY:
        case PGM_SECONDARY_AUTHORITY:
+               nullifying = true;
+               /* fall through */
        case PGM_SPACE_SWITCH:
                rc = put_guest_lc(vcpu, pgm_info.trans_exc_code,
                                  (u64 *)__LC_TRANS_EXC_CODE);
@@ -521,6 +534,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
        case PGM_EXTENDED_AUTHORITY:
                rc = put_guest_lc(vcpu, pgm_info.exc_access_id,
                                  (u8 *)__LC_EXC_ACCESS_ID);
+               nullifying = true;
                break;
        case PGM_ASCE_TYPE:
        case PGM_PAGE_TRANSLATION:
@@ -534,6 +548,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
                                   (u8 *)__LC_EXC_ACCESS_ID);
                rc |= put_guest_lc(vcpu, pgm_info.op_access_id,
                                   (u8 *)__LC_OP_ACCESS_ID);
+               nullifying = true;
                break;
        case PGM_MONITOR:
                rc = put_guest_lc(vcpu, pgm_info.mon_class_nr,
@@ -541,6 +556,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
                rc |= put_guest_lc(vcpu, pgm_info.mon_code,
                                   (u64 *)__LC_MON_CODE);
                break;
+       case PGM_VECTOR_PROCESSING:
        case PGM_DATA:
                rc = put_guest_lc(vcpu, pgm_info.data_exc_code,
                                  (u32 *)__LC_DATA_EXC_CODE);
@@ -551,6 +567,15 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
                rc |= put_guest_lc(vcpu, pgm_info.exc_access_id,
                                   (u8 *)__LC_EXC_ACCESS_ID);
                break;
+       case PGM_STACK_FULL:
+       case PGM_STACK_EMPTY:
+       case PGM_STACK_SPECIFICATION:
+       case PGM_STACK_TYPE:
+       case PGM_STACK_OPERATION:
+       case PGM_TRACE_TABEL:
+       case PGM_CRYPTO_OPERATION:
+               nullifying = true;
+               break;
        }
 
        if (pgm_info.code & PGM_PER) {
@@ -564,7 +589,12 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
                                   (u8 *) __LC_PER_ACCESS_ID);
        }
 
+       if (nullifying && vcpu->arch.sie_block->icptcode == ICPT_INST)
+               kvm_s390_rewind_psw(vcpu, ilc);
+
        rc |= put_guest_lc(vcpu, ilc, (u16 *) __LC_PGM_ILC);
+       rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
+                                (u64 *) __LC_LAST_BREAK);
        rc |= put_guest_lc(vcpu, pgm_info.code,
                           (u16 *)__LC_PGM_INT_CODE);
        rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW,
@@ -574,16 +604,27 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
        return rc ? -EFAULT : 0;
 }
 
-static int __must_check __deliver_service(struct kvm_vcpu *vcpu,
-                                         struct kvm_s390_interrupt_info *inti)
+static int __must_check __deliver_service(struct kvm_vcpu *vcpu)
 {
-       int rc;
+       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+       struct kvm_s390_ext_info ext;
+       int rc = 0;
+
+       spin_lock(&fi->lock);
+       if (!(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) {
+               spin_unlock(&fi->lock);
+               return 0;
+       }
+       ext = fi->srv_signal;
+       memset(&fi->srv_signal, 0, sizeof(ext));
+       clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs);
+       spin_unlock(&fi->lock);
 
        VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
-                  inti->ext.ext_params);
+                  ext.ext_params);
        vcpu->stat.deliver_service_signal++;
-       trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
-                                        inti->ext.ext_params, 0);
+       trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE,
+                                        ext.ext_params, 0);
 
        rc  = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE);
        rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR);
@@ -591,106 +632,146 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu,
                             &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
        rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= put_guest_lc(vcpu, inti->ext.ext_params,
+       rc |= put_guest_lc(vcpu, ext.ext_params,
                           (u32 *)__LC_EXT_PARAMS);
+
        return rc ? -EFAULT : 0;
 }
 
-static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu,
-                                          struct kvm_s390_interrupt_info *inti)
+static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu)
 {
-       int rc;
+       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+       struct kvm_s390_interrupt_info *inti;
+       int rc = 0;
 
-       trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
-                                        KVM_S390_INT_PFAULT_DONE, 0,
-                                        inti->ext.ext_params2);
+       spin_lock(&fi->lock);
+       inti = list_first_entry_or_null(&fi->lists[FIRQ_LIST_PFAULT],
+                                       struct kvm_s390_interrupt_info,
+                                       list);
+       if (inti) {
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
+                               KVM_S390_INT_PFAULT_DONE, 0,
+                               inti->ext.ext_params2);
+               list_del(&inti->list);
+               fi->counters[FIRQ_CNTR_PFAULT] -= 1;
+       }
+       if (list_empty(&fi->lists[FIRQ_LIST_PFAULT]))
+               clear_bit(IRQ_PEND_PFAULT_DONE, &fi->pending_irqs);
+       spin_unlock(&fi->lock);
 
-       rc  = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE, (u16 *)__LC_EXT_INT_CODE);
-       rc |= put_guest_lc(vcpu, PFAULT_DONE, (u16 *)__LC_EXT_CPU_ADDR);
-       rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
-                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
-                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= put_guest_lc(vcpu, inti->ext.ext_params2,
-                          (u64 *)__LC_EXT_PARAMS2);
+       if (inti) {
+               rc  = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE,
+                               (u16 *)__LC_EXT_INT_CODE);
+               rc |= put_guest_lc(vcpu, PFAULT_DONE,
+                               (u16 *)__LC_EXT_CPU_ADDR);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                               &vcpu->arch.sie_block->gpsw,
+                               sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+                               &vcpu->arch.sie_block->gpsw,
+                               sizeof(psw_t));
+               rc |= put_guest_lc(vcpu, inti->ext.ext_params2,
+                               (u64 *)__LC_EXT_PARAMS2);
+               kfree(inti);
+       }
        return rc ? -EFAULT : 0;
 }
 
-static int __must_check __deliver_virtio(struct kvm_vcpu *vcpu,
-                                        struct kvm_s390_interrupt_info *inti)
+static int __must_check __deliver_virtio(struct kvm_vcpu *vcpu)
 {
-       int rc;
+       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+       struct kvm_s390_interrupt_info *inti;
+       int rc = 0;
 
-       VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
-                  inti->ext.ext_params, inti->ext.ext_params2);
-       vcpu->stat.deliver_virtio_interrupt++;
-       trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
-                                        inti->ext.ext_params,
-                                        inti->ext.ext_params2);
+       spin_lock(&fi->lock);
+       inti = list_first_entry_or_null(&fi->lists[FIRQ_LIST_VIRTIO],
+                                       struct kvm_s390_interrupt_info,
+                                       list);
+       if (inti) {
+               VCPU_EVENT(vcpu, 4,
+                          "interrupt: virtio parm:%x,parm64:%llx",
+                          inti->ext.ext_params, inti->ext.ext_params2);
+               vcpu->stat.deliver_virtio_interrupt++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
+                               inti->type,
+                               inti->ext.ext_params,
+                               inti->ext.ext_params2);
+               list_del(&inti->list);
+               fi->counters[FIRQ_CNTR_VIRTIO] -= 1;
+       }
+       if (list_empty(&fi->lists[FIRQ_LIST_VIRTIO]))
+               clear_bit(IRQ_PEND_VIRTIO, &fi->pending_irqs);
+       spin_unlock(&fi->lock);
 
-       rc  = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE, (u16 *)__LC_EXT_INT_CODE);
-       rc |= put_guest_lc(vcpu, VIRTIO_PARAM, (u16 *)__LC_EXT_CPU_ADDR);
-       rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
-                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
-                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= put_guest_lc(vcpu, inti->ext.ext_params,
-                          (u32 *)__LC_EXT_PARAMS);
-       rc |= put_guest_lc(vcpu, inti->ext.ext_params2,
-                          (u64 *)__LC_EXT_PARAMS2);
+       if (inti) {
+               rc  = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE,
+                               (u16 *)__LC_EXT_INT_CODE);
+               rc |= put_guest_lc(vcpu, VIRTIO_PARAM,
+                               (u16 *)__LC_EXT_CPU_ADDR);
+               rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
+                               &vcpu->arch.sie_block->gpsw,
+                               sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
+                               &vcpu->arch.sie_block->gpsw,
+                               sizeof(psw_t));
+               rc |= put_guest_lc(vcpu, inti->ext.ext_params,
+                               (u32 *)__LC_EXT_PARAMS);
+               rc |= put_guest_lc(vcpu, inti->ext.ext_params2,
+                               (u64 *)__LC_EXT_PARAMS2);
+               kfree(inti);
+       }
        return rc ? -EFAULT : 0;
 }
 
 static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
-                                    struct kvm_s390_interrupt_info *inti)
+                                    unsigned long irq_type)
 {
-       int rc;
+       struct list_head *isc_list;
+       struct kvm_s390_float_interrupt *fi;
+       struct kvm_s390_interrupt_info *inti = NULL;
+       int rc = 0;
 
-       VCPU_EVENT(vcpu, 4, "interrupt: I/O %llx", inti->type);
-       vcpu->stat.deliver_io_int++;
-       trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
-                                        ((__u32)inti->io.subchannel_id << 16) |
-                                               inti->io.subchannel_nr,
-                                        ((__u64)inti->io.io_int_parm << 32) |
-                                               inti->io.io_int_word);
-
-       rc  = put_guest_lc(vcpu, inti->io.subchannel_id,
-                          (u16 *)__LC_SUBCHANNEL_ID);
-       rc |= put_guest_lc(vcpu, inti->io.subchannel_nr,
-                          (u16 *)__LC_SUBCHANNEL_NR);
-       rc |= put_guest_lc(vcpu, inti->io.io_int_parm,
-                          (u32 *)__LC_IO_INT_PARM);
-       rc |= put_guest_lc(vcpu, inti->io.io_int_word,
-                          (u32 *)__LC_IO_INT_WORD);
-       rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW,
-                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW,
-                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       return rc ? -EFAULT : 0;
-}
+       fi = &vcpu->kvm->arch.float_int;
 
-static int __must_check __deliver_mchk_floating(struct kvm_vcpu *vcpu,
-                                          struct kvm_s390_interrupt_info *inti)
-{
-       struct kvm_s390_mchk_info *mchk = &inti->mchk;
-       int rc;
+       spin_lock(&fi->lock);
+       isc_list = &fi->lists[irq_type - IRQ_PEND_IO_ISC_0];
+       inti = list_first_entry_or_null(isc_list,
+                                       struct kvm_s390_interrupt_info,
+                                       list);
+       if (inti) {
+               VCPU_EVENT(vcpu, 4, "interrupt: I/O %llx", inti->type);
+               vcpu->stat.deliver_io_int++;
+               trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
+                               inti->type,
+                               ((__u32)inti->io.subchannel_id << 16) |
+                               inti->io.subchannel_nr,
+                               ((__u64)inti->io.io_int_parm << 32) |
+                               inti->io.io_int_word);
+               list_del(&inti->list);
+               fi->counters[FIRQ_CNTR_IO] -= 1;
+       }
+       if (list_empty(isc_list))
+               clear_bit(irq_type, &fi->pending_irqs);
+       spin_unlock(&fi->lock);
+
+       if (inti) {
+               rc  = put_guest_lc(vcpu, inti->io.subchannel_id,
+                               (u16 *)__LC_SUBCHANNEL_ID);
+               rc |= put_guest_lc(vcpu, inti->io.subchannel_nr,
+                               (u16 *)__LC_SUBCHANNEL_NR);
+               rc |= put_guest_lc(vcpu, inti->io.io_int_parm,
+                               (u32 *)__LC_IO_INT_PARM);
+               rc |= put_guest_lc(vcpu, inti->io.io_int_word,
+                               (u32 *)__LC_IO_INT_WORD);
+               rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW,
+                               &vcpu->arch.sie_block->gpsw,
+                               sizeof(psw_t));
+               rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW,
+                               &vcpu->arch.sie_block->gpsw,
+                               sizeof(psw_t));
+               kfree(inti);
+       }
 
-       VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
-                  mchk->mcic);
-       trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_MCHK,
-                                        mchk->cr14, mchk->mcic);
-
-       rc  = kvm_s390_vcpu_store_status(vcpu, KVM_S390_STORE_STATUS_PREFIXED);
-       rc |= put_guest_lc(vcpu, mchk->mcic,
-                       (u64 __user *) __LC_MCCK_CODE);
-       rc |= put_guest_lc(vcpu, mchk->failing_storage_address,
-                       (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
-       rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA,
-                            &mchk->fixed_logout, sizeof(mchk->fixed_logout));
-       rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
-                            &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
-                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
        return rc ? -EFAULT : 0;
 }
 
@@ -698,6 +779,7 @@ typedef int (*deliver_irq_t)(struct kvm_vcpu *vcpu);
 
 static const deliver_irq_t deliver_irq_funcs[] = {
        [IRQ_PEND_MCHK_EX]        = __deliver_machine_check,
+       [IRQ_PEND_MCHK_REP]       = __deliver_machine_check,
        [IRQ_PEND_PROG]           = __deliver_prog,
        [IRQ_PEND_EXT_EMERGENCY]  = __deliver_emergency_signal,
        [IRQ_PEND_EXT_EXTERNAL]   = __deliver_external_call,
@@ -706,36 +788,11 @@ static const deliver_irq_t deliver_irq_funcs[] = {
        [IRQ_PEND_RESTART]        = __deliver_restart,
        [IRQ_PEND_SET_PREFIX]     = __deliver_set_prefix,
        [IRQ_PEND_PFAULT_INIT]    = __deliver_pfault_init,
+       [IRQ_PEND_EXT_SERVICE]    = __deliver_service,
+       [IRQ_PEND_PFAULT_DONE]    = __deliver_pfault_done,
+       [IRQ_PEND_VIRTIO]         = __deliver_virtio,
 };
 
-static int __must_check __deliver_floating_interrupt(struct kvm_vcpu *vcpu,
-                                          struct kvm_s390_interrupt_info *inti)
-{
-       int rc;
-
-       switch (inti->type) {
-       case KVM_S390_INT_SERVICE:
-               rc = __deliver_service(vcpu, inti);
-               break;
-       case KVM_S390_INT_PFAULT_DONE:
-               rc = __deliver_pfault_done(vcpu, inti);
-               break;
-       case KVM_S390_INT_VIRTIO:
-               rc = __deliver_virtio(vcpu, inti);
-               break;
-       case KVM_S390_MCHK:
-               rc = __deliver_mchk_floating(vcpu, inti);
-               break;
-       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-               rc = __deliver_io(vcpu, inti);
-               break;
-       default:
-               BUG();
-       }
-
-       return rc;
-}
-
 /* Check whether an external call is pending (deliverable or not) */
 int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
 {
@@ -751,21 +808,9 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
 
 int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop)
 {
-       struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
-       struct kvm_s390_interrupt_info  *inti;
        int rc;
 
-       rc = !!deliverable_local_irqs(vcpu);
-
-       if ((!rc) && atomic_read(&fi->active)) {
-               spin_lock(&fi->lock);
-               list_for_each_entry(inti, &fi->list, list)
-                       if (__interrupt_is_deliverable(vcpu, inti)) {
-                               rc = 1;
-                               break;
-                       }
-               spin_unlock(&fi->lock);
-       }
+       rc = !!deliverable_irqs(vcpu);
 
        if (!rc && kvm_cpu_has_pending_timer(vcpu))
                rc = 1;
@@ -784,12 +829,7 @@ int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop)
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-       if (!(vcpu->arch.sie_block->ckc <
-             get_tod_clock_fast() + vcpu->arch.sie_block->epoch))
-               return 0;
-       if (!ckc_interrupts_enabled(vcpu))
-               return 0;
-       return 1;
+       return ckc_irq_pending(vcpu) || cpu_timer_irq_pending(vcpu);
 }
 
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
@@ -884,60 +924,45 @@ void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu)
 int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
-       struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
-       struct kvm_s390_interrupt_info  *n, *inti = NULL;
        deliver_irq_t func;
-       int deliver;
        int rc = 0;
        unsigned long irq_type;
-       unsigned long deliverable_irqs;
+       unsigned long irqs;
 
        __reset_intercept_indicators(vcpu);
 
        /* pending ckc conditions might have been invalidated */
        clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs);
-       if (kvm_cpu_has_pending_timer(vcpu))
+       if (ckc_irq_pending(vcpu))
                set_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs);
 
+       /* pending cpu timer conditions might have been invalidated */
+       clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
+       if (cpu_timer_irq_pending(vcpu))
+               set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
+
        do {
-               deliverable_irqs = deliverable_local_irqs(vcpu);
+               irqs = deliverable_irqs(vcpu);
                /* bits are in the order of interrupt priority */
-               irq_type = find_first_bit(&deliverable_irqs, IRQ_PEND_COUNT);
+               irq_type = find_first_bit(&irqs, IRQ_PEND_COUNT);
                if (irq_type == IRQ_PEND_COUNT)
                        break;
-               func = deliver_irq_funcs[irq_type];
-               if (!func) {
-                       WARN_ON_ONCE(func == NULL);
-                       clear_bit(irq_type, &li->pending_irqs);
-                       continue;
+               if (is_ioirq(irq_type)) {
+                       rc = __deliver_io(vcpu, irq_type);
+               } else {
+                       func = deliver_irq_funcs[irq_type];
+                       if (!func) {
+                               WARN_ON_ONCE(func == NULL);
+                               clear_bit(irq_type, &li->pending_irqs);
+                               continue;
+                       }
+                       rc = func(vcpu);
                }
-               rc = func(vcpu);
-       } while (!rc && irq_type != IRQ_PEND_COUNT);
+               if (rc)
+                       break;
+       } while (!rc);
 
-       set_intercept_indicators_local(vcpu);
-
-       if (!rc && atomic_read(&fi->active)) {
-               do {
-                       deliver = 0;
-                       spin_lock(&fi->lock);
-                       list_for_each_entry_safe(inti, n, &fi->list, list) {
-                               if (__interrupt_is_deliverable(vcpu, inti)) {
-                                       list_del(&inti->list);
-                                       fi->irq_count--;
-                                       deliver = 1;
-                                       break;
-                               }
-                               __set_intercept_indicator(vcpu, inti);
-                       }
-                       if (list_empty(&fi->list))
-                               atomic_set(&fi->active, 0);
-                       spin_unlock(&fi->lock);
-                       if (deliver) {
-                               rc = __deliver_floating_interrupt(vcpu, inti);
-                               kfree(inti);
-                       }
-               } while (!rc && deliver);
-       }
+       set_intercept_indicators(vcpu);
 
        return rc;
 }
@@ -1172,80 +1197,182 @@ static int __inject_cpu_timer(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static struct kvm_s390_interrupt_info *get_io_int(struct kvm *kvm,
+                                                 int isc, u32 schid)
+{
+       struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+       struct list_head *isc_list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
+       struct kvm_s390_interrupt_info *iter;
+       u16 id = (schid & 0xffff0000U) >> 16;
+       u16 nr = schid & 0x0000ffffU;
 
+       spin_lock(&fi->lock);
+       list_for_each_entry(iter, isc_list, list) {
+               if (schid && (id != iter->io.subchannel_id ||
+                             nr != iter->io.subchannel_nr))
+                       continue;
+               /* found an appropriate entry */
+               list_del_init(&iter->list);
+               fi->counters[FIRQ_CNTR_IO] -= 1;
+               if (list_empty(isc_list))
+                       clear_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs);
+               spin_unlock(&fi->lock);
+               return iter;
+       }
+       spin_unlock(&fi->lock);
+       return NULL;
+}
+
+/*
+ * Dequeue and return an I/O interrupt matching any of the interruption
+ * subclasses as designated by the isc mask in cr6 and the schid (if != 0).
+ */
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
-                                                   u64 cr6, u64 schid)
+                                                   u64 isc_mask, u32 schid)
+{
+       struct kvm_s390_interrupt_info *inti = NULL;
+       int isc;
+
+       for (isc = 0; isc <= MAX_ISC && !inti; isc++) {
+               if (isc_mask & isc_to_isc_bits(isc))
+                       inti = get_io_int(kvm, isc, schid);
+       }
+       return inti;
+}
+
+#define SCCB_MASK 0xFFFFFFF8
+#define SCCB_EVENT_PENDING 0x3
+
+static int __inject_service(struct kvm *kvm,
+                            struct kvm_s390_interrupt_info *inti)
+{
+       struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+
+       spin_lock(&fi->lock);
+       fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING;
+       /*
+        * Early versions of the QEMU s390 bios will inject several
+        * service interrupts after another without handling a
+        * condition code indicating busy.
+        * We will silently ignore those superfluous sccb values.
+        * A future version of QEMU will take care of serialization
+        * of servc requests
+        */
+       if (fi->srv_signal.ext_params & SCCB_MASK)
+               goto out;
+       fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_MASK;
+       set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs);
+out:
+       spin_unlock(&fi->lock);
+       kfree(inti);
+       return 0;
+}
+
+static int __inject_virtio(struct kvm *kvm,
+                           struct kvm_s390_interrupt_info *inti)
+{
+       struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+
+       spin_lock(&fi->lock);
+       if (fi->counters[FIRQ_CNTR_VIRTIO] >= KVM_S390_MAX_VIRTIO_IRQS) {
+               spin_unlock(&fi->lock);
+               return -EBUSY;
+       }
+       fi->counters[FIRQ_CNTR_VIRTIO] += 1;
+       list_add_tail(&inti->list, &fi->lists[FIRQ_LIST_VIRTIO]);
+       set_bit(IRQ_PEND_VIRTIO, &fi->pending_irqs);
+       spin_unlock(&fi->lock);
+       return 0;
+}
+
+static int __inject_pfault_done(struct kvm *kvm,
+                                struct kvm_s390_interrupt_info *inti)
+{
+       struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+
+       spin_lock(&fi->lock);
+       if (fi->counters[FIRQ_CNTR_PFAULT] >=
+               (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)) {
+               spin_unlock(&fi->lock);
+               return -EBUSY;
+       }
+       fi->counters[FIRQ_CNTR_PFAULT] += 1;
+       list_add_tail(&inti->list, &fi->lists[FIRQ_LIST_PFAULT]);
+       set_bit(IRQ_PEND_PFAULT_DONE, &fi->pending_irqs);
+       spin_unlock(&fi->lock);
+       return 0;
+}
+
+#define CR_PENDING_SUBCLASS 28
+static int __inject_float_mchk(struct kvm *kvm,
+                               struct kvm_s390_interrupt_info *inti)
+{
+       struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+
+       spin_lock(&fi->lock);
+       fi->mchk.cr14 |= inti->mchk.cr14 & (1UL << CR_PENDING_SUBCLASS);
+       fi->mchk.mcic |= inti->mchk.mcic;
+       set_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs);
+       spin_unlock(&fi->lock);
+       kfree(inti);
+       return 0;
+}
+
+static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 {
        struct kvm_s390_float_interrupt *fi;
-       struct kvm_s390_interrupt_info *inti, *iter;
+       struct list_head *list;
+       int isc;
 
-       if ((!schid && !cr6) || (schid && cr6))
-               return NULL;
        fi = &kvm->arch.float_int;
        spin_lock(&fi->lock);
-       inti = NULL;
-       list_for_each_entry(iter, &fi->list, list) {
-               if (!is_ioint(iter->type))
-                       continue;
-               if (cr6 &&
-                   ((cr6 & int_word_to_isc_bits(iter->io.io_int_word)) == 0))
-                       continue;
-               if (schid) {
-                       if (((schid & 0x00000000ffff0000) >> 16) !=
-                           iter->io.subchannel_id)
-                               continue;
-                       if ((schid & 0x000000000000ffff) !=
-                           iter->io.subchannel_nr)
-                               continue;
-               }
-               inti = iter;
-               break;
-       }
-       if (inti) {
-               list_del_init(&inti->list);
-               fi->irq_count--;
+       if (fi->counters[FIRQ_CNTR_IO] >= KVM_S390_MAX_FLOAT_IRQS) {
+               spin_unlock(&fi->lock);
+               return -EBUSY;
        }
-       if (list_empty(&fi->list))
-               atomic_set(&fi->active, 0);
+       fi->counters[FIRQ_CNTR_IO] += 1;
+
+       isc = int_word_to_isc(inti->io.io_int_word);
+       list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
+       list_add_tail(&inti->list, list);
+       set_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs);
        spin_unlock(&fi->lock);
-       return inti;
+       return 0;
 }
 
 static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 {
        struct kvm_s390_local_interrupt *li;
        struct kvm_s390_float_interrupt *fi;
-       struct kvm_s390_interrupt_info *iter;
        struct kvm_vcpu *dst_vcpu = NULL;
        int sigcpu;
-       int rc = 0;
+       u64 type = READ_ONCE(inti->type);
+       int rc;
 
        fi = &kvm->arch.float_int;
-       spin_lock(&fi->lock);
-       if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) {
+
+       switch (type) {
+       case KVM_S390_MCHK:
+               rc = __inject_float_mchk(kvm, inti);
+               break;
+       case KVM_S390_INT_VIRTIO:
+               rc = __inject_virtio(kvm, inti);
+               break;
+       case KVM_S390_INT_SERVICE:
+               rc = __inject_service(kvm, inti);
+               break;
+       case KVM_S390_INT_PFAULT_DONE:
+               rc = __inject_pfault_done(kvm, inti);
+               break;
+       case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+               rc = __inject_io(kvm, inti);
+               break;
+       default:
                rc = -EINVAL;
-               goto unlock_fi;
        }
-       fi->irq_count++;
-       if (!is_ioint(inti->type)) {
-               list_add_tail(&inti->list, &fi->list);
-       } else {
-               u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
+       if (rc)
+               return rc;
 
-               /* Keep I/O interrupts sorted in isc order. */
-               list_for_each_entry(iter, &fi->list, list) {
-                       if (!is_ioint(iter->type))
-                               continue;
-                       if (int_word_to_isc_bits(iter->io.io_int_word)
-                           <= isc_bits)
-                               continue;
-                       break;
-               }
-               list_add_tail(&inti->list, &iter->list);
-       }
-       atomic_set(&fi->active, 1);
-       if (atomic_read(&kvm->online_vcpus) == 0)
-               goto unlock_fi;
        sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
        if (sigcpu == KVM_MAX_VCPUS) {
                do {
@@ -1257,7 +1384,7 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
        dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
        li = &dst_vcpu->arch.local_int;
        spin_lock(&li->lock);
-       switch (inti->type) {
+       switch (type) {
        case KVM_S390_MCHK:
                atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
                break;
@@ -1270,9 +1397,8 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
        }
        spin_unlock(&li->lock);
        kvm_s390_vcpu_wakeup(kvm_get_vcpu(kvm, sigcpu));
-unlock_fi:
-       spin_unlock(&fi->lock);
-       return rc;
+       return 0;
+
 }
 
 int kvm_s390_inject_vm(struct kvm *kvm,
@@ -1332,10 +1458,10 @@ int kvm_s390_inject_vm(struct kvm *kvm,
        return rc;
 }
 
-void kvm_s390_reinject_io_int(struct kvm *kvm,
+int kvm_s390_reinject_io_int(struct kvm *kvm,
                              struct kvm_s390_interrupt_info *inti)
 {
-       __inject_vm(kvm, inti);
+       return __inject_vm(kvm, inti);
 }
 
 int s390int_to_s390irq(struct kvm_s390_interrupt *s390int,
@@ -1388,12 +1514,10 @@ void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu)
        spin_unlock(&li->lock);
 }
 
-int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
+static int do_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 {
-       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        int rc;
 
-       spin_lock(&li->lock);
        switch (irq->type) {
        case KVM_S390_PROGRAM_INT:
                VCPU_EVENT(vcpu, 3, "inject: program check %d (from user)",
@@ -1433,83 +1557,130 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
        default:
                rc = -EINVAL;
        }
+
+       return rc;
+}
+
+int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
+{
+       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+       int rc;
+
+       spin_lock(&li->lock);
+       rc = do_inject_vcpu(vcpu, irq);
        spin_unlock(&li->lock);
        if (!rc)
                kvm_s390_vcpu_wakeup(vcpu);
        return rc;
 }
 
-void kvm_s390_clear_float_irqs(struct kvm *kvm)
+static inline void clear_irq_list(struct list_head *_list)
 {
-       struct kvm_s390_float_interrupt *fi;
-       struct kvm_s390_interrupt_info  *n, *inti = NULL;
+       struct kvm_s390_interrupt_info *inti, *n;
 
-       fi = &kvm->arch.float_int;
-       spin_lock(&fi->lock);
-       list_for_each_entry_safe(inti, n, &fi->list, list) {
+       list_for_each_entry_safe(inti, n, _list, list) {
                list_del(&inti->list);
                kfree(inti);
        }
-       fi->irq_count = 0;
-       atomic_set(&fi->active, 0);
-       spin_unlock(&fi->lock);
 }
 
-static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
-                                  u8 *addr)
+static void inti_to_irq(struct kvm_s390_interrupt_info *inti,
+                      struct kvm_s390_irq *irq)
 {
-       struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
-       struct kvm_s390_irq irq = {0};
-
-       irq.type = inti->type;
+       irq->type = inti->type;
        switch (inti->type) {
        case KVM_S390_INT_PFAULT_INIT:
        case KVM_S390_INT_PFAULT_DONE:
        case KVM_S390_INT_VIRTIO:
-       case KVM_S390_INT_SERVICE:
-               irq.u.ext = inti->ext;
+               irq->u.ext = inti->ext;
                break;
        case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-               irq.u.io = inti->io;
+               irq->u.io = inti->io;
                break;
-       case KVM_S390_MCHK:
-               irq.u.mchk = inti->mchk;
-               break;
-       default:
-               return -EINVAL;
        }
+}
 
-       if (copy_to_user(uptr, &irq, sizeof(irq)))
-               return -EFAULT;
+void kvm_s390_clear_float_irqs(struct kvm *kvm)
+{
+       struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+       int i;
 
-       return 0;
-}
+       spin_lock(&fi->lock);
+       for (i = 0; i < FIRQ_LIST_COUNT; i++)
+               clear_irq_list(&fi->lists[i]);
+       for (i = 0; i < FIRQ_MAX_COUNT; i++)
+               fi->counters[i] = 0;
+       spin_unlock(&fi->lock);
+};
 
-static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len)
+static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
 {
        struct kvm_s390_interrupt_info *inti;
        struct kvm_s390_float_interrupt *fi;
+       struct kvm_s390_irq *buf;
+       struct kvm_s390_irq *irq;
+       int max_irqs;
        int ret = 0;
        int n = 0;
+       int i;
+
+       if (len > KVM_S390_FLIC_MAX_BUFFER || len == 0)
+               return -EINVAL;
+
+       /*
+        * We are already using -ENOMEM to signal
+        * userspace it may retry with a bigger buffer,
+        * so we need to use something else for this case
+        */
+       buf = vzalloc(len);
+       if (!buf)
+               return -ENOBUFS;
+
+       max_irqs = len / sizeof(struct kvm_s390_irq);
 
        fi = &kvm->arch.float_int;
        spin_lock(&fi->lock);
-
-       list_for_each_entry(inti, &fi->list, list) {
-               if (len < sizeof(struct kvm_s390_irq)) {
+       for (i = 0; i < FIRQ_LIST_COUNT; i++) {
+               list_for_each_entry(inti, &fi->lists[i], list) {
+                       if (n == max_irqs) {
+                               /* signal userspace to try again */
+                               ret = -ENOMEM;
+                               goto out;
+                       }
+                       inti_to_irq(inti, &buf[n]);
+                       n++;
+               }
+       }
+       if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs)) {
+               if (n == max_irqs) {
                        /* signal userspace to try again */
                        ret = -ENOMEM;
-                       break;
+                       goto out;
                }
-               ret = copy_irq_to_user(inti, buf);
-               if (ret)
-                       break;
-               buf += sizeof(struct kvm_s390_irq);
-               len -= sizeof(struct kvm_s390_irq);
+               irq = (struct kvm_s390_irq *) &buf[n];
+               irq->type = KVM_S390_INT_SERVICE;
+               irq->u.ext = fi->srv_signal;
                n++;
        }
+       if (test_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) {
+               if (n == max_irqs) {
+                               /* signal userspace to try again */
+                               ret = -ENOMEM;
+                               goto out;
+               }
+               irq = (struct kvm_s390_irq *) &buf[n];
+               irq->type = KVM_S390_MCHK;
+               irq->u.mchk = fi->mchk;
+               n++;
+}
 
+out:
        spin_unlock(&fi->lock);
+       if (!ret && n > 0) {
+               if (copy_to_user(usrbuf, buf, sizeof(struct kvm_s390_irq) * n))
+                       ret = -EFAULT;
+       }
+       vfree(buf);
 
        return ret < 0 ? ret : n;
 }
@@ -1520,7 +1691,7 @@ static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 
        switch (attr->group) {
        case KVM_DEV_FLIC_GET_ALL_IRQS:
-               r = get_all_floating_irqs(dev->kvm, (u8 *) attr->addr,
+               r = get_all_floating_irqs(dev->kvm, (u8 __user *) attr->addr,
                                          attr->attr);
                break;
        default:
@@ -1952,3 +2123,143 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
 {
        return -EINVAL;
 }
+
+int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu, void __user *irqstate, int len)
+{
+       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+       struct kvm_s390_irq *buf;
+       int r = 0;
+       int n;
+
+       buf = vmalloc(len);
+       if (!buf)
+               return -ENOMEM;
+
+       if (copy_from_user((void *) buf, irqstate, len)) {
+               r = -EFAULT;
+               goto out_free;
+       }
+
+       /*
+        * Don't allow setting the interrupt state
+        * when there are already interrupts pending
+        */
+       spin_lock(&li->lock);
+       if (li->pending_irqs) {
+               r = -EBUSY;
+               goto out_unlock;
+       }
+
+       for (n = 0; n < len / sizeof(*buf); n++) {
+               r = do_inject_vcpu(vcpu, &buf[n]);
+               if (r)
+                       break;
+       }
+
+out_unlock:
+       spin_unlock(&li->lock);
+out_free:
+       vfree(buf);
+
+       return r;
+}
+
+static void store_local_irq(struct kvm_s390_local_interrupt *li,
+                           struct kvm_s390_irq *irq,
+                           unsigned long irq_type)
+{
+       switch (irq_type) {
+       case IRQ_PEND_MCHK_EX:
+       case IRQ_PEND_MCHK_REP:
+               irq->type = KVM_S390_MCHK;
+               irq->u.mchk = li->irq.mchk;
+               break;
+       case IRQ_PEND_PROG:
+               irq->type = KVM_S390_PROGRAM_INT;
+               irq->u.pgm = li->irq.pgm;
+               break;
+       case IRQ_PEND_PFAULT_INIT:
+               irq->type = KVM_S390_INT_PFAULT_INIT;
+               irq->u.ext = li->irq.ext;
+               break;
+       case IRQ_PEND_EXT_EXTERNAL:
+               irq->type = KVM_S390_INT_EXTERNAL_CALL;
+               irq->u.extcall = li->irq.extcall;
+               break;
+       case IRQ_PEND_EXT_CLOCK_COMP:
+               irq->type = KVM_S390_INT_CLOCK_COMP;
+               break;
+       case IRQ_PEND_EXT_CPU_TIMER:
+               irq->type = KVM_S390_INT_CPU_TIMER;
+               break;
+       case IRQ_PEND_SIGP_STOP:
+               irq->type = KVM_S390_SIGP_STOP;
+               irq->u.stop = li->irq.stop;
+               break;
+       case IRQ_PEND_RESTART:
+               irq->type = KVM_S390_RESTART;
+               break;
+       case IRQ_PEND_SET_PREFIX:
+               irq->type = KVM_S390_SIGP_SET_PREFIX;
+               irq->u.prefix = li->irq.prefix;
+               break;
+       }
+}
+
+int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
+{
+       uint8_t sigp_ctrl = vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl;
+       unsigned long sigp_emerg_pending[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+       unsigned long pending_irqs;
+       struct kvm_s390_irq irq;
+       unsigned long irq_type;
+       int cpuaddr;
+       int n = 0;
+
+       spin_lock(&li->lock);
+       pending_irqs = li->pending_irqs;
+       memcpy(&sigp_emerg_pending, &li->sigp_emerg_pending,
+              sizeof(sigp_emerg_pending));
+       spin_unlock(&li->lock);
+
+       for_each_set_bit(irq_type, &pending_irqs, IRQ_PEND_COUNT) {
+               memset(&irq, 0, sizeof(irq));
+               if (irq_type == IRQ_PEND_EXT_EMERGENCY)
+                       continue;
+               if (n + sizeof(irq) > len)
+                       return -ENOBUFS;
+               store_local_irq(&vcpu->arch.local_int, &irq, irq_type);
+               if (copy_to_user(&buf[n], &irq, sizeof(irq)))
+                       return -EFAULT;
+               n += sizeof(irq);
+       }
+
+       if (test_bit(IRQ_PEND_EXT_EMERGENCY, &pending_irqs)) {
+               for_each_set_bit(cpuaddr, sigp_emerg_pending, KVM_MAX_VCPUS) {
+                       memset(&irq, 0, sizeof(irq));
+                       if (n + sizeof(irq) > len)
+                               return -ENOBUFS;
+                       irq.type = KVM_S390_INT_EMERGENCY;
+                       irq.u.emerg.code = cpuaddr;
+                       if (copy_to_user(&buf[n], &irq, sizeof(irq)))
+                               return -EFAULT;
+                       n += sizeof(irq);
+               }
+       }
+
+       if ((sigp_ctrl & SIGP_CTRL_C) &&
+           (atomic_read(&vcpu->arch.sie_block->cpuflags) &
+            CPUSTAT_ECALL_PEND)) {
+               if (n + sizeof(irq) > len)
+                       return -ENOBUFS;
+               memset(&irq, 0, sizeof(irq));
+               irq.type = KVM_S390_INT_EXTERNAL_CALL;
+               irq.u.extcall.code = sigp_ctrl & SIGP_CTRL_SCN_MASK;
+               if (copy_to_user(&buf[n], &irq, sizeof(irq)))
+                       return -EFAULT;
+               n += sizeof(irq);
+       }
+
+       return n;
+}
index 19e17bd7aec09b2662874a3925e3d55f4e4207f4..afa2bd750ffc814d36300bc491a0a6eff516bda4 100644 (file)
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
+#include <linux/vmalloc.h>
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
 #include <asm/pgtable.h>
 #include <asm/nmi.h>
 #include <asm/switch_to.h>
+#include <asm/isc.h>
 #include <asm/sclp.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include "trace.h"
 #include "trace-s390.h"
 
+#define MEM_OP_MAX_SIZE 65536  /* Maximum transfer size for KVM_S390_MEM_OP */
+#define LOCAL_IRQS 32
+#define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
+                          (KVM_MAX_VCPUS + LOCAL_IRQS))
+
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
@@ -87,6 +94,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
        { "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) },
        { "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) },
+       { "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) },
        { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
        { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) },
        { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
@@ -101,8 +109,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 /* upper facilities limit for kvm */
 unsigned long kvm_s390_fac_list_mask[] = {
-       0xff82fffbf4fc2000UL,
-       0x005c000000000000UL,
+       0xffe6fffbfcfdfc40UL,
+       0x205c800000000000UL,
 };
 
 unsigned long kvm_s390_fac_list_mask_size(void)
@@ -171,9 +179,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_IRQCHIP:
        case KVM_CAP_VM_ATTRIBUTES:
        case KVM_CAP_MP_STATE:
+       case KVM_CAP_S390_INJECT_IRQ:
        case KVM_CAP_S390_USER_SIGP:
+       case KVM_CAP_S390_USER_STSI:
+       case KVM_CAP_S390_SKEYS:
+       case KVM_CAP_S390_IRQ_STATE:
                r = 1;
                break;
+       case KVM_CAP_S390_MEM_OP:
+               r = MEM_OP_MAX_SIZE;
+               break;
        case KVM_CAP_NR_VCPUS:
        case KVM_CAP_MAX_VCPUS:
                r = KVM_MAX_VCPUS;
@@ -184,6 +199,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_COW:
                r = MACHINE_HAS_ESOP;
                break;
+       case KVM_CAP_S390_VECTOR_REGISTERS:
+               r = MACHINE_HAS_VX;
+               break;
        default:
                r = 0;
        }
@@ -264,6 +282,18 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
                kvm->arch.user_sigp = 1;
                r = 0;
                break;
+       case KVM_CAP_S390_VECTOR_REGISTERS:
+               if (MACHINE_HAS_VX) {
+                       set_kvm_facility(kvm->arch.model.fac->mask, 129);
+                       set_kvm_facility(kvm->arch.model.fac->list, 129);
+                       r = 0;
+               } else
+                       r = -EINVAL;
+               break;
+       case KVM_CAP_S390_USER_STSI:
+               kvm->arch.user_stsi = 1;
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -708,6 +738,108 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
        return ret;
 }
 
+static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
+{
+       uint8_t *keys;
+       uint64_t hva;
+       unsigned long curkey;
+       int i, r = 0;
+
+       if (args->flags != 0)
+               return -EINVAL;
+
+       /* Is this guest using storage keys? */
+       if (!mm_use_skey(current->mm))
+               return KVM_S390_GET_SKEYS_NONE;
+
+       /* Enforce sane limit on memory allocation */
+       if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
+               return -EINVAL;
+
+       keys = kmalloc_array(args->count, sizeof(uint8_t),
+                            GFP_KERNEL | __GFP_NOWARN);
+       if (!keys)
+               keys = vmalloc(sizeof(uint8_t) * args->count);
+       if (!keys)
+               return -ENOMEM;
+
+       for (i = 0; i < args->count; i++) {
+               hva = gfn_to_hva(kvm, args->start_gfn + i);
+               if (kvm_is_error_hva(hva)) {
+                       r = -EFAULT;
+                       goto out;
+               }
+
+               curkey = get_guest_storage_key(current->mm, hva);
+               if (IS_ERR_VALUE(curkey)) {
+                       r = curkey;
+                       goto out;
+               }
+               keys[i] = curkey;
+       }
+
+       r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+                        sizeof(uint8_t) * args->count);
+       if (r)
+               r = -EFAULT;
+out:
+       kvfree(keys);
+       return r;
+}
+
+static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
+{
+       uint8_t *keys;
+       uint64_t hva;
+       int i, r = 0;
+
+       if (args->flags != 0)
+               return -EINVAL;
+
+       /* Enforce sane limit on memory allocation */
+       if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
+               return -EINVAL;
+
+       keys = kmalloc_array(args->count, sizeof(uint8_t),
+                            GFP_KERNEL | __GFP_NOWARN);
+       if (!keys)
+               keys = vmalloc(sizeof(uint8_t) * args->count);
+       if (!keys)
+               return -ENOMEM;
+
+       r = copy_from_user(keys, (uint8_t __user *)args->skeydata_addr,
+                          sizeof(uint8_t) * args->count);
+       if (r) {
+               r = -EFAULT;
+               goto out;
+       }
+
+       /* Enable storage key handling for the guest */
+       s390_enable_skey();
+
+       for (i = 0; i < args->count; i++) {
+               hva = gfn_to_hva(kvm, args->start_gfn + i);
+               if (kvm_is_error_hva(hva)) {
+                       r = -EFAULT;
+                       goto out;
+               }
+
+               /* Lowest order bit is reserved */
+               if (keys[i] & 0x01) {
+                       r = -EINVAL;
+                       goto out;
+               }
+
+               r = set_guest_storage_key(current->mm, hva,
+                                         (unsigned long)keys[i], 0);
+               if (r)
+                       goto out;
+       }
+out:
+       kvfree(keys);
+       return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
 {
@@ -767,6 +899,26 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = kvm_s390_vm_has_attr(kvm, &attr);
                break;
        }
+       case KVM_S390_GET_SKEYS: {
+               struct kvm_s390_skeys args;
+
+               r = -EFAULT;
+               if (copy_from_user(&args, argp,
+                                  sizeof(struct kvm_s390_skeys)))
+                       break;
+               r = kvm_s390_get_skeys(kvm, &args);
+               break;
+       }
+       case KVM_S390_SET_SKEYS: {
+               struct kvm_s390_skeys args;
+
+               r = -EFAULT;
+               if (copy_from_user(&args, argp,
+                                  sizeof(struct kvm_s390_skeys)))
+                       break;
+               r = kvm_s390_set_skeys(kvm, &args);
+               break;
+       }
        default:
                r = -ENOTTY;
        }
@@ -887,7 +1039,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        kvm->arch.dbf = debug_register(debug_name, 8, 2, 8 * sizeof(long));
        if (!kvm->arch.dbf)
-               goto out_nodbf;
+               goto out_err;
 
        /*
         * The architectural maximum amount of facilities is 16 kbit. To store
@@ -899,7 +1051,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm->arch.model.fac =
                (struct kvm_s390_fac *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
        if (!kvm->arch.model.fac)
-               goto out_nofac;
+               goto out_err;
 
        /* Populate the facility mask initially. */
        memcpy(kvm->arch.model.fac->mask, S390_lowcore.stfle_fac_list,
@@ -919,10 +1071,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm->arch.model.ibc = sclp_get_ibc() & 0x0fff;
 
        if (kvm_s390_crypto_init(kvm) < 0)
-               goto out_crypto;
+               goto out_err;
 
        spin_lock_init(&kvm->arch.float_int.lock);
-       INIT_LIST_HEAD(&kvm->arch.float_int.list);
+       for (i = 0; i < FIRQ_LIST_COUNT; i++)
+               INIT_LIST_HEAD(&kvm->arch.float_int.lists[i]);
        init_waitqueue_head(&kvm->arch.ipte_wq);
        mutex_init(&kvm->arch.ipte_mutex);
 
@@ -934,7 +1087,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        } else {
                kvm->arch.gmap = gmap_alloc(current->mm, (1UL << 44) - 1);
                if (!kvm->arch.gmap)
-                       goto out_nogmap;
+                       goto out_err;
                kvm->arch.gmap->private = kvm;
                kvm->arch.gmap->pfault_enabled = 0;
        }
@@ -946,15 +1099,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        spin_lock_init(&kvm->arch.start_stop_lock);
 
        return 0;
-out_nogmap:
+out_err:
        kfree(kvm->arch.crypto.crycb);
-out_crypto:
        free_page((unsigned long)kvm->arch.model.fac);
-out_nofac:
        debug_unregister(kvm->arch.dbf);
-out_nodbf:
        free_page((unsigned long)(kvm->arch.sca));
-out_err:
        return rc;
 }
 
@@ -1034,6 +1183,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                                    KVM_SYNC_CRS |
                                    KVM_SYNC_ARCH0 |
                                    KVM_SYNC_PFAULT;
+       if (test_kvm_facility(vcpu->kvm, 129))
+               vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
 
        if (kvm_is_ucontrol(vcpu->kvm))
                return __kvm_ucontrol_vcpu_init(vcpu);
@@ -1044,10 +1195,18 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        save_fp_ctl(&vcpu->arch.host_fpregs.fpc);
-       save_fp_regs(vcpu->arch.host_fpregs.fprs);
+       if (test_kvm_facility(vcpu->kvm, 129))
+               save_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs);
+       else
+               save_fp_regs(vcpu->arch.host_fpregs.fprs);
        save_access_regs(vcpu->arch.host_acrs);
-       restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
-       restore_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               restore_fp_ctl(&vcpu->run->s.regs.fpc);
+               restore_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
+       } else {
+               restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
+               restore_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       }
        restore_access_regs(vcpu->run->s.regs.acrs);
        gmap_enable(vcpu->arch.gmap);
        atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
@@ -1057,11 +1216,19 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
        gmap_disable(vcpu->arch.gmap);
-       save_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
-       save_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               save_fp_ctl(&vcpu->run->s.regs.fpc);
+               save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
+       } else {
+               save_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
+               save_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       }
        save_access_regs(vcpu->run->s.regs.acrs);
        restore_fp_ctl(&vcpu->arch.host_fpregs.fpc);
-       restore_fp_regs(vcpu->arch.host_fpregs.fprs);
+       if (test_kvm_facility(vcpu->kvm, 129))
+               restore_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs);
+       else
+               restore_fp_regs(vcpu->arch.host_fpregs.fprs);
        restore_access_regs(vcpu->arch.host_acrs);
 }
 
@@ -1129,6 +1296,15 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
+{
+       struct kvm_s390_cpu_model *model = &vcpu->kvm->arch.model;
+
+       vcpu->arch.cpu_id = model->cpu_id;
+       vcpu->arch.sie_block->ibc = model->ibc;
+       vcpu->arch.sie_block->fac = (int) (long) model->fac->list;
+}
+
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        int rc = 0;
@@ -1137,6 +1313,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
                                                    CPUSTAT_SM |
                                                    CPUSTAT_STOPPED |
                                                    CPUSTAT_GED);
+       kvm_s390_vcpu_setup_model(vcpu);
+
        vcpu->arch.sie_block->ecb   = 6;
        if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
                vcpu->arch.sie_block->ecb |= 0x10;
@@ -1147,8 +1325,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
                vcpu->arch.sie_block->eca |= 1;
        if (sclp_has_sigpif())
                vcpu->arch.sie_block->eca |= 0x10000000U;
-       vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE |
-                                     ICTL_TPROT;
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               vcpu->arch.sie_block->eca |= 0x00020000;
+               vcpu->arch.sie_block->ecd |= 0x20000000;
+       }
+       vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
 
        if (kvm_s390_cmma_enabled(vcpu->kvm)) {
                rc = kvm_s390_vcpu_setup_cmma(vcpu);
@@ -1158,11 +1339,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
 
-       mutex_lock(&vcpu->kvm->lock);
-       vcpu->arch.cpu_id = vcpu->kvm->arch.model.cpu_id;
-       vcpu->arch.sie_block->ibc = vcpu->kvm->arch.model.ibc;
-       mutex_unlock(&vcpu->kvm->lock);
-
        kvm_s390_vcpu_crypto_setup(vcpu);
 
        return rc;
@@ -1190,6 +1366,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 
        vcpu->arch.sie_block = &sie_page->sie_block;
        vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
+       vcpu->arch.host_vregs = &sie_page->vregs;
 
        vcpu->arch.sie_block->icpua = id;
        if (!kvm_is_ucontrol(kvm)) {
@@ -1205,7 +1382,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
                vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
                set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
        }
-       vcpu->arch.sie_block->fac = (int) (long) kvm->arch.model.fac->list;
 
        spin_lock_init(&vcpu->arch.local_int.lock);
        vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@ -1725,6 +1901,31 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
+{
+       psw_t *psw = &vcpu->arch.sie_block->gpsw;
+       u8 opcode;
+       int rc;
+
+       VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
+       trace_kvm_s390_sie_fault(vcpu);
+
+       /*
+        * We want to inject an addressing exception, which is defined as a
+        * suppressing or terminating exception. However, since we came here
+        * by a DAT access exception, the PSW still points to the faulting
+        * instruction since DAT exceptions are nullifying. So we've got
+        * to look up the current opcode to get the length of the instruction
+        * to be able to forward the PSW.
+        */
+       rc = read_guest(vcpu, psw->addr, 0, &opcode, 1);
+       if (rc)
+               return kvm_s390_inject_prog_cond(vcpu, rc);
+       psw->addr = __rewind_psw(*psw, -insn_length(opcode));
+
+       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+}
+
 static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
 {
        int rc = -1;
@@ -1756,11 +1957,8 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
                }
        }
 
-       if (rc == -1) {
-               VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
-               trace_kvm_s390_sie_fault(vcpu);
-               rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-       }
+       if (rc == -1)
+               rc = vcpu_post_run_fault_in_sie(vcpu);
 
        memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
 
@@ -1976,6 +2174,35 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
        return kvm_s390_store_status_unloaded(vcpu, addr);
 }
 
+/*
+ * store additional status at address
+ */
+int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
+                                       unsigned long gpa)
+{
+       /* Only bits 0-53 are used for address formation */
+       if (!(gpa & ~0x3ff))
+               return 0;
+
+       return write_guest_abs(vcpu, gpa & ~0x3ff,
+                              (void *)&vcpu->run->s.regs.vrs, 512);
+}
+
+int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+       if (!test_kvm_facility(vcpu->kvm, 129))
+               return 0;
+
+       /*
+        * The guest VXRS are in the host VXRs due to the lazy
+        * copying in vcpu load/put. Let's update our copies before we save
+        * it into the save area.
+        */
+       save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
+
+       return kvm_s390_store_adtl_status_unloaded(vcpu, addr);
+}
+
 static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
 {
        kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
@@ -2100,6 +2327,65 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
        return r;
 }
 
+static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
+                                 struct kvm_s390_mem_op *mop)
+{
+       void __user *uaddr = (void __user *)mop->buf;
+       void *tmpbuf = NULL;
+       int r, srcu_idx;
+       const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION
+                                   | KVM_S390_MEMOP_F_CHECK_ONLY;
+
+       if (mop->flags & ~supported_flags)
+               return -EINVAL;
+
+       if (mop->size > MEM_OP_MAX_SIZE)
+               return -E2BIG;
+
+       if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
+               tmpbuf = vmalloc(mop->size);
+               if (!tmpbuf)
+                       return -ENOMEM;
+       }
+
+       srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+       switch (mop->op) {
+       case KVM_S390_MEMOP_LOGICAL_READ:
+               if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+                       r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, false);
+                       break;
+               }
+               r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
+               if (r == 0) {
+                       if (copy_to_user(uaddr, tmpbuf, mop->size))
+                               r = -EFAULT;
+               }
+               break;
+       case KVM_S390_MEMOP_LOGICAL_WRITE:
+               if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+                       r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, true);
+                       break;
+               }
+               if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+                       r = -EFAULT;
+                       break;
+               }
+               r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
+               break;
+       default:
+               r = -EINVAL;
+       }
+
+       srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+
+       if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
+               kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+
+       vfree(tmpbuf);
+       return r;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
 {
@@ -2109,6 +2395,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        long r;
 
        switch (ioctl) {
+       case KVM_S390_IRQ: {
+               struct kvm_s390_irq s390irq;
+
+               r = -EFAULT;
+               if (copy_from_user(&s390irq, argp, sizeof(s390irq)))
+                       break;
+               r = kvm_s390_inject_vcpu(vcpu, &s390irq);
+               break;
+       }
        case KVM_S390_INTERRUPT: {
                struct kvm_s390_interrupt s390int;
                struct kvm_s390_irq s390irq;
@@ -2199,6 +2494,47 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
                break;
        }
+       case KVM_S390_MEM_OP: {
+               struct kvm_s390_mem_op mem_op;
+
+               if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0)
+                       r = kvm_s390_guest_mem_op(vcpu, &mem_op);
+               else
+                       r = -EFAULT;
+               break;
+       }
+       case KVM_S390_SET_IRQ_STATE: {
+               struct kvm_s390_irq_state irq_state;
+
+               r = -EFAULT;
+               if (copy_from_user(&irq_state, argp, sizeof(irq_state)))
+                       break;
+               if (irq_state.len > VCPU_IRQS_MAX_BUF ||
+                   irq_state.len == 0 ||
+                   irq_state.len % sizeof(struct kvm_s390_irq) > 0) {
+                       r = -EINVAL;
+                       break;
+               }
+               r = kvm_s390_set_irq_state(vcpu,
+                                          (void __user *) irq_state.buf,
+                                          irq_state.len);
+               break;
+       }
+       case KVM_S390_GET_IRQ_STATE: {
+               struct kvm_s390_irq_state irq_state;
+
+               r = -EFAULT;
+               if (copy_from_user(&irq_state, argp, sizeof(irq_state)))
+                       break;
+               if (irq_state.len == 0) {
+                       r = -EINVAL;
+                       break;
+               }
+               r = kvm_s390_get_irq_state(vcpu,
+                                          (__u8 __user *)  irq_state.buf,
+                                          irq_state.len);
+               break;
+       }
        default:
                r = -ENOTTY;
        }
index c34109aa552d9b1a6e5ea66f172b5c3e30ad001b..ca108b90ae5613a15e0d82a68ef0288a93d62792 100644 (file)
@@ -70,16 +70,22 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
        kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
 }
 
-static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu)
+typedef u8 __bitwise ar_t;
+
+static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar)
 {
        u32 base2 = vcpu->arch.sie_block->ipb >> 28;
        u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
 
+       if (ar)
+               *ar = base2;
+
        return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
 }
 
 static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
-                                             u64 *address1, u64 *address2)
+                                             u64 *address1, u64 *address2,
+                                             ar_t *ar_b1, ar_t *ar_b2)
 {
        u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
        u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
@@ -88,6 +94,11 @@ static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
 
        *address1 = (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1;
        *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
+
+       if (ar_b1)
+               *ar_b1 = base1;
+       if (ar_b2)
+               *ar_b2 = base2;
 }
 
 static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2)
@@ -98,7 +109,7 @@ static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2
                *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
 }
 
-static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu)
+static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar)
 {
        u32 base2 = vcpu->arch.sie_block->ipb >> 28;
        u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
@@ -107,14 +118,20 @@ static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu)
        if (disp2 & 0x80000)
                disp2+=0xfff00000;
 
+       if (ar)
+               *ar = base2;
+
        return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2;
 }
 
-static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu)
+static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, ar_t *ar)
 {
        u32 base2 = vcpu->arch.sie_block->ipb >> 28;
        u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
 
+       if (ar)
+               *ar = base2;
+
        return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
 }
 
@@ -125,13 +142,24 @@ static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc)
        vcpu->arch.sie_block->gpsw.mask |= cc << 44;
 }
 
-/* test availability of facility in a kvm intance */
+/* test availability of facility in a kvm instance */
 static inline int test_kvm_facility(struct kvm *kvm, unsigned long nr)
 {
        return __test_facility(nr, kvm->arch.model.fac->mask) &&
                __test_facility(nr, kvm->arch.model.fac->list);
 }
 
+static inline int set_kvm_facility(u64 *fac_list, unsigned long nr)
+{
+       unsigned char *ptr;
+
+       if (nr >= MAX_FACILITY_BIT)
+               return -EINVAL;
+       ptr = (unsigned char *) fac_list + (nr >> 3);
+       *ptr |= (0x80UL >> (nr & 7));
+       return 0;
+}
+
 /* are cpu states controlled by user space */
 static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
 {
@@ -150,9 +178,9 @@ int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
                                      struct kvm_s390_irq *irq);
 int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
-                                                   u64 cr6, u64 schid);
-void kvm_s390_reinject_io_int(struct kvm *kvm,
-                             struct kvm_s390_interrupt_info *inti);
+                                                   u64 isc_mask, u32 schid);
+int kvm_s390_reinject_io_int(struct kvm *kvm,
+                            struct kvm_s390_interrupt_info *inti);
 int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked);
 
 /* implemented in intercept.c */
@@ -177,7 +205,10 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
 /* implemented in kvm-s390.c */
 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
+int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
+                                       unsigned long addr);
 int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
+int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
 void s390_vcpu_block(struct kvm_vcpu *vcpu);
@@ -241,6 +272,10 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu);
 extern struct kvm_device_ops kvm_flic_ops;
 int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu);
 void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu);
+int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu,
+                          void __user *buf, int len);
+int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu,
+                          __u8 __user *buf, int len);
 
 /* implemented in guestdbg.c */
 void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu);
index 351116939ea27f2fcf6eb4de60447d86335d0b6e..d22d8ee1ff9d9c6404d653f5c4f8a04b8ddc70da 100644 (file)
@@ -36,15 +36,16 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
        struct kvm_vcpu *cpup;
        s64 hostclk, val;
        int i, rc;
+       ar_t ar;
        u64 op2;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       op2 = kvm_s390_get_base_disp_s(vcpu);
+       op2 = kvm_s390_get_base_disp_s(vcpu, &ar);
        if (op2 & 7)    /* Operand must be on a doubleword boundary */
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-       rc = read_guest(vcpu, op2, &val, sizeof(val));
+       rc = read_guest(vcpu, op2, ar, &val, sizeof(val));
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
 
@@ -68,20 +69,21 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
        u64 operand2;
        u32 address;
        int rc;
+       ar_t ar;
 
        vcpu->stat.instruction_spx++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       operand2 = kvm_s390_get_base_disp_s(vcpu);
+       operand2 = kvm_s390_get_base_disp_s(vcpu, &ar);
 
        /* must be word boundary */
        if (operand2 & 3)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
        /* get the value */
-       rc = read_guest(vcpu, operand2, &address, sizeof(address));
+       rc = read_guest(vcpu, operand2, ar, &address, sizeof(address));
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
 
@@ -107,13 +109,14 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
        u64 operand2;
        u32 address;
        int rc;
+       ar_t ar;
 
        vcpu->stat.instruction_stpx++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       operand2 = kvm_s390_get_base_disp_s(vcpu);
+       operand2 = kvm_s390_get_base_disp_s(vcpu, &ar);
 
        /* must be word boundary */
        if (operand2 & 3)
@@ -122,7 +125,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
        address = kvm_s390_get_prefix(vcpu);
 
        /* get the value */
-       rc = write_guest(vcpu, operand2, &address, sizeof(address));
+       rc = write_guest(vcpu, operand2, ar, &address, sizeof(address));
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
 
@@ -136,18 +139,19 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
        u16 vcpu_id = vcpu->vcpu_id;
        u64 ga;
        int rc;
+       ar_t ar;
 
        vcpu->stat.instruction_stap++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       ga = kvm_s390_get_base_disp_s(vcpu);
+       ga = kvm_s390_get_base_disp_s(vcpu, &ar);
 
        if (ga & 1)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       rc = write_guest(vcpu, ga, &vcpu_id, sizeof(vcpu_id));
+       rc = write_guest(vcpu, ga, ar, &vcpu_id, sizeof(vcpu_id));
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
 
@@ -207,7 +211,7 @@ static int handle_test_block(struct kvm_vcpu *vcpu)
        kvm_s390_get_regs_rre(vcpu, NULL, &reg2);
        addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
        addr = kvm_s390_logical_to_effective(vcpu, addr);
-       if (kvm_s390_check_low_addr_protection(vcpu, addr))
+       if (kvm_s390_check_low_addr_prot_real(vcpu, addr))
                return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
        addr = kvm_s390_real_to_abs(vcpu, addr);
 
@@ -229,18 +233,20 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
        struct kvm_s390_interrupt_info *inti;
        unsigned long len;
        u32 tpi_data[3];
-       int cc, rc;
+       int rc;
        u64 addr;
+       ar_t ar;
 
-       rc = 0;
-       addr = kvm_s390_get_base_disp_s(vcpu);
+       addr = kvm_s390_get_base_disp_s(vcpu, &ar);
        if (addr & 3)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-       cc = 0;
+
        inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->arch.sie_block->gcr[6], 0);
-       if (!inti)
-               goto no_interrupt;
-       cc = 1;
+       if (!inti) {
+               kvm_s390_set_psw_cc(vcpu, 0);
+               return 0;
+       }
+
        tpi_data[0] = inti->io.subchannel_id << 16 | inti->io.subchannel_nr;
        tpi_data[1] = inti->io.io_int_parm;
        tpi_data[2] = inti->io.io_int_word;
@@ -250,40 +256,51 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
                 * provided area.
                 */
                len = sizeof(tpi_data) - 4;
-               rc = write_guest(vcpu, addr, &tpi_data, len);
-               if (rc)
-                       return kvm_s390_inject_prog_cond(vcpu, rc);
+               rc = write_guest(vcpu, addr, ar, &tpi_data, len);
+               if (rc) {
+                       rc = kvm_s390_inject_prog_cond(vcpu, rc);
+                       goto reinject_interrupt;
+               }
        } else {
                /*
                 * Store the three-word I/O interruption code into
                 * the appropriate lowcore area.
                 */
                len = sizeof(tpi_data);
-               if (write_guest_lc(vcpu, __LC_SUBCHANNEL_ID, &tpi_data, len))
+               if (write_guest_lc(vcpu, __LC_SUBCHANNEL_ID, &tpi_data, len)) {
+                       /* failed writes to the low core are not recoverable */
                        rc = -EFAULT;
+                       goto reinject_interrupt;
+               }
        }
+
+       /* irq was successfully handed to the guest */
+       kfree(inti);
+       kvm_s390_set_psw_cc(vcpu, 1);
+       return 0;
+reinject_interrupt:
        /*
         * If we encounter a problem storing the interruption code, the
         * instruction is suppressed from the guest's view: reinject the
         * interrupt.
         */
-       if (!rc)
+       if (kvm_s390_reinject_io_int(vcpu->kvm, inti)) {
                kfree(inti);
-       else
-               kvm_s390_reinject_io_int(vcpu->kvm, inti);
-no_interrupt:
-       /* Set condition code and we're done. */
-       if (!rc)
-               kvm_s390_set_psw_cc(vcpu, cc);
+               rc = -EFAULT;
+       }
+       /* don't set the cc, a pgm irq was injected or we drop to user space */
        return rc ? -EFAULT : 0;
 }
 
 static int handle_tsch(struct kvm_vcpu *vcpu)
 {
-       struct kvm_s390_interrupt_info *inti;
+       struct kvm_s390_interrupt_info *inti = NULL;
+       const u64 isc_mask = 0xffUL << 24; /* all iscs set */
 
-       inti = kvm_s390_get_io_int(vcpu->kvm, 0,
-                                  vcpu->run->s.regs.gprs[1]);
+       /* a valid schid has at least one bit set */
+       if (vcpu->run->s.regs.gprs[1])
+               inti = kvm_s390_get_io_int(vcpu->kvm, isc_mask,
+                                          vcpu->run->s.regs.gprs[1]);
 
        /*
         * Prepare exit to userspace.
@@ -386,15 +403,16 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
        psw_compat_t new_psw;
        u64 addr;
        int rc;
+       ar_t ar;
 
        if (gpsw->mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       addr = kvm_s390_get_base_disp_s(vcpu);
+       addr = kvm_s390_get_base_disp_s(vcpu, &ar);
        if (addr & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       rc = read_guest(vcpu, addr, &new_psw, sizeof(new_psw));
+       rc = read_guest(vcpu, addr, ar, &new_psw, sizeof(new_psw));
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
        if (!(new_psw.mask & PSW32_MASK_BASE))
@@ -412,14 +430,15 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
        psw_t new_psw;
        u64 addr;
        int rc;
+       ar_t ar;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       addr = kvm_s390_get_base_disp_s(vcpu);
+       addr = kvm_s390_get_base_disp_s(vcpu, &ar);
        if (addr & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-       rc = read_guest(vcpu, addr, &new_psw, sizeof(new_psw));
+       rc = read_guest(vcpu, addr, ar, &new_psw, sizeof(new_psw));
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
        vcpu->arch.sie_block->gpsw = new_psw;
@@ -433,18 +452,19 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
        u64 stidp_data = vcpu->arch.stidp_data;
        u64 operand2;
        int rc;
+       ar_t ar;
 
        vcpu->stat.instruction_stidp++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       operand2 = kvm_s390_get_base_disp_s(vcpu);
+       operand2 = kvm_s390_get_base_disp_s(vcpu, &ar);
 
        if (operand2 & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       rc = write_guest(vcpu, operand2, &stidp_data, sizeof(stidp_data));
+       rc = write_guest(vcpu, operand2, ar, &stidp_data, sizeof(stidp_data));
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
 
@@ -467,6 +487,7 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
        for (n = mem->count - 1; n > 0 ; n--)
                memcpy(&mem->vm[n], &mem->vm[n - 1], sizeof(mem->vm[0]));
 
+       memset(&mem->vm[0], 0, sizeof(mem->vm[0]));
        mem->vm[0].cpus_total = cpus;
        mem->vm[0].cpus_configured = cpus;
        mem->vm[0].cpus_standby = 0;
@@ -478,6 +499,17 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
        ASCEBC(mem->vm[0].cpi, 16);
 }
 
+static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, ar_t ar,
+                                u8 fc, u8 sel1, u16 sel2)
+{
+       vcpu->run->exit_reason = KVM_EXIT_S390_STSI;
+       vcpu->run->s390_stsi.addr = addr;
+       vcpu->run->s390_stsi.ar = ar;
+       vcpu->run->s390_stsi.fc = fc;
+       vcpu->run->s390_stsi.sel1 = sel1;
+       vcpu->run->s390_stsi.sel2 = sel2;
+}
+
 static int handle_stsi(struct kvm_vcpu *vcpu)
 {
        int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28;
@@ -486,6 +518,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
        unsigned long mem = 0;
        u64 operand2;
        int rc = 0;
+       ar_t ar;
 
        vcpu->stat.instruction_stsi++;
        VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
@@ -508,7 +541,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       operand2 = kvm_s390_get_base_disp_s(vcpu);
+       operand2 = kvm_s390_get_base_disp_s(vcpu, &ar);
 
        if (operand2 & 0xfff)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -532,16 +565,20 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
                break;
        }
 
-       rc = write_guest(vcpu, operand2, (void *)mem, PAGE_SIZE);
+       rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
        if (rc) {
                rc = kvm_s390_inject_prog_cond(vcpu, rc);
                goto out;
        }
+       if (vcpu->kvm->arch.user_stsi) {
+               insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2);
+               rc = -EREMOTE;
+       }
        trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
        free_page(mem);
        kvm_s390_set_psw_cc(vcpu, 0);
        vcpu->run->s.regs.gprs[0] = 0;
-       return 0;
+       return rc;
 out_no_data:
        kvm_s390_set_psw_cc(vcpu, 3);
 out:
@@ -670,7 +707,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
        }
 
        if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
-               if (kvm_s390_check_low_addr_protection(vcpu, start))
+               if (kvm_s390_check_low_addr_prot_real(vcpu, start))
                        return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
        }
 
@@ -776,13 +813,14 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
        int reg, rc, nr_regs;
        u32 ctl_array[16];
        u64 ga;
+       ar_t ar;
 
        vcpu->stat.instruction_lctl++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       ga = kvm_s390_get_base_disp_rs(vcpu);
+       ga = kvm_s390_get_base_disp_rs(vcpu, &ar);
 
        if (ga & 3)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -791,7 +829,7 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
        trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, ga);
 
        nr_regs = ((reg3 - reg1) & 0xf) + 1;
-       rc = read_guest(vcpu, ga, ctl_array, nr_regs * sizeof(u32));
+       rc = read_guest(vcpu, ga, ar, ctl_array, nr_regs * sizeof(u32));
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
        reg = reg1;
@@ -814,13 +852,14 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu)
        int reg, rc, nr_regs;
        u32 ctl_array[16];
        u64 ga;
+       ar_t ar;
 
        vcpu->stat.instruction_stctl++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       ga = kvm_s390_get_base_disp_rs(vcpu);
+       ga = kvm_s390_get_base_disp_rs(vcpu, &ar);
 
        if (ga & 3)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -836,7 +875,7 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu)
                        break;
                reg = (reg + 1) % 16;
        } while (1);
-       rc = write_guest(vcpu, ga, ctl_array, nr_regs * sizeof(u32));
+       rc = write_guest(vcpu, ga, ar, ctl_array, nr_regs * sizeof(u32));
        return rc ? kvm_s390_inject_prog_cond(vcpu, rc) : 0;
 }
 
@@ -847,13 +886,14 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
        int reg, rc, nr_regs;
        u64 ctl_array[16];
        u64 ga;
+       ar_t ar;
 
        vcpu->stat.instruction_lctlg++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       ga = kvm_s390_get_base_disp_rsy(vcpu);
+       ga = kvm_s390_get_base_disp_rsy(vcpu, &ar);
 
        if (ga & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -862,7 +902,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
        trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, ga);
 
        nr_regs = ((reg3 - reg1) & 0xf) + 1;
-       rc = read_guest(vcpu, ga, ctl_array, nr_regs * sizeof(u64));
+       rc = read_guest(vcpu, ga, ar, ctl_array, nr_regs * sizeof(u64));
        if (rc)
                return kvm_s390_inject_prog_cond(vcpu, rc);
        reg = reg1;
@@ -884,13 +924,14 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
        int reg, rc, nr_regs;
        u64 ctl_array[16];
        u64 ga;
+       ar_t ar;
 
        vcpu->stat.instruction_stctg++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       ga = kvm_s390_get_base_disp_rsy(vcpu);
+       ga = kvm_s390_get_base_disp_rsy(vcpu, &ar);
 
        if (ga & 7)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -906,7 +947,7 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
                        break;
                reg = (reg + 1) % 16;
        } while (1);
-       rc = write_guest(vcpu, ga, ctl_array, nr_regs * sizeof(u64));
+       rc = write_guest(vcpu, ga, ar, ctl_array, nr_regs * sizeof(u64));
        return rc ? kvm_s390_inject_prog_cond(vcpu, rc) : 0;
 }
 
@@ -931,13 +972,14 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
        unsigned long hva, gpa;
        int ret = 0, cc = 0;
        bool writable;
+       ar_t ar;
 
        vcpu->stat.instruction_tprot++;
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       kvm_s390_get_base_disp_sse(vcpu, &address1, &address2);
+       kvm_s390_get_base_disp_sse(vcpu, &address1, &address2, &ar, NULL);
 
        /* we only handle the Linux memory detection case:
         * access key == 0
@@ -946,11 +988,11 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
                return -EOPNOTSUPP;
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
                ipte_lock(vcpu);
-       ret = guest_translate_address(vcpu, address1, &gpa, 1);
+       ret = guest_translate_address(vcpu, address1, ar, &gpa, 1);
        if (ret == PGM_PROTECTION) {
                /* Write protected? Try again with read-only... */
                cc = 1;
-               ret = guest_translate_address(vcpu, address1, &gpa, 0);
+               ret = guest_translate_address(vcpu, address1, ar, &gpa, 0);
        }
        if (ret) {
                if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) {
index 23b1e86b212245dcf7a95fb4ab82ab99f8749735..72e58bd2bee78162e963dc2bd531e40e03e29d97 100644 (file)
@@ -393,6 +393,9 @@ static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code)
        case SIGP_STORE_STATUS_AT_ADDRESS:
                vcpu->stat.instruction_sigp_store_status++;
                break;
+       case SIGP_STORE_ADDITIONAL_STATUS:
+               vcpu->stat.instruction_sigp_store_adtl_status++;
+               break;
        case SIGP_SET_PREFIX:
                vcpu->stat.instruction_sigp_prefix++;
                break;
@@ -431,7 +434,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-       order_code = kvm_s390_get_base_disp_rs(vcpu);
+       order_code = kvm_s390_get_base_disp_rs(vcpu, NULL);
        if (handle_sigp_order_in_user_space(vcpu, order_code))
                return -EOPNOTSUPP;
 
@@ -473,7 +476,7 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu)
        int r3 = vcpu->arch.sie_block->ipa & 0x000f;
        u16 cpu_addr = vcpu->run->s.regs.gprs[r3];
        struct kvm_vcpu *dest_vcpu;
-       u8 order_code = kvm_s390_get_base_disp_rs(vcpu);
+       u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL);
 
        trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
 
index ec2e2e2aba7d8f15419aa12bc6f790993f8c54d4..cc9b04a2b11b6394cfcad8b485df1fac6f598583 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef _ASM_SPARC_JUMP_LABEL_H
 #define _ASM_SPARC_JUMP_LABEL_H
 
-#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
 
 #include <linux/types.h>
 
@@ -22,8 +22,6 @@ l_yes:
        return true;
 }
 
-#endif /* __KERNEL__ */
-
 typedef u32 jump_label_t;
 
 struct jump_entry {
@@ -32,4 +30,5 @@ struct jump_entry {
        jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif
index 9ce5afe167ff509288b21605a2f9a35f96ff36dc..b36365f49478c573d715816d53582e2dfb153227 100644 (file)
@@ -639,10 +639,7 @@ static void pci_claim_bus_resources(struct pci_bus *bus)
                                       (unsigned long long)r->end,
                                       (unsigned int)r->flags);
 
-                       if (pci_claim_resource(dev, i) == 0)
-                               continue;
-
-                       pci_claim_bridge_resource(dev, i);
+                       pci_claim_resource(dev, i);
                }
        }
 
index 2f80d23a0a44964ebff804a2ab4428b9f30cb162..18147a5523d947736a3e7c0815a7eeb3526ec271 100644 (file)
@@ -181,17 +181,13 @@ static struct clocksource timer_cs = {
        .rating = 100,
        .read   = timer_cs_read,
        .mask   = CLOCKSOURCE_MASK(64),
-       .shift  = 2,
        .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
 static __init int setup_timer_cs(void)
 {
        timer_cs_enabled = 1;
-       timer_cs.mult = clocksource_hz2mult(sparc_config.clock_rate,
-                                           timer_cs.shift);
-
-       return clocksource_register(&timer_cs);
+       return clocksource_register_hz(&timer_cs, sparc_config.clock_rate);
 }
 
 #ifdef CONFIG_SMP
index d412b0856c0a2622b13b9bc84dc39f65c1b5b08c..00178ecf9aeab4731ffd24d86ff7bc28252e23b6 100644 (file)
@@ -257,34 +257,34 @@ void update_vsyscall_tz(void)
 
 void update_vsyscall(struct timekeeper *tk)
 {
-       if (tk->tkr.clock != &cycle_counter_cs)
+       if (tk->tkr_mono.clock != &cycle_counter_cs)
                return;
 
        write_seqcount_begin(&vdso_data->tb_seq);
 
-       vdso_data->cycle_last           = tk->tkr.cycle_last;
-       vdso_data->mask                 = tk->tkr.mask;
-       vdso_data->mult                 = tk->tkr.mult;
-       vdso_data->shift                = tk->tkr.shift;
+       vdso_data->cycle_last           = tk->tkr_mono.cycle_last;
+       vdso_data->mask                 = tk->tkr_mono.mask;
+       vdso_data->mult                 = tk->tkr_mono.mult;
+       vdso_data->shift                = tk->tkr_mono.shift;
 
        vdso_data->wall_time_sec        = tk->xtime_sec;
-       vdso_data->wall_time_snsec      = tk->tkr.xtime_nsec;
+       vdso_data->wall_time_snsec      = tk->tkr_mono.xtime_nsec;
 
        vdso_data->monotonic_time_sec   = tk->xtime_sec
                                        + tk->wall_to_monotonic.tv_sec;
-       vdso_data->monotonic_time_snsec = tk->tkr.xtime_nsec
+       vdso_data->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
                                        + ((u64)tk->wall_to_monotonic.tv_nsec
-                                               << tk->tkr.shift);
+                                               << tk->tkr_mono.shift);
        while (vdso_data->monotonic_time_snsec >=
-                                       (((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+                                       (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
                vdso_data->monotonic_time_snsec -=
-                                       ((u64)NSEC_PER_SEC) << tk->tkr.shift;
+                                       ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
                vdso_data->monotonic_time_sec++;
        }
 
        vdso_data->wall_time_coarse_sec = tk->xtime_sec;
-       vdso_data->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
-                                                tk->tkr.shift);
+       vdso_data->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
+                                                tk->tkr_mono.shift);
 
        vdso_data->monotonic_time_coarse_sec =
                vdso_data->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
index b7d31ca5518744983c77bc8339f30756621dfea0..faff6934c05a21f874ce5839f4b9b8542f9c02e5 100644 (file)
@@ -235,12 +235,10 @@ config ARCH_WANT_GENERAL_HUGETLB
        def_bool y
 
 config ZONE_DMA32
-       bool
-       default X86_64
+       def_bool y if X86_64
 
 config AUDIT_ARCH
-       bool
-       default X86_64
+       def_bool y if X86_64
 
 config ARCH_SUPPORTS_OPTIMIZED_INLINING
        def_bool y
@@ -891,7 +889,8 @@ config UP_LATE_INIT
        depends on !SMP && X86_LOCAL_APIC
 
 config X86_UP_APIC
-       bool "Local APIC support on uniprocessors"
+       bool "Local APIC support on uniprocessors" if !PCI_MSI
+       default PCI_MSI
        depends on X86_32 && !SMP && !X86_32_NON_STANDARD
        ---help---
          A local APIC (Advanced Programmable Interrupt Controller) is an
@@ -903,10 +902,6 @@ config X86_UP_APIC
          performance counters), and the NMI watchdog which detects hard
          lockups.
 
-config X86_UP_APIC_MSI
-       def_bool y
-       select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI
-
 config X86_UP_IOAPIC
        bool "IO-APIC support on uniprocessors"
        depends on X86_UP_APIC
@@ -925,8 +920,8 @@ config X86_LOCAL_APIC
        select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
 
 config X86_IO_APIC
-       def_bool X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
-       depends on X86_LOCAL_APIC
+       def_bool y
+       depends on X86_LOCAL_APIC || X86_UP_IOAPIC
        select IRQ_DOMAIN
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -1145,10 +1140,10 @@ config MICROCODE_OLD_INTERFACE
        depends on MICROCODE
 
 config MICROCODE_INTEL_EARLY
-       def_bool n
+       bool
 
 config MICROCODE_AMD_EARLY
-       def_bool n
+       bool
 
 config MICROCODE_EARLY
        bool "Early load microcode"
@@ -1300,14 +1295,14 @@ config ARCH_DMA_ADDR_T_64BIT
        def_bool y
        depends on X86_64 || HIGHMEM64G
 
-config DIRECT_GBPAGES
-       bool "Enable 1GB pages for kernel pagetables" if EXPERT
-       default y
-       depends on X86_64
+config X86_DIRECT_GBPAGES
+       def_bool y
+       depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
        ---help---
-         Allow the kernel linear mapping to use 1GB pages on CPUs that
-         support it. This can improve the kernel's performance a tiny bit by
-         reducing TLB pressure. If in doubt, say "Y".
+         Certain kernel features effectively disable kernel
+         linear 1 GB mappings (even if the CPU otherwise
+         supports them), so don't confuse the user by printing
+         that we have them enabled.
 
 # Common NUMA Features
 config NUMA
@@ -1747,14 +1742,11 @@ config KEXEC_VERIFY_SIG
        depends on KEXEC_FILE
        ---help---
          This option makes kernel signature verification mandatory for
-         kexec_file_load() syscall. If kernel is signature can not be
-         verified, kexec_file_load() will fail.
-
-         This option enforces signature verification at generic level.
-         One needs to enable signature verification for type of kernel
-         image being loaded to make sure it works. For example, enable
-         bzImage signature verification option to be able to load and
-         verify signatures of bzImage. Otherwise kernel loading will fail.
+         the kexec_file_load() syscall.
+
+         In addition to that option, you need to enable signature
+         verification for the corresponding kernel image type being
+         loaded in order for this to work.
 
 config KEXEC_BZIMAGE_VERIFY_SIG
        bool "Enable bzImage signature verification support"
index bb1376381985edb9f96e49c0a1b0269e56bd0f9e..d7b1f655b3ef4c2e14a4976fa565fe67d0159607 100644 (file)
@@ -295,7 +295,8 @@ static unsigned long find_random_addr(unsigned long minimum,
        return slots_fetch_random();
 }
 
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *boot_params,
+                                     unsigned char *input,
                                      unsigned long input_size,
                                      unsigned char *output,
                                      unsigned long output_size)
@@ -315,6 +316,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
        }
 #endif
 
+       boot_params->hdr.loadflags |= KASLR_FLAG;
+
        /* Record the various known unsafe memory ranges. */
        mem_avoid_init((unsigned long)input, input_size,
                       (unsigned long)output, output_size);
index 1d7fbbcc196d6f8b661545130972453f109e27d7..8ef964ddc18ec656b1e3b0f038adf134484dbdd2 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/page_types.h>
 #include <asm/boot.h>
 #include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
 
        __HEAD
 ENTRY(startup_32)
@@ -102,7 +103,7 @@ preferred_addr:
         * Test KEEP_SEGMENTS flag to see if the bootloader is asking
         * us to not reload segments
         */
-       testb   $(1<<6), BP_loadflags(%esi)
+       testb   $KEEP_SEGMENTS, BP_loadflags(%esi)
        jnz     1f
 
        cli
index 6b1766c6c08205f3bda8a527dff88097b48e77e3..b0c0d16ef58d1099342c97aff83767dd35c73691 100644 (file)
@@ -31,6 +31,7 @@
 #include <asm/msr.h>
 #include <asm/processor-flags.h>
 #include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
 
        __HEAD
        .code32
@@ -46,7 +47,7 @@ ENTRY(startup_32)
         * Test KEEP_SEGMENTS flag to see if the bootloader is asking
         * us to not reload segments
         */
-       testb $(1<<6), BP_loadflags(%esi)
+       testb $KEEP_SEGMENTS, BP_loadflags(%esi)
        jnz 1f
 
        cli
@@ -164,7 +165,7 @@ ENTRY(startup_32)
        /* After gdt is loaded */
        xorl    %eax, %eax
        lldt    %ax
-       movl    $0x20, %eax
+       movl    $__BOOT_TSS, %eax
        ltr     %ax
 
        /*
index a950864a64dab3d558197c77bef3c56a07961494..a107b935e22fbc9a749ddf975cf72ea187cb1fa2 100644 (file)
@@ -377,6 +377,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 
        real_mode = rmode;
 
+       /* Clear it for solely in-kernel use */
+       real_mode->hdr.loadflags &= ~KASLR_FLAG;
+
        sanitize_boot_params(real_mode);
 
        if (real_mode->screen_info.orig_video_mode == 7) {
@@ -401,7 +404,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
         * the entire decompressed kernel plus relocation table, or the
         * entire decompressed kernel plus .bss and .brk sections.
         */
-       output = choose_kernel_location(input_data, input_len, output,
+       output = choose_kernel_location(real_mode, input_data, input_len, output,
                                        output_len > run_size ? output_len
                                                              : run_size);
 
index 04477d68403f1fe6197d82276033ce27338c1bac..89dd0d78013aaff6c889340e0e3caceb4c8f8c88 100644 (file)
@@ -57,7 +57,8 @@ int cmdline_find_option_bool(const char *option);
 
 #if CONFIG_RANDOMIZE_BASE
 /* aslr.c */
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *boot_params,
+                                     unsigned char *input,
                                      unsigned long input_size,
                                      unsigned char *output,
                                      unsigned long output_size);
@@ -65,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
 bool has_cpuflag(int flag);
 #else
 static inline
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *boot_params,
+                                     unsigned char *input,
                                      unsigned long input_size,
                                      unsigned char *output,
                                      unsigned long output_size)
index 493f3fd9f1391815c29e06c2163f3225b4e52063..318b8465d30204cad7006bf0889672e0da093aed 100644 (file)
@@ -30,7 +30,7 @@ int strcmp(const char *str1, const char *str2)
        int delta = 0;
 
        while (*s1 || *s2) {
-               delta = *s2 - *s1;
+               delta = *s1 - *s2;
                if (delta)
                        return delta;
                s1++;
index 748e8d06290a668ff5c417197cf5c2bc5c7a16e3..aa8a96b052e30263d60afc6b81a60e387fc55773 100644 (file)
 /*
  * Common variables
  */
-int adapter;                   /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
-u16 video_segment;
+int adapter;           /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
 int force_x, force_y;  /* Don't query the BIOS for cols/rows */
-
 int do_restore;                /* Screen contents changed during mode flip */
 int graphic_mode;      /* Graphic mode with linear frame buffer */
 
index 43eda284d27fe96c2a4d407273f4b8d61bae87e2..05111bb8d018e38c71ae278ee0a78ccff76e34e2 100644 (file)
@@ -17,6 +17,8 @@
 #include "video.h"
 #include "vesa.h"
 
+static u16 video_segment;
+
 static void store_cursor_position(void)
 {
        struct biosregs ireg, oreg;
index 0bb25491262d00f941001d3f7f7b2fc800212250..b54e0328c449e981013dc1edc46d331827e290c4 100644 (file)
@@ -91,7 +91,6 @@ int mode_defined(u16 mode);   /* video.c */
 #define ADAPTER_VGA    2
 
 extern int adapter;
-extern u16 video_segment;
 extern int force_x, force_y;   /* Don't query the BIOS for cols/rows */
 extern int do_restore;         /* Restore screen contents */
 extern int graphic_mode;       /* Graphics mode with linear frame buffer */
index 419819d6dab3b7573196dc685c9bc9ea236a147f..aaa1118bf01e864f50dad66a0e1099bb50de08d7 100644 (file)
@@ -248,7 +248,7 @@ CONFIG_USB=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_MON=y
 CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_EHCI_TT_NEWSCHED=y
 CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_UHCI_HCD=y
 CONFIG_USB_PRINTER=y
index 4c311ddd973bb60686079a5c4d2831cd0bf224c7..315b861065725a4cd154744ee45d5331de6fb828 100644 (file)
@@ -243,7 +243,7 @@ CONFIG_USB=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_MON=y
 CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_EHCI_TT_NEWSCHED=y
 CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_UHCI_HCD=y
 CONFIG_USB_PRINTER=y
index 26d49ebae0404ee74fea7a5e7f6bd7d22b9bad80..225be06edc80982f9509e25af09e1ac0a0232237 100644 (file)
@@ -178,7 +178,7 @@ continue_block:
        ## 2a) PROCESS FULL BLOCKS:
        ################################################################
 full_block:
-       movq    $128,%rax
+       movl    $128,%eax
        lea     128*8*2(block_0), block_1
        lea     128*8*3(block_0), block_2
        add     $128*8*1, block_0
index a039d21986a21c87e9bdf9a41117a726ea3bf77f..a350c990dc86c86bdf5ec4396a9db17dd8dc7bfa 100644 (file)
@@ -264,7 +264,7 @@ ENTRY(twofish_enc_blk)
        movq    R1,     8(%rsi)
 
        popq    R1
-       movq    $1,%rax
+       movl    $1,%eax
        ret
 ENDPROC(twofish_enc_blk)
 
@@ -316,6 +316,6 @@ ENTRY(twofish_dec_blk)
        movq    R1,     8(%rsi)
 
        popq    R1
-       movq    $1,%rax
+       movl    $1,%eax
        ret
 ENDPROC(twofish_dec_blk)
index e785b422b76686b3bf79d95be5aebd20b961cef8..bb635c6418692f4fea1d02fce3b1c170f1a38b0f 100644 (file)
@@ -3,7 +3,6 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
-obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
 
 obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
 
index d0165c9a293241559c48742d6c0fb8df3e4fcf82..c81d35e6c7f1d91c22734793c006c0f5f33c0c10 100644 (file)
@@ -161,8 +161,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 }
 
 static int ia32_restore_sigcontext(struct pt_regs *regs,
-                                  struct sigcontext_ia32 __user *sc,
-                                  unsigned int *pax)
+                                  struct sigcontext_ia32 __user *sc)
 {
        unsigned int tmpflags, err = 0;
        void __user *buf;
@@ -184,7 +183,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
                RELOAD_SEG(es);
 
                COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
-               COPY(dx); COPY(cx); COPY(ip);
+               COPY(dx); COPY(cx); COPY(ip); COPY(ax);
                /* Don't touch extended registers */
 
                COPY_SEG_CPL3(cs);
@@ -197,12 +196,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 
                get_user_ex(tmp, &sc->fpstate);
                buf = compat_ptr(tmp);
-
-               get_user_ex(*pax, &sc->ax);
        } get_user_catch(err);
 
        err |= restore_xstate_sig(buf, 1);
 
+       force_iret();
+
        return err;
 }
 
@@ -211,7 +210,6 @@ asmlinkage long sys32_sigreturn(void)
        struct pt_regs *regs = current_pt_regs();
        struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
        sigset_t set;
-       unsigned int ax;
 
        if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
                goto badframe;
@@ -224,9 +222,9 @@ asmlinkage long sys32_sigreturn(void)
 
        set_current_blocked(&set);
 
-       if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
+       if (ia32_restore_sigcontext(regs, &frame->sc))
                goto badframe;
-       return ax;
+       return regs->ax;
 
 badframe:
        signal_fault(regs, frame, "32bit sigreturn");
@@ -238,7 +236,6 @@ asmlinkage long sys32_rt_sigreturn(void)
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe_ia32 __user *frame;
        sigset_t set;
-       unsigned int ax;
 
        frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
 
@@ -249,13 +246,13 @@ asmlinkage long sys32_rt_sigreturn(void)
 
        set_current_blocked(&set);
 
-       if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+       if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
                goto badframe;
 
        if (compat_restore_altstack(&frame->uc.uc_stack))
                goto badframe;
 
-       return ax;
+       return regs->ax;
 
 badframe:
        signal_fault(regs, frame, "32bit rt sigreturn");
index 156ebcab4ada6d54cf8992fcd95398af390f41cc..a821b1cd4fa7a2748cce60ba986ef63d27efb131 100644 (file)
 
        .section .entry.text, "ax"
 
-       .macro IA32_ARG_FIXUP noebp=0
-       movl    %edi,%r8d
-       .if \noebp
-       .else
-       movl    %ebp,%r9d
-       .endif
-       xchg    %ecx,%esi
-       movl    %ebx,%edi
-       movl    %edx,%edx       /* zero extension */
-       .endm 
-
-       /* clobbers %eax */     
-       .macro  CLEAR_RREGS offset=0, _r9=rax
+       /* clobbers %rax */
+       .macro  CLEAR_RREGS _r9=rax
        xorl    %eax,%eax
-       movq    %rax,\offset+R11(%rsp)
-       movq    %rax,\offset+R10(%rsp)
-       movq    %\_r9,\offset+R9(%rsp)
-       movq    %rax,\offset+R8(%rsp)
+       movq    %rax,R11(%rsp)
+       movq    %rax,R10(%rsp)
+       movq    %\_r9,R9(%rsp)
+       movq    %rax,R8(%rsp)
        .endm
 
        /*
         * If it's -1 to make us punt the syscall, then (u32)-1 is still
         * an appropriately invalid value.
         */
-       .macro LOAD_ARGS32 offset, _r9=0
+       .macro LOAD_ARGS32 _r9=0
        .if \_r9
-       movl \offset+16(%rsp),%r9d
+       movl R9(%rsp),%r9d
        .endif
-       movl \offset+40(%rsp),%ecx
-       movl \offset+48(%rsp),%edx
-       movl \offset+56(%rsp),%esi
-       movl \offset+64(%rsp),%edi
+       movl RCX(%rsp),%ecx
+       movl RDX(%rsp),%edx
+       movl RSI(%rsp),%esi
+       movl RDI(%rsp),%edi
        movl %eax,%eax                  /* zero extension */
        .endm
        
@@ -99,54 +88,69 @@ ENDPROC(native_irq_enable_sysexit)
 /*
  * 32bit SYSENTER instruction entry.
  *
+ * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
+ * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old rip (!!!) and rflags.
+ *
  * Arguments:
- * %eax        System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp user stack
- * 0(%ebp) Arg6        
- *     
- * Interrupts off.
- *     
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ *
  * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. Set up a complete hardware stack frame to share code
+ * path below. We set up a complete hardware stack frame to share code
  * with the int 0x80 path.
- */    
+ */
 ENTRY(ia32_sysenter_target)
        CFI_STARTPROC32 simple
        CFI_SIGNAL_FRAME
        CFI_DEF_CFA     rsp,0
        CFI_REGISTER    rsp,rbp
-       SWAPGS_UNSAFE_STACK
-       movq    PER_CPU_VAR(kernel_stack), %rsp
-       addq    $(KERNEL_STACK_OFFSET),%rsp
+
        /*
-        * No need to follow this irqs on/off section: the syscall
-        * disabled irqs, here we enable it straight after entry:
+        * Interrupts are off on entry.
+        * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+        * it is too small to ever cause noticeable irq latency.
         */
+       SWAPGS_UNSAFE_STACK
+       movq    PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
        ENABLE_INTERRUPTS(CLBR_NONE)
-       movl    %ebp,%ebp               /* zero extension */
-       pushq_cfi $__USER32_DS
-       /*CFI_REL_OFFSET ss,0*/
-       pushq_cfi %rbp
-       CFI_REL_OFFSET rsp,0
-       pushfq_cfi
-       /*CFI_REL_OFFSET rflags,0*/
-       movl    TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
-       CFI_REGISTER rip,r10
-       pushq_cfi $__USER32_CS
-       /*CFI_REL_OFFSET cs,0*/
+
+       /* Zero-extending 32-bit regs, do not remove */
+       movl    %ebp, %ebp
        movl    %eax, %eax
-       pushq_cfi %r10
-       CFI_REL_OFFSET rip,0
-       pushq_cfi %rax
+
+       movl    ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
+       CFI_REGISTER rip,r10
+
+       /* Construct struct pt_regs on stack */
+       pushq_cfi       $__USER32_DS            /* pt_regs->ss */
+       pushq_cfi       %rbp                    /* pt_regs->sp */
+       CFI_REL_OFFSET  rsp,0
+       pushfq_cfi                              /* pt_regs->flags */
+       pushq_cfi       $__USER32_CS            /* pt_regs->cs */
+       pushq_cfi       %r10 /* pt_regs->ip = thread_info->sysenter_return */
+       CFI_REL_OFFSET  rip,0
+       pushq_cfi_reg   rax                     /* pt_regs->orig_ax */
+       pushq_cfi_reg   rdi                     /* pt_regs->di */
+       pushq_cfi_reg   rsi                     /* pt_regs->si */
+       pushq_cfi_reg   rdx                     /* pt_regs->dx */
+       pushq_cfi_reg   rcx                     /* pt_regs->cx */
+       pushq_cfi_reg   rax                     /* pt_regs->ax */
        cld
-       SAVE_ARGS 0,1,0
-       /* no need to do an access_ok check here because rbp has been
-          32bit zero extended */ 
+       sub     $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
+       CFI_ADJUST_CFA_OFFSET 10*8
+
+       /*
+        * no need to do an access_ok check here because rbp has been
+        * 32bit zero extended
+        */
        ASM_STAC
 1:     movl    (%rbp),%ebp
        _ASM_EXTABLE(1b,ia32_badarg)
@@ -157,42 +161,80 @@ ENTRY(ia32_sysenter_target)
         * ourselves.  To save a few cycles, we can check whether
         * NT was set instead of doing an unconditional popfq.
         */
-       testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp)
+       testl $X86_EFLAGS_NT,EFLAGS(%rsp)
        jnz sysenter_fix_flags
 sysenter_flags_fixed:
 
-       orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-       testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+       testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        CFI_REMEMBER_STATE
        jnz  sysenter_tracesys
        cmpq    $(IA32_NR_syscalls-1),%rax
        ja      ia32_badsys
 sysenter_do_call:
-       IA32_ARG_FIXUP
+       /* 32bit syscall -> 64bit C ABI argument conversion */
+       movl    %edi,%r8d       /* arg5 */
+       movl    %ebp,%r9d       /* arg6 */
+       xchg    %ecx,%esi       /* rsi:arg2, rcx:arg4 */
+       movl    %ebx,%edi       /* arg1 */
+       movl    %edx,%edx       /* arg3 (zero extension) */
 sysenter_dispatch:
        call    *ia32_sys_call_table(,%rax,8)
-       movq    %rax,RAX-ARGOFFSET(%rsp)
+       movq    %rax,RAX(%rsp)
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       testl   $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl   $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        jnz     sysexit_audit
 sysexit_from_sys_call:
-       andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-       /* clear IF, that popfq doesn't enable interrupts early */
-       andl    $~0x200,EFLAGS-ARGOFFSET(%rsp)
-       movl    RIP-ARGOFFSET(%rsp),%edx                /* User %eip */
-       CFI_REGISTER rip,rdx
-       RESTORE_ARGS 0,24,0,0,0,0
+       /*
+        * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
+        * NMI between STI and SYSEXIT has poorly specified behavior,
+        * and and NMI followed by an IRQ with usergs is fatal.  So
+        * we just pretend we're using SYSEXIT but we really use
+        * SYSRETL instead.
+        *
+        * This code path is still called 'sysexit' because it pairs
+        * with 'sysenter' and it uses the SYSENTER calling convention.
+        */
+       andl    $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+       movl    RIP(%rsp),%ecx          /* User %eip */
+       CFI_REGISTER rip,rcx
+       RESTORE_RSI_RDI
+       xorl    %edx,%edx               /* avoid info leaks */
        xorq    %r8,%r8
        xorq    %r9,%r9
        xorq    %r10,%r10
-       xorq    %r11,%r11
-       popfq_cfi
+       movl    EFLAGS(%rsp),%r11d      /* User eflags */
        /*CFI_RESTORE rflags*/
-       popq_cfi %rcx                           /* User %esp */
-       CFI_REGISTER rsp,rcx
        TRACE_IRQS_ON
-       ENABLE_INTERRUPTS_SYSEXIT32
+
+       /*
+        * SYSRETL works even on Intel CPUs.  Use it in preference to SYSEXIT,
+        * since it avoids a dicey window with interrupts enabled.
+        */
+       movl    RSP(%rsp),%esp
+
+       /*
+        * USERGS_SYSRET32 does:
+        *  gsbase = user's gs base
+        *  eip = ecx
+        *  rflags = r11
+        *  cs = __USER32_CS
+        *  ss = __USER_DS
+        *
+        * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
+        *
+        *  pop %ebp
+        *  pop %edx
+        *  pop %ecx
+        *
+        * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
+        * avoid info leaks.  R11 ends up with VDSO32_SYSENTER_RETURN's
+        * address (already known to user code), and R12-R15 are
+        * callee-saved and therefore don't contain any interesting
+        * kernel data.
+        */
+       USERGS_SYSRET32
 
        CFI_RESTORE_STATE
 
@@ -205,18 +247,18 @@ sysexit_from_sys_call:
        movl %ebx,%esi                  /* 2nd arg: 1st syscall arg */
        movl %eax,%edi                  /* 1st arg: syscall number */
        call __audit_syscall_entry
-       movl RAX-ARGOFFSET(%rsp),%eax   /* reload syscall number */
+       movl RAX(%rsp),%eax     /* reload syscall number */
        cmpq $(IA32_NR_syscalls-1),%rax
        ja ia32_badsys
        movl %ebx,%edi                  /* reload 1st syscall arg */
-       movl RCX-ARGOFFSET(%rsp),%esi   /* reload 2nd syscall arg */
-       movl RDX-ARGOFFSET(%rsp),%edx   /* reload 3rd syscall arg */
-       movl RSI-ARGOFFSET(%rsp),%ecx   /* reload 4th syscall arg */
-       movl RDI-ARGOFFSET(%rsp),%r8d   /* reload 5th syscall arg */
+       movl RCX(%rsp),%esi     /* reload 2nd syscall arg */
+       movl RDX(%rsp),%edx     /* reload 3rd syscall arg */
+       movl RSI(%rsp),%ecx     /* reload 4th syscall arg */
+       movl RDI(%rsp),%r8d     /* reload 5th syscall arg */
        .endm
 
        .macro auditsys_exit exit
-       testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        jnz ia32_ret_from_sys_call
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
@@ -227,13 +269,13 @@ sysexit_from_sys_call:
 1:     setbe %al               /* 1 if error, 0 if not */
        movzbl %al,%edi         /* zero-extend that into %edi */
        call __audit_syscall_exit
-       movq RAX-ARGOFFSET(%rsp),%rax   /* reload syscall return value */
+       movq RAX(%rsp),%rax     /* reload syscall return value */
        movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        jz \exit
-       CLEAR_RREGS -ARGOFFSET
+       CLEAR_RREGS
        jmp int_with_check
        .endm
 
@@ -253,16 +295,16 @@ sysenter_fix_flags:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-       testl   $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl   $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        jz      sysenter_auditsys
 #endif
-       SAVE_REST
+       SAVE_EXTRA_REGS
        CLEAR_RREGS
        movq    $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
        movq    %rsp,%rdi        /* &pt_regs -> arg1 */
        call    syscall_trace_enter
-       LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
-       RESTORE_REST
+       LOAD_ARGS32  /* reload args from stack in case ptrace changed it */
+       RESTORE_EXTRA_REGS
        cmpq    $(IA32_NR_syscalls-1),%rax
        ja      int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
        jmp     sysenter_do_call
@@ -272,94 +314,128 @@ ENDPROC(ia32_sysenter_target)
 /*
  * 32bit SYSCALL instruction entry.
  *
+ * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
  * Arguments:
- * %eax        System call number.
- * %ebx Arg1
- * %ecx return EIP 
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
- * %esp user stack 
- * 0(%esp) Arg6
- *     
- * Interrupts off.
- *     
+ * eax  system call number
+ * ecx  return address
+ * ebx  arg1
+ * ebp  arg2   (note: not saved in the stack frame, should not be touched)
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * esp  user stack
+ * 0(%esp) arg6
+ *
  * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. Set up a complete hardware stack frame to share code
- * with the int 0x80 path.     
- */    
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
 ENTRY(ia32_cstar_target)
        CFI_STARTPROC32 simple
        CFI_SIGNAL_FRAME
-       CFI_DEF_CFA     rsp,KERNEL_STACK_OFFSET
+       CFI_DEF_CFA     rsp,0
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
+
+       /*
+        * Interrupts are off on entry.
+        * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+        * it is too small to ever cause noticeable irq latency.
+        */
        SWAPGS_UNSAFE_STACK
        movl    %esp,%r8d
        CFI_REGISTER    rsp,r8
        movq    PER_CPU_VAR(kernel_stack),%rsp
-       /*
-        * No need to follow this irqs on/off section: the syscall
-        * disabled irqs and here we enable it straight after entry:
-        */
        ENABLE_INTERRUPTS(CLBR_NONE)
-       SAVE_ARGS 8,0,0
-       movl    %eax,%eax       /* zero extension */
-       movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
-       movq    %rcx,RIP-ARGOFFSET(%rsp)
-       CFI_REL_OFFSET rip,RIP-ARGOFFSET
-       movq    %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+
+       /* Zero-extending 32-bit regs, do not remove */
+       movl    %eax,%eax
+
+       /* Construct struct pt_regs on stack */
+       pushq_cfi       $__USER32_DS            /* pt_regs->ss */
+       pushq_cfi       %r8                     /* pt_regs->sp */
+       CFI_REL_OFFSET rsp,0
+       pushq_cfi       %r11                    /* pt_regs->flags */
+       pushq_cfi       $__USER32_CS            /* pt_regs->cs */
+       pushq_cfi       %rcx                    /* pt_regs->ip */
+       CFI_REL_OFFSET rip,0
+       pushq_cfi_reg   rax                     /* pt_regs->orig_ax */
+       pushq_cfi_reg   rdi                     /* pt_regs->di */
+       pushq_cfi_reg   rsi                     /* pt_regs->si */
+       pushq_cfi_reg   rdx                     /* pt_regs->dx */
+       pushq_cfi_reg   rbp                     /* pt_regs->cx */
        movl    %ebp,%ecx
-       movq    $__USER32_CS,CS-ARGOFFSET(%rsp)
-       movq    $__USER32_DS,SS-ARGOFFSET(%rsp)
-       movq    %r11,EFLAGS-ARGOFFSET(%rsp)
-       /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
-       movq    %r8,RSP-ARGOFFSET(%rsp) 
-       CFI_REL_OFFSET rsp,RSP-ARGOFFSET
-       /* no need to do an access_ok check here because r8 has been
-          32bit zero extended */ 
-       /* hardware stack frame is complete now */      
+       pushq_cfi_reg   rax                     /* pt_regs->ax */
+       sub     $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
+       CFI_ADJUST_CFA_OFFSET 10*8
+
+       /*
+        * no need to do an access_ok check here because r8 has been
+        * 32bit zero extended
+        */
        ASM_STAC
 1:     movl    (%r8),%r9d
        _ASM_EXTABLE(1b,ia32_badarg)
        ASM_CLAC
-       orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-       testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+       testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        CFI_REMEMBER_STATE
        jnz   cstar_tracesys
        cmpq $IA32_NR_syscalls-1,%rax
        ja  ia32_badsys
 cstar_do_call:
-       IA32_ARG_FIXUP 1
+       /* 32bit syscall -> 64bit C ABI argument conversion */
+       movl    %edi,%r8d       /* arg5 */
+       /* r9 already loaded */ /* arg6 */
+       xchg    %ecx,%esi       /* rsi:arg2, rcx:arg4 */
+       movl    %ebx,%edi       /* arg1 */
+       movl    %edx,%edx       /* arg3 (zero extension) */
 cstar_dispatch:
        call *ia32_sys_call_table(,%rax,8)
-       movq %rax,RAX-ARGOFFSET(%rsp)
+       movq %rax,RAX(%rsp)
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        jnz sysretl_audit
 sysretl_from_sys_call:
-       andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-       RESTORE_ARGS 0,-ARG_SKIP,0,0,0
-       movl RIP-ARGOFFSET(%rsp),%ecx
+       andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+       RESTORE_RSI_RDI_RDX
+       movl RIP(%rsp),%ecx
        CFI_REGISTER rip,rcx
-       movl EFLAGS-ARGOFFSET(%rsp),%r11d       
+       movl EFLAGS(%rsp),%r11d
        /*CFI_REGISTER rflags,r11*/
        xorq    %r10,%r10
        xorq    %r9,%r9
        xorq    %r8,%r8
        TRACE_IRQS_ON
-       movl RSP-ARGOFFSET(%rsp),%esp
+       movl RSP(%rsp),%esp
        CFI_RESTORE rsp
+       /*
+        * 64bit->32bit SYSRET restores eip from ecx,
+        * eflags from r11 (but RF and VM bits are forced to 0),
+        * cs and ss are loaded from MSRs.
+        * (Note: 32bit->32bit SYSRET is different: since r11
+        * does not exist, it merely sets eflags.IF=1).
+        */
        USERGS_SYSRET32
-       
+
 #ifdef CONFIG_AUDITSYSCALL
 cstar_auditsys:
        CFI_RESTORE_STATE
-       movl %r9d,R9-ARGOFFSET(%rsp)    /* register to be clobbered by call */
+       movl %r9d,R9(%rsp)      /* register to be clobbered by call */
        auditsys_entry_common
-       movl R9-ARGOFFSET(%rsp),%r9d    /* reload 6th syscall arg */
+       movl R9(%rsp),%r9d      /* reload 6th syscall arg */
        jmp cstar_dispatch
 
 sysretl_audit:
@@ -368,17 +444,17 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        jz cstar_auditsys
 #endif
        xchgl %r9d,%ebp
-       SAVE_REST
-       CLEAR_RREGS 0, r9
+       SAVE_EXTRA_REGS
+       CLEAR_RREGS r9
        movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
        movq %rsp,%rdi        /* &pt_regs -> arg1 */
        call syscall_trace_enter
-       LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace changed it */
-       RESTORE_REST
+       LOAD_ARGS32   /* reload args from stack in case ptrace changed it */
+       RESTORE_EXTRA_REGS
        xchgl %ebp,%r9d
        cmpq $(IA32_NR_syscalls-1),%rax
        ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
@@ -391,78 +467,94 @@ ia32_badarg:
        jmp ia32_sysret
        CFI_ENDPROC
 
-/* 
- * Emulated IA32 system calls via int 0x80. 
+/*
+ * Emulated IA32 system calls via int 0x80.
  *
- * Arguments:   
- * %eax        System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6   (note: not saved in the stack frame, should not be touched)
  *
  * Notes:
- * Uses the same stack frame as the x86-64 version.    
- * All registers except %eax must be saved (but ptrace may violate that)
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except eax must be saved (but ptrace may violate that).
  * Arguments are zero extended. For system calls that want sign extension and
  * take long arguments a wrapper is needed. Most calls can just be called
  * directly.
- * Assumes it is only called from user space and entered with interrupts off.  
- */                            
+ * Assumes it is only called from user space and entered with interrupts off.
+ */
 
 ENTRY(ia32_syscall)
        CFI_STARTPROC32 simple
        CFI_SIGNAL_FRAME
-       CFI_DEF_CFA     rsp,SS+8-RIP
-       /*CFI_REL_OFFSET        ss,SS-RIP*/
-       CFI_REL_OFFSET  rsp,RSP-RIP
-       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP*/
-       /*CFI_REL_OFFSET        cs,CS-RIP*/
-       CFI_REL_OFFSET  rip,RIP-RIP
-       PARAVIRT_ADJUST_EXCEPTION_FRAME
-       SWAPGS
+       CFI_DEF_CFA     rsp,5*8
+       /*CFI_REL_OFFSET        ss,4*8 */
+       CFI_REL_OFFSET  rsp,3*8
+       /*CFI_REL_OFFSET        rflags,2*8 */
+       /*CFI_REL_OFFSET        cs,1*8 */
+       CFI_REL_OFFSET  rip,0*8
+
        /*
-        * No need to follow this irqs on/off section: the syscall
-        * disabled irqs and here we enable it straight after entry:
+        * Interrupts are off on entry.
+        * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+        * it is too small to ever cause noticeable irq latency.
         */
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
+       SWAPGS
        ENABLE_INTERRUPTS(CLBR_NONE)
-       movl %eax,%eax
-       pushq_cfi %rax
+
+       /* Zero-extending 32-bit regs, do not remove */
+       movl    %eax,%eax
+
+       /* Construct struct pt_regs on stack (iret frame is already on stack) */
+       pushq_cfi_reg   rax                     /* pt_regs->orig_ax */
+       pushq_cfi_reg   rdi                     /* pt_regs->di */
+       pushq_cfi_reg   rsi                     /* pt_regs->si */
+       pushq_cfi_reg   rdx                     /* pt_regs->dx */
+       pushq_cfi_reg   rcx                     /* pt_regs->cx */
+       pushq_cfi_reg   rax                     /* pt_regs->ax */
        cld
-       /* note the registers are not zero extended to the sf.
-          this could be a problem. */
-       SAVE_ARGS 0,1,0
-       orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       sub     $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
+       CFI_ADJUST_CFA_OFFSET 10*8
+
+       orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+       testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        jnz ia32_tracesys
        cmpq $(IA32_NR_syscalls-1),%rax
        ja ia32_badsys
 ia32_do_call:
-       IA32_ARG_FIXUP
+       /* 32bit syscall -> 64bit C ABI argument conversion */
+       movl %edi,%r8d  /* arg5 */
+       movl %ebp,%r9d  /* arg6 */
+       xchg %ecx,%esi  /* rsi:arg2, rcx:arg4 */
+       movl %ebx,%edi  /* arg1 */
+       movl %edx,%edx  /* arg3 (zero extension) */
        call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
-       movq %rax,RAX-ARGOFFSET(%rsp)
+       movq %rax,RAX(%rsp)
 ia32_ret_from_sys_call:
-       CLEAR_RREGS -ARGOFFSET
-       jmp int_ret_from_sys_call 
+       CLEAR_RREGS
+       jmp int_ret_from_sys_call
 
-ia32_tracesys:                  
-       SAVE_REST
+ia32_tracesys:
+       SAVE_EXTRA_REGS
        CLEAR_RREGS
        movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
        movq %rsp,%rdi        /* &pt_regs -> arg1 */
        call syscall_trace_enter
-       LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
-       RESTORE_REST
+       LOAD_ARGS32     /* reload args from stack in case ptrace changed it */
+       RESTORE_EXTRA_REGS
        cmpq $(IA32_NR_syscalls-1),%rax
        ja  int_ret_from_sys_call       /* ia32_tracesys has set RAX(%rsp) */
        jmp ia32_do_call
 END(ia32_syscall)
 
 ia32_badsys:
-       movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+       movq $0,ORIG_RAX(%rsp)
        movq $-ENOSYS,%rax
        jmp ia32_sysret
 
@@ -479,8 +571,6 @@ GLOBAL(\label)
 
        PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
        PTREGSCALL stub32_sigreturn, sys32_sigreturn
-       PTREGSCALL stub32_execve, compat_sys_execve
-       PTREGSCALL stub32_execveat, compat_sys_execveat
        PTREGSCALL stub32_fork, sys_fork
        PTREGSCALL stub32_vfork, sys_vfork
 
@@ -492,24 +582,23 @@ GLOBAL(stub32_clone)
 
        ALIGN
 ia32_ptregs_common:
-       popq %r11
        CFI_ENDPROC
        CFI_STARTPROC32 simple
        CFI_SIGNAL_FRAME
-       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
-       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
-       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
-       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
-       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
-       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
-       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
-/*     CFI_REL_OFFSET  cs,CS-ARGOFFSET*/
-/*     CFI_REL_OFFSET  rflags,EFLAGS-ARGOFFSET*/
-       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
-/*     CFI_REL_OFFSET  ss,SS-ARGOFFSET*/
-       SAVE_REST
+       CFI_DEF_CFA     rsp,SIZEOF_PTREGS
+       CFI_REL_OFFSET  rax,RAX
+       CFI_REL_OFFSET  rcx,RCX
+       CFI_REL_OFFSET  rdx,RDX
+       CFI_REL_OFFSET  rsi,RSI
+       CFI_REL_OFFSET  rdi,RDI
+       CFI_REL_OFFSET  rip,RIP
+/*     CFI_REL_OFFSET  cs,CS*/
+/*     CFI_REL_OFFSET  rflags,EFLAGS*/
+       CFI_REL_OFFSET  rsp,RSP
+/*     CFI_REL_OFFSET  ss,SS*/
+       SAVE_EXTRA_REGS 8
        call *%rax
-       RESTORE_REST
-       jmp  ia32_sysret        /* misbalances the return cache */
+       RESTORE_EXTRA_REGS 8
+       ret
        CFI_ENDPROC
 END(ia32_ptregs_common)
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
deleted file mode 100644 (file)
index 51ecd5b..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-
-long compat_ni_syscall(void)
-{
-       return -ENOSYS;
-}
index 8e0ceecdc95790d7a53eb3fe5bc1c3867bcb9e7f..719cd702b0a476e13abb42bfa28b0a7911204382 100644 (file)
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
                                advice);
 }
 
-long sys32_vm86_warning(void)
-{
-       struct task_struct *me = current;
-       static char lastcomm[sizeof(me->comm)];
-
-       if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-               compat_printk(KERN_INFO
-                             "%s: vm86 mode not supported on 64 bit kernel\n",
-                             me->comm);
-               strncpy(lastcomm, me->comm, sizeof(lastcomm));
-       }
-       return -ENOSYS;
-}
-
 asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
                                   size_t count)
 {
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
deleted file mode 100644 (file)
index 4754ba0..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-/* System call table for ia32 emulation. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <asm/asm-offsets.h>
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
-#include <asm/syscalls_32.h>
-#undef __SYSCALL_I386
-
-#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
-
-typedef void (*sys_call_ptr_t)(void);
-
-extern void compat_ni_syscall(void);
-
-const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
-       /*
-        * Smells like a compiler bug -- it doesn't work
-        * when the & below is removed.
-        */
-       [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
-#include <asm/syscalls_32.h>
-};
index 372231c22a47a46b1417e5c6739d88eb927f89fd..bdf02eeee76519582b0fe9c35b631852b1b417d9 100644 (file)
        .endm
 #endif
 
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
        .long \orig - .
        .long \alt - .
        .word \feature
        .byte \orig_len
        .byte \alt_len
+       .byte \pad_len
+.endm
+
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+       \oldinstr
+141:
+       .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+       .pushsection .altinstructions,"a"
+       altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+       .popsection
+
+       .pushsection .altinstr_replacement,"ax"
+143:
+       \newinstr
+144:
+       .popsection
+.endm
+
+#define old_len                        141b-140b
+#define new_len1               144f-143f
+#define new_len2               145f-144f
+
+/*
+ * max without conditionals. Idea adapted from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ */
+#define alt_max_short(a, b)    ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+       \oldinstr
+141:
+       .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
+               (alt_max_short(new_len1, new_len2) - (old_len)),0x90
+142:
+
+       .pushsection .altinstructions,"a"
+       altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
+       altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+       .popsection
+
+       .pushsection .altinstr_replacement,"ax"
+143:
+       \newinstr1
+144:
+       \newinstr2
+145:
+       .popsection
 .endm
 
 #endif  /*  __ASSEMBLY__  */
index 473bdbee378a10ac2030b586dc33d3be74de5a11..ba32af062f61d69164a792630e3257c8cdc6deb5 100644 (file)
@@ -48,8 +48,9 @@ struct alt_instr {
        s32 repl_offset;        /* offset to replacement instruction */
        u16 cpuid;              /* cpuid bit set for replacement */
        u8  instrlen;           /* length of original instruction */
-       u8  replacementlen;     /* length of new instruction, <= instrlen */
-};
+       u8  replacementlen;     /* length of new instruction */
+       u8  padlen;             /* length of build-time padding */
+} __packed;
 
 extern void alternative_instructions(void);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,69 @@ static inline int alternatives_text_reserved(void *start, void *end)
 }
 #endif /* CONFIG_SMP */
 
-#define OLDINSTR(oldinstr)     "661:\n\t" oldinstr "\n662:\n"
+#define b_replacement(num)     "664"#num
+#define e_replacement(num)     "665"#num
 
-#define b_replacement(number)  "663"#number
-#define e_replacement(number)  "664"#number
+#define alt_end_marker         "663"
+#define alt_slen               "662b-661b"
+#define alt_pad_len            alt_end_marker"b-662b"
+#define alt_total_slen         alt_end_marker"b-661b"
+#define alt_rlen(num)          e_replacement(num)"f-"b_replacement(num)"f"
 
-#define alt_slen "662b-661b"
-#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
+#define __OLDINSTR(oldinstr, num)                                      \
+       "661:\n\t" oldinstr "\n662:\n"                                  \
+       ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * "          \
+               "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
 
-#define ALTINSTR_ENTRY(feature, number)                                              \
+#define OLDINSTR(oldinstr, num)                                                \
+       __OLDINSTR(oldinstr, num)                                       \
+       alt_end_marker ":\n"
+
+/*
+ * max without conditionals. Idea adapted from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas works with s32s.
+ */
+#define alt_max_short(a, b)    "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
+
+/*
+ * Pad the second replacement alternative with additional NOPs if it is
+ * additionally longer than the first replacement alternative.
+ */
+#define OLDINSTR_2(oldinstr, num1, num2) \
+       "661:\n\t" oldinstr "\n662:\n"                                                          \
+       ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * "  \
+               "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n"  \
+       alt_end_marker ":\n"
+
+#define ALTINSTR_ENTRY(feature, num)                                         \
        " .long 661b - .\n"                             /* label           */ \
-       " .long " b_replacement(number)"f - .\n"        /* new instruction */ \
+       " .long " b_replacement(num)"f - .\n"           /* new instruction */ \
        " .word " __stringify(feature) "\n"             /* feature bit     */ \
-       " .byte " alt_slen "\n"                         /* source len      */ \
-       " .byte " alt_rlen(number) "\n"                 /* replacement len */
-
-#define DISCARD_ENTRY(number)                          /* rlen <= slen */    \
-       " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
+       " .byte " alt_total_slen "\n"                   /* source len      */ \
+       " .byte " alt_rlen(num) "\n"                    /* replacement len */ \
+       " .byte " alt_pad_len "\n"                      /* pad len */
 
-#define ALTINSTR_REPLACEMENT(newinstr, feature, number)        /* replacement */     \
-       b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
+#define ALTINSTR_REPLACEMENT(newinstr, feature, num)   /* replacement */     \
+       b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
 
 /* alternative assembly primitive: */
 #define ALTERNATIVE(oldinstr, newinstr, feature)                       \
-       OLDINSTR(oldinstr)                                              \
+       OLDINSTR(oldinstr, 1)                                           \
        ".pushsection .altinstructions,\"a\"\n"                         \
        ALTINSTR_ENTRY(feature, 1)                                      \
        ".popsection\n"                                                 \
-       ".pushsection .discard,\"aw\",@progbits\n"                      \
-       DISCARD_ENTRY(1)                                                \
-       ".popsection\n"                                                 \
        ".pushsection .altinstr_replacement, \"ax\"\n"                  \
        ALTINSTR_REPLACEMENT(newinstr, feature, 1)                      \
        ".popsection"
 
 #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
-       OLDINSTR(oldinstr)                                              \
+       OLDINSTR_2(oldinstr, 1, 2)                                      \
        ".pushsection .altinstructions,\"a\"\n"                         \
        ALTINSTR_ENTRY(feature1, 1)                                     \
        ALTINSTR_ENTRY(feature2, 2)                                     \
        ".popsection\n"                                                 \
-       ".pushsection .discard,\"aw\",@progbits\n"                      \
-       DISCARD_ENTRY(1)                                                \
-       DISCARD_ENTRY(2)                                                \
-       ".popsection\n"                                                 \
        ".pushsection .altinstr_replacement, \"ax\"\n"                  \
        ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)                    \
        ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)                    \
@@ -146,6 +166,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
 #define alternative(oldinstr, newinstr, feature)                       \
        asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
 
+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+       asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+
 /*
  * Alternative inline assembly with input.
  *
index efc3b22d896eb23b7e37cf9c720065c0b6b0c717..976b86a325e55cedfd28c029455ddecb2c06b6be 100644 (file)
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
 {
        volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
 
-       alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
+       alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
                       ASM_OUTPUT2("=r" (v), "=m" (*addr)),
                       ASM_OUTPUT2("0" (v), "m" (*addr)));
 }
@@ -204,7 +204,6 @@ extern void clear_local_APIC(void);
 extern void disconnect_bsp_APIC(int virt_wire_setup);
 extern void disable_local_APIC(void);
 extern void lapic_shutdown(void);
-extern int verify_local_APIC(void);
 extern void sync_Arb_IDs(void);
 extern void init_bsp_APIC(void);
 extern void setup_local_APIC(void);
index 2ab1eb33106eec42eff90d27b98cb698b5c4c835..959e45b81fe29192b0f1c97a65e028e7314f603d 100644 (file)
@@ -95,13 +95,11 @@ do {                                                                        \
  * Stop RDTSC speculation. This is needed when you need to use RDTSC
  * (or get_cycles or vread that possibly accesses the TSC) in a defined
  * code region.
- *
- * (Could use an alternative three way for this if there was one.)
  */
 static __always_inline void rdtsc_barrier(void)
 {
-       alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-       alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+       alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+                         "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
 #endif /* _ASM_X86_BARRIER_H */
index 1f1297b46f833ecd7843bf09a9592e0e5e61ec96..1c8b50edb2db3cac8f54c0c300d393b18bf24342 100644 (file)
@@ -55,143 +55,157 @@ For 32-bit we have the following conventions - kernel is built with
  * for assembly code:
  */
 
-#define R15              0
-#define R14              8
-#define R13             16
-#define R12             24
-#define RBP             32
-#define RBX             40
-
-/* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11             48
-#define R10             56
-#define R9              64
-#define R8              72
-#define RAX             80
-#define RCX             88
-#define RDX             96
-#define RSI            104
-#define RDI            112
-#define ORIG_RAX       120       /* + error_code */
-/* end of arguments */
-
-/* cpu exception frame or undefined in case of fast syscall: */
-#define RIP            128
-#define CS             136
-#define EFLAGS         144
-#define RSP            152
-#define SS             160
-
-#define ARGOFFSET      R11
-
-       .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
-       subq  $9*8+\addskip, %rsp
-       CFI_ADJUST_CFA_OFFSET   9*8+\addskip
-       movq_cfi rdi, 8*8
-       movq_cfi rsi, 7*8
-       movq_cfi rdx, 6*8
-
-       .if \save_rcx
-       movq_cfi rcx, 5*8
-       .endif
+/* The layout forms the "struct pt_regs" on the stack: */
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+#define R15            0*8
+#define R14            1*8
+#define R13            2*8
+#define R12            3*8
+#define RBP            4*8
+#define RBX            5*8
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+#define R11            6*8
+#define R10            7*8
+#define R9             8*8
+#define R8             9*8
+#define RAX            10*8
+#define RCX            11*8
+#define RDX            12*8
+#define RSI            13*8
+#define RDI            14*8
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX       15*8
+/* Return frame for iretq */
+#define RIP            16*8
+#define CS             17*8
+#define EFLAGS         18*8
+#define RSP            19*8
+#define SS             20*8
+
+#define SIZEOF_PTREGS  21*8
+
+       .macro ALLOC_PT_GPREGS_ON_STACK addskip=0
+       subq    $15*8+\addskip, %rsp
+       CFI_ADJUST_CFA_OFFSET 15*8+\addskip
+       .endm
 
-       .if \rax_enosys
-       movq $-ENOSYS, 4*8(%rsp)
-       .else
-       movq_cfi rax, 4*8
+       .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
+       .if \r11
+       movq_cfi r11, 6*8+\offset
        .endif
-
-       .if \save_r891011
-       movq_cfi r8,  3*8
-       movq_cfi r9,  2*8
-       movq_cfi r10, 1*8
-       movq_cfi r11, 0*8
+       .if \r8910
+       movq_cfi r10, 7*8+\offset
+       movq_cfi r9,  8*8+\offset
+       movq_cfi r8,  9*8+\offset
+       .endif
+       .if \rax
+       movq_cfi rax, 10*8+\offset
+       .endif
+       .if \rcx
+       movq_cfi rcx, 11*8+\offset
        .endif
+       movq_cfi rdx, 12*8+\offset
+       movq_cfi rsi, 13*8+\offset
+       movq_cfi rdi, 14*8+\offset
+       .endm
+       .macro SAVE_C_REGS offset=0
+       SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
+       .endm
+       .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
+       SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
+       .endm
+       .macro SAVE_C_REGS_EXCEPT_R891011
+       SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
+       .endm
+       .macro SAVE_C_REGS_EXCEPT_RCX_R891011
+       SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
+       .endm
+       .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
+       SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
+       .endm
+
+       .macro SAVE_EXTRA_REGS offset=0
+       movq_cfi r15, 0*8+\offset
+       movq_cfi r14, 1*8+\offset
+       movq_cfi r13, 2*8+\offset
+       movq_cfi r12, 3*8+\offset
+       movq_cfi rbp, 4*8+\offset
+       movq_cfi rbx, 5*8+\offset
+       .endm
+       .macro SAVE_EXTRA_REGS_RBP offset=0
+       movq_cfi rbp, 4*8+\offset
+       .endm
 
+       .macro RESTORE_EXTRA_REGS offset=0
+       movq_cfi_restore 0*8+\offset, r15
+       movq_cfi_restore 1*8+\offset, r14
+       movq_cfi_restore 2*8+\offset, r13
+       movq_cfi_restore 3*8+\offset, r12
+       movq_cfi_restore 4*8+\offset, rbp
+       movq_cfi_restore 5*8+\offset, rbx
        .endm
 
-#define ARG_SKIP       (9*8)
+       .macro ZERO_EXTRA_REGS
+       xorl    %r15d, %r15d
+       xorl    %r14d, %r14d
+       xorl    %r13d, %r13d
+       xorl    %r12d, %r12d
+       xorl    %ebp, %ebp
+       xorl    %ebx, %ebx
+       .endm
 
-       .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
-                           rstor_r8910=1, rstor_rdx=1
+       .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
        .if \rstor_r11
-       movq_cfi_restore 0*8, r11
+       movq_cfi_restore 6*8, r11
        .endif
-
        .if \rstor_r8910
-       movq_cfi_restore 1*8, r10
-       movq_cfi_restore 2*8, r9
-       movq_cfi_restore 3*8, r8
+       movq_cfi_restore 7*8, r10
+       movq_cfi_restore 8*8, r9
+       movq_cfi_restore 9*8, r8
        .endif
-
        .if \rstor_rax
-       movq_cfi_restore 4*8, rax
+       movq_cfi_restore 10*8, rax
        .endif
-
        .if \rstor_rcx
-       movq_cfi_restore 5*8, rcx
+       movq_cfi_restore 11*8, rcx
        .endif
-
        .if \rstor_rdx
-       movq_cfi_restore 6*8, rdx
-       .endif
-
-       movq_cfi_restore 7*8, rsi
-       movq_cfi_restore 8*8, rdi
-
-       .if ARG_SKIP+\addskip > 0
-       addq $ARG_SKIP+\addskip, %rsp
-       CFI_ADJUST_CFA_OFFSET   -(ARG_SKIP+\addskip)
+       movq_cfi_restore 12*8, rdx
        .endif
+       movq_cfi_restore 13*8, rsi
+       movq_cfi_restore 14*8, rdi
        .endm
-
-       .macro LOAD_ARGS offset, skiprax=0
-       movq \offset(%rsp),    %r11
-       movq \offset+8(%rsp),  %r10
-       movq \offset+16(%rsp), %r9
-       movq \offset+24(%rsp), %r8
-       movq \offset+40(%rsp), %rcx
-       movq \offset+48(%rsp), %rdx
-       movq \offset+56(%rsp), %rsi
-       movq \offset+64(%rsp), %rdi
-       .if \skiprax
-       .else
-       movq \offset+72(%rsp), %rax
-       .endif
+       .macro RESTORE_C_REGS
+       RESTORE_C_REGS_HELPER 1,1,1,1,1
        .endm
-
-#define REST_SKIP      (6*8)
-
-       .macro SAVE_REST
-       subq $REST_SKIP, %rsp
-       CFI_ADJUST_CFA_OFFSET   REST_SKIP
-       movq_cfi rbx, 5*8
-       movq_cfi rbp, 4*8
-       movq_cfi r12, 3*8
-       movq_cfi r13, 2*8
-       movq_cfi r14, 1*8
-       movq_cfi r15, 0*8
+       .macro RESTORE_C_REGS_EXCEPT_RAX
+       RESTORE_C_REGS_HELPER 0,1,1,1,1
        .endm
-
-       .macro RESTORE_REST
-       movq_cfi_restore 0*8, r15
-       movq_cfi_restore 1*8, r14
-       movq_cfi_restore 2*8, r13
-       movq_cfi_restore 3*8, r12
-       movq_cfi_restore 4*8, rbp
-       movq_cfi_restore 5*8, rbx
-       addq $REST_SKIP, %rsp
-       CFI_ADJUST_CFA_OFFSET   -(REST_SKIP)
+       .macro RESTORE_C_REGS_EXCEPT_RCX
+       RESTORE_C_REGS_HELPER 1,0,1,1,1
        .endm
-
-       .macro SAVE_ALL
-       SAVE_ARGS
-       SAVE_REST
+       .macro RESTORE_C_REGS_EXCEPT_R11
+       RESTORE_C_REGS_HELPER 1,1,0,1,1
+       .endm
+       .macro RESTORE_C_REGS_EXCEPT_RCX_R11
+       RESTORE_C_REGS_HELPER 1,0,0,1,1
+       .endm
+       .macro RESTORE_RSI_RDI
+       RESTORE_C_REGS_HELPER 0,0,0,0,0
+       .endm
+       .macro RESTORE_RSI_RDI_RDX
+       RESTORE_C_REGS_HELPER 0,0,0,0,1
        .endm
 
-       .macro RESTORE_ALL addskip=0
-       RESTORE_REST
-       RESTORE_ARGS 1, \addskip
+       .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
+       addq $15*8+\addskip, %rsp
+       CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
        .endm
 
        .macro icebp
@@ -210,37 +224,23 @@ For 32-bit we have the following conventions - kernel is built with
  */
 
        .macro SAVE_ALL
-       pushl_cfi %eax
-       CFI_REL_OFFSET eax, 0
-       pushl_cfi %ebp
-       CFI_REL_OFFSET ebp, 0
-       pushl_cfi %edi
-       CFI_REL_OFFSET edi, 0
-       pushl_cfi %esi
-       CFI_REL_OFFSET esi, 0
-       pushl_cfi %edx
-       CFI_REL_OFFSET edx, 0
-       pushl_cfi %ecx
-       CFI_REL_OFFSET ecx, 0
-       pushl_cfi %ebx
-       CFI_REL_OFFSET ebx, 0
+       pushl_cfi_reg eax
+       pushl_cfi_reg ebp
+       pushl_cfi_reg edi
+       pushl_cfi_reg esi
+       pushl_cfi_reg edx
+       pushl_cfi_reg ecx
+       pushl_cfi_reg ebx
        .endm
 
        .macro RESTORE_ALL
-       popl_cfi %ebx
-       CFI_RESTORE ebx
-       popl_cfi %ecx
-       CFI_RESTORE ecx
-       popl_cfi %edx
-       CFI_RESTORE edx
-       popl_cfi %esi
-       CFI_RESTORE esi
-       popl_cfi %edi
-       CFI_RESTORE edi
-       popl_cfi %ebp
-       CFI_RESTORE ebp
-       popl_cfi %eax
-       CFI_RESTORE eax
+       popl_cfi_reg ebx
+       popl_cfi_reg ecx
+       popl_cfi_reg edx
+       popl_cfi_reg esi
+       popl_cfi_reg edi
+       popl_cfi_reg ebp
+       popl_cfi_reg eax
        .endm
 
 #endif /* CONFIG_X86_64 */
index 59c6c401f79f16d9b98533275d66d437cbaeff0c..acdee09228b30e020332f0beb7e55e58e65b718e 100644 (file)
@@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
                sp = task_pt_regs(current)->sp;
        } else {
                /* -128 for the x32 ABI redzone */
-               sp = this_cpu_read(old_rsp) - 128;
+               sp = task_pt_regs(current)->sp - 128;
        }
 
        return (void __user *)round_down(sp - len, 16);
index 90a54851aedc98b29c65856986ade222818381b8..854c04b3c9c264ff84e84508e6a8d5fbf1ff2aa8 100644 (file)
 #define X86_FEATURE_RDSEED     ( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX                ( 9*32+19) /* The ADCX and ADOX instructions */
 #define X86_FEATURE_SMAP       ( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_PCOMMIT    ( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB       ( 9*32+24) /* CLWB instruction */
 #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
 #define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict Detection */
@@ -418,6 +420,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
                         " .word %P0\n"         /* 1: do replace */
                         " .byte 2b - 1b\n"     /* source len */
                         " .byte 0\n"           /* replacement len */
+                        " .byte 0\n"           /* pad len */
                         ".previous\n"
                         /* skipping size check since replacement size = 0 */
                         : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -432,6 +435,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
                         " .word %P0\n"         /* feature bit */
                         " .byte 2b - 1b\n"     /* source len */
                         " .byte 0\n"           /* replacement len */
+                        " .byte 0\n"           /* pad len */
                         ".previous\n"
                         /* skipping size check since replacement size = 0 */
                         : : "i" (bit) : : t_no);
@@ -457,6 +461,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
                             " .word %P1\n"             /* feature bit */
                             " .byte 2b - 1b\n"         /* source len */
                             " .byte 4f - 3f\n"         /* replacement len */
+                            " .byte 0\n"               /* pad len */
                             ".previous\n"
                             ".section .discard,\"aw\",@progbits\n"
                             " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -483,31 +488,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 {
 #ifdef CC_HAVE_ASM_GOTO
-/*
- * We need to spell the jumps to the compiler because, depending on the offset,
- * the replacement jump can be bigger than the original jump, and this we cannot
- * have. Thus, we force the jump to the widest, 4-byte, signed relative
- * offset even though the last would often fit in less bytes.
- */
-               asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
+               asm_volatile_goto("1: jmp %l[t_dynamic]\n"
                         "2:\n"
+                        ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+                                "((5f-4f) - (2b-1b)),0x90\n"
+                        "3:\n"
                         ".section .altinstructions,\"a\"\n"
                         " .long 1b - .\n"              /* src offset */
-                        " .long 3f - .\n"              /* repl offset */
+                        " .long 4f - .\n"              /* repl offset */
                         " .word %P1\n"                 /* always replace */
-                        " .byte 2b - 1b\n"             /* src len */
-                        " .byte 4f - 3f\n"             /* repl len */
+                        " .byte 3b - 1b\n"             /* src len */
+                        " .byte 5f - 4f\n"             /* repl len */
+                        " .byte 3b - 2b\n"             /* pad len */
                         ".previous\n"
                         ".section .altinstr_replacement,\"ax\"\n"
-                        "3: .byte 0xe9\n .long %l[t_no] - 2b\n"
-                        "4:\n"
+                        "4: jmp %l[t_no]\n"
+                        "5:\n"
                         ".previous\n"
                         ".section .altinstructions,\"a\"\n"
                         " .long 1b - .\n"              /* src offset */
                         " .long 0\n"                   /* no replacement */
                         " .word %P0\n"                 /* feature bit */
-                        " .byte 2b - 1b\n"             /* src len */
+                        " .byte 3b - 1b\n"             /* src len */
                         " .byte 0\n"                   /* repl len */
+                        " .byte 0\n"                   /* pad len */
                         ".previous\n"
                         : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
                         : : t_dynamic, t_no);
@@ -527,6 +531,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
                             " .word %P2\n"             /* always replace */
                             " .byte 2b - 1b\n"         /* source len */
                             " .byte 4f - 3f\n"         /* replacement len */
+                            " .byte 0\n"               /* pad len */
                             ".previous\n"
                             ".section .discard,\"aw\",@progbits\n"
                             " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -541,6 +546,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
                             " .word %P1\n"             /* feature bit */
                             " .byte 4b - 3b\n"         /* src len */
                             " .byte 6f - 5f\n"         /* repl len */
+                            " .byte 0\n"               /* pad len */
                             ".previous\n"
                             ".section .discard,\"aw\",@progbits\n"
                             " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
index a94b82e8f156f3888e0ab90ac879e39dd05ccec1..a0bf89fd26470102f6f38031808437f01afe8e1f 100644 (file)
@@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
  * Pentium F0 0F bugfix can have resulted in the mapped
  * IDT being write-protected.
  */
-#define set_intr_gate(n, addr)                                         \
+#define set_intr_gate_notrace(n, addr)                                 \
        do {                                                            \
                BUG_ON((unsigned)n > 0xFF);                             \
                _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0,        \
                          __KERNEL_CS);                                 \
+       } while (0)
+
+#define set_intr_gate(n, addr)                                         \
+       do {                                                            \
+               set_intr_gate_notrace(n, addr);                         \
                _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\
                                0, 0, __KERNEL_CS);                     \
        } while (0)
index f6f15986df6ca0f097cf245f04da7a6c123ad3b7..de1cdaf4d74346040667da81e767a5a32697c7c6 100644 (file)
        CFI_ADJUST_CFA_OFFSET 8
        .endm
 
+       .macro pushq_cfi_reg reg
+       pushq %\reg
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET \reg, 0
+       .endm
+
        .macro popq_cfi reg
        popq \reg
        CFI_ADJUST_CFA_OFFSET -8
        .endm
 
+       .macro popq_cfi_reg reg
+       popq %\reg
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE \reg
+       .endm
+
        .macro pushfq_cfi
        pushfq
        CFI_ADJUST_CFA_OFFSET 8
        CFI_ADJUST_CFA_OFFSET 4
        .endm
 
+       .macro pushl_cfi_reg reg
+       pushl %\reg
+       CFI_ADJUST_CFA_OFFSET 4
+       CFI_REL_OFFSET \reg, 0
+       .endm
+
        .macro popl_cfi reg
        popl \reg
        CFI_ADJUST_CFA_OFFSET -4
        .endm
 
+       .macro popl_cfi_reg reg
+       popl %\reg
+       CFI_ADJUST_CFA_OFFSET -4
+       CFI_RESTORE \reg
+       .endm
+
        .macro pushfl_cfi
        pushfl
        CFI_ADJUST_CFA_OFFSET 4
index 25bce45c6fc42f848bf70b56cf5a37884dd18684..3738b138b843d46467c75a910d916cc79ebad25f 100644 (file)
@@ -2,6 +2,8 @@
 #define _ASM_X86_EFI_H
 
 #include <asm/i387.h>
+#include <asm/pgtable.h>
+
 /*
  * We map the EFI regions needed for runtime services non-contiguously,
  * with preserved alignment on virtual addresses starting from -4G down
@@ -89,8 +91,8 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
 extern struct efi_scratch efi_scratch;
 extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
 extern int __init efi_memblock_x86_reserve_range(void);
-extern void __init efi_call_phys_prolog(void);
-extern void __init efi_call_phys_epilog(void);
+extern pgd_t * __init efi_call_phys_prolog(void);
+extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
 extern void __init efi_unmap_memmap(void);
 extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
index ca3347a9dab5211399e9e93a5e53d71ca1943095..935588d95c82fa35b68e3638d9acfe0a2734b373 100644 (file)
@@ -171,10 +171,11 @@ do {                                              \
 static inline void elf_common_init(struct thread_struct *t,
                                   struct pt_regs *regs, const u16 ds)
 {
-       regs->ax = regs->bx = regs->cx = regs->dx = 0;
-       regs->si = regs->di = regs->bp = 0;
+       /* Commented-out registers are cleared in stub_execve */
+       /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0;
+       regs->si = regs->di /*= regs->bp*/ = 0;
        regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
-       regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
+       /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/
        t->fs = t->gs = 0;
        t->fsindex = t->gsindex = 0;
        t->ds = t->es = ds;
@@ -365,6 +366,7 @@ enum align_flags {
 struct va_alignment {
        int flags;
        unsigned long mask;
+       unsigned long bits;
 } ____cacheline_aligned;
 
 extern struct va_alignment va_align;
index 72ba21a8b5fc2ff8b76c6c3e53b331ca6a558339..da5e96756570be6b1794e20d4161f4a540d1a781 100644 (file)
@@ -67,6 +67,34 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft);
 static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
 #endif
 
+/*
+ * Must be run with preemption disabled: this clears the fpu_owner_task,
+ * on this CPU.
+ *
+ * This will disable any lazy FPU state restore of the current FPU state,
+ * but if the current thread owns the FPU, it will still be saved by.
+ */
+static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+{
+       per_cpu(fpu_owner_task, cpu) = NULL;
+}
+
+/*
+ * Used to indicate that the FPU state in memory is newer than the FPU
+ * state in registers, and the FPU state should be reloaded next time the
+ * task is run. Only safe on the current task, or non-running tasks.
+ */
+static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk)
+{
+       tsk->thread.fpu.last_cpu = ~0;
+}
+
+static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
+{
+       return new == this_cpu_read_stable(fpu_owner_task) &&
+               cpu == new->thread.fpu.last_cpu;
+}
+
 static inline int is_ia32_compat_frame(void)
 {
        return config_enabled(CONFIG_IA32_EMULATION) &&
@@ -107,7 +135,6 @@ static __always_inline __pure bool use_fxsr(void)
 
 static inline void fx_finit(struct i387_fxsave_struct *fx)
 {
-       memset(fx, 0, xstate_size);
        fx->cwd = 0x37f;
        fx->mxcsr = MXCSR_DEFAULT;
 }
@@ -351,8 +378,14 @@ static inline void __thread_fpu_begin(struct task_struct *tsk)
        __thread_set_has_fpu(tsk);
 }
 
-static inline void __drop_fpu(struct task_struct *tsk)
+static inline void drop_fpu(struct task_struct *tsk)
 {
+       /*
+        * Forget coprocessor state..
+        */
+       preempt_disable();
+       tsk->thread.fpu_counter = 0;
+
        if (__thread_has_fpu(tsk)) {
                /* Ignore delayed exceptions from user space */
                asm volatile("1: fwait\n"
@@ -360,30 +393,29 @@ static inline void __drop_fpu(struct task_struct *tsk)
                             _ASM_EXTABLE(1b, 2b));
                __thread_fpu_end(tsk);
        }
-}
 
-static inline void drop_fpu(struct task_struct *tsk)
-{
-       /*
-        * Forget coprocessor state..
-        */
-       preempt_disable();
-       tsk->thread.fpu_counter = 0;
-       __drop_fpu(tsk);
        clear_stopped_child_used_math(tsk);
        preempt_enable();
 }
 
-static inline void drop_init_fpu(struct task_struct *tsk)
+static inline void restore_init_xstate(void)
+{
+       if (use_xsave())
+               xrstor_state(init_xstate_buf, -1);
+       else
+               fxrstor_checking(&init_xstate_buf->i387);
+}
+
+/*
+ * Reset the FPU state in the eager case and drop it in the lazy case (later use
+ * will reinit it).
+ */
+static inline void fpu_reset_state(struct task_struct *tsk)
 {
        if (!use_eager_fpu())
                drop_fpu(tsk);
-       else {
-               if (use_xsave())
-                       xrstor_state(init_xstate_buf, -1);
-               else
-                       fxrstor_checking(&init_xstate_buf->i387);
-       }
+       else
+               restore_init_xstate();
 }
 
 /*
@@ -400,24 +432,6 @@ static inline void drop_init_fpu(struct task_struct *tsk)
  */
 typedef struct { int preload; } fpu_switch_t;
 
-/*
- * Must be run with preemption disabled: this clears the fpu_owner_task,
- * on this CPU.
- *
- * This will disable any lazy FPU state restore of the current FPU state,
- * but if the current thread owns the FPU, it will still be saved by.
- */
-static inline void __cpu_disable_lazy_restore(unsigned int cpu)
-{
-       per_cpu(fpu_owner_task, cpu) = NULL;
-}
-
-static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
-{
-       return new == this_cpu_read_stable(fpu_owner_task) &&
-               cpu == new->thread.fpu.last_cpu;
-}
-
 static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
 {
        fpu_switch_t fpu;
@@ -426,13 +440,17 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
         * If the task has used the math, pre-load the FPU on xsave processors
         * or if the past 5 consecutive context-switches used math.
         */
-       fpu.preload = tsk_used_math(new) && (use_eager_fpu() ||
-                                            new->thread.fpu_counter > 5);
+       fpu.preload = tsk_used_math(new) &&
+                     (use_eager_fpu() || new->thread.fpu_counter > 5);
+
        if (__thread_has_fpu(old)) {
                if (!__save_init_fpu(old))
-                       cpu = ~0;
-               old->thread.fpu.last_cpu = cpu;
-               old->thread.fpu.has_fpu = 0;    /* But leave fpu_owner_task! */
+                       task_disable_lazy_fpu_restore(old);
+               else
+                       old->thread.fpu.last_cpu = cpu;
+
+               /* But leave fpu_owner_task! */
+               old->thread.fpu.has_fpu = 0;
 
                /* Don't change CR0.TS if we just switch! */
                if (fpu.preload) {
@@ -443,10 +461,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
                        stts();
        } else {
                old->thread.fpu_counter = 0;
-               old->thread.fpu.last_cpu = ~0;
+               task_disable_lazy_fpu_restore(old);
                if (fpu.preload) {
                        new->thread.fpu_counter++;
-                       if (!use_eager_fpu() && fpu_lazy_restore(new, cpu))
+                       if (fpu_lazy_restore(new, cpu))
                                fpu.preload = 0;
                        else
                                prefetch(new->thread.fpu.state);
@@ -466,7 +484,7 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
 {
        if (fpu.preload) {
                if (unlikely(restore_fpu_checking(new)))
-                       drop_init_fpu(new);
+                       fpu_reset_state(new);
        }
 }
 
@@ -495,10 +513,12 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame)
 }
 
 /*
- * Need to be preemption-safe.
+ * Needs to be preemption-safe.
  *
  * NOTE! user_fpu_begin() must be used only immediately before restoring
- * it. This function does not do any save/restore on their own.
+ * the save state. It does not do any saving/restoring on its own. In
+ * lazy FPU mode, it is just an optimization to avoid a #NM exception,
+ * the task can lose the FPU right after preempt_enable().
  */
 static inline void user_fpu_begin(void)
 {
@@ -519,24 +539,6 @@ static inline void __save_fpu(struct task_struct *tsk)
                fpu_fxsave(&tsk->thread.fpu);
 }
 
-/*
- * These disable preemption on their own and are safe
- */
-static inline void save_init_fpu(struct task_struct *tsk)
-{
-       WARN_ON_ONCE(!__thread_has_fpu(tsk));
-
-       if (use_eager_fpu()) {
-               __save_fpu(tsk);
-               return;
-       }
-
-       preempt_disable();
-       __save_init_fpu(tsk);
-       __thread_fpu_end(tsk);
-       preempt_enable();
-}
-
 /*
  * i387 state interaction
  */
index 9662290e0b2075ab42608af776abbe4a4219b6fd..e9571ddabc4feb821ae04d47c9d6c3b509178344 100644 (file)
@@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
 extern __visible void smp_invalidate_interrupt(struct pt_regs *);
 #endif
 
-extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR
-                                   - FIRST_EXTERNAL_VECTOR])(void);
+extern char irq_entries_start[];
 #ifdef CONFIG_TRACING
-#define trace_interrupt interrupt
+#define trace_irq_entries_start irq_entries_start
 #endif
 
 #define VECTOR_UNDEFINED       (-1)
index 47f29b1d18464aa6870ce6add18d5f37af2c28fb..e7814b74caf8235c985d5f49492aa0656cfac053 100644 (file)
@@ -69,7 +69,7 @@ struct insn {
        const insn_byte_t *next_byte;
 };
 
-#define MAX_INSN_SIZE  16
+#define MAX_INSN_SIZE  15
 
 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
 #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
index f42a04735a0a66c7f15cdc8640308861902c753d..e37d6b3ad9831bb05412114bac11f1cd0bf0bac3 100644 (file)
@@ -79,11 +79,12 @@ struct iommu_table_entry {
  *  d). Similar to the 'init', except that this gets called from pci_iommu_init
  *      where we do have a memory allocator.
  *
- * The standard vs the _FINISH differs in that the _FINISH variant will
- * continue detecting other IOMMUs in the call list after the
- * the detection routine returns a positive number. The _FINISH will
- * stop the execution chain. Both will still call the 'init' and
- * 'late_init' functions if they are set.
+ * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant
+ * in that the former will continue detecting other IOMMUs in the call
+ * list after the detection routine returns a positive number, while the
+ * latter will stop the execution chain upon first successful detection.
+ * Both variants will still call the 'init' and 'late_init' functions if
+ * they are set.
  */
 #define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init)         \
        __IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
index 0a8b519226b8feb37368ffbc4ca81011bc031fde..b77f5edb03b0c02dc2047d52d07c9da447dba209 100644 (file)
@@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void)
 #define USERGS_SYSRET32                                \
        swapgs;                                 \
        sysretl
-#define ENABLE_INTERRUPTS_SYSEXIT32            \
-       swapgs;                                 \
-       sti;                                    \
-       sysexit
 
 #else
 #define INTERRUPT_RETURN               iret
@@ -163,22 +159,27 @@ static inline int arch_irqs_disabled(void)
 
        return arch_irqs_disabled_flags(flags);
 }
+#endif /* !__ASSEMBLY__ */
 
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_TRACE_IRQFLAGS
+#  define TRACE_IRQS_ON                call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF       call trace_hardirqs_off_thunk;
 #else
-
-#ifdef CONFIG_X86_64
-#define ARCH_LOCKDEP_SYS_EXIT          call lockdep_sys_exit_thunk
-#define ARCH_LOCKDEP_SYS_EXIT_IRQ      \
+#  define TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#  ifdef CONFIG_X86_64
+#    define LOCKDEP_SYS_EXIT           call lockdep_sys_exit_thunk
+#    define LOCKDEP_SYS_EXIT_IRQ \
        TRACE_IRQS_ON; \
        sti; \
-       SAVE_REST; \
-       LOCKDEP_SYS_EXIT; \
-       RESTORE_REST; \
+       call lockdep_sys_exit_thunk; \
        cli; \
        TRACE_IRQS_OFF;
-
-#else
-#define ARCH_LOCKDEP_SYS_EXIT                  \
+#  else
+#    define LOCKDEP_SYS_EXIT \
        pushl %eax;                             \
        pushl %ecx;                             \
        pushl %edx;                             \
@@ -186,24 +187,12 @@ static inline int arch_irqs_disabled(void)
        popl %edx;                              \
        popl %ecx;                              \
        popl %eax;
-
-#define ARCH_LOCKDEP_SYS_EXIT_IRQ
-#endif
-
-#ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON                call trace_hardirqs_on_thunk;
-#  define TRACE_IRQS_OFF       call trace_hardirqs_off_thunk;
+#    define LOCKDEP_SYS_EXIT_IRQ
+#  endif
 #else
-#  define TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#  define LOCKDEP_SYS_EXIT     ARCH_LOCKDEP_SYS_EXIT
-#  define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
-# else
 #  define LOCKDEP_SYS_EXIT
 #  define LOCKDEP_SYS_EXIT_IRQ
-# endif
-
+#endif
 #endif /* __ASSEMBLY__ */
+
 #endif
index 6a2cefb4395a4228cce550ef8b231f3e7158d9d1..a4c1cf7e93f812e85fb56d3f858547192c028c58 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef _ASM_X86_JUMP_LABEL_H
 #define _ASM_X86_JUMP_LABEL_H
 
-#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
 
 #include <linux/stringify.h>
 #include <linux/types.h>
@@ -30,8 +30,6 @@ l_yes:
        return true;
 }
 
-#endif /* __KERNEL__ */
-
 #ifdef CONFIG_X86_64
 typedef u64 jump_label_t;
 #else
@@ -44,4 +42,5 @@ struct jump_entry {
        jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif
index a236e39cc385a4a468e745ce0d9fdb1e2d08341b..dea2e7e962e3e0648c9ecaaaffc5cb723b32f299 100644 (file)
@@ -81,11 +81,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
                (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 }
 
-#define SELECTOR_TI_MASK (1 << 2)
-#define SELECTOR_RPL_MASK 0x03
-
-#define IOPL_SHIFT 12
-
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MMU_HASH_SHIFT 10
@@ -345,6 +340,7 @@ struct kvm_pmu {
 enum {
        KVM_DEBUGREG_BP_ENABLED = 1,
        KVM_DEBUGREG_WONT_EXIT = 2,
+       KVM_DEBUGREG_RELOAD = 4,
 };
 
 struct kvm_vcpu_arch {
@@ -431,6 +427,9 @@ struct kvm_vcpu_arch {
 
        int cpuid_nent;
        struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+
+       int maxphyaddr;
+
        /* emulate context */
 
        struct x86_emulate_ctxt emulate_ctxt;
@@ -550,11 +549,20 @@ struct kvm_arch_memory_slot {
        struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 };
 
+/*
+ * We use as the mode the number of bits allocated in the LDR for the
+ * logical processor ID.  It happens that these are all powers of two.
+ * This makes it is very easy to detect cases where the APICs are
+ * configured for multiple modes; in that case, we cannot use the map and
+ * hence cannot use kvm_irq_delivery_to_apic_fast either.
+ */
+#define KVM_APIC_MODE_XAPIC_CLUSTER          4
+#define KVM_APIC_MODE_XAPIC_FLAT             8
+#define KVM_APIC_MODE_X2APIC                16
+
 struct kvm_apic_map {
        struct rcu_head rcu;
-       u8 ldr_bits;
-       /* fields bellow are used to decode ldr values in different modes */
-       u32 cid_shift, cid_mask, lid_mask, broadcast;
+       u8 mode;
        struct kvm_lapic *phys_map[256];
        /* first index is cluster id second is cpu id in a cluster */
        struct kvm_lapic *logical_map[16][16];
@@ -859,6 +867,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
                                      struct kvm_memory_slot *memslot);
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+                                       struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
@@ -933,6 +943,7 @@ struct x86_emulate_ctxt;
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -1128,7 +1139,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
index e62cf897f7819bc9795ab096a15689b3543e07b2..c1adf33fdd0d6f70f055b9a056bc7787bda7635e 100644 (file)
@@ -115,7 +115,7 @@ static inline void kvm_spinlock_init(void)
 
 static inline bool kvm_para_available(void)
 {
-       return 0;
+       return false;
 }
 
 static inline unsigned int kvm_arch_para_features(void)
index 9b3de99dc0044a8b6ccdba0e7523f5b1e8425c03..1f5a86d518db379ea65c100158df0c60988bb810 100644 (file)
@@ -116,6 +116,12 @@ struct mca_config {
        u32 rip_msr;
 };
 
+struct mce_vendor_flags {
+       __u64           overflow_recov  : 1, /* cpuid_ebx(80000007) */
+                       __reserved_0    : 63;
+};
+extern struct mce_vendor_flags mce_flags;
+
 extern struct mca_config mca_cfg;
 extern void mce_register_decode_chain(struct notifier_block *nb);
 extern void mce_unregister_decode_chain(struct notifier_block *nb);
@@ -128,9 +134,11 @@ extern int mce_p5_enabled;
 #ifdef CONFIG_X86_MCE
 int mcheck_init(void);
 void mcheck_cpu_init(struct cpuinfo_x86 *c);
+void mcheck_vendor_init_severity(void);
 #else
 static inline int mcheck_init(void) { return 0; }
 static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
+static inline void mcheck_vendor_init_severity(void) {}
 #endif
 
 #ifdef CONFIG_X86_ANCIENT_MCE
@@ -183,11 +191,11 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
 DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
 
 enum mcp_flags {
-       MCP_TIMESTAMP = (1 << 0),       /* log time stamp */
-       MCP_UC = (1 << 1),              /* log uncorrected errors */
-       MCP_DONTLOG = (1 << 2),         /* only clear, don't log */
+       MCP_TIMESTAMP   = BIT(0),       /* log time stamp */
+       MCP_UC          = BIT(1),       /* log uncorrected errors */
+       MCP_DONTLOG     = BIT(2),       /* only clear, don't log */
 };
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
 int mce_notify_irq(void);
 
index 201b520521ed74b6e35b63cda531ae58f75ec674..2fb20d6f7e23b0ccace549901dacf89b51e9c381 100644 (file)
@@ -75,6 +75,79 @@ static inline void __exit exit_amd_microcode(void) {}
 
 #ifdef CONFIG_MICROCODE_EARLY
 #define MAX_UCODE_COUNT 128
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx)       \
+               (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_vendor() to get vendor id for AP.
+ *
+ * x86_vendor() gets vendor information directly from CPUID.
+ */
+static inline int x86_vendor(void)
+{
+       u32 eax = 0x00000000;
+       u32 ebx, ecx = 0, edx;
+
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+
+       if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+               return X86_VENDOR_INTEL;
+
+       if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+               return X86_VENDOR_AMD;
+
+       return X86_VENDOR_UNKNOWN;
+}
+
+static inline unsigned int __x86_family(unsigned int sig)
+{
+       unsigned int x86;
+
+       x86 = (sig >> 8) & 0xf;
+
+       if (x86 == 0xf)
+               x86 += (sig >> 20) & 0xff;
+
+       return x86;
+}
+
+static inline unsigned int x86_family(void)
+{
+       u32 eax = 0x00000001;
+       u32 ebx, ecx = 0, edx;
+
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+
+       return __x86_family(eax);
+}
+
+static inline unsigned int x86_model(unsigned int sig)
+{
+       unsigned int x86, model;
+
+       x86 = __x86_family(sig);
+
+       model = (sig >> 4) & 0xf;
+
+       if (x86 == 0x6 || x86 == 0xf)
+               model += ((sig >> 16) & 0xf) << 4;
+
+       return model;
+}
+
 extern void __init load_ucode_bsp(void);
 extern void load_ucode_ap(void);
 extern int __init save_microcode_in_initrd(void);
index dd4c20043ce75be7545e963dd98ba38ce662396e..2b9209c46ca939991abed04a1c5d4ef786b0c698 100644 (file)
@@ -56,12 +56,15 @@ struct extended_sigtable {
 
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
-extern int
-get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev);
+extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc);
 extern int microcode_sanity_check(void *mc, int print_err);
-extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev);
-extern int
-update_match_revision(struct microcode_header_intel *mc_header, int rev);
+extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc);
+
+static inline int
+revision_is_newer(struct microcode_header_intel *mc_header, int rev)
+{
+       return (mc_header->rev <= rev) ? 0 : 1;
+}
 
 #ifdef CONFIG_MICROCODE_INTEL_EARLY
 extern void __init load_ucode_intel_bsp(void);
index a1410db38a1a682f758d44f384e2947b39203175..653dfa7662e17aa297b1277a81999854ecbea6e2 100644 (file)
@@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
                     :: "a" (eax), "c" (ecx));
 }
 
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+       trace_hardirqs_on();
+       /* "mwait %eax, %ecx;" */
+       asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+                    :: "a" (eax), "c" (ecx));
+}
+
 /*
  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
  * which can obviate IPI to trigger checking of need_resched.
index 965c47d254aa0f68ea83e4dc35cd983cc9ff789b..5f6051d5d139afb8ed126a5a6883fa43f23a2cde 100644 (file)
@@ -976,11 +976,6 @@ extern void default_banner(void);
        PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),       \
                  CLBR_NONE,                                            \
                  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
-
-#define ENABLE_INTERRUPTS_SYSEXIT32                                    \
-       PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),    \
-                 CLBR_NONE,                                            \
-                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
 #endif /* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
index ec1c93588cefd0c4e3a4705c6d966e2555ba95aa..d2203b5d9538ff700784c088b945c506903fab29 100644 (file)
@@ -210,8 +210,23 @@ struct x86_hw_tss {
        unsigned long           sp0;
        unsigned short          ss0, __ss0h;
        unsigned long           sp1;
-       /* ss1 caches MSR_IA32_SYSENTER_CS: */
-       unsigned short          ss1, __ss1h;
+
+       /*
+        * We don't use ring 1, so ss1 is a convenient scratch space in
+        * the same cacheline as sp0.  We use ss1 to cache the value in
+        * MSR_IA32_SYSENTER_CS.  When we context switch
+        * MSR_IA32_SYSENTER_CS, we first check if the new value being
+        * written matches ss1, and, if it's not, then we wrmsr the new
+        * value and update ss1.
+        *
+        * The only reason we context switch MSR_IA32_SYSENTER_CS is
+        * that we set it to zero in vm86 tasks to avoid corrupting the
+        * stack if we were to go through the sysenter path from vm86
+        * mode.
+        */
+       unsigned short          ss1;    /* MSR_IA32_SYSENTER_CS */
+
+       unsigned short          __ss1h;
        unsigned long           sp2;
        unsigned short          ss2, __ss2h;
        unsigned long           __cr3;
@@ -276,13 +291,17 @@ struct tss_struct {
        unsigned long           io_bitmap[IO_BITMAP_LONGS + 1];
 
        /*
-        * .. and then another 0x100 bytes for the emergency kernel stack:
+        * Space for the temporary SYSENTER stack:
         */
-       unsigned long           stack[64];
+       unsigned long           SYSENTER_stack[64];
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+
+#ifdef CONFIG_X86_32
+DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#endif
 
 /*
  * Save the original ist values for checking stack pointers during debugging
@@ -474,7 +493,6 @@ struct thread_struct {
 #ifdef CONFIG_X86_32
        unsigned long           sysenter_cs;
 #else
-       unsigned long           usersp; /* Copy from PDA */
        unsigned short          es;
        unsigned short          ds;
        unsigned short          fsindex;
@@ -564,6 +582,16 @@ static inline void native_swapgs(void)
 #endif
 }
 
+static inline unsigned long current_top_of_stack(void)
+{
+#ifdef CONFIG_X86_64
+       return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
+#else
+       /* sp0 on x86_32 is special in and around vm86 mode. */
+       return this_cpu_read_stable(cpu_current_top_of_stack);
+#endif
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -761,10 +789,10 @@ extern char                       ignore_fpu_irq;
 #define ARCH_HAS_SPINLOCK_PREFETCH
 
 #ifdef CONFIG_X86_32
-# define BASE_PREFETCH         ASM_NOP4
+# define BASE_PREFETCH         ""
 # define ARCH_HAS_PREFETCH
 #else
-# define BASE_PREFETCH         "prefetcht0 (%1)"
+# define BASE_PREFETCH         "prefetcht0 %P1"
 #endif
 
 /*
@@ -775,10 +803,9 @@ extern char                        ignore_fpu_irq;
  */
 static inline void prefetch(const void *x)
 {
-       alternative_input(BASE_PREFETCH,
-                         "prefetchnta (%1)",
+       alternative_input(BASE_PREFETCH, "prefetchnta %P1",
                          X86_FEATURE_XMM,
-                         "r" (x));
+                         "m" (*(const char *)x));
 }
 
 /*
@@ -788,10 +815,9 @@ static inline void prefetch(const void *x)
  */
 static inline void prefetchw(const void *x)
 {
-       alternative_input(BASE_PREFETCH,
-                         "prefetchw (%1)",
-                         X86_FEATURE_3DNOW,
-                         "r" (x));
+       alternative_input(BASE_PREFETCH, "prefetchw %P1",
+                         X86_FEATURE_3DNOWPREFETCH,
+                         "m" (*(const char *)x));
 }
 
 static inline void spin_lock_prefetch(const void *x)
@@ -799,6 +825,9 @@ static inline void spin_lock_prefetch(const void *x)
        prefetchw(x);
 }
 
+#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
+                          TOP_OF_KERNEL_STACK_PADDING)
+
 #ifdef CONFIG_X86_32
 /*
  * User space process size: 3GB (default).
@@ -809,39 +838,16 @@ static inline void spin_lock_prefetch(const void *x)
 #define STACK_TOP_MAX          STACK_TOP
 
 #define INIT_THREAD  {                                                   \
-       .sp0                    = sizeof(init_stack) + (long)&init_stack, \
+       .sp0                    = TOP_OF_INIT_STACK,                      \
        .vm86_info              = NULL,                                   \
        .sysenter_cs            = __KERNEL_CS,                            \
        .io_bitmap_ptr          = NULL,                                   \
 }
 
-/*
- * Note that the .io_bitmap member must be extra-big. This is because
- * the CPU will access an additional byte beyond the end of the IO
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
-#define INIT_TSS  {                                                      \
-       .x86_tss = {                                                      \
-               .sp0            = sizeof(init_stack) + (long)&init_stack, \
-               .ss0            = __KERNEL_DS,                            \
-               .ss1            = __KERNEL_CS,                            \
-               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,               \
-        },                                                               \
-       .io_bitmap              = { [0 ... IO_BITMAP_LONGS] = ~0 },       \
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
-#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
-#define KSTK_TOP(info)                                                 \
-({                                                                     \
-       unsigned long *__ptr = (unsigned long *)(info);                 \
-       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
-})
-
 /*
- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
  * This is necessary to guarantee that the entire "struct pt_regs"
  * is accessible even if the CPU haven't stored the SS/ESP registers
  * on the stack (interrupt gate does not save these registers
@@ -850,11 +856,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
  * "struct pt_regs" is possible, but they may contain the
  * completely wrong values.
  */
-#define task_pt_regs(task)                                             \
-({                                                                     \
-       struct pt_regs *__regs__;                                       \
-       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
-       __regs__ - 1;                                                   \
+#define task_pt_regs(task) \
+({                                                                     \
+       unsigned long __ptr = (unsigned long)task_stack_page(task);     \
+       __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;             \
+       ((struct pt_regs *)__ptr) - 1;                                  \
 })
 
 #define KSTK_ESP(task)         (task_pt_regs(task)->sp)
@@ -886,11 +892,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 #define STACK_TOP_MAX          TASK_SIZE_MAX
 
 #define INIT_THREAD  { \
-       .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
-}
-
-#define INIT_TSS  { \
-       .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+       .sp0 = TOP_OF_INIT_STACK \
 }
 
 /*
@@ -902,11 +904,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 #define task_pt_regs(tsk)      ((struct pt_regs *)(tsk)->thread.sp0 - 1)
 extern unsigned long KSTK_ESP(struct task_struct *task);
 
-/*
- * User space RSP while inside the SYSCALL fast path
- */
-DECLARE_PER_CPU(unsigned long, old_rsp);
-
 #endif /* CONFIG_X86_64 */
 
 extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
index 86fc2bb82287a687bd8ca0e976e8d32761678480..19507ffa5d28e9ce3ddece3856dd9cde4446f7f8 100644 (file)
@@ -31,13 +31,17 @@ struct pt_regs {
 #else /* __i386__ */
 
 struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
        unsigned long r15;
        unsigned long r14;
        unsigned long r13;
        unsigned long r12;
        unsigned long bp;
        unsigned long bx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
        unsigned long r11;
        unsigned long r10;
        unsigned long r9;
@@ -47,9 +51,12 @@ struct pt_regs {
        unsigned long dx;
        unsigned long si;
        unsigned long di;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
        unsigned long orig_ax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
        unsigned long ip;
        unsigned long cs;
        unsigned long flags;
@@ -89,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 }
 
 /*
- * user_mode_vm(regs) determines whether a register set came from user mode.
- * This is true if V8086 mode was enabled OR if the register set was from
- * protected mode with RPL-3 CS value.  This tricky test checks that with
- * one comparison.  Many places in the kernel can bypass this full check
- * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ * user_mode(regs) determines whether a register set came from user
+ * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
+ * register set was from protected mode with RPL-3 CS value.  This
+ * tricky test checks that with one comparison.
+ *
+ * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
+ * the extra check.
  */
 static inline int user_mode(struct pt_regs *regs)
 {
@@ -104,16 +113,6 @@ static inline int user_mode(struct pt_regs *regs)
 #endif
 }
 
-static inline int user_mode_vm(struct pt_regs *regs)
-{
-#ifdef CONFIG_X86_32
-       return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
-               USER_RPL;
-#else
-       return user_mode(regs);
-#endif
-}
-
 static inline int v8086_mode(struct pt_regs *regs)
 {
 #ifdef CONFIG_X86_32
@@ -138,12 +137,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
 #endif
 }
 
-#define current_user_stack_pointer()   this_cpu_read(old_rsp)
-/* ia32 vs. x32 difference */
-#define compat_user_stack_pointer()    \
-       (test_thread_flag(TIF_IA32)     \
-        ? current_pt_regs()->sp        \
-        : this_cpu_read(old_rsp))
+#define current_user_stack_pointer()   current_pt_regs()->sp
+#define compat_user_stack_pointer()    current_pt_regs()->sp
 #endif
 
 #ifdef CONFIG_X86_32
@@ -248,7 +243,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
  */
 #define arch_ptrace_stop_needed(code, info)                            \
 ({                                                                     \
-       set_thread_flag(TIF_NOTIFY_RESUME);                             \
+       force_iret();                                                   \
        false;                                                          \
 })
 
index d6b078e9fa28a3f4588237cb9a122f5b5ce53162..25b1cc07d49668c8a40306bf2ec81e4e2a11988e 100644 (file)
@@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 
 struct pvclock_vsyscall_time_info {
        struct pvclock_vcpu_time_info pvti;
+       u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
index db257a58571f0b47429e2dcc70cf628f1a968fe4..5a9856eb12bad7edb0f9a333870e331f5677d588 100644 (file)
@@ -3,8 +3,10 @@
 
 #include <linux/const.h>
 
-/* Constructor for a conventional segment GDT (or LDT) entry */
-/* This is a macro so it can be used in initializers */
+/*
+ * Constructor for a conventional segment GDT (or LDT) entry.
+ * This is a macro so it can be used in initializers.
+ */
 #define GDT_ENTRY(flags, base, limit)                  \
        ((((base)  & _AC(0xff000000,ULL)) << (56-24)) | \
         (((flags) & _AC(0x0000f0ff,ULL)) << 40) |      \
         (((base)  & _AC(0x00ffffff,ULL)) << 16) |      \
         (((limit) & _AC(0x0000ffff,ULL))))
 
-/* Simple and small GDT entries for booting only */
+/* Simple and small GDT entries for booting only: */
 
 #define GDT_ENTRY_BOOT_CS      2
-#define __BOOT_CS              (GDT_ENTRY_BOOT_CS * 8)
+#define GDT_ENTRY_BOOT_DS      3
+#define GDT_ENTRY_BOOT_TSS     4
+#define __BOOT_CS              (GDT_ENTRY_BOOT_CS*8)
+#define __BOOT_DS              (GDT_ENTRY_BOOT_DS*8)
+#define __BOOT_TSS             (GDT_ENTRY_BOOT_TSS*8)
+
+/*
+ * Bottom two bits of selector give the ring
+ * privilege level
+ */
+#define SEGMENT_RPL_MASK       0x3
 
-#define GDT_ENTRY_BOOT_DS      (GDT_ENTRY_BOOT_CS + 1)
-#define __BOOT_DS              (GDT_ENTRY_BOOT_DS * 8)
+/* User mode is privilege level 3: */
+#define USER_RPL               0x3
 
-#define GDT_ENTRY_BOOT_TSS     (GDT_ENTRY_BOOT_CS + 2)
-#define __BOOT_TSS             (GDT_ENTRY_BOOT_TSS * 8)
+/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */
+#define SEGMENT_TI_MASK                0x4
+/* LDT segment has TI set ... */
+#define SEGMENT_LDT            0x4
+/* ... GDT has it cleared */
+#define SEGMENT_GDT            0x0
 
-#define SEGMENT_RPL_MASK       0x3 /*
-                                    * Bottom two bits of selector give the ring
-                                    * privilege level
-                                    */
-#define SEGMENT_TI_MASK                0x4 /* Bit 2 is table indicator (LDT/GDT) */
-#define USER_RPL               0x3 /* User mode is privilege level 3 */
-#define SEGMENT_LDT            0x4 /* LDT segment has TI set... */
-#define SEGMENT_GDT            0x0 /* ... GDT has it cleared */
+#define GDT_ENTRY_INVALID_SEG  0
 
 #ifdef CONFIG_X86_32
 /*
  * The layout of the per-CPU GDT under Linux:
  *
- *   0 - null
+ *   0 - null                                                          <=== cacheline #1
  *   1 - reserved
  *   2 - reserved
  *   3 - reserved
  *
- *   4 - unused                        <==== new cacheline
+ *   4 - unused                                                                <=== cacheline #2
  *   5 - unused
  *
  *  ------- start of TLS (Thread-Local Storage) segments:
  *
  *   6 - TLS segment #1                        [ glibc's TLS segment ]
  *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
- *   8 - TLS segment #3
+ *   8 - TLS segment #3                                                        <=== cacheline #3
  *   9 - reserved
  *  10 - reserved
  *  11 - reserved
  *
  *  ------- start of kernel segments:
  *
- *  12 - kernel code segment           <==== new cacheline
+ *  12 - kernel code segment                                           <=== cacheline #4
  *  13 - kernel data segment
  *  14 - default user CS
  *  15 - default user DS
- *  16 - TSS
+ *  16 - TSS                                                           <=== cacheline #5
  *  17 - LDT
  *  18 - PNPBIOS support (16->32 gate)
  *  19 - PNPBIOS support
- *  20 - PNPBIOS support
+ *  20 - PNPBIOS support                                               <=== cacheline #6
  *  21 - PNPBIOS support
  *  22 - PNPBIOS support
  *  23 - APM BIOS support
- *  24 - APM BIOS support
+ *  24 - APM BIOS support                                              <=== cacheline #7
  *  25 - APM BIOS support
  *
  *  26 - ESPFIX small SS
  *  27 - per-cpu                       [ offset to per-cpu data area ]
- *  28 - stack_canary-20               [ for stack protector ]
+ *  28 - stack_canary-20               [ for stack protector ]         <=== cacheline #8
  *  29 - unused
  *  30 - unused
  *  31 - TSS for double fault handler
  */
-#define GDT_ENTRY_TLS_MIN      6
-#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+#define GDT_ENTRY_TLS_MIN              6
+#define GDT_ENTRY_TLS_MAX              (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
 
+#define GDT_ENTRY_KERNEL_CS            12
+#define GDT_ENTRY_KERNEL_DS            13
 #define GDT_ENTRY_DEFAULT_USER_CS      14
-
 #define GDT_ENTRY_DEFAULT_USER_DS      15
+#define GDT_ENTRY_TSS                  16
+#define GDT_ENTRY_LDT                  17
+#define GDT_ENTRY_PNPBIOS_CS32         18
+#define GDT_ENTRY_PNPBIOS_CS16         19
+#define GDT_ENTRY_PNPBIOS_DS           20
+#define GDT_ENTRY_PNPBIOS_TS1          21
+#define GDT_ENTRY_PNPBIOS_TS2          22
+#define GDT_ENTRY_APMBIOS_BASE         23
+
+#define GDT_ENTRY_ESPFIX_SS            26
+#define GDT_ENTRY_PERCPU               27
+#define GDT_ENTRY_STACK_CANARY         28
+
+#define GDT_ENTRY_DOUBLEFAULT_TSS      31
 
-#define GDT_ENTRY_KERNEL_BASE          (12)
+/*
+ * Number of entries in the GDT table:
+ */
+#define GDT_ENTRIES                    32
 
-#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE+0)
+/*
+ * Segment selector values corresponding to the above entries:
+ */
 
-#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE+1)
+#define __KERNEL_CS                    (GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS                    (GDT_ENTRY_KERNEL_DS*8)
+#define __USER_DS                      (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER_CS                      (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __ESPFIX_SS                    (GDT_ENTRY_ESPFIX_SS*8)
 
-#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE+4)
-#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE+5)
+/* segment for calling fn: */
+#define PNP_CS32                       (GDT_ENTRY_PNPBIOS_CS32*8)
+/* code segment for BIOS: */
+#define PNP_CS16                       (GDT_ENTRY_PNPBIOS_CS16*8)
 
-#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE+6)
-#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE+11)
+/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
+#define SEGMENT_IS_PNP_CODE(x)         (((x) & 0xf4) == PNP_CS32)
 
-#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE+14)
-#define __ESPFIX_SS                    (GDT_ENTRY_ESPFIX_SS*8)
+/* data segment for BIOS: */
+#define PNP_DS                         (GDT_ENTRY_PNPBIOS_DS*8)
+/* transfer data segment: */
+#define PNP_TS1                                (GDT_ENTRY_PNPBIOS_TS1*8)
+/* another data segment: */
+#define PNP_TS2                                (GDT_ENTRY_PNPBIOS_TS2*8)
 
-#define GDT_ENTRY_PERCPU               (GDT_ENTRY_KERNEL_BASE+15)
 #ifdef CONFIG_SMP
-#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+# define __KERNEL_PERCPU               (GDT_ENTRY_PERCPU*8)
 #else
-#define __KERNEL_PERCPU 0
+# define __KERNEL_PERCPU               0
 #endif
 
-#define GDT_ENTRY_STACK_CANARY         (GDT_ENTRY_KERNEL_BASE+16)
 #ifdef CONFIG_CC_STACKPROTECTOR
-#define __KERNEL_STACK_CANARY          (GDT_ENTRY_STACK_CANARY*8)
+# define __KERNEL_STACK_CANARY         (GDT_ENTRY_STACK_CANARY*8)
 #else
-#define __KERNEL_STACK_CANARY          0
+# define __KERNEL_STACK_CANARY         0
 #endif
 
-#define GDT_ENTRY_DOUBLEFAULT_TSS      31
-
-/*
- * The GDT has 32 entries
- */
-#define GDT_ENTRIES 32
+#else /* 64-bit: */
 
-/* The PnP BIOS entries in the GDT */
-#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
-#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
-#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
-#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
-#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
-
-/* The PnP BIOS selectors */
-#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
-#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
-#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
-#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
-#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
+#include <asm/cache.h>
 
+#define GDT_ENTRY_KERNEL32_CS          1
+#define GDT_ENTRY_KERNEL_CS            2
+#define GDT_ENTRY_KERNEL_DS            3
 
 /*
- * Matching rules for certain types of segments.
+ * We cannot use the same code segment descriptor for user and kernel mode,
+ * not even in long flat mode, because of different DPL.
+ *
+ * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes
+ * selectors:
+ *
+ *   if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
+ *   if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
+ *
+ * ss = STAR.SYSRET_CS+8 (in either case)
+ *
+ * thus USER_DS should be between 32-bit and 64-bit code selectors:
  */
+#define GDT_ENTRY_DEFAULT_USER32_CS    4
+#define GDT_ENTRY_DEFAULT_USER_DS      5
+#define GDT_ENTRY_DEFAULT_USER_CS      6
 
-/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
-#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
-
+/* Needs two entries */
+#define GDT_ENTRY_TSS                  8
+/* Needs two entries */
+#define GDT_ENTRY_LDT                  10
 
-#else
-#include <asm/cache.h>
-
-#define GDT_ENTRY_KERNEL32_CS 1
-#define GDT_ENTRY_KERNEL_CS 2
-#define GDT_ENTRY_KERNEL_DS 3
+#define GDT_ENTRY_TLS_MIN              12
+#define GDT_ENTRY_TLS_MAX              14
 
-#define __KERNEL32_CS   (GDT_ENTRY_KERNEL32_CS * 8)
+/* Abused to load per CPU data from limit */
+#define GDT_ENTRY_PER_CPU              15
 
 /*
- * we cannot use the same code segment descriptor for user and kernel
- * -- not even in the long flat mode, because of different DPL /kkeil
- * The segment offset needs to contain a RPL. Grr. -AK
- * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ * Number of entries in the GDT table:
  */
-#define GDT_ENTRY_DEFAULT_USER32_CS 4
-#define GDT_ENTRY_DEFAULT_USER_DS 5
-#define GDT_ENTRY_DEFAULT_USER_CS 6
-#define __USER32_CS   (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
-#define __USER32_DS    __USER_DS
-
-#define GDT_ENTRY_TSS 8        /* needs two entries */
-#define GDT_ENTRY_LDT 10 /* needs two entries */
-#define GDT_ENTRY_TLS_MIN 12
-#define GDT_ENTRY_TLS_MAX 14
-
-#define GDT_ENTRY_PER_CPU 15   /* Abused to load per CPU data from limit */
-#define __PER_CPU_SEG  (GDT_ENTRY_PER_CPU * 8 + 3)
+#define GDT_ENTRIES                    16
 
-/* TLS indexes for 64bit - hardcoded in arch_prctl */
-#define FS_TLS 0
-#define GS_TLS 1
-
-#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
-#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
-
-#define GDT_ENTRIES 16
+/*
+ * Segment selector values corresponding to the above entries:
+ *
+ * Note, selectors also need to have a correct RPL,
+ * expressed with the +3 value for user-space selectors:
+ */
+#define __KERNEL32_CS                  (GDT_ENTRY_KERNEL32_CS*8)
+#define __KERNEL_CS                    (GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS                    (GDT_ENTRY_KERNEL_DS*8)
+#define __USER32_CS                    (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
+#define __USER_DS                      (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER32_DS                    __USER_DS
+#define __USER_CS                      (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __PER_CPU_SEG                  (GDT_ENTRY_PER_CPU*8 + 3)
+
+/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */
+#define FS_TLS                         0
+#define GS_TLS                         1
+
+#define GS_TLS_SEL                     ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
+#define FS_TLS_SEL                     ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
 
 #endif
 
-#define __KERNEL_CS    (GDT_ENTRY_KERNEL_CS*8)
-#define __KERNEL_DS    (GDT_ENTRY_KERNEL_DS*8)
-#define __USER_DS      (GDT_ENTRY_DEFAULT_USER_DS*8+3)
-#define __USER_CS      (GDT_ENTRY_DEFAULT_USER_CS*8+3)
 #ifndef CONFIG_PARAVIRT
-#define get_kernel_rpl()  0
+# define get_kernel_rpl()              0
 #endif
 
-#define IDT_ENTRIES 256
-#define NUM_EXCEPTION_VECTORS 32
-/* Bitmask of exception vectors which push an error code on the stack */
-#define EXCEPTION_ERRCODE_MASK  0x00027d00
-#define GDT_SIZE (GDT_ENTRIES * 8)
-#define GDT_ENTRY_TLS_ENTRIES 3
-#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+#define IDT_ENTRIES                    256
+#define NUM_EXCEPTION_VECTORS          32
+
+/* Bitmask of exception vectors which push an error code on the stack: */
+#define EXCEPTION_ERRCODE_MASK         0x00027d00
+
+#define GDT_SIZE                       (GDT_ENTRIES*8)
+#define GDT_ENTRY_TLS_ENTRIES          3
+#define TLS_SIZE                       (GDT_ENTRY_TLS_ENTRIES* 8)
 
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
+
 extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
 #ifdef CONFIG_TRACING
-#define trace_early_idt_handlers early_idt_handlers
+# define trace_early_idt_handlers early_idt_handlers
 #endif
 
 /*
@@ -228,37 +260,30 @@ do {                                                                      \
 } while (0)
 
 /*
- * Save a segment register away
+ * Save a segment register away:
  */
 #define savesegment(seg, value)                                \
        asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
 
 /*
- * x86_32 user gs accessors.
+ * x86-32 user GS accessors:
  */
 #ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_32_LAZY_GS
-#define get_user_gs(regs)      (u16)({unsigned long v; savesegment(gs, v); v;})
-#define set_user_gs(regs, v)   loadsegment(gs, (unsigned long)(v))
-#define task_user_gs(tsk)      ((tsk)->thread.gs)
-#define lazy_save_gs(v)                savesegment(gs, (v))
-#define lazy_load_gs(v)                loadsegment(gs, (v))
-#else  /* X86_32_LAZY_GS */
-#define get_user_gs(regs)      (u16)((regs)->gs)
-#define set_user_gs(regs, v)   do { (regs)->gs = (v); } while (0)
-#define task_user_gs(tsk)      (task_pt_regs(tsk)->gs)
-#define lazy_save_gs(v)                do { } while (0)
-#define lazy_load_gs(v)                do { } while (0)
-#endif /* X86_32_LAZY_GS */
+# ifdef CONFIG_X86_32_LAZY_GS
+#  define get_user_gs(regs)            (u16)({ unsigned long v; savesegment(gs, v); v; })
+#  define set_user_gs(regs, v)         loadsegment(gs, (unsigned long)(v))
+#  define task_user_gs(tsk)            ((tsk)->thread.gs)
+#  define lazy_save_gs(v)              savesegment(gs, (v))
+#  define lazy_load_gs(v)              loadsegment(gs, (v))
+# else /* X86_32_LAZY_GS */
+#  define get_user_gs(regs)            (u16)((regs)->gs)
+#  define set_user_gs(regs, v)         do { (regs)->gs = (v); } while (0)
+#  define task_user_gs(tsk)            (task_pt_regs(tsk)->gs)
+#  define lazy_save_gs(v)              do { } while (0)
+#  define lazy_load_gs(v)              do { } while (0)
+# endif        /* X86_32_LAZY_GS */
 #endif /* X86_32 */
 
-static inline unsigned long get_limit(unsigned long segment)
-{
-       unsigned long __limit;
-       asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
-       return __limit + 1;
-}
-
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 
index ff4e7b236e21d5393bd55c5f53493d44861c70c1..f69e06b283fb9ee03e8704847558aa577aecf221 100644 (file)
@@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { }
  */
 extern struct boot_params boot_params;
 
+static inline bool kaslr_enabled(void)
+{
+       return !!(boot_params.hdr.loadflags & KASLR_FLAG);
+}
+
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
index 9dfce4e0417d92adc623d32ff93f67109316b451..6fe6b182c9981dd891a9a5bc9a55b3e6591a6f9f 100644 (file)
@@ -57,9 +57,9 @@ struct sigcontext {
        unsigned long ip;
        unsigned long flags;
        unsigned short cs;
-       unsigned short gs;
-       unsigned short fs;
-       unsigned short __pad0;
+       unsigned short __pad2;  /* Was called gs, but was always zero. */
+       unsigned short __pad1;  /* Was called fs, but was always zero. */
+       unsigned short ss;
        unsigned long err;
        unsigned long trapno;
        unsigned long oldmask;
index 7a958164088c10a61aeed98f6353152ebad83ff5..89db46752a8f0e1c78c9403ef4b40a805b488999 100644 (file)
@@ -13,9 +13,7 @@
                         X86_EFLAGS_CF | X86_EFLAGS_RF)
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
-                      unsigned long *pax);
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
 int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
                     struct pt_regs *regs, unsigned long mask);
 
index 8d3120f4e27053b3fae6d6334fe7a3a6dee33e82..ba665ebd17bb8f22a39712dbf2a8c398b9e9207f 100644 (file)
 
 #ifdef CONFIG_X86_SMAP
 
-#define ASM_CLAC                                                       \
-       661: ASM_NOP3 ;                                                 \
-       .pushsection .altinstr_replacement, "ax" ;                      \
-       662: __ASM_CLAC ;                                               \
-       .popsection ;                                                   \
-       .pushsection .altinstructions, "a" ;                            \
-       altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;       \
-       .popsection
-
-#define ASM_STAC                                                       \
-       661: ASM_NOP3 ;                                                 \
-       .pushsection .altinstr_replacement, "ax" ;                      \
-       662: __ASM_STAC ;                                               \
-       .popsection ;                                                   \
-       .pushsection .altinstructions, "a" ;                            \
-       altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;       \
-       .popsection
+#define ASM_CLAC \
+       ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
+
+#define ASM_STAC \
+       ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
 
 #else /* CONFIG_X86_SMAP */
 
 static __always_inline void clac(void)
 {
        /* Note: a barrier is implicit in alternative() */
-       alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
+       alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
 }
 
 static __always_inline void stac(void)
 {
        /* Note: a barrier is implicit in alternative() */
-       alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP);
+       alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
 }
 
 /* These macros can be used in asm() statements */
 #define ASM_CLAC \
-       ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
+       ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
 #define ASM_STAC \
-       ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP)
+       ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
 
 #else /* CONFIG_X86_SMAP */
 
index 8cd1cc3bc8356ffef29e349f08b3de43aeb79506..81d02fc7dafa521e665adf7dce8f657d0036d391 100644 (file)
@@ -154,6 +154,7 @@ void cpu_die_common(unsigned int cpu);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
+void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_disable(void);
 void native_cpu_die(unsigned int cpu);
index 6a4b00fafb003cbcf4e2bdc62a244ee71dc75a45..aeb4666e0c0a770a7fbb8432b7d133b2dd9e764d 100644 (file)
@@ -4,6 +4,8 @@
 
 #ifdef __KERNEL__
 
+#include <asm/nops.h>
+
 static inline void native_clts(void)
 {
        asm volatile("clts");
@@ -199,6 +201,28 @@ static inline void clflushopt(volatile void *__p)
                       "+m" (*(volatile char __force *)__p));
 }
 
+static inline void clwb(volatile void *__p)
+{
+       volatile struct { char x[64]; } *p = __p;
+
+       asm volatile(ALTERNATIVE_2(
+               ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
+               ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
+               X86_FEATURE_CLFLUSHOPT,
+               ".byte 0x66, 0x0f, 0xae, 0x30",  /* clwb (%%rax) */
+               X86_FEATURE_CLWB)
+               : [p] "+m" (*p)
+               : [pax] "a" (p));
+}
+
+static inline void pcommit_sfence(void)
+{
+       alternative(ASM_NOP7,
+                   ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */
+                   "sfence",
+                   X86_FEATURE_PCOMMIT);
+}
+
 #define nop() asm volatile ("nop")
 
 
index 1d4e4f279a3281e094684ad29bd4b544d37af6ab..ea2dbe82cba3a74e06e269655cdfca8a67253e52 100644 (file)
 #include <asm/percpu.h>
 #include <asm/types.h>
 
+/*
+ * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
+ * reserve at the top of the kernel stack.  We do it because of a nasty
+ * 32-bit corner case.  On x86_32, the hardware stack frame is
+ * variable-length.  Except for vm86 mode, struct pt_regs assumes a
+ * maximum-length frame.  If we enter from CPL 0, the top 8 bytes of
+ * pt_regs don't actually exist.  Ordinarily this doesn't matter, but it
+ * does in at least one case:
+ *
+ * If we take an NMI early enough in SYSENTER, then we can end up with
+ * pt_regs that extends above sp0.  On the way out, in the espfix code,
+ * we can read the saved SS value, but that value will be above sp0.
+ * Without this offset, that can result in a page fault.  (We are
+ * careful that, in this case, the value we read doesn't matter.)
+ *
+ * In vm86 mode, the hardware frame is much longer still, but we neither
+ * access the extra members from NMI context, nor do we write such a
+ * frame at sp0 at all.
+ *
+ * x86_64 has a fixed-length stack frame.
+ */
+#ifdef CONFIG_X86_32
+# define TOP_OF_KERNEL_STACK_PADDING 8
+#else
+# define TOP_OF_KERNEL_STACK_PADDING 0
+#endif
+
 /*
  * low level task data that entry.S needs immediate access to
  * - this struct should fit entirely inside of one cache line
@@ -145,7 +172,6 @@ struct thread_info {
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
 
 #define STACK_WARN             (THREAD_SIZE/8)
-#define KERNEL_STACK_OFFSET    (5*(BITS_PER_LONG/8))
 
 /*
  * macros/functions for gaining access to the thread information structure
@@ -158,10 +184,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack);
 
 static inline struct thread_info *current_thread_info(void)
 {
-       struct thread_info *ti;
-       ti = (void *)(this_cpu_read_stable(kernel_stack) +
-                     KERNEL_STACK_OFFSET - THREAD_SIZE);
-       return ti;
+       return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
 }
 
 static inline unsigned long current_stack_pointer(void)
@@ -177,16 +200,37 @@ static inline unsigned long current_stack_pointer(void)
 
 #else /* !__ASSEMBLY__ */
 
-/* how to get the thread information struct from ASM */
+/* Load thread_info address into "reg" */
 #define GET_THREAD_INFO(reg) \
        _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
-       _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ;
+       _ASM_SUB $(THREAD_SIZE),reg ;
 
 /*
- * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
- * a certain register (to be used in assembler memory operands).
+ * ASM operand which evaluates to a 'thread_info' address of
+ * the current task, if it is known that "reg" is exactly "off"
+ * bytes below the top of the stack currently.
+ *
+ * ( The kernel stack's size is known at build time, it is usually
+ *   2 or 4 pages, and the bottom  of the kernel stack contains
+ *   the thread_info structure. So to access the thread_info very
+ *   quickly from assembly code we can calculate down from the
+ *   top of the kernel stack to the bottom, using constant,
+ *   build-time calculations only. )
+ *
+ * For example, to fetch the current thread_info->flags value into %eax
+ * on x86-64 defconfig kernels, in syscall entry code where RSP is
+ * currently at exactly SIZEOF_PTREGS bytes away from the top of the
+ * stack:
+ *
+ *      mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
+ *
+ * will translate to:
+ *
+ *      8b 84 24 b8 c0 ff ff      mov    -0x3f48(%rsp), %eax
+ *
+ * which is below the current RSP by almost 16K.
  */
-#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg)
+#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
 
 #endif
 
@@ -236,6 +280,16 @@ static inline bool is_ia32_task(void)
 #endif
        return false;
 }
+
+/*
+ * Force syscall return via IRET by making it look as if there was
+ * some work pending. IRET is our most capable (but slowest) syscall
+ * return path, which is able to restore modified SS, CS and certain
+ * EFLAGS values that other (fast) syscall return instructions
+ * are not able to restore properly.
+ */
+#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
+
 #endif /* !__ASSEMBLY__ */
 
 #ifndef __ASSEMBLY__
index 12a26b979bf163008ebd1dc5b7cdff3ca5876f0b..f2f9b39b274ab0c2f81ab6b388f78ba5c881ec5e 100644 (file)
@@ -231,6 +231,6 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 }
 
 unsigned long
-copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
+copy_user_handle_tail(char *to, char *from, unsigned len);
 
 #endif /* _ASM_X86_UACCESS_64_H */
index 225b0988043a0a78ac9092a9af7a265122c685cd..ab456dc233b51482c53c0232a97ed71cbc50a990 100644 (file)
@@ -15,6 +15,7 @@
 
 /* loadflags */
 #define LOADED_HIGH    (1<<0)
+#define KASLR_FLAG     (1<<1)
 #define QUIET_FLAG     (1<<5)
 #define KEEP_SEGMENTS  (1<<6)
 #define CAN_USE_HEAP   (1<<7)
index 7b0a55a8885115386f40a4207838e60ad66abc21..580aee3072e0684082b0f74e1358925efaf4f540 100644 (file)
 #else /* __i386__ */
 
 #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
 #define R15 0
 #define R14 8
 #define R13 16
 #define R12 24
 #define RBP 32
 #define RBX 40
-/* arguments: interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
 #define R11 48
 #define R10 56
 #define R9 64
 #define RDX 96
 #define RSI 104
 #define RDI 112
-#define ORIG_RAX 120       /* = ERROR */
-/* end of arguments */
-/* cpu exception frame or undefined in case of fast syscall. */
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 120
+/* Return frame for iretq */
 #define RIP 128
 #define CS 136
 #define EFLAGS 144
 #define RSP 152
 #define SS 160
-#define ARGOFFSET R11
 #endif /* __ASSEMBLY__ */
 
 /* top of stack page */
index ac4b9aa4d9996b413eaf71ada8bb6b458e7a83a0..bc16115af39b9e66c048b5e0237180f505535b77 100644 (file)
@@ -41,13 +41,17 @@ struct pt_regs {
 #ifndef __KERNEL__
 
 struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
        unsigned long r15;
        unsigned long r14;
        unsigned long r13;
        unsigned long r12;
        unsigned long rbp;
        unsigned long rbx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
        unsigned long r11;
        unsigned long r10;
        unsigned long r9;
@@ -57,9 +61,12 @@ struct pt_regs {
        unsigned long rdx;
        unsigned long rsi;
        unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
        unsigned long orig_rax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
        unsigned long rip;
        unsigned long cs;
        unsigned long eflags;
index d8b9f9081e86fb486c88c34b8359bdfec28dd5fe..16dc4e8a2cd34845042445915f9e9d74d90546c6 100644 (file)
@@ -177,9 +177,24 @@ struct sigcontext {
        __u64 rip;
        __u64 eflags;           /* RFLAGS */
        __u16 cs;
-       __u16 gs;
-       __u16 fs;
-       __u16 __pad0;
+
+       /*
+        * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+        * Linux saved and restored fs and gs in these slots.  This
+        * was counterproductive, as fsbase and gsbase were never
+        * saved, so arch_prctl was presumably unreliable.
+        *
+        * If these slots are ever needed for any other purpose, there
+        * is some risk that very old 64-bit binaries could get
+        * confused.  I doubt that many such binaries still work,
+        * though, since the same patch in 2.5.64 also removed the
+        * 64-bit set_thread_area syscall, so it appears that there is
+        * no TLS API that works in both pre- and post-2.5.64 kernels.
+        */
+       __u16 __pad2;           /* Was gs. */
+       __u16 __pad1;           /* Was fs. */
+
+       __u16 ss;
        __u64 err;
        __u64 trapno;
        __u64 oldmask;
index c5f1a1deb91a904e21d88c2762836dd98500e600..1fe92181ee9ef8b3cb061a6091a732b2b9fa7bf0 100644 (file)
@@ -67,6 +67,7 @@
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_INVEPT              50
+#define EXIT_REASON_RDTSCP              51
 #define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_INVVPID             53
 #define EXIT_REASON_WBINVD              54
index cdb1b70ddad0f026cbe42800571d0869211bf6f5..c887cd944f0c18e849fda278ec50dbee6b731919 100644 (file)
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32)  += i386_ksyms_32.o
 obj-$(CONFIG_X86_64)   += sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)   += mcount_64.o
 obj-y                  += syscall_$(BITS).o vsyscall_gtod.o
+obj-$(CONFIG_IA32_EMULATION)   += syscall_32.o
 obj-$(CONFIG_X86_VSYSCALL_EMULATION)   += vsyscall_64.o vsyscall_emu_64.o
 obj-$(CONFIG_X86_ESPFIX64)     += espfix_64.o
 obj-$(CONFIG_SYSFS)    += ksysfs.o
index 703130f469ecf71978b9d67bf0fb50da9b31cbc9..aef65319316065eab845f35141682c3550f18a22 100644 (file)
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
 __setup("noreplace-paravirt", setup_noreplace_paravirt);
 #endif
 
-#define DPRINTK(fmt, ...)                              \
-do {                                                   \
-       if (debug_alternative)                          \
-               printk(KERN_DEBUG fmt, ##__VA_ARGS__);  \
+#define DPRINTK(fmt, args...)                                          \
+do {                                                                   \
+       if (debug_alternative)                                          \
+               printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);   \
+} while (0)
+
+#define DUMP_BYTES(buf, len, fmt, args...)                             \
+do {                                                                   \
+       if (unlikely(debug_alternative)) {                              \
+               int j;                                                  \
+                                                                       \
+               if (!(len))                                             \
+                       break;                                          \
+                                                                       \
+               printk(KERN_DEBUG fmt, ##args);                         \
+               for (j = 0; j < (len) - 1; j++)                         \
+                       printk(KERN_CONT "%02hhx ", buf[j]);            \
+               printk(KERN_CONT "%02hhx\n", buf[j]);                   \
+       }                                                               \
 } while (0)
 
 /*
@@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 
-/* Replace instructions with better alternatives for this CPU type.
-   This runs before SMP is initialized to avoid SMP problems with
-   self modifying code. This implies that asymmetric systems where
-   APs have less capabilities than the boot processor are not handled.
-   Tough. Make sure you disable such features by hand. */
+/*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+       return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+       u8 *next_rip, *tgt_rip;
+       s32 n_dspl, o_dspl;
+       int repl_len;
+
+       if (a->replacementlen != 5)
+               return;
+
+       o_dspl = *(s32 *)(insnbuf + 1);
+
+       /* next_rip of the replacement JMP */
+       next_rip = repl_insn + a->replacementlen;
+       /* target rip of the replacement JMP */
+       tgt_rip  = next_rip + o_dspl;
+       n_dspl = tgt_rip - orig_insn;
+
+       DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
+
+       if (tgt_rip - orig_insn >= 0) {
+               if (n_dspl - 2 <= 127)
+                       goto two_byte_jmp;
+               else
+                       goto five_byte_jmp;
+       /* negative offset */
+       } else {
+               if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+                       goto two_byte_jmp;
+               else
+                       goto five_byte_jmp;
+       }
+
+two_byte_jmp:
+       n_dspl -= 2;
+
+       insnbuf[0] = 0xeb;
+       insnbuf[1] = (s8)n_dspl;
+       add_nops(insnbuf + 2, 3);
+
+       repl_len = 2;
+       goto done;
+
+five_byte_jmp:
+       n_dspl -= 5;
+
+       insnbuf[0] = 0xe9;
+       *(s32 *)&insnbuf[1] = n_dspl;
 
+       repl_len = 5;
+
+done:
+
+       DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+               n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+{
+       if (instr[0] != 0x90)
+               return;
+
+       add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+
+       DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
+                  instr, a->instrlen - a->padlen, a->padlen);
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ */
 void __init_or_module apply_alternatives(struct alt_instr *start,
                                         struct alt_instr *end)
 {
@@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
        u8 *instr, *replacement;
        u8 insnbuf[MAX_PATCH_LEN];
 
-       DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+       DPRINTK("alt table %p -> %p", start, end);
        /*
         * The scan order should be from start to end. A later scanned
-        * alternative code can overwrite a previous scanned alternative code.
+        * alternative code can overwrite previously scanned alternative code.
         * Some kernel functions (e.g. memcpy, memset, etc) use this order to
         * patch code.
         *
@@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
         * order.
         */
        for (a = start; a < end; a++) {
+               int insnbuf_sz = 0;
+
                instr = (u8 *)&a->instr_offset + a->instr_offset;
                replacement = (u8 *)&a->repl_offset + a->repl_offset;
-               BUG_ON(a->replacementlen > a->instrlen);
                BUG_ON(a->instrlen > sizeof(insnbuf));
                BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
-               if (!boot_cpu_has(a->cpuid))
+               if (!boot_cpu_has(a->cpuid)) {
+                       if (a->padlen > 1)
+                               optimize_nops(a, instr);
+
                        continue;
+               }
+
+               DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
+                       a->cpuid >> 5,
+                       a->cpuid & 0x1f,
+                       instr, a->instrlen,
+                       replacement, a->replacementlen, a->padlen);
+
+               DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
+               DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
 
                memcpy(insnbuf, replacement, a->replacementlen);
+               insnbuf_sz = a->replacementlen;
 
                /* 0xe8 is a relative jump; fix the offset. */
-               if (*insnbuf == 0xe8 && a->replacementlen == 5)
-                   *(s32 *)(insnbuf + 1) += replacement - instr;
+               if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+                       *(s32 *)(insnbuf + 1) += replacement - instr;
+                       DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+                               *(s32 *)(insnbuf + 1),
+                               (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+               }
+
+               if (a->replacementlen && is_jmp(replacement[0]))
+                       recompute_jump(a, instr, replacement, insnbuf);
 
-               add_nops(insnbuf + a->replacementlen,
-                        a->instrlen - a->replacementlen);
+               if (a->instrlen > a->replacementlen) {
+                       add_nops(insnbuf + a->replacementlen,
+                                a->instrlen - a->replacementlen);
+                       insnbuf_sz += a->instrlen - a->replacementlen;
+               }
+               DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
 
-               text_poke_early(instr, insnbuf, a->instrlen);
+               text_poke_early(instr, insnbuf, insnbuf_sz);
        }
 }
 
 #ifdef CONFIG_SMP
-
 static void alternatives_smp_lock(const s32 *start, const s32 *end,
                                  u8 *text, u8 *text_end)
 {
@@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
        smp->locks_end  = locks_end;
        smp->text       = text;
        smp->text_end   = text_end;
-       DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
-               __func__, smp->locks, smp->locks_end,
+       DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+               smp->locks, smp->locks_end,
                smp->text, smp->text_end, smp->name);
 
        list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end)
 
        return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PARAVIRT
 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
@@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs)
        if (likely(!bp_patching_in_progress))
                return 0;
 
-       if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
+       if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
                return 0;
 
        /* set up the specified breakpoint handler */
index ad3639ae1b9b50a71ff224faaf96e05302eccd9f..dcb52850a28fcbe00a4a25ddf47d6f4ccedf3c9c 100644 (file)
@@ -1084,67 +1084,6 @@ void lapic_shutdown(void)
        local_irq_restore(flags);
 }
 
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
-       unsigned int reg0, reg1;
-
-       /*
-        * The version register is read-only in a real APIC.
-        */
-       reg0 = apic_read(APIC_LVR);
-       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
-       apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
-       reg1 = apic_read(APIC_LVR);
-       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
-       /*
-        * The two version reads above should print the same
-        * numbers.  If the second one is different, then we
-        * poke at a non-APIC.
-        */
-       if (reg1 != reg0)
-               return 0;
-
-       /*
-        * Check if the version looks reasonably.
-        */
-       reg1 = GET_APIC_VERSION(reg0);
-       if (reg1 == 0x00 || reg1 == 0xff)
-               return 0;
-       reg1 = lapic_get_maxlvt();
-       if (reg1 < 0x02 || reg1 == 0xff)
-               return 0;
-
-       /*
-        * The ID register is read/write in a real APIC.
-        */
-       reg0 = apic_read(APIC_ID);
-       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
-       apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
-       reg1 = apic_read(APIC_ID);
-       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
-       apic_write(APIC_ID, reg0);
-       if (reg1 != (reg0 ^ apic->apic_id_mask))
-               return 0;
-
-       /*
-        * The next two are just to see if we have sane values.
-        * They're only really relevant if we're in Virtual Wire
-        * compatibility mode, but most boxes are anymore.
-        */
-       reg0 = apic_read(APIC_LVT0);
-       apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
-       reg1 = apic_read(APIC_LVT1);
-       apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
-       return 1;
-}
-
 /**
  * sync_Arb_IDs - synchronize APIC bus arbitration IDs
  */
@@ -2283,7 +2222,6 @@ int __init APIC_init_uniprocessor(void)
                disable_ioapic_support();
 
        default_setup_apic_routing();
-       verify_local_APIC();
        apic_bsp_setup(true);
        return 0;
 }
index e658f21681c82e1ad8fa28bde084c7933939548b..d9d0bd2faaf42cf4b1e894628edb4139ee615780 100644 (file)
@@ -135,12 +135,12 @@ static void init_x2apic_ldr(void)
 
        per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
 
-       __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+       cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
        for_each_online_cpu(cpu) {
                if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
                        continue;
-               __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
-               __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+               cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
+               cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
        }
 }
 
@@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void)
 
        BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
 
-       __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+       cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
        register_hotcpu_notifier(&x2apic_cpu_notifier);
        return 1;
 }
index 8e9dcfd630e4b539e7936050b7bb1673660ae752..c8d92950bc041bcebcb6a2361ebeac0ce3065279 100644 (file)
@@ -144,33 +144,60 @@ static void __init uv_set_apicid_hibit(void)
 
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-       int pnodeid, is_uv1, is_uv2, is_uv3;
-
-       is_uv1 = !strcmp(oem_id, "SGI");
-       is_uv2 = !strcmp(oem_id, "SGI2");
-       is_uv3 = !strncmp(oem_id, "SGI3", 4);   /* there are varieties of UV3 */
-       if (is_uv1 || is_uv2 || is_uv3) {
-               uv_hub_info->hub_revision =
-                       (is_uv1 ? UV1_HUB_REVISION_BASE :
-                       (is_uv2 ? UV2_HUB_REVISION_BASE :
-                                 UV3_HUB_REVISION_BASE));
-               pnodeid = early_get_pnodeid();
-               early_get_apic_pnode_shift();
-               x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
-               x86_platform.nmi_init = uv_nmi_init;
-               if (!strcmp(oem_table_id, "UVL"))
-                       uv_system_type = UV_LEGACY_APIC;
-               else if (!strcmp(oem_table_id, "UVX"))
-                       uv_system_type = UV_X2APIC;
-               else if (!strcmp(oem_table_id, "UVH")) {
-                       __this_cpu_write(x2apic_extra_bits,
-                               pnodeid << uvh_apicid.s.pnode_shift);
-                       uv_system_type = UV_NON_UNIQUE_APIC;
-                       uv_set_apicid_hibit();
-                       return 1;
-               }
+       int pnodeid;
+       int uv_apic;
+
+       if (strncmp(oem_id, "SGI", 3) != 0)
+               return 0;
+
+       /*
+        * Determine UV arch type.
+        *   SGI: UV100/1000
+        *   SGI2: UV2000/3000
+        *   SGI3: UV300 (truncated to 4 chars because of different varieties)
+        */
+       uv_hub_info->hub_revision =
+               !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
+               !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
+               !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
+
+       if (uv_hub_info->hub_revision == 0)
+               goto badbios;
+
+       pnodeid = early_get_pnodeid();
+       early_get_apic_pnode_shift();
+       x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
+       x86_platform.nmi_init = uv_nmi_init;
+
+       if (!strcmp(oem_table_id, "UVX")) {             /* most common */
+               uv_system_type = UV_X2APIC;
+               uv_apic = 0;
+
+       } else if (!strcmp(oem_table_id, "UVH")) {      /* only UV1 systems */
+               uv_system_type = UV_NON_UNIQUE_APIC;
+               __this_cpu_write(x2apic_extra_bits,
+                       pnodeid << uvh_apicid.s.pnode_shift);
+               uv_set_apicid_hibit();
+               uv_apic = 1;
+
+       } else  if (!strcmp(oem_table_id, "UVL")) {     /* only used for */
+               uv_system_type = UV_LEGACY_APIC;        /* very small systems */
+               uv_apic = 0;
+
+       } else {
+               goto badbios;
        }
-       return 0;
+
+       pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n",
+               oem_id, oem_table_id, uv_system_type,
+               uv_min_hub_revision_id, uv_apic);
+
+       return uv_apic;
+
+badbios:
+       pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id);
+       pr_err("Current BIOS not supported, update kernel and/or BIOS\n");
+       BUG();
 }
 
 enum uv_system_type get_uv_system_type(void)
@@ -854,10 +881,14 @@ void __init uv_system_init(void)
        unsigned long mmr_base, present, paddr;
        unsigned short pnode_mask;
        unsigned char n_lshift;
-       char *hub = (is_uv1_hub() ? "UV1" :
-                   (is_uv2_hub() ? "UV2" :
-                                   "UV3"));
+       char *hub = (is_uv1_hub() ? "UV100/1000" :
+                   (is_uv2_hub() ? "UV2000/3000" :
+                   (is_uv3_hub() ? "UV300" : NULL)));
 
+       if (!hub) {
+               pr_err("UV: Unknown/unsupported UV hub\n");
+               return;
+       }
        pr_info("UV: Found %s hub\n", hub);
        map_low_mmrs();
 
index 3b3b9d33ac1d2f0809db3b16230c840e98774b70..47703aed74cfb7d013b2d577488d68c7280bf8c6 100644 (file)
@@ -68,7 +68,7 @@ void foo(void)
 
        /* Offset from the sysenter stack to tss.sp0 */
        DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
-                sizeof(struct tss_struct));
+              offsetofend(struct tss_struct, SYSENTER_stack));
 
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
        BLANK();
index fdcbb4d27c9f80f35a587f4284fa2b958c44e774..5ce6f2da87639c7d373879e43120c9035ba590b9 100644 (file)
@@ -81,6 +81,7 @@ int main(void)
 #undef ENTRY
 
        OFFSET(TSS_ist, tss_struct, x86_tss.ist);
+       OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
        BLANK();
 
        DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
index a220239cea65ca99b3c52ebbfaed6ce973f0dec1..fd470ebf924e574e827f1942b973075d3168a9b6 100644 (file)
@@ -5,6 +5,7 @@
 
 #include <linux/io.h>
 #include <linux/sched.h>
+#include <linux/random.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
@@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 
                va_align.mask     = (upperbit - 1) & PAGE_MASK;
                va_align.flags    = ALIGN_VA_32 | ALIGN_VA_64;
+
+               /* A random value per boot for bit slice [12:upper_bit) */
+               va_align.bits = get_random_int() & va_align.mask;
        }
 }
 
@@ -711,6 +715,11 @@ static void init_amd(struct cpuinfo_x86 *c)
                set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
 
        rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+       /* 3DNow or LM implies PREFETCHW */
+       if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+               if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+                       set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
 }
 
 #ifdef CONFIG_X86_32
index 2346c95c6ab1945077fdde28c129342ffe09749d..3f70538012e2d6ac0949f45983565a20a6ca87a3 100644 (file)
@@ -959,38 +959,37 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 #endif
 }
 
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
-/* May not be __init: called during resume */
-static void syscall32_cpu_init(void)
-{
-       /* Load these always in case some future AMD CPU supports
-          SYSENTER from compat mode too. */
-       wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-       wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
-       wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
-
-       wrmsrl(MSR_CSTAR, ia32_cstar_target);
-}
-#endif         /* CONFIG_IA32_EMULATION */
-#endif         /* CONFIG_X86_64 */
-
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
 #ifdef CONFIG_X86_32
 void enable_sep_cpu(void)
 {
-       int cpu = get_cpu();
-       struct tss_struct *tss = &per_cpu(init_tss, cpu);
+       struct tss_struct *tss;
+       int cpu;
 
-       if (!boot_cpu_has(X86_FEATURE_SEP)) {
-               put_cpu();
-               return;
-       }
+       cpu = get_cpu();
+       tss = &per_cpu(cpu_tss, cpu);
+
+       if (!boot_cpu_has(X86_FEATURE_SEP))
+               goto out;
+
+       /*
+        * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+        * see the big comment in struct x86_hw_tss's definition.
+        */
 
        tss->x86_tss.ss1 = __KERNEL_CS;
-       tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
-       wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-       wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
-       wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+       wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+
+       wrmsr(MSR_IA32_SYSENTER_ESP,
+             (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
+             0);
+
+       wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+
+out:
        put_cpu();
 }
 #endif
@@ -1118,7 +1117,7 @@ static __init int setup_disablecpuid(char *arg)
 __setup("clearcpuid=", setup_disablecpuid);
 
 DEFINE_PER_CPU(unsigned long, kernel_stack) =
-       (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+       (unsigned long)&init_thread_union + THREAD_SIZE;
 EXPORT_PER_CPU_SYMBOL(kernel_stack);
 
 #ifdef CONFIG_X86_64
@@ -1130,8 +1129,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
                     irq_stack_union) __aligned(PAGE_SIZE) __visible;
 
 /*
- * The following four percpu variables are hot.  Align current_task to
- * cacheline size such that all four fall in the same cacheline.
+ * The following percpu variables are hot.  Align current_task to
+ * cacheline size such that they fall in the same cacheline.
  */
 DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
        &init_task;
@@ -1171,10 +1170,23 @@ void syscall_init(void)
         */
        wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
        wrmsrl(MSR_LSTAR, system_call);
-       wrmsrl(MSR_CSTAR, ignore_sysret);
 
 #ifdef CONFIG_IA32_EMULATION
-       syscall32_cpu_init();
+       wrmsrl(MSR_CSTAR, ia32_cstar_target);
+       /*
+        * This only works on Intel CPUs.
+        * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+        * This does not cause SYSENTER to jump to the wrong location, because
+        * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+        */
+       wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+       wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+       wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+#else
+       wrmsrl(MSR_CSTAR, ignore_sysret);
+       wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+       wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+       wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
 #endif
 
        /* Flags to clear on syscall */
@@ -1226,6 +1238,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
 
+/*
+ * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
+ * the top of the kernel stack.  Use an extra percpu variable to track the
+ * top of the kernel stack directly.
+ */
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+       (unsigned long)&init_thread_union + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
 #ifdef CONFIG_CC_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
@@ -1307,7 +1328,7 @@ void cpu_init(void)
         */
        load_ucode_ap();
 
-       t = &per_cpu(init_tss, cpu);
+       t = &per_cpu(cpu_tss, cpu);
        oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
@@ -1391,7 +1412,7 @@ void cpu_init(void)
 {
        int cpu = smp_processor_id();
        struct task_struct *curr = current;
-       struct tss_struct *t = &per_cpu(init_tss, cpu);
+       struct tss_struct *t = &per_cpu(cpu_tss, cpu);
        struct thread_struct *thread = &curr->thread;
 
        wait_for_master_cpu(cpu);
index 659643376dbf7d9d04b377e85ef04bf1e6bf1ada..edcb0e28c336d085d0ee1011793c98ba6f5a13ae 100644 (file)
@@ -7,16 +7,14 @@
  *     Andi Kleen / Andreas Herrmann   : CPUID4 emulation on AMD.
  */
 
-#include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/compiler.h>
+#include <linux/cacheinfo.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sysfs.h>
 #include <linux/pci.h>
 
 #include <asm/processor.h>
-#include <linux/smp.h>
 #include <asm/amd_nb.h>
 #include <asm/smp.h>
 
@@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] =
 
 
 enum _cache_type {
-       CACHE_TYPE_NULL = 0,
-       CACHE_TYPE_DATA = 1,
-       CACHE_TYPE_INST = 2,
-       CACHE_TYPE_UNIFIED = 3
+       CTYPE_NULL = 0,
+       CTYPE_DATA = 1,
+       CTYPE_INST = 2,
+       CTYPE_UNIFIED = 3
 };
 
 union _cpuid4_leaf_eax {
@@ -159,11 +157,6 @@ struct _cpuid4_info_regs {
        struct amd_northbridge *nb;
 };
 
-struct _cpuid4_info {
-       struct _cpuid4_info_regs base;
-       DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
-};
-
 unsigned short                 num_cache_leaves;
 
 /* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -220,6 +213,13 @@ static const unsigned short assocs[] = {
 static const unsigned char levels[] = { 1, 1, 2, 3 };
 static const unsigned char types[] = { 1, 2, 3, 3 };
 
+static const enum cache_type cache_type_map[] = {
+       [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+       [CTYPE_DATA] = CACHE_TYPE_DATA,
+       [CTYPE_INST] = CACHE_TYPE_INST,
+       [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
 static void
 amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
                     union _cpuid4_leaf_ebx *ebx,
@@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
                (ebx->split.ways_of_associativity + 1) - 1;
 }
 
-struct _cache_attr {
-       struct attribute attr;
-       ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
-       ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
-                        unsigned int);
-};
-
 #if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+
 /*
  * L3 cache descriptors
  */
@@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
        l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
 
-static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
-{
-       int node;
-
-       /* only for L3, and not in virtualized environments */
-       if (index < 3)
-               return;
-
-       node = amd_get_nb_id(smp_processor_id());
-       this_leaf->nb = node_to_amd_nb(node);
-       if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
-               amd_calc_l3_indices(this_leaf->nb);
-}
-
 /*
  * check whether a slot used for disabling an L3 index is occupied.
  * @l3: L3 cache descriptor
@@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
        return -1;
 }
 
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
                                  unsigned int slot)
 {
        int index;
+       struct amd_northbridge *nb = this_leaf->priv;
 
-       if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
-               return -EINVAL;
-
-       index = amd_get_l3_disable_slot(this_leaf->base.nb, slot);
+       index = amd_get_l3_disable_slot(nb, slot);
        if (index >= 0)
                return sprintf(buf, "%d\n", index);
 
@@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 
 #define SHOW_CACHE_DISABLE(slot)                                       \
 static ssize_t                                                         \
-show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf,   \
-                         unsigned int cpu)                             \
+cache_disable_##slot##_show(struct device *dev,                                \
+                           struct device_attribute *attr, char *buf)   \
 {                                                                      \
+       struct cacheinfo *this_leaf = dev_get_drvdata(dev);             \
        return show_cache_disable(this_leaf, buf, slot);                \
 }
 SHOW_CACHE_DISABLE(0)
@@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
        return 0;
 }
 
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
-                                 const char *buf, size_t count,
-                                 unsigned int slot)
+static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
+                                  const char *buf, size_t count,
+                                  unsigned int slot)
 {
        unsigned long val = 0;
        int cpu, err = 0;
+       struct amd_northbridge *nb = this_leaf->priv;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
-               return -EINVAL;
-
-       cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+       cpu = cpumask_first(&this_leaf->shared_cpu_map);
 
        if (kstrtoul(buf, 10, &val) < 0)
                return -EINVAL;
 
-       err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
+       err = amd_set_l3_disable_slot(nb, cpu, slot, val);
        if (err) {
                if (err == -EEXIST)
                        pr_warning("L3 slot %d in use/index already disabled!\n",
@@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 
 #define STORE_CACHE_DISABLE(slot)                                      \
 static ssize_t                                                         \
-store_cache_disable_##slot(struct _cpuid4_info *this_leaf,             \
-                          const char *buf, size_t count,               \
-                          unsigned int cpu)                            \
+cache_disable_##slot##_store(struct device *dev,                       \
+                            struct device_attribute *attr,             \
+                            const char *buf, size_t count)             \
 {                                                                      \
+       struct cacheinfo *this_leaf = dev_get_drvdata(dev);             \
        return store_cache_disable(this_leaf, buf, count, slot);        \
 }
 STORE_CACHE_DISABLE(0)
 STORE_CACHE_DISABLE(1)
 
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
-               show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
-               show_cache_disable_1, store_cache_disable_1);
-
-static ssize_t
-show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
+static ssize_t subcaches_show(struct device *dev,
+                             struct device_attribute *attr, char *buf)
 {
-       if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
-               return -EINVAL;
+       struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+       int cpu = cpumask_first(&this_leaf->shared_cpu_map);
 
        return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
 }
 
-static ssize_t
-store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
-               unsigned int cpu)
+static ssize_t subcaches_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t count)
 {
+       struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+       int cpu = cpumask_first(&this_leaf->shared_cpu_map);
        unsigned long val;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
-               return -EINVAL;
-
        if (kstrtoul(buf, 16, &val) < 0)
                return -EINVAL;
 
@@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
        return count;
 }
 
-static struct _cache_attr subcaches =
-       __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t
+cache_private_attrs_is_visible(struct kobject *kobj,
+                              struct attribute *attr, int unused)
+{
+       struct device *dev = kobj_to_dev(kobj);
+       struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+       umode_t mode = attr->mode;
+
+       if (!this_leaf->priv)
+               return 0;
+
+       if ((attr == &dev_attr_subcaches.attr) &&
+           amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+               return mode;
+
+       if ((attr == &dev_attr_cache_disable_0.attr ||
+            attr == &dev_attr_cache_disable_1.attr) &&
+           amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+               return mode;
+
+       return 0;
+}
+
+static struct attribute_group cache_private_group = {
+       .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+       int n = 1;
+       static struct attribute **amd_l3_attrs;
+
+       if (amd_l3_attrs) /* already initialized */
+               return;
+
+       if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+               n += 2;
+       if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+               n += 1;
+
+       amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+       if (!amd_l3_attrs)
+               return;
+
+       n = 0;
+       if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+               amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+               amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+       }
+       if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+               amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
 
+       cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *
+cache_get_priv_group(struct cacheinfo *this_leaf)
+{
+       struct amd_northbridge *nb = this_leaf->priv;
+
+       if (this_leaf->level < 3 || !nb)
+               return NULL;
+
+       if (nb && nb->l3_cache.indices)
+               init_amd_l3_attrs();
+
+       return &cache_private_group;
+}
+
+static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
+{
+       int node;
+
+       /* only for L3, and not in virtualized environments */
+       if (index < 3)
+               return;
+
+       node = amd_get_nb_id(smp_processor_id());
+       this_leaf->nb = node_to_amd_nb(node);
+       if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
+               amd_calc_l3_indices(this_leaf->nb);
+}
 #else
 #define amd_init_l3_cache(x, y)
 #endif  /* CONFIG_AMD_NB && CONFIG_SYSFS */
@@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
                cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
        }
 
-       if (eax.split.type == CACHE_TYPE_NULL)
+       if (eax.split.type == CTYPE_NULL)
                return -EIO; /* better error ? */
 
        this_leaf->eax = eax;
@@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
                /* Do cpuid(op) loop to find out num_cache_leaves */
                cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
                cache_eax.full = eax;
-       } while (cache_eax.split.type != CACHE_TYPE_NULL);
+       } while (cache_eax.split.type != CTYPE_NULL);
        return i;
 }
 
@@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
 
                        switch (this_leaf.eax.split.level) {
                        case 1:
-                               if (this_leaf.eax.split.type == CACHE_TYPE_DATA)
+                               if (this_leaf.eax.split.type == CTYPE_DATA)
                                        new_l1d = this_leaf.size/1024;
-                               else if (this_leaf.eax.split.type == CACHE_TYPE_INST)
+                               else if (this_leaf.eax.split.type == CTYPE_INST)
                                        new_l1i = this_leaf.size/1024;
                                break;
                        case 2:
@@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
        return l2;
 }
 
-#ifdef CONFIG_SYSFS
-
-/* pointer to _cpuid4_info array (for each cache leaf) */
-static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
-#define CPUID4_INFO_IDX(x, y)  (&((per_cpu(ici_cpuid4_info, x))[y]))
-
-#ifdef CONFIG_SMP
-
-static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+                                   struct _cpuid4_info_regs *base)
 {
-       struct _cpuid4_info *this_leaf;
+       struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+       struct cacheinfo *this_leaf;
        int i, sibling;
 
        if (cpu_has_topoext) {
                unsigned int apicid, nshared, first, last;
 
-               if (!per_cpu(ici_cpuid4_info, cpu))
-                       return 0;
-
-               this_leaf = CPUID4_INFO_IDX(cpu, index);
-               nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
+               this_leaf = this_cpu_ci->info_list + index;
+               nshared = base->eax.split.num_threads_sharing + 1;
                apicid = cpu_data(cpu).apicid;
                first = apicid - (apicid % nshared);
                last = first + nshared - 1;
 
                for_each_online_cpu(i) {
+                       this_cpu_ci = get_cpu_cacheinfo(i);
+                       if (!this_cpu_ci->info_list)
+                               continue;
+
                        apicid = cpu_data(i).apicid;
                        if ((apicid < first) || (apicid > last))
                                continue;
-                       if (!per_cpu(ici_cpuid4_info, i))
-                               continue;
-                       this_leaf = CPUID4_INFO_IDX(i, index);
+
+                       this_leaf = this_cpu_ci->info_list + index;
 
                        for_each_online_cpu(sibling) {
                                apicid = cpu_data(sibling).apicid;
                                if ((apicid < first) || (apicid > last))
                                        continue;
-                               set_bit(sibling, this_leaf->shared_cpu_map);
+                               cpumask_set_cpu(sibling,
+                                               &this_leaf->shared_cpu_map);
                        }
                }
        } else if (index == 3) {
                for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
-                       if (!per_cpu(ici_cpuid4_info, i))
+                       this_cpu_ci = get_cpu_cacheinfo(i);
+                       if (!this_cpu_ci->info_list)
                                continue;
-                       this_leaf = CPUID4_INFO_IDX(i, index);
+                       this_leaf = this_cpu_ci->info_list + index;
                        for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
                                if (!cpu_online(sibling))
                                        continue;
-                               set_bit(sibling, this_leaf->shared_cpu_map);
+                               cpumask_set_cpu(sibling,
+                                               &this_leaf->shared_cpu_map);
                        }
                }
        } else
@@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
        return 1;
 }
 
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+                                struct _cpuid4_info_regs *base)
 {
-       struct _cpuid4_info *this_leaf, *sibling_leaf;
+       struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+       struct cacheinfo *this_leaf, *sibling_leaf;
        unsigned long num_threads_sharing;
        int index_msb, i;
        struct cpuinfo_x86 *c = &cpu_data(cpu);
 
        if (c->x86_vendor == X86_VENDOR_AMD) {
-               if (cache_shared_amd_cpu_map_setup(cpu, index))
+               if (__cache_amd_cpumap_setup(cpu, index, base))
                        return;
        }
 
-       this_leaf = CPUID4_INFO_IDX(cpu, index);
-       num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
+       this_leaf = this_cpu_ci->info_list + index;
+       num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
 
+       cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
        if (num_threads_sharing == 1)
-               cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
-       else {
-               index_msb = get_count_order(num_threads_sharing);
-
-               for_each_online_cpu(i) {
-                       if (cpu_data(i).apicid >> index_msb ==
-                           c->apicid >> index_msb) {
-                               cpumask_set_cpu(i,
-                                       to_cpumask(this_leaf->shared_cpu_map));
-                               if (i != cpu && per_cpu(ici_cpuid4_info, i))  {
-                                       sibling_leaf =
-                                               CPUID4_INFO_IDX(i, index);
-                                       cpumask_set_cpu(cpu, to_cpumask(
-                                               sibling_leaf->shared_cpu_map));
-                               }
-                       }
-               }
-       }
-}
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
-       struct _cpuid4_info     *this_leaf, *sibling_leaf;
-       int sibling;
-
-       this_leaf = CPUID4_INFO_IDX(cpu, index);
-       for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
-               sibling_leaf = CPUID4_INFO_IDX(sibling, index);
-               cpumask_clear_cpu(cpu,
-                                 to_cpumask(sibling_leaf->shared_cpu_map));
-       }
-}
-#else
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
-}
-
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
-}
-#endif
-
-static void free_cache_attributes(unsigned int cpu)
-{
-       int i;
-
-       for (i = 0; i < num_cache_leaves; i++)
-               cache_remove_shared_cpu_map(cpu, i);
-
-       kfree(per_cpu(ici_cpuid4_info, cpu));
-       per_cpu(ici_cpuid4_info, cpu) = NULL;
-}
-
-static void get_cpu_leaves(void *_retval)
-{
-       int j, *retval = _retval, cpu = smp_processor_id();
+               return;
 
-       /* Do cpuid and store the results */
-       for (j = 0; j < num_cache_leaves; j++) {
-               struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j);
+       index_msb = get_count_order(num_threads_sharing);
 
-               *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base);
-               if (unlikely(*retval < 0)) {
-                       int i;
+       for_each_online_cpu(i)
+               if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+                       struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
 
-                       for (i = 0; i < j; i++)
-                               cache_remove_shared_cpu_map(cpu, i);
-                       break;
+                       if (i == cpu || !sib_cpu_ci->info_list)
+                               continue;/* skip if itself or no cacheinfo */
+                       sibling_leaf = sib_cpu_ci->info_list + index;
+                       cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
+                       cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
                }
-               cache_shared_cpu_map_setup(cpu, j);
-       }
 }
 
-static int detect_cache_attributes(unsigned int cpu)
+static void ci_leaf_init(struct cacheinfo *this_leaf,
+                        struct _cpuid4_info_regs *base)
 {
-       int                     retval;
-
-       if (num_cache_leaves == 0)
-               return -ENOENT;
-
-       per_cpu(ici_cpuid4_info, cpu) = kzalloc(
-           sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
-       if (per_cpu(ici_cpuid4_info, cpu) == NULL)
-               return -ENOMEM;
-
-       smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
-       if (retval) {
-               kfree(per_cpu(ici_cpuid4_info, cpu));
-               per_cpu(ici_cpuid4_info, cpu) = NULL;
-       }
-
-       return retval;
+       this_leaf->level = base->eax.split.level;
+       this_leaf->type = cache_type_map[base->eax.split.type];
+       this_leaf->coherency_line_size =
+                               base->ebx.split.coherency_line_size + 1;
+       this_leaf->ways_of_associativity =
+                               base->ebx.split.ways_of_associativity + 1;
+       this_leaf->size = base->size;
+       this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
+       this_leaf->physical_line_partition =
+                               base->ebx.split.physical_line_partition + 1;
+       this_leaf->priv = base->nb;
 }
 
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
-
-/* pointer to kobject for cpuX/cache */
-static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
-
-struct _index_kobject {
-       struct kobject kobj;
-       unsigned int cpu;
-       unsigned short index;
-};
-
-/* pointer to array of kobjects for cpuX/cache/indexY */
-static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
-#define INDEX_KOBJECT_PTR(x, y)                (&((per_cpu(ici_index_kobject, x))[y]))
-
-#define show_one_plus(file_name, object, val)                          \
-static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
-                               unsigned int cpu)                       \
-{                                                                      \
-       return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
-}
-
-show_one_plus(level, base.eax.split.level, 0);
-show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1);
-show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1);
-show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1);
-show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1);
-
-static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
-                        unsigned int cpu)
-{
-       return sprintf(buf, "%luK\n", this_leaf->base.size / 1024);
-}
-
-static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
-                                       int type, char *buf)
-{
-       const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
-       int ret;
-
-       if (type)
-               ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
-                               cpumask_pr_args(mask));
-       else
-               ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
-                               cpumask_pr_args(mask));
-       buf[ret++] = '\n';
-       buf[ret] = '\0';
-       return ret;
-}
-
-static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
-                                         unsigned int cpu)
+static int __init_cache_level(unsigned int cpu)
 {
-       return show_shared_cpu_map_func(leaf, 0, buf);
-}
-
-static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
-                                          unsigned int cpu)
-{
-       return show_shared_cpu_map_func(leaf, 1, buf);
-}
+       struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
 
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
-                        unsigned int cpu)
-{
-       switch (this_leaf->base.eax.split.type) {
-       case CACHE_TYPE_DATA:
-               return sprintf(buf, "Data\n");
-       case CACHE_TYPE_INST:
-               return sprintf(buf, "Instruction\n");
-       case CACHE_TYPE_UNIFIED:
-               return sprintf(buf, "Unified\n");
-       default:
-               return sprintf(buf, "Unknown\n");
-       }
-}
-
-#define to_object(k)   container_of(k, struct _index_kobject, kobj)
-#define to_attr(a)     container_of(a, struct _cache_attr, attr)
-
-#define define_one_ro(_name) \
-static struct _cache_attr _name = \
-       __ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(physical_line_partition);
-define_one_ro(ways_of_associativity);
-define_one_ro(number_of_sets);
-define_one_ro(size);
-define_one_ro(shared_cpu_map);
-define_one_ro(shared_cpu_list);
-
-static struct attribute *default_attrs[] = {
-       &type.attr,
-       &level.attr,
-       &coherency_line_size.attr,
-       &physical_line_partition.attr,
-       &ways_of_associativity.attr,
-       &number_of_sets.attr,
-       &size.attr,
-       &shared_cpu_map.attr,
-       &shared_cpu_list.attr,
-       NULL
-};
-
-#ifdef CONFIG_AMD_NB
-static struct attribute **amd_l3_attrs(void)
-{
-       static struct attribute **attrs;
-       int n;
-
-       if (attrs)
-               return attrs;
-
-       n = ARRAY_SIZE(default_attrs);
-
-       if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
-               n += 2;
-
-       if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
-               n += 1;
-
-       attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
-       if (attrs == NULL)
-               return attrs = default_attrs;
-
-       for (n = 0; default_attrs[n]; n++)
-               attrs[n] = default_attrs[n];
-
-       if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
-               attrs[n++] = &cache_disable_0.attr;
-               attrs[n++] = &cache_disable_1.attr;
-       }
-
-       if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
-               attrs[n++] = &subcaches.attr;
-
-       return attrs;
-}
-#endif
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-       struct _cache_attr *fattr = to_attr(attr);
-       struct _index_kobject *this_leaf = to_object(kobj);
-       ssize_t ret;
-
-       ret = fattr->show ?
-               fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
-                       buf, this_leaf->cpu) :
-               0;
-       return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
-                    const char *buf, size_t count)
-{
-       struct _cache_attr *fattr = to_attr(attr);
-       struct _index_kobject *this_leaf = to_object(kobj);
-       ssize_t ret;
-
-       ret = fattr->store ?
-               fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
-                       buf, count, this_leaf->cpu) :
-               0;
-       return ret;
-}
-
-static const struct sysfs_ops sysfs_ops = {
-       .show   = show,
-       .store  = store,
-};
-
-static struct kobj_type ktype_cache = {
-       .sysfs_ops      = &sysfs_ops,
-       .default_attrs  = default_attrs,
-};
-
-static struct kobj_type ktype_percpu_entry = {
-       .sysfs_ops      = &sysfs_ops,
-};
-
-static void cpuid4_cache_sysfs_exit(unsigned int cpu)
-{
-       kfree(per_cpu(ici_cache_kobject, cpu));
-       kfree(per_cpu(ici_index_kobject, cpu));
-       per_cpu(ici_cache_kobject, cpu) = NULL;
-       per_cpu(ici_index_kobject, cpu) = NULL;
-       free_cache_attributes(cpu);
-}
-
-static int cpuid4_cache_sysfs_init(unsigned int cpu)
-{
-       int err;
-
-       if (num_cache_leaves == 0)
+       if (!num_cache_leaves)
                return -ENOENT;
-
-       err = detect_cache_attributes(cpu);
-       if (err)
-               return err;
-
-       /* Allocate all required memory */
-       per_cpu(ici_cache_kobject, cpu) =
-               kzalloc(sizeof(struct kobject), GFP_KERNEL);
-       if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
-               goto err_out;
-
-       per_cpu(ici_index_kobject, cpu) = kzalloc(
-           sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
-       if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
-               goto err_out;
-
+       if (!this_cpu_ci)
+               return -EINVAL;
+       this_cpu_ci->num_levels = 3;
+       this_cpu_ci->num_leaves = num_cache_leaves;
        return 0;
-
-err_out:
-       cpuid4_cache_sysfs_exit(cpu);
-       return -ENOMEM;
 }
 
-static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
-
-/* Add/Remove cache interface for CPU device */
-static int cache_add_dev(struct device *dev)
+static int __populate_cache_leaves(unsigned int cpu)
 {
-       unsigned int cpu = dev->id;
-       unsigned long i, j;
-       struct _index_kobject *this_object;
-       struct _cpuid4_info   *this_leaf;
-       int retval;
-
-       retval = cpuid4_cache_sysfs_init(cpu);
-       if (unlikely(retval < 0))
-               return retval;
-
-       retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
-                                     &ktype_percpu_entry,
-                                     &dev->kobj, "%s", "cache");
-       if (retval < 0) {
-               cpuid4_cache_sysfs_exit(cpu);
-               return retval;
-       }
+       unsigned int idx, ret;
+       struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+       struct cacheinfo *this_leaf = this_cpu_ci->info_list;
+       struct _cpuid4_info_regs id4_regs = {};
 
-       for (i = 0; i < num_cache_leaves; i++) {
-               this_object = INDEX_KOBJECT_PTR(cpu, i);
-               this_object->cpu = cpu;
-               this_object->index = i;
-
-               this_leaf = CPUID4_INFO_IDX(cpu, i);
-
-               ktype_cache.default_attrs = default_attrs;
-#ifdef CONFIG_AMD_NB
-               if (this_leaf->base.nb)
-                       ktype_cache.default_attrs = amd_l3_attrs();
-#endif
-               retval = kobject_init_and_add(&(this_object->kobj),
-                                             &ktype_cache,
-                                             per_cpu(ici_cache_kobject, cpu),
-                                             "index%1lu", i);
-               if (unlikely(retval)) {
-                       for (j = 0; j < i; j++)
-                               kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
-                       kobject_put(per_cpu(ici_cache_kobject, cpu));
-                       cpuid4_cache_sysfs_exit(cpu);
-                       return retval;
-               }
-               kobject_uevent(&(this_object->kobj), KOBJ_ADD);
+       for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+               ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
+               if (ret)
+                       return ret;
+               ci_leaf_init(this_leaf++, &id4_regs);
+               __cache_cpumap_setup(cpu, idx, &id4_regs);
        }
-       cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
-
-       kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
        return 0;
 }
 
-static void cache_remove_dev(struct device *dev)
-{
-       unsigned int cpu = dev->id;
-       unsigned long i;
-
-       if (per_cpu(ici_cpuid4_info, cpu) == NULL)
-               return;
-       if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
-               return;
-       cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
-
-       for (i = 0; i < num_cache_leaves; i++)
-               kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
-       kobject_put(per_cpu(ici_cache_kobject, cpu));
-       cpuid4_cache_sysfs_exit(cpu);
-}
-
-static int cacheinfo_cpu_callback(struct notifier_block *nfb,
-                                 unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (unsigned long)hcpu;
-       struct device *dev;
-
-       dev = get_cpu_device(cpu);
-       switch (action) {
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               cache_add_dev(dev);
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               cache_remove_dev(dev);
-               break;
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block cacheinfo_cpu_notifier = {
-       .notifier_call = cacheinfo_cpu_callback,
-};
-
-static int __init cache_sysfs_init(void)
-{
-       int i, err = 0;
-
-       if (num_cache_leaves == 0)
-               return 0;
-
-       cpu_notifier_register_begin();
-       for_each_online_cpu(i) {
-               struct device *dev = get_cpu_device(i);
-
-               err = cache_add_dev(dev);
-               if (err)
-                       goto out;
-       }
-       __register_hotcpu_notifier(&cacheinfo_cpu_notifier);
-
-out:
-       cpu_notifier_register_done();
-       return err;
-}
-
-device_initcall(cache_sysfs_init);
-
-#endif
+DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
+DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
index 10b46906767fd4857389570322757fa89bbc8e6c..fe32074b865b1686ba9e0e2cdc3ccb5ebe99ddfe 100644 (file)
@@ -14,6 +14,7 @@ enum severity_level {
 };
 
 #define ATTR_LEN               16
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
 
 /* One object for each MCE bank, shared by all CPUs */
 struct mce_bank {
@@ -23,20 +24,20 @@ struct mce_bank {
        char                    attrname[ATTR_LEN];     /* attribute name */
 };
 
-int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
 struct dentry *mce_get_debugfs_dir(void);
 
 extern struct mce_bank *mce_banks;
 extern mce_banks_t mce_banks_ce_disabled;
 
 #ifdef CONFIG_X86_MCE_INTEL
-unsigned long mce_intel_adjust_timer(unsigned long interval);
-void mce_intel_cmci_poll(void);
+unsigned long cmci_intel_adjust_timer(unsigned long interval);
+bool mce_intel_cmci_poll(void);
 void mce_intel_hcpu_update(unsigned long cpu);
 void cmci_disable_bank(int bank);
 #else
-# define mce_intel_adjust_timer mce_adjust_timer_default
-static inline void mce_intel_cmci_poll(void) { }
+# define cmci_intel_adjust_timer mce_adjust_timer_default
+static inline bool mce_intel_cmci_poll(void) { return false; }
 static inline void mce_intel_hcpu_update(unsigned long cpu) { }
 static inline void cmci_disable_bank(int bank) { }
 #endif
index 8bb433043a7f6877beaa8b38ccb8ec9684069212..9c682c222071db1960ba848c24c76567306d0542 100644 (file)
@@ -186,7 +186,61 @@ static int error_context(struct mce *m)
        return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
 }
 
-int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
+/*
+ * See AMD Error Scope Hierarchy table in a newer BKDG. For example
+ * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
+ */
+static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+       enum context ctx = error_context(m);
+
+       /* Processor Context Corrupt, no need to fumble too much, die! */
+       if (m->status & MCI_STATUS_PCC)
+               return MCE_PANIC_SEVERITY;
+
+       if (m->status & MCI_STATUS_UC) {
+
+               /*
+                * On older systems where overflow_recov flag is not present, we
+                * should simply panic if an error overflow occurs. If
+                * overflow_recov flag is present and set, then software can try
+                * to at least kill process to prolong system operation.
+                */
+               if (mce_flags.overflow_recov) {
+                       /* software can try to contain */
+                       if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
+                               return MCE_PANIC_SEVERITY;
+
+                       /* kill current process */
+                       return MCE_AR_SEVERITY;
+               } else {
+                       /* at least one error was not logged */
+                       if (m->status & MCI_STATUS_OVER)
+                               return MCE_PANIC_SEVERITY;
+               }
+
+               /*
+                * For any other case, return MCE_UC_SEVERITY so that we log the
+                * error and exit #MC handler.
+                */
+               return MCE_UC_SEVERITY;
+       }
+
+       /*
+        * deferred error: poll handler catches these and adds to mce_ring so
+        * memory-failure can take recovery actions.
+        */
+       if (m->status & MCI_STATUS_DEFERRED)
+               return MCE_DEFERRED_SEVERITY;
+
+       /*
+        * corrected error: poll handler catches these and passes responsibility
+        * of decoding the error to EDAC
+        */
+       return MCE_KEEP_SEVERITY;
+}
+
+static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
 {
        enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
        enum context ctx = error_context(m);
@@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
        }
 }
 
+/* Default to mce_severity_intel */
+int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+                   mce_severity_intel;
+
+void __init mcheck_vendor_init_severity(void)
+{
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+               mce_severity = mce_severity_amd;
+}
+
 #ifdef CONFIG_DEBUG_FS
 static void *s_start(struct seq_file *f, loff_t *pos)
 {
index 3c036cb4a370cad399cac636f97840b40c62bd4d..e535533d5ab89313ba51937ad8dd5740413f119e 100644 (file)
@@ -60,11 +60,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
 
-#define SPINUNIT 100   /* 100ns */
+#define SPINUNIT               100     /* 100ns */
 
 DEFINE_PER_CPU(unsigned, mce_exception_count);
 
 struct mce_bank *mce_banks __read_mostly;
+struct mce_vendor_flags mce_flags __read_mostly;
 
 struct mca_config mca_cfg __read_mostly = {
        .bootlog  = -1,
@@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int                     cpu_missing;
 
-/* CMCI storm detection filter */
-static DEFINE_PER_CPU(unsigned long, mce_polled_error);
-
 /*
  * MCA banks polled by the period polling timer for corrected events.
  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
  * is already totally * confused. In this case it's likely it will
  * not fully execute the machine check handler either.
  */
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 {
+       bool error_logged = false;
        struct mce m;
        int severity;
        int i;
@@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
                if (!(m.status & MCI_STATUS_VAL))
                        continue;
 
-               this_cpu_write(mce_polled_error, 1);
+
                /*
                 * Uncorrected or signalled events are handled by the exception
                 * handler when it is enabled, so don't process those here.
@@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
                 * Don't get the IP here because it's unlikely to
                 * have anything to do with the actual error location.
                 */
-               if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
+               if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
+                       error_logged = true;
                        mce_log(&m);
+               }
 
                /*
                 * Clear state for this bank.
@@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
         */
 
        sync_core();
+
+       return error_logged;
 }
 EXPORT_SYMBOL_GPL(machine_check_poll);
 
@@ -813,7 +816,7 @@ static void mce_reign(void)
         * other CPUs.
         */
        if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
-               mce_panic("Fatal Machine check", m, msg);
+               mce_panic("Fatal machine check", m, msg);
 
        /*
         * For UC somewhere we let the CPU who detects it handle it.
@@ -826,7 +829,7 @@ static void mce_reign(void)
         * source or one CPU is hung. Panic.
         */
        if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
-               mce_panic("Machine check from unknown source", NULL, NULL);
+               mce_panic("Fatal machine check from unknown source", NULL, NULL);
 
        /*
         * Now clear all the mces_seen so that they don't reappear on
@@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status)
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
  * errors, poll 2x slower (up to check_interval seconds).
  */
-static unsigned long check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
 
 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
        return interval;
 }
 
-static unsigned long (*mce_adjust_timer)(unsigned long interval) =
-       mce_adjust_timer_default;
+static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
 
-static int cmc_error_seen(void)
+static void __restart_timer(struct timer_list *t, unsigned long interval)
 {
-       unsigned long *v = this_cpu_ptr(&mce_polled_error);
+       unsigned long when = jiffies + interval;
+       unsigned long flags;
+
+       local_irq_save(flags);
 
-       return test_and_clear_bit(0, v);
+       if (timer_pending(t)) {
+               if (time_before(when, t->expires))
+                       mod_timer_pinned(t, when);
+       } else {
+               t->expires = round_jiffies(when);
+               add_timer_on(t, smp_processor_id());
+       }
+
+       local_irq_restore(flags);
 }
 
 static void mce_timer_fn(unsigned long data)
 {
        struct timer_list *t = this_cpu_ptr(&mce_timer);
+       int cpu = smp_processor_id();
        unsigned long iv;
-       int notify;
 
-       WARN_ON(smp_processor_id() != data);
+       WARN_ON(cpu != data);
+
+       iv = __this_cpu_read(mce_next_interval);
 
        if (mce_available(this_cpu_ptr(&cpu_info))) {
-               machine_check_poll(MCP_TIMESTAMP,
-                               this_cpu_ptr(&mce_poll_banks));
-               mce_intel_cmci_poll();
+               machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+
+               if (mce_intel_cmci_poll()) {
+                       iv = mce_adjust_timer(iv);
+                       goto done;
+               }
        }
 
        /*
-        * Alert userspace if needed.  If we logged an MCE, reduce the
-        * polling interval, otherwise increase the polling interval.
+        * Alert userspace if needed. If we logged an MCE, reduce the polling
+        * interval, otherwise increase the polling interval.
         */
-       iv = __this_cpu_read(mce_next_interval);
-       notify = mce_notify_irq();
-       notify |= cmc_error_seen();
-       if (notify) {
+       if (mce_notify_irq())
                iv = max(iv / 2, (unsigned long) HZ/100);
-       } else {
+       else
                iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
-               iv = mce_adjust_timer(iv);
-       }
+
+done:
        __this_cpu_write(mce_next_interval, iv);
-       /* Might have become 0 after CMCI storm subsided */
-       if (iv) {
-               t->expires = jiffies + iv;
-               add_timer_on(t, smp_processor_id());
-       }
+       __restart_timer(t, iv);
 }
 
 /*
@@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data)
 void mce_timer_kick(unsigned long interval)
 {
        struct timer_list *t = this_cpu_ptr(&mce_timer);
-       unsigned long when = jiffies + interval;
        unsigned long iv = __this_cpu_read(mce_next_interval);
 
-       if (timer_pending(t)) {
-               if (time_before(when, t->expires))
-                       mod_timer_pinned(t, when);
-       } else {
-               t->expires = round_jiffies(when);
-               add_timer_on(t, smp_processor_id());
-       }
+       __restart_timer(t, interval);
+
        if (interval < iv)
                __this_cpu_write(mce_next_interval, interval);
 }
@@ -1525,45 +1530,46 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
                 * Various K7s with broken bank 0 around. Always disable
                 * by default.
                 */
-                if (c->x86 == 6 && cfg->banks > 0)
+               if (c->x86 == 6 && cfg->banks > 0)
                        mce_banks[0].ctl = 0;
 
-                /*
-                 * Turn off MC4_MISC thresholding banks on those models since
-                 * they're not supported there.
-                 */
-                if (c->x86 == 0x15 &&
-                    (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
-                        int i;
-                        u64 val, hwcr;
-                        bool need_toggle;
-                        u32 msrs[] = {
+               /*
+                * overflow_recov is supported for F15h Models 00h-0fh
+                * even though we don't have a CPUID bit for it.
+                */
+               if (c->x86 == 0x15 && c->x86_model <= 0xf)
+                       mce_flags.overflow_recov = 1;
+
+               /*
+                * Turn off MC4_MISC thresholding banks on those models since
+                * they're not supported there.
+                */
+               if (c->x86 == 0x15 &&
+                   (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+                       int i;
+                       u64 hwcr;
+                       bool need_toggle;
+                       u32 msrs[] = {
                                0x00000413, /* MC4_MISC0 */
                                0xc0000408, /* MC4_MISC1 */
-                        };
+                       };
 
-                        rdmsrl(MSR_K7_HWCR, hwcr);
+                       rdmsrl(MSR_K7_HWCR, hwcr);
 
-                        /* McStatusWrEn has to be set */
-                        need_toggle = !(hwcr & BIT(18));
+                       /* McStatusWrEn has to be set */
+                       need_toggle = !(hwcr & BIT(18));
 
-                        if (need_toggle)
-                                wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+                       if (need_toggle)
+                               wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
 
-                        for (i = 0; i < ARRAY_SIZE(msrs); i++) {
-                                rdmsrl(msrs[i], val);
+                       /* Clear CntP bit safely */
+                       for (i = 0; i < ARRAY_SIZE(msrs); i++)
+                               msr_clear_bit(msrs[i], 62);
 
-                                /* CntP bit set? */
-                                if (val & BIT_64(62)) {
-                                       val &= ~BIT_64(62);
-                                       wrmsrl(msrs[i], val);
-                                }
-                        }
-
-                        /* restore old settings */
-                        if (need_toggle)
-                                wrmsrl(MSR_K7_HWCR, hwcr);
-                }
+                       /* restore old settings */
+                       if (need_toggle)
+                               wrmsrl(MSR_K7_HWCR, hwcr);
+               }
        }
 
        if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
        switch (c->x86_vendor) {
        case X86_VENDOR_INTEL:
                mce_intel_feature_init(c);
-               mce_adjust_timer = mce_intel_adjust_timer;
+               mce_adjust_timer = cmci_intel_adjust_timer;
                break;
        case X86_VENDOR_AMD:
                mce_amd_feature_init(c);
+               mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
                break;
        default:
                break;
@@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
        mcheck_intel_therm_init();
+       mcheck_vendor_init_severity();
 
        return 0;
 }
index f1c3769bbd6433344968bf5f78ad223170ef6878..55ad9b37cae853ce0d50f193dc7eb82a49207fee 100644 (file)
@@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank)
        return (bank == 4);
 }
 
-static const char * const bank4_names(struct threshold_block *b)
+static const char *bank4_names(const struct threshold_block *b)
 {
        switch (b->address) {
        /* MSR4_MISC0 */
@@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                        if (!b.interrupt_capable)
                                goto init;
 
+                       b.interrupt_enable = 1;
                        new     = (high & MASK_LVTOFF_HI) >> 20;
                        offset  = setup_APIC_mce(offset, new);
 
@@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void)
 log:
        mce_setup(&m);
        rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
+       if (!(m.status & MCI_STATUS_VAL))
+               return;
        m.misc = ((u64)high << 32) | low;
        m.bank = bank;
        mce_log(&m);
@@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
        b->interrupt_capable    = lvt_interrupt_supported(bank, high);
        b->threshold_limit      = THRESHOLD_MAX;
 
-       if (b->interrupt_capable)
+       if (b->interrupt_capable) {
                threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
-       else
+               b->interrupt_enable = 1;
+       } else {
                threshold_ktype.default_attrs[2] = NULL;
+       }
 
        INIT_LIST_HEAD(&b->miscj);
 
index b3c97bafc1238fedd30f1ec0a3bab2ba7ee05cb8..b4a41cf030edab7dbfae7dc67560ba63eabc2309 100644 (file)
  */
 static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
 
+/*
+ * CMCI storm detection backoff counter
+ *
+ * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
+ * encountered an error. If not, we decrement it by one. We signal the end of
+ * the CMCI storm when it reaches 0.
+ */
+static DEFINE_PER_CPU(int, cmci_backoff_cnt);
+
 /*
  * cmci_discover_lock protects against parallel discovery attempts
  * which could race against each other.
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
 
 #define CMCI_THRESHOLD         1
 #define CMCI_POLL_INTERVAL     (30 * HZ)
-#define CMCI_STORM_INTERVAL    (1 * HZ)
+#define CMCI_STORM_INTERVAL    (HZ)
 #define CMCI_STORM_THRESHOLD   15
 
 static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
        return !!(cap & MCG_CMCI_P);
 }
 
-void mce_intel_cmci_poll(void)
+bool mce_intel_cmci_poll(void)
 {
        if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
-               return;
-       machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+               return false;
+
+       /*
+        * Reset the counter if we've logged an error in the last poll
+        * during the storm.
+        */
+       if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+               this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+       else
+               this_cpu_dec(cmci_backoff_cnt);
+
+       return true;
 }
 
 void mce_intel_hcpu_update(unsigned long cpu)
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
        per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
 }
 
-unsigned long mce_intel_adjust_timer(unsigned long interval)
+unsigned long cmci_intel_adjust_timer(unsigned long interval)
 {
-       int r;
-
-       if (interval < CMCI_POLL_INTERVAL)
-               return interval;
+       if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
+           (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
+               mce_notify_irq();
+               return CMCI_STORM_INTERVAL;
+       }
 
        switch (__this_cpu_read(cmci_storm_state)) {
        case CMCI_STORM_ACTIVE:
+
                /*
                 * We switch back to interrupt mode once the poll timer has
-                * silenced itself. That means no events recorded and the
-                * timer interval is back to our poll interval.
+                * silenced itself. That means no events recorded and the timer
+                * interval is back to our poll interval.
                 */
                __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
-               r = atomic_sub_return(1, &cmci_storm_on_cpus);
-               if (r == 0)
+               if (!atomic_sub_return(1, &cmci_storm_on_cpus))
                        pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+
                /* FALLTHROUGH */
 
        case CMCI_STORM_SUBSIDED:
                /*
-                * We wait for all cpus to go back to SUBSIDED
-                * state. When that happens we switch back to
-                * interrupt mode.
+                * We wait for all CPUs to go back to SUBSIDED state. When that
+                * happens we switch back to interrupt mode.
                 */
                if (!atomic_read(&cmci_storm_on_cpus)) {
                        __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
                }
                return CMCI_POLL_INTERVAL;
        default:
-               /*
-                * We have shiny weather. Let the poll do whatever it
-                * thinks.
-                */
+
+               /* We have shiny weather. Let the poll do whatever it thinks. */
                return interval;
        }
 }
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
        cmci_storm_disable_banks();
        __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
        r = atomic_add_return(1, &cmci_storm_on_cpus);
-       mce_timer_kick(CMCI_POLL_INTERVAL);
+       mce_timer_kick(CMCI_STORM_INTERVAL);
+       this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
 
        if (r == 1)
                pr_notice("CMCI storm detected: switching to poll mode\n");
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
 {
        if (cmci_storm_detect())
                return;
+
        machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
        mce_notify_irq();
 }
@@ -286,6 +306,7 @@ void cmci_recheck(void)
 
        if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
                return;
+
        local_irq_save(flags);
        machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
        local_irq_restore(flags);
index bfbbe6195e2da7e798323d014cfc5ae2d406d0f3..12829c3ced3c549c982a3072f825c2ed1b1c7758 100644 (file)
@@ -21,7 +21,6 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/firmware.h>
-#include <linux/pci_ids.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/kernel.h>
index d45df4bd16abec23e9725132ab2f9e649237f564..a413a69cbd744f2e2873434ee20b66e86fb466dd 100644 (file)
 #include <asm/processor.h>
 #include <asm/cmdline.h>
 
-#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
-#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
-#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
-#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
-#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
-#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
-#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
-
-#define CPUID_IS(a, b, c, ebx, ecx, edx)       \
-               (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
-
-/*
- * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_vendor() gets vendor id for BSP.
- *
- * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_vendor() to get vendor id for AP.
- *
- * x86_vendor() gets vendor information directly through cpuid.
- */
-static int x86_vendor(void)
-{
-       u32 eax = 0x00000000;
-       u32 ebx, ecx = 0, edx;
-
-       native_cpuid(&eax, &ebx, &ecx, &edx);
-
-       if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
-               return X86_VENDOR_INTEL;
-
-       if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
-               return X86_VENDOR_AMD;
-
-       return X86_VENDOR_UNKNOWN;
-}
-
-static int x86_family(void)
-{
-       u32 eax = 0x00000001;
-       u32 ebx, ecx = 0, edx;
-       int x86;
-
-       native_cpuid(&eax, &ebx, &ecx, &edx);
-
-       x86 = (eax >> 8) & 0xf;
-       if (x86 == 15)
-               x86 += (eax >> 20) & 0xff;
-
-       return x86;
-}
-
 static bool __init check_loader_disabled_bsp(void)
 {
 #ifdef CONFIG_X86_32
@@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void)
 
 void __init load_ucode_bsp(void)
 {
-       int vendor, x86;
+       int vendor, family;
 
        if (check_loader_disabled_bsp())
                return;
@@ -105,15 +54,15 @@ void __init load_ucode_bsp(void)
                return;
 
        vendor = x86_vendor();
-       x86 = x86_family();
+       family = x86_family();
 
        switch (vendor) {
        case X86_VENDOR_INTEL:
-               if (x86 >= 6)
+               if (family >= 6)
                        load_ucode_intel_bsp();
                break;
        case X86_VENDOR_AMD:
-               if (x86 >= 0x10)
+               if (family >= 0x10)
                        load_ucode_amd_bsp();
                break;
        default:
@@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void)
 
 void load_ucode_ap(void)
 {
-       int vendor, x86;
+       int vendor, family;
 
        if (check_loader_disabled_ap())
                return;
@@ -141,15 +90,15 @@ void load_ucode_ap(void)
                return;
 
        vendor = x86_vendor();
-       x86 = x86_family();
+       family = x86_family();
 
        switch (vendor) {
        case X86_VENDOR_INTEL:
-               if (x86 >= 6)
+               if (family >= 6)
                        load_ucode_intel_ap();
                break;
        case X86_VENDOR_AMD:
-               if (x86 >= 0x10)
+               if (family >= 0x10)
                        load_ucode_amd_ap();
                break;
        default:
@@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void)
 
 void reload_early_microcode(void)
 {
-       int vendor, x86;
+       int vendor, family;
 
        vendor = x86_vendor();
-       x86 = x86_family();
+       family = x86_family();
 
        switch (vendor) {
        case X86_VENDOR_INTEL:
-               if (x86 >= 6)
+               if (family >= 6)
                        reload_ucode_intel();
                break;
        case X86_VENDOR_AMD:
-               if (x86 >= 0x10)
+               if (family >= 0x10)
                        reload_ucode_amd();
                break;
        default:
index 746e7fd08aad7082747ee1d9dca80570f1a00e3b..a41beadb3db9a396e5b74795e62a49648b367870 100644 (file)
@@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
        cpf = cpu_sig.pf;
        crev = cpu_sig.rev;
 
-       return get_matching_microcode(csig, cpf, mc_intel, crev);
+       return get_matching_microcode(csig, cpf, crev, mc_intel);
 }
 
 static int apply_microcode_intel(int cpu)
@@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 
                csig = uci->cpu_sig.sig;
                cpf = uci->cpu_sig.pf;
-               if (get_matching_microcode(csig, cpf, mc, new_rev)) {
+               if (get_matching_microcode(csig, cpf, new_rev, mc)) {
                        vfree(new_mc);
                        new_rev = mc_header.rev;
                        new_mc  = mc;
index 420eb933189ca487110607475ddbf33be8e8267b..2f49ab4ac0ae137d7ab0b851cf4b9e751d58922c 100644 (file)
  *     as published by the Free Software Foundation; either version
  *     2 of the License, or (at your option) any later version.
  */
+
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
@@ -28,6 +36,9 @@
 #include <asm/tlbflush.h>
 #include <asm/setup.h>
 
+#undef pr_fmt
+#define pr_fmt(fmt)    "microcode: " fmt
+
 static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
 static struct mc_saved_data {
        unsigned int mc_saved_count;
@@ -35,50 +46,45 @@ static struct mc_saved_data {
 } mc_saved_data;
 
 static enum ucode_state
-generic_load_microcode_early(struct microcode_intel **mc_saved_p,
-                            unsigned int mc_saved_count,
-                            struct ucode_cpu_info *uci)
+load_microcode_early(struct microcode_intel **saved,
+                    unsigned int num_saved, struct ucode_cpu_info *uci)
 {
        struct microcode_intel *ucode_ptr, *new_mc = NULL;
-       int new_rev = uci->cpu_sig.rev;
-       enum ucode_state state = UCODE_OK;
-       unsigned int mc_size;
-       struct microcode_header_intel *mc_header;
-       unsigned int csig = uci->cpu_sig.sig;
-       unsigned int cpf = uci->cpu_sig.pf;
-       int i;
+       struct microcode_header_intel *mc_hdr;
+       int new_rev, ret, i;
 
-       for (i = 0; i < mc_saved_count; i++) {
-               ucode_ptr = mc_saved_p[i];
+       new_rev = uci->cpu_sig.rev;
 
-               mc_header = (struct microcode_header_intel *)ucode_ptr;
-               mc_size = get_totalsize(mc_header);
-               if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
-                       new_rev = mc_header->rev;
-                       new_mc  = ucode_ptr;
-               }
-       }
+       for (i = 0; i < num_saved; i++) {
+               ucode_ptr = saved[i];
+               mc_hdr    = (struct microcode_header_intel *)ucode_ptr;
 
-       if (!new_mc) {
-               state = UCODE_NFOUND;
-               goto out;
+               ret = get_matching_microcode(uci->cpu_sig.sig,
+                                            uci->cpu_sig.pf,
+                                            new_rev,
+                                            ucode_ptr);
+               if (!ret)
+                       continue;
+
+               new_rev = mc_hdr->rev;
+               new_mc  = ucode_ptr;
        }
 
+       if (!new_mc)
+               return UCODE_NFOUND;
+
        uci->mc = (struct microcode_intel *)new_mc;
-out:
-       return state;
+       return UCODE_OK;
 }
 
-static void
-microcode_pointer(struct microcode_intel **mc_saved,
-                 unsigned long *mc_saved_in_initrd,
-                 unsigned long initrd_start, int mc_saved_count)
+static inline void
+copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
+                 unsigned long off, int num_saved)
 {
        int i;
 
-       for (i = 0; i < mc_saved_count; i++)
-               mc_saved[i] = (struct microcode_intel *)
-                             (mc_saved_in_initrd[i] + initrd_start);
+       for (i = 0; i < num_saved; i++)
+               mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
 }
 
 #ifdef CONFIG_X86_32
@@ -102,55 +108,27 @@ microcode_phys(struct microcode_intel **mc_saved_tmp,
 #endif
 
 static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data,
-              unsigned long *mc_saved_in_initrd,
-              unsigned long initrd_start,
-              struct ucode_cpu_info *uci)
+load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+              unsigned long initrd_start, struct ucode_cpu_info *uci)
 {
        struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
        unsigned int count = mc_saved_data->mc_saved_count;
 
        if (!mc_saved_data->mc_saved) {
-               microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
-                                 initrd_start, count);
+               copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
 
-               return generic_load_microcode_early(mc_saved_tmp, count, uci);
+               return load_microcode_early(mc_saved_tmp, count, uci);
        } else {
 #ifdef CONFIG_X86_32
                microcode_phys(mc_saved_tmp, mc_saved_data);
-               return generic_load_microcode_early(mc_saved_tmp, count, uci);
+               return load_microcode_early(mc_saved_tmp, count, uci);
 #else
-               return generic_load_microcode_early(mc_saved_data->mc_saved,
+               return load_microcode_early(mc_saved_data->mc_saved,
                                                    count, uci);
 #endif
        }
 }
 
-static u8 get_x86_family(unsigned long sig)
-{
-       u8 x86;
-
-       x86 = (sig >> 8) & 0xf;
-
-       if (x86 == 0xf)
-               x86 += (sig >> 20) & 0xff;
-
-       return x86;
-}
-
-static u8 get_x86_model(unsigned long sig)
-{
-       u8 x86, x86_model;
-
-       x86 = get_x86_family(sig);
-       x86_model = (sig >> 4) & 0xf;
-
-       if (x86 == 0x6 || x86 == 0xf)
-               x86_model += ((sig >> 16) & 0xf) << 4;
-
-       return x86_model;
-}
-
 /*
  * Given CPU signature and a microcode patch, this function finds if the
  * microcode patch has matching family and model with the CPU.
@@ -159,42 +137,40 @@ static enum ucode_state
 matching_model_microcode(struct microcode_header_intel *mc_header,
                        unsigned long sig)
 {
-       u8 x86, x86_model;
-       u8 x86_ucode, x86_model_ucode;
+       unsigned int fam, model;
+       unsigned int fam_ucode, model_ucode;
        struct extended_sigtable *ext_header;
        unsigned long total_size = get_totalsize(mc_header);
        unsigned long data_size = get_datasize(mc_header);
        int ext_sigcount, i;
        struct extended_signature *ext_sig;
 
-       x86 = get_x86_family(sig);
-       x86_model = get_x86_model(sig);
+       fam   = __x86_family(sig);
+       model = x86_model(sig);
 
-       x86_ucode = get_x86_family(mc_header->sig);
-       x86_model_ucode = get_x86_model(mc_header->sig);
+       fam_ucode   = __x86_family(mc_header->sig);
+       model_ucode = x86_model(mc_header->sig);
 
-       if (x86 == x86_ucode && x86_model == x86_model_ucode)
+       if (fam == fam_ucode && model == model_ucode)
                return UCODE_OK;
 
        /* Look for ext. headers: */
        if (total_size <= data_size + MC_HEADER_SIZE)
                return UCODE_NFOUND;
 
-       ext_header = (struct extended_sigtable *)
-                    mc_header + data_size + MC_HEADER_SIZE;
+       ext_header   = (void *) mc_header + data_size + MC_HEADER_SIZE;
+       ext_sig      = (void *)ext_header + EXT_HEADER_SIZE;
        ext_sigcount = ext_header->count;
-       ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
 
        for (i = 0; i < ext_sigcount; i++) {
-               x86_ucode = get_x86_family(ext_sig->sig);
-               x86_model_ucode = get_x86_model(ext_sig->sig);
+               fam_ucode   = __x86_family(ext_sig->sig);
+               model_ucode = x86_model(ext_sig->sig);
 
-               if (x86 == x86_ucode && x86_model == x86_model_ucode)
+               if (fam == fam_ucode && model == model_ucode)
                        return UCODE_OK;
 
                ext_sig++;
        }
-
        return UCODE_NFOUND;
 }
 
@@ -204,7 +180,7 @@ save_microcode(struct mc_saved_data *mc_saved_data,
               unsigned int mc_saved_count)
 {
        int i, j;
-       struct microcode_intel **mc_saved_p;
+       struct microcode_intel **saved_ptr;
        int ret;
 
        if (!mc_saved_count)
@@ -213,39 +189,45 @@ save_microcode(struct mc_saved_data *mc_saved_data,
        /*
         * Copy new microcode data.
         */
-       mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
-                            GFP_KERNEL);
-       if (!mc_saved_p)
+       saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+       if (!saved_ptr)
                return -ENOMEM;
 
        for (i = 0; i < mc_saved_count; i++) {
-               struct microcode_intel *mc = mc_saved_src[i];
-               struct microcode_header_intel *mc_header = &mc->hdr;
-               unsigned long mc_size = get_totalsize(mc_header);
-               mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
-               if (!mc_saved_p[i]) {
-                       ret = -ENOMEM;
-                       goto err;
-               }
+               struct microcode_header_intel *mc_hdr;
+               struct microcode_intel *mc;
+               unsigned long size;
+
                if (!mc_saved_src[i]) {
                        ret = -EINVAL;
                        goto err;
                }
-               memcpy(mc_saved_p[i], mc, mc_size);
+
+               mc     = mc_saved_src[i];
+               mc_hdr = &mc->hdr;
+               size   = get_totalsize(mc_hdr);
+
+               saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+               if (!saved_ptr[i]) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+
+               memcpy(saved_ptr[i], mc, size);
        }
 
        /*
         * Point to newly saved microcode.
         */
-       mc_saved_data->mc_saved = mc_saved_p;
+       mc_saved_data->mc_saved = saved_ptr;
        mc_saved_data->mc_saved_count = mc_saved_count;
 
        return 0;
 
 err:
        for (j = 0; j <= i; j++)
-               kfree(mc_saved_p[j]);
-       kfree(mc_saved_p);
+               kfree(saved_ptr[j]);
+       kfree(saved_ptr);
 
        return ret;
 }
@@ -257,48 +239,45 @@ err:
  * - or if it is a newly discovered microcode patch.
  *
  * The microcode patch should have matching model with CPU.
+ *
+ * Returns: The updated number @num_saved of saved microcode patches.
  */
-static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
-                    unsigned int *mc_saved_count_p)
+static unsigned int _save_mc(struct microcode_intel **mc_saved,
+                            u8 *ucode_ptr, unsigned int num_saved)
 {
-       int i;
-       int found = 0;
-       unsigned int mc_saved_count = *mc_saved_count_p;
-       struct microcode_header_intel *mc_header;
+       struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+       unsigned int sig, pf, new_rev;
+       int found = 0, i;
+
+       mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+
+       for (i = 0; i < num_saved; i++) {
+               mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
+               sig          = mc_saved_hdr->sig;
+               pf           = mc_saved_hdr->pf;
+               new_rev      = mc_hdr->rev;
+
+               if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
+                       continue;
+
+               found = 1;
+
+               if (!revision_is_newer(mc_hdr, new_rev))
+                       continue;
 
-       mc_header = (struct microcode_header_intel *)ucode_ptr;
-       for (i = 0; i < mc_saved_count; i++) {
-               unsigned int sig, pf;
-               unsigned int new_rev;
-               struct microcode_header_intel *mc_saved_header =
-                            (struct microcode_header_intel *)mc_saved[i];
-               sig = mc_saved_header->sig;
-               pf = mc_saved_header->pf;
-               new_rev = mc_header->rev;
-
-               if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
-                       found = 1;
-                       if (update_match_revision(mc_header, new_rev)) {
-                               /*
-                                * Found an older ucode saved before.
-                                * Replace the older one with this newer
-                                * one.
-                                */
-                               mc_saved[i] =
-                                       (struct microcode_intel *)ucode_ptr;
-                               break;
-                       }
-               }
-       }
-       if (i >= mc_saved_count && !found)
                /*
-                * This ucode is first time discovered in ucode file.
-                * Save it to memory.
+                * Found an older ucode saved earlier. Replace it with
+                * this newer one.
                 */
-               mc_saved[mc_saved_count++] =
-                                (struct microcode_intel *)ucode_ptr;
+               mc_saved[i] = (struct microcode_intel *)ucode_ptr;
+               break;
+       }
+
+       /* Newly detected microcode, save it to memory. */
+       if (i >= num_saved && !found)
+               mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
 
-       *mc_saved_count_p = mc_saved_count;
+       return num_saved;
 }
 
 /*
@@ -346,7 +325,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
                        continue;
                }
 
-               _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
+               mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
 
                ucode_ptr += mc_size;
        }
@@ -372,7 +351,7 @@ out:
 static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 {
        unsigned int val[2];
-       u8 x86, x86_model;
+       unsigned int family, model;
        struct cpu_signature csig;
        unsigned int eax, ebx, ecx, edx;
 
@@ -387,10 +366,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
        native_cpuid(&eax, &ebx, &ecx, &edx);
        csig.sig = eax;
 
-       x86 = get_x86_family(csig.sig);
-       x86_model = get_x86_model(csig.sig);
+       family = __x86_family(csig.sig);
+       model  = x86_model(csig.sig);
 
-       if ((x86_model >= 5) || (x86 > 6)) {
+       if ((model >= 5) || (family > 6)) {
                /* get processor flags from MSR 0x17 */
                native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
                csig.pf = 1 << ((val[1] >> 18) & 7);
@@ -429,8 +408,7 @@ static void __ref show_saved_mc(void)
        sig = uci.cpu_sig.sig;
        pf = uci.cpu_sig.pf;
        rev = uci.cpu_sig.rev;
-       pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
-                smp_processor_id(), sig, pf, rev);
+       pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
 
        for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
                struct microcode_header_intel *mc_saved_header;
@@ -457,8 +435,7 @@ static void __ref show_saved_mc(void)
                if (total_size <= data_size + MC_HEADER_SIZE)
                        continue;
 
-               ext_header = (struct extended_sigtable *)
-                            mc_saved_header + data_size + MC_HEADER_SIZE;
+               ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
                ext_sigcount = ext_header->count;
                ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
 
@@ -515,8 +492,7 @@ int save_mc_for_early(u8 *mc)
         * Save the microcode patch mc in mc_save_tmp structure if it's a newer
         * version.
         */
-
-       _save_mc(mc_saved_tmp, mc, &mc_saved_count);
+       mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
 
        /*
         * Save the mc_save_tmp in global mc_saved_data.
@@ -548,12 +524,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early);
 
 static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
 static __init enum ucode_state
-scan_microcode(unsigned long start, unsigned long end,
-               struct mc_saved_data *mc_saved_data,
-               unsigned long *mc_saved_in_initrd,
-               struct ucode_cpu_info *uci)
+scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+              unsigned long start, unsigned long size,
+              struct ucode_cpu_info *uci)
 {
-       unsigned int size = end - start + 1;
        struct cpio_data cd;
        long offset = 0;
 #ifdef CONFIG_X86_32
@@ -569,10 +543,8 @@ scan_microcode(unsigned long start, unsigned long end,
        if (!cd.data)
                return UCODE_ERROR;
 
-
        return get_matching_model_microcode(0, start, cd.data, cd.size,
-                                           mc_saved_data, mc_saved_in_initrd,
-                                           uci);
+                                           mc_saved_data, initrd, uci);
 }
 
 /*
@@ -704,7 +676,7 @@ int __init save_microcode_in_initrd_intel(void)
        if (count == 0)
                return ret;
 
-       microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
+       copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
        ret = save_microcode(&mc_saved_data, mc_saved, count);
        if (ret)
                pr_err("Cannot save microcode patches from initrd.\n");
@@ -716,52 +688,44 @@ int __init save_microcode_in_initrd_intel(void)
 
 static void __init
 _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
-                     unsigned long *mc_saved_in_initrd,
-                     unsigned long initrd_start_early,
-                     unsigned long initrd_end_early,
-                     struct ucode_cpu_info *uci)
+                     unsigned long *initrd,
+                     unsigned long start, unsigned long size)
 {
+       struct ucode_cpu_info uci;
        enum ucode_state ret;
 
-       collect_cpu_info_early(uci);
-       scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
-                      mc_saved_in_initrd, uci);
+       collect_cpu_info_early(&uci);
 
-       ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
-                            initrd_start_early, uci);
+       ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+       if (ret != UCODE_OK)
+               return;
 
-       if (ret == UCODE_OK)
-               apply_microcode_early(uci, true);
+       ret = load_microcode(mc_saved_data, initrd, start, &uci);
+       if (ret != UCODE_OK)
+               return;
+
+       apply_microcode_early(&uci, true);
 }
 
-void __init
-load_ucode_intel_bsp(void)
+void __init load_ucode_intel_bsp(void)
 {
-       u64 ramdisk_image, ramdisk_size;
-       unsigned long initrd_start_early, initrd_end_early;
-       struct ucode_cpu_info uci;
+       u64 start, size;
 #ifdef CONFIG_X86_32
-       struct boot_params *boot_params_p;
+       struct boot_params *p;
 
-       boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
-       ramdisk_image = boot_params_p->hdr.ramdisk_image;
-       ramdisk_size  = boot_params_p->hdr.ramdisk_size;
-       initrd_start_early = ramdisk_image;
-       initrd_end_early = initrd_start_early + ramdisk_size;
+       p       = (struct boot_params *)__pa_nodebug(&boot_params);
+       start   = p->hdr.ramdisk_image;
+       size    = p->hdr.ramdisk_size;
 
        _load_ucode_intel_bsp(
-               (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
-               (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
-               initrd_start_early, initrd_end_early, &uci);
+                       (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+                       (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+                       start, size);
 #else
-       ramdisk_image = boot_params.hdr.ramdisk_image;
-       ramdisk_size  = boot_params.hdr.ramdisk_size;
-       initrd_start_early = ramdisk_image + PAGE_OFFSET;
-       initrd_end_early = initrd_start_early + ramdisk_size;
-
-       _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
-                             initrd_start_early, initrd_end_early,
-                             &uci);
+       start   = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
+       size    = boot_params.hdr.ramdisk_size;
+
+       _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
 #endif
 }
 
@@ -771,6 +735,7 @@ void load_ucode_intel_ap(void)
        struct ucode_cpu_info uci;
        unsigned long *mc_saved_in_initrd_p;
        unsigned long initrd_start_addr;
+       enum ucode_state ret;
 #ifdef CONFIG_X86_32
        unsigned long *initrd_start_p;
 
@@ -793,8 +758,12 @@ void load_ucode_intel_ap(void)
                return;
 
        collect_cpu_info_early(&uci);
-       load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
-                      initrd_start_addr, &uci);
+       ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+                            initrd_start_addr, &uci);
+
+       if (ret != UCODE_OK)
+               return;
+
        apply_microcode_early(&uci, true);
 }
 
@@ -808,8 +777,8 @@ void reload_ucode_intel(void)
 
        collect_cpu_info_early(&uci);
 
-       ret = generic_load_microcode_early(mc_saved_data.mc_saved,
-                                          mc_saved_data.mc_saved_count, &uci);
+       ret = load_microcode_early(mc_saved_data.mc_saved,
+                                  mc_saved_data.mc_saved_count, &uci);
        if (ret != UCODE_OK)
                return;
 
index ce69320d017907aa2d8e25c4fe7ecef47c725005..cd47a510a3f174233300d8763705b6f200faf9f4 100644 (file)
@@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf,
        return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
 }
 
-int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
-       return (mc_header->rev <= rev) ? 0 : 1;
-}
-
 int microcode_sanity_check(void *mc, int print_err)
 {
        unsigned long total_size, data_size, ext_table_size;
@@ -128,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err)
 EXPORT_SYMBOL_GPL(microcode_sanity_check);
 
 /*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
  */
-int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
 {
        struct microcode_header_intel *mc_header = mc;
        struct extended_sigtable *ext_header;
@@ -159,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
 }
 
 /*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
  */
-int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
 {
-       struct microcode_header_intel *mc_header = mc;
+       struct microcode_header_intel *mc_hdr = mc;
 
-       if (!update_match_revision(mc_header, rev))
+       if (!revision_is_newer(mc_hdr, rev))
                return 0;
 
-       return get_matching_sig(csig, cpf, mc, rev);
+       return get_matching_sig(csig, cpf, rev, mc);
 }
 EXPORT_SYMBOL_GPL(get_matching_microcode);
index 36d99a337b49f56398ca29d900638ddcedee277b..3f20710a5b23b7f7456e99bb7f654ce703bb6abf 100644 (file)
@@ -6,7 +6,7 @@
 IN=$1
 OUT=$2
 
-function dump_array()
+dump_array()
 {
        ARRAY=$1
        SIZE=$2
index b71a7f86d68aca8ba6dd864e9c8cf15e2b3d28ba..e2888a3ad1e3c3ffd0f2584f34c66f5de0608e19 100644 (file)
@@ -2146,6 +2146,12 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  */
 static unsigned long code_segment_base(struct pt_regs *regs)
 {
+       /*
+        * For IA32 we look at the GDT/LDT segment base to convert the
+        * effective IP to a linear address.
+        */
+
+#ifdef CONFIG_X86_32
        /*
         * If we are in VM86 mode, add the segment offset to convert to a
         * linear address.
@@ -2153,18 +2159,12 @@ static unsigned long code_segment_base(struct pt_regs *regs)
        if (regs->flags & X86_VM_MASK)
                return 0x10 * regs->cs;
 
-       /*
-        * For IA32 we look at the GDT/LDT segment base to convert the
-        * effective IP to a linear address.
-        */
-#ifdef CONFIG_X86_32
        if (user_mode(regs) && regs->cs != __USER_CS)
                return get_segment_base(regs->cs);
 #else
-       if (test_thread_flag(TIF_IA32)) {
-               if (user_mode(regs) && regs->cs != __USER32_CS)
-                       return get_segment_base(regs->cs);
-       }
+       if (user_mode(regs) && !user_64bit_mode(regs) &&
+           regs->cs != __USER32_CS)
+               return get_segment_base(regs->cs);
 #endif
        return 0;
 }
index 498b6d967138b1fff29659e81813c77e70ff1f58..258990688a5e999557de7f3b5398d7eccdc45ebb 100644 (file)
@@ -212,11 +212,11 @@ static struct event_constraint intel_hsw_event_constraints[] = {
        INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
        INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
        /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-       INTEL_EVENT_CONSTRAINT(0x08a3, 0x4),
+       INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
        /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-       INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4),
+       INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
        /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-       INTEL_EVENT_CONSTRAINT(0x04a3, 0xf),
+       INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
        EVENT_CONSTRAINT_END
 };
 
@@ -1649,11 +1649,11 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
        if (c)
                return c;
 
-       c = intel_pebs_constraints(event);
+       c = intel_shared_regs_constraints(cpuc, event);
        if (c)
                return c;
 
-       c = intel_shared_regs_constraints(cpuc, event);
+       c = intel_pebs_constraints(event);
        if (c)
                return c;
 
index aceb2f90c7166afcfa844cd7da58bfe6358efaa6..c76d3e37c6e1dc99a7083f05a2f79b7cd82968e1 100644 (file)
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
 #ifdef CONFIG_X86_32
        struct pt_regs fixed_regs;
 
-       if (!user_mode_vm(regs)) {
+       if (!user_mode(regs)) {
                crash_fixup_ss_esp(&fixed_regs, regs);
                regs = &fixed_regs;
        }
index 3d3503351242b7c6d8dcc0d32350e36e63da1939..6367a780cc8ca891b9513d2e4688717c8d5d3207 100644 (file)
@@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void)
        initial_boot_params = dt = early_memremap(initial_dtb, map_len);
        size = of_get_flat_dt_size();
        if (map_len < size) {
-               early_iounmap(dt, map_len);
+               early_memunmap(dt, map_len);
                initial_boot_params = dt = early_memremap(initial_dtb, size);
                map_len = size;
        }
 
        unflatten_and_copy_device_tree();
-       early_iounmap(dt, map_len);
+       early_memunmap(dt, map_len);
 }
 #else
 static inline void x86_flattree_get_config(void) { }
index cf3df1d8d039e5d1689417e04c56bcb7057ae754..9c30acfadae24757cca11f00513d089e488da78d 100644 (file)
@@ -25,10 +25,12 @@ unsigned int code_bytes = 64;
 int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
 static int die_counter;
 
-static void printk_stack_address(unsigned long address, int reliable)
+static void printk_stack_address(unsigned long address, int reliable,
+               void *data)
 {
-       pr_cont(" [<%p>] %s%pB\n",
-               (void *)address, reliable ? "" : "? ", (void *)address);
+       printk("%s [<%p>] %s%pB\n",
+               (char *)data, (void *)address, reliable ? "" : "? ",
+               (void *)address);
 }
 
 void printk_address(unsigned long address)
@@ -155,8 +157,7 @@ static int print_trace_stack(void *data, char *name)
 static void print_trace_address(void *data, unsigned long addr, int reliable)
 {
        touch_nmi_watchdog();
-       printk(data);
-       printk_stack_address(addr, reliable);
+       printk_stack_address(addr, reliable, data);
 }
 
 static const struct stacktrace_ops print_trace_ops = {
@@ -278,7 +279,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
        print_modules();
        show_regs(regs);
 #ifdef CONFIG_X86_32
-       if (user_mode_vm(regs)) {
+       if (user_mode(regs)) {
                sp = regs->sp;
                ss = regs->ss & 0xffff;
        } else {
@@ -307,7 +308,7 @@ void die(const char *str, struct pt_regs *regs, long err)
        unsigned long flags = oops_begin();
        int sig = SIGSEGV;
 
-       if (!user_mode_vm(regs))
+       if (!user_mode(regs))
                report_bug(regs->ip, regs);
 
        if (__die(str, regs, err))
index 5abd4cd4230c69f3ff4730e97a1297be40013c44..464ffd69b92e9ef376b9c534aec3c12973d6ad7a 100644 (file)
@@ -108,9 +108,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        for (i = 0; i < kstack_depth_to_print; i++) {
                if (kstack_end(stack))
                        break;
-               if (i && ((i % STACKSLOTS_PER_LINE) == 0))
-                       pr_cont("\n");
-               pr_cont(" %08lx", *stack++);
+               if ((i % STACKSLOTS_PER_LINE) == 0) {
+                       if (i != 0)
+                               pr_cont("\n");
+                       printk("%s %08lx", log_lvl, *stack++);
+               } else
+                       pr_cont(" %08lx", *stack++);
                touch_nmi_watchdog();
        }
        pr_cont("\n");
@@ -123,13 +126,13 @@ void show_regs(struct pt_regs *regs)
        int i;
 
        show_regs_print_info(KERN_EMERG);
-       __show_regs(regs, !user_mode_vm(regs));
+       __show_regs(regs, !user_mode(regs));
 
        /*
         * When in-kernel, we also print out the stack and code at the
         * time of the fault..
         */
-       if (!user_mode_vm(regs)) {
+       if (!user_mode(regs)) {
                unsigned int code_prologue = code_bytes * 43 / 64;
                unsigned int code_len = code_bytes;
                unsigned char c;
index ff86f19b575849fca7e20a4086e09f798ae8291d..5f1c6266eb3028579f5036161dc240387dcb124f 100644 (file)
@@ -280,12 +280,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                                pr_cont(" <EOI> ");
                        }
                } else {
-               if (((long) stack & (THREAD_SIZE-1)) == 0)
+               if (kstack_end(stack))
                        break;
                }
-               if (i && ((i % STACKSLOTS_PER_LINE) == 0))
-                       pr_cont("\n");
-               pr_cont(" %016lx", *stack++);
+               if ((i % STACKSLOTS_PER_LINE) == 0) {
+                       if (i != 0)
+                               pr_cont("\n");
+                       printk("%s %016lx", log_lvl, *stack++);
+               } else
+                       pr_cont(" %016lx", *stack++);
                touch_nmi_watchdog();
        }
        preempt_enable();
index 46201deee923fffd686244546dc7039dbaab8b2b..7d46bb2603346b41dfb924eee28975cd76e704f3 100644 (file)
@@ -661,7 +661,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len)
        extmap = (struct e820entry *)(sdata->data);
        __append_e820_map(extmap, entries);
        sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-       early_iounmap(sdata, data_len);
+       early_memunmap(sdata, data_len);
        printk(KERN_INFO "e820: extended physical RAM map:\n");
        e820_print_map("extended");
 }
index a62536a1be889019081696b2e7df908b0a4ef9fb..49ff55ef9b26cba11af5d9b36c694926ea3e7f6b 100644 (file)
@@ -95,20 +95,6 @@ static unsigned long early_serial_base = 0x3f8;  /* ttyS0 */
 #define DLL             0       /*  Divisor Latch Low         */
 #define DLH             1       /*  Divisor latch High        */
 
-static void mem32_serial_out(unsigned long addr, int offset, int value)
-{
-       uint32_t *vaddr = (uint32_t *)addr;
-       /* shift implied by pointer type */
-       writel(value, vaddr + offset);
-}
-
-static unsigned int mem32_serial_in(unsigned long addr, int offset)
-{
-       uint32_t *vaddr = (uint32_t *)addr;
-       /* shift implied by pointer type */
-       return readl(vaddr + offset);
-}
-
 static unsigned int io_serial_in(unsigned long addr, int offset)
 {
        return inb(addr + offset);
@@ -205,6 +191,20 @@ static __init void early_serial_init(char *s)
 }
 
 #ifdef CONFIG_PCI
+static void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+       u32 *vaddr = (u32 *)addr;
+       /* shift implied by pointer type */
+       writel(value, vaddr + offset);
+}
+
+static unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+       u32 *vaddr = (u32 *)addr;
+       /* shift implied by pointer type */
+       return readl(vaddr + offset);
+}
+
 /*
  * early_pci_serial_init()
  *
@@ -217,8 +217,8 @@ static __init void early_pci_serial_init(char *s)
        unsigned divisor;
        unsigned long baud = DEFAULT_BAUD;
        u8 bus, slot, func;
-       uint32_t classcode, bar0;
-       uint16_t cmdreg;
+       u32 classcode, bar0;
+       u16 cmdreg;
        char *e;
 
 
index 31e2d5bf3e38887ca06402bff6c647b9aa9a3c5c..1c309763e32197d255b72e3f976aa948ab67f431 100644 (file)
@@ -395,10 +395,13 @@ sysenter_past_esp:
        /*CFI_REL_OFFSET cs, 0*/
        /*
         * Push current_thread_info()->sysenter_return to the stack.
-        * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
-        * pushed above; +8 corresponds to copy_thread's esp0 setting.
+        * A tiny bit of offset fixup is necessary: TI_sysenter_return
+        * is relative to thread_info, which is at the bottom of the
+        * kernel stack page.  4*4 means the 4 words pushed above;
+        * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
+        * and THREAD_SIZE takes us to the bottom.
         */
-       pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+       pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
        CFI_REL_OFFSET eip, 0
 
        pushl_cfi %eax
@@ -432,7 +435,7 @@ sysenter_after_call:
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
        testl $_TIF_ALLWORK_MASK, %ecx
-       jne sysexit_audit
+       jnz sysexit_audit
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
        movl PT_EIP(%esp), %edx
@@ -460,7 +463,7 @@ sysenter_audit:
 
 sysexit_audit:
        testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-       jne syscall_exit_work
+       jnz syscall_exit_work
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_ANY)
        movl %eax,%edx          /* second arg, syscall return value */
@@ -472,7 +475,7 @@ sysexit_audit:
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
        testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-       jne syscall_exit_work
+       jnz syscall_exit_work
        movl PT_EAX(%esp),%eax  /* reload syscall return value */
        jmp sysenter_exit
 #endif
@@ -510,7 +513,7 @@ syscall_exit:
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
        testl $_TIF_ALLWORK_MASK, %ecx  # current->work
-       jne syscall_exit_work
+       jnz syscall_exit_work
 
 restore_all:
        TRACE_IRQS_IRET
@@ -612,7 +615,7 @@ work_notifysig:                             # deal with pending signals and
 #ifdef CONFIG_VM86
        testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
        movl %esp, %eax
-       jne work_notifysig_v86          # returning to kernel-space or
+       jnz work_notifysig_v86          # returning to kernel-space or
                                        # vm86-space
 1:
 #else
@@ -720,43 +723,22 @@ END(sysenter_badsys)
 .endm
 
 /*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
  */
-.section .init.rodata,"a"
-ENTRY(interrupt)
-.section .entry.text, "ax"
-       .p2align 5
-       .p2align CONFIG_X86_L1_CACHE_SHIFT
+       .align 8
 ENTRY(irq_entries_start)
        RING0_INT_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7
-       .balign 32
-  .rept        7
-    .if vector < FIRST_SYSTEM_VECTOR
-      .if vector <> FIRST_EXTERNAL_VECTOR
+    vector=FIRST_EXTERNAL_VECTOR
+    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+       pushl_cfi $(~vector+0x80)       /* Note: always in signed byte range */
+    vector=vector+1
+       jmp     common_interrupt
        CFI_ADJUST_CFA_OFFSET -4
-      .endif
-1:     pushl_cfi $(~vector+0x80)       /* Note: always in signed byte range */
-      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
-       jmp 2f
-      .endif
-      .previous
-       .long 1b
-      .section .entry.text, "ax"
-vector=vector+1
-    .endif
-  .endr
-2:     jmp common_interrupt
-.endr
+       .align  8
+    .endr
 END(irq_entries_start)
 
-.previous
-END(interrupt)
-.previous
-
 /*
  * the CPU automatically disables interrupts when executing an IRQ vector,
  * so IRQ-flags tracing has to follow that:
@@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error)
        pushl_cfi $0
 #ifdef CONFIG_X86_INVD_BUG
        /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661:   pushl_cfi $do_general_protection
-662:
-.section .altinstructions,"a"
-       altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
-.previous
-.section .altinstr_replacement,"ax"
-663:   pushl $do_simd_coprocessor_error
-664:
-.previous
+       ALTERNATIVE "pushl_cfi $do_general_protection", \
+                   "pushl $do_simd_coprocessor_error", \
+                   X86_FEATURE_XMM
 #else
        pushl_cfi $do_simd_coprocessor_error
 #endif
@@ -1240,20 +1216,13 @@ error_code:
        /*CFI_REL_OFFSET es, 0*/
        pushl_cfi %ds
        /*CFI_REL_OFFSET ds, 0*/
-       pushl_cfi %eax
-       CFI_REL_OFFSET eax, 0
-       pushl_cfi %ebp
-       CFI_REL_OFFSET ebp, 0
-       pushl_cfi %edi
-       CFI_REL_OFFSET edi, 0
-       pushl_cfi %esi
-       CFI_REL_OFFSET esi, 0
-       pushl_cfi %edx
-       CFI_REL_OFFSET edx, 0
-       pushl_cfi %ecx
-       CFI_REL_OFFSET ecx, 0
-       pushl_cfi %ebx
-       CFI_REL_OFFSET ebx, 0
+       pushl_cfi_reg eax
+       pushl_cfi_reg ebp
+       pushl_cfi_reg edi
+       pushl_cfi_reg esi
+       pushl_cfi_reg edx
+       pushl_cfi_reg ecx
+       pushl_cfi_reg ebx
        cld
        movl $(__KERNEL_PERCPU), %ecx
        movl %ecx, %fs
index 2babb393915e76dbeb8a1b757305b819fddd324b..c7b238494b31f267b9c4fc3f4a74d519fb7b3aa5 100644 (file)
  * NOTE: This code handles signal-recognition, which happens every time
  * after an interrupt and after each system call.
  *
- * Normal syscalls and interrupts don't save a full stack frame, this is
- * only done for syscall tracing, signals or fork/exec et.al.
- *
  * A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
  * at the top of the kernel process stack.
- * - partial stack frame: partially saved registers up to R11.
- * - full stack frame: Like partial stack frame, but all register saved.
  *
  * Some macro usage:
  * - CFI macros are used to generate dwarf2 unwind information for better
  * backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
  * - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
  * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
  * - idtentry - Define exception entry points.
  */
        .section .entry.text, "ax"
 
 
-#ifndef CONFIG_PREEMPT
-#define retint_kernel retint_restore_args
-#endif
-
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_usergs_sysret64)
        swapgs
@@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64)
 #endif /* CONFIG_PARAVIRT */
 
 
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+.macro TRACE_IRQS_IRETQ
 #ifdef CONFIG_TRACE_IRQFLAGS
-       bt   $9,EFLAGS-\offset(%rsp)    /* interrupts off? */
+       bt   $9,EFLAGS(%rsp)    /* interrupts off? */
        jnc  1f
        TRACE_IRQS_ON
 1:
@@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64)
        call debug_stack_reset
 .endm
 
-.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
-       bt   $9,EFLAGS-\offset(%rsp)    /* interrupts off? */
+.macro TRACE_IRQS_IRETQ_DEBUG
+       bt   $9,EFLAGS(%rsp)    /* interrupts off? */
        jnc  1f
        TRACE_IRQS_ON_DEBUG
 1:
@@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64)
 #endif
 
 /*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
- */
-
-       /* %rsp:at FRAMEEND */
-       .macro FIXUP_TOP_OF_STACK tmp offset=0
-       movq PER_CPU_VAR(old_rsp),\tmp
-       movq \tmp,RSP+\offset(%rsp)
-       movq $__USER_DS,SS+\offset(%rsp)
-       movq $__USER_CS,CS+\offset(%rsp)
-       movq RIP+\offset(%rsp),\tmp  /* get rip */
-       movq \tmp,RCX+\offset(%rsp)  /* copy it to rcx as sysret would do */
-       movq R11+\offset(%rsp),\tmp  /* get eflags */
-       movq \tmp,EFLAGS+\offset(%rsp)
-       .endm
-
-       .macro RESTORE_TOP_OF_STACK tmp offset=0
-       movq RSP+\offset(%rsp),\tmp
-       movq \tmp,PER_CPU_VAR(old_rsp)
-       movq EFLAGS+\offset(%rsp),\tmp
-       movq \tmp,R11+\offset(%rsp)
-       .endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
+ * empty frame
  */
        .macro EMPTY_FRAME start=1 offset=0
        .if \start
@@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64)
  * initial frame state for interrupts (and exceptions without error code)
  */
        .macro INTR_FRAME start=1 offset=0
-       EMPTY_FRAME \start, SS+8+\offset-RIP
-       /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
-       CFI_REL_OFFSET rsp, RSP+\offset-RIP
-       /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
-       /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
-       CFI_REL_OFFSET rip, RIP+\offset-RIP
+       EMPTY_FRAME \start, 5*8+\offset
+       /*CFI_REL_OFFSET ss, 4*8+\offset*/
+       CFI_REL_OFFSET rsp, 3*8+\offset
+       /*CFI_REL_OFFSET rflags, 2*8+\offset*/
+       /*CFI_REL_OFFSET cs, 1*8+\offset*/
+       CFI_REL_OFFSET rip, 0*8+\offset
        .endm
 
 /*
@@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64)
  * with vector already pushed)
  */
        .macro XCPT_FRAME start=1 offset=0
-       INTR_FRAME \start, RIP+\offset-ORIG_RAX
-       .endm
-
-/*
- * frame that enables calling into C.
- */
-       .macro PARTIAL_FRAME start=1 offset=0
-       XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
-       CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
-       CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
-       CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
-       CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
-       CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
-       CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
-       CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
-       CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
-       CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+       INTR_FRAME \start, 1*8+\offset
        .endm
 
 /*
  * frame that enables passing a complete pt_regs to a C function.
  */
        .macro DEFAULT_FRAME start=1 offset=0
-       PARTIAL_FRAME \start, R11+\offset-R15
+       XCPT_FRAME \start, ORIG_RAX+\offset
+       CFI_REL_OFFSET rdi, RDI+\offset
+       CFI_REL_OFFSET rsi, RSI+\offset
+       CFI_REL_OFFSET rdx, RDX+\offset
+       CFI_REL_OFFSET rcx, RCX+\offset
+       CFI_REL_OFFSET rax, RAX+\offset
+       CFI_REL_OFFSET r8, R8+\offset
+       CFI_REL_OFFSET r9, R9+\offset
+       CFI_REL_OFFSET r10, R10+\offset
+       CFI_REL_OFFSET r11, R11+\offset
        CFI_REL_OFFSET rbx, RBX+\offset
        CFI_REL_OFFSET rbp, RBP+\offset
        CFI_REL_OFFSET r12, R12+\offset
@@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64)
        CFI_REL_OFFSET r15, R15+\offset
        .endm
 
-ENTRY(save_paranoid)
-       XCPT_FRAME 1 RDI+8
-       cld
-       movq %rdi, RDI+8(%rsp)
-       movq %rsi, RSI+8(%rsp)
-       movq_cfi rdx, RDX+8
-       movq_cfi rcx, RCX+8
-       movq_cfi rax, RAX+8
-       movq %r8, R8+8(%rsp)
-       movq %r9, R9+8(%rsp)
-       movq %r10, R10+8(%rsp)
-       movq %r11, R11+8(%rsp)
-       movq_cfi rbx, RBX+8
-       movq %rbp, RBP+8(%rsp)
-       movq %r12, R12+8(%rsp)
-       movq %r13, R13+8(%rsp)
-       movq %r14, R14+8(%rsp)
-       movq %r15, R15+8(%rsp)
-       movl $1,%ebx
-       movl $MSR_GS_BASE,%ecx
-       rdmsr
-       testl %edx,%edx
-       js 1f   /* negative -> in kernel */
-       SWAPGS
-       xorl %ebx,%ebx
-1:     ret
-       CFI_ENDPROC
-END(save_paranoid)
-
 /*
- * A newly forked process directly context switches into this address.
+ * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
- * rdi: prev task we switched from
- */
-ENTRY(ret_from_fork)
-       DEFAULT_FRAME
-
-       LOCK ; btr $TIF_FORK,TI_flags(%r8)
-
-       pushq_cfi $0x0002
-       popfq_cfi                               # reset kernel eflags
-
-       call schedule_tail                      # rdi: 'prev' task parameter
-
-       GET_THREAD_INFO(%rcx)
-
-       RESTORE_REST
-
-       testl $3, CS-ARGOFFSET(%rsp)            # from kernel_thread?
-       jz   1f
-
-       /*
-        * By the time we get here, we have no idea whether our pt_regs,
-        * ti flags, and ti status came from the 64-bit SYSCALL fast path,
-        * the slow path, or one of the ia32entry paths.
-        * Use int_ret_from_sys_call to return, since it can safely handle
-        * all of the above.
-        */
-       jmp  int_ret_from_sys_call
-
-1:
-       subq $REST_SKIP, %rsp   # leave space for volatiles
-       CFI_ADJUST_CFA_OFFSET   REST_SKIP
-       movq %rbp, %rdi
-       call *%rbx
-       movl $0, RAX(%rsp)
-       RESTORE_REST
-       jmp int_ret_from_sys_call
-       CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * System call entry. Up to 6 arguments in registers are supported.
+ * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
  *
- * SYSCALL does not save anything on the stack and does not change the
- * stack pointer.  However, it does mask the flags register for us, so
- * CLD and CLAC are not needed.
- */
-
-/*
- * Register setup:
+ * Registers on entry:
  * rax  system call number
+ * rcx  return address
+ * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
  * rdi  arg0
- * rcx  return address for syscall/sysret, C arg3
  * rsi  arg1
  * rdx  arg2
- * r10  arg3   (--> moved to rcx for C)
+ * r10  arg3 (needs to be moved to rcx to conform to C ABI)
  * r8   arg4
  * r9   arg5
- * r11  eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
+ * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
  *
- * Interrupts are off on entry.
  * Only called from user space.
  *
- * XXX if we had a free scratch register we could save the RSP into the stack frame
- *      and report it properly in ps. Unfortunately we haven't.
- *
- * When user can change the frames always force IRET. That is because
+ * When user can change pt_regs->foo always force IRET. That is because
  * it deals with uncanonical addresses better. SYSRET has trouble
  * with them due to bugs in both AMD and Intel CPUs.
  */
@@ -324,9 +198,15 @@ END(ret_from_fork)
 ENTRY(system_call)
        CFI_STARTPROC   simple
        CFI_SIGNAL_FRAME
-       CFI_DEF_CFA     rsp,KERNEL_STACK_OFFSET
+       CFI_DEF_CFA     rsp,0
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
+
+       /*
+        * Interrupts are off on entry.
+        * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+        * it is too small to ever cause noticeable irq latency.
+        */
        SWAPGS_UNSAFE_STACK
        /*
         * A hypervisor implementation might want to use a label
@@ -335,18 +215,38 @@ ENTRY(system_call)
         */
 GLOBAL(system_call_after_swapgs)
 
-       movq    %rsp,PER_CPU_VAR(old_rsp)
+       movq    %rsp,PER_CPU_VAR(rsp_scratch)
        movq    PER_CPU_VAR(kernel_stack),%rsp
+
+       /* Construct struct pt_regs on stack */
+       pushq_cfi $__USER_DS                    /* pt_regs->ss */
+       pushq_cfi PER_CPU_VAR(rsp_scratch)      /* pt_regs->sp */
        /*
-        * No need to follow this irqs off/on section - it's straight
-        * and short:
+        * Re-enable interrupts.
+        * We use 'rsp_scratch' as a scratch space, hence irq-off block above
+        * must execute atomically in the face of possible interrupt-driven
+        * task preemption. We must enable interrupts only after we're done
+        * with using rsp_scratch:
         */
        ENABLE_INTERRUPTS(CLBR_NONE)
-       SAVE_ARGS 8, 0, rax_enosys=1
-       movq_cfi rax,(ORIG_RAX-ARGOFFSET)
-       movq  %rcx,RIP-ARGOFFSET(%rsp)
-       CFI_REL_OFFSET rip,RIP-ARGOFFSET
-       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       pushq_cfi       %r11                    /* pt_regs->flags */
+       pushq_cfi       $__USER_CS              /* pt_regs->cs */
+       pushq_cfi       %rcx                    /* pt_regs->ip */
+       CFI_REL_OFFSET rip,0
+       pushq_cfi_reg   rax                     /* pt_regs->orig_ax */
+       pushq_cfi_reg   rdi                     /* pt_regs->di */
+       pushq_cfi_reg   rsi                     /* pt_regs->si */
+       pushq_cfi_reg   rdx                     /* pt_regs->dx */
+       pushq_cfi_reg   rcx                     /* pt_regs->cx */
+       pushq_cfi       $-ENOSYS                /* pt_regs->ax */
+       pushq_cfi_reg   r8                      /* pt_regs->r8 */
+       pushq_cfi_reg   r9                      /* pt_regs->r9 */
+       pushq_cfi_reg   r10                     /* pt_regs->r10 */
+       pushq_cfi_reg   r11                     /* pt_regs->r11 */
+       sub     $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
+       CFI_ADJUST_CFA_OFFSET 6*8
+
+       testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
        jnz tracesys
 system_call_fastpath:
 #if __SYSCALL_MASK == ~0
@@ -355,18 +255,21 @@ system_call_fastpath:
        andl $__SYSCALL_MASK,%eax
        cmpl $__NR_syscall_max,%eax
 #endif
-       ja ret_from_sys_call  /* and return regs->ax */
+       ja      1f      /* return -ENOSYS (already in pt_regs->ax) */
        movq %r10,%rcx
-       call *sys_call_table(,%rax,8)  # XXX:    rip relative
-       movq %rax,RAX-ARGOFFSET(%rsp)
+       call *sys_call_table(,%rax,8)
+       movq %rax,RAX(%rsp)
+1:
 /*
- * Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
+ * Syscall return path ending with SYSRET (fast path).
+ * Has incompletely filled pt_regs.
  */
-ret_from_sys_call:
        LOCKDEP_SYS_EXIT
+       /*
+        * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+        * it is too small to ever cause noticeable irq latency.
+        */
        DISABLE_INTERRUPTS(CLBR_NONE)
-       TRACE_IRQS_OFF
 
        /*
         * We must check ti flags with interrupts (or at least preemption)
@@ -376,72 +279,73 @@ ret_from_sys_call:
         * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
         * very bad.
         */
-       testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-       jnz int_ret_from_sys_call_fixup /* Go the the slow path */
+       testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+       jnz int_ret_from_sys_call_irqs_off      /* Go to the slow path */
 
        CFI_REMEMBER_STATE
-       /*
-        * sysretq will re-enable interrupts:
-        */
-       TRACE_IRQS_ON
-       movq RIP-ARGOFFSET(%rsp),%rcx
+
+       RESTORE_C_REGS_EXCEPT_RCX_R11
+       movq    RIP(%rsp),%rcx
        CFI_REGISTER    rip,rcx
-       RESTORE_ARGS 1,-ARG_SKIP,0
+       movq    EFLAGS(%rsp),%r11
        /*CFI_REGISTER  rflags,r11*/
-       movq    PER_CPU_VAR(old_rsp), %rsp
+       movq    RSP(%rsp),%rsp
+       /*
+        * 64bit SYSRET restores rip from rcx,
+        * rflags from r11 (but RF and VM bits are forced to 0),
+        * cs and ss are loaded from MSRs.
+        * Restoration of rflags re-enables interrupts.
+        */
        USERGS_SYSRET64
 
        CFI_RESTORE_STATE
 
-int_ret_from_sys_call_fixup:
-       FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
-       jmp int_ret_from_sys_call_irqs_off
-
-       /* Do syscall tracing */
+       /* Do syscall entry tracing */
 tracesys:
-       leaq -REST_SKIP(%rsp), %rdi
-       movq $AUDIT_ARCH_X86_64, %rsi
+       movq %rsp, %rdi
+       movl $AUDIT_ARCH_X86_64, %esi
        call syscall_trace_enter_phase1
        test %rax, %rax
        jnz tracesys_phase2             /* if needed, run the slow path */
-       LOAD_ARGS 0                     /* else restore clobbered regs */
+       RESTORE_C_REGS_EXCEPT_RAX       /* else restore clobbered regs */
+       movq ORIG_RAX(%rsp), %rax
        jmp system_call_fastpath        /*      and return to the fast path */
 
 tracesys_phase2:
-       SAVE_REST
-       FIXUP_TOP_OF_STACK %rdi
+       SAVE_EXTRA_REGS
        movq %rsp, %rdi
-       movq $AUDIT_ARCH_X86_64, %rsi
+       movl $AUDIT_ARCH_X86_64, %esi
        movq %rax,%rdx
        call syscall_trace_enter_phase2
 
        /*
-        * Reload arg registers from stack in case ptrace changed them.
+        * Reload registers from stack in case ptrace changed them.
         * We don't reload %rax because syscall_trace_entry_phase2() returned
         * the value it wants us to use in the table lookup.
         */
-       LOAD_ARGS ARGOFFSET, 1
-       RESTORE_REST
+       RESTORE_C_REGS_EXCEPT_RAX
+       RESTORE_EXTRA_REGS
 #if __SYSCALL_MASK == ~0
        cmpq $__NR_syscall_max,%rax
 #else
        andl $__SYSCALL_MASK,%eax
        cmpl $__NR_syscall_max,%eax
 #endif
-       ja   int_ret_from_sys_call      /* RAX(%rsp) is already set */
+       ja      1f      /* return -ENOSYS (already in pt_regs->ax) */
        movq %r10,%rcx  /* fixup for C */
        call *sys_call_table(,%rax,8)
-       movq %rax,RAX-ARGOFFSET(%rsp)
-       /* Use IRET because user could have changed frame */
+       movq %rax,RAX(%rsp)
+1:
+       /* Use IRET because user could have changed pt_regs->foo */
 
 /*
  * Syscall return path ending with IRET.
- * Has correct top of stack, but partial stack frame.
+ * Has correct iret frame.
  */
 GLOBAL(int_ret_from_sys_call)
        DISABLE_INTERRUPTS(CLBR_NONE)
+int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
        TRACE_IRQS_OFF
-int_ret_from_sys_call_irqs_off:
        movl $_TIF_ALLWORK_MASK,%edi
        /* edi: mask to check */
 GLOBAL(int_with_check)
@@ -450,8 +354,8 @@ GLOBAL(int_with_check)
        movl TI_flags(%rcx),%edx
        andl %edi,%edx
        jnz   int_careful
-       andl    $~TS_COMPAT,TI_status(%rcx)
-       jmp   retint_swapgs
+       andl    $~TS_COMPAT,TI_status(%rcx)
+       jmp     syscall_return
 
        /* Either reschedule or signal or syscall exit tracking needed. */
        /* First do a reschedule test. */
@@ -468,12 +372,11 @@ int_careful:
        TRACE_IRQS_OFF
        jmp int_with_check
 
-       /* handle signals and tracing -- both require a full stack frame */
+       /* handle signals and tracing -- both require a full pt_regs */
 int_very_careful:
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
-int_check_syscall_exit_work:
-       SAVE_REST
+       SAVE_EXTRA_REGS
        /* Check for syscall exit trace */
        testl $_TIF_WORK_SYSCALL_EXIT,%edx
        jz int_signal
@@ -492,86 +395,192 @@ int_signal:
        call do_notify_resume
 1:     movl $_TIF_WORK_MASK,%edi
 int_restore_rest:
-       RESTORE_REST
+       RESTORE_EXTRA_REGS
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
        jmp int_with_check
+
+syscall_return:
+       /* The IRETQ could re-enable interrupts: */
+       DISABLE_INTERRUPTS(CLBR_ANY)
+       TRACE_IRQS_IRETQ
+
+       /*
+        * Try to use SYSRET instead of IRET if we're returning to
+        * a completely clean 64-bit userspace context.
+        */
+       movq RCX(%rsp),%rcx
+       cmpq %rcx,RIP(%rsp)             /* RCX == RIP */
+       jne opportunistic_sysret_failed
+
+       /*
+        * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+        * in kernel space.  This essentially lets the user take over
+        * the kernel, since userspace controls RSP.  It's not worth
+        * testing for canonicalness exactly -- this check detects any
+        * of the 17 high bits set, which is true for non-canonical
+        * or kernel addresses.  (This will pessimize vsyscall=native.
+        * Big deal.)
+        *
+        * If virtual addresses ever become wider, this will need
+        * to be updated to remain correct on both old and new CPUs.
+        */
+       .ifne __VIRTUAL_MASK_SHIFT - 47
+       .error "virtual address width changed -- SYSRET checks need update"
+       .endif
+       shr $__VIRTUAL_MASK_SHIFT, %rcx
+       jnz opportunistic_sysret_failed
+
+       cmpq $__USER_CS,CS(%rsp)        /* CS must match SYSRET */
+       jne opportunistic_sysret_failed
+
+       movq R11(%rsp),%r11
+       cmpq %r11,EFLAGS(%rsp)          /* R11 == RFLAGS */
+       jne opportunistic_sysret_failed
+
+       /*
+        * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
+        * restoring TF results in a trap from userspace immediately after
+        * SYSRET.  This would cause an infinite loop whenever #DB happens
+        * with register state that satisfies the opportunistic SYSRET
+        * conditions.  For example, single-stepping this user code:
+        *
+        *           movq $stuck_here,%rcx
+        *           pushfq
+        *           popq %r11
+        *   stuck_here:
+        *
+        * would never get past 'stuck_here'.
+        */
+       testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+       jnz opportunistic_sysret_failed
+
+       /* nothing to check for RSP */
+
+       cmpq $__USER_DS,SS(%rsp)        /* SS must match SYSRET */
+       jne opportunistic_sysret_failed
+
+       /*
+        * We win!  This label is here just for ease of understanding
+        * perf profiles.  Nothing jumps here.
+        */
+syscall_return_via_sysret:
+       CFI_REMEMBER_STATE
+       /* r11 is already restored (see code above) */
+       RESTORE_C_REGS_EXCEPT_R11
+       movq RSP(%rsp),%rsp
+       USERGS_SYSRET64
+       CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
+       SWAPGS
+       jmp     restore_c_regs_and_iret
        CFI_ENDPROC
 END(system_call)
 
+
        .macro FORK_LIKE func
 ENTRY(stub_\func)
        CFI_STARTPROC
-       popq    %r11                    /* save return address */
-       PARTIAL_FRAME 0
-       SAVE_REST
-       pushq   %r11                    /* put it back on stack */
-       FIXUP_TOP_OF_STACK %r11, 8
-       DEFAULT_FRAME 0 8               /* offset 8: return address */
-       call sys_\func
-       RESTORE_TOP_OF_STACK %r11, 8
-       ret $REST_SKIP          /* pop extended registers */
+       DEFAULT_FRAME 0, 8              /* offset 8: return address */
+       SAVE_EXTRA_REGS 8
+       jmp sys_\func
        CFI_ENDPROC
 END(stub_\func)
        .endm
 
-       .macro FIXED_FRAME label,func
-ENTRY(\label)
-       CFI_STARTPROC
-       PARTIAL_FRAME 0 8               /* offset 8: return address */
-       FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
-       call \func
-       RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
-       ret
-       CFI_ENDPROC
-END(\label)
-       .endm
-
        FORK_LIKE  clone
        FORK_LIKE  fork
        FORK_LIKE  vfork
-       FIXED_FRAME stub_iopl, sys_iopl
 
 ENTRY(stub_execve)
        CFI_STARTPROC
-       addq $8, %rsp
-       PARTIAL_FRAME 0
-       SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
-       call sys_execve
-       movq %rax,RAX(%rsp)
-       RESTORE_REST
-       jmp int_ret_from_sys_call
+       DEFAULT_FRAME 0, 8
+       call    sys_execve
+return_from_execve:
+       testl   %eax, %eax
+       jz      1f
+       /* exec failed, can use fast SYSRET code path in this case */
+       ret
+1:
+       /* must use IRET code path (pt_regs->cs may have changed) */
+       addq    $8, %rsp
+       CFI_ADJUST_CFA_OFFSET -8
+       ZERO_EXTRA_REGS
+       movq    %rax,RAX(%rsp)
+       jmp     int_ret_from_sys_call
        CFI_ENDPROC
 END(stub_execve)
-
-ENTRY(stub_execveat)
+/*
+ * Remaining execve stubs are only 7 bytes long.
+ * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
+ */
+       .align  8
+GLOBAL(stub_execveat)
        CFI_STARTPROC
-       addq $8, %rsp
-       PARTIAL_FRAME 0
-       SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
-       call sys_execveat
-       RESTORE_TOP_OF_STACK %r11
-       movq %rax,RAX(%rsp)
-       RESTORE_REST
-       jmp int_ret_from_sys_call
+       DEFAULT_FRAME 0, 8
+       call    sys_execveat
+       jmp     return_from_execve
        CFI_ENDPROC
 END(stub_execveat)
 
+#ifdef CONFIG_X86_X32_ABI
+       .align  8
+GLOBAL(stub_x32_execve)
+       CFI_STARTPROC
+       DEFAULT_FRAME 0, 8
+       call    compat_sys_execve
+       jmp     return_from_execve
+       CFI_ENDPROC
+END(stub_x32_execve)
+       .align  8
+GLOBAL(stub_x32_execveat)
+       CFI_STARTPROC
+       DEFAULT_FRAME 0, 8
+       call    compat_sys_execveat
+       jmp     return_from_execve
+       CFI_ENDPROC
+END(stub_x32_execveat)
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
+       .align  8
+GLOBAL(stub32_execve)
+       CFI_STARTPROC
+       call    compat_sys_execve
+       jmp     return_from_execve
+       CFI_ENDPROC
+END(stub32_execve)
+       .align  8
+GLOBAL(stub32_execveat)
+       CFI_STARTPROC
+       call    compat_sys_execveat
+       jmp     return_from_execve
+       CFI_ENDPROC
+END(stub32_execveat)
+#endif
+
 /*
  * sigreturn is special because it needs to restore all registers on return.
  * This cannot be done with SYSRET, so use the IRET return path instead.
  */
 ENTRY(stub_rt_sigreturn)
        CFI_STARTPROC
-       addq $8, %rsp
-       PARTIAL_FRAME 0
-       SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
+       DEFAULT_FRAME 0, 8
+       /*
+        * SAVE_EXTRA_REGS result is not normally needed:
+        * sigreturn overwrites all pt_regs->GPREGS.
+        * But sigreturn can fail (!), and there is no easy way to detect that.
+        * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
+        * we SAVE_EXTRA_REGS here.
+        */
+       SAVE_EXTRA_REGS 8
        call sys_rt_sigreturn
-       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-       RESTORE_REST
+return_from_stub:
+       addq    $8, %rsp
+       CFI_ADJUST_CFA_OFFSET -8
+       RESTORE_EXTRA_REGS
+       movq %rax,RAX(%rsp)
        jmp int_ret_from_sys_call
        CFI_ENDPROC
 END(stub_rt_sigreturn)
@@ -579,86 +588,70 @@ END(stub_rt_sigreturn)
 #ifdef CONFIG_X86_X32_ABI
 ENTRY(stub_x32_rt_sigreturn)
        CFI_STARTPROC
-       addq $8, %rsp
-       PARTIAL_FRAME 0
-       SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
+       DEFAULT_FRAME 0, 8
+       SAVE_EXTRA_REGS 8
        call sys32_x32_rt_sigreturn
-       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-       RESTORE_REST
-       jmp int_ret_from_sys_call
+       jmp  return_from_stub
        CFI_ENDPROC
 END(stub_x32_rt_sigreturn)
+#endif
 
-ENTRY(stub_x32_execve)
-       CFI_STARTPROC
-       addq $8, %rsp
-       PARTIAL_FRAME 0
-       SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
-       call compat_sys_execve
-       RESTORE_TOP_OF_STACK %r11
-       movq %rax,RAX(%rsp)
-       RESTORE_REST
-       jmp int_ret_from_sys_call
-       CFI_ENDPROC
-END(stub_x32_execve)
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
+ENTRY(ret_from_fork)
+       DEFAULT_FRAME
 
-ENTRY(stub_x32_execveat)
-       CFI_STARTPROC
-       addq $8, %rsp
-       PARTIAL_FRAME 0
-       SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
-       call compat_sys_execveat
-       RESTORE_TOP_OF_STACK %r11
-       movq %rax,RAX(%rsp)
-       RESTORE_REST
+       LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
+       pushq_cfi $0x0002
+       popfq_cfi                               # reset kernel eflags
+
+       call schedule_tail                      # rdi: 'prev' task parameter
+
+       RESTORE_EXTRA_REGS
+
+       testl $3,CS(%rsp)                       # from kernel_thread?
+
+       /*
+        * By the time we get here, we have no idea whether our pt_regs,
+        * ti flags, and ti status came from the 64-bit SYSCALL fast path,
+        * the slow path, or one of the ia32entry paths.
+        * Use IRET code path to return, since it can safely handle
+        * all of the above.
+        */
+       jnz     int_ret_from_sys_call
+
+       /* We came from kernel_thread */
+       /* nb: we depend on RESTORE_EXTRA_REGS above */
+       movq %rbp, %rdi
+       call *%rbx
+       movl $0, RAX(%rsp)
+       RESTORE_EXTRA_REGS
        jmp int_ret_from_sys_call
        CFI_ENDPROC
-END(stub_x32_execveat)
-
-#endif
+END(ret_from_fork)
 
 /*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
  */
-       .section .init.rodata,"a"
-ENTRY(interrupt)
-       .section .entry.text
-       .p2align 5
-       .p2align CONFIG_X86_L1_CACHE_SHIFT
+       .align 8
 ENTRY(irq_entries_start)
        INTR_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7
-       .balign 32
-  .rept        7
-    .if vector < FIRST_SYSTEM_VECTOR
-      .if vector <> FIRST_EXTERNAL_VECTOR
+    vector=FIRST_EXTERNAL_VECTOR
+    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+       pushq_cfi $(~vector+0x80)       /* Note: always in signed byte range */
+    vector=vector+1
+       jmp     common_interrupt
        CFI_ADJUST_CFA_OFFSET -8
-      .endif
-1:     pushq_cfi $(~vector+0x80)       /* Note: always in signed byte range */
-      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
-       jmp 2f
-      .endif
-      .previous
-       .quad 1b
-      .section .entry.text
-vector=vector+1
-    .endif
-  .endr
-2:     jmp common_interrupt
-.endr
+       .align  8
+    .endr
        CFI_ENDPROC
 END(irq_entries_start)
 
-.previous
-END(interrupt)
-.previous
-
 /*
  * Interrupt entry/exit.
  *
@@ -669,47 +662,45 @@ END(interrupt)
 
 /* 0(%rsp): ~(interrupt number) */
        .macro interrupt func
-       /* reserve pt_regs for scratch regs and rbp */
-       subq $ORIG_RAX-RBP, %rsp
-       CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
        cld
-       /* start from rbp in pt_regs and jump over */
-       movq_cfi rdi, (RDI-RBP)
-       movq_cfi rsi, (RSI-RBP)
-       movq_cfi rdx, (RDX-RBP)
-       movq_cfi rcx, (RCX-RBP)
-       movq_cfi rax, (RAX-RBP)
-       movq_cfi  r8,  (R8-RBP)
-       movq_cfi  r9,  (R9-RBP)
-       movq_cfi r10, (R10-RBP)
-       movq_cfi r11, (R11-RBP)
-
-       /* Save rbp so that we can unwind from get_irq_regs() */
-       movq_cfi rbp, 0
-
-       /* Save previous stack value */
-       movq %rsp, %rsi
+       /*
+        * Since nothing in interrupt handling code touches r12...r15 members
+        * of "struct pt_regs", and since interrupts can nest, we can save
+        * four stack slots and simultaneously provide
+        * an unwind-friendly stack layout by saving "truncated" pt_regs
+        * exactly up to rbp slot, without these members.
+        */
+       ALLOC_PT_GPREGS_ON_STACK -RBP
+       SAVE_C_REGS -RBP
+       /* this goes to 0(%rsp) for unwinder, not for saving the value: */
+       SAVE_EXTRA_REGS_RBP -RBP
+
+       leaq -RBP(%rsp),%rdi    /* arg1 for \func (pointer to pt_regs) */
 
-       leaq -RBP(%rsp),%rdi    /* arg1 for handler */
-       testl $3, CS-RBP(%rsi)
+       testl $3, CS-RBP(%rsp)
        je 1f
        SWAPGS
+1:
        /*
+        * Save previous stack pointer, optionally switch to interrupt stack.
         * irq_count is used to check if a CPU is already on an interrupt stack
         * or not. While this is essentially redundant with preempt_count it is
         * a little cheaper to use a separate counter in the PDA (short of
         * moving irq_enter into assembly, which would be too much work)
         */
-1:     incl PER_CPU_VAR(irq_count)
+       movq %rsp, %rsi
+       incl PER_CPU_VAR(irq_count)
        cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
        CFI_DEF_CFA_REGISTER    rsi
-
-       /* Store previous stack value */
        pushq %rsi
+       /*
+        * For debugger:
+        * "CFA (Current Frame Address) is the value on stack + offset"
+        */
        CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
-                       0x77 /* DW_OP_breg7 */, 0, \
+                       0x77 /* DW_OP_breg7 (rsp) */, 0, \
                        0x06 /* DW_OP_deref */, \
-                       0x08 /* DW_OP_const1u */, SS+8-RBP, \
+                       0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
                        0x22 /* DW_OP_plus */
        /* We entered an interrupt context - irqs are off: */
        TRACE_IRQS_OFF
@@ -727,7 +718,7 @@ common_interrupt:
        ASM_CLAC
        addq $-0x80,(%rsp)              /* Adjust vector to [-256,-1] range */
        interrupt do_IRQ
-       /* 0(%rsp): old_rsp-ARGOFFSET */
+       /* 0(%rsp): old RSP */
 ret_from_intr:
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
@@ -735,19 +726,18 @@ ret_from_intr:
 
        /* Restore saved previous stack */
        popq %rsi
-       CFI_DEF_CFA rsi,SS+8-RBP        /* reg/off reset after def_cfa_expr */
-       leaq ARGOFFSET-RBP(%rsi), %rsp
+       CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
+       /* return code expects complete pt_regs - adjust rsp accordingly: */
+       leaq -RBP(%rsi),%rsp
        CFI_DEF_CFA_REGISTER    rsp
-       CFI_ADJUST_CFA_OFFSET   RBP-ARGOFFSET
+       CFI_ADJUST_CFA_OFFSET   RBP
 
-exit_intr:
-       GET_THREAD_INFO(%rcx)
-       testl $3,CS-ARGOFFSET(%rsp)
+       testl $3,CS(%rsp)
        je retint_kernel
-
        /* Interrupt came from user space */
+
+       GET_THREAD_INFO(%rcx)
        /*
-        * Has a correct top of stack, but a partial stack frame
         * %rcx: thread info. Interrupts off.
         */
 retint_with_reschedule:
@@ -766,70 +756,34 @@ retint_swapgs:            /* return to user-space */
        DISABLE_INTERRUPTS(CLBR_ANY)
        TRACE_IRQS_IRETQ
 
-       /*
-        * Try to use SYSRET instead of IRET if we're returning to
-        * a completely clean 64-bit userspace context.
-        */
-       movq (RCX-R11)(%rsp), %rcx
-       cmpq %rcx,(RIP-R11)(%rsp)               /* RCX == RIP */
-       jne opportunistic_sysret_failed
-
-       /*
-        * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
-        * in kernel space.  This essentially lets the user take over
-        * the kernel, since userspace controls RSP.  It's not worth
-        * testing for canonicalness exactly -- this check detects any
-        * of the 17 high bits set, which is true for non-canonical
-        * or kernel addresses.  (This will pessimize vsyscall=native.
-        * Big deal.)
-        *
-        * If virtual addresses ever become wider, this will need
-        * to be updated to remain correct on both old and new CPUs.
-        */
-       .ifne __VIRTUAL_MASK_SHIFT - 47
-       .error "virtual address width changed -- sysret checks need update"
-       .endif
-       shr $__VIRTUAL_MASK_SHIFT, %rcx
-       jnz opportunistic_sysret_failed
-
-       cmpq $__USER_CS,(CS-R11)(%rsp)          /* CS must match SYSRET */
-       jne opportunistic_sysret_failed
-
-       movq (R11-ARGOFFSET)(%rsp), %r11
-       cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp)      /* R11 == RFLAGS */
-       jne opportunistic_sysret_failed
-
-       testq $X86_EFLAGS_RF,%r11               /* sysret can't restore RF */
-       jnz opportunistic_sysret_failed
-
-       /* nothing to check for RSP */
-
-       cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp)    /* SS must match SYSRET */
-       jne opportunistic_sysret_failed
-
-       /*
-        * We win!  This label is here just for ease of understanding
-        * perf profiles.  Nothing jumps here.
-        */
-irq_return_via_sysret:
-       CFI_REMEMBER_STATE
-       RESTORE_ARGS 1,8,1
-       movq (RSP-RIP)(%rsp),%rsp
-       USERGS_SYSRET64
-       CFI_RESTORE_STATE
-
-opportunistic_sysret_failed:
        SWAPGS
-       jmp restore_args
+       jmp     restore_c_regs_and_iret
 
-retint_restore_args:   /* return to kernel space */
-       DISABLE_INTERRUPTS(CLBR_ANY)
+/* Returning to kernel space */
+retint_kernel:
+#ifdef CONFIG_PREEMPT
+       /* Interrupts are off */
+       /* Check if we need preemption */
+       bt      $9,EFLAGS(%rsp) /* interrupts were off? */
+       jnc     1f
+0:     cmpl    $0,PER_CPU_VAR(__preempt_count)
+       jnz     1f
+       call    preempt_schedule_irq
+       jmp     0b
+1:
+#endif
        /*
         * The iretq could re-enable interrupts:
         */
        TRACE_IRQS_IRETQ
-restore_args:
-       RESTORE_ARGS 1,8,1
+
+/*
+ * At this label, code paths which return to kernel and to user,
+ * which come from interrupts/exception and from syscalls, merge.
+ */
+restore_c_regs_and_iret:
+       RESTORE_C_REGS
+       REMOVE_PT_GPREGS_FROM_STACK 8
 
 irq_return:
        INTERRUPT_RETURN
@@ -900,28 +854,17 @@ retint_signal:
        jz    retint_swapgs
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_NONE)
-       SAVE_REST
+       SAVE_EXTRA_REGS
        movq $-1,ORIG_RAX(%rsp)
        xorl %esi,%esi          # oldset
        movq %rsp,%rdi          # &pt_regs
        call do_notify_resume
-       RESTORE_REST
+       RESTORE_EXTRA_REGS
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
        GET_THREAD_INFO(%rcx)
        jmp retint_with_reschedule
 
-#ifdef CONFIG_PREEMPT
-       /* Returning to kernel space. Check if we need preemption */
-       /* rcx:  threadinfo. interrupts off. */
-ENTRY(retint_kernel)
-       cmpl $0,PER_CPU_VAR(__preempt_count)
-       jnz  retint_restore_args
-       bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
-       jnc  retint_restore_args
-       call preempt_schedule_irq
-       jmp exit_intr
-#endif
        CFI_ENDPROC
 END(common_interrupt)
 
@@ -1010,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \
 /*
  * Exception entry points.
  */
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
 
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
@@ -1032,8 +975,7 @@ ENTRY(\sym)
        pushq_cfi $-1                   /* ORIG_RAX: no syscall to restart */
        .endif
 
-       subq $ORIG_RAX-R15, %rsp
-       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+       ALLOC_PT_GPREGS_ON_STACK
 
        .if \paranoid
        .if \paranoid == 1
@@ -1041,10 +983,11 @@ ENTRY(\sym)
        testl $3, CS(%rsp)              /* If coming from userspace, switch */
        jnz 1f                          /* stacks. */
        .endif
-       call save_paranoid
+       call paranoid_entry
        .else
        call error_entry
        .endif
+       /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
        DEFAULT_FRAME 0
 
@@ -1066,19 +1009,20 @@ ENTRY(\sym)
        .endif
 
        .if \shift_ist != -1
-       subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+       subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
        .endif
 
        call \do_sym
 
        .if \shift_ist != -1
-       addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+       addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
        .endif
 
+       /* these procedures expect "no swapgs" flag in ebx */
        .if \paranoid
-       jmp paranoid_exit               /* %ebx: no swapgs flag */
+       jmp paranoid_exit
        .else
-       jmp error_exit                  /* %ebx: no swapgs flag */
+       jmp error_exit
        .endif
 
        .if \paranoid == 1
@@ -1282,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
        addq $0x30,%rsp
        CFI_ADJUST_CFA_OFFSET -0x30
        pushq_cfi $-1 /* orig_ax = -1 => not a system call */
-       SAVE_ALL
+       ALLOC_PT_GPREGS_ON_STACK
+       SAVE_C_REGS
+       SAVE_EXTRA_REGS
        jmp error_exit
        CFI_ENDPROC
 END(xen_failsafe_callback)
@@ -1314,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
 idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
 #endif
 
-       /*
-        * "Paranoid" exit path from exception stack.  This is invoked
-        * only on return from non-NMI IST interrupts that came
-        * from kernel space.
-        *
-        * We may be returning to very strange contexts (e.g. very early
-        * in syscall entry), so checking for preemption here would
-        * be complicated.  Fortunately, we there's no good reason
-        * to try to handle preemption here.
-        */
+/*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ */
+ENTRY(paranoid_entry)
+       XCPT_FRAME 1 15*8
+       cld
+       SAVE_C_REGS 8
+       SAVE_EXTRA_REGS 8
+       movl $1,%ebx
+       movl $MSR_GS_BASE,%ecx
+       rdmsr
+       testl %edx,%edx
+       js 1f   /* negative -> in kernel */
+       SWAPGS
+       xorl %ebx,%ebx
+1:     ret
+       CFI_ENDPROC
+END(paranoid_entry)
 
-       /* ebx: no swapgs flag */
+/*
+ * "Paranoid" exit path from exception stack.  This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated.  Fortunately, we there's no good reason
+ * to try to handle preemption here.
+ */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(paranoid_exit)
        DEFAULT_FRAME
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF_DEBUG
        testl %ebx,%ebx                         /* swapgs needed? */
-       jnz paranoid_restore
-       TRACE_IRQS_IRETQ 0
+       jnz paranoid_exit_no_swapgs
+       TRACE_IRQS_IRETQ
        SWAPGS_UNSAFE_STACK
-       RESTORE_ALL 8
-       INTERRUPT_RETURN
-paranoid_restore:
-       TRACE_IRQS_IRETQ_DEBUG 0
-       RESTORE_ALL 8
+       jmp paranoid_exit_restore
+paranoid_exit_no_swapgs:
+       TRACE_IRQS_IRETQ_DEBUG
+paranoid_exit_restore:
+       RESTORE_EXTRA_REGS
+       RESTORE_C_REGS
+       REMOVE_PT_GPREGS_FROM_STACK 8
        INTERRUPT_RETURN
        CFI_ENDPROC
 END(paranoid_exit)
 
 /*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(error_entry)
-       XCPT_FRAME
-       CFI_ADJUST_CFA_OFFSET 15*8
-       /* oldrax contains error code */
+       XCPT_FRAME 1 15*8
        cld
-       movq %rdi, RDI+8(%rsp)
-       movq %rsi, RSI+8(%rsp)
-       movq %rdx, RDX+8(%rsp)
-       movq %rcx, RCX+8(%rsp)
-       movq %rax, RAX+8(%rsp)
-       movq  %r8,  R8+8(%rsp)
-       movq  %r9,  R9+8(%rsp)
-       movq %r10, R10+8(%rsp)
-       movq %r11, R11+8(%rsp)
-       movq_cfi rbx, RBX+8
-       movq %rbp, RBP+8(%rsp)
-       movq %r12, R12+8(%rsp)
-       movq %r13, R13+8(%rsp)
-       movq %r14, R14+8(%rsp)
-       movq %r15, R15+8(%rsp)
+       SAVE_C_REGS 8
+       SAVE_EXTRA_REGS 8
        xorl %ebx,%ebx
        testl $3,CS+8(%rsp)
        je error_kernelspace
@@ -1376,12 +1329,12 @@ error_sti:
        TRACE_IRQS_OFF
        ret
 
-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here.  B stepping K8s sometimes report a
- * truncated RIP for IRET exceptions returning to compat mode. Check
- * for these here too.
- */
+       /*
       * There are two places in the kernel that can potentially fault with
       * usergs. Handle them here.  B stepping K8s sometimes report a
       * truncated RIP for IRET exceptions returning to compat mode. Check
       * for these here too.
       */
 error_kernelspace:
        CFI_REL_OFFSET rcx, RCX+8
        incl %ebx
@@ -1411,11 +1364,11 @@ error_bad_iret:
 END(error_entry)
 
 
-/* ebx:        no swapgs flag (1: don't need swapgs, 0: need it) */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(error_exit)
        DEFAULT_FRAME
        movl %ebx,%eax
-       RESTORE_REST
+       RESTORE_EXTRA_REGS
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
        GET_THREAD_INFO(%rcx)
@@ -1430,19 +1383,7 @@ ENTRY(error_exit)
        CFI_ENDPROC
 END(error_exit)
 
-/*
- * Test if a given stack is an NMI stack or not.
- */
-       .macro test_in_nmi reg stack nmi_ret normal_ret
-       cmpq %\reg, \stack
-       ja \normal_ret
-       subq $EXCEPTION_STKSZ, %\reg
-       cmpq %\reg, \stack
-       jb \normal_ret
-       jmp \nmi_ret
-       .endm
-
-       /* runs on exception stack */
+/* Runs on exception stack */
 ENTRY(nmi)
        INTR_FRAME
        PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1478,7 +1419,7 @@ ENTRY(nmi)
         * NMI.
         */
 
-       /* Use %rdx as out temp variable throughout */
+       /* Use %rdx as our temp variable throughout */
        pushq_cfi %rdx
        CFI_REL_OFFSET rdx, 0
 
@@ -1503,8 +1444,17 @@ ENTRY(nmi)
         * We check the variable because the first NMI could be in a
         * breakpoint routine using a breakpoint stack.
         */
-       lea 6*8(%rsp), %rdx
-       test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+       lea     6*8(%rsp), %rdx
+       /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+       cmpq    %rdx, 4*8(%rsp)
+       /* If the stack pointer is above the NMI stack, this is a normal NMI */
+       ja      first_nmi
+       subq    $EXCEPTION_STKSZ, %rdx
+       cmpq    %rdx, 4*8(%rsp)
+       /* If it is below the NMI stack, it is a normal NMI */
+       jb      first_nmi
+       /* Ah, it is within the NMI stack, treat it as nested */
+
        CFI_REMEMBER_STATE
 
 nested_nmi:
@@ -1597,7 +1547,7 @@ first_nmi:
        .rept 5
        pushq_cfi 11*8(%rsp)
        .endr
-       CFI_DEF_CFA_OFFSET SS+8-RIP
+       CFI_DEF_CFA_OFFSET 5*8
 
        /* Everything up to here is safe from nested NMIs */
 
@@ -1625,7 +1575,7 @@ repeat_nmi:
        pushq_cfi -6*8(%rsp)
        .endr
        subq $(5*8), %rsp
-       CFI_DEF_CFA_OFFSET SS+8-RIP
+       CFI_DEF_CFA_OFFSET 5*8
 end_repeat_nmi:
 
        /*
@@ -1634,16 +1584,16 @@ end_repeat_nmi:
         * so that we repeat another NMI.
         */
        pushq_cfi $-1           /* ORIG_RAX: no syscall to restart */
-       subq $ORIG_RAX-R15, %rsp
-       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+       ALLOC_PT_GPREGS_ON_STACK
+
        /*
-        * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+        * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
         * as we should not be calling schedule in NMI context.
         * Even with normal interrupts enabled. An NMI should not be
         * setting NEED_RESCHED or anything that normal interrupts and
         * exceptions might do.
         */
-       call save_paranoid
+       call paranoid_entry
        DEFAULT_FRAME 0
 
        /*
@@ -1674,8 +1624,10 @@ end_repeat_nmi:
 nmi_swapgs:
        SWAPGS_UNSAFE_STACK
 nmi_restore:
+       RESTORE_EXTRA_REGS
+       RESTORE_C_REGS
        /* Pop the extra iret frame at once */
-       RESTORE_ALL 6*8
+       REMOVE_PT_GPREGS_FROM_STACK 6*8
 
        /* Clear the NMI executing stack variable */
        movq $0, 5*8(%rsp)
index c4f8d4659070db99ce190543186bc4a4ac5d2ac9..2b55ee6db053c79fbe91a6119e613075be54111b 100644 (file)
@@ -177,9 +177,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
         */
        load_ucode_bsp();
 
-       if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
-               early_printk("Kernel alive\n");
-
        clear_page(init_level4_pgt);
        /* set init_level4_pgt kernel high mapping*/
        init_level4_pgt[511] = early_level4_pgt[511];
index f36bd42d6f0c8b5fc5cd75dcf133b35a1f6bfe3b..d031bad9e07eadf3a80bc69a449cd13a44ed8080 100644 (file)
@@ -22,6 +22,7 @@
 #include <asm/cpufeature.h>
 #include <asm/percpu.h>
 #include <asm/nops.h>
+#include <asm/bootparam.h>
 
 /* Physical address */
 #define pa(X) ((X) - __PAGE_OFFSET)
@@ -90,7 +91,7 @@ ENTRY(startup_32)
        
        /* test KEEP_SEGMENTS flag to see if the bootloader is asking
                us to not reload segments */
-       testb $(1<<6), BP_loadflags(%esi)
+       testb $KEEP_SEGMENTS, BP_loadflags(%esi)
        jnz 2f
 
 /*
index 6fd514d9f69a267a7813ca2b04657991429932d5..ae6588b301c248b3c281a1e072802e6764e9ac44 100644 (file)
@@ -1,5 +1,5 @@
 /*
- *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *  linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
  *
  *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
  *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -56,7 +56,7 @@ startup_64:
         * %rsi holds a physical pointer to real_mode_data.
         *
         * We come here either directly from a 64bit bootloader, or from
-        * arch/x86_64/boot/compressed/head.S.
+        * arch/x86/boot/compressed/head_64.S.
         *
         * We only come here initially at boot nothing else comes here.
         *
@@ -146,7 +146,7 @@ startup_64:
        leaq    level2_kernel_pgt(%rip), %rdi
        leaq    4096(%rdi), %r8
        /* See if it is a valid page table entry */
-1:     testq   $1, 0(%rdi)
+1:     testb   $1, 0(%rdi)
        jz      2f
        addq    %rbp, 0(%rdi)
        /* Go to the next page */
index d5651fce0b71af6c15226483b6a06398ccbeb8a0..367f39d35e9cb98300fa368d3689d2ce143e71a6 100644 (file)
@@ -42,8 +42,8 @@ void kernel_fpu_enable(void)
  * be set (so that the clts/stts pair does nothing that is
  * visible in the interrupted kernel thread).
  *
- * Except for the eagerfpu case when we return 1 unless we've already
- * been eager and saved the state in kernel_fpu_begin().
+ * Except for the eagerfpu case when we return true; in the likely case
+ * the thread has FPU but we are not going to set/clear TS.
  */
 static inline bool interrupted_kernel_fpu_idle(void)
 {
@@ -51,7 +51,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
                return false;
 
        if (use_eager_fpu())
-               return __thread_has_fpu(current);
+               return true;
 
        return !__thread_has_fpu(current) &&
                (read_cr0() & X86_CR0_TS);
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
 static inline bool interrupted_user_mode(void)
 {
        struct pt_regs *regs = get_irq_regs();
-       return regs && user_mode_vm(regs);
+       return regs && user_mode(regs);
 }
 
 /*
@@ -94,9 +94,10 @@ void __kernel_fpu_begin(void)
 
        if (__thread_has_fpu(me)) {
                __save_init_fpu(me);
-       } else if (!use_eager_fpu()) {
+       } else {
                this_cpu_write(fpu_owner_task, NULL);
-               clts();
+               if (!use_eager_fpu())
+                       clts();
        }
 }
 EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -107,7 +108,7 @@ void __kernel_fpu_end(void)
 
        if (__thread_has_fpu(me)) {
                if (WARN_ON(restore_fpu_checking(me)))
-                       drop_init_fpu(me);
+                       fpu_reset_state(me);
        } else if (!use_eager_fpu()) {
                stts();
        }
@@ -120,10 +121,13 @@ void unlazy_fpu(struct task_struct *tsk)
 {
        preempt_disable();
        if (__thread_has_fpu(tsk)) {
-               __save_init_fpu(tsk);
-               __thread_fpu_end(tsk);
-       } else
-               tsk->thread.fpu_counter = 0;
+               if (use_eager_fpu()) {
+                       __save_fpu(tsk);
+               } else {
+                       __save_init_fpu(tsk);
+                       __thread_fpu_end(tsk);
+               }
+       }
        preempt_enable();
 }
 EXPORT_SYMBOL(unlazy_fpu);
@@ -221,11 +225,12 @@ void fpu_finit(struct fpu *fpu)
                return;
        }
 
+       memset(fpu->state, 0, xstate_size);
+
        if (cpu_has_fxsr) {
                fx_finit(&fpu->state->fxsave);
        } else {
                struct i387_fsave_struct *fp = &fpu->state->fsave;
-               memset(fp, 0, xstate_size);
                fp->cwd = 0xffff037fu;
                fp->swd = 0xffff0000u;
                fp->twd = 0xffffffffu;
@@ -247,7 +252,7 @@ int init_fpu(struct task_struct *tsk)
        if (tsk_used_math(tsk)) {
                if (cpu_has_fpu && tsk == current)
                        unlazy_fpu(tsk);
-               tsk->thread.fpu.last_cpu = ~0;
+               task_disable_lazy_fpu_restore(tsk);
                return 0;
        }
 
@@ -336,6 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
                unsigned int pos, unsigned int count,
                void *kbuf, void __user *ubuf)
 {
+       struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
        int ret;
 
        if (!cpu_has_xsave)
@@ -350,14 +356,12 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
         * memory layout in the thread struct, so that we can copy the entire
         * xstateregs to the user using one user_regset_copyout().
         */
-       memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
-              xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-
+       memcpy(&xsave->i387.sw_reserved,
+               xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
        /*
         * Copy the xstate memory layout.
         */
-       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-                                 &target->thread.fpu.state->xsave, 0, -1);
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
        return ret;
 }
 
@@ -365,8 +369,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
                  unsigned int pos, unsigned int count,
                  const void *kbuf, const void __user *ubuf)
 {
+       struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
        int ret;
-       struct xsave_hdr_struct *xsave_hdr;
 
        if (!cpu_has_xsave)
                return -ENODEV;
@@ -375,22 +379,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
        if (ret)
                return ret;
 
-       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-                                &target->thread.fpu.state->xsave, 0, -1);
-
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
        /*
         * mxcsr reserved bits must be masked to zero for security reasons.
         */
-       target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-
-       xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
-
-       xsave_hdr->xstate_bv &= pcntxt_mask;
+       xsave->i387.mxcsr &= mxcsr_feature_mask;
+       xsave->xsave_hdr.xstate_bv &= pcntxt_mask;
        /*
         * These bits must be zero.
         */
-       memset(xsave_hdr->reserved, 0, 48);
-
+       memset(&xsave->xsave_hdr.reserved, 0, 48);
        return ret;
 }
 
index 4ddaf66ea35f696eac6afce6bb43f01d2d84f1aa..37dae792dbbed00b480aa67a33cb0b2ae1699428 100644 (file)
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
         * because the ->io_bitmap_max value must match the bitmap
         * contents:
         */
-       tss = &per_cpu(init_tss, get_cpu());
+       tss = &per_cpu(cpu_tss, get_cpu());
 
        if (turn_on)
                bitmap_clear(t->io_bitmap_ptr, from, num);
index 67b1cbe0093adba1141f8d9ebda29ad34dc9d23e..e5952c22553241e2ceea5d5fd6f1f7b758cc960e 100644 (file)
@@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void)
 
        this_cpu = smp_processor_id();
        cpumask_copy(&online_new, cpu_online_mask);
-       cpu_clear(this_cpu, online_new);
+       cpumask_clear_cpu(this_cpu, &online_new);
 
        this_count = 0;
        for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void)
 
                        data = irq_desc_get_irq_data(desc);
                        cpumask_copy(&affinity_new, data->affinity);
-                       cpu_clear(this_cpu, affinity_new);
+                       cpumask_clear_cpu(this_cpu, &affinity_new);
 
                        /* Do not count inactive or per-cpu irqs. */
                        if (!irq_has_action(irq) || irqd_is_per_cpu(data))
index 28d28f5eb8f49c2a9b8f84997e1dd0dc8841ece0..f9fd86a7fcc7d1bc8c2cc920fc5b5037f5859336 100644 (file)
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
        if (unlikely(!desc))
                return false;
 
-       if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
+       if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
                if (unlikely(overflow))
                        print_stack_overflow();
                desc->handle_irq(irq, desc);
index e4b503d5558c5d435eddc3c3c5be8c55f1de558d..394e643d7830fc01d4da516bd79cab1dc0aa6962 100644 (file)
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
        u64 estack_top, estack_bottom;
        u64 curbase = (u64)task_stack_page(current);
 
-       if (user_mode_vm(regs))
+       if (user_mode(regs))
                return;
 
        if (regs->sp >= curbase + sizeof(struct thread_info) +
index 70e181ea1eac1f2da444482e6714e61b52d5a19e..cd10a64372647c3579ba6717db49c6cd63c6353a 100644 (file)
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void)
 #endif
        for_each_clear_bit_from(i, used_vectors, first_system_vector) {
                /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
-               set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
+               set_intr_gate(i, irq_entries_start +
+                               8 * (i - FIRST_EXTERNAL_VECTOR));
        }
 #ifdef CONFIG_X86_LOCAL_APIC
        for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
index 7ec1d5f8d28339bce0b74191a1d458c6c8e5d5df..d6178d9791db7966e8bc188e3df16c03233c3da3 100644 (file)
@@ -72,7 +72,7 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
        { "bx", 8, offsetof(struct pt_regs, bx) },
        { "cx", 8, offsetof(struct pt_regs, cx) },
        { "dx", 8, offsetof(struct pt_regs, dx) },
-       { "si", 8, offsetof(struct pt_regs, dx) },
+       { "si", 8, offsetof(struct pt_regs, si) },
        { "di", 8, offsetof(struct pt_regs, di) },
        { "bp", 8, offsetof(struct pt_regs, bp) },
        { "sp", 8, offsetof(struct pt_regs, sp) },
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
 #ifdef CONFIG_X86_32
        switch (regno) {
        case GDB_SS:
-               if (!user_mode_vm(regs))
+               if (!user_mode(regs))
                        *(unsigned long *)mem = __KERNEL_DS;
                break;
        case GDB_SP:
-               if (!user_mode_vm(regs))
+               if (!user_mode(regs))
                        *(unsigned long *)mem = kernel_stack_pointer(regs);
                break;
        case GDB_GS:
index 4e3d5a9621fe0052fac43d5ad6c109b9d3f54447..24d079604fd53afcab905a518c139e3ef2f13d71 100644 (file)
@@ -602,7 +602,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
        struct kprobe *p;
        struct kprobe_ctlblk *kcb;
 
-       if (user_mode_vm(regs))
+       if (user_mode(regs))
                return 0;
 
        addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
@@ -1007,7 +1007,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
        struct die_args *args = data;
        int ret = NOTIFY_DONE;
 
-       if (args->regs && user_mode_vm(args->regs))
+       if (args->regs && user_mode(args->regs))
                return ret;
 
        if (val == DIE_GPF) {
index d1ac80b72c72184a0b999c2b299b5e265d26de7a..005c03e93fc54c7907e8e9e2bc1d771902b1c3ca 100644 (file)
@@ -33,6 +33,7 @@
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/setup.h>
 
 #if 0
 #define DEBUGP(fmt, ...)                               \
@@ -47,21 +48,13 @@ do {                                                        \
 
 #ifdef CONFIG_RANDOMIZE_BASE
 static unsigned long module_load_offset;
-static int randomize_modules = 1;
 
 /* Mutex protects the module_load_offset. */
 static DEFINE_MUTEX(module_kaslr_mutex);
 
-static int __init parse_nokaslr(char *p)
-{
-       randomize_modules = 0;
-       return 0;
-}
-early_param("nokaslr", parse_nokaslr);
-
 static unsigned long int get_module_load_offset(void)
 {
-       if (randomize_modules) {
+       if (kaslr_enabled()) {
                mutex_lock(&module_kaslr_mutex);
                /*
                 * Calculate the module_load_offset the first time this
index 781861cc5ee8d7b9bbd27e9b13c380da59bb06c0..da8cb987b97312f6b37b5a967ce70fc5e75e0205 100644 (file)
@@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user,
        }
 
        /*
-        * RIP, flags, and the argument registers are usually saved.
-        * orig_ax is probably okay, too.
+        * These registers are always saved on 64-bit syscall entry.
+        * On 32-bit entry points, they are saved too except r8..r11.
         */
        regs_user_copy->ip = user_regs->ip;
+       regs_user_copy->ax = user_regs->ax;
        regs_user_copy->cx = user_regs->cx;
        regs_user_copy->dx = user_regs->dx;
        regs_user_copy->si = user_regs->si;
@@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user,
        regs_user_copy->r11 = user_regs->r11;
        regs_user_copy->orig_ax = user_regs->orig_ax;
        regs_user_copy->flags = user_regs->flags;
+       regs_user_copy->sp = user_regs->sp;
+       regs_user_copy->cs = user_regs->cs;
+       regs_user_copy->ss = user_regs->ss;
 
        /*
-        * Don't even try to report the "rest" regs.
+        * Most system calls don't save these registers, don't report them.
         */
        regs_user_copy->bx = -1;
        regs_user_copy->bp = -1;
@@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 
        /*
         * For this to be at all useful, we need a reasonable guess for
-        * sp and the ABI.  Be careful: we're in NMI context, and we're
+        * the ABI.  Be careful: we're in NMI context, and we're
         * considering current to be the current task, so we should
         * be careful not to look at any other percpu variables that might
         * change during context switches.
         */
-       if (IS_ENABLED(CONFIG_IA32_EMULATION) &&
-           task_thread_info(current)->status & TS_COMPAT) {
-               /* Easy case: we're in a compat syscall. */
-               regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
-               regs_user_copy->sp = user_regs->sp;
-               regs_user_copy->cs = user_regs->cs;
-               regs_user_copy->ss = user_regs->ss;
-       } else if (user_regs->orig_ax != -1) {
-               /*
-                * We're probably in a 64-bit syscall.
-                * Warning: this code is severely racy.  At least it's better
-                * than just blindly copying user_regs.
-                */
-               regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
-               regs_user_copy->sp = this_cpu_read(old_rsp);
-               regs_user_copy->cs = __USER_CS;
-               regs_user_copy->ss = __USER_DS;
-               regs_user_copy->cx = -1;  /* usually contains garbage */
-       } else {
-               /* We're probably in an interrupt or exception. */
-               regs_user->abi = user_64bit_mode(user_regs) ?
-                       PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
-               regs_user_copy->sp = user_regs->sp;
-               regs_user_copy->cs = user_regs->cs;
-               regs_user_copy->ss = user_regs->ss;
-       }
+       regs_user->abi = user_64bit_mode(user_regs) ?
+               PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
 
        regs_user->regs = regs_user_copy;
 }
index 046e2d620bbe7be507808e0f7188c45249f2d69c..8213da62b1b79c1c37798b598494add6802881d7 100644 (file)
@@ -9,7 +9,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/pm.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 #include <linux/random.h>
 #include <linux/user-return-notifier.h>
 #include <linux/dmi.h>
@@ -24,6 +24,7 @@
 #include <asm/syscalls.h>
 #include <asm/idle.h>
 #include <asm/uaccess.h>
+#include <asm/mwait.h>
 #include <asm/i387.h>
 #include <asm/fpu-internal.h>
 #include <asm/debugreg.h>
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+       .x86_tss = {
+               .sp0 = TOP_OF_INIT_STACK,
+#ifdef CONFIG_X86_32
+               .ss0 = __KERNEL_DS,
+               .ss1 = __KERNEL_CS,
+               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+#endif
+        },
+#ifdef CONFIG_X86_32
+        /*
+         * Note that the .io_bitmap member must be extra-big. This is because
+         * the CPU will access an additional byte beyond the end of the IO
+         * permission bitmap. The extra byte must be all 1 bits, and must
+         * be within the limit.
+         */
+       .io_bitmap              = { [0 ... IO_BITMAP_LONGS] = ~0 },
+#endif
+};
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
 
 #ifdef CONFIG_X86_64
 static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -69,8 +89,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 
        dst->thread.fpu_counter = 0;
        dst->thread.fpu.has_fpu = 0;
-       dst->thread.fpu.last_cpu = ~0;
        dst->thread.fpu.state = NULL;
+       task_disable_lazy_fpu_restore(dst);
        if (tsk_used_math(src)) {
                int err = fpu_alloc(&dst->thread.fpu);
                if (err)
@@ -109,7 +129,7 @@ void exit_thread(void)
        unsigned long *bp = t->io_bitmap_ptr;
 
        if (bp) {
-               struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+               struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
 
                t->io_bitmap_ptr = NULL;
                clear_thread_flag(TIF_IO_BITMAP);
@@ -131,13 +151,18 @@ void flush_thread(void)
 
        flush_ptrace_hw_breakpoint(tsk);
        memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
-       drop_init_fpu(tsk);
-       /*
-        * Free the FPU state for non xsave platforms. They get reallocated
-        * lazily at the first use.
-        */
-       if (!use_eager_fpu())
+
+       if (!use_eager_fpu()) {
+               /* FPU state will be reallocated lazily at the first use. */
+               drop_fpu(tsk);
                free_thread_xstate(tsk);
+       } else if (!used_math()) {
+               /* kthread execs. TODO: cleanup this horror. */
+               if (WARN_ON(init_fpu(tsk)))
+                       force_sig(SIGKILL, tsk);
+               user_fpu_begin();
+               restore_init_xstate();
+       }
 }
 
 static void hard_disable_TSC(void)
@@ -377,14 +402,11 @@ static void amd_e400_idle(void)
 
                if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
                        cpumask_set_cpu(cpu, amd_e400_c1e_mask);
-                       /*
-                        * Force broadcast so ACPI can not interfere.
-                        */
-                       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
-                                          &cpu);
+                       /* Force broadcast so ACPI can not interfere. */
+                       tick_broadcast_force();
                        pr_info("Switch to broadcast mode on CPU%d\n", cpu);
                }
-               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+               tick_broadcast_enter();
 
                default_idle();
 
@@ -393,12 +415,59 @@ static void amd_e400_idle(void)
                 * called with interrupts disabled.
                 */
                local_irq_disable();
-               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+               tick_broadcast_exit();
                local_irq_enable();
        } else
                default_idle();
 }
 
+/*
+ * Intel Core2 and older machines prefer MWAIT over HALT for C1.
+ * We can't rely on cpuidle installing MWAIT, because it will not load
+ * on systems that support only C1 -- so the boot default must be MWAIT.
+ *
+ * Some AMD machines are the opposite, they depend on using HALT.
+ *
+ * So for default C1, which is used during boot until cpuidle loads,
+ * use MWAIT-C1 on Intel HW that has it, else use HALT.
+ */
+static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
+{
+       if (c->x86_vendor != X86_VENDOR_INTEL)
+               return 0;
+
+       if (!cpu_has(c, X86_FEATURE_MWAIT))
+               return 0;
+
+       return 1;
+}
+
+/*
+ * MONITOR/MWAIT with no hints, used for default default C1 state.
+ * This invokes MWAIT with interrutps enabled and no flags,
+ * which is backwards compatible with the original MWAIT implementation.
+ */
+
+static void mwait_idle(void)
+{
+       if (!current_set_polling_and_test()) {
+               if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
+                       smp_mb(); /* quirk */
+                       clflush((void *)&current_thread_info()->flags);
+                       smp_mb(); /* quirk */
+               }
+
+               __monitor((void *)&current_thread_info()->flags, 0, 0);
+               if (!need_resched())
+                       __sti_mwait(0, 0);
+               else
+                       local_irq_enable();
+       } else {
+               local_irq_enable();
+       }
+       __current_clr_polling();
+}
+
 void select_idle_routine(const struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
@@ -412,6 +481,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
                /* E400: APIC timer interrupt does not wake up CPU from C1e */
                pr_info("using AMD E400 aware idle routine\n");
                x86_idle = amd_e400_idle;
+       } else if (prefer_mwait_c1_over_halt(c)) {
+               pr_info("using mwait in idle threads\n");
+               x86_idle = mwait_idle;
        } else
                x86_idle = default_idle;
 }
index 603c4f99cb5a17f83e65c3066a5642a4bb9d0f42..8ed2106b06da63e0a8e0dcf561aad7a1fc112e40 100644 (file)
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all)
        unsigned long sp;
        unsigned short ss, gs;
 
-       if (user_mode_vm(regs)) {
+       if (user_mode(regs)) {
                sp = regs->sp;
                ss = regs->ss & 0xffff;
                gs = get_user_gs(regs);
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
        regs->ip                = new_ip;
        regs->sp                = new_sp;
        regs->flags             = X86_EFLAGS_IF;
-       /*
-        * force it to the iret return path by making it look as if there was
-        * some work pending.
-        */
-       set_thread_flag(TIF_NOTIFY_RESUME);
+       force_iret();
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
@@ -248,18 +244,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        struct thread_struct *prev = &prev_p->thread,
                                 *next = &next_p->thread;
        int cpu = smp_processor_id();
-       struct tss_struct *tss = &per_cpu(init_tss, cpu);
+       struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
        fpu_switch_t fpu;
 
        /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
        fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
-       /*
-        * Reload esp0.
-        */
-       load_sp0(tss, next);
-
        /*
         * Save away %gs. No need to save %fs, as it was saved on the
         * stack on entry.  No need to save %es and %ds, as those are
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         */
        arch_end_context_switch(next_p);
 
+       /*
+        * Reload esp0, kernel_stack, and current_top_of_stack.  This changes
+        * current_thread_info().
+        */
+       load_sp0(tss, next);
        this_cpu_write(kernel_stack,
-                 (unsigned long)task_stack_page(next_p) +
-                 THREAD_SIZE - KERNEL_STACK_OFFSET);
+                      (unsigned long)task_stack_page(next_p) +
+                      THREAD_SIZE);
+       this_cpu_write(cpu_current_top_of_stack,
+                      (unsigned long)task_stack_page(next_p) +
+                      THREAD_SIZE);
 
        /*
         * Restore %gs if needed (which is common)
index 67fcc43577d279faa02941dd56dbf36f270a9832..4baaa972f52aaed15b3dddd94b4df4f653126d81 100644 (file)
@@ -52,7 +52,7 @@
 
 asmlinkage extern void ret_from_fork(void);
 
-__visible DEFINE_PER_CPU(unsigned long, old_rsp);
+__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
 
 /* Prints also some state that isn't saved in the pt_regs */
 void __show_regs(struct pt_regs *regs, int all)
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
        p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
        childregs = task_pt_regs(p);
        p->thread.sp = (unsigned long) childregs;
-       p->thread.usersp = me->thread.usersp;
        set_tsk_thread_flag(p, TIF_FORK);
        p->thread.io_bitmap_ptr = NULL;
 
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
         */
        if (clone_flags & CLONE_SETTLS) {
 #ifdef CONFIG_IA32_EMULATION
-               if (test_thread_flag(TIF_IA32))
+               if (is_ia32_task())
                        err = do_set_thread_area(p, -1,
                                (struct user_desc __user *)childregs->si, 0);
                else
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
        loadsegment(es, _ds);
        loadsegment(ds, _ds);
        load_gs_index(0);
-       current->thread.usersp  = new_sp;
        regs->ip                = new_ip;
        regs->sp                = new_sp;
-       this_cpu_write(old_rsp, new_sp);
        regs->cs                = _cs;
        regs->ss                = _ss;
        regs->flags             = X86_EFLAGS_IF;
+       force_iret();
 }
 
 void
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        struct thread_struct *prev = &prev_p->thread;
        struct thread_struct *next = &next_p->thread;
        int cpu = smp_processor_id();
-       struct tss_struct *tss = &per_cpu(init_tss, cpu);
+       struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
        unsigned fsindex, gsindex;
        fpu_switch_t fpu;
 
        fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
-       /* Reload esp0 and ss1. */
-       load_sp0(tss, next);
-
        /* We must save %fs and %gs before load_TLS() because
         * %fs and %gs may be cleared by load_TLS().
         *
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        /*
         * Switch the PDA and FPU contexts.
         */
-       prev->usersp = this_cpu_read(old_rsp);
-       this_cpu_write(old_rsp, next->usersp);
        this_cpu_write(current_task, next_p);
 
        /*
@@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
        this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
 
+       /* Reload esp0 and ss1.  This changes current_thread_info(). */
+       load_sp0(tss, next);
+
        this_cpu_write(kernel_stack,
-                 (unsigned long)task_stack_page(next_p) +
-                 THREAD_SIZE - KERNEL_STACK_OFFSET);
+               (unsigned long)task_stack_page(next_p) + THREAD_SIZE);
 
        /*
         * Now maybe reload the debug registers and handle I/O bitmaps
@@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr)
 
 unsigned long KSTK_ESP(struct task_struct *task)
 {
-       return (test_tsk_thread_flag(task, TIF_IA32)) ?
-                       (task_pt_regs(task)->sp) : ((task)->thread.usersp);
+       return task_pt_regs(task)->sp;
 }
index e510618b2e91a7969bb8cf6c74a35f59e4bf1bea..a7bc794807195af79b6c15054b1941867d373198 100644 (file)
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task,
        case offsetof(struct user_regs_struct,cs):
                if (unlikely(value == 0))
                        return -EIO;
-#ifdef CONFIG_IA32_EMULATION
-               if (test_tsk_thread_flag(task, TIF_IA32))
-                       task_pt_regs(task)->cs = value;
-#endif
+               task_pt_regs(task)->cs = value;
                break;
        case offsetof(struct user_regs_struct,ss):
                if (unlikely(value == 0))
                        return -EIO;
-#ifdef CONFIG_IA32_EMULATION
-               if (test_tsk_thread_flag(task, TIF_IA32))
-                       task_pt_regs(task)->ss = value;
-#endif
+               task_pt_regs(task)->ss = value;
                break;
        }
 
@@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
        memset(info, 0, sizeof(*info));
        info->si_signo = SIGTRAP;
        info->si_code = si_code;
-       info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
+       info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
 }
 
 void user_single_step_siginfo(struct task_struct *tsk,
index 2f355d229a587771680b28080d92fd06f345d7e7..e5ecd20e72dd56d82447c94c17e6e85ae29eba90 100644 (file)
@@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
 
+static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
+
+static struct pvclock_vsyscall_time_info *
+pvclock_get_vsyscall_user_time_info(int cpu)
+{
+       if (!pvclock_vdso_info) {
+               BUG();
+               return NULL;
+       }
+
+       return &pvclock_vdso_info[cpu];
+}
+
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
+{
+       return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
+}
+
 #ifdef CONFIG_X86_64
+static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
+                               void *v)
+{
+       struct task_migration_notifier *mn = v;
+       struct pvclock_vsyscall_time_info *pvti;
+
+       pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
+
+       /* this is NULL when pvclock vsyscall is not initialized */
+       if (unlikely(pvti == NULL))
+               return NOTIFY_DONE;
+
+       pvti->migrate_count++;
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block pvclock_migrate = {
+       .notifier_call = pvclock_task_migrate,
+};
+
 /*
  * Initialize the generic pvclock vsyscall state.  This will allocate
  * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
 
        WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
 
+       pvclock_vdso_info = i;
+
        for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
                __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
                             __pa(i) + (idx*PAGE_SIZE),
                             PAGE_KERNEL_VVAR);
        }
 
+
+       register_task_migration_notifier(&pvclock_migrate);
+
        return 0;
 }
 #endif
index bae6c609888e7fdff25784d5bd96fd8dcd5ea88a..86db4bcd7ce52bcb74a5bf42efcd8e7152488cf1 100644 (file)
@@ -183,6 +183,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
                },
        },
 
+       /* ASRock */
+       {       /* Handle problems with rebooting on ASRock Q1900DC-ITX */
+               .callback = set_pci_reboot,
+               .ident = "ASRock Q1900DC-ITX",
+               .matches = {
+                       DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"),
+                       DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"),
+               },
+       },
+
        /* ASUS */
        {       /* Handle problems with rebooting on ASUS P4S800 */
                .callback = set_bios_reboot,
index e13f8e7c22a68c3d9590b8deb69e116d2f625f68..77630d57e7bf30ac7f281e15c440cd7b7bae37a9 100644 (file)
@@ -226,23 +226,23 @@ swap_pages:
        movl    (%ebx), %ecx
        addl    $4, %ebx
 1:
-       testl   $0x1,   %ecx  /* is it a destination page */
+       testb   $0x1, %cl     /* is it a destination page */
        jz      2f
        movl    %ecx,   %edi
        andl    $0xfffff000, %edi
        jmp     0b
 2:
-       testl   $0x2,   %ecx  /* is it an indirection page */
+       testb   $0x2, %cl    /* is it an indirection page */
        jz      2f
        movl    %ecx,   %ebx
        andl    $0xfffff000, %ebx
        jmp     0b
 2:
-       testl   $0x4,   %ecx /* is it the done indicator */
+       testb   $0x4, %cl    /* is it the done indicator */
        jz      2f
        jmp     3f
 2:
-       testl   $0x8,   %ecx /* is it the source indicator */
+       testb   $0x8, %cl    /* is it the source indicator */
        jz      0b           /* Ignore it otherwise */
        movl    %ecx,   %esi /* For every source page do a copy */
        andl    $0xfffff000, %esi
index 3fd2c693e4752d01e071de68ef57e2db8b47b605..98111b38ebfd6eb9949242c5aae7b18bbbdb4489 100644 (file)
@@ -123,7 +123,7 @@ identity_mapped:
         * Set cr4 to a known state:
         *  - physical address extension enabled
         */
-       movq    $X86_CR4_PAE, %rax
+       movl    $X86_CR4_PAE, %eax
        movq    %rax, %cr4
 
        jmp 1f
@@ -221,23 +221,23 @@ swap_pages:
        movq    (%rbx), %rcx
        addq    $8,     %rbx
 1:
-       testq   $0x1,   %rcx  /* is it a destination page? */
+       testb   $0x1,   %cl   /* is it a destination page? */
        jz      2f
        movq    %rcx,   %rdi
        andq    $0xfffffffffffff000, %rdi
        jmp     0b
 2:
-       testq   $0x2,   %rcx  /* is it an indirection page? */
+       testb   $0x2,   %cl   /* is it an indirection page? */
        jz      2f
        movq    %rcx,   %rbx
        andq    $0xfffffffffffff000, %rbx
        jmp     0b
 2:
-       testq   $0x4,   %rcx  /* is it the done indicator? */
+       testb   $0x4,   %cl   /* is it the done indicator? */
        jz      2f
        jmp     3f
 2:
-       testq   $0x8,   %rcx  /* is it the source indicator? */
+       testb   $0x8,   %cl   /* is it the source indicator? */
        jz      0b            /* Ignore it otherwise */
        movq    %rcx,   %rsi  /* For ever source page do a copy */
        andq    $0xfffffffffffff000, %rsi
@@ -246,17 +246,17 @@ swap_pages:
        movq    %rsi, %rax
 
        movq    %r10, %rdi
-       movq    $512,   %rcx
+       movl    $512, %ecx
        rep ; movsq
 
        movq    %rax, %rdi
        movq    %rdx, %rsi
-       movq    $512,   %rcx
+       movl    $512, %ecx
        rep ; movsq
 
        movq    %rdx, %rdi
        movq    %r10, %rsi
-       movq    $512,   %rcx
+       movl    $512, %ecx
        rep ; movsq
 
        lea     PAGE_SIZE(%rax), %rsi
index 0a2421cca01fad095bbb7caa8e7c779d910d751b..d74ac33290ae3eeef46b923c4556d644b72f0d5a 100644 (file)
@@ -354,7 +354,7 @@ static void __init relocate_initrd(void)
                mapaddr = ramdisk_image & PAGE_MASK;
                p = early_memremap(mapaddr, clen+slop);
                memcpy(q, p+slop, clen);
-               early_iounmap(p, clen+slop);
+               early_memunmap(p, clen+slop);
                q += clen;
                ramdisk_image += clen;
                ramdisk_size  -= clen;
@@ -438,7 +438,7 @@ static void __init parse_setup_data(void)
                data_len = data->len + sizeof(struct setup_data);
                data_type = data->type;
                pa_next = data->next;
-               early_iounmap(data, sizeof(*data));
+               early_memunmap(data, sizeof(*data));
 
                switch (data_type) {
                case SETUP_E820_EXT:
@@ -470,7 +470,7 @@ static void __init e820_reserve_setup_data(void)
                         E820_RAM, E820_RESERVED_KERN);
                found = 1;
                pa_data = data->next;
-               early_iounmap(data, sizeof(*data));
+               early_memunmap(data, sizeof(*data));
        }
        if (!found)
                return;
@@ -491,7 +491,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
                data = early_memremap(pa_data, sizeof(*data));
                memblock_reserve(pa_data, sizeof(*data) + data->len);
                pa_data = data->next;
-               early_iounmap(data, sizeof(*data));
+               early_memunmap(data, sizeof(*data));
        }
 }
 
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void)
 static int
 dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
 {
-       pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
-                "(relocation range: 0x%lx-0x%lx)\n",
-                (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
-                __START_KERNEL_map, MODULES_VADDR-1);
+       if (kaslr_enabled()) {
+               pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
+                        (unsigned long)&_text - __START_KERNEL,
+                        __START_KERNEL,
+                        __START_KERNEL_map,
+                        MODULES_VADDR-1);
+       } else {
+               pr_emerg("Kernel Offset: disabled\n");
+       }
 
        return 0;
 }
index e5042463c1bca59107e107117984c37753d732c3..3e581865c8e2a048bbc307a9d3fc4e04ad14b4d7 100644 (file)
@@ -61,8 +61,7 @@
        regs->seg = GET_SEG(seg) | 3;                   \
 } while (0)
 
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
-                      unsigned long *pax)
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 {
        void __user *buf;
        unsigned int tmpflags;
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 #endif /* CONFIG_X86_32 */
 
                COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
-               COPY(dx); COPY(cx); COPY(ip);
+               COPY(dx); COPY(cx); COPY(ip); COPY(ax);
 
 #ifdef CONFIG_X86_64
                COPY(r8);
@@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
                COPY(r15);
 #endif /* CONFIG_X86_64 */
 
-#ifdef CONFIG_X86_32
                COPY_SEG_CPL3(cs);
                COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
-               /* Kernel saves and restores only the CS segment register on signals,
-                * which is the bare minimum needed to allow mixed 32/64-bit code.
-                * App's signal handler can save/restore other segments if needed. */
-               COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
 
                get_user_ex(tmpflags, &sc->flags);
                regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
                regs->orig_ax = -1;             /* disable syscall checks */
 
                get_user_ex(buf, &sc->fpstate);
-
-               get_user_ex(*pax, &sc->ax);
        } get_user_catch(err);
 
        err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
 
+       force_iret();
+
        return err;
 }
 
@@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 #else /* !CONFIG_X86_32 */
                put_user_ex(regs->flags, &sc->flags);
                put_user_ex(regs->cs, &sc->cs);
-               put_user_ex(0, &sc->gs);
-               put_user_ex(0, &sc->fs);
+               put_user_ex(0, &sc->__pad2);
+               put_user_ex(0, &sc->__pad1);
+               put_user_ex(regs->ss, &sc->ss);
 #endif /* CONFIG_X86_32 */
 
                put_user_ex(fpstate, &sc->fpstate);
@@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
        regs->sp = (unsigned long)frame;
 
-       /* Set up the CS register to run signal handlers in 64-bit mode,
-          even if the handler happens to be interrupting 32-bit code. */
+       /*
+        * Set up the CS and SS registers to run signal handlers in
+        * 64-bit mode, even if the handler happens to be interrupting
+        * 32-bit or 16-bit code.
+        *
+        * SS is subtle.  In 64-bit mode, we don't need any particular
+        * SS descriptor, but we do need SS to be valid.  It's possible
+        * that the old SS is entirely bogus -- this can happen if the
+        * signal we're trying to deliver is #GP or #SS caused by a bad
+        * SS value.
+        */
        regs->cs = __USER_CS;
+       regs->ss = __USER_DS;
 
        return 0;
 }
@@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void)
 {
        struct pt_regs *regs = current_pt_regs();
        struct sigframe __user *frame;
-       unsigned long ax;
        sigset_t set;
 
        frame = (struct sigframe __user *)(regs->sp - 8);
@@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void)
 
        set_current_blocked(&set);
 
-       if (restore_sigcontext(regs, &frame->sc, &ax))
+       if (restore_sigcontext(regs, &frame->sc))
                goto badframe;
-       return ax;
+       return regs->ax;
 
 badframe:
        signal_fault(regs, frame, "sigreturn");
@@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void)
 {
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe __user *frame;
-       unsigned long ax;
        sigset_t set;
 
        frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
@@ -579,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void)
 
        set_current_blocked(&set);
 
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
                goto badframe;
 
        if (restore_altstack(&frame->uc.uc_stack))
                goto badframe;
 
-       return ax;
+       return regs->ax;
 
 badframe:
        signal_fault(regs, frame, "rt_sigreturn");
@@ -679,7 +680,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
                 * Ensure the signal handler starts with the new fpu state.
                 */
                if (used_math())
-                       drop_init_fpu(current);
+                       fpu_reset_state(current);
        }
        signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
 }
@@ -780,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe_x32 __user *frame;
        sigset_t set;
-       unsigned long ax;
 
        frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
 
@@ -791,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 
        set_current_blocked(&set);
 
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
                goto badframe;
 
        if (compat_restore_altstack(&frame->uc.uc_stack))
                goto badframe;
 
-       return ax;
+       return regs->ax;
 
 badframe:
        signal_fault(regs, frame, "x32 rt_sigreturn");
index febc6aabc72e049443f68c167622d50cd8344f16..7035f6b21c3f99126f5362a8ba5ced4bbe565111 100644 (file)
@@ -779,6 +779,26 @@ out:
        return boot_error;
 }
 
+void common_cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+       /* Just in case we booted with a single CPU. */
+       alternatives_enable_smp();
+
+       per_cpu(current_task, cpu) = idle;
+
+#ifdef CONFIG_X86_32
+       /* Stack for startup_32 can be just as for start_secondary onwards */
+       irq_ctx_init(cpu);
+       per_cpu(cpu_current_top_of_stack, cpu) =
+               (unsigned long)task_stack_page(idle) + THREAD_SIZE;
+#else
+       clear_tsk_thread_flag(idle, TIF_FORK);
+       initial_gs = per_cpu_offset(cpu);
+#endif
+       per_cpu(kernel_stack, cpu) =
+               (unsigned long)task_stack_page(idle) + THREAD_SIZE;
+}
+
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -796,23 +816,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
        int cpu0_nmi_registered = 0;
        unsigned long timeout;
 
-       /* Just in case we booted with a single CPU. */
-       alternatives_enable_smp();
-
        idle->thread.sp = (unsigned long) (((struct pt_regs *)
                          (THREAD_SIZE +  task_stack_page(idle))) - 1);
-       per_cpu(current_task, cpu) = idle;
 
-#ifdef CONFIG_X86_32
-       /* Stack for startup_32 can be just as for start_secondary onwards */
-       irq_ctx_init(cpu);
-#else
-       clear_tsk_thread_flag(idle, TIF_FORK);
-       initial_gs = per_cpu_offset(cpu);
-#endif
-       per_cpu(kernel_stack, cpu) =
-               (unsigned long)task_stack_page(idle) -
-               KERNEL_STACK_OFFSET + THREAD_SIZE;
        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
        initial_code = (unsigned long)start_secondary;
        stack_start  = idle->thread.sp;
@@ -953,6 +959,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
        /* the FPU context is blank, nobody can own it */
        __cpu_disable_lazy_restore(cpu);
 
+       common_cpu_up(cpu, tidle);
+
        err = do_boot_cpu(apicid, cpu, tidle);
        if (err) {
                pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
@@ -1086,8 +1094,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
                return SMP_NO_APIC;
        }
 
-       verify_local_APIC();
-
        /*
         * If SMP should be disabled, then really disable it!
         */
index 30277e27431acde9a9320e0b1be4470bddb40e3a..10e0272d789a189b7215100a1d66a676d9b4bbfa 100644 (file)
@@ -34,10 +34,26 @@ static unsigned long get_align_mask(void)
        return va_align.mask;
 }
 
+/*
+ * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
+ * va_align.bits, [12:upper_bit), are set to a random value instead of
+ * zeroing them. This random value is computed once per boot. This form
+ * of ASLR is known as "per-boot ASLR".
+ *
+ * To achieve this, the random value is added to the info.align_offset
+ * value before calling vm_unmapped_area() or ORed directly to the
+ * address.
+ */
+static unsigned long get_align_bits(void)
+{
+       return va_align.bits & get_align_mask();
+}
+
 unsigned long align_vdso_addr(unsigned long addr)
 {
        unsigned long align_mask = get_align_mask();
-       return (addr + align_mask) & ~align_mask;
+       addr = (addr + align_mask) & ~align_mask;
+       return addr | get_align_bits();
 }
 
 static int __init control_va_addr_alignment(char *str)
@@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
        info.length = len;
        info.low_limit = begin;
        info.high_limit = end;
-       info.align_mask = filp ? get_align_mask() : 0;
+       info.align_mask = 0;
        info.align_offset = pgoff << PAGE_SHIFT;
+       if (filp) {
+               info.align_mask = get_align_mask();
+               info.align_offset += get_align_bits();
+       }
        return vm_unmapped_area(&info);
 }
 
@@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        info.length = len;
        info.low_limit = PAGE_SIZE;
        info.high_limit = mm->mmap_base;
-       info.align_mask = filp ? get_align_mask() : 0;
+       info.align_mask = 0;
        info.align_offset = pgoff << PAGE_SHIFT;
+       if (filp) {
+               info.align_mask = get_align_mask();
+               info.align_offset += get_align_bits();
+       }
        addr = vm_unmapped_area(&info);
        if (!(addr & ~PAGE_MASK))
                return addr;
index e9bcd57d8a9eb862212351527de3597040e611ba..3777189c4a19f04d1b7118ce2fae228e86356592 100644 (file)
@@ -5,21 +5,29 @@
 #include <linux/cache.h>
 #include <asm/asm-offsets.h>
 
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#ifdef CONFIG_IA32_EMULATION
+#define SYM(sym, compat) compat
+#else
+#define SYM(sym, compat) sym
+#define ia32_sys_call_table sys_call_table
+#define __NR_ia32_syscall_max __NR_syscall_max
+#endif
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
 
 typedef asmlinkage void (*sys_call_ptr_t)(void);
 
 extern asmlinkage void sys_ni_syscall(void);
 
-__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
        /*
         * Smells like a compiler bug -- it doesn't work
         * when the & below is removed.
         */
-       [0 ... __NR_syscall_max] = &sys_ni_syscall,
+       [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
 #include <asm/syscalls_32.h>
 };
index 25adc0e16eaa6349e78bd909860f6710705acb8e..d39c09119db6d2bf9e7ed6f1921f26f45dee3019 100644 (file)
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs)
 {
        unsigned long pc = instruction_pointer(regs);
 
-       if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+       if (!user_mode(regs) && in_lock_functions(pc)) {
 #ifdef CONFIG_FRAME_POINTER
                return *(unsigned long *)(regs->bp + sizeof(long));
 #else
index 4ff5d162ff9fd55381259ff8dd96f84064ecea72..f4fa991406cd78463299631466298e46cf2ffcea 100644 (file)
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 {
        enum ctx_state prev_state;
 
-       if (user_mode_vm(regs)) {
+       if (user_mode(regs)) {
                /* Other than that, we're just an exception. */
                prev_state = exception_enter();
        } else {
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
        /* Must be before exception_exit. */
        preempt_count_sub(HARDIRQ_OFFSET);
 
-       if (user_mode_vm(regs))
+       if (user_mode(regs))
                return exception_exit(prev_state);
        else
                rcu_nmi_exit();
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
  *
  * IST exception handlers normally cannot schedule.  As a special
  * exception, if the exception interrupted userspace code (i.e.
- * user_mode_vm(regs) would return true) and the exception was not
+ * user_mode(regs) would return true) and the exception was not
  * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
  * begins a non-atomic section within an ist_enter()/ist_exit() region.
  * Callers are responsible for enabling interrupts themselves inside
@@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
  */
 void ist_begin_non_atomic(struct pt_regs *regs)
 {
-       BUG_ON(!user_mode_vm(regs));
+       BUG_ON(!user_mode(regs));
 
        /*
         * Sanity check: we need to be on the normal thread stack.  This
         * will catch asm bugs and any attempt to use ist_preempt_enable
         * from double_fault.
         */
-       BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
-               & ~(THREAD_SIZE - 1)) != 0);
+       BUG_ON((unsigned long)(current_top_of_stack() -
+                              current_stack_pointer()) >= THREAD_SIZE);
 
        preempt_count_sub(HARDIRQ_OFFSET);
 }
@@ -194,8 +194,7 @@ static nokprobe_inline int
 do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
                  struct pt_regs *regs, long error_code)
 {
-#ifdef CONFIG_X86_32
-       if (regs->flags & X86_VM_MASK) {
+       if (v8086_mode(regs)) {
                /*
                 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
                 * On nmi (interrupt 2), do_trap should not be called.
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
                }
                return -1;
        }
-#endif
+
        if (!user_mode(regs)) {
                if (!fixup_exception(regs)) {
                        tsk->thread.error_code = error_code;
@@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
                goto exit;
        conditional_sti(regs);
 
-       if (!user_mode_vm(regs))
+       if (!user_mode(regs))
                die("bounds", regs, error_code);
 
        if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
        prev_state = exception_enter();
        conditional_sti(regs);
 
-#ifdef CONFIG_X86_32
-       if (regs->flags & X86_VM_MASK) {
+       if (v8086_mode(regs)) {
                local_irq_enable();
                handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
                goto exit;
        }
-#endif
 
        tsk = current;
        if (!user_mode(regs)) {
@@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
        /* Copy the remainder of the stack from the current stack. */
        memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
 
-       BUG_ON(!user_mode_vm(&new_stack->regs));
+       BUG_ON(!user_mode(&new_stack->regs));
        return new_stack;
 }
 NOKPROBE_SYMBOL(fixup_bad_iret);
@@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
         * then it's very likely the result of an icebp/int01 trap.
         * User wants a sigtrap for that.
         */
-       if (!dr6 && user_mode_vm(regs))
+       if (!dr6 && user_mode(regs))
                user_icebp = 1;
 
        /* Catch kmemcheck conditions first of all! */
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
        /* It's safe to allow irq's after DR6 has been saved */
        preempt_conditional_sti(regs);
 
-       if (regs->flags & X86_VM_MASK) {
+       if (v8086_mode(regs)) {
                handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
                                        X86_TRAP_DB);
                preempt_conditional_cli(regs);
@@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
                return;
        conditional_sti(regs);
 
-       if (!user_mode_vm(regs))
+       if (!user_mode(regs))
        {
                if (!fixup_exception(regs)) {
                        task->thread.error_code = error_code;
@@ -734,7 +731,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
        /*
         * Save the info for the exception handler and clear the error.
         */
-       save_init_fpu(task);
+       unlazy_fpu(task);
        task->thread.trap_nr = trapnr;
        task->thread.error_code = error_code;
        info.si_signo = SIGFPE;
@@ -863,7 +860,7 @@ void math_state_restore(void)
        kernel_fpu_disable();
        __thread_fpu_begin(tsk);
        if (unlikely(restore_fpu_checking(tsk))) {
-               drop_init_fpu(tsk);
+               fpu_reset_state(tsk);
                force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
        } else {
                tsk->thread.fpu_counter++;
@@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 /* Set of traps needed for early debugging. */
 void __init early_trap_init(void)
 {
-       set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+       /*
+        * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
+        * is ready in cpu_init() <-- trap_init(). Before trap_init(),
+        * CPU runs at ring 0 so it is impossible to hit an invalid
+        * stack.  Using the original stack works well enough at this
+        * early stage. DEBUG_STACK will be equipped after cpu_init() in
+        * trap_init().
+        *
+        * We don't need to set trace_idt_table like set_intr_gate(),
+        * since we don't have trace_debug and it will be reset to
+        * 'debug' in trap_init() by set_intr_gate_ist().
+        */
+       set_intr_gate_notrace(X86_TRAP_DB, debug);
        /* int3 can be called from all */
-       set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+       set_system_intr_gate(X86_TRAP_BP, &int3);
 #ifdef CONFIG_X86_32
        set_intr_gate(X86_TRAP_PF, page_fault);
 #endif
@@ -1005,6 +1014,15 @@ void __init trap_init(void)
         */
        cpu_init();
 
+       /*
+        * X86_TRAP_DB and X86_TRAP_BP have been set
+        * in early_trap_init(). However, ITS works only after
+        * cpu_init() loads TSS. See comments in early_trap_init().
+        */
+       set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+       /* int3 can be called from all */
+       set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+
        x86_init.irqs.trap_init();
 
 #ifdef CONFIG_X86_64
index 81f8adb0679e548d31af5982297c85141e3693f9..0b81ad67da07fa36e57577de5f7165ab320a55a1 100644 (file)
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
        int ret = NOTIFY_DONE;
 
        /* We are only interested in userspace traps */
-       if (regs && !user_mode_vm(regs))
+       if (regs && !user_mode(regs))
                return NOTIFY_DONE;
 
        switch (val) {
index e8edcf52e06911fe5446e543f40368ea69114225..fc9db6ef2a95937b1abd4f37f3a466043ba818d4 100644 (file)
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
                do_exit(SIGSEGV);
        }
 
-       tss = &per_cpu(init_tss, get_cpu());
+       tss = &per_cpu(cpu_tss, get_cpu());
        current->thread.sp0 = current->thread.saved_sp0;
        current->thread.sysenter_cs = __KERNEL_CS;
        load_sp0(tss, &current->thread);
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
        tsk->thread.saved_fs = info->regs32->fs;
        tsk->thread.saved_gs = get_user_gs(info->regs32);
 
-       tss = &per_cpu(init_tss, get_cpu());
+       tss = &per_cpu(cpu_tss, get_cpu());
        tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
        if (cpu_has_sep)
                tsk->thread.sysenter_cs = 0;
index c7d791f32b98e4f8598684306f086c6193ff6e93..51e3304169951619362ea4a1494716e4f20696bf 100644 (file)
@@ -31,30 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
        gtod_write_begin(vdata);
 
        /* copy vsyscall data */
-       vdata->vclock_mode      = tk->tkr.clock->archdata.vclock_mode;
-       vdata->cycle_last       = tk->tkr.cycle_last;
-       vdata->mask             = tk->tkr.mask;
-       vdata->mult             = tk->tkr.mult;
-       vdata->shift            = tk->tkr.shift;
+       vdata->vclock_mode      = tk->tkr_mono.clock->archdata.vclock_mode;
+       vdata->cycle_last       = tk->tkr_mono.cycle_last;
+       vdata->mask             = tk->tkr_mono.mask;
+       vdata->mult             = tk->tkr_mono.mult;
+       vdata->shift            = tk->tkr_mono.shift;
 
        vdata->wall_time_sec            = tk->xtime_sec;
-       vdata->wall_time_snsec          = tk->tkr.xtime_nsec;
+       vdata->wall_time_snsec          = tk->tkr_mono.xtime_nsec;
 
        vdata->monotonic_time_sec       = tk->xtime_sec
                                        + tk->wall_to_monotonic.tv_sec;
-       vdata->monotonic_time_snsec     = tk->tkr.xtime_nsec
+       vdata->monotonic_time_snsec     = tk->tkr_mono.xtime_nsec
                                        + ((u64)tk->wall_to_monotonic.tv_nsec
-                                               << tk->tkr.shift);
+                                               << tk->tkr_mono.shift);
        while (vdata->monotonic_time_snsec >=
-                                       (((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+                                       (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
                vdata->monotonic_time_snsec -=
-                                       ((u64)NSEC_PER_SEC) << tk->tkr.shift;
+                                       ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
                vdata->monotonic_time_sec++;
        }
 
        vdata->wall_time_coarse_sec     = tk->xtime_sec;
-       vdata->wall_time_coarse_nsec    = (long)(tk->tkr.xtime_nsec >>
-                                                tk->tkr.shift);
+       vdata->wall_time_coarse_nsec    = (long)(tk->tkr_mono.xtime_nsec >>
+                                                tk->tkr_mono.shift);
 
        vdata->monotonic_time_coarse_sec =
                vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
index cdc6cf90307800abb83f1b4b516ba389212c3dd0..87a815b85f3e5c1b9ee6db83224ad954eef1df75 100644 (file)
@@ -342,7 +342,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
                         config_enabled(CONFIG_IA32_EMULATION));
 
        if (!buf) {
-               drop_init_fpu(tsk);
+               fpu_reset_state(tsk);
                return 0;
        }
 
@@ -416,7 +416,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
                 */
                user_fpu_begin();
                if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
-                       drop_init_fpu(tsk);
+                       fpu_reset_state(tsk);
                        return -1;
                }
        }
@@ -678,19 +678,13 @@ void xsave_init(void)
        this_func();
 }
 
-static inline void __init eager_fpu_init_bp(void)
-{
-       current->thread.fpu.state =
-           alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
-       if (!init_xstate_buf)
-               setup_init_fpu_buf();
-}
-
-void eager_fpu_init(void)
+/*
+ * setup_init_fpu_buf() is __init and it is OK to call it here because
+ * init_xstate_buf will be unset only once during boot.
+ */
+void __init_refok eager_fpu_init(void)
 {
-       static __refdata void (*boot_func)(void) = eager_fpu_init_bp;
-
-       clear_used_math();
+       WARN_ON(used_math());
        current_thread_info()->status = 0;
 
        if (eagerfpu == ENABLE)
@@ -701,21 +695,8 @@ void eager_fpu_init(void)
                return;
        }
 
-       if (boot_func) {
-               boot_func();
-               boot_func = NULL;
-       }
-
-       /*
-        * This is same as math_state_restore(). But use_xsave() is
-        * not yet patched to use math_state_restore().
-        */
-       init_fpu(current);
-       __thread_fpu_begin(current);
-       if (cpu_has_xsave)
-               xrstor_state(init_xstate_buf, -1);
-       else
-               fxrstor_checking(&init_xstate_buf->i387);
+       if (!init_xstate_buf)
+               setup_init_fpu_buf();
 }
 
 /*
index 08f790dfadc9fb90dc3591687aaa93cc73fe1f88..16e8f962eaadf9d8f0d1cb4942288ebcd3f12a94 100644 (file)
@@ -1,5 +1,5 @@
 
-ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
+ccflags-y += -Iarch/x86/kvm
 
 CFLAGS_x86.o := -I.
 CFLAGS_svm.o := -I.
index 8a80737ee6e6ec14bc7d9a6ffe08d9f580d3c890..59b69f6a2844cdce101a69c3bb34eb7ccd30556f 100644 (file)
@@ -104,6 +104,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
                ((best->eax & 0xff00) >> 8) != 0)
                return -EINVAL;
 
+       /* Update physical-address width */
+       vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
        kvm_pmu_cpuid_update(vcpu);
        return 0;
 }
@@ -135,6 +138,21 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
        }
 }
 
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+       if (!best || best->eax < 0x80000008)
+               goto not_found;
+       best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+       if (best)
+               return best->eax & 0xff;
+not_found:
+       return 36;
+}
+EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
+
 /* when an old userspace process fills a new kernel module */
 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                             struct kvm_cpuid *cpuid,
@@ -757,21 +775,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
 
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
-{
-       struct kvm_cpuid_entry2 *best;
-
-       best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
-       if (!best || best->eax < 0x80000008)
-               goto not_found;
-       best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
-       if (best)
-               return best->eax & 0xff;
-not_found:
-       return 36;
-}
-EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
-
 /*
  * If no match is found, check whether we exceed the vCPU's limit
  * and return the content of the highest valid _standard_ leaf instead.
index 4452eedfaedd0a4849b54d4137edd5e85b24ae46..c3b1ad9fca818befb9e5920f7eb7c0d2b703d245 100644 (file)
@@ -20,13 +20,19 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
                              struct kvm_cpuid_entry2 __user *entries);
 void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
 
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+
+static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.maxphyaddr;
+}
 
 static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
 
        if (!static_cpu_has(X86_FEATURE_XSAVE))
-               return 0;
+               return false;
 
        best = kvm_find_cpuid_entry(vcpu, 1, 0);
        return best && (best->ecx & bit(X86_FEATURE_XSAVE));
index 106c01557f2b63706eca28e462a3b072b590f0c5..630bcb0d7a045b4930213eac1c1bbee3ef7d0ebe 100644 (file)
@@ -248,27 +248,7 @@ struct mode_dual {
        struct opcode mode64;
 };
 
-/* EFLAGS bit definitions. */
-#define EFLG_ID (1<<21)
-#define EFLG_VIP (1<<20)
-#define EFLG_VIF (1<<19)
-#define EFLG_AC (1<<18)
-#define EFLG_VM (1<<17)
-#define EFLG_RF (1<<16)
-#define EFLG_IOPL (3<<12)
-#define EFLG_NT (1<<14)
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_IF (1<<9)
-#define EFLG_TF (1<<8)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
 #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
-#define EFLG_RESERVED_ONE_MASK 2
 
 enum x86_transfer_type {
        X86_TRANSFER_NONE,
@@ -317,7 +297,8 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
  * These EFLAGS bits are restored from saved value during emulation, and
  * any changes are written back to the saved value after emulation.
  */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
+                    X86_EFLAGS_PF|X86_EFLAGS_CF)
 
 #ifdef CONFIG_X86_64
 #define ON64(x) x
@@ -478,6 +459,25 @@ static void assign_masked(ulong *dest, ulong src, ulong mask)
        *dest = (*dest & ~mask) | (src & mask);
 }
 
+static void assign_register(unsigned long *reg, u64 val, int bytes)
+{
+       /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+       switch (bytes) {
+       case 1:
+               *(u8 *)reg = (u8)val;
+               break;
+       case 2:
+               *(u16 *)reg = (u16)val;
+               break;
+       case 4:
+               *reg = (u32)val;
+               break;  /* 64b: zero-extend */
+       case 8:
+               *reg = val;
+               break;
+       }
+}
+
 static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
 {
        return (1UL << (ctxt->ad_bytes << 3)) - 1;
@@ -943,6 +943,22 @@ FASTOP2(xadd);
 
 FASTOP2R(cmp, cmp_r);
 
+static int em_bsf_c(struct x86_emulate_ctxt *ctxt)
+{
+       /* If src is zero, do not writeback, but update flags */
+       if (ctxt->src.val == 0)
+               ctxt->dst.type = OP_NONE;
+       return fastop(ctxt, em_bsf);
+}
+
+static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
+{
+       /* If src is zero, do not writeback, but update flags */
+       if (ctxt->src.val == 0)
+               ctxt->dst.type = OP_NONE;
+       return fastop(ctxt, em_bsr);
+}
+
 static u8 test_cc(unsigned int condition, unsigned long flags)
 {
        u8 rc;
@@ -1399,7 +1415,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
                unsigned int in_page, n;
                unsigned int count = ctxt->rep_prefix ?
                        address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
-               in_page = (ctxt->eflags & EFLG_DF) ?
+               in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
                        offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
                        PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
                n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
@@ -1412,7 +1428,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
        }
 
        if (ctxt->rep_prefix && (ctxt->d & String) &&
-           !(ctxt->eflags & EFLG_DF)) {
+           !(ctxt->eflags & X86_EFLAGS_DF)) {
                ctxt->dst.data = rc->data + rc->pos;
                ctxt->dst.type = OP_MEM_STR;
                ctxt->dst.count = (rc->end - rc->pos) / size;
@@ -1691,21 +1707,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 static void write_register_operand(struct operand *op)
 {
-       /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
-       switch (op->bytes) {
-       case 1:
-               *(u8 *)op->addr.reg = (u8)op->val;
-               break;
-       case 2:
-               *(u16 *)op->addr.reg = (u16)op->val;
-               break;
-       case 4:
-               *op->addr.reg = (u32)op->val;
-               break;  /* 64b: zero-extend */
-       case 8:
-               *op->addr.reg = op->val;
-               break;
-       }
+       return assign_register(op->addr.reg, op->val, op->bytes);
 }
 
 static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
@@ -1792,32 +1794,34 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 {
        int rc;
        unsigned long val, change_mask;
-       int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+       int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
        int cpl = ctxt->ops->cpl(ctxt);
 
        rc = emulate_pop(ctxt, &val, len);
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
-               | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID;
+       change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                     X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
+                     X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
+                     X86_EFLAGS_AC | X86_EFLAGS_ID;
 
        switch(ctxt->mode) {
        case X86EMUL_MODE_PROT64:
        case X86EMUL_MODE_PROT32:
        case X86EMUL_MODE_PROT16:
                if (cpl == 0)
-                       change_mask |= EFLG_IOPL;
+                       change_mask |= X86_EFLAGS_IOPL;
                if (cpl <= iopl)
-                       change_mask |= EFLG_IF;
+                       change_mask |= X86_EFLAGS_IF;
                break;
        case X86EMUL_MODE_VM86:
                if (iopl < 3)
                        return emulate_gp(ctxt, 0);
-               change_mask |= EFLG_IF;
+               change_mask |= X86_EFLAGS_IF;
                break;
        default: /* real mode */
-               change_mask |= (EFLG_IOPL | EFLG_IF);
+               change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
                break;
        }
 
@@ -1918,7 +1922,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
 
 static int em_pushf(struct x86_emulate_ctxt *ctxt)
 {
-       ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM;
+       ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM;
        return em_push(ctxt);
 }
 
@@ -1926,6 +1930,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 {
        int rc = X86EMUL_CONTINUE;
        int reg = VCPU_REGS_RDI;
+       u32 val;
 
        while (reg >= VCPU_REGS_RAX) {
                if (reg == VCPU_REGS_RSP) {
@@ -1933,9 +1938,10 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
                        --reg;
                }
 
-               rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes);
+               rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
                if (rc != X86EMUL_CONTINUE)
                        break;
+               assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
                --reg;
        }
        return rc;
@@ -1956,7 +1962,7 @@ static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
-       ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
+       ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC);
 
        ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
        rc = em_push(ctxt);
@@ -2022,10 +2028,14 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
        unsigned long temp_eip = 0;
        unsigned long temp_eflags = 0;
        unsigned long cs = 0;
-       unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
-                            EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
-                            EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
-       unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
+       unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF |
+                            X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
+                            X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
+                            X86_EFLAGS_AC | X86_EFLAGS_ID |
+                            X86_EFLAGS_FIXED;
+       unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
+                                 X86_EFLAGS_VIP;
 
        /* TODO: Add stack limit check */
 
@@ -2054,7 +2064,6 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 
        ctxt->_eip = temp_eip;
 
-
        if (ctxt->op_bytes == 4)
                ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
        else if (ctxt->op_bytes == 2) {
@@ -2063,7 +2072,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
        }
 
        ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
-       ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+       ctxt->eflags |= X86_EFLAGS_FIXED;
        ctxt->ops->set_nmi_mask(ctxt, false);
 
        return rc;
@@ -2145,12 +2154,12 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
            ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
                *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
                *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
-               ctxt->eflags &= ~EFLG_ZF;
+               ctxt->eflags &= ~X86_EFLAGS_ZF;
        } else {
                ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
                        (u32) reg_read(ctxt, VCPU_REGS_RBX);
 
-               ctxt->eflags |= EFLG_ZF;
+               ctxt->eflags |= X86_EFLAGS_ZF;
        }
        return X86EMUL_CONTINUE;
 }
@@ -2222,7 +2231,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
        ctxt->src.val = ctxt->dst.orig_val;
        fastop(ctxt, em_cmp);
 
-       if (ctxt->eflags & EFLG_ZF) {
+       if (ctxt->eflags & X86_EFLAGS_ZF) {
                /* Success: write back to memory; no update of EAX */
                ctxt->src.type = OP_NONE;
                ctxt->dst.val = ctxt->src.orig_val;
@@ -2381,14 +2390,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 
                ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
                ctxt->eflags &= ~msr_data;
-               ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+               ctxt->eflags |= X86_EFLAGS_FIXED;
 #endif
        } else {
                /* legacy mode */
                ops->get_msr(ctxt, MSR_STAR, &msr_data);
                ctxt->_eip = (u32)msr_data;
 
-               ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
+               ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
        }
 
        return X86EMUL_CONTINUE;
@@ -2425,8 +2434,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
        if ((msr_data & 0xfffc) == 0x0)
                return emulate_gp(ctxt, 0);
 
-       ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
-       cs_sel = (u16)msr_data & ~SELECTOR_RPL_MASK;
+       ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+       cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
        ss_sel = cs_sel + 8;
        if (efer & EFER_LMA) {
                cs.d = 0;
@@ -2493,8 +2502,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
                        return emulate_gp(ctxt, 0);
                break;
        }
-       cs_sel |= SELECTOR_RPL_MASK;
-       ss_sel |= SELECTOR_RPL_MASK;
+       cs_sel |= SEGMENT_RPL_MASK;
+       ss_sel |= SEGMENT_RPL_MASK;
 
        ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
        ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
@@ -2512,7 +2521,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
                return false;
        if (ctxt->mode == X86EMUL_MODE_VM86)
                return true;
-       iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+       iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
        return ctxt->ops->cpl(ctxt) > iopl;
 }
 
@@ -2782,10 +2791,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
                return ret;
        ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
                                        X86_TRANSFER_TASK_SWITCH, NULL);
-       if (ret != X86EMUL_CONTINUE)
-               return ret;
 
-       return X86EMUL_CONTINUE;
+       return ret;
 }
 
 static int task_switch_32(struct x86_emulate_ctxt *ctxt,
@@ -2954,7 +2961,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
                struct operand *op)
 {
-       int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
+       int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count;
 
        register_address_increment(ctxt, reg, df * op->bytes);
        op->addr.mem.ea = register_address(ctxt, reg);
@@ -3323,7 +3330,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
 
-static int em_vmcall(struct x86_emulate_ctxt *ctxt)
+static int em_hypercall(struct x86_emulate_ctxt *ctxt)
 {
        int rc = ctxt->ops->fix_hypercall(ctxt);
 
@@ -3395,17 +3402,6 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt)
        return em_lgdt_lidt(ctxt, true);
 }
 
-static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
-{
-       int rc;
-
-       rc = ctxt->ops->fix_hypercall(ctxt);
-
-       /* Disable writeback. */
-       ctxt->dst.type = OP_NONE;
-       return rc;
-}
-
 static int em_lidt(struct x86_emulate_ctxt *ctxt)
 {
        return em_lgdt_lidt(ctxt, false);
@@ -3504,7 +3500,8 @@ static int em_sahf(struct x86_emulate_ctxt *ctxt)
 {
        u32 flags;
 
-       flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF;
+       flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+               X86_EFLAGS_SF;
        flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
 
        ctxt->eflags &= ~0xffUL;
@@ -3769,7 +3766,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 
 static const struct opcode group7_rm0[] = {
        N,
-       I(SrcNone | Priv | EmulateOnUD, em_vmcall),
+       I(SrcNone | Priv | EmulateOnUD, em_hypercall),
        N, N, N, N, N, N,
 };
 
@@ -3781,7 +3778,7 @@ static const struct opcode group7_rm1[] = {
 
 static const struct opcode group7_rm3[] = {
        DIP(SrcNone | Prot | Priv,              vmrun,          check_svme_pa),
-       II(SrcNone  | Prot | EmulateOnUD,       em_vmmcall,     vmmcall),
+       II(SrcNone  | Prot | EmulateOnUD,       em_hypercall,   vmmcall),
        DIP(SrcNone | Prot | Priv,              vmload,         check_svme_pa),
        DIP(SrcNone | Prot | Priv,              vmsave,         check_svme_pa),
        DIP(SrcNone | Prot | Priv,              stgi,           check_svme),
@@ -4192,7 +4189,8 @@ static const struct opcode twobyte_table[256] = {
        N, N,
        G(BitOp, group8),
        F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
-       F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
+       I(DstReg | SrcMem | ModRM, em_bsf_c),
+       I(DstReg | SrcMem | ModRM, em_bsr_c),
        D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
        /* 0xC0 - 0xC7 */
        F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
@@ -4759,9 +4757,9 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
        if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
             (ctxt->b == 0xae) || (ctxt->b == 0xaf))
            && (((ctxt->rep_prefix == REPE_PREFIX) &&
-                ((ctxt->eflags & EFLG_ZF) == 0))
+                ((ctxt->eflags & X86_EFLAGS_ZF) == 0))
                || ((ctxt->rep_prefix == REPNE_PREFIX) &&
-                   ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+                   ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF))))
                return true;
 
        return false;
@@ -4913,7 +4911,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
                        /* All REP prefixes have the same first termination condition */
                        if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
                                ctxt->eip = ctxt->_eip;
-                               ctxt->eflags &= ~EFLG_RF;
+                               ctxt->eflags &= ~X86_EFLAGS_RF;
                                goto done;
                        }
                }
@@ -4963,9 +4961,9 @@ special_insn:
        }
 
        if (ctxt->rep_prefix && (ctxt->d & String))
-               ctxt->eflags |= EFLG_RF;
+               ctxt->eflags |= X86_EFLAGS_RF;
        else
-               ctxt->eflags &= ~EFLG_RF;
+               ctxt->eflags &= ~X86_EFLAGS_RF;
 
        if (ctxt->execute) {
                if (ctxt->d & Fastop) {
@@ -5014,7 +5012,7 @@ special_insn:
                rc = emulate_int(ctxt, ctxt->src.val);
                break;
        case 0xce:              /* into */
-               if (ctxt->eflags & EFLG_OF)
+               if (ctxt->eflags & X86_EFLAGS_OF)
                        rc = emulate_int(ctxt, 4);
                break;
        case 0xe9: /* jmp rel */
@@ -5027,19 +5025,19 @@ special_insn:
                break;
        case 0xf5:      /* cmc */
                /* complement carry flag from eflags reg */
-               ctxt->eflags ^= EFLG_CF;
+               ctxt->eflags ^= X86_EFLAGS_CF;
                break;
        case 0xf8: /* clc */
-               ctxt->eflags &= ~EFLG_CF;
+               ctxt->eflags &= ~X86_EFLAGS_CF;
                break;
        case 0xf9: /* stc */
-               ctxt->eflags |= EFLG_CF;
+               ctxt->eflags |= X86_EFLAGS_CF;
                break;
        case 0xfc: /* cld */
-               ctxt->eflags &= ~EFLG_DF;
+               ctxt->eflags &= ~X86_EFLAGS_DF;
                break;
        case 0xfd: /* std */
-               ctxt->eflags |= EFLG_DF;
+               ctxt->eflags |= X86_EFLAGS_DF;
                break;
        default:
                goto cannot_emulate;
@@ -5100,7 +5098,7 @@ writeback:
                        }
                        goto done; /* skip rip writeback */
                }
-               ctxt->eflags &= ~EFLG_RF;
+               ctxt->eflags &= ~X86_EFLAGS_RF;
        }
 
        ctxt->eip = ctxt->_eip;
@@ -5137,8 +5135,7 @@ twobyte_insn:
        case 0x40 ... 0x4f:     /* cmov */
                if (test_cc(ctxt->b, ctxt->eflags))
                        ctxt->dst.val = ctxt->src.val;
-               else if (ctxt->mode != X86EMUL_MODE_PROT64 ||
-                        ctxt->op_bytes != 4)
+               else if (ctxt->op_bytes != 4)
                        ctxt->dst.type = OP_NONE; /* no writeback */
                break;
        case 0x80 ... 0x8f: /* jnz rel, etc*/
index 298781d4cfb44b7c6d6536d6d2779ada2eeb150a..4dce6f8b6129ebea2432840154cc97efd513947b 100644 (file)
@@ -443,7 +443,8 @@ static inline int pit_in_range(gpa_t addr)
                (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
 }
 
-static int pit_ioport_write(struct kvm_io_device *this,
+static int pit_ioport_write(struct kvm_vcpu *vcpu,
+                               struct kvm_io_device *this,
                            gpa_t addr, int len, const void *data)
 {
        struct kvm_pit *pit = dev_to_pit(this);
@@ -519,7 +520,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
        return 0;
 }
 
-static int pit_ioport_read(struct kvm_io_device *this,
+static int pit_ioport_read(struct kvm_vcpu *vcpu,
+                          struct kvm_io_device *this,
                           gpa_t addr, int len, void *data)
 {
        struct kvm_pit *pit = dev_to_pit(this);
@@ -589,7 +591,8 @@ static int pit_ioport_read(struct kvm_io_device *this,
        return 0;
 }
 
-static int speaker_ioport_write(struct kvm_io_device *this,
+static int speaker_ioport_write(struct kvm_vcpu *vcpu,
+                               struct kvm_io_device *this,
                                gpa_t addr, int len, const void *data)
 {
        struct kvm_pit *pit = speaker_to_pit(this);
@@ -606,8 +609,9 @@ static int speaker_ioport_write(struct kvm_io_device *this,
        return 0;
 }
 
-static int speaker_ioport_read(struct kvm_io_device *this,
-                              gpa_t addr, int len, void *data)
+static int speaker_ioport_read(struct kvm_vcpu *vcpu,
+                                  struct kvm_io_device *this,
+                                  gpa_t addr, int len, void *data)
 {
        struct kvm_pit *pit = speaker_to_pit(this);
        struct kvm_kpit_state *pit_state = &pit->pit_state;
index dd1b16b611b0ae6c9d2386a7e690e56a774f0b74..c84990b42b5b189550eecc904f781c0e3ecadf3d 100644 (file)
@@ -3,7 +3,7 @@
 
 #include <linux/kthread.h>
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 struct kvm_kpit_channel_state {
        u32 count; /* can be 65536 */
index 9541ba34126b90123ddfe383453145ddfcf789c4..fef922ff263589de97348e76ee4faeaeb5aaeef2 100644 (file)
@@ -529,42 +529,42 @@ static int picdev_read(struct kvm_pic *s,
        return 0;
 }
 
-static int picdev_master_write(struct kvm_io_device *dev,
+static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
                               gpa_t addr, int len, const void *val)
 {
        return picdev_write(container_of(dev, struct kvm_pic, dev_master),
                            addr, len, val);
 }
 
-static int picdev_master_read(struct kvm_io_device *dev,
+static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
                              gpa_t addr, int len, void *val)
 {
        return picdev_read(container_of(dev, struct kvm_pic, dev_master),
                            addr, len, val);
 }
 
-static int picdev_slave_write(struct kvm_io_device *dev,
+static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
                              gpa_t addr, int len, const void *val)
 {
        return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
                            addr, len, val);
 }
 
-static int picdev_slave_read(struct kvm_io_device *dev,
+static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
                             gpa_t addr, int len, void *val)
 {
        return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
                            addr, len, val);
 }
 
-static int picdev_eclr_write(struct kvm_io_device *dev,
+static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
                             gpa_t addr, int len, const void *val)
 {
        return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
                            addr, len, val);
 }
 
-static int picdev_eclr_read(struct kvm_io_device *dev,
+static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
                            gpa_t addr, int len, void *val)
 {
        return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
index 46d4449772bc714daa658ea6424fb45659095c70..28146f03c51421ce12f728d69613ded0a65699fd 100644 (file)
@@ -206,6 +206,8 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
 
        old_irr = ioapic->irr;
        ioapic->irr |= mask;
+       if (edge)
+               ioapic->irr_delivered &= ~mask;
        if ((edge && old_irr == ioapic->irr) ||
            (!edge && entry.fields.remote_irr)) {
                ret = 0;
@@ -349,7 +351,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
        irqe.shorthand = 0;
 
        if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
-               ioapic->irr &= ~(1 << irq);
+               ioapic->irr_delivered |= 1 << irq;
 
        if (irq == RTC_GSI && line_status) {
                /*
@@ -473,13 +475,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
        }
 }
 
-bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
-{
-       struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-       smp_rmb();
-       return test_bit(vector, ioapic->handled_vectors);
-}
-
 void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
 {
        struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
@@ -500,8 +495,8 @@ static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
                 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
 }
 
-static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
-                           void *val)
+static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+                               gpa_t addr, int len, void *val)
 {
        struct kvm_ioapic *ioapic = to_ioapic(this);
        u32 result;
@@ -543,8 +538,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
        return 0;
 }
 
-static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
-                            const void *val)
+static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+                                gpa_t addr, int len, const void *val)
 {
        struct kvm_ioapic *ioapic = to_ioapic(this);
        u32 data;
@@ -599,6 +594,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
        ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
        ioapic->ioregsel = 0;
        ioapic->irr = 0;
+       ioapic->irr_delivered = 0;
        ioapic->id = 0;
        memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
        rtc_irq_eoi_tracking_reset(ioapic);
@@ -656,6 +652,7 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 
        spin_lock(&ioapic->lock);
        memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+       state->irr &= ~ioapic->irr_delivered;
        spin_unlock(&ioapic->lock);
        return 0;
 }
@@ -669,6 +666,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
        spin_lock(&ioapic->lock);
        memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
        ioapic->irr = 0;
+       ioapic->irr_delivered = 0;
        update_handled_vectors(ioapic);
        kvm_vcpu_request_scan_ioapic(kvm);
        kvm_ioapic_inject_all(ioapic, state->irr);
index c2e36d934af4d96ffceb2f1edb78601a2a848f67..ca0b0b4e625603687bbbe0343ddc0feb11dadbd7 100644 (file)
@@ -3,7 +3,7 @@
 
 #include <linux/kvm_host.h>
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 struct kvm;
 struct kvm_vcpu;
@@ -77,6 +77,7 @@ struct kvm_ioapic {
        struct rtc_status rtc_status;
        struct delayed_work eoi_inject;
        u32 irq_eoi[IOAPIC_NUM_PINS];
+       u32 irr_delivered;
 };
 
 #ifdef DEBUG
@@ -97,13 +98,19 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
        return kvm->arch.vioapic;
 }
 
+static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+{
+       struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+       smp_rmb();
+       return test_bit(vector, ioapic->handled_vectors);
+}
+
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                int short_hand, unsigned int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
                        int trigger_mode);
-bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
index 2d03568e9498356716b7504c195c71a912819f4d..ad68c73008c57f0c1926f0cec9e83f79fe070252 100644 (file)
@@ -27,7 +27,7 @@
 #include <linux/kvm_host.h>
 #include <linux/spinlock.h>
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 #include "ioapic.h"
 #include "lapic.h"
 
index 4ee827d7bf36f730c25d358f709aa99cda93260a..d67206a7b99a689a4d7361de8bd8fc1b9ab02c1a 100644 (file)
@@ -133,6 +133,28 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
        return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }
 
+/* The logical map is definitely wrong if we have multiple
+ * modes at the same time.  (Physical map is always right.)
+ */
+static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
+{
+       return !(map->mode & (map->mode - 1));
+}
+
+static inline void
+apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
+{
+       unsigned lid_bits;
+
+       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER !=  4);
+       BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT    !=  8);
+       BUILD_BUG_ON(KVM_APIC_MODE_X2APIC        != 16);
+       lid_bits = map->mode;
+
+       *cid = dest_id >> lid_bits;
+       *lid = dest_id & ((1 << lid_bits) - 1);
+}
+
 static void recalculate_apic_map(struct kvm *kvm)
 {
        struct kvm_apic_map *new, *old = NULL;
@@ -146,48 +168,6 @@ static void recalculate_apic_map(struct kvm *kvm)
        if (!new)
                goto out;
 
-       new->ldr_bits = 8;
-       /* flat mode is default */
-       new->cid_shift = 8;
-       new->cid_mask = 0;
-       new->lid_mask = 0xff;
-       new->broadcast = APIC_BROADCAST;
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               struct kvm_lapic *apic = vcpu->arch.apic;
-
-               if (!kvm_apic_present(vcpu))
-                       continue;
-
-               if (apic_x2apic_mode(apic)) {
-                       new->ldr_bits = 32;
-                       new->cid_shift = 16;
-                       new->cid_mask = new->lid_mask = 0xffff;
-                       new->broadcast = X2APIC_BROADCAST;
-               } else if (kvm_apic_get_reg(apic, APIC_LDR)) {
-                       if (kvm_apic_get_reg(apic, APIC_DFR) ==
-                                                       APIC_DFR_CLUSTER) {
-                               new->cid_shift = 4;
-                               new->cid_mask = 0xf;
-                               new->lid_mask = 0xf;
-                       } else {
-                               new->cid_shift = 8;
-                               new->cid_mask = 0;
-                               new->lid_mask = 0xff;
-                       }
-               }
-
-               /*
-                * All APICs have to be configured in the same mode by an OS.
-                * We take advatage of this while building logical id loockup
-                * table. After reset APICs are in software disabled mode, so if
-                * we find apic with different setting we assume this is the mode
-                * OS wants all apics to be in; build lookup table accordingly.
-                */
-               if (kvm_apic_sw_enabled(apic))
-                       break;
-       }
-
        kvm_for_each_vcpu(i, vcpu, kvm) {
                struct kvm_lapic *apic = vcpu->arch.apic;
                u16 cid, lid;
@@ -198,11 +178,25 @@ static void recalculate_apic_map(struct kvm *kvm)
 
                aid = kvm_apic_id(apic);
                ldr = kvm_apic_get_reg(apic, APIC_LDR);
-               cid = apic_cluster_id(new, ldr);
-               lid = apic_logical_id(new, ldr);
 
                if (aid < ARRAY_SIZE(new->phys_map))
                        new->phys_map[aid] = apic;
+
+               if (apic_x2apic_mode(apic)) {
+                       new->mode |= KVM_APIC_MODE_X2APIC;
+               } else if (ldr) {
+                       ldr = GET_APIC_LOGICAL_ID(ldr);
+                       if (kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+                               new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
+                       else
+                               new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+               }
+
+               if (!kvm_apic_logical_map_valid(new))
+                       continue;
+
+               apic_logical_id(new, ldr, &cid, &lid);
+
                if (lid && cid < ARRAY_SIZE(new->logical_map))
                        new->logical_map[cid][ffs(lid) - 1] = apic;
        }
@@ -588,15 +582,23 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
        apic_update_ppr(apic);
 }
 
-static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
 {
-       return dest == (apic_x2apic_mode(apic) ?
-                       X2APIC_BROADCAST : APIC_BROADCAST);
+       if (apic_x2apic_mode(apic))
+               return mda == X2APIC_BROADCAST;
+
+       return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
 }
 
-static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
 {
-       return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
+       if (kvm_apic_broadcast(apic, mda))
+               return true;
+
+       if (apic_x2apic_mode(apic))
+               return mda == kvm_apic_id(apic);
+
+       return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
 }
 
 static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -613,6 +615,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
                       && (logical_id & mda & 0xffff) != 0;
 
        logical_id = GET_APIC_LOGICAL_ID(logical_id);
+       mda = GET_APIC_DEST_FIELD(mda);
 
        switch (kvm_apic_get_reg(apic, APIC_DFR)) {
        case APIC_DFR_FLAT:
@@ -627,10 +630,27 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
        }
 }
 
+/* KVM APIC implementation has two quirks
+ *  - dest always begins at 0 while xAPIC MDA has offset 24,
+ *  - IOxAPIC messages have to be delivered (directly) to x2APIC.
+ */
+static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
+                                              struct kvm_lapic *target)
+{
+       bool ipi = source != NULL;
+       bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
+
+       if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+               return X2APIC_BROADCAST;
+
+       return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
+}
+
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                           int short_hand, unsigned int dest, int dest_mode)
 {
        struct kvm_lapic *target = vcpu->arch.apic;
+       u32 mda = kvm_apic_mda(dest, source, target);
 
        apic_debug("target %p, source %p, dest 0x%x, "
                   "dest_mode 0x%x, short_hand 0x%x\n",
@@ -640,9 +660,9 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
        switch (short_hand) {
        case APIC_DEST_NOSHORT:
                if (dest_mode == APIC_DEST_PHYSICAL)
-                       return kvm_apic_match_physical_addr(target, dest);
+                       return kvm_apic_match_physical_addr(target, mda);
                else
-                       return kvm_apic_match_logical_addr(target, dest);
+                       return kvm_apic_match_logical_addr(target, mda);
        case APIC_DEST_SELF:
                return target == source;
        case APIC_DEST_ALLINC:
@@ -664,6 +684,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
        struct kvm_lapic **dst;
        int i;
        bool ret = false;
+       bool x2apic_ipi = src && apic_x2apic_mode(src);
 
        *r = -1;
 
@@ -675,15 +696,15 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
        if (irq->shorthand)
                return false;
 
+       if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+               return false;
+
        rcu_read_lock();
        map = rcu_dereference(kvm->arch.apic_map);
 
        if (!map)
                goto out;
 
-       if (irq->dest_id == map->broadcast)
-               goto out;
-
        ret = true;
 
        if (irq->dest_mode == APIC_DEST_PHYSICAL) {
@@ -692,16 +713,20 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
                dst = &map->phys_map[irq->dest_id];
        } else {
-               u32 mda = irq->dest_id << (32 - map->ldr_bits);
-               u16 cid = apic_cluster_id(map, mda);
+               u16 cid;
+
+               if (!kvm_apic_logical_map_valid(map)) {
+                       ret = false;
+                       goto out;
+               }
+
+               apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
 
                if (cid >= ARRAY_SIZE(map->logical_map))
                        goto out;
 
                dst = map->logical_map[cid];
 
-               bitmap = apic_logical_id(map, mda);
-
                if (irq->delivery_mode == APIC_DM_LOWEST) {
                        int l = -1;
                        for_each_set_bit(i, &bitmap, 16) {
@@ -1037,7 +1062,7 @@ static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
            addr < apic->base_address + LAPIC_MMIO_LENGTH;
 }
 
-static int apic_mmio_read(struct kvm_io_device *this,
+static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
                           gpa_t address, int len, void *data)
 {
        struct kvm_lapic *apic = to_lapic(this);
@@ -1357,7 +1382,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
        return ret;
 }
 
-static int apic_mmio_write(struct kvm_io_device *this,
+static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
                            gpa_t address, int len, const void *data)
 {
        struct kvm_lapic *apic = to_lapic(this);
@@ -1497,8 +1522,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
                return;
        }
 
-       if (!kvm_vcpu_is_bsp(apic->vcpu))
-               value &= ~MSR_IA32_APICBASE_BSP;
        vcpu->arch.apic_base = value;
 
        /* update jump label if enable bit changes */
index 0bc6c656625b8377df4ed6ed18da0ece953cfc38..9d28383fc1e70cc3437ae2eb062ff9433c659f96 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef __KVM_X86_LAPIC_H
 #define __KVM_X86_LAPIC_H
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 #include <linux/kvm_host.h>
 
@@ -148,21 +148,6 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
        return kvm_x86_ops->vm_has_apicv(kvm);
 }
 
-static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
-{
-       u16 cid;
-       ldr >>= 32 - map->ldr_bits;
-       cid = (ldr >> map->cid_shift) & map->cid_mask;
-
-       return cid;
-}
-
-static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
-{
-       ldr >>= (32 - map->ldr_bits);
-       return ldr & map->lid_mask;
-}
-
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.apic->pending_events;
index cee759299a356dd720a22095ea4cb684c89fcdd3..146f295ee32214a6f4ad40a58e53b61a3d1fa06d 100644 (file)
@@ -4465,6 +4465,79 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
                kvm_flush_remote_tlbs(kvm);
 }
 
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+               unsigned long *rmapp)
+{
+       u64 *sptep;
+       struct rmap_iterator iter;
+       int need_tlb_flush = 0;
+       pfn_t pfn;
+       struct kvm_mmu_page *sp;
+
+       for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+               BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+               sp = page_header(__pa(sptep));
+               pfn = spte_to_pfn(*sptep);
+
+               /*
+                * Only EPT supported for now; otherwise, one would need to
+                * find out efficiently whether the guest page tables are
+                * also using huge pages.
+                */
+               if (sp->role.direct &&
+                       !kvm_is_reserved_pfn(pfn) &&
+                       PageTransCompound(pfn_to_page(pfn))) {
+                       drop_spte(kvm, sptep);
+                       sptep = rmap_get_first(*rmapp, &iter);
+                       need_tlb_flush = 1;
+               } else
+                       sptep = rmap_get_next(&iter);
+       }
+
+       return need_tlb_flush;
+}
+
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+                       struct kvm_memory_slot *memslot)
+{
+       bool flush = false;
+       unsigned long *rmapp;
+       unsigned long last_index, index;
+       gfn_t gfn_start, gfn_end;
+
+       spin_lock(&kvm->mmu_lock);
+
+       gfn_start = memslot->base_gfn;
+       gfn_end = memslot->base_gfn + memslot->npages - 1;
+
+       if (gfn_start >= gfn_end)
+               goto out;
+
+       rmapp = memslot->arch.rmap[0];
+       last_index = gfn_to_index(gfn_end, memslot->base_gfn,
+                                       PT_PAGE_TABLE_LEVEL);
+
+       for (index = 0; index <= last_index; ++index, ++rmapp) {
+               if (*rmapp)
+                       flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
+
+               if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+                       if (flush) {
+                               kvm_flush_remote_tlbs(kvm);
+                               flush = false;
+                       }
+                       cond_resched_lock(&kvm->mmu_lock);
+               }
+       }
+
+       if (flush)
+               kvm_flush_remote_tlbs(kvm);
+
+out:
+       spin_unlock(&kvm->mmu_lock);
+}
+
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot)
 {
index 8e6b7d869d2f7f34432a1f1685606eab7945561a..29fbf9dfdc549f47f1e189f58c5d5adfeb7a5fd1 100644 (file)
@@ -38,7 +38,7 @@ static struct kvm_arch_event_perf_mapping {
 };
 
 /* mapping between fixed pmc index and arch_events array */
-int fixed_pmc_events[] = {1, 0, 7};
+static int fixed_pmc_events[] = {1, 0, 7};
 
 static bool pmc_is_gp(struct kvm_pmc *pmc)
 {
index cc618c882f900ad21cb4de57d94daa91a5f4ec4c..ce741b8650f6ece694fb47e1750d153291a1803f 100644 (file)
@@ -1261,7 +1261,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
        svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
                                   MSR_IA32_APICBASE_ENABLE;
-       if (kvm_vcpu_is_bsp(&svm->vcpu))
+       if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
                svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
        svm_init_osvw(&svm->vcpu);
@@ -1929,14 +1929,12 @@ static int nop_on_interception(struct vcpu_svm *svm)
 static int halt_interception(struct vcpu_svm *svm)
 {
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
-       skip_emulated_instruction(&svm->vcpu);
        return kvm_emulate_halt(&svm->vcpu);
 }
 
 static int vmmcall_interception(struct vcpu_svm *svm)
 {
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-       skip_emulated_instruction(&svm->vcpu);
        kvm_emulate_hypercall(&svm->vcpu);
        return 1;
 }
@@ -2757,11 +2755,11 @@ static int invlpga_interception(struct vcpu_svm *svm)
 {
        struct kvm_vcpu *vcpu = &svm->vcpu;
 
-       trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
-                         vcpu->arch.regs[VCPU_REGS_RAX]);
+       trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
+                         kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
 
        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
-       kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
+       kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
 
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
        skip_emulated_instruction(&svm->vcpu);
@@ -2770,12 +2768,18 @@ static int invlpga_interception(struct vcpu_svm *svm)
 
 static int skinit_interception(struct vcpu_svm *svm)
 {
-       trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
+       trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
 
        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
        return 1;
 }
 
+static int wbinvd_interception(struct vcpu_svm *svm)
+{
+       kvm_emulate_wbinvd(&svm->vcpu);
+       return 1;
+}
+
 static int xsetbv_interception(struct vcpu_svm *svm)
 {
        u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
@@ -2902,7 +2906,8 @@ static int rdpmc_interception(struct vcpu_svm *svm)
        return 1;
 }
 
-bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
+static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+                                           unsigned long val)
 {
        unsigned long cr0 = svm->vcpu.arch.cr0;
        bool ret = false;
@@ -2940,7 +2945,10 @@ static int cr_interception(struct vcpu_svm *svm)
                return emulate_on_interception(svm);
 
        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
-       cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+       if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
+               cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
+       else
+               cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
 
        err = 0;
        if (cr >= 16) { /* mov to cr */
@@ -3133,7 +3141,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 
 static int rdmsr_interception(struct vcpu_svm *svm)
 {
-       u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+       u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
        u64 data;
 
        if (svm_get_msr(&svm->vcpu, ecx, &data)) {
@@ -3142,8 +3150,8 @@ static int rdmsr_interception(struct vcpu_svm *svm)
        } else {
                trace_kvm_msr_read(ecx, data);
 
-               svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
-               svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
+               kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff);
+               kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32);
                svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
                skip_emulated_instruction(&svm->vcpu);
        }
@@ -3246,9 +3254,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 static int wrmsr_interception(struct vcpu_svm *svm)
 {
        struct msr_data msr;
-       u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-       u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
-               | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+       u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+       u64 data = kvm_read_edx_eax(&svm->vcpu);
 
        msr.data = data;
        msr.index = ecx;
@@ -3325,7 +3332,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_READ_CR3]                     = cr_interception,
        [SVM_EXIT_READ_CR4]                     = cr_interception,
        [SVM_EXIT_READ_CR8]                     = cr_interception,
-       [SVM_EXIT_CR0_SEL_WRITE]                = emulate_on_interception,
+       [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
        [SVM_EXIT_WRITE_CR0]                    = cr_interception,
        [SVM_EXIT_WRITE_CR3]                    = cr_interception,
        [SVM_EXIT_WRITE_CR4]                    = cr_interception,
@@ -3376,7 +3383,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_STGI]                         = stgi_interception,
        [SVM_EXIT_CLGI]                         = clgi_interception,
        [SVM_EXIT_SKINIT]                       = skinit_interception,
-       [SVM_EXIT_WBINVD]                       = emulate_on_interception,
+       [SVM_EXIT_WBINVD]                       = wbinvd_interception,
        [SVM_EXIT_MONITOR]                      = monitor_interception,
        [SVM_EXIT_MWAIT]                        = mwait_interception,
        [SVM_EXIT_XSETBV]                       = xsetbv_interception,
@@ -3555,7 +3562,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
        if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
            || !svm_exit_handlers[exit_code]) {
-               WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code);
+               WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
index ae4f6d35d19c268315745741150dd6d1a7df5222..f5e8dce8046c56b5273e9aa043754f98a7dee7d7 100644 (file)
@@ -2470,6 +2470,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
        vmx->nested.nested_vmx_secondary_ctls_low = 0;
        vmx->nested.nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+               SECONDARY_EXEC_RDTSCP |
                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                SECONDARY_EXEC_APIC_REGISTER_VIRT |
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -3268,8 +3269,8 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
                 * default value.
                 */
                if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
-                       save->selector &= ~SELECTOR_RPL_MASK;
-               save->dpl = save->selector & SELECTOR_RPL_MASK;
+                       save->selector &= ~SEGMENT_RPL_MASK;
+               save->dpl = save->selector & SEGMENT_RPL_MASK;
                save->s = 1;
        }
        vmx_set_segment(vcpu, save, seg);
@@ -3842,7 +3843,7 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
        unsigned int cs_rpl;
 
        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
-       cs_rpl = cs.selector & SELECTOR_RPL_MASK;
+       cs_rpl = cs.selector & SEGMENT_RPL_MASK;
 
        if (cs.unusable)
                return false;
@@ -3870,7 +3871,7 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu)
        unsigned int ss_rpl;
 
        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
-       ss_rpl = ss.selector & SELECTOR_RPL_MASK;
+       ss_rpl = ss.selector & SEGMENT_RPL_MASK;
 
        if (ss.unusable)
                return true;
@@ -3892,7 +3893,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
        unsigned int rpl;
 
        vmx_get_segment(vcpu, &var, seg);
-       rpl = var.selector & SELECTOR_RPL_MASK;
+       rpl = var.selector & SEGMENT_RPL_MASK;
 
        if (var.unusable)
                return true;
@@ -3919,7 +3920,7 @@ static bool tr_valid(struct kvm_vcpu *vcpu)
 
        if (tr.unusable)
                return false;
-       if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
+       if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
                return false;
        if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
                return false;
@@ -3937,7 +3938,7 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
 
        if (ldtr.unusable)
                return true;
-       if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
+       if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
                return false;
        if (ldtr.type != 2)
                return false;
@@ -3954,8 +3955,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
 
-       return ((cs.selector & SELECTOR_RPL_MASK) ==
-                (ss.selector & SELECTOR_RPL_MASK));
+       return ((cs.selector & SEGMENT_RPL_MASK) ==
+                (ss.selector & SEGMENT_RPL_MASK));
 }
 
 /*
@@ -4711,7 +4712,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
        kvm_set_cr8(&vmx->vcpu, 0);
        apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
-       if (kvm_vcpu_is_bsp(&vmx->vcpu))
+       if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
                apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
        apic_base_msr.host_initiated = true;
        kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
@@ -5006,7 +5007,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
                if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
                        if (vcpu->arch.halt_request) {
                                vcpu->arch.halt_request = 0;
-                               return kvm_emulate_halt(vcpu);
+                               return kvm_vcpu_halt(vcpu);
                        }
                        return 1;
                }
@@ -5071,6 +5072,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        }
 
        if (is_invalid_opcode(intr_info)) {
+               if (is_guest_mode(vcpu)) {
+                       kvm_queue_exception(vcpu, UD_VECTOR);
+                       return 1;
+               }
                er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
                if (er != EMULATE_DONE)
                        kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5090,9 +5095,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
            !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
-               vcpu->run->internal.ndata = 2;
+               vcpu->run->internal.ndata = 3;
                vcpu->run->internal.data[0] = vect_info;
                vcpu->run->internal.data[1] = intr_info;
+               vcpu->run->internal.data[2] = error_code;
                return 0;
        }
 
@@ -5533,13 +5539,11 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 
 static int handle_halt(struct kvm_vcpu *vcpu)
 {
-       skip_emulated_instruction(vcpu);
        return kvm_emulate_halt(vcpu);
 }
 
 static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
-       skip_emulated_instruction(vcpu);
        kvm_emulate_hypercall(vcpu);
        return 1;
 }
@@ -5570,7 +5574,6 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu)
 
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
-       skip_emulated_instruction(vcpu);
        kvm_emulate_wbinvd(vcpu);
        return 1;
 }
@@ -5828,7 +5831,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
        gpa_t gpa;
 
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-       if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+       if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                skip_emulated_instruction(vcpu);
                return 1;
        }
@@ -5909,7 +5912,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 
                if (vcpu->arch.halt_request) {
                        vcpu->arch.halt_request = 0;
-                       ret = kvm_emulate_halt(vcpu);
+                       ret = kvm_vcpu_halt(vcpu);
                        goto out;
                }
 
@@ -7318,21 +7321,21 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
                else if (port < 0x10000)
                        bitmap = vmcs12->io_bitmap_b;
                else
-                       return 1;
+                       return true;
                bitmap += (port & 0x7fff) / 8;
 
                if (last_bitmap != bitmap)
                        if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
-                               return 1;
+                               return true;
                if (b & (1 << (port & 7)))
-                       return 1;
+                       return true;
 
                port++;
                size--;
                last_bitmap = bitmap;
        }
 
-       return 0;
+       return false;
 }
 
 /*
@@ -7348,7 +7351,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
        gpa_t bitmap;
 
        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
-               return 1;
+               return true;
 
        /*
         * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
@@ -7367,10 +7370,10 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
        if (msr_index < 1024*8) {
                unsigned char b;
                if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
-                       return 1;
+                       return true;
                return 1 & (b >> (msr_index & 7));
        } else
-               return 1; /* let L1 handle the wrong parameter */
+               return true; /* let L1 handle the wrong parameter */
 }
 
 /*
@@ -7392,7 +7395,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
                case 0:
                        if (vmcs12->cr0_guest_host_mask &
                            (val ^ vmcs12->cr0_read_shadow))
-                               return 1;
+                               return true;
                        break;
                case 3:
                        if ((vmcs12->cr3_target_count >= 1 &&
@@ -7403,37 +7406,37 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
                                        vmcs12->cr3_target_value2 == val) ||
                                (vmcs12->cr3_target_count >= 4 &&
                                        vmcs12->cr3_target_value3 == val))
-                               return 0;
+                               return false;
                        if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
-                               return 1;
+                               return true;
                        break;
                case 4:
                        if (vmcs12->cr4_guest_host_mask &
                            (vmcs12->cr4_read_shadow ^ val))
-                               return 1;
+                               return true;
                        break;
                case 8:
                        if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
-                               return 1;
+                               return true;
                        break;
                }
                break;
        case 2: /* clts */
                if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
                    (vmcs12->cr0_read_shadow & X86_CR0_TS))
-                       return 1;
+                       return true;
                break;
        case 1: /* mov from cr */
                switch (cr) {
                case 3:
                        if (vmcs12->cpu_based_vm_exec_control &
                            CPU_BASED_CR3_STORE_EXITING)
-                               return 1;
+                               return true;
                        break;
                case 8:
                        if (vmcs12->cpu_based_vm_exec_control &
                            CPU_BASED_CR8_STORE_EXITING)
-                               return 1;
+                               return true;
                        break;
                }
                break;
@@ -7444,14 +7447,14 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
                 */
                if (vmcs12->cr0_guest_host_mask & 0xe &
                    (val ^ vmcs12->cr0_read_shadow))
-                       return 1;
+                       return true;
                if ((vmcs12->cr0_guest_host_mask & 0x1) &&
                    !(vmcs12->cr0_read_shadow & 0x1) &&
                    (val & 0x1))
-                       return 1;
+                       return true;
                break;
        }
-       return 0;
+       return false;
 }
 
 /*
@@ -7474,48 +7477,48 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                                KVM_ISA_VMX);
 
        if (vmx->nested.nested_run_pending)
-               return 0;
+               return false;
 
        if (unlikely(vmx->fail)) {
                pr_info_ratelimited("%s failed vm entry %x\n", __func__,
                                    vmcs_read32(VM_INSTRUCTION_ERROR));
-               return 1;
+               return true;
        }
 
        switch (exit_reason) {
        case EXIT_REASON_EXCEPTION_NMI:
                if (!is_exception(intr_info))
-                       return 0;
+                       return false;
                else if (is_page_fault(intr_info))
                        return enable_ept;
                else if (is_no_device(intr_info) &&
                         !(vmcs12->guest_cr0 & X86_CR0_TS))
-                       return 0;
+                       return false;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
-               return 0;
+               return false;
        case EXIT_REASON_TRIPLE_FAULT:
-               return 1;
+               return true;
        case EXIT_REASON_PENDING_INTERRUPT:
                return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
        case EXIT_REASON_NMI_WINDOW:
                return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
        case EXIT_REASON_TASK_SWITCH:
-               return 1;
+               return true;
        case EXIT_REASON_CPUID:
                if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
-                       return 0;
-               return 1;
+                       return false;
+               return true;
        case EXIT_REASON_HLT:
                return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
        case EXIT_REASON_INVD:
-               return 1;
+               return true;
        case EXIT_REASON_INVLPG:
                return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
        case EXIT_REASON_RDPMC:
                return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
-       case EXIT_REASON_RDTSC:
+       case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
                return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
        case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
        case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
@@ -7527,7 +7530,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                 * VMX instructions trap unconditionally. This allows L1 to
                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
                 */
-               return 1;
+               return true;
        case EXIT_REASON_CR_ACCESS:
                return nested_vmx_exit_handled_cr(vcpu, vmcs12);
        case EXIT_REASON_DR_ACCESS:
@@ -7538,7 +7541,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_MSR_WRITE:
                return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
        case EXIT_REASON_INVALID_STATE:
-               return 1;
+               return true;
        case EXIT_REASON_MWAIT_INSTRUCTION:
                return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
        case EXIT_REASON_MONITOR_INSTRUCTION:
@@ -7548,7 +7551,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                        nested_cpu_has2(vmcs12,
                                SECONDARY_EXEC_PAUSE_LOOP_EXITING);
        case EXIT_REASON_MCE_DURING_VMENTRY:
-               return 0;
+               return false;
        case EXIT_REASON_TPR_BELOW_THRESHOLD:
                return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
        case EXIT_REASON_APIC_ACCESS:
@@ -7557,7 +7560,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_APIC_WRITE:
        case EXIT_REASON_EOI_INDUCED:
                /* apic_write and eoi_induced should exit unconditionally. */
-               return 1;
+               return true;
        case EXIT_REASON_EPT_VIOLATION:
                /*
                 * L0 always deals with the EPT violation. If nested EPT is
@@ -7565,7 +7568,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                 * missing in the guest EPT table (EPT12), the EPT violation
                 * will be injected with nested_ept_inject_page_fault()
                 */
-               return 0;
+               return false;
        case EXIT_REASON_EPT_MISCONFIG:
                /*
                 * L2 never uses directly L1's EPT, but rather L0's own EPT
@@ -7573,11 +7576,11 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                 * (EPT on EPT). So any problems with the structure of the
                 * table is L0's fault.
                 */
-               return 0;
+               return false;
        case EXIT_REASON_WBINVD:
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
        case EXIT_REASON_XSETBV:
-               return 1;
+               return true;
        case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
                /*
                 * This should never happen, since it is not possible to
@@ -7587,7 +7590,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                 */
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
        default:
-               return 1;
+               return true;
        }
 }
 
@@ -8522,6 +8525,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
                                                exec_control);
                        }
                }
+               if (nested && !vmx->rdtscp_enabled)
+                       vmx->nested.nested_vmx_secondary_ctls_high &=
+                               ~SECONDARY_EXEC_RDTSCP;
        }
 
        /* Exposing INVPCID only when PCID is exposed */
@@ -8622,10 +8628,11 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                                        struct vmcs12 *vmcs12)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int maxphyaddr = cpuid_maxphyaddr(vcpu);
 
        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-               /* TODO: Also verify bits beyond physical address width are 0 */
-               if (!PAGE_ALIGNED(vmcs12->apic_access_addr))
+               if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
+                   vmcs12->apic_access_addr >> maxphyaddr)
                        return false;
 
                /*
@@ -8641,8 +8648,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
        }
 
        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-               /* TODO: Also verify bits beyond physical address width are 0 */
-               if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr))
+               if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
+                   vmcs12->virtual_apic_page_addr >> maxphyaddr)
                        return false;
 
                if (vmx->nested.virtual_apic_page) /* shouldn't happen */
@@ -8665,7 +8672,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
        }
 
        if (nested_cpu_has_posted_intr(vmcs12)) {
-               if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+               if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
+                   vmcs12->posted_intr_desc_addr >> maxphyaddr)
                        return false;
 
                if (vmx->nested.pi_desc_page) { /* shouldn't happen */
@@ -8864,9 +8872,9 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
                                       unsigned long count_field,
-                                      unsigned long addr_field,
-                                      int maxphyaddr)
+                                      unsigned long addr_field)
 {
+       int maxphyaddr;
        u64 count, addr;
 
        if (vmcs12_read_any(vcpu, count_field, &count) ||
@@ -8876,6 +8884,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
        }
        if (count == 0)
                return 0;
+       maxphyaddr = cpuid_maxphyaddr(vcpu);
        if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
            (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
                pr_warn_ratelimited(
@@ -8889,19 +8898,16 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
                                                struct vmcs12 *vmcs12)
 {
-       int maxphyaddr;
-
        if (vmcs12->vm_exit_msr_load_count == 0 &&
            vmcs12->vm_exit_msr_store_count == 0 &&
            vmcs12->vm_entry_msr_load_count == 0)
                return 0; /* Fast path */
-       maxphyaddr = cpuid_maxphyaddr(vcpu);
        if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
-                                       VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+                                       VM_EXIT_MSR_LOAD_ADDR) ||
            nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
-                                       VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+                                       VM_EXIT_MSR_STORE_ADDR) ||
            nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
-                                       VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+                                       VM_ENTRY_MSR_LOAD_ADDR))
                return -EINVAL;
        return 0;
 }
@@ -9151,8 +9157,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                        exec_control &= ~SECONDARY_EXEC_RDTSCP;
                /* Take the following fields only from vmcs12 */
                exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                                 SECONDARY_EXEC_RDTSCP |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-                                  SECONDARY_EXEC_APIC_REGISTER_VIRT);
+                                 SECONDARY_EXEC_APIC_REGISTER_VIRT);
                if (nested_cpu_has(vmcs12,
                                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
                        exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9385,7 +9392,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        }
 
        if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-               /*TODO: Also verify bits beyond physical address width are 0*/
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
@@ -9524,7 +9530,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        vmcs12->launch_state = 1;
 
        if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
-               return kvm_emulate_halt(vcpu);
+               return kvm_vcpu_halt(vcpu);
 
        vmx->nested.nested_run_pending = 1;
 
index 32bf19ef3115f65c9dffc23a655be2763babcaff..e1a81267f3f632e971d1fc63cfe687be2e7c60f3 100644 (file)
@@ -801,6 +801,17 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
+{
+       int i;
+
+       if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+               for (i = 0; i < KVM_NR_DB_REGS; i++)
+                       vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
+       }
+}
+
 static void kvm_update_dr6(struct kvm_vcpu *vcpu)
 {
        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
@@ -1070,19 +1081,19 @@ static void update_pvclock_gtod(struct timekeeper *tk)
        struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
        u64 boot_ns;
 
-       boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot));
+       boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
 
        write_seqcount_begin(&vdata->seq);
 
        /* copy pvclock gtod data */
-       vdata->clock.vclock_mode        = tk->tkr.clock->archdata.vclock_mode;
-       vdata->clock.cycle_last         = tk->tkr.cycle_last;
-       vdata->clock.mask               = tk->tkr.mask;
-       vdata->clock.mult               = tk->tkr.mult;
-       vdata->clock.shift              = tk->tkr.shift;
+       vdata->clock.vclock_mode        = tk->tkr_mono.clock->archdata.vclock_mode;
+       vdata->clock.cycle_last         = tk->tkr_mono.cycle_last;
+       vdata->clock.mask               = tk->tkr_mono.mask;
+       vdata->clock.mult               = tk->tkr_mono.mult;
+       vdata->clock.shift              = tk->tkr_mono.shift;
 
        vdata->boot_ns                  = boot_ns;
-       vdata->nsec_base                = tk->tkr.xtime_nsec;
+       vdata->nsec_base                = tk->tkr_mono.xtime_nsec;
 
        write_seqcount_end(&vdata->seq);
 }
@@ -3149,6 +3160,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+       kvm_update_dr0123(vcpu);
        vcpu->arch.dr6 = dbgregs->dr6;
        kvm_update_dr6(vcpu);
        vcpu->arch.dr7 = dbgregs->dr7;
@@ -4114,8 +4126,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
        do {
                n = min(len, 8);
                if (!(vcpu->arch.apic &&
-                     !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
-                   && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+                     !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
+                   && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
                        break;
                handled += n;
                addr += n;
@@ -4134,8 +4146,9 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
        do {
                n = min(len, 8);
                if (!(vcpu->arch.apic &&
-                     !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
-                   && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+                     !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
+                                        addr, n, v))
+                   && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
                        break;
                trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
                handled += n;
@@ -4475,7 +4488,8 @@ mmio:
        return X86EMUL_CONTINUE;
 }
 
-int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
+                       unsigned long addr,
                        void *val, unsigned int bytes,
                        struct x86_exception *exception,
                        const struct read_write_emulator_ops *ops)
@@ -4538,7 +4552,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
                                   exception, &read_emultor);
 }
 
-int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
                            unsigned long addr,
                            const void *val,
                            unsigned int bytes,
@@ -4629,10 +4643,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
        int r;
 
        if (vcpu->arch.pio.in)
-               r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+               r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
                                    vcpu->arch.pio.size, pd);
        else
-               r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+               r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
                                     vcpu->arch.pio.port, vcpu->arch.pio.size,
                                     pd);
        return r;
@@ -4705,7 +4719,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
        kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
 }
 
-int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
 {
        if (!need_emulate_wbinvd(vcpu))
                return X86EMUL_CONTINUE;
@@ -4722,19 +4736,29 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
                wbinvd();
        return X86EMUL_CONTINUE;
 }
+
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->skip_emulated_instruction(vcpu);
+       return kvm_emulate_wbinvd_noskip(vcpu);
+}
 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
 
+
+
 static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
 {
-       kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
+       kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
 }
 
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+                          unsigned long *dest)
 {
        return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
 }
 
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+                          unsigned long value)
 {
 
        return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
@@ -5816,7 +5840,7 @@ void kvm_arch_exit(void)
        free_percpu(shared_msrs);
 }
 
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.halt_exits;
        if (irqchip_in_kernel(vcpu->kvm)) {
@@ -5827,6 +5851,13 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
                return 0;
        }
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->skip_emulated_instruction(vcpu);
+       return kvm_vcpu_halt(vcpu);
+}
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
@@ -5903,7 +5934,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
        lapic_irq.dest_id = apicid;
 
        lapic_irq.delivery_mode = APIC_DM_REMRD;
-       kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+       kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
 }
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -5911,6 +5942,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        unsigned long nr, a0, a1, a2, a3, ret;
        int op_64_bit, r = 1;
 
+       kvm_x86_ops->skip_emulated_instruction(vcpu);
+
        if (kvm_hv_hypercall_enabled(vcpu->kvm))
                return kvm_hv_hypercall(vcpu);
 
@@ -6164,7 +6197,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 }
 
 /*
- * Returns 1 to let __vcpu_run() continue the guest execution loop without
+ * Returns 1 to let vcpu_run() continue the guest execution loop without
  * exiting to the userspace.  Otherwise, the value will be returned to the
  * userspace.
  */
@@ -6301,6 +6334,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                set_debugreg(vcpu->arch.eff_db[2], 2);
                set_debugreg(vcpu->arch.eff_db[3], 3);
                set_debugreg(vcpu->arch.dr6, 6);
+               vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
        }
 
        trace_kvm_entry(vcpu->vcpu_id);
@@ -6382,42 +6416,47 @@ out:
        return r;
 }
 
+static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+       if (!kvm_arch_vcpu_runnable(vcpu)) {
+               srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+               kvm_vcpu_block(vcpu);
+               vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+               if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+                       return 1;
+       }
+
+       kvm_apic_accept_events(vcpu);
+       switch(vcpu->arch.mp_state) {
+       case KVM_MP_STATE_HALTED:
+               vcpu->arch.pv.pv_unhalted = false;
+               vcpu->arch.mp_state =
+                       KVM_MP_STATE_RUNNABLE;
+       case KVM_MP_STATE_RUNNABLE:
+               vcpu->arch.apf.halted = false;
+               break;
+       case KVM_MP_STATE_INIT_RECEIVED:
+               break;
+       default:
+               return -EINTR;
+               break;
+       }
+       return 1;
+}
 
-static int __vcpu_run(struct kvm_vcpu *vcpu)
+static int vcpu_run(struct kvm_vcpu *vcpu)
 {
        int r;
        struct kvm *kvm = vcpu->kvm;
 
        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 
-       r = 1;
-       while (r > 0) {
+       for (;;) {
                if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                    !vcpu->arch.apf.halted)
                        r = vcpu_enter_guest(vcpu);
-               else {
-                       srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-                       kvm_vcpu_block(vcpu);
-                       vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-                       if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
-                               kvm_apic_accept_events(vcpu);
-                               switch(vcpu->arch.mp_state) {
-                               case KVM_MP_STATE_HALTED:
-                                       vcpu->arch.pv.pv_unhalted = false;
-                                       vcpu->arch.mp_state =
-                                               KVM_MP_STATE_RUNNABLE;
-                               case KVM_MP_STATE_RUNNABLE:
-                                       vcpu->arch.apf.halted = false;
-                                       break;
-                               case KVM_MP_STATE_INIT_RECEIVED:
-                                       break;
-                               default:
-                                       r = -EINTR;
-                                       break;
-                               }
-                       }
-               }
-
+               else
+                       r = vcpu_block(kvm, vcpu);
                if (r <= 0)
                        break;
 
@@ -6429,6 +6468,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                        r = -EINTR;
                        vcpu->run->exit_reason = KVM_EXIT_INTR;
                        ++vcpu->stat.request_irq_exits;
+                       break;
                }
 
                kvm_check_async_pf_completion(vcpu);
@@ -6437,6 +6477,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                        r = -EINTR;
                        vcpu->run->exit_reason = KVM_EXIT_INTR;
                        ++vcpu->stat.signal_exits;
+                       break;
                }
                if (need_resched()) {
                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -6568,7 +6609,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        } else
                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
-       r = __vcpu_run(vcpu);
+       r = vcpu_run(vcpu);
 
 out:
        post_kvm_run_save(vcpu);
@@ -7075,11 +7116,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
        kvm_clear_exception_queue(vcpu);
 
        memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+       kvm_update_dr0123(vcpu);
        vcpu->arch.dr6 = DR6_INIT;
        kvm_update_dr6(vcpu);
        vcpu->arch.dr7 = DR7_FIXED_1;
        kvm_update_dr7(vcpu);
 
+       vcpu->arch.cr2 = 0;
+
        kvm_make_request(KVM_REQ_EVENT, vcpu);
        vcpu->arch.apf.msr_val = 0;
        vcpu->arch.st.msr_val = 0;
@@ -7240,7 +7284,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
        vcpu->arch.pv.pv_unhalted = false;
        vcpu->arch.emulate_ctxt.ops = &emulate_ops;
-       if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
+       if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        else
                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -7288,6 +7332,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        vcpu->arch.guest_supported_xcr0 = 0;
        vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 
+       vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
        kvm_async_pf_hash_reset(vcpu);
        kvm_pmu_init(vcpu);
 
@@ -7428,7 +7474,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
                if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
-                       kvm_kvfree(free->arch.rmap[i]);
+                       kvfree(free->arch.rmap[i]);
                        free->arch.rmap[i] = NULL;
                }
                if (i == 0)
@@ -7436,7 +7482,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 
                if (!dont || free->arch.lpage_info[i - 1] !=
                             dont->arch.lpage_info[i - 1]) {
-                       kvm_kvfree(free->arch.lpage_info[i - 1]);
+                       kvfree(free->arch.lpage_info[i - 1]);
                        free->arch.lpage_info[i - 1] = NULL;
                }
        }
@@ -7490,12 +7536,12 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 out_free:
        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-               kvm_kvfree(slot->arch.rmap[i]);
+               kvfree(slot->arch.rmap[i]);
                slot->arch.rmap[i] = NULL;
                if (i == 0)
                        continue;
 
-               kvm_kvfree(slot->arch.lpage_info[i - 1]);
+               kvfree(slot->arch.lpage_info[i - 1]);
                slot->arch.lpage_info[i - 1] = NULL;
        }
        return -ENOMEM;
@@ -7617,6 +7663,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
        /* It's OK to get 'new' slot here as it has already been installed */
        new = id_to_memslot(kvm->memslots, mem->slot);
 
+       /*
+        * Dirty logging tracks sptes in 4k granularity, meaning that large
+        * sptes have to be split.  If live migration is successful, the guest
+        * in the source machine will be destroyed and large sptes will be
+        * created in the destination. However, if the guest continues to run
+        * in the source machine (for example if live migration fails), small
+        * sptes will remain around and cause bad performance.
+        *
+        * Scan sptes if dirty logging has been stopped, dropping those
+        * which can be collapsed into a single large-page spte.  Later
+        * page faults will create the large-page sptes.
+        */
+       if ((change != KVM_MR_DELETE) &&
+               (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+               !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+               kvm_mmu_zap_collapsible_sptes(kvm, new);
+
        /*
         * Set up write protection and/or dirty logging for the new slot.
         *
index ac4453d8520efd5e2080ef6f29cfd7da7b154d61..717908b16037d45957a0ec69b94c8c7d396bfaa6 100644 (file)
@@ -868,7 +868,8 @@ static void __init lguest_init_IRQ(void)
                /* Some systems map "vectors" to interrupts weirdly.  Not us! */
                __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
                if (i != SYSCALL_VECTOR)
-                       set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
+                       set_intr_gate(i, irq_entries_start +
+                                       8 * (i - FIRST_EXTERNAL_VECTOR));
        }
 
        /*
@@ -1076,6 +1077,7 @@ static void lguest_load_sp0(struct tss_struct *tss,
 {
        lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
                   THREAD_SIZE / PAGE_SIZE);
+       tss->x86_tss.sp0 = thread->sp0;
 }
 
 /* Let's just say, I wouldn't do debugging under a Guest. */
index f5cc9eb1d51bc02ef817bd664c7810f4f5b928b1..082a85167a5b68feff28d749b0e04425a0f4adf2 100644 (file)
 #include <asm/alternative-asm.h>
 #include <asm/dwarf2.h>
 
-.macro SAVE reg
-       pushl_cfi %\reg
-       CFI_REL_OFFSET \reg, 0
-.endm
-
-.macro RESTORE reg
-       popl_cfi %\reg
-       CFI_RESTORE \reg
-.endm
-
 .macro read64 reg
        movl %ebx, %eax
        movl %ecx, %edx
@@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8)
 .macro addsub_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
        CFI_STARTPROC
-       SAVE ebp
-       SAVE ebx
-       SAVE esi
-       SAVE edi
+       pushl_cfi_reg ebp
+       pushl_cfi_reg ebx
+       pushl_cfi_reg esi
+       pushl_cfi_reg edi
 
        movl %eax, %esi
        movl %edx, %edi
@@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8)
 10:
        movl %ebx, %eax
        movl %ecx, %edx
-       RESTORE edi
-       RESTORE esi
-       RESTORE ebx
-       RESTORE ebp
+       popl_cfi_reg edi
+       popl_cfi_reg esi
+       popl_cfi_reg ebx
+       popl_cfi_reg ebp
        ret
        CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
@@ -104,7 +94,7 @@ addsub_return sub sub sbb
 .macro incdec_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
        CFI_STARTPROC
-       SAVE ebx
+       pushl_cfi_reg ebx
 
        read64 %esi
 1:
@@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8)
 10:
        movl %ebx, %eax
        movl %ecx, %edx
-       RESTORE ebx
+       popl_cfi_reg ebx
        ret
        CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
@@ -130,7 +120,7 @@ incdec_return dec sub sbb
 
 ENTRY(atomic64_dec_if_positive_cx8)
        CFI_STARTPROC
-       SAVE ebx
+       pushl_cfi_reg ebx
 
        read64 %esi
 1:
@@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8)
 2:
        movl %ebx, %eax
        movl %ecx, %edx
-       RESTORE ebx
+       popl_cfi_reg ebx
        ret
        CFI_ENDPROC
 ENDPROC(atomic64_dec_if_positive_cx8)
 
 ENTRY(atomic64_add_unless_cx8)
        CFI_STARTPROC
-       SAVE ebp
-       SAVE ebx
+       pushl_cfi_reg ebp
+       pushl_cfi_reg ebx
 /* these just push these two parameters on the stack */
-       SAVE edi
-       SAVE ecx
+       pushl_cfi_reg edi
+       pushl_cfi_reg ecx
 
        movl %eax, %ebp
        movl %edx, %edi
@@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8)
 3:
        addl $8, %esp
        CFI_ADJUST_CFA_OFFSET -8
-       RESTORE ebx
-       RESTORE ebp
+       popl_cfi_reg ebx
+       popl_cfi_reg ebp
        ret
 4:
        cmpl %edx, 4(%esp)
@@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8)
 
 ENTRY(atomic64_inc_not_zero_cx8)
        CFI_STARTPROC
-       SAVE ebx
+       pushl_cfi_reg ebx
 
        read64 %esi
 1:
@@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8)
 
        movl $1, %eax
 3:
-       RESTORE ebx
+       popl_cfi_reg ebx
        ret
        CFI_ENDPROC
 ENDPROC(atomic64_inc_not_zero_cx8)
index e78b8eee66155df85844e8cf6dae4507fa313bc6..9bc944a9127481ead40689a73054d80e50f0bc10 100644 (file)
@@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
           */           
 ENTRY(csum_partial)
        CFI_STARTPROC
-       pushl_cfi %esi
-       CFI_REL_OFFSET esi, 0
-       pushl_cfi %ebx
-       CFI_REL_OFFSET ebx, 0
+       pushl_cfi_reg esi
+       pushl_cfi_reg ebx
        movl 20(%esp),%eax      # Function arg: unsigned int sum
        movl 16(%esp),%ecx      # Function arg: int len
        movl 12(%esp),%esi      # Function arg: unsigned char *buff
@@ -127,14 +125,12 @@ ENTRY(csum_partial)
 6:     addl %ecx,%eax
        adcl $0, %eax 
 7:     
-       testl $1, 12(%esp)
+       testb $1, 12(%esp)
        jz 8f
        roll $8, %eax
 8:
-       popl_cfi %ebx
-       CFI_RESTORE ebx
-       popl_cfi %esi
-       CFI_RESTORE esi
+       popl_cfi_reg ebx
+       popl_cfi_reg esi
        ret
        CFI_ENDPROC
 ENDPROC(csum_partial)
@@ -145,10 +141,8 @@ ENDPROC(csum_partial)
 
 ENTRY(csum_partial)
        CFI_STARTPROC
-       pushl_cfi %esi
-       CFI_REL_OFFSET esi, 0
-       pushl_cfi %ebx
-       CFI_REL_OFFSET ebx, 0
+       pushl_cfi_reg esi
+       pushl_cfi_reg ebx
        movl 20(%esp),%eax      # Function arg: unsigned int sum
        movl 16(%esp),%ecx      # Function arg: int len
        movl 12(%esp),%esi      # Function arg: const unsigned char *buf
@@ -251,14 +245,12 @@ ENTRY(csum_partial)
        addl %ebx,%eax
        adcl $0,%eax
 80: 
-       testl $1, 12(%esp)
+       testb $1, 12(%esp)
        jz 90f
        roll $8, %eax
 90: 
-       popl_cfi %ebx
-       CFI_RESTORE ebx
-       popl_cfi %esi
-       CFI_RESTORE esi
+       popl_cfi_reg ebx
+       popl_cfi_reg esi
        ret
        CFI_ENDPROC
 ENDPROC(csum_partial)
@@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic)
        CFI_STARTPROC
        subl  $4,%esp   
        CFI_ADJUST_CFA_OFFSET 4
-       pushl_cfi %edi
-       CFI_REL_OFFSET edi, 0
-       pushl_cfi %esi
-       CFI_REL_OFFSET esi, 0
-       pushl_cfi %ebx
-       CFI_REL_OFFSET ebx, 0
+       pushl_cfi_reg edi
+       pushl_cfi_reg esi
+       pushl_cfi_reg ebx
        movl ARGBASE+16(%esp),%eax      # sum
        movl ARGBASE+12(%esp),%ecx      # len
        movl ARGBASE+4(%esp),%esi       # src
@@ -412,12 +401,9 @@ DST(       movb %cl, (%edi)        )
 
 .previous
 
-       popl_cfi %ebx
-       CFI_RESTORE ebx
-       popl_cfi %esi
-       CFI_RESTORE esi
-       popl_cfi %edi
-       CFI_RESTORE edi
+       popl_cfi_reg ebx
+       popl_cfi_reg esi
+       popl_cfi_reg edi
        popl_cfi %ecx                   # equivalent to addl $4,%esp
        ret     
        CFI_ENDPROC
@@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic)
                
 ENTRY(csum_partial_copy_generic)
        CFI_STARTPROC
-       pushl_cfi %ebx
-       CFI_REL_OFFSET ebx, 0
-       pushl_cfi %edi
-       CFI_REL_OFFSET edi, 0
-       pushl_cfi %esi
-       CFI_REL_OFFSET esi, 0
+       pushl_cfi_reg ebx
+       pushl_cfi_reg edi
+       pushl_cfi_reg esi
        movl ARGBASE+4(%esp),%esi       #src
        movl ARGBASE+8(%esp),%edi       #dst    
        movl ARGBASE+12(%esp),%ecx      #len
@@ -506,12 +489,9 @@ DST(       movb %dl, (%edi)         )
        jmp  7b                 
 .previous                              
 
-       popl_cfi %esi
-       CFI_RESTORE esi
-       popl_cfi %edi
-       CFI_RESTORE edi
-       popl_cfi %ebx
-       CFI_RESTORE ebx
+       popl_cfi_reg esi
+       popl_cfi_reg edi
+       popl_cfi_reg ebx
        ret
        CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
index f2145cfa12a66830e834718340c4cc88e64a731a..e67e579c93bdf7f3d0737565ea106edeeefb3b6d 100644 (file)
@@ -1,31 +1,35 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
 /*
- * Zero a page.        
- * rdi page
- */                    
-ENTRY(clear_page_c)
+ * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
+ * recommended to use this when possible and we do use them by default.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original.
+ */
+
+/*
+ * Zero a page.
+ * %rdi        - page
+ */
+ENTRY(clear_page)
        CFI_STARTPROC
+
+       ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
+                     "jmp clear_page_c_e", X86_FEATURE_ERMS
+
        movl $4096/8,%ecx
        xorl %eax,%eax
        rep stosq
        ret
        CFI_ENDPROC
-ENDPROC(clear_page_c)
+ENDPROC(clear_page)
 
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_orig)
        CFI_STARTPROC
-       movl $4096,%ecx
-       xorl %eax,%eax
-       rep stosb
-       ret
-       CFI_ENDPROC
-ENDPROC(clear_page_c_e)
 
-ENTRY(clear_page)
-       CFI_STARTPROC
        xorl   %eax,%eax
        movl   $4096/64,%ecx
        .p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
        nop
        ret
        CFI_ENDPROC
-.Lclear_page_end:
-ENDPROC(clear_page)
-
-       /*
-        * Some CPUs support enhanced REP MOVSB/STOSB instructions.
-        * It is recommended to use this when possible.
-        * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
-        * Otherwise, use original function.
-        *
-        */
+ENDPROC(clear_page_orig)
 
-#include <asm/cpufeature.h>
-
-       .section .altinstr_replacement,"ax"
-1:     .byte 0xeb                                      /* jmp <disp8> */
-       .byte (clear_page_c - clear_page) - (2f - 1b)   /* offset */
-2:     .byte 0xeb                                      /* jmp <disp8> */
-       .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
-3:
-       .previous
-       .section .altinstructions,"a"
-       altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
-                            .Lclear_page_end-clear_page, 2b-1b
-       altinstruction_entry clear_page,2b,X86_FEATURE_ERMS,   \
-                            .Lclear_page_end-clear_page,3b-2b
-       .previous
+ENTRY(clear_page_c_e)
+       CFI_STARTPROC
+       movl $4096,%ecx
+       xorl %eax,%eax
+       rep stosb
+       ret
+       CFI_ENDPROC
+ENDPROC(clear_page_c_e)
index 176cca67212b7072687f42450ef56018c03ebaaa..8239dbcbf98455a99a125953392114f07a09aa74 100644 (file)
@@ -2,23 +2,26 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
+/*
+ * Some CPUs run faster using the string copy instructions (sane microcode).
+ * It is also a lot simpler. Use this when possible. But, don't use streaming
+ * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
+ * prefetch distance based on SMP/UP.
+ */
        ALIGN
-copy_page_rep:
+ENTRY(copy_page)
        CFI_STARTPROC
+       ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
        movl    $4096/8, %ecx
        rep     movsq
        ret
        CFI_ENDPROC
-ENDPROC(copy_page_rep)
-
-/*
- *  Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
- *  Could vary the prefetch distance based on SMP/UP.
-*/
+ENDPROC(copy_page)
 
-ENTRY(copy_page)
+ENTRY(copy_page_regs)
        CFI_STARTPROC
        subq    $2*8,   %rsp
        CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
        addq    $2*8, %rsp
        CFI_ADJUST_CFA_OFFSET -2*8
        ret
-.Lcopy_page_end:
        CFI_ENDPROC
-ENDPROC(copy_page)
-
-       /* Some CPUs run faster using the string copy instructions.
-          It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
-       .section .altinstr_replacement,"ax"
-1:     .byte 0xeb                                      /* jmp <disp8> */
-       .byte (copy_page_rep - copy_page) - (2f - 1b)   /* offset */
-2:
-       .previous
-       .section .altinstructions,"a"
-       altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,       \
-               .Lcopy_page_end-copy_page, 2b-1b
-       .previous
+ENDPROC(copy_page_regs)
index dee945d555941a078f40049b24b8b4731aba6f2c..fa997dfaef242fa9abdb28c20658a939caf72697 100644 (file)
@@ -8,9 +8,6 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
 
-/*
- * By placing feature2 after feature1 in altinstructions section, we logically
- * implement:
- * If CPU has feature2, jmp to alt2 is used
- * else if CPU has feature1, jmp to alt1 is used
- * else jmp to orig is used.
- */
-       .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
-0:
-       .byte 0xe9      /* 32bit jump */
-       .long \orig-1f  /* by default jump to orig */
-1:
-       .section .altinstr_replacement,"ax"
-2:     .byte 0xe9                      /* near jump with 32bit immediate */
-       .long \alt1-1b /* offset */   /* or alternatively to alt1 */
-3:     .byte 0xe9                      /* near jump with 32bit immediate */
-       .long \alt2-1b /* offset */   /* or alternatively to alt2 */
-       .previous
-
-       .section .altinstructions,"a"
-       altinstruction_entry 0b,2b,\feature1,5,5
-       altinstruction_entry 0b,3b,\feature2,5,5
-       .previous
-       .endm
-
        .macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
        /* check for bad alignment of destination */
        movl %edi,%ecx
        andl $7,%ecx
@@ -67,7 +38,6 @@
 
        _ASM_EXTABLE(100b,103b)
        _ASM_EXTABLE(101b,103b)
-#endif
        .endm
 
 /* Standard copy_to_user with segment limit checking */
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user)
        jc bad_to_user
        cmpq TI_addr_limit(%rax),%rcx
        ja bad_to_user
-       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
-               copy_user_generic_unrolled,copy_user_generic_string,    \
-               copy_user_enhanced_fast_string
+       ALTERNATIVE_2 "jmp copy_user_generic_unrolled",         \
+                     "jmp copy_user_generic_string",           \
+                     X86_FEATURE_REP_GOOD,                     \
+                     "jmp copy_user_enhanced_fast_string",     \
+                     X86_FEATURE_ERMS
        CFI_ENDPROC
 ENDPROC(_copy_to_user)
 
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user)
        jc bad_from_user
        cmpq TI_addr_limit(%rax),%rcx
        ja bad_from_user
-       ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
-               copy_user_generic_unrolled,copy_user_generic_string,    \
-               copy_user_enhanced_fast_string
+       ALTERNATIVE_2 "jmp copy_user_generic_unrolled",         \
+                     "jmp copy_user_generic_string",           \
+                     X86_FEATURE_REP_GOOD,                     \
+                     "jmp copy_user_enhanced_fast_string",     \
+                     X86_FEATURE_ERMS
        CFI_ENDPROC
 ENDPROC(_copy_from_user)
 
index 2419d5fefae30ac2453fabf71c633ede013d2e4a..9734182966f3be925a38c0762aa71e3148156148 100644 (file)
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic)
 
        /* handle last odd byte */
 .Lhandle_1:
-       testl $1, %r10d
+       testb $1, %r10b
        jz    .Lende
        xorl  %ebx, %ebx
        source
index 1313ae6b478b6c439741ee032a8c33b86868ee2c..8f72b334aea03387ab7084d9998f497cf7287da2 100644 (file)
  */
 void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
 {
+       /*
+        * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid
+        * even if the input buffer is long enough to hold them.
+        */
+       if (buf_len > MAX_INSN_SIZE)
+               buf_len = MAX_INSN_SIZE;
+
        memset(insn, 0, sizeof(*insn));
        insn->kaddr = kaddr;
        insn->end_kaddr = kaddr + buf_len;
@@ -164,6 +171,12 @@ found:
                                /* VEX.W overrides opnd_size */
                                insn->opnd_bytes = 8;
                } else {
+                       /*
+                        * For VEX2, fake VEX3-like byte#2.
+                        * Makes it easier to decode vex.W, vex.vvvv,
+                        * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
+                        */
+                       insn->vex_prefix.bytes[2] = b2 & 0x7f;
                        insn->vex_prefix.nbytes = 2;
                        insn->next_byte += 2;
                }
index 89b53c9968e7d50c1dd129dd92272f1a47edefe5..b046664f5a1ccf37f3c94b2ed7d8d5b3298e89bf 100644 (file)
@@ -1,11 +1,19 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-
 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
 #include <asm/alternative-asm.h>
 
+/*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+.weak memcpy
+
 /*
  * memcpy - Copy a memory block.
  *
  * Output:
  * rax original destination
  */
+ENTRY(__memcpy)
+ENTRY(memcpy)
+       ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+                     "jmp memcpy_erms", X86_FEATURE_ERMS
 
-/*
- * memcpy_c() - fast string ops (REP MOVSQ) based variant.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
- */
-       .section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c:
        movq %rdi, %rax
        movq %rdx, %rcx
        shrq $3, %rcx
        movl %edx, %ecx
        rep movsb
        ret
-.Lmemcpy_e:
-       .previous
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
 
 /*
- * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
- * memcpy_c. Use memcpy_c_e when possible.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
  */
-       .section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c_e:
+ENTRY(memcpy_erms)
        movq %rdi, %rax
        movq %rdx, %rcx
        rep movsb
        ret
-.Lmemcpy_e_e:
-       .previous
-
-.weak memcpy
+ENDPROC(memcpy_erms)
 
-ENTRY(__memcpy)
-ENTRY(memcpy)
+ENTRY(memcpy_orig)
        CFI_STARTPROC
        movq %rdi, %rax
 
@@ -183,26 +179,4 @@ ENTRY(memcpy)
 .Lend:
        retq
        CFI_ENDPROC
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-
-       /*
-        * Some CPUs are adding enhanced REP MOVSB/STOSB feature
-        * If the feature is supported, memcpy_c_e() is the first choice.
-        * If enhanced rep movsb copy is not available, use fast string copy
-        * memcpy_c() when possible. This is faster and code is simpler than
-        * original memcpy().
-        * Otherwise, original memcpy() is used.
-        * In .altinstructions section, ERMS feature is placed after REG_GOOD
-         * feature to implement the right patch order.
-        *
-        * Replace only beginning, memcpy is used to apply alternatives,
-        * so it is silly to overwrite itself with nops - reboot is the
-        * only outcome...
-        */
-       .section .altinstructions, "a"
-       altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
-                            .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
-       altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
-                            .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
-       .previous
+ENDPROC(memcpy_orig)
index 9c4b530575da6e9b70cb8505f1e4b391b6fdff02..0f8a0d0331b91715238f01e8f54ce74525fb0cc4 100644 (file)
@@ -5,7 +5,6 @@
  * This assembly file is re-written from memmove_64.c file.
  *     - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
-#define _STRING_C
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
        jg 2f
 
 .Lmemmove_begin_forward:
+       ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+
        /*
         * movsq instruction have many startup latency
         * so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
 13:
        retq
        CFI_ENDPROC
-
-       .section .altinstr_replacement,"ax"
-.Lmemmove_begin_forward_efs:
-       /* Forward moving data. */
-       movq %rdx, %rcx
-       rep movsb
-       retq
-.Lmemmove_end_forward_efs:
-       .previous
-
-       .section .altinstructions,"a"
-       altinstruction_entry .Lmemmove_begin_forward,           \
-               .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,   \
-               .Lmemmove_end_forward-.Lmemmove_begin_forward,  \
-               .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
-       .previous
 ENDPROC(__memmove)
 ENDPROC(memmove)
index 6f44935c6a606a1607022e889cc60ea151e54b65..93118fb239762ba78efd754d20a756aed0221411 100644 (file)
@@ -5,19 +5,30 @@
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
+.weak memset
+
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
  * simpler and shorter than the orignal function as well.
- *     
+ *
  * rdi   destination
- * rsi   value (char) 
- * rdx   count (bytes) 
- * 
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
  * rax   original destination
- */    
-       .section .altinstr_replacement, "ax", @progbits
-.Lmemset_c:
+ */
+ENTRY(memset)
+ENTRY(__memset)
+       /*
+        * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+        * to use it when possible. If not available, use fast string instructions.
+        *
+        * Otherwise, use original memset function.
+        */
+       ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+                     "jmp memset_erms", X86_FEATURE_ERMS
+
        movq %rdi,%r9
        movq %rdx,%rcx
        andl $7,%edx
@@ -31,8 +42,8 @@
        rep stosb
        movq %r9,%rax
        ret
-.Lmemset_e:
-       .previous
+ENDPROC(memset)
+ENDPROC(__memset)
 
 /*
  * ISO C memset - set a memory block to a byte value. This function uses
  *
  * rax   original destination
  */
-       .section .altinstr_replacement, "ax", @progbits
-.Lmemset_c_e:
+ENTRY(memset_erms)
        movq %rdi,%r9
        movb %sil,%al
        movq %rdx,%rcx
        rep stosb
        movq %r9,%rax
        ret
-.Lmemset_e_e:
-       .previous
-
-.weak memset
+ENDPROC(memset_erms)
 
-ENTRY(memset)
-ENTRY(__memset)
+ENTRY(memset_orig)
        CFI_STARTPROC
        movq %rdi,%r10
 
@@ -134,23 +140,4 @@ ENTRY(__memset)
        jmp .Lafter_bad_alignment
 .Lfinal:
        CFI_ENDPROC
-ENDPROC(memset)
-ENDPROC(__memset)
-
-       /* Some CPUs support enhanced REP MOVSB/STOSB feature.
-        * It is recommended to use this when possible.
-        *
-        * If enhanced REP MOVSB/STOSB feature is not available, use fast string
-        * instructions.
-        *
-        * Otherwise, use original memset function.
-        *
-        * In .altinstructions section, ERMS feature is placed after REG_GOOD
-         * feature to implement the right patch order.
-        */
-       .section .altinstructions,"a"
-       altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-                            .Lfinal-__memset,.Lmemset_e-.Lmemset_c
-       altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-                            .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
-       .previous
+ENDPROC(memset_orig)
index f6d13eefad1063d5e525dbc0cfff1ccc12b34bc9..3ca5218fbece0c568ed6079ee2370ff1cedf71f8 100644 (file)
@@ -14,8 +14,8 @@
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
        CFI_STARTPROC
-       pushq_cfi %rbx
-       pushq_cfi %rbp
+       pushq_cfi_reg rbx
+       pushq_cfi_reg rbp
        movq    %rdi, %r10      /* Save pointer */
        xorl    %r11d, %r11d    /* Return value */
        movl    (%rdi), %eax
@@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs)
        movl    %ebp, 20(%r10)
        movl    %esi, 24(%r10)
        movl    %edi, 28(%r10)
-       popq_cfi %rbp
-       popq_cfi %rbx
+       popq_cfi_reg rbp
+       popq_cfi_reg rbx
        ret
 3:
        CFI_RESTORE_STATE
@@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs)
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
        CFI_STARTPROC
-       pushl_cfi %ebx
-       pushl_cfi %ebp
-       pushl_cfi %esi
-       pushl_cfi %edi
+       pushl_cfi_reg ebx
+       pushl_cfi_reg ebp
+       pushl_cfi_reg esi
+       pushl_cfi_reg edi
        pushl_cfi $0              /* Return value */
        pushl_cfi %eax
        movl    4(%eax), %ecx
@@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs)
        movl    %esi, 24(%eax)
        movl    %edi, 28(%eax)
        popl_cfi %eax
-       popl_cfi %edi
-       popl_cfi %esi
-       popl_cfi %ebp
-       popl_cfi %ebx
+       popl_cfi_reg edi
+       popl_cfi_reg esi
+       popl_cfi_reg ebp
+       popl_cfi_reg ebx
        ret
 3:
        CFI_RESTORE_STATE
index 5dff5f042468acfedf6e4058643855d59c75315d..2322abe4da3b014aef389c03e177c37eec31b802 100644 (file)
  */
 
 #define save_common_regs \
-       pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+       pushl_cfi_reg ecx
 
 #define restore_common_regs \
-       popl_cfi %ecx; CFI_RESTORE ecx
+       popl_cfi_reg ecx
 
        /* Avoid uglifying the argument copying x86-64 needs to do. */
        .macro movq src, dst
  */
 
 #define save_common_regs \
-       pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
-       pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
-       pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
-       pushq_cfi %r8;  CFI_REL_OFFSET r8,  0; \
-       pushq_cfi %r9;  CFI_REL_OFFSET r9,  0; \
-       pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
-       pushq_cfi %r11; CFI_REL_OFFSET r11, 0
+       pushq_cfi_reg rdi; \
+       pushq_cfi_reg rsi; \
+       pushq_cfi_reg rcx; \
+       pushq_cfi_reg r8;  \
+       pushq_cfi_reg r9;  \
+       pushq_cfi_reg r10; \
+       pushq_cfi_reg r11
 
 #define restore_common_regs \
-       popq_cfi %r11; CFI_RESTORE r11; \
-       popq_cfi %r10; CFI_RESTORE r10; \
-       popq_cfi %r9;  CFI_RESTORE r9; \
-       popq_cfi %r8;  CFI_RESTORE r8; \
-       popq_cfi %rcx; CFI_RESTORE rcx; \
-       popq_cfi %rsi; CFI_RESTORE rsi; \
-       popq_cfi %rdi; CFI_RESTORE rdi
+       popq_cfi_reg r11; \
+       popq_cfi_reg r10; \
+       popq_cfi_reg r9; \
+       popq_cfi_reg r8; \
+       popq_cfi_reg rcx; \
+       popq_cfi_reg rsi; \
+       popq_cfi_reg rdi
 
 #endif
 
 ENTRY(call_rwsem_down_read_failed)
        CFI_STARTPROC
        save_common_regs
-       __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
-       CFI_REL_OFFSET __ASM_REG(dx), 0
+       __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
        movq %rax,%rdi
        call rwsem_down_read_failed
-       __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
-       CFI_RESTORE __ASM_REG(dx)
+       __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
        restore_common_regs
        ret
        CFI_ENDPROC
@@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake)
 ENTRY(call_rwsem_downgrade_wake)
        CFI_STARTPROC
        save_common_regs
-       __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
-       CFI_REL_OFFSET __ASM_REG(dx), 0
+       __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
        movq %rax,%rdi
        call rwsem_downgrade_wake
-       __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
-       CFI_RESTORE __ASM_REG(dx)
+       __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
        restore_common_regs
        ret
        CFI_ENDPROC
index e28cdaf5ac2c629cd8bca126a55a45745e403534..5eb715087b804e7d9cad520b46488c0270740d7c 100644 (file)
        .globl \name
 \name:
        CFI_STARTPROC
-       pushl_cfi %eax
-       CFI_REL_OFFSET eax, 0
-       pushl_cfi %ecx
-       CFI_REL_OFFSET ecx, 0
-       pushl_cfi %edx
-       CFI_REL_OFFSET edx, 0
+       pushl_cfi_reg eax
+       pushl_cfi_reg ecx
+       pushl_cfi_reg edx
 
        .if \put_ret_addr_in_eax
        /* Place EIP in the arg1 */
        .endif
 
        call \func
-       popl_cfi %edx
-       CFI_RESTORE edx
-       popl_cfi %ecx
-       CFI_RESTORE ecx
-       popl_cfi %eax
-       CFI_RESTORE eax
+       popl_cfi_reg edx
+       popl_cfi_reg ecx
+       popl_cfi_reg eax
        ret
        CFI_ENDPROC
        _ASM_NOKPROBE(\name)
index b30b5ebd614ada18d25b3fb7403f7a32e32a96d9..f89ba4e93025dec7c45480600a0ef567d4696e0a 100644 (file)
        CFI_STARTPROC
 
        /* this one pushes 9 elems, the next one would be %rIP */
-       SAVE_ARGS
+       pushq_cfi_reg rdi
+       pushq_cfi_reg rsi
+       pushq_cfi_reg rdx
+       pushq_cfi_reg rcx
+       pushq_cfi_reg rax
+       pushq_cfi_reg r8
+       pushq_cfi_reg r9
+       pushq_cfi_reg r10
+       pushq_cfi_reg r11
 
        .if \put_ret_addr_in_rdi
+       /* 9*8(%rsp) is return addr on stack */
        movq_cfi_restore 9*8, rdi
        .endif
 
 #endif
 #endif
 
-       /* SAVE_ARGS below is used only for the .cfi directives it contains. */
+#if defined(CONFIG_TRACE_IRQFLAGS) \
+ || defined(CONFIG_DEBUG_LOCK_ALLOC) \
+ || defined(CONFIG_PREEMPT)
        CFI_STARTPROC
-       SAVE_ARGS
+       CFI_ADJUST_CFA_OFFSET 9*8
 restore:
-       RESTORE_ARGS
+       popq_cfi_reg r11
+       popq_cfi_reg r10
+       popq_cfi_reg r9
+       popq_cfi_reg r8
+       popq_cfi_reg rax
+       popq_cfi_reg rcx
+       popq_cfi_reg rdx
+       popq_cfi_reg rsi
+       popq_cfi_reg rdi
        ret
        CFI_ENDPROC
        _ASM_NOKPROBE(restore)
+#endif
index c905e89e19feb5ff79778a11a162dfbaa88852a9..1f33b3d1fd68239c9bb07840d287bcee5d2d26b8 100644 (file)
@@ -69,21 +69,20 @@ EXPORT_SYMBOL(copy_in_user);
  * it is not necessary to optimize tail handling.
  */
 __visible unsigned long
-copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
+copy_user_handle_tail(char *to, char *from, unsigned len)
 {
-       char c;
-       unsigned zero_len;
-
        for (; len; --len, to++) {
+               char c;
+
                if (__get_user_nocheck(c, from++, sizeof(char)))
                        break;
                if (__put_user_nocheck(c, to, sizeof(char)))
                        break;
        }
-
-       for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
-               if (__put_user_nocheck(c, to++, sizeof(char)))
-                       break;
        clac();
+
+       /* If the destination is a kernel buffer, we always clear the end */
+       if ((unsigned long)to >= TASK_SIZE_MAX)
+               memset(to, 0, len);
        return len;
 }
index 1a2be7c6895d811be12083b5dab49f92cfb8791f..816488c0b97e3540a59af5c5f4c001fed1f5ba41 100644 (file)
@@ -273,6 +273,9 @@ dd: ESC
 de: ESC
 df: ESC
 # 0xe0 - 0xef
+# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
+# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
+# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
 e0: LOOPNE/LOOPNZ Jb (f64)
 e1: LOOPE/LOOPZ Jb (f64)
 e2: LOOP Jb (f64)
@@ -281,6 +284,10 @@ e4: IN AL,Ib
 e5: IN eAX,Ib
 e6: OUT Ib,AL
 e7: OUT Ib,eAX
+# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
+# in "near" jumps and calls is 16-bit. For CALL,
+# push of return address is 16-bit wide, RSP is decremented by 2
+# but is not truncated to 16 bits, unlike RIP.
 e8: CALL Jz (f64)
 e9: JMP-near Jz (f64)
 ea: JMP-far Ap (i64)
@@ -456,6 +463,7 @@ AVXcode: 1
 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
 # 0x0f 0x80-0x8f
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
 80: JO Jz (f64)
 81: JNO Jz (f64)
 82: JB/JC/JNAE Jz (f64)
@@ -842,6 +850,7 @@ EndTable
 GrpTable: Grp5
 0: INC Ev
 1: DEC Ev
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
 2: CALLN Ev (f64)
 3: CALLF Ep
 4: JMPN Ev (f64)
index ede025fb46f137ed7576cd5a06547264356c25b0..181c53bac3a7ee8881b8844bae66b951d9beecde 100644 (file)
@@ -59,7 +59,7 @@ static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
        int ret = 0;
 
        /* kprobe_running() needs smp_processor_id() */
-       if (kprobes_built_in() && !user_mode_vm(regs)) {
+       if (kprobes_built_in() && !user_mode(regs)) {
                preempt_disable();
                if (kprobe_running() && kprobe_fault_handler(regs, 14))
                        ret = 1;
@@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
        instr = (void *)convert_ip_to_linear(current, regs);
        max_instr = instr + 15;
 
-       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
                return 0;
 
        while (instr < max_instr) {
@@ -1035,7 +1035,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
        if (error_code & PF_USER)
                return false;
 
-       if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC))
+       if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
                return false;
 
        return true;
@@ -1140,7 +1140,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
         * User-mode registers count as a user access even for any
         * potential system fault or CPU buglet:
         */
-       if (user_mode_vm(regs)) {
+       if (user_mode(regs)) {
                local_irq_enable();
                error_code |= PF_USER;
                flags |= FAULT_FLAG_USER;
index a110efca6d068f7d881f8c1d955d8a6906457f64..1d553186c4345c02be5c0152764d988cfe365ae1 100644 (file)
 
 /*
  * Tables translating between page_cache_type_t and pte encoding.
- * Minimal supported modes are defined statically, modified if more supported
- * cache modes are available.
- * Index into __cachemode2pte_tbl is the cachemode.
- * Index into __pte2cachemode_tbl are the caching attribute bits of the pte
- * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
+ *
+ * Minimal supported modes are defined statically, they are modified
+ * during bootup if more supported cache modes are available.
+ *
+ *   Index into __cachemode2pte_tbl[] is the cachemode.
+ *
+ *   Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
+ *   (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
  */
 uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
-       [_PAGE_CACHE_MODE_WB]           = 0,
-       [_PAGE_CACHE_MODE_WC]           = _PAGE_PWT,
-       [_PAGE_CACHE_MODE_UC_MINUS]     = _PAGE_PCD,
-       [_PAGE_CACHE_MODE_UC]           = _PAGE_PCD | _PAGE_PWT,
-       [_PAGE_CACHE_MODE_WT]           = _PAGE_PCD,
-       [_PAGE_CACHE_MODE_WP]           = _PAGE_PCD,
+       [_PAGE_CACHE_MODE_WB      ]     = 0         | 0        ,
+       [_PAGE_CACHE_MODE_WC      ]     = _PAGE_PWT | 0        ,
+       [_PAGE_CACHE_MODE_UC_MINUS]     = 0         | _PAGE_PCD,
+       [_PAGE_CACHE_MODE_UC      ]     = _PAGE_PWT | _PAGE_PCD,
+       [_PAGE_CACHE_MODE_WT      ]     = 0         | _PAGE_PCD,
+       [_PAGE_CACHE_MODE_WP      ]     = 0         | _PAGE_PCD,
 };
 EXPORT_SYMBOL(__cachemode2pte_tbl);
+
 uint8_t __pte2cachemode_tbl[8] = {
-       [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB,
-       [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC,
-       [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS,
-       [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC,
-       [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
-       [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
-       [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
+       [__pte2cm_idx( 0        | 0         | 0        )] = _PAGE_CACHE_MODE_WB,
+       [__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_WC,
+       [__pte2cm_idx( 0        | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
+       [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC,
+       [__pte2cm_idx( 0        | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
+       [__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
+       [__pte2cm_idx(0         | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
        [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
 };
 EXPORT_SYMBOL(__pte2cachemode_tbl);
@@ -131,21 +135,7 @@ void  __init early_alloc_pgt_buf(void)
 
 int after_bootmem;
 
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
-                               = 1
-#endif
-;
-
-static void __init init_gbpages(void)
-{
-#ifdef CONFIG_X86_64
-       if (direct_gbpages && cpu_has_gbpages)
-               printk(KERN_INFO "Using GB pages for direct mapping\n");
-       else
-               direct_gbpages = 0;
-#endif
-}
+early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
 
 struct map_range {
        unsigned long start;
@@ -157,16 +147,12 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
-       init_gbpages();
-
 #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
        /*
         * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
         * This will simplify cpa(), which otherwise needs to support splitting
         * large pages into small in interrupt context, etc.
         */
-       if (direct_gbpages)
-               page_size_mask |= 1 << PG_LEVEL_1G;
        if (cpu_has_pse)
                page_size_mask |= 1 << PG_LEVEL_2M;
 #endif
@@ -179,6 +165,15 @@ static void __init probe_page_size_mask(void)
        if (cpu_has_pge) {
                cr4_set_bits_and_update_boot(X86_CR4_PGE);
                __supported_pte_mask |= _PAGE_GLOBAL;
+       } else
+               __supported_pte_mask &= ~_PAGE_GLOBAL;
+
+       /* Enable 1 GB linear kernel mappings if available: */
+       if (direct_gbpages && cpu_has_gbpages) {
+               printk(KERN_INFO "Using GB pages for direct mapping\n");
+               page_size_mask |= 1 << PG_LEVEL_1G;
+       } else {
+               direct_gbpages = 0;
        }
 }
 
index 30eb05ae7061624f7faee0077ce41bddb4991ef7..3fba623e3ba558553d9740d3a994343a2960428f 100644 (file)
@@ -130,20 +130,6 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
        return 0;
 }
 
-static int __init parse_direct_gbpages_off(char *arg)
-{
-       direct_gbpages = 0;
-       return 0;
-}
-early_param("nogbpages", parse_direct_gbpages_off);
-
-static int __init parse_direct_gbpages_on(char *arg)
-{
-       direct_gbpages = 1;
-       return 0;
-}
-early_param("gbpages", parse_direct_gbpages_on);
-
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  * physical space so we can cache the place of the first one and move
index cd4785bbacb9a86366ccd74121e5161b078e9596..4053bb58bf92e6c328936aed1c8439b528468e1d 100644 (file)
@@ -482,9 +482,16 @@ static void __init numa_clear_kernel_node_hotplug(void)
                                  &memblock.reserved, mb->nid);
        }
 
-       /* Mark all kernel nodes. */
+       /*
+        * Mark all kernel nodes.
+        *
+        * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
+        * may not include all the memblock.reserved memory ranges because
+        * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
+        */
        for_each_memblock(reserved, r)
-               node_set(r->nid, numa_kernel_nodes);
+               if (r->nid != MAX_NUMNODES)
+                       node_set(r->nid, numa_kernel_nodes);
 
        /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
        for (i = 0; i < numa_meminfo.nr_blks; i++) {
index 536ea2fb6e335677df559390520c3312976dcb81..89af288ec6740cfd793a4c4804e939bb023e9453 100644 (file)
@@ -81,11 +81,9 @@ void arch_report_meminfo(struct seq_file *m)
        seq_printf(m, "DirectMap4M:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_2M] << 12);
 #endif
-#ifdef CONFIG_X86_64
        if (direct_gbpages)
                seq_printf(m, "DirectMap1G:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_1G] << 20);
-#endif
 }
 #else
 static inline void split_page_count(int level) { }
@@ -1654,13 +1652,11 @@ int set_memory_ro(unsigned long addr, int numpages)
 {
        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
 }
-EXPORT_SYMBOL_GPL(set_memory_ro);
 
 int set_memory_rw(unsigned long addr, int numpages)
 {
        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
 }
-EXPORT_SYMBOL_GPL(set_memory_rw);
 
 int set_memory_np(unsigned long addr, int numpages)
 {
index 7ac68698406c3b35e5ce0b0e98c73c5441e869a3..35af6771a95ad6c126cca1f352b896998116876e 100644 (file)
@@ -610,7 +610,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 }
 
 #ifdef CONFIG_STRICT_DEVMEM
-/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 {
        return 1;
@@ -628,8 +628,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 
        while (cursor < to) {
                if (!devmem_is_allowed(pfn)) {
-                       printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
-                               current->comm, from, to - 1);
+                       printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+                              current->comm, from, to - 1);
                        return 0;
                }
                cursor += PAGE_SIZE;
index 7b22adaad4f1379a1195aec23b38a28a8db83f5c..5a7e5252c878bdcd2e45a11257adc15763cdea5f 100644 (file)
@@ -275,12 +275,87 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
        }
 }
 
+/*
+ * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
+ * assumes that pgd should be in one page.
+ *
+ * But kernel with PAE paging that is not running as a Xen domain
+ * only needs to allocate 32 bytes for pgd instead of one page.
+ */
+#ifdef CONFIG_X86_PAE
+
+#include <linux/slab.h>
+
+#define PGD_SIZE       (PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_ALIGN      32
+
+static struct kmem_cache *pgd_cache;
+
+static int __init pgd_cache_init(void)
+{
+       /*
+        * When PAE kernel is running as a Xen domain, it does not use
+        * shared kernel pmd. And this requires a whole page for pgd.
+        */
+       if (!SHARED_KERNEL_PMD)
+               return 0;
+
+       /*
+        * when PAE kernel is not running as a Xen domain, it uses
+        * shared kernel pmd. Shared kernel pmd does not require a whole
+        * page for pgd. We are able to just allocate a 32-byte for pgd.
+        * During boot time, we create a 32-byte slab for pgd table allocation.
+        */
+       pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
+                                     SLAB_PANIC, NULL);
+       if (!pgd_cache)
+               return -ENOMEM;
+
+       return 0;
+}
+core_initcall(pgd_cache_init);
+
+static inline pgd_t *_pgd_alloc(void)
+{
+       /*
+        * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
+        * We allocate one page for pgd.
+        */
+       if (!SHARED_KERNEL_PMD)
+               return (pgd_t *)__get_free_page(PGALLOC_GFP);
+
+       /*
+        * Now PAE kernel is not running as a Xen domain. We can allocate
+        * a 32-byte slab for pgd to save memory space.
+        */
+       return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+       if (!SHARED_KERNEL_PMD)
+               free_page((unsigned long)pgd);
+       else
+               kmem_cache_free(pgd_cache, pgd);
+}
+#else
+static inline pgd_t *_pgd_alloc(void)
+{
+       return (pgd_t *)__get_free_page(PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+       free_page((unsigned long)pgd);
+}
+#endif /* CONFIG_X86_PAE */
+
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
        pgd_t *pgd;
        pmd_t *pmds[PREALLOCATED_PMDS];
 
-       pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
+       pgd = _pgd_alloc();
 
        if (pgd == NULL)
                goto out;
@@ -310,7 +385,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 out_free_pmds:
        free_pmds(mm, pmds);
 out_free_pgd:
-       free_page((unsigned long)pgd);
+       _pgd_free(pgd);
 out:
        return NULL;
 }
@@ -320,7 +395,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
        pgd_mop_up_pmds(mm, pgd);
        pgd_dtor(pgd);
        paravirt_pgd_free(mm, pgd);
-       free_page((unsigned long)pgd);
+       _pgd_free(pgd);
 }
 
 /*
index 5d04be5efb6401b1b7512f4ca519a24a016b20d2..4e664bdb535ad89d09633fd73ea6eca5a46d4097 100644 (file)
@@ -111,7 +111,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 {
        struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
 
-       if (!user_mode_vm(regs)) {
+       if (!user_mode(regs)) {
                unsigned long stack = kernel_stack_pointer(regs);
                if (depth)
                        dump_trace(NULL, regs, (unsigned long *)stack, 0,
index d143d216d52bec69b912128d88c3283cd0122c6c..d7f997f7c26d2502a1a188583d6817c3672417d1 100644 (file)
@@ -67,7 +67,7 @@ void __init efi_bgrt_init(void)
 
        image = efi_lookup_mapped_addr(bgrt_tab->image_address);
        if (!image) {
-               image = early_memremap(bgrt_tab->image_address,
+               image = early_ioremap(bgrt_tab->image_address,
                                       sizeof(bmp_header));
                ioremapped = true;
                if (!image) {
@@ -89,7 +89,7 @@ void __init efi_bgrt_init(void)
        }
 
        if (ioremapped) {
-               image = early_memremap(bgrt_tab->image_address,
+               image = early_ioremap(bgrt_tab->image_address,
                                       bmp_header.size);
                if (!image) {
                        pr_err("Ignoring BGRT: failed to map image memory\n");
index dbc8627a5cdf6d569a5f69cbea05ebac924d2279..02744df576d52588a35308998ecc1a138435012e 100644 (file)
@@ -85,12 +85,20 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
        efi_memory_desc_t *virtual_map)
 {
        efi_status_t status;
+       unsigned long flags;
+       pgd_t *save_pgd;
 
-       efi_call_phys_prolog();
+       save_pgd = efi_call_phys_prolog();
+
+       /* Disable interrupts around EFI calls: */
+       local_irq_save(flags);
        status = efi_call_phys(efi_phys.set_virtual_address_map,
                               memory_map_size, descriptor_size,
                               descriptor_version, virtual_map);
-       efi_call_phys_epilog();
+       local_irq_restore(flags);
+
+       efi_call_phys_epilog(save_pgd);
+
        return status;
 }
 
@@ -491,7 +499,8 @@ void __init efi_init(void)
        if (efi_memmap_init())
                return;
 
-       print_efi_memmap();
+       if (efi_enabled(EFI_DBG))
+               print_efi_memmap();
 }
 
 void __init efi_late_init(void)
@@ -939,6 +948,8 @@ static int __init arch_parse_efi_cmdline(char *str)
 {
        if (parse_option_str(str, "old_map"))
                set_bit(EFI_OLD_MEMMAP, &efi.flags);
+       if (parse_option_str(str, "debug"))
+               set_bit(EFI_DBG, &efi.flags);
 
        return 0;
 }
index 40e7cda529365133b50bab8745d9995c47557d53..ed5b67338294f1325fffe5f7d9fce637731d5917 100644 (file)
 
 /*
  * To make EFI call EFI runtime service in physical addressing mode we need
- * prolog/epilog before/after the invocation to disable interrupt, to
- * claim EFI runtime service handler exclusively and to duplicate a memory in
- * low memory space say 0 - 3G.
+ * prolog/epilog before/after the invocation to claim the EFI runtime service
+ * handler exclusively and to duplicate a memory mapping in low memory space,
+ * say 0 - 3G.
  */
-static unsigned long efi_rt_eflags;
 
 void efi_sync_low_kernel_mappings(void) {}
 void __init efi_dump_pagetable(void) {}
@@ -57,21 +56,24 @@ void __init efi_map_region(efi_memory_desc_t *md)
 void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
 void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
 
-void __init efi_call_phys_prolog(void)
+pgd_t * __init efi_call_phys_prolog(void)
 {
        struct desc_ptr gdt_descr;
+       pgd_t *save_pgd;
 
-       local_irq_save(efi_rt_eflags);
-
+       /* Current pgd is swapper_pg_dir, we'll restore it later: */
+       save_pgd = swapper_pg_dir;
        load_cr3(initial_page_table);
        __flush_tlb_all();
 
        gdt_descr.address = __pa(get_cpu_gdt_table(0));
        gdt_descr.size = GDT_SIZE - 1;
        load_gdt(&gdt_descr);
+
+       return save_pgd;
 }
 
-void __init efi_call_phys_epilog(void)
+void __init efi_call_phys_epilog(pgd_t *save_pgd)
 {
        struct desc_ptr gdt_descr;
 
@@ -79,10 +81,8 @@ void __init efi_call_phys_epilog(void)
        gdt_descr.size = GDT_SIZE - 1;
        load_gdt(&gdt_descr);
 
-       load_cr3(swapper_pg_dir);
+       load_cr3(save_pgd);
        __flush_tlb_all();
-
-       local_irq_restore(efi_rt_eflags);
 }
 
 void __init efi_runtime_mkexec(void)
index 17e80d829df0391536ea49379108d1383a39c46f..a0ac0f9c307f661c8b3ed08c4ca6d23507772e36 100644 (file)
@@ -41,9 +41,6 @@
 #include <asm/realmode.h>
 #include <asm/time.h>
 
-static pgd_t *save_pgd __initdata;
-static unsigned long efi_flags __initdata;
-
 /*
  * We allocate runtime services regions bottom-up, starting from -4G, i.e.
  * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
@@ -78,17 +75,18 @@ static void __init early_code_mapping_set_exec(int executable)
        }
 }
 
-void __init efi_call_phys_prolog(void)
+pgd_t * __init efi_call_phys_prolog(void)
 {
        unsigned long vaddress;
+       pgd_t *save_pgd;
+
        int pgd;
        int n_pgds;
 
        if (!efi_enabled(EFI_OLD_MEMMAP))
-               return;
+               return NULL;
 
        early_code_mapping_set_exec(1);
-       local_irq_save(efi_flags);
 
        n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
        save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
@@ -99,24 +97,29 @@ void __init efi_call_phys_prolog(void)
                set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
        }
        __flush_tlb_all();
+
+       return save_pgd;
 }
 
-void __init efi_call_phys_epilog(void)
+void __init efi_call_phys_epilog(pgd_t *save_pgd)
 {
        /*
         * After the lock is released, the original page table is restored.
         */
-       int pgd;
-       int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+       int pgd_idx;
+       int nr_pgds;
 
-       if (!efi_enabled(EFI_OLD_MEMMAP))
+       if (!save_pgd)
                return;
 
-       for (pgd = 0; pgd < n_pgds; pgd++)
-               set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
+       nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+
+       for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++)
+               set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
+
        kfree(save_pgd);
+
        __flush_tlb_all();
-       local_irq_restore(efi_flags);
        early_code_mapping_set_exec(0);
 }
 
index c9a0838890e241692f6a03f4984485b6746c2f72..278e4da4222f79cf82aa2f8dcbe0e371243fb836 100644 (file)
@@ -11,6 +11,7 @@
  */
 
 #include <asm-generic/sections.h>
+#include <asm/cpu_device_id.h>
 #include <asm/imr.h>
 #include <linux/init.h>
 #include <linux/mm.h>
@@ -101,6 +102,12 @@ static void __init imr_self_test(void)
        }
 }
 
+static const struct x86_cpu_id imr_ids[] __initconst = {
+       { X86_VENDOR_INTEL, 5, 9 },     /* Intel Quark SoC X1000. */
+       {}
+};
+MODULE_DEVICE_TABLE(x86cpu, imr_ids);
+
 /**
  * imr_self_test_init - entry point for IMR driver.
  *
@@ -108,7 +115,8 @@ static void __init imr_self_test(void)
  */
 static int __init imr_self_test_init(void)
 {
-       imr_self_test();
+       if (x86_match_cpu(imr_ids))
+               imr_self_test();
        return 0;
 }
 
index 994798548b1ad57288e4b51c01422d3dad70ff7f..3b6ec42718e460717182c2762ffd7e6a005fff6f 100644 (file)
@@ -415,7 +415,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
        struct reset_args reset_args;
 
        reset_args.sender = sender;
-       cpus_clear(*mask);
+       cpumask_clear(mask);
        /* find a single cpu for each uvhub in this distribution mask */
        maskbits = sizeof(struct pnmask) * BITSPERBYTE;
        /* each bit is a pnode relative to the partition base pnode */
@@ -425,7 +425,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
                        continue;
                apnode = pnode + bcp->partition_base_pnode;
                cpu = pnode_to_first_cpu(apnode, smaster);
-               cpu_set(cpu, *mask);
+               cpumask_set_cpu(cpu, mask);
        }
 
        /* IPI all cpus; preemption is already disabled */
@@ -1126,7 +1126,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
        /* don't actually do a shootdown of the local cpu */
        cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 
-       if (cpu_isset(cpu, *cpumask))
+       if (cpumask_test_cpu(cpu, cpumask))
                stat->s_ntargself++;
 
        bau_desc = bcp->descriptor_base;
index 3e32ed5648a03894604976c4a9b2ae5c155c1750..757678fb26e1a06277687c1c90f86e75377de03a 100644 (file)
@@ -134,7 +134,7 @@ static void do_fpu_end(void)
 static void fix_processor_context(void)
 {
        int cpu = smp_processor_id();
-       struct tss_struct *t = &per_cpu(init_tss, cpu);
+       struct tss_struct *t = &per_cpu(cpu_tss, cpu);
 #ifdef CONFIG_X86_64
        struct desc_struct *desc = get_cpu_gdt_table(cpu);
        tss_desc tss;
index b3560ece1c9fbae0ff05c14824b76a07efdb0776..ef8187f9d28d96651d3d52e39d4441c60a408332 100644 (file)
 110    i386    iopl                    sys_iopl
 111    i386    vhangup                 sys_vhangup
 112    i386    idle
-113    i386    vm86old                 sys_vm86old                     sys32_vm86_warning
+113    i386    vm86old                 sys_vm86old                     sys_ni_syscall
 114    i386    wait4                   sys_wait4                       compat_sys_wait4
 115    i386    swapoff                 sys_swapoff
 116    i386    sysinfo                 sys_sysinfo                     compat_sys_sysinfo
 163    i386    mremap                  sys_mremap
 164    i386    setresuid               sys_setresuid16
 165    i386    getresuid               sys_getresuid16
-166    i386    vm86                    sys_vm86                        sys32_vm86_warning
+166    i386    vm86                    sys_vm86                        sys_ni_syscall
 167    i386    query_module
 168    i386    poll                    sys_poll
 169    i386    nfsservctl
index 8d656fbb57aab2606132858d48b2db7e99e8bf7c..9ef32d5f1b19e67ed10b69c67be5f53806c19ffa 100644 (file)
 169    common  reboot                  sys_reboot
 170    common  sethostname             sys_sethostname
 171    common  setdomainname           sys_setdomainname
-172    common  iopl                    stub_iopl
+172    common  iopl                    sys_iopl
 173    common  ioperm                  sys_ioperm
 174    64      create_module
 175    common  init_module             sys_init_module
index 2d7d9a1f5b531ee399b7a0e6a1f9eb4a190fd820..8ffd2146fa6a0367be6710913b5c4161f1a8859b 100644 (file)
@@ -64,8 +64,8 @@
  */
 static inline void rdtsc_barrier(void)
 {
-       alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-       alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+       alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+                         "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
 #endif
index 5cdfa9db22175ed8dc327465b4cf033a7a65d3bc..a75d8700472a4f52db0211217cf7bbcd4f2dc1f5 100644 (file)
@@ -16,7 +16,7 @@
  */
 
 /* Not going to be implemented by UML, since we have no hardware. */
-#define stub_iopl sys_ni_syscall
+#define sys_iopl sys_ni_syscall
 #define sys_ioperm sys_ni_syscall
 
 /*
index 9793322751e02f63ddba0d1b8fef5f21b0a4d502..40d2473836c923acc5705018bf3aebf50cfb12b8 100644 (file)
@@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode)
        cycle_t ret;
        u64 last;
        u32 version;
+       u32 migrate_count;
        u8 flags;
        unsigned cpu, cpu1;
 
 
        /*
-        * Note: hypervisor must guarantee that:
-        * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-        * 2. that per-CPU pvclock time info is updated if the
-        *    underlying CPU changes.
-        * 3. that version is increased whenever underlying CPU
-        *    changes.
-        *
+        * When looping to get a consistent (time-info, tsc) pair, we
+        * also need to deal with the possibility we can switch vcpus,
+        * so make sure we always re-fetch time-info for the current vcpu.
         */
        do {
                cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -102,20 +99,27 @@ static notrace cycle_t vread_pvclock(int *mode)
                 * __getcpu() calls (Gleb).
                 */
 
-               pvti = get_pvti(cpu);
+               /* Make sure migrate_count will change if we leave the VCPU. */
+               do {
+                       pvti = get_pvti(cpu);
+                       migrate_count = pvti->migrate_count;
+
+                       cpu1 = cpu;
+                       cpu = __getcpu() & VGETCPU_CPU_MASK;
+               } while (unlikely(cpu != cpu1));
 
                version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
 
                /*
                 * Test we're still on the cpu as well as the version.
-                * We could have been migrated just after the first
-                * vgetcpu but before fetching the version, so we
-                * wouldn't notice a version change.
+                * - We must read TSC of pvti's VCPU.
+                * - KVM doesn't follow the versioning protocol, so data could
+                *   change before version if we left the VCPU.
                 */
-               cpu1 = __getcpu() & VGETCPU_CPU_MASK;
-       } while (unlikely(cpu != cpu1 ||
-                         (pvti->pvti.version & 1) ||
-                         pvti->pvti.version != version));
+               smp_rmb();
+       } while (unlikely((pvti->pvti.version & 1) ||
+                         pvti->pvti.version != version ||
+                         pvti->migrate_count != migrate_count));
 
        if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
                *mode = VCLOCK_NONE;
index 5240f563076de2e0e27c92af2d04ad03d213ee8f..81665c9f21327f3e7db5088a94cbcef127bb3d93 100644 (file)
@@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss,
        mcs = xen_mc_entry(0);
        MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
        xen_mc_issue(PARAVIRT_LAZY_CPU);
+       tss->x86_tss.sp0 = thread->sp0;
 }
 
 static void xen_set_iopl_mask(unsigned mask)
index 9f93af56a5fc7bd4cf263406faf96f8a5fa56466..b47124d4cd67e29199fae1c48f9a94ecd93b00f7 100644 (file)
@@ -91,6 +91,12 @@ EXPORT_SYMBOL_GPL(xen_p2m_size);
 unsigned long xen_max_p2m_pfn __read_mostly;
 EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
 
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+#else
+#define P2M_LIMIT 0
+#endif
+
 static DEFINE_SPINLOCK(p2m_update_lock);
 
 static unsigned long *p2m_mid_missing_mfn;
@@ -385,9 +391,11 @@ static void __init xen_rebuild_p2m_list(unsigned long *p2m)
 void __init xen_vmalloc_p2m_tree(void)
 {
        static struct vm_struct vm;
+       unsigned long p2m_limit;
 
+       p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE;
        vm.flags = VM_ALLOC;
-       vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn,
+       vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit),
                        PMD_SIZE * PMDS_PER_MID_PAGE);
        vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE);
        pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size);
index 08e8489c47f1ba058ca57dd6353d6bf9c1776ba7..7413ee3706d02502d2dc9a94fa90fb5b97495e90 100644 (file)
@@ -445,15 +445,7 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 {
        int rc;
 
-       per_cpu(current_task, cpu) = idle;
-#ifdef CONFIG_X86_32
-       irq_ctx_init(cpu);
-#else
-       clear_tsk_thread_flag(idle, TIF_FORK);
-#endif
-       per_cpu(kernel_stack, cpu) =
-               (unsigned long)task_stack_page(idle) -
-               KERNEL_STACK_OFFSET + THREAD_SIZE;
+       common_cpu_up(cpu, idle);
 
        xen_setup_runstate_info(cpu);
        xen_setup_timer(cpu);
@@ -468,10 +460,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
        if (rc)
                return rc;
 
-       if (num_online_cpus() == 1)
-               /* Just in case we booted with a single CPU. */
-               alternatives_enable_smp();
-
        rc = xen_smp_intr_init(cpu);
        if (rc)
                return rc;
index c4df9dbd63b7c81f70c624c417d8362b85d3d34e..d9497698645a53b10ab4b62ddc98d12f4412a616 100644 (file)
@@ -1,5 +1,5 @@
 #include <linux/types.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 
 #include <xen/interface/xen.h>
 #include <xen/grant_table.h>
@@ -81,17 +81,14 @@ void xen_arch_post_suspend(int cancelled)
 
 static void xen_vcpu_notify_restore(void *data)
 {
-       unsigned long reason = (unsigned long)data;
-
        /* Boot processor notified via generic timekeeping_resume() */
-       if ( smp_processor_id() == 0)
+       if (smp_processor_id() == 0)
                return;
 
-       clockevents_notify(reason, NULL);
+       tick_resume_local();
 }
 
 void xen_arch_resume(void)
 {
-       on_each_cpu(xen_vcpu_notify_restore,
-                   (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
+       on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
 }
index 53adefda4275330a810b6d883b6ad8b58a72730c..985fc3ee0973c85f916c67cd9a40fc9c2c73d340 100644 (file)
@@ -68,11 +68,11 @@ ENTRY(xen_sysret64)
         * We're already on the usermode stack at this point, but
         * still with the kernel gs, so we can easily switch back
         */
-       movq %rsp, PER_CPU_VAR(old_rsp)
+       movq %rsp, PER_CPU_VAR(rsp_scratch)
        movq PER_CPU_VAR(kernel_stack), %rsp
 
        pushq $__USER_DS
-       pushq PER_CPU_VAR(old_rsp)
+       pushq PER_CPU_VAR(rsp_scratch)
        pushq %r11
        pushq $__USER_CS
        pushq %rcx
@@ -87,11 +87,11 @@ ENTRY(xen_sysret32)
         * We're already on the usermode stack at this point, but
         * still with the kernel gs, so we can easily switch back
         */
-       movq %rsp, PER_CPU_VAR(old_rsp)
+       movq %rsp, PER_CPU_VAR(rsp_scratch)
        movq PER_CPU_VAR(kernel_stack), %rsp
 
        pushq $__USER32_DS
-       pushq PER_CPU_VAR(old_rsp)
+       pushq PER_CPU_VAR(rsp_scratch)
        pushq %r11
        pushq $__USER32_CS
        pushq %rcx
index b7b8933ec24188b2807229af1aa844503b46dc89..33c428530193548e38e0ec4ce3608e08be8b1d8d 100644 (file)
@@ -1457,7 +1457,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 
                do {
                        page = alloc_pages_node(set->numa_node,
-                               GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+                               GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
                                this_order);
                        if (page)
                                break;
@@ -1479,8 +1479,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                left -= to_do * rq_size;
                for (j = 0; j < to_do; j++) {
                        tags->rqs[i] = p;
-                       tags->rqs[i]->atomic_flags = 0;
-                       tags->rqs[i]->cmd_flags = 0;
                        if (set->ops->init_request) {
                                if (set->ops->init_request(set->driver_data,
                                                tags->rqs[i], hctx_idx, i,
index 6ed2cbe5e8c9ae340233b0491255dfed40947590..12600bfffca93f4547e2325eeda9669ff443a7a7 100644 (file)
@@ -585,7 +585,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                                     b->physical_block_size);
 
        t->io_min = max(t->io_min, b->io_min);
-       t->io_opt = lcm(t->io_opt, b->io_opt);
+       t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
 
        t->cluster &= b->cluster;
        t->discard_zeroes_data &= b->discard_zeroes_data;
@@ -616,7 +616,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                    b->raid_partial_stripes_expensive);
 
        /* Find lowest common alignment_offset */
-       t->alignment_offset = lcm(t->alignment_offset, alignment)
+       t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment)
                % max(t->physical_block_size, t->io_min);
 
        /* Verify that new alignment_offset is on a logical block boundary */
@@ -643,7 +643,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                                                      b->max_discard_sectors);
                t->discard_granularity = max(t->discard_granularity,
                                             b->discard_granularity);
-               t->discard_alignment = lcm(t->discard_alignment, alignment) %
+               t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) %
                        t->discard_granularity;
        }
 
index c7b105c0e1d377991f1e324fae161535e01efe27..6bc9cbc01ad6a3f20c27740ebbf1a919e7c74d0d 100644 (file)
@@ -26,7 +26,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/cpu.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 #include <linux/slab.h>
 #include <linux/acpi.h>
 #include <asm/mwait.h>
@@ -41,8 +41,6 @@ static unsigned long power_saving_mwait_eax;
 
 static unsigned char tsc_detected_unstable;
 static unsigned char tsc_marked_unstable;
-static unsigned char lapic_detected_unstable;
-static unsigned char lapic_marked_unstable;
 
 static void power_saving_mwait_init(void)
 {
@@ -82,13 +80,10 @@ static void power_saving_mwait_init(void)
                 */
                if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
                        tsc_detected_unstable = 1;
-               if (!boot_cpu_has(X86_FEATURE_ARAT))
-                       lapic_detected_unstable = 1;
                break;
        default:
-               /* TSC & LAPIC could halt in idle */
+               /* TSC could halt in idle */
                tsc_detected_unstable = 1;
-               lapic_detected_unstable = 1;
        }
 #endif
 }
@@ -155,7 +150,6 @@ static int power_saving_thread(void *data)
        sched_setscheduler(current, SCHED_RR, &param);
 
        while (!kthread_should_stop()) {
-               int cpu;
                unsigned long expire_time;
 
                try_to_freeze();
@@ -177,28 +171,15 @@ static int power_saving_thread(void *data)
                                mark_tsc_unstable("TSC halts in idle");
                                tsc_marked_unstable = 1;
                        }
-                       if (lapic_detected_unstable && !lapic_marked_unstable) {
-                               int i;
-                               /* LAPIC could halt in idle, so notify users */
-                               for_each_online_cpu(i)
-                                       clockevents_notify(
-                                               CLOCK_EVT_NOTIFY_BROADCAST_ON,
-                                               &i);
-                               lapic_marked_unstable = 1;
-                       }
                        local_irq_disable();
-                       cpu = smp_processor_id();
-                       if (lapic_marked_unstable)
-                               clockevents_notify(
-                                       CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+                       tick_broadcast_enable();
+                       tick_broadcast_enter();
                        stop_critical_timings();
 
                        mwait_idle_with_hints(power_saving_mwait_eax, 1);
 
                        start_critical_timings();
-                       if (lapic_marked_unstable)
-                               clockevents_notify(
-                                       CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+                       tick_broadcast_exit();
                        local_irq_enable();
 
                        if (time_before(expire_time, jiffies)) {
index c6bb9f1257c92844fa076c0585cf73125d278dec..39e0c8e36244f75aac3de4f09a0a89817e035b91 100644 (file)
@@ -32,7 +32,7 @@
 #include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/sched.h>       /* need_resched() */
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 #include <linux/cpuidle.h>
 #include <linux/syscore_ops.h>
 #include <acpi/processor.h>
@@ -157,12 +157,11 @@ static void lapic_timer_check_state(int state, struct acpi_processor *pr,
 static void __lapic_timer_propagate_broadcast(void *arg)
 {
        struct acpi_processor *pr = (struct acpi_processor *) arg;
-       unsigned long reason;
 
-       reason = pr->power.timer_broadcast_on_state < INT_MAX ?
-               CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF;
-
-       clockevents_notify(reason, &pr->id);
+       if (pr->power.timer_broadcast_on_state < INT_MAX)
+               tick_broadcast_enable();
+       else
+               tick_broadcast_disable();
 }
 
 static void lapic_timer_propagate_broadcast(struct acpi_processor *pr)
@@ -179,11 +178,10 @@ static void lapic_timer_state_broadcast(struct acpi_processor *pr,
        int state = cx - pr->power.states;
 
        if (state >= pr->power.timer_broadcast_on_state) {
-               unsigned long reason;
-
-               reason = broadcast ?  CLOCK_EVT_NOTIFY_BROADCAST_ENTER :
-                       CLOCK_EVT_NOTIFY_BROADCAST_EXIT;
-               clockevents_notify(reason, &pr->id);
+               if (broadcast)
+                       tick_broadcast_enter();
+               else
+                       tick_broadcast_exit();
        }
 }
 
@@ -922,7 +920,7 @@ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr)
                return -EINVAL;
 
        drv->safe_state_index = -1;
-       for (i = 0; i < CPUIDLE_STATE_MAX; i++) {
+       for (i = CPUIDLE_DRIVER_STATE_START; i < CPUIDLE_STATE_MAX; i++) {
                drv->states[i].name[0] = '\0';
                drv->states[i].desc[0] = '\0';
        }
index ef150ebb4c304efa1cbf5fb6e23a2a9da036ae91..23dac3babfe3afc710db73a2ad4f8fe05174b7d7 100644 (file)
@@ -4204,9 +4204,18 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
        { "PIONEER DVD-RW  DVR-216D",   NULL,   ATA_HORKAGE_NOSETXFER },
 
        /* devices that don't properly handle queued TRIM commands */
-       { "Micron_M[56]*",              NULL,   ATA_HORKAGE_NO_NCQ_TRIM |
+       { "Micron_M500*",               NULL,   ATA_HORKAGE_NO_NCQ_TRIM |
+                                               ATA_HORKAGE_ZERO_AFTER_TRIM, },
+       { "Crucial_CT*M500*",           NULL,   ATA_HORKAGE_NO_NCQ_TRIM |
+                                               ATA_HORKAGE_ZERO_AFTER_TRIM, },
+       { "Micron_M5[15]0*",            "MU01", ATA_HORKAGE_NO_NCQ_TRIM |
+                                               ATA_HORKAGE_ZERO_AFTER_TRIM, },
+       { "Crucial_CT*M550*",           "MU01", ATA_HORKAGE_NO_NCQ_TRIM |
+                                               ATA_HORKAGE_ZERO_AFTER_TRIM, },
+       { "Crucial_CT*MX100*",          "MU01", ATA_HORKAGE_NO_NCQ_TRIM |
+                                               ATA_HORKAGE_ZERO_AFTER_TRIM, },
+       { "Samsung SSD 850 PRO*",       NULL,   ATA_HORKAGE_NO_NCQ_TRIM |
                                                ATA_HORKAGE_ZERO_AFTER_TRIM, },
-       { "Crucial_CT*SSD*",            NULL,   ATA_HORKAGE_NO_NCQ_TRIM, },
 
        /*
         * As defined, the DRAT (Deterministic Read After Trim) and RZAT
@@ -4226,6 +4235,8 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
         */
        { "INTEL*SSDSC2MH*",            NULL,   0, },
 
+       { "Micron*",                    NULL,   ATA_HORKAGE_ZERO_AFTER_TRIM, },
+       { "Crucial*",                   NULL,   ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "INTEL*SSD*",                 NULL,   ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "SSD*INTEL*",                 NULL,   ATA_HORKAGE_ZERO_AFTER_TRIM, },
        { "Samsung*SSD*",               NULL,   ATA_HORKAGE_ZERO_AFTER_TRIM, },
index 79524ed2a3cb0d8bb24ebf94af80bd461591f96f..8753b0f6a317790562cc1126805a11eea3a25a5d 100644 (file)
@@ -125,6 +125,7 @@ static int ipmi_powernv_recv(struct ipmi_smi_powernv *smi)
        spin_lock_irqsave(&smi->msg_lock, flags);
 
        if (!smi->cur_msg) {
+               spin_unlock_irqrestore(&smi->msg_lock, flags);
                pr_warn("no current message?\n");
                return 0;
        }
index f6646ed3047e09a3656b089491f0afaa14982af1..518585c1ce94626b1142f83bd0697afef9c05438 100644 (file)
@@ -262,6 +262,11 @@ struct smi_info {
         */
        bool supports_event_msg_buff;
 
+       /*
+        * Can we clear the global enables receive irq bit?
+        */
+       bool cannot_clear_recv_irq_bit;
+
        /*
         * Did we get an attention that we did not handle?
         */
@@ -461,6 +466,9 @@ static void smi_mod_timer(struct smi_info *smi_info, unsigned long new_val)
  * allocate messages, we just leave them in the BMC and run the system
  * polled until we can allocate some memory.  Once we have some
  * memory, we will re-enable the interrupt.
+ *
+ * Note that we cannot just use disable_irq(), since the interrupt may
+ * be shared.
  */
 static inline bool disable_si_irq(struct smi_info *smi_info)
 {
@@ -549,20 +557,15 @@ static u8 current_global_enables(struct smi_info *smi_info, u8 base,
 
        if (smi_info->supports_event_msg_buff)
                enables |= IPMI_BMC_EVT_MSG_BUFF;
-       else
-               enables &= ~IPMI_BMC_EVT_MSG_BUFF;
 
-       if (smi_info->irq && !smi_info->interrupt_disabled)
+       if ((smi_info->irq && !smi_info->interrupt_disabled) ||
+           smi_info->cannot_clear_recv_irq_bit)
                enables |= IPMI_BMC_RCV_MSG_INTR;
-       else
-               enables &= ~IPMI_BMC_RCV_MSG_INTR;
 
        if (smi_info->supports_event_msg_buff &&
            smi_info->irq && !smi_info->interrupt_disabled)
 
                enables |= IPMI_BMC_EVT_MSG_INTR;
-       else
-               enables &= ~IPMI_BMC_EVT_MSG_INTR;
 
        *irq_on = enables & (IPMI_BMC_EVT_MSG_INTR | IPMI_BMC_RCV_MSG_INTR);
 
@@ -2900,6 +2903,96 @@ static int try_get_dev_id(struct smi_info *smi_info)
        return rv;
 }
 
+/*
+ * Some BMCs do not support clearing the receive irq bit in the global
+ * enables (even if they don't support interrupts on the BMC).  Check
+ * for this and handle it properly.
+ */
+static void check_clr_rcv_irq(struct smi_info *smi_info)
+{
+       unsigned char         msg[3];
+       unsigned char         *resp;
+       unsigned long         resp_len;
+       int                   rv;
+
+       resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
+       if (!resp) {
+               printk(KERN_WARNING PFX "Out of memory allocating response for"
+                      " global enables command, cannot check recv irq bit"
+                      " handling.\n");
+               return;
+       }
+
+       msg[0] = IPMI_NETFN_APP_REQUEST << 2;
+       msg[1] = IPMI_GET_BMC_GLOBAL_ENABLES_CMD;
+       smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
+
+       rv = wait_for_msg_done(smi_info);
+       if (rv) {
+               printk(KERN_WARNING PFX "Error getting response from get"
+                      " global enables command, cannot check recv irq bit"
+                      " handling.\n");
+               goto out;
+       }
+
+       resp_len = smi_info->handlers->get_result(smi_info->si_sm,
+                                                 resp, IPMI_MAX_MSG_LENGTH);
+
+       if (resp_len < 4 ||
+                       resp[0] != (IPMI_NETFN_APP_REQUEST | 1) << 2 ||
+                       resp[1] != IPMI_GET_BMC_GLOBAL_ENABLES_CMD   ||
+                       resp[2] != 0) {
+               printk(KERN_WARNING PFX "Invalid return from get global"
+                      " enables command, cannot check recv irq bit"
+                      " handling.\n");
+               rv = -EINVAL;
+               goto out;
+       }
+
+       if ((resp[3] & IPMI_BMC_RCV_MSG_INTR) == 0)
+               /* Already clear, should work ok. */
+               goto out;
+
+       msg[0] = IPMI_NETFN_APP_REQUEST << 2;
+       msg[1] = IPMI_SET_BMC_GLOBAL_ENABLES_CMD;
+       msg[2] = resp[3] & ~IPMI_BMC_RCV_MSG_INTR;
+       smi_info->handlers->start_transaction(smi_info->si_sm, msg, 3);
+
+       rv = wait_for_msg_done(smi_info);
+       if (rv) {
+               printk(KERN_WARNING PFX "Error getting response from set"
+                      " global enables command, cannot check recv irq bit"
+                      " handling.\n");
+               goto out;
+       }
+
+       resp_len = smi_info->handlers->get_result(smi_info->si_sm,
+                                                 resp, IPMI_MAX_MSG_LENGTH);
+
+       if (resp_len < 3 ||
+                       resp[0] != (IPMI_NETFN_APP_REQUEST | 1) << 2 ||
+                       resp[1] != IPMI_SET_BMC_GLOBAL_ENABLES_CMD) {
+               printk(KERN_WARNING PFX "Invalid return from get global"
+                      " enables command, cannot check recv irq bit"
+                      " handling.\n");
+               rv = -EINVAL;
+               goto out;
+       }
+
+       if (resp[2] != 0) {
+               /*
+                * An error when setting the event buffer bit means
+                * clearing the bit is not supported.
+                */
+               printk(KERN_WARNING PFX "The BMC does not support clearing"
+                      " the recv irq bit, compensating, but the BMC needs to"
+                      " be fixed.\n");
+               smi_info->cannot_clear_recv_irq_bit = true;
+       }
+ out:
+       kfree(resp);
+}
+
 static int try_enable_event_buffer(struct smi_info *smi_info)
 {
        unsigned char         msg[3];
@@ -3395,6 +3488,8 @@ static int try_smi_init(struct smi_info *new_smi)
                goto out_err;
        }
 
+       check_clr_rcv_irq(new_smi);
+
        setup_oem_data_handler(new_smi);
        setup_xaction_handlers(new_smi);
 
index f6e378dac5f5b1031530839d5967ab96607f1f7b..f40e3bd2c69c265400f1241900be8dddd26b51e8 100644 (file)
@@ -468,11 +468,13 @@ static int ipmi_ssif_thread(void *data)
                int result;
 
                /* Wait for something to do */
-               wait_for_completion(&ssif_info->wake_thread);
-               init_completion(&ssif_info->wake_thread);
-
+               result = wait_for_completion_interruptible(
+                                               &ssif_info->wake_thread);
                if (ssif_info->stopping)
                        break;
+               if (result == -ERESTARTSYS)
+                       continue;
+               init_completion(&ssif_info->wake_thread);
 
                if (ssif_info->i2c_read_write == I2C_SMBUS_WRITE) {
                        result = i2c_smbus_write_block_data(
index a3025e7ae35f1741883d610310b915040d633101..266469691e5820151ad3f70d1479863e5d820f9b 100644 (file)
@@ -661,17 +661,17 @@ static const struct of_device_id arch_timer_mem_of_match[] __initconst = {
 };
 
 static bool __init
-arch_timer_probed(int type, const struct of_device_id *matches)
+arch_timer_needs_probing(int type, const struct of_device_id *matches)
 {
        struct device_node *dn;
-       bool probed = true;
+       bool needs_probing = false;
 
        dn = of_find_matching_node(NULL, matches);
        if (dn && of_device_is_available(dn) && !(arch_timers_present & type))
-               probed = false;
+               needs_probing = true;
        of_node_put(dn);
 
-       return probed;
+       return needs_probing;
 }
 
 static void __init arch_timer_common_init(void)
@@ -680,9 +680,9 @@ static void __init arch_timer_common_init(void)
 
        /* Wait until both nodes are probed if we have two timers */
        if ((arch_timers_present & mask) != mask) {
-               if (!arch_timer_probed(ARCH_MEM_TIMER, arch_timer_mem_of_match))
+               if (arch_timer_needs_probing(ARCH_MEM_TIMER, arch_timer_mem_of_match))
                        return;
-               if (!arch_timer_probed(ARCH_CP15_TIMER, arch_timer_of_match))
+               if (arch_timer_needs_probing(ARCH_CP15_TIMER, arch_timer_of_match))
                        return;
        }
 
index d305fb089767c1b1c51d05f33c1e84e064f728a7..a19a3f619cc755d3a9a04dbdcde24b230cb30d6d 100644 (file)
@@ -108,7 +108,7 @@ static void __init add_clocksource(struct device_node *source_timer)
 
 static u64 notrace read_sched_clock(void)
 {
-       return ~__raw_readl(sched_io_base);
+       return ~readl_relaxed(sched_io_base);
 }
 
 static const struct of_device_id sptimer_ids[] __initconst = {
index d0a7bd66b8b91e0eb9aa4f7092ec42869a9067d8..dc3c6ee04aaa009e117fcb43fb065394ddfd6fae 100644 (file)
@@ -210,7 +210,7 @@ static int em_sti_clocksource_enable(struct clocksource *cs)
 
        ret = em_sti_start(p, USER_CLOCKSOURCE);
        if (!ret)
-               __clocksource_updatefreq_hz(cs, p->rate);
+               __clocksource_update_freq_hz(cs, p->rate);
        return ret;
 }
 
index 2bd13b53b727635500051746e27299c89d8d9b60..b8ff3c64cc452a16fc4108426fb6e5b1c54e91e8 100644 (file)
@@ -641,7 +641,7 @@ static int sh_cmt_clocksource_enable(struct clocksource *cs)
 
        ret = sh_cmt_start(ch, FLAG_CLOCKSOURCE);
        if (!ret) {
-               __clocksource_updatefreq_hz(cs, ch->rate);
+               __clocksource_update_freq_hz(cs, ch->rate);
                ch->cs_enabled = true;
        }
        return ret;
index f150ca82bfaf106a7ef2c5a40dd12a1e098e39f0..b6b8fa3cd211fc6b03460f678168d0f6568362f0 100644 (file)
@@ -272,7 +272,7 @@ static int sh_tmu_clocksource_enable(struct clocksource *cs)
 
        ret = sh_tmu_enable(ch);
        if (!ret) {
-               __clocksource_updatefreq_hz(cs, ch->rate);
+               __clocksource_update_freq_hz(cs, ch->rate);
                ch->cs_enabled = true;
        }
 
index f4a9c0058b4d677382863a12bf887b40202f63fe..1928a8912584b98e835251442e4313016e112aa2 100644 (file)
@@ -170,7 +170,15 @@ static void __init sun4i_timer_init(struct device_node *node)
               TIMER_CTL_CLK_SRC(TIMER_CTL_CLK_SRC_OSC24M),
               timer_base + TIMER_CTL_REG(1));
 
-       sched_clock_register(sun4i_timer_sched_read, 32, rate);
+       /*
+        * sched_clock_register does not have priorities, and on sun6i and
+        * later there is a better sched_clock registered by arm_arch_timer.c
+        */
+       if (of_machine_is_compatible("allwinner,sun4i-a10") ||
+           of_machine_is_compatible("allwinner,sun5i-a13") ||
+           of_machine_is_compatible("allwinner,sun5i-a10s"))
+               sched_clock_register(sun4i_timer_sched_read, 32, rate);
+
        clocksource_mmio_init(timer_base + TIMER_CNTVAL_REG(1), node->name,
                              rate, 350, 32, clocksource_mmio_readl_down);
 
index d2616ef167701526a13efe7ef896f3cca2c8668a..5a112d72fc2d2ec43205f5cde5720431bdc89b54 100644 (file)
 static void __iomem *timer_reg_base;
 static void __iomem *rtc_base;
 
-static struct timespec persistent_ts;
+static struct timespec64 persistent_ts;
 static u64 persistent_ms, last_persistent_ms;
 
 static struct delay_timer tegra_delay_timer;
 
 #define timer_writel(value, reg) \
-       __raw_writel(value, timer_reg_base + (reg))
+       writel_relaxed(value, timer_reg_base + (reg))
 #define timer_readl(reg) \
-       __raw_readl(timer_reg_base + (reg))
+       readl_relaxed(timer_reg_base + (reg))
 
 static int tegra_timer_set_next_event(unsigned long cycles,
                                         struct clock_event_device *evt)
@@ -120,26 +120,25 @@ static u64 tegra_rtc_read_ms(void)
 }
 
 /*
- * tegra_read_persistent_clock -  Return time from a persistent clock.
+ * tegra_read_persistent_clock64 -  Return time from a persistent clock.
  *
  * Reads the time from a source which isn't disabled during PM, the
  * 32k sync timer.  Convert the cycles elapsed since last read into
- * nsecs and adds to a monotonically increasing timespec.
+ * nsecs and adds to a monotonically increasing timespec64.
  * Care must be taken that this funciton is not called while the
  * tegra_rtc driver could be executing to avoid race conditions
  * on the RTC shadow register
  */
-static void tegra_read_persistent_clock(struct timespec *ts)
+static void tegra_read_persistent_clock64(struct timespec64 *ts)
 {
        u64 delta;
-       struct timespec *tsp = &persistent_ts;
 
        last_persistent_ms = persistent_ms;
        persistent_ms = tegra_rtc_read_ms();
        delta = persistent_ms - last_persistent_ms;
 
-       timespec_add_ns(tsp, delta * NSEC_PER_MSEC);
-       *ts = *tsp;
+       timespec64_add_ns(&persistent_ts, delta * NSEC_PER_MSEC);
+       *ts = persistent_ts;
 }
 
 static unsigned long tegra_delay_timer_read_counter_long(void)
@@ -252,7 +251,7 @@ static void __init tegra20_init_rtc(struct device_node *np)
        else
                clk_prepare_enable(clk);
 
-       register_persistent_clock(NULL, tegra_read_persistent_clock);
+       register_persistent_clock(NULL, tegra_read_persistent_clock64);
 }
 CLOCKSOURCE_OF_DECLARE(tegra20_rtc, "nvidia,tegra20-rtc", tegra20_init_rtc);
 
index ec57ba2bbd87ac9f2a251e350598fa52019d7815..5b6e3d5644c9519f12468369f6a25c9ab4946ff4 100644 (file)
@@ -111,7 +111,7 @@ static irqreturn_t efm32_clock_event_handler(int irq, void *dev_id)
 static struct efm32_clock_event_ddata clock_event_ddata = {
        .evtdev = {
                .name = "efm32 clockevent",
-               .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_MODE_PERIODIC,
+               .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC,
                .set_mode = efm32_clock_event_set_mode,
                .set_next_event = efm32_clock_event_set_next_event,
                .rating = 200,
index b5b4d4585c9aba41bc396e91cc107b710b154664..c0304ff608b064b3785af81532fc2b8c7f7f2b30 100644 (file)
@@ -61,12 +61,12 @@ static inline struct pit_data *clkevt_to_pit_data(struct clock_event_device *clk
 
 static inline unsigned int pit_read(void __iomem *base, unsigned int reg_offset)
 {
-       return __raw_readl(base + reg_offset);
+       return readl_relaxed(base + reg_offset);
 }
 
 static inline void pit_write(void __iomem *base, unsigned int reg_offset, unsigned long value)
 {
-       __raw_writel(value, base + reg_offset);
+       writel_relaxed(value, base + reg_offset);
 }
 
 /*
index 58597fbcc046f27f88238aa949730d6109a704b7..28aa4b7bb6020c416974ec52c8f80eb12366e705 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/irq.h>
 #include <linux/irqreturn.h>
 #include <linux/reset.h>
+#include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 
 #define TIMER_SYNC_TICKS       3
 
-static void __iomem *timer_base;
-static u32 ticks_per_jiffy;
+struct sun5i_timer {
+       void __iomem            *base;
+       struct clk              *clk;
+       struct notifier_block   clk_rate_cb;
+       u32                     ticks_per_jiffy;
+};
+
+#define to_sun5i_timer(x) \
+       container_of(x, struct sun5i_timer, clk_rate_cb)
+
+struct sun5i_timer_clksrc {
+       struct sun5i_timer      timer;
+       struct clocksource      clksrc;
+};
+
+#define to_sun5i_timer_clksrc(x) \
+       container_of(x, struct sun5i_timer_clksrc, clksrc)
+
+struct sun5i_timer_clkevt {
+       struct sun5i_timer              timer;
+       struct clock_event_device       clkevt;
+};
+
+#define to_sun5i_timer_clkevt(x) \
+       container_of(x, struct sun5i_timer_clkevt, clkevt)
 
 /*
  * When we disable a timer, we need to wait at least for 2 cycles of
@@ -45,30 +69,30 @@ static u32 ticks_per_jiffy;
  * that is already setup and runs at the same frequency than the other
  * timers, and we never will be disabled.
  */
-static void sun5i_clkevt_sync(void)
+static void sun5i_clkevt_sync(struct sun5i_timer_clkevt *ce)
 {
-       u32 old = readl(timer_base + TIMER_CNTVAL_LO_REG(1));
+       u32 old = readl(ce->timer.base + TIMER_CNTVAL_LO_REG(1));
 
-       while ((old - readl(timer_base + TIMER_CNTVAL_LO_REG(1))) < TIMER_SYNC_TICKS)
+       while ((old - readl(ce->timer.base + TIMER_CNTVAL_LO_REG(1))) < TIMER_SYNC_TICKS)
                cpu_relax();
 }
 
-static void sun5i_clkevt_time_stop(u8 timer)
+static void sun5i_clkevt_time_stop(struct sun5i_timer_clkevt *ce, u8 timer)
 {
-       u32 val = readl(timer_base + TIMER_CTL_REG(timer));
-       writel(val & ~TIMER_CTL_ENABLE, timer_base + TIMER_CTL_REG(timer));
+       u32 val = readl(ce->timer.base + TIMER_CTL_REG(timer));
+       writel(val & ~TIMER_CTL_ENABLE, ce->timer.base + TIMER_CTL_REG(timer));
 
-       sun5i_clkevt_sync();
+       sun5i_clkevt_sync(ce);
 }
 
-static void sun5i_clkevt_time_setup(u8 timer, u32 delay)
+static void sun5i_clkevt_time_setup(struct sun5i_timer_clkevt *ce, u8 timer, u32 delay)
 {
-       writel(delay, timer_base + TIMER_INTVAL_LO_REG(timer));
+       writel(delay, ce->timer.base + TIMER_INTVAL_LO_REG(timer));
 }
 
-static void sun5i_clkevt_time_start(u8 timer, bool periodic)
+static void sun5i_clkevt_time_start(struct sun5i_timer_clkevt *ce, u8 timer, bool periodic)
 {
-       u32 val = readl(timer_base + TIMER_CTL_REG(timer));
+       u32 val = readl(ce->timer.base + TIMER_CTL_REG(timer));
 
        if (periodic)
                val &= ~TIMER_CTL_ONESHOT;
@@ -76,75 +100,230 @@ static void sun5i_clkevt_time_start(u8 timer, bool periodic)
                val |= TIMER_CTL_ONESHOT;
 
        writel(val | TIMER_CTL_ENABLE | TIMER_CTL_RELOAD,
-              timer_base + TIMER_CTL_REG(timer));
+              ce->timer.base + TIMER_CTL_REG(timer));
 }
 
 static void sun5i_clkevt_mode(enum clock_event_mode mode,
-                             struct clock_event_device *clk)
+                             struct clock_event_device *clkevt)
 {
+       struct sun5i_timer_clkevt *ce = to_sun5i_timer_clkevt(clkevt);
+
        switch (mode) {
        case CLOCK_EVT_MODE_PERIODIC:
-               sun5i_clkevt_time_stop(0);
-               sun5i_clkevt_time_setup(0, ticks_per_jiffy);
-               sun5i_clkevt_time_start(0, true);
+               sun5i_clkevt_time_stop(ce, 0);
+               sun5i_clkevt_time_setup(ce, 0, ce->timer.ticks_per_jiffy);
+               sun5i_clkevt_time_start(ce, 0, true);
                break;
        case CLOCK_EVT_MODE_ONESHOT:
-               sun5i_clkevt_time_stop(0);
-               sun5i_clkevt_time_start(0, false);
+               sun5i_clkevt_time_stop(ce, 0);
+               sun5i_clkevt_time_start(ce, 0, false);
                break;
        case CLOCK_EVT_MODE_UNUSED:
        case CLOCK_EVT_MODE_SHUTDOWN:
        default:
-               sun5i_clkevt_time_stop(0);
+               sun5i_clkevt_time_stop(ce, 0);
                break;
        }
 }
 
 static int sun5i_clkevt_next_event(unsigned long evt,
-                                  struct clock_event_device *unused)
+                                  struct clock_event_device *clkevt)
 {
-       sun5i_clkevt_time_stop(0);
-       sun5i_clkevt_time_setup(0, evt - TIMER_SYNC_TICKS);
-       sun5i_clkevt_time_start(0, false);
+       struct sun5i_timer_clkevt *ce = to_sun5i_timer_clkevt(clkevt);
+
+       sun5i_clkevt_time_stop(ce, 0);
+       sun5i_clkevt_time_setup(ce, 0, evt - TIMER_SYNC_TICKS);
+       sun5i_clkevt_time_start(ce, 0, false);
 
        return 0;
 }
 
-static struct clock_event_device sun5i_clockevent = {
-       .name = "sun5i_tick",
-       .rating = 340,
-       .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-       .set_mode = sun5i_clkevt_mode,
-       .set_next_event = sun5i_clkevt_next_event,
-};
-
-
 static irqreturn_t sun5i_timer_interrupt(int irq, void *dev_id)
 {
-       struct clock_event_device *evt = (struct clock_event_device *)dev_id;
+       struct sun5i_timer_clkevt *ce = (struct sun5i_timer_clkevt *)dev_id;
 
-       writel(0x1, timer_base + TIMER_IRQ_ST_REG);
-       evt->event_handler(evt);
+       writel(0x1, ce->timer.base + TIMER_IRQ_ST_REG);
+       ce->clkevt.event_handler(&ce->clkevt);
 
        return IRQ_HANDLED;
 }
 
-static struct irqaction sun5i_timer_irq = {
-       .name = "sun5i_timer0",
-       .flags = IRQF_TIMER | IRQF_IRQPOLL,
-       .handler = sun5i_timer_interrupt,
-       .dev_id = &sun5i_clockevent,
-};
+static cycle_t sun5i_clksrc_read(struct clocksource *clksrc)
+{
+       struct sun5i_timer_clksrc *cs = to_sun5i_timer_clksrc(clksrc);
+
+       return ~readl(cs->timer.base + TIMER_CNTVAL_LO_REG(1));
+}
+
+static int sun5i_rate_cb_clksrc(struct notifier_block *nb,
+                               unsigned long event, void *data)
+{
+       struct clk_notifier_data *ndata = data;
+       struct sun5i_timer *timer = to_sun5i_timer(nb);
+       struct sun5i_timer_clksrc *cs = container_of(timer, struct sun5i_timer_clksrc, timer);
+
+       switch (event) {
+       case PRE_RATE_CHANGE:
+               clocksource_unregister(&cs->clksrc);
+               break;
+
+       case POST_RATE_CHANGE:
+               clocksource_register_hz(&cs->clksrc, ndata->new_rate);
+               break;
+
+       default:
+               break;
+       }
+
+       return NOTIFY_DONE;
+}
+
+static int __init sun5i_setup_clocksource(struct device_node *node,
+                                         void __iomem *base,
+                                         struct clk *clk, int irq)
+{
+       struct sun5i_timer_clksrc *cs;
+       unsigned long rate;
+       int ret;
+
+       cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+       if (!cs)
+               return -ENOMEM;
+
+       ret = clk_prepare_enable(clk);
+       if (ret) {
+               pr_err("Couldn't enable parent clock\n");
+               goto err_free;
+       }
+
+       rate = clk_get_rate(clk);
+
+       cs->timer.base = base;
+       cs->timer.clk = clk;
+       cs->timer.clk_rate_cb.notifier_call = sun5i_rate_cb_clksrc;
+       cs->timer.clk_rate_cb.next = NULL;
+
+       ret = clk_notifier_register(clk, &cs->timer.clk_rate_cb);
+       if (ret) {
+               pr_err("Unable to register clock notifier.\n");
+               goto err_disable_clk;
+       }
+
+       writel(~0, base + TIMER_INTVAL_LO_REG(1));
+       writel(TIMER_CTL_ENABLE | TIMER_CTL_RELOAD,
+              base + TIMER_CTL_REG(1));
+
+       cs->clksrc.name = node->name;
+       cs->clksrc.rating = 340;
+       cs->clksrc.read = sun5i_clksrc_read;
+       cs->clksrc.mask = CLOCKSOURCE_MASK(32);
+       cs->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
+
+       ret = clocksource_register_hz(&cs->clksrc, rate);
+       if (ret) {
+               pr_err("Couldn't register clock source.\n");
+               goto err_remove_notifier;
+       }
+
+       return 0;
+
+err_remove_notifier:
+       clk_notifier_unregister(clk, &cs->timer.clk_rate_cb);
+err_disable_clk:
+       clk_disable_unprepare(clk);
+err_free:
+       kfree(cs);
+       return ret;
+}
+
+static int sun5i_rate_cb_clkevt(struct notifier_block *nb,
+                               unsigned long event, void *data)
+{
+       struct clk_notifier_data *ndata = data;
+       struct sun5i_timer *timer = to_sun5i_timer(nb);
+       struct sun5i_timer_clkevt *ce = container_of(timer, struct sun5i_timer_clkevt, timer);
+
+       if (event == POST_RATE_CHANGE) {
+               clockevents_update_freq(&ce->clkevt, ndata->new_rate);
+               ce->timer.ticks_per_jiffy = DIV_ROUND_UP(ndata->new_rate, HZ);
+       }
+
+       return NOTIFY_DONE;
+}
+
+static int __init sun5i_setup_clockevent(struct device_node *node, void __iomem *base,
+                                        struct clk *clk, int irq)
+{
+       struct sun5i_timer_clkevt *ce;
+       unsigned long rate;
+       int ret;
+       u32 val;
+
+       ce = kzalloc(sizeof(*ce), GFP_KERNEL);
+       if (!ce)
+               return -ENOMEM;
+
+       ret = clk_prepare_enable(clk);
+       if (ret) {
+               pr_err("Couldn't enable parent clock\n");
+               goto err_free;
+       }
+
+       rate = clk_get_rate(clk);
+
+       ce->timer.base = base;
+       ce->timer.ticks_per_jiffy = DIV_ROUND_UP(rate, HZ);
+       ce->timer.clk = clk;
+       ce->timer.clk_rate_cb.notifier_call = sun5i_rate_cb_clkevt;
+       ce->timer.clk_rate_cb.next = NULL;
+
+       ret = clk_notifier_register(clk, &ce->timer.clk_rate_cb);
+       if (ret) {
+               pr_err("Unable to register clock notifier.\n");
+               goto err_disable_clk;
+       }
+
+       ce->clkevt.name = node->name;
+       ce->clkevt.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
+       ce->clkevt.set_next_event = sun5i_clkevt_next_event;
+       ce->clkevt.set_mode = sun5i_clkevt_mode;
+       ce->clkevt.rating = 340;
+       ce->clkevt.irq = irq;
+       ce->clkevt.cpumask = cpu_possible_mask;
+
+       /* Enable timer0 interrupt */
+       val = readl(base + TIMER_IRQ_EN_REG);
+       writel(val | TIMER_IRQ_EN(0), base + TIMER_IRQ_EN_REG);
+
+       clockevents_config_and_register(&ce->clkevt, rate,
+                                       TIMER_SYNC_TICKS, 0xffffffff);
+
+       ret = request_irq(irq, sun5i_timer_interrupt, IRQF_TIMER | IRQF_IRQPOLL,
+                         "sun5i_timer0", ce);
+       if (ret) {
+               pr_err("Unable to register interrupt\n");
+               goto err_remove_notifier;
+       }
+
+       return 0;
+
+err_remove_notifier:
+       clk_notifier_unregister(clk, &ce->timer.clk_rate_cb);
+err_disable_clk:
+       clk_disable_unprepare(clk);
+err_free:
+       kfree(ce);
+       return ret;
+}
 
 static void __init sun5i_timer_init(struct device_node *node)
 {
        struct reset_control *rstc;
-       unsigned long rate;
+       void __iomem *timer_base;
        struct clk *clk;
-       int ret, irq;
-       u32 val;
+       int irq;
 
-       timer_base = of_iomap(node, 0);
+       timer_base = of_io_request_and_map(node, 0, of_node_full_name(node));
        if (!timer_base)
                panic("Can't map registers");
 
@@ -155,35 +334,13 @@ static void __init sun5i_timer_init(struct device_node *node)
        clk = of_clk_get(node, 0);
        if (IS_ERR(clk))
                panic("Can't get timer clock");
-       clk_prepare_enable(clk);
-       rate = clk_get_rate(clk);
 
        rstc = of_reset_control_get(node, NULL);
        if (!IS_ERR(rstc))
                reset_control_deassert(rstc);
 
-       writel(~0, timer_base + TIMER_INTVAL_LO_REG(1));
-       writel(TIMER_CTL_ENABLE | TIMER_CTL_RELOAD,
-              timer_base + TIMER_CTL_REG(1));
-
-       clocksource_mmio_init(timer_base + TIMER_CNTVAL_LO_REG(1), node->name,
-                             rate, 340, 32, clocksource_mmio_readl_down);
-
-       ticks_per_jiffy = DIV_ROUND_UP(rate, HZ);
-
-       /* Enable timer0 interrupt */
-       val = readl(timer_base + TIMER_IRQ_EN_REG);
-       writel(val | TIMER_IRQ_EN(0), timer_base + TIMER_IRQ_EN_REG);
-
-       sun5i_clockevent.cpumask = cpu_possible_mask;
-       sun5i_clockevent.irq = irq;
-
-       clockevents_config_and_register(&sun5i_clockevent, rate,
-                                       TIMER_SYNC_TICKS, 0xffffffff);
-
-       ret = setup_irq(irq, &sun5i_timer_irq);
-       if (ret)
-               pr_warn("failed to setup irq %d\n", irq);
+       sun5i_setup_clocksource(node, timer_base, clk, irq);
+       sun5i_setup_clockevent(node, timer_base, clk, irq);
 }
 CLOCKSOURCE_OF_DECLARE(sun5i_a13, "allwinner,sun5i-a13-hstimer",
                       sun5i_timer_init);
index 28e59a48b35fdb993b28f621bdb0b065c0057e73..8ae655c364f48aeeeec7e4717aab6e6bfe2d9f95 100644 (file)
@@ -1698,15 +1698,18 @@ void cpufreq_resume(void)
                    || __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS))
                        pr_err("%s: Failed to start governor for policy: %p\n",
                                __func__, policy);
-
-               /*
-                * schedule call cpufreq_update_policy() for boot CPU, i.e. last
-                * policy in list. It will verify that the current freq is in
-                * sync with what we believe it to be.
-                */
-               if (list_is_last(&policy->policy_list, &cpufreq_policy_list))
-                       schedule_work(&policy->update);
        }
+
+       /*
+        * schedule call cpufreq_update_policy() for first-online CPU, as that
+        * wouldn't be hotplugged-out on suspend. It will verify that the
+        * current freq is in sync with what we believe it to be.
+        */
+       policy = cpufreq_cpu_get_raw(cpumask_first(cpu_online_mask));
+       if (WARN_ON(!policy))
+               return;
+
+       schedule_work(&policy->update);
 }
 
 /**
index 080bd2dbde4ba5408504a451e9454b51497202e3..7a73a279e179a52b9ea209e00b3f61f797c4d4cb 100644 (file)
@@ -330,9 +330,6 @@ int cpuidle_enable_device(struct cpuidle_device *dev)
        if (!dev->registered)
                return -EINVAL;
 
-       if (!dev->state_count)
-               dev->state_count = drv->state_count;
-
        ret = cpuidle_add_device_sysfs(dev);
        if (ret)
                return ret;
index 2697e87d5b34ff9ae520f1130220afb4d0876dff..5db147859b9047db626d66e64bef2897a41822f2 100644 (file)
@@ -13,7 +13,7 @@
 #include <linux/sched.h>
 #include <linux/cpuidle.h>
 #include <linux/cpumask.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 
 #include "cpuidle.h"
 
@@ -130,21 +130,20 @@ static inline void __cpuidle_unset_driver(struct cpuidle_driver *drv)
 #endif
 
 /**
- * cpuidle_setup_broadcast_timer - enable/disable the broadcast timer
+ * cpuidle_setup_broadcast_timer - enable/disable the broadcast timer on a cpu
  * @arg: a void pointer used to match the SMP cross call API
  *
- * @arg is used as a value of type 'long' with one of the two values:
- * - CLOCK_EVT_NOTIFY_BROADCAST_ON
- * - CLOCK_EVT_NOTIFY_BROADCAST_OFF
+ * If @arg is NULL broadcast is disabled otherwise enabled
  *
- * Set the broadcast timer notification for the current CPU.  This function
- * is executed per CPU by an SMP cross call.  It not supposed to be called
- * directly.
+ * This function is executed per CPU by an SMP cross call.  It's not
+ * supposed to be called directly.
  */
 static void cpuidle_setup_broadcast_timer(void *arg)
 {
-       int cpu = smp_processor_id();
-       clockevents_notify((long)(arg), &cpu);
+       if (arg)
+               tick_broadcast_enable();
+       else
+               tick_broadcast_disable();
 }
 
 /**
@@ -239,7 +238,7 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv)
 
        if (drv->bctimer)
                on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,
-                                (void *)CLOCK_EVT_NOTIFY_BROADCAST_ON, 1);
+                                (void *)1, 1);
 
        poll_idle_init(drv);
 
@@ -263,7 +262,7 @@ static void __cpuidle_unregister_driver(struct cpuidle_driver *drv)
        if (drv->bctimer) {
                drv->bctimer = 0;
                on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,
-                                (void *)CLOCK_EVT_NOTIFY_BROADCAST_OFF, 1);
+                                NULL, 1);
        }
 
        __cpuidle_unset_driver(drv);
index 97c5903b4606cf0fa02aecec826cbed605796234..832a2c3f01ffccf691842507955078299799eb86 100644 (file)
@@ -401,7 +401,7 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device)
        struct cpuidle_driver *drv = cpuidle_get_cpu_driver(device);
 
        /* state statistics */
-       for (i = 0; i < device->state_count; i++) {
+       for (i = 0; i < drv->state_count; i++) {
                kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL);
                if (!kobj)
                        goto error_state;
@@ -433,9 +433,10 @@ error_state:
  */
 static void cpuidle_remove_state_sysfs(struct cpuidle_device *device)
 {
+       struct cpuidle_driver *drv = cpuidle_get_cpu_driver(device);
        int i;
 
-       for (i = 0; i < device->state_count; i++)
+       for (i = 0; i < drv->state_count; i++)
                cpuidle_free_state_kobj(device, i);
 }
 
index 0723096fb50ac125dbb471126ba396085307ff2e..c92d6a70ccf303c69cfdb127210a09a1262122bc 100644 (file)
@@ -475,6 +475,7 @@ static int bcm2835_dma_terminate_all(struct dma_chan *chan)
         * c->desc is NULL and exit.)
         */
        if (c->desc) {
+               bcm2835_dma_desc_free(&c->desc->vd);
                c->desc = NULL;
                bcm2835_dma_abort(c->chan_base);
 
index 512cb8e2805e797ef12760d2e445652186410137..ceedafbd23e01fcda6968fb04707ae141cff3a88 100644 (file)
@@ -903,6 +903,11 @@ static const struct cppi_glue_infos *get_glue_info(struct device *dev)
        return of_id->data;
 }
 
+#define CPPI41_DMA_BUSWIDTHS   (BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | \
+                               BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | \
+                               BIT(DMA_SLAVE_BUSWIDTH_3_BYTES) | \
+                               BIT(DMA_SLAVE_BUSWIDTH_4_BYTES))
+
 static int cppi41_dma_probe(struct platform_device *pdev)
 {
        struct cppi41_dd *cdd;
@@ -926,6 +931,10 @@ static int cppi41_dma_probe(struct platform_device *pdev)
        cdd->ddev.device_issue_pending = cppi41_dma_issue_pending;
        cdd->ddev.device_prep_slave_sg = cppi41_dma_prep_slave_sg;
        cdd->ddev.device_terminate_all = cppi41_stop_chan;
+       cdd->ddev.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
+       cdd->ddev.src_addr_widths = CPPI41_DMA_BUSWIDTHS;
+       cdd->ddev.dst_addr_widths = CPPI41_DMA_BUSWIDTHS;
+       cdd->ddev.residue_granularity = DMA_RESIDUE_GRANULARITY_BURST;
        cdd->ddev.dev = dev;
        INIT_LIST_HEAD(&cdd->ddev.channels);
        cpp41_dma_info.dma_cap = cdd->ddev.cap_mask;
index 4527a3ebeac446f58a4c3b3b8722d6caf16b1b63..84884418fd30fc73a700bde9c0bebd67d72ea0a5 100644 (file)
@@ -511,6 +511,9 @@ static void jz4740_dma_desc_free(struct virt_dma_desc *vdesc)
        kfree(container_of(vdesc, struct jz4740_dma_desc, vdesc));
 }
 
+#define JZ4740_DMA_BUSWIDTHS (BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | \
+       BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | BIT(DMA_SLAVE_BUSWIDTH_4_BYTES))
+
 static int jz4740_dma_probe(struct platform_device *pdev)
 {
        struct jz4740_dmaengine_chan *chan;
@@ -548,6 +551,10 @@ static int jz4740_dma_probe(struct platform_device *pdev)
        dd->device_prep_dma_cyclic = jz4740_dma_prep_dma_cyclic;
        dd->device_config = jz4740_dma_slave_config;
        dd->device_terminate_all = jz4740_dma_terminate_all;
+       dd->src_addr_widths = JZ4740_DMA_BUSWIDTHS;
+       dd->dst_addr_widths = JZ4740_DMA_BUSWIDTHS;
+       dd->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
+       dd->residue_granularity = DMA_RESIDUE_GRANULARITY_BURST;
        dd->dev = &pdev->dev;
        INIT_LIST_HEAD(&dd->channels);
 
index f15712f2fec6c06949c23f845eeb8bbb776b8907..ac336a961dea97be01894027b9f4bd16d5566bad 100644 (file)
@@ -859,9 +859,6 @@ int dma_async_device_register(struct dma_device *device)
        BUG_ON(!device->device_issue_pending);
        BUG_ON(!device->dev);
 
-       WARN(dma_has_cap(DMA_SLAVE, device->cap_mask) && !device->directions,
-            "this driver doesn't support generic slave capabilities reporting\n");
-
        /* note: this only matters in the
         * CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH=n case
         */
index 276157f22612dc18140b84294ab1ef49ecf46ebc..53dbd3b3384cfd8b00940f7a7cd752b28bcac71a 100644 (file)
@@ -260,6 +260,13 @@ static int edma_terminate_all(struct dma_chan *chan)
         */
        if (echan->edesc) {
                int cyclic = echan->edesc->cyclic;
+
+               /*
+                * free the running request descriptor
+                * since it is not in any of the vdesc lists
+                */
+               edma_desc_free(&echan->edesc->vdesc);
+
                echan->edesc = NULL;
                edma_stop(echan->ch_num);
                /* Move the cyclic channel back to default queue */
index 15cab7d79525914d862e5cd38344bc2e3046c95a..b4634109e0100905dd39d285f03d669c405cbef0 100644 (file)
@@ -193,8 +193,10 @@ static int moxart_terminate_all(struct dma_chan *chan)
 
        spin_lock_irqsave(&ch->vc.lock, flags);
 
-       if (ch->desc)
+       if (ch->desc) {
+               moxart_dma_desc_free(&ch->desc->vd);
                ch->desc = NULL;
+       }
 
        ctrl = readl(ch->base + REG_OFF_CTRL);
        ctrl &= ~(APB_DMA_ENABLE | APB_DMA_FIN_INT_EN | APB_DMA_ERR_INT_EN);
index 7dd6dd1216819543aae06f2d3dd2707d16917c24..167dbaf6574275a0fffd5590b7a169e811320b39 100644 (file)
@@ -981,6 +981,7 @@ static int omap_dma_terminate_all(struct dma_chan *chan)
         * c->desc is NULL and exit.)
         */
        if (c->desc) {
+               omap_dma_desc_free(&c->desc->vd);
                c->desc = NULL;
                /* Avoid stopping the dma twice */
                if (!c->paused)
index 69fac068669fde566f41013cefbdf48db023466c..6e45a43ffe8476686bcaee1157a7acc641fc3e6b 100644 (file)
@@ -17,7 +17,9 @@
  */
 static const char dmi_empty_string[] = "        ";
 
-static u16 __initdata dmi_ver;
+static u32 dmi_ver __initdata;
+static u32 dmi_len;
+static u16 dmi_num;
 /*
  * Catch too early calls to dmi_check_system():
  */
@@ -78,7 +80,7 @@ static const char * __init dmi_string(const struct dmi_header *dm, u8 s)
  *     We have to be cautious here. We have seen BIOSes with DMI pointers
  *     pointing to completely the wrong place for example
  */
-static void dmi_table(u8 *buf, u32 len, int num,
+static void dmi_table(u8 *buf,
                      void (*decode)(const struct dmi_header *, void *),
                      void *private_data)
 {
@@ -86,10 +88,13 @@ static void dmi_table(u8 *buf, u32 len, int num,
        int i = 0;
 
        /*
-        *      Stop when we see all the items the table claimed to have
-        *      OR we run off the end of the table (also happens)
+        * Stop when we have seen all the items the table claimed to have
+        * (SMBIOS < 3.0 only) OR we reach an end-of-table marker OR we run
+        * off the end of the table (should never happen but sometimes does
+        * on bogus implementations.)
         */
-       while ((i < num) && (data - buf + sizeof(struct dmi_header)) <= len) {
+       while ((!dmi_num || i < dmi_num) &&
+              (data - buf + sizeof(struct dmi_header)) <= dmi_len) {
                const struct dmi_header *dm = (const struct dmi_header *)data;
 
                /*
@@ -98,9 +103,9 @@ static void dmi_table(u8 *buf, u32 len, int num,
                 *  table in dmi_decode or dmi_string
                 */
                data += dm->length;
-               while ((data - buf < len - 1) && (data[0] || data[1]))
+               while ((data - buf < dmi_len - 1) && (data[0] || data[1]))
                        data++;
-               if (data - buf < len - 1)
+               if (data - buf < dmi_len - 1)
                        decode(dm, private_data);
 
                /*
@@ -115,8 +120,6 @@ static void dmi_table(u8 *buf, u32 len, int num,
 }
 
 static phys_addr_t dmi_base;
-static u32 dmi_len;
-static u16 dmi_num;
 
 static int __init dmi_walk_early(void (*decode)(const struct dmi_header *,
                void *))
@@ -127,7 +130,7 @@ static int __init dmi_walk_early(void (*decode)(const struct dmi_header *,
        if (buf == NULL)
                return -1;
 
-       dmi_table(buf, dmi_len, dmi_num, decode, NULL);
+       dmi_table(buf, decode, NULL);
 
        add_device_randomness(buf, dmi_len);
 
@@ -198,7 +201,7 @@ static void __init dmi_save_uuid(const struct dmi_header *dm, int slot,
         * the UUID are supposed to be little-endian encoded.  The specification
         * says that this is the defacto standard.
         */
-       if (dmi_ver >= 0x0206)
+       if (dmi_ver >= 0x020600)
                sprintf(s, "%pUL", d);
        else
                sprintf(s, "%pUB", d);
@@ -470,7 +473,7 @@ static void __init dmi_format_ids(char *buf, size_t len)
  */
 static int __init dmi_present(const u8 *buf)
 {
-       int smbios_ver;
+       u32 smbios_ver;
 
        if (memcmp(buf, "_SM_", 4) == 0 &&
            buf[5] < 32 && dmi_checksum(buf, buf[5])) {
@@ -503,14 +506,16 @@ static int __init dmi_present(const u8 *buf)
                if (dmi_walk_early(dmi_decode) == 0) {
                        if (smbios_ver) {
                                dmi_ver = smbios_ver;
-                               pr_info("SMBIOS %d.%d present.\n",
-                                      dmi_ver >> 8, dmi_ver & 0xFF);
+                               pr_info("SMBIOS %d.%d%s present.\n",
+                                       dmi_ver >> 8, dmi_ver & 0xFF,
+                                       (dmi_ver < 0x0300) ? "" : ".x");
                        } else {
                                dmi_ver = (buf[14] & 0xF0) << 4 |
                                           (buf[14] & 0x0F);
                                pr_info("Legacy DMI %d.%d present.\n",
                                       dmi_ver >> 8, dmi_ver & 0xFF);
                        }
+                       dmi_ver <<= 8;
                        dmi_format_ids(dmi_ids_string, sizeof(dmi_ids_string));
                        printk(KERN_DEBUG "DMI: %s\n", dmi_ids_string);
                        return 0;
@@ -528,25 +533,16 @@ static int __init dmi_smbios3_present(const u8 *buf)
 {
        if (memcmp(buf, "_SM3_", 5) == 0 &&
            buf[6] < 32 && dmi_checksum(buf, buf[6])) {
-               dmi_ver = get_unaligned_be16(buf + 7);
+               dmi_ver = get_unaligned_be32(buf + 6);
+               dmi_ver &= 0xFFFFFF;
+               dmi_num = 0;                    /* No longer specified */
                dmi_len = get_unaligned_le32(buf + 12);
                dmi_base = get_unaligned_le64(buf + 16);
 
-               /*
-                * The 64-bit SMBIOS 3.0 entry point no longer has a field
-                * containing the number of structures present in the table.
-                * Instead, it defines the table size as a maximum size, and
-                * relies on the end-of-table structure type (#127) to be used
-                * to signal the end of the table.
-                * So let's define dmi_num as an upper bound as well: each
-                * structure has a 4 byte header, so dmi_len / 4 is an upper
-                * bound for the number of structures in the table.
-                */
-               dmi_num = dmi_len / 4;
-
                if (dmi_walk_early(dmi_decode) == 0) {
-                       pr_info("SMBIOS %d.%d present.\n",
-                               dmi_ver >> 8, dmi_ver & 0xFF);
+                       pr_info("SMBIOS %d.%d.%d present.\n",
+                               dmi_ver >> 16, (dmi_ver >> 8) & 0xFF,
+                               dmi_ver & 0xFF);
                        dmi_format_ids(dmi_ids_string, sizeof(dmi_ids_string));
                        pr_debug("DMI: %s\n", dmi_ids_string);
                        return 0;
@@ -901,7 +897,7 @@ int dmi_walk(void (*decode)(const struct dmi_header *, void *),
        if (buf == NULL)
                return -1;
 
-       dmi_table(buf, dmi_len, dmi_num, decode, private_data);
+       dmi_table(buf, decode, private_data);
 
        dmi_unmap(buf);
        return 0;
index dcae482a9a17c51a554e5928664e1b960b297588..e29560e6b40b0e5f28a141e7c88ceba1bdfa22ff 100644 (file)
@@ -175,7 +175,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
        unsigned long initrd_addr;
        u64 initrd_size = 0;
        unsigned long fdt_addr = 0;  /* Original DTB */
-       u64 fdt_size = 0;  /* We don't get size from configuration table */
+       unsigned long fdt_size = 0;
        char *cmdline_ptr = NULL;
        int cmdline_size = 0;
        unsigned long new_fdt_addr;
@@ -239,8 +239,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
        } else {
                status = handle_cmdline_files(sys_table, image, cmdline_ptr,
                                              "dtb=",
-                                             ~0UL, (unsigned long *)&fdt_addr,
-                                             (unsigned long *)&fdt_size);
+                                             ~0UL, &fdt_addr, &fdt_size);
 
                if (status != EFI_SUCCESS) {
                        pr_efi_err(sys_table, "Failed to load device tree!\n");
@@ -252,7 +251,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
                pr_efi(sys_table, "Using DTB from command line\n");
        } else {
                /* Look for a device tree configuration table entry. */
-               fdt_addr = (uintptr_t)get_fdt(sys_table);
+               fdt_addr = (uintptr_t)get_fdt(sys_table, &fdt_size);
                if (fdt_addr)
                        pr_efi(sys_table, "Using DTB from configuration table\n");
        }
index 47437b16b18697c2d624eadd8513d9bdfa8f5299..e334a01cf92f8243392ddd66616c8563306ccf5d 100644 (file)
@@ -41,7 +41,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
                                            unsigned long fdt_addr,
                                            unsigned long fdt_size);
 
-void *get_fdt(efi_system_table_t *sys_table);
+void *get_fdt(efi_system_table_t *sys_table, unsigned long *fdt_size);
 
 void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size,
                     unsigned long desc_size, efi_memory_desc_t *runtime_map,
index 91da56c4fd540cb5d381848e582f3f4453568af3..ef5d764e2a27ea506775e7c117dc291b9749ded1 100644 (file)
@@ -323,7 +323,7 @@ fail:
        return EFI_LOAD_ERROR;
 }
 
-void *get_fdt(efi_system_table_t *sys_table)
+void *get_fdt(efi_system_table_t *sys_table, unsigned long *fdt_size)
 {
        efi_guid_t fdt_guid = DEVICE_TREE_GUID;
        efi_config_table_t *tables;
@@ -336,6 +336,11 @@ void *get_fdt(efi_system_table_t *sys_table)
        for (i = 0; i < sys_table->nr_tables; i++)
                if (efi_guidcmp(tables[i].guid, fdt_guid) == 0) {
                        fdt = (void *) tables[i].table;
+                       if (fdt_check_header(fdt) != 0) {
+                               pr_efi_err(sys_table, "Invalid header detected on UEFI supplied FDT, ignoring ...\n");
+                               return NULL;
+                       }
+                       *fdt_size = fdt_totalsize(fdt);
                        break;
         }
 
index a6952ba343a89747b919b45aa4d9e07951762fcd..a65b75161aa49d16749258407db18c7cfc30f79f 100644 (file)
@@ -334,7 +334,7 @@ static struct irq_domain_ops mpc8xxx_gpio_irq_ops = {
        .xlate  = irq_domain_xlate_twocell,
 };
 
-static struct of_device_id mpc8xxx_gpio_ids[] __initdata = {
+static struct of_device_id mpc8xxx_gpio_ids[] = {
        { .compatible = "fsl,mpc8349-gpio", },
        { .compatible = "fsl,mpc8572-gpio", },
        { .compatible = "fsl,mpc8610-gpio", },
index 257e2989215c035b87fbef2f7b086c58b722e266..045a952576c708e253de29438abaec95640989f2 100644 (file)
@@ -219,7 +219,7 @@ static int syscon_gpio_probe(struct platform_device *pdev)
                ret = of_property_read_u32_index(np, "gpio,syscon-dev", 2,
                                                 &priv->dir_reg_offset);
                if (ret)
-                       dev_err(dev, "can't read the dir register offset!\n");
+                       dev_dbg(dev, "can't read the dir register offset!\n");
 
                priv->dir_reg_offset <<= 3;
        }
index c0929d938ced866e343e0230e00fc5c9cda77c0b..df990f29757a7e045fd8a42760944b2d8bd2ba84 100644 (file)
@@ -201,6 +201,10 @@ static acpi_status acpi_gpiochip_request_interrupt(struct acpi_resource *ares,
        if (!handler)
                return AE_BAD_PARAMETER;
 
+       pin = acpi_gpiochip_pin_to_gpio_offset(chip, pin);
+       if (pin < 0)
+               return AE_BAD_PARAMETER;
+
        desc = gpiochip_request_own_desc(chip, pin, "ACPI:Event");
        if (IS_ERR(desc)) {
                dev_err(chip->dev, "Failed to request GPIO\n");
@@ -551,6 +555,12 @@ acpi_gpio_adr_space_handler(u32 function, acpi_physical_address address,
                struct gpio_desc *desc;
                bool found;
 
+               pin = acpi_gpiochip_pin_to_gpio_offset(chip, pin);
+               if (pin < 0) {
+                       status = AE_BAD_PARAMETER;
+                       goto out;
+               }
+
                mutex_lock(&achip->conn_lock);
 
                found = false;
index 679b10e34fb545f23c827f9699bfef1c9f4268bd..b6f076b213bcfde496a61824335b68d1760b5613 100644 (file)
@@ -2121,7 +2121,7 @@ int drm_mode_getconnector(struct drm_device *dev, void *data,
        connector = drm_connector_find(dev, out_resp->connector_id);
        if (!connector) {
                ret = -ENOENT;
-               goto out;
+               goto out_unlock;
        }
 
        for (i = 0; i < DRM_CONNECTOR_MAX_ENCODER; i++)
@@ -2201,6 +2201,8 @@ int drm_mode_getconnector(struct drm_device *dev, void *data,
 
 out:
        drm_modeset_unlock(&dev->mode_config.connection_mutex);
+
+out_unlock:
        mutex_unlock(&dev->mode_config.mutex);
 
        return ret;
index 732cb6f8e653f58dee7102f0bb11b797cde2f5ad..4c0aa97aaf0399a111fe0d63be6d996bafbfd7f3 100644 (file)
@@ -287,6 +287,7 @@ int drm_load_edid_firmware(struct drm_connector *connector)
 
        drm_mode_connector_update_edid_property(connector, edid);
        ret = drm_add_edid_modes(connector, edid);
+       drm_edid_to_eld(connector, edid);
        kfree(edid);
 
        return ret;
index 6591d48c1b9d0f3bcc4a74a8da730833cbc4e839..3fee587bc284ebffc7b2050c69034cba9c1949da 100644 (file)
@@ -174,6 +174,7 @@ static int drm_helper_probe_single_connector_modes_merge_bits(struct drm_connect
                        struct edid *edid = (struct edid *) connector->edid_blob_ptr->data;
 
                        count = drm_add_edid_modes(connector, edid);
+                       drm_edid_to_eld(connector, edid);
                } else
                        count = (*connector_funcs->get_modes)(connector);
        }
index c300e22da8ac5f2f4294c27543e62291ee0a8739..33a10ce967eacdfbcbc679a8645baae4515b4f1b 100644 (file)
@@ -147,6 +147,7 @@ struct fimd_win_data {
        unsigned int            ovl_height;
        unsigned int            fb_width;
        unsigned int            fb_height;
+       unsigned int            fb_pitch;
        unsigned int            bpp;
        unsigned int            pixel_format;
        dma_addr_t              dma_addr;
@@ -532,13 +533,14 @@ static void fimd_win_mode_set(struct exynos_drm_crtc *crtc,
        win_data->offset_y = plane->crtc_y;
        win_data->ovl_width = plane->crtc_width;
        win_data->ovl_height = plane->crtc_height;
+       win_data->fb_pitch = plane->pitch;
        win_data->fb_width = plane->fb_width;
        win_data->fb_height = plane->fb_height;
        win_data->dma_addr = plane->dma_addr[0] + offset;
        win_data->bpp = plane->bpp;
        win_data->pixel_format = plane->pixel_format;
-       win_data->buf_offsize = (plane->fb_width - plane->crtc_width) *
-                               (plane->bpp >> 3);
+       win_data->buf_offsize =
+               plane->pitch - (plane->crtc_width * (plane->bpp >> 3));
        win_data->line_size = plane->crtc_width * (plane->bpp >> 3);
 
        DRM_DEBUG_KMS("offset_x = %d, offset_y = %d\n",
@@ -704,7 +706,7 @@ static void fimd_win_commit(struct exynos_drm_crtc *crtc, int zpos)
        writel(val, ctx->regs + VIDWx_BUF_START(win, 0));
 
        /* buffer end address */
-       size = win_data->fb_width * win_data->ovl_height * (win_data->bpp >> 3);
+       size = win_data->fb_pitch * win_data->ovl_height * (win_data->bpp >> 3);
        val = (unsigned long)(win_data->dma_addr + size);
        writel(val, ctx->regs + VIDWx_BUF_END(win, 0));
 
index 3518bc4654c5c9381acdb8d7253395e0466c8adc..2e3bc57ea50e594dcdf739042c39169428ecbda1 100644 (file)
@@ -55,6 +55,7 @@ struct hdmi_win_data {
        unsigned int            fb_x;
        unsigned int            fb_y;
        unsigned int            fb_width;
+       unsigned int            fb_pitch;
        unsigned int            fb_height;
        unsigned int            src_width;
        unsigned int            src_height;
@@ -438,7 +439,7 @@ static void vp_video_buffer(struct mixer_context *ctx, int win)
        } else {
                luma_addr[0] = win_data->dma_addr;
                chroma_addr[0] = win_data->dma_addr
-                       + (win_data->fb_width * win_data->fb_height);
+                       + (win_data->fb_pitch * win_data->fb_height);
        }
 
        if (win_data->scan_flags & DRM_MODE_FLAG_INTERLACE) {
@@ -447,8 +448,8 @@ static void vp_video_buffer(struct mixer_context *ctx, int win)
                        luma_addr[1] = luma_addr[0] + 0x40;
                        chroma_addr[1] = chroma_addr[0] + 0x40;
                } else {
-                       luma_addr[1] = luma_addr[0] + win_data->fb_width;
-                       chroma_addr[1] = chroma_addr[0] + win_data->fb_width;
+                       luma_addr[1] = luma_addr[0] + win_data->fb_pitch;
+                       chroma_addr[1] = chroma_addr[0] + win_data->fb_pitch;
                }
        } else {
                ctx->interlace = false;
@@ -469,10 +470,10 @@ static void vp_video_buffer(struct mixer_context *ctx, int win)
        vp_reg_writemask(res, VP_MODE, val, VP_MODE_FMT_MASK);
 
        /* setting size of input image */
-       vp_reg_write(res, VP_IMG_SIZE_Y, VP_IMG_HSIZE(win_data->fb_width) |
+       vp_reg_write(res, VP_IMG_SIZE_Y, VP_IMG_HSIZE(win_data->fb_pitch) |
                VP_IMG_VSIZE(win_data->fb_height));
        /* chroma height has to reduced by 2 to avoid chroma distorions */
-       vp_reg_write(res, VP_IMG_SIZE_C, VP_IMG_HSIZE(win_data->fb_width) |
+       vp_reg_write(res, VP_IMG_SIZE_C, VP_IMG_HSIZE(win_data->fb_pitch) |
                VP_IMG_VSIZE(win_data->fb_height / 2));
 
        vp_reg_write(res, VP_SRC_WIDTH, win_data->src_width);
@@ -559,7 +560,7 @@ static void mixer_graph_buffer(struct mixer_context *ctx, int win)
        /* converting dma address base and source offset */
        dma_addr = win_data->dma_addr
                + (win_data->fb_x * win_data->bpp >> 3)
-               + (win_data->fb_y * win_data->fb_width * win_data->bpp >> 3);
+               + (win_data->fb_y * win_data->fb_pitch);
        src_x_offset = 0;
        src_y_offset = 0;
 
@@ -576,7 +577,8 @@ static void mixer_graph_buffer(struct mixer_context *ctx, int win)
                MXR_GRP_CFG_FORMAT_VAL(fmt), MXR_GRP_CFG_FORMAT_MASK);
 
        /* setup geometry */
-       mixer_reg_write(res, MXR_GRAPHIC_SPAN(win), win_data->fb_width);
+       mixer_reg_write(res, MXR_GRAPHIC_SPAN(win),
+                       win_data->fb_pitch / (win_data->bpp >> 3));
 
        /* setup display size */
        if (ctx->mxr_ver == MXR_VER_128_0_0_184 &&
@@ -961,6 +963,7 @@ static void mixer_win_mode_set(struct exynos_drm_crtc *crtc,
        win_data->fb_y = plane->fb_y;
        win_data->fb_width = plane->fb_width;
        win_data->fb_height = plane->fb_height;
+       win_data->fb_pitch = plane->pitch;
        win_data->src_width = plane->src_width;
        win_data->src_height = plane->src_height;
 
index cc6ea53d2b81951553d4b135a1760cb127e574c2..5c66b568bb8162345046fcce0118c5bf74eddc2f 100644 (file)
@@ -1095,6 +1095,7 @@ static void vlv_save_gunit_s0ix_state(struct drm_i915_private *dev_priv)
        /* Gunit-Display CZ domain, 0x182028-0x1821CF */
        s->gu_ctl0              = I915_READ(VLV_GU_CTL0);
        s->gu_ctl1              = I915_READ(VLV_GU_CTL1);
+       s->pcbr                 = I915_READ(VLV_PCBR);
        s->clock_gate_dis2      = I915_READ(VLV_GUNIT_CLOCK_GATE2);
 
        /*
@@ -1189,6 +1190,7 @@ static void vlv_restore_gunit_s0ix_state(struct drm_i915_private *dev_priv)
        /* Gunit-Display CZ domain, 0x182028-0x1821CF */
        I915_WRITE(VLV_GU_CTL0,                 s->gu_ctl0);
        I915_WRITE(VLV_GU_CTL1,                 s->gu_ctl1);
+       I915_WRITE(VLV_PCBR,                    s->pcbr);
        I915_WRITE(VLV_GUNIT_CLOCK_GATE2,       s->clock_gate_dis2);
 }
 
@@ -1197,19 +1199,7 @@ int vlv_force_gfx_clock(struct drm_i915_private *dev_priv, bool force_on)
        u32 val;
        int err;
 
-       val = I915_READ(VLV_GTLC_SURVIVABILITY_REG);
-       WARN_ON(!!(val & VLV_GFX_CLK_FORCE_ON_BIT) == force_on);
-
 #define COND (I915_READ(VLV_GTLC_SURVIVABILITY_REG) & VLV_GFX_CLK_STATUS_BIT)
-       /* Wait for a previous force-off to settle */
-       if (force_on) {
-               err = wait_for(!COND, 20);
-               if (err) {
-                       DRM_ERROR("timeout waiting for GFX clock force-off (%08x)\n",
-                                 I915_READ(VLV_GTLC_SURVIVABILITY_REG));
-                       return err;
-               }
-       }
 
        val = I915_READ(VLV_GTLC_SURVIVABILITY_REG);
        val &= ~VLV_GFX_CLK_FORCE_ON_BIT;
index 8727086cf48ccce9e6548df8cf4e1d0df59012e7..b4faa2df9d3d8151c4e7aff8853be20937414096 100644 (file)
@@ -1094,6 +1094,7 @@ struct vlv_s0ix_state {
        /* Display 2 CZ domain */
        u32 gu_ctl0;
        u32 gu_ctl1;
+       u32 pcbr;
        u32 clock_gate_dis2;
 };
 
index b773368fc62c8ac67717f6770ce20fd642a1bc8c..38a742532c4fa48ba2798d6aec910b546839090f 100644 (file)
@@ -1487,7 +1487,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
                goto err;
        }
 
-       if (i915_needs_cmd_parser(ring)) {
+       if (i915_needs_cmd_parser(ring) && args->batch_len) {
                batch_obj = i915_gem_execbuffer_parse(ring,
                                                      &shadow_exec_entry,
                                                      eb,
index 0a52c44ad03d6b21078fe7482ddc3b84813ac9c4..9c5451c97942801544bbe9a0e692810bb5a9444e 100644 (file)
@@ -1322,7 +1322,7 @@ int intel_sprite_set_colorkey(struct drm_device *dev, void *data,
        drm_modeset_lock_all(dev);
 
        plane = drm_plane_find(dev, set->plane_id);
-       if (!plane) {
+       if (!plane || plane->type != DRM_PLANE_TYPE_OVERLAY) {
                ret = -ENOENT;
                goto out_unlock;
        }
@@ -1349,7 +1349,7 @@ int intel_sprite_get_colorkey(struct drm_device *dev, void *data,
        drm_modeset_lock_all(dev);
 
        plane = drm_plane_find(dev, get->plane_id);
-       if (!plane) {
+       if (!plane || plane->type != DRM_PLANE_TYPE_OVERLAY) {
                ret = -ENOENT;
                goto out_unlock;
        }
index c648e1996dabac449dfb838e018cad85b2d3bb61..243a36c93b8f96c0f268ef9bfcae1721043c7240 100644 (file)
 #define VCE_UENC_REG_CLOCK_GATING      0x207c0
 #define VCE_SYS_INT_EN                 0x21300
 #      define VCE_SYS_INT_TRAP_INTERRUPT_EN    (1 << 3)
+#define VCE_LMI_VCPU_CACHE_40BIT_BAR   0x2145c
 #define VCE_LMI_CTRL2                  0x21474
 #define VCE_LMI_CTRL                   0x21498
 #define VCE_LMI_VM_CTRL                        0x214a0
index 5587603b4a891c1f2cfcf7873dd8aa9e173907df..33d5a4f4eebdc21118b733957a2e1f75230a55cb 100644 (file)
@@ -1565,6 +1565,7 @@ struct radeon_dpm {
        int                     new_active_crtc_count;
        u32                     current_active_crtcs;
        int                     current_active_crtc_count;
+       bool single_display;
        struct radeon_dpm_dynamic_state dyn_state;
        struct radeon_dpm_fan fan;
        u32 tdp_limit;
index 63ccb8fa799c209bc82db257d3da0c8fc60ba052..d27e4ccb848c9c60e8a14f71336b790b125d2231 100644 (file)
@@ -76,7 +76,7 @@ static bool igp_read_bios_from_vram(struct radeon_device *rdev)
 
 static bool radeon_read_bios(struct radeon_device *rdev)
 {
-       uint8_t __iomem *bios;
+       uint8_t __iomem *bios, val1, val2;
        size_t size;
 
        rdev->bios = NULL;
@@ -86,15 +86,19 @@ static bool radeon_read_bios(struct radeon_device *rdev)
                return false;
        }
 
-       if (size == 0 || bios[0] != 0x55 || bios[1] != 0xaa) {
+       val1 = readb(&bios[0]);
+       val2 = readb(&bios[1]);
+
+       if (size == 0 || val1 != 0x55 || val2 != 0xaa) {
                pci_unmap_rom(rdev->pdev, bios);
                return false;
        }
-       rdev->bios = kmemdup(bios, size, GFP_KERNEL);
+       rdev->bios = kzalloc(size, GFP_KERNEL);
        if (rdev->bios == NULL) {
                pci_unmap_rom(rdev->pdev, bios);
                return false;
        }
+       memcpy_fromio(rdev->bios, bios, size);
        pci_unmap_rom(rdev->pdev, bios);
        return true;
 }
index a69bd441dd2d0cc612b9fa4d9e7711934270e3c5..572b4dbec186a9d59e8066782773c86acdbfa46b 100644 (file)
@@ -122,7 +122,6 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
        it = interval_tree_iter_first(&rmn->objects, start, end);
        while (it) {
                struct radeon_bo *bo;
-               struct fence *fence;
                int r;
 
                bo = container_of(it, struct radeon_bo, mn_it);
@@ -134,12 +133,10 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
                        continue;
                }
 
-               fence = reservation_object_get_excl(bo->tbo.resv);
-               if (fence) {
-                       r = radeon_fence_wait((struct radeon_fence *)fence, false);
-                       if (r)
-                               DRM_ERROR("(%d) failed to wait for user bo\n", r);
-               }
+               r = reservation_object_wait_timeout_rcu(bo->tbo.resv, true,
+                       false, MAX_SCHEDULE_TIMEOUT);
+               if (r)
+                       DRM_ERROR("(%d) failed to wait for user bo\n", r);
 
                radeon_ttm_placement_from_domain(bo, RADEON_GEM_DOMAIN_CPU);
                r = ttm_bo_validate(&bo->tbo, &bo->placement, false, false);
index 33cf4108386dbba4ef70a0e372eb992d1ff7e4d3..c1ba83a8dd8c9333829aaa5f1384f65c7103d238 100644 (file)
@@ -837,12 +837,8 @@ static void radeon_dpm_thermal_work_handler(struct work_struct *work)
        radeon_pm_compute_clocks(rdev);
 }
 
-static struct radeon_ps *radeon_dpm_pick_power_state(struct radeon_device *rdev,
-                                                    enum radeon_pm_state_type dpm_state)
+static bool radeon_dpm_single_display(struct radeon_device *rdev)
 {
-       int i;
-       struct radeon_ps *ps;
-       u32 ui_class;
        bool single_display = (rdev->pm.dpm.new_active_crtc_count < 2) ?
                true : false;
 
@@ -858,6 +854,17 @@ static struct radeon_ps *radeon_dpm_pick_power_state(struct radeon_device *rdev,
        if (single_display && (r600_dpm_get_vrefresh(rdev) >= 120))
                single_display = false;
 
+       return single_display;
+}
+
+static struct radeon_ps *radeon_dpm_pick_power_state(struct radeon_device *rdev,
+                                                    enum radeon_pm_state_type dpm_state)
+{
+       int i;
+       struct radeon_ps *ps;
+       u32 ui_class;
+       bool single_display = radeon_dpm_single_display(rdev);
+
        /* certain older asics have a separare 3D performance state,
         * so try that first if the user selected performance
         */
@@ -983,6 +990,7 @@ static void radeon_dpm_change_power_state_locked(struct radeon_device *rdev)
        struct radeon_ps *ps;
        enum radeon_pm_state_type dpm_state;
        int ret;
+       bool single_display = radeon_dpm_single_display(rdev);
 
        /* if dpm init failed */
        if (!rdev->pm.dpm_enabled)
@@ -1007,6 +1015,9 @@ static void radeon_dpm_change_power_state_locked(struct radeon_device *rdev)
                /* vce just modifies an existing state so force a change */
                if (ps->vce_active != rdev->pm.dpm.vce_active)
                        goto force;
+               /* user has made a display change (such as timing) */
+               if (rdev->pm.dpm.single_display != single_display)
+                       goto force;
                if ((rdev->family < CHIP_BARTS) || (rdev->flags & RADEON_IS_IGP)) {
                        /* for pre-BTC and APUs if the num crtcs changed but state is the same,
                         * all we need to do is update the display configuration.
@@ -1069,6 +1080,7 @@ force:
 
        rdev->pm.dpm.current_active_crtcs = rdev->pm.dpm.new_active_crtcs;
        rdev->pm.dpm.current_active_crtc_count = rdev->pm.dpm.new_active_crtc_count;
+       rdev->pm.dpm.single_display = single_display;
 
        /* wait for the rings to drain */
        for (i = 0; i < RADEON_NUM_RINGS; i++) {
index 2456f69efd2310233fac5c75a314a9060abe1a17..8c7872339c2a6f5e94bf601c47ed421e5caee352 100644 (file)
@@ -495,7 +495,7 @@ static int radeon_debugfs_ring_info(struct seq_file *m, void *data)
        seq_printf(m, "%u free dwords in ring\n", ring->ring_free_dw);
        seq_printf(m, "%u dwords in ring\n", count);
 
-       if (!ring->ready)
+       if (!ring->ring)
                return 0;
 
        /* print 8 dw before current rptr as often it's the last executed
index d02aa1d0f5885408c877056bd4ac1ab0e1ed6f12..b292aca0f342d53856ec3eaf982b71fd8b0a7fa8 100644 (file)
@@ -598,6 +598,10 @@ static void radeon_ttm_tt_unpin_userptr(struct ttm_tt *ttm)
        enum dma_data_direction direction = write ?
                DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
 
+       /* double check that we don't free the table twice */
+       if (!ttm->sg->sgl)
+               return;
+
        /* free the sg table and pages again */
        dma_unmap_sg(rdev->dev, ttm->sg->sgl, ttm->sg->nents, direction);
 
index 1ac7bb825a1b3bfecea601d76f8e33c2b8d581ff..fbbe78fbd087ae7c147a43b925f0ac0401c86466 100644 (file)
@@ -156,6 +156,9 @@ int vce_v2_0_resume(struct radeon_device *rdev)
        WREG32(VCE_LMI_SWAP_CNTL1, 0);
        WREG32(VCE_LMI_VM_CTRL, 0);
 
+       WREG32(VCE_LMI_VCPU_CACHE_40BIT_BAR, addr >> 8);
+
+       addr &= 0xff;
        size = RADEON_GPU_PAGE_ALIGN(rdev->vce_fw->size);
        WREG32(VCE_VCPU_CACHE_OFFSET0, addr & 0x7fffffff);
        WREG32(VCE_VCPU_CACHE_SIZE0, size);
index b0e58522780d48c49b9a1ebff182d5fa7a43d93c..5c979d0667a2210d2d73873e8766d07ea0facb22 100644 (file)
@@ -55,7 +55,7 @@
 
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 #include <trace/events/power.h>
 #include <linux/sched.h>
 #include <linux/notifier.h>
@@ -638,12 +638,12 @@ static int intel_idle(struct cpuidle_device *dev,
                leave_mm(cpu);
 
        if (!(lapic_timer_reliable_states & (1 << (cstate))))
-               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+               tick_broadcast_enter();
 
        mwait_idle_with_hints(eax, ecx);
 
        if (!(lapic_timer_reliable_states & (1 << (cstate))))
-               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+               tick_broadcast_exit();
 
        return index;
 }
@@ -665,13 +665,12 @@ static void intel_idle_freeze(struct cpuidle_device *dev,
 
 static void __setup_broadcast_timer(void *arg)
 {
-       unsigned long reason = (unsigned long)arg;
-       int cpu = smp_processor_id();
-
-       reason = reason ?
-               CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF;
+       unsigned long on = (unsigned long)arg;
 
-       clockevents_notify(reason, &cpu);
+       if (on)
+               tick_broadcast_enable();
+       else
+               tick_broadcast_disable();
 }
 
 static int cpu_hotplug_notify(struct notifier_block *n,
index 1096da327130526080139e628fdd0a0130d32861..75c6d2103e07adad3b11919687e81f8dd7a8fca3 100644 (file)
@@ -659,7 +659,7 @@ static irqreturn_t bma180_trigger_handler(int irq, void *p)
 
        mutex_lock(&data->mutex);
 
-       for_each_set_bit(bit, indio_dev->buffer->scan_mask,
+       for_each_set_bit(bit, indio_dev->active_scan_mask,
                         indio_dev->masklength) {
                ret = bma180_get_data_reg(data, bit);
                if (ret < 0) {
index 066d0c04072c69943fa21313fb47f06c72fedcea..75567fd457dcc4b9bd7c147fdc4cb229cdbaf0c4 100644 (file)
@@ -168,14 +168,14 @@ static const struct {
        int val;
        int val2;
        u8 bw_bits;
-} bmc150_accel_samp_freq_table[] = { {7, 810000, 0x08},
-                                    {15, 630000, 0x09},
-                                    {31, 250000, 0x0A},
-                                    {62, 500000, 0x0B},
-                                    {125, 0, 0x0C},
-                                    {250, 0, 0x0D},
-                                    {500, 0, 0x0E},
-                                    {1000, 0, 0x0F} };
+} bmc150_accel_samp_freq_table[] = { {15, 620000, 0x08},
+                                    {31, 260000, 0x09},
+                                    {62, 500000, 0x0A},
+                                    {125, 0, 0x0B},
+                                    {250, 0, 0x0C},
+                                    {500, 0, 0x0D},
+                                    {1000, 0, 0x0E},
+                                    {2000, 0, 0x0F} };
 
 static const struct {
        int bw_bits;
@@ -840,7 +840,7 @@ static int bmc150_accel_validate_trigger(struct iio_dev *indio_dev,
 }
 
 static IIO_CONST_ATTR_SAMP_FREQ_AVAIL(
-               "7.810000 15.630000 31.250000 62.500000 125 250 500 1000");
+               "15.620000 31.260000 62.50000 125 250 500 1000 2000");
 
 static struct attribute *bmc150_accel_attributes[] = {
        &iio_const_attr_sampling_frequency_available.dev_attr.attr,
@@ -986,7 +986,7 @@ static irqreturn_t bmc150_accel_trigger_handler(int irq, void *p)
        int bit, ret, i = 0;
 
        mutex_lock(&data->mutex);
-       for_each_set_bit(bit, indio_dev->buffer->scan_mask,
+       for_each_set_bit(bit, indio_dev->active_scan_mask,
                         indio_dev->masklength) {
                ret = i2c_smbus_read_word_data(data->client,
                                               BMC150_ACCEL_AXIS_TO_REG(bit));
index 567de269cc00650191541a98ac8d5818d0ea8661..1a6379525fa47e73497b17866be4276fc88c8065 100644 (file)
@@ -956,7 +956,7 @@ static irqreturn_t kxcjk1013_trigger_handler(int irq, void *p)
 
        mutex_lock(&data->mutex);
 
-       for_each_set_bit(bit, indio_dev->buffer->scan_mask,
+       for_each_set_bit(bit, indio_dev->active_scan_mask,
                         indio_dev->masklength) {
                ret = kxcjk1013_get_acc_reg(data, bit);
                if (ret < 0) {
index 202daf889be276315d24575b1301349f295fe4c8..46379b1fb25b59b10018a121b2cc8dc78082d4a6 100644 (file)
@@ -137,7 +137,8 @@ config AXP288_ADC
 
 config CC10001_ADC
        tristate "Cosmic Circuits 10001 ADC driver"
-       depends on HAS_IOMEM || HAVE_CLK || REGULATOR
+       depends on HAVE_CLK || REGULATOR
+       depends on HAS_IOMEM
        select IIO_BUFFER
        select IIO_TRIGGERED_BUFFER
        help
index ff61ae55dd3ff8ac73c0925c0b88fa6a75a1083d..8a0eb4a04fb55b9cb2436db5b16f678f654b8a26 100644 (file)
@@ -544,7 +544,6 @@ static int at91_adc_configure_trigger(struct iio_trigger *trig, bool state)
 {
        struct iio_dev *idev = iio_trigger_get_drvdata(trig);
        struct at91_adc_state *st = iio_priv(idev);
-       struct iio_buffer *buffer = idev->buffer;
        struct at91_adc_reg_desc *reg = st->registers;
        u32 status = at91_adc_readl(st, reg->trigger_register);
        int value;
@@ -564,7 +563,7 @@ static int at91_adc_configure_trigger(struct iio_trigger *trig, bool state)
                at91_adc_writel(st, reg->trigger_register,
                                status | value);
 
-               for_each_set_bit(bit, buffer->scan_mask,
+               for_each_set_bit(bit, idev->active_scan_mask,
                                 st->num_channels) {
                        struct iio_chan_spec const *chan = idev->channels + bit;
                        at91_adc_writel(st, AT91_ADC_CHER,
@@ -579,7 +578,7 @@ static int at91_adc_configure_trigger(struct iio_trigger *trig, bool state)
                at91_adc_writel(st, reg->trigger_register,
                                status & ~value);
 
-               for_each_set_bit(bit, buffer->scan_mask,
+               for_each_set_bit(bit, idev->active_scan_mask,
                                 st->num_channels) {
                        struct iio_chan_spec const *chan = idev->channels + bit;
                        at91_adc_writel(st, AT91_ADC_CHDR,
index 2e5cc4409f78884e82f309c729febabe3ff7f982..a0e7161f040c91daca74a469d27ff641be9ea915 100644 (file)
@@ -188,12 +188,11 @@ static int tiadc_buffer_preenable(struct iio_dev *indio_dev)
 static int tiadc_buffer_postenable(struct iio_dev *indio_dev)
 {
        struct tiadc_device *adc_dev = iio_priv(indio_dev);
-       struct iio_buffer *buffer = indio_dev->buffer;
        unsigned int enb = 0;
        u8 bit;
 
        tiadc_step_config(indio_dev);
-       for_each_set_bit(bit, buffer->scan_mask, adc_dev->channels)
+       for_each_set_bit(bit, indio_dev->active_scan_mask, adc_dev->channels)
                enb |= (get_adc_step_bit(adc_dev, bit) << 1);
        adc_dev->buffer_en_ch_steps = enb;
 
index 8ec353c01d98e02e7074df9d4d295ec19f772f09..e63b8e76d4c3d54edc25d23561f28a11afeb05e7 100644 (file)
@@ -141,9 +141,13 @@ struct vf610_adc {
        struct regulator *vref;
        struct vf610_adc_feature adc_feature;
 
+       u32 sample_freq_avail[5];
+
        struct completion completion;
 };
 
+static const u32 vf610_hw_avgs[] = { 1, 4, 8, 16, 32 };
+
 #define VF610_ADC_CHAN(_idx, _chan_type) {                     \
        .type = (_chan_type),                                   \
        .indexed = 1,                                           \
@@ -180,35 +184,47 @@ static const struct iio_chan_spec vf610_adc_iio_channels[] = {
        /* sentinel */
 };
 
-/*
- * ADC sample frequency, unit is ADCK cycles.
- * ADC clk source is ipg clock, which is the same as bus clock.
- *
- * ADC conversion time = SFCAdder + AverageNum x (BCT + LSTAdder)
- * SFCAdder: fixed to 6 ADCK cycles
- * AverageNum: 1, 4, 8, 16, 32 samples for hardware average.
- * BCT (Base Conversion Time): fixed to 25 ADCK cycles for 12 bit mode
- * LSTAdder(Long Sample Time): fixed to 3 ADCK cycles
- *
- * By default, enable 12 bit resolution mode, clock source
- * set to ipg clock, So get below frequency group:
- */
-static const u32 vf610_sample_freq_avail[5] =
-{1941176, 559332, 286957, 145374, 73171};
+static inline void vf610_adc_calculate_rates(struct vf610_adc *info)
+{
+       unsigned long adck_rate, ipg_rate = clk_get_rate(info->clk);
+       int i;
+
+       /*
+        * Calculate ADC sample frequencies
+        * Sample time unit is ADCK cycles. ADCK clk source is ipg clock,
+        * which is the same as bus clock.
+        *
+        * ADC conversion time = SFCAdder + AverageNum x (BCT + LSTAdder)
+        * SFCAdder: fixed to 6 ADCK cycles
+        * AverageNum: 1, 4, 8, 16, 32 samples for hardware average.
+        * BCT (Base Conversion Time): fixed to 25 ADCK cycles for 12 bit mode
+        * LSTAdder(Long Sample Time): fixed to 3 ADCK cycles
+        */
+       adck_rate = ipg_rate / info->adc_feature.clk_div;
+       for (i = 0; i < ARRAY_SIZE(vf610_hw_avgs); i++)
+               info->sample_freq_avail[i] =
+                       adck_rate / (6 + vf610_hw_avgs[i] * (25 + 3));
+}
 
 static inline void vf610_adc_cfg_init(struct vf610_adc *info)
 {
+       struct vf610_adc_feature *adc_feature = &info->adc_feature;
+
        /* set default Configuration for ADC controller */
-       info->adc_feature.clk_sel = VF610_ADCIOC_BUSCLK_SET;
-       info->adc_feature.vol_ref = VF610_ADCIOC_VR_VREF_SET;
+       adc_feature->clk_sel = VF610_ADCIOC_BUSCLK_SET;
+       adc_feature->vol_ref = VF610_ADCIOC_VR_VREF_SET;
+
+       adc_feature->calibration = true;
+       adc_feature->ovwren = true;
+
+       adc_feature->res_mode = 12;
+       adc_feature->sample_rate = 1;
+       adc_feature->lpm = true;
 
-       info->adc_feature.calibration = true;
-       info->adc_feature.ovwren = true;
+       /* Use a save ADCK which is below 20MHz on all devices */
+       adc_feature->clk_div = 8;
 
-       info->adc_feature.clk_div = 1;
-       info->adc_feature.res_mode = 12;
-       info->adc_feature.sample_rate = 1;
-       info->adc_feature.lpm = true;
+       vf610_adc_calculate_rates(info);
 }
 
 static void vf610_adc_cfg_post_set(struct vf610_adc *info)
@@ -290,12 +306,10 @@ static void vf610_adc_cfg_set(struct vf610_adc *info)
 
        cfg_data = readl(info->regs + VF610_REG_ADC_CFG);
 
-       /* low power configuration */
        cfg_data &= ~VF610_ADC_ADLPC_EN;
        if (adc_feature->lpm)
                cfg_data |= VF610_ADC_ADLPC_EN;
 
-       /* disable high speed */
        cfg_data &= ~VF610_ADC_ADHSC_EN;
 
        writel(cfg_data, info->regs + VF610_REG_ADC_CFG);
@@ -435,10 +449,27 @@ static irqreturn_t vf610_adc_isr(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
-static IIO_CONST_ATTR_SAMP_FREQ_AVAIL("1941176, 559332, 286957, 145374, 73171");
+static ssize_t vf610_show_samp_freq_avail(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       struct vf610_adc *info = iio_priv(dev_to_iio_dev(dev));
+       size_t len = 0;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(info->sample_freq_avail); i++)
+               len += scnprintf(buf + len, PAGE_SIZE - len,
+                       "%u ", info->sample_freq_avail[i]);
+
+       /* replace trailing space by newline */
+       buf[len - 1] = '\n';
+
+       return len;
+}
+
+static IIO_DEV_ATTR_SAMP_FREQ_AVAIL(vf610_show_samp_freq_avail);
 
 static struct attribute *vf610_attributes[] = {
-       &iio_const_attr_sampling_frequency_available.dev_attr.attr,
+       &iio_dev_attr_sampling_frequency_available.dev_attr.attr,
        NULL
 };
 
@@ -502,7 +533,7 @@ static int vf610_read_raw(struct iio_dev *indio_dev,
                return IIO_VAL_FRACTIONAL_LOG2;
 
        case IIO_CHAN_INFO_SAMP_FREQ:
-               *val = vf610_sample_freq_avail[info->adc_feature.sample_rate];
+               *val = info->sample_freq_avail[info->adc_feature.sample_rate];
                *val2 = 0;
                return IIO_VAL_INT;
 
@@ -525,9 +556,9 @@ static int vf610_write_raw(struct iio_dev *indio_dev,
        switch (mask) {
                case IIO_CHAN_INFO_SAMP_FREQ:
                        for (i = 0;
-                               i < ARRAY_SIZE(vf610_sample_freq_avail);
+                               i < ARRAY_SIZE(info->sample_freq_avail);
                                i++)
-                               if (val == vf610_sample_freq_avail[i]) {
+                               if (val == info->sample_freq_avail[i]) {
                                        info->adc_feature.sample_rate = i;
                                        vf610_adc_sample_set(info);
                                        return 0;
index 60451b32824212a039b8ac9358751faa1af12819..ccf3ea7e1afa8ca1848937b9a8b92e1f2cefc8be 100644 (file)
@@ -822,7 +822,7 @@ static irqreturn_t bmg160_trigger_handler(int irq, void *p)
        int bit, ret, i = 0;
 
        mutex_lock(&data->mutex);
-       for_each_set_bit(bit, indio_dev->buffer->scan_mask,
+       for_each_set_bit(bit, indio_dev->active_scan_mask,
                         indio_dev->masklength) {
                ret = i2c_smbus_read_word_data(data->client,
                                               BMG160_AXIS_TO_REG(bit));
index e0017c22bb9c6ce3b4f4f9c753a8ce1364621b19..f53e9a803a0e1ec1589a5b0ad25d7aa1f8d54e70 100644 (file)
@@ -60,7 +60,7 @@ int adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev)
        iio_trigger_set_drvdata(adis->trig, adis);
        ret = iio_trigger_register(adis->trig);
 
-       indio_dev->trig = adis->trig;
+       indio_dev->trig = iio_trigger_get(adis->trig);
        if (ret)
                goto error_free_irq;
 
index d8d5bed65e072cae577968edb78e2e592c2a5bfa..ef76afe2643cb0bebe512124ca8c9326e09229c2 100644 (file)
@@ -410,42 +410,46 @@ error_read_raw:
        }
 }
 
-static int inv_mpu6050_write_fsr(struct inv_mpu6050_state *st, int fsr)
+static int inv_mpu6050_write_gyro_scale(struct inv_mpu6050_state *st, int val)
 {
-       int result;
+       int result, i;
        u8 d;
 
-       if (fsr < 0 || fsr > INV_MPU6050_MAX_GYRO_FS_PARAM)
-               return -EINVAL;
-       if (fsr == st->chip_config.fsr)
-               return 0;
+       for (i = 0; i < ARRAY_SIZE(gyro_scale_6050); ++i) {
+               if (gyro_scale_6050[i] == val) {
+                       d = (i << INV_MPU6050_GYRO_CONFIG_FSR_SHIFT);
+                       result = inv_mpu6050_write_reg(st,
+                                       st->reg->gyro_config, d);
+                       if (result)
+                               return result;
 
-       d = (fsr << INV_MPU6050_GYRO_CONFIG_FSR_SHIFT);
-       result = inv_mpu6050_write_reg(st, st->reg->gyro_config, d);
-       if (result)
-               return result;
-       st->chip_config.fsr = fsr;
+                       st->chip_config.fsr = i;
+                       return 0;
+               }
+       }
 
-       return 0;
+       return -EINVAL;
 }
 
-static int inv_mpu6050_write_accel_fs(struct inv_mpu6050_state *st, int fs)
+static int inv_mpu6050_write_accel_scale(struct inv_mpu6050_state *st, int val)
 {
-       int result;
+       int result, i;
        u8 d;
 
-       if (fs < 0 || fs > INV_MPU6050_MAX_ACCL_FS_PARAM)
-               return -EINVAL;
-       if (fs == st->chip_config.accl_fs)
-               return 0;
+       for (i = 0; i < ARRAY_SIZE(accel_scale); ++i) {
+               if (accel_scale[i] == val) {
+                       d = (i << INV_MPU6050_ACCL_CONFIG_FSR_SHIFT);
+                       result = inv_mpu6050_write_reg(st,
+                                       st->reg->accl_config, d);
+                       if (result)
+                               return result;
 
-       d = (fs << INV_MPU6050_ACCL_CONFIG_FSR_SHIFT);
-       result = inv_mpu6050_write_reg(st, st->reg->accl_config, d);
-       if (result)
-               return result;
-       st->chip_config.accl_fs = fs;
+                       st->chip_config.accl_fs = i;
+                       return 0;
+               }
+       }
 
-       return 0;
+       return -EINVAL;
 }
 
 static int inv_mpu6050_write_raw(struct iio_dev *indio_dev,
@@ -471,10 +475,10 @@ static int inv_mpu6050_write_raw(struct iio_dev *indio_dev,
        case IIO_CHAN_INFO_SCALE:
                switch (chan->type) {
                case IIO_ANGL_VEL:
-                       result = inv_mpu6050_write_fsr(st, val);
+                       result = inv_mpu6050_write_gyro_scale(st, val2);
                        break;
                case IIO_ACCEL:
-                       result = inv_mpu6050_write_accel_fs(st, val);
+                       result = inv_mpu6050_write_accel_scale(st, val2);
                        break;
                default:
                        result = -EINVAL;
index 0cd306a72a6e347391ea97d7a02d05e54b6c64ab..ba27e277511fc52585f8fa25fd20d49bca61be96 100644 (file)
 #include <linux/poll.h>
 #include "inv_mpu_iio.h"
 
+static void inv_clear_kfifo(struct inv_mpu6050_state *st)
+{
+       unsigned long flags;
+
+       /* take the spin lock sem to avoid interrupt kick in */
+       spin_lock_irqsave(&st->time_stamp_lock, flags);
+       kfifo_reset(&st->timestamps);
+       spin_unlock_irqrestore(&st->time_stamp_lock, flags);
+}
+
 int inv_reset_fifo(struct iio_dev *indio_dev)
 {
        int result;
@@ -50,6 +60,10 @@ int inv_reset_fifo(struct iio_dev *indio_dev)
                                        INV_MPU6050_BIT_FIFO_RST);
        if (result)
                goto reset_fifo_fail;
+
+       /* clear timestamps fifo */
+       inv_clear_kfifo(st);
+
        /* enable interrupt */
        if (st->chip_config.accl_fifo_enable ||
            st->chip_config.gyro_fifo_enable) {
@@ -83,16 +97,6 @@ reset_fifo_fail:
        return result;
 }
 
-static void inv_clear_kfifo(struct inv_mpu6050_state *st)
-{
-       unsigned long flags;
-
-       /* take the spin lock sem to avoid interrupt kick in */
-       spin_lock_irqsave(&st->time_stamp_lock, flags);
-       kfifo_reset(&st->timestamps);
-       spin_unlock_irqrestore(&st->time_stamp_lock, flags);
-}
-
 /**
  * inv_mpu6050_irq_handler() - Cache a timestamp at each data ready interrupt.
  */
@@ -184,7 +188,6 @@ end_session:
 flush_fifo:
        /* Flush HW and SW FIFOs. */
        inv_reset_fifo(indio_dev);
-       inv_clear_kfifo(st);
        mutex_unlock(&indio_dev->mlock);
        iio_trigger_notify_done(indio_dev->trig);
 
index 5cc3692acf377664dbf255a698bb5e64606fe864..b3a36376c719317006cf8b9d30e1369f8b6af8fb 100644 (file)
@@ -1227,7 +1227,7 @@ static irqreturn_t kmx61_trigger_handler(int irq, void *p)
                base = KMX61_MAG_XOUT_L;
 
        mutex_lock(&data->lock);
-       for_each_set_bit(bit, indio_dev->buffer->scan_mask,
+       for_each_set_bit(bit, indio_dev->active_scan_mask,
                         indio_dev->masklength) {
                ret = kmx61_read_measurement(data, base, bit);
                if (ret < 0) {
index aaba9d3d980ee623ad6b78111a9c9708198b23e0..4df97f650e448e80053e037fed6d5e208493c687 100644 (file)
@@ -847,8 +847,7 @@ static int iio_device_add_channel_sysfs(struct iio_dev *indio_dev,
  * @attr_list: List of IIO device attributes
  *
  * This function frees the memory allocated for each of the IIO device
- * attributes in the list. Note: if you want to reuse the list after calling
- * this function you have to reinitialize it using INIT_LIST_HEAD().
+ * attributes in the list.
  */
 void iio_free_chan_devattr_list(struct list_head *attr_list)
 {
@@ -856,6 +855,7 @@ void iio_free_chan_devattr_list(struct list_head *attr_list)
 
        list_for_each_entry_safe(p, n, attr_list, l) {
                kfree(p->dev_attr.attr.name);
+               list_del(&p->l);
                kfree(p);
        }
 }
@@ -936,6 +936,7 @@ static void iio_device_unregister_sysfs(struct iio_dev *indio_dev)
 
        iio_free_chan_devattr_list(&indio_dev->channel_attr_list);
        kfree(indio_dev->chan_attr_group.attrs);
+       indio_dev->chan_attr_group.attrs = NULL;
 }
 
 static void iio_dev_release(struct device *device)
index a4b397048f71f9fe2e22be46f45fca0fb06cb8ff..a99692ba91bc75fb3186f69f2d55bfc9efd6652c 100644 (file)
@@ -500,6 +500,7 @@ int iio_device_register_eventset(struct iio_dev *indio_dev)
 error_free_setup_event_lines:
        iio_free_chan_devattr_list(&indio_dev->event_interface->dev_attr_list);
        kfree(indio_dev->event_interface);
+       indio_dev->event_interface = NULL;
        return ret;
 }
 
index 74dff4e4a11acdda1ec44ec6eaec75ef4d0543e4..89fca3a7075039b9b9a0de5514c1b96758d4593b 100644 (file)
@@ -494,7 +494,7 @@ static irqreturn_t sx9500_trigger_handler(int irq, void *private)
 
        mutex_lock(&data->mutex);
 
-       for_each_set_bit(bit, indio_dev->buffer->scan_mask,
+       for_each_set_bit(bit, indio_dev->active_scan_mask,
                         indio_dev->masklength) {
                ret = sx9500_read_proximity(data, &indio_dev->channels[bit],
                                            &val);
index aec7a6aa2951db47bc6b5be969a29d1867688b23..8c014b5dab4c82ff805a744c89655555a56094fc 100644 (file)
@@ -99,6 +99,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
        if (dmasync)
                dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
 
+       /*
+        * If the combination of the addr and size requested for this memory
+        * region causes an integer overflow, return error.
+        */
+       if ((PAGE_ALIGN(addr + size) <= size) ||
+           (PAGE_ALIGN(addr + size) <= addr))
+               return ERR_PTR(-EINVAL);
+
        if (!can_do_mlock())
                return ERR_PTR(-EPERM);
 
index 1bd15ebc01f2df5002eca38f7089a61701471f5d..27bcdbc950c9fc2df9067504cfe055bf379eee78 100644 (file)
@@ -1154,10 +1154,28 @@ out:
        mutex_unlock(&alps_mutex);
 }
 
-static void alps_report_bare_ps2_packet(struct input_dev *dev,
+static void alps_report_bare_ps2_packet(struct psmouse *psmouse,
                                        unsigned char packet[],
                                        bool report_buttons)
 {
+       struct alps_data *priv = psmouse->private;
+       struct input_dev *dev;
+
+       /* Figure out which device to use to report the bare packet */
+       if (priv->proto_version == ALPS_PROTO_V2 &&
+           (priv->flags & ALPS_DUALPOINT)) {
+               /* On V2 devices the DualPoint Stick reports bare packets */
+               dev = priv->dev2;
+       } else if (unlikely(IS_ERR_OR_NULL(priv->dev3))) {
+               /* Register dev3 mouse if we received PS/2 packet first time */
+               if (!IS_ERR(priv->dev3))
+                       psmouse_queue_work(psmouse, &priv->dev3_register_work,
+                                          0);
+               return;
+       } else {
+               dev = priv->dev3;
+       }
+
        if (report_buttons)
                alps_report_buttons(dev, NULL,
                                packet[0] & 1, packet[0] & 2, packet[0] & 4);
@@ -1232,8 +1250,8 @@ static psmouse_ret_t alps_handle_interleaved_ps2(struct psmouse *psmouse)
                 * de-synchronization.
                 */
 
-               alps_report_bare_ps2_packet(priv->dev2,
-                                           &psmouse->packet[3], false);
+               alps_report_bare_ps2_packet(psmouse, &psmouse->packet[3],
+                                           false);
 
                /*
                 * Continue with the standard ALPS protocol handling,
@@ -1289,18 +1307,9 @@ static psmouse_ret_t alps_process_byte(struct psmouse *psmouse)
         * properly we only do this if the device is fully synchronized.
         */
        if (!psmouse->out_of_sync_cnt && (psmouse->packet[0] & 0xc8) == 0x08) {
-
-               /* Register dev3 mouse if we received PS/2 packet first time */
-               if (unlikely(!priv->dev3))
-                       psmouse_queue_work(psmouse,
-                                          &priv->dev3_register_work, 0);
-
                if (psmouse->pktcnt == 3) {
-                       /* Once dev3 mouse device is registered report data */
-                       if (likely(!IS_ERR_OR_NULL(priv->dev3)))
-                               alps_report_bare_ps2_packet(priv->dev3,
-                                                           psmouse->packet,
-                                                           true);
+                       alps_report_bare_ps2_packet(psmouse, psmouse->packet,
+                                                   true);
                        return PSMOUSE_FULL_PACKET;
                }
                return PSMOUSE_GOOD_DATA;
@@ -2281,10 +2290,12 @@ static int alps_set_protocol(struct psmouse *psmouse,
                priv->set_abs_params = alps_set_abs_params_mt;
                priv->nibble_commands = alps_v3_nibble_commands;
                priv->addr_command = PSMOUSE_CMD_RESET_WRAP;
-               priv->x_max = 1360;
-               priv->y_max = 660;
                priv->x_bits = 23;
                priv->y_bits = 12;
+
+               if (alps_dolphin_get_device_area(psmouse, priv))
+                       return -EIO;
+
                break;
 
        case ALPS_PROTO_V6:
@@ -2303,9 +2314,8 @@ static int alps_set_protocol(struct psmouse *psmouse,
                priv->set_abs_params = alps_set_abs_params_mt;
                priv->nibble_commands = alps_v3_nibble_commands;
                priv->addr_command = PSMOUSE_CMD_RESET_WRAP;
-
-               if (alps_dolphin_get_device_area(psmouse, priv))
-                       return -EIO;
+               priv->x_max = 0xfff;
+               priv->y_max = 0x7ff;
 
                if (priv->fw_ver[1] != 0xba)
                        priv->flags |= ALPS_BUTTONPAD;
index dda605836546847afbdbdd4c77c976134a914a2e..3b06c8a360b661f02ed3aa826e5a996b93e5b358 100644 (file)
@@ -152,6 +152,11 @@ static const struct min_max_quirk min_max_pnpid_table[] = {
                {ANY_BOARD_ID, ANY_BOARD_ID},
                1024, 5022, 2508, 4832
        },
+       {
+               (const char * const []){"LEN2006", NULL},
+               {2691, 2691},
+               1024, 5045, 2457, 4832
+       },
        {
                (const char * const []){"LEN2006", NULL},
                {ANY_BOARD_ID, ANY_BOARD_ID},
@@ -189,7 +194,7 @@ static const char * const topbuttonpad_pnp_ids[] = {
        "LEN2003",
        "LEN2004", /* L440 */
        "LEN2005",
-       "LEN2006",
+       "LEN2006", /* Edge E440/E540 */
        "LEN2007",
        "LEN2008",
        "LEN2009",
index fc13dd56953e1eb2cb25107ef017336546b733a7..a3adde6519f0a24b7150f8e69ceabed534ceaa38 100644 (file)
@@ -1288,10 +1288,13 @@ static phys_addr_t arm_smmu_iova_to_phys(struct iommu_domain *domain,
                return 0;
 
        spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags);
-       if (smmu_domain->smmu->features & ARM_SMMU_FEAT_TRANS_OPS)
+       if (smmu_domain->smmu->features & ARM_SMMU_FEAT_TRANS_OPS &&
+                       smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
                ret = arm_smmu_iova_to_phys_hard(domain, iova);
-       else
+       } else {
                ret = ops->iova_to_phys(ops, iova);
+       }
+
        spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags);
 
        return ret;
@@ -1556,7 +1559,7 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
                return -ENODEV;
        }
 
-       if (smmu->version == 1 || (!(id & ID0_ATOSNS) && (id & ID0_S1TS))) {
+       if ((id & ID0_S1TS) && ((smmu->version == 1) || (id & ID0_ATOSNS))) {
                smmu->features |= ARM_SMMU_FEAT_TRANS_OPS;
                dev_notice(smmu->dev, "\taddress translation ops\n");
        }
index ae4c1a854e57896fc64e33668369bebc23f70945..2d1e05bdbb53f5901035294a71c65231b004a338 100644 (file)
@@ -1742,9 +1742,8 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 
 static void domain_exit(struct dmar_domain *domain)
 {
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
        struct page *freelist = NULL;
+       int i;
 
        /* Domain 0 is reserved, so dont process it */
        if (!domain)
@@ -1764,8 +1763,8 @@ static void domain_exit(struct dmar_domain *domain)
 
        /* clear attached or cached domains */
        rcu_read_lock();
-       for_each_active_iommu(iommu, drhd)
-               iommu_detach_domain(domain, iommu);
+       for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus)
+               iommu_detach_domain(domain, g_iommus[i]);
        rcu_read_unlock();
 
        dma_free_pagelist(freelist);
index 10186cac7716e246ea8b8a6e915bf07970670f5b..bc39bdf7b99bf170d793965e28777d3c7a279537 100644 (file)
@@ -851,6 +851,7 @@ static int ipmmu_remove(struct platform_device *pdev)
 
 static const struct of_device_id ipmmu_of_ids[] = {
        { .compatible = "renesas,ipmmu-vmsa", },
+       { }
 };
 
 static struct platform_driver ipmmu_driver = {
index 596b0a9eee99a9f2ea1beaac726bddc4069c4937..9687f8afebffbb865256ba6677663e6c76702aa1 100644 (file)
@@ -169,7 +169,7 @@ static void its_encode_cmd(struct its_cmd_block *cmd, u8 cmd_nr)
 
 static void its_encode_devid(struct its_cmd_block *cmd, u32 devid)
 {
-       cmd->raw_cmd[0] &= ~(0xffffUL << 32);
+       cmd->raw_cmd[0] &= BIT_ULL(32) - 1;
        cmd->raw_cmd[0] |= ((u64)devid) << 32;
 }
 
@@ -802,6 +802,7 @@ static int its_alloc_tables(struct its_node *its)
        int i;
        int psz = SZ_64K;
        u64 shr = GITS_BASER_InnerShareable;
+       u64 cache = GITS_BASER_WaWb;
 
        for (i = 0; i < GITS_BASER_NR_REGS; i++) {
                u64 val = readq_relaxed(its->base + GITS_BASER + i * 8);
@@ -848,7 +849,7 @@ retry_baser:
                val = (virt_to_phys(base)                                |
                       (type << GITS_BASER_TYPE_SHIFT)                   |
                       ((entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) |
-                      GITS_BASER_WaWb                                   |
+                      cache                                             |
                       shr                                               |
                       GITS_BASER_VALID);
 
@@ -874,9 +875,12 @@ retry_baser:
                         * Shareability didn't stick. Just use
                         * whatever the read reported, which is likely
                         * to be the only thing this redistributor
-                        * supports.
+                        * supports. If that's zero, make it
+                        * non-cacheable as well.
                         */
                        shr = tmp & GITS_BASER_SHAREABILITY_MASK;
+                       if (!shr)
+                               cache = GITS_BASER_nC;
                        goto retry_baser;
                }
 
@@ -980,16 +984,39 @@ static void its_cpu_init_lpis(void)
        tmp = readq_relaxed(rbase + GICR_PROPBASER);
 
        if ((tmp ^ val) & GICR_PROPBASER_SHAREABILITY_MASK) {
+               if (!(tmp & GICR_PROPBASER_SHAREABILITY_MASK)) {
+                       /*
+                        * The HW reports non-shareable, we must
+                        * remove the cacheability attributes as
+                        * well.
+                        */
+                       val &= ~(GICR_PROPBASER_SHAREABILITY_MASK |
+                                GICR_PROPBASER_CACHEABILITY_MASK);
+                       val |= GICR_PROPBASER_nC;
+                       writeq_relaxed(val, rbase + GICR_PROPBASER);
+               }
                pr_info_once("GIC: using cache flushing for LPI property table\n");
                gic_rdists->flags |= RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING;
        }
 
        /* set PENDBASE */
        val = (page_to_phys(pend_page) |
-              GICR_PROPBASER_InnerShareable |
-              GICR_PROPBASER_WaWb);
+              GICR_PENDBASER_InnerShareable |
+              GICR_PENDBASER_WaWb);
 
        writeq_relaxed(val, rbase + GICR_PENDBASER);
+       tmp = readq_relaxed(rbase + GICR_PENDBASER);
+
+       if (!(tmp & GICR_PENDBASER_SHAREABILITY_MASK)) {
+               /*
+                * The HW reports non-shareable, we must remove the
+                * cacheability attributes as well.
+                */
+               val &= ~(GICR_PENDBASER_SHAREABILITY_MASK |
+                        GICR_PENDBASER_CACHEABILITY_MASK);
+               val |= GICR_PENDBASER_nC;
+               writeq_relaxed(val, rbase + GICR_PENDBASER);
+       }
 
        /* Enable LPIs */
        val = readl_relaxed(rbase + GICR_CTLR);
@@ -1026,7 +1053,7 @@ static void its_cpu_init_collection(void)
                         * This ITS wants a linear CPU number.
                         */
                        target = readq_relaxed(gic_data_rdist_rd_base() + GICR_TYPER);
-                       target = GICR_TYPER_CPU_NUMBER(target);
+                       target = GICR_TYPER_CPU_NUMBER(target) << 16;
                }
 
                /* Perform collection mapping */
@@ -1422,14 +1449,26 @@ static int its_probe(struct device_node *node, struct irq_domain *parent)
 
        writeq_relaxed(baser, its->base + GITS_CBASER);
        tmp = readq_relaxed(its->base + GITS_CBASER);
-       writeq_relaxed(0, its->base + GITS_CWRITER);
-       writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR);
 
-       if ((tmp ^ baser) & GITS_BASER_SHAREABILITY_MASK) {
+       if ((tmp ^ baser) & GITS_CBASER_SHAREABILITY_MASK) {
+               if (!(tmp & GITS_CBASER_SHAREABILITY_MASK)) {
+                       /*
+                        * The HW reports non-shareable, we must
+                        * remove the cacheability attributes as
+                        * well.
+                        */
+                       baser &= ~(GITS_CBASER_SHAREABILITY_MASK |
+                                  GITS_CBASER_CACHEABILITY_MASK);
+                       baser |= GITS_CBASER_nC;
+                       writeq_relaxed(baser, its->base + GITS_CBASER);
+               }
                pr_info("ITS: using cache flushing for cmd queue\n");
                its->flags |= ITS_FLAGS_CMDQ_NEEDS_FLUSHING;
        }
 
+       writeq_relaxed(0, its->base + GITS_CWRITER);
+       writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR);
+
        if (of_property_read_bool(its->msi_chip.of_node, "msi-controller")) {
                its->domain = irq_domain_add_tree(NULL, &its_domain_ops, its);
                if (!its->domain) {
index ee035ec4526bd802a89de51c6edc099d8bbb9747..169172d2ba05c8b4187b9151975dcd1be52043c2 100644 (file)
@@ -1,6 +1,6 @@
 config LGUEST
        tristate "Linux hypervisor example code"
-       depends on X86_32 && EVENTFD && TTY
+       depends on X86_32 && EVENTFD && TTY && PCI_DIRECT
        select HVC_DRIVER
        ---help---
          This is a very simple module which allows you to run
index 717daad71fb101b2b97efd03b1bb41d8b35ecfe7..e6178787ce3dd4a9b0e33b80b72ac39f73e25e33 100644 (file)
@@ -249,6 +249,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
        const int rw = bio_data_dir(bio);
        struct mddev *mddev = q->queuedata;
        unsigned int sectors;
+       int cpu;
 
        if (mddev == NULL || mddev->pers == NULL
            || !mddev->ready) {
@@ -284,7 +285,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
        sectors = bio_sectors(bio);
        mddev->pers->make_request(mddev, bio);
 
-       generic_start_io_acct(rw, sectors, &mddev->gendisk->part0);
+       cpu = part_stat_lock();
+       part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+       part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+       part_stat_unlock();
 
        if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
                wake_up(&mddev->sb_wait);
index 3ed9f42ddca65e10351a1a453f16383e63c52634..3b5d7f704aa346ad27b4806c9e36ee9ba984edc7 100644 (file)
@@ -313,7 +313,7 @@ static struct strip_zone *find_zone(struct r0conf *conf,
 
 /*
  * remaps the bio to the target device. we separate two flows.
- * power 2 flow and a general flow for the sake of perfromance
+ * power 2 flow and a general flow for the sake of performance
 */
 static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
                                sector_t sector, sector_t *sector_offset)
@@ -524,6 +524,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
                        split = bio;
                }
 
+               sector = bio->bi_iter.bi_sector;
                zone = find_zone(mddev->private, &sector);
                tmp_dev = map_sector(mddev, zone, sector, &sector);
                split->bi_bdev = tmp_dev->bdev;
index 5d2d8f45b4b62a1005e261383b7c858d3c5e539e..67faa8d6950efb1e4f8b24c0aad0e99027473b43 100644 (file)
@@ -1240,7 +1240,7 @@ static int rtl2832_probe(struct i2c_client *client,
        dev->regmap_config.max_register = 5 * 0x100,
        dev->regmap_config.ranges = regmap_range_cfg,
        dev->regmap_config.num_ranges = ARRAY_SIZE(regmap_range_cfg),
-       dev->regmap_config.cache_type = REGCACHE_RBTREE,
+       dev->regmap_config.cache_type = REGCACHE_NONE,
        dev->regmap = regmap_init(&client->dev, &regmap_bus, client,
                                  &dev->regmap_config);
        if (IS_ERR(dev->regmap)) {
index e4901a503c7366dc10a8ded1c61e84683f62124e..63c0ee5d0bf5eca7511df486fe300b3a468301b5 100644 (file)
@@ -1339,14 +1339,13 @@ static int vidioc_querycap(struct file *file, void  *priv,
        strlcpy(cap->driver, dev->name, sizeof(cap->driver));
        strlcpy(cap->card, cx23885_boards[tsport->dev->board].name,
                sizeof(cap->card));
-       sprintf(cap->bus_info, "PCI:%s", pci_name(dev->pci));
-       cap->capabilities =
-               V4L2_CAP_VIDEO_CAPTURE |
-               V4L2_CAP_READWRITE     |
-               V4L2_CAP_STREAMING     |
-               0;
+       sprintf(cap->bus_info, "PCIe:%s", pci_name(dev->pci));
+       cap->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_READWRITE |
+                          V4L2_CAP_STREAMING;
        if (dev->tuner_type != TUNER_ABSENT)
-               cap->capabilities |= V4L2_CAP_TUNER;
+               cap->device_caps |= V4L2_CAP_TUNER;
+       cap->capabilities = cap->device_caps | V4L2_CAP_VBI_CAPTURE |
+               V4L2_CAP_AUDIO | V4L2_CAP_DEVICE_CAPS;
 
        return 0;
 }
index 12f7452edce3708a989eb0e7f71a94def8a31665..a92ff4249d100d33e84acaf50b372eb5e2a190b0 100644 (file)
@@ -1845,6 +1845,9 @@ static void exynos4_jpeg_set_img_addr(struct s5p_jpeg_ctx *ctx)
        struct s5p_jpeg_addr jpeg_addr;
        u32 pix_size, padding_bytes = 0;
 
+       jpeg_addr.cb = 0;
+       jpeg_addr.cr = 0;
+
        pix_size = ctx->cap_q.w * ctx->cap_q.h;
 
        if (ctx->mode == S5P_JPEG_ENCODE) {
index e8c2cad9396272ed9fa6f43c0fbab2699ba0073e..0974b9a7a584fb85f6aa268257264f995b5baef3 100644 (file)
@@ -20,7 +20,7 @@
 
 void exynos3250_jpeg_reset(void __iomem *regs)
 {
-       u32 reg = 0;
+       u32 reg = 1;
        int count = 1000;
 
        writel(1, regs + EXYNOS3250_SW_RESET);
index 8e44a59d8ec20f17bcfd601fd4a8c7abdaf8a530..98374e8bad3e99ffdf5a70d26b923e6e0c265bf3 100644 (file)
@@ -833,6 +833,7 @@ static int s5p_mfc_open(struct file *file)
        q->type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
        q->io_modes = VB2_MMAP;
        q->drv_priv = &ctx->fh;
+       q->lock = &dev->mfc_mutex;
        if (vdev == dev->vfd_dec) {
                q->io_modes = VB2_MMAP;
                q->ops = get_dec_queue_ops();
index 15f7663dd9f5c193ba28968b2da5326f2bf19ea4..24262bbb1a3568fe87875bedfcd4d52f42867e4e 100644 (file)
@@ -29,7 +29,7 @@
 
 /* Offset base used to differentiate between CAPTURE and OUTPUT
 *  while mmaping */
-#define DST_QUEUE_OFF_BASE      (TASK_SIZE / 2)
+#define DST_QUEUE_OFF_BASE     (1 << 30)
 
 #define MFC_BANK1_ALLOC_CTX    0
 #define MFC_BANK2_ALLOC_CTX    1
index de2b8c69daa535dd6ea906dcd476638e3185c72f..22dfb3effda8912ded31f8afabe089857b55ac78 100644 (file)
@@ -302,7 +302,7 @@ struct s5p_mfc_hw_ops {
        void (*write_info)(struct s5p_mfc_ctx *ctx, unsigned int data,
                        unsigned int ofs);
        unsigned int (*read_info)(struct s5p_mfc_ctx *ctx,
-                       unsigned int ofs);
+                       unsigned long ofs);
        int (*get_dspl_y_adr)(struct s5p_mfc_dev *dev);
        int (*get_dec_y_adr)(struct s5p_mfc_dev *dev);
        int (*get_dspl_status)(struct s5p_mfc_dev *dev);
index 0c4fcf2dfd09c68cd41e12481b44944a585a52fc..b09bcd14049133cb63c1d35150b5a465b75e694d 100644 (file)
@@ -263,15 +263,15 @@ static void s5p_mfc_release_dev_context_buffer_v5(struct s5p_mfc_dev *dev)
 static void s5p_mfc_write_info_v5(struct s5p_mfc_ctx *ctx, unsigned int data,
                        unsigned int ofs)
 {
-       writel(data, (volatile void __iomem *)(ctx->shm.virt + ofs));
+       writel(data, (void *)(ctx->shm.virt + ofs));
        wmb();
 }
 
 static unsigned int s5p_mfc_read_info_v5(struct s5p_mfc_ctx *ctx,
-                               unsigned int ofs)
+                               unsigned long ofs)
 {
        rmb();
-       return readl((volatile void __iomem *)(ctx->shm.virt + ofs));
+       return readl((void *)(ctx->shm.virt + ofs));
 }
 
 static void s5p_mfc_dec_calc_dpb_size_v5(struct s5p_mfc_ctx *ctx)
index d826c58b5d538bfcc24fc3dc0f6539d3bbd5320e..cefad184fe969296db59858103f4c6e713c6acd3 100644 (file)
@@ -1852,17 +1852,17 @@ static void s5p_mfc_write_info_v6(struct s5p_mfc_ctx *ctx, unsigned int data,
                unsigned int ofs)
 {
        s5p_mfc_clock_on();
-       writel(data, (volatile void __iomem *)((unsigned long)ofs));
+       writel(data, (void *)((unsigned long)ofs));
        s5p_mfc_clock_off();
 }
 
 static unsigned int
-s5p_mfc_read_info_v6(struct s5p_mfc_ctx *ctx, unsigned int ofs)
+s5p_mfc_read_info_v6(struct s5p_mfc_ctx *ctx, unsigned long ofs)
 {
        int ret;
 
        s5p_mfc_clock_on();
-       ret = readl((volatile void __iomem *)((unsigned long)ofs));
+       ret = readl((void *)ofs);
        s5p_mfc_clock_off();
 
        return ret;
index 5a1835dd65e858e5622f5b01fbe5fe4f4b63e5bf..697aaed42486fb71fe3195be01e2c0b61ee2bb14 100644 (file)
@@ -20,6 +20,7 @@ if VIDEO_SAMSUNG_S5P_TV
 config VIDEO_SAMSUNG_S5P_HDMI
        tristate "Samsung HDMI Driver"
        depends on VIDEO_V4L2
+       depends on I2C
        depends on VIDEO_SAMSUNG_S5P_TV
        select VIDEO_SAMSUNG_S5P_HDMIPHY
        help
index a901b62485576489d4ed4867a131d7a6b7d69386..2554f3719b9e365584fb9b105c1ffd7d65bcd1b9 100644 (file)
@@ -1158,6 +1158,7 @@ static int sh_veu_probe(struct platform_device *pdev)
        }
 
        *vdev = sh_veu_videodev;
+       vdev->v4l2_dev = &veu->v4l2_dev;
        spin_lock_init(&veu->lock);
        mutex_init(&veu->fop_lock);
        vdev->lock = &veu->fop_lock;
index 8526bf5c8429aecbb5afa8030fca537b0720b2ab..c835beb2a1a8f30e14f48a34566b842dbdce96cf 100644 (file)
@@ -843,6 +843,8 @@ static int isi_camera_set_bus_param(struct soc_camera_device *icd)
        if (isi->pdata.full_mode)
                cfg1 |= ISI_CFG1_FULL_MODE;
 
+       cfg1 |= ISI_CFG1_THMASK_BEATS_16;
+
        isi_writel(isi, ISI_CTRL, ISI_CTRL_DIS);
        isi_writel(isi, ISI_CFG1, cfg1);
 
index cee7b56f840499440142bc598fbd588bd8469f67..66634b469c9899f043f79a69c4c232599ec8ee0f 100644 (file)
@@ -1665,7 +1665,7 @@ eclkreg:
 eaddpdev:
        platform_device_put(sasc->pdev);
 eallocpdev:
-       devm_kfree(ici->v4l2_dev.dev, sasc);
+       devm_kfree(ici->v4l2_dev.dev, info);
        dev_err(ici->v4l2_dev.dev, "group probe failed: %d\n", ret);
 
        return ret;
index 77dcfdf547ac8f62e4cc2ea5a8fa086478e545b5..87fc0fe29ebd30e91ee58488cf6398df111ece63 100644 (file)
@@ -780,8 +780,6 @@ static int rtl2832u_frontend_callback(void *adapter_priv, int component,
                case TUNER_RTL2832_TUA9001:
                        return rtl2832u_tua9001_tuner_callback(d, cmd, arg);
                }
-       default:
-               return -EINVAL;
        }
 
        return 0;
index 60af3b167f3b73e02071074a8e01ce39499c8d98..3fd94fe7e1eb1652b7dfc5df4d5983f3733eb65b 100644 (file)
@@ -1,6 +1,7 @@
 menuconfig USB_GSPCA
        tristate "GSPCA based webcams"
        depends on VIDEO_V4L2
+       depends on INPUT || INPUT=n
        default m
        ---help---
          Say Y here if you want to enable selecting webcams based
index bc08a829bc132068c0b51661f9459293ef30c142..cc16e76a24933c41a9cc81ddb5b2beb0193fcbe2 100644 (file)
@@ -3230,18 +3230,13 @@ int vb2_thread_stop(struct vb2_queue *q)
 
        if (threadio == NULL)
                return 0;
-       call_void_qop(q, wait_finish, q);
        threadio->stop = true;
-       vb2_internal_streamoff(q, q->type);
-       call_void_qop(q, wait_prepare, q);
+       /* Wake up all pending sleeps in the thread */
+       vb2_queue_error(q);
        err = kthread_stop(threadio->thread);
-       q->fileio = NULL;
-       fileio->req.count = 0;
-       vb2_reqbufs(q, &fileio->req);
-       kfree(fileio);
+       __vb2_cleanup_fileio(q);
        threadio->thread = NULL;
        kfree(threadio);
-       q->fileio = NULL;
        q->threadio = NULL;
        return err;
 }
index b481d20c83727aa46e30119380f02be200a41479..69e0483adfee02e569f89ece65a31f73c0a96f35 100644 (file)
@@ -632,8 +632,7 @@ static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
        }
 
        /* extract page list from userspace mapping */
-       ret = vb2_dc_get_user_pages(start, pages, n_pages, vma,
-                                   dma_dir == DMA_FROM_DEVICE);
+       ret = vb2_dc_get_user_pages(start, pages, n_pages, vma, dma_dir);
        if (ret) {
                unsigned long pfn;
                if (vb2_dc_get_user_pfn(start, n_pages, vma, &pfn) == 0) {
index 38552a31304aff8cf2747d0f6278897bd6b87d72..65fed7146e9bac2a407df1c05710d6a6eb75e6c4 100644 (file)
@@ -202,16 +202,17 @@ static void enclosure_remove_links(struct enclosure_component *cdev)
 {
        char name[ENCLOSURE_NAME_SIZE];
 
+       enclosure_link_name(cdev, name);
+
        /*
         * In odd circumstances, like multipath devices, something else may
         * already have removed the links, so check for this condition first.
         */
-       if (!cdev->dev->kobj.sd)
-               return;
+       if (cdev->dev->kobj.sd)
+               sysfs_remove_link(&cdev->dev->kobj, name);
 
-       enclosure_link_name(cdev, name);
-       sysfs_remove_link(&cdev->dev->kobj, name);
-       sysfs_remove_link(&cdev->cdev.kobj, "device");
+       if (cdev->cdev.kobj.sd)
+               sysfs_remove_link(&cdev->cdev.kobj, "device");
 }
 
 static int enclosure_add_links(struct enclosure_component *cdev)
index 82dc5748f873b72ea355b517c606d59dc6235c31..7f327121e6d7c43416a3ca6820cd39111796d683 100644 (file)
@@ -1210,7 +1210,7 @@ xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
 
                if (((die_args->trapnr == X86_TRAP_MF) ||
                     (die_args->trapnr == X86_TRAP_XF)) &&
-                   !user_mode_vm(die_args->regs))
+                   !user_mode(die_args->regs))
                        xpc_die_deactivate();
 
                break;
index b979c265fc51d0a09c48ff4216a8b055ebcd1c26..089a4028859d121d5611d04dde0139196753fb00 100644 (file)
@@ -3850,7 +3850,8 @@ static inline int bond_slave_override(struct bonding *bond,
        /* Find out if any slaves have the same mapping as this skb. */
        bond_for_each_slave_rcu(bond, slave, iter) {
                if (slave->queue_id == skb->queue_mapping) {
-                       if (bond_slave_can_tx(slave)) {
+                       if (bond_slave_is_up(slave) &&
+                           slave->link == BOND_LINK_UP) {
                                bond_dev_queue_xmit(bond, skb, slave->dev);
                                return 0;
                        }
index 80c46ad4cee439d2015b3ee7de66d578f54afd0b..ad0a7e8c2c2bdf33626824645a8180d8e6d900ff 100644 (file)
@@ -592,13 +592,12 @@ static int flexcan_poll_state(struct net_device *dev, u32 reg_esr)
                rx_state = unlikely(reg_esr & FLEXCAN_ESR_RX_WRN) ?
                           CAN_STATE_ERROR_WARNING : CAN_STATE_ERROR_ACTIVE;
                new_state = max(tx_state, rx_state);
-       } else if (unlikely(flt == FLEXCAN_ESR_FLT_CONF_PASSIVE)) {
+       } else {
                __flexcan_get_berr_counter(dev, &bec);
-               new_state = CAN_STATE_ERROR_PASSIVE;
+               new_state = flt == FLEXCAN_ESR_FLT_CONF_PASSIVE ?
+                           CAN_STATE_ERROR_PASSIVE : CAN_STATE_BUS_OFF;
                rx_state = bec.rxerr >= bec.txerr ? new_state : 0;
                tx_state = bec.rxerr <= bec.txerr ? new_state : 0;
-       } else {
-               new_state = CAN_STATE_BUS_OFF;
        }
 
        /* state hasn't changed */
@@ -1158,12 +1157,19 @@ static int flexcan_probe(struct platform_device *pdev)
        const struct flexcan_devtype_data *devtype_data;
        struct net_device *dev;
        struct flexcan_priv *priv;
+       struct regulator *reg_xceiver;
        struct resource *mem;
        struct clk *clk_ipg = NULL, *clk_per = NULL;
        void __iomem *base;
        int err, irq;
        u32 clock_freq = 0;
 
+       reg_xceiver = devm_regulator_get(&pdev->dev, "xceiver");
+       if (PTR_ERR(reg_xceiver) == -EPROBE_DEFER)
+               return -EPROBE_DEFER;
+       else if (IS_ERR(reg_xceiver))
+               reg_xceiver = NULL;
+
        if (pdev->dev.of_node)
                of_property_read_u32(pdev->dev.of_node,
                                                "clock-frequency", &clock_freq);
@@ -1224,9 +1230,7 @@ static int flexcan_probe(struct platform_device *pdev)
        priv->pdata = dev_get_platdata(&pdev->dev);
        priv->devtype_data = devtype_data;
 
-       priv->reg_xceiver = devm_regulator_get(&pdev->dev, "xceiver");
-       if (IS_ERR(priv->reg_xceiver))
-               priv->reg_xceiver = NULL;
+       priv->reg_xceiver = reg_xceiver;
 
        netif_napi_add(dev, &priv->napi, flexcan_poll, FLEXCAN_NAPI_WEIGHT);
 
index 009acc8641fc557cb580cb688983daf041519e4b..8b4d3e6875eb17e6bca38c812132953d0c5ce2c2 100644 (file)
@@ -901,6 +901,8 @@ static int gs_usb_probe(struct usb_interface *intf, const struct usb_device_id *
        }
 
        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return -ENOMEM;
        init_usb_anchor(&dev->rx_submitted);
 
        atomic_set(&dev->active_channels, 0);
index e97a08ce0b90c298577fed09c96dc7a711076e5e..57611fd91229a4580987b3519d56b8913f185411 100644 (file)
@@ -25,7 +25,6 @@
 #include <linux/can/dev.h>
 #include <linux/can/error.h>
 
-#define MAX_TX_URBS                    16
 #define MAX_RX_URBS                    4
 #define START_TIMEOUT                  1000 /* msecs */
 #define STOP_TIMEOUT                   1000 /* msecs */
@@ -443,6 +442,7 @@ struct kvaser_usb_error_summary {
        };
 };
 
+/* Context for an outstanding, not yet ACKed, transmission */
 struct kvaser_usb_tx_urb_context {
        struct kvaser_usb_net_priv *priv;
        u32 echo_index;
@@ -456,8 +456,13 @@ struct kvaser_usb {
        struct usb_endpoint_descriptor *bulk_in, *bulk_out;
        struct usb_anchor rx_submitted;
 
+       /* @max_tx_urbs: Firmware-reported maximum number of oustanding,
+        * not yet ACKed, transmissions on this device. This value is
+        * also used as a sentinel for marking free tx contexts.
+        */
        u32 fw_version;
        unsigned int nchannels;
+       unsigned int max_tx_urbs;
        enum kvaser_usb_family family;
 
        bool rxinitdone;
@@ -467,19 +472,18 @@ struct kvaser_usb {
 
 struct kvaser_usb_net_priv {
        struct can_priv can;
-
-       spinlock_t tx_contexts_lock;
-       int active_tx_contexts;
-       struct kvaser_usb_tx_urb_context tx_contexts[MAX_TX_URBS];
-
-       struct usb_anchor tx_submitted;
-       struct completion start_comp, stop_comp;
+       struct can_berr_counter bec;
 
        struct kvaser_usb *dev;
        struct net_device *netdev;
        int channel;
 
-       struct can_berr_counter bec;
+       struct completion start_comp, stop_comp;
+       struct usb_anchor tx_submitted;
+
+       spinlock_t tx_contexts_lock;
+       int active_tx_contexts;
+       struct kvaser_usb_tx_urb_context tx_contexts[];
 };
 
 static const struct usb_device_id kvaser_usb_table[] = {
@@ -592,8 +596,8 @@ static int kvaser_usb_wait_msg(const struct kvaser_usb *dev, u8 id,
                         * for further details.
                         */
                        if (tmp->len == 0) {
-                               pos = round_up(pos,
-                                              dev->bulk_in->wMaxPacketSize);
+                               pos = round_up(pos, le16_to_cpu(dev->bulk_in->
+                                                               wMaxPacketSize));
                                continue;
                        }
 
@@ -657,9 +661,13 @@ static int kvaser_usb_get_software_info(struct kvaser_usb *dev)
        switch (dev->family) {
        case KVASER_LEAF:
                dev->fw_version = le32_to_cpu(msg.u.leaf.softinfo.fw_version);
+               dev->max_tx_urbs =
+                       le16_to_cpu(msg.u.leaf.softinfo.max_outstanding_tx);
                break;
        case KVASER_USBCAN:
                dev->fw_version = le32_to_cpu(msg.u.usbcan.softinfo.fw_version);
+               dev->max_tx_urbs =
+                       le16_to_cpu(msg.u.usbcan.softinfo.max_outstanding_tx);
                break;
        }
 
@@ -715,7 +723,7 @@ static void kvaser_usb_tx_acknowledge(const struct kvaser_usb *dev,
 
        stats = &priv->netdev->stats;
 
-       context = &priv->tx_contexts[tid % MAX_TX_URBS];
+       context = &priv->tx_contexts[tid % dev->max_tx_urbs];
 
        /* Sometimes the state change doesn't come after a bus-off event */
        if (priv->can.restart_ms &&
@@ -744,7 +752,7 @@ static void kvaser_usb_tx_acknowledge(const struct kvaser_usb *dev,
        spin_lock_irqsave(&priv->tx_contexts_lock, flags);
 
        can_get_echo_skb(priv->netdev, context->echo_index);
-       context->echo_index = MAX_TX_URBS;
+       context->echo_index = dev->max_tx_urbs;
        --priv->active_tx_contexts;
        netif_wake_queue(priv->netdev);
 
@@ -1329,7 +1337,8 @@ static void kvaser_usb_read_bulk_callback(struct urb *urb)
                 * number of events in case of a heavy rx load on the bus.
                 */
                if (msg->len == 0) {
-                       pos = round_up(pos, dev->bulk_in->wMaxPacketSize);
+                       pos = round_up(pos, le16_to_cpu(dev->bulk_in->
+                                                       wMaxPacketSize));
                        continue;
                }
 
@@ -1512,11 +1521,13 @@ error:
 
 static void kvaser_usb_reset_tx_urb_contexts(struct kvaser_usb_net_priv *priv)
 {
-       int i;
+       int i, max_tx_urbs;
+
+       max_tx_urbs = priv->dev->max_tx_urbs;
 
        priv->active_tx_contexts = 0;
-       for (i = 0; i < MAX_TX_URBS; i++)
-               priv->tx_contexts[i].echo_index = MAX_TX_URBS;
+       for (i = 0; i < max_tx_urbs; i++)
+               priv->tx_contexts[i].echo_index = max_tx_urbs;
 }
 
 /* This method might sleep. Do not call it in the atomic context
@@ -1702,14 +1713,14 @@ static netdev_tx_t kvaser_usb_start_xmit(struct sk_buff *skb,
                *msg_tx_can_flags |= MSG_FLAG_REMOTE_FRAME;
 
        spin_lock_irqsave(&priv->tx_contexts_lock, flags);
-       for (i = 0; i < ARRAY_SIZE(priv->tx_contexts); i++) {
-               if (priv->tx_contexts[i].echo_index == MAX_TX_URBS) {
+       for (i = 0; i < dev->max_tx_urbs; i++) {
+               if (priv->tx_contexts[i].echo_index == dev->max_tx_urbs) {
                        context = &priv->tx_contexts[i];
 
                        context->echo_index = i;
                        can_put_echo_skb(skb, netdev, context->echo_index);
                        ++priv->active_tx_contexts;
-                       if (priv->active_tx_contexts >= MAX_TX_URBS)
+                       if (priv->active_tx_contexts >= dev->max_tx_urbs)
                                netif_stop_queue(netdev);
 
                        break;
@@ -1743,7 +1754,7 @@ static netdev_tx_t kvaser_usb_start_xmit(struct sk_buff *skb,
                spin_lock_irqsave(&priv->tx_contexts_lock, flags);
 
                can_free_echo_skb(netdev, context->echo_index);
-               context->echo_index = MAX_TX_URBS;
+               context->echo_index = dev->max_tx_urbs;
                --priv->active_tx_contexts;
                netif_wake_queue(netdev);
 
@@ -1881,7 +1892,9 @@ static int kvaser_usb_init_one(struct usb_interface *intf,
        if (err)
                return err;
 
-       netdev = alloc_candev(sizeof(*priv), MAX_TX_URBS);
+       netdev = alloc_candev(sizeof(*priv) +
+                             dev->max_tx_urbs * sizeof(*priv->tx_contexts),
+                             dev->max_tx_urbs);
        if (!netdev) {
                dev_err(&intf->dev, "Cannot alloc candev\n");
                return -ENOMEM;
@@ -2009,6 +2022,13 @@ static int kvaser_usb_probe(struct usb_interface *intf,
                return err;
        }
 
+       dev_dbg(&intf->dev, "Firmware version: %d.%d.%d\n",
+               ((dev->fw_version >> 24) & 0xff),
+               ((dev->fw_version >> 16) & 0xff),
+               (dev->fw_version & 0xffff));
+
+       dev_dbg(&intf->dev, "Max oustanding tx = %d URBs\n", dev->max_tx_urbs);
+
        err = kvaser_usb_get_card_info(dev);
        if (err) {
                dev_err(&intf->dev,
@@ -2016,11 +2036,6 @@ static int kvaser_usb_probe(struct usb_interface *intf,
                return err;
        }
 
-       dev_dbg(&intf->dev, "Firmware version: %d.%d.%d\n",
-               ((dev->fw_version >> 24) & 0xff),
-               ((dev->fw_version >> 16) & 0xff),
-               (dev->fw_version & 0xffff));
-
        for (i = 0; i < dev->nchannels; i++) {
                err = kvaser_usb_init_one(intf, id, i);
                if (err) {
index 1ba7c25002e1e27ec1a9333412c175edb0f5594a..e8fc4952c6b074f80a2af132d2bc161e7fd3d8c7 100644 (file)
@@ -26,8 +26,8 @@
 #define PUCAN_CMD_FILTER_STD           0x008
 #define PUCAN_CMD_TX_ABORT             0x009
 #define PUCAN_CMD_WR_ERR_CNT           0x00a
-#define PUCAN_CMD_RX_FRAME_ENABLE      0x00b
-#define PUCAN_CMD_RX_FRAME_DISABLE     0x00c
+#define PUCAN_CMD_SET_EN_OPTION                0x00b
+#define PUCAN_CMD_CLR_DIS_OPTION       0x00c
 #define PUCAN_CMD_END_OF_COLLECTION    0x3ff
 
 /* uCAN received messages list */
@@ -101,14 +101,15 @@ struct __packed pucan_wr_err_cnt {
        u16     unused;
 };
 
-/* uCAN RX_FRAME_ENABLE command fields */
-#define PUCAN_FLTEXT_ERROR             0x0001
-#define PUCAN_FLTEXT_BUSLOAD           0x0002
+/* uCAN SET_EN/CLR_DIS _OPTION command fields */
+#define PUCAN_OPTION_ERROR             0x0001
+#define PUCAN_OPTION_BUSLOAD           0x0002
+#define PUCAN_OPTION_CANDFDISO         0x0004
 
-struct __packed pucan_filter_ext {
+struct __packed pucan_options {
        __le16  opcode_channel;
 
-       __le16  ext_mask;
+       __le16  options;
        u32     unused;
 };
 
index 0bac0f14edc3cd73727b4a0d9f0776d857c4ba8c..a9221ad9f1a0a0387de1ebf8470c0d360e895773 100644 (file)
@@ -110,13 +110,13 @@ struct __packed pcan_ufd_led {
        u8      unused[5];
 };
 
-/* Extended usage of uCAN commands CMD_RX_FRAME_xxxABLE for PCAN-USB Pro FD */
+/* Extended usage of uCAN commands CMD_xxx_xx_OPTION for PCAN-USB Pro FD */
 #define PCAN_UFD_FLTEXT_CALIBRATION    0x8000
 
-struct __packed pcan_ufd_filter_ext {
+struct __packed pcan_ufd_options {
        __le16  opcode_channel;
 
-       __le16  ext_mask;
+       __le16  ucan_mask;
        u16     unused;
        __le16  usb_mask;
 };
@@ -251,6 +251,27 @@ static int pcan_usb_fd_build_restart_cmd(struct peak_usb_device *dev, u8 *buf)
        /* moves the pointer forward */
        pc += sizeof(struct pucan_wr_err_cnt);
 
+       /* add command to switch from ISO to non-ISO mode, if fw allows it */
+       if (dev->can.ctrlmode_supported & CAN_CTRLMODE_FD_NON_ISO) {
+               struct pucan_options *puo = (struct pucan_options *)pc;
+
+               puo->opcode_channel =
+                       (dev->can.ctrlmode & CAN_CTRLMODE_FD_NON_ISO) ?
+                       pucan_cmd_opcode_channel(dev,
+                                                PUCAN_CMD_CLR_DIS_OPTION) :
+                       pucan_cmd_opcode_channel(dev, PUCAN_CMD_SET_EN_OPTION);
+
+               puo->options = cpu_to_le16(PUCAN_OPTION_CANDFDISO);
+
+               /* to be sure that no other extended bits will be taken into
+                * account
+                */
+               puo->unused = 0;
+
+               /* moves the pointer forward */
+               pc += sizeof(struct pucan_options);
+       }
+
        /* next, go back to operational mode */
        cmd = (struct pucan_command *)pc;
        cmd->opcode_channel = pucan_cmd_opcode_channel(dev,
@@ -321,21 +342,21 @@ static int pcan_usb_fd_set_filter_std(struct peak_usb_device *dev, int idx,
        return pcan_usb_fd_send_cmd(dev, cmd);
 }
 
-/* set/unset notifications filter:
+/* set/unset options
  *
- *     onoff   sets(1)/unset(0) notifications
- *     mask    each bit defines a kind of notification to set/unset
+ *     onoff   set(1)/unset(0) options
+ *     mask    each bit defines a kind of options to set/unset
  */
-static int pcan_usb_fd_set_filter_ext(struct peak_usb_device *dev,
-                                     bool onoff, u16 ext_mask, u16 usb_mask)
+static int pcan_usb_fd_set_options(struct peak_usb_device *dev,
+                                  bool onoff, u16 ucan_mask, u16 usb_mask)
 {
-       struct pcan_ufd_filter_ext *cmd = pcan_usb_fd_cmd_buffer(dev);
+       struct pcan_ufd_options *cmd = pcan_usb_fd_cmd_buffer(dev);
 
        cmd->opcode_channel = pucan_cmd_opcode_channel(dev,
-                                       (onoff) ? PUCAN_CMD_RX_FRAME_ENABLE :
-                                                 PUCAN_CMD_RX_FRAME_DISABLE);
+                                       (onoff) ? PUCAN_CMD_SET_EN_OPTION :
+                                                 PUCAN_CMD_CLR_DIS_OPTION);
 
-       cmd->ext_mask = cpu_to_le16(ext_mask);
+       cmd->ucan_mask = cpu_to_le16(ucan_mask);
        cmd->usb_mask = cpu_to_le16(usb_mask);
 
        /* send the command */
@@ -770,9 +791,9 @@ static int pcan_usb_fd_start(struct peak_usb_device *dev)
                                       &pcan_usb_pro_fd);
 
                /* enable USB calibration messages */
-               err = pcan_usb_fd_set_filter_ext(dev, 1,
-                                                PUCAN_FLTEXT_ERROR,
-                                                PCAN_UFD_FLTEXT_CALIBRATION);
+               err = pcan_usb_fd_set_options(dev, 1,
+                                             PUCAN_OPTION_ERROR,
+                                             PCAN_UFD_FLTEXT_CALIBRATION);
        }
 
        pdev->usb_if->dev_opened_count++;
@@ -806,9 +827,9 @@ static int pcan_usb_fd_stop(struct peak_usb_device *dev)
 
        /* turn off special msgs for that interface if no other dev opened */
        if (pdev->usb_if->dev_opened_count == 1)
-               pcan_usb_fd_set_filter_ext(dev, 0,
-                                          PUCAN_FLTEXT_ERROR,
-                                          PCAN_UFD_FLTEXT_CALIBRATION);
+               pcan_usb_fd_set_options(dev, 0,
+                                       PUCAN_OPTION_ERROR,
+                                       PCAN_UFD_FLTEXT_CALIBRATION);
        pdev->usb_if->dev_opened_count--;
 
        return 0;
@@ -860,8 +881,14 @@ static int pcan_usb_fd_init(struct peak_usb_device *dev)
                         pdev->usb_if->fw_info.fw_version[2],
                         dev->adapter->ctrl_count);
 
-               /* the currently supported hw is non-ISO */
-               dev->can.ctrlmode = CAN_CTRLMODE_FD_NON_ISO;
+               /* check for ability to switch between ISO/non-ISO modes */
+               if (pdev->usb_if->fw_info.fw_version[0] >= 2) {
+                       /* firmware >= 2.x supports ISO/non-ISO switching */
+                       dev->can.ctrlmode_supported |= CAN_CTRLMODE_FD_NON_ISO;
+               } else {
+                       /* firmware < 2.x only supports fixed(!) non-ISO */
+                       dev->can.ctrlmode |= CAN_CTRLMODE_FD_NON_ISO;
+               }
 
                /* tell the hardware the can driver is running */
                err = pcan_usb_fd_drv_loaded(dev, 1);
@@ -937,9 +964,9 @@ static void pcan_usb_fd_exit(struct peak_usb_device *dev)
        if (dev->ctrl_idx == 0) {
                /* turn off calibration message if any device were opened */
                if (pdev->usb_if->dev_opened_count > 0)
-                       pcan_usb_fd_set_filter_ext(dev, 0,
-                                                  PUCAN_FLTEXT_ERROR,
-                                                  PCAN_UFD_FLTEXT_CALIBRATION);
+                       pcan_usb_fd_set_options(dev, 0,
+                                               PUCAN_OPTION_ERROR,
+                                               PCAN_UFD_FLTEXT_CALIBRATION);
 
                /* tell USB adapter that the driver is being unloaded */
                pcan_usb_fd_drv_loaded(dev, 0);
index 756053c028becff93c62cfcdbd6e1f635594cc01..4085c4b310470b6ebf718121fa423d2995dcb8ef 100644 (file)
@@ -1811,7 +1811,7 @@ struct bnx2x {
        int                     stats_state;
 
        /* used for synchronization of concurrent threads statistics handling */
-       spinlock_t              stats_lock;
+       struct mutex            stats_lock;
 
        /* used by dmae command loader */
        struct dmae_command     stats_dmae;
@@ -1935,8 +1935,6 @@ struct bnx2x {
 
        int fp_array_size;
        u32 dump_preset_idx;
-       bool                                    stats_started;
-       struct semaphore                        stats_sema;
 
        u8                                      phys_port_id[ETH_ALEN];
 
index 996e215fc3246c4fba6f1714ddceb69b88f179be..1ec635f549944f87e0c0df4eb0816dcf33d90706 100644 (file)
@@ -129,8 +129,8 @@ struct bnx2x_mac_vals {
        u32 xmac_val;
        u32 emac_addr;
        u32 emac_val;
-       u32 umac_addr;
-       u32 umac_val;
+       u32 umac_addr[2];
+       u32 umac_val[2];
        u32 bmac_addr;
        u32 bmac_val[2];
 };
@@ -7866,6 +7866,20 @@ int bnx2x_init_hw_func_cnic(struct bnx2x *bp)
        return 0;
 }
 
+/* previous driver DMAE transaction may have occurred when pre-boot stage ended
+ * and boot began, or when kdump kernel was loaded. Either case would invalidate
+ * the addresses of the transaction, resulting in was-error bit set in the pci
+ * causing all hw-to-host pcie transactions to timeout. If this happened we want
+ * to clear the interrupt which detected this from the pglueb and the was done
+ * bit
+ */
+static void bnx2x_clean_pglue_errors(struct bnx2x *bp)
+{
+       if (!CHIP_IS_E1x(bp))
+               REG_WR(bp, PGLUE_B_REG_WAS_ERROR_PF_7_0_CLR,
+                      1 << BP_ABS_FUNC(bp));
+}
+
 static int bnx2x_init_hw_func(struct bnx2x *bp)
 {
        int port = BP_PORT(bp);
@@ -7958,8 +7972,7 @@ static int bnx2x_init_hw_func(struct bnx2x *bp)
 
        bnx2x_init_block(bp, BLOCK_PGLUE_B, init_phase);
 
-       if (!CHIP_IS_E1x(bp))
-               REG_WR(bp, PGLUE_B_REG_WAS_ERROR_PF_7_0_CLR, func);
+       bnx2x_clean_pglue_errors(bp);
 
        bnx2x_init_block(bp, BLOCK_ATC, init_phase);
        bnx2x_init_block(bp, BLOCK_DMAE, init_phase);
@@ -10141,6 +10154,25 @@ static u32 bnx2x_get_pretend_reg(struct bnx2x *bp)
        return base + (BP_ABS_FUNC(bp)) * stride;
 }
 
+static bool bnx2x_prev_unload_close_umac(struct bnx2x *bp,
+                                        u8 port, u32 reset_reg,
+                                        struct bnx2x_mac_vals *vals)
+{
+       u32 mask = MISC_REGISTERS_RESET_REG_2_UMAC0 << port;
+       u32 base_addr;
+
+       if (!(mask & reset_reg))
+               return false;
+
+       BNX2X_DEV_INFO("Disable umac Rx %02x\n", port);
+       base_addr = port ? GRCBASE_UMAC1 : GRCBASE_UMAC0;
+       vals->umac_addr[port] = base_addr + UMAC_REG_COMMAND_CONFIG;
+       vals->umac_val[port] = REG_RD(bp, vals->umac_addr[port]);
+       REG_WR(bp, vals->umac_addr[port], 0);
+
+       return true;
+}
+
 static void bnx2x_prev_unload_close_mac(struct bnx2x *bp,
                                        struct bnx2x_mac_vals *vals)
 {
@@ -10149,10 +10181,7 @@ static void bnx2x_prev_unload_close_mac(struct bnx2x *bp,
        u8 port = BP_PORT(bp);
 
        /* reset addresses as they also mark which values were changed */
-       vals->bmac_addr = 0;
-       vals->umac_addr = 0;
-       vals->xmac_addr = 0;
-       vals->emac_addr = 0;
+       memset(vals, 0, sizeof(*vals));
 
        reset_reg = REG_RD(bp, MISC_REG_RESET_REG_2);
 
@@ -10201,15 +10230,11 @@ static void bnx2x_prev_unload_close_mac(struct bnx2x *bp,
                        REG_WR(bp, vals->xmac_addr, 0);
                        mac_stopped = true;
                }
-               mask = MISC_REGISTERS_RESET_REG_2_UMAC0 << port;
-               if (mask & reset_reg) {
-                       BNX2X_DEV_INFO("Disable umac Rx\n");
-                       base_addr = BP_PORT(bp) ? GRCBASE_UMAC1 : GRCBASE_UMAC0;
-                       vals->umac_addr = base_addr + UMAC_REG_COMMAND_CONFIG;
-                       vals->umac_val = REG_RD(bp, vals->umac_addr);
-                       REG_WR(bp, vals->umac_addr, 0);
-                       mac_stopped = true;
-               }
+
+               mac_stopped |= bnx2x_prev_unload_close_umac(bp, 0,
+                                                           reset_reg, vals);
+               mac_stopped |= bnx2x_prev_unload_close_umac(bp, 1,
+                                                           reset_reg, vals);
        }
 
        if (mac_stopped)
@@ -10505,8 +10530,11 @@ static int bnx2x_prev_unload_common(struct bnx2x *bp)
                /* Close the MAC Rx to prevent BRB from filling up */
                bnx2x_prev_unload_close_mac(bp, &mac_vals);
 
-               /* close LLH filters towards the BRB */
+               /* close LLH filters for both ports towards the BRB */
                bnx2x_set_rx_filter(&bp->link_params, 0);
+               bp->link_params.port ^= 1;
+               bnx2x_set_rx_filter(&bp->link_params, 0);
+               bp->link_params.port ^= 1;
 
                /* Check if the UNDI driver was previously loaded */
                if (bnx2x_prev_is_after_undi(bp)) {
@@ -10553,8 +10581,10 @@ static int bnx2x_prev_unload_common(struct bnx2x *bp)
 
        if (mac_vals.xmac_addr)
                REG_WR(bp, mac_vals.xmac_addr, mac_vals.xmac_val);
-       if (mac_vals.umac_addr)
-               REG_WR(bp, mac_vals.umac_addr, mac_vals.umac_val);
+       if (mac_vals.umac_addr[0])
+               REG_WR(bp, mac_vals.umac_addr[0], mac_vals.umac_val[0]);
+       if (mac_vals.umac_addr[1])
+               REG_WR(bp, mac_vals.umac_addr[1], mac_vals.umac_val[1]);
        if (mac_vals.emac_addr)
                REG_WR(bp, mac_vals.emac_addr, mac_vals.emac_val);
        if (mac_vals.bmac_addr) {
@@ -10571,26 +10601,6 @@ static int bnx2x_prev_unload_common(struct bnx2x *bp)
        return bnx2x_prev_mcp_done(bp);
 }
 
-/* previous driver DMAE transaction may have occurred when pre-boot stage ended
- * and boot began, or when kdump kernel was loaded. Either case would invalidate
- * the addresses of the transaction, resulting in was-error bit set in the pci
- * causing all hw-to-host pcie transactions to timeout. If this happened we want
- * to clear the interrupt which detected this from the pglueb and the was done
- * bit
- */
-static void bnx2x_prev_interrupted_dmae(struct bnx2x *bp)
-{
-       if (!CHIP_IS_E1x(bp)) {
-               u32 val = REG_RD(bp, PGLUE_B_REG_PGLUE_B_INT_STS);
-               if (val & PGLUE_B_PGLUE_B_INT_STS_REG_WAS_ERROR_ATTN) {
-                       DP(BNX2X_MSG_SP,
-                          "'was error' bit was found to be set in pglueb upon startup. Clearing\n");
-                       REG_WR(bp, PGLUE_B_REG_WAS_ERROR_PF_7_0_CLR,
-                              1 << BP_FUNC(bp));
-               }
-       }
-}
-
 static int bnx2x_prev_unload(struct bnx2x *bp)
 {
        int time_counter = 10;
@@ -10600,7 +10610,7 @@ static int bnx2x_prev_unload(struct bnx2x *bp)
        /* clear hw from errors which may have resulted from an interrupted
         * dmae transaction.
         */
-       bnx2x_prev_interrupted_dmae(bp);
+       bnx2x_clean_pglue_errors(bp);
 
        /* Release previously held locks */
        hw_lock_reg = (BP_FUNC(bp) <= 5) ?
@@ -12037,9 +12047,8 @@ static int bnx2x_init_bp(struct bnx2x *bp)
        mutex_init(&bp->port.phy_mutex);
        mutex_init(&bp->fw_mb_mutex);
        mutex_init(&bp->drv_info_mutex);
+       mutex_init(&bp->stats_lock);
        bp->drv_info_mng_owner = false;
-       spin_lock_init(&bp->stats_lock);
-       sema_init(&bp->stats_sema, 1);
 
        INIT_DELAYED_WORK(&bp->sp_task, bnx2x_sp_task);
        INIT_DELAYED_WORK(&bp->sp_rtnl_task, bnx2x_sp_rtnl_task);
@@ -13668,9 +13677,9 @@ static int bnx2x_eeh_nic_unload(struct bnx2x *bp)
        cancel_delayed_work_sync(&bp->sp_task);
        cancel_delayed_work_sync(&bp->period_task);
 
-       spin_lock_bh(&bp->stats_lock);
+       mutex_lock(&bp->stats_lock);
        bp->stats_state = STATS_STATE_DISABLED;
-       spin_unlock_bh(&bp->stats_lock);
+       mutex_unlock(&bp->stats_lock);
 
        bnx2x_save_statistics(bp);
 
index e5aca2de1871350f3e47a788c4360b461c02a039..cfe3c7695455e16eab516e0f99db6ee6b7e58879 100644 (file)
@@ -2238,7 +2238,9 @@ int bnx2x_vf_close(struct bnx2x *bp, struct bnx2x_virtf *vf)
 
                cookie.vf = vf;
                cookie.state = VF_ACQUIRED;
-               bnx2x_stats_safe_exec(bp, bnx2x_set_vf_state, &cookie);
+               rc = bnx2x_stats_safe_exec(bp, bnx2x_set_vf_state, &cookie);
+               if (rc)
+                       goto op_err;
        }
 
        DP(BNX2X_MSG_IOV, "set state to acquired\n");
index d1608297c773746237a8faf3697277afcbe45918..800ab44a07cecd721c8b12f0c5b8602e1124a0dd 100644 (file)
@@ -123,36 +123,28 @@ static void bnx2x_dp_stats(struct bnx2x *bp)
  */
 static void bnx2x_storm_stats_post(struct bnx2x *bp)
 {
-       if (!bp->stats_pending) {
-               int rc;
+       int rc;
 
-               spin_lock_bh(&bp->stats_lock);
-
-               if (bp->stats_pending) {
-                       spin_unlock_bh(&bp->stats_lock);
-                       return;
-               }
-
-               bp->fw_stats_req->hdr.drv_stats_counter =
-                       cpu_to_le16(bp->stats_counter++);
+       if (bp->stats_pending)
+               return;
 
-               DP(BNX2X_MSG_STATS, "Sending statistics ramrod %d\n",
-                  le16_to_cpu(bp->fw_stats_req->hdr.drv_stats_counter));
+       bp->fw_stats_req->hdr.drv_stats_counter =
+               cpu_to_le16(bp->stats_counter++);
 
-               /* adjust the ramrod to include VF queues statistics */
-               bnx2x_iov_adjust_stats_req(bp);
-               bnx2x_dp_stats(bp);
+       DP(BNX2X_MSG_STATS, "Sending statistics ramrod %d\n",
+          le16_to_cpu(bp->fw_stats_req->hdr.drv_stats_counter));
 
-               /* send FW stats ramrod */
-               rc = bnx2x_sp_post(bp, RAMROD_CMD_ID_COMMON_STAT_QUERY, 0,
-                                  U64_HI(bp->fw_stats_req_mapping),
-                                  U64_LO(bp->fw_stats_req_mapping),
-                                  NONE_CONNECTION_TYPE);
-               if (rc == 0)
-                       bp->stats_pending = 1;
+       /* adjust the ramrod to include VF queues statistics */
+       bnx2x_iov_adjust_stats_req(bp);
+       bnx2x_dp_stats(bp);
 
-               spin_unlock_bh(&bp->stats_lock);
-       }
+       /* send FW stats ramrod */
+       rc = bnx2x_sp_post(bp, RAMROD_CMD_ID_COMMON_STAT_QUERY, 0,
+                          U64_HI(bp->fw_stats_req_mapping),
+                          U64_LO(bp->fw_stats_req_mapping),
+                          NONE_CONNECTION_TYPE);
+       if (rc == 0)
+               bp->stats_pending = 1;
 }
 
 static void bnx2x_hw_stats_post(struct bnx2x *bp)
@@ -221,7 +213,7 @@ static void bnx2x_stats_comp(struct bnx2x *bp)
  */
 
 /* should be called under stats_sema */
-static void __bnx2x_stats_pmf_update(struct bnx2x *bp)
+static void bnx2x_stats_pmf_update(struct bnx2x *bp)
 {
        struct dmae_command *dmae;
        u32 opcode;
@@ -519,7 +511,7 @@ static void bnx2x_func_stats_init(struct bnx2x *bp)
 }
 
 /* should be called under stats_sema */
-static void __bnx2x_stats_start(struct bnx2x *bp)
+static void bnx2x_stats_start(struct bnx2x *bp)
 {
        if (IS_PF(bp)) {
                if (bp->port.pmf)
@@ -531,34 +523,13 @@ static void __bnx2x_stats_start(struct bnx2x *bp)
                bnx2x_hw_stats_post(bp);
                bnx2x_storm_stats_post(bp);
        }
-
-       bp->stats_started = true;
-}
-
-static void bnx2x_stats_start(struct bnx2x *bp)
-{
-       if (down_timeout(&bp->stats_sema, HZ/10))
-               BNX2X_ERR("Unable to acquire stats lock\n");
-       __bnx2x_stats_start(bp);
-       up(&bp->stats_sema);
 }
 
 static void bnx2x_stats_pmf_start(struct bnx2x *bp)
 {
-       if (down_timeout(&bp->stats_sema, HZ/10))
-               BNX2X_ERR("Unable to acquire stats lock\n");
        bnx2x_stats_comp(bp);
-       __bnx2x_stats_pmf_update(bp);
-       __bnx2x_stats_start(bp);
-       up(&bp->stats_sema);
-}
-
-static void bnx2x_stats_pmf_update(struct bnx2x *bp)
-{
-       if (down_timeout(&bp->stats_sema, HZ/10))
-               BNX2X_ERR("Unable to acquire stats lock\n");
-       __bnx2x_stats_pmf_update(bp);
-       up(&bp->stats_sema);
+       bnx2x_stats_pmf_update(bp);
+       bnx2x_stats_start(bp);
 }
 
 static void bnx2x_stats_restart(struct bnx2x *bp)
@@ -568,11 +539,9 @@ static void bnx2x_stats_restart(struct bnx2x *bp)
         */
        if (IS_VF(bp))
                return;
-       if (down_timeout(&bp->stats_sema, HZ/10))
-               BNX2X_ERR("Unable to acquire stats lock\n");
+
        bnx2x_stats_comp(bp);
-       __bnx2x_stats_start(bp);
-       up(&bp->stats_sema);
+       bnx2x_stats_start(bp);
 }
 
 static void bnx2x_bmac_stats_update(struct bnx2x *bp)
@@ -1246,18 +1215,12 @@ static void bnx2x_stats_update(struct bnx2x *bp)
 {
        u32 *stats_comp = bnx2x_sp(bp, stats_comp);
 
-       /* we run update from timer context, so give up
-        * if somebody is in the middle of transition
-        */
-       if (down_trylock(&bp->stats_sema))
+       if (bnx2x_edebug_stats_stopped(bp))
                return;
 
-       if (bnx2x_edebug_stats_stopped(bp) || !bp->stats_started)
-               goto out;
-
        if (IS_PF(bp)) {
                if (*stats_comp != DMAE_COMP_VAL)
-                       goto out;
+                       return;
 
                if (bp->port.pmf)
                        bnx2x_hw_stats_update(bp);
@@ -1267,7 +1230,7 @@ static void bnx2x_stats_update(struct bnx2x *bp)
                                BNX2X_ERR("storm stats were not updated for 3 times\n");
                                bnx2x_panic();
                        }
-                       goto out;
+                       return;
                }
        } else {
                /* vf doesn't collect HW statistics, and doesn't get completions
@@ -1281,7 +1244,7 @@ static void bnx2x_stats_update(struct bnx2x *bp)
 
        /* vf is done */
        if (IS_VF(bp))
-               goto out;
+               return;
 
        if (netif_msg_timer(bp)) {
                struct bnx2x_eth_stats *estats = &bp->eth_stats;
@@ -1292,9 +1255,6 @@ static void bnx2x_stats_update(struct bnx2x *bp)
 
        bnx2x_hw_stats_post(bp);
        bnx2x_storm_stats_post(bp);
-
-out:
-       up(&bp->stats_sema);
 }
 
 static void bnx2x_port_stats_stop(struct bnx2x *bp)
@@ -1358,12 +1318,7 @@ static void bnx2x_port_stats_stop(struct bnx2x *bp)
 
 static void bnx2x_stats_stop(struct bnx2x *bp)
 {
-       int update = 0;
-
-       if (down_timeout(&bp->stats_sema, HZ/10))
-               BNX2X_ERR("Unable to acquire stats lock\n");
-
-       bp->stats_started = false;
+       bool update = false;
 
        bnx2x_stats_comp(bp);
 
@@ -1381,8 +1336,6 @@ static void bnx2x_stats_stop(struct bnx2x *bp)
                bnx2x_hw_stats_post(bp);
                bnx2x_stats_comp(bp);
        }
-
-       up(&bp->stats_sema);
 }
 
 static void bnx2x_stats_do_nothing(struct bnx2x *bp)
@@ -1410,18 +1363,28 @@ static const struct {
 
 void bnx2x_stats_handle(struct bnx2x *bp, enum bnx2x_stats_event event)
 {
-       enum bnx2x_stats_state state;
-       void (*action)(struct bnx2x *bp);
+       enum bnx2x_stats_state state = bp->stats_state;
+
        if (unlikely(bp->panic))
                return;
 
-       spin_lock_bh(&bp->stats_lock);
-       state = bp->stats_state;
+       /* Statistics update run from timer context, and we don't want to stop
+        * that context in case someone is in the middle of a transition.
+        * For other events, wait a bit until lock is taken.
+        */
+       if (!mutex_trylock(&bp->stats_lock)) {
+               if (event == STATS_EVENT_UPDATE)
+                       return;
+
+               DP(BNX2X_MSG_STATS,
+                  "Unlikely stats' lock contention [event %d]\n", event);
+               mutex_lock(&bp->stats_lock);
+       }
+
+       bnx2x_stats_stm[state][event].action(bp);
        bp->stats_state = bnx2x_stats_stm[state][event].next_state;
-       action = bnx2x_stats_stm[state][event].action;
-       spin_unlock_bh(&bp->stats_lock);
 
-       action(bp);
+       mutex_unlock(&bp->stats_lock);
 
        if ((event != STATS_EVENT_UPDATE) || netif_msg_timer(bp))
                DP(BNX2X_MSG_STATS, "state %d -> event %d -> state %d\n",
@@ -1998,13 +1961,34 @@ void bnx2x_afex_collect_stats(struct bnx2x *bp, void *void_afex_stats,
        }
 }
 
-void bnx2x_stats_safe_exec(struct bnx2x *bp,
-                          void (func_to_exec)(void *cookie),
-                          void *cookie){
-       if (down_timeout(&bp->stats_sema, HZ/10))
-               BNX2X_ERR("Unable to acquire stats lock\n");
+int bnx2x_stats_safe_exec(struct bnx2x *bp,
+                         void (func_to_exec)(void *cookie),
+                         void *cookie)
+{
+       int cnt = 10, rc = 0;
+
+       /* Wait for statistics to end [while blocking further requests],
+        * then run supplied function 'safely'.
+        */
+       mutex_lock(&bp->stats_lock);
+
        bnx2x_stats_comp(bp);
+       while (bp->stats_pending && cnt--)
+               if (bnx2x_storm_stats_update(bp))
+                       usleep_range(1000, 2000);
+       if (bp->stats_pending) {
+               BNX2X_ERR("Failed to wait for stats pending to clear [possibly FW is stuck]\n");
+               rc = -EBUSY;
+               goto out;
+       }
+
        func_to_exec(cookie);
-       __bnx2x_stats_start(bp);
-       up(&bp->stats_sema);
+
+out:
+       /* No need to restart statistics - if they're enabled, the timer
+        * will restart the statistics.
+        */
+       mutex_unlock(&bp->stats_lock);
+
+       return rc;
 }
index 2beceaefdeea7aa5ac53f6a3028cd0fbcacbc51a..965539a9dabe7e4702e1ba6b382aefc69fb26fac 100644 (file)
@@ -539,9 +539,9 @@ struct bnx2x;
 void bnx2x_memset_stats(struct bnx2x *bp);
 void bnx2x_stats_init(struct bnx2x *bp);
 void bnx2x_stats_handle(struct bnx2x *bp, enum bnx2x_stats_event event);
-void bnx2x_stats_safe_exec(struct bnx2x *bp,
-                          void (func_to_exec)(void *cookie),
-                          void *cookie);
+int bnx2x_stats_safe_exec(struct bnx2x *bp,
+                         void (func_to_exec)(void *cookie),
+                         void *cookie);
 
 /**
  * bnx2x_save_statistics - save statistics when unloading.
index 97842d03675b327d65f564b351f24d12bf72c18b..c6ff4890d171a1509c7b3ca2aa21e5b0edd6039f 100644 (file)
@@ -376,8 +376,6 @@ enum {
 enum {
        INGQ_EXTRAS = 2,        /* firmware event queue and */
                                /*   forwarded interrupts */
-       MAX_EGRQ = MAX_ETH_QSETS*2 + MAX_OFLD_QSETS*2
-                  + MAX_CTRL_QUEUES + MAX_RDMA_QUEUES + MAX_ISCSI_QUEUES,
        MAX_INGQ = MAX_ETH_QSETS + MAX_OFLD_QSETS + MAX_RDMA_QUEUES
                   + MAX_RDMA_CIQS + MAX_ISCSI_QUEUES + INGQ_EXTRAS,
 };
@@ -616,11 +614,13 @@ struct sge {
        unsigned int idma_qid[2];   /* SGE IDMA Hung Ingress Queue ID */
 
        unsigned int egr_start;
+       unsigned int egr_sz;
        unsigned int ingr_start;
-       void *egr_map[MAX_EGRQ];    /* qid->queue egress queue map */
-       struct sge_rspq *ingr_map[MAX_INGQ]; /* qid->queue ingress queue map */
-       DECLARE_BITMAP(starving_fl, MAX_EGRQ);
-       DECLARE_BITMAP(txq_maperr, MAX_EGRQ);
+       unsigned int ingr_sz;
+       void **egr_map;    /* qid->queue egress queue map */
+       struct sge_rspq **ingr_map; /* qid->queue ingress queue map */
+       unsigned long *starving_fl;
+       unsigned long *txq_maperr;
        struct timer_list rx_timer; /* refills starving FLs */
        struct timer_list tx_timer; /* checks Tx queues */
 };
@@ -1136,6 +1136,8 @@ int cxgb4_t4_bar2_sge_qregs(struct adapter *adapter,
 
 unsigned int qtimer_val(const struct adapter *adap,
                        const struct sge_rspq *q);
+
+int t4_init_devlog_params(struct adapter *adapter);
 int t4_init_sge_params(struct adapter *adapter);
 int t4_init_tp_params(struct adapter *adap);
 int t4_filter_field_shift(const struct adapter *adap, int filter_sel);
index 78854ceb0870a29004ab04261fa26b296b958d4e..dcb0479452907abd43732aa9332f927f7f0c3ef3 100644 (file)
@@ -670,9 +670,13 @@ static int cctrl_tbl_show(struct seq_file *seq, void *v)
                "0.9375" };
 
        int i;
-       u16 incr[NMTUS][NCCTRL_WIN];
+       u16 (*incr)[NCCTRL_WIN];
        struct adapter *adap = seq->private;
 
+       incr = kmalloc(sizeof(*incr) * NMTUS, GFP_KERNEL);
+       if (!incr)
+               return -ENOMEM;
+
        t4_read_cong_tbl(adap, incr);
 
        for (i = 0; i < NCCTRL_WIN; ++i) {
@@ -685,6 +689,8 @@ static int cctrl_tbl_show(struct seq_file *seq, void *v)
                           adap->params.a_wnd[i],
                           dec_fac[adap->params.b_wnd[i]]);
        }
+
+       kfree(incr);
        return 0;
 }
 
index a22cf932ca3536920798ef50241fd9b43c26857a..d92995138f7ef9253c3b2ac37efba2c491e2f528 100644 (file)
@@ -920,7 +920,7 @@ static void quiesce_rx(struct adapter *adap)
 {
        int i;
 
-       for (i = 0; i < ARRAY_SIZE(adap->sge.ingr_map); i++) {
+       for (i = 0; i < adap->sge.ingr_sz; i++) {
                struct sge_rspq *q = adap->sge.ingr_map[i];
 
                if (q && q->handler) {
@@ -934,6 +934,21 @@ static void quiesce_rx(struct adapter *adap)
        }
 }
 
+/* Disable interrupt and napi handler */
+static void disable_interrupts(struct adapter *adap)
+{
+       if (adap->flags & FULL_INIT_DONE) {
+               t4_intr_disable(adap);
+               if (adap->flags & USING_MSIX) {
+                       free_msix_queue_irqs(adap);
+                       free_irq(adap->msix_info[0].vec, adap);
+               } else {
+                       free_irq(adap->pdev->irq, adap);
+               }
+               quiesce_rx(adap);
+       }
+}
+
 /*
  * Enable NAPI scheduling and interrupt generation for all Rx queues.
  */
@@ -941,7 +956,7 @@ static void enable_rx(struct adapter *adap)
 {
        int i;
 
-       for (i = 0; i < ARRAY_SIZE(adap->sge.ingr_map); i++) {
+       for (i = 0; i < adap->sge.ingr_sz; i++) {
                struct sge_rspq *q = adap->sge.ingr_map[i];
 
                if (!q)
@@ -970,8 +985,8 @@ static int setup_sge_queues(struct adapter *adap)
        int err, msi_idx, i, j;
        struct sge *s = &adap->sge;
 
-       bitmap_zero(s->starving_fl, MAX_EGRQ);
-       bitmap_zero(s->txq_maperr, MAX_EGRQ);
+       bitmap_zero(s->starving_fl, s->egr_sz);
+       bitmap_zero(s->txq_maperr, s->egr_sz);
 
        if (adap->flags & USING_MSIX)
                msi_idx = 1;         /* vector 0 is for non-queue interrupts */
@@ -983,6 +998,19 @@ static int setup_sge_queues(struct adapter *adap)
                msi_idx = -((int)s->intrq.abs_id + 1);
        }
 
+       /* NOTE: If you add/delete any Ingress/Egress Queue allocations in here,
+        * don't forget to update the following which need to be
+        * synchronized to and changes here.
+        *
+        * 1. The calculations of MAX_INGQ in cxgb4.h.
+        *
+        * 2. Update enable_msix/name_msix_vecs/request_msix_queue_irqs
+        *    to accommodate any new/deleted Ingress Queues
+        *    which need MSI-X Vectors.
+        *
+        * 3. Update sge_qinfo_show() to include information on the
+        *    new/deleted queues.
+        */
        err = t4_sge_alloc_rxq(adap, &s->fw_evtq, true, adap->port[0],
                               msi_idx, NULL, fwevtq_handler);
        if (err) {
@@ -4244,19 +4272,12 @@ static int cxgb_up(struct adapter *adap)
 
 static void cxgb_down(struct adapter *adapter)
 {
-       t4_intr_disable(adapter);
        cancel_work_sync(&adapter->tid_release_task);
        cancel_work_sync(&adapter->db_full_task);
        cancel_work_sync(&adapter->db_drop_task);
        adapter->tid_release_task_busy = false;
        adapter->tid_release_head = NULL;
 
-       if (adapter->flags & USING_MSIX) {
-               free_msix_queue_irqs(adapter);
-               free_irq(adapter->msix_info[0].vec, adapter);
-       } else
-               free_irq(adapter->pdev->irq, adapter);
-       quiesce_rx(adapter);
        t4_sge_stop(adapter);
        t4_free_sge_resources(adapter);
        adapter->flags &= ~FULL_INIT_DONE;
@@ -4733,8 +4754,9 @@ static int adap_init1(struct adapter *adap, struct fw_caps_config_cmd *c)
        if (ret < 0)
                return ret;
 
-       ret = t4_cfg_pfvf(adap, adap->fn, adap->fn, 0, MAX_EGRQ, 64, MAX_INGQ,
-                         0, 0, 4, 0xf, 0xf, 16, FW_CMD_CAP_PF, FW_CMD_CAP_PF);
+       ret = t4_cfg_pfvf(adap, adap->fn, adap->fn, 0, adap->sge.egr_sz, 64,
+                         MAX_INGQ, 0, 0, 4, 0xf, 0xf, 16, FW_CMD_CAP_PF,
+                         FW_CMD_CAP_PF);
        if (ret < 0)
                return ret;
 
@@ -5088,10 +5110,15 @@ static int adap_init0(struct adapter *adap)
        enum dev_state state;
        u32 params[7], val[7];
        struct fw_caps_config_cmd caps_cmd;
-       struct fw_devlog_cmd devlog_cmd;
-       u32 devlog_meminfo;
        int reset = 1;
 
+       /* Grab Firmware Device Log parameters as early as possible so we have
+        * access to it for debugging, etc.
+        */
+       ret = t4_init_devlog_params(adap);
+       if (ret < 0)
+               return ret;
+
        /* Contact FW, advertising Master capability */
        ret = t4_fw_hello(adap, adap->mbox, adap->mbox, MASTER_MAY, &state);
        if (ret < 0) {
@@ -5169,30 +5196,6 @@ static int adap_init0(struct adapter *adap)
        if (ret < 0)
                goto bye;
 
-       /* Read firmware device log parameters.  We really need to find a way
-        * to get these parameters initialized with some default values (which
-        * are likely to be correct) for the case where we either don't
-        * attache to the firmware or it's crashed when we probe the adapter.
-        * That way we'll still be able to perform early firmware startup
-        * debugging ...  If the request to get the Firmware's Device Log
-        * parameters fails, we'll live so we don't make that a fatal error.
-        */
-       memset(&devlog_cmd, 0, sizeof(devlog_cmd));
-       devlog_cmd.op_to_write = htonl(FW_CMD_OP_V(FW_DEVLOG_CMD) |
-                                      FW_CMD_REQUEST_F | FW_CMD_READ_F);
-       devlog_cmd.retval_len16 = htonl(FW_LEN16(devlog_cmd));
-       ret = t4_wr_mbox(adap, adap->mbox, &devlog_cmd, sizeof(devlog_cmd),
-                        &devlog_cmd);
-       if (ret == 0) {
-               devlog_meminfo =
-                       ntohl(devlog_cmd.memtype_devlog_memaddr16_devlog);
-               adap->params.devlog.memtype =
-                       FW_DEVLOG_CMD_MEMTYPE_DEVLOG_G(devlog_meminfo);
-               adap->params.devlog.start =
-                       FW_DEVLOG_CMD_MEMADDR16_DEVLOG_G(devlog_meminfo) << 4;
-               adap->params.devlog.size = ntohl(devlog_cmd.memsize_devlog);
-       }
-
        /*
         * Find out what ports are available to us.  Note that we need to do
         * this before calling adap_init0_no_config() since it needs nports
@@ -5293,6 +5296,51 @@ static int adap_init0(struct adapter *adap)
        adap->tids.nftids = val[4] - val[3] + 1;
        adap->sge.ingr_start = val[5];
 
+       /* qids (ingress/egress) returned from firmware can be anywhere
+        * in the range from EQ(IQFLINT)_START to EQ(IQFLINT)_END.
+        * Hence driver needs to allocate memory for this range to
+        * store the queue info. Get the highest IQFLINT/EQ index returned
+        * in FW_EQ_*_CMD.alloc command.
+        */
+       params[0] = FW_PARAM_PFVF(EQ_END);
+       params[1] = FW_PARAM_PFVF(IQFLINT_END);
+       ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 2, params, val);
+       if (ret < 0)
+               goto bye;
+       adap->sge.egr_sz = val[0] - adap->sge.egr_start + 1;
+       adap->sge.ingr_sz = val[1] - adap->sge.ingr_start + 1;
+
+       adap->sge.egr_map = kcalloc(adap->sge.egr_sz,
+                                   sizeof(*adap->sge.egr_map), GFP_KERNEL);
+       if (!adap->sge.egr_map) {
+               ret = -ENOMEM;
+               goto bye;
+       }
+
+       adap->sge.ingr_map = kcalloc(adap->sge.ingr_sz,
+                                    sizeof(*adap->sge.ingr_map), GFP_KERNEL);
+       if (!adap->sge.ingr_map) {
+               ret = -ENOMEM;
+               goto bye;
+       }
+
+       /* Allocate the memory for the vaious egress queue bitmaps
+        * ie starving_fl and txq_maperr.
+        */
+       adap->sge.starving_fl = kcalloc(BITS_TO_LONGS(adap->sge.egr_sz),
+                                       sizeof(long), GFP_KERNEL);
+       if (!adap->sge.starving_fl) {
+               ret = -ENOMEM;
+               goto bye;
+       }
+
+       adap->sge.txq_maperr = kcalloc(BITS_TO_LONGS(adap->sge.egr_sz),
+                                      sizeof(long), GFP_KERNEL);
+       if (!adap->sge.txq_maperr) {
+               ret = -ENOMEM;
+               goto bye;
+       }
+
        params[0] = FW_PARAM_PFVF(CLIP_START);
        params[1] = FW_PARAM_PFVF(CLIP_END);
        ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 2, params, val);
@@ -5501,6 +5549,10 @@ static int adap_init0(struct adapter *adap)
         * happened to HW/FW, stop issuing commands.
         */
 bye:
+       kfree(adap->sge.egr_map);
+       kfree(adap->sge.ingr_map);
+       kfree(adap->sge.starving_fl);
+       kfree(adap->sge.txq_maperr);
        if (ret != -ETIMEDOUT && ret != -EIO)
                t4_fw_bye(adap, adap->mbox);
        return ret;
@@ -5528,6 +5580,7 @@ static pci_ers_result_t eeh_err_detected(struct pci_dev *pdev,
                netif_carrier_off(dev);
        }
        spin_unlock(&adap->stats_lock);
+       disable_interrupts(adap);
        if (adap->flags & FULL_INIT_DONE)
                cxgb_down(adap);
        rtnl_unlock();
@@ -5912,6 +5965,10 @@ static void free_some_resources(struct adapter *adapter)
 
        t4_free_mem(adapter->l2t);
        t4_free_mem(adapter->tids.tid_tab);
+       kfree(adapter->sge.egr_map);
+       kfree(adapter->sge.ingr_map);
+       kfree(adapter->sge.starving_fl);
+       kfree(adapter->sge.txq_maperr);
        disable_msi(adapter);
 
        for_each_port(adapter, i)
@@ -6237,6 +6294,8 @@ static void remove_one(struct pci_dev *pdev)
                if (is_offload(adapter))
                        detach_ulds(adapter);
 
+               disable_interrupts(adapter);
+
                for_each_port(adapter, i)
                        if (adapter->port[i]->reg_state == NETREG_REGISTERED)
                                unregister_netdev(adapter->port[i]);
index b4b9f6048fe730dc1287a648a5e61c7f2d92dd7d..b688b32c21fe530aa0cd48b61323bca3ed269b53 100644 (file)
@@ -2171,7 +2171,7 @@ static void sge_rx_timer_cb(unsigned long data)
        struct adapter *adap = (struct adapter *)data;
        struct sge *s = &adap->sge;
 
-       for (i = 0; i < ARRAY_SIZE(s->starving_fl); i++)
+       for (i = 0; i < BITS_TO_LONGS(s->egr_sz); i++)
                for (m = s->starving_fl[i]; m; m &= m - 1) {
                        struct sge_eth_rxq *rxq;
                        unsigned int id = __ffs(m) + i * BITS_PER_LONG;
@@ -2259,7 +2259,7 @@ static void sge_tx_timer_cb(unsigned long data)
        struct adapter *adap = (struct adapter *)data;
        struct sge *s = &adap->sge;
 
-       for (i = 0; i < ARRAY_SIZE(s->txq_maperr); i++)
+       for (i = 0; i < BITS_TO_LONGS(s->egr_sz); i++)
                for (m = s->txq_maperr[i]; m; m &= m - 1) {
                        unsigned long id = __ffs(m) + i * BITS_PER_LONG;
                        struct sge_ofld_txq *txq = s->egr_map[id];
@@ -2741,7 +2741,8 @@ void t4_free_sge_resources(struct adapter *adap)
                free_rspq_fl(adap, &adap->sge.intrq, NULL);
 
        /* clear the reverse egress queue map */
-       memset(adap->sge.egr_map, 0, sizeof(adap->sge.egr_map));
+       memset(adap->sge.egr_map, 0,
+              adap->sge.egr_sz * sizeof(*adap->sge.egr_map));
 }
 
 void t4_sge_start(struct adapter *adap)
index 1abdfa123c6cf8d42a4788400539b40b05ed84eb..ee394dc68303851153a3598fe4455cec972597d9 100644 (file)
@@ -4458,6 +4458,59 @@ int cxgb4_t4_bar2_sge_qregs(struct adapter *adapter,
        return 0;
 }
 
+/**
+ *     t4_init_devlog_params - initialize adapter->params.devlog
+ *     @adap: the adapter
+ *
+ *     Initialize various fields of the adapter's Firmware Device Log
+ *     Parameters structure.
+ */
+int t4_init_devlog_params(struct adapter *adap)
+{
+       struct devlog_params *dparams = &adap->params.devlog;
+       u32 pf_dparams;
+       unsigned int devlog_meminfo;
+       struct fw_devlog_cmd devlog_cmd;
+       int ret;
+
+       /* If we're dealing with newer firmware, the Device Log Paramerters
+        * are stored in a designated register which allows us to access the
+        * Device Log even if we can't talk to the firmware.
+        */
+       pf_dparams =
+               t4_read_reg(adap, PCIE_FW_REG(PCIE_FW_PF_A, PCIE_FW_PF_DEVLOG));
+       if (pf_dparams) {
+               unsigned int nentries, nentries128;
+
+               dparams->memtype = PCIE_FW_PF_DEVLOG_MEMTYPE_G(pf_dparams);
+               dparams->start = PCIE_FW_PF_DEVLOG_ADDR16_G(pf_dparams) << 4;
+
+               nentries128 = PCIE_FW_PF_DEVLOG_NENTRIES128_G(pf_dparams);
+               nentries = (nentries128 + 1) * 128;
+               dparams->size = nentries * sizeof(struct fw_devlog_e);
+
+               return 0;
+       }
+
+       /* Otherwise, ask the firmware for it's Device Log Parameters.
+        */
+       memset(&devlog_cmd, 0, sizeof(devlog_cmd));
+       devlog_cmd.op_to_write = htonl(FW_CMD_OP_V(FW_DEVLOG_CMD) |
+                                      FW_CMD_REQUEST_F | FW_CMD_READ_F);
+       devlog_cmd.retval_len16 = htonl(FW_LEN16(devlog_cmd));
+       ret = t4_wr_mbox(adap, adap->mbox, &devlog_cmd, sizeof(devlog_cmd),
+                        &devlog_cmd);
+       if (ret)
+               return ret;
+
+       devlog_meminfo = ntohl(devlog_cmd.memtype_devlog_memaddr16_devlog);
+       dparams->memtype = FW_DEVLOG_CMD_MEMTYPE_DEVLOG_G(devlog_meminfo);
+       dparams->start = FW_DEVLOG_CMD_MEMADDR16_DEVLOG_G(devlog_meminfo) << 4;
+       dparams->size = ntohl(devlog_cmd.memsize_devlog);
+
+       return 0;
+}
+
 /**
  *     t4_init_sge_params - initialize adap->params.sge
  *     @adapter: the adapter
index 231a725f6d5d1679c4d6b07ff6694ac387f0b66a..326674b19983825af5631993b4427e0132ed6ba6 100644 (file)
@@ -63,6 +63,8 @@
 #define MC_BIST_STATUS_REG(reg_addr, idx) ((reg_addr) + (idx) * 4)
 #define EDC_BIST_STATUS_REG(reg_addr, idx) ((reg_addr) + (idx) * 4)
 
+#define PCIE_FW_REG(reg_addr, idx) ((reg_addr) + (idx) * 4)
+
 #define SGE_PF_KDOORBELL_A 0x0
 
 #define QID_S    15
 #define PFNUM_V(x) ((x) << PFNUM_S)
 
 #define PCIE_FW_A 0x30b8
+#define PCIE_FW_PF_A 0x30bc
 
 #define PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS_A 0x5908
 
index 9b353a88cbdab13a7ce85456e393edf197dd1740..a4a19e0ec7f5d76590f0e73641d5e03afbcde43a 100644 (file)
@@ -101,7 +101,7 @@ enum fw_wr_opcodes {
        FW_RI_BIND_MW_WR               = 0x18,
        FW_RI_FR_NSMR_WR               = 0x19,
        FW_RI_INV_LSTAG_WR             = 0x1a,
-       FW_LASTC2E_WR                  = 0x40
+       FW_LASTC2E_WR                  = 0x70
 };
 
 struct fw_wr_hdr {
@@ -993,6 +993,7 @@ enum fw_memtype_cf {
        FW_MEMTYPE_CF_EXTMEM            = 0x2,
        FW_MEMTYPE_CF_FLASH             = 0x4,
        FW_MEMTYPE_CF_INTERNAL          = 0x5,
+       FW_MEMTYPE_CF_EXTMEM1           = 0x6,
 };
 
 struct fw_caps_config_cmd {
@@ -1035,6 +1036,7 @@ enum fw_params_mnem {
        FW_PARAMS_MNEM_PFVF             = 2,    /* function params */
        FW_PARAMS_MNEM_REG              = 3,    /* limited register access */
        FW_PARAMS_MNEM_DMAQ             = 4,    /* dma queue params */
+       FW_PARAMS_MNEM_CHNET            = 5,    /* chnet params */
        FW_PARAMS_MNEM_LAST
 };
 
@@ -3102,7 +3104,8 @@ enum fw_devlog_facility {
        FW_DEVLOG_FACILITY_FCOE         = 0x2E,
        FW_DEVLOG_FACILITY_FOISCSI      = 0x30,
        FW_DEVLOG_FACILITY_FOFCOE       = 0x32,
-       FW_DEVLOG_FACILITY_MAX          = 0x32,
+       FW_DEVLOG_FACILITY_CHNET        = 0x34,
+       FW_DEVLOG_FACILITY_MAX          = 0x34,
 };
 
 /* log message format */
@@ -3139,4 +3142,36 @@ struct fw_devlog_cmd {
        (((x) >> FW_DEVLOG_CMD_MEMADDR16_DEVLOG_S) & \
         FW_DEVLOG_CMD_MEMADDR16_DEVLOG_M)
 
+/* P C I E   F W   P F 7   R E G I S T E R */
+
+/* PF7 stores the Firmware Device Log parameters which allows Host Drivers to
+ * access the "devlog" which needing to contact firmware.  The encoding is
+ * mostly the same as that returned by the DEVLOG command except for the size
+ * which is encoded as the number of entries in multiples-1 of 128 here rather
+ * than the memory size as is done in the DEVLOG command.  Thus, 0 means 128
+ * and 15 means 2048.  This of course in turn constrains the allowed values
+ * for the devlog size ...
+ */
+#define PCIE_FW_PF_DEVLOG              7
+
+#define PCIE_FW_PF_DEVLOG_NENTRIES128_S        28
+#define PCIE_FW_PF_DEVLOG_NENTRIES128_M        0xf
+#define PCIE_FW_PF_DEVLOG_NENTRIES128_V(x) \
+       ((x) << PCIE_FW_PF_DEVLOG_NENTRIES128_S)
+#define PCIE_FW_PF_DEVLOG_NENTRIES128_G(x) \
+       (((x) >> PCIE_FW_PF_DEVLOG_NENTRIES128_S) & \
+        PCIE_FW_PF_DEVLOG_NENTRIES128_M)
+
+#define PCIE_FW_PF_DEVLOG_ADDR16_S     4
+#define PCIE_FW_PF_DEVLOG_ADDR16_M     0xffffff
+#define PCIE_FW_PF_DEVLOG_ADDR16_V(x)  ((x) << PCIE_FW_PF_DEVLOG_ADDR16_S)
+#define PCIE_FW_PF_DEVLOG_ADDR16_G(x) \
+       (((x) >> PCIE_FW_PF_DEVLOG_ADDR16_S) & PCIE_FW_PF_DEVLOG_ADDR16_M)
+
+#define PCIE_FW_PF_DEVLOG_MEMTYPE_S    0
+#define PCIE_FW_PF_DEVLOG_MEMTYPE_M    0xf
+#define PCIE_FW_PF_DEVLOG_MEMTYPE_V(x) ((x) << PCIE_FW_PF_DEVLOG_MEMTYPE_S)
+#define PCIE_FW_PF_DEVLOG_MEMTYPE_G(x) \
+       (((x) >> PCIE_FW_PF_DEVLOG_MEMTYPE_S) & PCIE_FW_PF_DEVLOG_MEMTYPE_M)
+
 #endif /* _T4FW_INTERFACE_H_ */
index e2bd3f74785851d0db24c282f5a497972bcaf20f..b9d1cbac0eee3c97e76cff147df732601b83f714 100644 (file)
 #define __T4FW_VERSION_H__
 
 #define T4FW_VERSION_MAJOR 0x01
-#define T4FW_VERSION_MINOR 0x0C
-#define T4FW_VERSION_MICRO 0x19
+#define T4FW_VERSION_MINOR 0x0D
+#define T4FW_VERSION_MICRO 0x20
 #define T4FW_VERSION_BUILD 0x00
 
 #define T5FW_VERSION_MAJOR 0x01
-#define T5FW_VERSION_MINOR 0x0C
-#define T5FW_VERSION_MICRO 0x19
+#define T5FW_VERSION_MINOR 0x0D
+#define T5FW_VERSION_MICRO 0x20
 #define T5FW_VERSION_BUILD 0x00
 
 #endif
index 0545f0de1c52be282af53d6a5f03916f93ad7013..e0d711071afb7d6d80763666141477fb3dc4c479 100644 (file)
@@ -1004,7 +1004,7 @@ static inline void ring_tx_db(struct adapter *adapter, struct sge_txq *tq,
                                              ? (tq->pidx - 1)
                                              : (tq->size - 1));
                        __be64 *src = (__be64 *)&tq->desc[index];
-                       __be64 __iomem *dst = (__be64 *)(tq->bar2_addr +
+                       __be64 __iomem *dst = (__be64 __iomem *)(tq->bar2_addr +
                                                         SGE_UDB_WCDOORBELL);
                        unsigned int count = EQ_UNIT / sizeof(__be64);
 
@@ -1018,7 +1018,11 @@ static inline void ring_tx_db(struct adapter *adapter, struct sge_txq *tq,
                         * DMA.
                         */
                        while (count) {
-                               writeq(*src, dst);
+                               /* the (__force u64) is because the compiler
+                                * doesn't understand the endian swizzling
+                                * going on
+                                */
+                               writeq((__force u64)*src, dst);
                                src++;
                                dst++;
                                count--;
@@ -1252,8 +1256,8 @@ int t4vf_eth_xmit(struct sk_buff *skb, struct net_device *dev)
        BUG_ON(DIV_ROUND_UP(ETHTXQ_MAX_HDR, TXD_PER_EQ_UNIT) > 1);
        wr = (void *)&txq->q.desc[txq->q.pidx];
        wr->equiq_to_len16 = cpu_to_be32(wr_mid);
-       wr->r3[0] = cpu_to_be64(0);
-       wr->r3[1] = cpu_to_be64(0);
+       wr->r3[0] = cpu_to_be32(0);
+       wr->r3[1] = cpu_to_be32(0);
        skb_copy_from_linear_data(skb, (void *)wr->ethmacdst, fw_hdr_copy_len);
        end = (u64 *)wr + flits;
 
index 1b5506df35b15ab74eaf1ecea220da4ad6278a3f..280b4a21584934f56fbcc24399dfd0cf5feb694e 100644 (file)
@@ -210,10 +210,10 @@ int t4vf_wr_mbox_core(struct adapter *adapter, const void *cmd, int size,
 
                        if (rpl) {
                                /* request bit in high-order BE word */
-                               WARN_ON((be32_to_cpu(*(const u32 *)cmd)
+                               WARN_ON((be32_to_cpu(*(const __be32 *)cmd)
                                         & FW_CMD_REQUEST_F) == 0);
                                get_mbox_rpl(adapter, rpl, size, mbox_data);
-                               WARN_ON((be32_to_cpu(*(u32 *)rpl)
+                               WARN_ON((be32_to_cpu(*(__be32 *)rpl)
                                         & FW_CMD_REQUEST_F) != 0);
                        }
                        t4_write_reg(adapter, mbox_ctl,
@@ -484,7 +484,7 @@ int t4_bar2_sge_qregs(struct adapter *adapter,
         *  o The BAR2 Queue ID.
         *  o The BAR2 Queue ID Offset into the BAR2 page.
         */
-       bar2_page_offset = ((qid >> qpp_shift) << page_shift);
+       bar2_page_offset = ((u64)(qid >> qpp_shift) << page_shift);
        bar2_qid = qid & qpp_mask;
        bar2_qid_offset = bar2_qid * SGE_UDB_SIZE;
 
index 78e1ce09b1ab1deadad177472593e020a8f9d3c2..f6a3a7abd468e1f25fd4c33e874a38f0c85bc4dd 100644 (file)
@@ -1954,6 +1954,7 @@ static int fec_enet_mii_init(struct platform_device *pdev)
        struct fec_enet_private *fep = netdev_priv(ndev);
        struct device_node *node;
        int err = -ENXIO, i;
+       u32 mii_speed, holdtime;
 
        /*
         * The i.MX28 dual fec interfaces are not equal.
@@ -1991,10 +1992,33 @@ static int fec_enet_mii_init(struct platform_device *pdev)
         * Reference Manual has an error on this, and gets fixed on i.MX6Q
         * document.
         */
-       fep->phy_speed = DIV_ROUND_UP(clk_get_rate(fep->clk_ipg), 5000000);
+       mii_speed = DIV_ROUND_UP(clk_get_rate(fep->clk_ipg), 5000000);
        if (fep->quirks & FEC_QUIRK_ENET_MAC)
-               fep->phy_speed--;
-       fep->phy_speed <<= 1;
+               mii_speed--;
+       if (mii_speed > 63) {
+               dev_err(&pdev->dev,
+                       "fec clock (%lu) to fast to get right mii speed\n",
+                       clk_get_rate(fep->clk_ipg));
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       /*
+        * The i.MX28 and i.MX6 types have another filed in the MSCR (aka
+        * MII_SPEED) register that defines the MDIO output hold time. Earlier
+        * versions are RAZ there, so just ignore the difference and write the
+        * register always.
+        * The minimal hold time according to IEE802.3 (clause 22) is 10 ns.
+        * HOLDTIME + 1 is the number of clk cycles the fec is holding the
+        * output.
+        * The HOLDTIME bitfield takes values between 0 and 7 (inclusive).
+        * Given that ceil(clkrate / 5000000) <= 64, the calculation for
+        * holdtime cannot result in a value greater than 3.
+        */
+       holdtime = DIV_ROUND_UP(clk_get_rate(fep->clk_ipg), 100000000) - 1;
+
+       fep->phy_speed = mii_speed << 1 | holdtime << 8;
+
        writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED);
 
        fep->mii_bus = mdiobus_alloc();
index 357e8b576905a4a61c312e6c9736c087b706b69d..56b774d3a13d4c3856ad01fff0ebe67c128e4f65 100644 (file)
@@ -3893,6 +3893,9 @@ static int ucc_geth_probe(struct platform_device* ofdev)
        ugeth->phy_interface = phy_interface;
        ugeth->max_speed = max_speed;
 
+       /* Carrier starts down, phylib will bring it up */
+       netif_carrier_off(dev);
+
        err = register_netdev(dev);
        if (err) {
                if (netif_msg_probe(ugeth))
index 96208f17bb53be6240ae9e8d0cd30cc09041163e..2db653225a0e7713b21ecf3e80996eccf1a8a7d1 100644 (file)
@@ -2658,16 +2658,11 @@ static int mvneta_stop(struct net_device *dev)
 static int mvneta_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
        struct mvneta_port *pp = netdev_priv(dev);
-       int ret;
 
        if (!pp->phy_dev)
                return -ENOTSUPP;
 
-       ret = phy_mii_ioctl(pp->phy_dev, ifr, cmd);
-       if (!ret)
-               mvneta_adjust_link(dev);
-
-       return ret;
+       return phy_mii_ioctl(pp->phy_dev, ifr, cmd);
 }
 
 /* Ethtool methods */
index a681d7c0bb9f066f8d48ed068f68f0001dad8d0f..546ca4226916dbf2261525c12dc087ba35c7f4d7 100644 (file)
@@ -724,7 +724,8 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
                 * on the host, we deprecate the error message for this
                 * specific command/input_mod/opcode_mod/fw-status to be debug.
                 */
-               if (op == MLX4_CMD_SET_PORT && in_modifier == 1 &&
+               if (op == MLX4_CMD_SET_PORT &&
+                   (in_modifier == 1 || in_modifier == 2) &&
                    op_modifier == 0 && context->fw_status == CMD_STAT_BAD_SIZE)
                        mlx4_dbg(dev, "command 0x%x failed: fw status = 0x%x\n",
                                 op, context->fw_status);
@@ -1993,7 +1994,6 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
                        goto reset_slave;
                slave_state[slave].vhcr_dma = ((u64) param) << 48;
                priv->mfunc.master.slave_state[slave].cookie = 0;
-               mutex_init(&priv->mfunc.master.gen_eqe_mutex[slave]);
                break;
        case MLX4_COMM_CMD_VHCR1:
                if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR0)
@@ -2225,6 +2225,7 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
                for (i = 0; i < dev->num_slaves; ++i) {
                        s_state = &priv->mfunc.master.slave_state[i];
                        s_state->last_cmd = MLX4_COMM_CMD_RESET;
+                       mutex_init(&priv->mfunc.master.gen_eqe_mutex[i]);
                        for (j = 0; j < MLX4_EVENT_TYPES_NUM; ++j)
                                s_state->event_eq[j].eqn = -1;
                        __raw_writel((__force u32) 0,
index ebce5bb24df98b77eec38a1fc6c720987768934b..3485acf03014c7e4df64fa94a770fce00abe5a35 100644 (file)
@@ -2805,13 +2805,6 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
        netif_carrier_off(dev);
        mlx4_en_set_default_moderation(priv);
 
-       err = register_netdev(dev);
-       if (err) {
-               en_err(priv, "Netdev registration failed for port %d\n", port);
-               goto out;
-       }
-       priv->registered = 1;
-
        en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num);
        en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num);
 
@@ -2853,6 +2846,14 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 
        mlx4_set_stats_bitmap(mdev->dev, &priv->stats_bitmap);
 
+       err = register_netdev(dev);
+       if (err) {
+               en_err(priv, "Netdev registration failed for port %d\n", port);
+               goto out;
+       }
+
+       priv->registered = 1;
+
        return 0;
 
 out:
index 264bc15c1ff212ad649c3547c8a8ffc0b2c22e43..6e70ffee8e87ee2fa9957adeb5994ca346bf155b 100644 (file)
@@ -153,12 +153,10 @@ void mlx4_gen_slave_eqe(struct work_struct *work)
 
                /* All active slaves need to receive the event */
                if (slave == ALL_SLAVES) {
-                       for (i = 0; i < dev->num_slaves; i++) {
-                               if (i != dev->caps.function &&
-                                   master->slave_state[i].active)
-                                       if (mlx4_GEN_EQE(dev, i, eqe))
-                                               mlx4_warn(dev, "Failed to generate event for slave %d\n",
-                                                         i);
+                       for (i = 0; i <= dev->persist->num_vfs; i++) {
+                               if (mlx4_GEN_EQE(dev, i, eqe))
+                                       mlx4_warn(dev, "Failed to generate event for slave %d\n",
+                                                 i);
                        }
                } else {
                        if (mlx4_GEN_EQE(dev, slave, eqe))
@@ -203,13 +201,11 @@ static void mlx4_slave_event(struct mlx4_dev *dev, int slave,
                             struct mlx4_eqe *eqe)
 {
        struct mlx4_priv *priv = mlx4_priv(dev);
-       struct mlx4_slave_state *s_slave =
-               &priv->mfunc.master.slave_state[slave];
 
-       if (!s_slave->active) {
-               /*mlx4_warn(dev, "Trying to pass event to inactive slave\n");*/
+       if (slave < 0 || slave > dev->persist->num_vfs ||
+           slave == dev->caps.function ||
+           !priv->mfunc.master.slave_state[slave].active)
                return;
-       }
 
        slave_event(dev, slave, eqe);
 }
index d97ca88c55b59e039af66991e4ad413f82984158..6e413ac4e94011c4ea85993b8471d5adfc84d5d3 100644 (file)
@@ -3095,6 +3095,12 @@ int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe)
        if (!priv->mfunc.master.slave_state)
                return -EINVAL;
 
+       /* check for slave valid, slave not PF, and slave active */
+       if (slave < 0 || slave > dev->persist->num_vfs ||
+           slave == dev->caps.function ||
+           !priv->mfunc.master.slave_state[slave].active)
+               return 0;
+
        event_eq = &priv->mfunc.master.slave_state[slave].event_eq[eqe->type];
 
        /* Create the event only if the slave is registered */
index 9fb6948e14c64ef424c032811364aaefc608d4cb..5cecec282aba8b471b60b91af594ba6374edcda7 100644 (file)
@@ -4468,10 +4468,16 @@ static int rocker_port_master_changed(struct net_device *dev)
        struct net_device *master = netdev_master_upper_dev_get(dev);
        int err = 0;
 
+       /* There are currently three cases handled here:
+        * 1. Joining a bridge
+        * 2. Leaving a previously joined bridge
+        * 3. Other, e.g. being added to or removed from a bond or openvswitch,
+        *    in which case nothing is done
+        */
        if (master && master->rtnl_link_ops &&
            !strcmp(master->rtnl_link_ops->kind, "bridge"))
                err = rocker_port_bridge_join(rocker_port, master);
-       else
+       else if (rocker_port_is_bridged(rocker_port))
                err = rocker_port_bridge_leave(rocker_port);
 
        return err;
index 924ea98bd5311b1c7197a9fe4e9b475b095069ce..54549a6223dd2f47f493ae112a06f12f3337120f 100644 (file)
@@ -114,7 +114,9 @@ unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
 int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev);
 void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr);
-bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6);
+struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
+                                  const void *iaddr, bool is_v6);
+bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
 struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port,
                                        const void *iaddr, bool is_v6);
 void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync);
index 2a175006028b347efe1fbe6d145f311f78c80bd7..b7877a194cfe430469af8031c58efdf307eafc4f 100644 (file)
@@ -81,19 +81,20 @@ void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr)
        hash = (addr->atype == IPVL_IPV6) ?
               ipvlan_get_v6_hash(&addr->ip6addr) :
               ipvlan_get_v4_hash(&addr->ip4addr);
-       hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]);
+       if (hlist_unhashed(&addr->hlnode))
+               hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]);
 }
 
 void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync)
 {
-       hlist_del_rcu(&addr->hlnode);
+       hlist_del_init_rcu(&addr->hlnode);
        if (sync)
                synchronize_rcu();
 }
 
-bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
+struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
+                                  const void *iaddr, bool is_v6)
 {
-       struct ipvl_port *port = ipvlan->port;
        struct ipvl_addr *addr;
 
        list_for_each_entry(addr, &ipvlan->addrs, anode) {
@@ -101,12 +102,21 @@ bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
                    ipv6_addr_equal(&addr->ip6addr, iaddr)) ||
                    (!is_v6 && addr->atype == IPVL_IPV4 &&
                    addr->ip4addr.s_addr == ((struct in_addr *)iaddr)->s_addr))
-                       return true;
+                       return addr;
        }
+       return NULL;
+}
+
+bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6)
+{
+       struct ipvl_dev *ipvlan;
 
-       if (ipvlan_ht_addr_lookup(port, iaddr, is_v6))
-               return true;
+       ASSERT_RTNL();
 
+       list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+               if (ipvlan_find_addr(ipvlan, iaddr, is_v6))
+                       return true;
+       }
        return false;
 }
 
@@ -192,7 +202,8 @@ static void ipvlan_multicast_frame(struct ipvl_port *port, struct sk_buff *skb,
        if (skb->protocol == htons(ETH_P_PAUSE))
                return;
 
-       list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+       rcu_read_lock();
+       list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) {
                if (local && (ipvlan == in_dev))
                        continue;
 
@@ -219,6 +230,7 @@ static void ipvlan_multicast_frame(struct ipvl_port *port, struct sk_buff *skb,
 mcast_acct:
                ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true);
        }
+       rcu_read_unlock();
 
        /* Locally generated? ...Forward a copy to the main-device as
         * well. On the RX side we'll ignore it (wont give it to any
index 4f4099d5603d0b64f2a15b66e86dbee90260cead..4fa14208d79931df1b5d8707744e93043b951424 100644 (file)
@@ -505,7 +505,7 @@ static void ipvlan_link_delete(struct net_device *dev, struct list_head *head)
        if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) {
                list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) {
                        ipvlan_ht_addr_del(addr, !dev->dismantle);
-                       list_del_rcu(&addr->anode);
+                       list_del(&addr->anode);
                }
        }
        list_del_rcu(&ipvlan->pnode);
@@ -607,7 +607,7 @@ static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
 {
        struct ipvl_addr *addr;
 
-       if (ipvlan_addr_busy(ipvlan, ip6_addr, true)) {
+       if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true)) {
                netif_err(ipvlan, ifup, ipvlan->dev,
                          "Failed to add IPv6=%pI6c addr for %s intf\n",
                          ip6_addr, ipvlan->dev->name);
@@ -620,9 +620,13 @@ static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
        addr->master = ipvlan;
        memcpy(&addr->ip6addr, ip6_addr, sizeof(struct in6_addr));
        addr->atype = IPVL_IPV6;
-       list_add_tail_rcu(&addr->anode, &ipvlan->addrs);
+       list_add_tail(&addr->anode, &ipvlan->addrs);
        ipvlan->ipv6cnt++;
-       ipvlan_ht_addr_add(ipvlan, addr);
+       /* If the interface is not up, the address will be added to the hash
+        * list by ipvlan_open.
+        */
+       if (netif_running(ipvlan->dev))
+               ipvlan_ht_addr_add(ipvlan, addr);
 
        return 0;
 }
@@ -631,12 +635,12 @@ static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
 {
        struct ipvl_addr *addr;
 
-       addr = ipvlan_ht_addr_lookup(ipvlan->port, ip6_addr, true);
+       addr = ipvlan_find_addr(ipvlan, ip6_addr, true);
        if (!addr)
                return;
 
        ipvlan_ht_addr_del(addr, true);
-       list_del_rcu(&addr->anode);
+       list_del(&addr->anode);
        ipvlan->ipv6cnt--;
        WARN_ON(ipvlan->ipv6cnt < 0);
        kfree_rcu(addr, rcu);
@@ -675,7 +679,7 @@ static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
 {
        struct ipvl_addr *addr;
 
-       if (ipvlan_addr_busy(ipvlan, ip4_addr, false)) {
+       if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false)) {
                netif_err(ipvlan, ifup, ipvlan->dev,
                          "Failed to add IPv4=%pI4 on %s intf.\n",
                          ip4_addr, ipvlan->dev->name);
@@ -688,9 +692,13 @@ static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
        addr->master = ipvlan;
        memcpy(&addr->ip4addr, ip4_addr, sizeof(struct in_addr));
        addr->atype = IPVL_IPV4;
-       list_add_tail_rcu(&addr->anode, &ipvlan->addrs);
+       list_add_tail(&addr->anode, &ipvlan->addrs);
        ipvlan->ipv4cnt++;
-       ipvlan_ht_addr_add(ipvlan, addr);
+       /* If the interface is not up, the address will be added to the hash
+        * list by ipvlan_open.
+        */
+       if (netif_running(ipvlan->dev))
+               ipvlan_ht_addr_add(ipvlan, addr);
        ipvlan_set_broadcast_mac_filter(ipvlan, true);
 
        return 0;
@@ -700,12 +708,12 @@ static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
 {
        struct ipvl_addr *addr;
 
-       addr = ipvlan_ht_addr_lookup(ipvlan->port, ip4_addr, false);
+       addr = ipvlan_find_addr(ipvlan, ip4_addr, false);
        if (!addr)
                return;
 
        ipvlan_ht_addr_del(addr, true);
-       list_del_rcu(&addr->anode);
+       list_del(&addr->anode);
        ipvlan->ipv4cnt--;
        WARN_ON(ipvlan->ipv4cnt < 0);
        if (!ipvlan->ipv4cnt)
index 5c55f11572baa0a3e0dd3877ae886935cb8a6eb9..75d6f26729a30e34cdaaf8334a9cc0aaa2a01c82 100644 (file)
@@ -188,6 +188,8 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
                memcpy(skb_tail_pointer(skb), &padbytes, sizeof(padbytes));
                skb_put(skb, sizeof(padbytes));
        }
+
+       usbnet_set_skb_tx_stats(skb, 1, 0);
        return skb;
 }
 
index 9311a08565bed17cf5082a21b9c6dca7ab02dfe1..4545e78840b0d9dc80ae4392b36cfcf588f17c3a 100644 (file)
@@ -522,6 +522,7 @@ static const struct driver_info wwan_info = {
 #define DELL_VENDOR_ID         0x413C
 #define REALTEK_VENDOR_ID      0x0bda
 #define SAMSUNG_VENDOR_ID      0x04e8
+#define LENOVO_VENDOR_ID       0x17ef
 
 static const struct usb_device_id      products[] = {
 /* BLACKLIST !!
@@ -702,6 +703,13 @@ static const struct usb_device_id  products[] = {
        .driver_info = 0,
 },
 
+/* Lenovo Thinkpad USB 3.0 Ethernet Adapters (based on Realtek RTL8153) */
+{
+       USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x7205, USB_CLASS_COMM,
+                       USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+       .driver_info = 0,
+},
+
 /* WHITELIST!!!
  *
  * CDC Ether uses two interfaces, not necessarily consecutive.
index 80a844e0ae0383303d8fa4a6d4147fc99337f268..c3e4da9e79ca071a06082e965a3aec5bb206a77e 100644 (file)
@@ -1172,17 +1172,17 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 
        /* return skb */
        ctx->tx_curr_skb = NULL;
-       dev->net->stats.tx_packets += ctx->tx_curr_frame_num;
 
        /* keep private stats: framing overhead and number of NTBs */
        ctx->tx_overhead += skb_out->len - ctx->tx_curr_frame_payload;
        ctx->tx_ntbs++;
 
-       /* usbnet has already counted all the framing overhead.
+       /* usbnet will count all the framing overhead by default.
         * Adjust the stats so that the tx_bytes counter show real
         * payload data instead.
         */
-       dev->net->stats.tx_bytes -= skb_out->len - ctx->tx_curr_frame_payload;
+       usbnet_set_skb_tx_stats(skb_out, n,
+                               ctx->tx_curr_frame_payload - skb_out->len);
 
        return skb_out;
 
index 438fc6bcaef15538e9c151c46fce08dc67cec1a5..9f7c0ab3b3490b947d161428406612c80dc29360 100644 (file)
@@ -492,6 +492,7 @@ enum rtl8152_flags {
 /* Define these values to match your device */
 #define VENDOR_ID_REALTEK              0x0bda
 #define VENDOR_ID_SAMSUNG              0x04e8
+#define VENDOR_ID_LENOVO               0x17ef
 
 #define MCU_TYPE_PLA                   0x0100
 #define MCU_TYPE_USB                   0x0000
@@ -4037,6 +4038,7 @@ static struct usb_device_id rtl8152_table[] = {
        {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8152)},
        {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8153)},
        {REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101)},
+       {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x7205)},
        {}
 };
 
index b94a0fbb8b3b5a74ed466c4d5a66e25f4f2e0edc..953de13267df19d6fcefe60c2008f6222ca5775c 100644 (file)
@@ -144,6 +144,7 @@ static struct sk_buff *sr_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
                skb_put(skb, sizeof(padbytes));
        }
 
+       usbnet_set_skb_tx_stats(skb, 1, 0);
        return skb;
 }
 
index 449835f4331e210daad6c8717430341c1c6996ef..777757ae19732ab10ab2283645464a87a02c7b20 100644 (file)
@@ -1188,8 +1188,7 @@ static void tx_complete (struct urb *urb)
        struct usbnet           *dev = entry->dev;
 
        if (urb->status == 0) {
-               if (!(dev->driver_info->flags & FLAG_MULTI_PACKET))
-                       dev->net->stats.tx_packets++;
+               dev->net->stats.tx_packets += entry->packets;
                dev->net->stats.tx_bytes += entry->length;
        } else {
                dev->net->stats.tx_errors++;
@@ -1347,7 +1346,19 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
                } else
                        urb->transfer_flags |= URB_ZERO_PACKET;
        }
-       entry->length = urb->transfer_buffer_length = length;
+       urb->transfer_buffer_length = length;
+
+       if (info->flags & FLAG_MULTI_PACKET) {
+               /* Driver has set number of packets and a length delta.
+                * Calculate the complete length and ensure that it's
+                * positive.
+                */
+               entry->length += length;
+               if (WARN_ON_ONCE(entry->length <= 0))
+                       entry->length = length;
+       } else {
+               usbnet_set_skb_tx_stats(skb, 1, length);
+       }
 
        spin_lock_irqsave(&dev->txq.lock, flags);
        retval = usb_autopm_get_interface_async(dev->intf);
index cb366adc820b17a667a908d79db82da8519f3783..f50a6bc5d06ee0b2bb3c83c971cd5f1d81afd690 100644 (file)
@@ -219,12 +219,15 @@ void ath9k_beacon_remove_slot(struct ath_softc *sc, struct ieee80211_vif *vif)
        struct ath_common *common = ath9k_hw_common(sc->sc_ah);
        struct ath_vif *avp = (void *)vif->drv_priv;
        struct ath_buf *bf = avp->av_bcbuf;
+       struct ath_beacon_config *cur_conf = &sc->cur_chan->beacon;
 
        ath_dbg(common, CONFIG, "Removing interface at beacon slot: %d\n",
                avp->av_bslot);
 
        tasklet_disable(&sc->bcon_tasklet);
 
+       cur_conf->enable_beacon &= ~BIT(avp->av_bslot);
+
        if (bf && bf->bf_mpdu) {
                struct sk_buff *skb = bf->bf_mpdu;
                dma_unmap_single(sc->dev, bf->bf_buf_addr,
@@ -521,8 +524,7 @@ static bool ath9k_allow_beacon_config(struct ath_softc *sc,
        }
 
        if (sc->sc_ah->opmode == NL80211_IFTYPE_AP) {
-               if ((vif->type != NL80211_IFTYPE_AP) ||
-                   (sc->nbcnvifs > 1)) {
+               if (vif->type != NL80211_IFTYPE_AP) {
                        ath_dbg(common, CONFIG,
                                "An AP interface is already present !\n");
                        return false;
@@ -616,12 +618,14 @@ void ath9k_beacon_config(struct ath_softc *sc, struct ieee80211_vif *vif,
         * enabling/disabling SWBA.
         */
        if (changed & BSS_CHANGED_BEACON_ENABLED) {
-               if (!bss_conf->enable_beacon &&
-                   (sc->nbcnvifs <= 1)) {
-                       cur_conf->enable_beacon = false;
-               } else if (bss_conf->enable_beacon) {
-                       cur_conf->enable_beacon = true;
-                       ath9k_cache_beacon_config(sc, ctx, bss_conf);
+               bool enabled = cur_conf->enable_beacon;
+
+               if (!bss_conf->enable_beacon) {
+                       cur_conf->enable_beacon &= ~BIT(avp->av_bslot);
+               } else {
+                       cur_conf->enable_beacon |= BIT(avp->av_bslot);
+                       if (!enabled)
+                               ath9k_cache_beacon_config(sc, ctx, bss_conf);
                }
        }
 
index 2b79a568e8032c1fbc33a0fb550a5da6140f2865..d23737342f4fe7b311c84a7d556cad6e94ecf1f1 100644 (file)
@@ -54,7 +54,7 @@ struct ath_beacon_config {
        u16 dtim_period;
        u16 bmiss_timeout;
        u8 dtim_count;
-       bool enable_beacon;
+       u8 enable_beacon;
        bool ibss_creator;
        u32 nexttbtt;
        u32 intval;
index 60aa8d71e753fa936909dcc98ef915c2a2208f60..8529014e1a5e1c1b4637abc1625fd177b407f602 100644 (file)
@@ -424,7 +424,7 @@ static void ath9k_hw_init_defaults(struct ath_hw *ah)
        ah->power_mode = ATH9K_PM_UNDEFINED;
        ah->htc_reset_init = true;
 
-       ah->tpc_enabled = true;
+       ah->tpc_enabled = false;
 
        ah->ani_function = ATH9K_ANI_ALL;
        if (!AR_SREV_9300_20_OR_LATER(ah))
index defb7a44e0bc1ff554195890dcccc31bfb0c9769..7748a1ccf14fdf4a6b2864441e9b041da35558a6 100644 (file)
@@ -126,7 +126,8 @@ void brcmf_feat_attach(struct brcmf_pub *drvr)
        brcmf_feat_iovar_int_get(ifp, BRCMF_FEAT_MCHAN, "mchan");
        if (drvr->bus_if->wowl_supported)
                brcmf_feat_iovar_int_get(ifp, BRCMF_FEAT_WOWL, "wowl");
-       brcmf_feat_iovar_int_set(ifp, BRCMF_FEAT_MBSS, "mbss", 0);
+       if (drvr->bus_if->chip != BRCM_CC_43362_CHIP_ID)
+               brcmf_feat_iovar_int_set(ifp, BRCMF_FEAT_MBSS, "mbss", 0);
 
        /* set chip related quirks */
        switch (drvr->bus_if->chip) {
index a6f22c32a27994000f578c2baff08a2f62042aa0..3811878ab9cd2057ad44785c7f3dad5cb5edcf07 100644 (file)
@@ -708,7 +708,6 @@ struct iwl_priv {
        unsigned long reload_jiffies;
        int reload_count;
        bool ucode_loaded;
-       bool init_ucode_run;            /* Don't run init uCode again */
 
        u8 plcp_delta_threshold;
 
index 47e64e8b9517d97dff6b96e544a731d498412ba1..cceb026e0793b45fa418e7c023855d5c0bd23a02 100644 (file)
@@ -1114,16 +1114,17 @@ static void iwlagn_mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
        scd_queues &= ~(BIT(IWL_IPAN_CMD_QUEUE_NUM) |
                        BIT(IWL_DEFAULT_CMD_QUEUE_NUM));
 
-       if (vif)
-               scd_queues &= ~BIT(vif->hw_queue[IEEE80211_AC_VO]);
-
-       IWL_DEBUG_TX_QUEUES(priv, "Flushing SCD queues: 0x%x\n", scd_queues);
-       if (iwlagn_txfifo_flush(priv, scd_queues)) {
-               IWL_ERR(priv, "flush request fail\n");
-               goto done;
+       if (drop) {
+               IWL_DEBUG_TX_QUEUES(priv, "Flushing SCD queues: 0x%x\n",
+                                   scd_queues);
+               if (iwlagn_txfifo_flush(priv, scd_queues)) {
+                       IWL_ERR(priv, "flush request fail\n");
+                       goto done;
+               }
        }
+
        IWL_DEBUG_TX_QUEUES(priv, "wait transmit/flush all frames\n");
-       iwl_trans_wait_tx_queue_empty(priv->trans, 0xffffffff);
+       iwl_trans_wait_tx_queue_empty(priv->trans, scd_queues);
 done:
        mutex_unlock(&priv->mutex);
        IWL_DEBUG_MAC80211(priv, "leave\n");
index 4dbef7e58c2e3dfba5be41c1f2f4cfe32c04727b..5244e43bfafbc4617720097660ec693f5d30742f 100644 (file)
@@ -418,9 +418,6 @@ int iwl_run_init_ucode(struct iwl_priv *priv)
        if (!priv->fw->img[IWL_UCODE_INIT].sec[0].len)
                return 0;
 
-       if (priv->init_ucode_run)
-               return 0;
-
        iwl_init_notification_wait(&priv->notif_wait, &calib_wait,
                                   calib_complete, ARRAY_SIZE(calib_complete),
                                   iwlagn_wait_calib, priv);
@@ -440,8 +437,6 @@ int iwl_run_init_ucode(struct iwl_priv *priv)
         */
        ret = iwl_wait_notification(&priv->notif_wait, &calib_wait,
                                        UCODE_CALIB_TIMEOUT);
-       if (!ret)
-               priv->init_ucode_run = true;
 
        goto out;
 
index 996e7f16adf9feafc50cb5d56596a2b80e0cafb3..c7154ac42c8c366d093cfb462a46d9ac98c01e02 100644 (file)
@@ -1257,6 +1257,7 @@ static void iwl_req_fw_callback(const struct firmware *ucode_raw, void *context)
                                op->name, err);
 #endif
        }
+       kfree(pieces);
        return;
 
  try_again:
index efa9688a4cf11150f6d1e92c71e0a367d90fea87..078f24cf4af3927c66e2540bc6cc6404d0c36fc7 100644 (file)
@@ -1278,6 +1278,9 @@ static void rs_mac80211_tx_status(void *mvm_r,
        struct iwl_mvm *mvm = IWL_OP_MODE_GET_MVM(op_mode);
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 
+       if (!iwl_mvm_sta_from_mac80211(sta)->vif)
+               return;
+
        if (!ieee80211_is_data(hdr->frame_control) ||
            info->flags & IEEE80211_TX_CTL_NO_ACK)
                return;
@@ -2511,6 +2514,14 @@ static void rs_get_rate(void *mvm_r, struct ieee80211_sta *sta, void *mvm_sta,
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
        struct iwl_lq_sta *lq_sta = mvm_sta;
 
+       if (sta && !iwl_mvm_sta_from_mac80211(sta)->vif) {
+               /* if vif isn't initialized mvm doesn't know about
+                * this station, so don't do anything with the it
+                */
+               sta = NULL;
+               mvm_sta = NULL;
+       }
+
        /* TODO: handle rate_idx_mask and rate_idx_mcs_mask */
 
        /* Treat uninitialized rate scaling data same as non-existing. */
@@ -2827,6 +2838,9 @@ static void rs_rate_update(void *mvm_r,
                        (struct iwl_op_mode *)mvm_r;
        struct iwl_mvm *mvm = IWL_OP_MODE_GET_MVM(op_mode);
 
+       if (!iwl_mvm_sta_from_mac80211(sta)->vif)
+               return;
+
        /* Stop any ongoing aggregations as rs starts off assuming no agg */
        for (tid = 0; tid < IWL_MAX_TID_COUNT; tid++)
                ieee80211_stop_tx_ba_session(sta, tid);
@@ -3587,9 +3601,15 @@ static ssize_t iwl_dbgfs_ss_force_write(struct iwl_lq_sta *lq_sta, char *buf,
 
 MVM_DEBUGFS_READ_WRITE_FILE_OPS(ss_force, 32);
 
-static void rs_add_debugfs(void *mvm, void *mvm_sta, struct dentry *dir)
+static void rs_add_debugfs(void *mvm, void *priv_sta, struct dentry *dir)
 {
-       struct iwl_lq_sta *lq_sta = mvm_sta;
+       struct iwl_lq_sta *lq_sta = priv_sta;
+       struct iwl_mvm_sta *mvmsta;
+
+       mvmsta = container_of(lq_sta, struct iwl_mvm_sta, lq_sta);
+
+       if (!mvmsta->vif)
+               return;
 
        debugfs_create_file("rate_scale_table", S_IRUSR | S_IWUSR, dir,
                            lq_sta, &rs_sta_dbgfs_scale_table_ops);
index f8d6f306dd76d276b82056c9fb1a74956e78ac3c..4b81c0bf63b0a86173afde87ef2822c0dfa8860f 100644 (file)
@@ -197,6 +197,8 @@ iwl_mvm_te_handle_notify_csa(struct iwl_mvm *mvm,
                             struct iwl_time_event_notif *notif)
 {
        if (!le32_to_cpu(notif->status)) {
+               if (te_data->vif->type == NL80211_IFTYPE_STATION)
+                       ieee80211_connection_loss(te_data->vif);
                IWL_DEBUG_TE(mvm, "CSA time event failed to start\n");
                iwl_mvm_te_clear_data(mvm, te_data);
                return;
index 07304e1fd64aa70b41d4c9a18762b48924a6f250..96a05406babf864fb0ff463e9b87dc4c27ba5299 100644 (file)
@@ -949,8 +949,10 @@ int iwl_mvm_rx_ba_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb,
        mvmsta = iwl_mvm_sta_from_mac80211(sta);
        tid_data = &mvmsta->tid_data[tid];
 
-       if (WARN_ONCE(tid_data->txq_id != scd_flow, "Q %d, tid %d, flow %d",
-                     tid_data->txq_id, tid, scd_flow)) {
+       if (tid_data->txq_id != scd_flow) {
+               IWL_ERR(mvm,
+                       "invalid BA notification: Q %d, tid %d, flow %d\n",
+                       tid_data->txq_id, tid, scd_flow);
                rcu_read_unlock();
                return 0;
        }
index dbd6bcf5220563c03727bccde83083032b977ff8..686dd301cd536b68d616ea0507a3e1742e12240c 100644 (file)
@@ -368,10 +368,12 @@ static const struct pci_device_id iwl_hw_card_ids[] = {
 /* 3165 Series */
        {IWL_PCI_DEVICE(0x3165, 0x4010, iwl3165_2ac_cfg)},
        {IWL_PCI_DEVICE(0x3165, 0x4012, iwl3165_2ac_cfg)},
-       {IWL_PCI_DEVICE(0x3165, 0x4110, iwl3165_2ac_cfg)},
-       {IWL_PCI_DEVICE(0x3165, 0x4210, iwl3165_2ac_cfg)},
        {IWL_PCI_DEVICE(0x3165, 0x4410, iwl3165_2ac_cfg)},
        {IWL_PCI_DEVICE(0x3165, 0x4510, iwl3165_2ac_cfg)},
+       {IWL_PCI_DEVICE(0x3165, 0x4110, iwl3165_2ac_cfg)},
+       {IWL_PCI_DEVICE(0x3166, 0x4310, iwl3165_2ac_cfg)},
+       {IWL_PCI_DEVICE(0x3166, 0x4210, iwl3165_2ac_cfg)},
+       {IWL_PCI_DEVICE(0x3165, 0x8010, iwl3165_2ac_cfg)},
 
 /* 7265 Series */
        {IWL_PCI_DEVICE(0x095A, 0x5010, iwl7265_2ac_cfg)},
index a62170ea04818e37790eeec99fc2047e255b030f..8c45cf44ce24bac6393902ae2daf708b0ce11e6a 100644 (file)
@@ -1124,12 +1124,22 @@ static void _rtl_pci_prepare_bcn_tasklet(struct ieee80211_hw *hw)
        /*This is for new trx flow*/
        struct rtl_tx_buffer_desc *pbuffer_desc = NULL;
        u8 temp_one = 1;
+       u8 *entry;
 
        memset(&tcb_desc, 0, sizeof(struct rtl_tcb_desc));
        ring = &rtlpci->tx_ring[BEACON_QUEUE];
        pskb = __skb_dequeue(&ring->queue);
-       if (pskb)
+       if (rtlpriv->use_new_trx_flow)
+               entry = (u8 *)(&ring->buffer_desc[ring->idx]);
+       else
+               entry = (u8 *)(&ring->desc[ring->idx]);
+       if (pskb) {
+               pci_unmap_single(rtlpci->pdev,
+                                rtlpriv->cfg->ops->get_desc(
+                                (u8 *)entry, true, HW_DESC_TXBUFF_ADDR),
+                                pskb->len, PCI_DMA_TODEVICE);
                kfree_skb(pskb);
+       }
 
        /*NB: the beacon data buffer must be 32-bit aligned. */
        pskb = ieee80211_beacon_get(hw, mac->vif);
index e9b960f0ff32c8af2ff404a138780ff751bf4572..720aaf6313d296bec9b9a4826f1240b0eb4c0940 100644 (file)
@@ -1008,8 +1008,7 @@ err:
 
 static int xennet_change_mtu(struct net_device *dev, int mtu)
 {
-       int max = xennet_can_sg(dev) ?
-               XEN_NETIF_MAX_TX_SIZE - MAX_TCP_HEADER : ETH_DATA_LEN;
+       int max = xennet_can_sg(dev) ? XEN_NETIF_MAX_TX_SIZE : ETH_DATA_LEN;
 
        if (mtu > max)
                return -EINVAL;
@@ -1279,8 +1278,6 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev)
        netdev->ethtool_ops = &xennet_ethtool_ops;
        SET_NETDEV_DEV(netdev, &dev->dev);
 
-       netif_set_gso_max_size(netdev, XEN_NETIF_MAX_TX_SIZE - MAX_TCP_HEADER);
-
        np->netdev = netdev;
 
        netif_carrier_off(netdev);
index ad2906919d4589f4edbc0cd599a2fe7aa2938922..78a7dcbec7d8990ac37adad938a0aff3420423e2 100644 (file)
@@ -450,12 +450,17 @@ static struct of_bus *of_match_bus(struct device_node *np)
        return NULL;
 }
 
-static int of_empty_ranges_quirk(void)
+static int of_empty_ranges_quirk(struct device_node *np)
 {
        if (IS_ENABLED(CONFIG_PPC)) {
-               /* To save cycles, we cache the result */
+               /* To save cycles, we cache the result for global "Mac" setting */
                static int quirk_state = -1;
 
+               /* PA-SEMI sdc DT bug */
+               if (of_device_is_compatible(np, "1682m-sdc"))
+                       return true;
+
+               /* Make quirk cached */
                if (quirk_state < 0)
                        quirk_state =
                                of_machine_is_compatible("Power Macintosh") ||
@@ -490,7 +495,7 @@ static int of_translate_one(struct device_node *parent, struct of_bus *bus,
         * This code is only enabled on powerpc. --gcl
         */
        ranges = of_get_property(parent, rprop, &rlen);
-       if (ranges == NULL && !of_empty_ranges_quirk()) {
+       if (ranges == NULL && !of_empty_ranges_quirk(parent)) {
                pr_debug("OF: no ranges; cannot translate\n");
                return 1;
        }
index 1f4ea6f2d91094b1af4dee898d7fd0d63765f032..2e9f84fdd9ceb3d39611c617573f4895ab36423e 100644 (file)
@@ -342,7 +342,7 @@ static const struct irq_domain_ops msi_domain_ops = {
        .map = dw_pcie_msi_map,
 };
 
-int __init dw_pcie_host_init(struct pcie_port *pp)
+int dw_pcie_host_init(struct pcie_port *pp)
 {
        struct device_node *np = pp->dev->of_node;
        struct platform_device *pdev = to_platform_device(pp->dev);
index 866465fd3dbf7e617b3085e9a175bceea82d1ad1..020d788907191fd73ac1b7058b66b9a76f6b7351 100644 (file)
@@ -269,7 +269,7 @@ static struct pcie_host_ops spear13xx_pcie_host_ops = {
        .host_init = spear13xx_pcie_host_init,
 };
 
-static int __init spear13xx_add_pcie_port(struct pcie_port *pp,
+static int spear13xx_add_pcie_port(struct pcie_port *pp,
                                         struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
@@ -299,7 +299,7 @@ static int __init spear13xx_add_pcie_port(struct pcie_port *pp,
        return 0;
 }
 
-static int __init spear13xx_pcie_probe(struct platform_device *pdev)
+static int spear13xx_pcie_probe(struct platform_device *pdev)
 {
        struct spear13xx_pcie *spear13xx_pcie;
        struct pcie_port *pp;
@@ -370,7 +370,7 @@ static const struct of_device_id spear13xx_pcie_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, spear13xx_pcie_of_match);
 
-static struct platform_driver spear13xx_pcie_driver __initdata = {
+static struct platform_driver spear13xx_pcie_driver = {
        .probe          = spear13xx_pcie_probe,
        .driver = {
                .name   = "spear-pcie",
index 7d48ecae6695581e7ded9caa59adf782d5bf0d3f..788db48dbbad9ddf6ff697cd88c4fd832de29cde 100644 (file)
@@ -286,11 +286,12 @@ int cpci_configure_slot(struct slot *slot)
        }
        parent = slot->dev->bus;
 
-       list_for_each_entry(dev, &parent->devices, bus_list)
+       list_for_each_entry(dev, &parent->devices, bus_list) {
                if (PCI_SLOT(dev->devfn) != PCI_SLOT(slot->devfn))
                        continue;
                if (pci_is_bridge(dev))
                        pci_hp_add_bridge(dev);
+       }
 
 
        pci_assign_unassigned_bridge_resources(parent->self);
index 4890639873256812b721d83f304dff634ef58da1..c93fbe76d281c4b4822ceddae79feab984cfb576 100644 (file)
@@ -248,6 +248,9 @@ int pci_get_hp_params(struct pci_dev *dev, struct hotplug_params *hpp)
        acpi_handle handle, phandle;
        struct pci_bus *pbus;
 
+       if (acpi_pci_disabled)
+               return -ENODEV;
+
        handle = NULL;
        for (pbus = dev->bus; pbus; pbus = pbus->parent) {
                handle = acpi_pci_get_bridge_handle(pbus);
index c6849d9e86ce6c33bc11ec8ea621e3e042878817..167fe411ce2e30460ba6d0c8a04383976532b4e9 100644 (file)
@@ -132,16 +132,8 @@ static const char *aer_agent_string[] = {
 static void __print_tlp_header(struct pci_dev *dev,
                               struct aer_header_log_regs *t)
 {
-       unsigned char *tlp = (unsigned char *)&t;
-
-       dev_err(&dev->dev, "  TLP Header:"
-               " %02x%02x%02x%02x %02x%02x%02x%02x"
-               " %02x%02x%02x%02x %02x%02x%02x%02x\n",
-               *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
-               *(tlp + 7), *(tlp + 6), *(tlp + 5), *(tlp + 4),
-               *(tlp + 11), *(tlp + 10), *(tlp + 9),
-               *(tlp + 8), *(tlp + 15), *(tlp + 14),
-               *(tlp + 13), *(tlp + 12));
+       dev_err(&dev->dev, "  TLP Header: %08x %08x %08x %08x\n",
+               t->dw0, t->dw1, t->dw2, t->dw3);
 }
 
 static void __aer_print_error(struct pci_dev *dev,
index 472a5adc4642790d2ff1df9eab73aaaf2c4b064d..c29ba7e1430482c1ffe6c6ec92e1bd5cc7b06743 100644 (file)
@@ -55,7 +55,7 @@ static int rtc_suspend(struct device *dev)
        struct timespec64       delta, delta_delta;
        int err;
 
-       if (has_persistent_clock())
+       if (timekeeping_rtc_skipsuspend())
                return 0;
 
        if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0)
@@ -102,7 +102,7 @@ static int rtc_resume(struct device *dev)
        struct timespec64       sleep_time;
        int err;
 
-       if (has_persistent_clock())
+       if (timekeeping_rtc_skipresume())
                return 0;
 
        rtc_hctosys_ret = -ENODEV;
@@ -117,10 +117,6 @@ static int rtc_resume(struct device *dev)
                return 0;
        }
 
-       if (rtc_valid_tm(&tm) != 0) {
-               pr_debug("%s:  bogus resume time\n", dev_name(&rtc->dev));
-               return 0;
-       }
        new_rtc.tv_sec = rtc_tm_to_time64(&tm);
        new_rtc.tv_nsec = 0;
 
index 37215cf983e92926653d1f3206aa8c4c8842a55a..d43ee409a5f29c8ba344232577f56eab6121a6b2 100644 (file)
@@ -72,7 +72,11 @@ int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm)
                err = -ENODEV;
        else if (rtc->ops->set_time)
                err = rtc->ops->set_time(rtc->dev.parent, tm);
-       else if (rtc->ops->set_mmss) {
+       else if (rtc->ops->set_mmss64) {
+               time64_t secs64 = rtc_tm_to_time64(tm);
+
+               err = rtc->ops->set_mmss64(rtc->dev.parent, secs64);
+       } else if (rtc->ops->set_mmss) {
                time64_t secs64 = rtc_tm_to_time64(tm);
                err = rtc->ops->set_mmss(rtc->dev.parent, secs64);
        } else
@@ -96,6 +100,8 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs)
 
        if (!rtc->ops)
                err = -ENODEV;
+       else if (rtc->ops->set_mmss64)
+               err = rtc->ops->set_mmss64(rtc->dev.parent, secs);
        else if (rtc->ops->set_mmss)
                err = rtc->ops->set_mmss(rtc->dev.parent, secs);
        else if (rtc->ops->read_time && rtc->ops->set_time) {
index 1d0340fdb82021b5326182b4a517c4763b0818e8..9b725c55305859b48f5e6f2ffe45088d4b6f0659 100644 (file)
 /*
  * RTC clock functions and device struct declaration
  */
-static int ab3100_rtc_set_mmss(struct device *dev, unsigned long secs)
+static int ab3100_rtc_set_mmss(struct device *dev, time64_t secs)
 {
        u8 regs[] = {AB3100_TI0, AB3100_TI1, AB3100_TI2,
                     AB3100_TI3, AB3100_TI4, AB3100_TI5};
        unsigned char buf[6];
-       u64 fat_time = (u64) secs * AB3100_RTC_CLOCK_RATE * 2;
+       u64 hw_counter = secs * AB3100_RTC_CLOCK_RATE * 2;
        int err = 0;
        int i;
 
-       buf[0] = (fat_time) & 0xFF;
-       buf[1] = (fat_time >> 8) & 0xFF;
-       buf[2] = (fat_time >> 16) & 0xFF;
-       buf[3] = (fat_time >> 24) & 0xFF;
-       buf[4] = (fat_time >> 32) & 0xFF;
-       buf[5] = (fat_time >> 40) & 0xFF;
+       buf[0] = (hw_counter) & 0xFF;
+       buf[1] = (hw_counter >> 8) & 0xFF;
+       buf[2] = (hw_counter >> 16) & 0xFF;
+       buf[3] = (hw_counter >> 24) & 0xFF;
+       buf[4] = (hw_counter >> 32) & 0xFF;
+       buf[5] = (hw_counter >> 40) & 0xFF;
 
        for (i = 0; i < 6; i++) {
                err = abx500_set_register_interruptible(dev, 0,
@@ -75,7 +75,7 @@ static int ab3100_rtc_set_mmss(struct device *dev, unsigned long secs)
 
 static int ab3100_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
-       unsigned long time;
+       time64_t time;
        u8 rtcval;
        int err;
 
@@ -88,7 +88,7 @@ static int ab3100_rtc_read_time(struct device *dev, struct rtc_time *tm)
                dev_info(dev, "clock not set (lost power)");
                return -EINVAL;
        } else {
-               u64 fat_time;
+               u64 hw_counter;
                u8 buf[6];
 
                /* Read out time registers */
@@ -98,22 +98,21 @@ static int ab3100_rtc_read_time(struct device *dev, struct rtc_time *tm)
                if (err != 0)
                        return err;
 
-               fat_time = ((u64) buf[5] << 40) | ((u64) buf[4] << 32) |
+               hw_counter = ((u64) buf[5] << 40) | ((u64) buf[4] << 32) |
                        ((u64) buf[3] << 24) | ((u64) buf[2] << 16) |
                        ((u64) buf[1] << 8) | (u64) buf[0];
-               time = (unsigned long) (fat_time /
-                                       (u64) (AB3100_RTC_CLOCK_RATE * 2));
+               time = hw_counter / (u64) (AB3100_RTC_CLOCK_RATE * 2);
        }
 
-       rtc_time_to_tm(time, tm);
+       rtc_time64_to_tm(time, tm);
 
        return rtc_valid_tm(tm);
 }
 
 static int ab3100_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm)
 {
-       unsigned long time;
-       u64 fat_time;
+       time64_t time;
+       u64 hw_counter;
        u8 buf[6];
        u8 rtcval;
        int err;
@@ -134,11 +133,11 @@ static int ab3100_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm)
                                                     AB3100_AL0, buf, 4);
        if (err)
                return err;
-       fat_time = ((u64) buf[3] << 40) | ((u64) buf[2] << 32) |
+       hw_counter = ((u64) buf[3] << 40) | ((u64) buf[2] << 32) |
                ((u64) buf[1] << 24) | ((u64) buf[0] << 16);
-       time = (unsigned long) (fat_time / (u64) (AB3100_RTC_CLOCK_RATE * 2));
+       time = hw_counter / (u64) (AB3100_RTC_CLOCK_RATE * 2);
 
-       rtc_time_to_tm(time, &alarm->time);
+       rtc_time64_to_tm(time, &alarm->time);
 
        return rtc_valid_tm(&alarm->time);
 }
@@ -147,17 +146,17 @@ static int ab3100_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
 {
        u8 regs[] = {AB3100_AL0, AB3100_AL1, AB3100_AL2, AB3100_AL3};
        unsigned char buf[4];
-       unsigned long secs;
-       u64 fat_time;
+       time64_t secs;
+       u64 hw_counter;
        int err;
        int i;
 
-       rtc_tm_to_time(&alarm->time, &secs);
-       fat_time = (u64) secs * AB3100_RTC_CLOCK_RATE * 2;
-       buf[0] = (fat_time >> 16) & 0xFF;
-       buf[1] = (fat_time >> 24) & 0xFF;
-       buf[2] = (fat_time >> 32) & 0xFF;
-       buf[3] = (fat_time >> 40) & 0xFF;
+       secs = rtc_tm_to_time64(&alarm->time);
+       hw_counter = secs * AB3100_RTC_CLOCK_RATE * 2;
+       buf[0] = (hw_counter >> 16) & 0xFF;
+       buf[1] = (hw_counter >> 24) & 0xFF;
+       buf[2] = (hw_counter >> 32) & 0xFF;
+       buf[3] = (hw_counter >> 40) & 0xFF;
 
        /* Set the alarm */
        for (i = 0; i < 4; i++) {
@@ -193,7 +192,7 @@ static int ab3100_rtc_irq_enable(struct device *dev, unsigned int enabled)
 
 static const struct rtc_class_ops ab3100_rtc_ops = {
        .read_time      = ab3100_rtc_read_time,
-       .set_mmss       = ab3100_rtc_set_mmss,
+       .set_mmss64     = ab3100_rtc_set_mmss,
        .read_alarm     = ab3100_rtc_read_alarm,
        .set_alarm      = ab3100_rtc_set_alarm,
        .alarm_irq_enable = ab3100_rtc_irq_enable,
index 5bce904b7ee617cb2e3991cd4893ad493a75cbbd..32df1d812367c58acf3ebcd8445597fc8bdac014 100644 (file)
@@ -83,20 +83,19 @@ static int mc13xxx_rtc_read_time(struct device *dev, struct rtc_time *tm)
                        return ret;
        } while (days1 != days2);
 
-       rtc_time_to_tm(days1 * SEC_PER_DAY + seconds, tm);
+       rtc_time64_to_tm((time64_t)days1 * SEC_PER_DAY + seconds, tm);
 
        return rtc_valid_tm(tm);
 }
 
-static int mc13xxx_rtc_set_mmss(struct device *dev, unsigned long secs)
+static int mc13xxx_rtc_set_mmss(struct device *dev, time64_t secs)
 {
        struct mc13xxx_rtc *priv = dev_get_drvdata(dev);
        unsigned int seconds, days;
        unsigned int alarmseconds;
        int ret;
 
-       seconds = secs % SEC_PER_DAY;
-       days = secs / SEC_PER_DAY;
+       days = div_s64_rem(secs, SEC_PER_DAY, &seconds);
 
        mc13xxx_lock(priv->mc13xxx);
 
@@ -159,7 +158,7 @@ static int mc13xxx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm)
 {
        struct mc13xxx_rtc *priv = dev_get_drvdata(dev);
        unsigned seconds, days;
-       unsigned long s1970;
+       time64_t s1970;
        int enabled, pending;
        int ret;
 
@@ -189,10 +188,10 @@ out:
        alarm->enabled = enabled;
        alarm->pending = pending;
 
-       s1970 = days * SEC_PER_DAY + seconds;
+       s1970 = (time64_t)days * SEC_PER_DAY + seconds;
 
-       rtc_time_to_tm(s1970, &alarm->time);
-       dev_dbg(dev, "%s: %lu\n", __func__, s1970);
+       rtc_time64_to_tm(s1970, &alarm->time);
+       dev_dbg(dev, "%s: %lld\n", __func__, (long long)s1970);
 
        return 0;
 }
@@ -200,8 +199,8 @@ out:
 static int mc13xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
 {
        struct mc13xxx_rtc *priv = dev_get_drvdata(dev);
-       unsigned long s1970;
-       unsigned seconds, days;
+       time64_t s1970;
+       u32 seconds, days;
        int ret;
 
        mc13xxx_lock(priv->mc13xxx);
@@ -215,20 +214,17 @@ static int mc13xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
        if (unlikely(ret))
                goto out;
 
-       ret = rtc_tm_to_time(&alarm->time, &s1970);
-       if (unlikely(ret))
-               goto out;
+       s1970 = rtc_tm_to_time64(&alarm->time);
 
-       dev_dbg(dev, "%s: o%2.s %lu\n", __func__, alarm->enabled ? "n" : "ff",
-                       s1970);
+       dev_dbg(dev, "%s: o%2.s %lld\n", __func__, alarm->enabled ? "n" : "ff",
+                       (long long)s1970);
 
        ret = mc13xxx_rtc_irq_enable_unlocked(dev, alarm->enabled,
                        MC13XXX_IRQ_TODA);
        if (unlikely(ret))
                goto out;
 
-       seconds = s1970 % SEC_PER_DAY;
-       days = s1970 / SEC_PER_DAY;
+       days = div_s64_rem(s1970, SEC_PER_DAY, &seconds);
 
        ret = mc13xxx_reg_write(priv->mc13xxx, MC13XXX_RTCDAYA, days);
        if (unlikely(ret))
@@ -268,7 +264,7 @@ static irqreturn_t mc13xxx_rtc_update_handler(int irq, void *dev)
 
 static const struct rtc_class_ops mc13xxx_rtc_ops = {
        .read_time = mc13xxx_rtc_read_time,
-       .set_mmss = mc13xxx_rtc_set_mmss,
+       .set_mmss64 = mc13xxx_rtc_set_mmss,
        .read_alarm = mc13xxx_rtc_read_alarm,
        .set_alarm = mc13xxx_rtc_set_alarm,
        .alarm_irq_enable = mc13xxx_rtc_alarm_irq_enable,
index 3c3f8d10ab439c83fe93ec2041e8a3e29d9e0fc4..09d422b9f7f737d6ba4a0c990212e7f33f1d4038 100644 (file)
@@ -106,7 +106,7 @@ static inline int is_imx1_rtc(struct rtc_plat_data *data)
  * This function is used to obtain the RTC time or the alarm value in
  * second.
  */
-static u32 get_alarm_or_time(struct device *dev, int time_alarm)
+static time64_t get_alarm_or_time(struct device *dev, int time_alarm)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
@@ -129,29 +129,28 @@ static u32 get_alarm_or_time(struct device *dev, int time_alarm)
        hr = hr_min >> 8;
        min = hr_min & 0xff;
 
-       return (((day * 24 + hr) * 60) + min) * 60 + sec;
+       return ((((time64_t)day * 24 + hr) * 60) + min) * 60 + sec;
 }
 
 /*
  * This function sets the RTC alarm value or the time value.
  */
-static void set_alarm_or_time(struct device *dev, int time_alarm, u32 time)
+static void set_alarm_or_time(struct device *dev, int time_alarm, time64_t time)
 {
-       u32 day, hr, min, sec, temp;
+       u32 tod, day, hr, min, sec, temp;
        struct platform_device *pdev = to_platform_device(dev);
        struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
        void __iomem *ioaddr = pdata->ioaddr;
 
-       day = time / 86400;
-       time -= day * 86400;
+       day = div_s64_rem(time, 86400, &tod);
 
        /* time is within a day now */
-       hr = time / 3600;
-       time -= hr * 3600;
+       hr = tod / 3600;
+       tod -= hr * 3600;
 
        /* time is within an hour now */
-       min = time / 60;
-       sec = time - min * 60;
+       min = tod / 60;
+       sec = tod - min * 60;
 
        temp = (hr << 8) + min;
 
@@ -173,29 +172,18 @@ static void set_alarm_or_time(struct device *dev, int time_alarm, u32 time)
  * This function updates the RTC alarm registers and then clears all the
  * interrupt status bits.
  */
-static int rtc_update_alarm(struct device *dev, struct rtc_time *alrm)
+static void rtc_update_alarm(struct device *dev, struct rtc_time *alrm)
 {
-       struct rtc_time alarm_tm, now_tm;
-       unsigned long now, time;
+       time64_t time;
        struct platform_device *pdev = to_platform_device(dev);
        struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
        void __iomem *ioaddr = pdata->ioaddr;
 
-       now = get_alarm_or_time(dev, MXC_RTC_TIME);
-       rtc_time_to_tm(now, &now_tm);
-       alarm_tm.tm_year = now_tm.tm_year;
-       alarm_tm.tm_mon = now_tm.tm_mon;
-       alarm_tm.tm_mday = now_tm.tm_mday;
-       alarm_tm.tm_hour = alrm->tm_hour;
-       alarm_tm.tm_min = alrm->tm_min;
-       alarm_tm.tm_sec = alrm->tm_sec;
-       rtc_tm_to_time(&alarm_tm, &time);
+       time = rtc_tm_to_time64(alrm);
 
        /* clear all the interrupt status bits */
        writew(readw(ioaddr + RTC_RTCISR), ioaddr + RTC_RTCISR);
        set_alarm_or_time(dev, MXC_RTC_ALARM, time);
-
-       return 0;
 }
 
 static void mxc_rtc_irq_enable(struct device *dev, unsigned int bit,
@@ -283,14 +271,14 @@ static int mxc_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
  */
 static int mxc_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
-       u32 val;
+       time64_t val;
 
        /* Avoid roll-over from reading the different registers */
        do {
                val = get_alarm_or_time(dev, MXC_RTC_TIME);
        } while (val != get_alarm_or_time(dev, MXC_RTC_TIME));
 
-       rtc_time_to_tm(val, tm);
+       rtc_time64_to_tm(val, tm);
 
        return 0;
 }
@@ -298,7 +286,7 @@ static int mxc_rtc_read_time(struct device *dev, struct rtc_time *tm)
 /*
  * This function sets the internal RTC time based on tm in Gregorian date.
  */
-static int mxc_rtc_set_mmss(struct device *dev, unsigned long time)
+static int mxc_rtc_set_mmss(struct device *dev, time64_t time)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
@@ -309,9 +297,9 @@ static int mxc_rtc_set_mmss(struct device *dev, unsigned long time)
        if (is_imx1_rtc(pdata)) {
                struct rtc_time tm;
 
-               rtc_time_to_tm(time, &tm);
+               rtc_time64_to_tm(time, &tm);
                tm.tm_year = 70;
-               rtc_tm_to_time(&tm, &time);
+               time = rtc_tm_to_time64(&tm);
        }
 
        /* Avoid roll-over from reading the different registers */
@@ -333,7 +321,7 @@ static int mxc_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
        struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
        void __iomem *ioaddr = pdata->ioaddr;
 
-       rtc_time_to_tm(get_alarm_or_time(dev, MXC_RTC_ALARM), &alrm->time);
+       rtc_time64_to_tm(get_alarm_or_time(dev, MXC_RTC_ALARM), &alrm->time);
        alrm->pending = ((readw(ioaddr + RTC_RTCISR) & RTC_ALM_BIT)) ? 1 : 0;
 
        return 0;
@@ -346,11 +334,8 @@ static int mxc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
-       int ret;
 
-       ret = rtc_update_alarm(dev, &alrm->time);
-       if (ret)
-               return ret;
+       rtc_update_alarm(dev, &alrm->time);
 
        memcpy(&pdata->g_rtc_alarm, &alrm->time, sizeof(struct rtc_time));
        mxc_rtc_irq_enable(dev, RTC_ALM_BIT, alrm->enabled);
@@ -362,7 +347,7 @@ static int mxc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 static struct rtc_class_ops mxc_rtc_ops = {
        .release                = mxc_rtc_release,
        .read_time              = mxc_rtc_read_time,
-       .set_mmss               = mxc_rtc_set_mmss,
+       .set_mmss64             = mxc_rtc_set_mmss,
        .read_alarm             = mxc_rtc_read_alarm,
        .set_alarm              = mxc_rtc_set_alarm,
        .alarm_irq_enable       = mxc_rtc_alarm_irq_enable,
index 8f86fa91de1a80e4e7d1a60a58c3acd3a27bd1ce..3a2da4c892d65f25c597f1872932d639108bd57a 100644 (file)
 #include <linux/rtc.h>
 #include <linux/platform_device.h>
 
+static int test_mmss64;
+module_param(test_mmss64, int, 0644);
+MODULE_PARM_DESC(test_mmss64, "Test struct rtc_class_ops.set_mmss64().");
+
 static struct platform_device *test0 = NULL, *test1 = NULL;
 
 static int test_rtc_read_alarm(struct device *dev,
@@ -30,7 +34,13 @@ static int test_rtc_set_alarm(struct device *dev,
 static int test_rtc_read_time(struct device *dev,
        struct rtc_time *tm)
 {
-       rtc_time_to_tm(get_seconds(), tm);
+       rtc_time64_to_tm(ktime_get_real_seconds(), tm);
+       return 0;
+}
+
+static int test_rtc_set_mmss64(struct device *dev, time64_t secs)
+{
+       dev_info(dev, "%s, secs = %lld\n", __func__, (long long)secs);
        return 0;
 }
 
@@ -55,7 +65,7 @@ static int test_rtc_alarm_irq_enable(struct device *dev, unsigned int enable)
        return 0;
 }
 
-static const struct rtc_class_ops test_rtc_ops = {
+static struct rtc_class_ops test_rtc_ops = {
        .proc = test_rtc_proc,
        .read_time = test_rtc_read_time,
        .read_alarm = test_rtc_read_alarm,
@@ -101,6 +111,11 @@ static int test_probe(struct platform_device *plat_dev)
        int err;
        struct rtc_device *rtc;
 
+       if (test_mmss64) {
+               test_rtc_ops.set_mmss64 = test_rtc_set_mmss64;
+               test_rtc_ops.set_mmss = NULL;
+       }
+
        rtc = devm_rtc_device_register(&plat_dev->dev, "test",
                                &test_rtc_ops, THIS_MODULE);
        if (IS_ERR(rtc)) {
index eb71872d0361c0dbedd8f994aae72ce6bd0d2c7e..7728d5e32bf4921d7bfd02cabe4b5c1ea124a6bd 100644 (file)
@@ -11,7 +11,7 @@
  * rtc_set_ntp_time - Save NTP synchronized time to the RTC
  * @now: Current time of day
  *
- * Replacement for the NTP platform function update_persistent_clock
+ * Replacement for the NTP platform function update_persistent_clock64
  * that stores time for later retrieval by rtc_hctosys.
  *
  * Returns 0 on successful RTC update, -ENODEV if a RTC update is not
@@ -35,7 +35,10 @@ int rtc_set_ntp_time(struct timespec64 now)
        if (rtc) {
                /* rtc_hctosys exclusively uses UTC, so we call set_time here,
                 * not set_mmss. */
-               if (rtc->ops && (rtc->ops->set_time || rtc->ops->set_mmss))
+               if (rtc->ops &&
+                   (rtc->ops->set_time ||
+                    rtc->ops->set_mmss64 ||
+                    rtc->ops->set_mmss))
                        err = rtc_set_time(rtc, &tm);
                rtc_class_close(rtc);
        }
index a7cc618378187fb7d38e504d51befb93e78b223c..923a2b5a24395547212207312588b125f19de3a2 100644 (file)
@@ -5734,9 +5734,9 @@ free_port:
 hba_free:
        if (phba->msix_enabled)
                pci_disable_msix(phba->pcidev);
-       iscsi_host_remove(phba->shost);
        pci_dev_put(phba->pcidev);
        iscsi_host_free(phba->shost);
+       pci_set_drvdata(pcidev, NULL);
 disable_pci:
        pci_disable_device(pcidev);
        return ret;
index 54d7a6cbb98a48d06b932cce40c488488760da08..b1a263137a23391a1e19c2589f35fdaf2c4f514f 100644 (file)
@@ -1311,9 +1311,11 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
                                    "rejecting I/O to dead device\n");
                        ret = BLKPREP_KILL;
                        break;
-               case SDEV_QUIESCE:
                case SDEV_BLOCK:
                case SDEV_CREATED_BLOCK:
+                       ret = BLKPREP_DEFER;
+                       break;
+               case SDEV_QUIESCE:
                        /*
                         * If the devices is blocked we defer normal commands.
                         */
index 24183028bd712b11af46cd4531f60b33b4e57338..6d5b38d6957852ee81caa09cb6e3c047c55d26b8 100644 (file)
@@ -38,6 +38,7 @@ config IIO_SIMPLE_DUMMY_EVENTS
 config IIO_SIMPLE_DUMMY_BUFFER
        bool "Buffered capture support"
        select IIO_BUFFER
+       select IIO_TRIGGER
        select IIO_KFIFO_BUF
        help
          Add buffered data capture to the simple dummy driver.
index fd171d8b38fbcc444f3e7118bb66b2d51bdf698c..90cc18b703cf67ae6c089c36d51d8ebde6f28106 100644 (file)
@@ -592,6 +592,7 @@ int hmc5843_common_probe(struct device *dev, struct regmap *regmap,
        mutex_init(&data->lock);
 
        indio_dev->dev.parent = dev;
+       indio_dev->name = dev->driver->name;
        indio_dev->info = &hmc5843_info;
        indio_dev->modes = INDIO_DIRECT_MODE;
        indio_dev->channels = data->variant->channels;
index 2accb6e47beb35c1eff10256f7a85d749fe6d590..77d64251af40451f8e897eac1813599e3ba499d0 100644 (file)
@@ -1181,7 +1181,7 @@ iscsit_handle_scsi_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
         * traditional iSCSI block I/O.
         */
        if (iscsit_allocate_iovecs(cmd) < 0) {
-               return iscsit_add_reject_cmd(cmd,
+               return iscsit_reject_cmd(cmd,
                                ISCSI_REASON_BOOKMARK_NO_RESOURCES, buf);
        }
        immed_data = cmd->immediate_data;
@@ -3468,6 +3468,7 @@ iscsit_build_sendtargets_response(struct iscsi_cmd *cmd,
                                                tpg_np_list) {
                                struct iscsi_np *np = tpg_np->tpg_np;
                                bool inaddr_any = iscsit_check_inaddr_any(np);
+                               char *fmt_str;
 
                                if (np->np_network_transport != network_transport)
                                        continue;
@@ -3495,8 +3496,12 @@ iscsit_build_sendtargets_response(struct iscsi_cmd *cmd,
                                        }
                                }
 
-                               len = sprintf(buf, "TargetAddress="
-                                       "%s:%hu,%hu",
+                               if (np->np_sockaddr.ss_family == AF_INET6)
+                                       fmt_str = "TargetAddress=[%s]:%hu,%hu";
+                               else
+                                       fmt_str = "TargetAddress=%s:%hu,%hu";
+
+                               len = sprintf(buf, fmt_str,
                                        inaddr_any ? conn->local_ip : np->np_ip,
                                        np->np_port,
                                        tpg->tpgt);
index 79b4ec3ca2db12416a692fba785a09b462e3169f..7faa6aef9a4d5429cbf1d3810ebb181f7a911beb 100644 (file)
@@ -781,8 +781,8 @@ int se_dev_set_emulate_fua_write(struct se_device *dev, int flag)
        }
        if (flag &&
            dev->transport->get_write_cache) {
-               pr_err("emulate_fua_write not supported for this device\n");
-               return -EINVAL;
+               pr_warn("emulate_fua_write not supported for this device, ignoring\n");
+               return 0;
        }
        if (dev->export_count) {
                pr_err("emulate_fua_write cannot be changed with active"
index d1ec5804c0bb94cebeb22d5038b4861393047ba0..76c515dd802b489116dd73f342520dff8326a67e 100644 (file)
@@ -25,7 +25,7 @@
  * Function to allocate regfields which are common
  * between syscfg and memory mapped based sensors
  */
-int st_thermal_alloc_regfields(struct st_thermal_sensor *sensor)
+static int st_thermal_alloc_regfields(struct st_thermal_sensor *sensor)
 {
        struct device *dev = sensor->dev;
        struct regmap *regmap = sensor->regmap;
index 067bfcdb91d678aefb4fb5ca461024dd0792b699..fc0c9e198710327bbd10983b895eac4d740bb258 100644 (file)
@@ -157,7 +157,7 @@ static const struct st_thermal_sensor_ops st_mmap_sensor_ops = {
 };
 
 /* Compatible device data stih416 mpe thermal sensor */
-const struct st_thermal_compat_data st_416mpe_cdata = {
+static const struct st_thermal_compat_data st_416mpe_cdata = {
        .reg_fields             = st_mmap_thermal_regfields,
        .ops                    = &st_mmap_sensor_ops,
        .calibration_val        = 14,
@@ -166,7 +166,7 @@ const struct st_thermal_compat_data st_416mpe_cdata = {
 };
 
 /* Compatible device data stih407 thermal sensor */
-const struct st_thermal_compat_data st_407_cdata = {
+static const struct st_thermal_compat_data st_407_cdata = {
        .reg_fields             = st_mmap_thermal_regfields,
        .ops                    = &st_mmap_sensor_ops,
        .calibration_val        = 16,
@@ -174,19 +174,19 @@ const struct st_thermal_compat_data st_407_cdata = {
        .crit_temp              = 120,
 };
 
-static struct of_device_id st_mmap_thermal_of_match[] = {
+static const struct of_device_id st_mmap_thermal_of_match[] = {
        { .compatible = "st,stih416-mpe-thermal", .data = &st_416mpe_cdata },
        { .compatible = "st,stih407-thermal",     .data = &st_407_cdata },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, st_mmap_thermal_of_match);
 
-int st_mmap_probe(struct platform_device *pdev)
+static int st_mmap_probe(struct platform_device *pdev)
 {
        return st_thermal_register(pdev,  st_mmap_thermal_of_match);
 }
 
-int st_mmap_remove(struct platform_device *pdev)
+static int st_mmap_remove(struct platform_device *pdev)
 {
        return st_thermal_unregister(pdev);
 }
index 26d36a242bb89d63dec3b110e1eaa96b4c4fad65..3df5b789070325db13361d14b33eb4817bfc0d8d 100644 (file)
@@ -104,7 +104,7 @@ static const struct st_thermal_sensor_ops st_syscfg_sensor_ops = {
 };
 
 /* Compatible device data for stih415 sas thermal sensor */
-const struct st_thermal_compat_data st_415sas_cdata = {
+static const struct st_thermal_compat_data st_415sas_cdata = {
        .sys_compat             = "st,stih415-front-syscfg",
        .reg_fields             = st_415sas_regfields,
        .ops                    = &st_syscfg_sensor_ops,
@@ -114,7 +114,7 @@ const struct st_thermal_compat_data st_415sas_cdata = {
 };
 
 /* Compatible device data for stih415 mpe thermal sensor */
-const struct st_thermal_compat_data st_415mpe_cdata = {
+static const struct st_thermal_compat_data st_415mpe_cdata = {
        .sys_compat             = "st,stih415-system-syscfg",
        .reg_fields             = st_415mpe_regfields,
        .ops                    = &st_syscfg_sensor_ops,
@@ -124,7 +124,7 @@ const struct st_thermal_compat_data st_415mpe_cdata = {
 };
 
 /* Compatible device data for stih416 sas thermal sensor */
-const struct st_thermal_compat_data st_416sas_cdata = {
+static const struct st_thermal_compat_data st_416sas_cdata = {
        .sys_compat             = "st,stih416-front-syscfg",
        .reg_fields             = st_416sas_regfields,
        .ops                    = &st_syscfg_sensor_ops,
@@ -134,7 +134,7 @@ const struct st_thermal_compat_data st_416sas_cdata = {
 };
 
 /* Compatible device data for stid127 thermal sensor */
-const struct st_thermal_compat_data st_127_cdata = {
+static const struct st_thermal_compat_data st_127_cdata = {
        .sys_compat             = "st,stid127-cpu-syscfg",
        .reg_fields             = st_127_regfields,
        .ops                    = &st_syscfg_sensor_ops,
@@ -143,7 +143,7 @@ const struct st_thermal_compat_data st_127_cdata = {
        .crit_temp              = 120,
 };
 
-static struct of_device_id st_syscfg_thermal_of_match[] = {
+static const struct of_device_id st_syscfg_thermal_of_match[] = {
        { .compatible = "st,stih415-sas-thermal", .data = &st_415sas_cdata },
        { .compatible = "st,stih415-mpe-thermal", .data = &st_415mpe_cdata },
        { .compatible = "st,stih416-sas-thermal", .data = &st_416sas_cdata },
@@ -152,12 +152,12 @@ static struct of_device_id st_syscfg_thermal_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, st_syscfg_thermal_of_match);
 
-int st_syscfg_probe(struct platform_device *pdev)
+static int st_syscfg_probe(struct platform_device *pdev)
 {
        return st_thermal_register(pdev, st_syscfg_thermal_of_match);
 }
 
-int st_syscfg_remove(struct platform_device *pdev)
+static int st_syscfg_remove(struct platform_device *pdev)
 {
        return st_thermal_unregister(pdev);
 }
index 174d3bcf8bd7a16a161cdf94c53e8d14efaa55a6..4108db7e10c1094622d4abca08017db3125cf80e 100644 (file)
@@ -458,8 +458,10 @@ static void update_temperature(struct thermal_zone_device *tz)
 
        ret = thermal_zone_get_temp(tz, &temp);
        if (ret) {
-               dev_warn(&tz->device, "failed to read out thermal zone %d\n",
-                        tz->id);
+               if (ret != -EAGAIN)
+                       dev_warn(&tz->device,
+                                "failed to read out thermal zone (%d)\n",
+                                ret);
                return;
        }
 
index b1893f3f88f1c6b6fdb2e8e42ad565ba23e18fd1..3ad1458bfeb0fc32afe790b2aa0f3e36961b7b45 100644 (file)
@@ -921,6 +921,9 @@ static void lpuart_setup_watermark(struct lpuart_port *sport)
        writeb(val | UARTPFIFO_TXFE | UARTPFIFO_RXFE,
                        sport->port.membase + UARTPFIFO);
 
+       /* explicitly clear RDRF */
+       readb(sport->port.membase + UARTSR1);
+
        /* flush Tx and Rx FIFO */
        writeb(UARTCFIFO_TXFLUSH | UARTCFIFO_RXFLUSH,
                        sport->port.membase + UARTCFIFO);
@@ -1076,6 +1079,8 @@ static int lpuart_startup(struct uart_port *port)
        sport->txfifo_size = 0x1 << (((temp >> UARTPFIFO_TXSIZE_OFF) &
                UARTPFIFO_FIFOSIZE_MASK) + 1);
 
+       sport->port.fifosize = sport->txfifo_size;
+
        sport->rxfifo_size = 0x1 << (((temp >> UARTPFIFO_RXSIZE_OFF) &
                UARTPFIFO_FIFOSIZE_MASK) + 1);
 
index af821a9087204ec654f637a617513ba06d68a0ef..cf08876922f1446e55a2d8ca79bf87e0d4e24fed 100644 (file)
@@ -963,6 +963,7 @@ static void s3c24xx_serial_shutdown(struct uart_port *port)
                        free_irq(ourport->tx_irq, ourport);
                tx_enabled(port) = 0;
                ourport->tx_claimed = 0;
+               ourport->tx_mode = 0;
        }
 
        if (ourport->rx_claimed) {
index a7865c4b04980898b49317386ad6138aab051bc5..0827d7c965276382418f0a602ec5c1412c023142 100644 (file)
@@ -387,6 +387,10 @@ static void xhci_clear_port_change_bit(struct xhci_hcd *xhci, u16 wValue,
                status = PORT_PLC;
                port_change_bit = "link state";
                break;
+       case USB_PORT_FEAT_C_PORT_CONFIG_ERROR:
+               status = PORT_CEC;
+               port_change_bit = "config error";
+               break;
        default:
                /* Should never happen */
                return;
@@ -588,6 +592,8 @@ static u32 xhci_get_port_status(struct usb_hcd *hcd,
                        status |= USB_PORT_STAT_C_LINK_STATE << 16;
                if ((raw_port_status & PORT_WRC))
                        status |= USB_PORT_STAT_C_BH_RESET << 16;
+               if ((raw_port_status & PORT_CEC))
+                       status |= USB_PORT_STAT_C_CONFIG_ERROR << 16;
        }
 
        if (hcd->speed != HCD_USB3) {
@@ -1005,6 +1011,7 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
                case USB_PORT_FEAT_C_OVER_CURRENT:
                case USB_PORT_FEAT_C_ENABLE:
                case USB_PORT_FEAT_C_PORT_LINK_STATE:
+               case USB_PORT_FEAT_C_PORT_CONFIG_ERROR:
                        xhci_clear_port_change_bit(xhci, wValue, wIndex,
                                        port_array[wIndex], temp);
                        break;
@@ -1069,7 +1076,7 @@ int xhci_hub_status_data(struct usb_hcd *hcd, char *buf)
         */
        status = bus_state->resuming_ports;
 
-       mask = PORT_CSC | PORT_PEC | PORT_OCC | PORT_PLC | PORT_WRC;
+       mask = PORT_CSC | PORT_PEC | PORT_OCC | PORT_PLC | PORT_WRC | PORT_CEC;
 
        spin_lock_irqsave(&xhci->lock, flags);
        /* For each port, did anything change?  If so, set that bit in buf. */
index fd53c9ebd662a5fb4593c99ce5dd7c2552ad83c5..2af32e26fafc3727279fe656fbbcaf158736371d 100644 (file)
@@ -115,6 +115,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
        if (pdev->vendor == PCI_VENDOR_ID_INTEL) {
                xhci->quirks |= XHCI_LPM_SUPPORT;
                xhci->quirks |= XHCI_INTEL_HOST;
+               xhci->quirks |= XHCI_AVOID_BEI;
        }
        if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
                        pdev->device == PCI_DEVICE_ID_INTEL_PANTHERPOINT_XHCI) {
@@ -130,7 +131,6 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
                 * PPT chipsets.
                 */
                xhci->quirks |= XHCI_SPURIOUS_REBOOT;
-               xhci->quirks |= XHCI_AVOID_BEI;
        }
        if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
                pdev->device == PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI) {
index f32c292cc8689d81bfce947f7c587603e95c09be..3fc4fe7702533b785bc22ead9fe17939e1f365f8 100644 (file)
@@ -1203,7 +1203,7 @@ static int isp1760_udc_start(struct usb_gadget *gadget,
 
        if (udc->driver) {
                dev_err(udc->isp->dev, "UDC already has a gadget driver\n");
-               spin_unlock(&udc->lock);
+               spin_unlock_irqrestore(&udc->lock, flags);
                return -EBUSY;
        }
 
index 3086dec0ef53bbd5d5d3d21087b91469983492fb..8eb68a31cab6c4021617ca555cd58b086872c112 100644 (file)
@@ -604,6 +604,7 @@ static const struct usb_device_id id_table_combined[] = {
                .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
        { USB_DEVICE(FTDI_VID, FTDI_NT_ORIONLXM_PID),
                .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
+       { USB_DEVICE(FTDI_VID, FTDI_SYNAPSE_SS200_PID) },
        /*
         * ELV devices:
         */
@@ -1883,8 +1884,12 @@ static int ftdi_8u2232c_probe(struct usb_serial *serial)
 {
        struct usb_device *udev = serial->dev;
 
-       if ((udev->manufacturer && !strcmp(udev->manufacturer, "CALAO Systems")) ||
-           (udev->product && !strcmp(udev->product, "BeagleBone/XDS100V2")))
+       if (udev->manufacturer && !strcmp(udev->manufacturer, "CALAO Systems"))
+               return ftdi_jtag_probe(serial);
+
+       if (udev->product &&
+               (!strcmp(udev->product, "BeagleBone/XDS100V2") ||
+                !strcmp(udev->product, "SNAP Connect E10")))
                return ftdi_jtag_probe(serial);
 
        return 0;
index 56b1b55c4751696b2e89633ea90bfe1c2940c436..4e4f46f3c89c025670d42860756f39b2bb62ae24 100644 (file)
  */
 #define FTDI_NT_ORIONLXM_PID   0x7c90  /* OrionLXm Substation Automation Platform */
 
+/*
+ * Synapse Wireless product ids (FTDI_VID)
+ * http://www.synapse-wireless.com
+ */
+#define FTDI_SYNAPSE_SS200_PID 0x9090 /* SS200 - SNAP Stick 200 */
+
 
 /********************************/
 /** third-party VID/PID combos **/
index dd97d8b572c336e03c7c4c282fbe7e38c44646a4..4f7e072e4e001e9f7c439114f13b52a77d3250d6 100644 (file)
@@ -61,6 +61,7 @@ struct keyspan_pda_private {
 /* For Xircom PGSDB9 and older Entrega version of the same device */
 #define XIRCOM_VENDOR_ID               0x085a
 #define XIRCOM_FAKE_ID                 0x8027
+#define XIRCOM_FAKE_ID_2               0x8025 /* "PGMFHUB" serial */
 #define ENTREGA_VENDOR_ID              0x1645
 #define ENTREGA_FAKE_ID                        0x8093
 
@@ -70,6 +71,7 @@ static const struct usb_device_id id_table_combined[] = {
 #endif
 #ifdef XIRCOM
        { USB_DEVICE(XIRCOM_VENDOR_ID, XIRCOM_FAKE_ID) },
+       { USB_DEVICE(XIRCOM_VENDOR_ID, XIRCOM_FAKE_ID_2) },
        { USB_DEVICE(ENTREGA_VENDOR_ID, ENTREGA_FAKE_ID) },
 #endif
        { USB_DEVICE(KEYSPAN_VENDOR_ID, KEYSPAN_PDA_ID) },
@@ -93,6 +95,7 @@ static const struct usb_device_id id_table_fake[] = {
 #ifdef XIRCOM
 static const struct usb_device_id id_table_fake_xircom[] = {
        { USB_DEVICE(XIRCOM_VENDOR_ID, XIRCOM_FAKE_ID) },
+       { USB_DEVICE(XIRCOM_VENDOR_ID, XIRCOM_FAKE_ID_2) },
        { USB_DEVICE(ENTREGA_VENDOR_ID, ENTREGA_FAKE_ID) },
        { }
 };
index b812462083fcaf8d32c215327d28b7c4b7e6e4da..94d96809e686b9ffd1f83c76681c8c3ff0a7cf92 100644 (file)
@@ -55,6 +55,23 @@ config XEN_BALLOON_MEMORY_HOTPLUG
 
          In that case step 3 should be omitted.
 
+config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+       int "Hotplugged memory limit (in GiB) for a PV guest"
+       default 512 if X86_64
+       default 4 if X86_32
+       range 0 64 if X86_32
+       depends on XEN_HAVE_PVMMU
+       depends on XEN_BALLOON_MEMORY_HOTPLUG
+       help
+         Maxmium amount of memory (in GiB) that a PV guest can be
+         expanded to when using memory hotplug.
+
+         A PV guest can have more memory than this limit if is
+         started with a larger maximum.
+
+         This value is used to allocate enough space in internal
+         tables needed for physical memory administration.
+
 config XEN_SCRUB_PAGES
        bool "Scrub pages before returning them to system"
        depends on XEN_BALLOON
index 0b52d92cb2e52d03899dd410d8a58e7640c8d8d5..fd933695f2328f29c2493ee751f22230ec68cbb1 100644 (file)
@@ -229,6 +229,29 @@ static enum bp_state reserve_additional_memory(long credit)
        balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION);
        nid = memory_add_physaddr_to_nid(hotplug_start_paddr);
 
+#ifdef CONFIG_XEN_HAVE_PVMMU
+        /*
+         * add_memory() will build page tables for the new memory so
+         * the p2m must contain invalid entries so the correct
+         * non-present PTEs will be written.
+         *
+         * If a failure occurs, the original (identity) p2m entries
+         * are not restored since this region is now known not to
+         * conflict with any devices.
+         */ 
+       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+               unsigned long pfn, i;
+
+               pfn = PFN_DOWN(hotplug_start_paddr);
+               for (i = 0; i < balloon_hotplug; i++) {
+                       if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) {
+                               pr_warn("set_phys_to_machine() failed, no memory added\n");
+                               return BP_ECANCELED;
+                       }
+                }
+       }
+#endif
+
        rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT);
 
        if (rc) {
index f8e52a1854c1ab383e32383ac65a0f167e385793..a793f7023755dc15cb2b8bebe5206bc610bb428c 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -278,11 +278,11 @@ static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
 
-static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct kioctx_table *table;
-       int i;
+       int i, res = -EINVAL;
 
        spin_lock(&mm->ioctx_lock);
        rcu_read_lock();
@@ -292,13 +292,17 @@ static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
 
                ctx = table->table[i];
                if (ctx && ctx->aio_ring_file == file) {
-                       ctx->user_id = ctx->mmap_base = vma->vm_start;
+                       if (!atomic_read(&ctx->dead)) {
+                               ctx->user_id = ctx->mmap_base = vma->vm_start;
+                               res = 0;
+                       }
                        break;
                }
        }
 
        rcu_read_unlock();
        spin_unlock(&mm->ioctx_lock);
+       return res;
 }
 
 static const struct file_operations aio_ring_fops = {
@@ -727,6 +731,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 err_cleanup:
        aio_nr_sub(ctx->max_reqs);
 err_ctx:
+       atomic_set(&ctx->dead, 1);
+       if (ctx->mmap_size)
+               vm_munmap(ctx->mmap_base, ctx->mmap_size);
        aio_free_ring(ctx);
 err:
        mutex_unlock(&ctx->ring_lock);
@@ -748,11 +755,12 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 {
        struct kioctx_table *table;
 
-       if (atomic_xchg(&ctx->dead, 1))
+       spin_lock(&mm->ioctx_lock);
+       if (atomic_xchg(&ctx->dead, 1)) {
+               spin_unlock(&mm->ioctx_lock);
                return -EINVAL;
+       }
 
-
-       spin_lock(&mm->ioctx_lock);
        table = rcu_dereference_raw(mm->ioctx_table);
        WARN_ON(ctx != table->table[ctx->id]);
        table->table[ctx->id] = NULL;
index 4ac7445e6ec70516848e942f54a6846a8541113b..aa0dc2573374184597b9e449fad6467f924f0f45 100644 (file)
@@ -1,6 +1,9 @@
 /*
  *   fs/cifs/cifsencrypt.c
  *
+ *   Encryption and hashing operations relating to NTLM, NTLMv2.  See MS-NLMP
+ *   for more detailed information
+ *
  *   Copyright (C) International Business Machines  Corp., 2005,2013
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
@@ -515,7 +518,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
                                 __func__);
                        return rc;
                }
-       } else if (ses->serverName) {
+       } else {
+               /* We use ses->serverName if no domain name available */
                len = strlen(ses->serverName);
 
                server = kmalloc(2 + (len * 2), GFP_KERNEL);
index d3aa999ab78520fcd4819f99548247e231df591b..480cf9c81d505b8351dd76eee0f012110c3f2b9c 100644 (file)
@@ -1599,6 +1599,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                                pr_warn("CIFS: username too long\n");
                                goto cifs_parse_mount_err;
                        }
+
+                       kfree(vol->username);
                        vol->username = kstrdup(string, GFP_KERNEL);
                        if (!vol->username)
                                goto cifs_parse_mount_err;
@@ -1700,6 +1702,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                                goto cifs_parse_mount_err;
                        }
 
+                       kfree(vol->domainname);
                        vol->domainname = kstrdup(string, GFP_KERNEL);
                        if (!vol->domainname) {
                                pr_warn("CIFS: no memory for domainname\n");
@@ -1731,6 +1734,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        }
 
                         if (strncasecmp(string, "default", 7) != 0) {
+                               kfree(vol->iocharset);
                                vol->iocharset = kstrdup(string,
                                                         GFP_KERNEL);
                                if (!vol->iocharset) {
@@ -2913,8 +2917,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
                 * calling name ends in null (byte 16) from old smb
                 * convention.
                 */
-               if (server->workstation_RFC1001_name &&
-                   server->workstation_RFC1001_name[0] != 0)
+               if (server->workstation_RFC1001_name[0] != 0)
                        rfc1002mangle(ses_init_buf->trailer.
                                      session_req.calling_name,
                                      server->workstation_RFC1001_name,
@@ -3692,6 +3695,12 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
 #endif /* CIFS_WEAK_PW_HASH */
                rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
                                        bcc_ptr, nls_codepage);
+               if (rc) {
+                       cifs_dbg(FYI, "%s Can't generate NTLM rsp. Error: %d\n",
+                                __func__, rc);
+                       cifs_buf_release(smb_buffer);
+                       return rc;
+               }
 
                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
index a94b3e67318283dd54d61fc595ecb2037ba3a515..ca30c391a894a0e9df8eac6ed1ede194c89f1885 100644 (file)
@@ -1823,6 +1823,7 @@ refind_writable:
                        cifsFileInfo_put(inv_file);
                        spin_lock(&cifs_file_list_lock);
                        ++refind;
+                       inv_file = NULL;
                        goto refind_writable;
                }
        }
index 2d4f37235ed0fa360782ae237c89fccccbf8b719..3e126d7bb2ea5bec97c9d6e02973a49886261580 100644 (file)
@@ -771,6 +771,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
                                cifs_buf_release(srchinf->ntwrk_buf_start);
                        }
                        kfree(srchinf);
+                       if (rc)
+                               goto cgii_exit;
        } else
                goto cgii_exit;
 
index 689f035915cf70f075d71fca5e281ec009c5420a..22dfdf17d06547f3d1b3abbc302bb03abf1b047b 100644 (file)
@@ -322,7 +322,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 
        /* return pointer to beginning of data area, ie offset from SMB start */
        if ((*off != 0) && (*len != 0))
-               return hdr->ProtocolId + *off;
+               return (char *)(&hdr->ProtocolId[0]) + *off;
        else
                return NULL;
 }
index 96b5d40a2ece611b27ed19668cc4b7b665605113..eab05e1aa587424863d6914eb351da9fdcf17437 100644 (file)
@@ -684,7 +684,8 @@ smb2_clone_range(const unsigned int xid,
 
                        /* No need to change MaxChunks since already set to 1 */
                        chunk_sizes_updated = true;
-               }
+               } else
+                       goto cchunk_out;
        }
 
 cchunk_out:
index 3417340bf89e677fe0c46bf98cf922dd39d29a3a..65cd7a84c8bc3206033a917fe9d98fc939cbe1af 100644 (file)
@@ -1218,7 +1218,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
        struct smb2_ioctl_req *req;
        struct smb2_ioctl_rsp *rsp;
        struct TCP_Server_Info *server;
-       struct cifs_ses *ses = tcon->ses;
+       struct cifs_ses *ses;
        struct kvec iov[2];
        int resp_buftype;
        int num_iovecs;
@@ -1233,6 +1233,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
        if (plen)
                *plen = 0;
 
+       if (tcon)
+               ses = tcon->ses;
+       else
+               return -EIO;
+
        if (ses && (ses->server))
                server = ses->server;
        else
@@ -1296,14 +1301,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
        rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
 
        if ((rc != 0) && (rc != -EINVAL)) {
-               if (tcon)
-                       cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+               cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
                goto ioctl_exit;
        } else if (rc == -EINVAL) {
                if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) &&
                    (opcode != FSCTL_SRV_COPYCHUNK)) {
-                       if (tcon)
-                               cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+                       cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
                        goto ioctl_exit;
                }
        }
@@ -1629,7 +1632,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 
        rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
 
-       if ((rc != 0) && tcon)
+       if (rc != 0)
                cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
 
        free_rsp_buf(resp_buftype, iov[0].iov_base);
@@ -2114,7 +2117,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
        struct kvec iov[2];
        int rc = 0;
        int len;
-       int resp_buftype;
+       int resp_buftype = CIFS_NO_BUFFER;
        unsigned char *bufptr;
        struct TCP_Server_Info *server;
        struct cifs_ses *ses = tcon->ses;
index e907052eeadb69f683df3c7d8838106c6e36905b..32a8bbd7a9ad1121f9b100da08f80500736bcebf 100644 (file)
@@ -53,6 +53,18 @@ struct wb_writeback_work {
        struct completion *done;        /* set if the caller waits */
 };
 
+/*
+ * If an inode is constantly having its pages dirtied, but then the
+ * updates stop dirtytime_expire_interval seconds in the past, it's
+ * possible for the worst case time between when an inode has its
+ * timestamps updated and when they finally get written out to be two
+ * dirtytime_expire_intervals.  We set the default to 12 hours (in
+ * seconds), which means most of the time inodes will have their
+ * timestamps written to disk after 12 hours, but in the worst case a
+ * few inodes might not their timestamps updated for 24 hours.
+ */
+unsigned int dirtytime_expire_interval = 12 * 60 * 60;
+
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -275,8 +287,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 
        if ((flags & EXPIRE_DIRTY_ATIME) == 0)
                older_than_this = work->older_than_this;
-       else if ((work->reason == WB_REASON_SYNC) == 0) {
-               expire_time = jiffies - (HZ * 86400);
+       else if (!work->for_sync) {
+               expire_time = jiffies - (dirtytime_expire_interval * HZ);
                older_than_this = &expire_time;
        }
        while (!list_empty(delaying_queue)) {
@@ -458,6 +470,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                 */
                redirty_tail(inode, wb);
        } else if (inode->i_state & I_DIRTY_TIME) {
+               inode->dirtied_when = jiffies;
                list_move(&inode->i_wb_list, &wb->b_dirty_time);
        } else {
                /* The inode is clean. Remove from writeback lists. */
@@ -505,12 +518,17 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        spin_lock(&inode->i_lock);
 
        dirty = inode->i_state & I_DIRTY;
-       if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
-            (inode->i_state & I_DIRTY_TIME)) ||
-           (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
-               dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
-               trace_writeback_lazytime(inode);
-       }
+       if (inode->i_state & I_DIRTY_TIME) {
+               if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+                   unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
+                   unlikely(time_after(jiffies,
+                                       (inode->dirtied_time_when +
+                                        dirtytime_expire_interval * HZ)))) {
+                       dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+                       trace_writeback_lazytime(inode);
+               }
+       } else
+               inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
        inode->i_state &= ~dirty;
 
        /*
@@ -1131,6 +1149,56 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
        rcu_read_unlock();
 }
 
+/*
+ * Wake up bdi's periodically to make sure dirtytime inodes gets
+ * written back periodically.  We deliberately do *not* check the
+ * b_dirtytime list in wb_has_dirty_io(), since this would cause the
+ * kernel to be constantly waking up once there are any dirtytime
+ * inodes on the system.  So instead we define a separate delayed work
+ * function which gets called much more rarely.  (By default, only
+ * once every 12 hours.)
+ *
+ * If there is any other write activity going on in the file system,
+ * this function won't be necessary.  But if the only thing that has
+ * happened on the file system is a dirtytime inode caused by an atime
+ * update, we need this infrastructure below to make sure that inode
+ * eventually gets pushed out to disk.
+ */
+static void wakeup_dirtytime_writeback(struct work_struct *w);
+static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
+
+static void wakeup_dirtytime_writeback(struct work_struct *w)
+{
+       struct backing_dev_info *bdi;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+               if (list_empty(&bdi->wb.b_dirty_time))
+                       continue;
+               bdi_wakeup_thread(bdi);
+       }
+       rcu_read_unlock();
+       schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+}
+
+static int __init start_dirtytime_writeback(void)
+{
+       schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+       return 0;
+}
+__initcall(start_dirtytime_writeback);
+
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+                              void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int ret;
+
+       ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+       if (ret == 0 && write)
+               mod_delayed_work(system_wq, &dirtytime_work, 0);
+       return ret;
+}
+
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
        if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1269,8 +1337,13 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        }
 
                        inode->dirtied_when = jiffies;
-                       list_move(&inode->i_wb_list, dirtytime ?
-                                 &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
+                       if (dirtytime)
+                               inode->dirtied_time_when = jiffies;
+                       if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
+                               list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+                       else
+                               list_move(&inode->i_wb_list,
+                                         &bdi->wb.b_dirty_time);
                        spin_unlock(&bdi->wb.list_lock);
                        trace_writeback_dirty_inode_enqueue(inode);
 
index 528fedfda15e6432bd69b80c7bd8faceb7d1351d..40bc384728c0e07b637dccfa755fe49b5751bf86 100644 (file)
@@ -1388,9 +1388,8 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
        int error = 0;
-       struct file_lock *new_fl;
        struct file_lock_context *ctx = inode->i_flctx;
-       struct file_lock *fl;
+       struct file_lock *new_fl, *fl, *tmp;
        unsigned long break_time;
        int want_write = (mode & O_ACCMODE) != O_RDONLY;
        LIST_HEAD(dispose);
@@ -1420,7 +1419,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                        break_time++;   /* so that 0 means no break time */
        }
 
-       list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+       list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
                if (!leases_conflict(fl, new_fl))
                        continue;
                if (want_write) {
index cdbc78c7254218c80213877e90c08ec107aea223..03d647bf195d78bb3d6611553c9ad3e6fa4385a2 100644 (file)
@@ -137,7 +137,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
        seg->offset = iomap.offset;
        seg->length = iomap.length;
 
-       dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
+       dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es);
        return 0;
 
 out_error:
index 9da89fddab338fcebbeb51fc0ec71534e0defc5a..9aa2796da90d9169488a625d5f80e90010971ff4 100644 (file)
@@ -122,19 +122,19 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 
                p = xdr_decode_hyper(p, &bex.foff);
                if (bex.foff & (block_size - 1)) {
-                       dprintk("%s: unaligned offset %lld\n",
+                       dprintk("%s: unaligned offset 0x%llx\n",
                                __func__, bex.foff);
                        goto fail;
                }
                p = xdr_decode_hyper(p, &bex.len);
                if (bex.len & (block_size - 1)) {
-                       dprintk("%s: unaligned length %lld\n",
+                       dprintk("%s: unaligned length 0x%llx\n",
                                __func__, bex.foff);
                        goto fail;
                }
                p = xdr_decode_hyper(p, &bex.soff);
                if (bex.soff & (block_size - 1)) {
-                       dprintk("%s: unaligned disk offset %lld\n",
+                       dprintk("%s: unaligned disk offset 0x%llx\n",
                                __func__, bex.soff);
                        goto fail;
                }
index 1028a062954357c06005dc7b0d6f61a10ea6f418..6904213a436368e47628af85701a3beb68e0550b 100644 (file)
@@ -118,7 +118,7 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
 {
        struct super_block *sb = exp->ex_path.mnt->mnt_sb;
 
-       if (exp->ex_flags & NFSEXP_NOPNFS)
+       if (!(exp->ex_flags & NFSEXP_PNFS))
                return;
 
        if (sb->s_export_op->get_uuid &&
@@ -440,15 +440,14 @@ nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
                        list_move_tail(&lp->lo_perstate, reaplist);
                        return;
                }
-               end = seg->offset;
+               lo->offset = layout_end(seg);
        } else {
                /* retain the whole layout segment on a split. */
                if (layout_end(seg) < end) {
                        dprintk("%s: split not supported\n", __func__);
                        return;
                }
-
-               lo->offset = layout_end(seg);
+               end = seg->offset;
        }
 
        layout_update_len(lo, end);
@@ -513,6 +512,9 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp,
 
        spin_lock(&clp->cl_lock);
        list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+               if (ls->ls_layout_type != lrp->lr_layout_type)
+                       continue;
+
                if (lrp->lr_return_type == RETURN_FSID &&
                    !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
                                   &cstate->current_fh.fh_handle))
@@ -587,6 +589,8 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 
        rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
 
+       trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
+
        printk(KERN_WARNING
                "nfsd: client %s failed to respond to layout recall. "
                "  Fencing..\n", addr_str);
index d30bea8d0277ab3bfa45e7d66c38ed410026f585..92b9d97aff4f1adfe24ed607d11661196a500ffc 100644 (file)
@@ -1237,8 +1237,8 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
                nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
 
        gdp->gd_notify_types &= ops->notify_types;
-       exp_put(exp);
 out:
+       exp_put(exp);
        return nfserr;
 }
 
index d2f2c37dc2dbd2649399fe2ddad4032a5025d337..8ba1d888f1e624d672453bd1eea20c40e054f746 100644 (file)
@@ -3221,7 +3221,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
        } else
                nfs4_free_openowner(&oo->oo_owner);
        spin_unlock(&clp->cl_lock);
-       return oo;
+       return ret;
 }
 
 static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
@@ -5062,7 +5062,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
        } else
                nfs4_free_lockowner(&lo->lo_owner);
        spin_unlock(&clp->cl_lock);
-       return lo;
+       return ret;
 }
 
 static void
index df5e66caf100ca303005018932ccc8edadab4089..5fb7e78169a6b27a1a5a4d5e9ec354c9e32f6c43 100644 (file)
@@ -1562,7 +1562,11 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
        p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
        p = xdr_decode_hyper(p, &lgp->lg_seg.length);
        p = xdr_decode_hyper(p, &lgp->lg_minlength);
-       nfsd4_decode_stateid(argp, &lgp->lg_sid);
+
+       status = nfsd4_decode_stateid(argp, &lgp->lg_sid);
+       if (status)
+               return status;
+
        READ_BUF(4);
        lgp->lg_maxcount = be32_to_cpup(p++);
 
@@ -1580,7 +1584,11 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
        p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
        p = xdr_decode_hyper(p, &lcp->lc_seg.length);
        lcp->lc_reclaim = be32_to_cpup(p++);
-       nfsd4_decode_stateid(argp, &lcp->lc_sid);
+
+       status = nfsd4_decode_stateid(argp, &lcp->lc_sid);
+       if (status)
+               return status;
+
        READ_BUF(4);
        lcp->lc_newoffset = be32_to_cpup(p++);
        if (lcp->lc_newoffset) {
@@ -1628,7 +1636,11 @@ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
                READ_BUF(16);
                p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
                p = xdr_decode_hyper(p, &lrp->lr_seg.length);
-               nfsd4_decode_stateid(argp, &lrp->lr_sid);
+
+               status = nfsd4_decode_stateid(argp, &lrp->lr_sid);
+               if (status)
+                       return status;
+
                READ_BUF(4);
                lrp->lrf_body_len = be32_to_cpup(p++);
                if (lrp->lrf_body_len > 0) {
@@ -4123,7 +4135,7 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
                return nfserr_resource;
        *p++ = cpu_to_be32(lrp->lrs_present);
        if (lrp->lrs_present)
-               nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+               return nfsd4_encode_stateid(xdr, &lrp->lr_sid);
        return nfs_ok;
 }
 #endif /* CONFIG_NFSD_PNFS */
index 83a9694ec485b0593e3de847464243640fcd7186..46ec934f5dee8c529020558bed4cd820956734f1 100644 (file)
@@ -165,13 +165,17 @@ int nfsd_reply_cache_init(void)
 {
        unsigned int hashsize;
        unsigned int i;
+       int status = 0;
 
        max_drc_entries = nfsd_cache_size_limit();
        atomic_set(&num_drc_entries, 0);
        hashsize = nfsd_hashsize(max_drc_entries);
        maskbits = ilog2(hashsize);
 
-       register_shrinker(&nfsd_reply_cache_shrinker);
+       status = register_shrinker(&nfsd_reply_cache_shrinker);
+       if (status)
+               return status;
+
        drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
                                        0, 0, NULL);
        if (!drc_slab)
index 46e0d4e857c7f493f512196603d3725ca8d3dfaa..ba1790e52ff2364bd027454650ceef6a9ba227b9 100644 (file)
@@ -2394,7 +2394,6 @@ relock:
                /*
                 * for completing the rest of the request.
                 */
-               *ppos += written;
                count -= written;
                written_buffered = generic_perform_write(file, from, *ppos);
                /*
@@ -2409,7 +2408,6 @@ relock:
                        goto out_dio;
                }
 
-               iocb->ki_pos = *ppos + written_buffered;
                /* We need to ensure that the page cache pages are written to
                 * disk and invalidated to preserve the expected O_DIRECT
                 * semantics.
@@ -2418,6 +2416,7 @@ relock:
                ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
                                endbyte);
                if (ret == 0) {
+                       iocb->ki_pos = *ppos + written_buffered;
                        written += written_buffered;
                        invalidate_mapping_pages(mapping,
                                        *ppos >> PAGE_CACHE_SHIFT,
@@ -2440,10 +2439,14 @@ out_dio:
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 
+       if (unlikely(written <= 0))
+               goto no_sync;
+
        if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
            ((file->f_flags & O_DIRECT) && !direct_io)) {
-               ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
-                                              *ppos + count - 1);
+               ret = filemap_fdatawrite_range(file->f_mapping,
+                                              iocb->ki_pos - written,
+                                              iocb->ki_pos - 1);
                if (ret < 0)
                        written = ret;
 
@@ -2454,10 +2457,12 @@ out_dio:
                }
 
                if (!ret)
-                       ret = filemap_fdatawait_range(file->f_mapping, *ppos,
-                                                     *ppos + count - 1);
+                       ret = filemap_fdatawait_range(file->f_mapping,
+                                                     iocb->ki_pos - written,
+                                                     iocb->ki_pos - 1);
        }
 
+no_sync:
        /*
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
         * function pointer which is called when o_direct io completes so that
index b3f45a578344a90eee8d1c76f724636de24447b1..e5966758c093483cd6027b4e14ea678904be5978 100644 (file)
 #include <linux/workqueue.h>
 
 struct arch_timer_kvm {
-#ifdef CONFIG_KVM_ARM_TIMER
        /* Is the timer enabled */
        bool                    enabled;
 
        /* Virtual offset */
        cycle_t                 cntvoff;
-#endif
 };
 
 struct arch_timer_cpu {
-#ifdef CONFIG_KVM_ARM_TIMER
        /* Registers: control register, timer value */
        u32                             cntv_ctl;       /* Saved/restored */
        cycle_t                         cntv_cval;      /* Saved/restored */
@@ -55,10 +52,8 @@ struct arch_timer_cpu {
 
        /* Timer IRQ */
        const struct kvm_irq_level      *irq;
-#endif
 };
 
-#ifdef CONFIG_KVM_ARM_TIMER
 int kvm_timer_hyp_init(void);
 void kvm_timer_enable(struct kvm *kvm);
 void kvm_timer_init(struct kvm *kvm);
@@ -72,30 +67,6 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
-#else
-static inline int kvm_timer_hyp_init(void)
-{
-       return 0;
-};
-
-static inline void kvm_timer_enable(struct kvm *kvm) {}
-static inline void kvm_timer_init(struct kvm *kvm) {}
-static inline void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-                                       const struct kvm_irq_level *irq) {}
-static inline void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) {}
-static inline void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) {}
-static inline void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) {}
-static inline void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) {}
-
-static inline int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
-{
-       return 0;
-}
-
-static inline u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
-{
-       return 0;
-}
-#endif
+bool kvm_timer_should_fire(struct kvm_vcpu *vcpu);
 
 #endif
index 66203b268984ebedd72d5bd1b2f54440e56011bc..133ea00aa83bc8926137ca8867f28a4ab9469d27 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/irqreturn.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <kvm/iodev.h>
 
 #define VGIC_NR_IRQS_LEGACY    256
 #define VGIC_NR_SGIS           16
@@ -140,16 +141,21 @@ struct vgic_params {
 };
 
 struct vgic_vm_ops {
-       bool    (*handle_mmio)(struct kvm_vcpu *, struct kvm_run *,
-                              struct kvm_exit_mmio *);
        bool    (*queue_sgi)(struct kvm_vcpu *, int irq);
        void    (*add_sgi_source)(struct kvm_vcpu *, int irq, int source);
        int     (*init_model)(struct kvm *);
        int     (*map_resources)(struct kvm *, const struct vgic_params *);
 };
 
+struct vgic_io_device {
+       gpa_t addr;
+       int len;
+       const struct vgic_io_range *reg_ranges;
+       struct kvm_vcpu *redist_vcpu;
+       struct kvm_io_device dev;
+};
+
 struct vgic_dist {
-#ifdef CONFIG_KVM_ARM_VGIC
        spinlock_t              lock;
        bool                    in_kernel;
        bool                    ready;
@@ -197,6 +203,9 @@ struct vgic_dist {
        /* Level-triggered interrupt queued on VCPU interface */
        struct vgic_bitmap      irq_queued;
 
+       /* Interrupt was active when unqueue from VCPU interface */
+       struct vgic_bitmap      irq_active;
+
        /* Interrupt priority. Not used yet. */
        struct vgic_bytemap     irq_priority;
 
@@ -237,8 +246,12 @@ struct vgic_dist {
        /* Bitmap indicating which CPU has something pending */
        unsigned long           *irq_pending_on_cpu;
 
+       /* Bitmap indicating which CPU has active IRQs */
+       unsigned long           *irq_active_on_cpu;
+
        struct vgic_vm_ops      vm_ops;
-#endif
+       struct vgic_io_device   dist_iodev;
+       struct vgic_io_device   *redist_iodevs;
 };
 
 struct vgic_v2_cpu_if {
@@ -266,13 +279,18 @@ struct vgic_v3_cpu_if {
 };
 
 struct vgic_cpu {
-#ifdef CONFIG_KVM_ARM_VGIC
        /* per IRQ to LR mapping */
        u8              *vgic_irq_lr_map;
 
-       /* Pending interrupts on this VCPU */
+       /* Pending/active/both interrupts on this VCPU */
        DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS);
+       DECLARE_BITMAP( active_percpu, VGIC_NR_PRIVATE_IRQS);
+       DECLARE_BITMAP( pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
+
+       /* Pending/active/both shared interrupts, dynamically sized */
        unsigned long   *pending_shared;
+       unsigned long   *active_shared;
+       unsigned long   *pend_act_shared;
 
        /* Bitmap of used/free list registers */
        DECLARE_BITMAP( lr_used, VGIC_V2_MAX_LRS);
@@ -285,7 +303,6 @@ struct vgic_cpu {
                struct vgic_v2_cpu_if   vgic_v2;
                struct vgic_v3_cpu_if   vgic_v3;
        };
-#endif
 };
 
 #define LR_EMPTY       0xff
@@ -295,10 +312,7 @@ struct vgic_cpu {
 
 struct kvm;
 struct kvm_vcpu;
-struct kvm_run;
-struct kvm_exit_mmio;
 
-#ifdef CONFIG_KVM_ARM_VGIC
 int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
 int kvm_vgic_hyp_init(void);
 int kvm_vgic_map_resources(struct kvm *kvm);
@@ -312,8 +326,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
                        bool level);
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                     struct kvm_exit_mmio *mmio);
+int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
 
 #define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)    (!!((k)->arch.vgic.nr_cpus))
@@ -335,84 +348,4 @@ static inline int vgic_v3_probe(struct device_node *vgic_node,
 }
 #endif
 
-#else
-static inline int kvm_vgic_hyp_init(void)
-{
-       return 0;
-}
-
-static inline int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr)
-{
-       return 0;
-}
-
-static inline int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
-{
-       return -ENXIO;
-}
-
-static inline int kvm_vgic_map_resources(struct kvm *kvm)
-{
-       return 0;
-}
-
-static inline int kvm_vgic_create(struct kvm *kvm, u32 type)
-{
-       return 0;
-}
-
-static inline void kvm_vgic_destroy(struct kvm *kvm)
-{
-}
-
-static inline void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
-{
-       return 0;
-}
-
-static inline void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) {}
-static inline void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) {}
-
-static inline int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid,
-                                     unsigned int irq_num, bool level)
-{
-       return 0;
-}
-
-static inline int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
-{
-       return 0;
-}
-
-static inline bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                                   struct kvm_exit_mmio *mmio)
-{
-       return false;
-}
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
-       return 0;
-}
-
-static inline bool vgic_initialized(struct kvm *kvm)
-{
-       return true;
-}
-
-static inline bool vgic_ready(struct kvm *kvm)
-{
-       return true;
-}
-
-static inline int kvm_vgic_get_max_vcpus(void)
-{
-       return KVM_MAX_VCPUS;
-}
-#endif
-
 #endif
diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h
new file mode 100644 (file)
index 0000000..a6d208b
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __KVM_IODEV_H__
+#define __KVM_IODEV_H__
+
+#include <linux/kvm_types.h>
+#include <linux/errno.h>
+
+struct kvm_io_device;
+struct kvm_vcpu;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+       int (*read)(struct kvm_vcpu *vcpu,
+                   struct kvm_io_device *this,
+                   gpa_t addr,
+                   int len,
+                   void *val);
+       int (*write)(struct kvm_vcpu *vcpu,
+                    struct kvm_io_device *this,
+                    gpa_t addr,
+                    int len,
+                    const void *val);
+       void (*destructor)(struct kvm_io_device *this);
+};
+
+
+struct kvm_io_device {
+       const struct kvm_io_device_ops *ops;
+};
+
+static inline void kvm_iodevice_init(struct kvm_io_device *dev,
+                                    const struct kvm_io_device_ops *ops)
+{
+       dev->ops = ops;
+}
+
+static inline int kvm_iodevice_read(struct kvm_vcpu *vcpu,
+                                   struct kvm_io_device *dev, gpa_t addr,
+                                   int l, void *v)
+{
+       return dev->ops->read ? dev->ops->read(vcpu, dev, addr, l, v)
+                               : -EOPNOTSUPP;
+}
+
+static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
+                                    struct kvm_io_device *dev, gpa_t addr,
+                                    int l, const void *v)
+{
+       return dev->ops->write ? dev->ops->write(vcpu, dev, addr, l, v)
+                                : -EOPNOTSUPP;
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+       if (dev->ops->destructor)
+               dev->ops->destructor(dev);
+}
+
+#endif /* __KVM_IODEV_H__ */
index c294e3e25e37a50a953a4a0bb3cb1aa66ba904d9..a1b25e35ea5f9fc2978b7f62917c6b4e39c3dc75 100644 (file)
@@ -181,7 +181,9 @@ enum rq_flag_bits {
        __REQ_ELVPRIV,          /* elevator private data attached */
        __REQ_FAILED,           /* set if the request failed */
        __REQ_QUIET,            /* don't worry about errors */
-       __REQ_PREEMPT,          /* set for "ide_preempt" requests */
+       __REQ_PREEMPT,          /* set for "ide_preempt" requests and also
+                                  for requests for which the SCSI "quiesce"
+                                  state must be ignored. */
        __REQ_ALLOCED,          /* request came from our alloc pool */
        __REQ_COPY_USER,        /* contains copies of user pages */
        __REQ_FLUSH_SEQ,        /* request for flush sequence */
index 2e4cb67f6e560094aa719fe75f595dfbb562cf8e..96c280b2c263476c053bdd0c514aa16df3d04212 100644 (file)
@@ -8,64 +8,69 @@
 #ifndef _LINUX_CLOCKCHIPS_H
 #define _LINUX_CLOCKCHIPS_H
 
-/* Clock event notification values */
-enum clock_event_nofitiers {
-       CLOCK_EVT_NOTIFY_ADD,
-       CLOCK_EVT_NOTIFY_BROADCAST_ON,
-       CLOCK_EVT_NOTIFY_BROADCAST_OFF,
-       CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
-       CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
-       CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
-       CLOCK_EVT_NOTIFY_SUSPEND,
-       CLOCK_EVT_NOTIFY_RESUME,
-       CLOCK_EVT_NOTIFY_CPU_DYING,
-       CLOCK_EVT_NOTIFY_CPU_DEAD,
-};
-
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
 
-#include <linux/clocksource.h>
-#include <linux/cpumask.h>
-#include <linux/ktime.h>
-#include <linux/notifier.h>
+# include <linux/clocksource.h>
+# include <linux/cpumask.h>
+# include <linux/ktime.h>
+# include <linux/notifier.h>
 
 struct clock_event_device;
 struct module;
 
-/* Clock event mode commands */
+/* Clock event mode commands for legacy ->set_mode(): OBSOLETE */
 enum clock_event_mode {
-       CLOCK_EVT_MODE_UNUSED = 0,
+       CLOCK_EVT_MODE_UNUSED,
        CLOCK_EVT_MODE_SHUTDOWN,
        CLOCK_EVT_MODE_PERIODIC,
        CLOCK_EVT_MODE_ONESHOT,
        CLOCK_EVT_MODE_RESUME,
 };
 
+/*
+ * Possible states of a clock event device.
+ *
+ * DETACHED:   Device is not used by clockevents core. Initial state or can be
+ *             reached from SHUTDOWN.
+ * SHUTDOWN:   Device is powered-off. Can be reached from PERIODIC or ONESHOT.
+ * PERIODIC:   Device is programmed to generate events periodically. Can be
+ *             reached from DETACHED or SHUTDOWN.
+ * ONESHOT:    Device is programmed to generate event only once. Can be reached
+ *             from DETACHED or SHUTDOWN.
+ */
+enum clock_event_state {
+       CLOCK_EVT_STATE_DETACHED,
+       CLOCK_EVT_STATE_SHUTDOWN,
+       CLOCK_EVT_STATE_PERIODIC,
+       CLOCK_EVT_STATE_ONESHOT,
+};
+
 /*
  * Clock event features
  */
-#define CLOCK_EVT_FEAT_PERIODIC                0x000001
-#define CLOCK_EVT_FEAT_ONESHOT         0x000002
-#define CLOCK_EVT_FEAT_KTIME           0x000004
+# define CLOCK_EVT_FEAT_PERIODIC       0x000001
+# define CLOCK_EVT_FEAT_ONESHOT                0x000002
+# define CLOCK_EVT_FEAT_KTIME          0x000004
+
 /*
- * x86(64) specific misfeatures:
+ * x86(64) specific (mis)features:
  *
  * - Clockevent source stops in C3 State and needs broadcast support.
  * - Local APIC timer is used as a dummy device.
  */
-#define CLOCK_EVT_FEAT_C3STOP          0x000008
-#define CLOCK_EVT_FEAT_DUMMY           0x000010
+# define CLOCK_EVT_FEAT_C3STOP         0x000008
+# define CLOCK_EVT_FEAT_DUMMY          0x000010
 
 /*
  * Core shall set the interrupt affinity dynamically in broadcast mode
  */
-#define CLOCK_EVT_FEAT_DYNIRQ          0x000020
-#define CLOCK_EVT_FEAT_PERCPU          0x000040
+# define CLOCK_EVT_FEAT_DYNIRQ         0x000020
+# define CLOCK_EVT_FEAT_PERCPU         0x000040
 
 /*
  * Clockevent device is based on a hrtimer for broadcast
  */
-#define CLOCK_EVT_FEAT_HRTIMER         0x000080
+# define CLOCK_EVT_FEAT_HRTIMER                0x000080
 
 /**
  * struct clock_event_device - clock event device descriptor
@@ -78,10 +83,15 @@ enum clock_event_mode {
  * @min_delta_ns:      minimum delta value in ns
  * @mult:              nanosecond to cycles multiplier
  * @shift:             nanoseconds to cycles divisor (power of two)
- * @mode:              operating mode assigned by the management code
+ * @mode:              operating mode, relevant only to ->set_mode(), OBSOLETE
+ * @state:             current state of the device, assigned by the core code
  * @features:          features
  * @retries:           number of forced programming retries
- * @set_mode:          set mode function
+ * @set_mode:          legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME.
+ * @set_state_periodic:        switch state to periodic, if !set_mode
+ * @set_state_oneshot: switch state to oneshot, if !set_mode
+ * @set_state_shutdown:        switch state to shutdown, if !set_mode
+ * @tick_resume:       resume clkevt device, if !set_mode
  * @broadcast:         function to broadcast events
  * @min_delta_ticks:   minimum delta value in ticks stored for reconfiguration
  * @max_delta_ticks:   maximum delta value in ticks stored for reconfiguration
@@ -95,22 +105,31 @@ enum clock_event_mode {
  */
 struct clock_event_device {
        void                    (*event_handler)(struct clock_event_device *);
-       int                     (*set_next_event)(unsigned long evt,
-                                                 struct clock_event_device *);
-       int                     (*set_next_ktime)(ktime_t expires,
-                                                 struct clock_event_device *);
+       int                     (*set_next_event)(unsigned long evt, struct clock_event_device *);
+       int                     (*set_next_ktime)(ktime_t expires, struct clock_event_device *);
        ktime_t                 next_event;
        u64                     max_delta_ns;
        u64                     min_delta_ns;
        u32                     mult;
        u32                     shift;
        enum clock_event_mode   mode;
+       enum clock_event_state  state;
        unsigned int            features;
        unsigned long           retries;
 
+       /*
+        * State transition callback(s): Only one of the two groups should be
+        * defined:
+        * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME.
+        * - set_state_{shutdown|periodic|oneshot}(), tick_resume().
+        */
+       void                    (*set_mode)(enum clock_event_mode mode, struct clock_event_device *);
+       int                     (*set_state_periodic)(struct clock_event_device *);
+       int                     (*set_state_oneshot)(struct clock_event_device *);
+       int                     (*set_state_shutdown)(struct clock_event_device *);
+       int                     (*tick_resume)(struct clock_event_device *);
+
        void                    (*broadcast)(const struct cpumask *mask);
-       void                    (*set_mode)(enum clock_event_mode mode,
-                                           struct clock_event_device *);
        void                    (*suspend)(struct clock_event_device *);
        void                    (*resume)(struct clock_event_device *);
        unsigned long           min_delta_ticks;
@@ -136,18 +155,18 @@ struct clock_event_device {
  *
  * factor = (clock_ticks << shift) / nanoseconds
  */
-static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec,
-                                  int shift)
+static inline unsigned long
+div_sc(unsigned long ticks, unsigned long nsec, int shift)
 {
-       uint64_t tmp = ((uint64_t)ticks) << shift;
+       u64 tmp = ((u64)ticks) << shift;
 
        do_div(tmp, nsec);
+
        return (unsigned long) tmp;
 }
 
 /* Clock event layer functions */
-extern u64 clockevent_delta2ns(unsigned long latch,
-                              struct clock_event_device *evt);
+extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt);
 extern void clockevents_register_device(struct clock_event_device *dev);
 extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu);
 
@@ -158,57 +177,42 @@ extern void clockevents_config_and_register(struct clock_event_device *dev,
 
 extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq);
 
-extern void clockevents_exchange_device(struct clock_event_device *old,
-                                       struct clock_event_device *new);
-extern void clockevents_set_mode(struct clock_event_device *dev,
-                                enum clock_event_mode mode);
-extern int clockevents_program_event(struct clock_event_device *dev,
-                                    ktime_t expires, bool force);
-
-extern void clockevents_handle_noop(struct clock_event_device *dev);
-
 static inline void
 clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec)
 {
-       return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC,
-                                     freq, minsec);
+       return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, minsec);
 }
 
 extern void clockevents_suspend(void);
 extern void clockevents_resume(void);
 
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-#ifdef CONFIG_ARCH_HAS_TICK_BROADCAST
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+#  ifdef CONFIG_ARCH_HAS_TICK_BROADCAST
 extern void tick_broadcast(const struct cpumask *mask);
-#else
-#define tick_broadcast NULL
-#endif
+#  else
+#   define tick_broadcast      NULL
+#  endif
 extern int tick_receive_broadcast(void);
-#endif
+# endif
 
-#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
+# if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
 extern void tick_setup_hrtimer_broadcast(void);
 extern int tick_check_broadcast_expired(void);
-#else
+# else
 static inline int tick_check_broadcast_expired(void) { return 0; }
-static inline void tick_setup_hrtimer_broadcast(void) {};
-#endif
+static inline void tick_setup_hrtimer_broadcast(void) { }
+# endif
 
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern int clockevents_notify(unsigned long reason, void *arg);
-#else
-static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; }
-#endif
-
-#else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */
 
-static inline void clockevents_suspend(void) {}
-static inline void clockevents_resume(void) {}
+#else /* !CONFIG_GENERIC_CLOCKEVENTS: */
 
+static inline void clockevents_suspend(void) { }
+static inline void clockevents_resume(void) { }
 static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; }
 static inline int tick_check_broadcast_expired(void) { return 0; }
-static inline void tick_setup_hrtimer_broadcast(void) {};
+static inline void tick_setup_hrtimer_broadcast(void) { }
 
-#endif
+#endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
-#endif
+#endif /* _LINUX_CLOCKCHIPS_H */
index 9c78d15d33e4de979b3d70d6a569064aa5529048..135509821c3994f4083bd2f0eb8f7c05409c026c 100644 (file)
@@ -56,6 +56,7 @@ struct module;
  * @shift:             cycle to nanosecond divisor (power of two)
  * @max_idle_ns:       max idle time permitted by the clocksource (nsecs)
  * @maxadj:            maximum adjustment value to mult (~11%)
+ * @max_cycles:                maximum safe cycle value which won't overflow on multiplication
  * @flags:             flags describing special properties
  * @archdata:          arch-specific data
  * @suspend:           suspend function for the clocksource, if necessary
@@ -76,7 +77,7 @@ struct clocksource {
 #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
        struct arch_clocksource_data archdata;
 #endif
-
+       u64 max_cycles;
        const char *name;
        struct list_head list;
        int rating;
@@ -178,7 +179,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
 }
 
 
-extern int clocksource_register(struct clocksource*);
 extern int clocksource_unregister(struct clocksource*);
 extern void clocksource_touch_watchdog(void);
 extern struct clocksource* clocksource_get_next(void);
@@ -189,7 +189,7 @@ extern struct clocksource * __init clocksource_default_clock(void);
 extern void clocksource_mark_unstable(struct clocksource *cs);
 
 extern u64
-clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask);
+clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cycles);
 extern void
 clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);
 
@@ -200,7 +200,16 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);
 extern int
 __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq);
 extern void
-__clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq);
+__clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq);
+
+/*
+ * Don't call this unless you are a default clocksource
+ * (AKA: jiffies) and absolutely have to.
+ */
+static inline int __clocksource_register(struct clocksource *cs)
+{
+       return __clocksource_register_scale(cs, 1, 0);
+}
 
 static inline int clocksource_register_hz(struct clocksource *cs, u32 hz)
 {
@@ -212,14 +221,14 @@ static inline int clocksource_register_khz(struct clocksource *cs, u32 khz)
        return __clocksource_register_scale(cs, 1000, khz);
 }
 
-static inline void __clocksource_updatefreq_hz(struct clocksource *cs, u32 hz)
+static inline void __clocksource_update_freq_hz(struct clocksource *cs, u32 hz)
 {
-       __clocksource_updatefreq_scale(cs, 1, hz);
+       __clocksource_update_freq_scale(cs, 1, hz);
 }
 
-static inline void __clocksource_updatefreq_khz(struct clocksource *cs, u32 khz)
+static inline void __clocksource_update_freq_khz(struct clocksource *cs, u32 khz)
 {
-       __clocksource_updatefreq_scale(cs, 1000, khz);
+       __clocksource_update_freq_scale(cs, 1000, khz);
 }
 
 
index 1b45e4a0519b2c34033db91e37fd1f1f0b367f8c..0e41ca0e59275deb33b7c7220dffd1ff39cbb2f7 100644 (file)
@@ -192,29 +192,16 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 
 #include <uapi/linux/types.h>
 
-static __always_inline void data_access_exceeds_word_size(void)
-#ifdef __compiletime_warning
-__compiletime_warning("data access exceeds word size and won't be atomic")
-#endif
-;
-
-static __always_inline void data_access_exceeds_word_size(void)
-{
-}
-
 static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
 {
        switch (size) {
        case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
        case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
        case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
-#ifdef CONFIG_64BIT
        case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
-#endif
        default:
                barrier();
                __builtin_memcpy((void *)res, (const void *)p, size);
-               data_access_exceeds_word_size();
                barrier();
        }
 }
@@ -225,13 +212,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
        case 1: *(volatile __u8 *)p = *(__u8 *)res; break;
        case 2: *(volatile __u16 *)p = *(__u16 *)res; break;
        case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
-#ifdef CONFIG_64BIT
        case 8: *(volatile __u64 *)p = *(__u64 *)res; break;
-#endif
        default:
                barrier();
                __builtin_memcpy((void *)p, (const void *)res, size);
-               data_access_exceeds_word_size();
                barrier();
        }
 }
index 306178d7309f193cb32443f49c888297ffa9573e..9c5e892547961544eae22a53669ba600e5ad4973 100644 (file)
@@ -77,7 +77,6 @@ struct cpuidle_device {
        unsigned int            cpu;
 
        int                     last_residency;
-       int                     state_count;
        struct cpuidle_state_usage      states_usage[CPUIDLE_STATE_MAX];
        struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
        struct cpuidle_driver_kobj *kobj_driver;
index 022e34fcbd1bf6b56cb5f0dbb250a28885797f80..52456aa566a05eded5d61eac974b237e00c26ce9 100644 (file)
@@ -14,6 +14,8 @@
 #include <asm/io.h>
 #include <asm/scatterlist.h>
 
+struct device;
+
 struct dma_pool *dma_pool_create(const char *name, struct device *dev, 
                        size_t size, size_t align, size_t allocation);
 
index cf7e431cbc730dff0ef445dc48989bf5156a09ac..af5be0368dec26c934565e634c0dc803958ed2cc 100644 (file)
@@ -942,6 +942,7 @@ extern int __init efi_setup_pcdp_console(char *);
 #define EFI_64BIT              5       /* Is the firmware 64-bit? */
 #define EFI_PARAVIRT           6       /* Access is via a paravirt interface */
 #define EFI_ARCH_1             7       /* First arch-specific bit */
+#define EFI_DBG                        8       /* Print additional debug info at runtime */
 
 #ifdef CONFIG_EFI
 /*
index b4d71b5e1ff23a2d3f3ec5c481937edd571ead4d..52cc4492cb3a1bcb979124b097fefdcfbc89e390 100644 (file)
@@ -604,6 +604,7 @@ struct inode {
        struct mutex            i_mutex;
 
        unsigned long           dirtied_when;   /* jiffies of first dirtying */
+       unsigned long           dirtied_time_when;
 
        struct hlist_node       i_hash;
        struct list_head        i_wb_list;      /* backing dev IO list */
@@ -1548,7 +1549,7 @@ struct file_operations {
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
-       void (*mremap)(struct file *, struct vm_area_struct *);
+       int (*mremap)(struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
index 2df8e8dd10a483d55d723b6ac4875de807ab2044..21b6d768edd7a4e0f1aee98ed9f437000fa1b996 100644 (file)
@@ -253,21 +253,41 @@ struct obs_kernel_param {
  * obs_kernel_param "array" too far apart in .init.setup.
  */
 #define __setup_param(str, unique_id, fn, early)                       \
-       static const char __setup_str_##unique_id[] __initconst \
-               __aligned(1) = str; \
-       static struct obs_kernel_param __setup_##unique_id      \
-               __used __section(.init.setup)                   \
-               __attribute__((aligned((sizeof(long)))))        \
+       static const char __setup_str_##unique_id[] __initconst         \
+               __aligned(1) = str;                                     \
+       static struct obs_kernel_param __setup_##unique_id              \
+               __used __section(.init.setup)                           \
+               __attribute__((aligned((sizeof(long)))))                \
                = { __setup_str_##unique_id, fn, early }
 
-#define __setup(str, fn)                                       \
+#define __setup(str, fn)                                               \
        __setup_param(str, fn, fn, 0)
 
-/* NOTE: fn is as per module_param, not __setup!  Emits warning if fn
- * returns non-zero. */
-#define early_param(str, fn)                                   \
+/*
+ * NOTE: fn is as per module_param, not __setup!
+ * Emits warning if fn returns non-zero.
+ */
+#define early_param(str, fn)                                           \
        __setup_param(str, fn, fn, 1)
 
+#define early_param_on_off(str_on, str_off, var, config)               \
+                                                                       \
+       int var = IS_ENABLED(config);                                   \
+                                                                       \
+       static int __init parse_##var##_on(char *arg)                   \
+       {                                                               \
+               var = 1;                                                \
+               return 0;                                               \
+       }                                                               \
+       __setup_param(str_on, parse_##var##_on, parse_##var##_on, 1);   \
+                                                                       \
+       static int __init parse_##var##_off(char *arg)                  \
+       {                                                               \
+               var = 0;                                                \
+               return 0;                                               \
+       }                                                               \
+       __setup_param(str_off, parse_##var##_off, parse_##var##_off, 1)
+
 /* Relies on boot_command_line being set */
 void __init parse_early_param(void);
 void __init parse_early_options(char *cmdline);
index bf3fe719c7ce9d3c0efa3c2449cc0d2a7e5b43ad..47b9ebd4a74fc667601d76476645fefbdb3c36f6 100644 (file)
@@ -38,16 +38,17 @@ bool irq_work_queue(struct irq_work *work);
 bool irq_work_queue_on(struct irq_work *work, int cpu);
 #endif
 
-void irq_work_run(void);
 void irq_work_tick(void);
 void irq_work_sync(struct irq_work *work);
 
 #ifdef CONFIG_IRQ_WORK
 #include <asm/irq_work.h>
 
+void irq_work_run(void);
 bool irq_work_needs_cpu(void);
 #else
 static inline bool irq_work_needs_cpu(void) { return false; }
+static inline void irq_work_run(void) { }
 #endif
 
 #endif /* _LINUX_IRQ_WORK_H */
index 781974afff9f14e576a7912039a5fb68009cdb25..ffbc034c88104d251c3891f4513c12a62818c5dd 100644 (file)
 #define GICR_PROPBASER_WaWb            (5U << 7)
 #define GICR_PROPBASER_RaWaWt          (6U << 7)
 #define GICR_PROPBASER_RaWaWb          (7U << 7)
+#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7)
 #define GICR_PROPBASER_IDBITS_MASK     (0x1f)
 
+#define GICR_PENDBASER_NonShareable    (0U << 10)
+#define GICR_PENDBASER_InnerShareable  (1U << 10)
+#define GICR_PENDBASER_OuterShareable  (2U << 10)
+#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10)
+#define GICR_PENDBASER_nCnB            (0U << 7)
+#define GICR_PENDBASER_nC              (1U << 7)
+#define GICR_PENDBASER_RaWt            (2U << 7)
+#define GICR_PENDBASER_RaWb            (3U << 7)
+#define GICR_PENDBASER_WaWt            (4U << 7)
+#define GICR_PENDBASER_WaWb            (5U << 7)
+#define GICR_PENDBASER_RaWaWt          (6U << 7)
+#define GICR_PENDBASER_RaWaWb          (7U << 7)
+#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7)
+
 /*
  * Re-Distributor registers, offsets from SGI_base
  */
 #define GITS_CBASER_WaWb               (5UL << 59)
 #define GITS_CBASER_RaWaWt             (6UL << 59)
 #define GITS_CBASER_RaWaWb             (7UL << 59)
+#define GITS_CBASER_CACHEABILITY_MASK  (7UL << 59)
 #define GITS_CBASER_NonShareable       (0UL << 10)
 #define GITS_CBASER_InnerShareable     (1UL << 10)
 #define GITS_CBASER_OuterShareable     (2UL << 10)
 #define GITS_BASER_WaWb                        (5UL << 59)
 #define GITS_BASER_RaWaWt              (6UL << 59)
 #define GITS_BASER_RaWaWb              (7UL << 59)
+#define GITS_BASER_CACHEABILITY_MASK   (7UL << 59)
 #define GITS_BASER_TYPE_SHIFT          (56)
 #define GITS_BASER_TYPE(r)             (((r) >> GITS_BASER_TYPE_SHIFT) & 7)
 #define GITS_BASER_ENTRY_SIZE_SHIFT    (48)
index 98f923b6a0eaa78ee8b4ffdf57c87f146e3c03d6..f4de473f226b8ab5332dabc7083b74c80de8a664 100644 (file)
  * same as using STATIC_KEY_INIT_FALSE.
  */
 
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
+# define HAVE_JUMP_LABEL
+#endif
+
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/bug.h>
@@ -55,7 +61,7 @@ extern bool static_key_initialized;
                                    "%s used before call to jump_label_init", \
                                    __func__)
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
+#ifdef HAVE_JUMP_LABEL
 
 struct static_key {
        atomic_t enabled;
@@ -66,13 +72,18 @@ struct static_key {
 #endif
 };
 
-# include <asm/jump_label.h>
-# define HAVE_JUMP_LABEL
 #else
 struct static_key {
        atomic_t enabled;
 };
-#endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */
+#endif /* HAVE_JUMP_LABEL */
+#endif /* __ASSEMBLY__ */
+
+#ifdef HAVE_JUMP_LABEL
+#include <asm/jump_label.h>
+#endif
+
+#ifndef __ASSEMBLY__
 
 enum jump_label_type {
        JUMP_LABEL_DISABLE = 0,
@@ -203,3 +214,5 @@ static inline bool static_key_enabled(struct static_key *key)
 }
 
 #endif /* _LINUX_JUMP_LABEL_H */
+
+#endif /* __ASSEMBLY__ */
index d12b2104d19b422a9e3357f7186d80702807fa3d..82af5d0b996e7fb29edd029379f2ab9238a1b2a2 100644 (file)
@@ -165,12 +165,12 @@ enum kvm_bus {
        KVM_NR_BUSES
 };
 
-int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val);
-int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-                           int len, const void *val, long cookie);
-int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
-                   void *val);
+int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
+                           gpa_t addr, int len, const void *val, long cookie);
+int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
+                   int len, void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -658,7 +658,6 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 
 void *kvm_kvzalloc(unsigned long size);
-void kvm_kvfree(const void *addr);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 static inline struct kvm *kvm_arch_alloc_vm(void)
@@ -700,6 +699,20 @@ static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 #endif
 }
 
+#ifdef __KVM_HAVE_ARCH_INTC_INITIALIZED
+/*
+ * returns true if the virtual interrupt controller is initialized and
+ * ready to accept virtual IRQ. On some architectures the virtual interrupt
+ * controller is dynamically instantiated and this is not always true.
+ */
+bool kvm_arch_intc_initialized(struct kvm *kvm);
+#else
+static inline bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+       return true;
+}
+#endif
+
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
@@ -969,11 +982,16 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
-static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
 {
        return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
 }
 
+static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+       return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
+}
+
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
 
 #else
index 7bf01d779b4532a5a8d168d0ff91fd08d5228e02..1ce79a7f1daa18868adfe14598c35206382f58a5 100644 (file)
@@ -4,5 +4,6 @@
 #include <linux/compiler.h>
 
 unsigned long lcm(unsigned long a, unsigned long b) __attribute_const__;
+unsigned long lcm_not_zero(unsigned long a, unsigned long b) __attribute_const__;
 
 #endif /* _LCM_H */
index f279d9c158cd566d8e3c0bd2777094e553f81be0..2782df47101e0cd3ee6ff199ad97f75d640b9308 100644 (file)
@@ -474,16 +474,15 @@ struct zone {
        unsigned long           wait_table_bits;
 
        ZONE_PADDING(_pad1_)
-
-       /* Write-intensive fields used from the page allocator */
-       spinlock_t              lock;
-
        /* free areas of different sizes */
        struct free_area        free_area[MAX_ORDER];
 
        /* zone flags, see below */
        unsigned long           flags;
 
+       /* Write-intensive fields used from the page allocator */
+       spinlock_t              lock;
+
        ZONE_PADDING(_pad2_)
 
        /* Write-intensive fields used by page reclaim */
index dcf6ec27739b1d8bfb98c82e8902ca0b697df5f8..278738873703c119dfa322c13b2755e2176a61f5 100644 (file)
@@ -2185,6 +2185,12 @@ void netdev_freemem(struct net_device *dev);
 void synchronize_net(void);
 int init_dummy_netdev(struct net_device *dev);
 
+DECLARE_PER_CPU(int, xmit_recursion);
+static inline int dev_recursion_level(void)
+{
+       return this_cpu_read(xmit_recursion);
+}
+
 struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
index dcad7ee0d7466c8e7ffd7a050e814db551c95cac..8dcf6825fa88bbe25c7936797a4322bfd0e24318 100644 (file)
@@ -77,6 +77,7 @@ struct rtc_class_ops {
        int (*read_alarm)(struct device *, struct rtc_wkalrm *);
        int (*set_alarm)(struct device *, struct rtc_wkalrm *);
        int (*proc)(struct device *, struct seq_file *);
+       int (*set_mmss64)(struct device *, time64_t secs);
        int (*set_mmss)(struct device *, unsigned long secs);
        int (*read_callback)(struct device *, int data);
        int (*alarm_irq_enable)(struct device *, unsigned int enabled);
index a419b65770d669c3a51c88a86a145abbcd3db339..3f3308824fa41b473ac77e2927cfdec626bd8c0b 100644 (file)
@@ -176,6 +176,14 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
 
+/* Notifier for when a task gets migrated to a new CPU */
+struct task_migration_notifier {
+       struct task_struct *task;
+       int from_cpu;
+       int to_cpu;
+};
+extern void register_task_migration_notifier(struct notifier_block *n);
+
 extern unsigned long get_parent_ip(unsigned long addr);
 
 extern void dump_cpu_task(int cpu);
@@ -1115,15 +1123,28 @@ struct load_weight {
 };
 
 struct sched_avg {
+       u64 last_runnable_update;
+       s64 decay_count;
+       /*
+        * utilization_avg_contrib describes the amount of time that a
+        * sched_entity is running on a CPU. It is based on running_avg_sum
+        * and is scaled in the range [0..SCHED_LOAD_SCALE].
+        * load_avg_contrib described the amount of time that a sched_entity
+        * is runnable on a rq. It is based on both runnable_avg_sum and the
+        * weight of the task.
+        */
+       unsigned long load_avg_contrib, utilization_avg_contrib;
        /*
         * These sums represent an infinite geometric series and so are bound
         * above by 1024/(1-y).  Thus we only need a u32 to store them for all
         * choices of y < 1-2^(-32)*1024.
+        * running_avg_sum reflects the time that the sched_entity is
+        * effectively running on the CPU.
+        * runnable_avg_sum represents the amount of time a sched_entity is on
+        * a runqueue which includes the running time that is monitored by
+        * running_avg_sum.
         */
-       u32 runnable_avg_sum, runnable_avg_period;
-       u64 last_runnable_update;
-       s64 decay_count;
-       unsigned long load_avg_contrib;
+       u32 runnable_avg_sum, avg_period, running_avg_sum;
 };
 
 #ifdef CONFIG_SCHEDSTATS
index f5df8f687b4d097dd7855e3670a2b790cb380d5d..5f68d0a391cee8506f8e0d94cda72d8bd357b10f 100644 (file)
@@ -108,7 +108,7 @@ static inline unsigned __read_seqcount_begin(const seqcount_t *s)
        unsigned ret;
 
 repeat:
-       ret = ACCESS_ONCE(s->sequence);
+       ret = READ_ONCE(s->sequence);
        if (unlikely(ret & 1)) {
                cpu_relax();
                goto repeat;
@@ -127,7 +127,7 @@ repeat:
  */
 static inline unsigned raw_read_seqcount(const seqcount_t *s)
 {
-       unsigned ret = ACCESS_ONCE(s->sequence);
+       unsigned ret = READ_ONCE(s->sequence);
        smp_rmb();
        return ret;
 }
@@ -179,7 +179,7 @@ static inline unsigned read_seqcount_begin(const seqcount_t *s)
  */
 static inline unsigned raw_seqcount_begin(const seqcount_t *s)
 {
-       unsigned ret = ACCESS_ONCE(s->sequence);
+       unsigned ret = READ_ONCE(s->sequence);
        smp_rmb();
        return ret & ~1;
 }
index f4aec0e75c3a268cebeecd2dcdc43fc68126a27a..076af437284d59f9b2597e9d6f448804cf56fcb3 100644 (file)
@@ -19,3 +19,12 @@ enum {
 #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
 #endif
 #endif
+
+/**
+ * offsetofend(TYPE, MEMBER)
+ *
+ * @TYPE: The type of the structure
+ * @MEMBER: The member within the structure to get the end offset of
+ */
+#define offsetofend(TYPE, MEMBER) \
+       (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
index c57d8ea0716cddea1419a37481dbd7704b230d65..59a7889e15db51c3b24bc771dbe5d0b1c965212d 100644 (file)
@@ -60,17 +60,17 @@ struct rpc_xprt;
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 void           rpc_register_sysctl(void);
 void           rpc_unregister_sysctl(void);
-int            sunrpc_debugfs_init(void);
+void           sunrpc_debugfs_init(void);
 void           sunrpc_debugfs_exit(void);
-int            rpc_clnt_debugfs_register(struct rpc_clnt *);
+void           rpc_clnt_debugfs_register(struct rpc_clnt *);
 void           rpc_clnt_debugfs_unregister(struct rpc_clnt *);
-int            rpc_xprt_debugfs_register(struct rpc_xprt *);
+void           rpc_xprt_debugfs_register(struct rpc_xprt *);
 void           rpc_xprt_debugfs_unregister(struct rpc_xprt *);
 #else
-static inline int
+static inline void
 sunrpc_debugfs_init(void)
 {
-       return 0;
+       return;
 }
 
 static inline void
@@ -79,10 +79,10 @@ sunrpc_debugfs_exit(void)
        return;
 }
 
-static inline int
+static inline void
 rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
 {
-       return 0;
+       return;
 }
 
 static inline void
@@ -91,10 +91,10 @@ rpc_clnt_debugfs_unregister(struct rpc_clnt *clnt)
        return;
 }
 
-static inline int
+static inline void
 rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
 {
-       return 0;
+       return;
 }
 
 static inline void
index 9c085dc12ae92626e3d6ae831df435a82f188c08..f8492da57ad32e7607e320510a24ac0165dcd8cd 100644 (file)
@@ -1,7 +1,5 @@
-/*  linux/include/linux/tick.h
- *
- *  This file contains the structure definitions for tick related functions
- *
+/*
+ * Tick related global functions
  */
 #ifndef _LINUX_TICK_H
 #define _LINUX_TICK_H
 #include <linux/clockchips.h>
 #include <linux/irqflags.h>
 #include <linux/percpu.h>
-#include <linux/hrtimer.h>
 #include <linux/context_tracking_state.h>
 #include <linux/cpumask.h>
 #include <linux/sched.h>
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
-
-enum tick_device_mode {
-       TICKDEV_MODE_PERIODIC,
-       TICKDEV_MODE_ONESHOT,
-};
-
-struct tick_device {
-       struct clock_event_device *evtdev;
-       enum tick_device_mode mode;
-};
-
-enum tick_nohz_mode {
-       NOHZ_MODE_INACTIVE,
-       NOHZ_MODE_LOWRES,
-       NOHZ_MODE_HIGHRES,
-};
-
-/**
- * struct tick_sched - sched tick emulation and no idle tick control/stats
- * @sched_timer:       hrtimer to schedule the periodic tick in high
- *                     resolution mode
- * @last_tick:         Store the last tick expiry time when the tick
- *                     timer is modified for nohz sleeps. This is necessary
- *                     to resume the tick timer operation in the timeline
- *                     when the CPU returns from nohz sleep.
- * @tick_stopped:      Indicator that the idle tick has been stopped
- * @idle_jiffies:      jiffies at the entry to idle for idle time accounting
- * @idle_calls:                Total number of idle calls
- * @idle_sleeps:       Number of idle calls, where the sched tick was stopped
- * @idle_entrytime:    Time when the idle call was entered
- * @idle_waketime:     Time when the idle was interrupted
- * @idle_exittime:     Time when the idle state was left
- * @idle_sleeptime:    Sum of the time slept in idle with sched tick stopped
- * @iowait_sleeptime:  Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @sleep_length:      Duration of the current idle sleep
- * @do_timer_lst:      CPU was the last one doing do_timer before going idle
- */
-struct tick_sched {
-       struct hrtimer                  sched_timer;
-       unsigned long                   check_clocks;
-       enum tick_nohz_mode             nohz_mode;
-       ktime_t                         last_tick;
-       int                             inidle;
-       int                             tick_stopped;
-       unsigned long                   idle_jiffies;
-       unsigned long                   idle_calls;
-       unsigned long                   idle_sleeps;
-       int                             idle_active;
-       ktime_t                         idle_entrytime;
-       ktime_t                         idle_waketime;
-       ktime_t                         idle_exittime;
-       ktime_t                         idle_sleeptime;
-       ktime_t                         iowait_sleeptime;
-       ktime_t                         sleep_length;
-       unsigned long                   last_jiffies;
-       unsigned long                   next_jiffies;
-       ktime_t                         idle_expires;
-       int                             do_timer_last;
-};
-
 extern void __init tick_init(void);
-extern int tick_is_oneshot_available(void);
-extern struct tick_device *tick_get_device(int cpu);
-
 extern void tick_freeze(void);
 extern void tick_unfreeze(void);
+/* Should be core only, but ARM BL switcher requires it */
+extern void tick_suspend_local(void);
+/* Should be core only, but XEN resume magic and ARM BL switcher require it */
+extern void tick_resume_local(void);
+extern void tick_handover_do_timer(void);
+extern void tick_cleanup_dead_cpu(int cpu);
+#else /* CONFIG_GENERIC_CLOCKEVENTS */
+static inline void tick_init(void) { }
+static inline void tick_freeze(void) { }
+static inline void tick_unfreeze(void) { }
+static inline void tick_suspend_local(void) { }
+static inline void tick_resume_local(void) { }
+static inline void tick_handover_do_timer(void) { }
+static inline void tick_cleanup_dead_cpu(int cpu) { }
+#endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
-# ifdef CONFIG_HIGH_RES_TIMERS
-extern int tick_init_highres(void);
-extern int tick_program_event(ktime_t expires, int force);
-extern void tick_setup_sched_timer(void);
-# endif
-
-# if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
-extern void tick_cancel_sched_timer(int cpu);
-# else
-static inline void tick_cancel_sched_timer(int cpu) { }
-# endif
-
-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern struct tick_device *tick_get_broadcast_device(void);
-extern struct cpumask *tick_get_broadcast_mask(void);
-
-#  ifdef CONFIG_TICK_ONESHOT
-extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
-#  endif
-
-# endif /* BROADCAST */
-
-# ifdef CONFIG_TICK_ONESHOT
-extern void tick_clock_notify(void);
-extern int tick_check_oneshot_change(int allow_nohz);
-extern struct tick_sched *tick_get_tick_sched(int cpu);
+#ifdef CONFIG_TICK_ONESHOT
 extern void tick_irq_enter(void);
-extern int tick_oneshot_mode_active(void);
 #  ifndef arch_needs_cpu
 #   define arch_needs_cpu() (0)
 #  endif
 # else
-static inline void tick_clock_notify(void) { }
-static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
 static inline void tick_irq_enter(void) { }
-static inline int tick_oneshot_mode_active(void) { return 0; }
-# endif
+#endif
 
-#else /* CONFIG_GENERIC_CLOCKEVENTS */
-static inline void tick_init(void) { }
-static inline void tick_freeze(void) { }
-static inline void tick_unfreeze(void) { }
-static inline void tick_cancel_sched_timer(int cpu) { }
-static inline void tick_clock_notify(void) { }
-static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
-static inline void tick_irq_enter(void) { }
-static inline int tick_oneshot_mode_active(void) { return 0; }
-#endif /* !CONFIG_GENERIC_CLOCKEVENTS */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
+extern void hotplug_cpu__broadcast_tick_pull(int dead_cpu);
+#else
+static inline void hotplug_cpu__broadcast_tick_pull(int dead_cpu) { }
+#endif
 
-# ifdef CONFIG_NO_HZ_COMMON
-DECLARE_PER_CPU(struct tick_sched, tick_cpu_sched);
+enum tick_broadcast_mode {
+       TICK_BROADCAST_OFF,
+       TICK_BROADCAST_ON,
+       TICK_BROADCAST_FORCE,
+};
+
+enum tick_broadcast_state {
+       TICK_BROADCAST_EXIT,
+       TICK_BROADCAST_ENTER,
+};
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern void tick_broadcast_control(enum tick_broadcast_mode mode);
+#else
+static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { }
+#endif /* BROADCAST */
+
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
+extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+#else
+static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { return 0; }
+#endif
 
-static inline int tick_nohz_tick_stopped(void)
+static inline void tick_broadcast_enable(void)
+{
+       tick_broadcast_control(TICK_BROADCAST_ON);
+}
+static inline void tick_broadcast_disable(void)
+{
+       tick_broadcast_control(TICK_BROADCAST_OFF);
+}
+static inline void tick_broadcast_force(void)
+{
+       tick_broadcast_control(TICK_BROADCAST_FORCE);
+}
+static inline int tick_broadcast_enter(void)
 {
-       return __this_cpu_read(tick_cpu_sched.tick_stopped);
+       return tick_broadcast_oneshot_control(TICK_BROADCAST_ENTER);
+}
+static inline void tick_broadcast_exit(void)
+{
+       tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT);
 }
 
+#ifdef CONFIG_NO_HZ_COMMON
+extern int tick_nohz_tick_stopped(void);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
-
-# else /* !CONFIG_NO_HZ_COMMON */
-static inline int tick_nohz_tick_stopped(void)
-{
-       return 0;
-}
-
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline int tick_nohz_tick_stopped(void) { return 0; }
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
 
@@ -163,7 +111,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
-# endif /* !CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool tick_nohz_full_running;
index 05af9a3348934602456ec18b9cd29df84ac5a167..fb86963859c772846dfc531fc9cc8c0825f36ac7 100644 (file)
  * @read:      Read function of @clock
  * @mask:      Bitmask for two's complement subtraction of non 64bit clocks
  * @cycle_last: @clock cycle value at last update
- * @mult:      NTP adjusted multiplier for scaled math conversion
+ * @mult:      (NTP adjusted) multiplier for scaled math conversion
  * @shift:     Shift value for scaled math conversion
  * @xtime_nsec: Shifted (fractional) nano seconds offset for readout
- * @base_mono:  ktime_t (nanoseconds) base time for readout
+ * @base:      ktime_t (nanoseconds) base time for readout
  *
  * This struct has size 56 byte on 64 bit. Together with a seqcount it
  * occupies a single 64byte cache line.
  *
  * The struct is separate from struct timekeeper as it is also used
- * for a fast NMI safe accessor to clock monotonic.
+ * for a fast NMI safe accessors.
  */
 struct tk_read_base {
        struct clocksource      *clock;
@@ -35,12 +35,13 @@ struct tk_read_base {
        u32                     mult;
        u32                     shift;
        u64                     xtime_nsec;
-       ktime_t                 base_mono;
+       ktime_t                 base;
 };
 
 /**
  * struct timekeeper - Structure holding internal timekeeping values.
- * @tkr:               The readout base structure
+ * @tkr_mono:          The readout base structure for CLOCK_MONOTONIC
+ * @tkr_raw:           The readout base structure for CLOCK_MONOTONIC_RAW
  * @xtime_sec:         Current CLOCK_REALTIME time in seconds
  * @ktime_sec:         Current CLOCK_MONOTONIC time in seconds
  * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset
@@ -48,7 +49,6 @@ struct tk_read_base {
  * @offs_boot:         Offset clock monotonic -> clock boottime
  * @offs_tai:          Offset clock monotonic -> clock tai
  * @tai_offset:                The current UTC to TAI offset in seconds
- * @base_raw:          Monotonic raw base time in ktime_t format
  * @raw_time:          Monotonic raw base time in timespec64 format
  * @cycle_interval:    Number of clock cycles in one NTP interval
  * @xtime_interval:    Number of clock shifted nano seconds in one NTP
@@ -76,7 +76,8 @@ struct tk_read_base {
  * used instead.
  */
 struct timekeeper {
-       struct tk_read_base     tkr;
+       struct tk_read_base     tkr_mono;
+       struct tk_read_base     tkr_raw;
        u64                     xtime_sec;
        unsigned long           ktime_sec;
        struct timespec64       wall_to_monotonic;
@@ -84,7 +85,6 @@ struct timekeeper {
        ktime_t                 offs_boot;
        ktime_t                 offs_tai;
        s32                     tai_offset;
-       ktime_t                 base_raw;
        struct timespec64       raw_time;
 
        /* The following members are for timekeeping internal use */
index 3eaae47542751962579a3c6736f18917e4da7ad3..99176af216af449563e3a190b96edc04ea1a1f9e 100644 (file)
@@ -214,12 +214,18 @@ static inline u64 ktime_get_boot_ns(void)
        return ktime_to_ns(ktime_get_boottime());
 }
 
+static inline u64 ktime_get_tai_ns(void)
+{
+       return ktime_to_ns(ktime_get_clocktai());
+}
+
 static inline u64 ktime_get_raw_ns(void)
 {
        return ktime_to_ns(ktime_get_raw());
 }
 
 extern u64 ktime_get_mono_fast_ns(void);
+extern u64 ktime_get_raw_fast_ns(void);
 
 /*
  * Timespec interfaces utilizing the ktime based ones
@@ -242,6 +248,9 @@ static inline void timekeeping_clocktai(struct timespec *ts)
 /*
  * RTC specific
  */
+extern bool timekeeping_rtc_skipsuspend(void);
+extern bool timekeeping_rtc_skipresume(void);
+
 extern void timekeeping_inject_sleeptime64(struct timespec64 *delta);
 
 /*
@@ -253,17 +262,14 @@ extern void getnstime_raw_and_real(struct timespec *ts_raw,
 /*
  * Persistent clock related interfaces
  */
-extern bool persistent_clock_exist;
 extern int persistent_clock_is_local;
 
-static inline bool has_persistent_clock(void)
-{
-       return persistent_clock_exist;
-}
-
 extern void read_persistent_clock(struct timespec *ts);
+extern void read_persistent_clock64(struct timespec64 *ts);
 extern void read_boot_clock(struct timespec *ts);
+extern void read_boot_clock64(struct timespec64 *ts);
 extern int update_persistent_clock(struct timespec now);
+extern int update_persistent_clock64(struct timespec64 now);
 
 
 #endif
index d9a4905e01d0c98b88e89c7db5c85bbf7d5be8a1..6e0ce8c7b8cb5a9fcb985a5a5078f82267d03092 100644 (file)
@@ -227,9 +227,23 @@ struct skb_data {  /* skb->cb is one of these */
        struct urb              *urb;
        struct usbnet           *dev;
        enum skb_state          state;
-       size_t                  length;
+       long                    length;
+       unsigned long           packets;
 };
 
+/* Drivers that set FLAG_MULTI_PACKET must call this in their
+ * tx_fixup method before returning an skb.
+ */
+static inline void
+usbnet_set_skb_tx_stats(struct sk_buff *skb,
+                       unsigned long packets, long bytes_delta)
+{
+       struct skb_data *entry = (struct skb_data *) skb->cb;
+
+       entry->packets = packets;
+       entry->length = bytes_delta;
+}
+
 extern int usbnet_open(struct net_device *net);
 extern int usbnet_stop(struct net_device *net);
 extern netdev_tx_t usbnet_start_xmit(struct sk_buff *skb,
index 2d67b8998fd8b49d877d65b0b94a022be47d4e28..049b2f497bc79cac035faac38e4b808d9daf9af4 100644 (file)
@@ -78,19 +78,6 @@ extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
 extern void vfio_unregister_iommu_driver(
                                const struct vfio_iommu_driver_ops *ops);
 
-/**
- * offsetofend(TYPE, MEMBER)
- *
- * @TYPE: The type of the structure
- * @MEMBER: The member within the structure to get the end offset of
- *
- * Simple helper macro for dealing with variable sized structures passed
- * from user space.  This allows us to easily determine if the provided
- * structure is sized to include various fields.
- */
-#define offsetofend(TYPE, MEMBER) \
-       (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
-
 /*
  * External user API
  */
index 00048339c23e4f252ee6a4b15cd38b49b8032de4..b2dd371ec0ca0aa6f0e1dc71d1f79c69c6dbdb2b 100644 (file)
@@ -130,6 +130,7 @@ extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
 extern unsigned int dirty_writeback_interval;
 extern unsigned int dirty_expire_interval;
+extern unsigned int dirtytime_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
@@ -146,6 +147,8 @@ extern int dirty_ratio_handler(struct ctl_table *table, int write,
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos);
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+                              void __user *buffer, size_t *lenp, loff_t *ppos);
 
 struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
index c2e570336269b8e2e3c574c607afc651d8f1a6eb..6008b0985b7b18a5e15c157a487ccfb6b640b312 100644 (file)
 #define                ISI_CFG1_FRATE_DIV_MASK         (7 << 8)
 #define ISI_CFG1_DISCR                         (1 << 11)
 #define ISI_CFG1_FULL_MODE                     (1 << 12)
+/* Definition for THMASK(ISI_V2) */
+#define                ISI_CFG1_THMASK_BEATS_4         (0 << 13)
+#define                ISI_CFG1_THMASK_BEATS_8         (1 << 13)
+#define                ISI_CFG1_THMASK_BEATS_16        (2 << 13)
 
 /* Bitfields in CFG2 */
 #define ISI_CFG2_GRAYSCALE                     (1 << 13)
index 025c61c0dffbfe9ddfcc2cd6f9eb38b2af504d3f..6cc1eafb153a75c4665f375836f5592c282f9a8f 100644 (file)
@@ -453,22 +453,6 @@ static __inline__ void inet_reset_saddr(struct sock *sk)
 
 #endif
 
-static inline int sk_mc_loop(struct sock *sk)
-{
-       if (!sk)
-               return 1;
-       switch (sk->sk_family) {
-       case AF_INET:
-               return inet_sk(sk)->mc_loop;
-#if IS_ENABLED(CONFIG_IPV6)
-       case AF_INET6:
-               return inet6_sk(sk)->mc_loop;
-#endif
-       }
-       WARN_ON(1);
-       return 1;
-}
-
 bool ip_call_ra_chain(struct sk_buff *skb);
 
 /*
index 1d09b46c1e489325b95f9987327d95ca8affed08..eda131d179d971147f5b7ac254876dbf07946861 100644 (file)
@@ -174,7 +174,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 {
-       struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+       struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+                               inet6_sk(skb->sk) : NULL;
 
        return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ?
               skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
index ab186b1d31fffe7cc5b4888d85aca99c2d875da4..e4079c28e6b8588da92a8826d63e07195859a33b 100644 (file)
@@ -1762,6 +1762,8 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
 
 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
 
+bool sk_mc_loop(struct sock *sk);
+
 static inline bool sk_can_gso(const struct sock *sk)
 {
        return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
index b0a81307985282005ade90ee6da96e3048c405d3..2f62ab2d7bf99bd75b16eef6db74ce8aa233e8ff 100644 (file)
@@ -973,7 +973,8 @@ struct input_keymap_entry {
  */
 #define MT_TOOL_FINGER         0
 #define MT_TOOL_PEN            1
-#define MT_TOOL_MAX            1
+#define MT_TOOL_PALM           2
+#define MT_TOOL_MAX            2
 
 /*
  * Values describing the status of a force-feedback effect
index 805570650062e99dfbb20edbab759e1322462170..f574d7be7631e5795f26a8934391b8b6f1473f8e 100644 (file)
@@ -147,6 +147,16 @@ struct kvm_pit_config {
 
 #define KVM_PIT_SPEAKER_DUMMY     1
 
+struct kvm_s390_skeys {
+       __u64 start_gfn;
+       __u64 count;
+       __u64 skeydata_addr;
+       __u32 flags;
+       __u32 reserved[9];
+};
+#define KVM_S390_GET_SKEYS_NONE   1
+#define KVM_S390_SKEYS_MAX        1048576
+
 #define KVM_EXIT_UNKNOWN          0
 #define KVM_EXIT_EXCEPTION        1
 #define KVM_EXIT_IO               2
@@ -172,6 +182,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_S390_TSCH        22
 #define KVM_EXIT_EPR              23
 #define KVM_EXIT_SYSTEM_EVENT     24
+#define KVM_EXIT_S390_STSI        25
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -309,6 +320,15 @@ struct kvm_run {
                        __u32 type;
                        __u64 flags;
                } system_event;
+               /* KVM_EXIT_S390_STSI */
+               struct {
+                       __u64 addr;
+                       __u8 ar;
+                       __u8 reserved;
+                       __u8 fc;
+                       __u8 sel1;
+                       __u16 sel2;
+               } s390_stsi;
                /* Fix the size of the union. */
                char padding[256];
        };
@@ -324,7 +344,7 @@ struct kvm_run {
        __u64 kvm_dirty_regs;
        union {
                struct kvm_sync_regs regs;
-               char padding[1024];
+               char padding[2048];
        } s;
 };
 
@@ -365,6 +385,24 @@ struct kvm_translation {
        __u8  pad[5];
 };
 
+/* for KVM_S390_MEM_OP */
+struct kvm_s390_mem_op {
+       /* in */
+       __u64 gaddr;            /* the guest address */
+       __u64 flags;            /* flags */
+       __u32 size;             /* amount of bytes */
+       __u32 op;               /* type of operation */
+       __u64 buf;              /* buffer in userspace */
+       __u8 ar;                /* the access register number */
+       __u8 reserved[31];      /* should be set to 0 */
+};
+/* types for kvm_s390_mem_op->op */
+#define KVM_S390_MEMOP_LOGICAL_READ    0
+#define KVM_S390_MEMOP_LOGICAL_WRITE   1
+/* flags for kvm_s390_mem_op->flags */
+#define KVM_S390_MEMOP_F_CHECK_ONLY            (1ULL << 0)
+#define KVM_S390_MEMOP_F_INJECT_EXCEPTION      (1ULL << 1)
+
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
        /* in */
@@ -520,6 +558,13 @@ struct kvm_s390_irq {
        } u;
 };
 
+struct kvm_s390_irq_state {
+       __u64 buf;
+       __u32 flags;
+       __u32 len;
+       __u32 reserved[4];
+};
+
 /* for KVM_SET_GUEST_DEBUG */
 
 #define KVM_GUESTDBG_ENABLE            0x00000001
@@ -760,6 +805,14 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_ENABLE_HCALL 104
 #define KVM_CAP_CHECK_EXTENSION_VM 105
 #define KVM_CAP_S390_USER_SIGP 106
+#define KVM_CAP_S390_VECTOR_REGISTERS 107
+#define KVM_CAP_S390_MEM_OP 108
+#define KVM_CAP_S390_USER_STSI 109
+#define KVM_CAP_S390_SKEYS 110
+#define KVM_CAP_MIPS_FPU 111
+#define KVM_CAP_MIPS_MSA 112
+#define KVM_CAP_S390_INJECT_IRQ 113
+#define KVM_CAP_S390_IRQ_STATE 114
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1135,6 +1188,16 @@ struct kvm_s390_ucas_mapping {
 #define KVM_ARM_VCPU_INIT        _IOW(KVMIO,  0xae, struct kvm_vcpu_init)
 #define KVM_ARM_PREFERRED_TARGET  _IOR(KVMIO,  0xaf, struct kvm_vcpu_init)
 #define KVM_GET_REG_LIST         _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
+/* Available with KVM_CAP_S390_MEM_OP */
+#define KVM_S390_MEM_OP                  _IOW(KVMIO,  0xb1, struct kvm_s390_mem_op)
+/* Available with KVM_CAP_S390_SKEYS */
+#define KVM_S390_GET_SKEYS      _IOW(KVMIO, 0xb2, struct kvm_s390_skeys)
+#define KVM_S390_SET_SKEYS      _IOW(KVMIO, 0xb3, struct kvm_s390_skeys)
+/* Available with KVM_CAP_S390_INJECT_IRQ */
+#define KVM_S390_IRQ              _IOW(KVMIO,  0xb4, struct kvm_s390_irq)
+/* Available with KVM_CAP_S390_IRQ_STATE */
+#define KVM_S390_SET_IRQ_STATE   _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
+#define KVM_S390_GET_IRQ_STATE   _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3         (1 << 1)
index 4742f2cb42f2bd46180421efef423f101579bd03..d3bd6ffec04101138e831076789fc11cee98f2be 100644 (file)
@@ -47,7 +47,7 @@
  * exported filesystem.
  */
 #define        NFSEXP_V4ROOT           0x10000
-#define NFSEXP_NOPNFS          0x20000
+#define NFSEXP_PNFS            0x20000
 
 /* All flags that we claim to support.  (Note we don't support NOACL.) */
 #define NFSEXP_ALLFLAGS                0x3FE7F
index 1972b161c61e98fbe3e3ce003744cf1d2e8c5b1c..82eea9c5af61c2922a221dfc59885b7409305cff 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/gfp.h>
 #include <linux/suspend.h>
 #include <linux/lockdep.h>
+#include <linux/tick.h>
 #include <trace/events/power.h>
 
 #include "smpboot.h"
@@ -338,6 +339,8 @@ static int __ref take_cpu_down(void *_param)
                return err;
 
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
+       /* Give up timekeeping duties */
+       tick_handover_do_timer();
        /* Park the stopper thread */
        kthread_park(current);
        return 0;
@@ -411,10 +414,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        while (!idle_cpu(cpu))
                cpu_relax();
 
+       hotplug_cpu__broadcast_tick_pull(cpu);
        /* This actually kills the CPU. */
        __cpu_die(cpu);
 
        /* CPU is completely dead: tell everyone.  Too late to complain. */
+       tick_cleanup_dead_cpu(cpu);
        cpu_notify_nofail(CPU_DEAD | mod, hcpu);
 
        check_for_tasks(cpu);
index 2a5e3830e953b72cca4aeac4ec537e276f0f355b..2579e407ff67d039106207f78a466f824e515db6 100644 (file)
@@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
        if (!p)
                return -ESRCH;
 
-       if (!p->mm) {
+       if (unlikely(p->flags & PF_KTHREAD)) {
                put_task_struct(p);
                return -EPERM;
        }
index d1fe2ba5bac958bc85da8e8868408d8c6c809dc3..75e114bdf3f26f379c4382dce2bc5c128c06b868 100644 (file)
@@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
                 */
                return;
        }
-       ACCESS_ONCE(prev->next) = node;
+       WRITE_ONCE(prev->next, node);
 
        /* Wait until the lock holder passes the lock down. */
        arch_mcs_spin_lock_contended(&node->locked);
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 static inline
 void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 {
-       struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+       struct mcs_spinlock *next = READ_ONCE(node->next);
 
        if (likely(!next)) {
                /*
@@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
                if (likely(cmpxchg(lock, node, NULL) == node))
                        return;
                /* Wait until the next pointer is set */
-               while (!(next = ACCESS_ONCE(node->next)))
+               while (!(next = READ_ONCE(node->next)))
                        cpu_relax_lowlatency();
        }
 
index 94674e5919cba54e339addf0c7c7cf2b90f75c27..4cccea6b8934f5697fa3dfa609ae4bd10db78b6f 100644 (file)
@@ -25,7 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
 
 /*
  * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock,
 }
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-       if (lock->owner != owner)
-               return false;
-
-       /*
-        * Ensure we emit the owner->on_cpu, dereference _after_ checking
-        * lock->owner still matches owner, if that fails, owner might
-        * point to free()d memory, if it still matches, the rcu_read_lock()
-        * ensures the memory stays valid.
-        */
-       barrier();
-
-       return owner->on_cpu;
-}
-
 /*
  * Look out! "owner" is an entirely speculative pointer
  * access and not reliable.
  */
 static noinline
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 {
+       bool ret = true;
+
        rcu_read_lock();
-       while (owner_running(lock, owner)) {
-               if (need_resched())
+       while (lock->owner == owner) {
+               /*
+                * Ensure we emit the owner->on_cpu, dereference _after_
+                * checking lock->owner still matches owner. If that fails,
+                * owner might point to freed memory. If it still matches,
+                * the rcu_read_lock() ensures the memory stays valid.
+                */
+               barrier();
+
+               if (!owner->on_cpu || need_resched()) {
+                       ret = false;
                        break;
+               }
 
                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
 
-       /*
-        * We break out the loop above on need_resched() and when the
-        * owner changed, which is a sign for heavy contention. Return
-        * success only when lock->owner is NULL.
-        */
-       return lock->owner == NULL;
+       return ret;
 }
 
 /*
@@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
                return 0;
 
        rcu_read_lock();
-       owner = ACCESS_ONCE(lock->owner);
+       owner = READ_ONCE(lock->owner);
        if (owner)
                retval = owner->on_cpu;
        rcu_read_unlock();
@@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
                         * As such, when deadlock detection needs to be
                         * performed the optimistic spinning cannot be done.
                         */
-                       if (ACCESS_ONCE(ww->ctx))
+                       if (READ_ONCE(ww->ctx))
                                break;
                }
 
@@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
                 */
-               owner = ACCESS_ONCE(lock->owner);
+               owner = READ_ONCE(lock->owner);
                if (owner && !mutex_spin_on_owner(lock, owner))
                        break;
 
@@ -490,7 +481,7 @@ static inline int __sched
 __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 {
        struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+       struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
 
        if (!hold_ctx)
                return 0;
index c112d00341b05773934ecdb2977c0d2aca1d5c11..dc85ee23a26f79416a140241e3067a5a2ca24d0b 100644 (file)
@@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 
        prev = decode_cpu(old);
        node->prev = prev;
-       ACCESS_ONCE(prev->next) = node;
+       WRITE_ONCE(prev->next, node);
 
        /*
         * Normally @prev is untouchable after the above store; because at that
@@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
         * cmpxchg in an attempt to undo our queueing.
         */
 
-       while (!ACCESS_ONCE(node->locked)) {
+       while (!READ_ONCE(node->locked)) {
                /*
                 * If we need to reschedule bail... so we can block.
                 */
@@ -148,7 +148,7 @@ unqueue:
                 * Or we race against a concurrent unqueue()'s step-B, in which
                 * case its step-C will write us a new @node->prev pointer.
                 */
-               prev = ACCESS_ONCE(node->prev);
+               prev = READ_ONCE(node->prev);
        }
 
        /*
@@ -170,8 +170,8 @@ unqueue:
         * it will wait in Step-A.
         */
 
-       ACCESS_ONCE(next->prev) = prev;
-       ACCESS_ONCE(prev->next) = next;
+       WRITE_ONCE(next->prev, prev);
+       WRITE_ONCE(prev->next, next);
 
        return false;
 }
@@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
        node = this_cpu_ptr(&osq_node);
        next = xchg(&node->next, NULL);
        if (next) {
-               ACCESS_ONCE(next->locked) = 1;
+               WRITE_ONCE(next->locked, 1);
                return;
        }
 
        next = osq_wait_next(lock, node, NULL);
        if (next)
-               ACCESS_ONCE(next->locked) = 1;
+               WRITE_ONCE(next->locked, 1);
 }
index 6357265a31ad1a34b881aba31abe27ab6d3921ec..b73279367087ca779072b79a784f69224929c149 100644 (file)
@@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
  *
  * @task:      the task owning the mutex (owner) for which a chain walk is
  *             probably needed
- * @deadlock_detect: do we have to carry out deadlock detection?
+ * @chwalk:    do we have to carry out deadlock detection?
  * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
  *             things for a task that has just got its priority adjusted, and
  *             is waiting on a mutex)
index 2555ae15ec14c78d6c8f5030fea52daa74b5a5c9..3a504857206536f68fda513fecf63841d2979a06 100644 (file)
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 
                list_del(&waiter->list);
                tsk = waiter->task;
+               /*
+                * Make sure we do not wakeup the next reader before
+                * setting the nil condition to grant the next reader;
+                * otherwise we could miss the wakeup on the other
+                * side and end up sleeping again. See the pairing
+                * in rwsem_down_read_failed().
+                */
                smp_mb();
                waiter->task = NULL;
                wake_up_process(tsk);
index 2f7cc4076f50aa0c534c22e527ab3d1f11ce9a66..3417d0172a5d2e7cd69460ed4ef96c02f6c578d0 100644 (file)
@@ -14,8 +14,9 @@
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/sched/rt.h>
+#include <linux/osq_lock.h>
 
-#include "mcs_spinlock.h"
+#include "rwsem.h"
 
 /*
  * Guide to the rw_semaphore's count field for common values.
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
                waiter = list_entry(next, struct rwsem_waiter, list);
                next = waiter->list.next;
                tsk = waiter->task;
+               /*
+                * Make sure we do not wakeup the next reader before
+                * setting the nil condition to grant the next reader;
+                * otherwise we could miss the wakeup on the other
+                * side and end up sleeping again. See the pairing
+                * in rwsem_down_read_failed().
+                */
                smp_mb();
                waiter->task = NULL;
                wake_up_process(tsk);
@@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
                    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
                if (!list_is_singular(&sem->wait_list))
                        rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+               rwsem_set_owner(sem);
                return true;
        }
 
@@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
  */
 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 {
-       long old, count = ACCESS_ONCE(sem->count);
+       long old, count = READ_ONCE(sem->count);
 
        while (true) {
                if (!(count == 0 || count == RWSEM_WAITING_BIAS))
                        return false;
 
                old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
-               if (old == count)
+               if (old == count) {
+                       rwsem_set_owner(sem);
                        return true;
+               }
 
                count = old;
        }
@@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 {
        struct task_struct *owner;
-       bool on_cpu = false;
+       bool ret = true;
 
        if (need_resched())
                return false;
 
        rcu_read_lock();
-       owner = ACCESS_ONCE(sem->owner);
-       if (owner)
-               on_cpu = owner->on_cpu;
-       rcu_read_unlock();
-
-       /*
-        * If sem->owner is not set, yet we have just recently entered the
-        * slowpath, then there is a possibility reader(s) may have the lock.
-        * To be safe, avoid spinning in these situations.
-        */
-       return on_cpu;
-}
-
-static inline bool owner_running(struct rw_semaphore *sem,
-                                struct task_struct *owner)
-{
-       if (sem->owner != owner)
-               return false;
-
-       /*
-        * Ensure we emit the owner->on_cpu, dereference _after_ checking
-        * sem->owner still matches owner, if that fails, owner might
-        * point to free()d memory, if it still matches, the rcu_read_lock()
-        * ensures the memory stays valid.
-        */
-       barrier();
+       owner = READ_ONCE(sem->owner);
+       if (!owner) {
+               long count = READ_ONCE(sem->count);
+               /*
+                * If sem->owner is not set, yet we have just recently entered the
+                * slowpath with the lock being active, then there is a possibility
+                * reader(s) may have the lock. To be safe, bail spinning in these
+                * situations.
+                */
+               if (count & RWSEM_ACTIVE_MASK)
+                       ret = false;
+               goto done;
+       }
 
-       return owner->on_cpu;
+       ret = owner->on_cpu;
+done:
+       rcu_read_unlock();
+       return ret;
 }
 
 static noinline
 bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
 {
+       long count;
+
        rcu_read_lock();
-       while (owner_running(sem, owner)) {
-               if (need_resched())
-                       break;
+       while (sem->owner == owner) {
+               /*
+                * Ensure we emit the owner->on_cpu, dereference _after_
+                * checking sem->owner still matches owner, if that fails,
+                * owner might point to free()d memory, if it still matches,
+                * the rcu_read_lock() ensures the memory stays valid.
+                */
+               barrier();
+
+               /* abort spinning when need_resched or owner is not running */
+               if (!owner->on_cpu || need_resched()) {
+                       rcu_read_unlock();
+                       return false;
+               }
 
                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
 
+       if (READ_ONCE(sem->owner))
+               return true; /* new owner, continue spinning */
+
        /*
-        * We break out the loop above on need_resched() or when the
-        * owner changed, which is a sign for heavy contention. Return
-        * success only when sem->owner is NULL.
+        * When the owner is not set, the lock could be free or
+        * held by readers. Check the counter to verify the
+        * state.
         */
-       return sem->owner == NULL;
+       count = READ_ONCE(sem->count);
+       return (count == 0 || count == RWSEM_WAITING_BIAS);
 }
 
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
                goto done;
 
        while (true) {
-               owner = ACCESS_ONCE(sem->owner);
+               owner = READ_ONCE(sem->owner);
                if (owner && !rwsem_spin_on_owner(sem, owner))
                        break;
 
@@ -432,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 
        /* we're now waiting on the lock, but no longer actively locking */
        if (waiting) {
-               count = ACCESS_ONCE(sem->count);
+               count = READ_ONCE(sem->count);
 
                /*
                 * If there were already threads queued before us and there are
index e2d3bc7f03b41e1c01a7c8fc548ac162cdfa151e..205be0ce34de73e8590f2cd4cdb369526b236540 100644 (file)
@@ -9,29 +9,9 @@
 #include <linux/sched.h>
 #include <linux/export.h>
 #include <linux/rwsem.h>
-
 #include <linux/atomic.h>
 
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-       sem->owner = current;
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-       sem->owner = NULL;
-}
-
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-#endif
+#include "rwsem.h"
 
 /*
  * lock for reading
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644 (file)
index 0000000..870ed9a
--- /dev/null
@@ -0,0 +1,20 @@
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+       sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+       sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
index 99fdf94efce80f432fc4ab203aeb2a918ad5c564..ec53f594e9c9fcf434a4ffaf48ff2405d7272b73 100644 (file)
@@ -2479,6 +2479,23 @@ static int elf_header_check(struct load_info *info)
        return 0;
 }
 
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+       do {
+               unsigned long n = min(len, COPY_CHUNK_SIZE);
+
+               if (copy_from_user(dst, usrc, n) != 0)
+                       return -EFAULT;
+               cond_resched();
+               dst += n;
+               usrc += n;
+               len -= n;
+       } while (len);
+       return 0;
+}
+
 /* Sets info->hdr and info->len. */
 static int copy_module_from_user(const void __user *umod, unsigned long len,
                                  struct load_info *info)
@@ -2498,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
        if (!info->hdr)
                return -ENOMEM;
 
-       if (copy_from_user(info->hdr, umod, info->len) != 0) {
+       if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
                vfree(info->hdr);
                return -EFAULT;
        }
index c24d5a23bf939be71f75736efdbd3874da157c2d..5235dd4e1e2f68a97fa6836d98854d2e8e46724e 100644 (file)
@@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
        }
 }
 
-static bool is_nosave_page(unsigned long pfn)
-{
-       struct nosave_region *region;
-
-       list_for_each_entry(region, &nosave_regions, list) {
-               if (pfn >= region->start_pfn && pfn < region->end_pfn) {
-                       pr_err("PM: %#010llx in e820 nosave region: "
-                              "[mem %#010llx-%#010llx]\n",
-                              (unsigned long long) pfn << PAGE_SHIFT,
-                              (unsigned long long) region->start_pfn << PAGE_SHIFT,
-                              ((unsigned long long) region->end_pfn << PAGE_SHIFT)
-                                       - 1);
-                       return true;
-               }
-       }
-
-       return false;
-}
-
 /**
  *     create_basic_memory_bitmaps - create bitmaps needed for marking page
  *     frames that should not be saved and free page frames.  The pointers
@@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
        do {
                pfn = memory_bm_next_pfn(bm);
                if (likely(pfn != BM_END_OF_MAP)) {
-                       if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
+                       if (likely(pfn_valid(pfn)))
                                swsusp_set_page_free(pfn_to_page(pfn));
                        else
                                return -EFAULT;
index 62671f53202ac7d4de8037dce950c934c7a4ddbc..261af7bfcb67d55e9ec9c19f657453f3e2607803 100644 (file)
@@ -689,6 +689,23 @@ static inline bool got_nohz_idle_kick(void)
 #ifdef CONFIG_NO_HZ_FULL
 bool sched_can_stop_tick(void)
 {
+       /*
+        * FIFO realtime policy runs the highest priority task. Other runnable
+        * tasks are of a lower priority. The scheduler tick does nothing.
+        */
+       if (current->policy == SCHED_FIFO)
+               return true;
+
+       /*
+        * Round-robin realtime tasks time slice with other tasks at the same
+        * realtime priority. Is this task the only one at this priority?
+        */
+       if (current->policy == SCHED_RR) {
+               struct sched_rt_entity *rt_se = &current->rt;
+
+               return rt_se->run_list.prev == rt_se->run_list.next;
+       }
+
        /*
         * More than one running task need preemption.
         * nr_running update is assumed to be visible
@@ -996,6 +1013,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                rq_clock_skip_update(rq, true);
 }
 
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+
+void register_task_migration_notifier(struct notifier_block *n)
+{
+       atomic_notifier_chain_register(&task_migration_notifier, n);
+}
+
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -1026,10 +1050,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        trace_sched_migrate_task(p, new_cpu);
 
        if (task_cpu(p) != new_cpu) {
+               struct task_migration_notifier tmn;
+
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+
+               tmn.task = p;
+               tmn.from_cpu = task_cpu(p);
+               tmn.to_cpu = new_cpu;
+
+               atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
        }
 
        __set_task_cpu(p, new_cpu);
@@ -5320,36 +5352,13 @@ static int sched_cpu_active(struct notifier_block *nfb,
 static int sched_cpu_inactive(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
-       unsigned long flags;
-       long cpu = (long)hcpu;
-       struct dl_bw *dl_b;
-
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
-               set_cpu_active(cpu, false);
-
-               /* explicitly allow suspend */
-               if (!(action & CPU_TASKS_FROZEN)) {
-                       bool overflow;
-                       int cpus;
-
-                       rcu_read_lock_sched();
-                       dl_b = dl_bw_of(cpu);
-
-                       raw_spin_lock_irqsave(&dl_b->lock, flags);
-                       cpus = dl_bw_cpus(cpu);
-                       overflow = __dl_overflow(dl_b, cpus, 0, 0);
-                       raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
-                       rcu_read_unlock_sched();
-
-                       if (overflow)
-                               return notifier_from_errno(-EBUSY);
-               }
+               set_cpu_active((long)hcpu, false);
                return NOTIFY_OK;
+       default:
+               return NOTIFY_DONE;
        }
-
-       return NOTIFY_DONE;
 }
 
 static int __init migration_init(void)
@@ -5430,17 +5439,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
 
-               /*
-                * Even though we initialize ->capacity to something semi-sane,
-                * we leave capacity_orig unset. This allows us to detect if
-                * domain iteration is still funny without causing /0 traps.
-                */
-               if (!group->sgc->capacity_orig) {
-                       printk(KERN_CONT "\n");
-                       printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
-                       break;
-               }
-
                if (!cpumask_weight(sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: empty group\n");
@@ -5924,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                 * die on a /0 trap.
                 */
                sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
-               sg->sgc->capacity_orig = sg->sgc->capacity;
 
                /*
                 * Make sure the first group of this domain contains the
@@ -6235,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
         */
 
        if (sd->flags & SD_SHARE_CPUCAPACITY) {
+               sd->flags |= SD_PREFER_SIBLING;
                sd->imbalance_pct = 110;
                sd->smt_gain = 1178; /* ~15% */
 
@@ -7000,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                 */
 
        case CPU_ONLINE:
-       case CPU_DOWN_FAILED:
                cpuset_update_active_cpus(true);
                break;
        default:
@@ -7012,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
                               void *hcpu)
 {
-       switch (action) {
+       unsigned long flags;
+       long cpu = (long)hcpu;
+       struct dl_bw *dl_b;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
+               /* explicitly allow suspend */
+               if (!(action & CPU_TASKS_FROZEN)) {
+                       bool overflow;
+                       int cpus;
+
+                       rcu_read_lock_sched();
+                       dl_b = dl_bw_of(cpu);
+
+                       raw_spin_lock_irqsave(&dl_b->lock, flags);
+                       cpus = dl_bw_cpus(cpu);
+                       overflow = __dl_overflow(dl_b, cpus, 0, 0);
+                       raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+                       rcu_read_unlock_sched();
+
+                       if (overflow)
+                               return notifier_from_errno(-EBUSY);
+               }
                cpuset_update_active_cpus(false);
                break;
        case CPU_DOWN_PREPARE_FROZEN:
@@ -7158,8 +7177,8 @@ void __init sched_init(void)
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs);
-               init_rt_rq(&rq->rt, rq);
-               init_dl_rq(&rq->dl, rq);
+               init_rt_rq(&rq->rt);
+               init_dl_rq(&rq->dl);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -7199,7 +7218,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
-               rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+               rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
@@ -7798,7 +7817,7 @@ static int sched_rt_global_constraints(void)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-static int sched_dl_global_constraints(void)
+static int sched_dl_global_validate(void)
 {
        u64 runtime = global_rt_runtime();
        u64 period = global_rt_period();
@@ -7899,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
                if (ret)
                        goto undo;
 
-               ret = sched_rt_global_constraints();
+               ret = sched_dl_global_validate();
                if (ret)
                        goto undo;
 
-               ret = sched_dl_global_constraints();
+               ret = sched_rt_global_constraints();
                if (ret)
                        goto undo;
 
index 3fa8fa6d940300c1fbae721503aad2666f72b4e5..5e95145088fd37b3d07ccac66c3cd58f7effe10a 100644 (file)
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b)
        dl_b->total_bw = 0;
 }
 
-void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+void init_dl_rq(struct dl_rq *dl_rq)
 {
        dl_rq->rb_root = RB_ROOT;
 
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq)
        rq->post_schedule = has_pushable_dl_tasks(rq);
 }
 
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
+
+static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+{
+       struct rq *later_rq = NULL;
+       bool fallback = false;
+
+       later_rq = find_lock_later_rq(p, rq);
+
+       if (!later_rq) {
+               int cpu;
+
+               /*
+                * If we cannot preempt any rq, fall back to pick any
+                * online cpu.
+                */
+               fallback = true;
+               cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+               if (cpu >= nr_cpu_ids) {
+                       /*
+                        * Fail to find any suitable cpu.
+                        * The task will never come back!
+                        */
+                       BUG_ON(dl_bandwidth_enabled());
+
+                       /*
+                        * If admission control is disabled we
+                        * try a little harder to let the task
+                        * run.
+                        */
+                       cpu = cpumask_any(cpu_active_mask);
+               }
+               later_rq = cpu_rq(cpu);
+               double_lock_balance(rq, later_rq);
+       }
+
+       deactivate_task(rq, p, 0);
+       set_task_cpu(p, later_rq->cpu);
+       activate_task(later_rq, p, ENQUEUE_REPLENISH);
+
+       if (!fallback)
+               resched_curr(later_rq);
+
+       double_unlock_balance(rq, later_rq);
+}
+
 #else
 
 static inline
@@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
        unsigned long flags;
        struct rq *rq;
 
-       rq = task_rq_lock(current, &flags);
+       rq = task_rq_lock(p, &flags);
 
        /*
         * We need to take care of several possible races here:
@@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
        sched_clock_tick();
        update_rq_clock(rq);
 
+#ifdef CONFIG_SMP
+       /*
+        * If we find that the rq the task was on is no longer
+        * available, we need to select a new rq.
+        */
+       if (unlikely(!rq->online)) {
+               dl_task_offline_migration(rq, p);
+               goto unlock;
+       }
+#endif
+
        /*
         * If the throttle happened during sched-out; like:
         *
@@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                push_dl_task(rq);
 #endif
 unlock:
-       task_rq_unlock(rq, current, &flags);
+       task_rq_unlock(rq, p, &flags);
 
        return HRTIMER_NORESTART;
 }
@@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq)
        }
        update_rq_clock(rq);
        update_curr_dl(rq);
+       /*
+        * Tell update_rq_clock() that we've just updated,
+        * so we don't do microscopic update in schedule()
+        * and double the fastpath cost.
+        */
+       rq_clock_skip_update(rq, true);
 }
 
 #ifdef CONFIG_SMP
@@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
        int check_resched = 1;
 
-       /*
-        * If p is throttled, don't consider the possibility
-        * of preempting rq->curr, the check will be done right
-        * after its runtime will get replenished.
-        */
-       if (unlikely(p->dl.dl_throttled))
-               return;
-
        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
index 8baaf858d25c49921eaa3d9a83235b9f0d2b8c6c..a245c1fc6f0a610f17e2d13635306d681e2ef821 100644 (file)
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        if (!se) {
                struct sched_avg *avg = &cpu_rq(cpu)->avg;
                P(avg->runnable_avg_sum);
-               P(avg->runnable_avg_period);
+               P(avg->avg_period);
                return;
        }
 
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        P(se->load.weight);
 #ifdef CONFIG_SMP
        P(se->avg.runnable_avg_sum);
-       P(se->avg.runnable_avg_period);
+       P(se->avg.running_avg_sum);
+       P(se->avg.avg_period);
        P(se->avg.load_avg_contrib);
+       P(se->avg.utilization_avg_contrib);
        P(se->avg.decay_count);
 #endif
 #undef PN
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        cfs_rq->runnable_load_avg);
        SEQ_printf(m, "  .%-30s: %ld\n", "blocked_load_avg",
                        cfs_rq->blocked_load_avg);
+       SEQ_printf(m, "  .%-30s: %ld\n", "utilization_load_avg",
+                       cfs_rq->utilization_load_avg);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_contrib",
                        cfs_rq->tg_load_contrib);
@@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.load.weight);
 #ifdef CONFIG_SMP
        P(se.avg.runnable_avg_sum);
-       P(se.avg.runnable_avg_period);
+       P(se.avg.running_avg_sum);
+       P(se.avg.avg_period);
        P(se.avg.load_avg_contrib);
+       P(se.avg.utilization_avg_contrib);
        P(se.avg.decay_count);
 #endif
        P(policy);
index bcfe32088b3768363c2f37502a953b61a361f7ff..ffeaa4105e48a36105ecaea8967082e1e7a7af98 100644 (file)
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
 static inline void __update_task_entity_contrib(struct sched_entity *se);
+static inline void __update_task_entity_utilization(struct sched_entity *se);
 
 /* Give new task start runnable values to heavy its load in infant time */
 void init_task_runnable_average(struct task_struct *p)
@@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p)
        u32 slice;
 
        slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
-       p->se.avg.runnable_avg_sum = slice;
-       p->se.avg.runnable_avg_period = slice;
+       p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
+       p->se.avg.avg_period = slice;
        __update_task_entity_contrib(&p->se);
+       __update_task_entity_utilization(&p->se);
 }
 #else
 void init_task_runnable_average(struct task_struct *p)
@@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env,
 static bool load_too_imbalanced(long src_load, long dst_load,
                                struct task_numa_env *env)
 {
-       long imb, old_imb;
-       long orig_src_load, orig_dst_load;
        long src_capacity, dst_capacity;
+       long orig_src_load;
+       long load_a, load_b;
+       long moved_load;
+       long imb;
 
        /*
         * The load is corrected for the CPU capacity available on each node.
@@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load,
        dst_capacity = env->dst_stats.compute_capacity;
 
        /* We care about the slope of the imbalance, not the direction. */
-       if (dst_load < src_load)
-               swap(dst_load, src_load);
+       load_a = dst_load;
+       load_b = src_load;
+       if (load_a < load_b)
+               swap(load_a, load_b);
 
        /* Is the difference below the threshold? */
-       imb = dst_load * src_capacity * 100 -
-             src_load * dst_capacity * env->imbalance_pct;
+       imb = load_a * src_capacity * 100 -
+               load_b * dst_capacity * env->imbalance_pct;
        if (imb <= 0)
                return false;
 
        /*
         * The imbalance is above the allowed threshold.
-        * Compare it with the old imbalance.
+        * Allow a move that brings us closer to a balanced situation,
+        * without moving things past the point of balance.
         */
        orig_src_load = env->src_stats.load;
-       orig_dst_load = env->dst_stats.load;
 
-       if (orig_dst_load < orig_src_load)
-               swap(orig_dst_load, orig_src_load);
-
-       old_imb = orig_dst_load * src_capacity * 100 -
-                 orig_src_load * dst_capacity * env->imbalance_pct;
+       /*
+        * In a task swap, there will be one load moving from src to dst,
+        * and another moving back. This is the net sum of both moves.
+        * A simple task move will always have a positive value.
+        * Allow the move if it brings the system closer to a balanced
+        * situation, without crossing over the balance point.
+        */
+       moved_load = orig_src_load - src_load;
 
-       /* Would this change make things worse? */
-       return (imb > old_imb);
+       if (moved_load > 0)
+               /* Moving src -> dst. Did we overshoot balance? */
+               return src_load * dst_capacity < dst_load * src_capacity;
+       else
+               /* Moving dst -> src. Did we overshoot balance? */
+               return dst_load * src_capacity < src_load * dst_capacity;
 }
 
 /*
@@ -1675,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
                *period = now - p->last_task_numa_placement;
        } else {
                delta = p->se.avg.runnable_avg_sum;
-               *period = p->se.avg.runnable_avg_period;
+               *period = p->se.avg.avg_period;
        }
 
        p->last_sum_exec_runtime = runtime;
@@ -1765,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
                        }
                }
                /* Next round, evaluate the nodes within max_group. */
+               if (!max_faults)
+                       break;
                nodes = max_group;
        }
        return nid;
@@ -2165,8 +2180,10 @@ void task_numa_work(struct callback_head *work)
                vma = mm->mmap;
        }
        for (; vma; vma = vma->vm_next) {
-               if (!vma_migratable(vma) || !vma_policy_mof(vma))
+               if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+                       is_vm_hugetlb_page(vma)) {
                        continue;
+               }
 
                /*
                 * Shared library pages mapped by multiple processes are not
@@ -2501,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n)
  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  */
-static __always_inline int __update_entity_runnable_avg(u64 now,
+static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
                                                        struct sched_avg *sa,
-                                                       int runnable)
+                                                       int runnable,
+                                                       int running)
 {
        u64 delta, periods;
        u32 runnable_contrib;
        int delta_w, decayed = 0;
+       unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
 
        delta = now - sa->last_runnable_update;
        /*
@@ -2529,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
        sa->last_runnable_update = now;
 
        /* delta_w is the amount already accumulated against our next period */
-       delta_w = sa->runnable_avg_period % 1024;
+       delta_w = sa->avg_period % 1024;
        if (delta + delta_w >= 1024) {
                /* period roll-over */
                decayed = 1;
@@ -2542,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                delta_w = 1024 - delta_w;
                if (runnable)
                        sa->runnable_avg_sum += delta_w;
-               sa->runnable_avg_period += delta_w;
+               if (running)
+                       sa->running_avg_sum += delta_w * scale_freq
+                               >> SCHED_CAPACITY_SHIFT;
+               sa->avg_period += delta_w;
 
                delta -= delta_w;
 
@@ -2552,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 
                sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
                                                  periods + 1);
-               sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+               sa->running_avg_sum = decay_load(sa->running_avg_sum,
+                                                 periods + 1);
+               sa->avg_period = decay_load(sa->avg_period,
                                                     periods + 1);
 
                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
                runnable_contrib = __compute_runnable_contrib(periods);
                if (runnable)
                        sa->runnable_avg_sum += runnable_contrib;
-               sa->runnable_avg_period += runnable_contrib;
+               if (running)
+                       sa->running_avg_sum += runnable_contrib * scale_freq
+                               >> SCHED_CAPACITY_SHIFT;
+               sa->avg_period += runnable_contrib;
        }
 
        /* Remainder of delta accrued against u_0` */
        if (runnable)
                sa->runnable_avg_sum += delta;
-       sa->runnable_avg_period += delta;
+       if (running)
+               sa->running_avg_sum += delta * scale_freq
+                       >> SCHED_CAPACITY_SHIFT;
+       sa->avg_period += delta;
 
        return decayed;
 }
@@ -2582,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
                return 0;
 
        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+       se->avg.utilization_avg_contrib =
+               decay_load(se->avg.utilization_avg_contrib, decays);
 
        return decays;
 }
@@ -2617,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
 
        /* The fraction of a cpu used by this cfs_rq */
        contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
-                         sa->runnable_avg_period + 1);
+                         sa->avg_period + 1);
        contrib -= cfs_rq->tg_runnable_contrib;
 
        if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
@@ -2670,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
-       __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+       __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
+                       runnable, runnable);
        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
@@ -2688,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
 
        /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
        contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
-       contrib /= (se->avg.runnable_avg_period + 1);
+       contrib /= (se->avg.avg_period + 1);
        se->avg.load_avg_contrib = scale_load(contrib);
 }
 
@@ -2707,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
        return se->avg.load_avg_contrib - old_contrib;
 }
 
+
+static inline void __update_task_entity_utilization(struct sched_entity *se)
+{
+       u32 contrib;
+
+       /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+       contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
+       contrib /= (se->avg.avg_period + 1);
+       se->avg.utilization_avg_contrib = scale_load(contrib);
+}
+
+static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
+{
+       long old_contrib = se->avg.utilization_avg_contrib;
+
+       if (entity_is_task(se))
+               __update_task_entity_utilization(se);
+       else
+               se->avg.utilization_avg_contrib =
+                                       group_cfs_rq(se)->utilization_load_avg;
+
+       return se->avg.utilization_avg_contrib - old_contrib;
+}
+
 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
                                                 long load_contrib)
 {
@@ -2723,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
                                          int update_cfs_rq)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       long contrib_delta;
+       long contrib_delta, utilization_delta;
+       int cpu = cpu_of(rq_of(cfs_rq));
        u64 now;
 
        /*
@@ -2735,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se,
        else
                now = cfs_rq_clock_task(group_cfs_rq(se));
 
-       if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+       if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
+                                       cfs_rq->curr == se))
                return;
 
        contrib_delta = __update_entity_load_avg_contrib(se);
+       utilization_delta = __update_entity_utilization_avg_contrib(se);
 
        if (!update_cfs_rq)
                return;
 
-       if (se->on_rq)
+       if (se->on_rq) {
                cfs_rq->runnable_load_avg += contrib_delta;
-       else
+               cfs_rq->utilization_load_avg += utilization_delta;
+       } else {
                subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+       }
 }
 
 /*
@@ -2821,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
        }
 
        cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+       cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
        /* we force update consideration on load-balancer moves */
        update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
@@ -2839,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
        update_cfs_rq_blocked_load(cfs_rq, !sleep);
 
        cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+       cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
        if (sleep) {
                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -3176,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 */
                update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
+               update_entity_load_avg(se, 1);
        }
 
        update_stats_curr_start(cfs_rq, se);
@@ -4302,6 +4367,11 @@ static unsigned long capacity_of(int cpu)
        return cpu_rq(cpu)->cpu_capacity;
 }
 
+static unsigned long capacity_orig_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -4715,6 +4785,33 @@ next:
 done:
        return target;
 }
+/*
+ * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the usage with the capacity of the CPU that is available for CFS
+ * task (ie cpu_capacity).
+ * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * CPU. It represents the amount of utilization of a CPU in the range
+ * [0..SCHED_LOAD_SCALE].  The usage of a CPU can't be higher than the full
+ * capacity of the CPU because it's about the running time on this CPU.
+ * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in avg_period and running_load_avg or just
+ * after migrating tasks until the average stabilizes with the new running
+ * time. So we need to check that the usage stays into the range
+ * [0..cpu_capacity_orig] and cap if necessary.
+ * Without capping the usage, a group could be seen as overloaded (CPU0 usage
+ * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ */
+static int get_cpu_usage(int cpu)
+{
+       unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+       unsigned long capacity = capacity_orig_of(cpu);
+
+       if (usage >= SCHED_LOAD_SCALE)
+               return capacity;
+
+       return (usage * capacity) >> SCHED_LOAD_SHIFT;
+}
 
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -5841,12 +5938,12 @@ struct sg_lb_stats {
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long load_per_task;
        unsigned long group_capacity;
+       unsigned long group_usage; /* Total usage of the group */
        unsigned int sum_nr_running; /* Nr tasks running in the group */
-       unsigned int group_capacity_factor;
        unsigned int idle_cpus;
        unsigned int group_weight;
        enum group_type group_type;
-       int group_has_free_capacity;
+       int group_no_capacity;
 #ifdef CONFIG_NUMA_BALANCING
        unsigned int nr_numa_running;
        unsigned int nr_preferred_running;
@@ -5917,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
        return load_idx;
 }
 
-static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
-{
-       return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
-{
-       return default_scale_capacity(sd, cpu);
-}
-
 static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
        if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
@@ -5943,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 static unsigned long scale_rt_capacity(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       u64 total, available, age_stamp, avg;
+       u64 total, used, age_stamp, avg;
        s64 delta;
 
        /*
@@ -5959,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu)
 
        total = sched_avg_period() + delta;
 
-       if (unlikely(total < avg)) {
-               /* Ensures that capacity won't end up being negative */
-               available = 0;
-       } else {
-               available = total - avg;
-       }
-
-       if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
-               total = SCHED_CAPACITY_SCALE;
+       used = div_u64(avg, total);
 
-       total >>= SCHED_CAPACITY_SHIFT;
+       if (likely(used < SCHED_CAPACITY_SCALE))
+               return SCHED_CAPACITY_SCALE - used;
 
-       return div_u64(available, total);
+       return 1;
 }
 
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -5986,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 
        capacity >>= SCHED_CAPACITY_SHIFT;
 
-       sdg->sgc->capacity_orig = capacity;
-
-       if (sched_feat(ARCH_CAPACITY))
-               capacity *= arch_scale_freq_capacity(sd, cpu);
-       else
-               capacity *= default_scale_capacity(sd, cpu);
-
-       capacity >>= SCHED_CAPACITY_SHIFT;
+       cpu_rq(cpu)->cpu_capacity_orig = capacity;
 
        capacity *= scale_rt_capacity(cpu);
        capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6009,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
-       unsigned long capacity, capacity_orig;
+       unsigned long capacity;
        unsigned long interval;
 
        interval = msecs_to_jiffies(sd->balance_interval);
@@ -6021,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                return;
        }
 
-       capacity_orig = capacity = 0;
+       capacity = 0;
 
        if (child->flags & SD_OVERLAP) {
                /*
@@ -6041,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                         * Use capacity_of(), which is set irrespective of domains
                         * in update_cpu_capacity().
                         *
-                        * This avoids capacity/capacity_orig from being 0 and
+                        * This avoids capacity from being 0 and
                         * causing divide-by-zero issues on boot.
-                        *
-                        * Runtime updates will correct capacity_orig.
                         */
                        if (unlikely(!rq->sd)) {
-                               capacity_orig += capacity_of(cpu);
                                capacity += capacity_of(cpu);
                                continue;
                        }
 
                        sgc = rq->sd->groups->sgc;
-                       capacity_orig += sgc->capacity_orig;
                        capacity += sgc->capacity;
                }
        } else  {
@@ -6064,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
                group = child->groups;
                do {
-                       capacity_orig += group->sgc->capacity_orig;
                        capacity += group->sgc->capacity;
                        group = group->next;
                } while (group != child->groups);
        }
 
-       sdg->sgc->capacity_orig = capacity_orig;
        sdg->sgc->capacity = capacity;
 }
 
 /*
- * Try and fix up capacity for tiny siblings, this is needed when
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
- * which on its own isn't powerful enough.
- *
- * See update_sd_pick_busiest() and check_asym_packing().
+ * Check whether the capacity of the rq has been noticeably reduced by side
+ * activity. The imbalance_pct is used for the threshold.
+ * Return true is the capacity is reduced
  */
 static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 {
-       /*
-        * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
-        */
-       if (!(sd->flags & SD_SHARE_CPUCAPACITY))
-               return 0;
-
-       /*
-        * If ~90% of the cpu_capacity is still there, we're good.
-        */
-       if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
-               return 1;
-
-       return 0;
+       return ((rq->cpu_capacity * sd->imbalance_pct) <
+                               (rq->cpu_capacity_orig * 100));
 }
 
 /*
@@ -6134,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group)
 }
 
 /*
- * Compute the group capacity factor.
- *
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
- * first dividing out the smt factor and computing the actual number of cores
- * and limit unit capacity with that.
+ * group_has_capacity returns true if the group has spare capacity that could
+ * be used by some tasks.
+ * We consider that a group has spare capacity if the  * number of task is
+ * smaller than the number of CPUs or if the usage is lower than the available
+ * capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
  */
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
 {
-       unsigned int capacity_factor, smt, cpus;
-       unsigned int capacity, capacity_orig;
+       if (sgs->sum_nr_running < sgs->group_weight)
+               return true;
 
-       capacity = group->sgc->capacity;
-       capacity_orig = group->sgc->capacity_orig;
-       cpus = group->group_weight;
+       if ((sgs->group_capacity * 100) >
+                       (sgs->group_usage * env->sd->imbalance_pct))
+               return true;
+
+       return false;
+}
 
-       /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
-       smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
-       capacity_factor = cpus / smt; /* cores */
+/*
+ *  group_is_overloaded returns true if the group has more tasks than it can
+ *  handle.
+ *  group_is_overloaded is not equals to !group_has_capacity because a group
+ *  with the exact right number of tasks, has no more spare capacity but is not
+ *  overloaded so both group_has_capacity and group_is_overloaded return
+ *  false.
+ */
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+       if (sgs->sum_nr_running <= sgs->group_weight)
+               return false;
 
-       capacity_factor = min_t(unsigned,
-               capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
-       if (!capacity_factor)
-               capacity_factor = fix_small_capacity(env->sd, group);
+       if ((sgs->group_capacity * 100) <
+                       (sgs->group_usage * env->sd->imbalance_pct))
+               return true;
 
-       return capacity_factor;
+       return false;
 }
 
-static enum group_type
-group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+static enum group_type group_classify(struct lb_env *env,
+               struct sched_group *group,
+               struct sg_lb_stats *sgs)
 {
-       if (sgs->sum_nr_running > sgs->group_capacity_factor)
+       if (sgs->group_no_capacity)
                return group_overloaded;
 
        if (sg_imbalanced(group))
@@ -6202,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        load = source_load(i, load_idx);
 
                sgs->group_load += load;
+               sgs->group_usage += get_cpu_usage(i);
                sgs->sum_nr_running += rq->cfs.h_nr_running;
 
                if (rq->nr_running > 1)
@@ -6224,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 
        sgs->group_weight = group->group_weight;
-       sgs->group_capacity_factor = sg_capacity_factor(env, group);
-       sgs->group_type = group_classify(group, sgs);
 
-       if (sgs->group_capacity_factor > sgs->sum_nr_running)
-               sgs->group_has_free_capacity = 1;
+       sgs->group_no_capacity = group_is_overloaded(env, sgs);
+       sgs->group_type = group_classify(env, group, sgs);
 }
 
 /**
@@ -6350,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 
                /*
                 * In case the child domain prefers tasks go to siblings
-                * first, lower the sg capacity factor to one so that we'll try
+                * first, lower the sg capacity so that we'll try
                 * and move all the excess tasks away. We lower the capacity
                 * of a group only if the local group has the capacity to fit
-                * these excess tasks, i.e. nr_running < group_capacity_factor. The
-                * extra check prevents the case where you always pull from the
-                * heaviest group when it is already under-utilized (possible
-                * with a large weight task outweighs the tasks on the system).
+                * these excess tasks. The extra check prevents the case where
+                * you always pull from the heaviest group when it is already
+                * under-utilized (possible with a large weight task outweighs
+                * the tasks on the system).
                 */
                if (prefer_sibling && sds->local &&
-                   sds->local_stat.group_has_free_capacity) {
-                       sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
-                       sgs->group_type = group_classify(sg, sgs);
+                   group_has_capacity(env, &sds->local_stat) &&
+                   (sgs->sum_nr_running > 1)) {
+                       sgs->group_no_capacity = 1;
+                       sgs->group_type = group_overloaded;
                }
 
                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -6541,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         */
        if (busiest->group_type == group_overloaded &&
            local->group_type   == group_overloaded) {
-               load_above_capacity =
-                       (busiest->sum_nr_running - busiest->group_capacity_factor);
-
-               load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
-               load_above_capacity /= busiest->group_capacity;
+               load_above_capacity = busiest->sum_nr_running *
+                                       SCHED_LOAD_SCALE;
+               if (load_above_capacity > busiest->group_capacity)
+                       load_above_capacity -= busiest->group_capacity;
+               else
+                       load_above_capacity = ~0UL;
        }
 
        /*
@@ -6608,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        local = &sds.local_stat;
        busiest = &sds.busiest_stat;
 
+       /* ASYM feature bypasses nice load balance check */
        if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
            check_asym_packing(env, &sds))
                return sds.busiest;
@@ -6628,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                goto force_balance;
 
        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-       if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
-           !busiest->group_has_free_capacity)
+       if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+           busiest->group_no_capacity)
                goto force_balance;
 
        /*
@@ -6688,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
 
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-               unsigned long capacity, capacity_factor, wl;
+               unsigned long capacity, wl;
                enum fbq_type rt;
 
                rq = cpu_rq(i);
@@ -6717,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                        continue;
 
                capacity = capacity_of(i);
-               capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
-               if (!capacity_factor)
-                       capacity_factor = fix_small_capacity(env->sd, group);
 
                wl = weighted_cpuload(i);
 
@@ -6727,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                 * When comparing with imbalance, use weighted_cpuload()
                 * which is not scaled with the cpu capacity.
                 */
-               if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
+
+               if (rq->nr_running == 1 && wl > env->imbalance &&
+                   !check_cpu_capacity(rq, env->sd))
                        continue;
 
                /*
@@ -6775,6 +6849,19 @@ static int need_active_balance(struct lb_env *env)
                        return 1;
        }
 
+       /*
+        * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+        * It's worth migrating the task if the src_cpu's capacity is reduced
+        * because of other sched_class or IRQs if more capacity stays
+        * available on dst_cpu.
+        */
+       if ((env->idle != CPU_NOT_IDLE) &&
+           (env->src_rq->cfs.h_nr_running == 1)) {
+               if ((check_cpu_capacity(env->src_rq, sd)) &&
+                   (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
+                       return 1;
+       }
+
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
@@ -6874,6 +6961,9 @@ redo:
 
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
 
+       env.src_cpu = busiest->cpu;
+       env.src_rq = busiest;
+
        ld_moved = 0;
        if (busiest->nr_running > 1) {
                /*
@@ -6883,8 +6973,6 @@ redo:
                 * correctly treated as an imbalance.
                 */
                env.flags |= LBF_ALL_PINNED;
-               env.src_cpu   = busiest->cpu;
-               env.src_rq    = busiest;
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
 more_balance:
@@ -7584,22 +7672,25 @@ end:
 
 /*
  * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
  *   - This rq has more than one task.
- *   - At any scheduler domain level, this cpu's scheduler group has multiple
- *     busy cpu's exceeding the group's capacity.
+ *   - This rq has at least one CFS task and the capacity of the CPU is
+ *     significantly reduced because of RT tasks or IRQs.
+ *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ *     multiple busy cpu.
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
 {
        unsigned long now = jiffies;
        struct sched_domain *sd;
        struct sched_group_capacity *sgc;
        int nr_busy, cpu = rq->cpu;
+       bool kick = false;
 
        if (unlikely(rq->idle_balance))
-               return 0;
+               return false;
 
        /*
        * We may be recently in ticked or tickless idle mode. At the first
@@ -7613,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq)
         * balancing.
         */
        if (likely(!atomic_read(&nohz.nr_cpus)))
-               return 0;
+               return false;
 
        if (time_before(now, nohz.next_balance))
-               return 0;
+               return false;
 
        if (rq->nr_running >= 2)
-               goto need_kick;
+               return true;
 
        rcu_read_lock();
        sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
        if (sd) {
                sgc = sd->groups->sgc;
                nr_busy = atomic_read(&sgc->nr_busy_cpus);
 
-               if (nr_busy > 1)
-                       goto need_kick_unlock;
+               if (nr_busy > 1) {
+                       kick = true;
+                       goto unlock;
+               }
+
        }
 
-       sd = rcu_dereference(per_cpu(sd_asym, cpu));
+       sd = rcu_dereference(rq->sd);
+       if (sd) {
+               if ((rq->cfs.h_nr_running >= 1) &&
+                               check_cpu_capacity(rq, sd)) {
+                       kick = true;
+                       goto unlock;
+               }
+       }
 
+       sd = rcu_dereference(per_cpu(sd_asym, cpu));
        if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
-                                 sched_domain_span(sd)) < cpu))
-               goto need_kick_unlock;
-
-       rcu_read_unlock();
-       return 0;
+                                 sched_domain_span(sd)) < cpu)) {
+               kick = true;
+               goto unlock;
+       }
 
-need_kick_unlock:
+unlock:
        rcu_read_unlock();
-need_kick:
-       return 1;
+       return kick;
 }
 #else
 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
@@ -7660,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h)
        enum cpu_idle_type idle = this_rq->idle_balance ?
                                                CPU_IDLE : CPU_NOT_IDLE;
 
-       rebalance_domains(this_rq, idle);
-
        /*
         * If this cpu has a pending nohz_balance_kick, then do the
         * balancing on behalf of the other idle cpus whose ticks are
-        * stopped.
+        * stopped. Do nohz_idle_balance *before* rebalance_domains to
+        * give the idle cpus a chance to load balance. Else we may
+        * load balance only within the local sched_domain hierarchy
+        * and abort nohz_idle_balance altogether if we pull some load.
         */
        nohz_idle_balance(this_rq, idle);
+       rebalance_domains(this_rq, idle);
 }
 
 /*
index 90284d117fe65ffc7ee1de7127995a750c84df92..91e33cd485f6577050672432c02354393887774e 100644 (file)
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
  */
 SCHED_FEAT(TTWU_QUEUE, true)
 
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * In order to avoid a thundering herd attack of CPUs that are
+ * lowering their priorities at the same time, and there being
+ * a single CPU that has an RT task that can migrate and is waiting
+ * to run, where the other CPUs will try to take that CPUs
+ * rq lock and possibly create a large contention, sending an
+ * IPI to that CPU and let that CPU push the RT task to where
+ * it should go may be a better scenario.
+ */
+SCHED_FEAT(RT_PUSH_IPI, true)
+#endif
+
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
index 80014a17834214fcad51add08b2b171463e84128..4d207d2abcbd9d3d613c1035d5c436fcd08b3a14 100644 (file)
@@ -158,8 +158,7 @@ static void cpuidle_idle_call(void)
         * is used from another cpu as a broadcast timer, this call may
         * fail if it is not available
         */
-       if (broadcast &&
-           clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+       if (broadcast && tick_broadcast_enter())
                goto use_default;
 
        /* Take note of the planned idle state. */
@@ -176,7 +175,7 @@ static void cpuidle_idle_call(void)
        idle_set_state(this_rq(), NULL);
 
        if (broadcast)
-               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+               tick_broadcast_exit();
 
        /*
         * Give the governor an opportunity to reflect on the outcome
index f4d4b077eba0a67a5c55e6a04dee8f6ce78f322c..575da76a3874a8c1b2ddd0f518e5ecea7a805262 100644 (file)
@@ -6,6 +6,7 @@
 #include "sched.h"
 
 #include <linux/slab.h>
+#include <linux/irq_work.h>
 
 int sched_rr_timeslice = RR_TIMESLICE;
 
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
        raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 
-void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+#ifdef CONFIG_SMP
+static void push_irq_work_func(struct irq_work *work);
+#endif
+
+void init_rt_rq(struct rt_rq *rt_rq)
 {
        struct rt_prio_array *array;
        int i;
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
        plist_head_init(&rt_rq->pushable_tasks);
+
+#ifdef HAVE_RT_PUSH_IPI
+       rt_rq->push_flags = 0;
+       rt_rq->push_cpu = nr_cpu_ids;
+       raw_spin_lock_init(&rt_rq->push_lock);
+       init_irq_work(&rt_rq->push_work, push_irq_work_func);
 #endif
+#endif /* CONFIG_SMP */
        /* We start is dequeued state, because no RT tasks are queued */
        rt_rq->rt_queued = 0;
 
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
 
-               init_rt_rq(rt_rq, cpu_rq(i));
+               init_rt_rq(rt_rq);
                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
@@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq)
                ;
 }
 
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
+ *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
+ */
+static int rto_next_cpu(struct rq *rq)
+{
+       int prev_cpu = rq->rt.push_cpu;
+       int cpu;
+
+       cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+
+       /*
+        * If the previous cpu is less than the rq's CPU, then it already
+        * passed the end of the mask, and has started from the beginning.
+        * We end if the next CPU is greater or equal to rq's CPU.
+        */
+       if (prev_cpu < rq->cpu) {
+               if (cpu >= rq->cpu)
+                       return nr_cpu_ids;
+
+       } else if (cpu >= nr_cpu_ids) {
+               /*
+                * We passed the end of the mask, start at the beginning.
+                * If the result is greater or equal to the rq's CPU, then
+                * the loop is finished.
+                */
+               cpu = cpumask_first(rq->rd->rto_mask);
+               if (cpu >= rq->cpu)
+                       return nr_cpu_ids;
+       }
+       rq->rt.push_cpu = cpu;
+
+       /* Return cpu to let the caller know if the loop is finished or not */
+       return cpu;
+}
+
+static int find_next_push_cpu(struct rq *rq)
+{
+       struct rq *next_rq;
+       int cpu;
+
+       while (1) {
+               cpu = rto_next_cpu(rq);
+               if (cpu >= nr_cpu_ids)
+                       break;
+               next_rq = cpu_rq(cpu);
+
+               /* Make sure the next rq can push to this rq */
+               if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+                       break;
+       }
+
+       return cpu;
+}
+
+#define RT_PUSH_IPI_EXECUTING          1
+#define RT_PUSH_IPI_RESTART            2
+
+static void tell_cpu_to_push(struct rq *rq)
+{
+       int cpu;
+
+       if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+               raw_spin_lock(&rq->rt.push_lock);
+               /* Make sure it's still executing */
+               if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+                       /*
+                        * Tell the IPI to restart the loop as things have
+                        * changed since it started.
+                        */
+                       rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+                       raw_spin_unlock(&rq->rt.push_lock);
+                       return;
+               }
+               raw_spin_unlock(&rq->rt.push_lock);
+       }
+
+       /* When here, there's no IPI going around */
+
+       rq->rt.push_cpu = rq->cpu;
+       cpu = find_next_push_cpu(rq);
+       if (cpu >= nr_cpu_ids)
+               return;
+
+       rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+
+       irq_work_queue_on(&rq->rt.push_work, cpu);
+}
+
+/* Called from hardirq context */
+static void try_to_push_tasks(void *arg)
+{
+       struct rt_rq *rt_rq = arg;
+       struct rq *rq, *src_rq;
+       int this_cpu;
+       int cpu;
+
+       this_cpu = rt_rq->push_cpu;
+
+       /* Paranoid check */
+       BUG_ON(this_cpu != smp_processor_id());
+
+       rq = cpu_rq(this_cpu);
+       src_rq = rq_of_rt_rq(rt_rq);
+
+again:
+       if (has_pushable_tasks(rq)) {
+               raw_spin_lock(&rq->lock);
+               push_rt_task(rq);
+               raw_spin_unlock(&rq->lock);
+       }
+
+       /* Pass the IPI to the next rt overloaded queue */
+       raw_spin_lock(&rt_rq->push_lock);
+       /*
+        * If the source queue changed since the IPI went out,
+        * we need to restart the search from that CPU again.
+        */
+       if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+               rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+               rt_rq->push_cpu = src_rq->cpu;
+       }
+
+       cpu = find_next_push_cpu(src_rq);
+
+       if (cpu >= nr_cpu_ids)
+               rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+       raw_spin_unlock(&rt_rq->push_lock);
+
+       if (cpu >= nr_cpu_ids)
+               return;
+
+       /*
+        * It is possible that a restart caused this CPU to be
+        * chosen again. Don't bother with an IPI, just see if we
+        * have more to push.
+        */
+       if (unlikely(cpu == rq->cpu))
+               goto again;
+
+       /* Try the next RT overloaded CPU */
+       irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+
+static void push_irq_work_func(struct irq_work *work)
+{
+       struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+
+       try_to_push_tasks(rt_rq);
+}
+#endif /* HAVE_RT_PUSH_IPI */
+
 static int pull_rt_task(struct rq *this_rq)
 {
        int this_cpu = this_rq->cpu, ret = 0, cpu;
@@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq)
         */
        smp_rmb();
 
+#ifdef HAVE_RT_PUSH_IPI
+       if (sched_feat(RT_PUSH_IPI)) {
+               tell_cpu_to_push(this_rq);
+               return 0;
+       }
+#endif
+
        for_each_cpu(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
index dc0f435a27794657258623ac8a7f53f7326ff7ac..e0e1299939588ac47f08b13b45f1a6e2e9cf4d7f 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
+#include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
 
@@ -362,8 +363,14 @@ struct cfs_rq {
         * Under CFS, load is tracked on a per-entity basis and aggregated up.
         * This allows for the description of both thread and group usage (in
         * the FAIR_GROUP_SCHED case).
+        * runnable_load_avg is the sum of the load_avg_contrib of the
+        * sched_entities on the rq.
+        * blocked_load_avg is similar to runnable_load_avg except that its
+        * the blocked sched_entities on the rq.
+        * utilization_load_avg is the sum of the average running time of the
+        * sched_entities on the rq.
         */
-       unsigned long runnable_load_avg, blocked_load_avg;
+       unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
        atomic64_t decay_counter;
        u64 last_decay;
        atomic_long_t removed_load;
@@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void)
        return sysctl_sched_rt_runtime >= 0;
 }
 
+/* RT IPI pull logic requires IRQ_WORK */
+#ifdef CONFIG_IRQ_WORK
+# define HAVE_RT_PUSH_IPI
+#endif
+
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
        struct rt_prio_array active;
@@ -435,7 +447,13 @@ struct rt_rq {
        unsigned long rt_nr_total;
        int overloaded;
        struct plist_head pushable_tasks;
+#ifdef HAVE_RT_PUSH_IPI
+       int push_flags;
+       int push_cpu;
+       struct irq_work push_work;
+       raw_spinlock_t push_lock;
 #endif
+#endif /* CONFIG_SMP */
        int rt_queued;
 
        int rt_throttled;
@@ -597,6 +615,7 @@ struct rq {
        struct sched_domain *sd;
 
        unsigned long cpu_capacity;
+       unsigned long cpu_capacity_orig;
 
        unsigned char idle_balance;
        /* For active balancing */
@@ -807,7 +826,7 @@ struct sched_group_capacity {
         * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
         * for a single CPU.
         */
-       unsigned int capacity, capacity_orig;
+       unsigned int capacity;
        unsigned long next_update;
        int imbalance; /* XXX unrelated to capacity but shared group state */
        /*
@@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq)
 
 #ifdef CONFIG_SMP
 extern void sched_avg_update(struct rq *rq);
+
+#ifndef arch_scale_freq_capacity
+static __always_inline
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+       return SCHED_CAPACITY_SCALE;
+}
+#endif
+
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
-       rq->rt_avg += rt_delta;
+       rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
        sched_avg_update(rq);
 }
 #else
@@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
 
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
-extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
+extern void init_rt_rq(struct rt_rq *rt_rq);
+extern void init_dl_rq(struct dl_rq *dl_rq);
 
 extern void cfs_bandwidth_usage_inc(void);
 extern void cfs_bandwidth_usage_dec(void);
index 88ea2d6e00314059b96adb0505ffc5f9c98fcf73..ce410bb9f2e103e0fcfda7d7b844948a0a28fbce 100644 (file)
@@ -1227,6 +1227,14 @@ static struct ctl_table vm_table[] = {
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &zero,
        },
+       {
+               .procname       = "dirtytime_expire_seconds",
+               .data           = &dirtytime_expire_interval,
+               .maxlen         = sizeof(dirty_expire_interval),
+               .mode           = 0644,
+               .proc_handler   = dirtytime_interval_handler,
+               .extra1         = &zero,
+       },
        {
                .procname       = "nr_pdflush_threads",
                .mode           = 0444 /* read-only */,
index d626dc98e8df952eff1df8ced84b2557475fe6c6..579ce1b929afde343a29fc77e4e7c4997ea18852 100644 (file)
@@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET
 config GENERIC_CLOCKEVENTS
        bool
 
-# Migration helper. Builds, but does not invoke
-config GENERIC_CLOCKEVENTS_BUILD
-       bool
-       default y
-       depends on GENERIC_CLOCKEVENTS
-
 # Architecture can handle broadcast in a driver-agnostic way
 config ARCH_HAS_TICK_BROADCAST
        bool
index c09c07817d7a7c854a1b88b12b894f60c547cda9..01f0312419b3cb44d8fa455d8cfaa2ad14d5ef0d 100644 (file)
@@ -2,15 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
 obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
 
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)                += clockevents.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS)              += tick-common.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS)              += clockevents.o tick-common.o
 ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
  obj-y                                         += tick-broadcast.o
  obj-$(CONFIG_TICK_ONESHOT)                    += tick-broadcast-hrtimer.o
 endif
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)              += sched_clock.o
-obj-$(CONFIG_TICK_ONESHOT)                     += tick-oneshot.o
-obj-$(CONFIG_TICK_ONESHOT)                     += tick-sched.o
+obj-$(CONFIG_TICK_ONESHOT)                     += tick-oneshot.o tick-sched.o
 obj-$(CONFIG_TIMER_STATS)                      += timer_stats.o
 obj-$(CONFIG_DEBUG_FS)                         += timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)                      += test_udelay.o
index 55449909f11475372135ac61b33e65114eb151ba..25d942d1da27095e6366d720a0f8de58009cb5f3 100644 (file)
@@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 
+static int __clockevents_set_state(struct clock_event_device *dev,
+                                  enum clock_event_state state)
+{
+       /* Transition with legacy set_mode() callback */
+       if (dev->set_mode) {
+               /* Legacy callback doesn't support new modes */
+               if (state > CLOCK_EVT_STATE_ONESHOT)
+                       return -ENOSYS;
+               /*
+                * 'clock_event_state' and 'clock_event_mode' have 1-to-1
+                * mapping until *_ONESHOT, and so a simple cast will work.
+                */
+               dev->set_mode((enum clock_event_mode)state, dev);
+               dev->mode = (enum clock_event_mode)state;
+               return 0;
+       }
+
+       if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+               return 0;
+
+       /* Transition with new state-specific callbacks */
+       switch (state) {
+       case CLOCK_EVT_STATE_DETACHED:
+               /*
+                * This is an internal state, which is guaranteed to go from
+                * SHUTDOWN to DETACHED. No driver interaction required.
+                */
+               return 0;
+
+       case CLOCK_EVT_STATE_SHUTDOWN:
+               return dev->set_state_shutdown(dev);
+
+       case CLOCK_EVT_STATE_PERIODIC:
+               /* Core internal bug */
+               if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+                       return -ENOSYS;
+               return dev->set_state_periodic(dev);
+
+       case CLOCK_EVT_STATE_ONESHOT:
+               /* Core internal bug */
+               if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+                       return -ENOSYS;
+               return dev->set_state_oneshot(dev);
+
+       default:
+               return -ENOSYS;
+       }
+}
+
 /**
- * clockevents_set_mode - set the operating mode of a clock event device
+ * clockevents_set_state - set the operating state of a clock event device
  * @dev:       device to modify
- * @mode:      new mode
+ * @state:     new state
  *
  * Must be called with interrupts disabled !
  */
-void clockevents_set_mode(struct clock_event_device *dev,
-                                enum clock_event_mode mode)
+void clockevents_set_state(struct clock_event_device *dev,
+                          enum clock_event_state state)
 {
-       if (dev->mode != mode) {
-               dev->set_mode(mode, dev);
-               dev->mode = mode;
+       if (dev->state != state) {
+               if (__clockevents_set_state(dev, state))
+                       return;
+
+               dev->state = state;
 
                /*
                 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
                 * on it, so fix it up and emit a warning:
                 */
-               if (mode == CLOCK_EVT_MODE_ONESHOT) {
+               if (state == CLOCK_EVT_STATE_ONESHOT) {
                        if (unlikely(!dev->mult)) {
                                dev->mult = 1;
                                WARN_ON(1);
@@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev,
  */
 void clockevents_shutdown(struct clock_event_device *dev)
 {
-       clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+       clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
        dev->next_event.tv64 = KTIME_MAX;
 }
 
+/**
+ * clockevents_tick_resume -   Resume the tick device before using it again
+ * @dev:                       device to resume
+ */
+int clockevents_tick_resume(struct clock_event_device *dev)
+{
+       int ret = 0;
+
+       if (dev->set_mode) {
+               dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
+               dev->mode = CLOCK_EVT_MODE_RESUME;
+       } else if (dev->tick_resume) {
+               ret = dev->tick_resume(dev);
+       }
+
+       return ret;
+}
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
 
 /* Limit min_delta to a jiffie */
@@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
                delta = dev->min_delta_ns;
                dev->next_event = ktime_add_ns(ktime_get(), delta);
 
-               if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+               if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
                        return 0;
 
                dev->retries++;
@@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
        delta = dev->min_delta_ns;
        dev->next_event = ktime_add_ns(ktime_get(), delta);
 
-       if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+       if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
                return 0;
 
        dev->retries++;
@@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 
        dev->next_event = expires;
 
-       if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+       if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
                return 0;
 
        /* Shortcut for clockevent devices that can deal with ktime. */
@@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced)
        struct clock_event_device *dev, *newdev = NULL;
 
        list_for_each_entry(dev, &clockevent_devices, list) {
-               if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
+               if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
                        continue;
 
                if (!tick_check_replacement(newdev, dev))
@@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced)
 static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
 {
        /* Fast track. Device is unused */
-       if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
+       if (ced->state == CLOCK_EVT_STATE_DETACHED) {
                list_del_init(&ced->list);
                return 0;
        }
@@ -373,6 +442,37 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind);
 
+/* Sanity check of state transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+       /* Legacy set_mode() callback */
+       if (dev->set_mode) {
+               /* We shouldn't be supporting new modes now */
+               WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
+                       dev->set_state_shutdown || dev->tick_resume);
+
+               BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+               return 0;
+       }
+
+       if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+               return 0;
+
+       /* New state-specific callbacks */
+       if (!dev->set_state_shutdown)
+               return -EINVAL;
+
+       if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+           !dev->set_state_periodic)
+               return -EINVAL;
+
+       if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+           !dev->set_state_oneshot)
+               return -EINVAL;
+
+       return 0;
+}
+
 /**
  * clockevents_register_device - register a clock event device
  * @dev:       device to register
@@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev)
 {
        unsigned long flags;
 
-       BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+       BUG_ON(clockevents_sanity_check(dev));
+
+       /* Initialize state to DETACHED */
+       dev->state = CLOCK_EVT_STATE_DETACHED;
+
        if (!dev->cpumask) {
                WARN_ON(num_possible_cpus() > 1);
                dev->cpumask = cpumask_of(smp_processor_id());
@@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 {
        clockevents_config(dev, freq);
 
-       if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
+       if (dev->state == CLOCK_EVT_STATE_ONESHOT)
                return clockevents_program_event(dev, dev->next_event, false);
 
-       if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
-               dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+       if (dev->state == CLOCK_EVT_STATE_PERIODIC)
+               return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
 
        return 0;
 }
@@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev)
  * @old:       device to release (can be NULL)
  * @new:       device to request (can be NULL)
  *
- * Called from the notifier chain. clockevents_lock is held already
+ * Called from various tick functions with clockevents_lock held and
+ * interrupts disabled.
  */
 void clockevents_exchange_device(struct clock_event_device *old,
                                 struct clock_event_device *new)
 {
-       unsigned long flags;
-
-       local_irq_save(flags);
        /*
         * Caller releases a clock event device. We queue it into the
         * released list and do a notify add later.
         */
        if (old) {
                module_put(old->owner);
-               clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+               clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
                list_del(&old->list);
                list_add(&old->list, &clockevents_released);
        }
 
        if (new) {
-               BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+               BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
                clockevents_shutdown(new);
        }
-       local_irq_restore(flags);
 }
 
 /**
@@ -541,74 +642,40 @@ void clockevents_resume(void)
                        dev->resume(dev);
 }
 
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
+#ifdef CONFIG_HOTPLUG_CPU
 /**
- * clockevents_notify - notification about relevant events
- * Returns 0 on success, any other value on error
+ * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
  */
-int clockevents_notify(unsigned long reason, void *arg)
+void tick_cleanup_dead_cpu(int cpu)
 {
        struct clock_event_device *dev, *tmp;
        unsigned long flags;
-       int cpu, ret = 0;
 
        raw_spin_lock_irqsave(&clockevents_lock, flags);
 
-       switch (reason) {
-       case CLOCK_EVT_NOTIFY_BROADCAST_ON:
-       case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-       case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
-               tick_broadcast_on_off(reason, arg);
-               break;
-
-       case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
-       case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
-               ret = tick_broadcast_oneshot_control(reason);
-               break;
-
-       case CLOCK_EVT_NOTIFY_CPU_DYING:
-               tick_handover_do_timer(arg);
-               break;
-
-       case CLOCK_EVT_NOTIFY_SUSPEND:
-               tick_suspend();
-               tick_suspend_broadcast();
-               break;
-
-       case CLOCK_EVT_NOTIFY_RESUME:
-               tick_resume();
-               break;
-
-       case CLOCK_EVT_NOTIFY_CPU_DEAD:
-               tick_shutdown_broadcast_oneshot(arg);
-               tick_shutdown_broadcast(arg);
-               tick_shutdown(arg);
-               /*
-                * Unregister the clock event devices which were
-                * released from the users in the notify chain.
-                */
-               list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+       tick_shutdown_broadcast_oneshot(cpu);
+       tick_shutdown_broadcast(cpu);
+       tick_shutdown(cpu);
+       /*
+        * Unregister the clock event devices which were
+        * released from the users in the notify chain.
+        */
+       list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+               list_del(&dev->list);
+       /*
+        * Now check whether the CPU has left unused per cpu devices
+        */
+       list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
+               if (cpumask_test_cpu(cpu, dev->cpumask) &&
+                   cpumask_weight(dev->cpumask) == 1 &&
+                   !tick_is_broadcast_device(dev)) {
+                       BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
                        list_del(&dev->list);
-               /*
-                * Now check whether the CPU has left unused per cpu devices
-                */
-               cpu = *((int *)arg);
-               list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
-                       if (cpumask_test_cpu(cpu, dev->cpumask) &&
-                           cpumask_weight(dev->cpumask) == 1 &&
-                           !tick_is_broadcast_device(dev)) {
-                               BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-                               list_del(&dev->list);
-                       }
                }
-               break;
-       default:
-               break;
        }
        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
-       return ret;
 }
-EXPORT_SYMBOL_GPL(clockevents_notify);
+#endif
 
 #ifdef CONFIG_SYSFS
 struct bus_type clockevents_subsys = {
@@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void)
 }
 device_initcall(clockevents_init_sysfs);
 #endif /* SYSFS */
-
-#endif /* GENERIC_CLOCK_EVENTS */
index 4892352f0e4989c561c5d16ba3b27c063082a8f6..15facb1b9c606c7a5fa5ea3500ea7dd4bf523477 100644 (file)
@@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
                schedule_work(&watchdog_work);
 }
 
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
-{
-       printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
-              cs->name, delta);
-       __clocksource_unstable(cs);
-}
-
 /**
  * clocksource_mark_unstable - mark clocksource unstable via watchdog
  * @cs:                clocksource to be marked unstable
@@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 static void clocksource_watchdog(unsigned long data)
 {
        struct clocksource *cs;
-       cycle_t csnow, wdnow, delta;
+       cycle_t csnow, wdnow, cslast, wdlast, delta;
        int64_t wd_nsec, cs_nsec;
        int next_cpu, reset_pending;
 
@@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data)
 
                delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
                cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+               wdlast = cs->wd_last; /* save these in case we print them */
+               cslast = cs->cs_last;
                cs->cs_last = csnow;
                cs->wd_last = wdnow;
 
@@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)
 
                /* Check the deviation from the watchdog clocksource. */
                if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
-                       clocksource_unstable(cs, cs_nsec - wd_nsec);
+                       pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
+                       pr_warn("       '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+                               watchdog->name, wdnow, wdlast, watchdog->mask);
+                       pr_warn("       '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+                               cs->name, csnow, cslast, cs->mask);
+                       __clocksource_unstable(cs);
                        continue;
                }
 
@@ -469,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
  * @shift:     cycle to nanosecond divisor (power of two)
  * @maxadj:    maximum adjustment value to mult (~11%)
  * @mask:      bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc:   maximum cycle value before potential overflow (does not include
+ *             any safety margin)
+ *
+ * NOTE: This function includes a safety margin of 50%, in other words, we
+ * return half the number of nanoseconds the hardware counter can technically
+ * cover. This is done so that we can potentially detect problems caused by
+ * delayed timers or bad hardware, which might result in time intervals that
+ * are larger then what the math used can handle without overflows.
  */
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
 {
        u64 max_nsecs, max_cycles;
 
        /*
         * Calculate the maximum number of cycles that we can pass to the
-        * cyc2ns function without overflowing a 64-bit signed result. The
-        * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
-        * which is equivalent to the below.
-        * max_cycles < (2^63)/(mult + maxadj)
-        * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
-        * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
-        * max_cycles < 2^(63 - log2(mult + maxadj))
-        * max_cycles < 1 << (63 - log2(mult + maxadj))
-        * Please note that we add 1 to the result of the log2 to account for
-        * any rounding errors, ensure the above inequality is satisfied and
-        * no overflow will occur.
+        * cyc2ns() function without overflowing a 64-bit result.
         */
-       max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+       max_cycles = ULLONG_MAX;
+       do_div(max_cycles, mult+maxadj);
 
        /*
         * The actual maximum number of cycles we can defer the clocksource is
@@ -499,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
        max_cycles = min(max_cycles, mask);
        max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
 
+       /* return the max_cycles value as well if requested */
+       if (max_cyc)
+               *max_cyc = max_cycles;
+
+       /* Return 50% of the actual maximum, so we can detect bad values */
+       max_nsecs >>= 1;
+
        return max_nsecs;
 }
 
 /**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs:         Pointer to clocksource
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
+ * @cs:         Pointer to clocksource to be updated
  *
  */
-static u64 clocksource_max_deferment(struct clocksource *cs)
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
 {
-       u64 max_nsecs;
-
-       max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
-                                         cs->mask);
-       /*
-        * To ensure that the clocksource does not wrap whilst we are idle,
-        * limit the time the clocksource can be deferred by 12.5%. Please
-        * note a margin of 12.5% is used because this can be computed with
-        * a shift, versus say 10% which would require division.
-        */
-       return max_nsecs - (max_nsecs >> 3);
+       cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+                                               cs->maxadj, cs->mask,
+                                               &cs->max_cycles);
 }
 
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -648,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs)
 }
 
 /**
- * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
  * @cs:                clocksource to be registered
  * @scale:     Scale factor multiplied against freq to get clocksource hz
  * @freq:      clocksource frequency (cycles per second) divided by scale
@@ -656,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs)
  * This should only be called from the clocksource->enable() method.
  *
  * This *SHOULD NOT* be called directly! Please use the
- * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
  */
-void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
        u64 sec;
+
        /*
-        * Calc the maximum number of seconds which we can run before
-        * wrapping around. For clocksources which have a mask > 32bit
-        * we need to limit the max sleep time to have a good
-        * conversion precision. 10 minutes is still a reasonable
-        * amount. That results in a shift value of 24 for a
-        * clocksource with mask >= 40bit and f >= 4GHz. That maps to
-        * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
-        * margin as we do in clocksource_max_deferment()
+        * Default clocksources are *special* and self-define their mult/shift.
+        * But, you're not special, so you should specify a freq value.
         */
-       sec = (cs->mask - (cs->mask >> 3));
-       do_div(sec, freq);
-       do_div(sec, scale);
-       if (!sec)
-               sec = 1;
-       else if (sec > 600 && cs->mask > UINT_MAX)
-               sec = 600;
-
-       clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
-                              NSEC_PER_SEC / scale, sec * scale);
-
+       if (freq) {
+               /*
+                * Calc the maximum number of seconds which we can run before
+                * wrapping around. For clocksources which have a mask > 32-bit
+                * we need to limit the max sleep time to have a good
+                * conversion precision. 10 minutes is still a reasonable
+                * amount. That results in a shift value of 24 for a
+                * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
+                * ~ 0.06ppm granularity for NTP.
+                */
+               sec = cs->mask;
+               do_div(sec, freq);
+               do_div(sec, scale);
+               if (!sec)
+                       sec = 1;
+               else if (sec > 600 && cs->mask > UINT_MAX)
+                       sec = 600;
+
+               clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+                                      NSEC_PER_SEC / scale, sec * scale);
+       }
        /*
-        * for clocksources that have large mults, to avoid overflow.
-        * Since mult may be adjusted by ntp, add an safety extra margin
-        *
+        * Ensure clocksources that have large 'mult' values don't overflow
+        * when adjusted.
         */
        cs->maxadj = clocksource_max_adjustment(cs);
-       while ((cs->mult + cs->maxadj < cs->mult)
-               || (cs->mult - cs->maxadj > cs->mult)) {
+       while (freq && ((cs->mult + cs->maxadj < cs->mult)
+               || (cs->mult - cs->maxadj > cs->mult))) {
                cs->mult >>= 1;
                cs->shift--;
                cs->maxadj = clocksource_max_adjustment(cs);
        }
 
-       cs->max_idle_ns = clocksource_max_deferment(cs);
+       /*
+        * Only warn for *special* clocksources that self-define
+        * their mult/shift values and don't specify a freq.
+        */
+       WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+               "timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+               cs->name);
+
+       clocksource_update_max_deferment(cs);
+
+       pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+                       cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
-EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 
 /**
  * __clocksource_register_scale - Used to install new clocksources
@@ -714,7 +728,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 
        /* Initialize mult/shift and max_idle_ns */
-       __clocksource_updatefreq_scale(cs, scale, freq);
+       __clocksource_update_freq_scale(cs, scale, freq);
 
        /* Add clocksource to the clocksource list */
        mutex_lock(&clocksource_mutex);
@@ -726,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 }
 EXPORT_SYMBOL_GPL(__clocksource_register_scale);
 
-
-/**
- * clocksource_register - Used to install new clocksources
- * @cs:                clocksource to be registered
- *
- * Returns -EBUSY if registration fails, zero otherwise.
- */
-int clocksource_register(struct clocksource *cs)
-{
-       /* calculate max adjustment for given mult/shift */
-       cs->maxadj = clocksource_max_adjustment(cs);
-       WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
-               "Clocksource %s might overflow on 11%% adjustment\n",
-               cs->name);
-
-       /* calculate max idle time permitted for this clocksource */
-       cs->max_idle_ns = clocksource_max_deferment(cs);
-
-       mutex_lock(&clocksource_mutex);
-       clocksource_enqueue(cs);
-       clocksource_enqueue_watchdog(cs);
-       clocksource_select();
-       mutex_unlock(&clocksource_mutex);
-       return 0;
-}
-EXPORT_SYMBOL(clocksource_register);
-
 static void __clocksource_change_rating(struct clocksource *cs, int rating)
 {
        list_del(&cs->list);
index bee0c1f780911a97a4598b81089c9bc4d807d037..76d4bd962b19b3bab345460676954ef6f7c14568 100644 (file)
@@ -54,7 +54,7 @@
 
 #include <trace/events/timer.h>
 
-#include "timekeeping.h"
+#include "tick-internal.h"
 
 /*
  * The timer bases:
@@ -1707,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
                break;
 
 #ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DYING:
-       case CPU_DYING_FROZEN:
-               clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
-               break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-       {
-               clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
                migrate_hrtimers(scpu);
                break;
-       }
 #endif
 
        default:
index a6a5bf53e86d25575f90518399407a4fb65a85ed..347fecf86a3fb2242e0a88b63975424f0293dde7 100644 (file)
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 
-#include "tick-internal.h"
+#include "timekeeping.h"
 
 /* The Jiffies based clocksource is the lowest common
  * denominator clock source which should function on
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
        .mask           = 0xffffffff, /*32bits*/
        .mult           = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
        .shift          = JIFFIES_SHIFT,
+       .max_cycles     = 10,
 };
 
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
 
 static int __init init_jiffies_clocksource(void)
 {
-       return clocksource_register(&clocksource_jiffies);
+       return __clocksource_register(&clocksource_jiffies);
 }
 
 core_initcall(init_jiffies_clocksource);
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
 
        refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
 
-       clocksource_register(&refined_jiffies);
+       __clocksource_register(&refined_jiffies);
        return 0;
 }
index 0f60b08a4f073e9246ced1dc3b5de5f50efd7cf4..7a681003001c0ee75631e2c5c56e528ec0ea98df 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/module.h>
 #include <linux/rtc.h>
 
-#include "tick-internal.h"
 #include "ntp_internal.h"
 
 /*
@@ -459,6 +458,16 @@ out:
        return leap;
 }
 
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock64(struct timespec64 now64)
+{
+       struct timespec now;
+
+       now = timespec64_to_timespec(now64);
+       return update_persistent_clock(now);
+}
+#endif
+
 #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
 static void sync_cmos_clock(struct work_struct *work);
 
@@ -494,8 +503,9 @@ static void sync_cmos_clock(struct work_struct *work)
                if (persistent_clock_is_local)
                        adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-               fail = update_persistent_clock(timespec64_to_timespec(adjust));
+               fail = update_persistent_clock64(adjust);
 #endif
+
 #ifdef CONFIG_RTC_SYSTOHC
                if (fail == -ENODEV)
                        fail = rtc_set_ntp_time(adjust);
index 01d2d15aa66233dc62db43f8e988a0f5519a729b..a26036d37a3895f163a20abdde5c6361d0110cf1 100644 (file)
@@ -1,5 +1,6 @@
 /*
- * sched_clock.c: support for extending counters to full 64-bit ns counter
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ *                hardware time counters to full 64-bit ns values.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
 #include <linux/seqlock.h>
 #include <linux/bitops.h>
 
-struct clock_data {
-       ktime_t wrap_kt;
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns:          sched_clock() value at last update
+ * @epoch_cyc:         Clock cycle value at last update.
+ * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
+ *                     clocks.
+ * @read_sched_clock:  Current clock source (or dummy source when suspended).
+ * @mult:              Multipler for scaled math conversion.
+ * @shift:             Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
        u64 epoch_ns;
        u64 epoch_cyc;
-       seqcount_t seq;
-       unsigned long rate;
+       u64 sched_clock_mask;
+       u64 (*read_sched_clock)(void);
        u32 mult;
        u32 shift;
-       bool suspended;
+};
+
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ *                     registration of a new clock source)
+ *
+ * @seq:               Sequence counter for protecting updates. The lowest
+ *                     bit is the index for @read_data.
+ * @read_data:         Data required to read from sched_clock.
+ * @wrap_kt:           Duration for which clock can run before wrapping.
+ * @rate:              Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+       seqcount_t              seq;
+       struct clock_read_data  read_data[2];
+       ktime_t                 wrap_kt;
+       unsigned long           rate;
+
+       u64 (*actual_read_sched_clock)(void);
 };
 
 static struct hrtimer sched_clock_timer;
@@ -34,12 +73,6 @@ static int irqtime = -1;
 
 core_param(irqtime, irqtime, int, 0400);
 
-static struct clock_data cd = {
-       .mult   = NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
 static u64 notrace jiffy_sched_clock_read(void)
 {
        /*
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
        return (u64)(jiffies - INITIAL_JIFFIES);
 }
 
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd ____cacheline_aligned = {
+       .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+                         .read_sched_clock = jiffy_sched_clock_read, },
+       .actual_read_sched_clock = jiffy_sched_clock_read,
+};
 
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 
 unsigned long long notrace sched_clock(void)
 {
-       u64 epoch_ns;
-       u64 epoch_cyc;
-       u64 cyc;
+       u64 cyc, res;
        unsigned long seq;
-
-       if (cd.suspended)
-               return cd.epoch_ns;
+       struct clock_read_data *rd;
 
        do {
-               seq = raw_read_seqcount_begin(&cd.seq);
-               epoch_cyc = cd.epoch_cyc;
-               epoch_ns = cd.epoch_ns;
+               seq = raw_read_seqcount(&cd.seq);
+               rd = cd.read_data + (seq & 1);
+
+               cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+                     rd->sched_clock_mask;
+               res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
        } while (read_seqcount_retry(&cd.seq, seq));
 
-       cyc = read_sched_clock();
-       cyc = (cyc - epoch_cyc) & sched_clock_mask;
-       return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+       return res;
+}
+
+/*
+ * Updating the data required to read the clock.
+ *
+ * sched_clock() will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ */
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+       /* update the backup (odd) copy with the new data */
+       cd.read_data[1] = *rd;
+
+       /* steer readers towards the odd copy */
+       raw_write_seqcount_latch(&cd.seq);
+
+       /* now its safe for us to update the normal (even) copy */
+       cd.read_data[0] = *rd;
+
+       /* switch readers back to the even copy */
+       raw_write_seqcount_latch(&cd.seq);
 }
 
 /*
- * Atomically update the sched_clock epoch.
+ * Atomically update the sched_clock() epoch.
  */
-static void notrace update_sched_clock(void)
+static void update_sched_clock(void)
 {
-       unsigned long flags;
        u64 cyc;
        u64 ns;
+       struct clock_read_data rd;
+
+       rd = cd.read_data[0];
+
+       cyc = cd.actual_read_sched_clock();
+       ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+
+       rd.epoch_ns = ns;
+       rd.epoch_cyc = cyc;
 
-       cyc = read_sched_clock();
-       ns = cd.epoch_ns +
-               cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-                         cd.mult, cd.shift);
-
-       raw_local_irq_save(flags);
-       raw_write_seqcount_begin(&cd.seq);
-       cd.epoch_ns = ns;
-       cd.epoch_cyc = cyc;
-       raw_write_seqcount_end(&cd.seq);
-       raw_local_irq_restore(flags);
+       update_clock_read_data(&rd);
 }
 
 static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
        update_sched_clock();
        hrtimer_forward_now(hrt, cd.wrap_kt);
+
        return HRTIMER_RESTART;
 }
 
-void __init sched_clock_register(u64 (*read)(void), int bits,
-                                unsigned long rate)
+void __init
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 {
        u64 res, wrap, new_mask, new_epoch, cyc, ns;
        u32 new_mult, new_shift;
-       ktime_t new_wrap_kt;
        unsigned long r;
        char r_unit;
+       struct clock_read_data rd;
 
        if (cd.rate > rate)
                return;
 
        WARN_ON(!irqs_disabled());
 
-       /* calculate the mult/shift to convert counter ticks to ns. */
+       /* Calculate the mult/shift to convert counter ticks to ns. */
        clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
 
        new_mask = CLOCKSOURCE_MASK(bits);
+       cd.rate = rate;
+
+       /* Calculate how many nanosecs until we risk wrapping */
+       wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
+       cd.wrap_kt = ns_to_ktime(wrap);
 
-       /* calculate how many ns until we wrap */
-       wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
-       new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+       rd = cd.read_data[0];
 
-       /* update epoch for new counter and update epoch_ns from old counter*/
+       /* Update epoch for new counter and update 'epoch_ns' from old counter*/
        new_epoch = read();
-       cyc = read_sched_clock();
-       ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-                         cd.mult, cd.shift);
+       cyc = cd.actual_read_sched_clock();
+       ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+       cd.actual_read_sched_clock = read;
 
-       raw_write_seqcount_begin(&cd.seq);
-       read_sched_clock = read;
-       sched_clock_mask = new_mask;
-       cd.rate = rate;
-       cd.wrap_kt = new_wrap_kt;
-       cd.mult = new_mult;
-       cd.shift = new_shift;
-       cd.epoch_cyc = new_epoch;
-       cd.epoch_ns = ns;
-       raw_write_seqcount_end(&cd.seq);
+       rd.read_sched_clock     = read;
+       rd.sched_clock_mask     = new_mask;
+       rd.mult                 = new_mult;
+       rd.shift                = new_shift;
+       rd.epoch_cyc            = new_epoch;
+       rd.epoch_ns             = ns;
+
+       update_clock_read_data(&rd);
 
        r = rate;
        if (r >= 4000000) {
                r /= 1000000;
                r_unit = 'M';
-       } else if (r >= 1000) {
-               r /= 1000;
-               r_unit = 'k';
-       } else
-               r_unit = ' ';
-
-       /* calculate the ns resolution of this counter */
+       } else {
+               if (r >= 1000) {
+                       r /= 1000;
+                       r_unit = 'k';
+               } else {
+                       r_unit = ' ';
+               }
+       }
+
+       /* Calculate the ns resolution of this counter */
        res = cyc_to_ns(1ULL, new_mult, new_shift);
 
        pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
                bits, r, r_unit, res, wrap);
 
-       /* Enable IRQ time accounting if we have a fast enough sched_clock */
+       /* Enable IRQ time accounting if we have a fast enough sched_clock() */
        if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
                enable_sched_clock_irqtime();
 
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 void __init sched_clock_postinit(void)
 {
        /*
-        * If no sched_clock function has been provided at that point,
+        * If no sched_clock() function has been provided at that point,
         * make it the final one one.
         */
-       if (read_sched_clock == jiffy_sched_clock_read)
+       if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
                sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
 
        update_sched_clock();
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
 }
 
+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of 'epoch_cyc'.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+       unsigned long seq = raw_read_seqcount(&cd.seq);
+
+       return cd.read_data[seq & 1].epoch_cyc;
+}
+
 static int sched_clock_suspend(void)
 {
+       struct clock_read_data *rd = &cd.read_data[0];
+
        update_sched_clock();
        hrtimer_cancel(&sched_clock_timer);
-       cd.suspended = true;
+       rd->read_sched_clock = suspended_sched_clock_read;
+
        return 0;
 }
 
 static void sched_clock_resume(void)
 {
-       cd.epoch_cyc = read_sched_clock();
+       struct clock_read_data *rd = &cd.read_data[0];
+
+       rd->epoch_cyc = cd.actual_read_sched_clock();
        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
-       cd.suspended = false;
+       rd->read_sched_clock = cd.actual_read_sched_clock;
 }
 
 static struct syscore_ops sched_clock_ops = {
-       .suspend = sched_clock_suspend,
-       .resume = sched_clock_resume,
+       .suspend        = sched_clock_suspend,
+       .resume         = sched_clock_resume,
 };
 
 static int __init sched_clock_syscore_init(void)
 {
        register_syscore_ops(&sched_clock_ops);
+
        return 0;
 }
 device_initcall(sched_clock_syscore_init);
index 066f0ec05e487396315356df0ea04c8563ffa390..7e8ca4f448a88c5ad5708106bbd889e22715b3ad 100644 (file)
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask;
 static cpumask_var_t tick_broadcast_on;
 static cpumask_var_t tmpmask;
 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
-static int tick_broadcast_force;
+static int tick_broadcast_forced;
 
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 #else
 static inline void tick_broadcast_clear_oneshot(int cpu) { }
+static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
 #endif
 
 /*
@@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
        /*
         * The device is in periodic mode. No reprogramming necessary:
         */
-       if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+       if (dev->state == CLOCK_EVT_STATE_PERIODIC)
                goto unlock;
 
        /*
@@ -324,49 +326,54 @@ unlock:
        raw_spin_unlock(&tick_broadcast_lock);
 }
 
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
+/**
+ * tick_broadcast_control - Enable/disable or force broadcast mode
+ * @mode:      The selected broadcast mode
+ *
+ * Called when the system enters a state where affected tick devices
+ * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
  */
-static void tick_do_broadcast_on_off(unsigned long *reason)
+void tick_broadcast_control(enum tick_broadcast_mode mode)
 {
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
-       unsigned long flags;
        int cpu, bc_stopped;
 
-       raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
-       cpu = smp_processor_id();
-       td = &per_cpu(tick_cpu_device, cpu);
+       td = this_cpu_ptr(&tick_cpu_device);
        dev = td->evtdev;
-       bc = tick_broadcast_device.evtdev;
 
        /*
         * Is the device not affected by the powerstate ?
         */
        if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
-               goto out;
+               return;
 
        if (!tick_device_is_functional(dev))
-               goto out;
+               return;
 
+       raw_spin_lock(&tick_broadcast_lock);
+       cpu = smp_processor_id();
+       bc = tick_broadcast_device.evtdev;
        bc_stopped = cpumask_empty(tick_broadcast_mask);
 
-       switch (*reason) {
-       case CLOCK_EVT_NOTIFY_BROADCAST_ON:
-       case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+       switch (mode) {
+       case TICK_BROADCAST_FORCE:
+               tick_broadcast_forced = 1;
+       case TICK_BROADCAST_ON:
                cpumask_set_cpu(cpu, tick_broadcast_on);
                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
                                clockevents_shutdown(dev);
                }
-               if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
-                       tick_broadcast_force = 1;
                break;
-       case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-               if (tick_broadcast_force)
+
+       case TICK_BROADCAST_OFF:
+               if (tick_broadcast_forced)
                        break;
                cpumask_clear_cpu(cpu, tick_broadcast_on);
                if (!tick_device_is_functional(dev))
@@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
                else
                        tick_broadcast_setup_oneshot(bc);
        }
-out:
-       raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-}
-
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop.
- */
-void tick_broadcast_on_off(unsigned long reason, int *oncpu)
-{
-       if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
-               printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
-                      "offline CPU #%d\n", *oncpu);
-       else
-               tick_do_broadcast_on_off(&reason);
+       raw_spin_unlock(&tick_broadcast_lock);
 }
+EXPORT_SYMBOL_GPL(tick_broadcast_control);
 
 /*
  * Set the periodic handler depending on broadcast on/off
@@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
                dev->event_handler = tick_handle_periodic_broadcast;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 /*
  * Remove a CPU from broadcasting
  */
-void tick_shutdown_broadcast(unsigned int *cpup)
+void tick_shutdown_broadcast(unsigned int cpu)
 {
        struct clock_event_device *bc;
        unsigned long flags;
-       unsigned int cpu = *cpup;
 
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 
@@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
 
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
+#endif
 
 void tick_suspend_broadcast(void)
 {
@@ -453,38 +448,48 @@ void tick_suspend_broadcast(void)
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
-int tick_resume_broadcast(void)
+/*
+ * This is called from tick_resume_local() on a resuming CPU. That's
+ * called from the core resume function, tick_unfreeze() and the magic XEN
+ * resume hackery.
+ *
+ * In none of these cases the broadcast device mode can change and the
+ * bit of the resuming CPU in the broadcast mask is safe as well.
+ */
+bool tick_resume_check_broadcast(void)
+{
+       if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
+               return false;
+       else
+               return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
+}
+
+void tick_resume_broadcast(void)
 {
        struct clock_event_device *bc;
        unsigned long flags;
-       int broadcast = 0;
 
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 
        bc = tick_broadcast_device.evtdev;
 
        if (bc) {
-               clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+               clockevents_tick_resume(bc);
 
                switch (tick_broadcast_device.mode) {
                case TICKDEV_MODE_PERIODIC:
                        if (!cpumask_empty(tick_broadcast_mask))
                                tick_broadcast_start_periodic(bc);
-                       broadcast = cpumask_test_cpu(smp_processor_id(),
-                                                    tick_broadcast_mask);
                        break;
                case TICKDEV_MODE_ONESHOT:
                        if (!cpumask_empty(tick_broadcast_mask))
-                               broadcast = tick_resume_broadcast_oneshot(bc);
+                               tick_resume_broadcast_oneshot(bc);
                        break;
                }
        }
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-
-       return broadcast;
 }
 
-
 #ifdef CONFIG_TICK_ONESHOT
 
 static cpumask_var_t tick_broadcast_oneshot_mask;
@@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
 {
        int ret;
 
-       if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
-               clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+       if (bc->state != CLOCK_EVT_STATE_ONESHOT)
+               clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
 
        ret = clockevents_program_event(bc, expires, force);
        if (!ret)
@@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
        return ret;
 }
 
-int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
-       clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-       return 0;
+       clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
 }
 
 /*
@@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void)
                 * switched over, leave the device alone.
                 */
                if (td->mode == TICKDEV_MODE_ONESHOT) {
-                       clockevents_set_mode(td->evtdev,
-                                            CLOCK_EVT_MODE_ONESHOT);
+                       clockevents_set_state(td->evtdev,
+                                             CLOCK_EVT_STATE_ONESHOT);
                }
        }
 }
@@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
                if (dev->next_event.tv64 < bc->next_event.tv64)
                        return;
        }
-       clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+       clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 }
 
-static void broadcast_move_bc(int deadcpu)
-{
-       struct clock_event_device *bc = tick_broadcast_device.evtdev;
-
-       if (!bc || !broadcast_needs_cpu(bc, deadcpu))
-               return;
-       /* This moves the broadcast assignment to this cpu */
-       clockevents_program_event(bc, bc->next_event, 1);
-}
-
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state:     The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
  * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
  */
-int tick_broadcast_oneshot_control(unsigned long reason)
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 {
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
-       unsigned long flags;
-       ktime_t now;
        int cpu, ret = 0;
+       ktime_t now;
 
        /*
         * Periodic mode does not care about the enter/exit of power
@@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason)
         * We are called with preemtion disabled from the depth of the
         * idle code, so we can't be moved away.
         */
-       cpu = smp_processor_id();
-       td = &per_cpu(tick_cpu_device, cpu);
+       td = this_cpu_ptr(&tick_cpu_device);
        dev = td->evtdev;
 
        if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
                return 0;
 
+       raw_spin_lock(&tick_broadcast_lock);
        bc = tick_broadcast_device.evtdev;
+       cpu = smp_processor_id();
 
-       raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-       if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+       if (state == TICK_BROADCAST_ENTER) {
                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
                        WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
                        broadcast_shutdown_local(bc, dev);
@@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
                        cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
        } else {
                if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
-                       clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+                       clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
                        /*
                         * The cpu which was handling the broadcast
                         * timer marked this cpu in the broadcast
@@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason)
                }
        }
 out:
-       raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+       raw_spin_unlock(&tick_broadcast_lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
 
 /*
  * Reset the one shot broadcast for a cpu
@@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 
        /* Set it up only once ! */
        if (bc->event_handler != tick_handle_oneshot_broadcast) {
-               int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+               int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
 
                bc->event_handler = tick_handle_oneshot_broadcast;
 
@@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
                           tick_broadcast_oneshot_mask, tmpmask);
 
                if (was_periodic && !cpumask_empty(tmpmask)) {
-                       clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+                       clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
                        tick_broadcast_init_next_event(tmpmask,
                                                       tick_next_period);
                        tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
@@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void)
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+void hotplug_cpu__broadcast_tick_pull(int deadcpu)
+{
+       struct clock_event_device *bc;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+       bc = tick_broadcast_device.evtdev;
+
+       if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+               /* This moves the broadcast assignment to this CPU: */
+               clockevents_program_event(bc, bc->next_event, 1);
+       }
+       raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
 
 /*
  * Remove a dead CPU from broadcasting
  */
-void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+void tick_shutdown_broadcast_oneshot(unsigned int cpu)
 {
        unsigned long flags;
-       unsigned int cpu = *cpup;
 
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 
@@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
        cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
        cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
 
-       broadcast_move_bc(cpu);
-
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
+#endif
 
 /*
  * Check, whether the broadcast device is in one shot mode
index f7c515595b42b2bf9794a8f3f4ee1f9a2c17df89..3ae6afa1eb98e71cc82272cd0a79a25101eff429 100644 (file)
@@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
 
        tick_periodic(cpu);
 
-       if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+       if (dev->state != CLOCK_EVT_STATE_ONESHOT)
                return;
        for (;;) {
                /*
@@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 
        if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
            !tick_broadcast_oneshot_active()) {
-               clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+               clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
        } else {
                unsigned long seq;
                ktime_t next;
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
                        next = tick_next_period;
                } while (read_seqretry(&jiffies_lock, seq));
 
-               clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+               clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
 
                for (;;) {
                        if (!clockevents_program_event(dev, next, false))
@@ -332,14 +332,16 @@ out_bc:
        tick_install_broadcast_device(newdev);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 /*
  * Transfer the do_timer job away from a dying cpu.
  *
- * Called with interrupts disabled.
+ * Called with interrupts disabled. Not locking required. If
+ * tick_do_timer_cpu is owned by this cpu, nothing can change it.
  */
-void tick_handover_do_timer(int *cpup)
+void tick_handover_do_timer(void)
 {
-       if (*cpup == tick_do_timer_cpu) {
+       if (tick_do_timer_cpu == smp_processor_id()) {
                int cpu = cpumask_first(cpu_online_mask);
 
                tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
@@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup)
  * access the hardware device itself.
  * We just set the mode and remove it from the lists.
  */
-void tick_shutdown(unsigned int *cpup)
+void tick_shutdown(unsigned int cpu)
 {
-       struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+       struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
        struct clock_event_device *dev = td->evtdev;
 
        td->mode = TICKDEV_MODE_PERIODIC;
@@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup)
                 * Prevent that the clock events layer tries to call
                 * the set mode function!
                 */
+               dev->state = CLOCK_EVT_STATE_DETACHED;
                dev->mode = CLOCK_EVT_MODE_UNUSED;
                clockevents_exchange_device(dev, NULL);
                dev->event_handler = clockevents_handle_noop;
                td->evtdev = NULL;
        }
 }
+#endif
 
-void tick_suspend(void)
+/**
+ * tick_suspend_local - Suspend the local tick device
+ *
+ * Called from the local cpu for freeze with interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend_local(void)
 {
        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
 
        clockevents_shutdown(td->evtdev);
 }
 
-void tick_resume(void)
+/**
+ * tick_resume_local - Resume the local tick device
+ *
+ * Called from the local CPU for unfreeze or XEN resume magic.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume_local(void)
 {
        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
-       int broadcast = tick_resume_broadcast();
-
-       clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+       bool broadcast = tick_resume_check_broadcast();
 
+       clockevents_tick_resume(td->evtdev);
        if (!broadcast) {
                if (td->mode == TICKDEV_MODE_PERIODIC)
                        tick_setup_periodic(td->evtdev, 0);
@@ -394,6 +411,35 @@ void tick_resume(void)
        }
 }
 
+/**
+ * tick_suspend - Suspend the tick and the broadcast device
+ *
+ * Called from syscore_suspend() via timekeeping_suspend with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend(void)
+{
+       tick_suspend_local();
+       tick_suspend_broadcast();
+}
+
+/**
+ * tick_resume - Resume the tick and the broadcast device
+ *
+ * Called from syscore_resume() via timekeeping_resume with only one
+ * CPU online and interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume(void)
+{
+       tick_resume_broadcast();
+       tick_resume_local();
+}
+
 static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
 static unsigned int tick_freeze_depth;
 
@@ -411,12 +457,10 @@ void tick_freeze(void)
        raw_spin_lock(&tick_freeze_lock);
 
        tick_freeze_depth++;
-       if (tick_freeze_depth == num_online_cpus()) {
+       if (tick_freeze_depth == num_online_cpus())
                timekeeping_suspend();
-       } else {
-               tick_suspend();
-               tick_suspend_broadcast();
-       }
+       else
+               tick_suspend_local();
 
        raw_spin_unlock(&tick_freeze_lock);
 }
@@ -437,7 +481,7 @@ void tick_unfreeze(void)
        if (tick_freeze_depth == num_online_cpus())
                timekeeping_resume();
        else
-               tick_resume();
+               tick_resume_local();
 
        tick_freeze_depth--;
 
index 366aeb4f2c6696ee6239e501ea4904f3812cd44c..b64fdd8054c56b042784fdce988ebad64f2ea803 100644 (file)
@@ -5,15 +5,12 @@
 #include <linux/tick.h>
 
 #include "timekeeping.h"
+#include "tick-sched.h"
 
-extern seqlock_t jiffies_lock;
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
 
-#define CS_NAME_LEN    32
-
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
-
-#define TICK_DO_TIMER_NONE     -1
-#define TICK_DO_TIMER_BOOT     -2
+# define TICK_DO_TIMER_NONE    -1
+# define TICK_DO_TIMER_BOOT    -2
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern ktime_t tick_next_period;
@@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly;
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
 extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_handover_do_timer(int *cpup);
-extern void tick_shutdown(unsigned int *cpup);
+extern void tick_shutdown(unsigned int cpu);
 extern void tick_suspend(void);
 extern void tick_resume(void);
 extern bool tick_check_replacement(struct clock_event_device *curdev,
                                   struct clock_event_device *newdev);
 extern void tick_install_replacement(struct clock_event_device *dev);
+extern int tick_is_oneshot_available(void);
+extern struct tick_device *tick_get_device(int cpu);
 
-extern void clockevents_shutdown(struct clock_event_device *dev);
+extern int clockevents_tick_resume(struct clock_event_device *dev);
+/* Check, if the device is functional or a dummy for broadcast */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+       return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
 
+extern void clockevents_shutdown(struct clock_event_device *dev);
+extern void clockevents_exchange_device(struct clock_event_device *old,
+                                       struct clock_event_device *new);
+extern void clockevents_set_state(struct clock_event_device *dev,
+                                enum clock_event_state state);
+extern int clockevents_program_event(struct clock_event_device *dev,
+                                    ktime_t expires, bool force);
+extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
 extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
 
-/*
- * NO_HZ / high resolution timer shared code
- */
+/* Broadcasting support */
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_suspend_broadcast(void);
+extern void tick_resume_broadcast(void);
+extern bool tick_resume_check_broadcast(void);
+extern void tick_broadcast_init(void);
+extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
+extern struct tick_device *tick_get_broadcast_device(void);
+extern struct cpumask *tick_get_broadcast_mask(void);
+# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
+static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
+static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_suspend_broadcast(void) { }
+static inline void tick_resume_broadcast(void) { }
+static inline bool tick_resume_check_broadcast(void) { return false; }
+static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
+
+/* Set the periodic handler in non broadcast mode */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+       dev->event_handler = tick_handle_periodic;
+}
+# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
+
+#else /* !GENERIC_CLOCKEVENTS: */
+static inline void tick_suspend(void) { }
+static inline void tick_resume(void) { }
+#endif /* !GENERIC_CLOCKEVENTS */
+
+/* Oneshot related functions */
 #ifdef CONFIG_TICK_ONESHOT
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
                               void (*handler)(struct clock_event_device *),
@@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
 extern void tick_resume_oneshot(void);
-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static inline bool tick_oneshot_possible(void) { return true; }
+extern int tick_oneshot_mode_active(void);
+extern void tick_clock_notify(void);
+extern int tick_check_oneshot_change(int allow_nohz);
+extern int tick_init_highres(void);
+#else /* !CONFIG_TICK_ONESHOT: */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+                       void (*handler)(struct clock_event_device *),
+                       ktime_t nextevt) { BUG(); }
+static inline void tick_resume_oneshot(void) { BUG(); }
+static inline int tick_program_event(ktime_t expires, int force) { return 0; }
+static inline void tick_oneshot_notify(void) { }
+static inline bool tick_oneshot_possible(void) { return false; }
+static inline int tick_oneshot_mode_active(void) { return 0; }
+static inline void tick_clock_notify(void) { }
+static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+#endif /* !CONFIG_TICK_ONESHOT */
+
+/* Functions related to oneshot broadcasting */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
 extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
-extern int tick_broadcast_oneshot_control(unsigned long reason);
 extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
-extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
 extern int tick_broadcast_oneshot_active(void);
 extern void tick_check_oneshot_broadcast_this_cpu(void);
 bool tick_broadcast_oneshot_available(void);
-# else /* BROADCAST */
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
-       BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
+extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
+#else /* !(BROADCAST && ONESHOT): */
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
 static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
 static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
-static inline bool tick_broadcast_oneshot_available(void) { return true; }
-# endif /* !BROADCAST */
-
-#else /* !ONESHOT */
-static inline
-void tick_setup_oneshot(struct clock_event_device *newdev,
-                       void (*handler)(struct clock_event_device *),
-                       ktime_t nextevt)
-{
-       BUG();
-}
-static inline void tick_resume_oneshot(void)
-{
-       BUG();
-}
-static inline int tick_program_event(ktime_t expires, int force)
-{
-       return 0;
-}
-static inline void tick_oneshot_notify(void) { }
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
-       BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
-static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
-{
-       return 0;
-}
-static inline int tick_broadcast_oneshot_active(void) { return 0; }
-static inline bool tick_broadcast_oneshot_available(void) { return false; }
-#endif /* !TICK_ONESHOT */
+static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
+#endif /* !(BROADCAST && ONESHOT) */
 
 /* NO_HZ_FULL internal */
 #ifdef CONFIG_NO_HZ_FULL
@@ -105,68 +137,3 @@ extern void tick_nohz_init(void);
 # else
 static inline void tick_nohz_init(void) { }
 #endif
-
-/*
- * Broadcasting support
- */
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern void tick_install_broadcast_device(struct clock_event_device *dev);
-extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
-extern void tick_shutdown_broadcast(unsigned int *cpup);
-extern void tick_suspend_broadcast(void);
-extern int tick_resume_broadcast(void);
-extern void tick_broadcast_init(void);
-extern void
-tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
-int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
-
-#else /* !BROADCAST */
-
-static inline void tick_install_broadcast_device(struct clock_event_device *dev)
-{
-}
-
-static inline int tick_is_broadcast_device(struct clock_event_device *dev)
-{
-       return 0;
-}
-static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
-                                            int cpu)
-{
-       return 0;
-}
-static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
-static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
-static inline void tick_suspend_broadcast(void) { }
-static inline int tick_resume_broadcast(void) { return 0; }
-static inline void tick_broadcast_init(void) { }
-static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
-                                            u32 freq) { return -ENODEV; }
-
-/*
- * Set the periodic handler in non broadcast mode
- */
-static inline void tick_set_periodic_handler(struct clock_event_device *dev,
-                                            int broadcast)
-{
-       dev->event_handler = tick_handle_periodic;
-}
-#endif /* !BROADCAST */
-
-/*
- * Check, if the device is functional or a dummy for broadcast
- */
-static inline int tick_device_is_functional(struct clock_event_device *dev)
-{
-       return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
-}
-
-int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-
-#endif
-
-extern void do_timer(unsigned long ticks);
-extern void update_wall_time(void);
index 7ce740e78e1b506b155c07e3ac50a9a96e6b262d..67a64b1670bfdb984c7d9edec34f7eadd04800ec 100644 (file)
@@ -38,7 +38,7 @@ void tick_resume_oneshot(void)
 {
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
-       clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+       clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
        clockevents_program_event(dev, ktime_get(), true);
 }
 
@@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
                        ktime_t next_event)
 {
        newdev->event_handler = handler;
-       clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+       clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
        clockevents_program_event(newdev, next_event, true);
 }
 
@@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 
        td->mode = TICKDEV_MODE_ONESHOT;
        dev->event_handler = handler;
-       clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+       clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
        tick_broadcast_switch_to_oneshot();
        return 0;
 }
index a4c4edac45281b5c25080efd61e19cc10e9d8636..914259128145e2394e65bd36f18aaf9a81f78843 100644 (file)
@@ -34,7 +34,7 @@
 /*
  * Per cpu nohz control structure
  */
-DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 
 /*
  * The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -416,6 +416,11 @@ static int __init setup_tick_nohz(char *str)
 
 __setup("nohz=", setup_tick_nohz);
 
+int tick_nohz_tick_stopped(void)
+{
+       return __this_cpu_read(tick_cpu_sched.tick_stopped);
+}
+
 /**
  * tick_nohz_update_jiffies - update jiffies when idle was interrupted
  *
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
new file mode 100644 (file)
index 0000000..28b5da3
--- /dev/null
@@ -0,0 +1,74 @@
+#ifndef _TICK_SCHED_H
+#define _TICK_SCHED_H
+
+#include <linux/hrtimer.h>
+
+enum tick_device_mode {
+       TICKDEV_MODE_PERIODIC,
+       TICKDEV_MODE_ONESHOT,
+};
+
+struct tick_device {
+       struct clock_event_device *evtdev;
+       enum tick_device_mode mode;
+};
+
+enum tick_nohz_mode {
+       NOHZ_MODE_INACTIVE,
+       NOHZ_MODE_LOWRES,
+       NOHZ_MODE_HIGHRES,
+};
+
+/**
+ * struct tick_sched - sched tick emulation and no idle tick control/stats
+ * @sched_timer:       hrtimer to schedule the periodic tick in high
+ *                     resolution mode
+ * @last_tick:         Store the last tick expiry time when the tick
+ *                     timer is modified for nohz sleeps. This is necessary
+ *                     to resume the tick timer operation in the timeline
+ *                     when the CPU returns from nohz sleep.
+ * @tick_stopped:      Indicator that the idle tick has been stopped
+ * @idle_jiffies:      jiffies at the entry to idle for idle time accounting
+ * @idle_calls:                Total number of idle calls
+ * @idle_sleeps:       Number of idle calls, where the sched tick was stopped
+ * @idle_entrytime:    Time when the idle call was entered
+ * @idle_waketime:     Time when the idle was interrupted
+ * @idle_exittime:     Time when the idle state was left
+ * @idle_sleeptime:    Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime:  Sum of the time slept in idle with sched tick stopped, with IO outstanding
+ * @sleep_length:      Duration of the current idle sleep
+ * @do_timer_lst:      CPU was the last one doing do_timer before going idle
+ */
+struct tick_sched {
+       struct hrtimer                  sched_timer;
+       unsigned long                   check_clocks;
+       enum tick_nohz_mode             nohz_mode;
+       ktime_t                         last_tick;
+       int                             inidle;
+       int                             tick_stopped;
+       unsigned long                   idle_jiffies;
+       unsigned long                   idle_calls;
+       unsigned long                   idle_sleeps;
+       int                             idle_active;
+       ktime_t                         idle_entrytime;
+       ktime_t                         idle_waketime;
+       ktime_t                         idle_exittime;
+       ktime_t                         idle_sleeptime;
+       ktime_t                         iowait_sleeptime;
+       ktime_t                         sleep_length;
+       unsigned long                   last_jiffies;
+       unsigned long                   next_jiffies;
+       ktime_t                         idle_expires;
+       int                             do_timer_last;
+};
+
+extern struct tick_sched *tick_get_tick_sched(int cpu);
+
+extern void tick_setup_sched_timer(void);
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
+extern void tick_cancel_sched_timer(int cpu);
+#else
+static inline void tick_cancel_sched_timer(int cpu) { }
+#endif
+
+#endif
index 91db94136c1062571ba0d0f1bfd1ed687770af3e..946acb72179facb1c173e54592b3c1c3637f8abd 100644 (file)
@@ -59,17 +59,15 @@ struct tk_fast {
 };
 
 static struct tk_fast tk_fast_mono ____cacheline_aligned;
+static struct tk_fast tk_fast_raw  ____cacheline_aligned;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
 
-/* Flag for if there is a persistent clock on this platform */
-bool __read_mostly persistent_clock_exist = false;
-
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
-       while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
-               tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
+       while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
+               tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
                tk->xtime_sec++;
        }
 }
@@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
        struct timespec64 ts;
 
        ts.tv_sec = tk->xtime_sec;
-       ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+       ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        return ts;
 }
 
 static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
 {
        tk->xtime_sec = ts->tv_sec;
-       tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
+       tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
 }
 
 static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
 {
        tk->xtime_sec += ts->tv_sec;
-       tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
+       tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
 }
 
@@ -118,6 +116,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
        tk->offs_boot = ktime_add(tk->offs_boot, delta);
 }
 
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
+/*
+ * These simple flag variables are managed
+ * without locks, which is racy, but ok since
+ * we don't really care about being super
+ * precise about how many events were seen,
+ * just that a problem was observed.
+ */
+static int timekeeping_underflow_seen;
+static int timekeeping_overflow_seen;
+
+/* last_warning is only modified under the timekeeping lock */
+static long timekeeping_last_warning;
+
+static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+
+       cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+       const char *name = tk->tkr_mono.clock->name;
+
+       if (offset > max_cycles) {
+               printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
+                               offset, name, max_cycles);
+               printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
+       } else {
+               if (offset > (max_cycles >> 1)) {
+                       printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+                                       offset, name, max_cycles >> 1);
+                       printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
+               }
+       }
+
+       if (timekeeping_underflow_seen) {
+               if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+                       printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
+                       printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
+                       printk_deferred("         Your kernel is probably still fine.\n");
+                       timekeeping_last_warning = jiffies;
+               }
+               timekeeping_underflow_seen = 0;
+       }
+
+       if (timekeeping_overflow_seen) {
+               if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+                       printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
+                       printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
+                       printk_deferred("         Your kernel is probably still fine.\n");
+                       timekeeping_last_warning = jiffies;
+               }
+               timekeeping_overflow_seen = 0;
+       }
+}
+
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+       cycle_t now, last, mask, max, delta;
+       unsigned int seq;
+
+       /*
+        * Since we're called holding a seqlock, the data may shift
+        * under us while we're doing the calculation. This can cause
+        * false positives, since we'd note a problem but throw the
+        * results away. So nest another seqlock here to atomically
+        * grab the points we are checking with.
+        */
+       do {
+               seq = read_seqcount_begin(&tk_core.seq);
+               now = tkr->read(tkr->clock);
+               last = tkr->cycle_last;
+               mask = tkr->mask;
+               max = tkr->clock->max_cycles;
+       } while (read_seqcount_retry(&tk_core.seq, seq));
+
+       delta = clocksource_delta(now, last, mask);
+
+       /*
+        * Try to catch underflows by checking if we are seeing small
+        * mask-relative negative values.
+        */
+       if (unlikely((~delta & mask) < (mask >> 3))) {
+               timekeeping_underflow_seen = 1;
+               delta = 0;
+       }
+
+       /* Cap delta value to the max_cycles values to avoid mult overflows */
+       if (unlikely(delta > max)) {
+               timekeeping_overflow_seen = 1;
+               delta = tkr->clock->max_cycles;
+       }
+
+       return delta;
+}
+#else
+static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+}
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+       cycle_t cycle_now, delta;
+
+       /* read clocksource */
+       cycle_now = tkr->read(tkr->clock);
+
+       /* calculate the delta since the last update_wall_time */
+       delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+
+       return delta;
+}
+#endif
+
 /**
  * tk_setup_internals - Set up internals to use clocksource clock.
  *
@@ -135,11 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;
 
-       old_clock = tk->tkr.clock;
-       tk->tkr.clock = clock;
-       tk->tkr.read = clock->read;
-       tk->tkr.mask = clock->mask;
-       tk->tkr.cycle_last = tk->tkr.read(clock);
+       old_clock = tk->tkr_mono.clock;
+       tk->tkr_mono.clock = clock;
+       tk->tkr_mono.read = clock->read;
+       tk->tkr_mono.mask = clock->mask;
+       tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+
+       tk->tkr_raw.clock = clock;
+       tk->tkr_raw.read = clock->read;
+       tk->tkr_raw.mask = clock->mask;
+       tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
 
        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
@@ -163,11 +277,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
        if (old_clock) {
                int shift_change = clock->shift - old_clock->shift;
                if (shift_change < 0)
-                       tk->tkr.xtime_nsec >>= -shift_change;
+                       tk->tkr_mono.xtime_nsec >>= -shift_change;
                else
-                       tk->tkr.xtime_nsec <<= shift_change;
+                       tk->tkr_mono.xtime_nsec <<= shift_change;
        }
-       tk->tkr.shift = clock->shift;
+       tk->tkr_raw.xtime_nsec = 0;
+
+       tk->tkr_mono.shift = clock->shift;
+       tk->tkr_raw.shift = clock->shift;
 
        tk->ntp_error = 0;
        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -178,7 +295,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
-       tk->tkr.mult = clock->mult;
+       tk->tkr_mono.mult = clock->mult;
+       tk->tkr_raw.mult = clock->mult;
        tk->ntp_err_mult = 0;
 }
 
@@ -193,14 +311,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
 
 static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
-       cycle_t cycle_now, delta;
+       cycle_t delta;
        s64 nsec;
 
-       /* read clocksource: */
-       cycle_now = tkr->read(tkr->clock);
-
-       /* calculate the delta since the last update_wall_time: */
-       delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+       delta = timekeeping_get_delta(tkr);
 
        nsec = delta * tkr->mult + tkr->xtime_nsec;
        nsec >>= tkr->shift;
@@ -209,25 +323,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
        return nsec + arch_gettimeoffset();
 }
 
-static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
-{
-       struct clocksource *clock = tk->tkr.clock;
-       cycle_t cycle_now, delta;
-       s64 nsec;
-
-       /* read clocksource: */
-       cycle_now = tk->tkr.read(clock);
-
-       /* calculate the delta since the last update_wall_time: */
-       delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-
-       /* convert delta to nanoseconds. */
-       nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
-
-       /* If arch requires, add in get_arch_timeoffset() */
-       return nsec + arch_gettimeoffset();
-}
-
 /**
  * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
  * @tkr: Timekeeping readout base from which we take the update
@@ -267,18 +362,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
  * slightly wrong timestamp (a few nanoseconds). See
  * @ktime_get_mono_fast_ns.
  */
-static void update_fast_timekeeper(struct tk_read_base *tkr)
+static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
 {
-       struct tk_read_base *base = tk_fast_mono.base;
+       struct tk_read_base *base = tkf->base;
 
        /* Force readers off to base[1] */
-       raw_write_seqcount_latch(&tk_fast_mono.seq);
+       raw_write_seqcount_latch(&tkf->seq);
 
        /* Update base[0] */
        memcpy(base, tkr, sizeof(*base));
 
        /* Force readers back to base[0] */
-       raw_write_seqcount_latch(&tk_fast_mono.seq);
+       raw_write_seqcount_latch(&tkf->seq);
 
        /* Update base[1] */
        memcpy(base + 1, base, sizeof(*base));
@@ -316,22 +411,33 @@ static void update_fast_timekeeper(struct tk_read_base *tkr)
  * of the following timestamps. Callers need to be aware of that and
  * deal with it.
  */
-u64 notrace ktime_get_mono_fast_ns(void)
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 {
        struct tk_read_base *tkr;
        unsigned int seq;
        u64 now;
 
        do {
-               seq = raw_read_seqcount(&tk_fast_mono.seq);
-               tkr = tk_fast_mono.base + (seq & 0x01);
-               now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+               seq = raw_read_seqcount(&tkf->seq);
+               tkr = tkf->base + (seq & 0x01);
+               now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+       } while (read_seqcount_retry(&tkf->seq, seq));
 
-       } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
        return now;
 }
+
+u64 ktime_get_mono_fast_ns(void)
+{
+       return __ktime_get_fast_ns(&tk_fast_mono);
+}
 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
 
+u64 ktime_get_raw_fast_ns(void)
+{
+       return __ktime_get_fast_ns(&tk_fast_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+
 /* Suspend-time cycles value for halted fast timekeeper. */
 static cycle_t cycles_at_suspend;
 
@@ -353,12 +459,17 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
 static void halt_fast_timekeeper(struct timekeeper *tk)
 {
        static struct tk_read_base tkr_dummy;
-       struct tk_read_base *tkr = &tk->tkr;
+       struct tk_read_base *tkr = &tk->tkr_mono;
 
        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        cycles_at_suspend = tkr->read(tkr->clock);
        tkr_dummy.read = dummy_clock_read;
-       update_fast_timekeeper(&tkr_dummy);
+       update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
+
+       tkr = &tk->tkr_raw;
+       memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+       tkr_dummy.read = dummy_clock_read;
+       update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
 }
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
@@ -369,8 +480,8 @@ static inline void update_vsyscall(struct timekeeper *tk)
 
        xt = timespec64_to_timespec(tk_xtime(tk));
        wm = timespec64_to_timespec(tk->wall_to_monotonic);
-       update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
-                           tk->tkr.cycle_last);
+       update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
+                           tk->tkr_mono.cycle_last);
 }
 
 static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -387,11 +498,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
        * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
        * users are removed, this can be killed.
        */
-       remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
-       tk->tkr.xtime_nsec -= remainder;
-       tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+       remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
+       tk->tkr_mono.xtime_nsec -= remainder;
+       tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
        tk->ntp_error += remainder << tk->ntp_error_shift;
-       tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+       tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
 }
 #else
 #define old_vsyscall_fixup(tk)
@@ -456,17 +567,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
         */
        seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
        nsec = (u32) tk->wall_to_monotonic.tv_nsec;
-       tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+       tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
 
        /* Update the monotonic raw base */
-       tk->base_raw = timespec64_to_ktime(tk->raw_time);
+       tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
 
        /*
         * The sum of the nanoseconds portions of xtime and
         * wall_to_monotonic can be greater/equal one second. Take
         * this into account before updating tk->ktime_sec.
         */
-       nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+       nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        if (nsec >= NSEC_PER_SEC)
                seconds++;
        tk->ktime_sec = seconds;
@@ -489,7 +600,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                       sizeof(tk_core.timekeeper));
 
-       update_fast_timekeeper(&tk->tkr);
+       update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+       update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
 }
 
 /**
@@ -501,22 +613,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
  */
 static void timekeeping_forward_now(struct timekeeper *tk)
 {
-       struct clocksource *clock = tk->tkr.clock;
+       struct clocksource *clock = tk->tkr_mono.clock;
        cycle_t cycle_now, delta;
        s64 nsec;
 
-       cycle_now = tk->tkr.read(clock);
-       delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-       tk->tkr.cycle_last = cycle_now;
+       cycle_now = tk->tkr_mono.read(clock);
+       delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+       tk->tkr_mono.cycle_last = cycle_now;
+       tk->tkr_raw.cycle_last  = cycle_now;
 
-       tk->tkr.xtime_nsec += delta * tk->tkr.mult;
+       tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
 
        /* If arch requires, add in get_arch_timeoffset() */
-       tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
+       tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
 
        tk_normalize_xtime(tk);
 
-       nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
+       nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
        timespec64_add_ns(&tk->raw_time, nsec);
 }
 
@@ -537,7 +650,7 @@ int __getnstimeofday64(struct timespec64 *ts)
                seq = read_seqcount_begin(&tk_core.seq);
 
                ts->tv_sec = tk->xtime_sec;
-               nsecs = timekeeping_get_ns(&tk->tkr);
+               nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
        } while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -577,8 +690,8 @@ ktime_t ktime_get(void)
 
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-               base = tk->tkr.base_mono;
-               nsecs = timekeeping_get_ns(&tk->tkr);
+               base = tk->tkr_mono.base;
+               nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
        } while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -603,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
 
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-               base = ktime_add(tk->tkr.base_mono, *offset);
-               nsecs = timekeeping_get_ns(&tk->tkr);
+               base = ktime_add(tk->tkr_mono.base, *offset);
+               nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
        } while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -645,8 +758,8 @@ ktime_t ktime_get_raw(void)
 
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-               base = tk->base_raw;
-               nsecs = timekeeping_get_ns_raw(tk);
+               base = tk->tkr_raw.base;
+               nsecs = timekeeping_get_ns(&tk->tkr_raw);
 
        } while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -674,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
-               nsec = timekeeping_get_ns(&tk->tkr);
+               nsec = timekeeping_get_ns(&tk->tkr_mono);
                tomono = tk->wall_to_monotonic;
 
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -759,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
                ts_real->tv_sec = tk->xtime_sec;
                ts_real->tv_nsec = 0;
 
-               nsecs_raw = timekeeping_get_ns_raw(tk);
-               nsecs_real = timekeeping_get_ns(&tk->tkr);
+               nsecs_raw  = timekeeping_get_ns(&tk->tkr_raw);
+               nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
 
        } while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -943,7 +1056,7 @@ static int change_clocksource(void *data)
         */
        if (try_module_get(new->owner)) {
                if (!new->enable || new->enable(new) == 0) {
-                       old = tk->tkr.clock;
+                       old = tk->tkr_mono.clock;
                        tk_setup_internals(tk, new);
                        if (old->disable)
                                old->disable(old);
@@ -971,11 +1084,11 @@ int timekeeping_notify(struct clocksource *clock)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
 
-       if (tk->tkr.clock == clock)
+       if (tk->tkr_mono.clock == clock)
                return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
-       return tk->tkr.clock == clock ? 0 : -1;
+       return tk->tkr_mono.clock == clock ? 0 : -1;
 }
 
 /**
@@ -993,7 +1106,7 @@ void getrawmonotonic64(struct timespec64 *ts)
 
        do {
                seq = read_seqcount_begin(&tk_core.seq);
-               nsecs = timekeeping_get_ns_raw(tk);
+               nsecs = timekeeping_get_ns(&tk->tkr_raw);
                ts64 = tk->raw_time;
 
        } while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1016,7 +1129,7 @@ int timekeeping_valid_for_hres(void)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
 
-               ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+               ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
        } while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -1035,7 +1148,7 @@ u64 timekeeping_max_deferment(void)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
 
-               ret = tk->tkr.clock->max_idle_ns;
+               ret = tk->tkr_mono.clock->max_idle_ns;
 
        } while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -1057,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts)
        ts->tv_nsec = 0;
 }
 
+void __weak read_persistent_clock64(struct timespec64 *ts64)
+{
+       struct timespec ts;
+
+       read_persistent_clock(&ts);
+       *ts64 = timespec_to_timespec64(ts);
+}
+
 /**
  * read_boot_clock -  Return time of the system start.
  *
@@ -1072,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts)
        ts->tv_nsec = 0;
 }
 
+void __weak read_boot_clock64(struct timespec64 *ts64)
+{
+       struct timespec ts;
+
+       read_boot_clock(&ts);
+       *ts64 = timespec_to_timespec64(ts);
+}
+
+/* Flag for if timekeeping_resume() has injected sleeptime */
+static bool sleeptime_injected;
+
+/* Flag for if there is a persistent clock on this platform */
+static bool persistent_clock_exists;
+
 /*
  * timekeeping_init - Initializes the clocksource and common timekeeping values
  */
@@ -1081,20 +1216,17 @@ void __init timekeeping_init(void)
        struct clocksource *clock;
        unsigned long flags;
        struct timespec64 now, boot, tmp;
-       struct timespec ts;
 
-       read_persistent_clock(&ts);
-       now = timespec_to_timespec64(ts);
+       read_persistent_clock64(&now);
        if (!timespec64_valid_strict(&now)) {
                pr_warn("WARNING: Persistent clock returned invalid value!\n"
                        "         Check your CMOS/BIOS settings.\n");
                now.tv_sec = 0;
                now.tv_nsec = 0;
        } else if (now.tv_sec || now.tv_nsec)
-               persistent_clock_exist = true;
+               persistent_clock_exists = true;
 
-       read_boot_clock(&ts);
-       boot = timespec_to_timespec64(ts);
+       read_boot_clock64(&boot);
        if (!timespec64_valid_strict(&boot)) {
                pr_warn("WARNING: Boot clock returned invalid value!\n"
                        "         Check your CMOS/BIOS settings.\n");
@@ -1114,7 +1246,6 @@ void __init timekeeping_init(void)
        tk_set_xtime(tk, &now);
        tk->raw_time.tv_sec = 0;
        tk->raw_time.tv_nsec = 0;
-       tk->base_raw.tv64 = 0;
        if (boot.tv_sec == 0 && boot.tv_nsec == 0)
                boot = tk_xtime(tk);
 
@@ -1127,7 +1258,7 @@ void __init timekeeping_init(void)
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 
-/* time in seconds when suspend began */
+/* time in seconds when suspend began for persistent clock */
 static struct timespec64 timekeeping_suspend_time;
 
 /**
@@ -1152,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
        tk_debug_account_sleep_time(delta);
 }
 
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
+/**
+ * We have three kinds of time sources to use for sleep time
+ * injection, the preference order is:
+ * 1) non-stop clocksource
+ * 2) persistent clock (ie: RTC accessible when irqs are off)
+ * 3) RTC
+ *
+ * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
+ * If system has neither 1) nor 2), 3) will be used finally.
+ *
+ *
+ * If timekeeping has injected sleeptime via either 1) or 2),
+ * 3) becomes needless, so in this case we don't need to call
+ * rtc_resume(), and this is what timekeeping_rtc_skipresume()
+ * means.
+ */
+bool timekeeping_rtc_skipresume(void)
+{
+       return sleeptime_injected;
+}
+
+/**
+ * 1) can be determined whether to use or not only when doing
+ * timekeeping_resume() which is invoked after rtc_suspend(),
+ * so we can't skip rtc_suspend() surely if system has 1).
+ *
+ * But if system has 2), 2) will definitely be used, so in this
+ * case we don't need to call rtc_suspend(), and this is what
+ * timekeeping_rtc_skipsuspend() means.
+ */
+bool timekeeping_rtc_skipsuspend(void)
+{
+       return persistent_clock_exists;
+}
+
 /**
  * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
  * @delta: pointer to a timespec64 delta value
  *
- * This hook is for architectures that cannot support read_persistent_clock
+ * This hook is for architectures that cannot support read_persistent_clock64
  * because their RTC/persistent clock is only accessible when irqs are enabled.
+ * and also don't have an effective nonstop clocksource.
  *
  * This function should only be called by rtc_resume(), and allows
  * a suspend offset to be injected into the timekeeping values.
@@ -1167,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
 
-       /*
-        * Make sure we don't set the clock twice, as timekeeping_resume()
-        * already did it
-        */
-       if (has_persistent_clock())
-               return;
-
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
 
@@ -1189,26 +1350,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
        /* signal hrtimers about time change */
        clock_was_set();
 }
+#endif
 
 /**
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
  */
 void timekeeping_resume(void)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-       struct clocksource *clock = tk->tkr.clock;
+       struct clocksource *clock = tk->tkr_mono.clock;
        unsigned long flags;
        struct timespec64 ts_new, ts_delta;
-       struct timespec tmp;
        cycle_t cycle_now, cycle_delta;
-       bool suspendtime_found = false;
 
-       read_persistent_clock(&tmp);
-       ts_new = timespec_to_timespec64(tmp);
+       sleeptime_injected = false;
+       read_persistent_clock64(&ts_new);
 
        clockevents_resume();
        clocksource_resume();
@@ -1228,16 +1384,16 @@ void timekeeping_resume(void)
         * The less preferred source will only be tried if there is no better
         * usable source. The rtc part is handled separately in rtc core code.
         */
-       cycle_now = tk->tkr.read(clock);
+       cycle_now = tk->tkr_mono.read(clock);
        if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
-               cycle_now > tk->tkr.cycle_last) {
+               cycle_now > tk->tkr_mono.cycle_last) {
                u64 num, max = ULLONG_MAX;
                u32 mult = clock->mult;
                u32 shift = clock->shift;
                s64 nsec = 0;
 
-               cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
-                                               tk->tkr.mask);
+               cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+                                               tk->tkr_mono.mask);
 
                /*
                 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1253,17 +1409,19 @@ void timekeeping_resume(void)
                nsec += ((u64) cycle_delta * mult) >> shift;
 
                ts_delta = ns_to_timespec64(nsec);
-               suspendtime_found = true;
+               sleeptime_injected = true;
        } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
                ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
-               suspendtime_found = true;
+               sleeptime_injected = true;
        }
 
-       if (suspendtime_found)
+       if (sleeptime_injected)
                __timekeeping_inject_sleeptime(tk, &ts_delta);
 
        /* Re-base the last cycle value */
-       tk->tkr.cycle_last = cycle_now;
+       tk->tkr_mono.cycle_last = cycle_now;
+       tk->tkr_raw.cycle_last  = cycle_now;
+
        tk->ntp_error = 0;
        timekeeping_suspended = 0;
        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1272,9 +1430,7 @@ void timekeeping_resume(void)
 
        touch_softlockup_watchdog();
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
-       /* Resume hrtimers */
+       tick_resume();
        hrtimers_resume();
 }
 
@@ -1284,10 +1440,8 @@ int timekeeping_suspend(void)
        unsigned long flags;
        struct timespec64               delta, delta_delta;
        static struct timespec64        old_delta;
-       struct timespec tmp;
 
-       read_persistent_clock(&tmp);
-       timekeeping_suspend_time = timespec_to_timespec64(tmp);
+       read_persistent_clock64(&timekeeping_suspend_time);
 
        /*
         * On some systems the persistent_clock can not be detected at
@@ -1295,31 +1449,33 @@ int timekeeping_suspend(void)
         * value returned, update the persistent_clock_exists flag.
         */
        if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
-               persistent_clock_exist = true;
+               persistent_clock_exists = true;
 
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;
 
-       /*
-        * To avoid drift caused by repeated suspend/resumes,
-        * which each can add ~1 second drift error,
-        * try to compensate so the difference in system time
-        * and persistent_clock time stays close to constant.
-        */
-       delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
-       delta_delta = timespec64_sub(delta, old_delta);
-       if (abs(delta_delta.tv_sec)  >= 2) {
+       if (persistent_clock_exists) {
                /*
-                * if delta_delta is too large, assume time correction
-                * has occured and set old_delta to the current delta.
+                * To avoid drift caused by repeated suspend/resumes,
+                * which each can add ~1 second drift error,
+                * try to compensate so the difference in system time
+                * and persistent_clock time stays close to constant.
                 */
-               old_delta = delta;
-       } else {
-               /* Otherwise try to adjust old_system to compensate */
-               timekeeping_suspend_time =
-                       timespec64_add(timekeeping_suspend_time, delta_delta);
+               delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
+               delta_delta = timespec64_sub(delta, old_delta);
+               if (abs(delta_delta.tv_sec) >= 2) {
+                       /*
+                        * if delta_delta is too large, assume time correction
+                        * has occurred and set old_delta to the current delta.
+                        */
+                       old_delta = delta;
+               } else {
+                       /* Otherwise try to adjust old_system to compensate */
+                       timekeeping_suspend_time =
+                               timespec64_add(timekeeping_suspend_time, delta_delta);
+               }
        }
 
        timekeeping_update(tk, TK_MIRROR);
@@ -1327,7 +1483,7 @@ int timekeeping_suspend(void)
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
-       clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+       tick_suspend();
        clocksource_suspend();
        clockevents_suspend();
 
@@ -1416,15 +1572,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
         *
         * XXX - TODO: Doc ntp_error calculation.
         */
-       if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+       if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
                /* NTP adjustment caused clocksource mult overflow */
                WARN_ON_ONCE(1);
                return;
        }
 
-       tk->tkr.mult += mult_adj;
+       tk->tkr_mono.mult += mult_adj;
        tk->xtime_interval += interval;
-       tk->tkr.xtime_nsec -= offset;
+       tk->tkr_mono.xtime_nsec -= offset;
        tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
 }
 
@@ -1486,13 +1642,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
                tk->ntp_err_mult = 0;
        }
 
-       if (unlikely(tk->tkr.clock->maxadj &&
-               (abs(tk->tkr.mult - tk->tkr.clock->mult)
-                       > tk->tkr.clock->maxadj))) {
+       if (unlikely(tk->tkr_mono.clock->maxadj &&
+               (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
+                       > tk->tkr_mono.clock->maxadj))) {
                printk_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
-                       tk->tkr.clock->name, (long)tk->tkr.mult,
-                       (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+                       tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
+                       (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
        }
 
        /*
@@ -1509,9 +1665,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
         * We'll correct this error next time through this function, when
         * xtime_nsec is not as small.
         */
-       if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
-               s64 neg = -(s64)tk->tkr.xtime_nsec;
-               tk->tkr.xtime_nsec = 0;
+       if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
+               s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
+               tk->tkr_mono.xtime_nsec = 0;
                tk->ntp_error += neg << tk->ntp_error_shift;
        }
 }
@@ -1526,13 +1682,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
  */
 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
-       u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
+       u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
        unsigned int clock_set = 0;
 
-       while (tk->tkr.xtime_nsec >= nsecps) {
+       while (tk->tkr_mono.xtime_nsec >= nsecps) {
                int leap;
 
-               tk->tkr.xtime_nsec -= nsecps;
+               tk->tkr_mono.xtime_nsec -= nsecps;
                tk->xtime_sec++;
 
                /* Figure out if its a leap sec and apply if needed */
@@ -1577,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 
        /* Accumulate one shifted interval */
        offset -= interval;
-       tk->tkr.cycle_last += interval;
+       tk->tkr_mono.cycle_last += interval;
+       tk->tkr_raw.cycle_last  += interval;
 
-       tk->tkr.xtime_nsec += tk->xtime_interval << shift;
+       tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
        *clock_set |= accumulate_nsecs_to_secs(tk);
 
        /* Accumulate raw time */
@@ -1622,14 +1779,17 @@ void update_wall_time(void)
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
        offset = real_tk->cycle_interval;
 #else
-       offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
-                                  tk->tkr.cycle_last, tk->tkr.mask);
+       offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+                                  tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
 #endif
 
        /* Check if there's really nothing to do */
        if (offset < real_tk->cycle_interval)
                goto out;
 
+       /* Do some additional sanity checking */
+       timekeeping_check_update(real_tk, offset);
+
        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
@@ -1784,8 +1944,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
        do {
                seq = read_seqcount_begin(&tk_core.seq);
 
-               base = tk->tkr.base_mono;
-               nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
+               base = tk->tkr_mono.base;
+               nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
 
                *offs_real = tk->offs_real;
                *offs_boot = tk->offs_boot;
@@ -1816,8 +1976,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
        do {
                seq = read_seqcount_begin(&tk_core.seq);
 
-               base = tk->tkr.base_mono;
-               nsecs = timekeeping_get_ns(&tk->tkr);
+               base = tk->tkr_mono.base;
+               nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
                *offs_real = tk->offs_real;
                *offs_boot = tk->offs_boot;
index 1d91416055d5e9f05e5b6d482bc8ab490acdeeb2..ead8794b9a4e470242d37684fd04079ffbd70dec 100644 (file)
@@ -19,4 +19,11 @@ extern void timekeeping_clocktai(struct timespec *ts);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
 
+extern void do_timer(unsigned long ticks);
+extern void update_wall_time(void);
+
+extern seqlock_t jiffies_lock;
+
+#define CS_NAME_LEN    32
+
 #endif
index 2d3f5c5049394615912b09ad16a4e639cb6cba9b..2ece3aa5069cade64b8c4982e920a45bea5ba232 100644 (file)
@@ -90,8 +90,18 @@ struct tvec_base {
        struct tvec tv5;
 } ____cacheline_aligned;
 
+/*
+ * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
+ * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
+ * pointer to per-cpu entries because we don't know where we'll map the section,
+ * even for the boot cpu.
+ *
+ * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
+ * rest of them.
+ */
 struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
+
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 
 /* Functions below help us manage 'deferrable' flag */
@@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
 #ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
+
 /**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
-static int init_timers_cpu(int cpu)
-{
-       int j;
-       struct tvec_base *base;
-       static char tvec_base_done[NR_CPUS];
-
-       if (!tvec_base_done[cpu]) {
-               static char boot_done;
-
-               if (boot_done) {
-                       /*
-                        * The APs use this path later in boot
-                        */
-                       base = kzalloc_node(sizeof(*base), GFP_KERNEL,
-                                           cpu_to_node(cpu));
-                       if (!base)
-                               return -ENOMEM;
-
-                       /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
-                       if (WARN_ON(base != tbase_get_base(base))) {
-                               kfree(base);
-                               return -ENOMEM;
-                       }
-                       per_cpu(tvec_bases, cpu) = base;
-               } else {
-                       /*
-                        * This is for the boot CPU - we use compile-time
-                        * static initialisation because per-cpu memory isn't
-                        * ready yet and because the memory allocators are not
-                        * initialised either.
-                        */
-                       boot_done = 1;
-                       base = &boot_tvec_bases;
-               }
-               spin_lock_init(&base->lock);
-               tvec_base_done[cpu] = 1;
-               base->cpu = cpu;
-       } else {
-               base = per_cpu(tvec_bases, cpu);
-       }
-
-
-       for (j = 0; j < TVN_SIZE; j++) {
-               INIT_LIST_HEAD(base->tv5.vec + j);
-               INIT_LIST_HEAD(base->tv4.vec + j);
-               INIT_LIST_HEAD(base->tv3.vec + j);
-               INIT_LIST_HEAD(base->tv2.vec + j);
-       }
-       for (j = 0; j < TVR_SIZE; j++)
-               INIT_LIST_HEAD(base->tv1.vec + j);
-
-       base->timer_jiffies = jiffies;
-       base->next_timer = base->timer_jiffies;
-       base->active_timers = 0;
-       base->all_timers = 0;
-       return 0;
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
 {
@@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu)
                migrate_timer_list(new_base, old_base->tv5.vec + i);
        }
 
+       old_base->active_timers = 0;
+       old_base->all_timers = 0;
+
        spin_unlock(&old_base->lock);
        spin_unlock_irq(&new_base->lock);
        put_cpu_var(tvec_bases);
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 
 static int timer_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
-       long cpu = (long)hcpu;
-       int err;
-
-       switch(action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               err = init_timers_cpu(cpu);
-               if (err < 0)
-                       return notifier_from_errno(err);
-               break;
-#ifdef CONFIG_HOTPLUG_CPU
+       switch (action) {
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-               migrate_timers(cpu);
+               migrate_timers((long)hcpu);
                break;
-#endif
        default:
                break;
        }
+
        return NOTIFY_OK;
 }
 
-static struct notifier_block timers_nb = {
-       .notifier_call  = timer_cpu_notify,
-};
+static inline void timer_register_cpu_notifier(void)
+{
+       cpu_notifier(timer_cpu_notify, 0);
+}
+#else
+static inline void timer_register_cpu_notifier(void) { }
+#endif /* CONFIG_HOTPLUG_CPU */
 
+static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+{
+       int j;
 
-void __init init_timers(void)
+       BUG_ON(base != tbase_get_base(base));
+
+       base->cpu = cpu;
+       per_cpu(tvec_bases, cpu) = base;
+       spin_lock_init(&base->lock);
+
+       for (j = 0; j < TVN_SIZE; j++) {
+               INIT_LIST_HEAD(base->tv5.vec + j);
+               INIT_LIST_HEAD(base->tv4.vec + j);
+               INIT_LIST_HEAD(base->tv3.vec + j);
+               INIT_LIST_HEAD(base->tv2.vec + j);
+       }
+       for (j = 0; j < TVR_SIZE; j++)
+               INIT_LIST_HEAD(base->tv1.vec + j);
+
+       base->timer_jiffies = jiffies;
+       base->next_timer = base->timer_jiffies;
+}
+
+static void __init init_timer_cpus(void)
 {
-       int err;
+       struct tvec_base *base;
+       int local_cpu = smp_processor_id();
+       int cpu;
 
+       for_each_possible_cpu(cpu) {
+               if (cpu == local_cpu)
+                       base = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+               else
+                       base = per_cpu_ptr(&__tvec_bases, cpu);
+#endif
+
+               init_timer_cpu(base, cpu);
+       }
+}
+
+void __init init_timers(void)
+{
        /* ensure there are enough low bits for flags in timer->base pointer */
        BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
 
-       err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
-                              (void *)(long)smp_processor_id());
-       BUG_ON(err != NOTIFY_OK);
-
+       init_timer_cpus();
        init_timer_stats();
-       register_cpu_notifier(&timers_nb);
+       timer_register_cpu_notifier();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
 
index 61ed862cdd376222dedfa317301f3d6c3dbd3404..e878c2e0ba45e06c4690646a8853406e11dd1a15 100644 (file)
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
-#include <linux/tick.h>
 
 #include <asm/uaccess.h>
 
+#include "tick-internal.h"
 
 struct timer_list_iter {
        int cpu;
@@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        print_name_offset(m, dev->set_next_event);
        SEQ_printf(m, "\n");
 
-       SEQ_printf(m, " set_mode:       ");
-       print_name_offset(m, dev->set_mode);
-       SEQ_printf(m, "\n");
+       if (dev->set_mode) {
+               SEQ_printf(m, " set_mode:       ");
+               print_name_offset(m, dev->set_mode);
+               SEQ_printf(m, "\n");
+       } else {
+               if (dev->set_state_shutdown) {
+                       SEQ_printf(m, " shutdown: ");
+                       print_name_offset(m, dev->set_state_shutdown);
+                       SEQ_printf(m, "\n");
+               }
+
+               if (dev->set_state_periodic) {
+                       SEQ_printf(m, " periodic: ");
+                       print_name_offset(m, dev->set_state_periodic);
+                       SEQ_printf(m, "\n");
+               }
+
+               if (dev->set_state_oneshot) {
+                       SEQ_printf(m, " oneshot:  ");
+                       print_name_offset(m, dev->set_state_oneshot);
+                       SEQ_printf(m, "\n");
+               }
+
+               if (dev->tick_resume) {
+                       SEQ_printf(m, " resume:   ");
+                       print_name_offset(m, dev->tick_resume);
+                       SEQ_printf(m, "\n");
+               }
+       }
 
        SEQ_printf(m, " event_handler:  ");
        print_name_offset(m, dev->event_handler);
index c5cefb3c009ce9cd51199dc5fef683d7bc9b1bdc..36b6fa88ce5b412f92b15da530772c5a058e5d12 100644 (file)
@@ -865,6 +865,19 @@ config SCHED_STACK_END_CHECK
          data corruption or a sporadic crash at a later stage once the region
          is examined. The runtime overhead introduced is minimal.
 
+config DEBUG_TIMEKEEPING
+       bool "Enable extra timekeeping sanity checking"
+       help
+         This option will enable additional timekeeping sanity checks
+         which may be helpful when diagnosing issues where timekeeping
+         problems are suspected.
+
+         This may include checks in the timekeeping hotpaths, so this
+         option may have a (very small) performance impact to some
+         workloads.
+
+         If unsure, say N.
+
 config TIMER_STATS
        bool "Collect kernel timers statistics"
        depends on DEBUG_KERNEL && PROC_FS
index e97dbd51e7569f6a7ba273227752f2cdbcaebb49..03d7fcb420b5d60c564ad10935011ed8a6556b69 100644 (file)
--- a/lib/lcm.c
+++ b/lib/lcm.c
@@ -12,3 +12,14 @@ unsigned long lcm(unsigned long a, unsigned long b)
                return 0;
 }
 EXPORT_SYMBOL_GPL(lcm);
+
+unsigned long lcm_not_zero(unsigned long a, unsigned long b)
+{
+       unsigned long l = lcm(a, b);
+
+       if (l)
+               return l;
+
+       return (b ? : a);
+}
+EXPORT_SYMBOL_GPL(lcm_not_zero);
index ecb9a665ec19b5c8b6e062568dffb0c4c0b12d90..494994bf17c8ec9764cbb784cbd88840fa0919c9 100644 (file)
@@ -18,7 +18,7 @@
 #define CMPXCHG_LOOP(CODE, SUCCESS) do {                                       \
        struct lockref old;                                                     \
        BUILD_BUG_ON(sizeof(old) != 8);                                         \
-       old.lock_count = ACCESS_ONCE(lockref->lock_count);                      \
+       old.lock_count = READ_ONCE(lockref->lock_count);                        \
        while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) {     \
                struct lockref new = old, prev = old;                           \
                CODE                                                            \
index 76a1b59523ab05907403f4f9bd5dc547fca1bcab..f5907d23272d48562c69b911e5a0a619e3f4c180 100644 (file)
@@ -279,6 +279,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count)
        int minlen = min_t(int, count, nla_len(src));
 
        memcpy(dest, nla_data(src), minlen);
+       if (count > minlen)
+               memset(dest + minlen, 0, count - minlen);
 
        return minlen;
 }
index 57dadc025c6444dd1038691d160803745db124f8..2dc44b1cb1dfc2f6d644d2d655155e6cfce2bdeb 100644 (file)
@@ -286,8 +286,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                old_len = new_len;
                old_addr = new_addr;
                new_addr = -ENOMEM;
-       } else if (vma->vm_file && vma->vm_file->f_op->mremap)
-               vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+       } else if (vma->vm_file && vma->vm_file->f_op->mremap) {
+               err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+               if (err < 0) {
+                       move_page_tables(new_vma, new_addr, vma, old_addr,
+                                        moved_len, true);
+                       return err;
+               }
+       }
 
        /* Conceal VM_ACCOUNT so old reservation is not undone */
        if (vm_flags & VM_ACCOUNT) {
index 6b3f54ed65ba6fc4ff392877f662ef5dddeb8939..a9f4ae45b7fb856295a30a87a6c3701933d2f3f2 100644 (file)
@@ -484,7 +484,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
                               IPPROTO_TCP, &sock);
        if (ret)
                return ret;
-       sock->sk->sk_allocation = GFP_NOFS | __GFP_MEMALLOC;
+       sock->sk->sk_allocation = GFP_NOFS;
 
 #ifdef CONFIG_LOCKDEP
        lockdep_set_class(&sock->sk->sk_lock, &socket_class);
@@ -520,8 +520,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
                               ret);
        }
 
-       sk_set_memalloc(sock->sk);
-
        con->sock = sock;
        return 0;
 }
@@ -2808,11 +2806,8 @@ static void con_work(struct work_struct *work)
 {
        struct ceph_connection *con = container_of(work, struct ceph_connection,
                                                   work.work);
-       unsigned long pflags = current->flags;
        bool fault;
 
-       current->flags |= PF_MEMALLOC;
-
        mutex_lock(&con->mutex);
        while (true) {
                int ret;
@@ -2866,8 +2861,6 @@ static void con_work(struct work_struct *work)
                con_fault_finish(con);
 
        con->ops->put(con);
-
-       tsk_restore_flags(current, pflags, PF_MEMALLOC);
 }
 
 /*
index 962ee9d719641291853715f366717bf1626e115c..45109b70664e8abb1b7957aabd619c4e0d711db4 100644 (file)
@@ -2848,7 +2848,9 @@ static void skb_update_prio(struct sk_buff *skb)
 #define skb_update_prio(skb)
 #endif
 
-static DEFINE_PER_CPU(int, xmit_recursion);
+DEFINE_PER_CPU(int, xmit_recursion);
+EXPORT_SYMBOL(xmit_recursion);
+
 #define RECURSION_LIMIT 10
 
 /**
index 44706e81b2e03df3e9d39c1cd76879a4ede48d1e..e4fdc9dfb2c73fb371280464484ebcae87218be6 100644 (file)
@@ -175,9 +175,9 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
 
        spin_lock(&net->rules_mod_lock);
        list_del_rcu(&ops->list);
-       fib_rules_cleanup_ops(ops);
        spin_unlock(&net->rules_mod_lock);
 
+       fib_rules_cleanup_ops(ops);
        call_rcu(&ops->rcu, fib_rules_put_rcu);
 }
 EXPORT_SYMBOL_GPL(fib_rules_unregister);
index cb5290b8c428c5c348b25d842ab9cf3797b70eba..70d3450588b2c73cbb4a3027f7f92e0c285b90d7 100644 (file)
@@ -198,8 +198,10 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc)
  */
 int peernet2id(struct net *net, struct net *peer)
 {
-       int id = __peernet2id(net, peer, true);
+       bool alloc = atomic_read(&peer->count) == 0 ? false : true;
+       int id;
 
+       id = __peernet2id(net, peer, alloc);
        return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
 }
 EXPORT_SYMBOL(peernet2id);
index ee0608bb3bc087212d12bac729677bf454053560..7ebed55b5f7d1b2d1faacea1a49e7f9e947f43d1 100644 (file)
@@ -1932,10 +1932,10 @@ static int rtnl_group_changelink(const struct sk_buff *skb,
                struct ifinfomsg *ifm,
                struct nlattr **tb)
 {
-       struct net_device *dev;
+       struct net_device *dev, *aux;
        int err;
 
-       for_each_netdev(net, dev) {
+       for_each_netdev_safe(net, dev, aux) {
                if (dev->group == group) {
                        err = do_setlink(skb, dev, ifm, tb, NULL, 0);
                        if (err < 0)
index 78e89eb7eb705624d3ff63324f5002ae10b51145..71e3e5f1eaa04816b8bdd1d34b4fd575f19793b9 100644 (file)
@@ -653,6 +653,25 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
                sock_reset_flag(sk, bit);
 }
 
+bool sk_mc_loop(struct sock *sk)
+{
+       if (dev_recursion_level())
+               return false;
+       if (!sk)
+               return true;
+       switch (sk->sk_family) {
+       case AF_INET:
+               return inet_sk(sk)->mc_loop;
+#if IS_ENABLED(CONFIG_IPV6)
+       case AF_INET6:
+               return inet6_sk(sk)->mc_loop;
+#endif
+       }
+       WARN_ON(1);
+       return true;
+}
+EXPORT_SYMBOL(sk_mc_loop);
+
 /*
  *     This is meant for all protocols to use and covers goings on
  *     at the socket level. Everything here is generic.
index faf7cc3483fe0822c26be6b915061ee8fdd8be9a..9d66a0f72f906733878de68e7f2e6bd80932c1b9 100644 (file)
@@ -248,7 +248,9 @@ void __init dn_fib_rules_init(void)
 
 void __exit dn_fib_rules_cleanup(void)
 {
+       rtnl_lock();
        fib_rules_unregister(dn_fib_rules_ops);
+       rtnl_unlock();
        rcu_barrier();
 }
 
index 2173402d87e0f56f255d0b378f06fc51aa2d3fed..4dea2e0681d16409016ae09616127b22bbab3a1e 100644 (file)
@@ -501,12 +501,10 @@ static struct net_device *dev_to_net_device(struct device *dev)
 #ifdef CONFIG_OF
 static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
                                        struct dsa_chip_data *cd,
-                                       int chip_index,
+                                       int chip_index, int port_index,
                                        struct device_node *link)
 {
-       int ret;
        const __be32 *reg;
-       int link_port_addr;
        int link_sw_addr;
        struct device_node *parent_sw;
        int len;
@@ -519,6 +517,10 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
        if (!reg || (len != sizeof(*reg) * 2))
                return -EINVAL;
 
+       /*
+        * Get the destination switch number from the second field of its 'reg'
+        * property, i.e. for "reg = <0x19 1>" sw_addr is '1'.
+        */
        link_sw_addr = be32_to_cpup(reg + 1);
 
        if (link_sw_addr >= pd->nr_chips)
@@ -535,20 +537,9 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
                memset(cd->rtable, -1, pd->nr_chips * sizeof(s8));
        }
 
-       reg = of_get_property(link, "reg", NULL);
-       if (!reg) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       link_port_addr = be32_to_cpup(reg);
-
-       cd->rtable[link_sw_addr] = link_port_addr;
+       cd->rtable[link_sw_addr] = port_index;
 
        return 0;
-out:
-       kfree(cd->rtable);
-       return ret;
 }
 
 static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
@@ -658,7 +649,7 @@ static int dsa_of_probe(struct platform_device *pdev)
                        if (!strcmp(port_name, "dsa") && link &&
                                        pd->nr_chips > 1) {
                                ret = dsa_of_setup_routing_table(pd, cd,
-                                               chip_index, link);
+                                               chip_index, port_index, link);
                                if (ret)
                                        goto out_free_chip;
                        }
index 57be71dd6a9e0163dceefd564bf71036c12dc9ba..23b9b3e86f4cd78987790f3456470318e3b82ece 100644 (file)
@@ -1111,11 +1111,10 @@ static void ip_fib_net_exit(struct net *net)
 {
        unsigned int i;
 
+       rtnl_lock();
 #ifdef CONFIG_IP_MULTIPLE_TABLES
        fib4_rules_exit(net);
 #endif
-
-       rtnl_lock();
        for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
                struct fib_table *tb;
                struct hlist_head *head;
index 9d78427652d23e33a46ab7ce2d4b6dbac1660781..fe54eba6d00d3462c287b095370e3a938e73b178 100644 (file)
@@ -268,7 +268,7 @@ static int __net_init ipmr_rules_init(struct net *net)
        return 0;
 
 err2:
-       kfree(mrt);
+       ipmr_free_table(mrt);
 err1:
        fib_rules_unregister(ops);
        return err;
@@ -278,11 +278,13 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 {
        struct mr_table *mrt, *next;
 
+       rtnl_lock();
        list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
                list_del(&mrt->list);
                ipmr_free_table(mrt);
        }
        fib_rules_unregister(net->ipv4.mr_rules_ops);
+       rtnl_unlock();
 }
 #else
 #define ipmr_for_each_table(mrt, net) \
@@ -308,7 +310,10 @@ static int __net_init ipmr_rules_init(struct net *net)
 
 static void __net_exit ipmr_rules_exit(struct net *net)
 {
+       rtnl_lock();
        ipmr_free_table(net->ipv4.mrt);
+       net->ipv4.mrt = NULL;
+       rtnl_unlock();
 }
 #endif
 
index fb4cf8b8e121acd4bffcf2fdfbd7e03c76bad7cc..f501ac0483665aeb1100e2f2776d3f38610e9b30 100644 (file)
@@ -3105,10 +3105,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                        if (!first_ackt.v64)
                                first_ackt = last_ackt;
 
-                       if (!(sacked & TCPCB_SACKED_ACKED))
+                       if (!(sacked & TCPCB_SACKED_ACKED)) {
                                reord = min(pkts_acked, reord);
-                       if (!after(scb->end_seq, tp->high_seq))
-                               flag |= FLAG_ORIG_SACK_ACKED;
+                               if (!after(scb->end_seq, tp->high_seq))
+                                       flag |= FLAG_ORIG_SACK_ACKED;
+                       }
                }
 
                if (sacked & TCPCB_SACKED_ACKED)
index 5a2dfed4783b6ed0185dccded960972b4d6e13b0..f1756ee022078d12e74be5d245fc1a6a0c1c4a34 100644 (file)
@@ -1518,7 +1518,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
                skb->sk = sk;
                skb->destructor = sock_edemux;
                if (sk->sk_state != TCP_TIME_WAIT) {
-                       struct dst_entry *dst = sk->sk_rx_dst;
+                       struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
 
                        if (dst)
                                dst = dst_check(dst, 0);
index 27ca79682efbf681a0ab6073f50f8fa73214028e..70bc6abc0639cf88d0b23b295c20d941b456d98b 100644 (file)
@@ -322,7 +322,9 @@ out_fib6_rules_ops:
 
 static void __net_exit fib6_rules_net_exit(struct net *net)
 {
+       rtnl_lock();
        fib_rules_unregister(net->ipv6.fib6_rules_ops);
+       rtnl_unlock();
 }
 
 static struct pernet_operations fib6_rules_net_ops = {
index 7e80b61b51ff474db6c188218b70f12709209256..36cf0ab685a00d57fbdc6787ba77116dae638f21 100644 (file)
@@ -542,7 +542,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
        struct sk_buff *frag;
        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
-       struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+       struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+                               inet6_sk(skb->sk) : NULL;
        struct ipv6hdr *tmp_hdr;
        struct frag_hdr *fh;
        unsigned int mtu, hlen, left, len;
index 34b682617f504359cecff4447c6015f90623e949..312e0ff47339be3a3ffa8981c9e1b1cb7f5a551b 100644 (file)
@@ -252,7 +252,7 @@ static int __net_init ip6mr_rules_init(struct net *net)
        return 0;
 
 err2:
-       kfree(mrt);
+       ip6mr_free_table(mrt);
 err1:
        fib_rules_unregister(ops);
        return err;
@@ -267,8 +267,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
                list_del(&mrt->list);
                ip6mr_free_table(mrt);
        }
-       rtnl_unlock();
        fib_rules_unregister(net->ipv6.mr6_rules_ops);
+       rtnl_unlock();
 }
 #else
 #define ip6mr_for_each_table(mrt, net) \
@@ -336,7 +336,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
 
 static void ip6mr_free_table(struct mr6_table *mrt)
 {
-       del_timer(&mrt->ipmr_expire_timer);
+       del_timer_sync(&mrt->ipmr_expire_timer);
        mroute_clean_tables(mrt);
        kfree(mrt);
 }
index 471ed24aabaec4b1d8736438696ca3490d4e0f58..14ecdaf06bf7497dc71199fc5638b49592a24655 100644 (file)
@@ -1218,7 +1218,14 @@ static void ndisc_router_discovery(struct sk_buff *skb)
        if (rt)
                rt6_set_expires(rt, jiffies + (HZ * lifetime));
        if (ra_msg->icmph.icmp6_hop_limit) {
-               in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+               /* Only set hop_limit on the interface if it is higher than
+                * the current hop_limit.
+                */
+               if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) {
+                       in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+               } else {
+                       ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n");
+               }
                if (rt)
                        dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
                                       ra_msg->icmph.icmp6_hop_limit);
index 5d46832c6f72b89a278a3326918a3c8bff9afed4..1f5e62229aaa8b4d5822f82c3af3a6c8b1382ba6 100644 (file)
@@ -1411,6 +1411,15 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
        TCP_SKB_CB(skb)->sacked = 0;
 }
 
+static void tcp_v6_restore_cb(struct sk_buff *skb)
+{
+       /* We need to move header back to the beginning if xfrm6_policy_check()
+        * and tcp_v6_fill_cb() are going to be called again.
+        */
+       memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
+               sizeof(struct inet6_skb_parm));
+}
+
 static int tcp_v6_rcv(struct sk_buff *skb)
 {
        const struct tcphdr *th;
@@ -1543,6 +1552,7 @@ do_time_wait:
                        inet_twsk_deschedule(tw, &tcp_death_row);
                        inet_twsk_put(tw);
                        sk = sk2;
+                       tcp_v6_restore_cb(skb);
                        goto process;
                }
                /* Fall through to ACK */
@@ -1551,6 +1561,7 @@ do_time_wait:
                tcp_v6_timewait_ack(sk, skb);
                break;
        case TCP_TW_RST:
+               tcp_v6_restore_cb(skb);
                goto no_tcp_socket;
        case TCP_TW_SUCCESS:
                ;
@@ -1585,7 +1596,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
                skb->sk = sk;
                skb->destructor = sock_edemux;
                if (sk->sk_state != TCP_TIME_WAIT) {
-                       struct dst_entry *dst = sk->sk_rx_dst;
+                       struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
 
                        if (dst)
                                dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
index 2e9953b2db8402dd71c88691f263db00f2cba3e2..53d931172088b15b2890c42dd649308771b6e7d3 100644 (file)
@@ -1114,10 +1114,8 @@ static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
                        noblock, &err);
        else
                skb = sock_alloc_send_skb(sk, len, noblock, &err);
-       if (!skb) {
-               err = -ENOMEM;
+       if (!skb)
                goto out;
-       }
        if (iucv->transport == AF_IUCV_TRANS_HIPER)
                skb_reserve(skb, sizeof(struct af_iucv_trans_hdr) + ETH_HLEN);
        if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
index 895348e44c7d22c9e6d4828195e7099a74154531..a29a504492af6f2c38607f2c15e123a297d565cd 100644 (file)
@@ -1871,6 +1871,7 @@ static int __init l2tp_init(void)
        l2tp_wq = alloc_workqueue("l2tp", WQ_UNBOUND, 0);
        if (!l2tp_wq) {
                pr_err("alloc_workqueue failed\n");
+               unregister_pernet_device(&l2tp_net_ops);
                rc = -ENOMEM;
                goto out;
        }
index a48bad468880aa60c80b45a1cc97bef616845ffa..7702978a4c999dfd10d9b49792b8b25899e23966 100644 (file)
@@ -49,8 +49,6 @@ static void ieee80211_free_tid_rx(struct rcu_head *h)
                container_of(h, struct tid_ampdu_rx, rcu_head);
        int i;
 
-       del_timer_sync(&tid_rx->reorder_timer);
-
        for (i = 0; i < tid_rx->buf_size; i++)
                __skb_queue_purge(&tid_rx->reorder_buf[i]);
        kfree(tid_rx->reorder_buf);
@@ -93,6 +91,12 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 
        del_timer_sync(&tid_rx->session_timer);
 
+       /* make sure ieee80211_sta_reorder_release() doesn't re-arm the timer */
+       spin_lock_bh(&tid_rx->reorder_lock);
+       tid_rx->removed = true;
+       spin_unlock_bh(&tid_rx->reorder_lock);
+       del_timer_sync(&tid_rx->reorder_timer);
+
        call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx);
 }
 
index 944bdc04e913d2f599b6c5845ee2549abba20a1c..1eb730bf875272831d44ac62c6e5a18e0a1de977 100644 (file)
@@ -873,9 +873,10 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
 
  set_release_timer:
 
-               mod_timer(&tid_agg_rx->reorder_timer,
-                         tid_agg_rx->reorder_time[j] + 1 +
-                         HT_RX_REORDER_BUF_TIMEOUT);
+               if (!tid_agg_rx->removed)
+                       mod_timer(&tid_agg_rx->reorder_timer,
+                                 tid_agg_rx->reorder_time[j] + 1 +
+                                 HT_RX_REORDER_BUF_TIMEOUT);
        } else {
                del_timer(&tid_agg_rx->reorder_timer);
        }
index 925e68fe64c755766c2ecc30d047455f62c11362..fb0fc1302a588480cae6649e2e671ffa719de36b 100644 (file)
@@ -175,6 +175,7 @@ struct tid_ampdu_tx {
  * @reorder_lock: serializes access to reorder buffer, see below.
  * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and
  *     and ssn.
+ * @removed: this session is removed (but might have been found due to RCU)
  *
  * This structure's lifetime is managed by RCU, assignments to
  * the array holding it must hold the aggregation mutex.
@@ -199,6 +200,7 @@ struct tid_ampdu_rx {
        u16 timeout;
        u8 dialog_token;
        bool auto_seq;
+       bool removed;
 };
 
 /**
index ec2954ffc690c612eb1b04b018134ba0f52ba5c8..067a3fff1d2cb0c629c1dc2d75d0353b9269ba71 100644 (file)
@@ -274,10 +274,8 @@ void ovs_vport_del(struct vport *vport)
        ASSERT_OVSL();
 
        hlist_del_rcu(&vport->hash_node);
-
-       vport->ops->destroy(vport);
-
        module_put(vport->ops->owner);
+       vport->ops->destroy(vport);
 }
 
 /**
index 612aa73bbc60c990a320054e4bd7d2846575428e..e6ce1517367f884608640b2532080ab6566b9379 100644 (file)
@@ -303,9 +303,7 @@ static int rpc_client_register(struct rpc_clnt *clnt,
        struct super_block *pipefs_sb;
        int err;
 
-       err = rpc_clnt_debugfs_register(clnt);
-       if (err)
-               return err;
+       rpc_clnt_debugfs_register(clnt);
 
        pipefs_sb = rpc_get_sb_net(net);
        if (pipefs_sb) {
index e811f390f9f67ceb2e897ee8da79189417eacc75..82962f7e6e888f619ad79754f038732d5d5b6333 100644 (file)
@@ -129,48 +129,52 @@ static const struct file_operations tasks_fops = {
        .release        = tasks_release,
 };
 
-int
+void
 rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
 {
-       int len, err;
+       int len;
        char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */
+       struct rpc_xprt *xprt;
 
        /* Already registered? */
-       if (clnt->cl_debugfs)
-               return 0;
+       if (clnt->cl_debugfs || !rpc_clnt_dir)
+               return;
 
        len = snprintf(name, sizeof(name), "%x", clnt->cl_clid);
        if (len >= sizeof(name))
-               return -EINVAL;
+               return;
 
        /* make the per-client dir */
        clnt->cl_debugfs = debugfs_create_dir(name, rpc_clnt_dir);
        if (!clnt->cl_debugfs)
-               return -ENOMEM;
+               return;
 
        /* make tasks file */
-       err = -ENOMEM;
        if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs,
                                 clnt, &tasks_fops))
                goto out_err;
 
-       err = -EINVAL;
        rcu_read_lock();
+       xprt = rcu_dereference(clnt->cl_xprt);
+       /* no "debugfs" dentry? Don't bother with the symlink. */
+       if (!xprt->debugfs) {
+               rcu_read_unlock();
+               return;
+       }
        len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
-                       rcu_dereference(clnt->cl_xprt)->debugfs->d_name.name);
+                       xprt->debugfs->d_name.name);
        rcu_read_unlock();
+
        if (len >= sizeof(name))
                goto out_err;
 
-       err = -ENOMEM;
        if (!debugfs_create_symlink("xprt", clnt->cl_debugfs, name))
                goto out_err;
 
-       return 0;
+       return;
 out_err:
        debugfs_remove_recursive(clnt->cl_debugfs);
        clnt->cl_debugfs = NULL;
-       return err;
 }
 
 void
@@ -226,33 +230,33 @@ static const struct file_operations xprt_info_fops = {
        .release        = xprt_info_release,
 };
 
-int
+void
 rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
 {
        int len, id;
        static atomic_t cur_id;
        char            name[9]; /* 8 hex digits + NULL term */
 
+       if (!rpc_xprt_dir)
+               return;
+
        id = (unsigned int)atomic_inc_return(&cur_id);
 
        len = snprintf(name, sizeof(name), "%x", id);
        if (len >= sizeof(name))
-               return -EINVAL;
+               return;
 
        /* make the per-client dir */
        xprt->debugfs = debugfs_create_dir(name, rpc_xprt_dir);
        if (!xprt->debugfs)
-               return -ENOMEM;
+               return;
 
        /* make tasks file */
        if (!debugfs_create_file("info", S_IFREG | S_IRUSR, xprt->debugfs,
                                 xprt, &xprt_info_fops)) {
                debugfs_remove_recursive(xprt->debugfs);
                xprt->debugfs = NULL;
-               return -ENOMEM;
        }
-
-       return 0;
 }
 
 void
@@ -266,14 +270,17 @@ void __exit
 sunrpc_debugfs_exit(void)
 {
        debugfs_remove_recursive(topdir);
+       topdir = NULL;
+       rpc_clnt_dir = NULL;
+       rpc_xprt_dir = NULL;
 }
 
-int __init
+void __init
 sunrpc_debugfs_init(void)
 {
        topdir = debugfs_create_dir("sunrpc", NULL);
        if (!topdir)
-               goto out;
+               return;
 
        rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
        if (!rpc_clnt_dir)
@@ -283,10 +290,9 @@ sunrpc_debugfs_init(void)
        if (!rpc_xprt_dir)
                goto out_remove;
 
-       return 0;
+       return;
 out_remove:
        debugfs_remove_recursive(topdir);
        topdir = NULL;
-out:
-       return -ENOMEM;
+       rpc_clnt_dir = NULL;
 }
index e37fbed879568da535aa540656e7b7ace508e2cb..ee5d3d253102bf5d81a39f953248a6a6ca7a38d6 100644 (file)
@@ -98,10 +98,7 @@ init_sunrpc(void)
        if (err)
                goto out4;
 
-       err = sunrpc_debugfs_init();
-       if (err)
-               goto out5;
-
+       sunrpc_debugfs_init();
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
        rpc_register_sysctl();
 #endif
@@ -109,8 +106,6 @@ init_sunrpc(void)
        init_socket_xprt();     /* clnt sock transport */
        return 0;
 
-out5:
-       unregister_rpc_pipefs();
 out4:
        unregister_pernet_subsys(&sunrpc_net_ops);
 out3:
index e3015aede0d9443d99eba6b820aed104ab7515a6..9949722d99cebf6afa15953d8a9ac6a5c0bc2824 100644 (file)
@@ -1331,7 +1331,6 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
  */
 struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 {
-       int err;
        struct rpc_xprt *xprt;
        struct xprt_class *t;
 
@@ -1372,11 +1371,7 @@ found:
                return ERR_PTR(-ENOMEM);
        }
 
-       err = rpc_xprt_debugfs_register(xprt);
-       if (err) {
-               xprt_destroy(xprt);
-               return ERR_PTR(err);
-       }
+       rpc_xprt_debugfs_register(xprt);
 
        dprintk("RPC:       created transport %p with %u slots\n", xprt,
                        xprt->max_reqs);
index 935205e6bcfe6da614fcc6c67e77754dc3a484fa..be1c9fa60b09dc713155c94e7bf6bcc6366fc7aa 100644 (file)
@@ -152,11 +152,11 @@ out_netlink:
 static void __exit tipc_exit(void)
 {
        tipc_bearer_cleanup();
+       unregister_pernet_subsys(&tipc_net_ops);
        tipc_netlink_stop();
        tipc_netlink_compat_stop();
        tipc_socket_stop();
        tipc_unregister_sysctl();
-       unregister_pernet_subsys(&tipc_net_ops);
 
        pr_info("Deactivated\n");
 }
index a422aaa3bb0cb6a7ea2f1e0de3b86380a52bf012..9ee25a63f684269c8fc600b1293b9b4421a7f020 100644 (file)
@@ -96,10 +96,10 @@ int snd_bebob_maudio_load_firmware(struct fw_unit *unit)
        struct fw_device *device = fw_parent_device(unit);
        int err, rcode;
        u64 date;
-       __be32 cues[3] = {
-               MAUDIO_BOOTLOADER_CUE1,
-               MAUDIO_BOOTLOADER_CUE2,
-               MAUDIO_BOOTLOADER_CUE3
+       __le32 cues[3] = {
+               cpu_to_le32(MAUDIO_BOOTLOADER_CUE1),
+               cpu_to_le32(MAUDIO_BOOTLOADER_CUE2),
+               cpu_to_le32(MAUDIO_BOOTLOADER_CUE3)
        };
 
        /* check date of software used to build */
index 74382137b9f5abcd67b9c4c44581c9a80f26d17d..f9d12c0a7e5a342f81f6e47c720a2187fd940367 100644 (file)
@@ -2912,6 +2912,8 @@ static void alc283_init(struct hda_codec *codec)
 
        if (!hp_pin)
                return;
+
+       msleep(30);
        hp_pin_sense = snd_hda_jack_detect(codec, hp_pin);
 
        /* Index 0x43 Direct Drive HP AMP LPM Control 1 */
@@ -3607,6 +3609,7 @@ static void alc_headset_mode_unplugged(struct hda_codec *codec)
 
        switch (codec->vendor_id) {
        case 0x10ec0255:
+       case 0x10ec0256:
                alc_process_coef_fw(codec, coef0255);
                break;
        case 0x10ec0233:
@@ -3662,6 +3665,7 @@ static void alc_headset_mode_mic_in(struct hda_codec *codec, hda_nid_t hp_pin,
 
        switch (codec->vendor_id) {
        case 0x10ec0255:
+       case 0x10ec0256:
                alc_write_coef_idx(codec, 0x45, 0xc489);
                snd_hda_set_pin_ctl_cache(codec, hp_pin, 0);
                alc_process_coef_fw(codec, coef0255);
@@ -3731,6 +3735,7 @@ static void alc_headset_mode_default(struct hda_codec *codec)
 
        switch (codec->vendor_id) {
        case 0x10ec0255:
+       case 0x10ec0256:
                alc_process_coef_fw(codec, coef0255);
                break;
        case 0x10ec0233:
@@ -3785,6 +3790,7 @@ static void alc_headset_mode_ctia(struct hda_codec *codec)
 
        switch (codec->vendor_id) {
        case 0x10ec0255:
+       case 0x10ec0256:
                alc_process_coef_fw(codec, coef0255);
                break;
        case 0x10ec0233:
@@ -3839,6 +3845,7 @@ static void alc_headset_mode_omtp(struct hda_codec *codec)
 
        switch (codec->vendor_id) {
        case 0x10ec0255:
+       case 0x10ec0256:
                alc_process_coef_fw(codec, coef0255);
                break;
        case 0x10ec0233:
@@ -3884,6 +3891,7 @@ static void alc_determine_headset_type(struct hda_codec *codec)
 
        switch (codec->vendor_id) {
        case 0x10ec0255:
+       case 0x10ec0256:
                alc_process_coef_fw(codec, coef0255);
                msleep(300);
                val = alc_read_coef_idx(codec, 0x46);
@@ -4364,6 +4372,7 @@ enum {
        ALC269_FIXUP_QUANTA_MUTE,
        ALC269_FIXUP_LIFEBOOK,
        ALC269_FIXUP_LIFEBOOK_EXTMIC,
+       ALC269_FIXUP_LIFEBOOK_HP_PIN,
        ALC269_FIXUP_AMIC,
        ALC269_FIXUP_DMIC,
        ALC269VB_FIXUP_AMIC,
@@ -4517,6 +4526,13 @@ static const struct hda_fixup alc269_fixups[] = {
                        { }
                },
        },
+       [ALC269_FIXUP_LIFEBOOK_HP_PIN] = {
+               .type = HDA_FIXUP_PINS,
+               .v.pins = (const struct hda_pintbl[]) {
+                       { 0x21, 0x0221102f }, /* HP out */
+                       { }
+               },
+       },
        [ALC269_FIXUP_AMIC] = {
                .type = HDA_FIXUP_PINS,
                .v.pins = (const struct hda_pintbl[]) {
@@ -5010,6 +5026,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x104d, 0x9084, "Sony VAIO", ALC275_FIXUP_SONY_HWEQ),
        SND_PCI_QUIRK(0x104d, 0x9099, "Sony VAIO S13", ALC275_FIXUP_SONY_DISABLE_AAMIX),
        SND_PCI_QUIRK(0x10cf, 0x1475, "Lifebook", ALC269_FIXUP_LIFEBOOK),
+       SND_PCI_QUIRK(0x10cf, 0x15dc, "Lifebook T731", ALC269_FIXUP_LIFEBOOK_HP_PIN),
        SND_PCI_QUIRK(0x10cf, 0x1845, "Lifebook U904", ALC269_FIXUP_LIFEBOOK_EXTMIC),
        SND_PCI_QUIRK(0x144d, 0xc109, "Samsung Ativ book 9 (NP900X3G)", ALC269_FIXUP_INV_DMIC),
        SND_PCI_QUIRK(0x1458, 0xfa53, "Gigabyte BXBT-2807", ALC283_FIXUP_BXBT2807_MIC),
@@ -5217,6 +5234,16 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
                {0x17, 0x40000000},
                {0x1d, 0x40700001},
                {0x21, 0x02211050}),
+       SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
+               {0x12, 0x90a60140},
+               {0x13, 0x40000000},
+               {0x14, 0x90170110},
+               {0x19, 0x411111f0},
+               {0x1a, 0x411111f0},
+               {0x1b, 0x411111f0},
+               {0x1d, 0x40700001},
+               {0x1e, 0x411111f0},
+               {0x21, 0x02211020}),
        SND_HDA_PIN_QUIRK(0x10ec0280, 0x103c, "HP", ALC280_FIXUP_HP_GPIO4,
                {0x12, 0x90a60130},
                {0x13, 0x40000000},
index 9974f201a08f44ee25b109c7ff78e13440146c4b..474cae82a8742704af9875ef7734710ac05d71cf 100644 (file)
@@ -1156,25 +1156,6 @@ static int pcm512x_hw_params(struct snd_pcm_substream *substream,
                                ret, pcm512x->pll_out);
                        return ret;
                }
-
-               gpio = PCM512x_G1OE << (4 - 1);
-               ret = regmap_update_bits(pcm512x->regmap, PCM512x_GPIO_EN,
-                                        gpio, gpio);
-               if (ret != 0) {
-                       dev_err(codec->dev, "Failed to enable gpio %d: %d\n",
-                               4, ret);
-                       return ret;
-               }
-
-               gpio = PCM512x_GPIO_OUTPUT_1 + 4 - 1;
-               ret = regmap_update_bits(pcm512x->regmap, gpio,
-                                        PCM512x_GxSL, PCM512x_GxSL_PLLLK);
-               if (ret != 0) {
-                       dev_err(codec->dev,
-                               "Failed to output pll lock on %d: %d\n",
-                               ret, 4);
-                       return ret;
-               }
        }
 
        ret = regmap_update_bits(pcm512x->regmap, PCM512x_SYNCHRONIZE,
index dc9df007d3e33358a51b73bb6446d81f6d427ab1..337c317ead6fbc2fa7e3ae80bcfb7cb64119cf55 100644 (file)
@@ -192,6 +192,7 @@ static const struct rc_config {
        { USB_ID(0x041e, 0x3040), 2, 2, 6, 6,  2,  0x6e91 }, /* Live! 24-bit */
        { USB_ID(0x041e, 0x3042), 0, 1, 1, 1,  1,  0x000d }, /* Usb X-Fi S51 */
        { USB_ID(0x041e, 0x30df), 0, 1, 1, 1,  1,  0x000d }, /* Usb X-Fi S51 Pro */
+       { USB_ID(0x041e, 0x3237), 0, 1, 1, 1,  1,  0x000d }, /* Usb X-Fi S51 Pro */
        { USB_ID(0x041e, 0x3048), 2, 2, 6, 6,  2,  0x6e91 }, /* Toshiba SB0500 */
 };
 
index 753a47de8459b7a0b505e72d2f660793d9ede885..9a28365126f9ab170454e46249348a06df8a2f32 100644 (file)
@@ -1113,8 +1113,13 @@ void snd_usb_set_format_quirk(struct snd_usb_substream *subs,
 
 bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip)
 {
-       /* MS Lifecam HD-5000 doesn't support reading the sample rate. */
-       return chip->usb_id == USB_ID(0x045E, 0x076D);
+       /* devices which do not support reading the sample rate. */
+       switch (chip->usb_id) {
+       case USB_ID(0x045E, 0x076D): /* MS Lifecam HD-5000 */
+       case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */
+               return true;
+       }
+       return false;
 }
 
 /* Marantz/Denon USB DACs need a vendor cmd to switch
index d66ab799b35fd5cab83e5486368e40c39c2927ee..8c0c1a2770c8fe01bb479c97847540befe164011 100644 (file)
@@ -1,12 +1,12 @@
 
-MEMCPY_FN(__memcpy,
+MEMCPY_FN(memcpy_orig,
        "x86-64-unrolled",
        "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
 
-MEMCPY_FN(memcpy_c,
+MEMCPY_FN(__memcpy,
        "x86-64-movsq",
        "movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
 
-MEMCPY_FN(memcpy_c_e,
+MEMCPY_FN(memcpy_erms,
        "x86-64-movsb",
        "movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
index fcd9cf00600a970677dd331cfc7f71de0d67d00e..e4c2c30143b95133913a2bc3569b2871b79aa75c 100644 (file)
@@ -1,8 +1,6 @@
 #define memcpy MEMCPY /* don't hide glibc's memcpy() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#define Lmemcpy_c globl memcpy_c; memcpy_c
-#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
 #include "../../../arch/x86/lib/memcpy_64.S"
 /*
  * We need to provide note.GNU-stack section, saying that we want
index db1d3a29d97fec67c47a4dd84eca2f7779ec1865..d3dfb7936dcdfd688d20ae680b3b0c0009cdeb55 100644 (file)
@@ -36,7 +36,7 @@ static const struct option options[] = {
                    "Specify length of memory to copy. "
                    "Available units: B, KB, MB, GB and TB (upper and lower)"),
        OPT_STRING('r', "routine", &routine, "default",
-                   "Specify routine to copy"),
+                   "Specify routine to copy, \"all\" runs all available routines"),
        OPT_INTEGER('i', "iterations", &iterations,
                    "repeat memcpy() invocation this number of times"),
        OPT_BOOLEAN('c', "cycle", &use_cycle,
@@ -135,55 +135,16 @@ struct bench_mem_info {
        const char *const *usage;
 };
 
-static int bench_mem_common(int argc, const char **argv,
-                    const char *prefix __maybe_unused,
-                    struct bench_mem_info *info)
+static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen)
 {
-       int i;
-       size_t len;
-       double totallen;
+       const struct routine *r = &info->routines[r_idx];
        double result_bps[2];
        u64 result_cycle[2];
 
-       argc = parse_options(argc, argv, options,
-                            info->usage, 0);
-
-       if (no_prefault && only_prefault) {
-               fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
-               return 1;
-       }
-
-       if (use_cycle)
-               init_cycle();
-
-       len = (size_t)perf_atoll((char *)length_str);
-       totallen = (double)len * iterations;
-
        result_cycle[0] = result_cycle[1] = 0ULL;
        result_bps[0] = result_bps[1] = 0.0;
 
-       if ((s64)len <= 0) {
-               fprintf(stderr, "Invalid length:%s\n", length_str);
-               return 1;
-       }
-
-       /* same to without specifying either of prefault and no-prefault */
-       if (only_prefault && no_prefault)
-               only_prefault = no_prefault = false;
-
-       for (i = 0; info->routines[i].name; i++) {
-               if (!strcmp(info->routines[i].name, routine))
-                       break;
-       }
-       if (!info->routines[i].name) {
-               printf("Unknown routine:%s\n", routine);
-               printf("Available routines...\n");
-               for (i = 0; info->routines[i].name; i++) {
-                       printf("\t%s ... %s\n",
-                              info->routines[i].name, info->routines[i].desc);
-               }
-               return 1;
-       }
+       printf("Routine %s (%s)\n", r->name, r->desc);
 
        if (bench_format == BENCH_FORMAT_DEFAULT)
                printf("# Copying %s Bytes ...\n\n", length_str);
@@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv,
        if (!only_prefault && !no_prefault) {
                /* show both of results */
                if (use_cycle) {
-                       result_cycle[0] =
-                               info->do_cycle(&info->routines[i], len, false);
-                       result_cycle[1] =
-                               info->do_cycle(&info->routines[i], len, true);
+                       result_cycle[0] = info->do_cycle(r, len, false);
+                       result_cycle[1] = info->do_cycle(r, len, true);
                } else {
-                       result_bps[0] =
-                               info->do_gettimeofday(&info->routines[i],
-                                               len, false);
-                       result_bps[1] =
-                               info->do_gettimeofday(&info->routines[i],
-                                               len, true);
+                       result_bps[0]   = info->do_gettimeofday(r, len, false);
+                       result_bps[1]   = info->do_gettimeofday(r, len, true);
                }
        } else {
-               if (use_cycle) {
-                       result_cycle[pf] =
-                               info->do_cycle(&info->routines[i],
-                                               len, only_prefault);
-               } else {
-                       result_bps[pf] =
-                               info->do_gettimeofday(&info->routines[i],
-                                               len, only_prefault);
-               }
+               if (use_cycle)
+                       result_cycle[pf] = info->do_cycle(r, len, only_prefault);
+               else
+                       result_bps[pf] = info->do_gettimeofday(r, len, only_prefault);
        }
 
        switch (bench_format) {
@@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv,
                die("unknown format: %d\n", bench_format);
                break;
        }
+}
+
+static int bench_mem_common(int argc, const char **argv,
+                    const char *prefix __maybe_unused,
+                    struct bench_mem_info *info)
+{
+       int i;
+       size_t len;
+       double totallen;
+
+       argc = parse_options(argc, argv, options,
+                            info->usage, 0);
+
+       if (no_prefault && only_prefault) {
+               fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
+               return 1;
+       }
+
+       if (use_cycle)
+               init_cycle();
+
+       len = (size_t)perf_atoll((char *)length_str);
+       totallen = (double)len * iterations;
+
+       if ((s64)len <= 0) {
+               fprintf(stderr, "Invalid length:%s\n", length_str);
+               return 1;
+       }
+
+       /* same to without specifying either of prefault and no-prefault */
+       if (only_prefault && no_prefault)
+               only_prefault = no_prefault = false;
+
+       if (!strncmp(routine, "all", 3)) {
+               for (i = 0; info->routines[i].name; i++)
+                       __bench_mem_routine(info, i, len, totallen);
+               return 0;
+       }
+
+       for (i = 0; info->routines[i].name; i++) {
+               if (!strcmp(info->routines[i].name, routine))
+                       break;
+       }
+       if (!info->routines[i].name) {
+               printf("Unknown routine:%s\n", routine);
+               printf("Available routines...\n");
+               for (i = 0; info->routines[i].name; i++) {
+                       printf("\t%s ... %s\n",
+                              info->routines[i].name, info->routines[i].desc);
+               }
+               return 1;
+       }
+
+       __bench_mem_routine(info, i, len, totallen);
 
        return 0;
 }
index a71dff97c1f54e87aea5034398534bacc77d6a56..f02d028771d970da5d611c31d1b406179b450d45 100644 (file)
@@ -1,12 +1,12 @@
 
-MEMSET_FN(__memset,
+MEMSET_FN(memset_orig,
        "x86-64-unrolled",
        "unrolled memset() in arch/x86/lib/memset_64.S")
 
-MEMSET_FN(memset_c,
+MEMSET_FN(__memset,
        "x86-64-stosq",
        "movsq-based memset() in arch/x86/lib/memset_64.S")
 
-MEMSET_FN(memset_c_e,
+MEMSET_FN(memset_erms,
        "x86-64-stosb",
        "movsb-based memset() in arch/x86/lib/memset_64.S")
index 9e5af89ed13af64bed89e46e7a8df1e1b6a52298..de278784c866a3804040408454f38c4a4f7ca44e 100644 (file)
@@ -1,8 +1,6 @@
 #define memset MEMSET /* don't hide glibc's memset() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#define Lmemset_c globl memset_c; memset_c
-#define Lmemset_c_e globl memset_c_e; memset_c_e
 #include "../../../arch/x86/lib/memset_64.S"
 
 /*
index 6789d788d4947d31f890ca56660d8ef968901b82..3a3a0f16456ae3369cd73faff1d3af9e4e8d4180 100644 (file)
@@ -4,5 +4,6 @@
 /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
 
 #define altinstruction_entry #
+#define ALTERNATIVE_2 #
 
 #endif
index 0db571340edbd94a7042a63f517f410b42c127d7..95abddcd78397fc3b64884f6bec31a5f6005b9e8 100644 (file)
@@ -17,6 +17,7 @@ TARGETS += sysctl
 TARGETS += timers
 TARGETS += user
 TARGETS += vm
+TARGETS += x86
 #Please keep the TARGETS list alphabetically sorted
 
 TARGETS_HOTPLUG = cpu-hotplug
@@ -55,7 +56,40 @@ clean_hotplug:
                make -C $$TARGET clean; \
        done;
 
+INSTALL_PATH ?= install
+INSTALL_PATH := $(abspath $(INSTALL_PATH))
+ALL_SCRIPT := $(INSTALL_PATH)/run_kselftest.sh
+
+install:
+ifdef INSTALL_PATH
+       @# Ask all targets to install their files
+       mkdir -p $(INSTALL_PATH)
+       for TARGET in $(TARGETS); do \
+               mkdir -p $(INSTALL_PATH)/$$TARGET ; \
+               make -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \
+       done;
+
+       @# Ask all targets to emit their test scripts
+       echo "#!/bin/bash" > $(ALL_SCRIPT)
+       echo "cd \$$(dirname \$$0)" >> $(ALL_SCRIPT)
+       echo "ROOT=\$$PWD" >> $(ALL_SCRIPT)
+
+       for TARGET in $(TARGETS); do \
+               echo "echo ; echo Running tests in $$TARGET" >> $(ALL_SCRIPT); \
+               echo "echo ========================================" >> $(ALL_SCRIPT); \
+               echo "cd $$TARGET" >> $(ALL_SCRIPT); \
+               make -s --no-print-directory -C $$TARGET emit_tests >> $(ALL_SCRIPT); \
+               echo "cd \$$ROOT" >> $(ALL_SCRIPT); \
+       done;
+
+       chmod u+x $(ALL_SCRIPT)
+else
+       $(error Error: set INSTALL_PATH to use install)
+endif
+
 clean:
        for TARGET in $(TARGETS); do \
                make -C $$TARGET clean; \
        done;
+
+.PHONY: install
index e18b42b254af814d7eb03f9459a2d24f67397077..1822356402090df03ee6311cb99dd7e2c7db94d7 100644 (file)
@@ -16,8 +16,9 @@ else
        echo "Not an x86 target, can't build breakpoints selftests"
 endif
 
-run_tests:
-       @./breakpoint_test || echo "breakpoints selftests: [FAIL]"
+TEST_PROGS := breakpoint_test
+
+include ../lib.mk
 
 clean:
        rm -fr breakpoint_test
index e9c28d8dc84bf2426e931b063477dd700a95ffab..fe1f99101c5d5023d4687893ed0bf7a5e876f1a9 100644 (file)
@@ -1,9 +1,10 @@
 all:
 
-run_tests:
-       @/bin/bash ./on-off-test.sh || echo "cpu-hotplug selftests: [FAIL]"
+TEST_PROGS := cpu-on-off-test.sh
+
+include ../lib.mk
 
 run_full_test:
-       @/bin/bash ./on-off-test.sh -a || echo "cpu-hotplug selftests: [FAIL]"
+       @/bin/bash ./cpu-on-off-test.sh -a || echo "cpu-hotplug selftests: [FAIL]"
 
 clean:
diff --git a/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh b/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh
new file mode 100755 (executable)
index 0000000..98b1d65
--- /dev/null
@@ -0,0 +1,269 @@
+#!/bin/bash
+
+SYSFS=
+
+prerequisite()
+{
+       msg="skip all tests:"
+
+       if [ $UID != 0 ]; then
+               echo $msg must be run as root >&2
+               exit 0
+       fi
+
+       taskset -p 01 $$
+
+       SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
+
+       if [ ! -d "$SYSFS" ]; then
+               echo $msg sysfs is not mounted >&2
+               exit 0
+       fi
+
+       if ! ls $SYSFS/devices/system/cpu/cpu* > /dev/null 2>&1; then
+               echo $msg cpu hotplug is not supported >&2
+               exit 0
+       fi
+
+       echo "CPU online/offline summary:"
+       online_cpus=`cat $SYSFS/devices/system/cpu/online`
+       online_max=${online_cpus##*-}
+       echo -e "\t Cpus in online state: $online_cpus"
+
+       offline_cpus=`cat $SYSFS/devices/system/cpu/offline`
+       if [[ "a$offline_cpus" = "a" ]]; then
+               offline_cpus=0
+       else
+               offline_max=${offline_cpus##*-}
+       fi
+       echo -e "\t Cpus in offline state: $offline_cpus"
+}
+
+#
+# list all hot-pluggable CPUs
+#
+hotpluggable_cpus()
+{
+       local state=${1:-.\*}
+
+       for cpu in $SYSFS/devices/system/cpu/cpu*; do
+               if [ -f $cpu/online ] && grep -q $state $cpu/online; then
+                       echo ${cpu##/*/cpu}
+               fi
+       done
+}
+
+hotplaggable_offline_cpus()
+{
+       hotpluggable_cpus 0
+}
+
+hotpluggable_online_cpus()
+{
+       hotpluggable_cpus 1
+}
+
+cpu_is_online()
+{
+       grep -q 1 $SYSFS/devices/system/cpu/cpu$1/online
+}
+
+cpu_is_offline()
+{
+       grep -q 0 $SYSFS/devices/system/cpu/cpu$1/online
+}
+
+online_cpu()
+{
+       echo 1 > $SYSFS/devices/system/cpu/cpu$1/online
+}
+
+offline_cpu()
+{
+       echo 0 > $SYSFS/devices/system/cpu/cpu$1/online
+}
+
+online_cpu_expect_success()
+{
+       local cpu=$1
+
+       if ! online_cpu $cpu; then
+               echo $FUNCNAME $cpu: unexpected fail >&2
+       elif ! cpu_is_online $cpu; then
+               echo $FUNCNAME $cpu: unexpected offline >&2
+       fi
+}
+
+online_cpu_expect_fail()
+{
+       local cpu=$1
+
+       if online_cpu $cpu 2> /dev/null; then
+               echo $FUNCNAME $cpu: unexpected success >&2
+       elif ! cpu_is_offline $cpu; then
+               echo $FUNCNAME $cpu: unexpected online >&2
+       fi
+}
+
+offline_cpu_expect_success()
+{
+       local cpu=$1
+
+       if ! offline_cpu $cpu; then
+               echo $FUNCNAME $cpu: unexpected fail >&2
+       elif ! cpu_is_offline $cpu; then
+               echo $FUNCNAME $cpu: unexpected offline >&2
+       fi
+}
+
+offline_cpu_expect_fail()
+{
+       local cpu=$1
+
+       if offline_cpu $cpu 2> /dev/null; then
+               echo $FUNCNAME $cpu: unexpected success >&2
+       elif ! cpu_is_online $cpu; then
+               echo $FUNCNAME $cpu: unexpected offline >&2
+       fi
+}
+
+error=-12
+allcpus=0
+priority=0
+online_cpus=0
+online_max=0
+offline_cpus=0
+offline_max=0
+
+while getopts e:ahp: opt; do
+       case $opt in
+       e)
+               error=$OPTARG
+               ;;
+       a)
+               allcpus=1
+               ;;
+       h)
+               echo "Usage $0 [ -a ] [ -e errno ] [ -p notifier-priority ]"
+               echo -e "\t default offline one cpu"
+               echo -e "\t run with -a option to offline all cpus"
+               exit
+               ;;
+       p)
+               priority=$OPTARG
+               ;;
+       esac
+done
+
+if ! [ "$error" -ge -4095 -a "$error" -lt 0 ]; then
+       echo "error code must be -4095 <= errno < 0" >&2
+       exit 1
+fi
+
+prerequisite
+
+#
+# Safe test (default) - offline and online one cpu
+#
+if [ $allcpus -eq 0 ]; then
+       echo "Limited scope test: one hotplug cpu"
+       echo -e "\t (leaves cpu in the original state):"
+       echo -e "\t online to offline to online: cpu $online_max"
+       offline_cpu_expect_success $online_max
+       online_cpu_expect_success $online_max
+
+       if [[ $offline_cpus -gt 0 ]]; then
+               echo -e "\t offline to online to offline: cpu $offline_max"
+               online_cpu_expect_success $offline_max
+               offline_cpu_expect_success $offline_max
+       fi
+       exit 0
+else
+       echo "Full scope test: all hotplug cpus"
+       echo -e "\t online all offline cpus"
+       echo -e "\t offline all online cpus"
+       echo -e "\t online all offline cpus"
+fi
+
+#
+# Online all hot-pluggable CPUs
+#
+for cpu in `hotplaggable_offline_cpus`; do
+       online_cpu_expect_success $cpu
+done
+
+#
+# Offline all hot-pluggable CPUs
+#
+for cpu in `hotpluggable_online_cpus`; do
+       offline_cpu_expect_success $cpu
+done
+
+#
+# Online all hot-pluggable CPUs again
+#
+for cpu in `hotplaggable_offline_cpus`; do
+       online_cpu_expect_success $cpu
+done
+
+#
+# Test with cpu notifier error injection
+#
+
+DEBUGFS=`mount -t debugfs | head -1 | awk '{ print $3 }'`
+NOTIFIER_ERR_INJECT_DIR=$DEBUGFS/notifier-error-inject/cpu
+
+prerequisite_extra()
+{
+       msg="skip extra tests:"
+
+       /sbin/modprobe -q -r cpu-notifier-error-inject
+       /sbin/modprobe -q cpu-notifier-error-inject priority=$priority
+
+       if [ ! -d "$DEBUGFS" ]; then
+               echo $msg debugfs is not mounted >&2
+               exit 0
+       fi
+
+       if [ ! -d $NOTIFIER_ERR_INJECT_DIR ]; then
+               echo $msg cpu-notifier-error-inject module is not available >&2
+               exit 0
+       fi
+}
+
+prerequisite_extra
+
+#
+# Offline all hot-pluggable CPUs
+#
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_DOWN_PREPARE/error
+for cpu in `hotpluggable_online_cpus`; do
+       offline_cpu_expect_success $cpu
+done
+
+#
+# Test CPU hot-add error handling (offline => online)
+#
+echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_UP_PREPARE/error
+for cpu in `hotplaggable_offline_cpus`; do
+       online_cpu_expect_fail $cpu
+done
+
+#
+# Online all hot-pluggable CPUs
+#
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_UP_PREPARE/error
+for cpu in `hotplaggable_offline_cpus`; do
+       online_cpu_expect_success $cpu
+done
+
+#
+# Test CPU hot-remove error handling (online => offline)
+#
+echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_DOWN_PREPARE/error
+for cpu in `hotpluggable_online_cpus`; do
+       offline_cpu_expect_fail $cpu
+done
+
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_DOWN_PREPARE/error
+/sbin/modprobe -q -r cpu-notifier-error-inject
diff --git a/tools/testing/selftests/cpu-hotplug/on-off-test.sh b/tools/testing/selftests/cpu-hotplug/on-off-test.sh
deleted file mode 100644 (file)
index 98b1d65..0000000
+++ /dev/null
@@ -1,269 +0,0 @@
-#!/bin/bash
-
-SYSFS=
-
-prerequisite()
-{
-       msg="skip all tests:"
-
-       if [ $UID != 0 ]; then
-               echo $msg must be run as root >&2
-               exit 0
-       fi
-
-       taskset -p 01 $$
-
-       SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
-
-       if [ ! -d "$SYSFS" ]; then
-               echo $msg sysfs is not mounted >&2
-               exit 0
-       fi
-
-       if ! ls $SYSFS/devices/system/cpu/cpu* > /dev/null 2>&1; then
-               echo $msg cpu hotplug is not supported >&2
-               exit 0
-       fi
-
-       echo "CPU online/offline summary:"
-       online_cpus=`cat $SYSFS/devices/system/cpu/online`
-       online_max=${online_cpus##*-}
-       echo -e "\t Cpus in online state: $online_cpus"
-
-       offline_cpus=`cat $SYSFS/devices/system/cpu/offline`
-       if [[ "a$offline_cpus" = "a" ]]; then
-               offline_cpus=0
-       else
-               offline_max=${offline_cpus##*-}
-       fi
-       echo -e "\t Cpus in offline state: $offline_cpus"
-}
-
-#
-# list all hot-pluggable CPUs
-#
-hotpluggable_cpus()
-{
-       local state=${1:-.\*}
-
-       for cpu in $SYSFS/devices/system/cpu/cpu*; do
-               if [ -f $cpu/online ] && grep -q $state $cpu/online; then
-                       echo ${cpu##/*/cpu}
-               fi
-       done
-}
-
-hotplaggable_offline_cpus()
-{
-       hotpluggable_cpus 0
-}
-
-hotpluggable_online_cpus()
-{
-       hotpluggable_cpus 1
-}
-
-cpu_is_online()
-{
-       grep -q 1 $SYSFS/devices/system/cpu/cpu$1/online
-}
-
-cpu_is_offline()
-{
-       grep -q 0 $SYSFS/devices/system/cpu/cpu$1/online
-}
-
-online_cpu()
-{
-       echo 1 > $SYSFS/devices/system/cpu/cpu$1/online
-}
-
-offline_cpu()
-{
-       echo 0 > $SYSFS/devices/system/cpu/cpu$1/online
-}
-
-online_cpu_expect_success()
-{
-       local cpu=$1
-
-       if ! online_cpu $cpu; then
-               echo $FUNCNAME $cpu: unexpected fail >&2
-       elif ! cpu_is_online $cpu; then
-               echo $FUNCNAME $cpu: unexpected offline >&2
-       fi
-}
-
-online_cpu_expect_fail()
-{
-       local cpu=$1
-
-       if online_cpu $cpu 2> /dev/null; then
-               echo $FUNCNAME $cpu: unexpected success >&2
-       elif ! cpu_is_offline $cpu; then
-               echo $FUNCNAME $cpu: unexpected online >&2
-       fi
-}
-
-offline_cpu_expect_success()
-{
-       local cpu=$1
-
-       if ! offline_cpu $cpu; then
-               echo $FUNCNAME $cpu: unexpected fail >&2
-       elif ! cpu_is_offline $cpu; then
-               echo $FUNCNAME $cpu: unexpected offline >&2
-       fi
-}
-
-offline_cpu_expect_fail()
-{
-       local cpu=$1
-
-       if offline_cpu $cpu 2> /dev/null; then
-               echo $FUNCNAME $cpu: unexpected success >&2
-       elif ! cpu_is_online $cpu; then
-               echo $FUNCNAME $cpu: unexpected offline >&2
-       fi
-}
-
-error=-12
-allcpus=0
-priority=0
-online_cpus=0
-online_max=0
-offline_cpus=0
-offline_max=0
-
-while getopts e:ahp: opt; do
-       case $opt in
-       e)
-               error=$OPTARG
-               ;;
-       a)
-               allcpus=1
-               ;;
-       h)
-               echo "Usage $0 [ -a ] [ -e errno ] [ -p notifier-priority ]"
-               echo -e "\t default offline one cpu"
-               echo -e "\t run with -a option to offline all cpus"
-               exit
-               ;;
-       p)
-               priority=$OPTARG
-               ;;
-       esac
-done
-
-if ! [ "$error" -ge -4095 -a "$error" -lt 0 ]; then
-       echo "error code must be -4095 <= errno < 0" >&2
-       exit 1
-fi
-
-prerequisite
-
-#
-# Safe test (default) - offline and online one cpu
-#
-if [ $allcpus -eq 0 ]; then
-       echo "Limited scope test: one hotplug cpu"
-       echo -e "\t (leaves cpu in the original state):"
-       echo -e "\t online to offline to online: cpu $online_max"
-       offline_cpu_expect_success $online_max
-       online_cpu_expect_success $online_max
-
-       if [[ $offline_cpus -gt 0 ]]; then
-               echo -e "\t offline to online to offline: cpu $offline_max"
-               online_cpu_expect_success $offline_max
-               offline_cpu_expect_success $offline_max
-       fi
-       exit 0
-else
-       echo "Full scope test: all hotplug cpus"
-       echo -e "\t online all offline cpus"
-       echo -e "\t offline all online cpus"
-       echo -e "\t online all offline cpus"
-fi
-
-#
-# Online all hot-pluggable CPUs
-#
-for cpu in `hotplaggable_offline_cpus`; do
-       online_cpu_expect_success $cpu
-done
-
-#
-# Offline all hot-pluggable CPUs
-#
-for cpu in `hotpluggable_online_cpus`; do
-       offline_cpu_expect_success $cpu
-done
-
-#
-# Online all hot-pluggable CPUs again
-#
-for cpu in `hotplaggable_offline_cpus`; do
-       online_cpu_expect_success $cpu
-done
-
-#
-# Test with cpu notifier error injection
-#
-
-DEBUGFS=`mount -t debugfs | head -1 | awk '{ print $3 }'`
-NOTIFIER_ERR_INJECT_DIR=$DEBUGFS/notifier-error-inject/cpu
-
-prerequisite_extra()
-{
-       msg="skip extra tests:"
-
-       /sbin/modprobe -q -r cpu-notifier-error-inject
-       /sbin/modprobe -q cpu-notifier-error-inject priority=$priority
-
-       if [ ! -d "$DEBUGFS" ]; then
-               echo $msg debugfs is not mounted >&2
-               exit 0
-       fi
-
-       if [ ! -d $NOTIFIER_ERR_INJECT_DIR ]; then
-               echo $msg cpu-notifier-error-inject module is not available >&2
-               exit 0
-       fi
-}
-
-prerequisite_extra
-
-#
-# Offline all hot-pluggable CPUs
-#
-echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_DOWN_PREPARE/error
-for cpu in `hotpluggable_online_cpus`; do
-       offline_cpu_expect_success $cpu
-done
-
-#
-# Test CPU hot-add error handling (offline => online)
-#
-echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_UP_PREPARE/error
-for cpu in `hotplaggable_offline_cpus`; do
-       online_cpu_expect_fail $cpu
-done
-
-#
-# Online all hot-pluggable CPUs
-#
-echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_UP_PREPARE/error
-for cpu in `hotplaggable_offline_cpus`; do
-       online_cpu_expect_success $cpu
-done
-
-#
-# Test CPU hot-remove error handling (online => offline)
-#
-echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_DOWN_PREPARE/error
-for cpu in `hotpluggable_online_cpus`; do
-       offline_cpu_expect_fail $cpu
-done
-
-echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_DOWN_PREPARE/error
-/sbin/modprobe -q -r cpu-notifier-error-inject
index 29e8c6bc81b04330ee5a360218f0a8fb787ef3c3..736c3ddfc787499a3a114ab87ad173054e5468c2 100644 (file)
@@ -1,12 +1,13 @@
-CC = $(CROSS_COMPILE)gcc
 CFLAGS = -Wall
 
 test_objs = open-unlink create-read
 
 all: $(test_objs)
 
-run_tests: all
-       @/bin/bash ./efivarfs.sh || echo "efivarfs selftests: [FAIL]"
+TEST_PROGS := efivarfs.sh
+TEST_FILES := $(test_objs)
+
+include ../lib.mk
 
 clean:
        rm -f $(test_objs)
old mode 100644 (file)
new mode 100755 (executable)
index 66dfc2ce178896ffe0938b5dc09f1f339d93076c..4edb7d0da29b81aa1b0240ed600a42c42d63df9f 100644 (file)
@@ -1,4 +1,3 @@
-CC = $(CROSS_COMPILE)gcc
 CFLAGS = -Wall
 BINARIES = execveat
 DEPS = execveat.symlink execveat.denatured script subdir
@@ -18,8 +17,12 @@ execveat.denatured: execveat
 %: %.c
        $(CC) $(CFLAGS) -o $@ $^
 
-run_tests: all
-       ./execveat
+TEST_PROGS := execveat
+TEST_FILES := $(DEPS)
+
+include ../lib.mk
+
+override EMIT_TESTS := echo "mkdir -p subdir; (./execveat && echo \"selftests: execveat [PASS]\") || echo \"selftests: execveat [FAIL]\""
 
 clean:
        rm -rf $(BINARIES) $(DEPS) subdir.moved execveat.moved xxxxx*
index e23cce0bbc3a5ee90db9ef01c1a7b44c7d3e41aa..9bf82234855b8f5b0a831ecc97385a5dc5ef4728 100644 (file)
@@ -3,25 +3,9 @@
 # No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
 all:
 
-fw_filesystem:
-       @if /bin/sh ./fw_filesystem.sh ; then \
-                echo "fw_filesystem: ok"; \
-        else \
-                echo "fw_filesystem: [FAIL]"; \
-                exit 1; \
-        fi
+TEST_PROGS := fw_filesystem.sh fw_userhelper.sh
 
-fw_userhelper:
-       @if /bin/sh ./fw_userhelper.sh ; then \
-                echo "fw_userhelper: ok"; \
-        else \
-                echo "fw_userhelper: [FAIL]"; \
-                exit 1; \
-        fi
-
-run_tests: all fw_filesystem fw_userhelper
+include ../lib.mk
 
 # Nothing to clean up.
 clean:
-
-.PHONY: all clean run_tests fw_filesystem fw_userhelper
index 76cc9f1562679dcd48ecd3ab5f4b15228cb92f24..346720639d1d6c6d126379738969e679455ae630 100644 (file)
@@ -1,7 +1,8 @@
 all:
 
-run_tests:
-       @/bin/sh ./ftracetest || echo "ftrace selftests: [FAIL]"
+TEST_PROGS := ftracetest
+
+include ../lib.mk
 
 clean:
        rm -rf logs/*
index fd9c49a13612b2a3334a7f2eab6467e6844c861d..aa51f6c17359e18de0823f125ed7239b5f29c841 100644 (file)
@@ -2,4 +2,4 @@
 # description: Basic event tracing check
 test -f available_events -a -f set_event -a -d events
 # check scheduler events are available
-grep -q sched available_events && exit 0 || exit -1
\ No newline at end of file
+grep -q sched available_events && exit 0 || exit $FAIL
index 668616d9bb0364ce7be40f02a550602cda826b4f..87eb9d6dd4ca0e376faaa4eea161026417b2971f 100644 (file)
@@ -9,7 +9,11 @@ do_reset() {
 fail() { #msg
     do_reset
     echo $1
-    exit -1
+    exit $FAIL
+}
+
+yield() {
+    ping localhost -c 1 || sleep .001 || usleep 1 || sleep 1
 }
 
 if [ ! -f set_event -o ! -d events/sched ]; then
@@ -21,7 +25,8 @@ reset_tracer
 do_reset
 
 echo 'sched:sched_switch' > set_event
-usleep 1
+
+yield
 
 count=`cat trace | grep sched_switch | wc -l`
 if [ $count -eq 0 ]; then
@@ -31,7 +36,8 @@ fi
 do_reset
 
 echo 1 > events/sched/sched_switch/enable
-usleep 1
+
+yield
 
 count=`cat trace | grep sched_switch | wc -l`
 if [ $count -eq 0 ]; then
@@ -41,7 +47,8 @@ fi
 do_reset
 
 echo 0 > events/sched/sched_switch/enable
-usleep 1
+
+yield
 
 count=`cat trace | grep sched_switch | wc -l`
 if [ $count -ne 0 ]; then
index 655c415b6e7ff9b350474951aaff90cbd2727d19..ced27ef0638f2d3f21cd3a4fc436ebdd7bc8b259 100644 (file)
@@ -9,7 +9,11 @@ do_reset() {
 fail() { #msg
     do_reset
     echo $1
-    exit -1
+    exit $FAIL
+}
+
+yield() {
+    ping localhost -c 1 || sleep .001 || usleep 1 || sleep 1
 }
 
 if [ ! -f set_event -o ! -d events/sched ]; then
@@ -21,7 +25,8 @@ reset_tracer
 do_reset
 
 echo 'sched:*' > set_event
-usleep 1
+
+yield
 
 count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
 if [ $count -lt 3 ]; then
@@ -31,7 +36,8 @@ fi
 do_reset
 
 echo 1 > events/sched/enable
-usleep 1
+
+yield
 
 count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
 if [ $count -lt 3 ]; then
@@ -41,7 +47,8 @@ fi
 do_reset
 
 echo 0 > events/sched/enable
-usleep 1
+
+yield
 
 count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
 if [ $count -ne 0 ]; then
index 480845774007f7d9ba9ccf43fa272b9858f5ef53..0bb5df3c00d41701f7392c73c31c95b10867f1d8 100644 (file)
@@ -9,7 +9,11 @@ do_reset() {
 fail() { #msg
     do_reset
     echo $1
-    exit -1
+    exit $FAIL
+}
+
+yield() {
+    ping localhost -c 1 || sleep .001 || usleep 1 || sleep 1
 }
 
 if [ ! -f available_events -o ! -f set_event -o ! -d events ]; then
@@ -21,6 +25,9 @@ reset_tracer
 do_reset
 
 echo '*:*' > set_event
+
+yield
+
 count=`cat trace | grep -v ^# | wc -l`
 if [ $count -eq 0 ]; then
     fail "none of events are recorded"
@@ -29,6 +36,9 @@ fi
 do_reset
 
 echo 1 > events/enable
+
+yield
+
 count=`cat trace | grep -v ^# | wc -l`
 if [ $count -eq 0 ]; then
     fail "none of events are recorded"
@@ -37,6 +47,9 @@ fi
 do_reset
 
 echo 0 > events/enable
+
+yield
+
 count=`cat trace | grep -v ^# | wc -l`
 if [ $count -ne 0 ]; then
     fail "any of events should not be recorded"
index c15e018e022085c7e8e1ed619d5a9de7aaff7470..15c2dba06ea288a9d73652bfdcf001bd52fda936 100644 (file)
@@ -16,7 +16,9 @@ fi
 
 do_reset() {
     reset_tracer
-    echo 0 > /proc/sys/kernel/stack_tracer_enabled
+    if [ -e /proc/sys/kernel/stack_tracer_enabled ]; then
+           echo 0 > /proc/sys/kernel/stack_tracer_enabled
+    fi
     enable_tracing
     clear_trace
     echo > set_ftrace_filter
@@ -25,7 +27,7 @@ do_reset() {
 fail() { # msg
     do_reset
     echo $1
-    exit -1
+    exit $FAIL
 }
 
 disable_tracing
index 6af5f6360b184c32bdee9f8c4d40327909e839c7..0ab2189613efe534b6e443e2ee01d3ecbecd6c2b 100644 (file)
@@ -17,7 +17,7 @@ do_reset() {
 fail() { # msg
     do_reset
     echo $1
-    exit -1
+    exit $FAIL
 }
 
 disable_tracing
index 2e719cb1fc4d91492df00c37562898e68022a4a1..7808336d6f50832a456eca8061c6a88c3ba831b5 100644 (file)
@@ -31,7 +31,7 @@ fail() { # mesg
     reset_tracer
     echo > set_ftrace_filter
     echo $1
-    exit -1
+    exit $FAIL
 }
 
 echo "Testing function tracer with profiler:"
diff --git a/tools/testing/selftests/gen_kselftest_tar.sh b/tools/testing/selftests/gen_kselftest_tar.sh
new file mode 100755 (executable)
index 0000000..17d5bd0
--- /dev/null
@@ -0,0 +1,55 @@
+#!/bin/bash
+#
+# gen_kselftest_tar
+# Generate kselftest tarball
+# Author: Shuah Khan <shuahkh@osg.samsung.com>
+# Copyright (C) 2015 Samsung Electronics Co., Ltd.
+
+# This software may be freely redistributed under the terms of the GNU
+# General Public License (GPLv2).
+
+# main
+main()
+{
+       if [ "$#" -eq 0 ]; then
+               echo "$0: Generating default compression gzip"
+               copts="cvzf"
+               ext=".tar.gz"
+       else
+               case "$1" in
+                       tar)
+                               copts="cvf"
+                               ext=".tar"
+                               ;;
+                       targz)
+                               copts="cvzf"
+                               ext=".tar.gz"
+                               ;;
+                       tarbz2)
+                               copts="cvjf"
+                               ext=".tar.bz2"
+                               ;;
+                       tarxz)
+                               copts="cvJf"
+                               ext=".tar.xz"
+                               ;;
+                       *)
+                       echo "Unknown tarball format $1"
+                       exit 1
+                       ;;
+       esac
+       fi
+
+       install_dir=./kselftest
+
+# Run install using INSTALL_KSFT_PATH override to generate install
+# directory
+./kselftest_install.sh
+tar $copts kselftest${ext} $install_dir
+echo "Kselftest archive kselftest${ext} created!"
+
+# clean up install directory
+rm -rf kselftest
+}
+
+main "$@"
index 74bbefdeaf4c187b07c02e67fedd74619f6215d6..25d2e702c68a53f7021d47404fd59d69241f430c 100644 (file)
@@ -12,14 +12,11 @@ endif
 CFLAGS += -I../../../../usr/include/
 
 all:
-ifeq ($(ARCH),x86)
-       gcc $(CFLAGS) msgque.c -o msgque_test
-else
-       echo "Not an x86 target, can't build msgque selftest"
-endif
+       $(CC) $(CFLAGS) msgque.c -o msgque_test
+
+TEST_PROGS := msgque_test
 
-run_tests: all
-       ./msgque_test
+include ../lib.mk
 
 clean:
        rm -fr ./msgque_test
index ff0eefdc6ceb92168b298831bf046f01b1ec1bf1..2ae7450a9a8984e2f0ba86a96842010375076757 100644 (file)
@@ -1,10 +1,10 @@
-CC := $(CROSS_COMPILE)$(CC)
 CFLAGS += -I../../../../usr/include/
 
 all: kcmp_test
 
-run_tests: all
-       @./kcmp_test || echo "kcmp_test: [FAIL]"
+TEST_PROGS := kcmp_test
+
+include ../lib.mk
 
 clean:
        $(RM) kcmp_test kcmp-test-file
diff --git a/tools/testing/selftests/kselftest_install.sh b/tools/testing/selftests/kselftest_install.sh
new file mode 100755 (executable)
index 0000000..1555fbd
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/bash
+#
+# Kselftest Install
+# Install kselftest tests
+# Author: Shuah Khan <shuahkh@osg.samsung.com>
+# Copyright (C) 2015 Samsung Electronics Co., Ltd.
+
+# This software may be freely redistributed under the terms of the GNU
+# General Public License (GPLv2).
+
+install_loc=`pwd`
+
+main()
+{
+       if [ $(basename $install_loc) !=  "selftests" ]; then
+               echo "$0: Please run it in selftests directory ..."
+               exit 1;
+       fi
+       if [ "$#" -eq 0 ]; then
+               echo "$0: Installing in default location - $install_loc ..."
+       elif [ ! -d "$1" ]; then
+               echo "$0: $1 doesn't exist!!"
+               exit 1;
+       else
+               install_loc=$1
+               echo "$0: Installing in specified location - $install_loc ..."
+       fi
+
+       install_dir=$install_loc/kselftest
+
+# Create install directory
+       mkdir -p $install_dir
+# Build tests
+       INSTALL_PATH=$install_dir make install
+}
+
+main "$@"
diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
new file mode 100644 (file)
index 0000000..2194155
--- /dev/null
@@ -0,0 +1,35 @@
+# This mimics the top-level Makefile. We do it explicitly here so that this
+# Makefile can operate with or without the kbuild infrastructure.
+CC := $(CROSS_COMPILE)gcc
+
+define RUN_TESTS
+       @for TEST in $(TEST_PROGS); do \
+               (./$$TEST && echo "selftests: $$TEST [PASS]") || echo "selftests: $$TEST [FAIL]"; \
+       done;
+endef
+
+run_tests: all
+       $(RUN_TESTS)
+
+define INSTALL_RULE
+       mkdir -p $(INSTALL_PATH)
+       install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)
+endef
+
+install: all
+ifdef INSTALL_PATH
+       $(INSTALL_RULE)
+else
+       $(error Error: set INSTALL_PATH to use install)
+endif
+
+define EMIT_TESTS
+       @for TEST in $(TEST_PROGS); do \
+               echo "(./$$TEST && echo \"selftests: $$TEST [PASS]\") || echo \"selftests: $$TEST [FAIL]\""; \
+       done;
+endef
+
+emit_tests:
+       $(EMIT_TESTS)
+
+.PHONY: run_tests all clean install emit_tests
index b80cd10d53bac058926758b7df013d1190d6e67f..3e7eb7972511c657af9ca54e6468542a42a5bd33 100644 (file)
@@ -1,17 +1,19 @@
+CC = $(CROSS_COMPILE)gcc
 CFLAGS += -D_FILE_OFFSET_BITS=64
 CFLAGS += -I../../../../include/uapi/
 CFLAGS += -I../../../../include/
+CFLAGS += -I../../../../usr/include/
 
 all:
-       gcc $(CFLAGS) memfd_test.c -o memfd_test
+       $(CC) $(CFLAGS) memfd_test.c -o memfd_test
 
-run_tests: all
-       gcc $(CFLAGS) memfd_test.c -o memfd_test
-       @./memfd_test || echo "memfd_test: [FAIL]"
+TEST_PROGS := memfd_test
+
+include ../lib.mk
 
 build_fuse:
-       gcc $(CFLAGS) fuse_mnt.c `pkg-config fuse --cflags --libs` -o fuse_mnt
-       gcc $(CFLAGS) fuse_test.c -o fuse_test
+       $(CC) $(CFLAGS) fuse_mnt.c `pkg-config fuse --cflags --libs` -o fuse_mnt
+       $(CC) $(CFLAGS) fuse_test.c -o fuse_test
 
 run_fuse: build_fuse
        @./run_fuse_test.sh || echo "fuse_test: [FAIL]"
index d46b8d489cd252f18450b06dad87bb87f672e117..afb2624c704894fc280f553fbad9ac1f9fb07363 100644 (file)
@@ -1,9 +1,12 @@
 all:
 
-run_tests:
-       @/bin/bash ./on-off-test.sh -r 2 || echo "memory-hotplug selftests: [FAIL]"
+include ../lib.mk
+
+TEST_PROGS := mem-on-off-test.sh
+override RUN_TESTS := ./mem-on-off-test.sh -r 2 || echo "selftests: memory-hotplug [FAIL]"
+override EMIT_TESTS := echo "$(RUN_TESTS)"
 
 run_full_test:
-       @/bin/bash ./on-off-test.sh || echo "memory-hotplug selftests: [FAIL]"
+       @/bin/bash ./mem-on-off-test.sh || echo "memory-hotplug selftests: [FAIL]"
 
 clean:
diff --git a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh
new file mode 100755 (executable)
index 0000000..6cddde0
--- /dev/null
@@ -0,0 +1,238 @@
+#!/bin/bash
+
+SYSFS=
+
+prerequisite()
+{
+       msg="skip all tests:"
+
+       if [ $UID != 0 ]; then
+               echo $msg must be run as root >&2
+               exit 0
+       fi
+
+       SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
+
+       if [ ! -d "$SYSFS" ]; then
+               echo $msg sysfs is not mounted >&2
+               exit 0
+       fi
+
+       if ! ls $SYSFS/devices/system/memory/memory* > /dev/null 2>&1; then
+               echo $msg memory hotplug is not supported >&2
+               exit 0
+       fi
+}
+
+#
+# list all hot-pluggable memory
+#
+hotpluggable_memory()
+{
+       local state=${1:-.\*}
+
+       for memory in $SYSFS/devices/system/memory/memory*; do
+               if grep -q 1 $memory/removable &&
+                  grep -q $state $memory/state; then
+                       echo ${memory##/*/memory}
+               fi
+       done
+}
+
+hotplaggable_offline_memory()
+{
+       hotpluggable_memory offline
+}
+
+hotpluggable_online_memory()
+{
+       hotpluggable_memory online
+}
+
+memory_is_online()
+{
+       grep -q online $SYSFS/devices/system/memory/memory$1/state
+}
+
+memory_is_offline()
+{
+       grep -q offline $SYSFS/devices/system/memory/memory$1/state
+}
+
+online_memory()
+{
+       echo online > $SYSFS/devices/system/memory/memory$1/state
+}
+
+offline_memory()
+{
+       echo offline > $SYSFS/devices/system/memory/memory$1/state
+}
+
+online_memory_expect_success()
+{
+       local memory=$1
+
+       if ! online_memory $memory; then
+               echo $FUNCNAME $memory: unexpected fail >&2
+       elif ! memory_is_online $memory; then
+               echo $FUNCNAME $memory: unexpected offline >&2
+       fi
+}
+
+online_memory_expect_fail()
+{
+       local memory=$1
+
+       if online_memory $memory 2> /dev/null; then
+               echo $FUNCNAME $memory: unexpected success >&2
+       elif ! memory_is_offline $memory; then
+               echo $FUNCNAME $memory: unexpected online >&2
+       fi
+}
+
+offline_memory_expect_success()
+{
+       local memory=$1
+
+       if ! offline_memory $memory; then
+               echo $FUNCNAME $memory: unexpected fail >&2
+       elif ! memory_is_offline $memory; then
+               echo $FUNCNAME $memory: unexpected offline >&2
+       fi
+}
+
+offline_memory_expect_fail()
+{
+       local memory=$1
+
+       if offline_memory $memory 2> /dev/null; then
+               echo $FUNCNAME $memory: unexpected success >&2
+       elif ! memory_is_online $memory; then
+               echo $FUNCNAME $memory: unexpected offline >&2
+       fi
+}
+
+error=-12
+priority=0
+ratio=10
+
+while getopts e:hp:r: opt; do
+       case $opt in
+       e)
+               error=$OPTARG
+               ;;
+       h)
+               echo "Usage $0 [ -e errno ] [ -p notifier-priority ] [ -r percent-of-memory-to-offline ]"
+               exit
+               ;;
+       p)
+               priority=$OPTARG
+               ;;
+       r)
+               ratio=$OPTARG
+               ;;
+       esac
+done
+
+if ! [ "$error" -ge -4095 -a "$error" -lt 0 ]; then
+       echo "error code must be -4095 <= errno < 0" >&2
+       exit 1
+fi
+
+prerequisite
+
+echo "Test scope: $ratio% hotplug memory"
+echo -e "\t online all hotplug memory in offline state"
+echo -e "\t offline $ratio% hotplug memory in online state"
+echo -e "\t online all hotplug memory in offline state"
+
+#
+# Online all hot-pluggable memory
+#
+for memory in `hotplaggable_offline_memory`; do
+       echo offline-online $memory
+       online_memory_expect_success $memory
+done
+
+#
+# Offline $ratio percent of hot-pluggable memory
+#
+for memory in `hotpluggable_online_memory`; do
+       if [ $((RANDOM % 100)) -lt $ratio ]; then
+               echo online-offline $memory
+               offline_memory_expect_success $memory
+       fi
+done
+
+#
+# Online all hot-pluggable memory again
+#
+for memory in `hotplaggable_offline_memory`; do
+       echo offline-online $memory
+       online_memory_expect_success $memory
+done
+
+#
+# Test with memory notifier error injection
+#
+
+DEBUGFS=`mount -t debugfs | head -1 | awk '{ print $3 }'`
+NOTIFIER_ERR_INJECT_DIR=$DEBUGFS/notifier-error-inject/memory
+
+prerequisite_extra()
+{
+       msg="skip extra tests:"
+
+       /sbin/modprobe -q -r memory-notifier-error-inject
+       /sbin/modprobe -q memory-notifier-error-inject priority=$priority
+
+       if [ ! -d "$DEBUGFS" ]; then
+               echo $msg debugfs is not mounted >&2
+               exit 0
+       fi
+
+       if [ ! -d $NOTIFIER_ERR_INJECT_DIR ]; then
+               echo $msg memory-notifier-error-inject module is not available >&2
+               exit 0
+       fi
+}
+
+prerequisite_extra
+
+#
+# Offline $ratio percent of hot-pluggable memory
+#
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
+for memory in `hotpluggable_online_memory`; do
+       if [ $((RANDOM % 100)) -lt $ratio ]; then
+               offline_memory_expect_success $memory
+       fi
+done
+
+#
+# Test memory hot-add error handling (offline => online)
+#
+echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error
+for memory in `hotplaggable_offline_memory`; do
+       online_memory_expect_fail $memory
+done
+
+#
+# Online all hot-pluggable memory
+#
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error
+for memory in `hotplaggable_offline_memory`; do
+       online_memory_expect_success $memory
+done
+
+#
+# Test memory hot-remove error handling (online => offline)
+#
+echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
+for memory in `hotpluggable_online_memory`; do
+       offline_memory_expect_fail $memory
+done
+
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
+/sbin/modprobe -q -r memory-notifier-error-inject
diff --git a/tools/testing/selftests/memory-hotplug/on-off-test.sh b/tools/testing/selftests/memory-hotplug/on-off-test.sh
deleted file mode 100644 (file)
index 6cddde0..0000000
+++ /dev/null
@@ -1,238 +0,0 @@
-#!/bin/bash
-
-SYSFS=
-
-prerequisite()
-{
-       msg="skip all tests:"
-
-       if [ $UID != 0 ]; then
-               echo $msg must be run as root >&2
-               exit 0
-       fi
-
-       SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
-
-       if [ ! -d "$SYSFS" ]; then
-               echo $msg sysfs is not mounted >&2
-               exit 0
-       fi
-
-       if ! ls $SYSFS/devices/system/memory/memory* > /dev/null 2>&1; then
-               echo $msg memory hotplug is not supported >&2
-               exit 0
-       fi
-}
-
-#
-# list all hot-pluggable memory
-#
-hotpluggable_memory()
-{
-       local state=${1:-.\*}
-
-       for memory in $SYSFS/devices/system/memory/memory*; do
-               if grep -q 1 $memory/removable &&
-                  grep -q $state $memory/state; then
-                       echo ${memory##/*/memory}
-               fi
-       done
-}
-
-hotplaggable_offline_memory()
-{
-       hotpluggable_memory offline
-}
-
-hotpluggable_online_memory()
-{
-       hotpluggable_memory online
-}
-
-memory_is_online()
-{
-       grep -q online $SYSFS/devices/system/memory/memory$1/state
-}
-
-memory_is_offline()
-{
-       grep -q offline $SYSFS/devices/system/memory/memory$1/state
-}
-
-online_memory()
-{
-       echo online > $SYSFS/devices/system/memory/memory$1/state
-}
-
-offline_memory()
-{
-       echo offline > $SYSFS/devices/system/memory/memory$1/state
-}
-
-online_memory_expect_success()
-{
-       local memory=$1
-
-       if ! online_memory $memory; then
-               echo $FUNCNAME $memory: unexpected fail >&2
-       elif ! memory_is_online $memory; then
-               echo $FUNCNAME $memory: unexpected offline >&2
-       fi
-}
-
-online_memory_expect_fail()
-{
-       local memory=$1
-
-       if online_memory $memory 2> /dev/null; then
-               echo $FUNCNAME $memory: unexpected success >&2
-       elif ! memory_is_offline $memory; then
-               echo $FUNCNAME $memory: unexpected online >&2
-       fi
-}
-
-offline_memory_expect_success()
-{
-       local memory=$1
-
-       if ! offline_memory $memory; then
-               echo $FUNCNAME $memory: unexpected fail >&2
-       elif ! memory_is_offline $memory; then
-               echo $FUNCNAME $memory: unexpected offline >&2
-       fi
-}
-
-offline_memory_expect_fail()
-{
-       local memory=$1
-
-       if offline_memory $memory 2> /dev/null; then
-               echo $FUNCNAME $memory: unexpected success >&2
-       elif ! memory_is_online $memory; then
-               echo $FUNCNAME $memory: unexpected offline >&2
-       fi
-}
-
-error=-12
-priority=0
-ratio=10
-
-while getopts e:hp:r: opt; do
-       case $opt in
-       e)
-               error=$OPTARG
-               ;;
-       h)
-               echo "Usage $0 [ -e errno ] [ -p notifier-priority ] [ -r percent-of-memory-to-offline ]"
-               exit
-               ;;
-       p)
-               priority=$OPTARG
-               ;;
-       r)
-               ratio=$OPTARG
-               ;;
-       esac
-done
-
-if ! [ "$error" -ge -4095 -a "$error" -lt 0 ]; then
-       echo "error code must be -4095 <= errno < 0" >&2
-       exit 1
-fi
-
-prerequisite
-
-echo "Test scope: $ratio% hotplug memory"
-echo -e "\t online all hotplug memory in offline state"
-echo -e "\t offline $ratio% hotplug memory in online state"
-echo -e "\t online all hotplug memory in offline state"
-
-#
-# Online all hot-pluggable memory
-#
-for memory in `hotplaggable_offline_memory`; do
-       echo offline-online $memory
-       online_memory_expect_success $memory
-done
-
-#
-# Offline $ratio percent of hot-pluggable memory
-#
-for memory in `hotpluggable_online_memory`; do
-       if [ $((RANDOM % 100)) -lt $ratio ]; then
-               echo online-offline $memory
-               offline_memory_expect_success $memory
-       fi
-done
-
-#
-# Online all hot-pluggable memory again
-#
-for memory in `hotplaggable_offline_memory`; do
-       echo offline-online $memory
-       online_memory_expect_success $memory
-done
-
-#
-# Test with memory notifier error injection
-#
-
-DEBUGFS=`mount -t debugfs | head -1 | awk '{ print $3 }'`
-NOTIFIER_ERR_INJECT_DIR=$DEBUGFS/notifier-error-inject/memory
-
-prerequisite_extra()
-{
-       msg="skip extra tests:"
-
-       /sbin/modprobe -q -r memory-notifier-error-inject
-       /sbin/modprobe -q memory-notifier-error-inject priority=$priority
-
-       if [ ! -d "$DEBUGFS" ]; then
-               echo $msg debugfs is not mounted >&2
-               exit 0
-       fi
-
-       if [ ! -d $NOTIFIER_ERR_INJECT_DIR ]; then
-               echo $msg memory-notifier-error-inject module is not available >&2
-               exit 0
-       fi
-}
-
-prerequisite_extra
-
-#
-# Offline $ratio percent of hot-pluggable memory
-#
-echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
-for memory in `hotpluggable_online_memory`; do
-       if [ $((RANDOM % 100)) -lt $ratio ]; then
-               offline_memory_expect_success $memory
-       fi
-done
-
-#
-# Test memory hot-add error handling (offline => online)
-#
-echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error
-for memory in `hotplaggable_offline_memory`; do
-       online_memory_expect_fail $memory
-done
-
-#
-# Online all hot-pluggable memory
-#
-echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error
-for memory in `hotplaggable_offline_memory`; do
-       online_memory_expect_success $memory
-done
-
-#
-# Test memory hot-remove error handling (online => offline)
-#
-echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
-for memory in `hotpluggable_online_memory`; do
-       offline_memory_expect_fail $memory
-done
-
-echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
-/sbin/modprobe -q -r memory-notifier-error-inject
diff --git a/tools/testing/selftests/mount/.gitignore b/tools/testing/selftests/mount/.gitignore
new file mode 100644 (file)
index 0000000..856ad41
--- /dev/null
@@ -0,0 +1 @@
+unprivileged-remount-test
index 337d853c2b72e8f434e1ed94e4e0f53fac49bbde..95580a97326e166ce2cd9f267055bbc2cdbc2355 100644 (file)
@@ -1,17 +1,16 @@
 # Makefile for mount selftests.
-
+CFLAGS = -Wall \
+         -O2
 all: unprivileged-remount-test
 
 unprivileged-remount-test: unprivileged-remount-test.c
-       gcc -Wall -O2 unprivileged-remount-test.c -o unprivileged-remount-test
+       $(CC) $(CFLAGS) unprivileged-remount-test.c -o unprivileged-remount-test
 
-# Allow specific tests to be selected.
-test_unprivileged_remount: unprivileged-remount-test
-       @if [ -f /proc/self/uid_map ] ; then ./unprivileged-remount-test ; fi
+include ../lib.mk
 
-run_tests: all test_unprivileged_remount
+TEST_PROGS := unprivileged-remount-test
+override RUN_TESTS := if [ -f /proc/self/uid_map ] ; then ./unprivileged-remount-test ; fi
+override EMIT_TESTS := echo "$(RUN_TESTS)"
 
 clean:
        rm -f unprivileged-remount-test
-
-.PHONY: all test_unprivileged_remount
index 8056e2e68fa4cd56d27e61ba1cab7cc54cf514f9..0e3b41eb85cde2cd553bda36ac0b09f43553ceb8 100644 (file)
@@ -1,10 +1,22 @@
+CFLAGS = -O2
+
 all:
-       gcc -O2 mq_open_tests.c -o mq_open_tests -lrt
-       gcc -O2 -o mq_perf_tests mq_perf_tests.c -lrt -lpthread -lpopt
+       $(CC) $(CFLAGS) mq_open_tests.c -o mq_open_tests -lrt
+       $(CC) $(CFLAGS) -o mq_perf_tests mq_perf_tests.c -lrt -lpthread -lpopt
+
+include ../lib.mk
+
+override define RUN_TESTS
+       @./mq_open_tests /test1 || echo "selftests: mq_open_tests [FAIL]"
+       @./mq_perf_tests || echo "selftests: mq_perf_tests [FAIL]"
+endef
+
+TEST_PROGS := mq_open_tests mq_perf_tests
 
-run_tests:
-       @./mq_open_tests /test1 || echo "mq_open_tests: [FAIL]"
-       @./mq_perf_tests || echo "mq_perf_tests: [FAIL]"
+override define EMIT_TESTS
+       echo "./mq_open_tests /test1 || echo \"selftests: mq_open_tests [FAIL]\""
+       echo "./mq_perf_tests || echo \"selftests: mq_perf_tests [FAIL]\""
+endef
 
 clean:
        rm -f mq_open_tests mq_perf_tests
index 62f22cc9941ce7595d8ff7a085b339adc044eb0e..fac4782c51d8439e524a3675bd6acc5d2df79c21 100644 (file)
@@ -1,6 +1,5 @@
 # Makefile for net selftests
 
-CC = $(CROSS_COMPILE)gcc
 CFLAGS = -Wall -O2 -g
 
 CFLAGS += -I../../../../usr/include/
@@ -11,9 +10,10 @@ all: $(NET_PROGS)
 %: %.c
        $(CC) $(CFLAGS) -o $@ $^
 
-run_tests: all
-       @/bin/sh ./run_netsocktests || echo "sockettests: [FAIL]"
-       @/bin/sh ./run_afpackettests || echo "afpackettests: [FAIL]"
-       ./test_bpf.sh
+TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh
+TEST_FILES := $(NET_PROGS)
+
+include ../lib.mk
+
 clean:
        $(RM) $(NET_PROGS)
old mode 100644 (file)
new mode 100755 (executable)
old mode 100644 (file)
new mode 100755 (executable)
index 1d5e7ad2c46008590f9b18cd54ca330cf8a81745..2958fe9a74e97b2c86ce1c8e6072f49291cca257 100644 (file)
@@ -8,10 +8,9 @@ ifeq ($(ARCH),powerpc)
 
 GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown")
 
-CC := $(CROSS_COMPILE)$(CC)
 CFLAGS := -Wall -O2 -flto -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR) $(CFLAGS)
 
-export CC CFLAGS
+export CFLAGS
 
 TARGETS = pmu copyloops mm tm primitives stringloops
 
@@ -22,10 +21,25 @@ all: $(TARGETS)
 $(TARGETS):
        $(MAKE) -k -C $@ all
 
-run_tests: all
+include ../lib.mk
+
+override define RUN_TESTS
        @for TARGET in $(TARGETS); do \
                $(MAKE) -C $$TARGET run_tests; \
        done;
+endef
+
+override define INSTALL_RULE
+       @for TARGET in $(TARGETS); do \
+               $(MAKE) -C $$TARGET install; \
+       done;
+endef
+
+override define EMIT_TESTS
+       @for TARGET in $(TARGETS); do \
+               $(MAKE) -s -C $$TARGET emit_tests; \
+       done;
+endef
 
 clean:
        @for TARGET in $(TARGETS); do \
@@ -36,4 +50,4 @@ clean:
 tags:
        find . -name '*.c' -o -name '*.h' | xargs ctags
 
-.PHONY: all run_tests clean tags $(TARGETS)
+.PHONY: tags $(TARGETS)
index 6f2d3be227f9909622876c6215bcf534b88a455c..c05023514ce8a019a33c8af97516ad1d2be015ad 100644 (file)
@@ -6,24 +6,19 @@ CFLAGS += -D SELFTEST
 # Use our CFLAGS for the implicit .S rule
 ASFLAGS = $(CFLAGS)
 
-PROGS := copyuser_64 copyuser_power7 memcpy_64 memcpy_power7
+TEST_PROGS := copyuser_64 copyuser_power7 memcpy_64 memcpy_power7
 EXTRA_SOURCES := validate.c ../harness.c
 
-all: $(PROGS)
+all: $(TEST_PROGS)
 
 copyuser_64:     CPPFLAGS += -D COPY_LOOP=test___copy_tofrom_user_base
 copyuser_power7: CPPFLAGS += -D COPY_LOOP=test___copy_tofrom_user_power7
 memcpy_64:       CPPFLAGS += -D COPY_LOOP=test_memcpy
 memcpy_power7:   CPPFLAGS += -D COPY_LOOP=test_memcpy_power7
 
-$(PROGS): $(EXTRA_SOURCES)
+$(TEST_PROGS): $(EXTRA_SOURCES)
 
-run_tests: all
-       @-for PROG in $(PROGS); do \
-               ./$$PROG; \
-       done;
+include ../../lib.mk
 
 clean:
-       rm -f $(PROGS) *.o
-
-.PHONY: all run_tests clean
+       rm -f $(TEST_PROGS) *.o
index a14c538dd7f8d6068fe80bc4427a8e72e0cb94e6..41cc3ed66818bffa72d3bfffcbdecb44123c6ffc 100644 (file)
@@ -1,21 +1,16 @@
 noarg:
        $(MAKE) -C ../
 
-PROGS := hugetlb_vs_thp_test subpage_prot
+TEST_PROGS := hugetlb_vs_thp_test subpage_prot
 
-all: $(PROGS) tempfile
+all: $(TEST_PROGS) tempfile
 
-$(PROGS): ../harness.c
+$(TEST_PROGS): ../harness.c
 
-run_tests: all
-       @-for PROG in $(PROGS); do \
-               ./$$PROG; \
-       done;
+include ../../lib.mk
 
 tempfile:
        dd if=/dev/zero of=tempfile bs=64k count=1
 
 clean:
-       rm -f $(PROGS) tempfile
-
-.PHONY: all run_tests clean
+       rm -f $(TEST_PROGS) tempfile
index c9f4263906a5e67200284006fcae44bec8628065..5a161175bbd4197e907dbeb68f7c5f7003ecdf76 100644 (file)
@@ -1,38 +1,42 @@
 noarg:
        $(MAKE) -C ../
 
-PROGS := count_instructions l3_bank_test per_event_excludes
+TEST_PROGS := count_instructions l3_bank_test per_event_excludes
 EXTRA_SOURCES := ../harness.c event.c lib.c
 
-SUB_TARGETS = ebb
+all: $(TEST_PROGS) ebb
 
-all: $(PROGS) $(SUB_TARGETS)
-
-$(PROGS): $(EXTRA_SOURCES)
+$(TEST_PROGS): $(EXTRA_SOURCES)
 
 # loop.S can only be built 64-bit
 count_instructions: loop.S count_instructions.c $(EXTRA_SOURCES)
        $(CC) $(CFLAGS) -m64 -o $@ $^
 
-run_tests: all sub_run_tests
-       @-for PROG in $(PROGS); do \
-               ./$$PROG; \
-       done;
+include ../../lib.mk
 
-clean: sub_clean
-       rm -f $(PROGS) loop.o
+DEFAULT_RUN_TESTS := $(RUN_TESTS)
+override define RUN_TESTS
+       $(DEFAULT_RUN_TESTS)
+       $(MAKE) -C ebb run_tests
+endef
 
-$(SUB_TARGETS):
-       $(MAKE) -k -C $@ all
+DEFAULT_EMIT_TESTS := $(EMIT_TESTS)
+override define EMIT_TESTS
+       $(DEFAULT_EMIT_TESTS)
+       $(MAKE) -s -C ebb emit_tests
+endef
 
-sub_run_tests: all
-       @for TARGET in $(SUB_TARGETS); do \
-               $(MAKE) -C $$TARGET run_tests; \
-       done;
+DEFAULT_INSTALL := $(INSTALL_RULE)
+override define INSTALL_RULE
+       $(DEFAULT_INSTALL_RULE)
+       $(MAKE) -C ebb install
+endef
 
-sub_clean:
-       @for TARGET in $(SUB_TARGETS); do \
-               $(MAKE) -C $$TARGET clean; \
-       done;
+clean:
+       rm -f $(TEST_PROGS) loop.o
+       $(MAKE) -C ebb clean
+
+ebb:
+       $(MAKE) -k -C $@ all
 
-.PHONY: all run_tests clean sub_run_tests sub_clean $(SUB_TARGETS)
+.PHONY: all run_tests clean ebb
index 3dc4332698cb4f57c7160b22fe3c5fa740be6e79..5cdc9dbf2b279c95cd3f9603759b63bc5c0cfee5 100644 (file)
@@ -4,7 +4,7 @@ noarg:
 # The EBB handler is 64-bit code and everything links against it
 CFLAGS += -m64
 
-PROGS := reg_access_test event_attributes_test cycles_test     \
+TEST_PROGS := reg_access_test event_attributes_test cycles_test        \
         cycles_with_freeze_test pmc56_overflow_test            \
         ebb_vs_cpu_event_test cpu_event_vs_ebb_test            \
         cpu_event_pinned_vs_ebb_test task_event_vs_ebb_test    \
@@ -16,18 +16,15 @@ PROGS := reg_access_test event_attributes_test cycles_test  \
         lost_exception_test no_handler_test                    \
         cycles_with_mmcr2_test
 
-all: $(PROGS)
+all: $(TEST_PROGS)
 
-$(PROGS): ../../harness.c ../event.c ../lib.c ebb.c ebb_handler.S trace.c busy_loop.S
+$(TEST_PROGS): ../../harness.c ../event.c ../lib.c ebb.c ebb_handler.S trace.c busy_loop.S
 
 instruction_count_test: ../loop.S
 
 lost_exception_test: ../lib.c
 
-run_tests: all
-       @-for PROG in $(PROGS); do \
-               ./$$PROG; \
-       done;
+include ../../../lib.mk
 
 clean:
-       rm -f $(PROGS)
+       rm -f $(TEST_PROGS)
index ea737ca01732b2726fa7d149ce40d85c28fb7ccb..b68c6221d3d1bdad82a50356fa5c73083611d948 100644 (file)
@@ -1,17 +1,12 @@
 CFLAGS += -I$(CURDIR)
 
-PROGS := load_unaligned_zeropad
+TEST_PROGS := load_unaligned_zeropad
 
-all: $(PROGS)
+all: $(TEST_PROGS)
 
-$(PROGS): ../harness.c
+$(TEST_PROGS): ../harness.c
 
-run_tests: all
-       @-for PROG in $(PROGS); do \
-               ./$$PROG; \
-       done;
+include ../../lib.mk
 
 clean:
-       rm -f $(PROGS) *.o
-
-.PHONY: all run_tests clean
+       rm -f $(TEST_PROGS) *.o
index 506d7734647764cd077a669678db0eb9626ae23f..2a728f4d2873de636379c277feae38f004c60934 100644 (file)
@@ -2,19 +2,14 @@
 CFLAGS += -m64
 CFLAGS += -I$(CURDIR)
 
-PROGS := memcmp
+TEST_PROGS := memcmp
 EXTRA_SOURCES := memcmp_64.S ../harness.c
 
-all: $(PROGS)
+all: $(TEST_PROGS)
 
-$(PROGS): $(EXTRA_SOURCES)
+$(TEST_PROGS): $(EXTRA_SOURCES)
 
-run_tests: all
-       @-for PROG in $(PROGS); do \
-               ./$$PROG; \
-       done;
+include ../../lib.mk
 
 clean:
-       rm -f $(PROGS) *.o
-
-.PHONY: all run_tests clean
+       rm -f $(TEST_PROGS) *.o
index 2cede239a074dd110aa9ff3a6119b55f9c9d4ff6..34f2ec634b40ac2c8c3a1d2fac06c0257c8e7181 100644 (file)
@@ -1,15 +1,10 @@
-PROGS := tm-resched-dscr
+TEST_PROGS := tm-resched-dscr
 
-all: $(PROGS)
+all: $(TEST_PROGS)
 
-$(PROGS): ../harness.c
+$(TEST_PROGS): ../harness.c
 
-run_tests: all
-       @-for PROG in $(PROGS); do \
-               ./$$PROG; \
-       done;
+include ../../lib.mk
 
 clean:
-       rm -f $(PROGS) *.o
-
-.PHONY: all run_tests clean
+       rm -f $(TEST_PROGS) *.o
index 47ae2d385ce864ad85a7a070f29bd22c34fe3e5b..453927fea90cae7b65005fb661bf7b8254686d32 100644 (file)
@@ -6,5 +6,6 @@ all: peeksiginfo
 clean:
        rm -f peeksiginfo
 
-run_tests: all
-       @./peeksiginfo || echo "peeksiginfo selftests: [FAIL]"
+TEST_PROGS := peeksiginfo
+
+include ../lib.mk
index 04dc25e4fa92456597dc9a14862ab4192d84410d..bbd0b5398b613c08ee77eb4fbf9740383d874c0d 100644 (file)
@@ -1,12 +1,11 @@
-CC = $(CROSS_COMPILE)gcc
-
 all: get_size
 
 get_size: get_size.c
        $(CC) -static -ffreestanding -nostartfiles -s $< -o $@
 
-run_tests: all
-       ./get_size
+TEST_PROGS := get_size
+
+include ../lib.mk
 
 clean:
        $(RM) get_size
index 0a92adaf0865510f68d07914bd299c777370fddb..b3c33e071f10069ad01d0029c5d7c0350e413a18 100644 (file)
@@ -4,16 +4,10 @@
 # No binaries, but make sure arg-less "make" doesn't trigger "run_tests".
 all:
 
-# Allow specific tests to be selected.
-test_num:
-       @/bin/sh ./run_numerictests
+TEST_PROGS := run_numerictests run_stringtests
+TEST_FILES := common_tests
 
-test_string:
-       @/bin/sh ./run_stringtests
-
-run_tests: all test_num test_string
+include ../lib.mk
 
 # Nothing to clean up.
 clean:
-
-.PHONY: all run_tests clean test_num test_string
old mode 100644 (file)
new mode 100755 (executable)
old mode 100644 (file)
new mode 100755 (executable)
index eb2859f4ad2113576831e5f8ee420cf514f5194d..89a3f44bf355d65f72ad279c298920d4c0dec260 100644 (file)
@@ -1,8 +1,36 @@
-all:
-       gcc posix_timers.c -o posix_timers -lrt
+CC = $(CROSS_COMPILE)gcc
+BUILD_FLAGS = -DKTEST
+CFLAGS += -O3 -Wl,-no-as-needed -Wall $(BUILD_FLAGS)
+LDFLAGS += -lrt -lpthread
 
-run_tests: all
-       ./posix_timers
+# these are all "safe" tests that don't modify
+# system time or require escalated privledges
+TEST_PROGS = posix_timers nanosleep nsleep-lat set-timer-lat mqueue-lat \
+            inconsistency-check raw_skew threadtest rtctest
+
+TEST_PROGS_EXTENDED = alarmtimer-suspend valid-adjtimex change_skew \
+                     skew_consistency clocksource-switch leap-a-day \
+                     leapcrash set-tai set-2038
+
+bins = $(TEST_PROGS) $(TEST_PROGS_EXTENDED)
+
+all: ${bins}
+
+include ../lib.mk
+
+# these tests require escalated privledges
+# and may modify the system time or trigger
+# other behavior like suspend
+run_destructive_tests: run_tests
+       ./alarmtimer-suspend
+       ./valid-adjtimex
+       ./change_skew
+       ./skew_consistency
+       ./clocksource-switch
+       ./leap-a-day -s -i 10
+       ./leapcrash
+       ./set-tai
+       ./set-2038
 
 clean:
-       rm -f ./posix_timers
+       rm -f ${bins}
diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c
new file mode 100644 (file)
index 0000000..aaffbde
--- /dev/null
@@ -0,0 +1,185 @@
+/* alarmtimer suspend test
+ *             John Stultz (john.stultz@linaro.org)
+ *              (C) Copyright Linaro 2013
+ *              Licensed under the GPLv2
+ *
+ *   This test makes sure the alarmtimer & RTC wakeup code is
+ *   functioning.
+ *
+ *  To build:
+ *     $ gcc alarmtimer-suspend.c -o alarmtimer-suspend -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <unistd.h>
+#include <time.h>
+#include <string.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <pthread.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define CLOCK_REALTIME                 0
+#define CLOCK_MONOTONIC                        1
+#define CLOCK_PROCESS_CPUTIME_ID       2
+#define CLOCK_THREAD_CPUTIME_ID                3
+#define CLOCK_MONOTONIC_RAW            4
+#define CLOCK_REALTIME_COARSE          5
+#define CLOCK_MONOTONIC_COARSE         6
+#define CLOCK_BOOTTIME                 7
+#define CLOCK_REALTIME_ALARM           8
+#define CLOCK_BOOTTIME_ALARM           9
+#define CLOCK_HWSPECIFIC               10
+#define CLOCK_TAI                      11
+#define NR_CLOCKIDS                    12
+
+
+#define NSEC_PER_SEC 1000000000ULL
+#define UNREASONABLE_LAT (NSEC_PER_SEC * 4) /* hopefully we resume in 4secs */
+
+#define SUSPEND_SECS 15
+int alarmcount;
+int alarm_clock_id;
+struct timespec start_time;
+
+
+char *clockstring(int clockid)
+{
+       switch (clockid) {
+       case CLOCK_REALTIME:
+               return "CLOCK_REALTIME";
+       case CLOCK_MONOTONIC:
+               return "CLOCK_MONOTONIC";
+       case CLOCK_PROCESS_CPUTIME_ID:
+               return "CLOCK_PROCESS_CPUTIME_ID";
+       case CLOCK_THREAD_CPUTIME_ID:
+               return "CLOCK_THREAD_CPUTIME_ID";
+       case CLOCK_MONOTONIC_RAW:
+               return "CLOCK_MONOTONIC_RAW";
+       case CLOCK_REALTIME_COARSE:
+               return "CLOCK_REALTIME_COARSE";
+       case CLOCK_MONOTONIC_COARSE:
+               return "CLOCK_MONOTONIC_COARSE";
+       case CLOCK_BOOTTIME:
+               return "CLOCK_BOOTTIME";
+       case CLOCK_REALTIME_ALARM:
+               return "CLOCK_REALTIME_ALARM";
+       case CLOCK_BOOTTIME_ALARM:
+               return "CLOCK_BOOTTIME_ALARM";
+       case CLOCK_TAI:
+               return "CLOCK_TAI";
+       };
+       return "UNKNOWN_CLOCKID";
+}
+
+
+long long timespec_sub(struct timespec a, struct timespec b)
+{
+       long long ret = NSEC_PER_SEC * b.tv_sec + b.tv_nsec;
+
+       ret -= NSEC_PER_SEC * a.tv_sec + a.tv_nsec;
+       return ret;
+}
+
+int final_ret = 0;
+
+void sigalarm(int signo)
+{
+       long long delta_ns;
+       struct timespec ts;
+
+       clock_gettime(alarm_clock_id, &ts);
+       alarmcount++;
+
+       delta_ns = timespec_sub(start_time, ts);
+       delta_ns -= NSEC_PER_SEC * SUSPEND_SECS * alarmcount;
+
+       printf("ALARM(%i): %ld:%ld latency: %lld ns ", alarmcount, ts.tv_sec,
+                                                       ts.tv_nsec, delta_ns);
+
+       if (delta_ns > UNREASONABLE_LAT) {
+               printf("[FAIL]\n");
+               final_ret = -1;
+       } else
+               printf("[OK]\n");
+
+}
+
+int main(void)
+{
+       timer_t tm1;
+       struct itimerspec its1, its2;
+       struct sigevent se;
+       struct sigaction act;
+       int signum = SIGRTMAX;
+
+       /* Set up signal handler: */
+       sigfillset(&act.sa_mask);
+       act.sa_flags = 0;
+       act.sa_handler = sigalarm;
+       sigaction(signum, &act, NULL);
+
+       /* Set up timer: */
+       memset(&se, 0, sizeof(se));
+       se.sigev_notify = SIGEV_SIGNAL;
+       se.sigev_signo = signum;
+       se.sigev_value.sival_int = 0;
+
+       for (alarm_clock_id = CLOCK_REALTIME_ALARM;
+                       alarm_clock_id <= CLOCK_BOOTTIME_ALARM;
+                       alarm_clock_id++) {
+
+               alarmcount = 0;
+               timer_create(alarm_clock_id, &se, &tm1);
+
+               clock_gettime(alarm_clock_id, &start_time);
+               printf("Start time (%s): %ld:%ld\n", clockstring(alarm_clock_id),
+                               start_time.tv_sec, start_time.tv_nsec);
+               printf("Setting alarm for every %i seconds\n", SUSPEND_SECS);
+               its1.it_value = start_time;
+               its1.it_value.tv_sec += SUSPEND_SECS;
+               its1.it_interval.tv_sec = SUSPEND_SECS;
+               its1.it_interval.tv_nsec = 0;
+
+               timer_settime(tm1, TIMER_ABSTIME, &its1, &its2);
+
+               while (alarmcount < 5)
+                       sleep(1); /* First 5 alarms, do nothing */
+
+               printf("Starting suspend loops\n");
+               while (alarmcount < 10) {
+                       int ret;
+
+                       sleep(1);
+                       ret = system("echo mem > /sys/power/state");
+                       if (ret)
+                               break;
+               }
+               timer_delete(tm1);
+       }
+       if (final_ret)
+               return ksft_exit_fail();
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c
new file mode 100644 (file)
index 0000000..cb19689
--- /dev/null
@@ -0,0 +1,107 @@
+/* ADJ_FREQ Skew change test
+ *             by: john stultz (johnstul@us.ibm.com)
+ *             (C) Copyright IBM 2012
+ *             Licensed under the GPLv2
+ *
+ *  NOTE: This is a meta-test which cranks the ADJ_FREQ knob and
+ *  then uses other tests to detect problems. Thus this test requires
+ *  that the raw_skew, inconsistency-check and nanosleep tests be
+ *  present in the same directory it is run from.
+ *
+ *  To build:
+ *     $ gcc change_skew.c -o change_skew -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <time.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define NSEC_PER_SEC 1000000000LL
+
+
+int change_skew_test(int ppm)
+{
+       struct timex tx;
+       int ret;
+
+       tx.modes = ADJ_FREQUENCY;
+       tx.freq = ppm << 16;
+
+       ret = adjtimex(&tx);
+       if (ret < 0) {
+               printf("Error adjusting freq\n");
+               return ret;
+       }
+
+       ret = system("./raw_skew");
+       ret |= system("./inconsistency-check");
+       ret |= system("./nanosleep");
+
+       return ret;
+}
+
+
+int main(int argv, char **argc)
+{
+       struct timex tx;
+       int i, ret;
+
+       int ppm[5] = {0, 250, 500, -250, -500};
+
+       /* Kill ntpd */
+       ret = system("killall -9 ntpd");
+
+       /* Make sure there's no offset adjustment going on */
+       tx.modes = ADJ_OFFSET;
+       tx.offset = 0;
+       ret = adjtimex(&tx);
+
+       if (ret < 0) {
+               printf("Maybe you're not running as root?\n");
+               return -1;
+       }
+
+       for (i = 0; i < 5; i++) {
+               printf("Using %i ppm adjustment\n", ppm[i]);
+               ret = change_skew_test(ppm[i]);
+               if (ret)
+                       break;
+       }
+
+       /* Set things back */
+       tx.modes = ADJ_FREQUENCY;
+       tx.offset = 0;
+       adjtimex(&tx);
+
+       if (ret) {
+               printf("[FAIL]");
+               return ksft_exit_fail();
+       }
+       printf("[OK]");
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c
new file mode 100644 (file)
index 0000000..627ec74
--- /dev/null
@@ -0,0 +1,179 @@
+/* Clocksource change test
+ *             by: john stultz (johnstul@us.ibm.com)
+ *             (C) Copyright IBM 2012
+ *             Licensed under the GPLv2
+ *
+ *  NOTE: This is a meta-test which quickly changes the clocksourc and
+ *  then uses other tests to detect problems. Thus this test requires
+ *  that the inconsistency-check and nanosleep tests be present in the
+ *  same directory it is run from.
+ *
+ *  To build:
+ *     $ gcc clocksource-switch.c -o clocksource-switch -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/wait.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+
+int get_clocksources(char list[][30])
+{
+       int fd, i;
+       size_t size;
+       char buf[512];
+       char *head, *tmp;
+
+       fd = open("/sys/devices/system/clocksource/clocksource0/available_clocksource", O_RDONLY);
+
+       size = read(fd, buf, 512);
+
+       close(fd);
+
+       for (i = 0; i < 30; i++)
+               list[i][0] = '\0';
+
+       head = buf;
+       i = 0;
+       while (head - buf < size) {
+               /* Find the next space */
+               for (tmp = head; *tmp != ' '; tmp++) {
+                       if (*tmp == '\n')
+                               break;
+                       if (*tmp == '\0')
+                               break;
+               }
+               *tmp = '\0';
+               strcpy(list[i], head);
+               head = tmp + 1;
+               i++;
+       }
+
+       return i-1;
+}
+
+int get_cur_clocksource(char *buf, size_t size)
+{
+       int fd;
+
+       fd = open("/sys/devices/system/clocksource/clocksource0/current_clocksource", O_RDONLY);
+
+       size = read(fd, buf, size);
+
+       return 0;
+}
+
+int change_clocksource(char *clocksource)
+{
+       int fd;
+       size_t size;
+
+       fd = open("/sys/devices/system/clocksource/clocksource0/current_clocksource", O_WRONLY);
+
+       if (fd < 0)
+               return -1;
+
+       size = write(fd, clocksource, strlen(clocksource));
+
+       if (size < 0)
+               return -1;
+
+       close(fd);
+       return 0;
+}
+
+
+int run_tests(int secs)
+{
+       int ret;
+       char buf[255];
+
+       sprintf(buf, "./inconsistency-check -t %i", secs);
+       ret = system(buf);
+       if (ret)
+               return ret;
+       ret = system("./nanosleep");
+       return ret;
+}
+
+
+char clocksource_list[10][30];
+
+int main(int argv, char **argc)
+{
+       char orig_clk[512];
+       int count, i, status;
+       pid_t pid;
+
+       get_cur_clocksource(orig_clk, 512);
+
+       count = get_clocksources(clocksource_list);
+
+       if (change_clocksource(clocksource_list[0])) {
+               printf("Error: You probably need to run this as root\n");
+               return -1;
+       }
+
+       /* Check everything is sane before we start switching asyncrhonously */
+       for (i = 0; i < count; i++) {
+               printf("Validating clocksource %s\n", clocksource_list[i]);
+               if (change_clocksource(clocksource_list[i])) {
+                       status = -1;
+                       goto out;
+               }
+               if (run_tests(5)) {
+                       status = -1;
+                       goto out;
+               }
+       }
+
+
+       printf("Running Asyncrhonous Switching Tests...\n");
+       pid = fork();
+       if (!pid)
+               return run_tests(60);
+
+       while (pid != waitpid(pid, &status, WNOHANG))
+               for (i = 0; i < count; i++)
+                       if (change_clocksource(clocksource_list[i])) {
+                               status = -1;
+                               goto out;
+                       }
+out:
+       change_clocksource(orig_clk);
+
+       if (status)
+               return ksft_exit_fail();
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/inconsistency-check.c b/tools/testing/selftests/timers/inconsistency-check.c
new file mode 100644 (file)
index 0000000..caf1bc9
--- /dev/null
@@ -0,0 +1,204 @@
+/* Time inconsistency check test
+ *             by: john stultz (johnstul@us.ibm.com)
+ *             (C) Copyright IBM 2003, 2004, 2005, 2012
+ *             (C) Copyright Linaro Limited 2015
+ *             Licensed under the GPLv2
+ *
+ *  To build:
+ *     $ gcc inconsistency-check.c -o inconsistency-check -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define CALLS_PER_LOOP 64
+#define NSEC_PER_SEC 1000000000ULL
+
+#define CLOCK_REALTIME                 0
+#define CLOCK_MONOTONIC                        1
+#define CLOCK_PROCESS_CPUTIME_ID       2
+#define CLOCK_THREAD_CPUTIME_ID                3
+#define CLOCK_MONOTONIC_RAW            4
+#define CLOCK_REALTIME_COARSE          5
+#define CLOCK_MONOTONIC_COARSE         6
+#define CLOCK_BOOTTIME                 7
+#define CLOCK_REALTIME_ALARM           8
+#define CLOCK_BOOTTIME_ALARM           9
+#define CLOCK_HWSPECIFIC               10
+#define CLOCK_TAI                      11
+#define NR_CLOCKIDS                    12
+
+char *clockstring(int clockid)
+{
+       switch (clockid) {
+       case CLOCK_REALTIME:
+               return "CLOCK_REALTIME";
+       case CLOCK_MONOTONIC:
+               return "CLOCK_MONOTONIC";
+       case CLOCK_PROCESS_CPUTIME_ID:
+               return "CLOCK_PROCESS_CPUTIME_ID";
+       case CLOCK_THREAD_CPUTIME_ID:
+               return "CLOCK_THREAD_CPUTIME_ID";
+       case CLOCK_MONOTONIC_RAW:
+               return "CLOCK_MONOTONIC_RAW";
+       case CLOCK_REALTIME_COARSE:
+               return "CLOCK_REALTIME_COARSE";
+       case CLOCK_MONOTONIC_COARSE:
+               return "CLOCK_MONOTONIC_COARSE";
+       case CLOCK_BOOTTIME:
+               return "CLOCK_BOOTTIME";
+       case CLOCK_REALTIME_ALARM:
+               return "CLOCK_REALTIME_ALARM";
+       case CLOCK_BOOTTIME_ALARM:
+               return "CLOCK_BOOTTIME_ALARM";
+       case CLOCK_TAI:
+               return "CLOCK_TAI";
+       };
+       return "UNKNOWN_CLOCKID";
+}
+
+/* returns 1 if a <= b, 0 otherwise */
+static inline int in_order(struct timespec a, struct timespec b)
+{
+       /* use unsigned to avoid false positives on 2038 rollover */
+       if ((unsigned long)a.tv_sec < (unsigned long)b.tv_sec)
+               return 1;
+       if ((unsigned long)a.tv_sec > (unsigned long)b.tv_sec)
+               return 0;
+       if (a.tv_nsec > b.tv_nsec)
+               return 0;
+       return 1;
+}
+
+
+
+int consistency_test(int clock_type, unsigned long seconds)
+{
+       struct timespec list[CALLS_PER_LOOP];
+       int i, inconsistent;
+       long now, then;
+       time_t t;
+       char *start_str;
+
+       clock_gettime(clock_type, &list[0]);
+       now = then = list[0].tv_sec;
+
+       /* timestamp start of test */
+       t = time(0);
+       start_str = ctime(&t);
+
+       while (seconds == -1 || now - then < seconds) {
+               inconsistent = 0;
+
+               /* Fill list */
+               for (i = 0; i < CALLS_PER_LOOP; i++)
+                       clock_gettime(clock_type, &list[i]);
+
+               /* Check for inconsistencies */
+               for (i = 0; i < CALLS_PER_LOOP - 1; i++)
+                       if (!in_order(list[i], list[i+1]))
+                               inconsistent = i;
+
+               /* display inconsistency */
+               if (inconsistent) {
+                       unsigned long long delta;
+
+                       printf("\%s\n", start_str);
+                       for (i = 0; i < CALLS_PER_LOOP; i++) {
+                               if (i == inconsistent)
+                                       printf("--------------------\n");
+                               printf("%lu:%lu\n", list[i].tv_sec,
+                                                       list[i].tv_nsec);
+                               if (i == inconsistent + 1)
+                                       printf("--------------------\n");
+                       }
+                       delta = list[inconsistent].tv_sec * NSEC_PER_SEC;
+                       delta += list[inconsistent].tv_nsec;
+                       delta -= list[inconsistent+1].tv_sec * NSEC_PER_SEC;
+                       delta -= list[inconsistent+1].tv_nsec;
+                       printf("Delta: %llu ns\n", delta);
+                       fflush(0);
+                       /* timestamp inconsistency*/
+                       t = time(0);
+                       printf("%s\n", ctime(&t));
+                       printf("[FAILED]\n");
+                       return -1;
+               }
+               now = list[0].tv_sec;
+       }
+       printf("[OK]\n");
+       return 0;
+}
+
+
+int main(int argc, char *argv[])
+{
+       int clockid, opt;
+       int userclock = CLOCK_REALTIME;
+       int maxclocks = NR_CLOCKIDS;
+       int runtime = 10;
+       struct timespec ts;
+
+       /* Process arguments */
+       while ((opt = getopt(argc, argv, "t:c:")) != -1) {
+               switch (opt) {
+               case 't':
+                       runtime = atoi(optarg);
+                       break;
+               case 'c':
+                       userclock = atoi(optarg);
+                       maxclocks = userclock + 1;
+                       break;
+               default:
+                       printf("Usage: %s [-t <secs>] [-c <clockid>]\n", argv[0]);
+                       printf("        -t: Number of seconds to run\n");
+                       printf("        -c: clockid to use (default, all clockids)\n");
+                       exit(-1);
+               }
+       }
+
+       setbuf(stdout, NULL);
+
+       for (clockid = userclock; clockid < maxclocks; clockid++) {
+
+               if (clockid == CLOCK_HWSPECIFIC)
+                       continue;
+
+               if (!clock_gettime(clockid, &ts)) {
+                       printf("Consistent %-30s ", clockstring(clockid));
+                       if (consistency_test(clockid, runtime))
+                               return ksft_exit_fail();
+               }
+       }
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c
new file mode 100644 (file)
index 0000000..b8272e6
--- /dev/null
@@ -0,0 +1,319 @@
+/* Leap second stress test
+ *              by: John Stultz (john.stultz@linaro.org)
+ *              (C) Copyright IBM 2012
+ *              (C) Copyright 2013, 2015 Linaro Limited
+ *              Licensed under the GPLv2
+ *
+ *  This test signals the kernel to insert a leap second
+ *  every day at midnight GMT. This allows for stessing the
+ *  kernel's leap-second behavior, as well as how well applications
+ *  handle the leap-second discontinuity.
+ *
+ *  Usage: leap-a-day [-s] [-i <num>]
+ *
+ *  Options:
+ *     -s:     Each iteration, set the date to 10 seconds before midnight GMT.
+ *             This speeds up the number of leapsecond transitions tested,
+ *             but because it calls settimeofday frequently, advancing the
+ *             time by 24 hours every ~16 seconds, it may cause application
+ *             disruption.
+ *
+ *     -i:     Number of iterations to run (default: infinite)
+ *
+ *  Other notes: Disabling NTP prior to running this is advised, as the two
+ *              may conflict in their commands to the kernel.
+ *
+ *  To build:
+ *     $ gcc leap-a-day.c -o leap-a-day -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define NSEC_PER_SEC 1000000000ULL
+#define CLOCK_TAI 11
+
+/* returns 1 if a <= b, 0 otherwise */
+static inline int in_order(struct timespec a, struct timespec b)
+{
+       if (a.tv_sec < b.tv_sec)
+               return 1;
+       if (a.tv_sec > b.tv_sec)
+               return 0;
+       if (a.tv_nsec > b.tv_nsec)
+               return 0;
+       return 1;
+}
+
+struct timespec timespec_add(struct timespec ts, unsigned long long ns)
+{
+       ts.tv_nsec += ns;
+       while (ts.tv_nsec >= NSEC_PER_SEC) {
+               ts.tv_nsec -= NSEC_PER_SEC;
+               ts.tv_sec++;
+       }
+       return ts;
+}
+
+char *time_state_str(int state)
+{
+       switch (state) {
+       case TIME_OK:   return "TIME_OK";
+       case TIME_INS:  return "TIME_INS";
+       case TIME_DEL:  return "TIME_DEL";
+       case TIME_OOP:  return "TIME_OOP";
+       case TIME_WAIT: return "TIME_WAIT";
+       case TIME_BAD:  return "TIME_BAD";
+       }
+       return "ERROR";
+}
+
+/* clear NTP time_status & time_state */
+int clear_time_state(void)
+{
+       struct timex tx;
+       int ret;
+
+       /*
+        * We have to call adjtime twice here, as kernels
+        * prior to 6b1859dba01c7 (included in 3.5 and
+        * -stable), had an issue with the state machine
+        * and wouldn't clear the STA_INS/DEL flag directly.
+        */
+       tx.modes = ADJ_STATUS;
+       tx.status = STA_PLL;
+       ret = adjtimex(&tx);
+
+       /* Clear maxerror, as it can cause UNSYNC to be set */
+       tx.modes = ADJ_MAXERROR;
+       tx.maxerror = 0;
+       ret = adjtimex(&tx);
+
+       /* Clear the status */
+       tx.modes = ADJ_STATUS;
+       tx.status = 0;
+       ret = adjtimex(&tx);
+
+       return ret;
+}
+
+/* Make sure we cleanup on ctrl-c */
+void handler(int unused)
+{
+       clear_time_state();
+       exit(0);
+}
+
+/* Test for known hrtimer failure */
+void test_hrtimer_failure(void)
+{
+       struct timespec now, target;
+
+       clock_gettime(CLOCK_REALTIME, &now);
+       target = timespec_add(now, NSEC_PER_SEC/2);
+       clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &target, NULL);
+       clock_gettime(CLOCK_REALTIME, &now);
+
+       if (!in_order(target, now))
+               printf("ERROR: hrtimer early expiration failure observed.\n");
+}
+
+int main(int argc, char **argv)
+{
+       int settime = 0;
+       int tai_time = 0;
+       int insert = 1;
+       int iterations = -1;
+       int opt;
+
+       /* Process arguments */
+       while ((opt = getopt(argc, argv, "sti:")) != -1) {
+               switch (opt) {
+               case 's':
+                       printf("Setting time to speed up testing\n");
+                       settime = 1;
+                       break;
+               case 'i':
+                       iterations = atoi(optarg);
+                       break;
+               case 't':
+                       tai_time = 1;
+                       break;
+               default:
+                       printf("Usage: %s [-s] [-i <iterations>]\n", argv[0]);
+                       printf("        -s: Set time to right before leap second each iteration\n");
+                       printf("        -i: Number of iterations\n");
+                       printf("        -t: Print TAI time\n");
+                       exit(-1);
+               }
+       }
+
+       /* Make sure TAI support is present if -t was used */
+       if (tai_time) {
+               struct timespec ts;
+
+               if (clock_gettime(CLOCK_TAI, &ts)) {
+                       printf("System doesn't support CLOCK_TAI\n");
+                       ksft_exit_fail();
+               }
+       }
+
+       signal(SIGINT, handler);
+       signal(SIGKILL, handler);
+
+       if (iterations < 0)
+               printf("This runs continuously. Press ctrl-c to stop\n");
+       else
+               printf("Running for %i iterations. Press ctrl-c to stop\n", iterations);
+
+       printf("\n");
+       while (1) {
+               int ret;
+               struct timespec ts;
+               struct timex tx;
+               time_t now, next_leap;
+
+               /* Get the current time */
+               clock_gettime(CLOCK_REALTIME, &ts);
+
+               /* Calculate the next possible leap second 23:59:60 GMT */
+               next_leap = ts.tv_sec;
+               next_leap += 86400 - (next_leap % 86400);
+
+               if (settime) {
+                       struct timeval tv;
+
+                       tv.tv_sec = next_leap - 10;
+                       tv.tv_usec = 0;
+                       settimeofday(&tv, NULL);
+                       printf("Setting time to %s", ctime(&tv.tv_sec));
+               }
+
+               /* Reset NTP time state */
+               clear_time_state();
+
+               /* Set the leap second insert flag */
+               tx.modes = ADJ_STATUS;
+               if (insert)
+                       tx.status = STA_INS;
+               else
+                       tx.status = STA_DEL;
+               ret = adjtimex(&tx);
+               if (ret < 0) {
+                       printf("Error: Problem setting STA_INS/STA_DEL!: %s\n",
+                                                       time_state_str(ret));
+                       return ksft_exit_fail();
+               }
+
+               /* Validate STA_INS was set */
+               tx.modes = 0;
+               ret = adjtimex(&tx);
+               if (tx.status != STA_INS && tx.status != STA_DEL) {
+                       printf("Error: STA_INS/STA_DEL not set!: %s\n",
+                                                       time_state_str(ret));
+                       return ksft_exit_fail();
+               }
+
+               if (tai_time) {
+                       printf("Using TAI time,"
+                               " no inconsistencies should be seen!\n");
+               }
+
+               printf("Scheduling leap second for %s", ctime(&next_leap));
+
+               /* Wake up 3 seconds before leap */
+               ts.tv_sec = next_leap - 3;
+               ts.tv_nsec = 0;
+
+               while (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &ts, NULL))
+                       printf("Something woke us up, returning to sleep\n");
+
+               /* Validate STA_INS is still set */
+               tx.modes = 0;
+               ret = adjtimex(&tx);
+               if (tx.status != STA_INS && tx.status != STA_DEL) {
+                       printf("Something cleared STA_INS/STA_DEL, setting it again.\n");
+                       tx.modes = ADJ_STATUS;
+                       if (insert)
+                               tx.status = STA_INS;
+                       else
+                               tx.status = STA_DEL;
+                       ret = adjtimex(&tx);
+               }
+
+               /* Check adjtimex output every half second */
+               now = tx.time.tv_sec;
+               while (now < next_leap + 2) {
+                       char buf[26];
+                       struct timespec tai;
+
+                       tx.modes = 0;
+                       ret = adjtimex(&tx);
+
+                       if (tai_time) {
+                               clock_gettime(CLOCK_TAI, &tai);
+                               printf("%ld sec, %9ld ns\t%s\n",
+                                               tai.tv_sec,
+                                               tai.tv_nsec,
+                                               time_state_str(ret));
+                       } else {
+                               ctime_r(&tx.time.tv_sec, buf);
+                               buf[strlen(buf)-1] = 0; /*remove trailing\n */
+
+                               printf("%s + %6ld us (%i)\t%s\n",
+                                               buf,
+                                               tx.time.tv_usec,
+                                               tx.tai,
+                                               time_state_str(ret));
+                       }
+                       now = tx.time.tv_sec;
+                       /* Sleep for another half second */
+                       ts.tv_sec = 0;
+                       ts.tv_nsec = NSEC_PER_SEC / 2;
+                       clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, NULL);
+               }
+               /* Switch to using other mode */
+               insert = !insert;
+
+               /* Note if kernel has known hrtimer failure */
+               test_hrtimer_failure();
+
+               printf("Leap complete\n\n");
+
+               if ((iterations != -1) && !(--iterations))
+                       break;
+       }
+
+       clear_time_state();
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/leapcrash.c b/tools/testing/selftests/timers/leapcrash.c
new file mode 100644 (file)
index 0000000..a1071bd
--- /dev/null
@@ -0,0 +1,120 @@
+/* Demo leapsecond deadlock
+ *              by: John Stultz (john.stultz@linaro.org)
+ *              (C) Copyright IBM 2012
+ *              (C) Copyright 2013, 2015 Linaro Limited
+ *              Licensed under the GPL
+ *
+ * This test demonstrates leapsecond deadlock that is possibe
+ * on kernels from 2.6.26 to 3.3.
+ *
+ * WARNING: THIS WILL LIKELY HARDHANG SYSTEMS AND MAY LOSE DATA
+ * RUN AT YOUR OWN RISK!
+ *  To build:
+ *     $ gcc leapcrash.c -o leapcrash -lrt
+ */
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+
+
+/* clear NTP time_status & time_state */
+int clear_time_state(void)
+{
+       struct timex tx;
+       int ret;
+
+       /*
+        * We have to call adjtime twice here, as kernels
+        * prior to 6b1859dba01c7 (included in 3.5 and
+        * -stable), had an issue with the state machine
+        * and wouldn't clear the STA_INS/DEL flag directly.
+        */
+       tx.modes = ADJ_STATUS;
+       tx.status = STA_PLL;
+       ret = adjtimex(&tx);
+
+       tx.modes = ADJ_STATUS;
+       tx.status = 0;
+       ret = adjtimex(&tx);
+
+       return ret;
+}
+
+/* Make sure we cleanup on ctrl-c */
+void handler(int unused)
+{
+       clear_time_state();
+       exit(0);
+}
+
+
+int main(void)
+{
+       struct timex tx;
+       struct timespec ts;
+       time_t next_leap;
+       int count = 0;
+
+       setbuf(stdout, NULL);
+
+       signal(SIGINT, handler);
+       signal(SIGKILL, handler);
+       printf("This runs for a few minutes. Press ctrl-c to stop\n");
+
+       clear_time_state();
+
+
+       /* Get the current time */
+       clock_gettime(CLOCK_REALTIME, &ts);
+
+       /* Calculate the next possible leap second 23:59:60 GMT */
+       next_leap = ts.tv_sec;
+       next_leap += 86400 - (next_leap % 86400);
+
+       for (count = 0; count < 20; count++) {
+               struct timeval tv;
+
+
+               /* set the time to 2 seconds before the leap */
+               tv.tv_sec = next_leap - 2;
+               tv.tv_usec = 0;
+               if (settimeofday(&tv, NULL)) {
+                       printf("Error: You're likely not running with proper (ie: root) permissions\n");
+                       return ksft_exit_fail();
+               }
+               tx.modes = 0;
+               adjtimex(&tx);
+
+               /* hammer on adjtime w/ STA_INS */
+               while (tx.time.tv_sec < next_leap + 1) {
+                       /* Set the leap second insert flag */
+                       tx.modes = ADJ_STATUS;
+                       tx.status = STA_INS;
+                       adjtimex(&tx);
+               }
+               clear_time_state();
+               printf(".");
+       }
+       printf("[OK]\n");
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/mqueue-lat.c b/tools/testing/selftests/timers/mqueue-lat.c
new file mode 100644 (file)
index 0000000..a2a3924
--- /dev/null
@@ -0,0 +1,124 @@
+/* Measure mqueue timeout latency
+ *              by: john stultz (john.stultz@linaro.org)
+ *             (C) Copyright Linaro 2013
+ *
+ *             Inspired with permission from example test by:
+ *                     Romain Francoise <romain@orebokech.com>
+ *              Licensed under the GPLv2
+ *
+ *  To build:
+ *     $ gcc mqueue-lat.c -o mqueue-lat -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include <errno.h>
+#include <mqueue.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define NSEC_PER_SEC 1000000000ULL
+
+#define TARGET_TIMEOUT         100000000       /* 100ms in nanoseconds */
+#define UNRESONABLE_LATENCY    40000000        /* 40ms in nanosecs */
+
+
+long long timespec_sub(struct timespec a, struct timespec b)
+{
+       long long ret = NSEC_PER_SEC * b.tv_sec + b.tv_nsec;
+
+       ret -= NSEC_PER_SEC * a.tv_sec + a.tv_nsec;
+       return ret;
+}
+
+struct timespec timespec_add(struct timespec ts, unsigned long long ns)
+{
+       ts.tv_nsec += ns;
+       while (ts.tv_nsec >= NSEC_PER_SEC) {
+               ts.tv_nsec -= NSEC_PER_SEC;
+               ts.tv_sec++;
+       }
+       return ts;
+}
+
+int mqueue_lat_test(void)
+{
+
+       mqd_t q;
+       struct mq_attr attr;
+       struct timespec start, end, now, target;
+       int i, count, ret;
+
+       q = mq_open("/foo", O_CREAT | O_RDONLY, 0666, NULL);
+       if (q < 0) {
+               perror("mq_open");
+               return -1;
+       }
+       mq_getattr(q, &attr);
+
+
+       count = 100;
+       clock_gettime(CLOCK_MONOTONIC, &start);
+
+       for (i = 0; i < count; i++) {
+               char buf[attr.mq_msgsize];
+
+               clock_gettime(CLOCK_REALTIME, &now);
+               target = now;
+               target = timespec_add(now, TARGET_TIMEOUT); /* 100ms */
+
+               ret = mq_timedreceive(q, buf, sizeof(buf), NULL, &target);
+               if (ret < 0 && errno != ETIMEDOUT) {
+                       perror("mq_timedreceive");
+                       return -1;
+               }
+       }
+       clock_gettime(CLOCK_MONOTONIC, &end);
+
+       mq_close(q);
+
+       if ((timespec_sub(start, end)/count) > TARGET_TIMEOUT + UNRESONABLE_LATENCY)
+               return -1;
+
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       int ret;
+
+       printf("Mqueue latency :                          ");
+
+       ret = mqueue_lat_test();
+       if (ret < 0) {
+               printf("[FAILED]\n");
+               return ksft_exit_fail();
+       }
+       printf("[OK]\n");
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c
new file mode 100644 (file)
index 0000000..8a3c29d
--- /dev/null
@@ -0,0 +1,174 @@
+/* Make sure timers don't return early
+ *              by: john stultz (johnstul@us.ibm.com)
+ *                 John Stultz (john.stultz@linaro.org)
+ *              (C) Copyright IBM 2012
+ *              (C) Copyright Linaro 2013 2015
+ *              Licensed under the GPLv2
+ *
+ *  To build:
+ *     $ gcc nanosleep.c -o nanosleep -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define NSEC_PER_SEC 1000000000ULL
+
+#define CLOCK_REALTIME                 0
+#define CLOCK_MONOTONIC                        1
+#define CLOCK_PROCESS_CPUTIME_ID       2
+#define CLOCK_THREAD_CPUTIME_ID                3
+#define CLOCK_MONOTONIC_RAW            4
+#define CLOCK_REALTIME_COARSE          5
+#define CLOCK_MONOTONIC_COARSE         6
+#define CLOCK_BOOTTIME                 7
+#define CLOCK_REALTIME_ALARM           8
+#define CLOCK_BOOTTIME_ALARM           9
+#define CLOCK_HWSPECIFIC               10
+#define CLOCK_TAI                      11
+#define NR_CLOCKIDS                    12
+
+#define UNSUPPORTED 0xf00f
+
+char *clockstring(int clockid)
+{
+       switch (clockid) {
+       case CLOCK_REALTIME:
+               return "CLOCK_REALTIME";
+       case CLOCK_MONOTONIC:
+               return "CLOCK_MONOTONIC";
+       case CLOCK_PROCESS_CPUTIME_ID:
+               return "CLOCK_PROCESS_CPUTIME_ID";
+       case CLOCK_THREAD_CPUTIME_ID:
+               return "CLOCK_THREAD_CPUTIME_ID";
+       case CLOCK_MONOTONIC_RAW:
+               return "CLOCK_MONOTONIC_RAW";
+       case CLOCK_REALTIME_COARSE:
+               return "CLOCK_REALTIME_COARSE";
+       case CLOCK_MONOTONIC_COARSE:
+               return "CLOCK_MONOTONIC_COARSE";
+       case CLOCK_BOOTTIME:
+               return "CLOCK_BOOTTIME";
+       case CLOCK_REALTIME_ALARM:
+               return "CLOCK_REALTIME_ALARM";
+       case CLOCK_BOOTTIME_ALARM:
+               return "CLOCK_BOOTTIME_ALARM";
+       case CLOCK_TAI:
+               return "CLOCK_TAI";
+       };
+       return "UNKNOWN_CLOCKID";
+}
+
+/* returns 1 if a <= b, 0 otherwise */
+static inline int in_order(struct timespec a, struct timespec b)
+{
+       if (a.tv_sec < b.tv_sec)
+               return 1;
+       if (a.tv_sec > b.tv_sec)
+               return 0;
+       if (a.tv_nsec > b.tv_nsec)
+               return 0;
+       return 1;
+}
+
+struct timespec timespec_add(struct timespec ts, unsigned long long ns)
+{
+       ts.tv_nsec += ns;
+       while (ts.tv_nsec >= NSEC_PER_SEC) {
+               ts.tv_nsec -= NSEC_PER_SEC;
+               ts.tv_sec++;
+       }
+       return ts;
+}
+
+int nanosleep_test(int clockid, long long ns)
+{
+       struct timespec now, target, rel;
+
+       /* First check abs time */
+       if (clock_gettime(clockid, &now))
+               return UNSUPPORTED;
+       target = timespec_add(now, ns);
+
+       if (clock_nanosleep(clockid, TIMER_ABSTIME, &target, NULL))
+               return UNSUPPORTED;
+       clock_gettime(clockid, &now);
+
+       if (!in_order(target, now))
+               return -1;
+
+       /* Second check reltime */
+       clock_gettime(clockid, &now);
+       rel.tv_sec = 0;
+       rel.tv_nsec = 0;
+       rel = timespec_add(rel, ns);
+       target = timespec_add(now, ns);
+       clock_nanosleep(clockid, 0, &rel, NULL);
+       clock_gettime(clockid, &now);
+
+       if (!in_order(target, now))
+               return -1;
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       long long length;
+       int clockid, ret;
+
+       for (clockid = CLOCK_REALTIME; clockid < NR_CLOCKIDS; clockid++) {
+
+               /* Skip cputime clockids since nanosleep won't increment cputime */
+               if (clockid == CLOCK_PROCESS_CPUTIME_ID ||
+                               clockid == CLOCK_THREAD_CPUTIME_ID ||
+                               clockid == CLOCK_HWSPECIFIC)
+                       continue;
+
+               printf("Nanosleep %-31s ", clockstring(clockid));
+
+               length = 10;
+               while (length <= (NSEC_PER_SEC * 10)) {
+                       ret = nanosleep_test(clockid, length);
+                       if (ret == UNSUPPORTED) {
+                               printf("[UNSUPPORTED]\n");
+                               goto next;
+                       }
+                       if (ret < 0) {
+                               printf("[FAILED]\n");
+                               return ksft_exit_fail();
+                       }
+                       length *= 100;
+               }
+               printf("[OK]\n");
+next:
+               ret = 0;
+       }
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/nsleep-lat.c b/tools/testing/selftests/timers/nsleep-lat.c
new file mode 100644 (file)
index 0000000..2d7898f
--- /dev/null
@@ -0,0 +1,190 @@
+/* Measure nanosleep timer latency
+ *              by: john stultz (john.stultz@linaro.org)
+ *             (C) Copyright Linaro 2013
+ *              Licensed under the GPLv2
+ *
+ *  To build:
+ *     $ gcc nsleep-lat.c -o nsleep-lat -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define NSEC_PER_SEC 1000000000ULL
+
+#define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */
+
+
+#define CLOCK_REALTIME                 0
+#define CLOCK_MONOTONIC                        1
+#define CLOCK_PROCESS_CPUTIME_ID       2
+#define CLOCK_THREAD_CPUTIME_ID                3
+#define CLOCK_MONOTONIC_RAW            4
+#define CLOCK_REALTIME_COARSE          5
+#define CLOCK_MONOTONIC_COARSE         6
+#define CLOCK_BOOTTIME                 7
+#define CLOCK_REALTIME_ALARM           8
+#define CLOCK_BOOTTIME_ALARM           9
+#define CLOCK_HWSPECIFIC               10
+#define CLOCK_TAI                      11
+#define NR_CLOCKIDS                    12
+
+#define UNSUPPORTED 0xf00f
+
+char *clockstring(int clockid)
+{
+       switch (clockid) {
+       case CLOCK_REALTIME:
+               return "CLOCK_REALTIME";
+       case CLOCK_MONOTONIC:
+               return "CLOCK_MONOTONIC";
+       case CLOCK_PROCESS_CPUTIME_ID:
+               return "CLOCK_PROCESS_CPUTIME_ID";
+       case CLOCK_THREAD_CPUTIME_ID:
+               return "CLOCK_THREAD_CPUTIME_ID";
+       case CLOCK_MONOTONIC_RAW:
+               return "CLOCK_MONOTONIC_RAW";
+       case CLOCK_REALTIME_COARSE:
+               return "CLOCK_REALTIME_COARSE";
+       case CLOCK_MONOTONIC_COARSE:
+               return "CLOCK_MONOTONIC_COARSE";
+       case CLOCK_BOOTTIME:
+               return "CLOCK_BOOTTIME";
+       case CLOCK_REALTIME_ALARM:
+               return "CLOCK_REALTIME_ALARM";
+       case CLOCK_BOOTTIME_ALARM:
+               return "CLOCK_BOOTTIME_ALARM";
+       case CLOCK_TAI:
+               return "CLOCK_TAI";
+       };
+       return "UNKNOWN_CLOCKID";
+}
+
+struct timespec timespec_add(struct timespec ts, unsigned long long ns)
+{
+       ts.tv_nsec += ns;
+       while (ts.tv_nsec >= NSEC_PER_SEC) {
+               ts.tv_nsec -= NSEC_PER_SEC;
+               ts.tv_sec++;
+       }
+       return ts;
+}
+
+
+long long timespec_sub(struct timespec a, struct timespec b)
+{
+       long long ret = NSEC_PER_SEC * b.tv_sec + b.tv_nsec;
+
+       ret -= NSEC_PER_SEC * a.tv_sec + a.tv_nsec;
+       return ret;
+}
+
+int nanosleep_lat_test(int clockid, long long ns)
+{
+       struct timespec start, end, target;
+       long long latency = 0;
+       int i, count;
+
+       target.tv_sec = ns/NSEC_PER_SEC;
+       target.tv_nsec = ns%NSEC_PER_SEC;
+
+       if (clock_gettime(clockid, &start))
+               return UNSUPPORTED;
+       if (clock_nanosleep(clockid, 0, &target, NULL))
+               return UNSUPPORTED;
+
+       count = 10;
+
+       /* First check relative latency */
+       clock_gettime(clockid, &start);
+       for (i = 0; i < count; i++)
+               clock_nanosleep(clockid, 0, &target, NULL);
+       clock_gettime(clockid, &end);
+
+       if (((timespec_sub(start, end)/count)-ns) > UNRESONABLE_LATENCY) {
+               printf("Large rel latency: %lld ns :", (timespec_sub(start, end)/count)-ns);
+               return -1;
+       }
+
+       /* Next check absolute latency */
+       for (i = 0; i < count; i++) {
+               clock_gettime(clockid, &start);
+               target = timespec_add(start, ns);
+               clock_nanosleep(clockid, TIMER_ABSTIME, &target, NULL);
+               clock_gettime(clockid, &end);
+               latency += timespec_sub(target, end);
+       }
+
+       if (latency/count > UNRESONABLE_LATENCY) {
+               printf("Large abs latency: %lld ns :", latency/count);
+               return -1;
+       }
+
+       return 0;
+}
+
+
+
+int main(int argc, char **argv)
+{
+       long long length;
+       int clockid, ret;
+
+       for (clockid = CLOCK_REALTIME; clockid < NR_CLOCKIDS; clockid++) {
+
+               /* Skip cputime clockids since nanosleep won't increment cputime */
+               if (clockid == CLOCK_PROCESS_CPUTIME_ID ||
+                               clockid == CLOCK_THREAD_CPUTIME_ID ||
+                               clockid == CLOCK_HWSPECIFIC)
+                       continue;
+
+               printf("nsleep latency %-26s ", clockstring(clockid));
+
+               length = 10;
+               while (length <= (NSEC_PER_SEC * 10)) {
+                       ret = nanosleep_lat_test(clockid, length);
+                       if (ret)
+                               break;
+                       length *= 100;
+
+               }
+
+               if (ret == UNSUPPORTED) {
+                       printf("[UNSUPPORTED]\n");
+                       continue;
+               }
+               if (ret < 0) {
+                       printf("[FAILED]\n");
+                       return ksft_exit_fail();
+               }
+               printf("[OK]\n");
+       }
+       return ksft_exit_pass();
+}
index f87d970a485c48b7d10e304e911fd3699c41bb35..5a246a02dff3c6986a1bd06496473ec5be1fd252 100644 (file)
@@ -35,10 +35,11 @@ static void user_loop(void)
 static void kernel_loop(void)
 {
        void *addr = sbrk(0);
+       int err = 0;
 
-       while (!done) {
-               brk(addr + 4096);
-               brk(addr);
+       while (!done && !err) {
+               err = brk(addr + 4096);
+               err |= brk(addr);
        }
 }
 
@@ -190,8 +191,6 @@ static int check_timer_create(int which)
 
 int main(int argc, char **argv)
 {
-       int err;
-
        printf("Testing posix timers. False negative may happen on CPU execution \n");
        printf("based timers if other threads run on the CPU...\n");
 
diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c
new file mode 100644 (file)
index 0000000..30906bf
--- /dev/null
@@ -0,0 +1,154 @@
+/* CLOCK_MONOTONIC vs CLOCK_MONOTONIC_RAW skew test
+ *             by: john stultz (johnstul@us.ibm.com)
+ *                 John Stultz <john.stultz@linaro.org>
+ *             (C) Copyright IBM 2012
+ *             (C) Copyright Linaro Limited 2015
+ *             Licensed under the GPLv2
+ *
+ *  To build:
+ *     $ gcc raw_skew.c -o raw_skew -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <time.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+
+#define CLOCK_MONOTONIC_RAW            4
+#define NSEC_PER_SEC 1000000000LL
+
+#define shift_right(x, s) ({           \
+       __typeof__(x) __x = (x);        \
+       __typeof__(s) __s = (s);        \
+       __x < 0 ? -(-__x >> __s) : __x >> __s; \
+})
+
+long long llabs(long long val)
+{
+       if (val < 0)
+               val = -val;
+       return val;
+}
+
+unsigned long long ts_to_nsec(struct timespec ts)
+{
+       return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
+}
+
+struct timespec nsec_to_ts(long long ns)
+{
+       struct timespec ts;
+
+       ts.tv_sec = ns/NSEC_PER_SEC;
+       ts.tv_nsec = ns%NSEC_PER_SEC;
+       return ts;
+}
+
+long long diff_timespec(struct timespec start, struct timespec end)
+{
+       long long start_ns, end_ns;
+
+       start_ns = ts_to_nsec(start);
+       end_ns = ts_to_nsec(end);
+       return end_ns - start_ns;
+}
+
+void get_monotonic_and_raw(struct timespec *mon, struct timespec *raw)
+{
+       struct timespec start, mid, end;
+       long long diff = 0, tmp;
+       int i;
+
+       for (i = 0; i < 3; i++) {
+               long long newdiff;
+
+               clock_gettime(CLOCK_MONOTONIC, &start);
+               clock_gettime(CLOCK_MONOTONIC_RAW, &mid);
+               clock_gettime(CLOCK_MONOTONIC, &end);
+
+               newdiff = diff_timespec(start, end);
+               if (diff == 0 || newdiff < diff) {
+                       diff = newdiff;
+                       *raw = mid;
+                       tmp = (ts_to_nsec(start) + ts_to_nsec(end))/2;
+                       *mon = nsec_to_ts(tmp);
+               }
+       }
+}
+
+int main(int argv, char **argc)
+{
+       struct timespec mon, raw, start, end;
+       long long delta1, delta2, interval, eppm, ppm;
+       struct timex tx1, tx2;
+
+       setbuf(stdout, NULL);
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &raw)) {
+               printf("ERR: NO CLOCK_MONOTONIC_RAW\n");
+               return -1;
+       }
+
+       tx1.modes = 0;
+       adjtimex(&tx1);
+       get_monotonic_and_raw(&mon, &raw);
+       start = mon;
+       delta1 = diff_timespec(mon, raw);
+
+       if (tx1.offset)
+               printf("WARNING: ADJ_OFFSET in progress, this will cause inaccurate results\n");
+
+       printf("Estimating clock drift: ");
+       sleep(120);
+
+       get_monotonic_and_raw(&mon, &raw);
+       end = mon;
+       tx2.modes = 0;
+       adjtimex(&tx2);
+       delta2 = diff_timespec(mon, raw);
+
+       interval = diff_timespec(start, end);
+
+       /* calculate measured ppm between MONOTONIC and MONOTONIC_RAW */
+       eppm = ((delta2-delta1)*NSEC_PER_SEC)/interval;
+       eppm = -eppm;
+       printf("%lld.%i(est)", eppm/1000, abs((int)(eppm%1000)));
+
+       /* Avg the two actual freq samples adjtimex gave us */
+       ppm = (tx1.freq + tx2.freq) * 1000 / 2;
+       ppm = (long long)tx1.freq * 1000;
+       ppm = shift_right(ppm, 16);
+       printf(" %lld.%i(act)", ppm/1000, abs((int)(ppm%1000)));
+
+       if (llabs(eppm - ppm) > 1000) {
+               printf("        [FAILED]\n");
+               return ksft_exit_fail();
+       }
+       printf("        [OK]\n");
+       return  ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/rtctest.c b/tools/testing/selftests/timers/rtctest.c
new file mode 100644 (file)
index 0000000..d80ae85
--- /dev/null
@@ -0,0 +1,271 @@
+/*
+ *      Real Time Clock Driver Test/Example Program
+ *
+ *      Compile with:
+ *                  gcc -s -Wall -Wstrict-prototypes rtctest.c -o rtctest
+ *
+ *      Copyright (C) 1996, Paul Gortmaker.
+ *
+ *      Released under the GNU General Public License, version 2,
+ *      included herein by reference.
+ *
+ */
+
+#include <stdio.h>
+#include <linux/rtc.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+
+
+/*
+ * This expects the new RTC class driver framework, working with
+ * clocks that will often not be clones of what the PC-AT had.
+ * Use the command line to specify another RTC if you need one.
+ */
+static const char default_rtc[] = "/dev/rtc0";
+
+
+int main(int argc, char **argv)
+{
+       int i, fd, retval, irqcount = 0;
+       unsigned long tmp, data;
+       struct rtc_time rtc_tm;
+       const char *rtc = default_rtc;
+       struct timeval start, end, diff;
+
+       switch (argc) {
+       case 2:
+               rtc = argv[1];
+               /* FALLTHROUGH */
+       case 1:
+               break;
+       default:
+               fprintf(stderr, "usage:  rtctest [rtcdev]\n");
+               return 1;
+       }
+
+       fd = open(rtc, O_RDONLY);
+
+       if (fd ==  -1) {
+               perror(rtc);
+               exit(errno);
+       }
+
+       fprintf(stderr, "\n\t\t\tRTC Driver Test Example.\n\n");
+
+       /* Turn on update interrupts (one per second) */
+       retval = ioctl(fd, RTC_UIE_ON, 0);
+       if (retval == -1) {
+               if (errno == ENOTTY) {
+                       fprintf(stderr,
+                               "\n...Update IRQs not supported.\n");
+                       goto test_READ;
+               }
+               perror("RTC_UIE_ON ioctl");
+               exit(errno);
+       }
+
+       fprintf(stderr, "Counting 5 update (1/sec) interrupts from reading %s:",
+                       rtc);
+       fflush(stderr);
+       for (i=1; i<6; i++) {
+               /* This read will block */
+               retval = read(fd, &data, sizeof(unsigned long));
+               if (retval == -1) {
+                       perror("read");
+                       exit(errno);
+               }
+               fprintf(stderr, " %d",i);
+               fflush(stderr);
+               irqcount++;
+       }
+
+       fprintf(stderr, "\nAgain, from using select(2) on /dev/rtc:");
+       fflush(stderr);
+       for (i=1; i<6; i++) {
+               struct timeval tv = {5, 0};     /* 5 second timeout on select */
+               fd_set readfds;
+
+               FD_ZERO(&readfds);
+               FD_SET(fd, &readfds);
+               /* The select will wait until an RTC interrupt happens. */
+               retval = select(fd+1, &readfds, NULL, NULL, &tv);
+               if (retval == -1) {
+                       perror("select");
+                       exit(errno);
+               }
+               /* This read won't block unlike the select-less case above. */
+               retval = read(fd, &data, sizeof(unsigned long));
+               if (retval == -1) {
+                       perror("read");
+                       exit(errno);
+               }
+               fprintf(stderr, " %d",i);
+               fflush(stderr);
+               irqcount++;
+       }
+
+       /* Turn off update interrupts */
+       retval = ioctl(fd, RTC_UIE_OFF, 0);
+       if (retval == -1) {
+               perror("RTC_UIE_OFF ioctl");
+               exit(errno);
+       }
+
+test_READ:
+       /* Read the RTC time/date */
+       retval = ioctl(fd, RTC_RD_TIME, &rtc_tm);
+       if (retval == -1) {
+               perror("RTC_RD_TIME ioctl");
+               exit(errno);
+       }
+
+       fprintf(stderr, "\n\nCurrent RTC date/time is %d-%d-%d, %02d:%02d:%02d.\n",
+               rtc_tm.tm_mday, rtc_tm.tm_mon + 1, rtc_tm.tm_year + 1900,
+               rtc_tm.tm_hour, rtc_tm.tm_min, rtc_tm.tm_sec);
+
+       /* Set the alarm to 5 sec in the future, and check for rollover */
+       rtc_tm.tm_sec += 5;
+       if (rtc_tm.tm_sec >= 60) {
+               rtc_tm.tm_sec %= 60;
+               rtc_tm.tm_min++;
+       }
+       if (rtc_tm.tm_min == 60) {
+               rtc_tm.tm_min = 0;
+               rtc_tm.tm_hour++;
+       }
+       if (rtc_tm.tm_hour == 24)
+               rtc_tm.tm_hour = 0;
+
+       retval = ioctl(fd, RTC_ALM_SET, &rtc_tm);
+       if (retval == -1) {
+               if (errno == ENOTTY) {
+                       fprintf(stderr,
+                               "\n...Alarm IRQs not supported.\n");
+                       goto test_PIE;
+               }
+               perror("RTC_ALM_SET ioctl");
+               exit(errno);
+       }
+
+       /* Read the current alarm settings */
+       retval = ioctl(fd, RTC_ALM_READ, &rtc_tm);
+       if (retval == -1) {
+               perror("RTC_ALM_READ ioctl");
+               exit(errno);
+       }
+
+       fprintf(stderr, "Alarm time now set to %02d:%02d:%02d.\n",
+               rtc_tm.tm_hour, rtc_tm.tm_min, rtc_tm.tm_sec);
+
+       /* Enable alarm interrupts */
+       retval = ioctl(fd, RTC_AIE_ON, 0);
+       if (retval == -1) {
+               perror("RTC_AIE_ON ioctl");
+               exit(errno);
+       }
+
+       fprintf(stderr, "Waiting 5 seconds for alarm...");
+       fflush(stderr);
+       /* This blocks until the alarm ring causes an interrupt */
+       retval = read(fd, &data, sizeof(unsigned long));
+       if (retval == -1) {
+               perror("read");
+               exit(errno);
+       }
+       irqcount++;
+       fprintf(stderr, " okay. Alarm rang.\n");
+
+       /* Disable alarm interrupts */
+       retval = ioctl(fd, RTC_AIE_OFF, 0);
+       if (retval == -1) {
+               perror("RTC_AIE_OFF ioctl");
+               exit(errno);
+       }
+
+test_PIE:
+       /* Read periodic IRQ rate */
+       retval = ioctl(fd, RTC_IRQP_READ, &tmp);
+       if (retval == -1) {
+               /* not all RTCs support periodic IRQs */
+               if (errno == ENOTTY) {
+                       fprintf(stderr, "\nNo periodic IRQ support\n");
+                       goto done;
+               }
+               perror("RTC_IRQP_READ ioctl");
+               exit(errno);
+       }
+       fprintf(stderr, "\nPeriodic IRQ rate is %ldHz.\n", tmp);
+
+       fprintf(stderr, "Counting 20 interrupts at:");
+       fflush(stderr);
+
+       /* The frequencies 128Hz, 256Hz, ... 8192Hz are only allowed for root. */
+       for (tmp=2; tmp<=64; tmp*=2) {
+
+               retval = ioctl(fd, RTC_IRQP_SET, tmp);
+               if (retval == -1) {
+                       /* not all RTCs can change their periodic IRQ rate */
+                       if (errno == ENOTTY) {
+                               fprintf(stderr,
+                                       "\n...Periodic IRQ rate is fixed\n");
+                               goto done;
+                       }
+                       perror("RTC_IRQP_SET ioctl");
+                       exit(errno);
+               }
+
+               fprintf(stderr, "\n%ldHz:\t", tmp);
+               fflush(stderr);
+
+               /* Enable periodic interrupts */
+               retval = ioctl(fd, RTC_PIE_ON, 0);
+               if (retval == -1) {
+                       perror("RTC_PIE_ON ioctl");
+                       exit(errno);
+               }
+
+               for (i=1; i<21; i++) {
+                       gettimeofday(&start, NULL);
+                       /* This blocks */
+                       retval = read(fd, &data, sizeof(unsigned long));
+                       if (retval == -1) {
+                               perror("read");
+                               exit(errno);
+                       }
+                       gettimeofday(&end, NULL);
+                       timersub(&end, &start, &diff);
+                       if (diff.tv_sec > 0 ||
+                           diff.tv_usec > ((1000000L / tmp) * 1.10)) {
+                               fprintf(stderr, "\nPIE delta error: %ld.%06ld should be close to 0.%06ld\n",
+                                      diff.tv_sec, diff.tv_usec,
+                                      (1000000L / tmp));
+                               fflush(stdout);
+                               exit(-1);
+                       }
+
+                       fprintf(stderr, " %d",i);
+                       fflush(stderr);
+                       irqcount++;
+               }
+
+               /* Disable periodic interrupts */
+               retval = ioctl(fd, RTC_PIE_OFF, 0);
+               if (retval == -1) {
+                       perror("RTC_PIE_OFF ioctl");
+                       exit(errno);
+               }
+       }
+
+done:
+       fprintf(stderr, "\n\n\t\t\t *** Test complete ***\n");
+
+       close(fd);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/timers/set-2038.c b/tools/testing/selftests/timers/set-2038.c
new file mode 100644 (file)
index 0000000..c8a7e14
--- /dev/null
@@ -0,0 +1,144 @@
+/* Time bounds setting test
+ *             by: john stultz (johnstul@us.ibm.com)
+ *             (C) Copyright IBM 2012
+ *             Licensed under the GPLv2
+ *
+ *  NOTE: This is a meta-test which sets the time to edge cases then
+ *  uses other tests to detect problems. Thus this test requires that
+ *  the inconsistency-check and nanosleep tests be present in the same
+ *  directory it is run from.
+ *
+ *  To build:
+ *     $ gcc set-2038.c -o set-2038 -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/time.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define NSEC_PER_SEC 1000000000LL
+
+#define KTIME_MAX      ((long long)~((unsigned long long)1 << 63))
+#define KTIME_SEC_MAX  (KTIME_MAX / NSEC_PER_SEC)
+
+#define YEAR_1901 (-0x7fffffffL)
+#define YEAR_1970 1
+#define YEAR_2038 0x7fffffffL                  /*overflows 32bit time_t */
+#define YEAR_2262 KTIME_SEC_MAX                        /*overflows 64bit ktime_t */
+#define YEAR_MAX  ((long long)((1ULL<<63)-1))  /*overflows 64bit time_t */
+
+int is32bits(void)
+{
+       return (sizeof(long) == 4);
+}
+
+int settime(long long time)
+{
+       struct timeval now;
+       int ret;
+
+       now.tv_sec = (time_t)time;
+       now.tv_usec  = 0;
+
+       ret = settimeofday(&now, NULL);
+
+       printf("Setting time to 0x%lx: %d\n", (long)time, ret);
+       return ret;
+}
+
+int do_tests(void)
+{
+       int ret;
+
+       ret = system("date");
+       ret = system("./inconsistency-check -c 0 -t 20");
+       ret |= system("./nanosleep");
+       ret |= system("./nsleep-lat");
+       return ret;
+
+}
+
+int main(int argc, char *argv[])
+{
+       int ret = 0;
+       int opt, dangerous = 0;
+       time_t start;
+
+       /* Process arguments */
+       while ((opt = getopt(argc, argv, "d")) != -1) {
+               switch (opt) {
+               case 'd':
+                       dangerous = 1;
+               }
+       }
+
+       start = time(0);
+
+       /* First test that crazy values don't work */
+       if (!settime(YEAR_1901)) {
+               ret = -1;
+               goto out;
+       }
+       if (!settime(YEAR_MAX)) {
+               ret = -1;
+               goto out;
+       }
+       if (!is32bits() && !settime(YEAR_2262)) {
+               ret = -1;
+               goto out;
+       }
+
+       /* Now test behavior near edges */
+       settime(YEAR_1970);
+       ret = do_tests();
+       if (ret)
+               goto out;
+
+       settime(YEAR_2038 - 600);
+       ret = do_tests();
+       if (ret)
+               goto out;
+
+       /* The rest of the tests can blowup on 32bit systems */
+       if (is32bits() && !dangerous)
+               goto out;
+       /* Test rollover behavior 32bit edge */
+       settime(YEAR_2038 - 10);
+       ret = do_tests();
+       if (ret)
+               goto out;
+
+       settime(YEAR_2262 - 600);
+       ret = do_tests();
+
+out:
+       /* restore clock */
+       settime(start);
+       if (ret)
+               return ksft_exit_fail();
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/set-tai.c b/tools/testing/selftests/timers/set-tai.c
new file mode 100644 (file)
index 0000000..dc88dbc
--- /dev/null
@@ -0,0 +1,79 @@
+/* Set tai offset
+ *              by: John Stultz <john.stultz@linaro.org>
+ *              (C) Copyright Linaro 2013
+ *              Licensed under the GPLv2
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+int set_tai(int offset)
+{
+       struct timex tx;
+
+       memset(&tx, 0, sizeof(tx));
+
+       tx.modes = ADJ_TAI;
+       tx.constant = offset;
+
+       return adjtimex(&tx);
+}
+
+int get_tai(void)
+{
+       struct timex tx;
+
+       memset(&tx, 0, sizeof(tx));
+
+       adjtimex(&tx);
+       return tx.tai;
+}
+
+int main(int argc, char **argv)
+{
+       int i, ret;
+
+       ret = get_tai();
+       printf("tai offset started at %i\n", ret);
+
+       printf("Checking tai offsets can be properly set: ");
+       for (i = 1; i <= 60; i++) {
+               ret = set_tai(i);
+               ret = get_tai();
+               if (ret != i) {
+                       printf("[FAILED] expected: %i got %i\n", i, ret);
+                       return ksft_exit_fail();
+               }
+       }
+       printf("[OK]\n");
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c
new file mode 100644 (file)
index 0000000..4fc98c5
--- /dev/null
@@ -0,0 +1,216 @@
+/* set_timer latency test
+ *             John Stultz (john.stultz@linaro.org)
+ *              (C) Copyright Linaro 2014
+ *              Licensed under the GPLv2
+ *
+ *   This test makes sure the set_timer api is correct
+ *
+ *  To build:
+ *     $ gcc set-timer-lat.c -o set-timer-lat -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <unistd.h>
+#include <time.h>
+#include <string.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <pthread.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define CLOCK_REALTIME                 0
+#define CLOCK_MONOTONIC                        1
+#define CLOCK_PROCESS_CPUTIME_ID       2
+#define CLOCK_THREAD_CPUTIME_ID                3
+#define CLOCK_MONOTONIC_RAW            4
+#define CLOCK_REALTIME_COARSE          5
+#define CLOCK_MONOTONIC_COARSE         6
+#define CLOCK_BOOTTIME                 7
+#define CLOCK_REALTIME_ALARM           8
+#define CLOCK_BOOTTIME_ALARM           9
+#define CLOCK_HWSPECIFIC               10
+#define CLOCK_TAI                      11
+#define NR_CLOCKIDS                    12
+
+
+#define NSEC_PER_SEC 1000000000ULL
+#define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */
+
+#define TIMER_SECS 1
+int alarmcount;
+int clock_id;
+struct timespec start_time;
+long long max_latency_ns;
+
+char *clockstring(int clockid)
+{
+       switch (clockid) {
+       case CLOCK_REALTIME:
+               return "CLOCK_REALTIME";
+       case CLOCK_MONOTONIC:
+               return "CLOCK_MONOTONIC";
+       case CLOCK_PROCESS_CPUTIME_ID:
+               return "CLOCK_PROCESS_CPUTIME_ID";
+       case CLOCK_THREAD_CPUTIME_ID:
+               return "CLOCK_THREAD_CPUTIME_ID";
+       case CLOCK_MONOTONIC_RAW:
+               return "CLOCK_MONOTONIC_RAW";
+       case CLOCK_REALTIME_COARSE:
+               return "CLOCK_REALTIME_COARSE";
+       case CLOCK_MONOTONIC_COARSE:
+               return "CLOCK_MONOTONIC_COARSE";
+       case CLOCK_BOOTTIME:
+               return "CLOCK_BOOTTIME";
+       case CLOCK_REALTIME_ALARM:
+               return "CLOCK_REALTIME_ALARM";
+       case CLOCK_BOOTTIME_ALARM:
+               return "CLOCK_BOOTTIME_ALARM";
+       case CLOCK_TAI:
+               return "CLOCK_TAI";
+       };
+       return "UNKNOWN_CLOCKID";
+}
+
+
+long long timespec_sub(struct timespec a, struct timespec b)
+{
+       long long ret = NSEC_PER_SEC * b.tv_sec + b.tv_nsec;
+
+       ret -= NSEC_PER_SEC * a.tv_sec + a.tv_nsec;
+       return ret;
+}
+
+
+void sigalarm(int signo)
+{
+       long long delta_ns;
+       struct timespec ts;
+
+       clock_gettime(clock_id, &ts);
+       alarmcount++;
+
+       delta_ns = timespec_sub(start_time, ts);
+       delta_ns -= NSEC_PER_SEC * TIMER_SECS * alarmcount;
+
+       if (delta_ns < 0)
+               printf("%s timer fired early: FAIL\n", clockstring(clock_id));
+
+       if (delta_ns > max_latency_ns)
+               max_latency_ns = delta_ns;
+}
+
+int do_timer(int clock_id, int flags)
+{
+       struct sigevent se;
+       timer_t tm1;
+       struct itimerspec its1, its2;
+       int err;
+
+       /* Set up timer: */
+       memset(&se, 0, sizeof(se));
+       se.sigev_notify = SIGEV_SIGNAL;
+       se.sigev_signo = SIGRTMAX;
+       se.sigev_value.sival_int = 0;
+
+       max_latency_ns = 0;
+       alarmcount = 0;
+
+       err = timer_create(clock_id, &se, &tm1);
+       if (err) {
+               if ((clock_id == CLOCK_REALTIME_ALARM) ||
+                   (clock_id == CLOCK_BOOTTIME_ALARM)) {
+                       printf("%-22s %s missing CAP_WAKE_ALARM?    : [UNSUPPORTED]\n",
+                                       clockstring(clock_id),
+                                       flags ? "ABSTIME":"RELTIME");
+                       return 0;
+               }
+               printf("%s - timer_create() failed\n", clockstring(clock_id));
+               return -1;
+       }
+
+       clock_gettime(clock_id, &start_time);
+       if (flags) {
+               its1.it_value = start_time;
+               its1.it_value.tv_sec += TIMER_SECS;
+       } else {
+               its1.it_value.tv_sec = TIMER_SECS;
+               its1.it_value.tv_nsec = 0;
+       }
+       its1.it_interval.tv_sec = TIMER_SECS;
+       its1.it_interval.tv_nsec = 0;
+
+       err = timer_settime(tm1, flags, &its1, &its2);
+       if (err) {
+               printf("%s - timer_settime() failed\n", clockstring(clock_id));
+               return -1;
+       }
+
+       while (alarmcount < 5)
+               sleep(1);
+
+       printf("%-22s %s max latency: %10lld ns : ",
+                       clockstring(clock_id),
+                       flags ? "ABSTIME":"RELTIME",
+                       max_latency_ns);
+
+       timer_delete(tm1);
+       if (max_latency_ns < UNRESONABLE_LATENCY) {
+               printf("[OK]\n");
+               return 0;
+       }
+       printf("[FAILED]\n");
+       return -1;
+}
+
+int main(void)
+{
+       struct sigaction act;
+       int signum = SIGRTMAX;
+       int ret = 0;
+
+       /* Set up signal handler: */
+       sigfillset(&act.sa_mask);
+       act.sa_flags = 0;
+       act.sa_handler = sigalarm;
+       sigaction(signum, &act, NULL);
+
+       printf("Setting timers for every %i seconds\n", TIMER_SECS);
+       for (clock_id = 0; clock_id < NR_CLOCKIDS; clock_id++) {
+
+               if ((clock_id == CLOCK_PROCESS_CPUTIME_ID) ||
+                               (clock_id == CLOCK_THREAD_CPUTIME_ID) ||
+                               (clock_id == CLOCK_MONOTONIC_RAW) ||
+                               (clock_id == CLOCK_REALTIME_COARSE) ||
+                               (clock_id == CLOCK_MONOTONIC_COARSE) ||
+                               (clock_id == CLOCK_HWSPECIFIC))
+                       continue;
+
+               ret |= do_timer(clock_id, TIMER_ABSTIME);
+               ret |= do_timer(clock_id, 0);
+       }
+       if (ret)
+               return ksft_exit_fail();
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c
new file mode 100644 (file)
index 0000000..5562f84
--- /dev/null
@@ -0,0 +1,89 @@
+/* ADJ_FREQ Skew consistency test
+ *             by: john stultz (johnstul@us.ibm.com)
+ *             (C) Copyright IBM 2012
+ *             Licensed under the GPLv2
+ *
+ *  NOTE: This is a meta-test which cranks the ADJ_FREQ knob back
+ *  and forth and watches for consistency problems. Thus this test requires
+ *  that the inconsistency-check tests be present in the same directory it
+ *  is run from.
+ *
+ *  To build:
+ *     $ gcc skew_consistency.c -o skew_consistency -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define NSEC_PER_SEC 1000000000LL
+
+int main(int argv, char **argc)
+{
+       struct timex tx;
+       int ret, ppm;
+       pid_t pid;
+
+
+       printf("Running Asyncrhonous Frequency Changing Tests...\n");
+
+       pid = fork();
+       if (!pid)
+               return system("./inconsistency-check -c 1 -t 600");
+
+       ppm = 500;
+       ret = 0;
+
+       while (pid != waitpid(pid, &ret, WNOHANG)) {
+               ppm = -ppm;
+               tx.modes = ADJ_FREQUENCY;
+               tx.freq = ppm << 16;
+               adjtimex(&tx);
+               usleep(500000);
+       }
+
+       /* Set things back */
+       tx.modes = ADJ_FREQUENCY;
+       tx.offset = 0;
+       adjtimex(&tx);
+
+
+       if (ret) {
+               printf("[FAILED]\n");
+               return ksft_exit_fail();
+       }
+       printf("[OK]\n");
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c
new file mode 100644 (file)
index 0000000..e632e11
--- /dev/null
@@ -0,0 +1,204 @@
+/* threadtest.c
+ *             by: john stultz (johnstul@us.ibm.com)
+ *             (C) Copyright IBM 2004, 2005, 2006, 2012
+ *             Licensed under the GPLv2
+ *
+ *  To build:
+ *     $ gcc threadtest.c -o threadtest -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+
+/* serializes shared list access */
+pthread_mutex_t list_lock = PTHREAD_MUTEX_INITIALIZER;
+/* serializes console output */
+pthread_mutex_t print_lock = PTHREAD_MUTEX_INITIALIZER;
+
+
+#define MAX_THREADS 128
+#define LISTSIZE 128
+
+int done = 0;
+
+struct timespec global_list[LISTSIZE];
+int listcount = 0;
+
+
+void checklist(struct timespec *list, int size)
+{
+       int i, j;
+       struct timespec *a, *b;
+
+       /* scan the list */
+       for (i = 0; i < size-1; i++) {
+               a = &list[i];
+               b = &list[i+1];
+
+               /* look for any time inconsistencies */
+               if ((b->tv_sec <= a->tv_sec) &&
+                       (b->tv_nsec < a->tv_nsec)) {
+
+                       /* flag other threads */
+                       done = 1;
+
+                       /*serialize printing to avoid junky output*/
+                       pthread_mutex_lock(&print_lock);
+
+                       /* dump the list */
+                       printf("\n");
+                       for (j = 0; j < size; j++) {
+                               if (j == i)
+                                       printf("---------------\n");
+                               printf("%lu:%lu\n", list[j].tv_sec, list[j].tv_nsec);
+                               if (j == i+1)
+                                       printf("---------------\n");
+                       }
+                       printf("[FAILED]\n");
+
+                       pthread_mutex_unlock(&print_lock);
+               }
+       }
+}
+
+/* The shared thread shares a global list
+ * that each thread fills while holding the lock.
+ * This stresses clock syncronization across cpus.
+ */
+void *shared_thread(void *arg)
+{
+       while (!done) {
+               /* protect the list */
+               pthread_mutex_lock(&list_lock);
+
+               /* see if we're ready to check the list */
+               if (listcount >= LISTSIZE) {
+                       checklist(global_list, LISTSIZE);
+                       listcount = 0;
+               }
+               clock_gettime(CLOCK_MONOTONIC, &global_list[listcount++]);
+
+               pthread_mutex_unlock(&list_lock);
+       }
+       return NULL;
+}
+
+
+/* Each independent thread fills in its own
+ * list. This stresses clock_gettime() lock contention.
+ */
+void *independent_thread(void *arg)
+{
+       struct timespec my_list[LISTSIZE];
+       int count;
+
+       while (!done) {
+               /* fill the list */
+               for (count = 0; count < LISTSIZE; count++)
+                       clock_gettime(CLOCK_MONOTONIC, &my_list[count]);
+               checklist(my_list, LISTSIZE);
+       }
+       return NULL;
+}
+
+#define DEFAULT_THREAD_COUNT 8
+#define DEFAULT_RUNTIME 30
+
+int main(int argc, char **argv)
+{
+       int thread_count, i;
+       time_t start, now, runtime;
+       char buf[255];
+       pthread_t pth[MAX_THREADS];
+       int opt;
+       void *tret;
+       int ret = 0;
+       void *(*thread)(void *) = shared_thread;
+
+       thread_count = DEFAULT_THREAD_COUNT;
+       runtime = DEFAULT_RUNTIME;
+
+       /* Process arguments */
+       while ((opt = getopt(argc, argv, "t:n:i")) != -1) {
+               switch (opt) {
+               case 't':
+                       runtime = atoi(optarg);
+                       break;
+               case 'n':
+                       thread_count = atoi(optarg);
+                       break;
+               case 'i':
+                       thread = independent_thread;
+                       printf("using independent threads\n");
+                       break;
+               default:
+                       printf("Usage: %s [-t <secs>] [-n <numthreads>] [-i]\n", argv[0]);
+                       printf("        -t: time to run\n");
+                       printf("        -n: number of threads\n");
+                       printf("        -i: use independent threads\n");
+                       return -1;
+               }
+       }
+
+       if (thread_count > MAX_THREADS)
+               thread_count = MAX_THREADS;
+
+
+       setbuf(stdout, NULL);
+
+       start = time(0);
+       strftime(buf, 255, "%a, %d %b %Y %T %z", localtime(&start));
+       printf("%s\n", buf);
+       printf("Testing consistency with %i threads for %ld seconds: ", thread_count, runtime);
+
+       /* spawn */
+       for (i = 0; i < thread_count; i++)
+               pthread_create(&pth[i], 0, thread, 0);
+
+       while (time(&now) < start + runtime) {
+               sleep(1);
+               if (done) {
+                       ret = 1;
+                       strftime(buf, 255, "%a, %d %b %Y %T %z", localtime(&now));
+                       printf("%s\n", buf);
+                       goto out;
+               }
+       }
+       printf("[OK]\n");
+       done = 1;
+
+out:
+       /* wait */
+       for (i = 0; i < thread_count; i++)
+               pthread_join(pth[i], &tret);
+
+       /* die */
+       if (ret)
+               ksft_exit_fail();
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c
new file mode 100644 (file)
index 0000000..e86d937
--- /dev/null
@@ -0,0 +1,202 @@
+/* valid adjtimex test
+ *              by: John Stultz <john.stultz@linaro.org>
+ *              (C) Copyright Linaro 2015
+ *              Licensed under the GPLv2
+ *
+ *  This test validates adjtimex interface with valid
+ *  and invalid test data.
+ *
+ *  Usage: valid-adjtimex
+ *
+ *  To build:
+ *     $ gcc valid-adjtimex.c -o valid-adjtimex -lrt
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#ifdef KTEST
+#include "../kselftest.h"
+#else
+static inline int ksft_exit_pass(void)
+{
+       exit(0);
+}
+static inline int ksft_exit_fail(void)
+{
+       exit(1);
+}
+#endif
+
+#define NSEC_PER_SEC 1000000000L
+
+/* clear NTP time_status & time_state */
+int clear_time_state(void)
+{
+       struct timex tx;
+       int ret;
+
+       tx.modes = ADJ_STATUS;
+       tx.status = 0;
+       ret = adjtimex(&tx);
+       return ret;
+}
+
+#define NUM_FREQ_VALID 32
+#define NUM_FREQ_OUTOFRANGE 4
+#define NUM_FREQ_INVALID 2
+
+long valid_freq[NUM_FREQ_VALID] = {
+       -499<<16,
+       -450<<16,
+       -400<<16,
+       -350<<16,
+       -300<<16,
+       -250<<16,
+       -200<<16,
+       -150<<16,
+       -100<<16,
+       -75<<16,
+       -50<<16,
+       -25<<16,
+       -10<<16,
+       -5<<16,
+       -1<<16,
+       -1000,
+       1<<16,
+       5<<16,
+       10<<16,
+       25<<16,
+       50<<16,
+       75<<16,
+       100<<16,
+       150<<16,
+       200<<16,
+       250<<16,
+       300<<16,
+       350<<16,
+       400<<16,
+       450<<16,
+       499<<16,
+};
+
+long outofrange_freq[NUM_FREQ_OUTOFRANGE] = {
+       -1000<<16,
+       -550<<16,
+       550<<16,
+       1000<<16,
+};
+
+#define LONG_MAX (~0UL>>1)
+#define LONG_MIN (-LONG_MAX - 1)
+
+long invalid_freq[NUM_FREQ_INVALID] = {
+       LONG_MAX,
+       LONG_MIN,
+};
+
+int validate_freq(void)
+{
+       struct timex tx;
+       int ret, pass = 0;
+       int i;
+
+       clear_time_state();
+
+       memset(&tx, 0, sizeof(struct timex));
+       /* Set the leap second insert flag */
+
+       printf("Testing ADJ_FREQ... ");
+       for (i = 0; i < NUM_FREQ_VALID; i++) {
+               tx.modes = ADJ_FREQUENCY;
+               tx.freq = valid_freq[i];
+
+               ret = adjtimex(&tx);
+               if (ret < 0) {
+                       printf("[FAIL]\n");
+                       printf("Error: adjtimex(ADJ_FREQ, %ld - %ld ppm\n",
+                               valid_freq[i], valid_freq[i]>>16);
+                       pass = -1;
+                       goto out;
+               }
+               tx.modes = 0;
+               ret = adjtimex(&tx);
+               if (tx.freq != valid_freq[i]) {
+                       printf("Warning: freq value %ld not what we set it (%ld)!\n",
+                                       tx.freq, valid_freq[i]);
+               }
+       }
+       for (i = 0; i < NUM_FREQ_OUTOFRANGE; i++) {
+               tx.modes = ADJ_FREQUENCY;
+               tx.freq = outofrange_freq[i];
+
+               ret = adjtimex(&tx);
+               if (ret < 0) {
+                       printf("[FAIL]\n");
+                       printf("Error: adjtimex(ADJ_FREQ, %ld - %ld ppm\n",
+                               outofrange_freq[i], outofrange_freq[i]>>16);
+                       pass = -1;
+                       goto out;
+               }
+               tx.modes = 0;
+               ret = adjtimex(&tx);
+               if (tx.freq == outofrange_freq[i]) {
+                       printf("[FAIL]\n");
+                       printf("ERROR: out of range value %ld actually set!\n",
+                                       tx.freq);
+                       pass = -1;
+                       goto out;
+               }
+       }
+
+
+       if (sizeof(long) == 8) { /* this case only applies to 64bit systems */
+               for (i = 0; i < NUM_FREQ_INVALID; i++) {
+                       tx.modes = ADJ_FREQUENCY;
+                       tx.freq = invalid_freq[i];
+                       ret = adjtimex(&tx);
+                       if (ret >= 0) {
+                               printf("[FAIL]\n");
+                               printf("Error: No failure on invalid ADJ_FREQUENCY %ld\n",
+                                       invalid_freq[i]);
+                               pass = -1;
+                               goto out;
+                       }
+               }
+       }
+
+       printf("[OK]\n");
+out:
+       /* reset freq to zero */
+       tx.modes = ADJ_FREQUENCY;
+       tx.freq = 0;
+       ret = adjtimex(&tx);
+
+       return pass;
+}
+
+
+int main(int argc, char **argv)
+{
+       if (validate_freq())
+               return ksft_exit_fail();
+
+       return ksft_exit_pass();
+}
index 12c9d15bab075a6eaaa5f171a0833841c829b470..d401b63c5b1ad09273d2228add75d8635694172f 100644 (file)
@@ -3,5 +3,6 @@
 # No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
 all:
 
-run_tests: all
-       ./test_user_copy.sh
+TEST_PROGS := test_user_copy.sh
+
+include ../lib.mk
index 077828c889f1377886b98c93349919d55ccb57a2..a5ce9534eb15f35335b389ec89f0379bfc45c224 100644 (file)
@@ -1,6 +1,5 @@
 # Makefile for vm selftests
 
-CC = $(CROSS_COMPILE)gcc
 CFLAGS = -Wall
 BINARIES = hugepage-mmap hugepage-shm map_hugetlb thuge-gen hugetlbfstest
 BINARIES += transhuge-stress
@@ -9,8 +8,10 @@ all: $(BINARIES)
 %: %.c
        $(CC) $(CFLAGS) -o $@ $^ -lrt
 
-run_tests: all
-       @/bin/sh ./run_vmtests || (echo "vmtests: [FAIL]"; exit 1)
+TEST_PROGS := run_vmtests
+TEST_FILES := $(BINARIES)
+
+include ../lib.mk
 
 clean:
        $(RM) $(BINARIES)
old mode 100644 (file)
new mode 100755 (executable)
diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore
new file mode 100644 (file)
index 0000000..15034fe
--- /dev/null
@@ -0,0 +1,2 @@
+*_32
+*_64
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
new file mode 100644 (file)
index 0000000..f0a7918
--- /dev/null
@@ -0,0 +1,48 @@
+.PHONY: all all_32 all_64 check_build32 clean run_tests
+
+TARGETS_C_BOTHBITS := sigreturn
+
+BINARIES_32 := $(TARGETS_C_BOTHBITS:%=%_32)
+BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
+
+CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
+
+UNAME_P := $(shell uname -p)
+
+# Always build 32-bit tests
+all: all_32
+
+# If we're on a 64-bit host, build 64-bit tests as well
+ifeq ($(shell uname -p),x86_64)
+all: all_64
+endif
+
+all_32: check_build32 $(BINARIES_32)
+
+all_64: $(BINARIES_64)
+
+clean:
+       $(RM) $(BINARIES_32) $(BINARIES_64)
+
+run_tests:
+       ./run_x86_tests.sh
+
+$(TARGETS_C_BOTHBITS:%=%_32): %_32: %.c
+       $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
+
+$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
+       $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
+
+check_build32:
+       @if ! $(CC) -m32 -o /dev/null trivial_32bit_program.c; then     \
+         echo "Warning: you seem to have a broken 32-bit build" 2>&1;  \
+         echo "environment.  If you are using a Debian-like";          \
+         echo " distribution, try:";                                   \
+         echo "";                                                      \
+         echo "  apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \
+         echo "";                                                      \
+         echo "If you are using a Fedora-like distribution, try:";     \
+         echo "";                                                      \
+         echo "  yum install glibc-devel.*i686";                       \
+         exit 1;                                                       \
+       fi
diff --git a/tools/testing/selftests/x86/run_x86_tests.sh b/tools/testing/selftests/x86/run_x86_tests.sh
new file mode 100644 (file)
index 0000000..3d3ec65
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# This is deliberately minimal.  IMO kselftests should provide a standard
+# script here.
+./sigreturn_32 || exit 1
+
+if [[ "$uname -p" -eq "x86_64" ]]; then
+    ./sigreturn_64 || exit 1
+fi
+
+exit 0
diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c
new file mode 100644 (file)
index 0000000..b5aa1ba
--- /dev/null
@@ -0,0 +1,684 @@
+/*
+ * sigreturn.c - tests for x86 sigreturn(2) and exit-to-userspace
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * This is a series of tests that exercises the sigreturn(2) syscall and
+ * the IRET / SYSRET paths in the kernel.
+ *
+ * For now, this focuses on the effects of unusual CS and SS values,
+ * and it has a bunch of tests to make sure that ESP/RSP is restored
+ * properly.
+ *
+ * The basic idea behind these tests is to raise(SIGUSR1) to create a
+ * sigcontext frame, plug in the values to be tested, and then return,
+ * which implicitly invokes sigreturn(2) and programs the user context
+ * as desired.
+ *
+ * For tests for which we expect sigreturn and the subsequent return to
+ * user mode to succeed, we return to a short trampoline that generates
+ * SIGTRAP so that the meat of the tests can be ordinary C code in a
+ * SIGTRAP handler.
+ *
+ * The inner workings of each test is documented below.
+ *
+ * Do not run on outdated, unpatched kernels at risk of nasty crashes.
+ */
+
+#define _GNU_SOURCE
+
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <asm/ldt.h>
+#include <err.h>
+#include <setjmp.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+
+/*
+ * In principle, this test can run on Linux emulation layers (e.g.
+ * Illumos "LX branded zones").  Solaris-based kernels reserve LDT
+ * entries 0-5 for their own internal purposes, so start our LDT
+ * allocations above that reservation.  (The tests don't pass on LX
+ * branded zones, but at least this lets them run.)
+ */
+#define LDT_OFFSET 6
+
+/* An aligned stack accessible through some of our segments. */
+static unsigned char stack16[65536] __attribute__((aligned(4096)));
+
+/*
+ * An aligned int3 instruction used as a trampoline.  Some of the tests
+ * want to fish out their ss values, so this trampoline copies ss to eax
+ * before the int3.
+ */
+asm (".pushsection .text\n\t"
+     ".type int3, @function\n\t"
+     ".align 4096\n\t"
+     "int3:\n\t"
+     "mov %ss,%eax\n\t"
+     "int3\n\t"
+     ".size int3, . - int3\n\t"
+     ".align 4096, 0xcc\n\t"
+     ".popsection");
+extern char int3[4096];
+
+/*
+ * At startup, we prepapre:
+ *
+ * - ldt_nonexistent_sel: An LDT entry that doesn't exist (all-zero
+ *   descriptor or out of bounds).
+ * - code16_sel: A 16-bit LDT code segment pointing to int3.
+ * - data16_sel: A 16-bit LDT data segment pointing to stack16.
+ * - npcode32_sel: A 32-bit not-present LDT code segment pointing to int3.
+ * - npdata32_sel: A 32-bit not-present LDT data segment pointing to stack16.
+ * - gdt_data16_idx: A 16-bit GDT data segment pointing to stack16.
+ * - gdt_npdata32_idx: A 32-bit not-present GDT data segment pointing to
+ *   stack16.
+ *
+ * For no particularly good reason, xyz_sel is a selector value with the
+ * RPL and LDT bits filled in, whereas xyz_idx is just an index into the
+ * descriptor table.  These variables will be zero if their respective
+ * segments could not be allocated.
+ */
+static unsigned short ldt_nonexistent_sel;
+static unsigned short code16_sel, data16_sel, npcode32_sel, npdata32_sel;
+
+static unsigned short gdt_data16_idx, gdt_npdata32_idx;
+
+static unsigned short GDT3(int idx)
+{
+       return (idx << 3) | 3;
+}
+
+static unsigned short LDT3(int idx)
+{
+       return (idx << 3) | 7;
+}
+
+/* Our sigaltstack scratch space. */
+static char altstack_data[SIGSTKSZ];
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+                      int flags)
+{
+       struct sigaction sa;
+       memset(&sa, 0, sizeof(sa));
+       sa.sa_sigaction = handler;
+       sa.sa_flags = SA_SIGINFO | flags;
+       sigemptyset(&sa.sa_mask);
+       if (sigaction(sig, &sa, 0))
+               err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+       struct sigaction sa;
+       memset(&sa, 0, sizeof(sa));
+       sa.sa_handler = SIG_DFL;
+       sigemptyset(&sa.sa_mask);
+       if (sigaction(sig, &sa, 0))
+               err(1, "sigaction");
+}
+
+static void add_ldt(const struct user_desc *desc, unsigned short *var,
+                   const char *name)
+{
+       if (syscall(SYS_modify_ldt, 1, desc, sizeof(*desc)) == 0) {
+               *var = LDT3(desc->entry_number);
+       } else {
+               printf("[NOTE]\tFailed to create %s segment\n", name);
+               *var = 0;
+       }
+}
+
+static void setup_ldt(void)
+{
+       if ((unsigned long)stack16 > (1ULL << 32) - sizeof(stack16))
+               errx(1, "stack16 is too high\n");
+       if ((unsigned long)int3 > (1ULL << 32) - sizeof(int3))
+               errx(1, "int3 is too high\n");
+
+       ldt_nonexistent_sel = LDT3(LDT_OFFSET + 2);
+
+       const struct user_desc code16_desc = {
+               .entry_number    = LDT_OFFSET + 0,
+               .base_addr       = (unsigned long)int3,
+               .limit           = 4095,
+               .seg_32bit       = 0,
+               .contents        = 2, /* Code, not conforming */
+               .read_exec_only  = 0,
+               .limit_in_pages  = 0,
+               .seg_not_present = 0,
+               .useable         = 0
+       };
+       add_ldt(&code16_desc, &code16_sel, "code16");
+
+       const struct user_desc data16_desc = {
+               .entry_number    = LDT_OFFSET + 1,
+               .base_addr       = (unsigned long)stack16,
+               .limit           = 0xffff,
+               .seg_32bit       = 0,
+               .contents        = 0, /* Data, grow-up */
+               .read_exec_only  = 0,
+               .limit_in_pages  = 0,
+               .seg_not_present = 0,
+               .useable         = 0
+       };
+       add_ldt(&data16_desc, &data16_sel, "data16");
+
+       const struct user_desc npcode32_desc = {
+               .entry_number    = LDT_OFFSET + 3,
+               .base_addr       = (unsigned long)int3,
+               .limit           = 4095,
+               .seg_32bit       = 1,
+               .contents        = 2, /* Code, not conforming */
+               .read_exec_only  = 0,
+               .limit_in_pages  = 0,
+               .seg_not_present = 1,
+               .useable         = 0
+       };
+       add_ldt(&npcode32_desc, &npcode32_sel, "npcode32");
+
+       const struct user_desc npdata32_desc = {
+               .entry_number    = LDT_OFFSET + 4,
+               .base_addr       = (unsigned long)stack16,
+               .limit           = 0xffff,
+               .seg_32bit       = 1,
+               .contents        = 0, /* Data, grow-up */
+               .read_exec_only  = 0,
+               .limit_in_pages  = 0,
+               .seg_not_present = 1,
+               .useable         = 0
+       };
+       add_ldt(&npdata32_desc, &npdata32_sel, "npdata32");
+
+       struct user_desc gdt_data16_desc = {
+               .entry_number    = -1,
+               .base_addr       = (unsigned long)stack16,
+               .limit           = 0xffff,
+               .seg_32bit       = 0,
+               .contents        = 0, /* Data, grow-up */
+               .read_exec_only  = 0,
+               .limit_in_pages  = 0,
+               .seg_not_present = 0,
+               .useable         = 0
+       };
+
+       if (syscall(SYS_set_thread_area, &gdt_data16_desc) == 0) {
+               /*
+                * This probably indicates vulnerability to CVE-2014-8133.
+                * Merely getting here isn't definitive, though, and we'll
+                * diagnose the problem for real later on.
+                */
+               printf("[WARN]\tset_thread_area allocated data16 at index %d\n",
+                      gdt_data16_desc.entry_number);
+               gdt_data16_idx = gdt_data16_desc.entry_number;
+       } else {
+               printf("[OK]\tset_thread_area refused 16-bit data\n");
+       }
+
+       struct user_desc gdt_npdata32_desc = {
+               .entry_number    = -1,
+               .base_addr       = (unsigned long)stack16,
+               .limit           = 0xffff,
+               .seg_32bit       = 1,
+               .contents        = 0, /* Data, grow-up */
+               .read_exec_only  = 0,
+               .limit_in_pages  = 0,
+               .seg_not_present = 1,
+               .useable         = 0
+       };
+
+       if (syscall(SYS_set_thread_area, &gdt_npdata32_desc) == 0) {
+               /*
+                * As a hardening measure, newer kernels don't allow this.
+                */
+               printf("[WARN]\tset_thread_area allocated npdata32 at index %d\n",
+                      gdt_npdata32_desc.entry_number);
+               gdt_npdata32_idx = gdt_npdata32_desc.entry_number;
+       } else {
+               printf("[OK]\tset_thread_area refused 16-bit data\n");
+       }
+}
+
+/* State used by our signal handlers. */
+static gregset_t initial_regs, requested_regs, resulting_regs;
+
+/* Instructions for the SIGUSR1 handler. */
+static volatile unsigned short sig_cs, sig_ss;
+static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno;
+
+/* Abstractions for some 32-bit vs 64-bit differences. */
+#ifdef __x86_64__
+# define REG_IP REG_RIP
+# define REG_SP REG_RSP
+# define REG_AX REG_RAX
+
+struct selectors {
+       unsigned short cs, gs, fs, ss;
+};
+
+static unsigned short *ssptr(ucontext_t *ctx)
+{
+       struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
+       return &sels->ss;
+}
+
+static unsigned short *csptr(ucontext_t *ctx)
+{
+       struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
+       return &sels->cs;
+}
+#else
+# define REG_IP REG_EIP
+# define REG_SP REG_ESP
+# define REG_AX REG_EAX
+
+static greg_t *ssptr(ucontext_t *ctx)
+{
+       return &ctx->uc_mcontext.gregs[REG_SS];
+}
+
+static greg_t *csptr(ucontext_t *ctx)
+{
+       return &ctx->uc_mcontext.gregs[REG_CS];
+}
+#endif
+
+/* Number of errors in the current test case. */
+static volatile sig_atomic_t nerrs;
+
+/*
+ * SIGUSR1 handler.  Sets CS and SS as requested and points IP to the
+ * int3 trampoline.  Sets SP to a large known value so that we can see
+ * whether the value round-trips back to user mode correctly.
+ */
+static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
+{
+       ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+       memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+
+       *csptr(ctx) = sig_cs;
+       *ssptr(ctx) = sig_ss;
+
+       ctx->uc_mcontext.gregs[REG_IP] =
+               sig_cs == code16_sel ? 0 : (unsigned long)&int3;
+       ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL;
+       ctx->uc_mcontext.gregs[REG_AX] = 0;
+
+       memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+       requested_regs[REG_AX] = *ssptr(ctx);   /* The asm code does this. */
+
+       return;
+}
+
+/*
+ * Called after a successful sigreturn.  Restores our state so that
+ * the original raise(SIGUSR1) returns.
+ */
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+       ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+       sig_err = ctx->uc_mcontext.gregs[REG_ERR];
+       sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
+
+       unsigned short ss;
+       asm ("mov %%ss,%0" : "=r" (ss));
+
+       greg_t asm_ss = ctx->uc_mcontext.gregs[REG_AX];
+       if (asm_ss != sig_ss && sig == SIGTRAP) {
+               /* Sanity check failure. */
+               printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n",
+                      ss, *ssptr(ctx), (unsigned long long)asm_ss);
+               nerrs++;
+       }
+
+       memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+       memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
+
+       sig_trapped = sig;
+}
+
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+int cs_bitness(unsigned short cs)
+{
+       uint32_t valid = 0, ar;
+       asm ("lar %[cs], %[ar]\n\t"
+            "jnz 1f\n\t"
+            "mov $1, %[valid]\n\t"
+            "1:"
+            : [ar] "=r" (ar), [valid] "+rm" (valid)
+            : [cs] "r" (cs));
+
+       if (!valid)
+               return -1;
+
+       bool db = (ar & (1 << 22));
+       bool l = (ar & (1 << 21));
+
+       if (!(ar & (1<<11)))
+           return -1;  /* Not code. */
+
+       if (l && !db)
+               return 64;
+       else if (!l && db)
+               return 32;
+       else if (!l && !db)
+               return 16;
+       else
+               return -1;      /* Unknown bitness. */
+}
+
+/* Finds a usable code segment of the requested bitness. */
+int find_cs(int bitness)
+{
+       unsigned short my_cs;
+
+       asm ("mov %%cs,%0" :  "=r" (my_cs));
+
+       if (cs_bitness(my_cs) == bitness)
+               return my_cs;
+       if (cs_bitness(my_cs + (2 << 3)) == bitness)
+               return my_cs + (2 << 3);
+       if (my_cs > (2<<3) && cs_bitness(my_cs - (2 << 3)) == bitness)
+           return my_cs - (2 << 3);
+       if (cs_bitness(code16_sel) == bitness)
+               return code16_sel;
+
+       printf("[WARN]\tCould not find %d-bit CS\n", bitness);
+       return -1;
+}
+
+static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss)
+{
+       int cs = find_cs(cs_bits);
+       if (cs == -1) {
+               printf("[SKIP]\tCode segment unavailable for %d-bit CS, %d-bit SS\n",
+                      cs_bits, use_16bit_ss ? 16 : 32);
+               return 0;
+       }
+
+       if (force_ss != -1) {
+               sig_ss = force_ss;
+       } else {
+               if (use_16bit_ss) {
+                       if (!data16_sel) {
+                               printf("[SKIP]\tData segment unavailable for %d-bit CS, 16-bit SS\n",
+                                      cs_bits);
+                               return 0;
+                       }
+                       sig_ss = data16_sel;
+               } else {
+                       asm volatile ("mov %%ss,%0" : "=r" (sig_ss));
+               }
+       }
+
+       sig_cs = cs;
+
+       printf("[RUN]\tValid sigreturn: %d-bit CS (%hx), %d-bit SS (%hx%s)\n",
+              cs_bits, sig_cs, use_16bit_ss ? 16 : 32, sig_ss,
+              (sig_ss & 4) ? "" : ", GDT");
+
+       raise(SIGUSR1);
+
+       nerrs = 0;
+
+       /*
+        * Check that each register had an acceptable value when the
+        * int3 trampoline was invoked.
+        */
+       for (int i = 0; i < NGREG; i++) {
+               greg_t req = requested_regs[i], res = resulting_regs[i];
+               if (i == REG_TRAPNO || i == REG_IP)
+                       continue;       /* don't care */
+               if (i == REG_SP) {
+                       printf("\tSP: %llx -> %llx\n", (unsigned long long)req,
+                              (unsigned long long)res);
+
+                       /*
+                        * In many circumstances, the high 32 bits of rsp
+                        * are zeroed.  For example, we could be a real
+                        * 32-bit program, or we could hit any of a number
+                        * of poorly-documented IRET or segmented ESP
+                        * oddities.  If this happens, it's okay.
+                        */
+                       if (res == (req & 0xFFFFFFFF))
+                               continue;  /* OK; not expected to work */
+               }
+
+               bool ignore_reg = false;
+#if __i386__
+               if (i == REG_UESP)
+                       ignore_reg = true;
+#else
+               if (i == REG_CSGSFS) {
+                       struct selectors *req_sels =
+                               (void *)&requested_regs[REG_CSGSFS];
+                       struct selectors *res_sels =
+                               (void *)&resulting_regs[REG_CSGSFS];
+                       if (req_sels->cs != res_sels->cs) {
+                               printf("[FAIL]\tCS mismatch: requested 0x%hx; got 0x%hx\n",
+                                      req_sels->cs, res_sels->cs);
+                               nerrs++;
+                       }
+
+                       if (req_sels->ss != res_sels->ss) {
+                               printf("[FAIL]\tSS mismatch: requested 0x%hx; got 0x%hx\n",
+                                      req_sels->ss, res_sels->ss);
+                               nerrs++;
+                       }
+
+                       continue;
+               }
+#endif
+
+               /* Sanity check on the kernel */
+               if (i == REG_AX && requested_regs[i] != resulting_regs[i]) {
+                       printf("[FAIL]\tAX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n",
+                              (unsigned long long)requested_regs[i],
+                              (unsigned long long)resulting_regs[i]);
+                       nerrs++;
+                       continue;
+               }
+
+               if (requested_regs[i] != resulting_regs[i] && !ignore_reg) {
+                       /*
+                        * SP is particularly interesting here.  The
+                        * usual cause of failures is that we hit the
+                        * nasty IRET case of returning to a 16-bit SS,
+                        * in which case bits 16:31 of the *kernel*
+                        * stack pointer persist in ESP.
+                        */
+                       printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n",
+                              i, (unsigned long long)requested_regs[i],
+                              (unsigned long long)resulting_regs[i]);
+                       nerrs++;
+               }
+       }
+
+       if (nerrs == 0)
+               printf("[OK]\tall registers okay\n");
+
+       return nerrs;
+}
+
+static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs)
+{
+       int cs = force_cs == -1 ? find_cs(cs_bits) : force_cs;
+       if (cs == -1)
+               return 0;
+
+       sig_cs = cs;
+       sig_ss = ss;
+
+       printf("[RUN]\t%d-bit CS (%hx), bogus SS (%hx)\n",
+              cs_bits, sig_cs, sig_ss);
+
+       sig_trapped = 0;
+       raise(SIGUSR1);
+       if (sig_trapped) {
+               char errdesc[32] = "";
+               if (sig_err) {
+                       const char *src = (sig_err & 1) ? " EXT" : "";
+                       const char *table;
+                       if ((sig_err & 0x6) == 0x0)
+                               table = "GDT";
+                       else if ((sig_err & 0x6) == 0x4)
+                               table = "LDT";
+                       else if ((sig_err & 0x6) == 0x2)
+                               table = "IDT";
+                       else
+                               table = "???";
+
+                       sprintf(errdesc, "%s%s index %d, ",
+                               table, src, sig_err >> 3);
+               }
+
+               char trapname[32];
+               if (sig_trapno == 13)
+                       strcpy(trapname, "GP");
+               else if (sig_trapno == 11)
+                       strcpy(trapname, "NP");
+               else if (sig_trapno == 12)
+                       strcpy(trapname, "SS");
+               else if (sig_trapno == 32)
+                       strcpy(trapname, "IRET");  /* X86_TRAP_IRET */
+               else
+                       sprintf(trapname, "%d", sig_trapno);
+
+               printf("[OK]\tGot #%s(0x%lx) (i.e. %s%s)\n",
+                      trapname, (unsigned long)sig_err,
+                      errdesc, strsignal(sig_trapped));
+               return 0;
+       } else {
+               printf("[FAIL]\tDid not get SIGSEGV\n");
+               return 1;
+       }
+}
+
+int main()
+{
+       int total_nerrs = 0;
+       unsigned short my_cs, my_ss;
+
+       asm volatile ("mov %%cs,%0" : "=r" (my_cs));
+       asm volatile ("mov %%ss,%0" : "=r" (my_ss));
+       setup_ldt();
+
+       stack_t stack = {
+               .ss_sp = altstack_data,
+               .ss_size = SIGSTKSZ,
+       };
+       if (sigaltstack(&stack, NULL) != 0)
+               err(1, "sigaltstack");
+
+       sethandler(SIGUSR1, sigusr1, 0);
+       sethandler(SIGTRAP, sigtrap, SA_ONSTACK);
+
+       /* Easy cases: return to a 32-bit SS in each possible CS bitness. */
+       total_nerrs += test_valid_sigreturn(64, false, -1);
+       total_nerrs += test_valid_sigreturn(32, false, -1);
+       total_nerrs += test_valid_sigreturn(16, false, -1);
+
+       /*
+        * Test easy espfix cases: return to a 16-bit LDT SS in each possible
+        * CS bitness.  NB: with a long mode CS, the SS bitness is irrelevant.
+        *
+        * This catches the original missing-espfix-on-64-bit-kernels issue
+        * as well as CVE-2014-8134.
+        */
+       total_nerrs += test_valid_sigreturn(64, true, -1);
+       total_nerrs += test_valid_sigreturn(32, true, -1);
+       total_nerrs += test_valid_sigreturn(16, true, -1);
+
+       if (gdt_data16_idx) {
+               /*
+                * For performance reasons, Linux skips espfix if SS points
+                * to the GDT.  If we were able to allocate a 16-bit SS in
+                * the GDT, see if it leaks parts of the kernel stack pointer.
+                *
+                * This tests for CVE-2014-8133.
+                */
+               total_nerrs += test_valid_sigreturn(64, true,
+                                                   GDT3(gdt_data16_idx));
+               total_nerrs += test_valid_sigreturn(32, true,
+                                                   GDT3(gdt_data16_idx));
+               total_nerrs += test_valid_sigreturn(16, true,
+                                                   GDT3(gdt_data16_idx));
+       }
+
+       /*
+        * We're done testing valid sigreturn cases.  Now we test states
+        * for which sigreturn itself will succeed but the subsequent
+        * entry to user mode will fail.
+        *
+        * Depending on the failure mode and the kernel bitness, these
+        * entry failures can generate SIGSEGV, SIGBUS, or SIGILL.
+        */
+       clearhandler(SIGTRAP);
+       sethandler(SIGSEGV, sigtrap, SA_ONSTACK);
+       sethandler(SIGBUS, sigtrap, SA_ONSTACK);
+       sethandler(SIGILL, sigtrap, SA_ONSTACK);  /* 32-bit kernels do this */
+
+       /* Easy failures: invalid SS, resulting in #GP(0) */
+       test_bad_iret(64, ldt_nonexistent_sel, -1);
+       test_bad_iret(32, ldt_nonexistent_sel, -1);
+       test_bad_iret(16, ldt_nonexistent_sel, -1);
+
+       /* These fail because SS isn't a data segment, resulting in #GP(SS) */
+       test_bad_iret(64, my_cs, -1);
+       test_bad_iret(32, my_cs, -1);
+       test_bad_iret(16, my_cs, -1);
+
+       /* Try to return to a not-present code segment, triggering #NP(SS). */
+       test_bad_iret(32, my_ss, npcode32_sel);
+
+       /*
+        * Try to return to a not-present but otherwise valid data segment.
+        * This will cause IRET to fail with #SS on the espfix stack.  This
+        * exercises CVE-2014-9322.
+        *
+        * Note that, if espfix is enabled, 64-bit Linux will lose track
+        * of the actual cause of failure and report #GP(0) instead.
+        * This would be very difficult for Linux to avoid, because
+        * espfix64 causes IRET failures to be promoted to #DF, so the
+        * original exception frame is never pushed onto the stack.
+        */
+       test_bad_iret(32, npdata32_sel, -1);
+
+       /*
+        * Try to return to a not-present but otherwise valid data
+        * segment without invoking espfix.  Newer kernels don't allow
+        * this to happen in the first place.  On older kernels, though,
+        * this can trigger CVE-2014-9322.
+        */
+       if (gdt_npdata32_idx)
+               test_bad_iret(32, GDT3(gdt_npdata32_idx), -1);
+
+       return total_nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/trivial_32bit_program.c b/tools/testing/selftests/x86/trivial_32bit_program.c
new file mode 100644 (file)
index 0000000..2e231be
--- /dev/null
@@ -0,0 +1,14 @@
+/*
+ * Trivial program to check that we have a valid 32-bit build environment.
+ * Copyright (c) 2015 Andy Lutomirski
+ * GPL v2
+ */
+
+#include <stdio.h>
+
+int main()
+{
+       printf("\n");
+
+       return 0;
+}
index 6e54f3542126b189be45d2bce32b009721a9d3ab..98c95f2fcba4a63912fb81fbafd3854b08835e00 100644 (file)
@@ -85,13 +85,22 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
+/*
+ * Work function for handling the backup timer that we schedule when a vcpu is
+ * no longer running, but had a timer programmed to fire in the future.
+ */
 static void kvm_timer_inject_irq_work(struct work_struct *work)
 {
        struct kvm_vcpu *vcpu;
 
        vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired);
        vcpu->arch.timer_cpu.armed = false;
-       kvm_timer_inject_irq(vcpu);
+
+       /*
+        * If the vcpu is blocked we want to wake it up so that it will see
+        * the timer has expired when entering the guest.
+        */
+       kvm_vcpu_kick(vcpu);
 }
 
 static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
@@ -102,6 +111,21 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
        return HRTIMER_NORESTART;
 }
 
+bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
+{
+       struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+       cycle_t cval, now;
+
+       if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
+               !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
+               return false;
+
+       cval = timer->cntv_cval;
+       now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
+
+       return cval <= now;
+}
+
 /**
  * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
  * @vcpu: The vcpu pointer
@@ -119,6 +143,13 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
         * populate the CPU timer again.
         */
        timer_disarm(timer);
+
+       /*
+        * If the timer expired while we were not scheduled, now is the time
+        * to inject it.
+        */
+       if (kvm_timer_should_fire(vcpu))
+               kvm_timer_inject_irq(vcpu);
 }
 
 /**
@@ -134,16 +165,9 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
        cycle_t cval, now;
        u64 ns;
 
-       if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
-               !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
-               return;
-
-       cval = timer->cntv_cval;
-       now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
-
        BUG_ON(timer_is_armed(timer));
 
-       if (cval <= now) {
+       if (kvm_timer_should_fire(vcpu)) {
                /*
                 * Timer has already expired while we were not
                 * looking. Inject the interrupt and carry on.
@@ -152,6 +176,9 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
                return;
        }
 
+       cval = timer->cntv_cval;
+       now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
+
        ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask,
                                 &timecounter->frac);
        timer_arm(timer, ns);
index 19c6210f02cf5c2003b96a3ecbb198321114a2a6..13907970d11c3a94b8dc0a5b1848973035cc41cf 100644 (file)
@@ -107,6 +107,22 @@ static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
                                             vcpu->vcpu_id);
 }
 
+static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu,
+                                      struct kvm_exit_mmio *mmio,
+                                      phys_addr_t offset)
+{
+       return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
+                                         vcpu->vcpu_id);
+}
+
+static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu,
+                                        struct kvm_exit_mmio *mmio,
+                                        phys_addr_t offset)
+{
+       return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
+                                           vcpu->vcpu_id);
+}
+
 static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
                                     struct kvm_exit_mmio *mmio,
                                     phys_addr_t offset)
@@ -303,7 +319,7 @@ static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
                return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
 }
 
-static const struct kvm_mmio_range vgic_dist_ranges[] = {
+static const struct vgic_io_range vgic_dist_ranges[] = {
        {
                .base           = GIC_DIST_CTRL,
                .len            = 12,
@@ -344,13 +360,13 @@ static const struct kvm_mmio_range vgic_dist_ranges[] = {
                .base           = GIC_DIST_ACTIVE_SET,
                .len            = VGIC_MAX_IRQS / 8,
                .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_raz_wi,
+               .handle_mmio    = handle_mmio_set_active_reg,
        },
        {
                .base           = GIC_DIST_ACTIVE_CLEAR,
                .len            = VGIC_MAX_IRQS / 8,
                .bits_per_irq   = 1,
-               .handle_mmio    = handle_mmio_raz_wi,
+               .handle_mmio    = handle_mmio_clear_active_reg,
        },
        {
                .base           = GIC_DIST_PRI,
@@ -388,24 +404,6 @@ static const struct kvm_mmio_range vgic_dist_ranges[] = {
        {}
 };
 
-static bool vgic_v2_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                               struct kvm_exit_mmio *mmio)
-{
-       unsigned long base = vcpu->kvm->arch.vgic.vgic_dist_base;
-
-       if (!is_in_range(mmio->phys_addr, mmio->len, base,
-                        KVM_VGIC_V2_DIST_SIZE))
-               return false;
-
-       /* GICv2 does not support accesses wider than 32 bits */
-       if (mmio->len > 4) {
-               kvm_inject_dabt(vcpu, mmio->phys_addr);
-               return true;
-       }
-
-       return vgic_handle_mmio_range(vcpu, run, mmio, vgic_dist_ranges, base);
-}
-
 static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
 {
        struct kvm *kvm = vcpu->kvm;
@@ -490,6 +488,7 @@ static bool vgic_v2_queue_sgi(struct kvm_vcpu *vcpu, int irq)
 static int vgic_v2_map_resources(struct kvm *kvm,
                                 const struct vgic_params *params)
 {
+       struct vgic_dist *dist = &kvm->arch.vgic;
        int ret = 0;
 
        if (!irqchip_in_kernel(kvm))
@@ -500,13 +499,17 @@ static int vgic_v2_map_resources(struct kvm *kvm,
        if (vgic_ready(kvm))
                goto out;
 
-       if (IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_dist_base) ||
-           IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_cpu_base)) {
+       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+           IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
                kvm_err("Need to set vgic cpu and dist addresses first\n");
                ret = -ENXIO;
                goto out;
        }
 
+       vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
+                                KVM_VGIC_V2_DIST_SIZE,
+                                vgic_dist_ranges, -1, &dist->dist_iodev);
+
        /*
         * Initialize the vgic if this hasn't already been done on demand by
         * accessing the vgic state from userspace.
@@ -514,18 +517,23 @@ static int vgic_v2_map_resources(struct kvm *kvm,
        ret = vgic_init(kvm);
        if (ret) {
                kvm_err("Unable to allocate maps\n");
-               goto out;
+               goto out_unregister;
        }
 
-       ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base,
+       ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
                                    params->vcpu_base, KVM_VGIC_V2_CPU_SIZE,
                                    true);
        if (ret) {
                kvm_err("Unable to remap VGIC CPU to VCPU\n");
-               goto out;
+               goto out_unregister;
        }
 
-       kvm->arch.vgic.ready = true;
+       dist->ready = true;
+       goto out;
+
+out_unregister:
+       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
+
 out:
        if (ret)
                kvm_vgic_destroy(kvm);
@@ -554,7 +562,6 @@ void vgic_v2_init_emulation(struct kvm *kvm)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
 
-       dist->vm_ops.handle_mmio = vgic_v2_handle_mmio;
        dist->vm_ops.queue_sgi = vgic_v2_queue_sgi;
        dist->vm_ops.add_sgi_source = vgic_v2_add_sgi_source;
        dist->vm_ops.init_model = vgic_v2_init_model;
@@ -631,7 +638,7 @@ static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
  * CPU Interface Register accesses - these are not accessed by the VM, but by
  * user space for saving and restoring VGIC state.
  */
-static const struct kvm_mmio_range vgic_cpu_ranges[] = {
+static const struct vgic_io_range vgic_cpu_ranges[] = {
        {
                .base           = GIC_CPU_CTRL,
                .len            = 12,
@@ -658,12 +665,13 @@ static int vgic_attr_regs_access(struct kvm_device *dev,
                                 struct kvm_device_attr *attr,
                                 u32 *reg, bool is_write)
 {
-       const struct kvm_mmio_range *r = NULL, *ranges;
+       const struct vgic_io_range *r = NULL, *ranges;
        phys_addr_t offset;
        int ret, cpuid, c;
        struct kvm_vcpu *vcpu, *tmp_vcpu;
        struct vgic_dist *vgic;
        struct kvm_exit_mmio mmio;
+       u32 data;
 
        offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
        cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
@@ -685,6 +693,7 @@ static int vgic_attr_regs_access(struct kvm_device *dev,
 
        mmio.len = 4;
        mmio.is_write = is_write;
+       mmio.data = &data;
        if (is_write)
                mmio_data_write(&mmio, ~0, *reg);
        switch (attr->group) {
@@ -699,7 +708,7 @@ static int vgic_attr_regs_access(struct kvm_device *dev,
        default:
                BUG();
        }
-       r = vgic_find_range(ranges, &mmio, offset);
+       r = vgic_find_range(ranges, 4, offset);
 
        if (unlikely(!r || !r->handle_mmio)) {
                ret = -ENXIO;
index b3f154631515eda6bccef2a6094757cf0f0b135f..e9c3a7a83833bf2ef058cfd407b92bafe20da073 100644 (file)
@@ -340,7 +340,7 @@ static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
        return false;
 }
 
-static const struct kvm_mmio_range vgic_v3_dist_ranges[] = {
+static const struct vgic_io_range vgic_v3_dist_ranges[] = {
        {
                .base           = GICD_CTLR,
                .len            = 0x04,
@@ -502,6 +502,43 @@ static const struct kvm_mmio_range vgic_v3_dist_ranges[] = {
        {},
 };
 
+static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
+                                   struct kvm_exit_mmio *mmio,
+                                   phys_addr_t offset)
+{
+       /* since we don't support LPIs, this register is zero for now */
+       vgic_reg_access(mmio, NULL, offset,
+                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
+       return false;
+}
+
+static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
+                                    struct kvm_exit_mmio *mmio,
+                                    phys_addr_t offset)
+{
+       u32 reg;
+       u64 mpidr;
+       struct kvm_vcpu *redist_vcpu = mmio->private;
+       int target_vcpu_id = redist_vcpu->vcpu_id;
+
+       /* the upper 32 bits contain the affinity value */
+       if ((offset & ~3) == 4) {
+               mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
+               reg = compress_mpidr(mpidr);
+
+               vgic_reg_access(mmio, &reg, offset,
+                               ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+               return false;
+       }
+
+       reg = redist_vcpu->vcpu_id << 8;
+       if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
+               reg |= GICR_TYPER_LAST;
+       vgic_reg_access(mmio, &reg, offset,
+                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
+       return false;
+}
+
 static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
                                              struct kvm_exit_mmio *mmio,
                                              phys_addr_t offset)
@@ -570,186 +607,107 @@ static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
        return vgic_handle_cfg_reg(reg, mmio, offset);
 }
 
-static const struct kvm_mmio_range vgic_redist_sgi_ranges[] = {
+#define SGI_base(x) ((x) + SZ_64K)
+
+static const struct vgic_io_range vgic_redist_ranges[] = {
+       {
+               .base           = GICR_CTLR,
+               .len            = 0x04,
+               .bits_per_irq   = 0,
+               .handle_mmio    = handle_mmio_ctlr_redist,
+       },
+       {
+               .base           = GICR_TYPER,
+               .len            = 0x08,
+               .bits_per_irq   = 0,
+               .handle_mmio    = handle_mmio_typer_redist,
+       },
+       {
+               .base           = GICR_IIDR,
+               .len            = 0x04,
+               .bits_per_irq   = 0,
+               .handle_mmio    = handle_mmio_iidr,
+       },
+       {
+               .base           = GICR_WAKER,
+               .len            = 0x04,
+               .bits_per_irq   = 0,
+               .handle_mmio    = handle_mmio_raz_wi,
+       },
        {
-               .base           = GICR_IGROUPR0,
+               .base           = GICR_IDREGS,
+               .len            = 0x30,
+               .bits_per_irq   = 0,
+               .handle_mmio    = handle_mmio_idregs,
+       },
+       {
+               .base           = SGI_base(GICR_IGROUPR0),
                .len            = 0x04,
                .bits_per_irq   = 1,
                .handle_mmio    = handle_mmio_rao_wi,
        },
        {
-               .base           = GICR_ISENABLER0,
+               .base           = SGI_base(GICR_ISENABLER0),
                .len            = 0x04,
                .bits_per_irq   = 1,
                .handle_mmio    = handle_mmio_set_enable_reg_redist,
        },
        {
-               .base           = GICR_ICENABLER0,
+               .base           = SGI_base(GICR_ICENABLER0),
                .len            = 0x04,
                .bits_per_irq   = 1,
                .handle_mmio    = handle_mmio_clear_enable_reg_redist,
        },
        {
-               .base           = GICR_ISPENDR0,
+               .base           = SGI_base(GICR_ISPENDR0),
                .len            = 0x04,
                .bits_per_irq   = 1,
                .handle_mmio    = handle_mmio_set_pending_reg_redist,
        },
        {
-               .base           = GICR_ICPENDR0,
+               .base           = SGI_base(GICR_ICPENDR0),
                .len            = 0x04,
                .bits_per_irq   = 1,
                .handle_mmio    = handle_mmio_clear_pending_reg_redist,
        },
        {
-               .base           = GICR_ISACTIVER0,
+               .base           = SGI_base(GICR_ISACTIVER0),
                .len            = 0x04,
                .bits_per_irq   = 1,
                .handle_mmio    = handle_mmio_raz_wi,
        },
        {
-               .base           = GICR_ICACTIVER0,
+               .base           = SGI_base(GICR_ICACTIVER0),
                .len            = 0x04,
                .bits_per_irq   = 1,
                .handle_mmio    = handle_mmio_raz_wi,
        },
        {
-               .base           = GICR_IPRIORITYR0,
+               .base           = SGI_base(GICR_IPRIORITYR0),
                .len            = 0x20,
                .bits_per_irq   = 8,
                .handle_mmio    = handle_mmio_priority_reg_redist,
        },
        {
-               .base           = GICR_ICFGR0,
+               .base           = SGI_base(GICR_ICFGR0),
                .len            = 0x08,
                .bits_per_irq   = 2,
                .handle_mmio    = handle_mmio_cfg_reg_redist,
        },
        {
-               .base           = GICR_IGRPMODR0,
+               .base           = SGI_base(GICR_IGRPMODR0),
                .len            = 0x04,
                .bits_per_irq   = 1,
                .handle_mmio    = handle_mmio_raz_wi,
        },
        {
-               .base           = GICR_NSACR,
+               .base           = SGI_base(GICR_NSACR),
                .len            = 0x04,
                .handle_mmio    = handle_mmio_raz_wi,
        },
        {},
 };
 
-static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
-                                   struct kvm_exit_mmio *mmio,
-                                   phys_addr_t offset)
-{
-       /* since we don't support LPIs, this register is zero for now */
-       vgic_reg_access(mmio, NULL, offset,
-                       ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
-                                    struct kvm_exit_mmio *mmio,
-                                    phys_addr_t offset)
-{
-       u32 reg;
-       u64 mpidr;
-       struct kvm_vcpu *redist_vcpu = mmio->private;
-       int target_vcpu_id = redist_vcpu->vcpu_id;
-
-       /* the upper 32 bits contain the affinity value */
-       if ((offset & ~3) == 4) {
-               mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
-               reg = compress_mpidr(mpidr);
-
-               vgic_reg_access(mmio, &reg, offset,
-                               ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-               return false;
-       }
-
-       reg = redist_vcpu->vcpu_id << 8;
-       if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
-               reg |= GICR_TYPER_LAST;
-       vgic_reg_access(mmio, &reg, offset,
-                       ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-       return false;
-}
-
-static const struct kvm_mmio_range vgic_redist_ranges[] = {
-       {
-               .base           = GICR_CTLR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_ctlr_redist,
-       },
-       {
-               .base           = GICR_TYPER,
-               .len            = 0x08,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_typer_redist,
-       },
-       {
-               .base           = GICR_IIDR,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_iidr,
-       },
-       {
-               .base           = GICR_WAKER,
-               .len            = 0x04,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_raz_wi,
-       },
-       {
-               .base           = GICR_IDREGS,
-               .len            = 0x30,
-               .bits_per_irq   = 0,
-               .handle_mmio    = handle_mmio_idregs,
-       },
-       {},
-};
-
-/*
- * This function splits accesses between the distributor and the two
- * redistributor parts (private/SPI). As each redistributor is accessible
- * from any CPU, we have to determine the affected VCPU by taking the faulting
- * address into account. We then pass this VCPU to the handler function via
- * the private parameter.
- */
-#define SGI_BASE_OFFSET SZ_64K
-static bool vgic_v3_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                               struct kvm_exit_mmio *mmio)
-{
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-       unsigned long dbase = dist->vgic_dist_base;
-       unsigned long rdbase = dist->vgic_redist_base;
-       int nrcpus = atomic_read(&vcpu->kvm->online_vcpus);
-       int vcpu_id;
-       const struct kvm_mmio_range *mmio_range;
-
-       if (is_in_range(mmio->phys_addr, mmio->len, dbase, GIC_V3_DIST_SIZE)) {
-               return vgic_handle_mmio_range(vcpu, run, mmio,
-                                             vgic_v3_dist_ranges, dbase);
-       }
-
-       if (!is_in_range(mmio->phys_addr, mmio->len, rdbase,
-           GIC_V3_REDIST_SIZE * nrcpus))
-               return false;
-
-       vcpu_id = (mmio->phys_addr - rdbase) / GIC_V3_REDIST_SIZE;
-       rdbase += (vcpu_id * GIC_V3_REDIST_SIZE);
-       mmio->private = kvm_get_vcpu(vcpu->kvm, vcpu_id);
-
-       if (mmio->phys_addr >= rdbase + SGI_BASE_OFFSET) {
-               rdbase += SGI_BASE_OFFSET;
-               mmio_range = vgic_redist_sgi_ranges;
-       } else {
-               mmio_range = vgic_redist_ranges;
-       }
-       return vgic_handle_mmio_range(vcpu, run, mmio, mmio_range, rdbase);
-}
-
 static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
 {
        if (vgic_queue_irq(vcpu, 0, irq)) {
@@ -766,6 +724,9 @@ static int vgic_v3_map_resources(struct kvm *kvm,
 {
        int ret = 0;
        struct vgic_dist *dist = &kvm->arch.vgic;
+       gpa_t rdbase = dist->vgic_redist_base;
+       struct vgic_io_device *iodevs = NULL;
+       int i;
 
        if (!irqchip_in_kernel(kvm))
                return 0;
@@ -791,7 +752,41 @@ static int vgic_v3_map_resources(struct kvm *kvm,
                goto out;
        }
 
-       kvm->arch.vgic.ready = true;
+       ret = vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
+                                      GIC_V3_DIST_SIZE, vgic_v3_dist_ranges,
+                                      -1, &dist->dist_iodev);
+       if (ret)
+               goto out;
+
+       iodevs = kcalloc(dist->nr_cpus, sizeof(iodevs[0]), GFP_KERNEL);
+       if (!iodevs) {
+               ret = -ENOMEM;
+               goto out_unregister;
+       }
+
+       for (i = 0; i < dist->nr_cpus; i++) {
+               ret = vgic_register_kvm_io_dev(kvm, rdbase,
+                                              SZ_128K, vgic_redist_ranges,
+                                              i, &iodevs[i]);
+               if (ret)
+                       goto out_unregister;
+               rdbase += GIC_V3_REDIST_SIZE;
+       }
+
+       dist->redist_iodevs = iodevs;
+       dist->ready = true;
+       goto out;
+
+out_unregister:
+       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
+       if (iodevs) {
+               for (i = 0; i < dist->nr_cpus; i++) {
+                       if (iodevs[i].dev.ops)
+                               kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+                                                         &iodevs[i].dev);
+               }
+       }
+
 out:
        if (ret)
                kvm_vgic_destroy(kvm);
@@ -832,7 +827,6 @@ void vgic_v3_init_emulation(struct kvm *kvm)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
 
-       dist->vm_ops.handle_mmio = vgic_v3_handle_mmio;
        dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
        dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
        dist->vm_ops.init_model = vgic_v3_init_model;
index c9f60f52458802f4a66a3912732d8665bc4a3e32..8d550ff14700c8a628b9c2c6ab55301f6bd0e92b 100644 (file)
@@ -31,6 +31,9 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_mmu.h>
+#include <trace/events/kvm.h>
+#include <asm/kvm.h>
+#include <kvm/iodev.h>
 
 /*
  * How the whole thing works (courtesy of Christoffer Dall):
@@ -263,6 +266,13 @@ static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq)
        return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq);
 }
 
+static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+       return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq);
+}
+
 static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
@@ -277,6 +287,20 @@ static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq)
        vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0);
 }
 
+static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+       vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1);
+}
+
+static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+       vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0);
+}
+
 static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
@@ -520,6 +544,44 @@ bool vgic_handle_clear_pending_reg(struct kvm *kvm,
        return false;
 }
 
+bool vgic_handle_set_active_reg(struct kvm *kvm,
+                               struct kvm_exit_mmio *mmio,
+                               phys_addr_t offset, int vcpu_id)
+{
+       u32 *reg;
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
+       vgic_reg_access(mmio, reg, offset,
+                       ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
+
+       if (mmio->is_write) {
+               vgic_update_state(kvm);
+               return true;
+       }
+
+       return false;
+}
+
+bool vgic_handle_clear_active_reg(struct kvm *kvm,
+                                 struct kvm_exit_mmio *mmio,
+                                 phys_addr_t offset, int vcpu_id)
+{
+       u32 *reg;
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
+       vgic_reg_access(mmio, reg, offset,
+                       ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
+
+       if (mmio->is_write) {
+               vgic_update_state(kvm);
+               return true;
+       }
+
+       return false;
+}
+
 static u32 vgic_cfg_expand(u16 val)
 {
        u32 res = 0;
@@ -588,16 +650,12 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
 }
 
 /**
- * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor
+ * vgic_unqueue_irqs - move pending/active IRQs from LRs to the distributor
  * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
  *
- * Move any pending IRQs that have already been assigned to LRs back to the
+ * Move any IRQs that have already been assigned to LRs back to the
  * emulated distributor state so that the complete emulated state can be read
  * from the main emulation structures without investigating the LRs.
- *
- * Note that IRQs in the active state in the LRs get their pending state moved
- * to the distributor but the active state stays in the LRs, because we don't
- * track the active state on the distributor side.
  */
 void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 {
@@ -613,12 +671,22 @@ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
                 * 01: pending
                 * 10: active
                 * 11: pending and active
-                *
-                * If the LR holds only an active interrupt (not pending) then
-                * just leave it alone.
                 */
-               if ((lr.state & LR_STATE_MASK) == LR_STATE_ACTIVE)
-                       continue;
+               BUG_ON(!(lr.state & LR_STATE_MASK));
+
+               /* Reestablish SGI source for pending and active IRQs */
+               if (lr.irq < VGIC_NR_SGIS)
+                       add_sgi_source(vcpu, lr.irq, lr.source);
+
+               /*
+                * If the LR holds an active (10) or a pending and active (11)
+                * interrupt then move the active state to the
+                * distributor tracking bit.
+                */
+               if (lr.state & LR_STATE_ACTIVE) {
+                       vgic_irq_set_active(vcpu, lr.irq);
+                       lr.state &= ~LR_STATE_ACTIVE;
+               }
 
                /*
                 * Reestablish the pending state on the distributor and the
@@ -626,21 +694,19 @@ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
                 * is fine, then we are only setting a few bits that were
                 * already set.
                 */
-               vgic_dist_irq_set_pending(vcpu, lr.irq);
-               if (lr.irq < VGIC_NR_SGIS)
-                       add_sgi_source(vcpu, lr.irq, lr.source);
-               lr.state &= ~LR_STATE_PENDING;
+               if (lr.state & LR_STATE_PENDING) {
+                       vgic_dist_irq_set_pending(vcpu, lr.irq);
+                       lr.state &= ~LR_STATE_PENDING;
+               }
+
                vgic_set_lr(vcpu, i, lr);
 
                /*
-                * If there's no state left on the LR (it could still be
-                * active), then the LR does not hold any useful info and can
-                * be marked as free for other use.
+                * Mark the LR as free for other use.
                 */
-               if (!(lr.state & LR_STATE_MASK)) {
-                       vgic_retire_lr(i, lr.irq, vcpu);
-                       vgic_irq_clear_queued(vcpu, lr.irq);
-               }
+               BUG_ON(lr.state & LR_STATE_MASK);
+               vgic_retire_lr(i, lr.irq, vcpu);
+               vgic_irq_clear_queued(vcpu, lr.irq);
 
                /* Finally update the VGIC state. */
                vgic_update_state(vcpu->kvm);
@@ -648,24 +714,21 @@ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 }
 
 const
-struct kvm_mmio_range *vgic_find_range(const struct kvm_mmio_range *ranges,
-                                      struct kvm_exit_mmio *mmio,
-                                      phys_addr_t offset)
-{
-       const struct kvm_mmio_range *r = ranges;
-
-       while (r->len) {
-               if (offset >= r->base &&
-                   (offset + mmio->len) <= (r->base + r->len))
-                       return r;
-               r++;
+struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
+                                     int len, gpa_t offset)
+{
+       while (ranges->len) {
+               if (offset >= ranges->base &&
+                   (offset + len) <= (ranges->base + ranges->len))
+                       return ranges;
+               ranges++;
        }
 
        return NULL;
 }
 
 static bool vgic_validate_access(const struct vgic_dist *dist,
-                                const struct kvm_mmio_range *range,
+                                const struct vgic_io_range *range,
                                 unsigned long offset)
 {
        int irq;
@@ -693,9 +756,8 @@ static bool vgic_validate_access(const struct vgic_dist *dist,
 static bool call_range_handler(struct kvm_vcpu *vcpu,
                               struct kvm_exit_mmio *mmio,
                               unsigned long offset,
-                              const struct kvm_mmio_range *range)
+                              const struct vgic_io_range *range)
 {
-       u32 *data32 = (void *)mmio->data;
        struct kvm_exit_mmio mmio32;
        bool ret;
 
@@ -712,91 +774,142 @@ static bool call_range_handler(struct kvm_vcpu *vcpu,
        mmio32.private = mmio->private;
 
        mmio32.phys_addr = mmio->phys_addr + 4;
-       if (mmio->is_write)
-               *(u32 *)mmio32.data = data32[1];
+       mmio32.data = &((u32 *)mmio->data)[1];
        ret = range->handle_mmio(vcpu, &mmio32, offset + 4);
-       if (!mmio->is_write)
-               data32[1] = *(u32 *)mmio32.data;
 
        mmio32.phys_addr = mmio->phys_addr;
-       if (mmio->is_write)
-               *(u32 *)mmio32.data = data32[0];
+       mmio32.data = &((u32 *)mmio->data)[0];
        ret |= range->handle_mmio(vcpu, &mmio32, offset);
-       if (!mmio->is_write)
-               data32[0] = *(u32 *)mmio32.data;
 
        return ret;
 }
 
 /**
- * vgic_handle_mmio_range - handle an in-kernel MMIO access
+ * vgic_handle_mmio_access - handle an in-kernel MMIO access
+ * This is called by the read/write KVM IO device wrappers below.
  * @vcpu:      pointer to the vcpu performing the access
- * @run:       pointer to the kvm_run structure
- * @mmio:      pointer to the data describing the access
- * @ranges:    array of MMIO ranges in a given region
- * @mmio_base: base address of that region
+ * @this:      pointer to the KVM IO device in charge
+ * @addr:      guest physical address of the access
+ * @len:       size of the access
+ * @val:       pointer to the data region
+ * @is_write:  read or write access
  *
  * returns true if the MMIO access could be performed
  */
-bool vgic_handle_mmio_range(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                           struct kvm_exit_mmio *mmio,
-                           const struct kvm_mmio_range *ranges,
-                           unsigned long mmio_base)
+static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
+                                  struct kvm_io_device *this, gpa_t addr,
+                                  int len, void *val, bool is_write)
 {
-       const struct kvm_mmio_range *range;
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       struct vgic_io_device *iodev = container_of(this,
+                                                   struct vgic_io_device, dev);
+       struct kvm_run *run = vcpu->run;
+       const struct vgic_io_range *range;
+       struct kvm_exit_mmio mmio;
        bool updated_state;
-       unsigned long offset;
+       gpa_t offset;
 
-       offset = mmio->phys_addr - mmio_base;
-       range = vgic_find_range(ranges, mmio, offset);
+       offset = addr - iodev->addr;
+       range = vgic_find_range(iodev->reg_ranges, len, offset);
        if (unlikely(!range || !range->handle_mmio)) {
-               pr_warn("Unhandled access %d %08llx %d\n",
-                       mmio->is_write, mmio->phys_addr, mmio->len);
-               return false;
+               pr_warn("Unhandled access %d %08llx %d\n", is_write, addr, len);
+               return -ENXIO;
        }
 
-       spin_lock(&vcpu->kvm->arch.vgic.lock);
+       mmio.phys_addr = addr;
+       mmio.len = len;
+       mmio.is_write = is_write;
+       mmio.data = val;
+       mmio.private = iodev->redist_vcpu;
+
+       spin_lock(&dist->lock);
        offset -= range->base;
        if (vgic_validate_access(dist, range, offset)) {
-               updated_state = call_range_handler(vcpu, mmio, offset, range);
+               updated_state = call_range_handler(vcpu, &mmio, offset, range);
        } else {
-               if (!mmio->is_write)
-                       memset(mmio->data, 0, mmio->len);
+               if (!is_write)
+                       memset(val, 0, len);
                updated_state = false;
        }
-       spin_unlock(&vcpu->kvm->arch.vgic.lock);
-       kvm_prepare_mmio(run, mmio);
+       spin_unlock(&dist->lock);
+       run->mmio.is_write      = is_write;
+       run->mmio.len           = len;
+       run->mmio.phys_addr     = addr;
+       memcpy(run->mmio.data, val, len);
+
        kvm_handle_mmio_return(vcpu, run);
 
        if (updated_state)
                vgic_kick_vcpus(vcpu->kvm);
 
-       return true;
+       return 0;
+}
+
+static int vgic_handle_mmio_read(struct kvm_vcpu *vcpu,
+                                struct kvm_io_device *this,
+                                gpa_t addr, int len, void *val)
+{
+       return vgic_handle_mmio_access(vcpu, this, addr, len, val, false);
 }
 
+static int vgic_handle_mmio_write(struct kvm_vcpu *vcpu,
+                                 struct kvm_io_device *this,
+                                 gpa_t addr, int len, const void *val)
+{
+       return vgic_handle_mmio_access(vcpu, this, addr, len, (void *)val,
+                                      true);
+}
+
+struct kvm_io_device_ops vgic_io_ops = {
+       .read   = vgic_handle_mmio_read,
+       .write  = vgic_handle_mmio_write,
+};
+
 /**
- * vgic_handle_mmio - handle an in-kernel MMIO access for the GIC emulation
- * @vcpu:      pointer to the vcpu performing the access
- * @run:       pointer to the kvm_run structure
- * @mmio:      pointer to the data describing the access
+ * vgic_register_kvm_io_dev - register VGIC register frame on the KVM I/O bus
+ * @kvm:            The VM structure pointer
+ * @base:           The (guest) base address for the register frame
+ * @len:            Length of the register frame window
+ * @ranges:         Describing the handler functions for each register
+ * @redist_vcpu_id: The VCPU ID to pass on to the handlers on call
+ * @iodev:          Points to memory to be passed on to the handler
  *
- * returns true if the MMIO access has been performed in kernel space,
- * and false if it needs to be emulated in user space.
- * Calls the actual handling routine for the selected VGIC model.
+ * @iodev stores the parameters of this function to be usable by the handler
+ * respectively the dispatcher function (since the KVM I/O bus framework lacks
+ * an opaque parameter). Initialization is done in this function, but the
+ * reference should be valid and unique for the whole VGIC lifetime.
+ * If the register frame is not mapped for a specific VCPU, pass -1 to
+ * @redist_vcpu_id.
  */
-bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                     struct kvm_exit_mmio *mmio)
+int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
+                            const struct vgic_io_range *ranges,
+                            int redist_vcpu_id,
+                            struct vgic_io_device *iodev)
 {
-       if (!irqchip_in_kernel(vcpu->kvm))
-               return false;
+       struct kvm_vcpu *vcpu = NULL;
+       int ret;
 
-       /*
-        * This will currently call either vgic_v2_handle_mmio() or
-        * vgic_v3_handle_mmio(), which in turn will call
-        * vgic_handle_mmio_range() defined above.
-        */
-       return vcpu->kvm->arch.vgic.vm_ops.handle_mmio(vcpu, run, mmio);
+       if (redist_vcpu_id >= 0)
+               vcpu = kvm_get_vcpu(kvm, redist_vcpu_id);
+
+       iodev->addr             = base;
+       iodev->len              = len;
+       iodev->reg_ranges       = ranges;
+       iodev->redist_vcpu      = vcpu;
+
+       kvm_iodevice_init(&iodev->dev, &vgic_io_ops);
+
+       mutex_lock(&kvm->slots_lock);
+
+       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, base, len,
+                                     &iodev->dev);
+       mutex_unlock(&kvm->slots_lock);
+
+       /* Mark the iodev as invalid if registration fails. */
+       if (ret)
+               iodev->dev.ops = NULL;
+
+       return ret;
 }
 
 static int vgic_nr_shared_irqs(struct vgic_dist *dist)
@@ -804,6 +917,36 @@ static int vgic_nr_shared_irqs(struct vgic_dist *dist)
        return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
 }
 
+static int compute_active_for_cpu(struct kvm_vcpu *vcpu)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       unsigned long *active, *enabled, *act_percpu, *act_shared;
+       unsigned long active_private, active_shared;
+       int nr_shared = vgic_nr_shared_irqs(dist);
+       int vcpu_id;
+
+       vcpu_id = vcpu->vcpu_id;
+       act_percpu = vcpu->arch.vgic_cpu.active_percpu;
+       act_shared = vcpu->arch.vgic_cpu.active_shared;
+
+       active = vgic_bitmap_get_cpu_map(&dist->irq_active, vcpu_id);
+       enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
+       bitmap_and(act_percpu, active, enabled, VGIC_NR_PRIVATE_IRQS);
+
+       active = vgic_bitmap_get_shared_map(&dist->irq_active);
+       enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
+       bitmap_and(act_shared, active, enabled, nr_shared);
+       bitmap_and(act_shared, act_shared,
+                  vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
+                  nr_shared);
+
+       active_private = find_first_bit(act_percpu, VGIC_NR_PRIVATE_IRQS);
+       active_shared = find_first_bit(act_shared, nr_shared);
+
+       return (active_private < VGIC_NR_PRIVATE_IRQS ||
+               active_shared < nr_shared);
+}
+
 static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
@@ -835,7 +978,7 @@ static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
 
 /*
  * Update the interrupt state and determine which CPUs have pending
- * interrupts. Must be called with distributor lock held.
+ * or active interrupts. Must be called with distributor lock held.
  */
 void vgic_update_state(struct kvm *kvm)
 {
@@ -849,10 +992,13 @@ void vgic_update_state(struct kvm *kvm)
        }
 
        kvm_for_each_vcpu(c, vcpu, kvm) {
-               if (compute_pending_for_cpu(vcpu)) {
-                       pr_debug("CPU%d has pending interrupts\n", c);
+               if (compute_pending_for_cpu(vcpu))
                        set_bit(c, dist->irq_pending_on_cpu);
-               }
+
+               if (compute_active_for_cpu(vcpu))
+                       set_bit(c, dist->irq_active_on_cpu);
+               else
+                       clear_bit(c, dist->irq_active_on_cpu);
        }
 }
 
@@ -955,6 +1101,26 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
        }
 }
 
+static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
+                                int lr_nr, struct vgic_lr vlr)
+{
+       if (vgic_irq_is_active(vcpu, irq)) {
+               vlr.state |= LR_STATE_ACTIVE;
+               kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state);
+               vgic_irq_clear_active(vcpu, irq);
+               vgic_update_state(vcpu->kvm);
+       } else if (vgic_dist_irq_is_pending(vcpu, irq)) {
+               vlr.state |= LR_STATE_PENDING;
+               kvm_debug("Set pending: 0x%x\n", vlr.state);
+       }
+
+       if (!vgic_irq_is_edge(vcpu, irq))
+               vlr.state |= LR_EOI_INT;
+
+       vgic_set_lr(vcpu, lr_nr, vlr);
+       vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
+}
+
 /*
  * Queue an interrupt to a CPU virtual interface. Return true on success,
  * or false if it wasn't possible to queue it.
@@ -982,9 +1148,7 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
                if (vlr.source == sgi_source_id) {
                        kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
                        BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
-                       vlr.state |= LR_STATE_PENDING;
-                       vgic_set_lr(vcpu, lr, vlr);
-                       vgic_sync_lr_elrsr(vcpu, lr, vlr);
+                       vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
                        return true;
                }
        }
@@ -1001,12 +1165,8 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
 
        vlr.irq = irq;
        vlr.source = sgi_source_id;
-       vlr.state = LR_STATE_PENDING;
-       if (!vgic_irq_is_edge(vcpu, irq))
-               vlr.state |= LR_EOI_INT;
-
-       vgic_set_lr(vcpu, lr, vlr);
-       vgic_sync_lr_elrsr(vcpu, lr, vlr);
+       vlr.state = 0;
+       vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
 
        return true;
 }
@@ -1038,39 +1198,49 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       unsigned long *pa_percpu, *pa_shared;
        int i, vcpu_id;
        int overflow = 0;
+       int nr_shared = vgic_nr_shared_irqs(dist);
 
        vcpu_id = vcpu->vcpu_id;
 
+       pa_percpu = vcpu->arch.vgic_cpu.pend_act_percpu;
+       pa_shared = vcpu->arch.vgic_cpu.pend_act_shared;
+
+       bitmap_or(pa_percpu, vgic_cpu->pending_percpu, vgic_cpu->active_percpu,
+                 VGIC_NR_PRIVATE_IRQS);
+       bitmap_or(pa_shared, vgic_cpu->pending_shared, vgic_cpu->active_shared,
+                 nr_shared);
        /*
         * We may not have any pending interrupt, or the interrupts
         * may have been serviced from another vcpu. In all cases,
         * move along.
         */
-       if (!kvm_vgic_vcpu_pending_irq(vcpu)) {
-               pr_debug("CPU%d has no pending interrupt\n", vcpu_id);
+       if (!kvm_vgic_vcpu_pending_irq(vcpu) && !kvm_vgic_vcpu_active_irq(vcpu))
                goto epilog;
-       }
 
        /* SGIs */
-       for_each_set_bit(i, vgic_cpu->pending_percpu, VGIC_NR_SGIS) {
+       for_each_set_bit(i, pa_percpu, VGIC_NR_SGIS) {
                if (!queue_sgi(vcpu, i))
                        overflow = 1;
        }
 
        /* PPIs */
-       for_each_set_bit_from(i, vgic_cpu->pending_percpu, VGIC_NR_PRIVATE_IRQS) {
+       for_each_set_bit_from(i, pa_percpu, VGIC_NR_PRIVATE_IRQS) {
                if (!vgic_queue_hwirq(vcpu, i))
                        overflow = 1;
        }
 
        /* SPIs */
-       for_each_set_bit(i, vgic_cpu->pending_shared, vgic_nr_shared_irqs(dist)) {
+       for_each_set_bit(i, pa_shared, nr_shared) {
                if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
                        overflow = 1;
        }
 
+
+
+
 epilog:
        if (overflow) {
                vgic_enable_underflow(vcpu);
@@ -1089,7 +1259,9 @@ epilog:
 static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 {
        u32 status = vgic_get_interrupt_status(vcpu);
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        bool level_pending = false;
+       struct kvm *kvm = vcpu->kvm;
 
        kvm_debug("STATUS = %08x\n", status);
 
@@ -1106,6 +1278,7 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
                        struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
                        WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
 
+                       spin_lock(&dist->lock);
                        vgic_irq_clear_queued(vcpu, vlr.irq);
                        WARN_ON(vlr.state & LR_STATE_MASK);
                        vlr.state = 0;
@@ -1124,6 +1297,17 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
                         */
                        vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
 
+                       /*
+                        * kvm_notify_acked_irq calls kvm_set_irq()
+                        * to reset the IRQ level. Need to release the
+                        * lock for kvm_set_irq to grab it.
+                        */
+                       spin_unlock(&dist->lock);
+
+                       kvm_notify_acked_irq(kvm, 0,
+                                            vlr.irq - VGIC_NR_PRIVATE_IRQS);
+                       spin_lock(&dist->lock);
+
                        /* Any additional pending interrupt? */
                        if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
                                vgic_cpu_irq_set(vcpu, vlr.irq);
@@ -1133,6 +1317,8 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
                                vgic_cpu_irq_clear(vcpu, vlr.irq);
                        }
 
+                       spin_unlock(&dist->lock);
+
                        /*
                         * Despite being EOIed, the LR may not have
                         * been marked as empty.
@@ -1155,10 +1341,7 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
        return level_pending;
 }
 
-/*
- * Sync back the VGIC state after a guest run. The distributor lock is
- * needed so we don't get preempted in the middle of the state processing.
- */
+/* Sync back the VGIC state after a guest run */
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
@@ -1205,14 +1388,10 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
-       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
        if (!irqchip_in_kernel(vcpu->kvm))
                return;
 
-       spin_lock(&dist->lock);
        __kvm_vgic_sync_hwstate(vcpu);
-       spin_unlock(&dist->lock);
 }
 
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
@@ -1225,6 +1404,17 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
        return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
 }
 
+int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+       if (!irqchip_in_kernel(vcpu->kvm))
+               return 0;
+
+       return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu);
+}
+
+
 void vgic_kick_vcpus(struct kvm *kvm)
 {
        struct kvm_vcpu *vcpu;
@@ -1397,8 +1587,12 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 
        kfree(vgic_cpu->pending_shared);
+       kfree(vgic_cpu->active_shared);
+       kfree(vgic_cpu->pend_act_shared);
        kfree(vgic_cpu->vgic_irq_lr_map);
        vgic_cpu->pending_shared = NULL;
+       vgic_cpu->active_shared = NULL;
+       vgic_cpu->pend_act_shared = NULL;
        vgic_cpu->vgic_irq_lr_map = NULL;
 }
 
@@ -1408,9 +1602,14 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
 
        int sz = (nr_irqs - VGIC_NR_PRIVATE_IRQS) / 8;
        vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
+       vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
+       vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
        vgic_cpu->vgic_irq_lr_map = kmalloc(nr_irqs, GFP_KERNEL);
 
-       if (!vgic_cpu->pending_shared || !vgic_cpu->vgic_irq_lr_map) {
+       if (!vgic_cpu->pending_shared
+               || !vgic_cpu->active_shared
+               || !vgic_cpu->pend_act_shared
+               || !vgic_cpu->vgic_irq_lr_map) {
                kvm_vgic_vcpu_destroy(vcpu);
                return -ENOMEM;
        }
@@ -1463,10 +1662,12 @@ void kvm_vgic_destroy(struct kvm *kvm)
        kfree(dist->irq_spi_mpidr);
        kfree(dist->irq_spi_target);
        kfree(dist->irq_pending_on_cpu);
+       kfree(dist->irq_active_on_cpu);
        dist->irq_sgi_sources = NULL;
        dist->irq_spi_cpu = NULL;
        dist->irq_spi_target = NULL;
        dist->irq_pending_on_cpu = NULL;
+       dist->irq_active_on_cpu = NULL;
        dist->nr_cpus = 0;
 }
 
@@ -1502,6 +1703,7 @@ int vgic_init(struct kvm *kvm)
        ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs);
        ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs);
        ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs);
+       ret |= vgic_init_bitmap(&dist->irq_active, nr_cpus, nr_irqs);
        ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs);
        ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs);
 
@@ -1514,10 +1716,13 @@ int vgic_init(struct kvm *kvm)
                                       GFP_KERNEL);
        dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
                                           GFP_KERNEL);
+       dist->irq_active_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
+                                          GFP_KERNEL);
        if (!dist->irq_sgi_sources ||
            !dist->irq_spi_cpu ||
            !dist->irq_spi_target ||
-           !dist->irq_pending_on_cpu) {
+           !dist->irq_pending_on_cpu ||
+           !dist->irq_active_on_cpu) {
                ret = -ENOMEM;
                goto out;
        }
@@ -1845,12 +2050,9 @@ int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
        return r;
 }
 
-int vgic_has_attr_regs(const struct kvm_mmio_range *ranges, phys_addr_t offset)
+int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset)
 {
-       struct kvm_exit_mmio dev_attr_mmio;
-
-       dev_attr_mmio.len = 4;
-       if (vgic_find_range(ranges, &dev_attr_mmio, offset))
+       if (vgic_find_range(ranges, 4, offset))
                return 0;
        else
                return -ENXIO;
@@ -1883,8 +2085,10 @@ static struct notifier_block vgic_cpu_nb = {
 };
 
 static const struct of_device_id vgic_ids[] = {
-       { .compatible = "arm,cortex-a15-gic", .data = vgic_v2_probe, },
-       { .compatible = "arm,gic-v3", .data = vgic_v3_probe, },
+       { .compatible = "arm,cortex-a15-gic",   .data = vgic_v2_probe, },
+       { .compatible = "arm,cortex-a7-gic",    .data = vgic_v2_probe, },
+       { .compatible = "arm,gic-400",          .data = vgic_v2_probe, },
+       { .compatible = "arm,gic-v3",           .data = vgic_v3_probe, },
        {},
 };
 
@@ -1932,3 +2136,38 @@ out_free_irq:
        free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus());
        return ret;
 }
+
+int kvm_irq_map_gsi(struct kvm *kvm,
+                   struct kvm_kernel_irq_routing_entry *entries,
+                   int gsi)
+{
+       return gsi;
+}
+
+int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+       return pin;
+}
+
+int kvm_set_irq(struct kvm *kvm, int irq_source_id,
+               u32 irq, int level, bool line_status)
+{
+       unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS;
+
+       trace_kvm_set_irq(irq, level, irq_source_id);
+
+       BUG_ON(!vgic_initialized(kvm));
+
+       if (spi > kvm->arch.vgic.nr_irqs)
+               return -EINVAL;
+       return kvm_vgic_inject_irq(kvm, 0, spi, level);
+
+}
+
+/* MSI not implemented yet */
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+               struct kvm *kvm, int irq_source_id,
+               int level, bool line_status)
+{
+       return 0;
+}
index 1e83bdf5f499b24dd8ea19d5f456fcc740d87bad..0df74cbb6200686ab8cfbc853b11f27cc954b678 100644 (file)
@@ -20,6 +20,8 @@
 #ifndef __KVM_VGIC_H__
 #define __KVM_VGIC_H__
 
+#include <kvm/iodev.h>
+
 #define VGIC_ADDR_UNDEF                (-1)
 #define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
 
@@ -57,6 +59,14 @@ void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
 bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq);
 void vgic_unqueue_irqs(struct kvm_vcpu *vcpu);
 
+struct kvm_exit_mmio {
+       phys_addr_t     phys_addr;
+       void            *data;
+       u32             len;
+       bool            is_write;
+       void            *private;
+};
+
 void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
                     phys_addr_t offset, int mode);
 bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
@@ -74,7 +84,7 @@ void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
        *((u32 *)mmio->data) = cpu_to_le32(value) & mask;
 }
 
-struct kvm_mmio_range {
+struct vgic_io_range {
        phys_addr_t base;
        unsigned long len;
        int bits_per_irq;
@@ -82,6 +92,11 @@ struct kvm_mmio_range {
                            phys_addr_t offset);
 };
 
+int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
+                            const struct vgic_io_range *ranges,
+                            int redist_id,
+                            struct vgic_io_device *iodev);
+
 static inline bool is_in_range(phys_addr_t addr, unsigned long len,
                               phys_addr_t baseaddr, unsigned long size)
 {
@@ -89,14 +104,8 @@ static inline bool is_in_range(phys_addr_t addr, unsigned long len,
 }
 
 const
-struct kvm_mmio_range *vgic_find_range(const struct kvm_mmio_range *ranges,
-                                      struct kvm_exit_mmio *mmio,
-                                      phys_addr_t offset);
-
-bool vgic_handle_mmio_range(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                           struct kvm_exit_mmio *mmio,
-                           const struct kvm_mmio_range *ranges,
-                           unsigned long mmio_base);
+struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
+                                     int len, gpa_t offset);
 
 bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
                            phys_addr_t offset, int vcpu_id, int access);
@@ -107,12 +116,20 @@ bool vgic_handle_set_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
 bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
                                   phys_addr_t offset, int vcpu_id);
 
+bool vgic_handle_set_active_reg(struct kvm *kvm,
+                               struct kvm_exit_mmio *mmio,
+                               phys_addr_t offset, int vcpu_id);
+
+bool vgic_handle_clear_active_reg(struct kvm *kvm,
+                                 struct kvm_exit_mmio *mmio,
+                                 phys_addr_t offset, int vcpu_id);
+
 bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
                         phys_addr_t offset);
 
 void vgic_kick_vcpus(struct kvm *kvm);
 
-int vgic_has_attr_regs(const struct kvm_mmio_range *ranges, phys_addr_t offset);
+int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset);
 int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
 int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
 
index 00d86427af0f8bae911c2e41a33303c2d02a3428..571c1ce37d152f86c3690d9e2427f5eeca97cd2c 100644 (file)
@@ -8,7 +8,7 @@
  *
  */
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
@@ -60,8 +60,9 @@ static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
        return 1;
 }
 
-static int coalesced_mmio_write(struct kvm_io_device *this,
-                               gpa_t addr, int len, const void *val)
+static int coalesced_mmio_write(struct kvm_vcpu *vcpu,
+                               struct kvm_io_device *this, gpa_t addr,
+                               int len, const void *val)
 {
        struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
        struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
index 148b2392c762ba763a6ad09b314699c451b463d0..9ff4193dfa493c3e226c3fd554061b171ca7b9c5 100644 (file)
@@ -36,7 +36,7 @@
 #include <linux/seqlock.h>
 #include <trace/events/kvm.h>
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 #ifdef CONFIG_HAVE_KVM_IRQFD
 /*
@@ -311,6 +311,9 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
        unsigned int events;
        int idx;
 
+       if (!kvm_arch_intc_initialized(kvm))
+               return -EAGAIN;
+
        irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
        if (!irqfd)
                return -ENOMEM;
@@ -712,8 +715,8 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 
 /* MMIO/PIO writes trigger an event if the addr/val match */
 static int
-ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
-               const void *val)
+ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
+               int len, const void *val)
 {
        struct _ioeventfd *p = to_ioeventfd(this);
 
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
deleted file mode 100644 (file)
index 12fd3ca..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-#ifndef __KVM_IODEV_H__
-#define __KVM_IODEV_H__
-
-#include <linux/kvm_types.h>
-#include <asm/errno.h>
-
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-       int (*read)(struct kvm_io_device *this,
-                   gpa_t addr,
-                   int len,
-                   void *val);
-       int (*write)(struct kvm_io_device *this,
-                    gpa_t addr,
-                    int len,
-                    const void *val);
-       void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-       const struct kvm_io_device_ops *ops;
-};
-
-static inline void kvm_iodevice_init(struct kvm_io_device *dev,
-                                    const struct kvm_io_device_ops *ops)
-{
-       dev->ops = ops;
-}
-
-static inline int kvm_iodevice_read(struct kvm_io_device *dev,
-                                   gpa_t addr, int l, void *v)
-{
-       return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
-}
-
-static inline int kvm_iodevice_write(struct kvm_io_device *dev,
-                                    gpa_t addr, int l, const void *v)
-{
-       return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
-}
-
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
-{
-       if (dev->ops->destructor)
-               dev->ops->destructor(dev);
-}
-
-#endif /* __KVM_IODEV_H__ */
index 7f256f31df102e36da59a8ebed636f1c9615cb00..1d56a901e791788d9f2c855dcf3e96a9b650df77 100644 (file)
@@ -105,7 +105,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
        i = kvm_irq_map_gsi(kvm, irq_set, irq);
        srcu_read_unlock(&kvm->irq_srcu, idx);
 
-       while(i--) {
+       while (i--) {
                int r;
                r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
                                   line_status);
index cc6a25d95fbff532bf5b00b0c339bec91ddc5bcf..d3fc9399062a5034b99eaa3d12c855699fdbf608 100644 (file)
@@ -16,7 +16,7 @@
  *
  */
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-unsigned int halt_poll_ns = 0;
+static unsigned int halt_poll_ns;
 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
 /*
  * Ordering of locks:
  *
- *             kvm->lock --> kvm->slots_lock --> kvm->irq_lock
+ *     kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  */
 
 DEFINE_SPINLOCK(kvm_lock);
@@ -80,7 +80,7 @@ static DEFINE_RAW_SPINLOCK(kvm_count_lock);
 LIST_HEAD(vm_list);
 
 static cpumask_var_t cpus_hardware_enabled;
-static int kvm_usage_count = 0;
+static int kvm_usage_count;
 static atomic_t hardware_enable_failed;
 
 struct kmem_cache *kvm_vcpu_cache;
@@ -539,20 +539,12 @@ void *kvm_kvzalloc(unsigned long size)
                return kzalloc(size, GFP_KERNEL);
 }
 
-void kvm_kvfree(const void *addr)
-{
-       if (is_vmalloc_addr(addr))
-               vfree(addr);
-       else
-               kfree(addr);
-}
-
 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
        if (!memslot->dirty_bitmap)
                return;
 
-       kvm_kvfree(memslot->dirty_bitmap);
+       kvfree(memslot->dirty_bitmap);
        memslot->dirty_bitmap = NULL;
 }
 
@@ -888,8 +880,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
                 * or moved, memslot will be created.
                 *
                 * validation of sp->gfn happens in:
-                *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
-                *      - kvm_is_visible_gfn (mmu_check_roots)
+                *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
+                *      - kvm_is_visible_gfn (mmu_check_roots)
                 */
                kvm_arch_flush_shadow_memslot(kvm, slot);
 
@@ -1061,9 +1053,11 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
                mask = xchg(&dirty_bitmap[i], 0);
                dirty_bitmap_buffer[i] = mask;
 
-               offset = i * BITS_PER_LONG;
-               kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset,
-                                                               mask);
+               if (mask) {
+                       offset = i * BITS_PER_LONG;
+                       kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
+                                                               offset, mask);
+               }
        }
 
        spin_unlock(&kvm->mmu_lock);
@@ -1193,16 +1187,6 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
        return gfn_to_hva_memslot_prot(slot, gfn, writable);
 }
 
-static int kvm_read_hva(void *data, void __user *hva, int len)
-{
-       return __copy_from_user(data, hva, len);
-}
-
-static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
-{
-       return __copy_from_user_inatomic(data, hva, len);
-}
-
 static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long start, int write, struct page **page)
 {
@@ -1481,7 +1465,6 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 
        return kvm_pfn_to_page(pfn);
 }
-
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
 void kvm_release_page_clean(struct page *page)
@@ -1517,6 +1500,7 @@ void kvm_set_pfn_dirty(pfn_t pfn)
 {
        if (!kvm_is_reserved_pfn(pfn)) {
                struct page *page = pfn_to_page(pfn);
+
                if (!PageReserved(page))
                        SetPageDirty(page);
        }
@@ -1554,7 +1538,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
        addr = gfn_to_hva_prot(kvm, gfn, NULL);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
-       r = kvm_read_hva(data, (void __user *)addr + offset, len);
+       r = __copy_from_user(data, (void __user *)addr + offset, len);
        if (r)
                return -EFAULT;
        return 0;
@@ -1593,7 +1577,7 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        pagefault_disable();
-       r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
+       r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
        pagefault_enable();
        if (r)
                return -EFAULT;
@@ -1653,8 +1637,8 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        ghc->generation = slots->generation;
        ghc->len = len;
        ghc->memslot = gfn_to_memslot(kvm, start_gfn);
-       ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, &nr_pages_avail);
-       if (!kvm_is_error_hva(ghc->hva) && nr_pages_avail >= nr_pages_needed) {
+       ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
+       if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
                ghc->hva += offset;
        } else {
                /*
@@ -1742,7 +1726,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
        int offset = offset_in_page(gpa);
        int ret;
 
-        while ((seg = next_segment(len, offset)) != 0) {
+       while ((seg = next_segment(len, offset)) != 0) {
                ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
                if (ret < 0)
                        return ret;
@@ -1800,6 +1784,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        start = cur = ktime_get();
        if (halt_poll_ns) {
                ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+
                do {
                        /*
                         * This sets KVM_REQ_UNHALT if an interrupt
@@ -2118,7 +2103,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
         * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
         * so vcpu_load() would break it.
         */
-       if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
+       if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT)
                return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 #endif
 
@@ -2135,6 +2120,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
                        /* The thread running this VCPU changed. */
                        struct pid *oldpid = vcpu->pid;
                        struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
+
                        rcu_assign_pointer(vcpu->pid, newpid);
                        if (oldpid)
                                synchronize_rcu();
@@ -2205,7 +2191,7 @@ out_free1:
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, &mp_state, sizeof mp_state))
+               if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
                        goto out;
                r = 0;
                break;
@@ -2214,7 +2200,7 @@ out_free1:
                struct kvm_mp_state mp_state;
 
                r = -EFAULT;
-               if (copy_from_user(&mp_state, argp, sizeof mp_state))
+               if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
                break;
@@ -2223,13 +2209,13 @@ out_free1:
                struct kvm_translation tr;
 
                r = -EFAULT;
-               if (copy_from_user(&tr, argp, sizeof tr))
+               if (copy_from_user(&tr, argp, sizeof(tr)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, &tr, sizeof tr))
+               if (copy_to_user(argp, &tr, sizeof(tr)))
                        goto out;
                r = 0;
                break;
@@ -2238,7 +2224,7 @@ out_free1:
                struct kvm_guest_debug dbg;
 
                r = -EFAULT;
-               if (copy_from_user(&dbg, argp, sizeof dbg))
+               if (copy_from_user(&dbg, argp, sizeof(dbg)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
                break;
@@ -2252,14 +2238,14 @@ out_free1:
                if (argp) {
                        r = -EFAULT;
                        if (copy_from_user(&kvm_sigmask, argp,
-                                          sizeof kvm_sigmask))
+                                          sizeof(kvm_sigmask)))
                                goto out;
                        r = -EINVAL;
-                       if (kvm_sigmask.len != sizeof sigset)
+                       if (kvm_sigmask.len != sizeof(sigset))
                                goto out;
                        r = -EFAULT;
                        if (copy_from_user(&sigset, sigmask_arg->sigset,
-                                          sizeof sigset))
+                                          sizeof(sigset)))
                                goto out;
                        p = &sigset;
                }
@@ -2321,14 +2307,14 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
                if (argp) {
                        r = -EFAULT;
                        if (copy_from_user(&kvm_sigmask, argp,
-                                          sizeof kvm_sigmask))
+                                          sizeof(kvm_sigmask)))
                                goto out;
                        r = -EINVAL;
-                       if (kvm_sigmask.len != sizeof csigset)
+                       if (kvm_sigmask.len != sizeof(csigset))
                                goto out;
                        r = -EFAULT;
                        if (copy_from_user(&csigset, sigmask_arg->sigset,
-                                          sizeof csigset))
+                                          sizeof(csigset)))
                                goto out;
                        sigset_from_compat(&sigset, &csigset);
                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
@@ -2525,7 +2511,7 @@ static long kvm_vm_ioctl(struct file *filp,
 
                r = -EFAULT;
                if (copy_from_user(&kvm_userspace_mem, argp,
-                                               sizeof kvm_userspace_mem))
+                                               sizeof(kvm_userspace_mem)))
                        goto out;
 
                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
@@ -2535,7 +2521,7 @@ static long kvm_vm_ioctl(struct file *filp,
                struct kvm_dirty_log log;
 
                r = -EFAULT;
-               if (copy_from_user(&log, argp, sizeof log))
+               if (copy_from_user(&log, argp, sizeof(log)))
                        goto out;
                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
                break;
@@ -2543,16 +2529,18 @@ static long kvm_vm_ioctl(struct file *filp,
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
        case KVM_REGISTER_COALESCED_MMIO: {
                struct kvm_coalesced_mmio_zone zone;
+
                r = -EFAULT;
-               if (copy_from_user(&zone, argp, sizeof zone))
+               if (copy_from_user(&zone, argp, sizeof(zone)))
                        goto out;
                r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
                break;
        }
        case KVM_UNREGISTER_COALESCED_MMIO: {
                struct kvm_coalesced_mmio_zone zone;
+
                r = -EFAULT;
-               if (copy_from_user(&zone, argp, sizeof zone))
+               if (copy_from_user(&zone, argp, sizeof(zone)))
                        goto out;
                r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
                break;
@@ -2562,7 +2550,7 @@ static long kvm_vm_ioctl(struct file *filp,
                struct kvm_irqfd data;
 
                r = -EFAULT;
-               if (copy_from_user(&data, argp, sizeof data))
+               if (copy_from_user(&data, argp, sizeof(data)))
                        goto out;
                r = kvm_irqfd(kvm, &data);
                break;
@@ -2571,7 +2559,7 @@ static long kvm_vm_ioctl(struct file *filp,
                struct kvm_ioeventfd data;
 
                r = -EFAULT;
-               if (copy_from_user(&data, argp, sizeof data))
+               if (copy_from_user(&data, argp, sizeof(data)))
                        goto out;
                r = kvm_ioeventfd(kvm, &data);
                break;
@@ -2592,7 +2580,7 @@ static long kvm_vm_ioctl(struct file *filp,
                struct kvm_msi msi;
 
                r = -EFAULT;
-               if (copy_from_user(&msi, argp, sizeof msi))
+               if (copy_from_user(&msi, argp, sizeof(msi)))
                        goto out;
                r = kvm_send_userspace_msi(kvm, &msi);
                break;
@@ -2604,7 +2592,7 @@ static long kvm_vm_ioctl(struct file *filp,
                struct kvm_irq_level irq_event;
 
                r = -EFAULT;
-               if (copy_from_user(&irq_event, argp, sizeof irq_event))
+               if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
                        goto out;
 
                r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
@@ -2614,7 +2602,7 @@ static long kvm_vm_ioctl(struct file *filp,
 
                r = -EFAULT;
                if (ioctl == KVM_IRQ_LINE_STATUS) {
-                       if (copy_to_user(argp, &irq_event, sizeof irq_event))
+                       if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
                                goto out;
                }
 
@@ -2647,7 +2635,7 @@ static long kvm_vm_ioctl(struct file *filp,
                        goto out_free_irq_routing;
                r = kvm_set_irq_routing(kvm, entries, routing.nr,
                                        routing.flags);
-       out_free_irq_routing:
+out_free_irq_routing:
                vfree(entries);
                break;
        }
@@ -2822,8 +2810,7 @@ static void hardware_enable_nolock(void *junk)
        if (r) {
                cpumask_clear_cpu(cpu, cpus_hardware_enabled);
                atomic_inc(&hardware_enable_failed);
-               printk(KERN_INFO "kvm: enabling virtualization on "
-                                "CPU%d failed\n", cpu);
+               pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
        }
 }
 
@@ -2899,12 +2886,12 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
        val &= ~CPU_TASKS_FROZEN;
        switch (val) {
        case CPU_DYING:
-               printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+               pr_info("kvm: disabling virtualization on CPU%d\n",
                       cpu);
                hardware_disable();
                break;
        case CPU_STARTING:
-               printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
+               pr_info("kvm: enabling virtualization on CPU%d\n",
                       cpu);
                hardware_enable();
                break;
@@ -2921,7 +2908,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
         *
         * And Intel TXT required VMX off for all cpu when system shutdown.
         */
-       printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+       pr_info("kvm: exiting hardware virtualization\n");
        kvm_rebooting = true;
        on_each_cpu(hardware_disable_nolock, NULL, 1);
        return NOTIFY_OK;
@@ -2945,7 +2932,7 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 }
 
 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
-                                 const struct kvm_io_range *r2)
+                                const struct kvm_io_range *r2)
 {
        if (r1->addr < r2->addr)
                return -1;
@@ -2998,7 +2985,7 @@ static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
        return off;
 }
 
-static int __kvm_io_bus_write(struct kvm_io_bus *bus,
+static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
                              struct kvm_io_range *range, const void *val)
 {
        int idx;
@@ -3009,7 +2996,7 @@ static int __kvm_io_bus_write(struct kvm_io_bus *bus,
 
        while (idx < bus->dev_count &&
                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-               if (!kvm_iodevice_write(bus->range[idx].dev, range->addr,
+               if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
                                        range->len, val))
                        return idx;
                idx++;
@@ -3019,7 +3006,7 @@ static int __kvm_io_bus_write(struct kvm_io_bus *bus,
 }
 
 /* kvm_io_bus_write - called under kvm->slots_lock */
-int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val)
 {
        struct kvm_io_bus *bus;
@@ -3031,14 +3018,14 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                .len = len,
        };
 
-       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-       r = __kvm_io_bus_write(bus, &range, val);
+       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+       r = __kvm_io_bus_write(vcpu, bus, &range, val);
        return r < 0 ? r : 0;
 }
 
 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
-int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-                           int len, const void *val, long cookie)
+int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
+                           gpa_t addr, int len, const void *val, long cookie)
 {
        struct kvm_io_bus *bus;
        struct kvm_io_range range;
@@ -3048,12 +3035,12 @@ int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                .len = len,
        };
 
-       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
 
        /* First try the device referenced by cookie. */
        if ((cookie >= 0) && (cookie < bus->dev_count) &&
            (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
-               if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len,
+               if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
                                        val))
                        return cookie;
 
@@ -3061,11 +3048,11 @@ int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
         * cookie contained garbage; fall back to search and return the
         * correct cookie value.
         */
-       return __kvm_io_bus_write(bus, &range, val);
+       return __kvm_io_bus_write(vcpu, bus, &range, val);
 }
 
-static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
-                            void *val)
+static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
+                            struct kvm_io_range *range, void *val)
 {
        int idx;
 
@@ -3075,7 +3062,7 @@ static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
 
        while (idx < bus->dev_count &&
                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-               if (!kvm_iodevice_read(bus->range[idx].dev, range->addr,
+               if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
                                       range->len, val))
                        return idx;
                idx++;
@@ -3086,7 +3073,7 @@ static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
-int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val)
 {
        struct kvm_io_bus *bus;
@@ -3098,8 +3085,8 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                .len = len,
        };
 
-       bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-       r = __kvm_io_bus_read(bus, &range, val);
+       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+       r = __kvm_io_bus_read(vcpu, bus, &range, val);
        return r < 0 ? r : 0;
 }
 
@@ -3269,6 +3256,7 @@ struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
 {
        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
        if (vcpu->preempted)
                vcpu->preempted = false;
 
@@ -3350,7 +3338,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
        r = misc_register(&kvm_dev);
        if (r) {
-               printk(KERN_ERR "kvm: misc device register failed\n");
+               pr_err("kvm: misc device register failed\n");
                goto out_unreg;
        }
 
@@ -3361,7 +3349,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
        r = kvm_init_debug();
        if (r) {
-               printk(KERN_ERR "kvm: create debugfs files failed\n");
+               pr_err("kvm: create debugfs files failed\n");
                goto out_undebugfs;
        }