From 2ca8c1a1269630473840835d1585adf2582b7c0b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 001/231] [PATCH] x86-64: Update defconfig Signed-off-by: Andi Kleen --- arch/x86_64/defconfig | 183 +++++++++++------------------------------- 1 file changed, 49 insertions(+), 134 deletions(-) diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index b26378815b91..941a7e3aa5fb 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.21-rc3 -# Wed Mar 7 15:29:47 2007 +# Linux kernel version: 2.6.21-git3 +# Tue May 1 07:30:48 2007 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -118,11 +118,11 @@ CONFIG_X86_PC=y # CONFIG_X86_VSMP is not set # CONFIG_MK8 is not set # CONFIG_MPSC is not set -# CONFIG_MCORE2 is not set -CONFIG_GENERIC_CPU=y -CONFIG_X86_L1_CACHE_BYTES=128 -CONFIG_X86_L1_CACHE_SHIFT=7 -CONFIG_X86_INTERNODE_CACHE_BYTES=128 +CONFIG_MCORE2=y +# CONFIG_GENERIC_CPU is not set +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_TSC=y CONFIG_X86_GOOD_APIC=y # CONFIG_MICROCODE is not set @@ -174,6 +174,7 @@ CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y # CONFIG_KEXEC is not set # CONFIG_CRASH_DUMP is not set +# CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_START=0x200000 CONFIG_SECCOMP=y # CONFIG_CC_STACKPROTECTOR is not set @@ -182,7 +183,6 @@ CONFIG_HZ_250=y # CONFIG_HZ_300 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=250 -# CONFIG_REORDER is not set CONFIG_K8_NB=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y @@ -218,7 +218,6 @@ CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_THERMAL=y CONFIG_ACPI_NUMA=y # CONFIG_ACPI_ASUS is not set -# CONFIG_ACPI_IBM is not set # CONFIG_ACPI_TOSHIBA is not set CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set @@ -243,7 +242,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y # # CPUFreq processor drivers @@ -299,7 +298,6 @@ CONFIG_NET=y # # Networking options # -# CONFIG_NETDEBUG is not set CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set CONFIG_UNIX=y @@ -334,6 +332,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set +# CONFIG_IPV6_OPTIMISTIC_DAD is not set # CONFIG_INET6_AH is not set # CONFIG_INET6_ESP is not set # CONFIG_INET6_IPCOMP is not set @@ -389,6 +388,13 @@ CONFIG_IPV6_SIT=y # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set +# CONFIG_AF_RXRPC is not set + +# +# Wireless +# +# CONFIG_CFG80211 is not set +# CONFIG_WIRELESS_EXT is not set # CONFIG_IEEE80211 is not set # @@ -409,10 +415,6 @@ CONFIG_FW_LOADER=y # Connector - unified userspace <-> kernelspace linker # # CONFIG_CONNECTOR is not set - -# -# Memory Technology Devices (MTD) -# # CONFIG_MTD is not set # @@ -459,6 +461,7 @@ CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_SONY_LAPTOP is not set +# CONFIG_THINKPAD_ACPI is not set # # ATA/ATAPI/MFM/RLL support @@ -494,7 +497,6 @@ CONFIG_BLK_DEV_IDEPCI=y # CONFIG_BLK_DEV_RZ1000 is not set CONFIG_BLK_DEV_IDEDMA_PCI=y # CONFIG_BLK_DEV_IDEDMA_FORCED is not set -CONFIG_IDEDMA_PCI_AUTO=y # CONFIG_IDEDMA_ONLYDISK is not set # CONFIG_BLK_DEV_AEC62XX is not set # CONFIG_BLK_DEV_ALI15X3 is not set @@ -525,7 +527,6 @@ CONFIG_BLK_DEV_PDC202XX_NEW=y # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set -CONFIG_IDEDMA_AUTO=y # CONFIG_BLK_DEV_HD is not set # @@ -584,11 +585,9 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_AIC79XX_REG_PRETTY_PRINT is not set # CONFIG_SCSI_AIC94XX is not set # CONFIG_SCSI_ARCMSR is not set -CONFIG_MEGARAID_NEWGEN=y -CONFIG_MEGARAID_MM=y -CONFIG_MEGARAID_MAILBOX=y +# CONFIG_MEGARAID_NEWGEN is not set # CONFIG_MEGARAID_LEGACY is not set -CONFIG_MEGARAID_SAS=y +# CONFIG_MEGARAID_SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_SCSI_BUSLOGIC is not set # CONFIG_SCSI_DMX3191D is not set @@ -608,6 +607,7 @@ CONFIG_MEGARAID_SAS=y # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set +# CONFIG_SCSI_ESP_CORE is not set # CONFIG_SCSI_SRP is not set # @@ -636,6 +636,7 @@ CONFIG_SATA_ACPI=y # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set # CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD640_PCI is not set # CONFIG_PATA_CMD64X is not set # CONFIG_PATA_CS5520 is not set # CONFIG_PATA_CS5530 is not set @@ -687,7 +688,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_FUSION=y CONFIG_FUSION_SPI=y # CONFIG_FUSION_FC is not set -CONFIG_FUSION_SAS=y +# CONFIG_FUSION_SAS is not set CONFIG_FUSION_MAX_SGE=128 # CONFIG_FUSION_CTL is not set @@ -700,19 +701,22 @@ CONFIG_IEEE1394=y # Subsystem Options # # CONFIG_IEEE1394_VERBOSEDEBUG is not set -# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set # -# Device Drivers +# Controllers +# + +# +# Texas Instruments PCILynx requires I2C # -# CONFIG_IEEE1394_PCILYNX is not set CONFIG_IEEE1394_OHCI1394=y # -# Protocol Drivers +# Protocols # # CONFIG_IEEE1394_VIDEO1394 is not set # CONFIG_IEEE1394_SBP2 is not set +# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set # CONFIG_IEEE1394_ETH1394 is not set # CONFIG_IEEE1394_DV1394 is not set CONFIG_IEEE1394_RAWIO=y @@ -775,7 +779,8 @@ CONFIG_TULIP=y # CONFIG_HP100 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set -# CONFIG_AMD8111_ETH is not set +CONFIG_AMD8111_ETH=y +# CONFIG_AMD8111E_NAPI is not set # CONFIG_ADAPTEC_STARFIRE is not set CONFIG_B44=y CONFIG_FORCEDETH=y @@ -837,9 +842,10 @@ CONFIG_S2IO=m # CONFIG_TR is not set # -# Wireless LAN (non-hamradio) +# Wireless LAN # -# CONFIG_NET_RADIO is not set +# CONFIG_WLAN_PRE80211 is not set +# CONFIG_WLAN_80211 is not set # # Wan interfaces @@ -853,7 +859,6 @@ CONFIG_S2IO=m # CONFIG_SHAPER is not set CONFIG_NETCONSOLE=y CONFIG_NETPOLL=y -# CONFIG_NETPOLL_RX is not set # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y @@ -987,57 +992,7 @@ CONFIG_HPET_MMAP=y # # I2C support # -CONFIG_I2C=m -CONFIG_I2C_CHARDEV=m - -# -# I2C Algorithms -# -# CONFIG_I2C_ALGOBIT is not set -# CONFIG_I2C_ALGOPCF is not set -# CONFIG_I2C_ALGOPCA is not set - -# -# I2C Hardware Bus support -# -# CONFIG_I2C_ALI1535 is not set -# CONFIG_I2C_ALI1563 is not set -# CONFIG_I2C_ALI15X3 is not set -# CONFIG_I2C_AMD756 is not set -# CONFIG_I2C_AMD8111 is not set -# CONFIG_I2C_I801 is not set -# CONFIG_I2C_I810 is not set -# CONFIG_I2C_PIIX4 is not set -CONFIG_I2C_ISA=m -# CONFIG_I2C_NFORCE2 is not set -# CONFIG_I2C_OCORES is not set -# CONFIG_I2C_PARPORT_LIGHT is not set -# CONFIG_I2C_PASEMI is not set -# CONFIG_I2C_PROSAVAGE is not set -# CONFIG_I2C_SAVAGE4 is not set -# CONFIG_I2C_SIS5595 is not set -# CONFIG_I2C_SIS630 is not set -# CONFIG_I2C_SIS96X is not set -# CONFIG_I2C_STUB is not set -# CONFIG_I2C_VIA is not set -# CONFIG_I2C_VIAPRO is not set -# CONFIG_I2C_VOODOO3 is not set -# CONFIG_I2C_PCA_ISA is not set - -# -# Miscellaneous I2C Chip support -# -# CONFIG_SENSORS_DS1337 is not set -# CONFIG_SENSORS_DS1374 is not set -# CONFIG_SENSORS_EEPROM is not set -# CONFIG_SENSORS_PCF8574 is not set -# CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set -# CONFIG_SENSORS_MAX6875 is not set -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# CONFIG_I2C_DEBUG_CHIP is not set +# CONFIG_I2C is not set # # SPI support @@ -1053,54 +1008,8 @@ CONFIG_I2C_ISA=m # # Hardware Monitoring support # -CONFIG_HWMON=y +# CONFIG_HWMON is not set # CONFIG_HWMON_VID is not set -# CONFIG_SENSORS_ABITUGURU is not set -# CONFIG_SENSORS_ADM1021 is not set -# CONFIG_SENSORS_ADM1025 is not set -# CONFIG_SENSORS_ADM1026 is not set -# CONFIG_SENSORS_ADM1029 is not set -# CONFIG_SENSORS_ADM1031 is not set -# CONFIG_SENSORS_ADM9240 is not set -# CONFIG_SENSORS_K8TEMP is not set -# CONFIG_SENSORS_ASB100 is not set -# CONFIG_SENSORS_ATXP1 is not set -# CONFIG_SENSORS_DS1621 is not set -# CONFIG_SENSORS_F71805F is not set -# CONFIG_SENSORS_FSCHER is not set -# CONFIG_SENSORS_FSCPOS is not set -# CONFIG_SENSORS_GL518SM is not set -# CONFIG_SENSORS_GL520SM is not set -# CONFIG_SENSORS_IT87 is not set -# CONFIG_SENSORS_LM63 is not set -# CONFIG_SENSORS_LM75 is not set -# CONFIG_SENSORS_LM77 is not set -# CONFIG_SENSORS_LM78 is not set -# CONFIG_SENSORS_LM80 is not set -# CONFIG_SENSORS_LM83 is not set -# CONFIG_SENSORS_LM85 is not set -# CONFIG_SENSORS_LM87 is not set -# CONFIG_SENSORS_LM90 is not set -# CONFIG_SENSORS_LM92 is not set -# CONFIG_SENSORS_MAX1619 is not set -# CONFIG_SENSORS_PC87360 is not set -# CONFIG_SENSORS_PC87427 is not set -# CONFIG_SENSORS_SIS5595 is not set -# CONFIG_SENSORS_SMSC47M1 is not set -# CONFIG_SENSORS_SMSC47M192 is not set -CONFIG_SENSORS_SMSC47B397=m -# CONFIG_SENSORS_VIA686A is not set -# CONFIG_SENSORS_VT1211 is not set -# CONFIG_SENSORS_VT8231 is not set -# CONFIG_SENSORS_W83781D is not set -# CONFIG_SENSORS_W83791D is not set -# CONFIG_SENSORS_W83792D is not set -# CONFIG_SENSORS_W83793 is not set -# CONFIG_SENSORS_W83L785TS is not set -# CONFIG_SENSORS_W83627HF is not set -# CONFIG_SENSORS_W83627EHF is not set -# CONFIG_SENSORS_HDAPS is not set -# CONFIG_HWMON_DEBUG_CHIP is not set # # Multifunction device drivers @@ -1147,8 +1056,9 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -# CONFIG_OBSOLETE_OSS is not set +CONFIG_OBSOLETE_OSS=y # CONFIG_SOUND_BT878 is not set +# CONFIG_SOUND_ES1371 is not set CONFIG_SOUND_ICH=y # CONFIG_SOUND_TRIDENT is not set # CONFIG_SOUND_MSNDCLAS is not set @@ -1162,6 +1072,14 @@ CONFIG_SOUND_ICH=y CONFIG_HID=y # CONFIG_HID_DEBUG is not set +# +# USB Input Devices +# +CONFIG_USB_HID=y +# CONFIG_USB_HIDINPUT_POWERBOOK is not set +# CONFIG_HID_FF is not set +# CONFIG_USB_HIDDEV is not set + # # USB support # @@ -1175,6 +1093,7 @@ CONFIG_USB=y # Miscellaneous USB options # CONFIG_USB_DEVICEFS=y +# CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set # CONFIG_USB_OTG is not set @@ -1225,10 +1144,6 @@ CONFIG_USB_STORAGE=y # # USB Input Devices # -CONFIG_USB_HID=y -# CONFIG_USB_HIDINPUT_POWERBOOK is not set -# CONFIG_HID_FF is not set -# CONFIG_USB_HIDDEV is not set # CONFIG_USB_AIPTEK is not set # CONFIG_USB_WACOM is not set # CONFIG_USB_ACECAD is not set @@ -1556,7 +1471,7 @@ CONFIG_DEBUG_KERNEL=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set +CONFIG_TIMER_STATS=y # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set From bdd0dc5210b5bb35dbddd5b0bc742e65a812a067 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 002/231] [PATCH] i386: Update defconfig Signed-off-by: Andi Kleen --- arch/i386/defconfig | 73 ++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/arch/i386/defconfig b/arch/i386/defconfig index c96911c37aea..9da84412a831 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.21-rc3 -# Wed Mar 7 15:29:47 2007 +# Linux kernel version: 2.6.21-git3 +# Tue May 1 07:30:51 2007 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -108,9 +108,9 @@ CONFIG_DEFAULT_IOSCHED="anticipatory" # # Processor type and features # -# CONFIG_TICK_ONESHOT is not set -# CONFIG_NO_HZ is not set -# CONFIG_HIGH_RES_TIMERS is not set +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_SMP=y # CONFIG_X86_PC is not set # CONFIG_X86_ELAN is not set @@ -146,9 +146,11 @@ CONFIG_MPENTIUMIII=y # CONFIG_MGEODE_LX is not set # CONFIG_MCYRIXIII is not set # CONFIG_MVIAC3_2 is not set +# CONFIG_MVIAC7 is not set CONFIG_X86_GENERIC=y CONFIG_X86_CMPXCHG=y CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_XADD=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y # CONFIG_ARCH_HAS_ILOG2_U32 is not set # CONFIG_ARCH_HAS_ILOG2_U64 is not set @@ -162,6 +164,8 @@ CONFIG_X86_GOOD_APIC=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y CONFIG_X86_TSC=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_MODEL=4 CONFIG_HPET_TIMER=y CONFIG_HPET_EMULATE_RTC=y CONFIG_NR_CPUS=32 @@ -248,7 +252,6 @@ CONFIG_ACPI_FAN=y CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_THERMAL=y # CONFIG_ACPI_ASUS is not set -# CONFIG_ACPI_IBM is not set # CONFIG_ACPI_TOSHIBA is not set CONFIG_ACPI_BLACKLIST_YEAR=2001 CONFIG_ACPI_DEBUG=y @@ -257,10 +260,7 @@ CONFIG_ACPI_POWER=y CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y # CONFIG_ACPI_CONTAINER is not set - -# -# APM (Advanced Power Management) BIOS Support -# +# CONFIG_ACPI_SBS is not set # CONFIG_APM is not set # @@ -277,7 +277,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y # # CPUFreq processor drivers @@ -349,7 +349,6 @@ CONFIG_NET=y # # Networking options # -# CONFIG_NETDEBUG is not set CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set CONFIG_UNIX=y @@ -388,6 +387,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set +# CONFIG_IPV6_OPTIMISTIC_DAD is not set # CONFIG_INET6_AH is not set # CONFIG_INET6_ESP is not set # CONFIG_INET6_IPCOMP is not set @@ -443,6 +443,13 @@ CONFIG_IPV6_SIT=y # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set +# CONFIG_AF_RXRPC is not set + +# +# Wireless +# +# CONFIG_CFG80211 is not set +# CONFIG_WIRELESS_EXT is not set # CONFIG_IEEE80211 is not set # @@ -463,10 +470,6 @@ CONFIG_FW_LOADER=y # Connector - unified userspace <-> kernelspace linker # # CONFIG_CONNECTOR is not set - -# -# Memory Technology Devices (MTD) -# # CONFIG_MTD is not set # @@ -513,6 +516,7 @@ CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_SONY_LAPTOP is not set +# CONFIG_THINKPAD_ACPI is not set # # ATA/ATAPI/MFM/RLL support @@ -548,7 +552,6 @@ CONFIG_BLK_DEV_IDEPCI=y # CONFIG_BLK_DEV_RZ1000 is not set CONFIG_BLK_DEV_IDEDMA_PCI=y # CONFIG_BLK_DEV_IDEDMA_FORCED is not set -CONFIG_IDEDMA_PCI_AUTO=y # CONFIG_IDEDMA_ONLYDISK is not set # CONFIG_BLK_DEV_AEC62XX is not set # CONFIG_BLK_DEV_ALI15X3 is not set @@ -580,7 +583,6 @@ CONFIG_BLK_DEV_PIIX=y # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set -CONFIG_IDEDMA_AUTO=y # CONFIG_BLK_DEV_HD is not set # @@ -669,6 +671,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_NSP32 is not set # CONFIG_SCSI_DEBUG is not set +# CONFIG_SCSI_ESP_CORE is not set # CONFIG_SCSI_SRP is not set # @@ -697,6 +700,7 @@ CONFIG_SATA_ACPI=y # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set # CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD640_PCI is not set # CONFIG_PATA_CMD64X is not set # CONFIG_PATA_CS5520 is not set # CONFIG_PATA_CS5530 is not set @@ -762,10 +766,9 @@ CONFIG_IEEE1394=y # Subsystem Options # # CONFIG_IEEE1394_VERBOSEDEBUG is not set -# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set # -# Device Drivers +# Controllers # # @@ -774,10 +777,11 @@ CONFIG_IEEE1394=y CONFIG_IEEE1394_OHCI1394=y # -# Protocol Drivers +# Protocols # # CONFIG_IEEE1394_VIDEO1394 is not set # CONFIG_IEEE1394_SBP2 is not set +# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set # CONFIG_IEEE1394_ETH1394 is not set # CONFIG_IEEE1394_DV1394 is not set CONFIG_IEEE1394_RAWIO=y @@ -820,7 +824,9 @@ CONFIG_MII=y # CONFIG_HAPPYMEAL is not set # CONFIG_SUNGEM is not set # CONFIG_CASSINI is not set -# CONFIG_NET_VENDOR_3COM is not set +CONFIG_NET_VENDOR_3COM=y +CONFIG_VORTEX=y +# CONFIG_TYPHOON is not set # # Tulip family network device support @@ -901,9 +907,10 @@ CONFIG_BNX2=y # CONFIG_TR is not set # -# Wireless LAN (non-hamradio) +# Wireless LAN # -# CONFIG_NET_RADIO is not set +# CONFIG_WLAN_PRE80211 is not set +# CONFIG_WLAN_80211 is not set # # Wan interfaces @@ -917,7 +924,6 @@ CONFIG_BNX2=y # CONFIG_SHAPER is not set CONFIG_NETCONSOLE=y CONFIG_NETPOLL=y -# CONFIG_NETPOLL_RX is not set # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y @@ -1050,7 +1056,7 @@ CONFIG_MAX_RAW_DEVS=256 CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set CONFIG_HPET_MMAP=y -CONFIG_HANGCHECK_TIMER=y +# CONFIG_HANGCHECK_TIMER is not set # # TPM devices @@ -1141,6 +1147,14 @@ CONFIG_SOUND_ICH=y CONFIG_HID=y # CONFIG_HID_DEBUG is not set +# +# USB Input Devices +# +CONFIG_USB_HID=y +# CONFIG_USB_HIDINPUT_POWERBOOK is not set +# CONFIG_HID_FF is not set +# CONFIG_USB_HIDDEV is not set + # # USB support # @@ -1154,6 +1168,7 @@ CONFIG_USB=y # Miscellaneous USB options # CONFIG_USB_DEVICEFS=y +# CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set # CONFIG_USB_OTG is not set @@ -1204,10 +1219,6 @@ CONFIG_USB_STORAGE=y # # USB Input Devices # -CONFIG_USB_HID=y -# CONFIG_USB_HIDINPUT_POWERBOOK is not set -# CONFIG_HID_FF is not set -# CONFIG_USB_HIDDEV is not set # CONFIG_USB_AIPTEK is not set # CONFIG_USB_WACOM is not set # CONFIG_USB_ACECAD is not set @@ -1528,7 +1539,7 @@ CONFIG_DEBUG_KERNEL=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set +CONFIG_TIMER_STATS=y # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set From fb27145d6ad2548d2d770f33ffa0a268c0afba85 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 003/231] [PATCH] i386: revert i386-fix-the-verify_quirk_intel_irqbalance This is unneeded with Ingo's genapic rework. Cc: Suresh Siddha Cc: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/quirks.c | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index 34874c398b44..a01320a7b636 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -10,38 +10,13 @@ #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) { - u8 config, rev; - u32 word; - - /* BIOS may enable hardware IRQ balancing for - * E7520/E7320/E7525(revision ID 0x9 and below) - * based platforms. - * For those platforms, make sure that the genapic is set to 'flat' - */ - pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); - if (rev > 0x9) - return; - - /* enable access to config space*/ - pci_read_config_byte(dev, 0xf4, &config); - pci_write_config_byte(dev, 0xf4, config|0x2); - - /* read xTPR register */ - raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); - - if (!(word & (1 << 13))) { #ifdef CONFIG_X86_64 - if (genapic != &apic_flat) - panic("APIC mode must be flat on this system\n"); + if (genapic != &apic_flat) + panic("APIC mode must be flat on this system\n"); #elif defined(CONFIG_X86_GENERICARCH) - if (genapic != &apic_default) - panic("APIC mode must be default(flat) on this system. Use apic=default\n"); + if (genapic != &apic_default) + panic("APIC mode must be default(flat) on this system. Use apic=default\n"); #endif - } - - /* put back the original value for config space*/ - if (!(config & 0x2)) - pci_write_config_byte(dev, 0xf4, config); } void __init quirk_intel_irqbalance(void) From 3dc68d9b58ae644cee8e218e3dcde0dceb5c47a3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 004/231] [PATCH] x86-64: revert x86_64-mm-add-genapic_force This is obsoleted by new Ingo genapic patches. Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic.c | 9 +-------- include/asm-x86_64/genapic.h | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 0b3603adf56d..7312ddb84fb4 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -33,7 +33,7 @@ extern struct genapic apic_flat; extern struct genapic apic_physflat; struct genapic *genapic = &apic_flat; -struct genapic *genapic_force; + /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. @@ -46,13 +46,6 @@ void __init clustered_apic_check(void) u8 cluster_cnt[NUM_APIC_CLUSTERS]; int max_apic = 0; - /* genapic selection can be forced because of certain quirks. - */ - if (genapic_force) { - genapic = genapic_force; - goto print; - } - #if defined(CONFIG_ACPI) /* * Some x86_64 machines use physical APIC mode regardless of how many diff --git a/include/asm-x86_64/genapic.h b/include/asm-x86_64/genapic.h index b80f4bb5f273..a0e9a4b93484 100644 --- a/include/asm-x86_64/genapic.h +++ b/include/asm-x86_64/genapic.h @@ -30,6 +30,6 @@ struct genapic { }; -extern struct genapic *genapic, *genapic_force, apic_flat; +extern struct genapic *genapic; #endif From a86f34b49f32b238d16b2e3bf6c9a5391a3f683f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 005/231] [PATCH] x86: revert x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525 Obsoleted by Ingo's genapic stuff. Cc: Ingo Molnar Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/earlyquirk.c | 21 -------------- arch/i386/kernel/quirks.c | 46 +++++++----------------------- arch/i386/kernel/smpboot.c | 6 ---- arch/x86_64/kernel/early-quirks.c | 13 --------- arch/x86_64/kernel/smpboot.c | 8 ------ include/asm-i386/genapic.h | 2 +- include/asm-i386/irq.h | 2 -- include/asm-x86_64/proto.h | 1 - 8 files changed, 12 insertions(+), 87 deletions(-) diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index 8f7efd38254d..23f78efc577d 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -10,7 +10,6 @@ #include #include #include -#include #ifdef CONFIG_ACPI @@ -48,24 +47,6 @@ static int __init check_bridge(int vendor, int device) return 0; } -static void check_intel(void) -{ - u16 vendor, device; - - vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID); - - if (vendor != PCI_VENDOR_ID_INTEL) - return; - - device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); -#ifdef CONFIG_SMP - if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || - device == PCI_DEVICE_ID_INTEL_E7520_MCH || - device == PCI_DEVICE_ID_INTEL_E7525_MCH) - quirk_intel_irqbalance(); -#endif -} - void __init check_acpi_pci(void) { int num, slot, func; @@ -77,8 +58,6 @@ void __init check_acpi_pci(void) if (!early_pci_allowed()) return; - check_intel(); - /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index a01320a7b636..9f6ab1789bb0 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -3,23 +3,10 @@ */ #include #include -#include -#include -#include #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) -{ -#ifdef CONFIG_X86_64 - if (genapic != &apic_flat) - panic("APIC mode must be flat on this system\n"); -#elif defined(CONFIG_X86_GENERICARCH) - if (genapic != &apic_default) - panic("APIC mode must be default(flat) on this system. Use apic=default\n"); -#endif -} -void __init quirk_intel_irqbalance(void) +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { u8 config, rev; u32 word; @@ -29,18 +16,18 @@ void __init quirk_intel_irqbalance(void) * based platforms. * Disable SW irqbalance/affinity on those platforms. */ - rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION); + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev > 0x9) return; printk(KERN_INFO "Intel E7520/7320/7525 detected."); - /* enable access to config space */ - config = read_pci_config_byte(0, 0, 0, 0xf4); - write_pci_config_byte(0, 0, 0, 0xf4, config|0x2); + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + pci_write_config_byte(dev, 0xf4, config|0x2); /* read xTPR register */ - word = read_pci_config_16(0, 0, 0x40, 0x4c); + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); if (!(word & (1 << 13))) { printk(KERN_INFO "Disabling irq balancing and affinity\n"); @@ -50,25 +37,14 @@ void __init quirk_intel_irqbalance(void) noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; -#endif -#ifdef CONFIG_HOTPLUG_CPU - printk(KERN_INFO "Disabling cpu hotplug control\n"); - enable_cpu_hotplug = 0; -#endif -#ifdef CONFIG_X86_64 - /* force the genapic selection to flat mode so that - * interrupts can be redirected to more than one CPU. - */ - genapic_force = &apic_flat; #endif } - /* put back the original value for config space */ + /* put back the original value for config space*/ if (!(config & 0x2)) - write_pci_config_byte(0, 0, 0, 0xf4, config); + pci_write_config_byte(dev, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); - +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); #endif diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 4ff55e675576..7b14e88b555f 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include @@ -1319,11 +1318,6 @@ int __cpuinit __cpu_up(unsigned int cpu) touch_nmi_watchdog(); } -#ifdef CONFIG_X86_GENERICARCH - if (num_online_cpus() > 8 && genapic == &apic_default) - panic("Default flat APIC routing can't be used with > 8 cpus\n"); -#endif - return 0; } diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index fede55a53995..990d9c218a5d 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -71,18 +71,6 @@ static void __init ati_bugs(void) } } -static void intel_bugs(void) -{ - u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); - -#ifdef CONFIG_SMP - if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || - device == PCI_DEVICE_ID_INTEL_E7520_MCH || - device == PCI_DEVICE_ID_INTEL_E7525_MCH) - quirk_intel_irqbalance(); -#endif -} - struct chipset { u16 vendor; void (*f)(void); @@ -92,7 +80,6 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, { PCI_VENDOR_ID_VIA, via_bugs }, { PCI_VENDOR_ID_ATI, ati_bugs }, - { PCI_VENDOR_ID_INTEL, intel_bugs}, {} }; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index cd4643a37022..14724be48bec 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -60,7 +60,6 @@ #include #include #include -#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -965,13 +964,6 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); - - if (num_online_cpus() > 8 && genapic == &apic_flat) { - printk(KERN_WARNING - "flat APIC routing can't be used with > 8 cpus\n"); - BUG(); - } - err = 0; return err; diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h index fd2be593b06e..8ffbb0f07457 100644 --- a/include/asm-i386/genapic.h +++ b/include/asm-i386/genapic.h @@ -122,6 +122,6 @@ struct genapic { APICFUNC(phys_pkg_id) \ } -extern struct genapic *genapic, apic_default; +extern struct genapic *genapic; #endif diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h index 11761cdaae19..9e15ce0006eb 100644 --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -37,8 +37,6 @@ static __inline__ int irq_canonicalize(int irq) extern int irqbalance_disable(char *str); #endif -extern void quirk_intel_irqbalance(void); - #ifdef CONFIG_HOTPLUG_CPU extern void fixup_irqs(cpumask_t map); #endif diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index b6e65a699f2a..6688cf9eb27b 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -82,7 +82,6 @@ extern void syscall32_cpu_init(void); extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); extern void early_quirks(void); -extern void quirk_intel_irqbalance(void); extern void check_efer(void); extern int unhandled_signal(struct task_struct *tsk, int sig); From f18d397e6aa5cde638d164b1d519c3ee903f4867 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 006/231] [PATCH] x86-64: optimize & fix APIC mode setup Fix a couple of inconsistencies/problems I found while reviewing the x86_64 genapic code (when I was chasing mysterious eth0 timeouts that would only trigger if CPU_HOTPLUG is enabled): - AMD systems defaulted to the slower flat-physical mode instead of the flat-logical mode. The only restriction on AMD systems is that they should not use clustered APIC mode. - removed the CPU hotplug hacks, switching the default for small systems back from phys-flat to logical-flat. The switching to logical flat mode on small systems fixed sporadic ethernet driver timeouts i was getting on a dual-core Athlon64 system: NETDEV WATCHDOG: eth0: transmit timed out eth0: Transmit timeout, status 0c 0005 c07f media 80. eth0: Tx queue start entry 32 dirty entry 28. eth0: Tx descriptor 0 is 0008a04a. (queue head) eth0: Tx descriptor 1 is 0008a04a. eth0: Tx descriptor 2 is 0008a04a. eth0: Tx descriptor 3 is 0008a04a. eth0: link up, 100Mbps, full-duplex, lpa 0xC5E1 - The use of '<= 8' was a bug by itself (the valid APIC ids for logical flat mode go from 0 to 7, not 0 to 8). The new logic is to use logical flat mode on both AMD and Intel systems, and to only switch to physical mode when logical mode cannot be used. If CPU hotplug is racy wrt. APIC shutdown then CPU hotplug needs fixing, not the whole IRQ system be made inconsistent and slowed down. - minor cleanups: simplified some code constructs build & booted on a couple of AMD and Intel SMP systems. Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- arch/x86_64/kernel/genapic.c | 39 ++++++++++++++---------------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 7312ddb84fb4..2f2b8fc6e2f3 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -32,21 +32,20 @@ extern struct genapic apic_cluster; extern struct genapic apic_flat; extern struct genapic apic_physflat; -struct genapic *genapic = &apic_flat; - +struct genapic __read_mostly *genapic = &apic_flat; /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ void __init clustered_apic_check(void) { - long i; + int i; u8 clusters, max_cluster; u8 id; u8 cluster_cnt[NUM_APIC_CLUSTERS]; int max_apic = 0; -#if defined(CONFIG_ACPI) +#ifdef CONFIG_ACPI /* * Some x86_64 machines use physical APIC mode regardless of how many * procs/clusters are present (x86_64 ES7000 is an example). @@ -68,20 +67,17 @@ void __init clustered_apic_check(void) cluster_cnt[APIC_CLUSTERID(id)]++; } - /* Don't use clustered mode on AMD platforms. */ + /* + * Don't use clustered mode on AMD platforms, default + * to flat logical mode. + */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - genapic = &apic_physflat; -#ifndef CONFIG_HOTPLUG_CPU - /* In the CPU hotplug case we cannot use broadcast mode - because that opens a race when a CPU is removed. - Stay at physflat mode in this case. - It is bad to do this unconditionally though. Once - we have ACPI platform support for CPU hotplug - we should detect hotplug capablity from ACPI tables and - only do this when really needed. -AK */ - if (max_apic <= 8) - genapic = &apic_flat; -#endif + /* + * Switch to physical flat mode if more than 8 APICs + * (In the case of 8 CPUs APIC ID goes from 0 to 7): + */ + if (max_apic >= 8) + genapic = &apic_physflat; goto print; } @@ -103,14 +99,9 @@ void __init clustered_apic_check(void) * (We don't use lowest priority delivery + HW APIC IRQ steering, so * can ignore the clustered logical case and go straight to physical.) */ - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) { -#ifdef CONFIG_HOTPLUG_CPU - /* Don't use APIC shortcuts in CPU hotplug to avoid races */ - genapic = &apic_physflat; -#else + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) genapic = &apic_flat; -#endif - } else + else genapic = &apic_cluster; print: From 07c7c4744400f93a7c52b32159c31d823e1747a5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 007/231] [PATCH] x86-64: always use physical delivery mode on > 8 CPUs Remove clustered APIC mode. There's little point in the use of clustered APIC mode, broadcasting is limited to within the cluster only, and chipsets have bugs in this area as well. So default to physical APIC mode when the CPU count is large, and default to logical APIC mode when the CPU count is 8 or smaller. (this patch only removes the use of genapic_cluster and cleans up the resulting genapic.c file - removal of all remaining traces of clustered mode will be done by another patch.) Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- arch/x86_64/kernel/genapic.c | 71 ++++++++---------------------------- include/asm-x86_64/genapic.h | 4 +- 2 files changed, 18 insertions(+), 57 deletions(-) diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 2f2b8fc6e2f3..025f26ebb8d7 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -11,26 +11,24 @@ #include #include #include +#include #include #include #include -#include #include #include -#if defined(CONFIG_ACPI) +#ifdef CONFIG_ACPI #include #endif /* which logical CPU number maps to which CPU (physical APIC ID) */ -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly + = { [0 ... NR_CPUS-1] = BAD_APICID }; EXPORT_SYMBOL(x86_cpu_to_apicid); -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -extern struct genapic apic_cluster; -extern struct genapic apic_flat; -extern struct genapic apic_physflat; +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; struct genapic __read_mostly *genapic = &apic_flat; @@ -39,76 +37,37 @@ struct genapic __read_mostly *genapic = &apic_flat; */ void __init clustered_apic_check(void) { - int i; - u8 clusters, max_cluster; + unsigned int i, max_apic = 0; u8 id; - u8 cluster_cnt[NUM_APIC_CLUSTERS]; - int max_apic = 0; #ifdef CONFIG_ACPI /* - * Some x86_64 machines use physical APIC mode regardless of how many - * procs/clusters are present (x86_64 ES7000 is an example). + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID) - if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) { - genapic = &apic_cluster; - goto print; - } + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) + genapic = &apic_physflat; #endif - memset(cluster_cnt, 0, sizeof(cluster_cnt)); for (i = 0; i < NR_CPUS; i++) { id = bios_cpu_apicid[i]; if (id == BAD_APICID) continue; if (id > max_apic) max_apic = id; - cluster_cnt[APIC_CLUSTERID(id)]++; } - /* - * Don't use clustered mode on AMD platforms, default - * to flat logical mode. - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - /* - * Switch to physical flat mode if more than 8 APICs - * (In the case of 8 CPUs APIC ID goes from 0 to 7): - */ - if (max_apic >= 8) - genapic = &apic_physflat; - goto print; - } - - clusters = 0; - max_cluster = 0; - - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (cluster_cnt[i] > 0) { - ++clusters; - if (cluster_cnt[i] > max_cluster) - max_cluster = cluster_cnt[i]; - } - } - - /* - * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, - * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical - * else physical mode. - * (We don't use lowest priority delivery + HW APIC IRQ steering, so - * can ignore the clustered logical case and go straight to physical.) - */ - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) + if (max_apic < 8) genapic = &apic_flat; else - genapic = &apic_cluster; + genapic = &apic_physflat; -print: printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); } -/* Same for both flat and clustered. */ +/* Same for both flat and physical. */ void send_IPI_self(int vector) { diff --git a/include/asm-x86_64/genapic.h b/include/asm-x86_64/genapic.h index a0e9a4b93484..d7e516ccbaa4 100644 --- a/include/asm-x86_64/genapic.h +++ b/include/asm-x86_64/genapic.h @@ -29,7 +29,9 @@ struct genapic { unsigned int (*phys_pkg_id)(int index_msb); }; - extern struct genapic *genapic; +extern struct genapic apic_flat; +extern struct genapic apic_physflat; + #endif From 424df390101f9dfe015f0633aaec696c159b37a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 008/231] [PATCH] x86-64: remove clustered APIC mode Remove now unused clustered APIC mode code. Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- arch/x86_64/kernel/Makefile | 3 +- arch/x86_64/kernel/genapic_cluster.c | 137 --------------------------- 2 files changed, 1 insertion(+), 139 deletions(-) delete mode 100644 arch/x86_64/kernel/genapic_cluster.c diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index bb47e86f3d02..6879b4f01e88 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -21,8 +21,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o obj-y += apic.o nmi.o -obj-y += io_apic.o mpparse.o \ - genapic.o genapic_cluster.o genapic_flat.o +obj-y += io_apic.o mpparse.o genapic.o genapic_flat.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_PM) += suspend.o diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c deleted file mode 100644 index 73d76308b955..000000000000 --- a/arch/x86_64/kernel/genapic_cluster.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright 2004 James Cleverdon, IBM. - * Subject to the GNU Public License, v.2 - * - * Clustered APIC subarch code. Up to 255 CPUs, physical delivery. - * (A more realistic maximum is around 230 CPUs.) - * - * Hacked for x86-64 by James Cleverdon from i386 architecture code by - * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and - * James Cleverdon. - */ -#include -#include -#include -#include -#include -#include -#include -#include - - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -static void cluster_init_apic_ldr(void) -{ - unsigned long val, id; - long i, count; - u8 lid; - u8 my_id = hard_smp_processor_id(); - u8 my_cluster = APIC_CLUSTER(my_id); - - /* Create logical APIC IDs by counting CPUs already in cluster. */ - for (count = 0, i = NR_CPUS; --i >= 0; ) { - lid = x86_cpu_to_log_apicid[i]; - if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) - ++count; - } - /* - * We only have a 4 wide bitmap in cluster mode. There's no way - * to get above 60 CPUs and still give each one it's own bit. - * But, we're using physical IRQ delivery, so we don't care. - * Use bit 3 for the 4th through Nth CPU in each cluster. - */ - if (count >= XAPIC_DEST_CPUS_SHIFT) - count = 3; - id = my_cluster | (1UL << count); - x86_cpu_to_log_apicid[smp_processor_id()] = id; - apic_write(APIC_DFR, APIC_DFR_CLUSTER); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write(APIC_LDR, val); -} - -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - -static cpumask_t cluster_target_cpus(void) -{ - return cpumask_of_cpu(0); -} - -static cpumask_t cluster_vector_allocation_domain(int cpu) -{ - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; -} - -static void cluster_send_IPI_mask(cpumask_t mask, int vector) -{ - send_IPI_mask_sequence(mask, vector); -} - -static void cluster_send_IPI_allbutself(int vector) -{ - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - cluster_send_IPI_mask(mask, vector); -} - -static void cluster_send_IPI_all(int vector) -{ - cluster_send_IPI_mask(cpu_online_map, vector); -} - -static int cluster_apic_id_registered(void) -{ - return 1; -} - -static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) - return x86_cpu_to_apicid[cpu]; - else - return BAD_APICID; -} - -/* cpuid returns the value latched in the HW at reset, not the APIC ID - * register's value. For any box whose BIOS changes APIC IDs, like - * clustered APIC systems, we must use hard_smp_processor_id. - * - * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. - */ -static unsigned int phys_pkg_id(int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - -struct genapic apic_cluster = { - .name = "clustered", - .int_delivery_mode = dest_Fixed, - .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .target_cpus = cluster_target_cpus, - .vector_allocation_domain = cluster_vector_allocation_domain, - .apic_id_registered = cluster_apic_id_registered, - .init_apic_ldr = cluster_init_apic_ldr, - .send_IPI_all = cluster_send_IPI_all, - .send_IPI_allbutself = cluster_send_IPI_allbutself, - .send_IPI_mask = cluster_send_IPI_mask, - .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, -}; From 3c43f03908de98fa8f7a9e8fc9411ebf4c2de298 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 009/231] [PATCH] x86: default to physical mode on hotplug CPU kernels Default to physical mode on hotplug CPU kernels. Furher simplify and clean up the APIC initialization code. Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- arch/i386/kernel/acpi/boot.c | 2 +- arch/i386/kernel/mpparse.c | 2 +- arch/x86_64/kernel/genapic.c | 16 +++------------- arch/x86_64/kernel/mpparse.c | 2 +- include/asm-i386/genapic.h | 4 ++-- include/asm-i386/mach-bigsmp/mach_apic.h | 2 +- include/asm-i386/mach-default/mach_apic.h | 2 +- include/asm-i386/mach-es7000/mach_apic.h | 2 +- include/asm-i386/mach-generic/mach_apic.h | 2 +- include/asm-i386/mach-numaq/mach_apic.h | 2 +- include/asm-i386/mach-summit/mach_apic.h | 2 +- include/asm-i386/mach-visws/mach_apic.h | 2 +- include/asm-x86_64/apic.h | 2 +- 13 files changed, 16 insertions(+), 26 deletions(-) diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 9ea5b8ecc7e1..280898b045b2 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -874,7 +874,7 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; - clustered_apic_check(); + setup_apic_routing(); } } if (error == -EINVAL) { diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 4f5983c98669..0952eccd8f28 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) } ++mpc_record; } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "SMP mptable: no processors registered!\n"); return num_processors; diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 025f26ebb8d7..c08650a427e2 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -35,11 +35,8 @@ struct genapic __read_mostly *genapic = &apic_flat; /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ -void __init clustered_apic_check(void) +void __init setup_apic_routing(void) { - unsigned int i, max_apic = 0; - u8 id; - #ifdef CONFIG_ACPI /* * Quirk: some x86_64 machines can only use physical APIC mode @@ -49,17 +46,10 @@ void __init clustered_apic_check(void) if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) genapic = &apic_physflat; + else #endif - for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; - if (id == BAD_APICID) - continue; - if (id > max_apic) - max_apic = id; - } - - if (max_apic < 8) + if (cpus_weight(cpu_possible_map) <= 8) genapic = &apic_flat; else genapic = &apic_physflat; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 455aa0b932f0..d0dc4891599b 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -300,7 +300,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) } } } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h index 8ffbb0f07457..33e3ffe1766c 100644 --- a/include/asm-i386/genapic.h +++ b/include/asm-i386/genapic.h @@ -36,7 +36,7 @@ struct genapic { void (*init_apic_ldr)(void); physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); - void (*clustered_apic_check)(void); + void (*setup_apic_routing)(void); int (*multi_timer_check)(int apic, int irq); int (*apicid_to_node)(int logical_apicid); int (*cpu_to_logical_apicid)(int cpu); @@ -99,7 +99,7 @@ struct genapic { APICFUNC(check_apicid_present) \ APICFUNC(init_apic_ldr) \ APICFUNC(ioapic_phys_id_map) \ - APICFUNC(clustered_apic_check) \ + APICFUNC(setup_apic_routing) \ APICFUNC(multi_timer_check) \ APICFUNC(apicid_to_node) \ APICFUNC(cpu_to_logical_apicid) \ diff --git a/include/asm-i386/mach-bigsmp/mach_apic.h b/include/asm-i386/mach-bigsmp/mach_apic.h index 18b19a773440..ebd319f838ab 100644 --- a/include/asm-i386/mach-bigsmp/mach_apic.h +++ b/include/asm-i386/mach-bigsmp/mach_apic.h @@ -71,7 +71,7 @@ static inline void init_apic_ldr(void) apic_write_around(APIC_LDR, val); } -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { printk("Enabling APIC mode: %s. Using %d I/O APICs\n", "Physflat", nr_ioapics); diff --git a/include/asm-i386/mach-default/mach_apic.h b/include/asm-i386/mach-default/mach_apic.h index 3ef6292db780..6db1c3babe9a 100644 --- a/include/asm-i386/mach-default/mach_apic.h +++ b/include/asm-i386/mach-default/mach_apic.h @@ -54,7 +54,7 @@ static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) return phys_map; } -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { printk("Enabling APIC mode: %s. Using %d I/O APICs\n", "Flat", nr_ioapics); diff --git a/include/asm-i386/mach-es7000/mach_apic.h b/include/asm-i386/mach-es7000/mach_apic.h index 26333685a7fb..8e8b3949173a 100644 --- a/include/asm-i386/mach-es7000/mach_apic.h +++ b/include/asm-i386/mach-es7000/mach_apic.h @@ -81,7 +81,7 @@ static inline void enable_apic_mode(void) } extern int apic_version [MAX_APICS]; -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { int apic = bios_cpu_apicid[smp_processor_id()]; printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", diff --git a/include/asm-i386/mach-generic/mach_apic.h b/include/asm-i386/mach-generic/mach_apic.h index d9dc039da94a..a236e7021528 100644 --- a/include/asm-i386/mach-generic/mach_apic.h +++ b/include/asm-i386/mach-generic/mach_apic.h @@ -13,7 +13,7 @@ #define apic_id_registered (genapic->apic_id_registered) #define init_apic_ldr (genapic->init_apic_ldr) #define ioapic_phys_id_map (genapic->ioapic_phys_id_map) -#define clustered_apic_check (genapic->clustered_apic_check) +#define setup_apic_routing (genapic->setup_apic_routing) #define multi_timer_check (genapic->multi_timer_check) #define apicid_to_node (genapic->apicid_to_node) #define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid) diff --git a/include/asm-i386/mach-numaq/mach_apic.h b/include/asm-i386/mach-numaq/mach_apic.h index 9d158095da82..5e5e7dd2692e 100644 --- a/include/asm-i386/mach-numaq/mach_apic.h +++ b/include/asm-i386/mach-numaq/mach_apic.h @@ -34,7 +34,7 @@ static inline void init_apic_ldr(void) /* Already done in NUMA-Q firmware */ } -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { printk("Enabling APIC mode: %s. Using %d I/O APICs\n", "NUMA-Q", nr_ioapics); diff --git a/include/asm-i386/mach-summit/mach_apic.h b/include/asm-i386/mach-summit/mach_apic.h index 43e5bd8f4a19..732f776aab8e 100644 --- a/include/asm-i386/mach-summit/mach_apic.h +++ b/include/asm-i386/mach-summit/mach_apic.h @@ -80,7 +80,7 @@ static inline int apic_id_registered(void) return 1; } -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", nr_ioapics); diff --git a/include/asm-i386/mach-visws/mach_apic.h b/include/asm-i386/mach-visws/mach_apic.h index 18afe6b6fc4d..efac6f0d139f 100644 --- a/include/asm-i386/mach-visws/mach_apic.h +++ b/include/asm-i386/mach-visws/mach_apic.h @@ -47,7 +47,7 @@ static inline void summit_check(char *oem, char *productid) { } -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { } diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index 7cfb39cbd918..2f3b013595ab 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h @@ -83,7 +83,7 @@ extern void setup_secondary_APIC_clock (void); extern int APIC_init_uniprocessor (void); extern void disable_APIC_timer(void); extern void enable_APIC_timer(void); -extern void clustered_apic_check(void); +extern void setup_apic_routing(void); extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, unsigned char msg_type, unsigned char mask); From 28609f6e4921ebb75ceaf5317454b8047b07cef4 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 010/231] [PATCH] i386: adjustments to page table dump during oops (v4) - make the page table contents printing PAE capable - make sure the address stored in current->thread.cr2 is unmodified from what was read from CR2 - don't call oops_may_print() multiple times, when one time suffices - print pte even in highpte case, as long as the pte page isn't in actually in high memory (which is specifically the case for all page tables covering kernel space) (Changes to v3: Use sizeof()*2 rather than the suggested sizeof()*4 for printing width, use fixed 16-nibble width for PAE, and also apply the max_low_pfn range check to the middle level lookup on PAE.) Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/mm/fault.c | 55 ++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index b8c4e259fc8b..c6a0a06258e6 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -20,6 +20,7 @@ #include #include /* For unblank_screen() */ #include +#include /* for max_low_pfn */ #include #include #include @@ -301,7 +302,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; - unsigned long page; int write, si_code; /* get the address */ @@ -510,7 +510,9 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, bust_spinlocks(1); if (oops_may_print()) { - #ifdef CONFIG_X86_PAE + __typeof__(pte_val(__pte(0))) page; + +#ifdef CONFIG_X86_PAE if (error_code & 16) { pte_t *pte = lookup_address(address); @@ -519,7 +521,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, "NX-protected page - exploit attempt? " "(uid: %d)\n", current->uid); } - #endif +#endif if (address < PAGE_SIZE) printk(KERN_ALERT "BUG: unable to handle kernel NULL " "pointer dereference"); @@ -529,25 +531,38 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, printk(" at virtual address %08lx\n",address); printk(KERN_ALERT " printing eip:\n"); printk("%08lx\n", regs->eip); - } - page = read_cr3(); - page = ((unsigned long *) __va(page))[address >> 22]; - if (oops_may_print()) + + page = read_cr3(); + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +#ifdef CONFIG_X86_PAE + printk(KERN_ALERT "*pdpt = %016Lx\n", page); + if ((page >> PAGE_SHIFT) < max_low_pfn + && page & _PAGE_PRESENT) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) + & (PTRS_PER_PMD - 1)]; + printk(KERN_ALERT "*pde = %016Lx\n", page); + page &= ~_PAGE_NX; + } +#else printk(KERN_ALERT "*pde = %08lx\n", page); - /* - * We must not directly access the pte in the highpte - * case, the page table might be allocated in highmem. - * And lets rather not kmap-atomic the pte, just in case - * it's allocated already. - */ -#ifndef CONFIG_HIGHPTE - if ((page & 1) && oops_may_print()) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = %08lx\n", page); - } #endif + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already. + */ + if ((page >> PAGE_SHIFT) < max_low_pfn + && (page & _PAGE_PRESENT)) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) + & (PTRS_PER_PTE - 1)]; + printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page); + } + } + tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; From 00f1ea696702163b7411d2316264525996c66ed3 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 011/231] [PATCH] x86: adjust inclusion of asm/fixmap.h Move inclusion of asm/fixmap.h to where it is really used rather than where it may have been used long ago (requires a few other adjustments to includes due to previous implicit dependencies). Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic.c | 1 + arch/x86_64/kernel/genapic_flat.c | 1 + include/asm-i386/hpet.h | 2 -- include/asm-i386/kexec.h | 5 ----- include/asm-i386/pgalloc.h | 1 - include/asm-i386/smp.h | 7 ++----- include/asm-x86_64/ipi.h | 4 +--- include/asm-x86_64/pgalloc.h | 1 - include/asm-x86_64/pgtable.h | 1 - include/asm-x86_64/smp.h | 3 +-- 10 files changed, 6 insertions(+), 20 deletions(-) diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index c08650a427e2..47496a40e84f 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -18,6 +18,7 @@ #include #include +#include #ifdef CONFIG_ACPI #include diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 7c01db8fa9d1..9e0a552f0e4a 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -16,6 +16,7 @@ #include #include #include +#include static cpumask_t flat_target_cpus(void) { diff --git a/include/asm-i386/hpet.h b/include/asm-i386/hpet.h index fc03cf9de5c4..dddeedf504b7 100644 --- a/include/asm-i386/hpet.h +++ b/include/asm-i386/hpet.h @@ -28,8 +28,6 @@ #include -#include - /* * Documentation on HPET can be found at: * http://www.intel.com/ial/home/sp/pcmmspec.htm diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h index 4dfc9f5ed031..c5b4ab95bdc2 100644 --- a/include/asm-i386/kexec.h +++ b/include/asm-i386/kexec.h @@ -21,7 +21,6 @@ #ifndef __ASSEMBLY__ -#include #include #include @@ -29,10 +28,6 @@ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. * I.e. Maximum page that is mapped directly into kernel memory, * and kmap is not required. - * - * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct - * calculation for the amount of memory directly mappable into the - * kernel memory space. */ /* Maximum physical address we can use pages from */ diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h index c8dc2d0141a7..47430175b75f 100644 --- a/include/asm-i386/pgalloc.h +++ b/include/asm-i386/pgalloc.h @@ -1,7 +1,6 @@ #ifndef _I386_PGALLOC_H #define _I386_PGALLOC_H -#include #include #include /* for struct page */ diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index 6bf0033a301c..9cab1531c613 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -11,16 +11,13 @@ #include #endif -#ifdef CONFIG_X86_LOCAL_APIC -#ifndef __ASSEMBLY__ -#include +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__) #include #include +#include #ifdef CONFIG_X86_IO_APIC #include #endif -#include -#endif #endif #define BAD_APICID 0xFFu diff --git a/include/asm-x86_64/ipi.h b/include/asm-x86_64/ipi.h index 2a5c162b7d92..ffa6f1517f1a 100644 --- a/include/asm-x86_64/ipi.h +++ b/include/asm-x86_64/ipi.h @@ -18,10 +18,8 @@ * Subject to the GNU Public License, v.2 */ -#include #include -#include -#include +#include /* * the following functions deal with sending IPIs between CPUs. diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h index 4e28b6060a5e..31d497171969 100644 --- a/include/asm-x86_64/pgalloc.h +++ b/include/asm-x86_64/pgalloc.h @@ -1,7 +1,6 @@ #ifndef _X86_64_PGALLOC_H #define _X86_64_PGALLOC_H -#include #include #include #include diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 730bd6028416..5957361782fe 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -6,7 +6,6 @@ * the x86-64 page table tree. */ #include -#include #include #include #include diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index de592a408c07..f4236d7789aa 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -10,10 +10,9 @@ #include extern int disable_apic; -#include #include -#include #include +#include #include #ifdef CONFIG_SMP From b0354795c9c8fef2fadf8f867586c78efd9a1dc9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH 012/231] [PATCH] x86-64: adjust inclusion of asm/vsyscall32.h Avoid including asm/vsyscall32.h in virtually every source file. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/ia32/syscall32.c | 1 + include/asm-x86_64/fixmap.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c index 568ff0df89e7..fc4419ff0355 100644 --- a/arch/x86_64/ia32/syscall32.c +++ b/arch/x86_64/ia32/syscall32.c @@ -13,6 +13,7 @@ #include #include #include +#include extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; diff --git a/include/asm-x86_64/fixmap.h b/include/asm-x86_64/fixmap.h index 1b620db5b9e3..e90e1677531b 100644 --- a/include/asm-x86_64/fixmap.h +++ b/include/asm-x86_64/fixmap.h @@ -15,7 +15,6 @@ #include #include #include -#include /* * Here we define all the compile-time 'special' virtual From 9964cf7d776600724ef5f1b33303ceadc588b8ba Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 013/231] [PATCH] x86: consolidate smp_send_stop() Synchronize i386's smp_send_stop() with x86-64's in only try-locking the call lock to prevent deadlocks when called from panic(). In both version, disable interrupts before clearing the CPU off the online map to eliminate races with IRQ handlers inspecting this map. Also in both versions, save/restore interrupts rather than disabling/ enabling them. On x86-64, eliminate one function used here by folding it into its single caller, convert to static, and rename for consistency with i386 (lkcd may like this). Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/smp.c | 68 ++++++++++++++++++++++------------------ arch/x86_64/kernel/smp.c | 28 ++++++----------- include/asm-x86_64/smp.h | 1 - 3 files changed, 48 insertions(+), 49 deletions(-) diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 0e8977871b1f..0cd459baad68 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -515,35 +515,14 @@ void unlock_ipi_call_lock(void) static struct call_data_struct *call_data; -/** - * smp_call_function(): Run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: currently unused. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute <> or are or have executed. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) +static void __smp_call_function(void (*func) (void *info), void *info, + int nonatomic, int wait) { struct call_data_struct data; - int cpus; + int cpus = num_online_cpus() - 1; - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); - cpus = num_online_cpus() - 1; - if (!cpus) { - spin_unlock(&call_lock); - return 0; - } - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); + if (!cpus) + return; data.func = func; data.info = info; @@ -565,6 +544,30 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, if (wait) while (atomic_read(&data.finished) != cpus) cpu_relax(); +} + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: currently unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Does not return until + * remote CPUs are nearly ready to execute <> or are or have executed. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait) +{ + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + __smp_call_function(func, info, nonatomic, wait); spin_unlock(&call_lock); return 0; @@ -573,11 +576,11 @@ EXPORT_SYMBOL(smp_call_function); static void stop_this_cpu (void * dummy) { + local_irq_disable(); /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); disable_local_APIC(); if (cpu_data[smp_processor_id()].hlt_works_ok) for(;;) halt(); @@ -590,11 +593,16 @@ static void stop_this_cpu (void * dummy) void smp_send_stop(void) { - smp_call_function(stop_this_cpu, NULL, 1, 0); + /* Don't deadlock on the call lock in panic */ + int nolock = !spin_trylock(&call_lock); + unsigned long flags; - local_irq_disable(); + local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); + if (!nolock) + spin_unlock(&call_lock); disable_local_APIC(); - local_irq_enable(); + local_irq_restore(flags); } /* diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index af1ec4d23cf8..bd1d123947ce 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -452,42 +452,34 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, } EXPORT_SYMBOL(smp_call_function); -void smp_stop_cpu(void) +static void stop_this_cpu(void *dummy) { - unsigned long flags; + local_irq_disable(); /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_save(flags); disable_local_APIC(); - local_irq_restore(flags); -} - -static void smp_really_stop_cpu(void *dummy) -{ - smp_stop_cpu(); for (;;) halt(); } void smp_send_stop(void) { - int nolock = 0; + int nolock; + unsigned long flags; + if (reboot_force) return; + /* Don't deadlock on the call lock in panic */ - if (!spin_trylock(&call_lock)) { - /* ignore locking because we have panicked anyways */ - nolock = 1; - } - __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); + nolock = !spin_trylock(&call_lock); + local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); if (!nolock) spin_unlock(&call_lock); - - local_irq_disable(); disable_local_APIC(); - local_irq_enable(); + local_irq_restore(flags); } /* diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index f4236d7789aa..d5704421456b 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -37,7 +37,6 @@ extern void lock_ipi_call_lock(void); extern void unlock_ipi_call_lock(void); extern int smp_num_siblings; extern void smp_send_reschedule(int cpu); -void smp_stop_cpu(void); extern cpumask_t cpu_sibling_map[NR_CPUS]; extern cpumask_t cpu_core_map[NR_CPUS]; From f76c392380a40008ee6ecaea4e5a51a3a10282c4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 014/231] [PATCH] i386: No need to use -traditional for processing asm in i386/kernel/ No need to use -traditional for processing asm in i386/kernel/ Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/kernel/Makefile | 2 -- arch/i386/kernel/entry.S | 2 +- include/asm-i386/percpu.h | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 4ae3dcf1d2f0..bd7753cb9e69 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -43,8 +43,6 @@ obj-$(CONFIG_VMI) += vmi.o vmitime.o obj-$(CONFIG_PARAVIRT) += paravirt.o obj-y += pcspeaker.o -EXTRA_AFLAGS := -traditional - obj-$(CONFIG_SCx200) += scx200.o # vsyscall.o contains the vsyscall DSO images as __initdata. diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 18bddcb8e9e8..922cc38dc405 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -635,7 +635,7 @@ ENTRY(name) \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ - call smp_/**/name; \ + call smp_##name; \ jmp ret_from_intr; \ CFI_ENDPROC; \ ENDPROC(name) diff --git a/include/asm-i386/percpu.h b/include/asm-i386/percpu.h index 510ae1d3486c..a10e7c68ae9d 100644 --- a/include/asm-i386/percpu.h +++ b/include/asm-i386/percpu.h @@ -20,10 +20,10 @@ #ifdef CONFIG_SMP #define PER_CPU(var, cpu) \ movl __per_cpu_offset(,cpu,4), cpu; \ - addl $per_cpu__/**/var, cpu; + addl $per_cpu__##var, cpu; #else /* ! SMP */ #define PER_CPU(var, cpu) \ - movl $per_cpu__/**/var, cpu; + movl $per_cpu__##var, cpu; #endif /* SMP */ #endif /* !__ASSEMBLY__ */ From 9215da33209b861b01c51382254b178a3fe92a30 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 015/231] [PATCH] i386: mtrr range check correction Whether a region is below 1Mb is determined by its start rather than its end. This hunk got erroneously dropped from a previous patch. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/mtrr/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index f77fc53db654..68b383788882 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -428,7 +428,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i } } - if (base + size < 0x100) { + if (base < 0x100) { printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", base, size); return -EINVAL; From f5e8861583a591020176c90c10c6a130fed4f3ec Mon Sep 17 00:00:00 2001 From: takada Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 016/231] [PATCH] i386: pit_latch_buggy has no effect Eliminated the arch/i386/kernel/timers in 2.6.18, use clocksoures instead. pit_latch_buggy was referred in timers/timer_tsc.c, and currently removed. Therefore nobody refer it. Until 2.6.17, MediaGX's TSC works correctly. after 2.6.18, warned "TSC appears to be running slowly. Marking it as unstable". So marked unstable TSC when CS55x0. Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/cyrix.c | 2 +- arch/i386/kernel/time.c | 2 -- include/asm-i386/timer.h | 2 -- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index de27bd07bc9c..f0badfdd4e45 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -279,7 +279,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) */ if (vendor == PCI_VENDOR_ID_CYRIX && (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) - pit_latch_buggy = 1; + mark_tsc_unstable(); } #endif c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 94e5cb091104..a665df61f08c 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -70,8 +70,6 @@ #include -int pit_latch_buggy; /* extern */ - #include "do_timer.h" unsigned int cpu_khz; /* Detected as we calibrate the TSC */ diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h index 12dd67bf760f..153770e25faa 100644 --- a/include/asm-i386/timer.h +++ b/include/asm-i386/timer.h @@ -9,8 +9,6 @@ void setup_pit_timer(void); unsigned long long native_sched_clock(void); unsigned long native_calculate_cpu_khz(void); -/* Modifiers for buggy PIT handling */ -extern int pit_latch_buggy; extern int timer_ack; extern int no_timer_check; extern int no_sync_cmos_clock; From 0949be35095b53dbaa72db700cb5074c5c249629 Mon Sep 17 00:00:00 2001 From: Simon Arlott Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 017/231] [PATCH] i386: Add an option for the VIA C7 which sets appropriate L1 cache The VIA C7 is a 686 (with TSC) that supports MMX, SSE and SSE2, it also has a cache line length of 64 according to http://www.digit-life.com/articles2/cpu/rmma-via-c7.html. This patch sets gcc to -march=686 and select s the correct cache shift. Signed-off-by: Simon Arlott Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Dave Jones Cc: Alan Cox Signed-off-by: Andrew Morton --- arch/i386/Kconfig.cpu | 13 ++++++++++--- arch/i386/Makefile.cpu | 1 + arch/um/defconfig | 1 + include/asm-i386/module.h | 2 ++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu index b99c0e2a4e63..b1af9f50a143 100644 --- a/arch/i386/Kconfig.cpu +++ b/arch/i386/Kconfig.cpu @@ -43,6 +43,7 @@ config M386 - "Geode GX/LX" For AMD Geode GX and LX processors. - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). + - "VIA C7" for VIA C7. If you don't know what to do, choose "386". @@ -203,6 +204,12 @@ config MVIAC3_2 of SSE and tells gcc to treat the CPU as a 686. Note, this kernel will not boot on older (pre model 9) C3s. +config MVIAC7 + bool "VIA C7" + help + Select this for a VIA C7. Selecting this uses the correct cache + shift and tells gcc to treat the CPU as a 686. + endchoice config X86_GENERIC @@ -231,7 +238,7 @@ config X86_L1_CACHE_SHIFT default "7" if MPENTIUM4 || X86_GENERIC default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 config RWSEM_GENERIC_SPINLOCK bool @@ -297,7 +304,7 @@ config X86_ALIGNMENT_16 config X86_GOOD_APIC bool - depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 + depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 default y config X86_INTEL_USERCOPY @@ -322,5 +329,5 @@ config X86_OOSTORE config X86_TSC bool - depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ default y diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu index a32c031c90d7..e0ada4e9e137 100644 --- a/arch/i386/Makefile.cpu +++ b/arch/i386/Makefile.cpu @@ -32,6 +32,7 @@ cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) +cflags-$(CONFIG_MVIAC7) += -march=i686 cflags-$(CONFIG_MCORE2) += -march=i686 $(call cc-option,-mtune=core2,$(call cc-option,-mtune=generic,-mtune=i686)) # AMD Elan support diff --git a/arch/um/defconfig b/arch/um/defconfig index 780cc0a4a128..f938fa822146 100644 --- a/arch/um/defconfig +++ b/arch/um/defconfig @@ -41,6 +41,7 @@ CONFIG_M686=y # CONFIG_MGEODE_LX is not set # CONFIG_MCYRIXIII is not set # CONFIG_MVIAC3_2 is not set +# CONFIG_MVIAC7 is not set # CONFIG_X86_GENERIC is not set CONFIG_X86_CMPXCHG=y CONFIG_X86_XADD=y diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h index 02f8f541cbe0..7e5fda6c3976 100644 --- a/include/asm-i386/module.h +++ b/include/asm-i386/module.h @@ -54,6 +54,8 @@ struct mod_arch_specific #define MODULE_PROC_FAMILY "CYRIXIII " #elif defined CONFIG_MVIAC3_2 #define MODULE_PROC_FAMILY "VIAC3-2 " +#elif defined CONFIG_MVIAC7 +#define MODULE_PROC_FAMILY "VIAC7 " #elif defined CONFIG_MGEODEGX1 #define MODULE_PROC_FAMILY "GEODEGX1 " #elif defined CONFIG_MGEODE_LX From 0adad171c2334d43fc2fa208f6b45e88f0679427 Mon Sep 17 00:00:00 2001 From: Rene Herman Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 018/231] [PATCH] i386: probe_roms() cleanup Remove the assumption that if the first page of a legacy ROM is mapped, it'll all be mapped. This'll also stop people reading this code from wondering if they're looking at a bug... Signed-off-by: Rene Herman Signed-off-by: Martin Murray Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Zachary Amsden Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/e820.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 70f39560846a..31f4670ef740 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -161,26 +161,27 @@ static struct resource standard_io_resources[] = { { static int __init romsignature(const unsigned char *rom) { + const unsigned short * const ptr = (const unsigned short *)rom; unsigned short sig; - return probe_kernel_address((const unsigned short *)rom, sig) == 0 && - sig == ROMSIGNATURE; + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; } -static int __init romchecksum(unsigned char *rom, unsigned long length) +static int __init romchecksum(const unsigned char *rom, unsigned long length) { - unsigned char sum; + unsigned char sum, c; - for (sum = 0; length; length--) - sum += *rom++; - return sum == 0; + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; } static void __init probe_roms(void) { + const unsigned char *rom; unsigned long start, length, upper; - unsigned char *rom; - int i; + unsigned char c; + int i; /* video rom */ upper = adapter_rom_resources[0].start; @@ -191,8 +192,11 @@ static void __init probe_roms(void) video_rom_resource.start = start; + if (probe_kernel_address(rom + 2, c) != 0) + continue; + /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; + length = c * 512; /* if checksum okay, trust length byte */ if (length && romchecksum(rom, length)) @@ -226,8 +230,11 @@ static void __init probe_roms(void) if (!romsignature(rom)) continue; + if (probe_kernel_address(rom + 2, c) != 0) + continue; + /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; + length = c * 512; /* but accept any length that fits if checksum okay */ if (!length || start + length > upper || !romchecksum(rom, length)) From 3755090722e5a9c44780e5461ed9e49ab542bc73 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 019/231] [PATCH] x86-64: a few missing entry.S annotations Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index ed4350ced3d0..fa984b53e7e6 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -701,6 +701,7 @@ END(spurious_interrupt) CFI_ADJUST_CFA_OFFSET 8 pushq %rax /* push real oldrax to the rdi slot */ CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rax,0 leaq \sym(%rip),%rax jmp error_entry CFI_ENDPROC @@ -710,6 +711,7 @@ END(spurious_interrupt) XCPT_FRAME pushq %rax CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rax,0 leaq \sym(%rip),%rax jmp error_entry CFI_ENDPROC @@ -817,6 +819,7 @@ paranoid_schedule\trace: */ KPROBE_ENTRY(error_entry) _frame RDI + CFI_REL_OFFSET rax,0 /* rdi slot contains rax, oldrax contains error code */ cld subq $14*8,%rsp @@ -824,6 +827,7 @@ KPROBE_ENTRY(error_entry) movq %rsi,13*8(%rsp) CFI_REL_OFFSET rsi,RSI movq 14*8(%rsp),%rsi /* load rax from rdi slot */ + CFI_REGISTER rax,rsi movq %rdx,12*8(%rsp) CFI_REL_OFFSET rdx,RDX movq %rcx,11*8(%rsp) @@ -857,6 +861,7 @@ error_swapgs: swapgs error_sti: movq %rdi,RDI(%rsp) + CFI_REL_OFFSET rdi,RDI movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) From 00e065ea587363e538d9624eea8cacad12cb7397 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 020/231] [PATCH] i386: Add dwarf2 annotations to *_user and checksum functions Signed-off-by: Andi Kleen --- arch/i386/lib/checksum.S | 69 ++++++++++++++++++++++++++++++++++------ arch/i386/lib/getuser.S | 26 +++++++++------ arch/i386/lib/putuser.S | 39 +++++++++++++++-------- 3 files changed, 102 insertions(+), 32 deletions(-) diff --git a/arch/i386/lib/checksum.S b/arch/i386/lib/checksum.S index 75ffd02654fc..adbccd0bbb78 100644 --- a/arch/i386/lib/checksum.S +++ b/arch/i386/lib/checksum.S @@ -25,6 +25,8 @@ * 2 of the License, or (at your option) any later version. */ +#include +#include #include /* @@ -36,8 +38,6 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) */ .text -.align 4 -.globl csum_partial #ifndef CONFIG_X86_USE_PPRO_CHECKSUM @@ -48,9 +48,14 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) * Fortunately, it is easy to convert 2-byte alignment to 4-byte * alignment for the unrolled loop. */ -csum_partial: +ENTRY(csum_partial) + CFI_STARTPROC pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: unsigned char *buff @@ -128,16 +133,27 @@ csum_partial: roll $8, %eax 8: popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi ret + CFI_ENDPROC +ENDPROC(csum_partial) #else /* Version for PentiumII/PPro */ -csum_partial: +ENTRY(csum_partial) + CFI_STARTPROC pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: const unsigned char *buf @@ -245,8 +261,14 @@ csum_partial: roll $8, %eax 90: popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi ret + CFI_ENDPROC +ENDPROC(csum_partial) #endif @@ -278,19 +300,24 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, .long 9999b, 6002f ; \ .previous -.align 4 -.globl csum_partial_copy_generic - #ifndef CONFIG_X86_USE_PPRO_CHECKSUM #define ARGBASE 16 #define FP 12 -csum_partial_copy_generic: +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC subl $4,%esp + CFI_ADJUST_CFA_OFFSET 4 pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len movl ARGBASE+4(%esp),%esi # src @@ -400,10 +427,19 @@ DST( movb %cl, (%edi) ) .previous popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi popl %edi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edi popl %ecx # equivalent to addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 ret + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) #else @@ -421,10 +457,17 @@ DST( movb %cl, (%edi) ) #define ARGBASE 12 -csum_partial_copy_generic: +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst movl ARGBASE+12(%esp),%ecx #len @@ -485,9 +528,17 @@ DST( movb %dl, (%edi) ) .previous popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi popl %edi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edi popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx ret + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) #undef ROUND #undef ROUND1 diff --git a/arch/i386/lib/getuser.S b/arch/i386/lib/getuser.S index 62d7f178a326..6d84b53f12a2 100644 --- a/arch/i386/lib/getuser.S +++ b/arch/i386/lib/getuser.S @@ -8,6 +8,8 @@ * return an error value in addition to the "real" * return value. */ +#include +#include #include @@ -24,19 +26,19 @@ */ .text -.align 4 -.globl __get_user_1 -__get_user_1: +ENTRY(__get_user_1) + CFI_STARTPROC GET_THREAD_INFO(%edx) cmpl TI_addr_limit(%edx),%eax jae bad_get_user 1: movzbl (%eax),%edx xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__get_user_1) -.align 4 -.globl __get_user_2 -__get_user_2: +ENTRY(__get_user_2) + CFI_STARTPROC addl $1,%eax jc bad_get_user GET_THREAD_INFO(%edx) @@ -45,10 +47,11 @@ __get_user_2: 2: movzwl -1(%eax),%edx xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__get_user_2) -.align 4 -.globl __get_user_4 -__get_user_4: +ENTRY(__get_user_4) + CFI_STARTPROC addl $3,%eax jc bad_get_user GET_THREAD_INFO(%edx) @@ -57,11 +60,16 @@ __get_user_4: 3: movl -3(%eax),%edx xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__get_user_4) bad_get_user: + CFI_STARTPROC xorl %edx,%edx movl $-14,%eax ret + CFI_ENDPROC +END(bad_get_user) .section __ex_table,"a" .long 1b,bad_get_user diff --git a/arch/i386/lib/putuser.S b/arch/i386/lib/putuser.S index a32d9f570f48..f58fba109d18 100644 --- a/arch/i386/lib/putuser.S +++ b/arch/i386/lib/putuser.S @@ -8,6 +8,8 @@ * return an error value in addition to the "real" * return value. */ +#include +#include #include @@ -23,23 +25,28 @@ * as they get called from within inline assembly. */ -#define ENTER pushl %ebx ; GET_THREAD_INFO(%ebx) -#define EXIT popl %ebx ; ret +#define ENTER CFI_STARTPROC ; \ + pushl %ebx ; \ + CFI_ADJUST_CFA_OFFSET 4 ; \ + CFI_REL_OFFSET ebx, 0 ; \ + GET_THREAD_INFO(%ebx) +#define EXIT popl %ebx ; \ + CFI_ADJUST_CFA_OFFSET -4 ; \ + CFI_RESTORE ebx ; \ + ret ; \ + CFI_ENDPROC .text -.align 4 -.globl __put_user_1 -__put_user_1: +ENTRY(__put_user_1) ENTER cmpl TI_addr_limit(%ebx),%ecx jae bad_put_user 1: movb %al,(%ecx) xorl %eax,%eax EXIT +ENDPROC(__put_user_1) -.align 4 -.globl __put_user_2 -__put_user_2: +ENTRY(__put_user_2) ENTER movl TI_addr_limit(%ebx),%ebx subl $1,%ebx @@ -48,10 +55,9 @@ __put_user_2: 2: movw %ax,(%ecx) xorl %eax,%eax EXIT +ENDPROC(__put_user_2) -.align 4 -.globl __put_user_4 -__put_user_4: +ENTRY(__put_user_4) ENTER movl TI_addr_limit(%ebx),%ebx subl $3,%ebx @@ -60,10 +66,9 @@ __put_user_4: 3: movl %eax,(%ecx) xorl %eax,%eax EXIT +ENDPROC(__put_user_4) -.align 4 -.globl __put_user_8 -__put_user_8: +ENTRY(__put_user_8) ENTER movl TI_addr_limit(%ebx),%ebx subl $7,%ebx @@ -73,10 +78,16 @@ __put_user_8: 5: movl %edx,4(%ecx) xorl %eax,%eax EXIT +ENDPROC(__put_user_8) bad_put_user: + CFI_STARTPROC simple + CFI_DEF_CFA esp, 2*4 + CFI_OFFSET eip, -1*4 + CFI_OFFSET ebx, -2*4 movl $-14,%eax EXIT +END(bad_put_user) .section __ex_table,"a" .long 1b,bad_put_user From d18951834216eae82e2f9112416111b4f55f1849 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 021/231] [PATCH] x86: Fix i386 and x86_64 fault information pollution a userspace fault or a kernelspace fault which will result in the immediate death of the process. They should not be filled in as a result of a kernelspace fault which can be fixed up. Otherwise, if the process is handling SIGSEGV and examining the fault information, this can result in the kernel space fault trashing the previously stored fault information if it arrives between the userspace fault happening and the SIGSEGV being delivered to the process. Signed-off-by: Jeff Dike Signed-off-by: Andi Kleen Acked-by: Jan Beulich -- arch/i386/kernel/traps.c | 24 ++++++++++++++++++------ arch/x86_64/kernel/traps.c | 30 +++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 13 deletions(-) --- arch/i386/kernel/traps.c | 24 ++++++++++++++++++------ arch/x86_64/kernel/traps.c | 30 +++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index af0d3f70a817..58dfecc8e36c 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -476,8 +476,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, siginfo_t *info) { struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; if (regs->eflags & VM_MASK) { if (vm86) @@ -489,6 +487,18 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, goto kernel_trap; trap_signal: { + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (info) force_sig_info(signr, info, tsk); else @@ -497,8 +507,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, } kernel_trap: { - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; die(str, regs, error_code); + } return; } @@ -603,9 +616,6 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, } put_cpu(); - current->thread.error_code = error_code; - current->thread.trap_no = 13; - if (regs->eflags & VM_MASK) goto gp_in_vm86; @@ -624,6 +634,8 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, gp_in_kernel: if (!fixup_exception(regs)) { + current->thread.error_code = error_code; + current->thread.trap_no = 13; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) return; diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 09d2e8a10a49..cceef5bd7302 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -581,10 +581,20 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, { struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - if (user_mode(regs)) { + /* + * We want error_code and trap_no set for userspace + * faults and kernelspace faults which result in + * die(), but not kernelspace faults which are fixed + * up. die() gives the process no chance to handle + * the signal and notice the kernel fault information, + * so that won't result in polluting the information + * about previously queued, but not yet delivered, + * faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (exception_trace && unhandled_signal(tsk, signr)) printk(KERN_INFO "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", @@ -605,8 +615,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, fixup = search_exception_tables(regs->rip); if (fixup) regs->rip = fixup->fixup; - else + else { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; die(str, regs, error_code); + } return; } } @@ -682,10 +695,10 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, conditional_sti(regs); - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (user_mode(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) printk(KERN_INFO "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", @@ -704,6 +717,9 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, regs->rip = fixup->fixup; return; } + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) return; From 803d80f65038f77c4681a0d7708e9d693e68aaa8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 022/231] [PATCH] x86-64: Some cleanup in time.c Move prototypes into header files Remove unneeded includes. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 5 +---- include/asm-x86_64/proto.h | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 75d73a9aa9ff..811b8f987b50 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -39,13 +39,10 @@ #include #include #include -#include #include #include #include - -extern void i8254_timer_resume(void); -extern int using_apic_timer; +#include static char *timename = NULL; diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 6688cf9eb27b..f64949fae61a 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -122,6 +122,8 @@ extern void smp_local_timer_interrupt(void); long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); +void i8254_timer_resume(void); + #define round_up(x,y) (((x) + (y) - 1) & ~((y)-1)) #define round_down(x,y) ((x) & ~((y)-1)) From bf8696ed6dfa561198b4736deaf11ab68dcc4845 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 023/231] [PATCH] i386: i386 make NMI use PERFCTR1 for architectural perfmon (take 2) Hello, This patch against 2.6.20-git14 makes the NMI watchdog use PERFSEL1/PERFCTR1 instead of PERFSEL0/PERFCTR0 on processors supporting Intel architectural perfmon, such as Intel Core 2. Although all PMU events can work on both counters, the Precise Event-Based Sampling (PEBS) requires that the event be in PERFCTR0 to work correctly (see section 18.14.4.1 in the IA32 SDM Vol 3b). A similar patch for x86-64 is to follow. Changelog: - make the i386 NMI watchdog use PERFSEL1/PERFCTR1 instead of PERFSEL0/PERFCTR0 on processors supporting the Intel architectural perfmon (e.g. Core 2 Duo). This allows PEBS to work when the NMI watchdog is active. signed-off-by: stephane eranian Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 84c3497efb60..aef22c881a0d 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -365,7 +365,7 @@ static int __init check_nmi_watchdog(void) nmi_hz = 1; if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) { nmi_hz = adjust_for_32bit_ctr(nmi_hz); } } @@ -799,8 +799,8 @@ static int setup_intel_arch_watchdog(void) (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) goto fail; - perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1; if (!__reserve_perfctr_nmi(-1, perfctr_msr)) goto fail; @@ -1080,7 +1080,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) write_watchdog_counter(wd->perfctr_msr, NULL); } else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) { /* P6 based Pentium M need to re-unmask * the apic vector but it doesn't hurt * other P6 variant. From 405e494d91bac85cc992f55ad434b0f325e399a5 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 024/231] [PATCH] x86-64: x86_64 make NMI use PERFCTR1 for architectural perfmon (take 2) Hello, This patch against 2.6.20-git14 makes the NMI watchdog use PERFSEL1/PERFCTR1 instead of PERFSEL0/PERFCTR0 on processors supporting Intel architectural perfmon, such as Intel Core 2. Although all PMU events can work on both counters, the Precise Event-Based Sampling (PEBS) requires that the event be in PERFCTR0 to work correctly (see section 18.14.4.1 in the IA32 SDM Vol 3b). This versions has 3 chunks compared to previous where we had missed on check. Changelog: - make the x86-64 NMI watchdog use PERFSEL1/PERFCTR1 instead of PERFSEL0/PERFCTR0 on processors supporting the Intel architectural perfmon (e.g. Core 2 Duo). This allows PEBS to work when the NMI watchdog is active. signed-off-by: stephane eranian Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index dfab9f167366..010d3d9bd56a 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -348,7 +348,7 @@ int __init check_nmi_watchdog (void) struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); nmi_hz = 1; - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) nmi_hz = adjust_for_32bit_ctr(nmi_hz); } @@ -713,8 +713,8 @@ static int setup_intel_arch_watchdog(void) (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) goto fail; - perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1; if (!__reserve_perfctr_nmi(-1, perfctr_msr)) goto fail; @@ -958,7 +958,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) /* start the cycle over again */ wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) { /* * ArchPerfom/Core Duo needs to re-unmask * the apic vector From e319af1d8722bc3fb61ed6968c88bb66fbb3f58e Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 025/231] [PATCH] i386: Add __init to probe_bigsmp Add __init to probe_bigsmp. All callers are __init and data being examined is __initdata. Resolves MODPOST warning similar to: WARNING: vmlinux - Section mismatch: reference to .init.data: from .text between 'probe_bigsmp' (at offset 0xc0401e56) and 'init_apic_ldr' Signed-off-by: Prarit Bhargava Signed-off-by: Andi Kleen --- arch/i386/mach-generic/bigsmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/mach-generic/bigsmp.c b/arch/i386/mach-generic/bigsmp.c index 8a210fa915b5..e932d3485ae2 100644 --- a/arch/i386/mach-generic/bigsmp.c +++ b/arch/i386/mach-generic/bigsmp.c @@ -45,7 +45,7 @@ static struct dmi_system_id __initdata bigsmp_dmi_table[] = { }; -static int probe_bigsmp(void) +static int __init probe_bigsmp(void) { if (def_to_bigsmp) dmi_bigsmp = 1; From 86c0baf123e474b6eb404798926ecf62b426bf3a Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 026/231] [PATCH] i386: Change sysenter_setup to __cpuinit & improve __INIT, __INITDATA Change sysenter_setup to __cpuinit. Change __INIT & __INITDATA to be cpu hotplug aware. Resolve MODPOST warnings similar to: WARNING: vmlinux - Section mismatch: reference to .init.text:sysenter_setup from .text between 'identify_cpu' (at offset 0xc040a380) and 'detect_ht' and WARNING: vmlinux - Section mismatch: reference to .init.data:vsyscall_int80_end from .text between 'sysenter_setup' (at offset 0xc041a269) and 'enable_sep_cpu' WARNING: vmlinux - Section mismatch: reference to .init.data:vsyscall_int80_start from .text between 'sysenter_setup' (at offset 0xc041a26e) and 'enable_sep_cpu' WARNING: vmlinux - Section mismatch: reference to .init.data:vsyscall_sysenter_end from .text between 'sysenter_setup' (at offset 0xc041a275) and 'enable_sep_cpu' WARNING: vmlinux - Section mismatch: reference to .init.data:vsyscall_sysenter_start from .text between 'sysenter_setup' (at offset 0xc041a27a) and 'enable_sep_cpu' Signed-off-by: Prarit Bhargava Signed-off-by: Andi Kleen --- arch/i386/kernel/sysenter.c | 2 +- include/linux/init.h | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 13ca54a85a1c..168f8147d3b4 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -72,7 +72,7 @@ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; static struct page *syscall_pages[1]; -int __init sysenter_setup(void) +int __cpuinit sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); syscall_pages[0] = virt_to_page(syscall_page); diff --git a/include/linux/init.h b/include/linux/init.h index e290a010e3f2..9abf120ec9f8 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -52,9 +52,14 @@ #endif /* For assembly routines */ +#ifdef CONFIG_HOTPLUG_CPU +#define __INIT .section ".text","ax" +#define __INITDATA .section ".data","aw" +#else #define __INIT .section ".init.text","ax" -#define __FINIT .previous #define __INITDATA .section ".init.data","aw" +#endif +#define __FINIT .previous #ifndef __ASSEMBLY__ /* From d9c93813ac17b34ee6eb7a424578acae6f90d759 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH 027/231] [PATCH] x86-64: Correct max number of CPUs in Kconfig Pointed out by Adrian Bunk Signed-off-by: Andi Kleen --- arch/x86_64/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 56eb14c98475..b3dbf11eb82c 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -415,13 +415,13 @@ config OUT_OF_LINE_PFN_TO_PAGE depends on DISCONTIGMEM config NR_CPUS - int "Maximum number of CPUs (2-256)" + int "Maximum number of CPUs (2-255)" range 2 255 depends on SMP default "8" help This allows you to specify the maximum number of CPUs which this - kernel will support. Current maximum is 256 CPUs due to + kernel will support. Current maximum is 255 CPUs due to APIC addressing limits. Less depending on the hardware. This is purely to save memory - each supported CPU requires From 2a12652c0335ec90747d3402a82b6699ae883b58 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 028/231] [PATCH] i386: Support Oprofile for AMD Family 10 CPUs Signed-off-by: Andi Kleen --- arch/i386/oprofile/nmi_int.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index 8fda7be9dd4d..695f737516ae 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -414,6 +414,10 @@ int __init op_nmi_init(struct oprofile_operations *ops) user space an consistent name. */ cpu_type = "x86-64/hammer"; break; + case 0x10: + model = &op_athlon_spec; + cpu_type = "x86-64/family10"; + break; } break; From c8fdd247255a3a027cd9f66dcf93e6847d1d2f85 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 029/231] [PATCH] x86: Drop cc-options call for all options supported in gcc 3.2+ The kernel only supports gcc 3.2+ now so it doesn't make sense anymore to explicitely check for options this compiler version already has. This actually fixes a bug. The -mprefered-stack-boundary check never worked because gcc rightly complains CC arch/i386/kernel/asm-offsets.s cc1: -mpreferred-stack-boundary=2 is not between 4 and 12 We just never saw the error because of cc-options. I changed it to 4 to actually work. Tested by compiling i386 and x86-64 defconfig with gcc 3.2. Should speed up the build time a tiny bit and improve stack usage on i386 slightly. Signed-off-by: Andi Kleen --- Makefile | 2 +- arch/i386/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index d970cb16545a..387526b69d4f 100644 --- a/Makefile +++ b/Makefile @@ -491,7 +491,7 @@ endif include $(srctree)/arch/$(ARCH)/Makefile ifdef CONFIG_FRAME_POINTER -CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) +CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls else CFLAGS += -fomit-frame-pointer endif diff --git a/arch/i386/Makefile b/arch/i386/Makefile index bd28f9f9b4b7..6dc5e5d90fec 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -34,7 +34,7 @@ CHECKFLAGS += -D__i386__ CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return # prevent gcc from keeping the stack 16 byte aligned -CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) +CFLAGS += -mpreferred-stack-boundary=4 # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/i386/Makefile.cpu From 6d1c426158131b11d05d66e7dd6bf91e5b1b4fc7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 030/231] [PATCH] i386: Update __copy_to_user_inatomic linuxdoc description Explicity specify that the caller should pin the user memory otherwise the function will sleep Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andi Kleen --- include/asm-i386/uaccess.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h index 70829ae3ad52..e2aa5e0d0cc7 100644 --- a/include/asm-i386/uaccess.h +++ b/include/asm-i386/uaccess.h @@ -397,7 +397,19 @@ unsigned long __must_check __copy_from_user_ll_nocache(void *to, unsigned long __must_check __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, unsigned long n); -/* +/** + * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. + * @to: Destination address, in user space. + * @from: Source address, in kernel space. + * @n: Number of bytes to copy. + * + * Context: User context only. + * + * Copy data from kernel space to user space. Caller must check + * the specified block with access_ok() before calling this function. + * The caller should also make sure he pins the user space address + * so that the we don't result in page fault and sleep. + * * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault * we return the initial request size (1, 2 or 4), as copy_*_user should do. * If a store crosses a page boundary and gets a fault, the x86 will not write From 973efae21beb2feda138f152ed06d4204774d93c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 031/231] [PATCH] i386: clean up mach_reboot_fixups The reboot_fixups stuff seems to be a bit of a mess, specifically the header is in linux/ when its a purely i386-specific piece of code. I'm not sure why it has its config option; its only currently needed for "geode-gx1/cs5530a", so perhaps whatever config option controls that hardware should enable this? Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/kernel/reboot.c | 6 +++++- arch/i386/kernel/reboot_fixups.c | 2 +- include/{linux => asm-i386}/reboot_fixups.h | 4 ---- 3 files changed, 6 insertions(+), 6 deletions(-) rename include/{linux => asm-i386}/reboot_fixups.h (61%) diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 3514b4153f7f..8b5ff6e15412 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -17,7 +17,7 @@ #include #include #include "mach_reboot.h" -#include +#include /* * Power off function, if any @@ -316,6 +316,10 @@ void machine_shutdown(void) #endif } +void __attribute__((weak)) mach_reboot_fixups(void) +{ +} + void machine_emergency_restart(void) { if (!reboot_thru_bios) { diff --git a/arch/i386/kernel/reboot_fixups.c b/arch/i386/kernel/reboot_fixups.c index 99aab41a05b0..2d78d918340f 100644 --- a/arch/i386/kernel/reboot_fixups.c +++ b/arch/i386/kernel/reboot_fixups.c @@ -10,7 +10,7 @@ #include #include -#include +#include static void cs5530a_warm_reset(struct pci_dev *dev) { diff --git a/include/linux/reboot_fixups.h b/include/asm-i386/reboot_fixups.h similarity index 61% rename from include/linux/reboot_fixups.h rename to include/asm-i386/reboot_fixups.h index 480ea2d489d8..0cb7d87c2b68 100644 --- a/include/linux/reboot_fixups.h +++ b/include/asm-i386/reboot_fixups.h @@ -1,10 +1,6 @@ #ifndef _LINUX_REBOOT_FIXUPS_H #define _LINUX_REBOOT_FIXUPS_H -#ifdef CONFIG_X86_REBOOTFIXUPS extern void mach_reboot_fixups(void); -#else -#define mach_reboot_fixups() ((void)(0)) -#endif #endif /* _LINUX_REBOOT_FIXUPS_H */ From b0b1ff653a0ce88dc9d5cec4e67c9c2be0ba03ef Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 032/231] [PATCH] i386: Fix usage of -mtune when X86_GENERIC=y or CONFIG_MCORE2=y Hi! I sent this simple patch to lkml about two weeks ago and also cc'ed to Linus, but seems that the patch got ignored. I decided to write to you, because you have modified the relevant file most recently. Below is a copy of the mail that is also available at . Signed-off-by: Andi Kleen --- arch/i386/Makefile.cpu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu index e0ada4e9e137..e372b584e919 100644 --- a/arch/i386/Makefile.cpu +++ b/arch/i386/Makefile.cpu @@ -4,9 +4,9 @@ #-mtune exists since gcc 3.4 HAS_MTUNE := $(call cc-option-yn, -mtune=i386) ifeq ($(HAS_MTUNE),y) -tune = $(call cc-option,-mtune=$(1),) +tune = $(call cc-option,-mtune=$(1),$(2)) else -tune = $(call cc-option,-mcpu=$(1),) +tune = $(call cc-option,-mcpu=$(1),$(2)) endif align := $(cc-option-align) @@ -33,7 +33,7 @@ cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 -cflags-$(CONFIG_MCORE2) += -march=i686 $(call cc-option,-mtune=core2,$(call cc-option,-mtune=generic,-mtune=i686)) +cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) # AMD Elan support cflags-$(CONFIG_X86_ELAN) += -march=i486 @@ -43,5 +43,5 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx # add at the end to overwrite eventual tuning options from earlier # cpu entries -cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic) +cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) From f9d09645d6157fefa18ff75930737060c8092ddb Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 033/231] [PATCH] x86-64: Remove unused set_seg_base The set_seg_base function isn't used anywhere (2.6.21-rc3-git1) Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- include/asm-x86_64/desc.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/asm-x86_64/desc.h b/include/asm-x86_64/desc.h index 913d6ac00033..7726e74db536 100644 --- a/include/asm-x86_64/desc.h +++ b/include/asm-x86_64/desc.h @@ -107,16 +107,6 @@ static inline void set_ldt_desc(unsigned cpu, void *addr, int size) DESC_LDT, size * 8 - 1); } -static inline void set_seg_base(unsigned cpu, int entry, void *base) -{ - struct desc_struct *d = &cpu_gdt(cpu)[entry]; - u32 addr = (u32)(u64)base; - BUG_ON((u64)base >> 32); - d->base0 = addr & 0xffff; - d->base1 = (addr >> 16) & 0xff; - d->base2 = (addr >> 24) & 0xff; -} - #define LDT_entry_a(info) \ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) /* Don't allow setting of the lm bit. It is useless anyways because From fbc16f2c2a0e16dbd75ac85d3b6db97f92b642ba Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 034/231] [PATCH] x86-64: Remove duplicated code for reading control registers On Tue, Mar 13, 2007 at 05:33:09AM -0700, Randy.Dunlap wrote: > On Tue, 13 Mar 2007, Glauber de Oliveira Costa wrote: > > > Tiny cleanup: > > > > In x86_64, the same functions for reading cr3 and writing cr{3,4} are > > defined in tlbflush.h and system.h, whith just a name change. > > The only difference is the clobbering of memory, which seems a safe, and > > even needed change for the write_cr4. This patch removes the duplicate. > > write_cr3() is moved to system.h for consistency. > > missing patch..... > thanks. Attached now -- Glauber de Oliveira Costa Red Hat Inc. "Free as in Freedom" Signed-off-by: Andi Kleen --- include/asm-x86_64/system.h | 7 ++++++- include/asm-x86_64/tlbflush.h | 33 +++++---------------------------- 2 files changed, 11 insertions(+), 29 deletions(-) diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h index bd376bc8c4ab..213b7fe5d998 100644 --- a/include/asm-x86_64/system.h +++ b/include/asm-x86_64/system.h @@ -89,6 +89,11 @@ static inline unsigned long read_cr3(void) return cr3; } +static inline void write_cr3(unsigned long val) +{ + asm volatile("movq %0,%%cr3" :: "r" (val) : "memory"); +} + static inline unsigned long read_cr4(void) { unsigned long cr4; @@ -98,7 +103,7 @@ static inline unsigned long read_cr4(void) static inline void write_cr4(unsigned long val) { - asm volatile("movq %0,%%cr4" :: "r" (val)); + asm volatile("movq %0,%%cr4" :: "r" (val) : "memory"); } #define stts() write_cr0(8 | read_cr0()) diff --git a/include/asm-x86_64/tlbflush.h b/include/asm-x86_64/tlbflush.h index 983bd296c81a..512401b8725f 100644 --- a/include/asm-x86_64/tlbflush.h +++ b/include/asm-x86_64/tlbflush.h @@ -3,41 +3,18 @@ #include #include - -static inline unsigned long get_cr3(void) -{ - unsigned long cr3; - asm volatile("mov %%cr3,%0" : "=r" (cr3)); - return cr3; -} - -static inline void set_cr3(unsigned long cr3) -{ - asm volatile("mov %0,%%cr3" :: "r" (cr3) : "memory"); -} +#include static inline void __flush_tlb(void) { - set_cr3(get_cr3()); -} - -static inline unsigned long get_cr4(void) -{ - unsigned long cr4; - asm volatile("mov %%cr4,%0" : "=r" (cr4)); - return cr4; -} - -static inline void set_cr4(unsigned long cr4) -{ - asm volatile("mov %0,%%cr4" :: "r" (cr4) : "memory"); + write_cr3(read_cr3()); } static inline void __flush_tlb_all(void) { - unsigned long cr4 = get_cr4(); - set_cr4(cr4 & ~X86_CR4_PGE); /* clear PGE */ - set_cr4(cr4); /* write old PGE again and flush TLBs */ + unsigned long cr4 = read_cr4(); + write_cr4(cr4 & ~X86_CR4_PGE); /* clear PGE */ + write_cr4(cr4); /* write old PGE again and flush TLBs */ } #define __flush_tlb_one(addr) \ From 6b37f5a20c0e5c334c010a587058354215433e92 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 035/231] [PATCH] x86-64: fix cpu MHz reporting on constant_tsc cpus This patch fixes the reporting of cpu_mhz in /proc/cpuinfo on CPUs with a constant TSC rate and a kernel with disabled cpufreq. Signed-off-by: Mark Langsdorf Signed-off-by: Joerg Roedel Signed-off-by: Andi Kleen arch/x86_64/kernel/apic.c | 2 - arch/x86_64/kernel/time.c | 58 +++++++++++++++++++++++++++++++++++++++--- arch/x86_64/kernel/tsc.c | 12 +++++--- arch/x86_64/kernel/tsc_sync.c | 2 - include/asm-x86_64/proto.h | 1 5 files changed, 65 insertions(+), 10 deletions(-) --- arch/x86_64/kernel/apic.c | 2 +- arch/x86_64/kernel/time.c | 58 +++++++++++++++++++++++++++++++++-- arch/x86_64/kernel/tsc.c | 12 +++++--- arch/x86_64/kernel/tsc_sync.c | 2 +- include/asm-x86_64/proto.h | 1 + 5 files changed, 65 insertions(+), 10 deletions(-) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index bd3e45d47c37..3421f21b6c70 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -843,7 +843,7 @@ static int __init calibrate_APIC_clock(void) } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT); - result = (apic_start - apic) * 1000L * cpu_khz / + result = (apic_start - apic) * 1000L * tsc_khz / (tsc - tsc_start); } printk("result %d\n", result); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 811b8f987b50..5f862e216a42 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -43,6 +43,7 @@ #include #include #include +#include static char *timename = NULL; @@ -249,6 +250,51 @@ static unsigned long get_cmos_time(void) return mktime(year, mon, day, hour, min, sec); } +/* calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency */ +#define TICK_COUNT 100000000 +static unsigned int __init tsc_calibrate_cpu_khz(void) +{ + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start meauring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles_sync(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); +} /* * pit_calibrate_tsc() uses the speaker output (channel 2) of @@ -336,14 +382,20 @@ void __init time_init(void) if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; - cpu_khz = hpet_calibrate_tsc(); + tsc_khz = hpet_calibrate_tsc(); timename = "HPET"; } else { pit_init(); - cpu_khz = pit_calibrate_tsc(); + tsc_khz = pit_calibrate_tsc(); timename = "PIT"; } + cpu_khz = tsc_khz; + if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && + boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 16) + cpu_khz = tsc_calibrate_cpu_khz(); + if (unsynchronized_tsc()) mark_tsc_unstable(); @@ -352,7 +404,7 @@ void __init time_init(void) else vgetcpu_mode = VGETCPU_LSL; - set_cyc2ns_scale(cpu_khz); + set_cyc2ns_scale(tsc_khz); printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); init_tsc_clocksource(); diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c index 1a0edbbffaa0..5c84992c676d 100644 --- a/arch/x86_64/kernel/tsc.c +++ b/arch/x86_64/kernel/tsc.c @@ -13,6 +13,8 @@ static int notsc __initdata = 0; unsigned int cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); +unsigned int tsc_khz; +EXPORT_SYMBOL(tsc_khz); static unsigned int cyc2ns_scale __read_mostly; @@ -77,7 +79,7 @@ static void handle_cpufreq_delayed_get(struct work_struct *v) static unsigned int ref_freq = 0; static unsigned long loops_per_jiffy_ref = 0; -static unsigned long cpu_khz_ref = 0; +static unsigned long tsc_khz_ref = 0; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -99,7 +101,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if (!ref_freq) { ref_freq = freq->old; loops_per_jiffy_ref = *lpj; - cpu_khz_ref = cpu_khz; + tsc_khz_ref = tsc_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || @@ -107,12 +109,12 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); + tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable(); } - set_cyc2ns_scale(cpu_khz_ref); + set_cyc2ns_scale(tsc_khz_ref); return 0; } @@ -213,7 +215,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable); void __init init_tsc_clocksource(void) { if (!notsc) { - clocksource_tsc.mult = clocksource_khz2mult(cpu_khz, + clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, clocksource_tsc.shift); if (check_tsc_unstable()) clocksource_tsc.rating = 0; diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c index 014f0db45dfa..72d444dede9b 100644 --- a/arch/x86_64/kernel/tsc_sync.c +++ b/arch/x86_64/kernel/tsc_sync.c @@ -50,7 +50,7 @@ static __cpuinit void check_tsc_warp(void) /* * The measurement runs for 20 msecs: */ - end = start + cpu_khz * 20ULL; + end = start + tsc_khz * 20ULL; now = start; for (i = 0; ; i++) { diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index f64949fae61a..78427021d94a 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -92,6 +92,7 @@ extern unsigned long table_start, table_end; extern int exception_trace; extern unsigned cpu_khz; +extern unsigned tsc_khz; extern void no_iommu_init(void); extern int force_iommu, no_iommu; From 19d1743315099665db4ce02c9942507a5ee1deea Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 036/231] [PATCH] i386: Simplify smp_call_function*() by using common implementation smp_call_function and smp_call_function_single are almost complete duplicates of the same logic. This patch combines them by implementing them in terms of the more general smp_call_function_mask(). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Stephane Eranian Cc: Andrew Morton Cc: Andi Kleen Cc: "Randy.Dunlap" Cc: Ingo Molnar --- arch/i386/kernel/smp.c | 183 +++++++++++++++++++++++------------------ 1 file changed, 101 insertions(+), 82 deletions(-) diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 0cd459baad68..b90bebeb1c79 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -546,6 +546,73 @@ static void __smp_call_function(void (*func) (void *info), void *info, cpu_relax(); } + +/** + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. Must not include the current cpu. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Does not return until + * remote CPUs are nearly ready to execute <> or are or have finished. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) +{ + struct call_data_struct data; + cpumask_t allbutself; + int cpus; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + + allbutself = cpu_online_map; + cpu_clear(smp_processor_id(), allbutself); + + cpus_and(mask, mask, allbutself); + cpus = cpus_weight(mask); + + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); + + /* Send a message to other CPUs */ + if (cpus_equal(mask, allbutself)) + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + else + send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + spin_unlock(&call_lock); + + return 0; +} + /** * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. @@ -559,21 +626,44 @@ static void __smp_call_function(void (*func) (void *info), void *info, * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) +int smp_call_function(void (*func) (void *info), void *info, int nonatomic, + int wait) { - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); - __smp_call_function(func, info, nonatomic, wait); - spin_unlock(&call_lock); - - return 0; + return smp_call_function_mask(cpu_online_map, func, info, wait); } EXPORT_SYMBOL(smp_call_function); +/* + * smp_call_function_single - Run a function on another CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Currently unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Retrurns 0 on success, else a negative status code. + * + * Does not return until the remote CPU is nearly ready to execute + * or is or has executed. + */ +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int ret; + int me = get_cpu(); + if (cpu == me) { + WARN_ON(1); + put_cpu(); + return -EBUSY; + } + + ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); + + put_cpu(); + return ret; +} +EXPORT_SYMBOL(smp_call_function_single); + static void stop_this_cpu (void * dummy) { local_irq_disable(); @@ -641,77 +731,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs) } } -/* - * this function sends a 'generic call function' IPI to one other CPU - * in the system. - * - * cpu is a standard Linux logical CPU number. - */ -static void -__smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = 1; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - -/* - * smp_call_function_single - Run a function on another CPU - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Currently unused. - * @wait: If true, wait until function has completed on other CPUs. - * - * Retrurns 0 on success, else a negative status code. - * - * Does not return until the remote CPU is nearly ready to execute - * or is or has executed. - */ - -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - /* prevent preemption and reschedule on another processor */ - int me = get_cpu(); - if (cpu == me) { - WARN_ON(1); - put_cpu(); - return -EBUSY; - } - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - spin_lock_bh(&call_lock); - __smp_call_function_single(cpu, func, info, nonatomic, wait); - spin_unlock_bh(&call_lock); - put_cpu(); - return 0; -} -EXPORT_SYMBOL(smp_call_function_single); - static int convert_apicid_to_cpu(int apic_id) { int i; From e65845045588806fa5c8df8a4f4253516515a5e3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 037/231] [PATCH] x86-64: dma_ops as const The dma_ops structure can be const since it never changes after boot. Signed-off-by: Stephen Hemminger Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 2 +- arch/x86_64/kernel/pci-gart.c | 2 +- arch/x86_64/kernel/pci-nommu.c | 2 +- arch/x86_64/kernel/pci-swiotlb.c | 2 +- arch/x86_64/mm/init.c | 2 +- include/asm-x86_64/dma-mapping.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 04480c3b68f5..5bd20b542c1e 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -507,7 +507,7 @@ void* calgary_alloc_coherent(struct device *dev, size_t size, return ret; } -static struct dma_mapping_ops calgary_dma_ops = { +static const struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 0bae862e9a55..0a762e10f2be 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -556,7 +556,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) extern int agp_amd64_init(void); -static struct dma_mapping_ops gart_dma_ops = { +static const struct dma_mapping_ops gart_dma_ops = { .mapping_error = NULL, .map_single = gart_map_single, .map_simple = gart_map_simple, diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index df09ab05a1bd..6dade0c867cc 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -79,7 +79,7 @@ void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, { } -struct dma_mapping_ops nommu_dma_ops = { +const struct dma_mapping_ops nommu_dma_ops = { .map_single = nommu_map_single, .unmap_single = nommu_unmap_single, .map_sg = nommu_map_sg, diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index eb18be5a6569..4b4569abc60c 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -12,7 +12,7 @@ int swiotlb __read_mostly; EXPORT_SYMBOL(swiotlb); -struct dma_mapping_ops swiotlb_dma_ops = { +const struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc_coherent = swiotlb_alloc_coherent, .free_coherent = swiotlb_free_coherent, diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index ec31534eb104..5ca61731f550 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -46,7 +46,7 @@ #define Dprintk(x...) #endif -struct dma_mapping_ops* dma_ops; +const struct dma_mapping_ops* dma_ops; EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h index d2af227f06d0..6897e2a436e5 100644 --- a/include/asm-x86_64/dma-mapping.h +++ b/include/asm-x86_64/dma-mapping.h @@ -52,7 +52,7 @@ struct dma_mapping_ops { }; extern dma_addr_t bad_dma_address; -extern struct dma_mapping_ops* dma_ops; +extern const struct dma_mapping_ops* dma_ops; extern int iommu_merge; static inline int dma_mapping_error(dma_addr_t dma_addr) From 9d291e787b2b71d1b57e5fbb24ba9c70e748ed84 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 038/231] [PATCH] x86-64: Assembly safe page.h and pgtable.h This patch makes pgtable.h and page.h safe to include in assembly files like head.S. Allowing us to use symbolic constants instead of hard coded numbers when refering to the page tables. This patch copies asm-sparc64/const.h to asm-x86_64 to get a definition of _AC() a very convinient macro that allows us to force the type when we are compiling the code in C and to drop all of the type information when we are using the constant in assembly. Previously this was done with multiple definition of the same constant. const.h was modified slightly so that it works when given CONFIG options as arguments. This patch adds #ifndef __ASSEMBLY__ ... #endif and _AC(1,UL) where appropriate so the assembler won't choke on the header files. Otherwise nothing should have changed. AK: added const.h to exported headers to fix headers_check Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- include/asm-x86_64/Kbuild | 1 + include/asm-x86_64/const.h | 20 ++++++++++++++++++++ include/asm-x86_64/page.h | 28 ++++++++++------------------ include/asm-x86_64/pgtable.h | 33 +++++++++++++++++++++------------ 4 files changed, 52 insertions(+), 30 deletions(-) create mode 100644 include/asm-x86_64/const.h diff --git a/include/asm-x86_64/Kbuild b/include/asm-x86_64/Kbuild index ebd7117782a6..242296ede3dd 100644 --- a/include/asm-x86_64/Kbuild +++ b/include/asm-x86_64/Kbuild @@ -18,3 +18,4 @@ header-y += vsyscall32.h unifdef-y += mce.h unifdef-y += mtrr.h unifdef-y += vsyscall.h +unifdef-y += const.h diff --git a/include/asm-x86_64/const.h b/include/asm-x86_64/const.h new file mode 100644 index 000000000000..54fb08f3db9b --- /dev/null +++ b/include/asm-x86_64/const.h @@ -0,0 +1,20 @@ +/* const.h: Macros for dealing with constants. */ + +#ifndef _X86_64_CONST_H +#define _X86_64_CONST_H + +/* Some constant macros are used in both assembler and + * C code. Therefore we cannot annotate them always with + * 'UL' and other type specificers unilaterally. We + * use the following macros to deal with this. + */ + +#ifdef __ASSEMBLY__ +#define _AC(X,Y) X +#else +#define __AC(X,Y) (X##Y) +#define _AC(X,Y) __AC(X,Y) +#endif + + +#endif /* !(_X86_64_CONST_H) */ diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index 10f346165cab..d554b94485df 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -1,14 +1,11 @@ #ifndef _X86_64_PAGE_H #define _X86_64_PAGE_H +#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#ifdef __ASSEMBLY__ -#define PAGE_SIZE (0x1 << PAGE_SHIFT) -#else -#define PAGE_SIZE (1UL << PAGE_SHIFT) -#endif +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) #define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK) @@ -33,10 +30,10 @@ #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) +#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT) #define HPAGE_SHIFT PMD_SHIFT -#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) +#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) @@ -76,29 +73,24 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define __pgd(x) ((pgd_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } ) -#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START) -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) -#define __START_KERNEL_map 0xffffffff80000000UL -#define __PAGE_OFFSET 0xffff810000000000UL +#endif /* !__ASSEMBLY__ */ -#else #define __PHYSICAL_START CONFIG_PHYSICAL_START #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map 0xffffffff80000000 #define __PAGE_OFFSET 0xffff810000000000 -#endif /* !__ASSEMBLY__ */ /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) /* See Documentation/x86_64/mm.txt for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 46 -#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1) +#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1) #define __VIRTUAL_MASK_SHIFT 48 -#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) +#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1) -#define KERNEL_TEXT_SIZE (40UL*1024*1024) -#define KERNEL_TEXT_START 0xffffffff80000000UL +#define KERNEL_TEXT_SIZE (40*1024*1024) +#define KERNEL_TEXT_START 0xffffffff80000000 #ifndef __ASSEMBLY__ @@ -106,7 +98,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; #endif /* __ASSEMBLY__ */ -#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) +#define PAGE_OFFSET __PAGE_OFFSET /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. Otherwise you risk miscompilation. */ diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 5957361782fe..c514deb658a3 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -1,6 +1,9 @@ #ifndef _X86_64_PGTABLE_H #define _X86_64_PGTABLE_H +#include +#ifndef __ASSEMBLY__ + /* * This file contains the functions and defines necessary to modify and use * the x86-64 page table tree. @@ -30,6 +33,8 @@ extern void clear_kernel_mapping(unsigned long addr, unsigned long size); extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +#endif /* !__ASSEMBLY__ */ + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ @@ -54,6 +59,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; */ #define PTRS_PER_PTE 512 +#ifndef __ASSEMBLY__ + #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e)) #define pmd_ERROR(e) \ @@ -117,22 +124,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK)) -#define PMD_SIZE (1UL << PMD_SHIFT) +#endif /* !__ASSEMBLY__ */ + +#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) #define FIRST_USER_ADDRESS 0 -#ifndef __ASSEMBLY__ -#define MAXMEM 0x3fffffffffffUL -#define VMALLOC_START 0xffffc20000000000UL -#define VMALLOC_END 0xffffe1ffffffffffUL -#define MODULES_VADDR 0xffffffff88000000UL -#define MODULES_END 0xfffffffffff00000UL +#define MAXMEM 0x3fffffffffff +#define VMALLOC_START 0xffffc20000000000 +#define VMALLOC_END 0xffffe1ffffffffff +#define MODULES_VADDR 0xffffffff88000000 +#define MODULES_END 0xfffffffffff00000 #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define _PAGE_BIT_PRESENT 0 @@ -158,7 +166,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long #define _PAGE_GLOBAL 0x100 /* Global TLB entry */ #define _PAGE_PROTNONE 0x080 /* If not present */ -#define _PAGE_NX (1UL<<_PAGE_BIT_NX) +#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -220,6 +228,8 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC +#ifndef __ASSEMBLY__ + static inline unsigned long pgd_bad(pgd_t pgd) { return pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); @@ -405,8 +415,6 @@ extern spinlock_t pgd_lock; extern struct page *pgd_list; void vmalloc_sync_all(void); -#endif /* !__ASSEMBLY__ */ - extern int kern_addr_valid(unsigned long addr); #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ @@ -436,5 +444,6 @@ extern int kern_addr_valid(unsigned long addr); #define __HAVE_ARCH_PTEP_SET_WRPROTECT #define __HAVE_ARCH_PTE_SAME #include +#endif /* !__ASSEMBLY__ */ #endif /* _X86_64_PGTABLE_H */ From dafe41ee3a9389c08c91cdfd8670295f20f89e04 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 039/231] [PATCH] x86-64: Kill temp boot pmds Early in the boot process we need the ability to set up temporary mappings, before our normal mechanisms are initialized. Currently this is used to map pages that are part of the page tables we are building and pages during the dmi scan. The core problem is that we are using the user portion of the page tables to implement this. Which means that while this mechanism is active we cannot catch NULL pointer dereferences and we deviate from the normal ways of handling things. In this patch I modify early_ioremap to map pages into the kernel portion of address space, roughly where we will later put modules, and I make the discovery of which addresses we can use dynamic which removes all kinds of static limits and remove the dependencies on implementation details between different parts of the code. Now alloc_low_page() and unmap_low_page() use early_iomap() and early_iounmap() to allocate/map and unmap a page. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 3 -- arch/x86_64/mm/init.c | 100 +++++++++++++++++--------------------- 2 files changed, 45 insertions(+), 58 deletions(-) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 598a4d0351fc..118c6088198a 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -288,9 +288,6 @@ NEXT_PAGE(level2_ident_pgt) .quad i << 21 | 0x083 i = i + 1 .endr - /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ - .globl temp_boot_pmds -temp_boot_pmds: .fill 492,8,0 NEXT_PAGE(level2_kernel_pgt) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 5ca61731f550..4ab3d40aac90 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -167,23 +167,9 @@ __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) unsigned long __initdata table_start, table_end; -extern pmd_t temp_boot_pmds[]; - -static struct temp_map { - pmd_t *pmd; - void *address; - int allocated; -} temp_mappings[] __initdata = { - { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) }, - { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, - {} -}; - -static __meminit void *alloc_low_page(int *index, unsigned long *phys) +static __meminit void *alloc_low_page(unsigned long *phys) { - struct temp_map *ti; - int i; - unsigned long pfn = table_end++, paddr; + unsigned long pfn = table_end++; void *adr; if (after_bootmem) { @@ -194,57 +180,63 @@ static __meminit void *alloc_low_page(int *index, unsigned long *phys) if (pfn >= end_pfn) panic("alloc_low_page: ran out of memory"); - for (i = 0; temp_mappings[i].allocated; i++) { - if (!temp_mappings[i].pmd) - panic("alloc_low_page: ran out of temp mappings"); - } - ti = &temp_mappings[i]; - paddr = (pfn << PAGE_SHIFT) & PMD_MASK; - set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); - ti->allocated = 1; - __flush_tlb(); - adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); - memset(adr, 0, PAGE_SIZE); - *index = i; - *phys = pfn * PAGE_SIZE; - return adr; -} -static __meminit void unmap_low_page(int i) + adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); + memset(adr, 0, PAGE_SIZE); + *phys = pfn * PAGE_SIZE; + return adr; +} + +static __meminit void unmap_low_page(void *adr) { - struct temp_map *ti; if (after_bootmem) return; - ti = &temp_mappings[i]; - set_pmd(ti->pmd, __pmd(0)); - ti->allocated = 0; + early_iounmap(adr, PAGE_SIZE); } /* Must run before zap_low_mappings */ __init void *early_ioremap(unsigned long addr, unsigned long size) { - unsigned long map = round_down(addr, LARGE_PAGE_SIZE); + unsigned long vaddr; + pmd_t *pmd, *last_pmd; + int i, pmds; - /* actually usually some more */ - if (size >= LARGE_PAGE_SIZE) { - return NULL; + pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + vaddr = __START_KERNEL_map; + pmd = level2_kernel_pgt; + last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { + for (i = 0; i < pmds; i++) { + if (pmd_present(pmd[i])) + goto next; + } + vaddr += addr & ~PMD_MASK; + addr &= PMD_MASK; + for (i = 0; i < pmds; i++, addr += PMD_SIZE) + set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); + __flush_tlb(); + return (void *)vaddr; + next: + ; } - set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); - map += LARGE_PAGE_SIZE; - set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); - return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1)); + printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); + return NULL; } /* To avoid virtual aliases later */ __init void early_iounmap(void *addr, unsigned long size) { - if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) - printk("early_iounmap: bad address %p\n", addr); - set_pmd(temp_mappings[0].pmd, __pmd(0)); - set_pmd(temp_mappings[1].pmd, __pmd(0)); + unsigned long vaddr; + pmd_t *pmd; + int i, pmds; + + vaddr = (unsigned long)addr; + pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + pmd = level2_kernel_pgt + pmd_index(vaddr); + for (i = 0; i < pmds; i++) + pmd_clear(pmd + i); __flush_tlb(); } @@ -289,7 +281,6 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { - int map; unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; @@ -307,12 +298,12 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne continue; } - pmd = alloc_low_page(&map, &pmd_phys); + pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); - unmap_low_page(map); + unmap_low_page(pmd); } __flush_tlb(); } @@ -364,7 +355,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) end = (unsigned long)__va(end); for (; start < end; start = next) { - int map; unsigned long pud_phys; pgd_t *pgd = pgd_offset_k(start); pud_t *pud; @@ -372,7 +362,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) if (after_bootmem) pud = pud_offset(pgd, start & PGDIR_MASK); else - pud = alloc_low_page(&map, &pud_phys); + pud = alloc_low_page(&pud_phys); next = start + PGDIR_SIZE; if (next > end) @@ -380,7 +370,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); - unmap_low_page(map); + unmap_low_page(pud); } if (!after_bootmem) From 67dcbb6bc6537aea92a2466bfc75f015b00e465e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 040/231] [PATCH] x86-64: Clean up the early boot page table - Merge physmem_pgt and ident_pgt, removing physmem_pgt. The merge is broken as soon as mm/init.c:init_memory_mapping is run. - As physmem_pgt is gone don't export it in pgtable.h. - Use defines from pgtable.h for page permissions. - Fix the physical memory identity mapping so it is at the correct address. - Remove the physical memory mapping from wakeup_level4_pgt it is at the wrong address so we can't possibly be usinging it. - Simply NEXT_PAGE the work to calculate the phys_ alias of the labels was very cool. Unfortuantely it was a brittle special purpose hack that makes maitenance more difficult. Instead just use label - __START_KERNEL_map like we do everywhere else in assembly. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 61 +++++++++++++++++------------------- include/asm-x86_64/pgtable.h | 1 - 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 118c6088198a..23b45e79a27c 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -260,52 +261,48 @@ ljumpvector: ENTRY(stext) ENTRY(_stext) - $page = 0 #define NEXT_PAGE(name) \ - $page = $page + 1; \ - .org $page * 0x1000; \ - phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \ + .balign PAGE_SIZE; \ ENTRY(name) +/* Automate the creation of 1 to 1 mapping pmd entries */ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << 21) + (PERM) ; \ + i = i + 1 ; \ + .endr + NEXT_PAGE(init_level4_pgt) /* This gets initialized in x86_64_start_kernel */ .fill 512,8,0 NEXT_PAGE(level3_ident_pgt) - .quad phys_level2_ident_pgt | 0x007 + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .fill 511,8,0 NEXT_PAGE(level3_kernel_pgt) .fill 510,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad phys_level2_kernel_pgt | 0x007 + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .fill 1,8,0 NEXT_PAGE(level2_ident_pgt) - /* 40MB for bootup. */ - i = 0 - .rept 20 - .quad i << 21 | 0x083 - i = i + 1 - .endr - .fill 492,8,0 + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) /* 40MB kernel mapping. The kernel code cannot be bigger than that. When you change this change KERNEL_TEXT_SIZE in page.h too. */ /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ - i = 0 - .rept 20 - .quad i << 21 | 0x183 - i = i + 1 - .endr + PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, + KERNEL_TEXT_SIZE/PMD_SIZE) /* Module mapping starts here */ - .fill 492,8,0 - -NEXT_PAGE(level3_physmem_pgt) - .quad phys_level2_kernel_pgt | 0x007 /* so that __va works even before pagetable_init */ - .fill 511,8,0 + .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 +#undef PMDS #undef NEXT_PAGE .data @@ -313,12 +310,10 @@ NEXT_PAGE(level3_physmem_pgt) #ifdef CONFIG_ACPI_SLEEP .align PAGE_SIZE ENTRY(wakeup_level4_pgt) - .quad phys_level3_ident_pgt | 0x007 - .fill 255,8,0 - .quad phys_level3_physmem_pgt | 0x007 - .fill 254,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 510,8,0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad phys_level3_kernel_pgt | 0x007 + .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE #endif #ifndef CONFIG_HOTPLUG_CPU @@ -332,12 +327,12 @@ ENTRY(wakeup_level4_pgt) */ .align PAGE_SIZE ENTRY(boot_level4_pgt) - .quad phys_level3_ident_pgt | 0x007 - .fill 255,8,0 - .quad phys_level3_physmem_pgt | 0x007 - .fill 254,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 257,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 252,8,0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad phys_level3_kernel_pgt | 0x007 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE .data diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index c514deb658a3..5a5d43b3ef5d 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -14,7 +14,6 @@ #include extern pud_t level3_kernel_pgt[512]; -extern pud_t level3_physmem_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; From 93fd755e47d7b00fa5470199cfb14cbd9ded0284 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH 041/231] [PATCH] x86-64: Fix early printk to use standard ISA mapping Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/early_printk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 47b6d90349da..b7aa9abe1c6a 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -11,11 +11,10 @@ #ifdef __i386__ #include -#define VGABASE (__ISA_IO_base + 0xb8000) #else #include -#define VGABASE ((void __iomem *)0xffffffff800b8000UL) #endif +#define VGABASE (__ISA_IO_base + 0xb8000) static int max_ypos = 25, max_xpos = 80; static int current_ypos = 25, current_xpos = 0; From 278c0eb7f96586c02b2bfaa8e250d951919a2e6a Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 042/231] [PATCH] x86-64: modify copy_bootdata to use virtual addresses Use virtual addresses instead of physical addresses in copy bootdata. In addition fix the implementation of the old bootloader convention. Everything is at real_mode_data always. It is just that sometimes real_mode_data was relocated by setup.S to not sit at 0x90000. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head64.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 5f197b0a330a..5c529c1e3d6c 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -29,25 +29,24 @@ static void __init clear_bss(void) } #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#define OLD_CL_MAGIC_ADDR 0x90020 +#define OLD_CL_MAGIC_ADDR 0x20 #define OLD_CL_MAGIC 0xA33F -#define OLD_CL_BASE_ADDR 0x90000 -#define OLD_CL_OFFSET 0x90022 +#define OLD_CL_OFFSET 0x22 static void __init copy_bootdata(char *real_mode_data) { - int new_data; + unsigned long new_data; char * command_line; memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); - new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); + new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); if (!new_data) { - if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { + if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { return; } - new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; + new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); } - command_line = (char *) ((u64)(new_data)); + command_line = __va(new_data); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } @@ -74,7 +73,7 @@ void __init x86_64_start_kernel(char * real_mode_data) cpu_pda(i) = &boot_cpu_pda[i]; pda_init(0); - copy_bootdata(real_mode_data); + copy_bootdata(__va(real_mode_data)); #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); #endif From 30f472895401fbe8e64f861a2569bc9acb098741 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 043/231] [PATCH] x86-64: cleanup segments Move __KERNEL32_CS up into the unused gdt entry. __KERNEL32_CS is used when entering the kernel so putting it first is useful when trying to keep boot gdt sizes to a minimum. Set the accessed bit on all gdt entries. We don't care so there is no need for the cpu to burn the extra cycles, and it potentially allows the pages to be immutable. Plus it is confusing when debugging and your gdt entries mysteriously change. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 12 ++++++------ include/asm-x86_64/segment.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 23b45e79a27c..2b2e2c51e532 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -362,13 +362,13 @@ gdt: ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ + .quad 0x00af9b000000ffff /* __KERNEL_CS */ + .quad 0x00cf93000000ffff /* __KERNEL_DS */ + .quad 0x00cffb000000ffff /* __USER32_CS */ + .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ + .quad 0x00affb000000ffff /* __USER_CS */ .quad 0x0 /* unused */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x00cffa000000ffff /* __USER32_CS */ - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ - .quad 0x00affa000000ffff /* __USER_CS */ - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ diff --git a/include/asm-x86_64/segment.h b/include/asm-x86_64/segment.h index 334ddcdd8f92..adf2bf1e187c 100644 --- a/include/asm-x86_64/segment.h +++ b/include/asm-x86_64/segment.h @@ -6,7 +6,7 @@ #define __KERNEL_CS 0x10 #define __KERNEL_DS 0x18 -#define __KERNEL32_CS 0x38 +#define __KERNEL32_CS 0x08 /* * we cannot use the same code segment descriptor for user and kernel From 3c321bceb4a626639ab43a5a24d884930e511826 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 044/231] [PATCH] x86-64: Add EFER to the register set saved by save_processor_state EFER varies like %cr4 depending on the cpu capabilities, and which cpu capabilities we want to make use of. So save/restore it make certain we have the same EFER value when we are done. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/suspend.c | 3 ++- include/asm-x86_64/suspend.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index 91f7e678bae7..fe865ea4df52 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -33,7 +33,6 @@ void __save_processor_state(struct saved_context *ctxt) asm volatile ("str %0" : "=m" (ctxt->tr)); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ - /* EFER should be constant for kernel version, no need to handle it. */ /* * segment registers */ @@ -50,6 +49,7 @@ void __save_processor_state(struct saved_context *ctxt) /* * control registers */ + rdmsrl(MSR_EFER, ctxt->efer); asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); @@ -75,6 +75,7 @@ void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ + wrmsrl(MSR_EFER, ctxt->efer); asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h index bc7f81715e5e..a42306c220d6 100644 --- a/include/asm-x86_64/suspend.h +++ b/include/asm-x86_64/suspend.h @@ -17,6 +17,7 @@ struct saved_context { u16 ds, es, fs, gs, ss; unsigned long gs_base, gs_kernel_base, fs_base; unsigned long cr0, cr2, cr3, cr4, cr8; + unsigned long efer; u16 gdt_pad; u16 gdt_limit; unsigned long gdt_base; From 90b1c2085ea1955641e60a4f0acd63fdc271cd0b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 045/231] [PATCH] x86-64: 64bit PIC SMP trampoline This modifies the SMP trampoline and all of the associated code so it can jump to a 64bit kernel loaded at an arbitrary address. The dependencies on having an idenetity mapped page in the kernel page tables for SMP bootup have all been removed. In addition the trampoline has been modified to verify that long mode is supported. Asking if long mode is implemented is down right silly but we have traditionally had some of these checks, and they can't hurt anything. So when the totally ludicrous happens we just might handle it correctly. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 1 + arch/x86_64/kernel/setup.c | 9 +- arch/x86_64/kernel/trampoline.S | 168 +++++++++++++++++++++++++++++--- 3 files changed, 156 insertions(+), 22 deletions(-) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 2b2e2c51e532..562d62fbd69f 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -101,6 +101,7 @@ startup_32: .org 0x100 .globl startup_64 startup_64: +ENTRY(secondary_startup_64) /* We come here either from startup_32 * or directly from a 64bit bootloader. * Since we may have come directly from a bootloader we diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 3d98b696881d..4b114ee31ebc 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -329,15 +329,8 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_SMP - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); #endif #ifdef CONFIG_ACPI_SLEEP diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S index c79b99a9e2f6..13eee63c7bb5 100644 --- a/arch/x86_64/kernel/trampoline.S +++ b/arch/x86_64/kernel/trampoline.S @@ -3,6 +3,7 @@ * Trampoline.S Derived from Setup.S by Linus Torvalds * * 4 Jan 1997 Michael Chastain: changed to gnu as. + * 15 Sept 2005 Eric Biederman: 64bit PIC support * * Entry: CS:IP point to the start of our code, we are * in real mode with no stack, but the rest of the @@ -17,15 +18,20 @@ * and IP is zero. Thus, data addresses need to be absolute * (no relocation) and are taken with regard to r_base. * + * With the addition of trampoline_level4_pgt this code can + * now enter a 64bit kernel that lives at arbitrary 64bit + * physical addresses. + * * If you work on this file, check the object module with objdump * --full-contents --reloc to make sure there are no relocation - * entries. For the GDT entry we do hand relocation in smpboot.c - * because of 64bit linker limitations. + * entries. */ #include -#include +#include #include +#include +#include .data @@ -33,15 +39,31 @@ ENTRY(trampoline_data) r_base = . + cli # We should be safe anyway wbinvd mov %cs, %ax # Code and data in the same place mov %ax, %ds + mov %ax, %es + mov %ax, %ss - cli # We should be safe anyway movl $0xA5A5A5A5, trampoline_data - r_base # write marker for master knows we're running + # Setup stack + movw $(trampoline_stack_end - r_base), %sp + + call verify_cpu # Verify the cpu supports long mode + + mov %cs, %ax + movzx %ax, %esi # Find the 32bit trampoline location + shll $4, %esi + + # Fixup the vectors + addl %esi, startup_32_vector - r_base + addl %esi, startup_64_vector - r_base + addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer + /* * GDT tables in non default location kernel can be beyond 16MB and * lgdt will not be able to load the address as in real mode default @@ -49,23 +71,141 @@ r_base = . * to 32 bit. */ - lidtl idt_48 - r_base # load idt with 0, 0 - lgdtl gdt_48 - r_base # load gdt with whatever is appropriate + lidtl tidt - r_base # load idt with 0, 0 + lgdtl tgdt - r_base # load gdt with whatever is appropriate xor %ax, %ax inc %ax # protected mode (PE) bit lmsw %ax # into protected mode - # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S - ljmpl $__KERNEL32_CS, $(startup_32-__START_KERNEL_map) + + # flush prefetch and jump to startup_32 + ljmpl *(startup_32_vector - r_base) + + .code32 + .balign 4 +startup_32: + movl $__KERNEL_DS, %eax # Initialize the %ds segment register + movl %eax, %ds + + xorl %eax, %eax + btsl $5, %eax # Enable PAE mode + movl %eax, %cr4 + + # Setup trampoline 4 level pagetables + leal (trampoline_level4_pgt - r_base)(%esi), %eax + movl %eax, %cr3 + + movl $MSR_EFER, %ecx + movl $(1 << _EFER_LME), %eax # Enable Long Mode + xorl %edx, %edx + wrmsr + + xorl %eax, %eax + btsl $31, %eax # Enable paging and in turn activate Long Mode + btsl $0, %eax # Enable protected mode + movl %eax, %cr0 + + /* + * At this point we're in long mode but in 32bit compatibility mode + * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn + * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + */ + ljmp *(startup_64_vector - r_base)(%esi) + + .code64 + .balign 4 +startup_64: + # Now jump into the kernel using virtual addresses + movq $secondary_startup_64, %rax + jmp *%rax + + .code16 +verify_cpu: + pushl $0 # Kill any dangerous flags + popfl + + /* minimum CPUID flags for x86-64 */ + /* see http://www.x86-64.org/lists/discuss/msg02971.html */ +#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ + (1<<13)|(1<<15)|(1<<24)|(1<<25)|(1<<26)) +#define REQUIRED_MASK2 (1<<29) + + pushfl # check for cpuid + popl %eax + movl %eax, %ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + pushl %ebx + popfl + cmpl %eax, %ebx + jz no_longmode + + xorl %eax, %eax # See if cpuid 1 is implemented + cpuid + cmpl $0x1, %eax + jb no_longmode + + movl $0x01, %eax # Does the cpu have what it takes? + cpuid + andl $REQUIRED_MASK1, %edx + xorl $REQUIRED_MASK1, %edx + jnz no_longmode + + movl $0x80000000, %eax # See if extended cpuid is implemented + cpuid + cmpl $0x80000001, %eax + jb no_longmode + + movl $0x80000001, %eax # Does the cpu have what it takes? + cpuid + andl $REQUIRED_MASK2, %edx + xorl $REQUIRED_MASK2, %edx + jnz no_longmode + + ret # The cpu supports long mode + +no_longmode: + hlt + jmp no_longmode + # Careful these need to be in the same 64K segment as the above; -idt_48: +tidt: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L -gdt_48: - .short GDT_ENTRIES*8 - 1 # gdt limit - .long cpu_gdt_table-__START_KERNEL_map + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 4 +tgdt: + .short tgdt_end - tgdt # gdt limit + .long tgdt - r_base + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tgdt_end: -.globl trampoline_end -trampoline_end: + .balign 4 +startup_32_vector: + .long startup_32 - r_base + .word __KERNEL32_CS, 0 + + .balign 4 +startup_64_vector: + .long startup_64 - r_base + .word __KERNEL_CS, 0 + +trampoline_stack: + .org 0x1000 +trampoline_stack_end: +ENTRY(trampoline_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 510,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE + +ENTRY(trampoline_end) From 7c17e70613d2c70f9d5d0b5d37126fd5e8e9b728 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 046/231] [PATCH] x86-64: Get rid of dead code in suspend resume o Get rid of dead code in wakeup.S o We never restore from saved_gdt, saved_idt, saved_ltd, saved_tss, saved_cr3, saved_cr4, saved_cr0, real_save_gdt, saved_efer, saved_efer2. Get rid of of associated code. o Get rid of bogus_magic, bogus_31_magic and bogus_magic2. No longer being used. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/acpi/wakeup.S | 57 +------------------------------- 1 file changed, 1 insertion(+), 56 deletions(-) diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 185faa911db5..6ece70e91a32 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -258,8 +258,6 @@ gdt_48a: .word 0, 0 # gdt base (filled in later) -real_save_gdt: .word 0 - .quad 0 real_magic: .quad 0 video_mode: .quad 0 video_flags: .quad 0 @@ -272,10 +270,6 @@ bogus_32_magic: movb $0xb3,%al ; outb %al,$0x80 jmp bogus_32_magic -bogus_31_magic: - movb $0xb1,%al ; outb %al,$0x80 - jmp bogus_31_magic - bogus_cpu: movb $0xbc,%al ; outb %al,$0x80 jmp bogus_cpu @@ -346,16 +340,6 @@ check_vesaa: _setbada: jmp setbada - .code64 -bogus_magic: - movw $0x0e00 + 'B', %ds:(0xb8018) - jmp bogus_magic - -bogus_magic2: - movw $0x0e00 + '2', %ds:(0xb8018) - jmp bogus_magic2 - - wakeup_stack_begin: # Stack grows down .org 0xff0 @@ -373,28 +357,11 @@ ENTRY(wakeup_end) # # Returned address is location of code in low memory (past data and stack) # + .code64 ENTRY(acpi_copy_wakeup_routine) pushq %rax - pushq %rcx pushq %rdx - sgdt saved_gdt - sidt saved_idt - sldt saved_ldt - str saved_tss - - movq %cr3, %rdx - movq %rdx, saved_cr3 - movq %cr4, %rdx - movq %rdx, saved_cr4 - movq %cr0, %rdx - movq %rdx, saved_cr0 - sgdt real_save_gdt - wakeup_start (,%rdi) - movl $MSR_EFER, %ecx - rdmsr - movl %eax, saved_efer - movl %edx, saved_efer2 - movl saved_video_mode, %edx movl %edx, video_mode - wakeup_start (,%rdi) movl acpi_video_flags, %edx @@ -407,17 +374,8 @@ ENTRY(acpi_copy_wakeup_routine) cmpl $0x9abcdef0, %eax jne bogus_32_magic - # make sure %cr4 is set correctly (features, etc) - movl saved_cr4 - __START_KERNEL_map, %eax - movq %rax, %cr4 - - movl saved_cr0 - __START_KERNEL_map, %eax - movq %rax, %cr0 - jmp 1f # Flush pipelines -1: # restore the regs we used popq %rdx - popq %rcx popq %rax ENTRY(do_suspend_lowlevel_s4bios) ret @@ -512,16 +470,3 @@ ENTRY(saved_eip) .quad 0 ENTRY(saved_esp) .quad 0 ENTRY(saved_magic) .quad 0 - -ALIGN -# saved registers -saved_gdt: .quad 0,0 -saved_idt: .quad 0,0 -saved_ldt: .quad 0 -saved_tss: .quad 0 - -saved_cr0: .quad 0 -saved_cr3: .quad 0 -saved_cr4: .quad 0 -saved_efer: .quad 0 -saved_efer2: .quad 0 From 7db681d7e4038ad205b5face5cf7f7815633e1b5 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 047/231] [PATCH] x86-64: wakeup.S rename registers to reflect right names o Use appropriate names for 64bit regsiters. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/acpi/wakeup.S | 36 ++++++++++++++++---------------- include/asm-x86_64/suspend.h | 12 +++++------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 6ece70e91a32..17dbeff64eef 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -211,16 +211,16 @@ wakeup_long64: movw %ax, %es movw %ax, %fs movw %ax, %gs - movq saved_esp, %rsp + movq saved_rsp, %rsp movw $0x0e00 + 'x', %ds:(0xb8018) - movq saved_ebx, %rbx - movq saved_edi, %rdi - movq saved_esi, %rsi - movq saved_ebp, %rbp + movq saved_rbx, %rbx + movq saved_rdi, %rdi + movq saved_rsi, %rsi + movq saved_rbp, %rbp movw $0x0e00 + '!', %ds:(0xb801a) - movq saved_eip, %rax + movq saved_rip, %rax jmp *%rax .code32 @@ -408,13 +408,13 @@ do_suspend_lowlevel: movq %r15, saved_context_r15(%rip) pushfq ; popq saved_context_eflags(%rip) - movq $.L97, saved_eip(%rip) + movq $.L97, saved_rip(%rip) - movq %rsp,saved_esp - movq %rbp,saved_ebp - movq %rbx,saved_ebx - movq %rdi,saved_edi - movq %rsi,saved_esi + movq %rsp,saved_rsp + movq %rbp,saved_rbp + movq %rbx,saved_rbx + movq %rdi,saved_rdi + movq %rsi,saved_rsi addq $8, %rsp movl $3, %edi @@ -461,12 +461,12 @@ do_suspend_lowlevel: .data ALIGN -ENTRY(saved_ebp) .quad 0 -ENTRY(saved_esi) .quad 0 -ENTRY(saved_edi) .quad 0 -ENTRY(saved_ebx) .quad 0 +ENTRY(saved_rbp) .quad 0 +ENTRY(saved_rsi) .quad 0 +ENTRY(saved_rdi) .quad 0 +ENTRY(saved_rbx) .quad 0 -ENTRY(saved_eip) .quad 0 -ENTRY(saved_esp) .quad 0 +ENTRY(saved_rip) .quad 0 +ENTRY(saved_rsp) .quad 0 ENTRY(saved_magic) .quad 0 diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h index a42306c220d6..9c3f8de90d2d 100644 --- a/include/asm-x86_64/suspend.h +++ b/include/asm-x86_64/suspend.h @@ -45,12 +45,12 @@ extern unsigned long saved_context_eflags; extern void fix_processor_context(void); #ifdef CONFIG_ACPI_SLEEP -extern unsigned long saved_eip; -extern unsigned long saved_esp; -extern unsigned long saved_ebp; -extern unsigned long saved_ebx; -extern unsigned long saved_esi; -extern unsigned long saved_edi; +extern unsigned long saved_rip; +extern unsigned long saved_rsp; +extern unsigned long saved_rbp; +extern unsigned long saved_rbx; +extern unsigned long saved_rsi; +extern unsigned long saved_rdi; /* routines for saving/restoring kernel state */ extern int acpi_save_state_mem(void); From 275f55170ec2b5d777b070cb8ab9e5d58e65a2a8 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 048/231] [PATCH] x86-64: wakeup.S misc cleanups o Various cleanups. One of the main purpose of cleanups is that make wakeup.S as close as possible to trampoline.S. o Following are the changes - Indentations for comments. - Changed the gdt table to compact form and to resemble the one in trampoline.S - Take the jump to 32bit from real mode using ljmpl. Makes code more readable. - After enabling long mode, directly take a long jump for 64bit mode. No need to take an extra jump to "reach_comaptibility_mode" - Stack is not used after real mode. So don't load stack in 32 bit mode. - No need to enable PGE here. - No need to do extra EFER read, anyway we trash the read contents. - No need to enable system call (EFER_SCE). Anyway it will be enabled when original EFER is restored. - No need to set MP, ET, NE, WP, AM bits in cr0. Very soon we will reload the original cr0 while restroing the processor state. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/acpi/wakeup.S | 112 +++++++++++-------------------- 1 file changed, 40 insertions(+), 72 deletions(-) diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 17dbeff64eef..bd4c6f1a6f32 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -30,11 +30,12 @@ wakeup_code: cld # setup data segment movw %cs, %ax - movw %ax, %ds # Make ds:0 point to wakeup_start + movw %ax, %ds # Make ds:0 point to wakeup_start movw %ax, %ss - mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board + # Private stack is needed for ASUS board + mov $(wakeup_stack - wakeup_code), %sp - pushl $0 # Kill any dangerous flags + pushl $0 # Kill any dangerous flags popfl movl real_magic - wakeup_code, %eax @@ -45,7 +46,7 @@ wakeup_code: jz 1f lcall $0xc000,$3 movw %cs, %ax - movw %ax, %ds # Bios might have played with that + movw %ax, %ds # Bios might have played with that movw %ax, %ss 1: @@ -75,9 +76,12 @@ wakeup_code: jmp 1f 1: - .byte 0x66, 0xea # prefix + jmpi-opcode - .long wakeup_32 - __START_KERNEL_map - .word __KERNEL_CS + ljmpl *(wakeup_32_vector - wakeup_code) + + .balign 4 +wakeup_32_vector: + .long wakeup_32 - __START_KERNEL_map + .word __KERNEL32_CS, 0 .code32 wakeup_32: @@ -96,65 +100,50 @@ wakeup_32: jnc bogus_cpu movl %edx,%edi - movw $__KERNEL_DS, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %fs - movw %ax, %gs + movl $__KERNEL_DS, %eax + movl %eax, %ds - movw $__KERNEL_DS, %ax - movw %ax, %ss - - mov $(wakeup_stack - __START_KERNEL_map), %esp movl saved_magic - __START_KERNEL_map, %eax cmpl $0x9abcdef0, %eax jne bogus_32_magic + movw $0x0e00 + 'i', %ds:(0xb8012) + movb $0xa8, %al ; outb %al, $0x80; + /* * Prepare for entering 64bits mode */ - /* Enable PAE mode and PGE */ + /* Enable PAE */ xorl %eax, %eax btsl $5, %eax - btsl $7, %eax movl %eax, %cr4 /* Setup early boot stage 4 level pagetables */ movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax movl %eax, %cr3 - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - /* Fool rdmsr and reset %eax to avoid dependences */ - xorl %eax, %eax /* Enable Long Mode */ + xorl %eax, %eax btsl $_EFER_LME, %eax - /* Enable System Call */ - btsl $_EFER_SCE, %eax - /* No Execute supported? */ + /* No Execute supported? */ btl $20,%edi jnc 1f btsl $_EFER_NX, %eax -1: /* Make changes effective */ +1: movl $MSR_EFER, %ecx + xorl %edx, %edx wrmsr - wbinvd xorl %eax, %eax btsl $31, %eax /* Enable paging and in turn activate Long Mode */ btsl $0, %eax /* Enable protected mode */ - btsl $1, %eax /* Enable MP */ - btsl $4, %eax /* Enable ET */ - btsl $5, %eax /* Enable NE */ - btsl $16, %eax /* Enable WP */ - btsl $18, %eax /* Enable AM */ /* Make changes effective */ movl %eax, %cr0 + /* At this point: CR4.PAE must be 1 CS.L must be 0 @@ -162,11 +151,6 @@ wakeup_32: Next instruction must be a branch This must be on identity-mapped page */ - jmp reach_compatibility_mode -reach_compatibility_mode: - movw $0x0e00 + 'i', %ds:(0xb8012) - movb $0xa8, %al ; outb %al, $0x80; - /* * At this point we're in long mode but in 32bit compatibility mode * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn @@ -174,24 +158,19 @@ reach_compatibility_mode: * the new gdt/idt that has __KERNEL_CS with CS.L = 1. */ - movw $0x0e00 + 'n', %ds:(0xb8014) - movb $0xa9, %al ; outb %al, $0x80 - - /* Load new GDT with the 64bit segment using 32bit descriptor */ - movl $(pGDT32 - __START_KERNEL_map), %eax - lgdt (%eax) - - movl $(wakeup_jumpvector - __START_KERNEL_map), %eax /* Finally jump in 64bit mode */ - ljmp *(%eax) + ljmp *(wakeup_long64_vector - __START_KERNEL_map) -wakeup_jumpvector: - .long wakeup_long64 - __START_KERNEL_map - .word __KERNEL_CS + .balign 4 +wakeup_long64_vector: + .long wakeup_long64 - __START_KERNEL_map + .word __KERNEL_CS, 0 .code64 - /* Hooray, we are in Long 64-bit mode (but still running in low memory) */ + /* Hooray, we are in Long 64-bit mode (but still running in + * low memory) + */ wakeup_long64: /* * We must switch to a new descriptor in kernel space for the GDT @@ -201,6 +180,9 @@ wakeup_long64: */ lgdt cpu_gdt_descr - __START_KERNEL_map + movw $0x0e00 + 'n', %ds:(0xb8014) + movb $0xa9, %al ; outb %al, $0x80 + movw $0x0e00 + 'u', %ds:(0xb8016) nop @@ -227,33 +209,19 @@ wakeup_long64: .align 64 gdta: + /* Its good to keep gdt in sync with one in trampoline.S */ .word 0, 0, 0, 0 # dummy - - .word 0, 0, 0, 0 # unused - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?) - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9200 # data read/write - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) -# this is 64bit descriptor for code - .word 0xFFFF - .word 0 - .word 0x9A00 # code read/exec - .word 0x00AF # as above, but it is long mode and with D=0 + /* ??? Why I need the accessed bit set in order for this to work? */ + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS idt_48a: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L gdt_48a: - .word 0x8000 # gdt limit=2048, + .word 0x800 # gdt limit=2048, # 256 GDT entries .word 0, 0 # gdt base (filled in later) @@ -263,7 +231,7 @@ video_mode: .quad 0 video_flags: .quad 0 bogus_real_magic: - movb $0xba,%al ; outb %al,$0x80 + movb $0xba,%al ; outb %al,$0x80 jmp bogus_real_magic bogus_32_magic: From d8e1baf10d62c06fc52e89137357e54da3d92672 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 049/231] [PATCH] x86-64: 64bit ACPI wakeup trampoline o Moved wakeup_level4_pgt into the wakeup routine so we can run the kernel above 4G. o Now we first go to 64bit mode and continue to run from trampoline and then then start accessing kernel symbols and restore processor context. This enables us to resume even in relocatable kernel context when kernel might not be loaded at physical addr it has been compiled for. o Removed the need for modifying any existing kernel page table. o Increased the size of the wakeup routine to 8K. This is required as wake page tables are on trampoline itself and they got to be at 4K boundary, hence one page is not sufficient. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/acpi/sleep.c | 24 +++---------- arch/x86_64/kernel/acpi/wakeup.S | 59 ++++++++++++++++++++------------ arch/x86_64/kernel/head.S | 9 ----- 3 files changed, 41 insertions(+), 51 deletions(-) diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index e1548fbe95ae..195b7034a148 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -60,19 +60,6 @@ extern char wakeup_start, wakeup_end; extern unsigned long acpi_copy_wakeup_routine(unsigned long); -static pgd_t low_ptr; - -static void init_low_mapping(void) -{ - pgd_t *slot0 = pgd_offset(current->mm, 0UL); - low_ptr = *slot0; - /* FIXME: We're playing with the current task's page tables here, which - * is potentially dangerous on SMP systems. - */ - set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); - local_flush_tlb(); -} - /** * acpi_save_state_mem - save kernel state * @@ -81,8 +68,6 @@ static void init_low_mapping(void) */ int acpi_save_state_mem(void) { - init_low_mapping(); - memcpy((void *)acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); acpi_copy_wakeup_routine(acpi_wakeup_address); @@ -95,8 +80,6 @@ int acpi_save_state_mem(void) */ void acpi_restore_state_mem(void) { - set_pgd(pgd_offset(current->mm, 0UL), low_ptr); - local_flush_tlb(); } /** @@ -109,10 +92,11 @@ void acpi_restore_state_mem(void) */ void __init acpi_reserve_bootmem(void) { - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); + if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) printk(KERN_CRIT - "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); + "ACPI: Wakeup code way too big, will crash on attempt" + " to suspend\n"); } static int __init acpi_sleep_setup(char *str) diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index bd4c6f1a6f32..766cfbcac1db 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -1,6 +1,7 @@ .text #include #include +#include #include #include @@ -62,12 +63,15 @@ wakeup_code: movb $0xa2, %al ; outb %al, $0x80 - lidt %ds:idt_48a - wakeup_code - xorl %eax, %eax - movw %ds, %ax # (Convert %ds:gdt to a linear ptr) - shll $4, %eax - addl $(gdta - wakeup_code), %eax - movl %eax, gdt_48a +2 - wakeup_code + mov %ds, %ax # Find 32bit wakeup_code addr + movzx %ax, %esi # (Convert %ds:gdt to a liner ptr) + shll $4, %esi + # Fix up the vectors + addl %esi, wakeup_32_vector - wakeup_code + addl %esi, wakeup_long64_vector - wakeup_code + addl %esi, gdt_48a + 2 - wakeup_code # Fixup the gdt pointer + + lidtl %ds:idt_48a - wakeup_code lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is # appropriate @@ -80,7 +84,7 @@ wakeup_code: .balign 4 wakeup_32_vector: - .long wakeup_32 - __START_KERNEL_map + .long wakeup_32 - wakeup_code .word __KERNEL32_CS, 0 .code32 @@ -103,10 +107,6 @@ wakeup_32: movl $__KERNEL_DS, %eax movl %eax, %ds - movl saved_magic - __START_KERNEL_map, %eax - cmpl $0x9abcdef0, %eax - jne bogus_32_magic - movw $0x0e00 + 'i', %ds:(0xb8012) movb $0xa8, %al ; outb %al, $0x80; @@ -120,7 +120,7 @@ wakeup_32: movl %eax, %cr4 /* Setup early boot stage 4 level pagetables */ - movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax + leal (wakeup_level4_pgt - wakeup_code)(%esi), %eax movl %eax, %cr3 /* Enable Long Mode */ @@ -159,11 +159,11 @@ wakeup_32: */ /* Finally jump in 64bit mode */ - ljmp *(wakeup_long64_vector - __START_KERNEL_map) + ljmp *(wakeup_long64_vector - wakeup_code)(%esi) .balign 4 wakeup_long64_vector: - .long wakeup_long64 - __START_KERNEL_map + .long wakeup_long64 - wakeup_code .word __KERNEL_CS, 0 .code64 @@ -178,11 +178,16 @@ wakeup_long64: * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */ - lgdt cpu_gdt_descr - __START_KERNEL_map + lgdt cpu_gdt_descr movw $0x0e00 + 'n', %ds:(0xb8014) movb $0xa9, %al ; outb %al, $0x80 + movq saved_magic, %rax + movq $0x123456789abcdef0, %rdx + cmpq %rdx, %rax + jne bogus_64_magic + movw $0x0e00 + 'u', %ds:(0xb8016) nop @@ -223,20 +228,21 @@ idt_48a: gdt_48a: .word 0x800 # gdt limit=2048, # 256 GDT entries - .word 0, 0 # gdt base (filled in later) - + .long gdta - wakeup_code # gdt base (relocated in later) real_magic: .quad 0 video_mode: .quad 0 video_flags: .quad 0 +.code16 bogus_real_magic: movb $0xba,%al ; outb %al,$0x80 jmp bogus_real_magic -bogus_32_magic: +.code64 +bogus_64_magic: movb $0xb3,%al ; outb %al,$0x80 - jmp bogus_32_magic + jmp bogus_64_magic bogus_cpu: movb $0xbc,%al ; outb %al,$0x80 @@ -263,6 +269,7 @@ bogus_cpu: #define VIDEO_FIRST_V7 0x0900 # Setting of user mode (AX=mode ID) => CF=success +.code16 mode_seta: movw %ax, %bx #if 0 @@ -313,6 +320,13 @@ wakeup_stack_begin: # Stack grows down .org 0xff0 wakeup_stack: # Just below end of page +.org 0x1000 +ENTRY(wakeup_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 510,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE + ENTRY(wakeup_end) ## @@ -338,9 +352,10 @@ ENTRY(acpi_copy_wakeup_routine) movq $0x123456789abcdef0, %rdx movq %rdx, saved_magic - movl saved_magic - __START_KERNEL_map, %eax - cmpl $0x9abcdef0, %eax - jne bogus_32_magic + movq saved_magic, %rax + movq $0x123456789abcdef0, %rdx + cmpq %rdx, %rax + jne bogus_64_magic # restore the regs we used popq %rdx diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 562d62fbd69f..926aa2197aaa 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -308,15 +308,6 @@ NEXT_PAGE(level2_kernel_pgt) .data -#ifdef CONFIG_ACPI_SLEEP - .align PAGE_SIZE -ENTRY(wakeup_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 510,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE -#endif - #ifndef CONFIG_HOTPLUG_CPU __INITDATA #endif From bdb96a6614cfaba24e23dd9de4040c068c3af19b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 050/231] [PATCH] x86-64: Modify discover_ebda to use virtual addresses Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 4b114ee31ebc..65e2bc551a2b 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -205,10 +205,10 @@ static void discover_ebda(void) * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E */ - ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); ebda_addr <<= 4; - ebda_size = *(unsigned short *)(unsigned long)ebda_addr; + ebda_size = *(unsigned short *)__va(ebda_addr); /* Round EBDA up to pages */ if (ebda_size == 0) From cfd243d4af7c7f8f52f5cb99d3932d9074b039ff Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 051/231] [PATCH] x86-64: Remove the identity mapping as early as possible With the rewrite of the SMP trampoline and the early page allocator there is nothing that needs identity mapped pages, once we start executing C code. So add zap_identity_mappings into head64.c and remove zap_low_mappings() from much later in the code. The functions are subtly different thus the name change. This also kills boot_level4_pgt which was from an earlier attempt to move the identity mappings as early as possible, and is now no longer needed. Essentially I have replaced boot_level4_pgt with trampoline_level4_pgt in trampoline.S Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 39 +++++++++++++----------------------- arch/x86_64/kernel/head64.c | 17 ++++++++++------ arch/x86_64/kernel/setup.c | 2 -- arch/x86_64/kernel/setup64.c | 1 - arch/x86_64/mm/init.c | 24 ---------------------- include/asm-x86_64/pgtable.h | 1 - include/asm-x86_64/proto.h | 2 -- 7 files changed, 25 insertions(+), 61 deletions(-) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 926aa2197aaa..c211e52f1333 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -71,7 +71,7 @@ startup_32: movl %eax, %cr4 /* Setup early boot stage 4 level pagetables */ - movl $(boot_level4_pgt - __START_KERNEL_map), %eax + movl $(init_level4_pgt - __START_KERNEL_map), %eax movl %eax, %cr3 /* Setup EFER (Extended Feature Enable Register) */ @@ -115,7 +115,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(boot_level4_pgt - __START_KERNEL_map), %rax + movq $(init_level4_pgt - __START_KERNEL_map), %rax movq %rax, %cr3 /* Check if nx is implemented */ @@ -274,9 +274,19 @@ ENTRY(name) i = i + 1 ; \ .endr + /* + * This default setting generates an ident mapping at address 0x100000 + * and a mapping for the kernel that precisely maps virtual address + * 0xffffffff80000000 to physical address 0x000000. (always using + * 2Mbyte large pages provided by PAE mode) + */ NEXT_PAGE(init_level4_pgt) - /* This gets initialized in x86_64_start_kernel */ - .fill 512,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 257,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 252,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE @@ -307,27 +317,6 @@ NEXT_PAGE(level2_kernel_pgt) #undef NEXT_PAGE .data - -#ifndef CONFIG_HOTPLUG_CPU - __INITDATA -#endif - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ - .align PAGE_SIZE -ENTRY(boot_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 257,8,0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 252,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE - - .data - .align 16 .globl cpu_gdt_descr cpu_gdt_descr: diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 5c529c1e3d6c..6c34bdd22e26 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -18,8 +18,16 @@ #include #include #include +#include #include +static void __init zap_identity_mappings(void) +{ + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); + __flush_tlb(); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -57,18 +65,15 @@ void __init x86_64_start_kernel(char * real_mode_data) /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); + /* Make NULL pointers segfault */ + zap_identity_mappings(); + for (i = 0; i < IDT_ENTRIES; i++) set_intr_gate(i, early_idt_handler); asm volatile("lidt %0" :: "m" (idt_descr)); early_printk("Kernel alive\n"); - /* - * switch to init_level4_pgt from boot_level4_pgt - */ - memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t)); - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); - for (i = 0; i < NR_CPUS; i++) cpu_pda(i) = &boot_cpu_pda[i]; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 65e2bc551a2b..0e2b8df0ea64 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -274,8 +274,6 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); - zap_low_mappings(0); - #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 6a70b55f719d..53064a9a365f 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -201,7 +201,6 @@ void __cpuinit cpu_init (void) /* CPU 0 is initialised in head64.c */ if (cpu != 0) { pda_init(cpu); - zap_low_mappings(cpu); } else estacks = boot_exception_stacks; diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 4ab3d40aac90..b0a607892183 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -378,21 +378,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) __flush_tlb_all(); } -void __cpuinit zap_low_mappings(int cpu) -{ - if (cpu == 0) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - } else { - /* - * For AP's, zap the low identity mappings by changing the cr3 - * to init_level4_pgt and doing local flush tlb all - */ - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); - } - __flush_tlb_all(); -} - #ifndef CONFIG_NUMA void __init paging_init(void) { @@ -569,15 +554,6 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); - -#ifdef CONFIG_SMP - /* - * Sync boot_level4_pgt mappings with the init_level4_pgt - * except for the low identity mappings which are already zapped - * in init_level4_pgt. This sync-up is essential for AP's bringup - */ - memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); -#endif } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 5a5d43b3ef5d..703f0243f27a 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -17,7 +17,6 @@ extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; -extern pgd_t boot_level4_pgt[]; extern unsigned long __supported_pte_mask; #define swapper_pg_dir init_level4_pgt diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 78427021d94a..3f8f285138d2 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -11,8 +11,6 @@ struct pt_regs; extern void start_kernel(void); extern void pda_init(int); -extern void zap_low_mappings(int cpu); - extern void early_idt_handler(void); extern void mcheck_init(struct cpuinfo_x86 *c); From 49c3df6aaa6a51071fc135273d1a2515d019099f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 052/231] [PATCH] x86: Move swsusp __pa() dependent code to arch portion o __pa() should be used only on kernel linearly mapped virtual addresses and not on kernel text and data addresses. o Hibernation code needs to determine the physical address associated with kernel symbol to mark a section boundary which contains pages which don't have to be saved and restored during hibernate/resume operation. o Move this piece of code in arch dependent section. So that architectures which don't have kernel text/data mapped into kernel linearly mapped region can come up with their own ways of determining physical addresses associated with a kernel text. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/power/suspend.c | 14 ++++++++++++++ arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/suspend.c | 24 ++++++++++++++++++++++++ arch/x86_64/kernel/suspend.c | 14 ++++++++++++++ kernel/power/power.h | 5 ++--- kernel/power/snapshot.c | 11 ----------- 6 files changed, 55 insertions(+), 14 deletions(-) create mode 100644 arch/powerpc/kernel/suspend.c diff --git a/arch/i386/power/suspend.c b/arch/i386/power/suspend.c index db5e98d2eb73..a0020b913f31 100644 --- a/arch/i386/power/suspend.c +++ b/arch/i386/power/suspend.c @@ -16,6 +16,9 @@ /* Defined in arch/i386/power/swsusp.S */ extern int restore_image(void); +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + /* Pointer to the temporary resume page tables */ pgd_t *resume_pg_dir; @@ -156,3 +159,14 @@ int swsusp_arch_resume(void) restore_image(); return 0; } + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index e0fa80eca366..aa693d0f151a 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_6xx) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o obj-$(CONFIG_TAU) += tau_6xx.o obj32-$(CONFIG_SOFTWARE_SUSPEND) += swsusp_32.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj32-$(CONFIG_MODULES) += module_32.o ifeq ($(CONFIG_PPC_MERGE),y) diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c new file mode 100644 index 000000000000..8cee57107541 --- /dev/null +++ b/arch/powerpc/kernel/suspend.c @@ -0,0 +1,24 @@ +/* + * Suspend support specific for power. + * + * Distribute under GPLv2 + * + * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2001 Patrick Mochel + */ + +#include + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index fe865ea4df52..4ca523d58a5b 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -13,6 +13,9 @@ #include #include +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + struct saved_context saved_context; unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx; @@ -220,4 +223,15 @@ int swsusp_arch_resume(void) restore_image(); return 0; } + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} #endif /* CONFIG_SOFTWARE_SUSPEND */ diff --git a/kernel/power/power.h b/kernel/power/power.h index eb461b816bf4..1c6eef8df4ad 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -23,6 +23,8 @@ static inline int pm_suspend_disk(void) } #endif +extern int pfn_is_nosave(unsigned long); + extern struct mutex pm_mutex; #define power_attr(_name) \ @@ -37,9 +39,6 @@ static struct subsys_attribute _name##_attr = { \ extern struct subsystem power_subsys; -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; extern int in_suspend; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index fc53ad068128..704c25a3ffec 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -650,17 +650,6 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } static inline unsigned int count_highmem_pages(void) { return 0; } #endif /* CONFIG_HIGHMEM */ -/** - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -static inline int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - /** * saveable - Determine whether a non-highmem page should be included in * the suspend image. From 1b29c1643c0d82512477ccd97dc290198fe23e22 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 053/231] [PATCH] x86-64: do not use virt_to_page on kernel data address o virt_to_page() call should be used on kernel linear addresses and not on kernel text and data addresses. Swsusp code uses it on kernel data (statically allocated swsusp_header). o Allocate swsusp_header dynamically so that virt_to_page() can be used safely. o I am changing this because in next few patches, __pa() on x86_64 will no longer support kernel text and data addresses and hibernation breaks. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- kernel/power/swap.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3581f8f86acd..b18c155cbb60 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -33,12 +33,14 @@ extern char resume_file[]; #define SWSUSP_SIG "S1SUSPEND" -static struct swsusp_header { +struct swsusp_header { char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; sector_t image; char orig_sig[10]; char sig[10]; -} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; +} __attribute__((packed)); + +static struct swsusp_header *swsusp_header; /* * General things @@ -141,14 +143,14 @@ static int mark_swapfiles(sector_t start) { int error; - bio_read_page(swsusp_resume_block, &swsusp_header, NULL); - if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || - !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { - memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); - memcpy(swsusp_header.sig,SWSUSP_SIG, 10); - swsusp_header.image = start; + bio_read_page(swsusp_resume_block, swsusp_header, NULL); + if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || + !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { + memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); + memcpy(swsusp_header->sig,SWSUSP_SIG, 10); + swsusp_header->image = start; error = bio_write_page(swsusp_resume_block, - &swsusp_header, NULL); + swsusp_header, NULL); } else { printk(KERN_ERR "swsusp: Swap header not found!\n"); error = -ENODEV; @@ -564,7 +566,7 @@ int swsusp_read(void) if (error < PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); - error = get_swap_reader(&handle, swsusp_header.image); + error = get_swap_reader(&handle, swsusp_header->image); if (!error) error = swap_read_page(&handle, header, NULL); if (!error) @@ -591,17 +593,17 @@ int swsusp_check(void) resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); - memset(&swsusp_header, 0, sizeof(swsusp_header)); + memset(swsusp_header, 0, sizeof(PAGE_SIZE)); error = bio_read_page(swsusp_resume_block, - &swsusp_header, NULL); + swsusp_header, NULL); if (error) return error; - if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { - memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ error = bio_write_page(swsusp_resume_block, - &swsusp_header, NULL); + swsusp_header, NULL); } else { return -EINVAL; } @@ -632,3 +634,13 @@ void swsusp_close(void) blkdev_put(resume_bdev); } + +static int swsusp_header_init(void) +{ + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); + if (!swsusp_header) + panic("Could not allocate memory for swsusp_header\n"); + return 0; +} + +core_initcall(swsusp_header_init); From 0dbf7028c0c1f266c9631139450a1502d3cd457e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 054/231] [PATCH] x86: __pa and __pa_symbol address space separation Currently __pa_symbol is for use with symbols in the kernel address map and __pa is for use with pointers into the physical memory map. But the code is implemented so you can usually interchange the two. __pa which is much more common can be implemented much more cheaply if it is it doesn't have to worry about any other kernel address spaces. This is especially true with a relocatable kernel as __pa_symbol needs to peform an extra variable read to resolve the address. There is a third macro that is added for the vsyscall data __pa_vsymbol for finding the physical addesses of vsyscall pages. Most of this patch is simply sorting through the references to __pa or __pa_symbol and using the proper one. A little of it is continuing to use a physical address when we have it instead of recalculating it several times. swapper_pgd is now NULL. leave_mm now uses init_mm.pgd and init_mm.pgd is initialized at boot (instead of compile time) to the physmem virtual mapping of init_level4_pgd. The physical address changed. Except for the for EMPTY_ZERO page all of the remaining references to __pa_symbol appear to be during kernel initialization. So this should reduce the cost of __pa in the common case, even on a relocated kernel. As this is technically a semantic change we need to be on the lookout for anything I missed. But it works for me (tm). Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/kernel/alternative.c | 4 ++-- arch/i386/mm/init.c | 15 ++++++++------- arch/x86_64/kernel/machine_kexec.c | 14 +++++++------- arch/x86_64/kernel/setup.c | 9 +++++---- arch/x86_64/kernel/smp.c | 2 +- arch/x86_64/kernel/vsyscall.c | 9 +++++++-- arch/x86_64/mm/init.c | 21 +++++++++++---------- arch/x86_64/mm/pageattr.c | 16 ++++++++-------- include/asm-x86_64/page.h | 6 ++---- include/asm-x86_64/pgtable.h | 4 ++-- 10 files changed, 53 insertions(+), 47 deletions(-) diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 426f59b0106b..a27c8d347364 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -402,8 +402,8 @@ void __init alternative_instructions(void) _text, _etext); } free_init_pages("SMP alternatives", - (unsigned long)__smp_alt_begin, - (unsigned long)__smp_alt_end); + __pa_symbol(&__smp_alt_begin), + __pa_symbol(&__smp_alt_end)); } else { alternatives_smp_save(__smp_alt_instructions, __smp_alt_instructions_end); diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index ae436882af7a..23be1b0aafa4 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -774,10 +774,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + ClearPageReserved(page); + init_page_count(page); + memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + __free_page(page); totalram_pages++; } printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); @@ -786,14 +787,14 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) void free_initmem(void) { free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + __pa_symbol(&__init_begin), + __pa_symbol(&__init_end)); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + free_init_pages("initrd memory", __pa(start), __pa(end)); } #endif diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 0497e3bd5bff..a8bb33c1a8f2 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -191,19 +191,19 @@ NORET_TYPE void machine_kexec(struct kimage *image) page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; - page_list[PA_PGD] = __pa(kexec_pgd); + page_list[PA_PGD] = __pa_symbol(&kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; - page_list[PA_PUD_0] = __pa(kexec_pud0); + page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0); page_list[VA_PUD_0] = (unsigned long)kexec_pud0; - page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0); page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0); page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PUD_1] = __pa(kexec_pud1); + page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1); page_list[VA_PUD_1] = (unsigned long)kexec_pud1; - page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1); page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; - page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; page_list[PA_TABLE_PAGE] = diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0e2b8df0ea64..b9bdfc1b54f8 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -243,11 +243,12 @@ void __init setup_arch(char **cmdline_p) init_mm.end_code = (unsigned long) &_etext; init_mm.end_data = (unsigned long) &_edata; init_mm.brk = (unsigned long) &_end; + init_mm.pgd = __va(__pa_symbol(&init_level4_pgt)); - code_resource.start = virt_to_phys(&_text); - code_resource.end = virt_to_phys(&_etext)-1; - data_resource.start = virt_to_phys(&_etext); - data_resource.end = virt_to_phys(&_edata)-1; + code_resource.start = __pa_symbol(&_text); + code_resource.end = __pa_symbol(&_etext)-1; + data_resource.start = __pa_symbol(&_etext); + data_resource.end = __pa_symbol(&_edata)-1; early_identify_cpu(&boot_cpu_data); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index bd1d123947ce..22abae4e9f39 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -76,7 +76,7 @@ static inline void leave_mm(int cpu) if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); - load_cr3(swapper_pg_dir); + load_cr3(init_mm.pgd); } /* diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index b43c698cf7d3..d14cbb3e0ebe 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -45,6 +45,11 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" +#define __pa_vsymbol(x) \ + ({unsigned long v; \ + extern char __vsyscall_0; \ + asm("" : "=r" (v) : "0" (x)); \ + ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) struct vsyscall_gtod_data_t { seqlock_t lock; @@ -224,10 +229,10 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, return ret; /* gcc has some trouble with __va(__pa()), so just do it this way. */ - map1 = ioremap(__pa_symbol(&vsysc1), 2); + map1 = ioremap(__pa_vsymbol(&vsysc1), 2); if (!map1) return -ENOMEM; - map2 = ioremap(__pa_symbol(&vsysc2), 2); + map2 = ioremap(__pa_vsymbol(&vsysc2), 2); if (!map2) { ret = -ENOMEM; goto out; diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index b0a607892183..69e22d3c9238 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -565,11 +565,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + ClearPageReserved(page); + init_page_count(page); + memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + __free_page(page); totalram_pages++; } } @@ -579,17 +579,18 @@ void free_initmem(void) memset(__initdata_begin, POISON_FREE_INITDATA, __initdata_end - __initdata_begin); free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + __pa_symbol(&__init_begin), + __pa_symbol(&__init_end)); } #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; + unsigned long addr = (unsigned long)__va(__pa_symbol(&__start_rodata)); + unsigned long end = (unsigned long)__va(__pa_symbol(&__end_rodata)); - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) + for (; addr < end; addr += PAGE_SIZE) change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); printk ("Write protecting the kernel read-only data: %luk\n", @@ -608,7 +609,7 @@ void mark_rodata_ro(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + free_init_pages("initrd memory", __pa(start), __pa(end)); } #endif diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 081409aa3452..76ee90a5abe0 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -51,7 +51,6 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, SetPagePrivate(base); page_private(base) = 0; - address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { @@ -101,13 +100,12 @@ static inline void save_page(struct page *fpage) * No more special protections in this 2/4MB area - revert to a * large page again. */ -static void revert_page(unsigned long address, pgprot_t ref_prot) +static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_prot) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t large_pte; - unsigned long pfn; pgd = pgd_offset_k(address); BUG_ON(pgd_none(*pgd)); @@ -115,7 +113,6 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; large_pte = pfn_pte(pfn, ref_prot); large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); @@ -141,7 +138,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, */ struct page *split; ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); + split = split_large_page(pfn << PAGE_SHIFT, prot, + ref_prot2); if (!split) return -ENOMEM; set_pte(kpte, mk_pte(split, ref_prot2)); @@ -160,7 +158,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, if (page_private(kpte_page) == 0) { save_page(kpte_page); - revert_page(address, ref_prot); + revert_page(address, pfn, ref_prot); } return 0; } @@ -180,6 +178,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, */ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) { + unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT; int err = 0; int i; @@ -192,10 +191,11 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) break; /* Handle kernel mapping too which aliases part of the * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { + if ((pfn >= phys_base_pfn) && + ((pfn - phys_base_pfn) < (KERNEL_TEXT_SIZE >> PAGE_SHIFT))) { unsigned long addr2; pgprot_t prot2; - addr2 = __START_KERNEL_map + __pa(address); + addr2 = __START_KERNEL_map + ((pfn - phys_base_pfn) << PAGE_SHIFT); /* Make sure the kernel mappings stay executable */ prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); err = __change_page_attr(addr2, pfn, prot2, diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index d554b94485df..4974433bbf34 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -102,17 +102,15 @@ typedef struct { unsigned long pgprot; } pgprot_t; /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. Otherwise you risk miscompilation. */ -#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET) +#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ #define __pa_symbol(x) \ ({unsigned long v; \ asm("" : "=r" (v) : "0" (x)); \ - __pa(v); }) + (v - __START_KERNEL_map); }) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#define __boot_va(x) __va(x) -#define __boot_pa(x) __pa(x) #ifdef CONFIG_FLATMEM #define pfn_valid(pfn) ((pfn) < end_pfn) #endif diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 703f0243f27a..c1865e38c7b7 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -19,7 +19,7 @@ extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; extern unsigned long __supported_pte_mask; -#define swapper_pg_dir init_level4_pgt +#define swapper_pg_dir ((pgd_t *)NULL) extern void paging_init(void); extern void clear_kernel_mapping(unsigned long addr, unsigned long size); @@ -29,7 +29,7 @@ extern void clear_kernel_mapping(unsigned long addr, unsigned long size); * for zero-mapped memory areas etc.. */ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +#define ZERO_PAGE(vaddr) (pfn_to_page(__pa_symbol(&empty_zero_page) >> PAGE_SHIFT)) #endif /* !__ASSEMBLY__ */ From 1ab60e0f72f71ec54831e525a3e1154f1c092408 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH 055/231] [PATCH] x86-64: Relocatable Kernel Support This patch modifies the x86_64 kernel so that it can be loaded and run at any 2M aligned address, below 512G. The technique used is to compile the decompressor with -fPIC and modify it so the decompressor is fully relocatable. For the main kernel the page tables are modified so the kernel remains at the same virtual address. In addition a variable phys_base is kept that holds the physical address the kernel is loaded at. __pa_symbol is modified to add that when we take the address of a kernel symbol. When loaded with a normal bootloader the decompressor will decompress the kernel to 2M and it will run there. This both ensures the relocation code is always working, and makes it easier to use 2M pages for the kernel and the cpu. AK: changed to not make RELOCATABLE default in Kconfig Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/Kconfig | 49 +++- arch/x86_64/boot/compressed/Makefile | 12 +- arch/x86_64/boot/compressed/head.S | 322 +++++++++++++++++------- arch/x86_64/boot/compressed/misc.c | 247 +++++++++--------- arch/x86_64/boot/compressed/vmlinux.lds | 44 ++++ arch/x86_64/boot/compressed/vmlinux.scr | 9 +- arch/x86_64/kernel/head.S | 243 ++++++++++-------- arch/x86_64/kernel/suspend_asm.S | 7 +- include/asm-x86_64/page.h | 6 +- 9 files changed, 602 insertions(+), 337 deletions(-) create mode 100644 arch/x86_64/boot/compressed/vmlinux.lds diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index b3dbf11eb82c..715632026073 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -565,23 +565,56 @@ config CRASH_DUMP PHYSICAL_START. For more details see Documentation/kdump/kdump.txt +config RELOCATABLE + bool "Build a relocatable kernel(EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Builds a relocatable kernel. This enables loading and running + a kernel binary from a different physical address than it has + been compiled for. + + One use is for the kexec on panic case where the recovery kernel + must live at a different physical address than the primary + kernel. + + Note: If CONFIG_RELOCATABLE=y, then kernel run from the address + it has been loaded at and compile time physical address + (CONFIG_PHYSICAL_START) is ignored. + config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - default "0x1000000" if CRASH_DUMP default "0x200000" help - This gives the physical address where the kernel is loaded. Normally - for regular kernels this value is 0x200000 (2MB). But in the case - of kexec on panic the fail safe kernel needs to run at a different - address than the panic-ed kernel. This option is used to set the load - address for kernels used to capture crash dump on being kexec'ed - after panic. The default value for crash dump kernels is - 0x1000000 (16MB). This can also be set based on the "X" value as + This gives the physical address where the kernel is loaded. It + should be aligned to 2MB boundary. + + If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then + bzImage will decompress itself to above physical address and + run from there. Otherwise, bzImage will run from the address where + it has been loaded by the boot loader and will ignore above physical + address. + + In normal kdump cases one does not have to set/change this option + as now bzImage can be compiled as a completely relocatable image + (CONFIG_RELOCATABLE=y) and be used to load and run from a different + address. This option is mainly useful for the folks who don't want + to use a bzImage for capturing the crash dump and want to use a + vmlinux instead. + + So if you are using bzImage for capturing the crash dump, leave + the value here unchanged to 0x200000 and set CONFIG_RELOCATABLE=y. + Otherwise if you plan to use vmlinux for capturing the crash dump + change this value to start of the reserved region (Typically 16MB + 0x1000000). In other words, it can be set based on the "X" value as specified in the "crashkernel=YM@XM" command line boot parameter passed to the panic-ed kernel. Typically this parameter is set as crashkernel=64M@16M. Please take a look at Documentation/kdump/kdump.txt for more details about crash dumps. + Usage of bzImage for capturing the crash dump is advantageous as + one does not have to build two kernels. Same kernel can be used + as production kernel and capture kernel. + Don't change this unless you know what you are doing. config SECCOMP diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index e70fa6e1da08..705a3e33d7e1 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile @@ -8,16 +8,14 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o EXTRA_AFLAGS := -traditional -AFLAGS := $(subst -m64,-m32,$(AFLAGS)) # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with # -m32 -CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -LDFLAGS := -m elf_i386 +CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin +LDFLAGS := -m elf_x86_64 -LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386 - -$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE +LDFLAGS_vmlinux := -T +$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: @@ -27,7 +25,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE $(call if_changed,gzip) -LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T +LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE $(call if_changed,ld) diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index 6f55565e4d42..c353a9266ea4 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S @@ -26,116 +26,262 @@ #include #include +#include #include +#include +.section ".text.head" .code32 .globl startup_32 - + startup_32: cld cli - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs - movl %eax,%gs + movl $(__KERNEL_DS), %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss - lss stack_start,%esp - xorl %eax,%eax -1: incl %eax # check that A20 really IS enabled - movl %eax,0x000000 # loop forever if it isn't - cmpl %eax,0x100000 - je 1b +/* Calculate the delta between where we were compiled to run + * at and where we were actually loaded at. This can only be done + * with a short local call on x86. Nothing else will tell us what + * address we are running at. The reserved chunk of the real-mode + * data at 0x34-0x3f are used as the stack for this calculation. + * Only 4 bytes are needed. + */ + leal 0x40(%esi), %esp + call 1f +1: popl %ebp + subl $1b, %ebp + +/* Compute the delta between where we were compiled to run at + * and where the code will actually run at. + */ +/* %ebp contains the address we are loaded at by the boot loader and %ebx + * contains the address where we should move the kernel image temporarily + * for safe in-place decompression. + */ + +#ifdef CONFIG_RELOCATABLE + movl %ebp, %ebx + addl $(LARGE_PAGE_SIZE -1), %ebx + andl $LARGE_PAGE_MASK, %ebx +#else + movl $CONFIG_PHYSICAL_START, %ebx +#endif + + /* Replace the compressed data size with the uncompressed size */ + subl input_len(%ebp), %ebx + movl output_len(%ebp), %eax + addl %eax, %ebx + /* Add 8 bytes for every 32K input block */ + shrl $12, %eax + addl %eax, %ebx + /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ + addl $(32768 + 18 + 4095), %ebx + andl $~4095, %ebx /* - * Initialize eflags. Some BIOS's leave bits like NT set. This would - * confuse the debugger if this code is traced. - * XXX - best to initialize before switching to protected mode. + * Prepare for entering 64 bit mode */ - pushl $0 - popfl + + /* Load new GDT with the 64bit segments using 32bit descriptor */ + leal gdt(%ebp), %eax + movl %eax, gdt+2(%ebp) + lgdt gdt(%ebp) + + /* Enable PAE mode */ + xorl %eax, %eax + orl $(1 << 5), %eax + movl %eax, %cr4 + + /* + * Build early 4G boot pagetable + */ + /* Initialize Page tables to 0*/ + leal pgtable(%ebx), %edi + xorl %eax, %eax + movl $((4096*6)/4), %ecx + rep stosl + + /* Build Level 4 */ + leal pgtable + 0(%ebx), %edi + leal 0x1007 (%edi), %eax + movl %eax, 0(%edi) + + /* Build Level 3 */ + leal pgtable + 0x1000(%ebx), %edi + leal 0x1007(%edi), %eax + movl $4, %ecx +1: movl %eax, 0x00(%edi) + addl $0x00001000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Build Level 2 */ + leal pgtable + 0x2000(%ebx), %edi + movl $0x00000183, %eax + movl $2048, %ecx +1: movl %eax, 0(%edi) + addl $0x00200000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Enable the boot page tables */ + leal pgtable(%ebx), %eax + movl %eax, %cr3 + + /* Enable Long mode in EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Setup for the jump to 64bit mode + * + * When the jump is performend we will be in long mode but + * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 + * (and in turn EFER.LMA = 1). To jump into 64bit mode we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + * We place all of the values on our mini stack so lret can + * used to perform that far jump. + */ + pushl $__KERNEL_CS + leal startup_64(%ebp), %eax + pushl %eax + + /* Enter paged protected Mode, activating Long Mode */ + movl $0x80000001, %eax /* Enable Paging and Protected mode */ + movl %eax, %cr0 + + /* Jump from 32bit compatibility mode into 64bit mode. */ + lret + + /* Be careful here startup_64 needs to be at a predictable + * address so I can export it in an ELF header. Bootloaders + * should look at the ELF header to find this address, as + * it may change in the future. + */ + .code64 + .org 0x100 +ENTRY(startup_64) + /* We come here either from startup_32 or directly from a + * 64bit bootloader. If we come here from a bootloader we depend on + * an identity mapped page table being provied that maps our + * entire text+data+bss and hopefully all of memory. + */ + + /* Setup data segments. */ + xorl %eax, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + /* Compute the decompressed kernel start address. It is where + * we were loaded at aligned to a 2M boundary. %rbp contains the + * decompressed kernel start address. + * + * If it is a relocatable kernel then decompress and run the kernel + * from load address aligned to 2MB addr, otherwise decompress and + * run the kernel from CONFIG_PHYSICAL_START + */ + + /* Start with the delta to where the kernel will run at. */ +#ifdef CONFIG_RELOCATABLE + leaq startup_32(%rip) /* - $startup_32 */, %rbp + addq $(LARGE_PAGE_SIZE - 1), %rbp + andq $LARGE_PAGE_MASK, %rbp + movq %rbp, %rbx +#else + movq $CONFIG_PHYSICAL_START, %rbp + movq %rbp, %rbx +#endif + + /* Replace the compressed data size with the uncompressed size */ + movl input_len(%rip), %eax + subq %rax, %rbx + movl output_len(%rip), %eax + addq %rax, %rbx + /* Add 8 bytes for every 32K input block */ + shrq $12, %rax + addq %rax, %rbx + /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ + addq $(32768 + 18 + 4095), %rbx + andq $~4095, %rbx + +/* Copy the compressed kernel to the end of our buffer + * where decompression in place becomes safe. + */ + leaq _end(%rip), %r8 + leaq _end(%rbx), %r9 + movq $_end /* - $startup_32 */, %rcx +1: subq $8, %r8 + subq $8, %r9 + movq 0(%r8), %rax + movq %rax, 0(%r9) + subq $8, %rcx + jnz 1b + +/* + * Jump to the relocated address. + */ + leaq relocated(%rbx), %rax + jmp *%rax + +.section ".text" +relocated: + /* * Clear BSS */ - xorl %eax,%eax - movl $_edata,%edi - movl $_end,%ecx - subl %edi,%ecx + xorq %rax, %rax + leaq _edata(%rbx), %rdi + leaq _end(%rbx), %rcx + subq %rdi, %rcx cld rep stosb + + /* Setup the stack */ + leaq user_stack_end(%rip), %rsp + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + /* * Do the decompression, and jump to the new kernel.. */ - subl $16,%esp # place for structure on the stack - movl %esp,%eax - pushl %esi # real mode pointer as second arg - pushl %eax # address of structure as first arg - call decompress_kernel - orl %eax,%eax - jnz 3f - addl $8,%esp - xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $__PHYSICAL_START + pushq %rsi # Save the real mode argument + movq %rsi, %rdi # real mode address + leaq _heap(%rip), %rsi # _heap + leaq input_data(%rip), %rdx # input_data + movl input_len(%rip), %eax + movq %rax, %rcx # input_len + movq %rbp, %r8 # output + call decompress_kernel + popq %rsi + /* - * We come here, if we were loaded high. - * We need to move the move-in-place routine down to 0x1000 - * and then start it with the buffer addresses in registers, - * which we got from the stack. + * Jump to the decompressed kernel. */ -3: - movl %esi,%ebx - movl $move_routine_start,%esi - movl $0x1000,%edi - movl $move_routine_end,%ecx - subl %esi,%ecx - addl $3,%ecx - shrl $2,%ecx - cld - rep - movsl + jmp *%rbp - popl %esi # discard the address - addl $4,%esp # real mode pointer - popl %esi # low_buffer_start - popl %ecx # lcount - popl %edx # high_buffer_start - popl %eax # hcount - movl $__PHYSICAL_START,%edi - cli # make sure we don't get interrupted - ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine - -/* - * Routine (template) for moving the decompressed kernel in place, - * if we were high loaded. This _must_ PIC-code ! - */ -move_routine_start: - movl %ecx,%ebp - shrl $2,%ecx - rep - movsl - movl %ebp,%ecx - andl $3,%ecx - rep - movsb - movl %edx,%esi - movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0 - addl $3,%ecx - shrl $2,%ecx - rep - movsl - movl %ebx,%esi # Restore setup pointer - xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $__PHYSICAL_START -move_routine_end: - - -/* Stack for uncompression */ - .align 32 -user_stack: + .data +gdt: + .word gdt_end - gdt + .long gdt + .word 0 + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ +gdt_end: + .bss +/* Stack for uncompression */ + .balign 4 +user_stack: .fill 4096,4,0 -stack_start: - .long user_stack+4096 - .word __KERNEL_DS - +user_stack_end: diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index 3755b2e394d0..fee54dbf1749 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -9,10 +9,95 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ +#define _LINUX_STRING_H_ 1 +#define __LINUX_BITMAP_H 1 + +#include #include #include #include +/* WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically + * at run time, but no relocation processing is performed. + * This means that it is not safe to place pointers in static structures. + */ + +/* + * Getting to provable safe in place decompression is hard. + * Worst case behaviours need to be analized. + * Background information: + * + * The file layout is: + * magic[2] + * method[1] + * flags[1] + * timestamp[4] + * extraflags[1] + * os[1] + * compressed data blocks[N] + * crc[4] orig_len[4] + * + * resulting in 18 bytes of non compressed data overhead. + * + * Files divided into blocks + * 1 bit (last block flag) + * 2 bits (block type) + * + * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. + * The smallest block type encoding is always used. + * + * stored: + * 32 bits length in bytes. + * + * fixed: + * magic fixed tree. + * symbols. + * + * dynamic: + * dynamic tree encoding. + * symbols. + * + * + * The buffer for decompression in place is the length of the + * uncompressed data, plus a small amount extra to keep the algorithm safe. + * The compressed data is placed at the end of the buffer. The output + * pointer is placed at the start of the buffer and the input pointer + * is placed where the compressed data starts. Problems will occur + * when the output pointer overruns the input pointer. + * + * The output pointer can only overrun the input pointer if the input + * pointer is moving faster than the output pointer. A condition only + * triggered by data whose compressed form is larger than the uncompressed + * form. + * + * The worst case at the block level is a growth of the compressed data + * of 5 bytes per 32767 bytes. + * + * The worst case internal to a compressed block is very hard to figure. + * The worst case can at least be boundined by having one bit that represents + * 32764 bytes and then all of the rest of the bytes representing the very + * very last byte. + * + * All of which is enough to compute an amount of extra data that is required + * to be safe. To avoid problems at the block level allocating 5 extra bytes + * per 32767 bytes of data is sufficient. To avoind problems internal to a block + * adding an extra 32767 bytes (the worst case uncompressed block size) is + * sufficient, to ensure that in the worst case the decompressed data for + * block will stop the byte before the compressed data for a block begins. + * To avoid problems with the compressed data's meta information an extra 18 + * bytes are needed. Leading to the formula: + * + * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. + * + * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. + * Adding 32768 instead of 32767 just makes for round numbers. + * Adding the decompressor_size is necessary as it musht live after all + * of the data as well. Last I measured the decompressor is about 14K. + * 10K of actuall data and 4K of bss. + * + */ + /* * gzip declarations */ @@ -28,15 +113,20 @@ typedef unsigned char uch; typedef unsigned short ush; typedef unsigned long ulg; -#define WSIZE 0x8000 /* Window size must be at least 32k, */ - /* and a power of two */ +#define WSIZE 0x80000000 /* Window size must be at least 32k, + * and a power of two + * We don't actually have a window just + * a huge output buffer so I report + * a 2G windows size, as that should + * always be larger than our output buffer. + */ -static uch *inbuf; /* input buffer */ -static uch window[WSIZE]; /* Sliding window buffer */ +static uch *inbuf; /* input buffer */ +static uch *window; /* Sliding window buffer, (and final output buffer) */ -static unsigned insize = 0; /* valid bytes in inbuf */ -static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ -static unsigned outcnt = 0; /* bytes in output buffer */ +static unsigned insize; /* valid bytes in inbuf */ +static unsigned inptr; /* index of next byte to be processed in inbuf */ +static unsigned outcnt; /* bytes in output buffer */ /* gzip flag byte */ #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ @@ -87,8 +177,6 @@ extern unsigned char input_data[]; extern int input_len; static long bytes_out = 0; -static uch *output_data; -static unsigned long output_ptr = 0; static void *malloc(int size); static void free(void *where); @@ -98,17 +186,10 @@ static void *memcpy(void *dest, const void *src, unsigned n); static void putstr(const char *); -extern int end; -static long free_mem_ptr = (long)&end; +static long free_mem_ptr; static long free_mem_end_ptr; -#define INPLACE_MOVE_ROUTINE 0x1000 -#define LOW_BUFFER_START 0x2000 -#define LOW_BUFFER_MAX 0x90000 -#define HEAP_SIZE 0x3000 -static unsigned int low_buffer_end, low_buffer_size; -static int high_loaded =0; -static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; +#define HEAP_SIZE 0x6000 static char *vidmem = (char *)0xb8000; static int vidport; @@ -218,58 +299,31 @@ static void* memcpy(void* dest, const void* src, unsigned n) */ static int fill_inbuf(void) { - if (insize != 0) { - error("ran out of input data"); - } - - inbuf = input_data; - insize = input_len; - inptr = 1; - return inbuf[0]; + error("ran out of input data"); + return 0; } /* =========================================================================== * Write the output window window[0..outcnt-1] and update crc and bytes_out. * (Used for the decompressed data only.) */ -static void flush_window_low(void) -{ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, *out, ch; - - in = window; - out = &output_data[output_ptr]; - for (n = 0; n < outcnt; n++) { - ch = *out++ = *in++; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - output_ptr += (ulg)outcnt; - outcnt = 0; -} - -static void flush_window_high(void) -{ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, ch; - in = window; - for (n = 0; n < outcnt; n++) { - ch = *output_data++ = *in++; - if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - outcnt = 0; -} - static void flush_window(void) { - if (high_loaded) flush_window_high(); - else flush_window_low(); + /* With my window equal to my output buffer + * I only need to compute the crc here. + */ + ulg c = crc; /* temporary variable */ + unsigned n; + uch *in, ch; + + in = window; + for (n = 0; n < outcnt; n++) { + ch = *in++; + c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); + } + crc = c; + bytes_out += (ulg)outcnt; + outcnt = 0; } static void error(char *x) @@ -281,57 +335,8 @@ static void error(char *x) while(1); /* Halt */ } -static void setup_normal_output_buffer(void) -{ -#ifdef STANDARD_MEMORY_BIOS_CALL - if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory"); -#else - if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); -#endif - output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ - free_mem_end_ptr = (long)real_mode; -} - -struct moveparams { - uch *low_buffer_start; int lcount; - uch *high_buffer_start; int hcount; -}; - -static void setup_output_buffer_if_we_run_high(struct moveparams *mv) -{ - high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); -#ifdef STANDARD_MEMORY_BIOS_CALL - if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); -#else - if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); -#endif - mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START; - low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX - ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; - low_buffer_size = low_buffer_end - LOW_BUFFER_START; - high_loaded = 1; - free_mem_end_ptr = (long)high_buffer_start; - if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); - mv->hcount = 0; /* say: we need not to move high_buffer */ - } - else mv->hcount = -1; - mv->high_buffer_start = high_buffer_start; -} - -static void close_output_buffer_if_we_run_high(struct moveparams *mv) -{ - if (bytes_out > low_buffer_size) { - mv->lcount = low_buffer_size; - if (mv->hcount) - mv->hcount = bytes_out - low_buffer_size; - } else { - mv->lcount = bytes_out; - mv->hcount = 0; - } -} - -int decompress_kernel(struct moveparams *mv, void *rmode) +asmlinkage void decompress_kernel(void *rmode, unsigned long heap, + uch *input_data, unsigned long input_len, uch *output) { real_mode = rmode; @@ -346,13 +351,21 @@ int decompress_kernel(struct moveparams *mv, void *rmode) lines = RM_SCREEN_INFO.orig_video_lines; cols = RM_SCREEN_INFO.orig_video_cols; - if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); - else setup_output_buffer_if_we_run_high(mv); + window = output; /* Output buffer (Normally at 1M) */ + free_mem_ptr = heap; /* Heap */ + free_mem_end_ptr = heap + HEAP_SIZE; + inbuf = input_data; /* Input buffer */ + insize = input_len; + inptr = 0; + + if ((ulg)output & 0x1fffffUL) + error("Destination address not 2M aligned"); + if ((ulg)output >= 0xffffffffffUL) + error("Destination address too large"); makecrc(); putstr(".\nDecompressing Linux..."); gunzip(); putstr("done.\nBooting the kernel.\n"); - if (high_loaded) close_output_buffer_if_we_run_high(mv); - return high_loaded; + return; } diff --git a/arch/x86_64/boot/compressed/vmlinux.lds b/arch/x86_64/boot/compressed/vmlinux.lds new file mode 100644 index 000000000000..94c13e557fb4 --- /dev/null +++ b/arch/x86_64/boot/compressed/vmlinux.lds @@ -0,0 +1,44 @@ +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) +ENTRY(startup_64) +SECTIONS +{ + /* Be careful parts of head.S assume startup_32 is at + * address 0. + */ + . = 0; + .text : { + _head = . ; + *(.text.head) + _ehead = . ; + *(.text.compressed) + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(8); + _end = . ; + . = ALIGN(4096); + pgtable = . ; + . = . + 4096 * 6; + _heap = .; + } +} diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr index 1ed9d791f863..bd1429ce193e 100644 --- a/arch/x86_64/boot/compressed/vmlinux.scr +++ b/arch/x86_64/boot/compressed/vmlinux.scr @@ -1,9 +1,10 @@ SECTIONS { - .data : { + .text.compressed : { input_len = .; - LONG(input_data_end - input_data) input_data = .; - *(.data) - input_data_end = .; + LONG(input_data_end - input_data) input_data = .; + *(.data) + output_len = . - 4; + input_data_end = .; } } diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c211e52f1333..36aa98a6d15c 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -5,6 +5,7 @@ * Copyright (C) 2000 Pavel Machek * Copyright (C) 2000 Karsten Keil * Copyright (C) 2001,2002 Andi Kleen + * Copyright (C) 2005 Eric Biederman */ @@ -17,95 +18,127 @@ #include #include #include - + /* we are not able to switch in one step to the final KERNEL ADRESS SPACE - * because we need identity-mapped pages on setup so define __START_KERNEL to - * 0x100000 for this stage - * + * because we need identity-mapped pages. + * */ .text .section .bootstrap.text - .code32 - .globl startup_32 -/* %bx: 1 if coming from smp trampoline on secondary cpu */ -startup_32: - - /* - * At this point the CPU runs in 32bit protected mode (CS.D = 1) with - * paging disabled and the point of this file is to switch to 64bit - * long mode with a kernel mapping for kerneland to jump into the - * kernel virtual addresses. - * There is no stack until we set one up. - */ - - /* Initialize the %ds segment register */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - - /* Load new GDT with the 64bit segments using 32bit descriptor */ - lgdt pGDT32 - __START_KERNEL_map - - /* If the CPU doesn't support CPUID this will double fault. - * Unfortunately it is hard to check for CPUID without a stack. - */ - - /* Check if extended functions are implemented */ - movl $0x80000000, %eax - cpuid - cmpl $0x80000000, %eax - jbe no_long_mode - /* Check if long mode is implemented */ - mov $0x80000001, %eax - cpuid - btl $29, %edx - jnc no_long_mode - - /* - * Prepare for entering 64bits mode - */ - - /* Enable PAE mode */ - xorl %eax, %eax - btsl $5, %eax - movl %eax, %cr4 - - /* Setup early boot stage 4 level pagetables */ - movl $(init_level4_pgt - __START_KERNEL_map), %eax - movl %eax, %cr3 - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - - /* Enable Long Mode */ - btsl $_EFER_LME, %eax - - /* Make changes effective */ - wrmsr - - xorl %eax, %eax - btsl $31, %eax /* Enable paging and in turn activate Long Mode */ - btsl $0, %eax /* Enable protected mode */ - /* Make changes effective */ - movl %eax, %cr0 - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. - */ - ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) - .code64 - .org 0x100 .globl startup_64 startup_64: + + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * and someone has loaded an identity mapped page table + * for us. These identity mapped page tables map all of the + * kernel pages and possibly all of memory. + * + * %esi holds a physical pointer to real_mode_data. + * + * We come here either directly from a 64bit bootloader, or from + * arch/x86_64/boot/compressed/head.S. + * + * We only come here initially at boot nothing else comes here. + * + * Since we may be loaded at an address different from what we were + * compiled to run at we first fixup the physical addresses in our page + * tables and then reload them. + */ + + /* Compute the delta between the address I am compiled to run at and the + * address I am actually running at. + */ + leaq _text(%rip), %rbp + subq $_text - __START_KERNEL_map, %rbp + + /* Is the address not 2M aligned? */ + movq %rbp, %rax + andl $~LARGE_PAGE_MASK, %eax + testl %eax, %eax + jnz bad_address + + /* Is the address too large? */ + leaq _text(%rip), %rdx + movq $PGDIR_SIZE, %rax + cmpq %rax, %rdx + jae bad_address + + /* Fixup the physical addresses in the page table + */ + addq %rbp, init_level4_pgt + 0(%rip) + addq %rbp, init_level4_pgt + (258*8)(%rip) + addq %rbp, init_level4_pgt + (511*8)(%rip) + + addq %rbp, level3_ident_pgt + 0(%rip) + addq %rbp, level3_kernel_pgt + (510*8)(%rip) + + /* Add an Identity mapping if I am above 1G */ + leaq _text(%rip), %rdi + andq $LARGE_PAGE_MASK, %rdi + + movq %rdi, %rax + shrq $PUD_SHIFT, %rax + andq $(PTRS_PER_PUD - 1), %rax + jz ident_complete + + leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx + leaq level3_ident_pgt(%rip), %rbx + movq %rdx, 0(%rbx, %rax, 8) + + movq %rdi, %rax + shrq $PMD_SHIFT, %rax + andq $(PTRS_PER_PMD - 1), %rax + leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx + leaq level2_spare_pgt(%rip), %rbx + movq %rdx, 0(%rbx, %rax, 8) +ident_complete: + + /* Fixup the kernel text+data virtual addresses + */ + leaq level2_kernel_pgt(%rip), %rdi + leaq 4096(%rdi), %r8 + /* See if it is a valid page table entry */ +1: testq $1, 0(%rdi) + jz 2f + addq %rbp, 0(%rdi) + /* Go to the next page */ +2: addq $8, %rdi + cmp %r8, %rdi + jne 1b + + /* Fixup phys_base */ + addq %rbp, phys_base(%rip) + +#ifdef CONFIG_SMP + addq %rbp, trampoline_level4_pgt + 0(%rip) + addq %rbp, trampoline_level4_pgt + (511*8)(%rip) +#endif +#ifdef CONFIG_ACPI_SLEEP + addq %rbp, wakeup_level4_pgt + 0(%rip) + addq %rbp, wakeup_level4_pgt + (511*8)(%rip) +#endif + + /* Due to ENTRY(), sometimes the empty space gets filled with + * zeros. Better take a jmp than relying on empty space being + * filled with 0x90 (nop) + */ + jmp secondary_startup_64 ENTRY(secondary_startup_64) - /* We come here either from startup_32 - * or directly from a 64bit bootloader. - * Since we may have come directly from a bootloader we - * reload the page tables here. + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * and someone has loaded a mapped page table. + * + * %esi holds a physical pointer to real_mode_data. + * + * We come here either from startup_64 (using physical addresses) + * or from trampoline.S (using virtual addresses). + * + * Using virtual addresses from trampoline.S removes the need + * to have any identity mapped pages in the kernel page table + * after the boot processor executes this code. */ /* Enable PAE mode and PGE */ @@ -116,8 +149,14 @@ ENTRY(secondary_startup_64) /* Setup early boot stage 4 level pagetables. */ movq $(init_level4_pgt - __START_KERNEL_map), %rax + addq phys_base(%rip), %rax movq %rax, %cr3 + /* Ensure I am executing from virtual addresses */ + movq $1f, %rax + jmp *%rax +1: + /* Check if nx is implemented */ movl $0x80000001, %eax cpuid @@ -126,17 +165,11 @@ ENTRY(secondary_startup_64) /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx rdmsr - - /* Enable System Call */ - btsl $_EFER_SCE, %eax - - /* No Execute supported? */ - btl $20,%edi + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ jnc 1f btsl $_EFER_NX, %eax -1: - /* Make changes effective */ - wrmsr +1: wrmsr /* Make changes effective */ /* Setup cr0 */ #define CR0_PM 1 /* protected mode */ @@ -163,7 +196,7 @@ ENTRY(secondary_startup_64) * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */ - lgdt cpu_gdt_descr + lgdt cpu_gdt_descr(%rip) /* set up data segments. actually 0 would do too */ movl $__KERNEL_DS,%eax @@ -214,6 +247,9 @@ initial_code: init_rsp: .quad init_thread_union+THREAD_SIZE-8 +bad_address: + jmp bad_address + ENTRY(early_idt_handler) cmpl $2,early_recursion_flag(%rip) jz 1f @@ -242,23 +278,7 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" -.code32 -ENTRY(no_long_mode) - /* This isn't an x86-64 CPU so hang */ -1: - jmp 1b - -.org 0xf00 - .globl pGDT32 -pGDT32: - .word gdt_end-cpu_gdt_table-1 - .long cpu_gdt_table-__START_KERNEL_map - -.org 0xf10 -ljumpvector: - .long startup_64-__START_KERNEL_map - .word __KERNEL_CS - +.balign PAGE_SIZE ENTRY(stext) ENTRY(_stext) @@ -303,7 +323,7 @@ NEXT_PAGE(level2_ident_pgt) * Don't set NX because code runs from these pages. */ PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) - + NEXT_PAGE(level2_kernel_pgt) /* 40MB kernel mapping. The kernel code cannot be bigger than that. When you change this change KERNEL_TEXT_SIZE in page.h too. */ @@ -313,6 +333,9 @@ NEXT_PAGE(level2_kernel_pgt) /* Module mapping starts here */ .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 +NEXT_PAGE(level2_spare_pgt) + .fill 512,8,0 + #undef PMDS #undef NEXT_PAGE @@ -330,6 +353,10 @@ gdt: .endr #endif +ENTRY(phys_base) + /* This must match the first entry in level2_kernel_pgt */ + .quad 0x0000000000000000 + /* We need valid kernel segments for data and code in long mode too * IRET will check the segment types kkeil 2000/10/28 * Also sysret mandates a special GDT layout diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S index bfbe00763c68..16d183f67bc1 100644 --- a/arch/x86_64/kernel/suspend_asm.S +++ b/arch/x86_64/kernel/suspend_asm.S @@ -71,9 +71,10 @@ loop: jmp loop done: /* go back to the original page tables */ - leaq init_level4_pgt(%rip), %rax - subq $__START_KERNEL_map, %rax - movq %rax, %cr3 + movq $(init_level4_pgt - __START_KERNEL_map), %rax + addq phys_base(%rip), %rax + movq %rax, %cr3 + /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax movq %rax, %rdx diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index 4974433bbf34..40a24d0df090 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -61,6 +61,8 @@ typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgprot; } pgprot_t; +extern unsigned long phys_base; + #define pte_val(x) ((x).pte) #define pmd_val(x) ((x).pmd) #define pud_val(x) ((x).pud) @@ -101,14 +103,14 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define PAGE_OFFSET __PAGE_OFFSET /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. - Otherwise you risk miscompilation. */ + Otherwise you risk miscompilation. */ #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ #define __pa_symbol(x) \ ({unsigned long v; \ asm("" : "=r" (v) : "0" (x)); \ - (v - __START_KERNEL_map); }) + ((v - __START_KERNEL_map) + phys_base); }) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #ifdef CONFIG_FLATMEM From 6a50a664ca0cfd2a487525f10cec3ff4d570b5e8 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 056/231] [PATCH] x86-64: build-time checking o X86_64 kernel should run from 2MB aligned address for two reasons. - Performance. - For relocatable kernels, page tables are updated based on difference between compile time address and load time physical address. This difference should be multiple of 2MB as kernel text and data is mapped using 2MB pages and PMD should be pointing to a 2MB aligned address. Life is simpler if both compile time and load time kernel addresses are 2MB aligned. o Flag the error at compile time if one is trying to build a kernel which does not meet alignment restrictions. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: "Eric W. Biederman" Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/boot/compressed/misc.c | 2 +- arch/x86_64/kernel/head64.c | 7 +++++++ include/asm-x86_64/page.h | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index fee54dbf1749..fed1167159c3 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -358,7 +358,7 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long heap, insize = input_len; inptr = 0; - if ((ulg)output & 0x1fffffUL) + if ((ulg)output & (__KERNEL_ALIGN - 1)) error("Destination address not 2M aligned"); if ((ulg)output >= 0xffffffffffUL) error("Destination address too large"); diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 6c34bdd22e26..213d90e04755 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -62,6 +62,13 @@ void __init x86_64_start_kernel(char * real_mode_data) { int i; + /* + * Make sure kernel is aligned to 2MB address. Catching it at compile + * time is better. Change your config file and compile the kernel + * for a 2MB aligned address (CONFIG_PHYSICAL_START) + */ + BUILD_BUG_ON(CONFIG_PHYSICAL_START & (__KERNEL_ALIGN - 1)); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index 40a24d0df090..b17fc16ec2eb 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -78,6 +78,7 @@ extern unsigned long phys_base; #endif /* !__ASSEMBLY__ */ #define __PHYSICAL_START CONFIG_PHYSICAL_START +#define __KERNEL_ALIGN 0x200000 #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map 0xffffffff80000000 #define __PAGE_OFFSET 0xffff810000000000 From 8035d3ea78c2a61a9738c7857742370e0aa74d5c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 057/231] [PATCH] x86-64: Extend bzImage protocol for relocatable bzImage o Extend the bzImage protocol (same as i386) to allow bzImage loaders to load the protected mode kernel at non-1MB address. Now protected mode component is relocatable and can be loaded at non-1MB addresses. o As of today kdump uses it to run a second kernel from a reserved memory area. Signed-off-by: Vivek Goyal Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/boot/setup.S | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index 770940cc0108..deb3573c7aec 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -80,7 +80,7 @@ start: # This is the setup header, and it must start at %cs:2 (old 0x9020:2) .ascii "HdrS" # header signature - .word 0x0204 # header version number (>= 0x0105) + .word 0x0205 # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) realmode_swtch: .word 0, 0 # default_switch, SETUPSEG start_sys_seg: .word SYSSEG @@ -155,7 +155,16 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later) # low memory 0x10000 or higher. ramdisk_max: .long 0xffffffff - +kernel_alignment: .long 0x200000 # physical addr alignment required for + # protected mode relocatable kernel +#ifdef CONFIG_RELOCATABLE +relocatable_kernel: .byte 1 +#else +relocatable_kernel: .byte 0 +#endif +pad2: .byte 0 +pad3: .word 0 + trampoline: call start_of_setup .align 16 # The offset at this point is 0x240 From a4831e08b7f3ed51623c9eb25e8c945b76b3eda3 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 058/231] [PATCH] x86-64: Move cpu verification code to common file o This patch moves the code to verify long mode and SSE to a common file. This code is now shared by trampoline.S, wakeup.S, boot/setup.S and boot/compressed/head.S o So far we used to do very limited check in trampoline.S, wakeup.S and in 32bit entry point. Now all the entry paths are forced to do the exhaustive check, including SSE because verify_cpu is shared. o I am keeping this patch as last in the x86 relocatable series because previous patches have got quite some amount of testing done and don't want to distrub that. So that if there is problem introduced by this patch, at least it can be easily isolated. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/boot/compressed/head.S | 19 ++++- arch/x86_64/boot/setup.S | 63 ++--------------- arch/x86_64/kernel/acpi/wakeup.S | 30 ++++---- arch/x86_64/kernel/trampoline.S | 51 +------------ arch/x86_64/kernel/verify_cpu.S | 110 +++++++++++++++++++++++++++++ 5 files changed, 151 insertions(+), 122 deletions(-) create mode 100644 arch/x86_64/kernel/verify_cpu.S diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index c353a9266ea4..f9d5692a0106 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S @@ -54,6 +54,15 @@ startup_32: 1: popl %ebp subl $1b, %ebp +/* setup a stack and make sure cpu supports long mode. */ + movl $user_stack_end, %eax + addl %ebp, %eax + movl %eax, %esp + + call verify_cpu + testl %eax, %eax + jnz no_longmode + /* Compute the delta between where we were compiled to run at * and where the code will actually run at. */ @@ -159,13 +168,21 @@ startup_32: /* Jump from 32bit compatibility mode into 64bit mode. */ lret +no_longmode: + /* This isn't an x86-64 CPU so hang */ +1: + hlt + jmp 1b + +#include "../../kernel/verify_cpu.S" + /* Be careful here startup_64 needs to be at a predictable * address so I can export it in an ELF header. Bootloaders * should look at the ELF header to find this address, as * it may change in the future. */ .code64 - .org 0x100 + .org 0x200 ENTRY(startup_64) /* We come here either from startup_32 or directly from a * 64bit bootloader. If we come here from a bootloader we depend on diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index deb3573c7aec..816d04faa2be 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -299,64 +299,10 @@ loader_ok: movw %cs,%ax movw %ax,%ds - /* minimum CPUID flags for x86-64 */ - /* see http://www.x86-64.org/lists/discuss/msg02971.html */ -#define SSE_MASK ((1<<25)|(1<<26)) -#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ - (1<<13)|(1<<15)|(1<<24)) -#define REQUIRED_MASK2 (1<<29) + call verify_cpu + testl %eax,%eax + jz sse_ok - pushfl /* standard way to check for cpuid */ - popl %eax - movl %eax,%ebx - xorl $0x200000,%eax - pushl %eax - popfl - pushfl - popl %eax - cmpl %eax,%ebx - jz no_longmode /* cpu has no cpuid */ - movl $0x0,%eax - cpuid - cmpl $0x1,%eax - jb no_longmode /* no cpuid 1 */ - xor %di,%di - cmpl $0x68747541,%ebx /* AuthenticAMD */ - jnz noamd - cmpl $0x69746e65,%edx - jnz noamd - cmpl $0x444d4163,%ecx - jnz noamd - mov $1,%di /* cpu is from AMD */ -noamd: - movl $0x1,%eax - cpuid - andl $REQUIRED_MASK1,%edx - xorl $REQUIRED_MASK1,%edx - jnz no_longmode - movl $0x80000000,%eax - cpuid - cmpl $0x80000001,%eax - jb no_longmode /* no extended cpuid */ - movl $0x80000001,%eax - cpuid - andl $REQUIRED_MASK2,%edx - xorl $REQUIRED_MASK2,%edx - jnz no_longmode -sse_test: - movl $1,%eax - cpuid - andl $SSE_MASK,%edx - cmpl $SSE_MASK,%edx - je sse_ok - test %di,%di - jz no_longmode /* only try to force SSE on AMD */ - movl $0xc0010015,%ecx /* HWCR */ - rdmsr - btr $15,%eax /* enable SSE */ - wrmsr - xor %di,%di /* don't loop */ - jmp sse_test /* try again */ no_longmode: call beep lea long_mode_panic,%si @@ -366,7 +312,8 @@ no_longmode_loop: long_mode_panic: .string "Your CPU does not support long mode. Use a 32bit distribution." .byte 0 - + +#include "../kernel/verify_cpu.S" sse_ok: popw %ds diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 766cfbcac1db..8550a6ffa275 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -43,6 +43,11 @@ wakeup_code: cmpl $0x12345678, %eax jne bogus_real_magic + call verify_cpu # Verify the cpu supports long + # mode + testl %eax, %eax + jnz no_longmode + testl $1, video_flags - wakeup_code jz 1f lcall $0xc000,$3 @@ -92,18 +97,6 @@ wakeup_32: # Running in this code, but at low address; paging is not yet turned on. movb $0xa5, %al ; outb %al, $0x80 - /* Check if extended functions are implemented */ - movl $0x80000000, %eax - cpuid - cmpl $0x80000000, %eax - jbe bogus_cpu - wbinvd - mov $0x80000001, %eax - cpuid - btl $29, %edx - jnc bogus_cpu - movl %edx,%edi - movl $__KERNEL_DS, %eax movl %eax, %ds @@ -123,6 +116,11 @@ wakeup_32: leal (wakeup_level4_pgt - wakeup_code)(%esi), %eax movl %eax, %cr3 + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + /* Enable Long Mode */ xorl %eax, %eax btsl $_EFER_LME, %eax @@ -244,10 +242,12 @@ bogus_64_magic: movb $0xb3,%al ; outb %al,$0x80 jmp bogus_64_magic -bogus_cpu: - movb $0xbc,%al ; outb %al,$0x80 - jmp bogus_cpu +.code16 +no_longmode: + movb $0xbc,%al ; outb %al,$0x80 + jmp no_longmode +#include "../verify_cpu.S" /* This code uses an extended set of video mode numbers. These include: * Aliases for standard modes diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S index 13eee63c7bb5..e7e2764c461b 100644 --- a/arch/x86_64/kernel/trampoline.S +++ b/arch/x86_64/kernel/trampoline.S @@ -54,6 +54,8 @@ r_base = . movw $(trampoline_stack_end - r_base), %sp call verify_cpu # Verify the cpu supports long mode + testl %eax, %eax # Check for return code + jnz no_longmode mov %cs, %ax movzx %ax, %esi # Find the 32bit trampoline location @@ -121,57 +123,10 @@ startup_64: jmp *%rax .code16 -verify_cpu: - pushl $0 # Kill any dangerous flags - popfl - - /* minimum CPUID flags for x86-64 */ - /* see http://www.x86-64.org/lists/discuss/msg02971.html */ -#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ - (1<<13)|(1<<15)|(1<<24)|(1<<25)|(1<<26)) -#define REQUIRED_MASK2 (1<<29) - - pushfl # check for cpuid - popl %eax - movl %eax, %ebx - xorl $0x200000,%eax - pushl %eax - popfl - pushfl - popl %eax - pushl %ebx - popfl - cmpl %eax, %ebx - jz no_longmode - - xorl %eax, %eax # See if cpuid 1 is implemented - cpuid - cmpl $0x1, %eax - jb no_longmode - - movl $0x01, %eax # Does the cpu have what it takes? - cpuid - andl $REQUIRED_MASK1, %edx - xorl $REQUIRED_MASK1, %edx - jnz no_longmode - - movl $0x80000000, %eax # See if extended cpuid is implemented - cpuid - cmpl $0x80000001, %eax - jb no_longmode - - movl $0x80000001, %eax # Does the cpu have what it takes? - cpuid - andl $REQUIRED_MASK2, %edx - xorl $REQUIRED_MASK2, %edx - jnz no_longmode - - ret # The cpu supports long mode - no_longmode: hlt jmp no_longmode - +#include "verify_cpu.S" # Careful these need to be in the same 64K segment as the above; tidt: diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S new file mode 100644 index 000000000000..72edabd2ef9a --- /dev/null +++ b/arch/x86_64/kernel/verify_cpu.S @@ -0,0 +1,110 @@ +/* + * + * verify_cpu.S - Code for cpu long mode and SSE verification. This + * code has been borrowed from boot/setup.S and was introduced by + * Andi Kleen. + * + * Copyright (c) 2007 Andi Kleen (ak@suse.de) + * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) + * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + * + * This is a common code for verification whether CPU supports + * long mode and SSE or not. It is not called directly instead this + * file is included at various places and compiled in that context. + * Following are the current usage. + * + * This file is included by both 16bit and 32bit code. + * + * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) + * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) + * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) + * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) + * + * verify_cpu, returns the status of cpu check in register %eax. + * 0: Success 1: Failure + * + * The caller needs to check for the error code and take the action + * appropriately. Either display a message or halt. + */ + +verify_cpu: + + pushfl # Save caller passed flags + pushl $0 # Kill any dangerous flags + popfl + + /* minimum CPUID flags for x86-64 */ + /* see http://www.x86-64.org/lists/discuss/msg02971.html */ +#define SSE_MASK ((1<<25)|(1<<26)) +#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ + (1<<13)|(1<<15)|(1<<24)) +#define REQUIRED_MASK2 (1<<29) + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + jz verify_cpu_no_longmode # cpu has no cpuid + + movl $0x0,%eax # See if cpuid 1 is implemented + cpuid + cmpl $0x1,%eax + jb verify_cpu_no_longmode # no cpuid 1 + + xor %di,%di + cmpl $0x68747541,%ebx # AuthenticAMD + jnz verify_cpu_noamd + cmpl $0x69746e65,%edx + jnz verify_cpu_noamd + cmpl $0x444d4163,%ecx + jnz verify_cpu_noamd + mov $1,%di # cpu is from AMD + +verify_cpu_noamd: + movl $0x1,%eax # Does the cpu have what it takes + cpuid + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx + jnz verify_cpu_no_longmode + + movl $0x80000000,%eax # See if extended cpuid is implemented + cpuid + cmpl $0x80000001,%eax + jb verify_cpu_no_longmode # no extended cpuid + + movl $0x80000001,%eax # Does the cpu have what it takes + cpuid + andl $REQUIRED_MASK2,%edx + xorl $REQUIRED_MASK2,%edx + jnz verify_cpu_no_longmode + +verify_cpu_sse_test: + movl $1,%eax + cpuid + andl $SSE_MASK,%edx + cmpl $SSE_MASK,%edx + je verify_cpu_sse_ok + test %di,%di + jz verify_cpu_no_longmode # only try to force SSE on AMD + movl $0xc0010015,%ecx # HWCR + rdmsr + btr $15,%eax # enable SSE + wrmsr + xor %di,%di # don't loop + jmp verify_cpu_sse_test # try again + +verify_cpu_no_longmode: + popfl # Restore caller passed flags + movl $1,%eax + ret +verify_cpu_sse_ok: + popfl # Restore caller passed flags + xorl %eax, %eax + ret From 184c44d2049c4db7ef6ec65794546954da2c6a0e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 059/231] [PATCH] x86-64: fix x86_64-mm-sched-clock-share Fix for the following patch. Provide dummy cpufreq functions when CPUFREQ is not compiled in. Cc: Andi Kleen Cc: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- include/linux/cpufreq.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 0899e2cdcdd1..cb9b2ec8849c 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -32,7 +32,15 @@ * CPUFREQ NOTIFIER INTERFACE * *********************************************************************/ +#ifdef CONFIG_CPU_FREQ int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list); +#else +static inline int cpufreq_register_notifier(struct notifier_block *nb, + unsigned int list) +{ + return 0; +} +#endif int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list); #define CPUFREQ_TRANSITION_NOTIFIER (0) @@ -261,17 +269,22 @@ int cpufreq_set_policy(struct cpufreq_policy *policy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); int cpufreq_update_policy(unsigned int cpu); -/* query the current CPU frequency (in kHz). If zero, cpufreq couldn't detect it */ -unsigned int cpufreq_get(unsigned int cpu); -/* query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it */ +/* + * query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it + */ #ifdef CONFIG_CPU_FREQ unsigned int cpufreq_quick_get(unsigned int cpu); +unsigned int cpufreq_get(unsigned int cpu); #else static inline unsigned int cpufreq_quick_get(unsigned int cpu) { return 0; } +static inline unsigned int cpufreq_get(unsigned int cpu) +{ + return 0; +} #endif From 1652fcbf37abdbbebaf386b46b20e486769e7b45 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 060/231] [PATCH] x86-64: Don't disable basic block reordering When compiling with -Os (which is default) the compiler defaults to it anyways. And with -O2 it probably generates somewhat better (although also larger) code. Signed-off-by: Andi Kleen --- arch/x86_64/Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 2941a915d4ef..803cfcc9b346 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -41,9 +41,6 @@ cflags-y += -mno-red-zone cflags-y += -mcmodel=kernel cflags-y += -pipe cflags-kernel-$(CONFIG_REORDER) += -ffunction-sections -# this makes reading assembly source easier, but produces worse code -# actually it makes the kernel smaller too. -cflags-y += -fno-reorder-blocks cflags-y += -Wno-sign-compare cflags-y += -fno-asynchronous-unwind-tables ifneq ($(CONFIG_DEBUG_INFO),y) From fb60b8392ce8ab185a7a0b4f7cefbe18b2afdab5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 061/231] [PATCH] x86-64: Allow sys_uselib unconditionally Previously it wasn't enabled in the binfmt_aout is a module case. Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ia32entry.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 796df6992f62..c48087db6f75 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -481,11 +481,7 @@ ia32_sys_call_table: .quad sys_symlink .quad sys_lstat .quad sys_readlink /* 85 */ -#ifdef CONFIG_IA32_AOUT .quad sys_uselib -#else - .quad quiet_ni_syscall -#endif .quad sys_swapon .quad sys_reboot .quad compat_sys_old_readdir From d039c688c6b3e7217381d2804bc0ca3171913fec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 062/231] [PATCH] x86-64: Minor white space cleanup in traps.c Signed-off-by: Andi Kleen --- arch/x86_64/kernel/traps.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index cceef5bd7302..d76fc32d4599 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -426,8 +426,7 @@ void show_registers(struct pt_regs *regs) const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; - rsp = regs->rsp; - + rsp = regs->rsp; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -438,7 +437,6 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (in_kernel) { - printk("Stack: "); _show_stack(NULL, regs, (unsigned long*)rsp); From e073ae1b34d5600ffc550407625dcb2d4cf46c6e Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 063/231] [PATCH] x86-64: Set HASHDIST_DEFAULT to 1 for x86_64 NUMA Enable system hashtable memory to be distributed among nodes on x86_64 NUMA Forcing the kernel to use node interleaved vmalloc instead of bootmem for the system hashtable memory (alloc_large_system_hash) reduces the memory imbalance on node 0 by around 40MB on a 8 node x86_64 NUMA box: Before the following patch, on bootup of a 8 node box: Node 0 MemTotal: 3407488 kB Node 0 MemFree: 3206296 kB Node 0 MemUsed: 201192 kB Node 0 Active: 7012 kB Node 0 Inactive: 512 kB Node 0 Dirty: 0 kB Node 0 Writeback: 0 kB Node 0 FilePages: 1912 kB Node 0 Mapped: 420 kB Node 0 AnonPages: 5612 kB Node 0 PageTables: 468 kB Node 0 NFS_Unstable: 0 kB Node 0 Bounce: 0 kB Node 0 Slab: 5408 kB Node 0 SReclaimable: 644 kB Node 0 SUnreclaim: 4764 kB After the patch (or using hashdist=1 on the kernel command line): Node 0 MemTotal: 3407488 kB Node 0 MemFree: 3247608 kB Node 0 MemUsed: 159880 kB Node 0 Active: 3012 kB Node 0 Inactive: 616 kB Node 0 Dirty: 0 kB Node 0 Writeback: 0 kB Node 0 FilePages: 2424 kB Node 0 Mapped: 380 kB Node 0 AnonPages: 1200 kB Node 0 PageTables: 396 kB Node 0 NFS_Unstable: 0 kB Node 0 Bounce: 0 kB Node 0 Slab: 6304 kB Node 0 SReclaimable: 1596 kB Node 0 SUnreclaim: 4708 kB I guess it is a good idea to keep HASHDIST_DEFAULT "on" for x86_64 NUMA since x86_64 has no dearth of vmalloc space? Or maybe enable hash distribution for all 64bit NUMA arches? The following patch does it only for x86_64. I ran a HPC MPI benchmark -- 'Ansys wingsolid', which takes up quite a bit of memory and uses up tlb entries. This was on a 4 way, 2 socket Tyan AMD box (non vsmp), with 8G total memory (4G pernode). The results with and without hash distribution are: 1. Vanilla - runtime of 1188.000s 2. With hashdist=1 runtime of 1154.000s Oprofile output for the duration of run is: 1. Vanilla: PU: AMD64 processors, speed 2411.16 MHz (estimated) Counted L1_AND_L2_DTLB_MISSES events (L1 and L2 DTLB misses) with a unit mask of 0x00 (No unit mask) count 500 samples % app name symbol name 163054 6.5513 libansys1.so MultiFront::decompose(int, int, Elemset *, int *, int, int, int) 162061 6.5114 libansys3.so blockSaxpy6L_fd 162042 6.5107 libansys3.so blockInnerProduct6L_fd 156286 6.2794 libansys3.so maxb33_ 87879 3.5309 libansys1.so elmatrixmultpcg_ 84857 3.4095 libansys4.so saxpy_pcg 58637 2.3560 libansys4.so .st4560 46612 1.8728 libansys4.so .st4282 43043 1.7294 vmlinux-t copy_user_generic_string 41326 1.6604 libansys3.so blockSaxpyBackSolve6L_fd 41288 1.6589 libansys3.so blockInnerProductBackSolve6L_fd 2. With hashdist=1 CPU: AMD64 processors, speed 2411.13 MHz (estimated) Counted L1_AND_L2_DTLB_MISSES events (L1 and L2 DTLB misses) with a unit mask of 0x00 (No unit mask) count 500 samples % app name symbol name 162993 6.9814 libansys1.so MultiFront::decompose(int, int, Elemset *, int *, int, int, int) 160799 6.8874 libansys3.so blockInnerProduct6L_fd 160459 6.8729 libansys3.so blockSaxpy6L_fd 156018 6.6826 libansys3.so maxb33_ 84700 3.6279 libansys4.so saxpy_pcg 83434 3.5737 libansys1.so elmatrixmultpcg_ 58074 2.4875 libansys4.so .st4560 46000 1.9703 libansys4.so .st4282 41166 1.7632 libansys3.so blockSaxpyBackSolve6L_fd 41033 1.7575 libansys3.so blockInnerProductBackSolve6L_fd 35762 1.5318 libansys1.so inner_product_sub 35591 1.5245 libansys1.so inner_product_sub2 28259 1.2104 libansys4.so addVectors Signed-off-by: Pravin B. Shelar Signed-off-by: Ravikiran Thirumalai Signed-off-by: Shai Fultheim Signed-off-by: Andi Kleen Acked-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton --- include/linux/bootmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 81c07cd18643..0365ec9fc0c9 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -122,9 +122,9 @@ extern void *alloc_large_system_hash(const char *tablename, #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ /* Only NUMA needs hash distribution. - * IA64 is known to have sufficient vmalloc space. + * IA64 and x86_64 have sufficient vmalloc space. */ -#if defined(CONFIG_NUMA) && defined(CONFIG_IA64) +#if defined(CONFIG_NUMA) && (defined(CONFIG_IA64) || defined(CONFIG_X86_64)) #define HASHDIST_DEFAULT 1 #else #define HASHDIST_DEFAULT 0 From 1833d6bc72893265f22addd79cf52e6987496e0f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 064/231] [PATCH] i386: modpost apic related warning fixes o Modpost generates warnings for i386 if compiled with CONFIG_RELOCATABLE=y WARNING: vmlinux - Section mismatch: reference to .init.text:find_unisys_acpi_oem_table from .text between 'acpi_madt_oem_check' (at offset 0xc0101eda) and 'enable_apic_mode' WARNING: vmlinux - Section mismatch: reference to .init.text:acpi_get_table_header_early from .text between 'acpi_madt_oem_check' (at offset 0xc0101ef0) and 'enable_apic_mode' WARNING: vmlinux - Section mismatch: reference to .init.text:parse_unisys_oem from .text between 'acpi_madt_oem_check' (at offset 0xc0101f2e) and 'enable_apic_mode' WARNING: vmlinux - Section mismatch: reference to .init.text:setup_unisys from .text between 'acpi_madt_oem_check' (at offset 0xc0101f37) and 'enable_apic_mode'WARNING: vmlinux - Section mismatch: reference to .init.text:parse_unisys_oem from .text between 'mps_oem_check' (at offset 0xc0101ec7) and 'acpi_madt_oem_check' WARNING: vmlinux - Section mismatch: reference to .init.text:es7000_sw_apic from .text between 'enable_apic_mode' (at offset 0xc0101f48) and 'check_apicid_present' o Some functions which are inline (acpi_madt_oem_check) are not inlined by compiler as these functions are accessed using function pointer. These functions are put in .text section and they in-turn access __init type functions hence modpost generates warnings. o Do not iniline acpi_madt_oem_check, instead make it __init. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Len Brown Signed-off-by: Andrew Morton --- arch/i386/mach-generic/es7000.c | 41 +++++++++++++++++++++ include/asm-i386/mach-es7000/mach_apic.h | 7 ---- include/asm-i386/mach-es7000/mach_mpparse.h | 32 ---------------- scripts/mod/modpost.c | 1 + 4 files changed, 42 insertions(+), 39 deletions(-) diff --git a/arch/i386/mach-generic/es7000.c b/arch/i386/mach-generic/es7000.c index b8963a5a3b25..b47f951c0ec2 100644 --- a/arch/i386/mach-generic/es7000.c +++ b/arch/i386/mach-generic/es7000.c @@ -25,4 +25,45 @@ static int probe_es7000(void) return 0; } +extern void es7000_sw_apic(void); +static void __init enable_apic_mode(void) +{ + es7000_sw_apic(); + return; +} + +static __init int mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (mpc->mpc_oemptr) { + struct mp_config_oemtable *oem_table = + (struct mp_config_oemtable *)mpc->mpc_oemptr; + if (!strncmp(oem, "UNISYS", 6)) + return parse_unisys_oem((char *)oem_table); + } + return 0; +} + +#ifdef CONFIG_ACPI +/* Hook from generic ACPI tables.c */ +static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + unsigned long oem_addr; + if (!find_unisys_acpi_oem_table(&oem_addr)) { + if (es7000_check_dsdt()) + return parse_unisys_oem((char *)oem_addr); + else { + setup_unisys(); + return 1; + } + } + return 0; +} +#else +static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return 0; +} +#endif + struct genapic apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/include/asm-i386/mach-es7000/mach_apic.h b/include/asm-i386/mach-es7000/mach_apic.h index 8e8b3949173a..2d978928a395 100644 --- a/include/asm-i386/mach-es7000/mach_apic.h +++ b/include/asm-i386/mach-es7000/mach_apic.h @@ -73,13 +73,6 @@ static inline void init_apic_ldr(void) apic_write_around(APIC_LDR, val); } -extern void es7000_sw_apic(void); -static inline void enable_apic_mode(void) -{ - es7000_sw_apic(); - return; -} - extern int apic_version [MAX_APICS]; static inline void setup_apic_routing(void) { diff --git a/include/asm-i386/mach-es7000/mach_mpparse.h b/include/asm-i386/mach-es7000/mach_mpparse.h index 24990e546da3..b9fb784e1fd5 100644 --- a/include/asm-i386/mach-es7000/mach_mpparse.h +++ b/include/asm-i386/mach-es7000/mach_mpparse.h @@ -18,18 +18,6 @@ extern int parse_unisys_oem (char *oemptr); extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); extern void setup_unisys(void); -static inline int mps_oem_check(struct mp_config_table *mpc, char *oem, - char *productid) -{ - if (mpc->mpc_oemptr) { - struct mp_config_oemtable *oem_table = - (struct mp_config_oemtable *)mpc->mpc_oemptr; - if (!strncmp(oem, "UNISYS", 6)) - return parse_unisys_oem((char *)oem_table); - } - return 0; -} - #ifdef CONFIG_ACPI static inline int es7000_check_dsdt(void) @@ -41,26 +29,6 @@ static inline int es7000_check_dsdt(void) return 1; return 0; } - -/* Hook from generic ACPI tables.c */ -static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - unsigned long oem_addr; - if (!find_unisys_acpi_oem_table(&oem_addr)) { - if (es7000_check_dsdt()) - return parse_unisys_oem((char *)oem_addr); - else { - setup_unisys(); - return 1; - } - } - return 0; -} -#else -static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - return 0; -} #endif #endif /* __ASM_MACH_MPPARSE_H */ diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 65bdfdb56877..78d659cbb36a 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -606,6 +606,7 @@ static int secref_whitelist(const char *modname, const char *tosec, "_probe", "_probe_one", "_console", + "apic_es7000", NULL }; From 30a1528d3bf444eac7f2886ba284da22114b2f7c Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 065/231] [PATCH] i386: make struct vmi_ops static Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Zachary Amsden Signed-off-by: Andrew Morton --- arch/i386/kernel/vmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index 697a70e8c0c9..440422482c6d 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -56,7 +56,7 @@ static int disable_noidle; static int disable_vmi_timer; /* Cached VMI operations */ -struct { +static struct { void (*cpuid)(void /* non-c */); void (*_set_ldt)(u32 selector); void (*set_tr)(u32 selector); From e48b30c189559e20e1f616faccae487972486320 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 066/231] [PATCH] i386: type cast clean up for find_next_zero_bit clean up unneeded type cast by properly declare data type. Signed-off-by: Ken Chen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/lib/bitops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/i386/lib/bitops.c b/arch/i386/lib/bitops.c index 97db3853dc82..afd0045595d4 100644 --- a/arch/i386/lib/bitops.c +++ b/arch/i386/lib/bitops.c @@ -43,7 +43,7 @@ EXPORT_SYMBOL(find_next_bit); */ int find_next_zero_bit(const unsigned long *addr, int size, int offset) { - unsigned long * p = ((unsigned long *) addr) + (offset >> 5); + const unsigned long *p = addr + (offset >> 5); int set = 0, bit = offset & 31, res; if (bit) { @@ -64,7 +64,7 @@ int find_next_zero_bit(const unsigned long *addr, int size, int offset) /* * No zero yet, search remaining full bytes for a zero */ - res = find_first_zero_bit (p, size - 32 * (p - (unsigned long *) addr)); + res = find_first_zero_bit(p, size - 32 * (p - addr)); return (offset + set + res); } EXPORT_SYMBOL(find_next_zero_bit); From 2714221985ce6388ec2fa78d7d52e2a5bef78eec Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 067/231] [PATCH] i386: workaround for a -Wmissing-prototypes warning Work around a warning with -Wmissing-prototypes in arch/i386/kernel/asm-offsets.c The warning isn't gcc's fault - asm-offsets.c is simply a special file. Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index c37535163bfc..d822792d8e3e 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -25,6 +25,9 @@ #define OFFSET(sym, str, mem) \ DEFINE(sym, offsetof(struct str, mem)); +/* workaround for a warning with -Wmissing-prototypes */ +void foo(void); + void foo(void) { OFFSET(SIGCONTEXT_eax, sigcontext, eax); From 5a90cf205c922707ffed2d8f87cefd942e96b0ba Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 068/231] [PATCH] x86: Log reason why TSC was marked unstable Change mark_tsc_unstable() so it takes a string argument, which holds the reason the TSC was marked unstable. This is then displayed the first time mark_tsc_unstable is called. This should help us better debug why the TSC was marked unstable on certain systems and allow us to make sure we're not being overly paranoid when throwing out this troublesome clocksource. Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/cyrix.c | 2 +- arch/i386/kernel/tsc.c | 5 +++-- arch/x86_64/kernel/time.c | 2 +- arch/x86_64/kernel/tsc.c | 5 +++-- arch/x86_64/kernel/tsc_sync.c | 2 +- drivers/acpi/processor_idle.c | 4 ++-- include/asm-i386/mach-summit/mach_mpparse.h | 4 ++-- include/asm-i386/tsc.h | 2 +- include/asm-x86_64/timex.h | 2 +- 9 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index f0badfdd4e45..e77f8e1cf7aa 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -279,7 +279,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) */ if (vendor == PCI_VENDOR_ID_CYRIX && (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) - mark_tsc_unstable(); + mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 6cb8f5336732..755209dc93e1 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -233,7 +233,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) * TSC based sched_clock turns * to junk w/ cpufreq */ - mark_tsc_unstable(); + mark_tsc_unstable("cpufreq changes"); } } } @@ -281,11 +281,12 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_MUST_VERIFY, }; -void mark_tsc_unstable(void) +void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; tsc_enabled = 0; + printk("Marking TSC unstable due to: %s.\n", reason); /* Can be called before registration */ if (clocksource_tsc.mult) clocksource_change_rating(&clocksource_tsc, 0); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 5f862e216a42..91c9066a380d 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -397,7 +397,7 @@ void __init time_init(void) cpu_khz = tsc_calibrate_cpu_khz(); if (unsynchronized_tsc()) - mark_tsc_unstable(); + mark_tsc_unstable("TSCs unsynchronized"); if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) vgetcpu_mode = VGETCPU_RDTSCP; diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c index 5c84992c676d..48f9a8e6aa91 100644 --- a/arch/x86_64/kernel/tsc.c +++ b/arch/x86_64/kernel/tsc.c @@ -111,7 +111,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - mark_tsc_unstable(); + mark_tsc_unstable("cpufreq changes"); } set_cyc2ns_scale(tsc_khz_ref); @@ -199,10 +199,11 @@ static struct clocksource clocksource_tsc = { .vread = vread_tsc, }; -void mark_tsc_unstable(void) +void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; + printk("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) clocksource_change_rating(&clocksource_tsc, 0); diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c index 72d444dede9b..355f5f506c81 100644 --- a/arch/x86_64/kernel/tsc_sync.c +++ b/arch/x86_64/kernel/tsc_sync.c @@ -138,7 +138,7 @@ void __cpuinit check_tsc_sync_source(int cpu) printk("\n"); printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," " turning off TSC clock.\n", max_warp); - mark_tsc_unstable(); + mark_tsc_unstable("check_tsc_sync_source failed"); nr_warps = 0; max_warp = 0; last_tsc = 0; diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index ae0654cd11ea..ee5759bef945 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -475,7 +475,7 @@ static void acpi_processor_idle(void) #ifdef CONFIG_GENERIC_TIME /* TSC halts in C2, so notify users */ - mark_tsc_unstable(); + mark_tsc_unstable("possible TSC halt in C2"); #endif /* Re-enable interrupts */ local_irq_enable(); @@ -517,7 +517,7 @@ static void acpi_processor_idle(void) #ifdef CONFIG_GENERIC_TIME /* TSC halts in C3, so notify users */ - mark_tsc_unstable(); + mark_tsc_unstable("TSC halts in C3"); #endif /* Re-enable interrupts */ local_irq_enable(); diff --git a/include/asm-i386/mach-summit/mach_mpparse.h b/include/asm-i386/mach-summit/mach_mpparse.h index 94268399170d..c2520539d934 100644 --- a/include/asm-i386/mach-summit/mach_mpparse.h +++ b/include/asm-i386/mach-summit/mach_mpparse.h @@ -30,7 +30,7 @@ static inline int mps_oem_check(struct mp_config_table *mpc, char *oem, (!strncmp(productid, "VIGIL SMP", 9) || !strncmp(productid, "EXA", 3) || !strncmp(productid, "RUTHLESS SMP", 12))){ - mark_tsc_unstable(); + mark_tsc_unstable("Summit based system"); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); return 1; @@ -44,7 +44,7 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (!strncmp(oem_id, "IBM", 3) && (!strncmp(oem_table_id, "SERVIGIL", 8) || !strncmp(oem_table_id, "EXA", 3))){ - mark_tsc_unstable(); + mark_tsc_unstable("Summit based system"); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); return 1; diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h index 84016ff481b9..346976632e15 100644 --- a/include/asm-i386/tsc.h +++ b/include/asm-i386/tsc.h @@ -53,7 +53,7 @@ static __always_inline cycles_t get_cycles_sync(void) } extern void tsc_init(void); -extern void mark_tsc_unstable(void); +extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern void init_tsc_clocksource(void); diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h index 8c6808a3fba4..f6527e1b6c1c 100644 --- a/include/asm-x86_64/timex.h +++ b/include/asm-x86_64/timex.h @@ -27,6 +27,6 @@ extern int read_current_timer(unsigned long *timer_value); #define NS_SCALE 10 /* 2^10, carefully chosen */ #define US_SCALE 32 /* 2^32, arbitralrily chosen */ -extern void mark_tsc_unstable(void); +extern void mark_tsc_unstable(char *msg); extern void set_cyc2ns_scale(unsigned long khz); #endif From 9f7290ed23b1cedf7198ef7b67f2ed256bc8553e Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 069/231] [PATCH] x86-64: fix ia32_binfmt.c build error Reorder code to avoid multiple inclusion of elf.h. #undef several symbols to avoid build errors over redefinitions. Signed-off-by: Ralf Baechle Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/ia32/ia32_binfmt.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c index 071100ea1251..185399baaf6d 100644 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ b/arch/x86_64/ia32/ia32_binfmt.c @@ -5,6 +5,11 @@ * This tricks binfmt_elf.c into loading 32bit binaries using lots * of ugly preprocessor tricks. Talk about very very poor man's inheritance. */ +#define __ASM_X86_64_ELF_H 1 + +#undef ELF_CLASS +#define ELF_CLASS ELFCLASS32 + #include #include #include @@ -50,9 +55,6 @@ struct elf_phdr; #undef ELF_ARCH #define ELF_ARCH EM_386 -#undef ELF_CLASS -#define ELF_CLASS ELFCLASS32 - #define ELF_DATA ELFDATA2LSB #define USE_ELF_CORE_DUMP 1 @@ -136,7 +138,7 @@ struct elf_prpsinfo #define user user32 -#define __ASM_X86_64_ELF_H 1 +#undef elf_read_implies_exec #define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X) //#include #include From f0e13ae76a607eab9c387544ebca550f2196a876 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH 070/231] [PATCH] x86-64: remove extra smp_processor_id calling Cc: "Eric W. Biederman" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index c6a5bc7e8118..318d9055cd97 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1407,8 +1407,7 @@ static void irq_complete_move(unsigned int irq) vector = ~get_irq_regs()->orig_rax; me = smp_processor_id(); - if ((vector == cfg->vector) && - cpu_isset(smp_processor_id(), cfg->domain)) { + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { cpumask_t cleanup_mask; cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); From 786142fab86cfddd3e7797e9ccf8a8a3bcaf0456 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 071/231] [PATCH] x86-64: make simnow_init() static Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/early_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index b7aa9abe1c6a..92213d2b7c11 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -175,7 +175,7 @@ static noinline long simnow(long cmd, long a, long b, long c) return ret; } -void __init simnow_init(char *str) +static void __init simnow_init(char *str) { char *fn = "klog"; if (*str == '=') From 8eb68faed9e077c45c2bab5fce7c4e371fe9c28f Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 072/231] [PATCH] i386: vmi_pmd_clear() static This patch makes the needlessly global vmi_pmd_clear() static. Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Acked-by: Zachary Amsden Signed-off-by: Andrew Morton --- arch/i386/kernel/vmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index 440422482c6d..626c82063d19 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -516,7 +516,7 @@ static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } -void vmi_pmd_clear(pmd_t *pmd) +static void vmi_pmd_clear(pmd_t *pmd) { const pte_t pte = { 0 }; vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); From 8280c0c58e9762a9fe29d550a9db81410de77691 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 073/231] [PATCH] i386: fix GDT's number of quadwords in comment Fix comments to represent the true number of quadwords in GDT. Signed-off-by: Ahmed S. Darwish Signed-off-by: Andi Kleen Acked-by: Randy Dunlap Signed-off-by: Andrew Morton --- arch/i386/kernel/head.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 3fa7f9389afe..cb185f40c282 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -612,7 +612,7 @@ ENTRY(boot_gdt_table) .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ /* - * The Global Descriptor Table contains 28 quadwords, per-CPU. + * The Global Descriptor Table contains 32 quadwords, per-CPU. */ .align L1_CACHE_BYTES ENTRY(cpu_gdt_table) @@ -639,7 +639,7 @@ ENTRY(cpu_gdt_table) /* * Segments used for calling PnP BIOS have byte granularity. - * They code segments and data segments have fixed 64k limits, + * The code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ .quad 0x00409a000000ffff /* 0x90 32-bit code */ From 8b8ca80e192b10eecc01fc44a2902510af86f73b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 074/231] [PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/x86_64/boot-options.txt | 8 +- arch/x86_64/mm/numa.c | 263 +++++++++++++++----------- include/asm-x86_64/mmzone.h | 2 +- 3 files changed, 162 insertions(+), 111 deletions(-) diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 85f51e5a749f..7500aad95f3c 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -149,7 +149,13 @@ NUMA numa=noacpi Don't parse the SRAT table for NUMA setup - numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. + numa=fake=CMDLINE + If a number, fakes CMDLINE nodes and ignores NUMA setup of the + actual machine. Otherwise, system memory is configured + depending on the sizes and coefficients listed. For example: + numa=fake=2*512,1024,4*256 + gives two 512M nodes, a 1024M node, and four 256M nodes. The + remaining system RAM is allocated to an additional node. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 41b8fb069924..c55936bc6be6 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -273,125 +273,172 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -int numa_fake __initdata = 0; +#define E820_ADDR_HOLE_SIZE(start, end) \ + (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ + PAGE_SHIFT) +char *cmdline __initdata; /* - * This function is used to find out if the start and end correspond to - * different zones. + * Setups up nid to range from addr to addr + size. If the end boundary is + * greater than max_addr, then max_addr is used instead. The return value is 0 + * if there is additional memory left for allocation past addr and -1 otherwise. + * addr is adjusted to be at the end of the node. */ -int zone_cross_over(unsigned long start, unsigned long end) +static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, + u64 size, u64 max_addr) { - if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && - (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) - return 1; - return 0; + int ret = 0; + nodes[nid].start = *addr; + *addr += size; + if (*addr >= max_addr) { + *addr = max_addr; + ret = -1; + } + nodes[nid].end = *addr; + node_set_online(nid); + printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, + nodes[nid].start, nodes[nid].end, + (nodes[nid].end - nodes[nid].start) >> 20); + return ret; } -static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +/* + * Splits num_nodes nodes up equally starting at node_start. The return value + * is the number of nodes split up and addr is adjusted to be at the end of the + * last node allocated. + */ +static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, + u64 max_addr, int node_start, + int num_nodes) { - int i, big; - struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz, old_sz; - unsigned long hole_size; - unsigned long start, end; - unsigned long max_addr = (end_pfn << PAGE_SHIFT); + unsigned int big; + u64 size; + int i; - start = (start_pfn << PAGE_SHIFT); - hole_size = e820_hole_size(start, max_addr); - sz = (max_addr - start - hole_size) / numa_fake; - - /* Kludge needed for the hash function */ - - old_sz = sz; + if (num_nodes <= 0) + return -1; + if (num_nodes > MAX_NUMNODES) + num_nodes = MAX_NUMNODES; + size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) / + num_nodes; /* - * Round down to the nearest FAKE_NODE_MIN_SIZE. + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the leftovers. */ - sz &= FAKE_NODE_MIN_HASH_MASK; + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / + FAKE_NODE_MIN_SIZE; - /* - * We ensure that each node is at least 64MB big. Smaller than this - * size can cause VM hiccups. - */ - if (sz == 0) { - printk(KERN_INFO "Not enough memory for %d nodes. Reducing " - "the number of nodes\n", numa_fake); - numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; - printk(KERN_INFO "Number of fake nodes will be = %d\n", - numa_fake); - sz = FAKE_NODE_MIN_SIZE; + /* Round down to nearest FAKE_NODE_MIN_SIZE. */ + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + printk(KERN_ERR "Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; } - /* - * Find out how many nodes can get an extra NODE_MIN_SIZE granule. - * This logic ensures the extra memory gets distributed among as many - * nodes as possible (as compared to one single node getting all that - * extra memory. - */ - big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; - printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " - "%d\n", - (sz >> 20), (hole_size >> 20), big); - memset(&nodes,0,sizeof(nodes)); - end = start; - for (i = 0; i < numa_fake; i++) { - /* - * In case we are not able to allocate enough memory for all - * the nodes, we reduce the number of fake nodes. - */ - if (end >= max_addr) { - numa_fake = i - 1; - break; - } - start = nodes[i].start = end; - /* - * Final node can have all the remaining memory. - */ - if (i == numa_fake-1) - sz = max_addr - start; - end = nodes[i].start + sz; - /* - * Fir "big" number of nodes get extra granule. - */ + + for (i = node_start; i < num_nodes + node_start; i++) { + u64 end = *addr + size; if (i < big) end += FAKE_NODE_MIN_SIZE; /* - * Iterate over the range to ensure that this node gets at - * least sz amount of RAM (excluding holes) + * The final node can have the remaining system RAM. Other + * nodes receive roughly the same amount of available pages. */ - while ((end - start - e820_hole_size(start, end)) < sz) { - end += FAKE_NODE_MIN_SIZE; - if (end >= max_addr) - break; - } - /* - * Look at the next node to make sure there is some real memory - * to map. Bad things happen when the only memory present - * in a zone on a fake node is IO hole. - */ - while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { - if (zone_cross_over(start, end + sz)) { - end = (MAX_DMA32_PFN << PAGE_SHIFT); - break; - } - if (end >= max_addr) - break; - end += FAKE_NODE_MIN_SIZE; - } - if (end > max_addr) + if (i == num_nodes + node_start - 1) end = max_addr; - nodes[i].end = end; - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", - i, - nodes[i].start, nodes[i].end, - (nodes[i].end - nodes[i].start) >> 20); - node_set_online(i); - } - memnode_shift = compute_hash_shift(nodes, numa_fake); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); - return -1; - } - for_each_online_node(i) { + else + while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) < + size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) + break; + } + return i - node_start + 1; +} + +/* + * Sets up the system RAM area from start_pfn to end_pfn according to the + * numa=fake command-line option. + */ +static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +{ + struct bootnode nodes[MAX_NUMNODES]; + u64 addr = start_pfn << PAGE_SHIFT; + u64 max_addr = end_pfn << PAGE_SHIFT; + unsigned int coeff; + unsigned int num = 0; + int num_nodes = 0; + u64 size; + int i; + + memset(&nodes, 0, sizeof(nodes)); + /* + * If the numa=fake command-line is just a single number N, split the + * system RAM into N fake nodes. + */ + if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { + num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, + simple_strtol(cmdline, NULL, 0)); + if (num_nodes < 0) + return num_nodes; + goto out; + } + + /* Parse the command line. */ + for (coeff = 1; ; cmdline++) { + if (*cmdline && isdigit(*cmdline)) { + num = num * 10 + *cmdline - '0'; + continue; + } + if (*cmdline == '*') + coeff = num; + if (!*cmdline || *cmdline == ',') { + /* + * Round down to the nearest FAKE_NODE_MIN_SIZE. + * Command-line coefficients are in megabytes. + */ + size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; + if (size) { + for (i = 0; i < coeff; i++, num_nodes++) + if (setup_node_range(num_nodes, nodes, + &addr, size, max_addr) < 0) + goto done; + coeff = 1; + } + } + if (!*cmdline) + break; + num = 0; + } +done: + if (!num_nodes) + return -1; + /* Fill remainder of system RAM with a final node, if appropriate. */ + if (addr < max_addr) { + setup_node_range(num_nodes, nodes, &addr, max_addr - addr, + max_addr); + num_nodes++; + } +out: + memnode_shift = compute_hash_shift(nodes, num_nodes); + if (memnode_shift < 0) { + memnode_shift = 0; + printk(KERN_ERR "No NUMA hash function found. NUMA emulation " + "disabled.\n"); + return -1; + } + + /* + * We need to vacate all active ranges that may have been registered by + * SRAT. + */ + remove_all_active_ranges(); + for_each_online_node(i) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); @@ -399,14 +446,15 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) numa_init_array(); return 0; } -#endif +#undef E820_ADDR_HOLE_SIZE +#endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int i; #ifdef CONFIG_NUMA_EMU - if (numa_fake && !numa_emulation(start_pfn, end_pfn)) + if (cmdline && !numa_emulation(start_pfn, end_pfn)) return; #endif @@ -486,11 +534,8 @@ static __init int numa_setup(char *opt) if (!strncmp(opt,"off",3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU - if(!strncmp(opt, "fake=", 5)) { - numa_fake = simple_strtoul(opt+5,NULL,0); ; - if (numa_fake >= MAX_NUMNODES) - numa_fake = MAX_NUMNODES; - } + if (!strncmp(opt, "fake=", 5)) + cmdline = opt + 5; #endif #ifdef CONFIG_ACPI_NUMA if (!strncmp(opt,"noacpi",6)) diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index fb558fb1d211..19a89377b123 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -49,7 +49,7 @@ extern int pfn_valid(unsigned long pfn); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE (64*1024*1024) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul)) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1uL)) #endif #endif From 14694d736bb66d0ec250d05c81c6e98a19c229c6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 075/231] [PATCH] x86-64: split remaining fake nodes equally Extends the numa=fake x86_64 command-line option to split the remaining system memory into equal-sized nodes. For example: numa=fake=2*512,4* gives two 512M nodes and the remaining system memory is split into four approximately equal chunks. This is beneficial for systems where the exact size of RAM is unknown or not necessarily relevant, but the granularity with which nodes shall be allocated is known. Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/x86_64/boot-options.txt | 4 +++- arch/x86_64/mm/numa.c | 22 ++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 7500aad95f3c..12a9aacecaae 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -155,7 +155,9 @@ NUMA depending on the sizes and coefficients listed. For example: numa=fake=2*512,1024,4*256 gives two 512M nodes, a 1024M node, and four 256M nodes. The - remaining system RAM is allocated to an additional node. + remaining system RAM is allocated to an additional node. If + the last character of CMDLINE is a *, the remaining system RAM + is instead divided up equally among its coefficient. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index c55936bc6be6..0ae2d9d5d7ea 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -418,11 +418,25 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) done: if (!num_nodes) return -1; - /* Fill remainder of system RAM with a final node, if appropriate. */ + /* Fill remainder of system RAM, if appropriate. */ if (addr < max_addr) { - setup_node_range(num_nodes, nodes, &addr, max_addr - addr, - max_addr); - num_nodes++; + switch (*(cmdline - 1)) { + case '*': + /* Split remaining nodes into coeff chunks */ + if (coeff <= 0) + break; + num_nodes += split_nodes_equally(nodes, &addr, max_addr, + num_nodes, coeff); + break; + case ',': + /* Do not allocate remaining system RAM */ + break; + default: + /* Give one final node */ + setup_node_range(num_nodes, nodes, &addr, + max_addr - addr, max_addr); + num_nodes++; + } } out: memnode_shift = compute_hash_shift(nodes, num_nodes); From 382591d500bbcd20a44416c5e0e292708468587c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 076/231] [PATCH] x86-64: fixed size remaining fake nodes Extends the numa=fake x86_64 command-line option to split the remaining system memory into nodes of fixed size. Any leftover memory is allocated to a final node unless the command-line ends with a comma. For example: numa=fake=2*512,*128 gives two 512M nodes and the remaining system memory is split into nodes of 128M each. This is beneficial for systems where the exact size of RAM is unknown or not necessarily relevant, but the size of the remaining nodes to be allocated is known based on their capacity for resource management. Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/x86_64/boot-options.txt | 14 +++++--- arch/x86_64/mm/numa.c | 47 +++++++++++++++++++++------ 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 12a9aacecaae..6177d881983f 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -153,11 +153,15 @@ NUMA If a number, fakes CMDLINE nodes and ignores NUMA setup of the actual machine. Otherwise, system memory is configured depending on the sizes and coefficients listed. For example: - numa=fake=2*512,1024,4*256 - gives two 512M nodes, a 1024M node, and four 256M nodes. The - remaining system RAM is allocated to an additional node. If - the last character of CMDLINE is a *, the remaining system RAM - is instead divided up equally among its coefficient. + numa=fake=2*512,1024,4*256,*128 + gives two 512M nodes, a 1024M node, four 256M nodes, and the + rest split into 128M chunks. If the last character of CMDLINE + is a *, the remaining memory is divided up equally among its + coefficient: + numa=fake=2*512,2* + gives two 512M nodes and the rest split into two nodes. + Otherwise, the remaining system RAM is allocated to an + additional node. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 0ae2d9d5d7ea..5ee07bc41eb5 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -361,6 +361,21 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, return i - node_start + 1; } +/* + * Splits the remaining system RAM into chunks of size. The remaining memory is + * always assigned to a final node and can be asymmetric. Returns the number of + * nodes split. + */ +static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, + u64 max_addr, int node_start, u64 size) +{ + int i = node_start; + size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; + while (!setup_node_range(i++, nodes, addr, size, max_addr)) + ; + return i - node_start; +} + /* * Sets up the system RAM area from start_pfn to end_pfn according to the * numa=fake command-line option. @@ -370,9 +385,10 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) struct bootnode nodes[MAX_NUMNODES]; u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = end_pfn << PAGE_SHIFT; - unsigned int coeff; - unsigned int num = 0; int num_nodes = 0; + int coeff_flag; + int coeff = -1; + int num = 0; u64 size; int i; @@ -390,29 +406,34 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) } /* Parse the command line. */ - for (coeff = 1; ; cmdline++) { + for (coeff_flag = 0; ; cmdline++) { if (*cmdline && isdigit(*cmdline)) { num = num * 10 + *cmdline - '0'; continue; } - if (*cmdline == '*') - coeff = num; + if (*cmdline == '*') { + if (num > 0) + coeff = num; + coeff_flag = 1; + } if (!*cmdline || *cmdline == ',') { + if (!coeff_flag) + coeff = 1; /* * Round down to the nearest FAKE_NODE_MIN_SIZE. * Command-line coefficients are in megabytes. */ size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; - if (size) { + if (size) for (i = 0; i < coeff; i++, num_nodes++) if (setup_node_range(num_nodes, nodes, &addr, size, max_addr) < 0) goto done; - coeff = 1; - } + if (!*cmdline) + break; + coeff_flag = 0; + coeff = -1; } - if (!*cmdline) - break; num = 0; } done: @@ -420,6 +441,12 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) return -1; /* Fill remainder of system RAM, if appropriate. */ if (addr < max_addr) { + if (coeff_flag && coeff < 0) { + /* Split remaining nodes into num-sized chunks */ + num_nodes += split_nodes_by_size(nodes, &addr, max_addr, + num_nodes, num); + goto out; + } switch (*(cmdline - 1)) { case '*': /* Split remaining nodes into coeff chunks */ From d824395c5994adbf7efe377cc67f732133270554 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 077/231] [PATCH] x86: remove constant_tsc reporting from /proc/cpuinfo' power flags remove the reporting of the constant_tsc flag from the "power management" field in /proc/cpuinfo. The NULL value there was replaced by "" because the former would result in a printout of [8] if the flag is set. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/proc.c | 3 +-- arch/x86_64/kernel/setup.c | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 47e3ebbfb28d..89d91e6cc972 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -72,8 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) "stc", "100mhzsteps", "hwpstate", - NULL, - NULL, /* constant_tsc - moved to flags */ + "", /* constant_tsc - moved to flags */ /* nothing */ }; struct cpuinfo_x86 *c = v; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index b9bdfc1b54f8..0a1d539149df 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -979,9 +979,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) "stc", "100mhzsteps", "hwpstate", - NULL, /* tsc invariant mapped to constant_tsc */ - NULL, - /* nothing */ /* constant_tsc - moved to flags */ + "", /* tsc invariant mapped to constant_tsc */ + /* nothing */ }; From 20280195f2a3d80c42a190959ca22108c93cd7e0 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 078/231] [PATCH] x86-64: fake numa for cpusets document Create a document to explain how to use numa=fake in conjunction with cpusets for coarse memory resource management. An attempt to get more awareness and testing for this feature. Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/x86_64/fake-numa-for-cpusets | 66 ++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 Documentation/x86_64/fake-numa-for-cpusets diff --git a/Documentation/x86_64/fake-numa-for-cpusets b/Documentation/x86_64/fake-numa-for-cpusets new file mode 100644 index 000000000000..d1a985c5b00a --- /dev/null +++ b/Documentation/x86_64/fake-numa-for-cpusets @@ -0,0 +1,66 @@ +Using numa=fake and CPUSets for Resource Management +Written by David Rientjes + +This document describes how the numa=fake x86_64 command-line option can be used +in conjunction with cpusets for coarse memory management. Using this feature, +you can create fake NUMA nodes that represent contiguous chunks of memory and +assign them to cpusets and their attached tasks. This is a way of limiting the +amount of system memory that are available to a certain class of tasks. + +For more information on the features of cpusets, see Documentation/cpusets.txt. +There are a number of different configurations you can use for your needs. For +more information on the numa=fake command line option and its various ways of +configuring fake nodes, see Documentation/x86_64/boot-options.txt. + +For the purposes of this introduction, we'll assume a very primitive NUMA +emulation setup of "numa=fake=4*512,". This will split our system memory into +four equal chunks of 512M each that we can now use to assign to cpusets. As +you become more familiar with using this combination for resource control, +you'll determine a better setup to minimize the number of nodes you have to deal +with. + +A machine may be split as follows with "numa=fake=4*512," as reported by dmesg: + + Faking node 0 at 0000000000000000-0000000020000000 (512MB) + Faking node 1 at 0000000020000000-0000000040000000 (512MB) + Faking node 2 at 0000000040000000-0000000060000000 (512MB) + Faking node 3 at 0000000060000000-0000000080000000 (512MB) + ... + On node 0 totalpages: 130975 + On node 1 totalpages: 131072 + On node 2 totalpages: 131072 + On node 3 totalpages: 131072 + +Now following the instructions for mounting the cpusets filesystem from +Documentation/cpusets.txt, you can assign fake nodes (i.e. contiguous memory +address spaces) to individual cpusets: + + [root@xroads /]# mkdir exampleset + [root@xroads /]# mount -t cpuset none exampleset + [root@xroads /]# mkdir exampleset/ddset + [root@xroads /]# cd exampleset/ddset + [root@xroads /exampleset/ddset]# echo 0-1 > cpus + [root@xroads /exampleset/ddset]# echo 0-1 > mems + +Now this cpuset, 'ddset', will only allowed access to fake nodes 0 and 1 for +memory allocations (1G). + +You can now assign tasks to these cpusets to limit the memory resources +available to them according to the fake nodes assigned as mems: + + [root@xroads /exampleset/ddset]# echo $$ > tasks + [root@xroads /exampleset/ddset]# dd if=/dev/zero of=tmp bs=1024 count=1G + [1] 13425 + +Notice the difference between the system memory usage as reported by +/proc/meminfo between the restricted cpuset case above and the unrestricted +case (i.e. running the same 'dd' command without assigning it to a fake NUMA +cpuset): + Unrestricted Restricted + MemTotal: 3091900 kB 3091900 kB + MemFree: 42113 kB 1513236 kB + +This allows for coarse memory management for the tasks you assign to particular +cpusets. Since cpusets can form a hierarchy, you can create some pretty +interesting combinations of use-cases for various classes of tasks for your +memory management needs. From 1b523fb54977c9bb81b16c4badce581a2b455994 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 079/231] [PATCH] i386: VDSO_PRELINK warning fix The lguest patches somehow managed to trigger this: In file included from arch/i386/lguest/lguest.c:38: include/asm/asm-offsets.h:67:1: warning: "VDSO_PRELINK" redefined In file included from include/linux/elf.h:7, from include/linux/module.h:15, from include/linux/device.h:21, from include/linux/interrupt.h:15, from arch/i386/lguest/lguest.c:27: include/asm/elf.h:140:1: warning: this is the location of the previous definition I assume that using the same identifier twice was a bad idea.. Cc: Rusty Russell Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/asm-offsets.c | 2 +- arch/i386/kernel/vsyscall.lds.S | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index d822792d8e3e..655cc8d4c745 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -97,7 +97,7 @@ void foo(void) sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(VDSO_PRELINK, VDSO_PRELINK); + DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S index f66cd11adb72..4a8b0ed9b8fb 100644 --- a/arch/i386/kernel/vsyscall.lds.S +++ b/arch/i386/kernel/vsyscall.lds.S @@ -7,7 +7,7 @@ SECTIONS { - . = VDSO_PRELINK + SIZEOF_HEADERS; + . = VDSO_PRELINK_asm + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -21,7 +21,7 @@ SECTIONS For the layouts to match, we need to skip more than enough space for the dynamic symbol table et al. If this amount is insufficient, ld -shared will barf. Just increase it here. */ - . = VDSO_PRELINK + 0x400; + . = VDSO_PRELINK_asm + 0x400; .text : { *(.text) } :text =0x90909090 .note : { *(.note.*) } :text :note From 692174b97d5b871f4b0f648b1fb17aa37b955876 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 080/231] [PATCH] i386: Initialize esp0 properly all the time Whenever we schedule, __switch_to calls load_esp0 which does: tss->esp0 = thread->esp0; This is never initialized for the initial thread (ie "swapper"), so when we're scheduling that, we end up setting esp0 to 0. This is fine: the swapper never leaves ring 0, so this field is never used. lguest, however, gets upset that we're trying to used an unmapped page as our kernel stack. Rather than work around it there, let's initialize it. Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- include/asm-i386/processor.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 11bf899de8aa..01ae0ffcf236 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -421,6 +421,7 @@ struct thread_struct { }; #define INIT_THREAD { \ + .esp0 = sizeof(init_stack) + (long)&init_stack, \ .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ From eab0c72aecd7982b2c848f7d493ba379efcef15e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 081/231] [PATCH] x86-64: Introduce load_TLS to the "for" loop. GCC (4.1 at least) unrolls it anyway, but I can't believe this code was ever justifiable. (I've also submitted a patch which cleans up i386, which is even uglier). Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- include/asm-x86_64/desc.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/include/asm-x86_64/desc.h b/include/asm-x86_64/desc.h index 7726e74db536..ac991b5ca0fd 100644 --- a/include/asm-x86_64/desc.h +++ b/include/asm-x86_64/desc.h @@ -135,16 +135,13 @@ static inline void set_ldt_desc(unsigned cpu, void *addr, int size) (info)->useable == 0 && \ (info)->lm == 0) -#if TLS_SIZE != 24 -# error update this code. -#endif - static inline void load_TLS(struct thread_struct *t, unsigned int cpu) { + unsigned int i; u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN); - gdt[0] = t->tls_array[0]; - gdt[1] = t->tls_array[1]; - gdt[2] = t->tls_array[2]; + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[i] = t->tls_array[i]; } /* From 79e030114a8d97a1dcd593ab84fb986f8c91c536 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH 082/231] [PATCH] i386: Allow i386 crash kernels to handle x86_64 dumps The specific case I am encountering is kdump under Xen with a 64 bit hypervisor and 32 bit kernel/userspace. The dump created is 64 bit due to the hypervisor but the dump kernel is 32 bit for maximum compatibility. It's possibly less likely to be useful in a purely native scenario but I see no reason to disallow it. [akpm@linux-foundation.org: build fix] Signed-off-by: Ian Campbell Signed-off-by: Andi Kleen Acked-by: Vivek Goyal Cc: Horms Cc: Magnus Damm Cc: "Eric W. Biederman" Cc: Andi Kleen Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 2 +- include/asm-i386/kexec.h | 3 +++ include/linux/crash_dump.h | 8 ++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index d96050728c43..523e1098ae88 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -514,7 +514,7 @@ static int __init parse_crash_elf64_headers(void) /* Do some basic Verification. */ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || (ehdr.e_type != ET_CORE) || - !elf_check_arch(&ehdr) || + !vmcore_elf_check_arch(&ehdr) || ehdr.e_ident[EI_CLASS] != ELFCLASS64 || ehdr.e_ident[EI_VERSION] != EV_CURRENT || ehdr.e_version != EV_CURRENT || diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h index c5b4ab95bdc2..bcb5b21de2d2 100644 --- a/include/asm-i386/kexec.h +++ b/include/asm-i386/kexec.h @@ -42,6 +42,9 @@ /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_386 +/* We can also handle crash dumps from 64 bit kernel. */ +#define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64) + #define MAX_NOTE_BYTES 1024 /* CPU does not save ss and esp on stack if execution is already diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 32503657f14f..22c7ac5cd80c 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -14,5 +14,13 @@ extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, extern const struct file_operations proc_vmcore_operations; extern struct proc_dir_entry *proc_vmcore; +/* Architecture code defines this if there are other possible ELF + * machine types, e.g. on bi-arch capable hardware. */ +#ifndef vmcore_elf_check_arch_cross +#define vmcore_elf_check_arch_cross(x) 0 +#endif + +#define vmcore_elf_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x)) + #endif /* CONFIG_CRASH_DUMP */ #endif /* LINUX_CRASHDUMP_H */ From 8f9aeca7a081d81c4c9862be1e04f15b5ab5461f Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 083/231] [PATCH] x86: add command line length to boot protocol Because the command line is increased to 2048 characters after 2.6.21, it's not possible for boot loaders and userspace tools to determine the length of the command line the kernel can understand. The benefit of knowing the length is that users can be warned if the command line size is too long which prevents surprise if things don't work after bootup. This patch updates the boot protocol to contain a field called "cmdline_size" that contain the length of the command line (excluding the terminating zero). The patch also adds missing fields (of protocol version 2.05) to the x86_64 setup code. Signed-off-by: Bernhard Walle Signed-off-by: Andi Kleen Cc: Alon Bar-Lev Acked-by: H. Peter Anvin Cc: Andi Kleen Signed-off-by: Andrew Morton --- Documentation/i386/boot.txt | 23 +++++++++++++++++------ arch/i386/boot/setup.S | 7 ++++++- arch/x86_64/boot/setup.S | 7 ++++++- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt index 38fe1f03fb14..6498666ea330 100644 --- a/Documentation/i386/boot.txt +++ b/Documentation/i386/boot.txt @@ -2,7 +2,7 @@ ---------------------------- H. Peter Anvin - Last update 2007-01-26 + Last update 2007-03-06 On the i386 platform, the Linux kernel uses a rather complicated boot convention. This has evolved partially due to historical aspects, as @@ -35,9 +35,13 @@ Protocol 2.03: (Kernel 2.4.18-pre1) Explicitly makes the highest possible initrd address available to the bootloader. Protocol 2.04: (Kernel 2.6.14) Extend the syssize field to four bytes. + Protocol 2.05: (Kernel 2.6.20) Make protected mode kernel relocatable. Introduce relocatable_kernel and kernel_alignment fields. +Protocol 2.06: (Kernel 2.6.22) Added a field that contains the size of + the boot command line + **** MEMORY LAYOUT @@ -133,6 +137,8 @@ Offset Proto Name Meaning 022C/4 2.03+ initrd_addr_max Highest legal initrd address 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not +0235/3 N/A pad2 Unused +0238/4 2.06+ cmdline_size Maximum size of the kernel command line (1) For backwards compatibility, if the setup_sects field contains 0, the real value is 4. @@ -233,6 +239,12 @@ filled out, however: if your ramdisk is exactly 131072 bytes long and this field is 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) + cmdline_size: + The maximum size of the command line without the terminating + zero. This means that the command line can contain at most + cmdline_size characters. With protocol version 2.05 and + earlier, the maximum size was 255. + **** THE KERNEL COMMAND LINE @@ -241,11 +253,10 @@ loader to communicate with the kernel. Some of its options are also relevant to the boot loader itself, see "special command line options" below. -The kernel command line is a null-terminated string currently up to -255 characters long, plus the final null. A string that is too long -will be automatically truncated by the kernel, a boot loader may allow -a longer command line to be passed to permit future kernels to extend -this limit. +The kernel command line is a null-terminated string. The maximum +length can be retrieved from the field cmdline_size. Before protocol +version 2.06, the maximum was 255 characters. A string that is too +long will be automatically truncated by the kernel. If the boot protocol version is 2.02 or later, the address of the kernel command line is given by the header field cmd_line_ptr (see diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S index 06edf1c66242..b74b7f40b79c 100644 --- a/arch/i386/boot/setup.S +++ b/arch/i386/boot/setup.S @@ -52,6 +52,7 @@ #include #include #include +#include /* Signature words to ensure LILO loaded us right */ #define SIG1 0xAA55 @@ -81,7 +82,7 @@ start: # This is the setup header, and it must start at %cs:2 (old 0x9020:2) .ascii "HdrS" # header signature - .word 0x0205 # header version number (>= 0x0105) + .word 0x0206 # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) realmode_swtch: .word 0, 0 # default_switch, SETUPSEG start_sys_seg: .word SYSSEG @@ -171,6 +172,10 @@ relocatable_kernel: .byte 0 pad2: .byte 0 pad3: .word 0 +cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, + #added with boot protocol + #version 2.06 + trampoline: call start_of_setup .align 16 # The offset at this point is 0x240 diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index 816d04faa2be..4f8851000d7e 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -51,6 +51,7 @@ #include #include #include +#include /* Signature words to ensure LILO loaded us right */ #define SIG1 0xAA55 @@ -80,7 +81,7 @@ start: # This is the setup header, and it must start at %cs:2 (old 0x9020:2) .ascii "HdrS" # header signature - .word 0x0205 # header version number (>= 0x0105) + .word 0x0206 # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) realmode_swtch: .word 0, 0 # default_switch, SETUPSEG start_sys_seg: .word SYSSEG @@ -165,6 +166,10 @@ relocatable_kernel: .byte 0 pad2: .byte 0 pad3: .word 0 +cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, + #added with boot protocol + #version 2.06 + trampoline: call start_of_setup .align 16 # The offset at this point is 0x240 From ae1ee11be77f51cedb6c569887dddc70c163ab6d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 084/231] [PATCH] i386: Use per-cpu variables for GDT, PDA Allocating PDA and GDT at boot is a pain. Using simple per-cpu variables adds happiness (although we need the GDT page-aligned for Xen, which we do in a followup patch). [akpm@linux-foundation.org: build fix] Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 94 ++++------------------------ arch/i386/kernel/smpboot.c | 21 +------ arch/i386/mach-voyager/voyager_smp.c | 10 +-- include/asm-generic/percpu.h | 1 + include/asm-i386/desc.h | 1 + include/asm-i386/pda.h | 7 +-- include/asm-i386/processor.h | 2 +- 7 files changed, 21 insertions(+), 115 deletions(-) diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index dcbbd0a8bfc2..2335f4464ead 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -25,8 +25,10 @@ DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); -struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(_cpu_pda); +DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]); + +DEFINE_PER_CPU(struct i386_pda, _cpu_pda); +EXPORT_PER_CPU_SYMBOL(_cpu_pda); static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; @@ -609,52 +611,6 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) return regs; } -static __cpuinit int alloc_gdt(int cpu) -{ - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt; - struct i386_pda *pda; - - gdt = (struct desc_struct *)cpu_gdt_descr->address; - pda = cpu_pda(cpu); - - /* - * This is a horrible hack to allocate the GDT. The problem - * is that cpu_init() is called really early for the boot CPU - * (and hence needs bootmem) but much later for the secondary - * CPUs, when bootmem will have gone away - */ - if (NODE_DATA(0)->bdata->node_bootmem_map) { - BUG_ON(gdt != NULL || pda != NULL); - - gdt = alloc_bootmem_pages(PAGE_SIZE); - pda = alloc_bootmem(sizeof(*pda)); - /* alloc_bootmem(_pages) panics on failure, so no check */ - - memset(gdt, 0, PAGE_SIZE); - memset(pda, 0, sizeof(*pda)); - } else { - /* GDT and PDA might already have been allocated if - this is a CPU hotplug re-insertion. */ - if (gdt == NULL) - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); - - if (pda == NULL) - pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu)); - - if (unlikely(!gdt || !pda)) { - free_pages((unsigned long)gdt, 0); - kfree(pda); - return 0; - } - } - - cpu_gdt_descr->address = (unsigned long)gdt; - cpu_pda(cpu) = pda; - - return 1; -} - /* Initial PDA used by boot CPU */ struct i386_pda boot_pda = { ._pda = &boot_pda, @@ -670,31 +626,17 @@ static inline void set_kernel_fs(void) asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); } -/* Initialize the CPU's GDT and PDA. The boot CPU does this for - itself, but secondaries find this done for them. */ -__cpuinit int init_gdt(int cpu, struct task_struct *idle) +/* Initialize the CPU's GDT and PDA. This is either the boot CPU doing itself + (still using cpu_gdt_table), or a CPU doing it for a secondary which + will soon come up. */ +__cpuinit void init_gdt(int cpu, struct task_struct *idle) { struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt; - struct i386_pda *pda; + struct desc_struct *gdt = per_cpu(cpu_gdt, cpu); + struct i386_pda *pda = &per_cpu(_cpu_pda, cpu); - /* For non-boot CPUs, the GDT and PDA should already have been - allocated. */ - if (!alloc_gdt(cpu)) { - printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu); - return 0; - } - - gdt = (struct desc_struct *)cpu_gdt_descr->address; - pda = cpu_pda(cpu); - - BUG_ON(gdt == NULL || pda == NULL); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ memcpy(gdt, cpu_gdt_table, GDT_SIZE); + cpu_gdt_descr->address = (unsigned long)gdt; cpu_gdt_descr->size = GDT_SIZE - 1; pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, @@ -706,17 +648,12 @@ __cpuinit int init_gdt(int cpu, struct task_struct *idle) pda->_pda = pda; pda->cpu_number = cpu; pda->pcurrent = idle; - - return 1; } void __cpuinit cpu_set_gdt(int cpu) { struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - /* Reinit these anyway, even if they've already been done (on - the boot CPU, this will transition from the boot gdt+pda to - the real ones). */ load_gdt(cpu_gdt_descr); set_kernel_fs(); } @@ -804,13 +741,8 @@ void __cpuinit cpu_init(void) struct task_struct *curr = current; /* Set up the real GDT and PDA, so we can transition from the - boot versions. */ - if (!init_gdt(cpu, curr)) { - /* failed to allocate something; not much we can do... */ - for (;;) - local_irq_enable(); - } - + boot_gdt_table & boot_pda. */ + init_gdt(cpu, curr); cpu_set_gdt(cpu); _cpu_init(cpu, curr); } diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 7b14e88b555f..b36a5f174cc9 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -808,13 +808,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); - /* Pre-allocate and initialize the CPU's GDT and PDA so it - doesn't have to do any memory allocation during the - delicate CPU-bringup phase. */ - if (!init_gdt(cpu, idle)) { - printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); - return -1; /* ? */ - } + init_gdt(cpu, idle); idle->thread.eip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ @@ -940,7 +934,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) DECLARE_COMPLETION_ONSTACK(done); struct warm_boot_cpu_info info; int apicid, ret; - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); apicid = x86_cpu_to_apicid[cpu]; if (apicid == BAD_APICID) { @@ -948,18 +941,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) goto exit; } - /* - * the CPU isn't initialized at boot time, allocate gdt table here. - * cpu_init will initialize it - */ - if (!cpu_gdt_descr->address) { - cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL); - if (!cpu_gdt_descr->address) - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); - ret = -ENOMEM; - goto exit; - } - info.complete = &done; info.apicid = apicid; info.cpu = cpu; diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c index 74aeedf277f4..15d8132d4f5e 100644 --- a/arch/i386/mach-voyager/voyager_smp.c +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -580,15 +580,7 @@ do_boot_cpu(__u8 cpu) /* init_tasks (in sched.c) is indexed logically */ stack_start.esp = (void *) idle->thread.esp; - /* Pre-allocate and initialize the CPU's GDT and PDA so it - doesn't have to do any memory allocation during the - delicate CPU-bringup phase. */ - if (!init_gdt(cpu, idle)) { - printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); - cpucount--; - return; - } - + init_gdt(cpu, idle); irq_ctx_init(cpu); /* Note: Don't modify initial ss override */ diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 196376262240..d984a9041436 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -1,6 +1,7 @@ #ifndef _ASM_GENERIC_PERCPU_H_ #define _ASM_GENERIC_PERCPU_H_ #include +#include #define __GENERIC_PER_CPU #ifdef CONFIG_SMP diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index 050831f34f71..53c5916687b6 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -22,6 +22,7 @@ struct Xgt_desc_struct { extern struct Xgt_desc_struct idt_descr; DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); +DECLARE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]); extern struct Xgt_desc_struct early_gdt_descr; static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h index b12d59a318b7..aef7f732f77e 100644 --- a/include/asm-i386/pda.h +++ b/include/asm-i386/pda.h @@ -8,6 +8,7 @@ #include #include +#include struct i386_pda { @@ -18,10 +19,8 @@ struct i386_pda struct pt_regs *irq_regs; }; -extern struct i386_pda *_cpu_pda[]; - -#define cpu_pda(i) (_cpu_pda[i]) - +DECLARE_PER_CPU(struct i386_pda, _cpu_pda); +#define cpu_pda(i) (&per_cpu(_cpu_pda, (i))) #define pda_offset(field) offsetof(struct i386_pda, field) extern void __bad_pda_field(void); diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 01ae0ffcf236..cd940befef53 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -743,7 +743,7 @@ extern unsigned long boot_option_idle_override; extern void enable_sep_cpu(void); extern int sysenter_setup(void); -extern int init_gdt(int cpu, struct task_struct *idle); +extern void init_gdt(int cpu, struct task_struct *idle); extern void cpu_set_gdt(int); extern void secondary_cpu_init(void); From bf50467204b435421d8de33ad080fa46c6f3d50b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 085/231] [PATCH] i386: Use per-cpu GDT immediately upon boot Now we are no longer dynamically allocating the GDT, we don't need the "cpu_gdt_table" at all: we can switch straight from "boot_gdt_table" to the per-cpu GDT. This means initializing the cpu_gdt array in C. The boot CPU uses the per-cpu var directly, then in smp_prepare_cpus() it switches to the per-cpu copy just allocated. For secondary CPUs, the early_gdt_descr is set to point directly to their per-cpu copy. For UP the code is very simple: it keeps using the "per-cpu" GDT as per SMP, but we never have to move. Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 72 +++++++++++----------------- arch/i386/kernel/head.S | 55 +-------------------- arch/i386/kernel/smpboot.c | 59 ++++++++++++++++++----- arch/i386/mach-voyager/voyager_smp.c | 6 --- include/asm-i386/desc.h | 2 - include/asm-i386/processor.h | 1 - 6 files changed, 75 insertions(+), 120 deletions(-) diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 2335f4464ead..fd6b079f7609 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -25,7 +25,33 @@ DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); -DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]); +DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]) = { + [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, + [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, + [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, + [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ + [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + /* 16-bit code */ + [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, + [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ + + [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, + [GDT_ENTRY_PDA] = { 0x00000000, 0x00c09200 }, /* set in setup_pda */ +}; DEFINE_PER_CPU(struct i386_pda, _cpu_pda); EXPORT_PER_CPU_SYMBOL(_cpu_pda); @@ -618,46 +644,6 @@ struct i386_pda boot_pda = { .pcurrent = &init_task, }; -static inline void set_kernel_fs(void) -{ - /* Set %fs for this CPU's PDA. Memory clobber is to create a - barrier with respect to any PDA operations, so the compiler - doesn't move any before here. */ - asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); -} - -/* Initialize the CPU's GDT and PDA. This is either the boot CPU doing itself - (still using cpu_gdt_table), or a CPU doing it for a secondary which - will soon come up. */ -__cpuinit void init_gdt(int cpu, struct task_struct *idle) -{ - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt = per_cpu(cpu_gdt, cpu); - struct i386_pda *pda = &per_cpu(_cpu_pda, cpu); - - memcpy(gdt, cpu_gdt_table, GDT_SIZE); - cpu_gdt_descr->address = (unsigned long)gdt; - cpu_gdt_descr->size = GDT_SIZE - 1; - - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, - (u32 *)&gdt[GDT_ENTRY_PDA].b, - (unsigned long)pda, sizeof(*pda) - 1, - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ - - memset(pda, 0, sizeof(*pda)); - pda->_pda = pda; - pda->cpu_number = cpu; - pda->pcurrent = idle; -} - -void __cpuinit cpu_set_gdt(int cpu) -{ - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - - load_gdt(cpu_gdt_descr); - set_kernel_fs(); -} - /* Common CPU init for both boot and secondary CPUs */ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) { @@ -740,10 +726,6 @@ void __cpuinit cpu_init(void) int cpu = smp_processor_id(); struct task_struct *curr = current; - /* Set up the real GDT and PDA, so we can transition from the - boot_gdt_table & boot_pda. */ - init_gdt(cpu, curr); - cpu_set_gdt(cpu); _cpu_init(cpu, curr); } diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index cb185f40c282..633fd2f47429 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -599,7 +599,7 @@ idt_descr: .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 - .long cpu_gdt_table + .long per_cpu__cpu_gdt /* Overwritten for secondary CPUs */ /* * The boot_gdt_table must mirror the equivalent in setup.S and is @@ -610,56 +610,3 @@ ENTRY(boot_gdt_table) .fill GDT_ENTRY_BOOT_CS,8,0 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ - -/* - * The Global Descriptor Table contains 32 quadwords, per-CPU. - */ - .align L1_CACHE_BYTES -ENTRY(cpu_gdt_table) - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x0000000000000000 /* 0x0b reserved */ - .quad 0x0000000000000000 /* 0x13 reserved */ - .quad 0x0000000000000000 /* 0x1b reserved */ - .quad 0x0000000000000000 /* 0x20 unused */ - .quad 0x0000000000000000 /* 0x28 unused */ - .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ - .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ - .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ - .quad 0x0000000000000000 /* 0x4b reserved */ - .quad 0x0000000000000000 /* 0x53 reserved */ - .quad 0x0000000000000000 /* 0x5b reserved */ - - .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ - .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ - .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ - .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ - - .quad 0x0000000000000000 /* 0x80 TSS descriptor */ - .quad 0x0000000000000000 /* 0x88 LDT descriptor */ - - /* - * Segments used for calling PnP BIOS have byte granularity. - * The code segments and data segments have fixed 64k limits, - * the transfer segment sizes are set at run time. - */ - .quad 0x00409a000000ffff /* 0x90 32-bit code */ - .quad 0x00009a000000ffff /* 0x98 16-bit code */ - .quad 0x000092000000ffff /* 0xa0 16-bit data */ - .quad 0x0000920000000000 /* 0xa8 16-bit data */ - .quad 0x0000920000000000 /* 0xb0 16-bit data */ - - /* - * The APM segments have byte granularity and their bases - * are set at run time. All have 64k limits. - */ - .quad 0x00409a000000ffff /* 0xb8 APM CS code */ - .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ - .quad 0x004092000000ffff /* 0xc8 APM DS data */ - - .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ - .quad 0x00cf92000000ffff /* 0xd8 - PDA */ - .quad 0x0000000000000000 /* 0xe0 - unused */ - .quad 0x0000000000000000 /* 0xe8 - unused */ - .quad 0x0000000000000000 /* 0xf0 - unused */ - .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ - diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index b36a5f174cc9..a9447c3e86dd 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -439,12 +439,6 @@ static void __cpuinit start_secondary(void *unused) */ void __devinit initialize_secondary(void) { - /* - * switch to the per CPU GDT we already set up - * in do_boot_cpu() - */ - cpu_set_gdt(current_thread_info()->cpu); - /* * We don't actually need to load the full TSS, * basically just the stack pointer and the eip. @@ -787,6 +781,32 @@ static inline struct task_struct * alloc_idle_task(int cpu) #define alloc_idle_task(cpu) fork_idle(cpu) #endif +/* Initialize the CPU's GDT. This is either the boot CPU doing itself + (still using the master per-cpu area), or a CPU doing it for a + secondary which will soon come up. */ +static __cpuinit void init_gdt(int cpu, struct task_struct *idle) +{ + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt = per_cpu(cpu_gdt, cpu); + struct i386_pda *pda = &per_cpu(_cpu_pda, cpu); + + cpu_gdt_descr->address = (unsigned long)gdt; + cpu_gdt_descr->size = GDT_SIZE - 1; + + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, + (u32 *)&gdt[GDT_ENTRY_PDA].b, + (unsigned long)pda, sizeof(*pda) - 1, + 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ + + memset(pda, 0, sizeof(*pda)); + pda->_pda = pda; + pda->cpu_number = cpu; + pda->pcurrent = idle; +} + +/* Defined in head.S */ +extern struct Xgt_desc_struct early_gdt_descr; + static int __cpuinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -809,6 +829,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) panic("failed fork for CPU %d", cpu); init_gdt(cpu, idle); + early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + start_pda = cpu_pda(cpu); idle->thread.eip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ @@ -1161,13 +1183,26 @@ void __init smp_prepare_cpus(unsigned int max_cpus) smp_boot_cpus(max_cpus); } -void __devinit smp_prepare_boot_cpu(void) +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +static inline void switch_to_new_gdt(void) { - cpu_set(smp_processor_id(), cpu_online_map); - cpu_set(smp_processor_id(), cpu_callout_map); - cpu_set(smp_processor_id(), cpu_present_map); - cpu_set(smp_processor_id(), cpu_possible_map); - per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + load_gdt(&per_cpu(cpu_gdt_descr, smp_processor_id())); + asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); +} + +void __init smp_prepare_boot_cpu(void) +{ + unsigned int cpu = smp_processor_id(); + + init_gdt(cpu, current); + switch_to_new_gdt(); + + cpu_set(cpu, cpu_online_map); + cpu_set(cpu, cpu_callout_map); + cpu_set(cpu, cpu_present_map); + cpu_set(cpu, cpu_possible_map); + __get_cpu_var(cpu_state) = CPU_ONLINE; } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c index 15d8132d4f5e..b9ce33c0c202 100644 --- a/arch/i386/mach-voyager/voyager_smp.c +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -764,12 +764,6 @@ initialize_secondary(void) set_current(hard_get_current()); #endif - /* - * switch to the per CPU GDT we already set up - * in do_boot_cpu() - */ - cpu_set_gdt(current_thread_info()->cpu); - /* * We don't actually need to load the full TSS, * basically just the stack pointer and the eip. diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index 53c5916687b6..a75ae6b97860 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -12,8 +12,6 @@ #include -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; - struct Xgt_desc_struct { unsigned short size; unsigned long address __attribute__((packed)); diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index cd940befef53..b25a2f5b5375 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -743,7 +743,6 @@ extern unsigned long boot_option_idle_override; extern void enable_sep_cpu(void); extern int sysenter_setup(void); -extern void init_gdt(int cpu, struct task_struct *idle); extern void cpu_set_gdt(int); extern void secondary_cpu_init(void); From d2cbcc49e2bfd6eaa44d7e4e5e5f171aaa5ec80d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 086/231] [PATCH] i386: clean up cpu_init() We now have cpu_init() and secondary_cpu_init() doing nothing but calling _cpu_init() with the same arguments. Rename _cpu_init() to cpu_init() and use it as a replcement for secondary_cpu_init(). Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 34 +++++++++------------------------- arch/i386/kernel/smpboot.c | 8 ++++---- include/asm-i386/processor.h | 2 +- 3 files changed, 14 insertions(+), 30 deletions(-) diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index fd6b079f7609..2a26956fce42 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -644,9 +644,16 @@ struct i386_pda boot_pda = { .pcurrent = &init_task, }; -/* Common CPU init for both boot and secondary CPUs */ -static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __cpuinit cpu_init(void) { + int cpu = smp_processor_id(); + struct task_struct *curr = current; struct tss_struct * t = &per_cpu(init_tss, cpu); struct thread_struct *thread = &curr->thread; @@ -706,29 +713,6 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) mxcsr_feature_mask_init(); } -/* Entrypoint to initialize secondary CPU */ -void __cpuinit secondary_cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - - _cpu_init(cpu, curr); -} - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ -void __cpuinit cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - - _cpu_init(cpu, curr); -} - #ifdef CONFIG_HOTPLUG_CPU void __cpuinit cpu_uninit(void) { diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index a9447c3e86dd..954245f6d307 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -378,14 +378,14 @@ set_cpu_sibling_map(int cpu) static void __cpuinit start_secondary(void *unused) { /* - * Don't put *anything* before secondary_cpu_init(), SMP - * booting is too fragile that we want to limit the - * things done here to the most necessary things. + * Don't put *anything* before cpu_init(), SMP booting is too + * fragile that we want to limit the things done here to the + * most necessary things. */ #ifdef CONFIG_VMI vmi_bringup(); #endif - secondary_cpu_init(); + cpu_init(); preempt_disable(); smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index b25a2f5b5375..80f7e8a1e878 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -744,6 +744,6 @@ extern void enable_sep_cpu(void); extern int sysenter_setup(void); extern void cpu_set_gdt(int); -extern void secondary_cpu_init(void); +extern void cpu_init(void); #endif /* __ASM_I386_PROCESSOR_H */ From 52de74dd3994e165ef1b35c33d54655a6400e30c Mon Sep 17 00:00:00 2001 From: Sebastien Dugue Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 087/231] [PATCH] i386: Rename boot_gdt_table to boot_gdt Rename boot_gdt_table to boot_gdt to avoid the duplicate T(able). Signed-off-by: Sebastien Dugue Signed-off-by: Andi Kleen Acked-by: Rusty Russell Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/head.S | 9 ++++----- arch/i386/kernel/trampoline.S | 12 ++++++------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 633fd2f47429..cc46494787e8 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -147,8 +147,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but - * we know the trampoline has already loaded the boot_gdt_table GDT - * for us. + * we know the trampoline has already loaded the boot_gdt for us. * * If cpu hotplug is not supported then this code can go in init section * which will be freed later @@ -588,7 +587,7 @@ fault_msg: .word 0 # 32 bit align gdt_desc.address boot_gdt_descr: .word __BOOT_DS+7 - .long boot_gdt_table - __PAGE_OFFSET + .long boot_gdt - __PAGE_OFFSET .word 0 # 32-bit align idt_desc.address idt_descr: @@ -602,11 +601,11 @@ ENTRY(early_gdt_descr) .long per_cpu__cpu_gdt /* Overwritten for secondary CPUs */ /* - * The boot_gdt_table must mirror the equivalent in setup.S and is + * The boot_gdt must mirror the equivalent in setup.S and is * used only for booting. */ .align L1_CACHE_BYTES -ENTRY(boot_gdt_table) +ENTRY(boot_gdt) .fill GDT_ENTRY_BOOT_CS,8,0 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ diff --git a/arch/i386/kernel/trampoline.S b/arch/i386/kernel/trampoline.S index 2f1814c5cfd7..f62815f8d06a 100644 --- a/arch/i386/kernel/trampoline.S +++ b/arch/i386/kernel/trampoline.S @@ -29,7 +29,7 @@ * * TYPE VALUE * R_386_32 startup_32_smp - * R_386_32 boot_gdt_table + * R_386_32 boot_gdt */ #include @@ -62,8 +62,8 @@ r_base = . * to 32 bit. */ - lidtl boot_idt - r_base # load idt with 0, 0 - lgdtl boot_gdt - r_base # load gdt with whatever is appropriate + lidtl boot_idt_descr - r_base # load idt with 0, 0 + lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate xor %ax, %ax inc %ax # protected mode (PE) bit @@ -73,11 +73,11 @@ r_base = . # These need to be in the same 64K segment as the above; # hence we don't use the boot_gdt_descr defined in head.S -boot_gdt: +boot_gdt_descr: .word __BOOT_DS + 7 # gdt limit - .long boot_gdt_table-__PAGE_OFFSET # gdt base + .long boot_gdt - __PAGE_OFFSET # gdt base -boot_idt: +boot_idt_descr: .word 0 # idt limit = 0 .long 0 # idt base = 0L From 90a0a06aa81692028864c21f981905fda46b1208 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 088/231] [PATCH] i386: rationalize paravirt wrappers paravirt.c used to implement native versions of all low-level functions. Far cleaner is to have the native versions exposed in the headers and as inline native_XXX, and if !CONFIG_PARAVIRT, then simply #define XXX native_XXX. There are several nice side effects: 1) write_dt_entry() now takes the correct "struct Xgt_desc_struct *" not "void *". 2) load_TLS is reintroduced to the for loop, not manually unrolled with a #error in case the bounds ever change. 3) Macros become inlines, with type checking. 4) Access to the native versions is trivial for KVM, lguest, Xen and others who might want it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Avi Kivity Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 293 +---------------------------------- include/asm-i386/desc.h | 82 ++++++---- include/asm-i386/io.h | 15 +- include/asm-i386/irqflags.h | 61 +++++--- include/asm-i386/msr.h | 157 +++++++++++++------ include/asm-i386/paravirt.h | 17 +- include/asm-i386/processor.h | 94 ++++++++--- include/asm-i386/system.h | 139 ++++++++++------- 8 files changed, 386 insertions(+), 472 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 2ec331e03fa9..47698756aec5 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -93,294 +93,11 @@ static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) return insn_len; } -static unsigned long native_get_debugreg(int regno) -{ - unsigned long val = 0; /* Damn you, gcc! */ - - switch (regno) { - case 0: - asm("movl %%db0, %0" :"=r" (val)); break; - case 1: - asm("movl %%db1, %0" :"=r" (val)); break; - case 2: - asm("movl %%db2, %0" :"=r" (val)); break; - case 3: - asm("movl %%db3, %0" :"=r" (val)); break; - case 6: - asm("movl %%db6, %0" :"=r" (val)); break; - case 7: - asm("movl %%db7, %0" :"=r" (val)); break; - default: - BUG(); - } - return val; -} - -static void native_set_debugreg(int regno, unsigned long value) -{ - switch (regno) { - case 0: - asm("movl %0,%%db0" : /* no output */ :"r" (value)); - break; - case 1: - asm("movl %0,%%db1" : /* no output */ :"r" (value)); - break; - case 2: - asm("movl %0,%%db2" : /* no output */ :"r" (value)); - break; - case 3: - asm("movl %0,%%db3" : /* no output */ :"r" (value)); - break; - case 6: - asm("movl %0,%%db6" : /* no output */ :"r" (value)); - break; - case 7: - asm("movl %0,%%db7" : /* no output */ :"r" (value)); - break; - default: - BUG(); - } -} - void init_IRQ(void) { paravirt_ops.init_IRQ(); } -static void native_clts(void) -{ - asm volatile ("clts"); -} - -static unsigned long native_read_cr0(void) -{ - unsigned long val; - asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); - return val; -} - -static void native_write_cr0(unsigned long val) -{ - asm volatile("movl %0,%%cr0": :"r" (val)); -} - -static unsigned long native_read_cr2(void) -{ - unsigned long val; - asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); - return val; -} - -static void native_write_cr2(unsigned long val) -{ - asm volatile("movl %0,%%cr2": :"r" (val)); -} - -static unsigned long native_read_cr3(void) -{ - unsigned long val; - asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); - return val; -} - -static void native_write_cr3(unsigned long val) -{ - asm volatile("movl %0,%%cr3": :"r" (val)); -} - -static unsigned long native_read_cr4(void) -{ - unsigned long val; - asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); - return val; -} - -static unsigned long native_read_cr4_safe(void) -{ - unsigned long val; - /* This could fault if %cr4 does not exist */ - asm("1: movl %%cr4, %0 \n" - "2: \n" - ".section __ex_table,\"a\" \n" - ".long 1b,2b \n" - ".previous \n" - : "=r" (val): "0" (0)); - return val; -} - -static void native_write_cr4(unsigned long val) -{ - asm volatile("movl %0,%%cr4": :"r" (val)); -} - -static unsigned long native_save_fl(void) -{ - unsigned long f; - asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); - return f; -} - -static void native_restore_fl(unsigned long f) -{ - asm volatile("pushl %0 ; popfl": /* no output */ - :"g" (f) - :"memory", "cc"); -} - -static void native_irq_disable(void) -{ - asm volatile("cli": : :"memory"); -} - -static void native_irq_enable(void) -{ - asm volatile("sti": : :"memory"); -} - -static void native_safe_halt(void) -{ - asm volatile("sti; hlt": : :"memory"); -} - -static void native_halt(void) -{ - asm volatile("hlt": : :"memory"); -} - -static void native_wbinvd(void) -{ - asm volatile("wbinvd": : :"memory"); -} - -static unsigned long long native_read_msr(unsigned int msr, int *err) -{ - unsigned long long val; - - asm volatile("2: rdmsr ; xorl %0,%0\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: movl %3,%0 ; jmp 1b\n\t" - ".previous\n\t" - ".section __ex_table,\"a\"\n" - " .align 4\n\t" - " .long 2b,3b\n\t" - ".previous" - : "=r" (*err), "=A" (val) - : "c" (msr), "i" (-EFAULT)); - - return val; -} - -static int native_write_msr(unsigned int msr, unsigned long long val) -{ - int err; - asm volatile("2: wrmsr ; xorl %0,%0\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: movl %4,%0 ; jmp 1b\n\t" - ".previous\n\t" - ".section __ex_table,\"a\"\n" - " .align 4\n\t" - " .long 2b,3b\n\t" - ".previous" - : "=a" (err) - : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)), - "i" (-EFAULT)); - return err; -} - -static unsigned long long native_read_tsc(void) -{ - unsigned long long val; - asm volatile("rdtsc" : "=A" (val)); - return val; -} - -static unsigned long long native_read_pmc(void) -{ - unsigned long long val; - asm volatile("rdpmc" : "=A" (val)); - return val; -} - -static void native_load_tr_desc(void) -{ - asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); -} - -static void native_load_gdt(const struct Xgt_desc_struct *dtr) -{ - asm volatile("lgdt %0"::"m" (*dtr)); -} - -static void native_load_idt(const struct Xgt_desc_struct *dtr) -{ - asm volatile("lidt %0"::"m" (*dtr)); -} - -static void native_store_gdt(struct Xgt_desc_struct *dtr) -{ - asm ("sgdt %0":"=m" (*dtr)); -} - -static void native_store_idt(struct Xgt_desc_struct *dtr) -{ - asm ("sidt %0":"=m" (*dtr)); -} - -static unsigned long native_store_tr(void) -{ - unsigned long tr; - asm ("str %0":"=r" (tr)); - return tr; -} - -static void native_load_tls(struct thread_struct *t, unsigned int cpu) -{ -#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] - C(0); C(1); C(2); -#undef C -} - -static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) -{ - u32 *lp = (u32 *)((char *)dt + entry*8); - lp[0] = entry_low; - lp[1] = entry_high; -} - -static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) -{ - native_write_dt_entry(dt, entrynum, low, high); -} - -static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) -{ - native_write_dt_entry(dt, entrynum, low, high); -} - -static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) -{ - native_write_dt_entry(dt, entrynum, low, high); -} - -static void native_load_esp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - tss->esp0 = thread->esp0; - - /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -} - -static void native_io_delay(void) -{ - asm volatile("outb %al,$0x80"); -} - static void native_flush_tlb(void) { __native_flush_tlb(); @@ -517,8 +234,8 @@ struct paravirt_ops paravirt_ops = { .safe_halt = native_safe_halt, .halt = native_halt, .wbinvd = native_wbinvd, - .read_msr = native_read_msr, - .write_msr = native_write_msr, + .read_msr = native_read_msr_safe, + .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .get_scheduled_cycles = native_read_tsc, @@ -531,9 +248,9 @@ struct paravirt_ops paravirt_ops = { .store_idt = native_store_idt, .store_tr = native_store_tr, .load_tls = native_load_tls, - .write_ldt_entry = native_write_ldt_entry, - .write_gdt_entry = native_write_gdt_entry, - .write_idt_entry = native_write_idt_entry, + .write_ldt_entry = write_dt_entry, + .write_gdt_entry = write_dt_entry, + .write_idt_entry = write_dt_entry, .load_esp0 = native_load_esp0, .set_iopl_mask = native_set_iopl_mask, diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index a75ae6b97860..13f701ea9a8f 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -57,45 +57,33 @@ static inline void pack_gate(__u32 *a, __u32 *b, #ifdef CONFIG_PARAVIRT #include #else -#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) - -#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) -#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) +#define load_TR_desc() native_load_tr_desc() +#define load_gdt(dtr) native_load_gdt(dtr) +#define load_idt(dtr) native_load_idt(dtr) #define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) #define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) -#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr)) -#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) -#define store_tr(tr) __asm__ ("str %0":"=m" (tr)) +#define store_gdt(dtr) native_store_gdt(dtr) +#define store_idt(dtr) native_store_idt(dtr) +#define store_tr(tr) (tr = native_store_tr()) #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) -#if TLS_SIZE != 24 -# error update this code. -#endif - -static inline void load_TLS(struct thread_struct *t, unsigned int cpu) -{ -#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] - C(0); C(1); C(2); -#undef C -} +#define load_TLS(t, cpu) native_load_tls(t, cpu) +#define set_ldt native_set_ldt #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) +#endif -static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) +static inline void write_dt_entry(struct desc_struct *dt, + int entry, u32 entry_low, u32 entry_high) { - __u32 *lp = (__u32 *)((char *)dt + entry*8); - *lp = entry_a; - *(lp+1) = entry_b; + dt[entry].a = entry_low; + dt[entry].b = entry_high; } -#define set_ldt native_set_ldt -#endif /* CONFIG_PARAVIRT */ - -static inline fastcall void native_set_ldt(const void *addr, - unsigned int entries) +static inline void native_set_ldt(const void *addr, unsigned int entries) { if (likely(entries == 0)) __asm__ __volatile__("lldt %w0"::"q" (0)); @@ -111,6 +99,48 @@ static inline fastcall void native_set_ldt(const void *addr, } } + +static inline void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static inline void native_load_gdt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static inline void native_load_idt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static inline void native_store_gdt(struct Xgt_desc_struct *dtr) +{ + asm ("sgdt %0":"=m" (*dtr)); +} + +static inline void native_store_idt(struct Xgt_desc_struct *dtr) +{ + asm ("sidt %0":"=m" (*dtr)); +} + +static inline unsigned long native_store_tr(void) +{ + unsigned long tr; + asm ("str %0":"=r" (tr)); + return tr; +} + +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; +} + static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) { __u32 a, b; diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h index 59fe616933c4..e797586a5bfc 100644 --- a/include/asm-i386/io.h +++ b/include/asm-i386/io.h @@ -250,19 +250,22 @@ static inline void flush_write_buffers(void) #endif /* __KERNEL__ */ +static inline void native_io_delay(void) +{ + asm volatile("outb %%al,$0x80" : : : "memory"); +} + #if defined(CONFIG_PARAVIRT) #include #else -#define __SLOW_DOWN_IO "outb %%al,$0x80;" - static inline void slow_down_io(void) { - __asm__ __volatile__( - __SLOW_DOWN_IO + native_io_delay(); #ifdef REALLY_SLOW_IO - __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO + native_io_delay(); + native_io_delay(); + native_io_delay(); #endif - : : ); } #endif diff --git a/include/asm-i386/irqflags.h b/include/asm-i386/irqflags.h index 17b18cf4fe9d..c1cdd094938e 100644 --- a/include/asm-i386/irqflags.h +++ b/include/asm-i386/irqflags.h @@ -10,6 +10,42 @@ #ifndef _ASM_IRQFLAGS_H #define _ASM_IRQFLAGS_H +#ifndef __ASSEMBLY__ +static inline unsigned long native_save_fl(void) +{ + unsigned long f; + asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); + return f; +} + +static inline void native_restore_fl(unsigned long f) +{ + asm volatile("pushl %0 ; popfl": /* no output */ + :"g" (f) + :"memory", "cc"); +} + +static inline void native_irq_disable(void) +{ + asm volatile("cli": : :"memory"); +} + +static inline void native_irq_enable(void) +{ + asm volatile("sti": : :"memory"); +} + +static inline void native_safe_halt(void) +{ + asm volatile("sti; hlt": : :"memory"); +} + +static inline void native_halt(void) +{ + asm volatile("hlt": : :"memory"); +} +#endif /* __ASSEMBLY__ */ + #ifdef CONFIG_PARAVIRT #include #else @@ -17,35 +53,22 @@ static inline unsigned long __raw_local_save_flags(void) { - unsigned long flags; - - __asm__ __volatile__( - "pushfl ; popl %0" - : "=g" (flags) - : /* no input */ - ); - - return flags; + return native_save_fl(); } static inline void raw_local_irq_restore(unsigned long flags) { - __asm__ __volatile__( - "pushl %0 ; popfl" - : /* no output */ - :"g" (flags) - :"memory", "cc" - ); + native_restore_fl(flags); } static inline void raw_local_irq_disable(void) { - __asm__ __volatile__("cli" : : : "memory"); + native_irq_disable(); } static inline void raw_local_irq_enable(void) { - __asm__ __volatile__("sti" : : : "memory"); + native_irq_enable(); } /* @@ -54,7 +77,7 @@ static inline void raw_local_irq_enable(void) */ static inline void raw_safe_halt(void) { - __asm__ __volatile__("sti; hlt" : : : "memory"); + native_safe_halt(); } /* @@ -63,7 +86,7 @@ static inline void raw_safe_halt(void) */ static inline void halt(void) { - __asm__ __volatile__("hlt": : :"memory"); + native_halt(); } /* diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h index 2ad3f30b1a68..00acaa8b36bb 100644 --- a/include/asm-i386/msr.h +++ b/include/asm-i386/msr.h @@ -1,6 +1,74 @@ #ifndef __ASM_MSR_H #define __ASM_MSR_H +#include + +static inline unsigned long long native_read_msr(unsigned int msr) +{ + unsigned long long val; + + asm volatile("rdmsr" : "=A" (val) : "c" (msr)); + return val; +} + +static inline unsigned long long native_read_msr_safe(unsigned int msr, + int *err) +{ + unsigned long long val; + + asm volatile("2: rdmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %3,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=r" (*err), "=A" (val) + : "c" (msr), "i" (-EFAULT)); + + return val; +} + +static inline void native_write_msr(unsigned int msr, unsigned long long val) +{ + asm volatile("wrmsr" : : "c" (msr), "A"(val)); +} + +static inline int native_write_msr_safe(unsigned int msr, + unsigned long long val) +{ + int err; + asm volatile("2: wrmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %4,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=a" (err) + : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)), + "i" (-EFAULT)); + return err; +} + +static inline unsigned long long native_read_tsc(void) +{ + unsigned long long val; + asm volatile("rdtsc" : "=A" (val)); + return val; +} + +static inline unsigned long long native_read_pmc(void) +{ + unsigned long long val; + asm volatile("rdpmc" : "=A" (val)); + return val; +} + #ifdef CONFIG_PARAVIRT #include #else @@ -11,22 +79,20 @@ * pointer indirection), this allows gcc to optimize better */ -#define rdmsr(msr,val1,val2) \ - __asm__ __volatile__("rdmsr" \ - : "=a" (val1), "=d" (val2) \ - : "c" (msr)) +#define rdmsr(msr,val1,val2) \ + do { \ + unsigned long long __val = native_read_msr(msr); \ + val1 = __val; \ + val2 = __val >> 32; \ + } while(0) -#define wrmsr(msr,val1,val2) \ - __asm__ __volatile__("wrmsr" \ - : /* no outputs */ \ - : "c" (msr), "a" (val1), "d" (val2)) +#define wrmsr(msr,val1,val2) \ + native_write_msr(msr, ((unsigned long long)val2 << 32) | val1) -#define rdmsrl(msr,val) do { \ - unsigned long l__,h__; \ - rdmsr (msr, l__, h__); \ - val = l__; \ - val |= ((u64)h__<<32); \ -} while(0) +#define rdmsrl(msr,val) \ + do { \ + (val) = native_read_msr(msr); \ + } while(0) static inline void wrmsrl (unsigned long msr, unsigned long long val) { @@ -37,50 +103,41 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val) } /* wrmsr with exception handling */ -#define wrmsr_safe(msr,a,b) ({ int ret__; \ - asm volatile("2: wrmsr ; xorl %0,%0\n" \ - "1:\n\t" \ - ".section .fixup,\"ax\"\n\t" \ - "3: movl %4,%0 ; jmp 1b\n\t" \ - ".previous\n\t" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n\t" \ - " .long 2b,3b\n\t" \ - ".previous" \ - : "=a" (ret__) \ - : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT));\ - ret__; }) +#define wrmsr_safe(msr,val1,val2) \ + (native_write_msr_safe(msr, ((unsigned long long)val2 << 32) | val1)) /* rdmsr with exception handling */ -#define rdmsr_safe(msr,a,b) ({ int ret__; \ - asm volatile("2: rdmsr ; xorl %0,%0\n" \ - "1:\n\t" \ - ".section .fixup,\"ax\"\n\t" \ - "3: movl %4,%0 ; jmp 1b\n\t" \ - ".previous\n\t" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n\t" \ - " .long 2b,3b\n\t" \ - ".previous" \ - : "=r" (ret__), "=a" (*(a)), "=d" (*(b)) \ - : "c" (msr), "i" (-EFAULT));\ - ret__; }) +#define rdmsr_safe(msr,p1,p2) \ + ({ \ + int __err; \ + unsigned long long __val = native_read_msr_safe(msr, &__err);\ + (*p1) = __val; \ + (*p2) = __val >> 32; \ + __err; \ + }) -#define rdtsc(low,high) \ - __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high)) +#define rdtsc(low,high) \ + do { \ + u64 _l = native_read_tsc(); \ + (low) = (u32)_l; \ + (high) = _l >> 32; \ + } while(0) -#define rdtscl(low) \ - __asm__ __volatile__("rdtsc" : "=a" (low) : : "edx") +#define rdtscl(low) \ + do { \ + (low) = native_read_tsc(); \ + } while(0) -#define rdtscll(val) \ - __asm__ __volatile__("rdtsc" : "=A" (val)) +#define rdtscll(val) ((val) = native_read_tsc()) #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) -#define rdpmc(counter,low,high) \ - __asm__ __volatile__("rdpmc" \ - : "=a" (low), "=d" (high) \ - : "c" (counter)) +#define rdpmc(counter,low,high) \ + do { \ + u64 _l = native_read_pmc(); \ + low = (u32)_l; \ + high = _l >> 32; \ + } while(0) #endif /* !CONFIG_PARAVIRT */ #ifdef CONFIG_SMP diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index e63f1e444fcf..32acebce9ae2 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -29,6 +29,7 @@ struct thread_struct; struct Xgt_desc_struct; struct tss_struct; struct mm_struct; +struct desc_struct; struct paravirt_ops { unsigned int kernel_rpl; @@ -105,14 +106,13 @@ struct paravirt_ops void (*set_ldt)(const void *desc, unsigned entries); unsigned long (*store_tr)(void); void (*load_tls)(struct thread_struct *t, unsigned int cpu); - void (*write_ldt_entry)(void *dt, int entrynum, - u32 low, u32 high); - void (*write_gdt_entry)(void *dt, int entrynum, - u32 low, u32 high); - void (*write_idt_entry)(void *dt, int entrynum, - u32 low, u32 high); - void (*load_esp0)(struct tss_struct *tss, - struct thread_struct *thread); + void (*write_ldt_entry)(struct desc_struct *, + int entrynum, u32 low, u32 high); + void (*write_gdt_entry)(struct desc_struct *, + int entrynum, u32 low, u32 high); + void (*write_idt_entry)(struct desc_struct *, + int entrynum, u32 low, u32 high); + void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t); void (*set_iopl_mask)(unsigned mask); @@ -232,6 +232,7 @@ static inline void halt(void) #define get_kernel_rpl() (paravirt_ops.kernel_rpl) +/* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr,val1,val2) do { \ int _err; \ u64 _l = paravirt_ops.read_msr(msr,&_err); \ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 80f7e8a1e878..96edfdfe32d1 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -147,7 +147,7 @@ static inline void detect_ht(struct cpuinfo_x86 *c) {} #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ -static inline fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx, +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ @@ -545,13 +545,7 @@ static inline void rep_nop(void) #define cpu_relax() rep_nop() -#ifdef CONFIG_PARAVIRT -#include -#else -#define paravirt_enabled() 0 -#define __cpuid native_cpuid - -static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) +static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread) { tss->esp0 = thread->esp0; /* This can only happen when SEP is enabled, no need to test "SEP"arately */ @@ -561,24 +555,60 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa } } -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - __asm__("movl %%db" #register ", %0" \ - :"=r" (var)) -#define set_debugreg(value, register) \ - __asm__("movl %0,%%db" #register \ - : /* no output */ \ - :"r" (value)) -#define set_iopl_mask native_set_iopl_mask -#endif /* CONFIG_PARAVIRT */ +static inline unsigned long native_get_debugreg(int regno) +{ + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("movl %%db0, %0" :"=r" (val)); break; + case 1: + asm("movl %%db1, %0" :"=r" (val)); break; + case 2: + asm("movl %%db2, %0" :"=r" (val)); break; + case 3: + asm("movl %%db3, %0" :"=r" (val)); break; + case 6: + asm("movl %%db6, %0" :"=r" (val)); break; + case 7: + asm("movl %%db7, %0" :"=r" (val)); break; + default: + BUG(); + } + return val; +} + +static inline void native_set_debugreg(int regno, unsigned long value) +{ + switch (regno) { + case 0: + asm("movl %0,%%db0" : /* no output */ :"r" (value)); + break; + case 1: + asm("movl %0,%%db1" : /* no output */ :"r" (value)); + break; + case 2: + asm("movl %0,%%db2" : /* no output */ :"r" (value)); + break; + case 3: + asm("movl %0,%%db3" : /* no output */ :"r" (value)); + break; + case 6: + asm("movl %0,%%db6" : /* no output */ :"r" (value)); + break; + case 7: + asm("movl %0,%%db7" : /* no output */ :"r" (value)); + break; + default: + BUG(); + } +} /* * Set IOPL bits in EFLAGS from given mask */ -static fastcall inline void native_set_iopl_mask(unsigned mask) +static inline void native_set_iopl_mask(unsigned mask) { unsigned int reg; __asm__ __volatile__ ("pushfl;" @@ -591,6 +621,28 @@ static fastcall inline void native_set_iopl_mask(unsigned mask) : "i" (~X86_EFLAGS_IOPL), "r" (mask)); } +#ifdef CONFIG_PARAVIRT +#include +#else +#define paravirt_enabled() 0 +#define __cpuid native_cpuid + +static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) +{ + native_load_esp0(tss, thread); +} + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = native_get_debugreg(register) +#define set_debugreg(value, register) \ + native_set_debugreg(register, value) + +#define set_iopl_mask native_set_iopl_mask +#endif /* CONFIG_PARAVIRT */ + /* * Generic CPUID function * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h index a6d20d9a1a30..c3a58c08c495 100644 --- a/include/asm-i386/system.h +++ b/include/asm-i386/system.h @@ -88,65 +88,96 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \ #define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value)) + +static inline void native_clts(void) +{ + asm volatile ("clts"); +} + +static inline unsigned long native_read_cr0(void) +{ + unsigned long val; + asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); + return val; +} + +static inline void native_write_cr0(unsigned long val) +{ + asm volatile("movl %0,%%cr0": :"r" (val)); +} + +static inline unsigned long native_read_cr2(void) +{ + unsigned long val; + asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); + return val; +} + +static inline void native_write_cr2(unsigned long val) +{ + asm volatile("movl %0,%%cr2": :"r" (val)); +} + +static inline unsigned long native_read_cr3(void) +{ + unsigned long val; + asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); + return val; +} + +static inline void native_write_cr3(unsigned long val) +{ + asm volatile("movl %0,%%cr3": :"r" (val)); +} + +static inline unsigned long native_read_cr4(void) +{ + unsigned long val; + asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); + return val; +} + +static inline unsigned long native_read_cr4_safe(void) +{ + unsigned long val; + /* This could fault if %cr4 does not exist */ + asm("1: movl %%cr4, %0 \n" + "2: \n" + ".section __ex_table,\"a\" \n" + ".long 1b,2b \n" + ".previous \n" + : "=r" (val): "0" (0)); + return val; +} + +static inline void native_write_cr4(unsigned long val) +{ + asm volatile("movl %0,%%cr4": :"r" (val)); +} + +static inline void native_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + + #ifdef CONFIG_PARAVIRT #include #else -#define read_cr0() ({ \ - unsigned int __dummy; \ - __asm__ __volatile__( \ - "movl %%cr0,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define write_cr0(x) \ - __asm__ __volatile__("movl %0,%%cr0": :"r" (x)) - -#define read_cr2() ({ \ - unsigned int __dummy; \ - __asm__ __volatile__( \ - "movl %%cr2,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define write_cr2(x) \ - __asm__ __volatile__("movl %0,%%cr2": :"r" (x)) - -#define read_cr3() ({ \ - unsigned int __dummy; \ - __asm__ ( \ - "movl %%cr3,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define write_cr3(x) \ - __asm__ __volatile__("movl %0,%%cr3": :"r" (x)) - -#define read_cr4() ({ \ - unsigned int __dummy; \ - __asm__( \ - "movl %%cr4,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define read_cr4_safe() ({ \ - unsigned int __dummy; \ - /* This could fault if %cr4 does not exist */ \ - __asm__("1: movl %%cr4, %0 \n" \ - "2: \n" \ - ".section __ex_table,\"a\" \n" \ - ".long 1b,2b \n" \ - ".previous \n" \ - : "=r" (__dummy): "0" (0)); \ - __dummy; \ -}) -#define write_cr4(x) \ - __asm__ __volatile__("movl %0,%%cr4": :"r" (x)) - -#define wbinvd() \ - __asm__ __volatile__ ("wbinvd": : :"memory") +#define read_cr0() (native_read_cr0()) +#define write_cr0(x) (native_write_cr0(x)) +#define read_cr2() (native_read_cr2()) +#define write_cr2(x) (native_write_cr2(x)) +#define read_cr3() (native_read_cr3()) +#define write_cr3(x) (native_write_cr3(x)) +#define read_cr4() (native_read_cr4()) +#define read_cr4_safe() (native_read_cr4_safe()) +#define write_cr4(x) (native_write_cr4(x)) +#define wbinvd() (native_wbinvd()) /* Clear the 'TS' bit */ -#define clts() __asm__ __volatile__ ("clts") +#define clts() (native_clts()) + #endif/* CONFIG_PARAVIRT */ /* Set the 'TS' bit */ From d01ad8dd56527be72947b4b9997bb2c05783c3ed Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 089/231] [PATCH] x86: Improve handling of kernel mappings in change_page_attr Fix various broken corner cases in i386 and x86-64 change_page_attr. AK: split off from tighten kernel image access rights Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/mm/pageattr.c | 4 ++-- arch/x86_64/mm/pageattr.c | 16 ++++++++++++---- include/asm-i386/pgtable.h | 2 ++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 412ebbd8adb0..ea6b6d4a0a2a 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -142,7 +142,7 @@ __change_page_attr(struct page *page, pgprot_t prot) return -EINVAL; kpte_page = virt_to_page(kpte); if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { - if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, prot)); } else { pgprot_t ref_prot; @@ -158,7 +158,7 @@ __change_page_attr(struct page *page, pgprot_t prot) kpte_page = split; } page_private(kpte_page)++; - } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + } else if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); BUG_ON(page_private(kpte_page) == 0); page_private(kpte_page)--; diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 76ee90a5abe0..bf4aa8dd4254 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -179,16 +179,24 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) { unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT; - int err = 0; + int err = 0, kernel_map = 0; int i; + if (address >= __START_KERNEL_map + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { + address = (unsigned long)__va(__pa(address)); + kernel_map = 1; + } + down_write(&init_mm.mmap_sem); for (i = 0; i < numpages; i++, address += PAGE_SIZE) { unsigned long pfn = __pa(address) >> PAGE_SHIFT; - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; + if (!kernel_map || pte_present(pfn_pte(0, prot))) { + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); + if (err) + break; + } /* Handle kernel mapping too which aliases part of the * lowmem */ if ((pfn >= phys_base_pfn) && diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index c3b58d473a55..143ddc42b86f 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -159,6 +159,7 @@ void paging_init(void); extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) @@ -166,6 +167,7 @@ extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) From 6fb14755a676282a4e6caa05a08c92db8e45cfff Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 090/231] [PATCH] x86: tighten kernel image page access rights On x86-64, kernel memory freed after init can be entirely unmapped instead of just getting 'poisoned' by overwriting with a debug pattern. On i386 and x86-64 (under CONFIG_DEBUG_RODATA), kernel text and bug table can also be write-protected. Compared to the first version, this one prevents re-creating deleted mappings in the kernel image range on x86-64, if those got removed previously. This, together with the original changes, prevents temporarily having inconsistent mappings when cacheability attributes are being changed on such pages (e.g. from AGP code). While on i386 such duplicate mappings don't exist, the same change is done there, too, both for consistency and because checking pte_present() before using various other pte_XXX functions is a requirement anyway. At once, i386 code gets adjusted to use pte_huge() instead of open coding this. AK: split out cpa() changes Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/vmlinux.lds.S | 4 ++-- arch/i386/mm/init.c | 25 +++++++++++++++++++------ arch/x86_64/kernel/head.S | 1 - arch/x86_64/kernel/vmlinux.lds.S | 5 +++-- arch/x86_64/mm/init.c | 25 ++++++++++++++++--------- include/linux/poison.h | 3 --- 6 files changed, 40 insertions(+), 23 deletions(-) diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 6f38f818380b..f4ec72231835 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -61,8 +61,6 @@ SECTIONS __stop___ex_table = .; } - RODATA - BUG_TABLE . = ALIGN(4); @@ -72,6 +70,8 @@ SECTIONS __tracedata_end = .; } + RODATA + /* writeable */ . = ALIGN(4096); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 23be1b0aafa4..bd5ef3718504 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -751,13 +752,25 @@ static int noinline do_test_wp_bit(void) void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() <= 1) +#endif + { + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RX); + printk("Write protecting the kernel text: %luk\n", size >> 10); + } - printk("Write protecting the kernel read-only data: %uk\n", - (__end_rodata - __start_rodata) >> 10); + start += size; + size = (unsigned long)__end_rodata - start; + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RO); + printk("Write protecting the kernel read-only data: %luk\n", + size >> 10); /* * change_page_attr() requires a global_flush_tlb() call after it. @@ -781,7 +794,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) __free_page(page); totalram_pages++; } - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); } void free_initmem(void) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 36aa98a6d15c..fd9fdfdd143e 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -280,7 +280,6 @@ early_idt_ripmsg: .balign PAGE_SIZE ENTRY(stext) -ENTRY(_stext) #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 5176ecf006ee..3bdeb88d28f4 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -29,6 +29,7 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { /* First the code that has to be first for bootstrapping */ *(.bootstrap.text) + _stext = .; /* Then all the functions that are "hot" in profiles, to group them onto the same hugetlb entry */ #include "functionlist" @@ -50,10 +51,10 @@ SECTIONS __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; - RODATA - BUG_TABLE + RODATA + . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 69e22d3c9238..e3134bc9a4fc 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -563,21 +564,23 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) if (begin >= end) return; - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); for (addr = begin; addr < end; addr += PAGE_SIZE) { struct page *page = pfn_to_page(addr >> PAGE_SHIFT); ClearPageReserved(page); init_page_count(page); memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + if (addr >= __START_KERNEL_map) + change_page_attr_addr(addr, 1, __pgprot(0)); __free_page(page); totalram_pages++; } + if (addr > __START_KERNEL_map) + global_flush_tlb(); } void free_initmem(void) { - memset(__initdata_begin, POISON_FREE_INITDATA, - __initdata_end - __initdata_begin); free_init_pages("unused kernel memory", __pa_symbol(&__init_begin), __pa_symbol(&__init_end)); @@ -587,14 +590,18 @@ void free_initmem(void) void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__va(__pa_symbol(&__start_rodata)); - unsigned long end = (unsigned long)__va(__pa_symbol(&__end_rodata)); + unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size; - for (; addr < end; addr += PAGE_SIZE) - change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() > 1) + start = PFN_ALIGN(__va(__pa_symbol(&_etext))); +#endif + size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start; + change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO); - printk ("Write protecting the kernel read-only data: %luk\n", - (__end_rodata - __start_rodata) >> 10); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); /* * change_page_attr_addr() requires a global_flush_tlb() call after it. diff --git a/include/linux/poison.h b/include/linux/poison.h index 3e628f990fdf..89580b764959 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -26,9 +26,6 @@ /********** arch/$ARCH/mm/init.c **********/ #define POISON_FREE_INITMEM 0xcc -/********** arch/x86_64/mm/init.c **********/ -#define POISON_FREE_INITDATA 0xba - /********** arch/ia64/hp/common/sba_iommu.c **********/ /* * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a From 78eea47ac3e256559d97f0c2434928be39cba524 Mon Sep 17 00:00:00 2001 From: Parag Warudkar Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 091/231] [PATCH] i386: get rid of unused variables Signed-off-by: Parag Warudkar Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/apm.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 064bbf2861f4..4d073d390715 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -384,13 +384,6 @@ static int ignore_sys_suspend; static int ignore_normal_resume; static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; -#ifdef CONFIG_APM_RTC_IS_GMT -# define clock_cmos_diff 0 -# define got_clock_diff 1 -#else -static long clock_cmos_diff; -static int got_clock_diff; -#endif static int debug __read_mostly; static int smp __read_mostly; static int apm_disabled = -1; From b4531e863dbd06b5d336afefdb37483b690dea59 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 092/231] [PATCH] i386: Use X86_EFLAGS_IF in irqflags.h. Move X86_EFLAGS_IF et al out to a new header: processor-flags.h, so we can include it from irqflags.h and use it in raw_irqs_disabled_flags(). As a side-effect, we could now use these flags in .S files. Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- include/asm-i386/irqflags.h | 3 ++- include/asm-i386/processor-flags.h | 26 ++++++++++++++++++++++++++ include/asm-i386/processor.h | 22 +--------------------- 3 files changed, 29 insertions(+), 22 deletions(-) create mode 100644 include/asm-i386/processor-flags.h diff --git a/include/asm-i386/irqflags.h b/include/asm-i386/irqflags.h index c1cdd094938e..eff8585cb741 100644 --- a/include/asm-i386/irqflags.h +++ b/include/asm-i386/irqflags.h @@ -9,6 +9,7 @@ */ #ifndef _ASM_IRQFLAGS_H #define _ASM_IRQFLAGS_H +#include #ifndef __ASSEMBLY__ static inline unsigned long native_save_fl(void) @@ -119,7 +120,7 @@ static inline unsigned long __raw_local_irq_save(void) static inline int raw_irqs_disabled_flags(unsigned long flags) { - return !(flags & (1 << 9)); + return !(flags & X86_EFLAGS_IF); } static inline int raw_irqs_disabled(void) diff --git a/include/asm-i386/processor-flags.h b/include/asm-i386/processor-flags.h new file mode 100644 index 000000000000..b4711c222e2b --- /dev/null +++ b/include/asm-i386/processor-flags.h @@ -0,0 +1,26 @@ +#ifndef __ASM_I386_PROCESSOR_FLAGS_H +#define __ASM_I386_PROCESSOR_FLAGS_H +/* Various flags defined: can be included from assembler. */ + +/* + * EFLAGS bits + */ +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ + +#endif /* __ASM_I386_PROCESSOR_FLAGS_H */ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 96edfdfe32d1..11838df88603 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -21,6 +21,7 @@ #include #include #include +#include /* flag for disabling the tsc */ extern int tsc_disable; @@ -126,27 +127,6 @@ extern void detect_ht(struct cpuinfo_x86 *c); static inline void detect_ht(struct cpuinfo_x86 *c) {} #endif -/* - * EFLAGS bits - */ -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ - static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { From b8716890f35dd567a3f8e4dd69c428a8b3f47814 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 093/231] [PATCH] x86-64: Remove unused stext symbol suggested by Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index fd9fdfdd143e..1fab487dee86 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -279,7 +279,6 @@ early_idt_ripmsg: .asciz "RIP %s\n" .balign PAGE_SIZE -ENTRY(stext) #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ From 05f36927eddd83e2840a981ef4d9af754dcb86e9 Mon Sep 17 00:00:00 2001 From: Parag Warudkar Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 094/231] [PATCH] i386: remove the APM_RTC_IS_GMT config option. Signed-off-by: Parag Warudkar Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 53d62373a524..c0a3e233fa3f 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -1029,19 +1029,6 @@ config APM_DISPLAY_BLANK backlight at all, or it might print a lot of errors to the console, especially if you are using gpm. -config APM_RTC_IS_GMT - bool "RTC stores time in GMT" - depends on APM - help - Say Y here if your RTC (Real Time Clock a.k.a. hardware clock) - stores the time in GMT (Greenwich Mean Time). Say N if your RTC - stores localtime. - - It is in fact recommended to store GMT in your RTC, because then you - don't have to worry about daylight savings time changes. The only - reason not to use GMT in your RTC is if you also run a broken OS - that doesn't understand GMT. - config APM_ALLOW_INTS bool "Allow interrupts during APM BIOS calls" depends on APM From 2bff73830c3df5f575d3bc21bf19df1a10bf7091 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 095/231] [PATCH] x86-64: use lru instead of page->index and page->private for pgd lists management. x86_64 currently simulates a list using the index and private fields of the page struct. Seems that the code was inherited from i386. But x86_64 does not use the slab to allocate pgds and pmds etc. So the lru field is not used by the slab and therefore available. This patch uses standard list operations on page->lru to realize pgd tracking. Signed-off-by: Christoph Lameter Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/mm/fault.c | 5 ++--- include/asm-x86_64/pgalloc.h | 14 +++----------- include/asm-x86_64/pgtable.h | 2 +- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 6ada7231f3ab..de99dba2c515 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -585,7 +585,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, } DEFINE_SPINLOCK(pgd_lock); -struct page *pgd_list; +LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { @@ -605,8 +605,7 @@ void vmalloc_sync_all(void) if (pgd_none(*pgd_ref)) continue; spin_lock(&pgd_lock); - for (page = pgd_list; page; - page = (struct page *)page->index) { + list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); if (pgd_none(*pgd)) diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h index 31d497171969..8bb564687860 100644 --- a/include/asm-x86_64/pgalloc.h +++ b/include/asm-x86_64/pgalloc.h @@ -44,24 +44,16 @@ static inline void pgd_list_add(pgd_t *pgd) struct page *page = virt_to_page(pgd); spin_lock(&pgd_lock); - page->index = (pgoff_t)pgd_list; - if (pgd_list) - pgd_list->private = (unsigned long)&page->index; - pgd_list = page; - page->private = (unsigned long)&pgd_list; + list_add(&page->lru, &pgd_list); spin_unlock(&pgd_lock); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *next, **pprev, *page = virt_to_page(pgd); + struct page *page = virt_to_page(pgd); spin_lock(&pgd_lock); - next = (struct page *)page->index; - pprev = (struct page **)page->private; - *pprev = next; - if (next) - next->private = (unsigned long)pprev; + list_del(&page->lru); spin_unlock(&pgd_lock); } diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index c1865e38c7b7..599993f6ba84 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -410,7 +410,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) extern spinlock_t pgd_lock; -extern struct page *pgd_list; +extern struct list_head pgd_list; void vmalloc_sync_all(void); extern int kern_addr_valid(unsigned long addr); From ca906e42312781c38b7a9625109fc65b937ca56c Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH 096/231] [PATCH] x86: sys_ioperm() prototype cleanup - there's no reason for duplicating the prototype from include/linux/syscalls.h in include/asm-x86_64/unistd.h - every file should #include the headers containing the prototypes for it's global functions Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- arch/i386/kernel/ioport.c | 1 + arch/x86_64/kernel/ioport.c | 1 + include/asm-x86_64/unistd.h | 1 - 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c index 498e8bc197d5..1b4530e6cd82 100644 --- a/arch/i386/kernel/ioport.c +++ b/arch/i386/kernel/ioport.c @@ -16,6 +16,7 @@ #include #include #include +#include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index 745b1f0f494e..387d347b0e07 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -16,6 +16,7 @@ #include #include #include +#include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index c5f596e71faa..576b29732d3c 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -655,7 +655,6 @@ __SYSCALL(__NR_move_pages, sys_move_pages) #include asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs); -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on); struct sigaction; asmlinkage long sys_rt_sigaction(int sig, const struct sigaction __user *act, From bd8559c38ee5be40ce2c57a80fd3c3e978cca267 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 097/231] [PATCH] x86: remove UNEXPECTED_IO_APIC() Many years ago, UNEXPECTED_IO_APIC() contained printk()'s (but nothing more). Now that it's completely empty for years, we can as well remove it. Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 30 ------------------------------ arch/x86_64/kernel/io_apic.c | 28 ---------------------------- 2 files changed, 58 deletions(-) diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index b3ab8ffebd27..8191583032d0 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1403,10 +1403,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in enable_8259A_irq(0); } -static inline void UNEXPECTED_IO_APIC(void) -{ -} - void __init print_IO_APIC(void) { int apic, i; @@ -1446,34 +1442,12 @@ void __init print_IO_APIC(void) printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - if (reg_00.bits.ID >= get_physical_broadcast()) - UNEXPECTED_IO_APIC(); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) - ) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); /* * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, @@ -1483,8 +1457,6 @@ void __init print_IO_APIC(void) if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); } /* @@ -1496,8 +1468,6 @@ void __init print_IO_APIC(void) reg_03.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - if (reg_03.bits.__reserved_1) - UNEXPECTED_IO_APIC(); } printk(KERN_DEBUG ".... IRQ redirection table:\n"); diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 318d9055cd97..dd14ffa565b1 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -907,10 +907,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in enable_8259A_irq(0); } -void __init UNEXPECTED_IO_APIC(void) -{ -} - void __apicdebuginit print_IO_APIC(void) { int apic, i; @@ -946,40 +942,16 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) && - (reg_01.bits.entries != 0x03) - ) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); if (reg_01.bits.version >= 0x10) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); } printk(KERN_DEBUG ".... IRQ redirection table:\n"); From 272a3713bb9e302e0455c894c41180a482d2c8a3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 098/231] [PATCH] x86-64: fix vtime() vsyscall There is a tiny probability that the return value from vtime(time_t *t) is Signed-off-by: Andi Kleen different than the value stored in *t Using a temporary variable solves the problem and gives a faster code. 17: 48 85 ff test %rdi,%rdi 1a: 48 8b 05 00 00 00 00 mov 0(%rip),%rax # __vsyscall_gtod_data.wall_time_tv.tv_sec 21: 74 03 je 26 23: 48 89 07 mov %rax,(%rdi) 26: c9 leaveq 27: c3 retq Signed-off-by: Eric Dumazet --- arch/x86_64/kernel/vsyscall.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index d14cbb3e0ebe..ba330f870679 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -156,11 +156,13 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { + time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - else if (t) - *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; - return __vsyscall_gtod_data.wall_time_tv.tv_sec; + result = __vsyscall_gtod_data.wall_time_tv.tv_sec; + if (t) + *t = result; + return result; } /* Fast way to get current CPU and node. From c8118c6c07f2edfd697aaa0b93e08c3b65a5a675 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 099/231] [PATCH] x86-64: vsyscall_gtod_data diet and vgettimeofday() fix Current vsyscall_gtod_data is large (3 or 4 cache lines dirtied at timer interrupt). We can shrink it to exactly 64 bytes (1 cache line on AMD64) Instead of copying a whole struct clocksource, we copy only needed fields. I deleted an unused field : offset_base This patch fixes one oddity in vgettimeofday(): It can returns a timeval with tv_usec = 1000000. Maybe not a bug, but why not doing the right thing ? Signed-off-by: Eric Dumazet Signed-off-by: Andi Kleen --- arch/x86_64/kernel/vsyscall.c | 53 ++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index ba330f870679..dc32cef96195 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -51,13 +51,28 @@ asm("" : "=r" (v) : "0" (x)); \ ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) +/* + * vsyscall_gtod_data contains data that is : + * - readonly from vsyscalls + * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) + * Try to keep this structure as small as possible to avoid cache line ping pongs + */ struct vsyscall_gtod_data_t { - seqlock_t lock; - int sysctl_enabled; - struct timeval wall_time_tv; + seqlock_t lock; + + /* open coded 'struct timespec' */ + time_t wall_time_sec; + u32 wall_time_nsec; + + int sysctl_enabled; struct timezone sys_tz; - cycle_t offset_base; - struct clocksource clock; + struct { /* extract of a clocksource struct */ + cycle_t (*vread)(void); + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; + } clock; }; int __vgetcpu_mode __section_vgetcpu_mode; @@ -73,9 +88,13 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); /* copy vsyscall data */ - vsyscall_gtod_data.clock = *clock; - vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; + vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.sys_tz = sys_tz; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -110,7 +129,8 @@ static __always_inline long time_syscall(long *t) static __always_inline void do_vgettimeofday(struct timeval * tv) { cycle_t now, base, mask, cycle_delta; - unsigned long seq, mult, shift, nsec_delta; + unsigned seq; + unsigned long mult, shift, nsec; cycle_t (*vread)(void); do { seq = read_seqbegin(&__vsyscall_gtod_data.lock); @@ -126,21 +146,20 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) mult = __vsyscall_gtod_data.clock.mult; shift = __vsyscall_gtod_data.clock.shift; - *tv = __vsyscall_gtod_data.wall_time_tv; - + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + nsec = __vsyscall_gtod_data.wall_time_nsec; } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); /* calculate interval: */ cycle_delta = (now - base) & mask; /* convert to nsecs: */ - nsec_delta = (cycle_delta * mult) >> shift; + nsec += (cycle_delta * mult) >> shift; - /* convert to usecs and add to timespec: */ - tv->tv_usec += nsec_delta / NSEC_PER_USEC; - while (tv->tv_usec > USEC_PER_SEC) { + while (nsec >= NSEC_PER_SEC) { tv->tv_sec += 1; - tv->tv_usec -= USEC_PER_SEC; + nsec -= NSEC_PER_SEC; } + tv->tv_usec = nsec / NSEC_PER_USEC; } int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) @@ -159,7 +178,7 @@ time_t __vsyscall(1) vtime(time_t *t) time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - result = __vsyscall_gtod_data.wall_time_tv.tv_sec; + result = __vsyscall_gtod_data.wall_time_sec; if (t) *t = result; return result; From ae32b1297a77c23fd0badd642bb685062f7a37f8 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 100/231] [PATCH] x86-64: Inhibit machine from asserting an NMI when doing Alt-SysRq-M operation. This patch touches the NMI watchdog every MAX_ORDER_NR_PAGES to inhibit the machine from triggering an NMI while the CPUs are locked. This situation is happening on boxes with more than 64CPUs and 128GB of RAM when Alt-SysRq-m is performed. It has been succesfully tested for regression on uni, 2, 4, 8 32, and 64 CPU boxes with various memory configuration. Signed-off-by: Andi Kleen --- arch/x86_64/mm/init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index e3134bc9a4fc..282b0a8f00ad 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -73,6 +74,11 @@ void show_mem(void) for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { + /* this loop can take a while with 256 GB and 4k pages + so update the NMI watchdog */ + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { + touch_nmi_watchdog(); + } page = pfn_to_page(pgdat->node_start_pfn + i); total++; if (PageReserved(page)) From dd4ecfc2b10d962d70ff59f8994a29aa048700ec Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 101/231] [PATCH] x86-64: adjust EDID retrieval commit 5e518d7672dea4cd7c60871e40d0490c52f01d13 did the same change to i386's variant. With this change, i386's and x86-64's versions are identical, raising the question whether the x86-64 one should go (just like there's only one instance of edd.S). Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/boot/video.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S index 6090516c9c7f..8143c9516cb4 100644 --- a/arch/x86_64/boot/video.S +++ b/arch/x86_64/boot/video.S @@ -1977,7 +1977,7 @@ store_edid: movw $0x4f15, %ax # do VBE/DDC movw $0x01, %bx movw $0x00, %cx - movw $0x01, %dx + movw $0x00, %dx movw $0x140, %di int $0x10 From 141f9cfe0a948c8fe26e5669364ee62c03ea42e8 Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 102/231] [PATCH] x86-64: Fix "Section mismatch" compile warning Fix "Section mismatch" warnings in arch/x86_64/kernel/time.c Signed-off-by: Bernhard Walle Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 91c9066a380d..0652e173813b 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -328,7 +328,7 @@ static unsigned int __init pit_calibrate_tsc(void) #define PIT_MODE 0x43 #define PIT_CH0 0x40 -static void __init __pit_init(int val, u8 mode) +static void __pit_init(int val, u8 mode) { unsigned long flags; @@ -344,12 +344,12 @@ void __init pit_init(void) __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ } -void __init pit_stop_interrupt(void) +void pit_stop_interrupt(void) { __pit_init(0, 0x30); /* mode 0 */ } -void __init stop_timer_interrupt(void) +void stop_timer_interrupt(void) { char *name; if (hpet_address) { From 4fbb5968810b237e81977f131986b9efd5245368 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 103/231] [PATCH] i386: cleanup GDT Access Now we have an explicit per-cpu GDT variable, we don't need to keep the descriptors around to use them to find the GDT: expose cpu_gdt directly. We could go further and make load_gdt() pack the descriptor for us, or even assume it means "load the current cpu's GDT" which is what it always does. Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 4 +--- arch/i386/kernel/efi.c | 16 ++++++++-------- arch/i386/kernel/entry.S | 3 +-- arch/i386/kernel/smpboot.c | 12 ++++++------ arch/i386/kernel/traps.c | 4 +--- include/asm-i386/desc.h | 7 ++----- 6 files changed, 19 insertions(+), 27 deletions(-) diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 2a26956fce42..5faf675aab4b 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -22,9 +22,6 @@ #include "cpu.h" -DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); -EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); - DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]) = { [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, @@ -52,6 +49,7 @@ DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]) = { [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, [GDT_ENTRY_PDA] = { 0x00000000, 0x00c09200 }, /* set in setup_pda */ }; +EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt); DEFINE_PER_CPU(struct i386_pda, _cpu_pda); EXPORT_PER_CPU_SYMBOL(_cpu_pda); diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index 8f9c624ace6f..dd9e7faafa7c 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -69,13 +69,11 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) { unsigned long cr4; unsigned long temp; - struct Xgt_desc_struct *cpu_gdt_descr; + struct Xgt_desc_struct gdt_descr; spin_lock(&efi_rt_lock); local_irq_save(efi_rt_eflags); - cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); - /* * If I don't have PSE, I should just duplicate two entries in page * directory. If I have PSE, I just need to duplicate one entry in @@ -105,17 +103,19 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) */ local_flush_tlb(); - cpu_gdt_descr->address = __pa(cpu_gdt_descr->address); - load_gdt(cpu_gdt_descr); + gdt_descr.address = __pa(get_cpu_gdt_table(0)); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); } static void efi_call_phys_epilog(void) __releases(efi_rt_lock) { unsigned long cr4; - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); + struct Xgt_desc_struct gdt_descr; - cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address); - load_gdt(cpu_gdt_descr); + gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); cr4 = read_cr4(); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 922cc38dc405..c61c6b67e856 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -561,8 +561,7 @@ END(syscall_badsys) #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ movl %fs:PDA_cpu, %ebx; \ - PER_CPU(cpu_gdt_descr, %ebx); \ - movl GDS_address(%ebx), %ebx; \ + PER_CPU(cpu_gdt, %ebx); \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ addl %esp, %eax; \ pushl $__KERNEL_DS; \ diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 954245f6d307..b0ad04d9ef21 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -786,13 +786,9 @@ static inline struct task_struct * alloc_idle_task(int cpu) secondary which will soon come up. */ static __cpuinit void init_gdt(int cpu, struct task_struct *idle) { - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt = per_cpu(cpu_gdt, cpu); + struct desc_struct *gdt = get_cpu_gdt_table(cpu); struct i386_pda *pda = &per_cpu(_cpu_pda, cpu); - cpu_gdt_descr->address = (unsigned long)gdt; - cpu_gdt_descr->size = GDT_SIZE - 1; - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, (u32 *)&gdt[GDT_ENTRY_PDA].b, (unsigned long)pda, sizeof(*pda) - 1, @@ -1187,7 +1183,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) * it's on the real one. */ static inline void switch_to_new_gdt(void) { - load_gdt(&per_cpu(cpu_gdt_descr, smp_processor_id())); + struct Xgt_desc_struct gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); } diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 58dfecc8e36c..8722444cacaa 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -1030,9 +1030,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, fastcall unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { - int cpu = smp_processor_id(); - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address; + struct desc_struct *gdt = __get_cpu_var(cpu_gdt); unsigned long base = (kesp - uesp) & -THREAD_SIZE; unsigned long new_kesp = kesp - base; unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index 13f701ea9a8f..4a974064e928 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -18,16 +18,13 @@ struct Xgt_desc_struct { unsigned short pad; } __attribute__ ((packed)); -extern struct Xgt_desc_struct idt_descr; -DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); DECLARE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]); -extern struct Xgt_desc_struct early_gdt_descr; - static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { - return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; + return per_cpu(cpu_gdt, cpu); } +extern struct Xgt_desc_struct idt_descr; extern struct desc_struct idt_table[]; extern void set_intr_gate(unsigned int irq, void * addr); From 01a2f435564b4baab61328b4018d36464468f57b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 104/231] [PATCH] i386: Add smp_ops interface Add a smp_ops interface. This abstracts the API defined by for use within arch/i386. The primary intent is that it be used by a paravirtualizing hypervisor to implement SMP, but it could also be used by non-APIC-using sub-architectures. This is related to CONFIG_PARAVIRT, but is implemented unconditionally since it is simpler that way and not a highly performance-sensitive interface. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Ingo Molnar Cc: James Bottomley --- arch/i386/kernel/smp.c | 21 +++++++++++---- arch/i386/kernel/smpboot.c | 8 +++--- include/asm-i386/smp.h | 53 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index b90bebeb1c79..fe38b49a1223 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -483,7 +483,7 @@ void flush_tlb_all(void) * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... */ -void smp_send_reschedule(int cpu) +void native_smp_send_reschedule(int cpu) { WARN_ON(cpu_is_offline(cpu)); send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); @@ -560,9 +560,9 @@ static void __smp_call_function(void (*func) (void *info), void *info, * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) +int native_smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; cpumask_t allbutself; @@ -681,7 +681,7 @@ static void stop_this_cpu (void * dummy) * this function calls the 'stop' function on all other CPUs in the system. */ -void smp_send_stop(void) +void native_smp_send_stop(void) { /* Don't deadlock on the call lock in panic */ int nolock = !spin_trylock(&call_lock); @@ -757,3 +757,14 @@ int safe_smp_processor_id(void) return cpuid >= 0 ? cpuid : 0; } + +struct smp_ops smp_ops = { + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .cpu_up = native_cpu_up, + .smp_cpus_done = native_smp_cpus_done, + + .smp_send_stop = native_smp_send_stop, + .smp_send_reschedule = native_smp_send_reschedule, + .smp_call_function_mask = native_smp_call_function_mask, +}; diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index b0ad04d9ef21..1c3ad9b406ca 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1171,7 +1171,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ -void __init smp_prepare_cpus(unsigned int max_cpus) +void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_commenced_mask = cpumask_of_cpu(0); cpu_callin_map = cpumask_of_cpu(0); @@ -1191,7 +1191,7 @@ static inline void switch_to_new_gdt(void) asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); } -void __init smp_prepare_boot_cpu(void) +void __init native_smp_prepare_boot_cpu(void) { unsigned int cpu = smp_processor_id(); @@ -1292,7 +1292,7 @@ void __cpu_die(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit native_cpu_up(unsigned int cpu) { unsigned long flags; #ifdef CONFIG_HOTPLUG_CPU @@ -1337,7 +1337,7 @@ int __cpuinit __cpu_up(unsigned int cpu) return 0; } -void __init smp_cpus_done(unsigned int max_cpus) +void __init native_smp_cpus_done(unsigned int max_cpus) { #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index 9cab1531c613..2d083cb4ca93 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -49,6 +49,59 @@ extern void cpu_exit_clear(void); extern void cpu_uninit(void); #endif +struct smp_ops +{ + void (*smp_prepare_boot_cpu)(void); + void (*smp_prepare_cpus)(unsigned max_cpus); + int (*cpu_up)(unsigned cpu); + void (*smp_cpus_done)(unsigned max_cpus); + + void (*smp_send_stop)(void); + void (*smp_send_reschedule)(int cpu); + int (*smp_call_function_mask)(cpumask_t mask, + void (*func)(void *info), void *info, + int wait); +}; + +extern struct smp_ops smp_ops; + +static inline void smp_prepare_boot_cpu(void) +{ + smp_ops.smp_prepare_boot_cpu(); +} +static inline void smp_prepare_cpus(unsigned int max_cpus) +{ + smp_ops.smp_prepare_cpus(max_cpus); +} +static inline int __cpu_up(unsigned int cpu) +{ + return smp_ops.cpu_up(cpu); +} +static inline void smp_cpus_done(unsigned int max_cpus) +{ + smp_ops.smp_cpus_done(max_cpus); +} + +static inline void smp_send_stop(void) +{ + smp_ops.smp_send_stop(); +} +static inline void smp_send_reschedule(int cpu) +{ + smp_ops.smp_send_reschedule(cpu); +} +static inline int smp_call_function_mask(cpumask_t mask, + void (*func) (void *info), void *info, + int wait) +{ + return smp_ops.smp_call_function_mask(mask, func, info, wait); +} + +void native_smp_prepare_boot_cpu(void); +void native_smp_prepare_cpus(unsigned int max_cpus); +int native_cpu_up(unsigned int cpunum); +void native_smp_cpus_done(unsigned int max_cpus); + #ifndef CONFIG_PARAVIRT #define startup_ipi_hook(phys_apicid, start_eip, start_esp) \ do { } while (0) From 07f3331c6bfd27a06dfb0ca9fa4f06dec6606876 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 105/231] [PATCH] i386: Add machine_ops interface to abstract halting and rebooting machine_ops is an interface for the machine_* functions defined in . This is intended to allow hypervisors to intercept the reboot process, but it could be used to implement other x86 subarchtecture reboots. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/kernel/apm.c | 3 +-- arch/i386/kernel/reboot.c | 43 ++++++++++++++++++++++++++++++++++----- include/asm-i386/reboot.h | 20 ++++++++++++++++++ 3 files changed, 59 insertions(+), 7 deletions(-) create mode 100644 include/asm-i386/reboot.h diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 4d073d390715..367ff1d930cb 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -233,11 +233,10 @@ #include #include #include +#include #include "io_ports.h" -extern void machine_real_restart(unsigned char *, int); - #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); #endif diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 8b5ff6e15412..14b4de2882be 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -18,6 +18,7 @@ #include #include "mach_reboot.h" #include +#include /* * Power off function, if any @@ -280,7 +281,7 @@ void machine_real_restart(unsigned char *code, int length) EXPORT_SYMBOL(machine_real_restart); #endif -void machine_shutdown(void) +static void native_machine_shutdown(void) { #ifdef CONFIG_SMP int reboot_cpu_id; @@ -320,7 +321,7 @@ void __attribute__((weak)) mach_reboot_fixups(void) { } -void machine_emergency_restart(void) +static void native_machine_emergency_restart(void) { if (!reboot_thru_bios) { if (efi_enabled) { @@ -344,17 +345,17 @@ void machine_emergency_restart(void) machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); } -void machine_restart(char * __unused) +static void native_machine_restart(char * __unused) { machine_shutdown(); machine_emergency_restart(); } -void machine_halt(void) +static void native_machine_halt(void) { } -void machine_power_off(void) +static void native_machine_power_off(void) { if (pm_power_off) { machine_shutdown(); @@ -363,3 +364,35 @@ void machine_power_off(void) } +struct machine_ops machine_ops = { + .power_off = native_machine_power_off, + .shutdown = native_machine_shutdown, + .emergency_restart = native_machine_emergency_restart, + .restart = native_machine_restart, + .halt = native_machine_halt, +}; + +void machine_power_off(void) +{ + machine_ops.power_off(); +} + +void machine_shutdown(void) +{ + machine_ops.shutdown(); +} + +void machine_emergency_restart(void) +{ + machine_ops.emergency_restart(); +} + +void machine_restart(char *cmd) +{ + machine_ops.restart(cmd); +} + +void machine_halt(void) +{ + machine_ops.halt(); +} diff --git a/include/asm-i386/reboot.h b/include/asm-i386/reboot.h new file mode 100644 index 000000000000..e9e3ffc22c07 --- /dev/null +++ b/include/asm-i386/reboot.h @@ -0,0 +1,20 @@ +#ifndef _ASM_REBOOT_H +#define _ASM_REBOOT_H + +struct pt_regs; + +struct machine_ops +{ + void (*restart)(char *cmd); + void (*halt)(void); + void (*power_off)(void); + void (*shutdown)(void); + void (*crash_shutdown)(struct pt_regs *); + void (*emergency_restart)(void); +}; + +extern struct machine_ops machine_ops; + +void machine_real_restart(unsigned char *code, int length); + +#endif /* _ASM_REBOOT_H */ From 43c3ab308d0e8e6ad9a5fb24020eb59cd423a3db Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 106/231] [PATCH] x86-64: Change my email address Signed-off-by: Andi Kleen --- CREDITS | 5 +++-- MAINTAINERS | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CREDITS b/CREDITS index d7140309e06d..c5f819bacda3 100644 --- a/CREDITS +++ b/CREDITS @@ -1745,8 +1745,9 @@ S: D-64295 S: Germany N: Andi Kleen -E: ak@muc.de -D: network hacker, syncookies +E: andi@firstfloor.org +U: http://www.halobates.de +D: network, x86, NUMA, various hacks S: Schwalbenstr. 96 S: 85551 Ottobrunn S: Germany diff --git a/MAINTAINERS b/MAINTAINERS index af1c7926c153..a57589e2d38e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1605,7 +1605,7 @@ S: Maintained HPET: x86_64 P: Andi Kleen and Vojtech Pavlik -M: ak@muc.de and vojtech@suse.cz +M: andi@firstfloor.org and vojtech@suse.cz S: Maintained HPET: ACPI hpet.c From bbba11c35baaad3f70f32e185a2c1d40d7901fe9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 107/231] [PATCH] i386: Remove unneeded externs in nmi.c All were already in some header Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index aef22c881a0d..2dec1b105737 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -75,9 +75,6 @@ static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); /* local prototypes */ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); -extern void show_registers(struct pt_regs *regs); -extern int unknown_nmi_panic; - /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) { From b00742d399513a4100c24cc2accefdc1bb1e0b15 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 108/231] [PATCH] x86-64: Account for module percpu space separately from kernel percpu Rather than using a single constant PERCPU_ENOUGH_ROOM, compute it as the sum of kernel_percpu + PERCPU_MODULE_RESERVE. This is now common to all architectures; if an architecture wants to set PERCPU_ENOUGH_ROOM to something special, then it may do so (ia64 is the only one which does). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Eric W. Biederman Cc: Andi Kleen --- include/asm-alpha/percpu.h | 14 -------------- include/asm-sparc64/percpu.h | 10 ---------- include/asm-x86_64/percpu.h | 10 ---------- include/linux/percpu.h | 9 ++++++++- kernel/module.c | 2 +- 5 files changed, 9 insertions(+), 36 deletions(-) diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h index 651ebb141b24..48348fe34c19 100644 --- a/include/asm-alpha/percpu.h +++ b/include/asm-alpha/percpu.h @@ -1,20 +1,6 @@ #ifndef __ALPHA_PERCPU_H #define __ALPHA_PERCPU_H -/* - * Increase the per cpu area for Alpha so that - * modules using percpu area can load. - */ -#ifdef CONFIG_MODULES -# define PERCPU_MODULE_RESERVE 8192 -#else -# define PERCPU_MODULE_RESERVE 0 -#endif - -#define PERCPU_ENOUGH_ROOM \ - (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \ - PERCPU_MODULE_RESERVE) - #include #endif /* __ALPHA_PERCPU_H */ diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h index 0d3df76aa47f..ced8cbde046d 100644 --- a/include/asm-sparc64/percpu.h +++ b/include/asm-sparc64/percpu.h @@ -5,16 +5,6 @@ #ifdef CONFIG_SMP -#ifdef CONFIG_MODULES -# define PERCPU_MODULE_RESERVE 8192 -#else -# define PERCPU_MODULE_RESERVE 0 -#endif - -#define PERCPU_ENOUGH_ROOM \ - (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \ - PERCPU_MODULE_RESERVE) - extern void setup_per_cpu_areas(void); extern unsigned long __per_cpu_base; diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h index 5ed0ef340842..c6fbb67eac90 100644 --- a/include/asm-x86_64/percpu.h +++ b/include/asm-x86_64/percpu.h @@ -11,16 +11,6 @@ #include -#ifdef CONFIG_MODULES -# define PERCPU_MODULE_RESERVE 8192 -#else -# define PERCPU_MODULE_RESERVE 0 -#endif - -#define PERCPU_ENOUGH_ROOM \ - (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \ - PERCPU_MODULE_RESERVE) - #define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset) #define __my_cpu_offset() read_pda(data_offset) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 600e3d387ffc..b72be2f79e6a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -11,9 +11,16 @@ /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */ #ifndef PERCPU_ENOUGH_ROOM -#define PERCPU_ENOUGH_ROOM 32768 +#ifdef CONFIG_MODULES +#define PERCPU_MODULE_RESERVE 8192 +#else +#define PERCPU_MODULE_RESERVE 0 #endif +#define PERCPU_ENOUGH_ROOM \ + (__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE) +#endif /* PERCPU_ENOUGH_ROOM */ + /* * Must be an lvalue. Since @var must be a simple identifier, * we force a syntax error here if it isn't. diff --git a/kernel/module.c b/kernel/module.c index 9da5af668a20..cf49ca25fcce 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -430,7 +430,7 @@ static int percpu_modinit(void) pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, GFP_KERNEL); /* Static in-kernel percpu data (used). */ - pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES); + pcpu_size[0] = -(__per_cpu_end-__per_cpu_start); /* Free room. */ pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; if (pcpu_size[1] < 0) { From b92e9fac400d4ae5bc7a75c568e9844ec53ea329 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 109/231] [PATCH] x86: fix amd64-agp aperture validation Under CONFIG_DISCONTIGMEM, assuming that a !pfn_valid() implies all subsequent pfn-s are also invalid is wrong. Thus replace this by explicitly checking against the E820 map. AK: make e820 on x86-64 not initdata Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen Acked-by: Mark Langsdorf --- arch/i386/kernel/e820.c | 20 ++++++++++++++++++++ arch/x86_64/kernel/e820.c | 5 +++-- drivers/char/agp/amd64-agp.c | 13 ++++--------- include/asm-i386/e820.h | 1 + 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 31f4670ef740..829beec9247e 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -825,6 +825,26 @@ void __init limit_regions(unsigned long long size) print_memory_map("limit_regions endfunc"); } +/* + * This function checks if any part of the range is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + const struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + /* * This function checks if the entire range is mapped with type. * diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index a490fabfcf47..be8965427a93 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -25,7 +25,7 @@ #include #include -struct e820map e820 __initdata; +struct e820map e820; /* * PFN of last memory page. @@ -98,7 +98,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) * This function checks if any part of the range is mapped * with type. */ -int __meminit +int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) { int i; @@ -112,6 +112,7 @@ e820_any_mapped(unsigned long start, unsigned long end, unsigned type) } return 0; } +EXPORT_SYMBOL_GPL(e820_any_mapped); /* * This function checks if the entire range is mapped with type. diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 485720486d60..c9f0f250d78f 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -14,6 +14,7 @@ #include #include #include /* PAGE_SIZE */ +#include #include #include "agp.h" @@ -259,7 +260,6 @@ static const struct agp_bridge_driver amd_8151_driver = { /* Some basic sanity checks for the aperture. */ static int __devinit aperture_valid(u64 aper, u32 size) { - u32 pfn, c; if (aper == 0) { printk(KERN_ERR PFX "No aperture\n"); return 0; @@ -272,14 +272,9 @@ static int __devinit aperture_valid(u64 aper, u32 size) printk(KERN_ERR PFX "Aperture out of bounds\n"); return 0; } - pfn = aper >> PAGE_SHIFT; - for (c = 0; c < size/PAGE_SIZE; c++) { - if (!pfn_valid(pfn + c)) - break; - if (!PageReserved(pfn_to_page(pfn + c))) { - printk(KERN_ERR PFX "Aperture pointing to RAM\n"); - return 0; - } + if (e820_any_mapped(aper, aper + size, E820_RAM)) { + printk(KERN_ERR PFX "Aperture pointing to RAM\n"); + return 0; } /* Request the Aperture. This catches cases when someone else diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h index c5b8fc6109d6..096a2a8eb1da 100644 --- a/include/asm-i386/e820.h +++ b/include/asm-i386/e820.h @@ -38,6 +38,7 @@ extern struct e820map e820; extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); +extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern void find_max_pfn(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); extern void e820_register_memory(void); From 5d02d7ae73ac9446f20bbf604b04a74637178b35 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH 110/231] [PATCH] x86-64: Use X86_EFLAGS_IF in x86-64/irqflags.h. As per i386 patch: move X86_EFLAGS_IF et al out to a new header: processor-flags.h, so we can include it from irqflags.h and use it in raw_irqs_disabled_flags(). As a side-effect, we could now use these flags in .S files. Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- include/asm-x86_64/irqflags.h | 9 +++++---- include/asm-x86_64/processor-flags.h | 26 ++++++++++++++++++++++++++ include/asm-x86_64/processor.h | 22 +--------------------- 3 files changed, 32 insertions(+), 25 deletions(-) create mode 100644 include/asm-x86_64/processor-flags.h diff --git a/include/asm-x86_64/irqflags.h b/include/asm-x86_64/irqflags.h index cce6937e87c0..86e70fe23659 100644 --- a/include/asm-x86_64/irqflags.h +++ b/include/asm-x86_64/irqflags.h @@ -9,6 +9,7 @@ */ #ifndef _ASM_IRQFLAGS_H #define _ASM_IRQFLAGS_H +#include #ifndef __ASSEMBLY__ /* @@ -53,19 +54,19 @@ static inline void raw_local_irq_disable(void) { unsigned long flags = __raw_local_save_flags(); - raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); + raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); } static inline void raw_local_irq_enable(void) { unsigned long flags = __raw_local_save_flags(); - raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); + raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); } static inline int raw_irqs_disabled_flags(unsigned long flags) { - return !(flags & (1<<9)) || (flags & (1 << 18)); + return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC); } #else /* CONFIG_X86_VSMP */ @@ -82,7 +83,7 @@ static inline void raw_local_irq_enable(void) static inline int raw_irqs_disabled_flags(unsigned long flags) { - return !(flags & (1 << 9)); + return !(flags & X86_EFLAGS_IF); } #endif diff --git a/include/asm-x86_64/processor-flags.h b/include/asm-x86_64/processor-flags.h new file mode 100644 index 000000000000..806112fb7985 --- /dev/null +++ b/include/asm-x86_64/processor-flags.h @@ -0,0 +1,26 @@ +#ifndef __ASM_X86_64_PROCESSOR_FLAGS_H +#define __ASM_X86_64_PROCESSOR_FLAGS_H +/* Various flags defined: can be included from assembler. */ + +/* + * EFLAGS bits + */ +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ + +#endif /* __ASM_X86_64_PROCESSOR_FLAGS_H */ diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h index 76552d72804c..2c1497de0b09 100644 --- a/include/asm-x86_64/processor.h +++ b/include/asm-x86_64/processor.h @@ -20,6 +20,7 @@ #include #include #include +#include #define TF_MASK 0x00000100 #define IF_MASK 0x00000200 @@ -102,27 +103,6 @@ extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; -/* - * EFLAGS bits - */ -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ - /* * Intel CPU features in CR4 */ From bbf30a1650be396b5467f769f4fbee715f16ec36 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 111/231] [PATCH] x86-64: fix arithmetic in comment The xmm space on x86_64 is 256 bytes. Signed-off-by: Avi Kivity Signed-off-by: Andi Kleen --- include/asm-x86_64/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h index 2c1497de0b09..6a117349a5de 100644 --- a/include/asm-x86_64/processor.h +++ b/include/asm-x86_64/processor.h @@ -181,7 +181,7 @@ struct i387_fxsave_struct { u32 mxcsr; u32 mxcsr_mask; u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */ + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ u32 padding[24]; } __attribute__ ((aligned (16))); From 0d08e0d3a97cce22ebf80b54785e00d9b94e1add Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 112/231] [PATCH] x86-64: Fix vmalloc_32 to really allocate <4GB on 64bit platforms Ugly ifdef, but should handle all 64bit platforms that have suitable zones. On some like Altix it's probably impossible without IOMMU use to get memory <4GB this way, but they have to live with that. Signed-off-by: Andi Kleen --- mm/vmalloc.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9eef486da909..cb5aabda7046 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -431,7 +431,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, - (gfp_mask & ~(__GFP_HIGHMEM | __GFP_ZERO)), + (gfp_mask & GFP_LEVEL_MASK), node); } area->pages = pages; @@ -577,6 +577,14 @@ void *vmalloc_exec(unsigned long size) return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); } +#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) +#define GFP_VMALLOC32 GFP_DMA32 +#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) +#define GFP_VMALLOC32 GFP_DMA +#else +#define GFP_VMALLOC32 GFP_KERNEL +#endif + /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size @@ -586,7 +594,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); } EXPORT_SYMBOL(vmalloc_32); @@ -602,7 +610,7 @@ void *vmalloc_32_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); if (ret) { write_lock(&vmlist_lock); area = __find_vm_area(ret); From 1353ebb4b48151e3810d9a60449edd43a90ea3c3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 113/231] [PATCH] i386: Clean up asm-i386/bugs.h Most of asm-i386/bugs.h is code which should be in a C file, so put it there. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Linus Torvalds --- arch/i386/kernel/cpu/Makefile | 2 +- arch/i386/kernel/cpu/bugs.c | 191 ++++++++++++++++++++++++++++++++ include/asm-i386/alternative.h | 1 + include/asm-i386/bugs.h | 194 +-------------------------------- 4 files changed, 197 insertions(+), 191 deletions(-) create mode 100644 arch/i386/kernel/cpu/bugs.c diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile index 010aecfffbc1..5fb1a7560438 100644 --- a/arch/i386/kernel/cpu/Makefile +++ b/arch/i386/kernel/cpu/Makefile @@ -2,7 +2,7 @@ # Makefile for x86-compatible CPU details and quirks # -obj-y := common.o proc.o +obj-y := common.o proc.o bugs.o obj-y += amd.o obj-y += cyrix.o diff --git a/arch/i386/kernel/cpu/bugs.c b/arch/i386/kernel/cpu/bugs.c new file mode 100644 index 000000000000..54428a2500f3 --- /dev/null +++ b/arch/i386/kernel/cpu/bugs.c @@ -0,0 +1,191 @@ +/* + * arch/i386/cpu/bugs.c + * + * Copyright (C) 1994 Linus Torvalds + * + * Cyrix stuff, June 1998 by: + * - Rafael R. Reilova (moved everything from head.S), + * + * - Channing Corn (tests & fixes), + * - Andrew D. Balsa (code cleanup). + */ +#include +#include +#include +#include +#include +#include +#include + +static int __init no_halt(char *s) +{ + boot_cpu_data.hlt_works_ok = 0; + return 1; +} + +__setup("no-hlt", no_halt); + +static int __init mca_pentium(char *s) +{ + mca_pentium_flag = 1; + return 1; +} + +__setup("mca-pentium", mca_pentium); + +static int __init no_387(char *s) +{ + boot_cpu_data.hard_math = 0; + write_cr0(0xE | read_cr0()); + return 1; +} + +__setup("no387", no_387); + +static double __initdata x = 4195835.0; +static double __initdata y = 3145727.0; + +/* + * This used to check for exceptions.. + * However, it turns out that to support that, + * the XMM trap handlers basically had to + * be buggy. So let's have a correct XMM trap + * handler, and forget about printing out + * some status at boot. + * + * We should really only care about bugs here + * anyway. Not features. + */ +static void __init check_fpu(void) +{ + if (!boot_cpu_data.hard_math) { +#ifndef CONFIG_MATH_EMULATION + printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); + printk(KERN_EMERG "Giving up.\n"); + for (;;) ; +#endif + return; + } + +/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */ + /* Test for the divl bug.. */ + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&boot_cpu_data.fdiv_bug) + : "m" (*&x), "m" (*&y)); + if (boot_cpu_data.fdiv_bug) + printk("Hmm, FPU with FDIV bug.\n"); +} + +static void __init check_hlt(void) +{ + if (paravirt_enabled()) + return; + + printk(KERN_INFO "Checking 'hlt' instruction... "); + if (!boot_cpu_data.hlt_works_ok) { + printk("disabled\n"); + return; + } + halt(); + halt(); + halt(); + halt(); + printk("OK.\n"); +} + +/* + * Most 386 processors have a bug where a POPAD can lock the + * machine even from user space. + */ + +static void __init check_popad(void) +{ +#ifndef CONFIG_X86_POPAD_OK + int res, inp = (int) &res; + + printk(KERN_INFO "Checking for popad bug... "); + __asm__ __volatile__( + "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " + : "=&a" (res) + : "d" (inp) + : "ecx", "edi" ); + /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ + if (res != 12345678) printk( "Buggy.\n" ); + else printk( "OK.\n" ); +#endif +} + +/* + * Check whether we are able to run this kernel safely on SMP. + * + * - In order to run on a i386, we need to be compiled for i386 + * (for due to lack of "invlpg" and working WP on a i386) + * - In order to run on anything without a TSC, we need to be + * compiled for a i486. + * - In order to support the local APIC on a buggy Pentium machine, + * we need to be compiled with CONFIG_X86_GOOD_APIC disabled, + * which happens implicitly if compiled for a Pentium or lower + * (unless an advanced selection of CPU features is used) as an + * otherwise config implies a properly working local APIC without + * the need to do extra reads from the APIC. +*/ + +static void __init check_config(void) +{ +/* + * We'd better not be a i386 if we're configured to use some + * i486+ only features! (WP works in supervisor mode and the + * new "invlpg" and "bswap" instructions) + */ +#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP) + if (boot_cpu_data.x86 == 3) + panic("Kernel requires i486+ for 'invlpg' and other features"); +#endif + +/* + * If we configured ourselves for a TSC, we'd better have one! + */ +#ifdef CONFIG_X86_TSC + if (!cpu_has_tsc && !tsc_disable) + panic("Kernel compiled for Pentium+, requires TSC feature!"); +#endif + +/* + * If we were told we had a good local APIC, check for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC) + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL + && cpu_has_apic + && boot_cpu_data.x86 == 5 + && boot_cpu_data.x86_model == 2 + && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) + panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); +#endif +} + + +void __init check_bugs(void) +{ + identify_boot_cpu(); +#ifndef CONFIG_SMP + printk("CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + check_config(); + check_fpu(); + check_hlt(); + check_popad(); + init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + alternative_instructions(); +} diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h index b8fa9557c532..dbc1a29284f3 100644 --- a/include/asm-i386/alternative.h +++ b/include/asm-i386/alternative.h @@ -16,6 +16,7 @@ struct alt_instr { u8 pad; }; +extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); struct module; diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h index c90c7c499302..df539b390448 100644 --- a/include/asm-i386/bugs.h +++ b/include/asm-i386/bugs.h @@ -1,198 +1,12 @@ -/* - * include/asm-i386/bugs.h - * - * Copyright (C) 1994 Linus Torvalds - * - * Cyrix stuff, June 1998 by: - * - Rafael R. Reilova (moved everything from head.S), - * - * - Channing Corn (tests & fixes), - * - Andrew D. Balsa (code cleanup). - */ - /* * This is included by init/main.c to check for architecture-dependent bugs. * * Needs: * void check_bugs(void); */ +#ifndef _ASM_I386_BUG_H +#define _ASM_I386_BUG_H -#include -#include -#include -#include -#include +extern void __init check_bugs(void); -static int __init no_halt(char *s) -{ - boot_cpu_data.hlt_works_ok = 0; - return 1; -} - -__setup("no-hlt", no_halt); - -static int __init mca_pentium(char *s) -{ - mca_pentium_flag = 1; - return 1; -} - -__setup("mca-pentium", mca_pentium); - -static int __init no_387(char *s) -{ - boot_cpu_data.hard_math = 0; - write_cr0(0xE | read_cr0()); - return 1; -} - -__setup("no387", no_387); - -static double __initdata x = 4195835.0; -static double __initdata y = 3145727.0; - -/* - * This used to check for exceptions.. - * However, it turns out that to support that, - * the XMM trap handlers basically had to - * be buggy. So let's have a correct XMM trap - * handler, and forget about printing out - * some status at boot. - * - * We should really only care about bugs here - * anyway. Not features. - */ -static void __init check_fpu(void) -{ - if (!boot_cpu_data.hard_math) { -#ifndef CONFIG_MATH_EMULATION - printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); - printk(KERN_EMERG "Giving up.\n"); - for (;;) ; -#endif - return; - } - -/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */ - /* Test for the divl bug.. */ - __asm__("fninit\n\t" - "fldl %1\n\t" - "fdivl %2\n\t" - "fmull %2\n\t" - "fldl %1\n\t" - "fsubp %%st,%%st(1)\n\t" - "fistpl %0\n\t" - "fwait\n\t" - "fninit" - : "=m" (*&boot_cpu_data.fdiv_bug) - : "m" (*&x), "m" (*&y)); - if (boot_cpu_data.fdiv_bug) - printk("Hmm, FPU with FDIV bug.\n"); -} - -static void __init check_hlt(void) -{ - if (paravirt_enabled()) - return; - - printk(KERN_INFO "Checking 'hlt' instruction... "); - if (!boot_cpu_data.hlt_works_ok) { - printk("disabled\n"); - return; - } - halt(); - halt(); - halt(); - halt(); - printk("OK.\n"); -} - -/* - * Most 386 processors have a bug where a POPAD can lock the - * machine even from user space. - */ - -static void __init check_popad(void) -{ -#ifndef CONFIG_X86_POPAD_OK - int res, inp = (int) &res; - - printk(KERN_INFO "Checking for popad bug... "); - __asm__ __volatile__( - "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " - : "=&a" (res) - : "d" (inp) - : "ecx", "edi" ); - /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ - if (res != 12345678) printk( "Buggy.\n" ); - else printk( "OK.\n" ); -#endif -} - -/* - * Check whether we are able to run this kernel safely on SMP. - * - * - In order to run on a i386, we need to be compiled for i386 - * (for due to lack of "invlpg" and working WP on a i386) - * - In order to run on anything without a TSC, we need to be - * compiled for a i486. - * - In order to support the local APIC on a buggy Pentium machine, - * we need to be compiled with CONFIG_X86_GOOD_APIC disabled, - * which happens implicitly if compiled for a Pentium or lower - * (unless an advanced selection of CPU features is used) as an - * otherwise config implies a properly working local APIC without - * the need to do extra reads from the APIC. -*/ - -static void __init check_config(void) -{ -/* - * We'd better not be a i386 if we're configured to use some - * i486+ only features! (WP works in supervisor mode and the - * new "invlpg" and "bswap" instructions) - */ -#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP) - if (boot_cpu_data.x86 == 3) - panic("Kernel requires i486+ for 'invlpg' and other features"); -#endif - -/* - * If we configured ourselves for a TSC, we'd better have one! - */ -#ifdef CONFIG_X86_TSC - if (!cpu_has_tsc && !tsc_disable) - panic("Kernel compiled for Pentium+, requires TSC feature!"); -#endif - -/* - * If we were told we had a good local APIC, check for buggy Pentia, - * i.e. all B steppings and the C2 stepping of P54C when using their - * integrated APIC (see 11AP erratum in "Pentium Processor - * Specification Update"). - */ -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC) - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL - && cpu_has_apic - && boot_cpu_data.x86 == 5 - && boot_cpu_data.x86_model == 2 - && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) - panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); -#endif -} - -extern void alternative_instructions(void); - -static void __init check_bugs(void) -{ - identify_cpu(&boot_cpu_data); -#ifndef CONFIG_SMP - printk("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - check_config(); - check_fpu(); - check_hlt(); - check_popad(); - init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); - alternative_instructions(); -} +#endif /* _ASM_I386_BUG_H */ From a6c4e076ee4c1ea670e4faa55814e63dd08e3f29 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 114/231] [PATCH] i386: clean up identify_cpu identify_cpu() is used to identify both the boot CPU and secondary CPUs, but it performs some actions which only apply to the boot CPU. Those functions are therefore really __init functions, but because they're called by identify_cpu(), they must be marked __cpuinit. This patch splits identify_cpu() into identify_boot_cpu() and identify_secondary_cpu(), and calls the appropriate init functions from each. Also, identify_boot_cpu() and all the functions it dominates are marked __init. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/common.c | 21 ++++++++++++++------- arch/i386/kernel/smpboot.c | 2 +- arch/i386/kernel/sysenter.c | 2 +- include/asm-i386/processor.h | 3 ++- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 5faf675aab4b..58128585ae60 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -394,7 +394,7 @@ __setup("serialnumber", x86_serial_nr_setup); /* * This does the hard work of actually picking apart the CPU stuff... */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -505,15 +505,22 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Init Machine Check Exception if available. */ mcheck_init(c); +} - if (c == &boot_cpu_data) - sysenter_setup(); +void __init identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); + sysenter_setup(); enable_sep_cpu(); + mtrr_bp_init(); +} - if (c == &boot_cpu_data) - mtrr_bp_init(); - else - mtrr_ap_init(); +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); + enable_sep_cpu(); + mtrr_ap_init(); } #ifdef CONFIG_X86_HT diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 1c3ad9b406ca..61e2842add36 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -155,7 +155,7 @@ static void __cpuinit smp_store_cpu_info(int id) *c = boot_cpu_data; if (id!=0) - identify_cpu(c); + identify_secondary_cpu(c); /* * Mask B, Pentium, but not Pentium MMX */ diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 168f8147d3b4..13ca54a85a1c 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -72,7 +72,7 @@ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; static struct page *syscall_pages[1]; -int __cpuinit sysenter_setup(void) +int __init sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); syscall_pages[0] = virt_to_page(syscall_page); diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 11838df88603..9d895cc2f312 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -116,7 +116,8 @@ extern char ignore_fpu_irq; void __init cpu_detect(struct cpuinfo_x86 *c); -extern void identify_cpu(struct cpuinfo_x86 *); +extern void identify_boot_cpu(void); +extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; From d4f7a2c18e59e0304a1c733589ce14fc02fec1bd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 115/231] [PATCH] i386: Relocate VDSO ELF headers to match mapped location with COMPAT_VDSO Some versions of libc can't deal with a VDSO which doesn't have its ELF headers matching its mapped address. COMPAT_VDSO maps the VDSO at a specific system-wide fixed address. Previously this was all done at build time, on the grounds that the fixed VDSO address is always at the top of the address space. However, a hypervisor may reserve some of that address space, pushing the fixmap address down. This patch does the adjustment dynamically at runtime, depending on the runtime location of the VDSO fixmap. [ Patch has been through several hands: Jan Beulich wrote the orignal version; Zach reworked it, and Jeremy converted it to relocate phdrs as well as sections. ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Zachary Amsden Cc: "Jan Beulich" Cc: Eric W. Biederman Cc: Andi Kleen Cc: Ingo Molnar Cc: Roland McGrath --- arch/i386/kernel/entry.S | 4 - arch/i386/kernel/sysenter.c | 158 ++++++++++++++++++++++++++++++++++-- arch/i386/mm/pgtable.c | 6 -- include/asm-i386/elf.h | 28 +++---- include/asm-i386/fixmap.h | 8 +- include/linux/elf.h | 17 ++++ 6 files changed, 178 insertions(+), 43 deletions(-) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index c61c6b67e856..e901952dff37 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -305,16 +305,12 @@ sysenter_past_esp: pushl $(__USER_CS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET cs, 0*/ -#ifndef CONFIG_COMPAT_VDSO /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) -#else - pushl $SYSENTER_RETURN -#endif CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eip, 0 diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 13ca54a85a1c..e5a958379ac9 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Should the kernel map a VDSO page into processes and pass its @@ -46,6 +47,129 @@ __setup("vdso=", vdso_setup); extern asmlinkage void sysenter_entry(void); +#ifdef CONFIG_COMPAT_VDSO +static __init void reloc_symtab(Elf32_Ehdr *ehdr, + unsigned offset, unsigned size) +{ + Elf32_Sym *sym = (void *)ehdr + offset; + unsigned nsym = size / sizeof(*sym); + unsigned i; + + for(i = 0; i < nsym; i++, sym++) { + if (sym->st_shndx == SHN_UNDEF || + sym->st_shndx == SHN_ABS) + continue; /* skip */ + + if (sym->st_shndx > SHN_LORESERVE) { + printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", + sym->st_shndx); + continue; + } + + switch(ELF_ST_TYPE(sym->st_info)) { + case STT_OBJECT: + case STT_FUNC: + case STT_SECTION: + case STT_FILE: + sym->st_value += VDSO_HIGH_BASE; + } + } +} + +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) +{ + Elf32_Dyn *dyn = (void *)ehdr + offset; + + for(; dyn->d_tag != DT_NULL; dyn++) + switch(dyn->d_tag) { + case DT_PLTGOT: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_INIT: + case DT_FINI: + case DT_REL: + case DT_DEBUG: + case DT_JMPREL: + case DT_VERSYM: + case DT_VERDEF: + case DT_VERNEED: + case DT_ADDRRNGLO ... DT_ADDRRNGHI: + /* definitely pointers needing relocation */ + dyn->d_un.d_ptr += VDSO_HIGH_BASE; + break; + + case DT_ENCODING ... OLD_DT_LOOS-1: + case DT_LOOS ... DT_HIOS-1: + /* Tags above DT_ENCODING are pointers if + they're even */ + if (dyn->d_tag >= DT_ENCODING && + (dyn->d_tag & 1) == 0) + dyn->d_un.d_ptr += VDSO_HIGH_BASE; + break; + + case DT_VERDEFNUM: + case DT_VERNEEDNUM: + case DT_FLAGS_1: + case DT_RELACOUNT: + case DT_RELCOUNT: + case DT_VALRNGLO ... DT_VALRNGHI: + /* definitely not pointers */ + break; + + case OLD_DT_LOOS ... DT_LOOS-1: + case DT_HIOS ... DT_VALRNGLO-1: + default: + if (dyn->d_tag > DT_ENCODING) + printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", + dyn->d_tag); + break; + } +} + +static __init void relocate_vdso(Elf32_Ehdr *ehdr) +{ + Elf32_Phdr *phdr; + Elf32_Shdr *shdr; + int i; + + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || + !elf_check_arch(ehdr) || + ehdr->e_type != ET_DYN); + + ehdr->e_entry += VDSO_HIGH_BASE; + + /* rebase phdrs */ + phdr = (void *)ehdr + ehdr->e_phoff; + for (i = 0; i < ehdr->e_phnum; i++) { + phdr[i].p_vaddr += VDSO_HIGH_BASE; + + /* relocate dynamic stuff */ + if (phdr[i].p_type == PT_DYNAMIC) + reloc_dyn(ehdr, phdr[i].p_offset); + } + + /* rebase sections */ + shdr = (void *)ehdr + ehdr->e_shoff; + for(i = 0; i < ehdr->e_shnum; i++) { + if (!(shdr[i].sh_flags & SHF_ALLOC)) + continue; + + shdr[i].sh_addr += VDSO_HIGH_BASE; + + if (shdr[i].sh_type == SHT_SYMTAB || + shdr[i].sh_type == SHT_DYNSYM) + reloc_symtab(ehdr, shdr[i].sh_offset, + shdr[i].sh_size); + } +} +#else +static inline void relocate_vdso(Elf32_Ehdr *ehdr) +{ +} +#endif /* COMPAT_VDSO */ + void enable_sep_cpu(void) { int cpu = get_cpu(); @@ -75,6 +199,9 @@ static struct page *syscall_pages[1]; int __init sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); + const void *vsyscall; + size_t vsyscall_len; + syscall_pages[0] = virt_to_page(syscall_page); #ifdef CONFIG_COMPAT_VDSO @@ -83,23 +210,23 @@ int __init sysenter_setup(void) #endif if (!boot_cpu_has(X86_FEATURE_SEP)) { - memcpy(syscall_page, - &vsyscall_int80_start, - &vsyscall_int80_end - &vsyscall_int80_start); - return 0; + vsyscall = &vsyscall_int80_start; + vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start; + } else { + vsyscall = &vsyscall_sysenter_start; + vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start; } - memcpy(syscall_page, - &vsyscall_sysenter_start, - &vsyscall_sysenter_end - &vsyscall_sysenter_start); + memcpy(syscall_page, vsyscall, vsyscall_len); + relocate_vdso(syscall_page); return 0; } -#ifndef CONFIG_COMPAT_VDSO /* Defined in vsyscall-sysenter.S */ extern void SYSENTER_RETURN; +#ifdef __HAVE_ARCH_GATE_AREA /* Setup a VMA at program startup for the vsyscall page */ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) { @@ -159,4 +286,17 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } -#endif +#else /* !__HAVE_ARCH_GATE_AREA */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +{ + /* + * If not creating userspace VMA, simply set vdso to point to + * fixmap page. + */ + current->mm->context.vdso = (void *)VDSO_HIGH_BASE; + current_thread_info()->sysenter_return = + (void *)VDSO_SYM(&SYSENTER_RETURN); + + return 0; +} +#endif /* __HAVE_ARCH_GATE_AREA */ diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index fa0cfbd551e1..99c09edc3dbb 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -144,10 +144,8 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) } static int fixmaps; -#ifndef CONFIG_COMPAT_VDSO unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); -#endif void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) { @@ -173,12 +171,8 @@ void reserve_top_address(unsigned long reserve) BUG_ON(fixmaps > 0); printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", (int)-reserve); -#ifdef CONFIG_COMPAT_VDSO - BUG_ON(reserve != 0); -#else __FIXADDR_TOP = -reserve - PAGE_SIZE; __VMALLOC_RESERVE += reserve; -#endif } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h index 952b3ee3c9bb..d304ab4161ff 100644 --- a/include/asm-i386/elf.h +++ b/include/asm-i386/elf.h @@ -133,39 +133,31 @@ extern int dump_task_extended_fpu (struct task_struct *, struct user_fxsr_struct #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs) #define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) -#define VDSO_BASE ((unsigned long)current->mm->context.vdso) - -#ifdef CONFIG_COMPAT_VDSO -# define VDSO_COMPAT_BASE VDSO_HIGH_BASE -# define VDSO_PRELINK VDSO_HIGH_BASE -#else -# define VDSO_COMPAT_BASE VDSO_BASE -# define VDSO_PRELINK 0 -#endif +#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso) +#define VDSO_PRELINK 0 #define VDSO_SYM(x) \ - (VDSO_COMPAT_BASE + (unsigned long)(x) - VDSO_PRELINK) + (VDSO_CURRENT_BASE + (unsigned long)(x) - VDSO_PRELINK) #define VDSO_HIGH_EHDR ((const struct elfhdr *) VDSO_HIGH_BASE) -#define VDSO_EHDR ((const struct elfhdr *) VDSO_COMPAT_BASE) +#define VDSO_EHDR ((const struct elfhdr *) VDSO_CURRENT_BASE) extern void __kernel_vsyscall; #define VDSO_ENTRY VDSO_SYM(&__kernel_vsyscall) -#ifndef CONFIG_COMPAT_VDSO -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; + +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES extern int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack); -#endif extern unsigned int vdso_enabled; -#define ARCH_DLINFO \ -do if (vdso_enabled) { \ - NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ - NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_COMPAT_BASE); \ +#define ARCH_DLINFO \ +do if (vdso_enabled) { \ + NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } while (0) #endif diff --git a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h index 3e9f610c35df..e5651b2a585a 100644 --- a/include/asm-i386/fixmap.h +++ b/include/asm-i386/fixmap.h @@ -19,13 +19,9 @@ * Leave one empty page between vmalloc'ed areas and * the start of the fixmap. */ -#ifndef CONFIG_COMPAT_VDSO extern unsigned long __FIXADDR_TOP; -#else -#define __FIXADDR_TOP 0xfffff000 -#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) -#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) -#endif +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) #ifndef __ASSEMBLY__ #include diff --git a/include/linux/elf.h b/include/linux/elf.h index 60713e6ea297..8b17ffe222c4 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -83,6 +83,23 @@ typedef __s64 Elf64_Sxword; #define DT_DEBUG 21 #define DT_TEXTREL 22 #define DT_JMPREL 23 +#define DT_ENCODING 32 +#define OLD_DT_LOOS 0x60000000 +#define DT_LOOS 0x6000000d +#define DT_HIOS 0x6ffff000 +#define DT_VALRNGLO 0x6ffffd00 +#define DT_VALRNGHI 0x6ffffdff +#define DT_ADDRRNGLO 0x6ffffe00 +#define DT_ADDRRNGHI 0x6ffffeff +#define DT_VERSYM 0x6ffffff0 +#define DT_RELACOUNT 0x6ffffff9 +#define DT_RELCOUNT 0x6ffffffa +#define DT_FLAGS_1 0x6ffffffb +#define DT_VERDEF 0x6ffffffc +#define DT_VERDEFNUM 0x6ffffffd +#define DT_VERNEED 0x6ffffffe +#define DT_VERNEEDNUM 0x6fffffff +#define OLD_DT_HIOS 0x6fffffff #define DT_LOPROC 0x70000000 #define DT_HIPROC 0x7fffffff From 1dbf527c51c6c20c19869c8125cb5b87c3d09506 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 116/231] [PATCH] i386: Make COMPAT_VDSO runtime selectable. Now that relocation of the VDSO for COMPAT_VDSO users is done at runtime rather than compile time, it is possible to enable/disable compat mode at runtime. This patch allows you to enable COMPAT_VDSO mode with "vdso=2" on the kernel command line, or via sysctl. (Switching on a running system shouldn't be done lightly; any process which was relying on the compat VDSO will be upset if it goes away.) The COMPAT_VDSO config option still exists, but if enabled it just makes vdso_enabled default to VDSO_COMPAT. +From: Hugh Dickins Fix oops from i386-make-compat_vdso-runtime-selectable.patch. Even mingetty at system startup finds it easy to trigger an oops while reading /proc/PID/maps: though it has a good hold on the mm itself, that cannot stop exit_mm() from resetting tsk->mm to NULL. (It is usually show_map()'s call to get_gate_vma() which oopses, and I expect we could change that to check priv->tail_vma instead; but no matter, even m_start()'s call just after get_task_mm() is racy.) Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Zachary Amsden Cc: "Jan Beulich" Cc: Eric W. Biederman Cc: Andi Kleen Cc: Ingo Molnar Cc: Roland McGrath --- Documentation/kernel-parameters.txt | 1 + arch/i386/kernel/sysenter.c | 145 ++++++++++++++++++---------- include/asm-i386/page.h | 2 - 3 files changed, 95 insertions(+), 53 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 84c3bd05c639..4287696f18dd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1820,6 +1820,7 @@ and is between 256 and 4096 characters. It is defined in the file [USBHID] The interval which mice are to be polled at. vdso= [IA-32,SH] + vdso=2: enable compat VDSO (default with COMPAT_VDSO) vdso=1: enable VDSO (default) vdso=0: disable VDSO mapping diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index e5a958379ac9..0b9768ee1e8d 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -23,16 +23,25 @@ #include #include #include +#include + +enum { + VDSO_DISABLED = 0, + VDSO_ENABLED = 1, + VDSO_COMPAT = 2, +}; + +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_DEFAULT VDSO_COMPAT +#else +#define VDSO_DEFAULT VDSO_ENABLED +#endif /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ -#ifdef CONFIG_PARAVIRT -unsigned int __read_mostly vdso_enabled = 0; -#else -unsigned int __read_mostly vdso_enabled = 1; -#endif +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; EXPORT_SYMBOL_GPL(vdso_enabled); @@ -47,7 +56,6 @@ __setup("vdso=", vdso_setup); extern asmlinkage void sysenter_entry(void); -#ifdef CONFIG_COMPAT_VDSO static __init void reloc_symtab(Elf32_Ehdr *ehdr, unsigned offset, unsigned size) { @@ -164,11 +172,6 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) shdr[i].sh_size); } } -#else -static inline void relocate_vdso(Elf32_Ehdr *ehdr) -{ -} -#endif /* COMPAT_VDSO */ void enable_sep_cpu(void) { @@ -188,6 +191,25 @@ void enable_sep_cpu(void) put_cpu(); } +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; +} + /* * These symbols are defined by vsyscall.o to mark the bounds * of the ELF DSO images included therein. @@ -196,6 +218,22 @@ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; static struct page *syscall_pages[1]; +static void map_compat_vdso(int map) +{ + static int vdso_mapped; + + if (map == vdso_mapped) + return; + + vdso_mapped = map; + + __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT, + map ? PAGE_READONLY_EXEC : PAGE_NONE); + + /* flush stray tlbs */ + flush_tlb_all(); +} + int __init sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); @@ -204,10 +242,9 @@ int __init sysenter_setup(void) syscall_pages[0] = virt_to_page(syscall_page); -#ifdef CONFIG_COMPAT_VDSO - __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC); + gate_vma_init(); + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); -#endif if (!boot_cpu_has(X86_FEATURE_SEP)) { vsyscall = &vsyscall_int80_start; @@ -226,42 +263,57 @@ int __init sysenter_setup(void) /* Defined in vsyscall-sysenter.S */ extern void SYSENTER_RETURN; -#ifdef __HAVE_ARCH_GATE_AREA /* Setup a VMA at program startup for the vsyscall page */ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) { struct mm_struct *mm = current->mm; unsigned long addr; int ret; + bool compat; down_write(&mm->mmap_sem); - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - /* - * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - ret = install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, - syscall_pages); - if (ret) - goto up_fail; + /* Test compat mode once here, in case someone + changes it via sysctl */ + compat = (vdso_enabled == VDSO_COMPAT); + + map_compat_vdso(compat); + + if (compat) + addr = VDSO_HIGH_BASE; + else { + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + /* + * MAYWRITE to allow gdb to COW and set breakpoints + * + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully + * interpretable later without matching up the same + * kernel and hardware config to see what PC values + * meant. + */ + ret = install_special_mapping(mm, addr, PAGE_SIZE, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + syscall_pages); + + if (ret) + goto up_fail; + } current->mm->context.vdso = (void *)addr; current_thread_info()->sysenter_return = - (void *)VDSO_SYM(&SYSENTER_RETURN); -up_fail: + (void *)VDSO_SYM(&SYSENTER_RETURN); + + up_fail: up_write(&mm->mmap_sem); + return ret; } @@ -274,6 +326,11 @@ const char *arch_vma_name(struct vm_area_struct *vma) struct vm_area_struct *get_gate_vma(struct task_struct *tsk) { + struct mm_struct *mm = tsk->mm; + + /* Check to see if this task was created in compat vdso mode */ + if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) + return &gate_vma; return NULL; } @@ -286,17 +343,3 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } -#else /* !__HAVE_ARCH_GATE_AREA */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) -{ - /* - * If not creating userspace VMA, simply set vdso to point to - * fixmap page. - */ - current->mm->context.vdso = (void *)VDSO_HIGH_BASE; - current_thread_info()->sysenter_return = - (void *)VDSO_SYM(&SYSENTER_RETURN); - - return 0; -} -#endif /* __HAVE_ARCH_GATE_AREA */ diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 7b19f454761d..fd3f64ace248 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -143,9 +143,7 @@ extern int page_is_ram(unsigned long pagenr); #include #include -#ifndef CONFIG_COMPAT_VDSO #define __HAVE_ARCH_GATE_AREA 1 -#endif #endif /* __KERNEL__ */ #endif /* _I386_PAGE_H */ From c169859d6dfc7471ef9f2dbd720936e17906a084 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 117/231] [PATCH] x86-64: Clean up asm-x86_64/bugs.h Most of asm-x86_64/bugs.h is code which should be in a C file, so put it there. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Linus Torvalds --- arch/x86_64/kernel/Makefile | 2 +- arch/x86_64/kernel/bugs.c | 28 ++++++++++++++++++++++++++++ include/asm-x86_64/alternative.h | 1 + include/asm-x86_64/bugs.h | 30 ++++-------------------------- 4 files changed, 34 insertions(+), 27 deletions(-) create mode 100644 arch/x86_64/kernel/bugs.c diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 6879b4f01e88..a613e13d0e75 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c new file mode 100644 index 000000000000..131e541e3f7b --- /dev/null +++ b/arch/x86_64/kernel/bugs.c @@ -0,0 +1,28 @@ +/* + * arch/x86_64/kernel/bugs.c + * + * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 2000 SuSE + * + * This is included by init/main.c to check for architecture-dependent bugs. + * + * Needs: + * void check_bugs(void); + */ + +#include +#include +#include +#include +#include +#include + +void __init check_bugs(void) +{ + identify_cpu(&boot_cpu_data); +#if !defined(CONFIG_SMP) + printk("CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + alternative_instructions(); +} diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h index a6657b4f3e0e..67ebea3cc481 100644 --- a/include/asm-x86_64/alternative.h +++ b/include/asm-x86_64/alternative.h @@ -16,6 +16,7 @@ struct alt_instr { u8 pad[5]; }; +extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); struct module; diff --git a/include/asm-x86_64/bugs.h b/include/asm-x86_64/bugs.h index d86c5dd689fa..b33dc04d8f42 100644 --- a/include/asm-x86_64/bugs.h +++ b/include/asm-x86_64/bugs.h @@ -1,28 +1,6 @@ -/* - * include/asm-x86_64/bugs.h - * - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2000 SuSE - * - * This is included by init/main.c to check for architecture-dependent bugs. - * - * Needs: - * void check_bugs(void); - */ +#ifndef _ASM_X86_64_BUGS_H +#define _ASM_X86_64_BUGS_H -#include -#include -#include -#include +void check_bugs(void); -extern void alternative_instructions(void); - -static void __init check_bugs(void) -{ - identify_cpu(&boot_cpu_data); -#if !defined(CONFIG_SMP) - printk("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - alternative_instructions(); -} +#endif /* _ASM_X86_64_BUGS_H */ From f039b754714a422959027cb18bb33760eb8153f0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 118/231] [PATCH] x86: Don't use MWAIT on AMD Family 10 It doesn't put the CPU into deeper sleep states, so it's better to use the standard idle loop to save power. But allow to reenable it anyways for benchmarking. I also removed the obsolete idle=halt on i386 Cc: andreas.herrmann@amd.com Signed-off-by: Andi Kleen --- Documentation/kernel-parameters.txt | 11 +++++++++-- arch/i386/kernel/cpu/amd.c | 5 +++++ arch/i386/kernel/process.c | 17 ++++++++--------- arch/x86_64/kernel/process.c | 12 +++++++----- arch/x86_64/kernel/setup.c | 6 ++++++ include/asm-i386/processor.h | 2 ++ include/asm-x86_64/proto.h | 2 ++ 7 files changed, 39 insertions(+), 16 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4287696f18dd..94ce0d20253d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -695,8 +695,15 @@ and is between 256 and 4096 characters. It is defined in the file idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed See Documentation/ide.txt. - idle= [HW] - Format: idle=poll or idle=halt + idle= [X86] + Format: idle=poll or idle=mwait + Poll forces a polling idle loop that can slightly improves the performance + of waking up a idle CPU, but will use a lot of power and make the system + run hot. Not recommended. + idle=mwait. On systems which support MONITOR/MWAIT but the kernel chose + to not use it because it doesn't save as much power as a normal idle + loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same + as idle=poll. ignore_loglevel [KNL] Ignore loglevel setting - this will print /all/ diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 2d47db482972..197cda62caa3 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -53,6 +53,8 @@ static __cpuinit int amd_apic_timer_broken(void) return 0; } +int force_mwait __cpuinitdata; + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; @@ -275,6 +277,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (amd_apic_timer_broken()) set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability); + + if (c->x86 == 0x10 && !force_mwait) + clear_bit(X86_FEATURE_MWAIT, c->x86_capability); } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 393a67d5d943..7e8e129b3d7d 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -272,25 +272,24 @@ void __devinit select_idle_routine(const struct cpuinfo_x86 *c) } } -static int __init idle_setup (char *str) +static int __init idle_setup(char *str) { - if (!strncmp(str, "poll", 4)) { + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; #ifdef CONFIG_X86_SMP if (smp_num_siblings > 1) printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); #endif - } else if (!strncmp(str, "halt", 4)) { - printk("using halt in idle threads.\n"); - pm_idle = default_idle; - } + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; boot_option_idle_override = 1; - return 1; + return 0; } - -__setup("idle=", idle_setup); +early_param("idle", idle_setup); void show_regs(struct pt_regs * regs) { diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index d8d5ccc245c8..4f21765078b7 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -288,16 +288,18 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) static int __init idle_setup (char *str) { - if (!strncmp(str, "poll", 4)) { + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; - } + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; boot_option_idle_override = 1; - return 1; + return 0; } - -__setup("idle=", idle_setup); +early_param("idle", idle_setup); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0a1d539149df..db30b5bcef61 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -79,6 +79,8 @@ int bootloader_type; unsigned long saved_video_mode; +int force_mwait __cpuinitdata; + /* * Early DMI memory */ @@ -604,6 +606,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* RDTSC can be speculated around */ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + + /* Family 10 doesn't support C states in MWAIT so don't use it */ + if (c->x86 == 0x10 && !force_mwait) + clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 9d895cc2f312..882d3f8fbbac 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -779,4 +779,6 @@ extern int sysenter_setup(void); extern void cpu_set_gdt(int); extern void cpu_init(void); +extern int force_mwait; + #endif /* __ASM_I386_PROCESSOR_H */ diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 3f8f285138d2..98063bcb3b33 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -119,6 +119,8 @@ extern int gsi_irq_sharing(int gsi); extern void smp_local_timer_interrupt(void); +extern int force_mwait; + long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); void i8254_timer_resume(void); From 7946331856f99bdb00b0a0a53e97d9b621546622 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 119/231] [PATCH] i386: Use menuconfig objects - APM (I hope Andi is the right one to Cc, otherwise please add, thanks!) Use menuconfigs instead of menus, so the whole menu can be disabled at once instead of going through all options. Signed-off-by: Jan Engelhardt Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index c0a3e233fa3f..6964e247f473 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -915,12 +915,9 @@ source kernel/power/Kconfig source "drivers/acpi/Kconfig" -menu "APM (Advanced Power Management) BIOS Support" -depends on PM && !X86_VISWS - -config APM +menuconfig APM tristate "APM (Advanced Power Management) BIOS support" - depends on PM + depends on PM && !X86_VISWS ---help--- APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with @@ -977,9 +974,10 @@ config APM To compile this driver as a module, choose M here: the module will be called apm. +if APM + config APM_IGNORE_USER_SUSPEND bool "Ignore USER SUSPEND" - depends on APM help This option will ignore USER SUSPEND requests. On machines with a compliant APM BIOS, you want to say N. However, on the NEC Versa M @@ -987,7 +985,6 @@ config APM_IGNORE_USER_SUSPEND config APM_DO_ENABLE bool "Enable PM at boot time" - depends on APM ---help--- Enable APM features at boot time. From page 36 of the APM BIOS specification: "When disabled, the APM BIOS does not automatically @@ -1005,7 +1002,6 @@ config APM_DO_ENABLE config APM_CPU_IDLE bool "Make CPU Idle calls when idle" - depends on APM help Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. On some machines, this can activate improved power savings, such as @@ -1017,7 +1013,6 @@ config APM_CPU_IDLE config APM_DISPLAY_BLANK bool "Enable console blanking using APM" - depends on APM help Enable console blanking using the APM. Some laptops can use this to turn off the LCD backlight when the screen blanker of the Linux @@ -1031,7 +1026,6 @@ config APM_DISPLAY_BLANK config APM_ALLOW_INTS bool "Allow interrupts during APM BIOS calls" - depends on APM help Normally we disable external interrupts while we are making calls to the APM BIOS as a measure to lessen the effects of a badly behaving @@ -1042,13 +1036,12 @@ config APM_ALLOW_INTS config APM_REAL_MODE_POWER_OFF bool "Use real mode APM BIOS call to power off" - depends on APM help Use real mode APM BIOS calls to switch off the computer. This is a work-around for a number of buggy BIOSes. Switch this option on if your computer crashes instead of powering off properly. -endmenu +endif # APM source "arch/i386/kernel/cpu/cpufreq/Kconfig" From d479d2cc0802d5c8546a6a7492646e08228effd5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 120/231] [PATCH] i386: Update smp_call_function* comments Update documentation for i386 smp_call_function* functions. As reported by Randy Dunlap [ I've posted this before but it seems to have been lost along the way. ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Randy Dunlap --- arch/i386/kernel/smp.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index fe38b49a1223..9d84f6f001bf 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -554,8 +554,10 @@ static void __smp_call_function(void (*func) (void *info), void *info, * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute <> or are or have finished. + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. @@ -617,11 +619,13 @@ int native_smp_call_function_mask(cpumask_t mask, * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @nonatomic: currently unused. + * @nonatomic: Unused. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute <> or are or have executed. + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. @@ -633,17 +637,18 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic, } EXPORT_SYMBOL(smp_call_function); -/* +/** * smp_call_function_single - Run a function on another CPU + * @cpu: The target CPU. Cannot be the calling CPU. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Currently unused. + * @nonatomic: Unused. * @wait: If true, wait until function has completed on other CPUs. * - * Retrurns 0 on success, else a negative status code. + * Returns 0 on success, else a negative status code. * - * Does not return until the remote CPU is nearly ready to execute - * or is or has executed. + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. */ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int nonatomic, int wait) From de90c5ce832b1218042316260ff9268b00fdcba3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 121/231] [PATCH] i386: Enable bank 0 on non K7 Athlon As a bug workaround bank 0 on K7s is normally disabled, but no need to do that on other AMD CPUs. Cc: davej@redhat.com Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mcheck/k7.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c index b0862af595aa..7a2472557bbb 100644 --- a/arch/i386/kernel/cpu/mcheck/k7.c +++ b/arch/i386/kernel/cpu/mcheck/k7.c @@ -82,9 +82,13 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) nr_mce_banks = l & 0xff; /* Clear status for MC index 0 separately, we don't touch CTL, - * as some Athlons cause spurious MCEs when its enabled. */ - wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); - for (i=1; i Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 122/231] [PATCH] x86: Allow percpu variables to be page-aligned Let's allow page-alignment in general for per-cpu data (wanted by Xen, and Ingo suggested KVM as well). Because larger alignments can use more room, we increase the max per-cpu memory to 64k rather than 32k: it's getting a little tight. Signed-off-by: Rusty Russell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Acked-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/alpha/kernel/vmlinux.lds.S | 2 +- arch/arm/kernel/vmlinux.lds.S | 2 +- arch/cris/arch-v32/vmlinux.lds.S | 1 + arch/frv/kernel/vmlinux.lds.S | 1 + arch/i386/kernel/vmlinux.lds.S | 2 +- arch/m32r/kernel/vmlinux.lds.S | 2 +- arch/mips/kernel/vmlinux.lds.S | 2 +- arch/parisc/kernel/vmlinux.lds.S | 2 +- arch/powerpc/kernel/setup_64.c | 4 ++-- arch/powerpc/kernel/vmlinux.lds.S | 6 +----- arch/ppc/kernel/vmlinux.lds.S | 2 +- arch/s390/kernel/vmlinux.lds.S | 2 +- arch/sh/kernel/vmlinux.lds.S | 2 +- arch/sh64/kernel/vmlinux.lds.S | 2 +- arch/sparc/kernel/vmlinux.lds.S | 2 +- arch/sparc64/kernel/smp.c | 6 +++--- arch/x86_64/kernel/setup64.c | 4 ++-- arch/x86_64/kernel/vmlinux.lds.S | 2 +- arch/xtensa/kernel/vmlinux.lds.S | 2 +- init/main.c | 8 ++------ kernel/module.c | 8 ++++---- 21 files changed, 29 insertions(+), 35 deletions(-) diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 4cc44bd33d33..cf1e6fc6c686 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -69,7 +69,7 @@ SECTIONS . = ALIGN(8); SECURITY_INIT - . = ALIGN(64); + . = ALIGN(8192); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index ddbdad48f5b2..d1a6a597ed9a 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -59,7 +59,7 @@ SECTIONS usr/built-in.o(.init.ramfs) __initramfs_end = .; #endif - . = ALIGN(64); + . = ALIGN(4096); __per_cpu_start = .; *(.data.percpu) __per_cpu_end = .; diff --git a/arch/cris/arch-v32/vmlinux.lds.S b/arch/cris/arch-v32/vmlinux.lds.S index e124fcd766d5..dfa25e1542b9 100644 --- a/arch/cris/arch-v32/vmlinux.lds.S +++ b/arch/cris/arch-v32/vmlinux.lds.S @@ -91,6 +91,7 @@ SECTIONS } SECURITY_INIT + . = ALIGN (8192); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 97910e016825..28eae9735ad6 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -57,6 +57,7 @@ SECTIONS __alt_instructions_end = .; .altinstr_replacement : { *(.altinstr_replacement) } + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index f4ec72231835..97fe6eac47c9 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -194,7 +194,7 @@ SECTIONS __initramfs_end = .; } #endif - . = ALIGN(L1_CACHE_BYTES); + . = ALIGN(4096); .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; *(.data.percpu) diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 439cc257cd1d..6c73bca3f478 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -110,7 +110,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index c76b793310c2..043f637e3d10 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -119,7 +119,7 @@ SECTIONS .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(_PAGE_SIZE); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 2a8253358c6c..c74585990598 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -181,7 +181,7 @@ SECTIONS .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(ASM_PAGE_SIZE); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 22083ce3cc30..6018178708a5 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -582,14 +582,14 @@ void __init setup_per_cpu_areas(void) char *ptr; /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); + size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE); #ifdef CONFIG_MODULES if (size < PERCPU_ENOUGH_ROOM) size = PERCPU_ENOUGH_ROOM; #endif for_each_possible_cpu(i) { - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7eefeb4a30e7..132067313147 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -139,11 +139,7 @@ SECTIONS __initramfs_end = .; } #endif -#ifdef CONFIG_PPC32 - . = ALIGN(32); -#else - . = ALIGN(128); -#endif + . = ALIGN(PAGE_SIZE); .data.percpu : { __per_cpu_start = .; *(.data.percpu) diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S index a0625562a44b..44cd128fb719 100644 --- a/arch/ppc/kernel/vmlinux.lds.S +++ b/arch/ppc/kernel/vmlinux.lds.S @@ -130,7 +130,7 @@ SECTIONS __ftr_fixup : { *(__ftr_fixup) } __stop___ftr_fixup = .; - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 418f6426a949..e9d3432aba60 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -107,7 +107,7 @@ SECTIONS . = ALIGN(2); __initramfs_end = .; #endif - . = ALIGN(256); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 78a6c09875b2..2f606d0ce1f6 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -54,7 +54,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .data.page_aligned : { *(.data.page_aligned) } - . = ALIGN(L1_CACHE_BYTES); + . = ALIGN(PAGE_SIZE); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/sh64/kernel/vmlinux.lds.S b/arch/sh64/kernel/vmlinux.lds.S index a59c5e998131..4f9616f39830 100644 --- a/arch/sh64/kernel/vmlinux.lds.S +++ b/arch/sh64/kernel/vmlinux.lds.S @@ -85,7 +85,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .data.page_aligned : C_PHYS(.data.page_aligned) { *(.data.page_aligned) } - . = ALIGN(L1_CACHE_BYTES); + . = ALIGN(PAGE_SIZE); __per_cpu_start = .; .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) } __per_cpu_end = . ; diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index e5c24e0521de..f0bb6e60e620 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -65,7 +65,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index d4f0a70f4845..1fac215252e4 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -1343,11 +1343,11 @@ void __init setup_per_cpu_areas(void) /* Copy section for each CPU (we discard the original) */ goal = PERCPU_ENOUGH_ROOM; - __per_cpu_shift = 0; - for (size = 1UL; size < goal; size <<= 1UL) + __per_cpu_shift = PAGE_SHIFT; + for (size = PAGE_SIZE; size < goal; size <<= 1UL) __per_cpu_shift++; - ptr = alloc_bootmem(size * NR_CPUS); + ptr = alloc_bootmem_pages(size * NR_CPUS); __per_cpu_base = ptr - __per_cpu_start; diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 53064a9a365f..64379a80d763 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -103,9 +103,9 @@ void __init setup_per_cpu_areas(void) if (!NODE_DATA(cpu_to_node(i))) { printk("cpu with no node %d, num_online_nodes %d\n", i, num_online_nodes()); - ptr = alloc_bootmem(size); + ptr = alloc_bootmem_pages(size); } else { - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); } if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 3bdeb88d28f4..7ef0b8820cd2 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -195,7 +195,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index ab6370054cee..4fbd66a52a88 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -198,7 +198,7 @@ SECTIONS __ftr_fixup : { *(__ftr_fixup) } __stop___ftr_fixup = .; - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/init/main.c b/init/main.c index a92989e7836a..80f09f31bfab 100644 --- a/init/main.c +++ b/init/main.c @@ -369,12 +369,8 @@ static void __init setup_per_cpu_areas(void) unsigned long nr_possible_cpus = num_possible_cpus(); /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif - ptr = alloc_bootmem(size * nr_possible_cpus); + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); for_each_possible_cpu(i) { __per_cpu_offset[i] = ptr - __per_cpu_start; diff --git a/kernel/module.c b/kernel/module.c index cf49ca25fcce..4dc4a257545c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -346,10 +346,10 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, unsigned int i; void *ptr; - if (align > SMP_CACHE_BYTES) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n", - name, align, SMP_CACHE_BYTES); - align = SMP_CACHE_BYTES; + if (align > PAGE_SIZE) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + name, align, PAGE_SIZE); + align = PAGE_SIZE; } ptr = __per_cpu_start; From 4bc5aa91fb1e544ad37805520030a0d9fc6e11d3 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH 123/231] [PATCH] x86: Clean up x86 control register and MSR macros (corrected) This patch is based on Rusty's recent cleanup of the EFLAGS-related macros; it extends the same kind of cleanup to control registers and MSRs. It also unifies these between i386 and x86-64; at least with regards to MSRs, the two had definitely gotten out of sync. Signed-off-by: H. Peter Anvin Signed-off-by: Andi Kleen --- include/asm-i386/Kbuild | 2 + include/asm-i386/msr-index.h | 273 ++++++++++++++++++++++++++ include/asm-i386/msr.h | 237 +---------------------- include/asm-i386/processor-flags.h | 65 +++++++ include/asm-i386/processor.h | 35 ---- include/asm-x86_64/Kbuild | 3 +- include/asm-x86_64/msr-index.h | 1 + include/asm-x86_64/msr.h | 274 +-------------------------- include/asm-x86_64/processor-flags.h | 27 +-- include/asm-x86_64/processor.h | 31 --- 10 files changed, 356 insertions(+), 592 deletions(-) create mode 100644 include/asm-i386/msr-index.h create mode 100644 include/asm-x86_64/msr-index.h diff --git a/include/asm-i386/Kbuild b/include/asm-i386/Kbuild index 5ae93afc67e1..cbf6e8f1087b 100644 --- a/include/asm-i386/Kbuild +++ b/include/asm-i386/Kbuild @@ -3,8 +3,10 @@ include include/asm-generic/Kbuild.asm header-y += boot.h header-y += debugreg.h header-y += ldt.h +header-y += msr-index.h header-y += ptrace-abi.h header-y += ucontext.h +unifdef-y += msr.h unifdef-y += mtrr.h unifdef-y += vm86.h diff --git a/include/asm-i386/msr-index.h b/include/asm-i386/msr-index.h new file mode 100644 index 000000000000..f1190802283d --- /dev/null +++ b/include/asm-i386/msr-index.h @@ -0,0 +1,273 @@ +#ifndef __ASM_MSR_INDEX_H +#define __ASM_MSR_INDEX_H + +/* CPU model specific register (MSR) numbers */ + +/* x86-64 specific MSRs */ +#define MSR_EFER 0xc0000080 /* extended feature register */ +#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */ +#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */ +#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */ +#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ +#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ +#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ +#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ + +/* EFER bits: */ +#define _EFER_SCE 0 /* SYSCALL/SYSRET */ +#define _EFER_LME 8 /* Long mode enable */ +#define _EFER_LMA 10 /* Long mode active (read-only) */ +#define _EFER_NX 11 /* No execute enable */ + +#define EFER_SCE (1<<_EFER_SCE) +#define EFER_LME (1<<_EFER_LME) +#define EFER_LMA (1<<_EFER_LMA) +#define EFER_NX (1<<_EFER_NX) + +/* Intel MSRs. Some also available on other CPUs */ +#define MSR_IA32_PERFCTR0 0x000000c1 +#define MSR_IA32_PERFCTR1 0x000000c2 +#define MSR_FSB_FREQ 0x000000cd + +#define MSR_MTRRcap 0x000000fe +#define MSR_IA32_BBL_CR_CTL 0x00000119 + +#define MSR_IA32_SYSENTER_CS 0x00000174 +#define MSR_IA32_SYSENTER_ESP 0x00000175 +#define MSR_IA32_SYSENTER_EIP 0x00000176 + +#define MSR_IA32_MCG_CAP 0x00000179 +#define MSR_IA32_MCG_STATUS 0x0000017a +#define MSR_IA32_MCG_CTL 0x0000017b + +#define MSR_IA32_PEBS_ENABLE 0x000003f1 +#define MSR_IA32_DS_AREA 0x00000600 +#define MSR_IA32_PERF_CAPABILITIES 0x00000345 + +#define MSR_MTRRfix64K_00000 0x00000250 +#define MSR_MTRRfix16K_80000 0x00000258 +#define MSR_MTRRfix16K_A0000 0x00000259 +#define MSR_MTRRfix4K_C0000 0x00000268 +#define MSR_MTRRfix4K_C8000 0x00000269 +#define MSR_MTRRfix4K_D0000 0x0000026a +#define MSR_MTRRfix4K_D8000 0x0000026b +#define MSR_MTRRfix4K_E0000 0x0000026c +#define MSR_MTRRfix4K_E8000 0x0000026d +#define MSR_MTRRfix4K_F0000 0x0000026e +#define MSR_MTRRfix4K_F8000 0x0000026f +#define MSR_MTRRdefType 0x000002ff + +#define MSR_IA32_DEBUGCTLMSR 0x000001d9 +#define MSR_IA32_LASTBRANCHFROMIP 0x000001db +#define MSR_IA32_LASTBRANCHTOIP 0x000001dc +#define MSR_IA32_LASTINTFROMIP 0x000001dd +#define MSR_IA32_LASTINTTOIP 0x000001de + +#define MSR_IA32_MC0_CTL 0x00000400 +#define MSR_IA32_MC0_STATUS 0x00000401 +#define MSR_IA32_MC0_ADDR 0x00000402 +#define MSR_IA32_MC0_MISC 0x00000403 + +#define MSR_P6_PERFCTR0 0x000000c1 +#define MSR_P6_PERFCTR1 0x000000c2 +#define MSR_P6_EVNTSEL0 0x00000186 +#define MSR_P6_EVNTSEL1 0x00000187 + +/* K7/K8 MSRs. Not complete. See the architecture manual for a more + complete list. */ +#define MSR_K7_EVNTSEL0 0xc0010000 +#define MSR_K7_PERFCTR0 0xc0010004 +#define MSR_K7_EVNTSEL1 0xc0010001 +#define MSR_K7_PERFCTR1 0xc0010005 +#define MSR_K7_EVNTSEL2 0xc0010002 +#define MSR_K7_PERFCTR2 0xc0010006 +#define MSR_K7_EVNTSEL3 0xc0010003 +#define MSR_K7_PERFCTR3 0xc0010007 +#define MSR_K8_TOP_MEM1 0xc001001a +#define MSR_K7_CLK_CTL 0xc001001b +#define MSR_K8_TOP_MEM2 0xc001001d +#define MSR_K8_SYSCFG 0xc0010010 +#define MSR_K7_HWCR 0xc0010015 +#define MSR_K8_HWCR 0xc0010015 +#define MSR_K7_FID_VID_CTL 0xc0010041 +#define MSR_K7_FID_VID_STATUS 0xc0010042 +#define MSR_K8_ENABLE_C1E 0xc0010055 + +/* K6 MSRs */ +#define MSR_K6_EFER 0xc0000080 +#define MSR_K6_STAR 0xc0000081 +#define MSR_K6_WHCR 0xc0000082 +#define MSR_K6_UWCCR 0xc0000085 +#define MSR_K6_EPMR 0xc0000086 +#define MSR_K6_PSOR 0xc0000087 +#define MSR_K6_PFIR 0xc0000088 + +/* Centaur-Hauls/IDT defined MSRs. */ +#define MSR_IDT_FCR1 0x00000107 +#define MSR_IDT_FCR2 0x00000108 +#define MSR_IDT_FCR3 0x00000109 +#define MSR_IDT_FCR4 0x0000010a + +#define MSR_IDT_MCR0 0x00000110 +#define MSR_IDT_MCR1 0x00000111 +#define MSR_IDT_MCR2 0x00000112 +#define MSR_IDT_MCR3 0x00000113 +#define MSR_IDT_MCR4 0x00000114 +#define MSR_IDT_MCR5 0x00000115 +#define MSR_IDT_MCR6 0x00000116 +#define MSR_IDT_MCR7 0x00000117 +#define MSR_IDT_MCR_CTRL 0x00000120 + +/* VIA Cyrix defined MSRs*/ +#define MSR_VIA_FCR 0x00001107 +#define MSR_VIA_LONGHAUL 0x0000110a +#define MSR_VIA_RNG 0x0000110b +#define MSR_VIA_BCR2 0x00001147 + +/* Transmeta defined MSRs */ +#define MSR_TMTA_LONGRUN_CTRL 0x80868010 +#define MSR_TMTA_LONGRUN_FLAGS 0x80868011 +#define MSR_TMTA_LRTI_READOUT 0x80868018 +#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a + +/* Intel defined MSRs. */ +#define MSR_IA32_P5_MC_ADDR 0x00000000 +#define MSR_IA32_P5_MC_TYPE 0x00000001 +#define MSR_IA32_TSC 0x00000010 +#define MSR_IA32_PLATFORM_ID 0x00000017 +#define MSR_IA32_EBL_CR_POWERON 0x0000002a + +#define MSR_IA32_APICBASE 0x0000001b +#define MSR_IA32_APICBASE_BSP (1<<8) +#define MSR_IA32_APICBASE_ENABLE (1<<11) +#define MSR_IA32_APICBASE_BASE (0xfffff<<12) + +#define MSR_IA32_UCODE_WRITE 0x00000079 +#define MSR_IA32_UCODE_REV 0x0000008b + +#define MSR_IA32_PERF_STATUS 0x00000198 +#define MSR_IA32_PERF_CTL 0x00000199 + +#define MSR_IA32_MPERF 0x000000e7 +#define MSR_IA32_APERF 0x000000e8 + +#define MSR_IA32_THERM_CONTROL 0x0000019a +#define MSR_IA32_THERM_INTERRUPT 0x0000019b +#define MSR_IA32_THERM_STATUS 0x0000019c +#define MSR_IA32_MISC_ENABLE 0x000001a0 + +/* Intel Model 6 */ +#define MSR_P6_EVNTSEL0 0x00000186 +#define MSR_P6_EVNTSEL1 0x00000187 + +/* P4/Xeon+ specific */ +#define MSR_IA32_MCG_EAX 0x00000180 +#define MSR_IA32_MCG_EBX 0x00000181 +#define MSR_IA32_MCG_ECX 0x00000182 +#define MSR_IA32_MCG_EDX 0x00000183 +#define MSR_IA32_MCG_ESI 0x00000184 +#define MSR_IA32_MCG_EDI 0x00000185 +#define MSR_IA32_MCG_EBP 0x00000186 +#define MSR_IA32_MCG_ESP 0x00000187 +#define MSR_IA32_MCG_EFLAGS 0x00000188 +#define MSR_IA32_MCG_EIP 0x00000189 +#define MSR_IA32_MCG_RESERVED 0x0000018a + +/* Pentium IV performance counter MSRs */ +#define MSR_P4_BPU_PERFCTR0 0x00000300 +#define MSR_P4_BPU_PERFCTR1 0x00000301 +#define MSR_P4_BPU_PERFCTR2 0x00000302 +#define MSR_P4_BPU_PERFCTR3 0x00000303 +#define MSR_P4_MS_PERFCTR0 0x00000304 +#define MSR_P4_MS_PERFCTR1 0x00000305 +#define MSR_P4_MS_PERFCTR2 0x00000306 +#define MSR_P4_MS_PERFCTR3 0x00000307 +#define MSR_P4_FLAME_PERFCTR0 0x00000308 +#define MSR_P4_FLAME_PERFCTR1 0x00000309 +#define MSR_P4_FLAME_PERFCTR2 0x0000030a +#define MSR_P4_FLAME_PERFCTR3 0x0000030b +#define MSR_P4_IQ_PERFCTR0 0x0000030c +#define MSR_P4_IQ_PERFCTR1 0x0000030d +#define MSR_P4_IQ_PERFCTR2 0x0000030e +#define MSR_P4_IQ_PERFCTR3 0x0000030f +#define MSR_P4_IQ_PERFCTR4 0x00000310 +#define MSR_P4_IQ_PERFCTR5 0x00000311 +#define MSR_P4_BPU_CCCR0 0x00000360 +#define MSR_P4_BPU_CCCR1 0x00000361 +#define MSR_P4_BPU_CCCR2 0x00000362 +#define MSR_P4_BPU_CCCR3 0x00000363 +#define MSR_P4_MS_CCCR0 0x00000364 +#define MSR_P4_MS_CCCR1 0x00000365 +#define MSR_P4_MS_CCCR2 0x00000366 +#define MSR_P4_MS_CCCR3 0x00000367 +#define MSR_P4_FLAME_CCCR0 0x00000368 +#define MSR_P4_FLAME_CCCR1 0x00000369 +#define MSR_P4_FLAME_CCCR2 0x0000036a +#define MSR_P4_FLAME_CCCR3 0x0000036b +#define MSR_P4_IQ_CCCR0 0x0000036c +#define MSR_P4_IQ_CCCR1 0x0000036d +#define MSR_P4_IQ_CCCR2 0x0000036e +#define MSR_P4_IQ_CCCR3 0x0000036f +#define MSR_P4_IQ_CCCR4 0x00000370 +#define MSR_P4_IQ_CCCR5 0x00000371 +#define MSR_P4_ALF_ESCR0 0x000003ca +#define MSR_P4_ALF_ESCR1 0x000003cb +#define MSR_P4_BPU_ESCR0 0x000003b2 +#define MSR_P4_BPU_ESCR1 0x000003b3 +#define MSR_P4_BSU_ESCR0 0x000003a0 +#define MSR_P4_BSU_ESCR1 0x000003a1 +#define MSR_P4_CRU_ESCR0 0x000003b8 +#define MSR_P4_CRU_ESCR1 0x000003b9 +#define MSR_P4_CRU_ESCR2 0x000003cc +#define MSR_P4_CRU_ESCR3 0x000003cd +#define MSR_P4_CRU_ESCR4 0x000003e0 +#define MSR_P4_CRU_ESCR5 0x000003e1 +#define MSR_P4_DAC_ESCR0 0x000003a8 +#define MSR_P4_DAC_ESCR1 0x000003a9 +#define MSR_P4_FIRM_ESCR0 0x000003a4 +#define MSR_P4_FIRM_ESCR1 0x000003a5 +#define MSR_P4_FLAME_ESCR0 0x000003a6 +#define MSR_P4_FLAME_ESCR1 0x000003a7 +#define MSR_P4_FSB_ESCR0 0x000003a2 +#define MSR_P4_FSB_ESCR1 0x000003a3 +#define MSR_P4_IQ_ESCR0 0x000003ba +#define MSR_P4_IQ_ESCR1 0x000003bb +#define MSR_P4_IS_ESCR0 0x000003b4 +#define MSR_P4_IS_ESCR1 0x000003b5 +#define MSR_P4_ITLB_ESCR0 0x000003b6 +#define MSR_P4_ITLB_ESCR1 0x000003b7 +#define MSR_P4_IX_ESCR0 0x000003c8 +#define MSR_P4_IX_ESCR1 0x000003c9 +#define MSR_P4_MOB_ESCR0 0x000003aa +#define MSR_P4_MOB_ESCR1 0x000003ab +#define MSR_P4_MS_ESCR0 0x000003c0 +#define MSR_P4_MS_ESCR1 0x000003c1 +#define MSR_P4_PMH_ESCR0 0x000003ac +#define MSR_P4_PMH_ESCR1 0x000003ad +#define MSR_P4_RAT_ESCR0 0x000003bc +#define MSR_P4_RAT_ESCR1 0x000003bd +#define MSR_P4_SAAT_ESCR0 0x000003ae +#define MSR_P4_SAAT_ESCR1 0x000003af +#define MSR_P4_SSU_ESCR0 0x000003be +#define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */ + +#define MSR_P4_TBPU_ESCR0 0x000003c2 +#define MSR_P4_TBPU_ESCR1 0x000003c3 +#define MSR_P4_TC_ESCR0 0x000003c4 +#define MSR_P4_TC_ESCR1 0x000003c5 +#define MSR_P4_U2L_ESCR0 0x000003b0 +#define MSR_P4_U2L_ESCR1 0x000003b1 + +/* Intel Core-based CPU performance counters */ +#define MSR_CORE_PERF_FIXED_CTR0 0x00000309 +#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a +#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b +#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d +#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e +#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f +#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 + +/* Geode defined MSRs */ +#define MSR_GEODE_BUSCONT_CONF0 0x00001900 + +#endif /* __ASM_MSR_INDEX_H */ diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h index 00acaa8b36bb..9559894c7658 100644 --- a/include/asm-i386/msr.h +++ b/include/asm-i386/msr.h @@ -1,6 +1,11 @@ #ifndef __ASM_MSR_H #define __ASM_MSR_H +#include + +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ + #include static inline unsigned long long native_read_msr(unsigned int msr) @@ -153,234 +158,6 @@ static inline void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) wrmsr(msr_no, l, h); } #endif /* CONFIG_SMP */ - -/* symbolic names for some interesting MSRs */ -/* Intel defined MSRs. */ -#define MSR_IA32_P5_MC_ADDR 0 -#define MSR_IA32_P5_MC_TYPE 1 -#define MSR_IA32_PLATFORM_ID 0x17 -#define MSR_IA32_EBL_CR_POWERON 0x2a - -#define MSR_IA32_APICBASE 0x1b -#define MSR_IA32_APICBASE_BSP (1<<8) -#define MSR_IA32_APICBASE_ENABLE (1<<11) -#define MSR_IA32_APICBASE_BASE (0xfffff<<12) - -#define MSR_IA32_UCODE_WRITE 0x79 -#define MSR_IA32_UCODE_REV 0x8b - -#define MSR_P6_PERFCTR0 0xc1 -#define MSR_P6_PERFCTR1 0xc2 -#define MSR_FSB_FREQ 0xcd - - -#define MSR_IA32_BBL_CR_CTL 0x119 - -#define MSR_IA32_SYSENTER_CS 0x174 -#define MSR_IA32_SYSENTER_ESP 0x175 -#define MSR_IA32_SYSENTER_EIP 0x176 - -#define MSR_IA32_MCG_CAP 0x179 -#define MSR_IA32_MCG_STATUS 0x17a -#define MSR_IA32_MCG_CTL 0x17b - -/* P4/Xeon+ specific */ -#define MSR_IA32_MCG_EAX 0x180 -#define MSR_IA32_MCG_EBX 0x181 -#define MSR_IA32_MCG_ECX 0x182 -#define MSR_IA32_MCG_EDX 0x183 -#define MSR_IA32_MCG_ESI 0x184 -#define MSR_IA32_MCG_EDI 0x185 -#define MSR_IA32_MCG_EBP 0x186 -#define MSR_IA32_MCG_ESP 0x187 -#define MSR_IA32_MCG_EFLAGS 0x188 -#define MSR_IA32_MCG_EIP 0x189 -#define MSR_IA32_MCG_RESERVED 0x18A - -#define MSR_P6_EVNTSEL0 0x186 -#define MSR_P6_EVNTSEL1 0x187 - -#define MSR_IA32_PERF_STATUS 0x198 -#define MSR_IA32_PERF_CTL 0x199 - -#define MSR_IA32_MPERF 0xE7 -#define MSR_IA32_APERF 0xE8 - -#define MSR_IA32_THERM_CONTROL 0x19a -#define MSR_IA32_THERM_INTERRUPT 0x19b -#define MSR_IA32_THERM_STATUS 0x19c -#define MSR_IA32_MISC_ENABLE 0x1a0 - -#define MSR_IA32_DEBUGCTLMSR 0x1d9 -#define MSR_IA32_LASTBRANCHFROMIP 0x1db -#define MSR_IA32_LASTBRANCHTOIP 0x1dc -#define MSR_IA32_LASTINTFROMIP 0x1dd -#define MSR_IA32_LASTINTTOIP 0x1de - -#define MSR_IA32_MC0_CTL 0x400 -#define MSR_IA32_MC0_STATUS 0x401 -#define MSR_IA32_MC0_ADDR 0x402 -#define MSR_IA32_MC0_MISC 0x403 - -#define MSR_IA32_PEBS_ENABLE 0x3f1 -#define MSR_IA32_DS_AREA 0x600 -#define MSR_IA32_PERF_CAPABILITIES 0x345 - -/* Pentium IV performance counter MSRs */ -#define MSR_P4_BPU_PERFCTR0 0x300 -#define MSR_P4_BPU_PERFCTR1 0x301 -#define MSR_P4_BPU_PERFCTR2 0x302 -#define MSR_P4_BPU_PERFCTR3 0x303 -#define MSR_P4_MS_PERFCTR0 0x304 -#define MSR_P4_MS_PERFCTR1 0x305 -#define MSR_P4_MS_PERFCTR2 0x306 -#define MSR_P4_MS_PERFCTR3 0x307 -#define MSR_P4_FLAME_PERFCTR0 0x308 -#define MSR_P4_FLAME_PERFCTR1 0x309 -#define MSR_P4_FLAME_PERFCTR2 0x30a -#define MSR_P4_FLAME_PERFCTR3 0x30b -#define MSR_P4_IQ_PERFCTR0 0x30c -#define MSR_P4_IQ_PERFCTR1 0x30d -#define MSR_P4_IQ_PERFCTR2 0x30e -#define MSR_P4_IQ_PERFCTR3 0x30f -#define MSR_P4_IQ_PERFCTR4 0x310 -#define MSR_P4_IQ_PERFCTR5 0x311 -#define MSR_P4_BPU_CCCR0 0x360 -#define MSR_P4_BPU_CCCR1 0x361 -#define MSR_P4_BPU_CCCR2 0x362 -#define MSR_P4_BPU_CCCR3 0x363 -#define MSR_P4_MS_CCCR0 0x364 -#define MSR_P4_MS_CCCR1 0x365 -#define MSR_P4_MS_CCCR2 0x366 -#define MSR_P4_MS_CCCR3 0x367 -#define MSR_P4_FLAME_CCCR0 0x368 -#define MSR_P4_FLAME_CCCR1 0x369 -#define MSR_P4_FLAME_CCCR2 0x36a -#define MSR_P4_FLAME_CCCR3 0x36b -#define MSR_P4_IQ_CCCR0 0x36c -#define MSR_P4_IQ_CCCR1 0x36d -#define MSR_P4_IQ_CCCR2 0x36e -#define MSR_P4_IQ_CCCR3 0x36f -#define MSR_P4_IQ_CCCR4 0x370 -#define MSR_P4_IQ_CCCR5 0x371 -#define MSR_P4_ALF_ESCR0 0x3ca -#define MSR_P4_ALF_ESCR1 0x3cb -#define MSR_P4_BPU_ESCR0 0x3b2 -#define MSR_P4_BPU_ESCR1 0x3b3 -#define MSR_P4_BSU_ESCR0 0x3a0 -#define MSR_P4_BSU_ESCR1 0x3a1 -#define MSR_P4_CRU_ESCR0 0x3b8 -#define MSR_P4_CRU_ESCR1 0x3b9 -#define MSR_P4_CRU_ESCR2 0x3cc -#define MSR_P4_CRU_ESCR3 0x3cd -#define MSR_P4_CRU_ESCR4 0x3e0 -#define MSR_P4_CRU_ESCR5 0x3e1 -#define MSR_P4_DAC_ESCR0 0x3a8 -#define MSR_P4_DAC_ESCR1 0x3a9 -#define MSR_P4_FIRM_ESCR0 0x3a4 -#define MSR_P4_FIRM_ESCR1 0x3a5 -#define MSR_P4_FLAME_ESCR0 0x3a6 -#define MSR_P4_FLAME_ESCR1 0x3a7 -#define MSR_P4_FSB_ESCR0 0x3a2 -#define MSR_P4_FSB_ESCR1 0x3a3 -#define MSR_P4_IQ_ESCR0 0x3ba -#define MSR_P4_IQ_ESCR1 0x3bb -#define MSR_P4_IS_ESCR0 0x3b4 -#define MSR_P4_IS_ESCR1 0x3b5 -#define MSR_P4_ITLB_ESCR0 0x3b6 -#define MSR_P4_ITLB_ESCR1 0x3b7 -#define MSR_P4_IX_ESCR0 0x3c8 -#define MSR_P4_IX_ESCR1 0x3c9 -#define MSR_P4_MOB_ESCR0 0x3aa -#define MSR_P4_MOB_ESCR1 0x3ab -#define MSR_P4_MS_ESCR0 0x3c0 -#define MSR_P4_MS_ESCR1 0x3c1 -#define MSR_P4_PMH_ESCR0 0x3ac -#define MSR_P4_PMH_ESCR1 0x3ad -#define MSR_P4_RAT_ESCR0 0x3bc -#define MSR_P4_RAT_ESCR1 0x3bd -#define MSR_P4_SAAT_ESCR0 0x3ae -#define MSR_P4_SAAT_ESCR1 0x3af -#define MSR_P4_SSU_ESCR0 0x3be -#define MSR_P4_SSU_ESCR1 0x3bf /* guess: not defined in manual */ -#define MSR_P4_TBPU_ESCR0 0x3c2 -#define MSR_P4_TBPU_ESCR1 0x3c3 -#define MSR_P4_TC_ESCR0 0x3c4 -#define MSR_P4_TC_ESCR1 0x3c5 -#define MSR_P4_U2L_ESCR0 0x3b0 -#define MSR_P4_U2L_ESCR1 0x3b1 - -/* AMD Defined MSRs */ -#define MSR_K6_EFER 0xC0000080 -#define MSR_K6_STAR 0xC0000081 -#define MSR_K6_WHCR 0xC0000082 -#define MSR_K6_UWCCR 0xC0000085 -#define MSR_K6_EPMR 0xC0000086 -#define MSR_K6_PSOR 0xC0000087 -#define MSR_K6_PFIR 0xC0000088 - -#define MSR_K7_EVNTSEL0 0xC0010000 -#define MSR_K7_EVNTSEL1 0xC0010001 -#define MSR_K7_EVNTSEL2 0xC0010002 -#define MSR_K7_EVNTSEL3 0xC0010003 -#define MSR_K7_PERFCTR0 0xC0010004 -#define MSR_K7_PERFCTR1 0xC0010005 -#define MSR_K7_PERFCTR2 0xC0010006 -#define MSR_K7_PERFCTR3 0xC0010007 -#define MSR_K7_HWCR 0xC0010015 -#define MSR_K7_CLK_CTL 0xC001001b -#define MSR_K7_FID_VID_CTL 0xC0010041 -#define MSR_K7_FID_VID_STATUS 0xC0010042 - -#define MSR_K8_ENABLE_C1E 0xC0010055 - -/* extended feature register */ -#define MSR_EFER 0xc0000080 - -/* EFER bits: */ - -/* Execute Disable enable */ -#define _EFER_NX 11 -#define EFER_NX (1<<_EFER_NX) - -/* Centaur-Hauls/IDT defined MSRs. */ -#define MSR_IDT_FCR1 0x107 -#define MSR_IDT_FCR2 0x108 -#define MSR_IDT_FCR3 0x109 -#define MSR_IDT_FCR4 0x10a - -#define MSR_IDT_MCR0 0x110 -#define MSR_IDT_MCR1 0x111 -#define MSR_IDT_MCR2 0x112 -#define MSR_IDT_MCR3 0x113 -#define MSR_IDT_MCR4 0x114 -#define MSR_IDT_MCR5 0x115 -#define MSR_IDT_MCR6 0x116 -#define MSR_IDT_MCR7 0x117 -#define MSR_IDT_MCR_CTRL 0x120 - -/* VIA Cyrix defined MSRs*/ -#define MSR_VIA_FCR 0x1107 -#define MSR_VIA_LONGHAUL 0x110a -#define MSR_VIA_RNG 0x110b -#define MSR_VIA_BCR2 0x1147 - -/* Transmeta defined MSRs */ -#define MSR_TMTA_LONGRUN_CTRL 0x80868010 -#define MSR_TMTA_LONGRUN_FLAGS 0x80868011 -#define MSR_TMTA_LRTI_READOUT 0x80868018 -#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a - -/* Intel Core-based CPU performance counters */ -#define MSR_CORE_PERF_FIXED_CTR0 0x309 -#define MSR_CORE_PERF_FIXED_CTR1 0x30a -#define MSR_CORE_PERF_FIXED_CTR2 0x30b -#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x38d -#define MSR_CORE_PERF_GLOBAL_STATUS 0x38e -#define MSR_CORE_PERF_GLOBAL_CTRL 0x38f -#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390 - -/* Geode defined MSRs */ -#define MSR_GEODE_BUSCONT_CONF0 0x1900 - +#endif +#endif #endif /* __ASM_MSR_H */ diff --git a/include/asm-i386/processor-flags.h b/include/asm-i386/processor-flags.h index b4711c222e2b..5404e90edd57 100644 --- a/include/asm-i386/processor-flags.h +++ b/include/asm-i386/processor-flags.h @@ -23,4 +23,69 @@ #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ +/* + * Basic CPU control in CR0 + */ +#define X86_CR0_PE 0x00000001 /* Protection Enable */ +#define X86_CR0_MP 0x00000002 /* Monitor Coprocessor */ +#define X86_CR0_EM 0x00000004 /* Emulation */ +#define X86_CR0_TS 0x00000008 /* Task Switched */ +#define X86_CR0_ET 0x00000010 /* Extension Type */ +#define X86_CR0_NE 0x00000020 /* Numeric Error */ +#define X86_CR0_WP 0x00010000 /* Write Protect */ +#define X86_CR0_AM 0x00040000 /* Alignment Mask */ +#define X86_CR0_NW 0x20000000 /* Not Write-through */ +#define X86_CR0_CD 0x40000000 /* Cache Disable */ +#define X86_CR0_PG 0x80000000 /* Paging */ + +/* + * Paging options in CR3 + */ +#define X86_CR3_PWT 0x00000008 /* Page Write Through */ +#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ + +/* + * Intel CPU features in CR4 + */ +#define X86_CR4_VME 0x00000001 /* enable vm86 extensions */ +#define X86_CR4_PVI 0x00000002 /* virtual interrupts flag enable */ +#define X86_CR4_TSD 0x00000004 /* disable time stamp at ipl 3 */ +#define X86_CR4_DE 0x00000008 /* enable debugging extensions */ +#define X86_CR4_PSE 0x00000010 /* enable page size extensions */ +#define X86_CR4_PAE 0x00000020 /* enable physical address extensions */ +#define X86_CR4_MCE 0x00000040 /* Machine check enable */ +#define X86_CR4_PGE 0x00000080 /* enable global pages */ +#define X86_CR4_PCE 0x00000100 /* enable performance counters at ipl 3 */ +#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ +#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ +#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ + +/* + * x86-64 Task Priority Register, CR8 + */ +#define X86_CR8_TPR 0x00000007 /* task priority register */ + +/* + * AMD and Transmeta use MSRs for configuration; see + */ + +/* + * NSC/Cyrix CPU configuration register indexes + */ +#define CX86_PCR0 0x20 +#define CX86_GCR 0xb8 +#define CX86_CCR0 0xc0 +#define CX86_CCR1 0xc1 +#define CX86_CCR2 0xc2 +#define CX86_CCR3 0xc3 +#define CX86_CCR4 0xe8 +#define CX86_CCR5 0xe9 +#define CX86_CCR6 0xea +#define CX86_CCR7 0xeb +#define CX86_PCR1 0xf0 +#define CX86_DIR0 0xfe +#define CX86_DIR1 0xff +#define CX86_ARR_BASE 0xc4 +#define CX86_RCR_BASE 0xdc + #endif /* __ASM_I386_PROCESSOR_FLAGS_H */ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 882d3f8fbbac..77e263267aa6 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -142,21 +142,6 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, #define load_cr3(pgdir) write_cr3(__pa(pgdir)) -/* - * Intel CPU features in CR4 - */ -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ - /* * Save the cr4 feature set we're using (ie * Pentium 4MB enable and PPro Global page @@ -183,26 +168,6 @@ static inline void clear_in_cr4 (unsigned long mask) write_cr4(cr4); } -/* - * NSC/Cyrix CPU configuration register indexes - */ - -#define CX86_PCR0 0x20 -#define CX86_GCR 0xb8 -#define CX86_CCR0 0xc0 -#define CX86_CCR1 0xc1 -#define CX86_CCR2 0xc2 -#define CX86_CCR3 0xc3 -#define CX86_CCR4 0xe8 -#define CX86_CCR5 0xe9 -#define CX86_CCR6 0xea -#define CX86_CCR7 0xeb -#define CX86_PCR1 0xf0 -#define CX86_DIR0 0xfe -#define CX86_DIR1 0xff -#define CX86_ARR_BASE 0xc4 -#define CX86_RCR_BASE 0xdc - /* * NSC/Cyrix CPU indexed register access macros */ diff --git a/include/asm-x86_64/Kbuild b/include/asm-x86_64/Kbuild index 242296ede3dd..89ad1fc27c8b 100644 --- a/include/asm-x86_64/Kbuild +++ b/include/asm-x86_64/Kbuild @@ -8,7 +8,7 @@ header-y += boot.h header-y += bootsetup.h header-y += debugreg.h header-y += ldt.h -header-y += msr.h +header-y += msr-index.h header-y += prctl.h header-y += ptrace-abi.h header-y += sigcontext32.h @@ -16,6 +16,7 @@ header-y += ucontext.h header-y += vsyscall32.h unifdef-y += mce.h +unifdef-y += msr.h unifdef-y += mtrr.h unifdef-y += vsyscall.h unifdef-y += const.h diff --git a/include/asm-x86_64/msr-index.h b/include/asm-x86_64/msr-index.h new file mode 100644 index 000000000000..d77a63f1ddf2 --- /dev/null +++ b/include/asm-x86_64/msr-index.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h index 902f9a58617e..a524f0325673 100644 --- a/include/asm-x86_64/msr.h +++ b/include/asm-x86_64/msr.h @@ -1,6 +1,8 @@ #ifndef X86_64_MSR_H #define X86_64_MSR_H 1 +#include + #ifndef __ASSEMBLY__ /* * Access to machine-specific registers (available on 586 and better only) @@ -157,9 +159,6 @@ static inline unsigned int cpuid_edx(unsigned int op) return edx; } -#define MSR_IA32_UCODE_WRITE 0x79 -#define MSR_IA32_UCODE_REV 0x8b - #ifdef CONFIG_SMP void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); @@ -172,269 +171,6 @@ static inline void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { wrmsr(msr_no, l, h); } -#endif /* CONFIG_SMP */ - -#endif - -/* AMD/K8 specific MSRs */ -#define MSR_EFER 0xc0000080 /* extended feature register */ -#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */ -#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */ -#define MSR_CSTAR 0xc0000083 /* compatibility mode SYSCALL target */ -#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ -#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ -#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ -#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */ -/* EFER bits: */ -#define _EFER_SCE 0 /* SYSCALL/SYSRET */ -#define _EFER_LME 8 /* Long mode enable */ -#define _EFER_LMA 10 /* Long mode active (read-only) */ -#define _EFER_NX 11 /* No execute enable */ - -#define EFER_SCE (1<<_EFER_SCE) -#define EFER_LME (1<<_EFER_LME) -#define EFER_LMA (1<<_EFER_LMA) -#define EFER_NX (1<<_EFER_NX) - -/* Intel MSRs. Some also available on other CPUs */ -#define MSR_IA32_TSC 0x10 -#define MSR_IA32_PLATFORM_ID 0x17 - -#define MSR_IA32_PERFCTR0 0xc1 -#define MSR_IA32_PERFCTR1 0xc2 -#define MSR_FSB_FREQ 0xcd - -#define MSR_MTRRcap 0x0fe -#define MSR_IA32_BBL_CR_CTL 0x119 - -#define MSR_IA32_SYSENTER_CS 0x174 -#define MSR_IA32_SYSENTER_ESP 0x175 -#define MSR_IA32_SYSENTER_EIP 0x176 - -#define MSR_IA32_MCG_CAP 0x179 -#define MSR_IA32_MCG_STATUS 0x17a -#define MSR_IA32_MCG_CTL 0x17b - -#define MSR_IA32_EVNTSEL0 0x186 -#define MSR_IA32_EVNTSEL1 0x187 - -#define MSR_IA32_DEBUGCTLMSR 0x1d9 -#define MSR_IA32_LASTBRANCHFROMIP 0x1db -#define MSR_IA32_LASTBRANCHTOIP 0x1dc -#define MSR_IA32_LASTINTFROMIP 0x1dd -#define MSR_IA32_LASTINTTOIP 0x1de - -#define MSR_IA32_PEBS_ENABLE 0x3f1 -#define MSR_IA32_DS_AREA 0x600 -#define MSR_IA32_PERF_CAPABILITIES 0x345 - -#define MSR_MTRRfix64K_00000 0x250 -#define MSR_MTRRfix16K_80000 0x258 -#define MSR_MTRRfix16K_A0000 0x259 -#define MSR_MTRRfix4K_C0000 0x268 -#define MSR_MTRRfix4K_C8000 0x269 -#define MSR_MTRRfix4K_D0000 0x26a -#define MSR_MTRRfix4K_D8000 0x26b -#define MSR_MTRRfix4K_E0000 0x26c -#define MSR_MTRRfix4K_E8000 0x26d -#define MSR_MTRRfix4K_F0000 0x26e -#define MSR_MTRRfix4K_F8000 0x26f -#define MSR_MTRRdefType 0x2ff - -#define MSR_IA32_MC0_CTL 0x400 -#define MSR_IA32_MC0_STATUS 0x401 -#define MSR_IA32_MC0_ADDR 0x402 -#define MSR_IA32_MC0_MISC 0x403 - -#define MSR_P6_PERFCTR0 0xc1 -#define MSR_P6_PERFCTR1 0xc2 -#define MSR_P6_EVNTSEL0 0x186 -#define MSR_P6_EVNTSEL1 0x187 - -/* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */ -#define MSR_K7_EVNTSEL0 0xC0010000 -#define MSR_K7_PERFCTR0 0xC0010004 -#define MSR_K7_EVNTSEL1 0xC0010001 -#define MSR_K7_PERFCTR1 0xC0010005 -#define MSR_K7_EVNTSEL2 0xC0010002 -#define MSR_K7_PERFCTR2 0xC0010006 -#define MSR_K7_EVNTSEL3 0xC0010003 -#define MSR_K7_PERFCTR3 0xC0010007 -#define MSR_K8_TOP_MEM1 0xC001001A -#define MSR_K8_TOP_MEM2 0xC001001D -#define MSR_K8_SYSCFG 0xC0010010 -#define MSR_K8_HWCR 0xC0010015 - -/* K6 MSRs */ -#define MSR_K6_EFER 0xC0000080 -#define MSR_K6_STAR 0xC0000081 -#define MSR_K6_WHCR 0xC0000082 -#define MSR_K6_UWCCR 0xC0000085 -#define MSR_K6_PSOR 0xC0000087 -#define MSR_K6_PFIR 0xC0000088 - -/* Centaur-Hauls/IDT defined MSRs. */ -#define MSR_IDT_FCR1 0x107 -#define MSR_IDT_FCR2 0x108 -#define MSR_IDT_FCR3 0x109 -#define MSR_IDT_FCR4 0x10a - -#define MSR_IDT_MCR0 0x110 -#define MSR_IDT_MCR1 0x111 -#define MSR_IDT_MCR2 0x112 -#define MSR_IDT_MCR3 0x113 -#define MSR_IDT_MCR4 0x114 -#define MSR_IDT_MCR5 0x115 -#define MSR_IDT_MCR6 0x116 -#define MSR_IDT_MCR7 0x117 -#define MSR_IDT_MCR_CTRL 0x120 - -/* VIA Cyrix defined MSRs*/ -#define MSR_VIA_FCR 0x1107 -#define MSR_VIA_LONGHAUL 0x110a -#define MSR_VIA_RNG 0x110b -#define MSR_VIA_BCR2 0x1147 - -/* Intel defined MSRs. */ -#define MSR_IA32_P5_MC_ADDR 0 -#define MSR_IA32_P5_MC_TYPE 1 -#define MSR_IA32_PLATFORM_ID 0x17 -#define MSR_IA32_EBL_CR_POWERON 0x2a - -#define MSR_IA32_APICBASE 0x1b -#define MSR_IA32_APICBASE_BSP (1<<8) -#define MSR_IA32_APICBASE_ENABLE (1<<11) -#define MSR_IA32_APICBASE_BASE (0xfffff<<12) - -/* P4/Xeon+ specific */ -#define MSR_IA32_MCG_EAX 0x180 -#define MSR_IA32_MCG_EBX 0x181 -#define MSR_IA32_MCG_ECX 0x182 -#define MSR_IA32_MCG_EDX 0x183 -#define MSR_IA32_MCG_ESI 0x184 -#define MSR_IA32_MCG_EDI 0x185 -#define MSR_IA32_MCG_EBP 0x186 -#define MSR_IA32_MCG_ESP 0x187 -#define MSR_IA32_MCG_EFLAGS 0x188 -#define MSR_IA32_MCG_EIP 0x189 -#define MSR_IA32_MCG_RESERVED 0x18A - -#define MSR_P6_EVNTSEL0 0x186 -#define MSR_P6_EVNTSEL1 0x187 - -#define MSR_IA32_PERF_STATUS 0x198 -#define MSR_IA32_PERF_CTL 0x199 - -#define MSR_IA32_MPERF 0xE7 -#define MSR_IA32_APERF 0xE8 - -#define MSR_IA32_THERM_CONTROL 0x19a -#define MSR_IA32_THERM_INTERRUPT 0x19b -#define MSR_IA32_THERM_STATUS 0x19c -#define MSR_IA32_MISC_ENABLE 0x1a0 - -#define MSR_IA32_DEBUGCTLMSR 0x1d9 -#define MSR_IA32_LASTBRANCHFROMIP 0x1db -#define MSR_IA32_LASTBRANCHTOIP 0x1dc -#define MSR_IA32_LASTINTFROMIP 0x1dd -#define MSR_IA32_LASTINTTOIP 0x1de - -#define MSR_IA32_MC0_CTL 0x400 -#define MSR_IA32_MC0_STATUS 0x401 -#define MSR_IA32_MC0_ADDR 0x402 -#define MSR_IA32_MC0_MISC 0x403 - -/* Pentium IV performance counter MSRs */ -#define MSR_P4_BPU_PERFCTR0 0x300 -#define MSR_P4_BPU_PERFCTR1 0x301 -#define MSR_P4_BPU_PERFCTR2 0x302 -#define MSR_P4_BPU_PERFCTR3 0x303 -#define MSR_P4_MS_PERFCTR0 0x304 -#define MSR_P4_MS_PERFCTR1 0x305 -#define MSR_P4_MS_PERFCTR2 0x306 -#define MSR_P4_MS_PERFCTR3 0x307 -#define MSR_P4_FLAME_PERFCTR0 0x308 -#define MSR_P4_FLAME_PERFCTR1 0x309 -#define MSR_P4_FLAME_PERFCTR2 0x30a -#define MSR_P4_FLAME_PERFCTR3 0x30b -#define MSR_P4_IQ_PERFCTR0 0x30c -#define MSR_P4_IQ_PERFCTR1 0x30d -#define MSR_P4_IQ_PERFCTR2 0x30e -#define MSR_P4_IQ_PERFCTR3 0x30f -#define MSR_P4_IQ_PERFCTR4 0x310 -#define MSR_P4_IQ_PERFCTR5 0x311 -#define MSR_P4_BPU_CCCR0 0x360 -#define MSR_P4_BPU_CCCR1 0x361 -#define MSR_P4_BPU_CCCR2 0x362 -#define MSR_P4_BPU_CCCR3 0x363 -#define MSR_P4_MS_CCCR0 0x364 -#define MSR_P4_MS_CCCR1 0x365 -#define MSR_P4_MS_CCCR2 0x366 -#define MSR_P4_MS_CCCR3 0x367 -#define MSR_P4_FLAME_CCCR0 0x368 -#define MSR_P4_FLAME_CCCR1 0x369 -#define MSR_P4_FLAME_CCCR2 0x36a -#define MSR_P4_FLAME_CCCR3 0x36b -#define MSR_P4_IQ_CCCR0 0x36c -#define MSR_P4_IQ_CCCR1 0x36d -#define MSR_P4_IQ_CCCR2 0x36e -#define MSR_P4_IQ_CCCR3 0x36f -#define MSR_P4_IQ_CCCR4 0x370 -#define MSR_P4_IQ_CCCR5 0x371 -#define MSR_P4_ALF_ESCR0 0x3ca -#define MSR_P4_ALF_ESCR1 0x3cb -#define MSR_P4_BPU_ESCR0 0x3b2 -#define MSR_P4_BPU_ESCR1 0x3b3 -#define MSR_P4_BSU_ESCR0 0x3a0 -#define MSR_P4_BSU_ESCR1 0x3a1 -#define MSR_P4_CRU_ESCR0 0x3b8 -#define MSR_P4_CRU_ESCR1 0x3b9 -#define MSR_P4_CRU_ESCR2 0x3cc -#define MSR_P4_CRU_ESCR3 0x3cd -#define MSR_P4_CRU_ESCR4 0x3e0 -#define MSR_P4_CRU_ESCR5 0x3e1 -#define MSR_P4_DAC_ESCR0 0x3a8 -#define MSR_P4_DAC_ESCR1 0x3a9 -#define MSR_P4_FIRM_ESCR0 0x3a4 -#define MSR_P4_FIRM_ESCR1 0x3a5 -#define MSR_P4_FLAME_ESCR0 0x3a6 -#define MSR_P4_FLAME_ESCR1 0x3a7 -#define MSR_P4_FSB_ESCR0 0x3a2 -#define MSR_P4_FSB_ESCR1 0x3a3 -#define MSR_P4_IQ_ESCR0 0x3ba -#define MSR_P4_IQ_ESCR1 0x3bb -#define MSR_P4_IS_ESCR0 0x3b4 -#define MSR_P4_IS_ESCR1 0x3b5 -#define MSR_P4_ITLB_ESCR0 0x3b6 -#define MSR_P4_ITLB_ESCR1 0x3b7 -#define MSR_P4_IX_ESCR0 0x3c8 -#define MSR_P4_IX_ESCR1 0x3c9 -#define MSR_P4_MOB_ESCR0 0x3aa -#define MSR_P4_MOB_ESCR1 0x3ab -#define MSR_P4_MS_ESCR0 0x3c0 -#define MSR_P4_MS_ESCR1 0x3c1 -#define MSR_P4_PMH_ESCR0 0x3ac -#define MSR_P4_PMH_ESCR1 0x3ad -#define MSR_P4_RAT_ESCR0 0x3bc -#define MSR_P4_RAT_ESCR1 0x3bd -#define MSR_P4_SAAT_ESCR0 0x3ae -#define MSR_P4_SAAT_ESCR1 0x3af -#define MSR_P4_SSU_ESCR0 0x3be -#define MSR_P4_SSU_ESCR1 0x3bf /* guess: not defined in manual */ -#define MSR_P4_TBPU_ESCR0 0x3c2 -#define MSR_P4_TBPU_ESCR1 0x3c3 -#define MSR_P4_TC_ESCR0 0x3c4 -#define MSR_P4_TC_ESCR1 0x3c5 -#define MSR_P4_U2L_ESCR0 0x3b0 -#define MSR_P4_U2L_ESCR1 0x3b1 - -/* Intel Core-based CPU performance counters */ -#define MSR_CORE_PERF_FIXED_CTR0 0x309 -#define MSR_CORE_PERF_FIXED_CTR1 0x30a -#define MSR_CORE_PERF_FIXED_CTR2 0x30b -#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x38d -#define MSR_CORE_PERF_GLOBAL_STATUS 0x38e -#define MSR_CORE_PERF_GLOBAL_CTRL 0x38f -#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390 - -#endif +#endif /* CONFIG_SMP */ +#endif /* __ASSEMBLY__ */ +#endif /* X86_64_MSR_H */ diff --git a/include/asm-x86_64/processor-flags.h b/include/asm-x86_64/processor-flags.h index 806112fb7985..ec99a57b2c6a 100644 --- a/include/asm-x86_64/processor-flags.h +++ b/include/asm-x86_64/processor-flags.h @@ -1,26 +1 @@ -#ifndef __ASM_X86_64_PROCESSOR_FLAGS_H -#define __ASM_X86_64_PROCESSOR_FLAGS_H -/* Various flags defined: can be included from assembler. */ - -/* - * EFLAGS bits - */ -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ - -#endif /* __ASM_X86_64_PROCESSOR_FLAGS_H */ +#include diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h index 6a117349a5de..461ffe4c1fcc 100644 --- a/include/asm-x86_64/processor.h +++ b/include/asm-x86_64/processor.h @@ -103,21 +103,6 @@ extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; -/* - * Intel CPU features in CR4 - */ -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ - /* * Save the cr4 feature set we're using (ie * Pentium 4MB enable and PPro Global page @@ -406,22 +391,6 @@ static inline void prefetchw(void *x) #define cpu_relax() rep_nop() -/* - * NSC/Cyrix CPU configuration register indexes - */ -#define CX86_CCR0 0xc0 -#define CX86_CCR1 0xc1 -#define CX86_CCR2 0xc2 -#define CX86_CCR3 0xc3 -#define CX86_CCR4 0xe8 -#define CX86_CCR5 0xe9 -#define CX86_CCR6 0xea -#define CX86_CCR7 0xeb -#define CX86_DIR0 0xfe -#define CX86_DIR1 0xff -#define CX86_ARR_BASE 0xc4 -#define CX86_RCR_BASE 0xdc - /* * NSC/Cyrix CPU indexed register access macros */ From d0175ab64412aabc93da8682aaa99124d6815056 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 124/231] [PATCH] i386: Remove smp_alt_instructions The .smp_altinstructions section and its corresponding symbols are completely unused, so remove them. Also, remove stray #ifdef __KENREL__ in asm-i386/alternative.h Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen --- arch/i386/kernel/alternative.c | 38 ++-------------------------------- arch/i386/kernel/vmlinux.lds.S | 11 ---------- include/asm-i386/alternative.h | 6 +----- 3 files changed, 3 insertions(+), 52 deletions(-) diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index a27c8d347364..f09635408049 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -132,11 +132,8 @@ static void nop_out(void *insns, unsigned int len) } extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; -extern u8 __smp_alt_begin[], __smp_alt_end[]; - /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with self modifying code. This implies that assymetric systems where @@ -171,29 +168,6 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) #ifdef CONFIG_SMP -static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end) -{ - struct alt_instr *a; - - DPRINTK("%s: alt table %p-%p\n", __FUNCTION__, start, end); - for (a = start; a < end; a++) { - memcpy(a->replacement + a->replacementlen, - a->instr, - a->instrlen); - } -} - -static void alternatives_smp_apply(struct alt_instr *start, struct alt_instr *end) -{ - struct alt_instr *a; - - for (a = start; a < end; a++) { - memcpy(a->instr, - a->replacement + a->replacementlen, - a->instrlen); - } -} - static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) { u8 **ptr; @@ -319,8 +293,6 @@ void alternatives_smp_switch(int smp) printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); - alternatives_smp_apply(__smp_alt_instructions, - __smp_alt_instructions_end); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); @@ -328,8 +300,6 @@ void alternatives_smp_switch(int smp) printk(KERN_INFO "SMP alternatives: switching to UP code\n"); set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); - apply_alternatives(__smp_alt_instructions, - __smp_alt_instructions_end); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_unlock(mod->locks, mod->locks_end, mod->text, mod->text_end); @@ -396,17 +366,13 @@ void __init alternative_instructions(void) printk(KERN_INFO "SMP alternatives: switching to UP code\n"); set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); - apply_alternatives(__smp_alt_instructions, - __smp_alt_instructions_end); alternatives_smp_unlock(__smp_locks, __smp_locks_end, _text, _etext); } free_init_pages("SMP alternatives", - __pa_symbol(&__smp_alt_begin), - __pa_symbol(&__smp_alt_end)); + __pa_symbol(&__smp_locks), + __pa_symbol(&__smp_locks_end)); } else { - alternatives_smp_save(__smp_alt_instructions, - __smp_alt_instructions_end); alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 97fe6eac47c9..2ce4aa185fc8 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -117,22 +117,11 @@ SECTIONS /* might get freed after init */ . = ALIGN(4096); - .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { - __smp_alt_begin = .; - __smp_alt_instructions = .; - *(.smp_altinstructions) - __smp_alt_instructions_end = .; - } - . = ALIGN(4); .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { __smp_locks = .; *(.smp_locks) __smp_locks_end = .; } - .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { - *(.smp_altinstr_replacement) - __smp_alt_end = .; - } /* will be freed after init * Following ALIGN() is required to make sure no other data falls on the * same page where __smp_alt_end is pointing as that page might be freed diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h index dbc1a29284f3..4d518eebe461 100644 --- a/include/asm-i386/alternative.h +++ b/include/asm-i386/alternative.h @@ -1,8 +1,6 @@ #ifndef _I386_ALTERNATIVE_H #define _I386_ALTERNATIVE_H -#ifdef __KERNEL__ - #include #include #include @@ -32,9 +30,7 @@ static inline void alternatives_smp_module_add(struct module *mod, char *name, void *text, void *text_end) {} static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} -#endif - -#endif +#endif /* CONFIG_SMP */ /* * Alternative instructions for different CPU types or capabilities. From b7fb4af06c18496950a45b365f7a09c47ea64c17 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 125/231] [PATCH] i386: Allow boot-time disable of SMP altinstructions Add "noreplace-smp" to disable SMP instruction replacement. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- Documentation/kernel-parameters.txt | 6 ++++++ arch/i386/kernel/alternative.c | 23 +++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 94ce0d20253d..242b3a09b6ce 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1164,6 +1164,9 @@ and is between 256 and 4096 characters. It is defined in the file nomce [IA-32] Machine Check Exception + noreplace-smp [IA-32,SMP] Don't replace SMP instructions + with UP alternatives + noresidual [PPC] Don't use residual data on PReP machines. noresume [SWSUSP] Disables resume and restores original swap @@ -1569,6 +1572,9 @@ and is between 256 and 4096 characters. It is defined in the file smart2= [HW] Format: [,[,...,]] + smp-alt-once [IA-32,SMP] On a hotplug CPU system, only + attempt to substitute SMP alternatives once at boot. + snd-ad1816a= [HW,ALSA] snd-ad1848= [HW,ALSA] diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index f09635408049..9b8e85a8edec 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -5,6 +5,7 @@ #include #include +static int noreplace_smp = 0; static int smp_alt_once = 0; static int debug_alternative = 0; @@ -13,15 +14,23 @@ static int __init bootonly(char *str) smp_alt_once = 1; return 1; } +__setup("smp-alt-boot", bootonly); + static int __init debug_alt(char *str) { debug_alternative = 1; return 1; } - -__setup("smp-alt-boot", bootonly); __setup("debug-alternative", debug_alt); +static int __init setup_noreplace_smp(char *str) +{ + noreplace_smp = 1; + return 1; +} +__setup("noreplace-smp", setup_noreplace_smp); + + #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) @@ -185,6 +194,9 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end { u8 **ptr; + if (noreplace_smp) + return; + for (ptr = start; ptr < end; ptr++) { if (*ptr < text) continue; @@ -219,6 +231,9 @@ void alternatives_smp_module_add(struct module *mod, char *name, struct smp_alt_module *smp; unsigned long flags; + if (noreplace_smp) + return; + if (smp_alt_once) { if (boot_cpu_has(X86_FEATURE_UP)) alternatives_smp_unlock(locks, locks_end, @@ -253,7 +268,7 @@ void alternatives_smp_module_del(struct module *mod) struct smp_alt_module *item; unsigned long flags; - if (smp_alt_once) + if (smp_alt_once || noreplace_smp) return; spin_lock_irqsave(&smp_alt, flags); @@ -284,7 +299,7 @@ void alternatives_smp_switch(int smp) return; #endif - if (smp_alt_once) + if (noreplace_smp || smp_alt_once) return; BUG_ON(!smp && (num_online_cpus() > 1)); From 1c3d99c11c47c8a1a9ed6a46555dbf6520683c52 Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 126/231] [PATCH] x86-64: Fix x86_64 compilation with DEBUG_SIG on Setting the DEBUG_SIG flag breaks compilation due to a wrong struct access. Aditionally, it raises two warnings. This is one patch to fix them all. Signed-off-by: Glauber de Oliveira Costa Signed-off-by: Andi Kleen --- arch/x86_64/kernel/signal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 49ec324cd141..c819625f3316 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -141,7 +141,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) goto badframe; #ifdef DEBUG_SIG - printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax); + printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); #endif if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) @@ -301,7 +301,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #ifdef DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif @@ -463,7 +463,7 @@ void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG - printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", + printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); #endif From 82d1bb725e128c97b362a4b33fcbfff08fdaaa5a Mon Sep 17 00:00:00 2001 From: James Puthukattukaran Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 127/231] [PATCH] x86-64: x86-64 system crashes when no memory populating Node 0 I have a 4 socket AMD Operton system. The 2.6.18 kernel I have crashes when there is no memory in node0. AK: changed call to _nopanic Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/aperture.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index b487396c4c5b..a52af5820592 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -51,7 +51,6 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) static u32 __init allocate_aperture(void) { - pg_data_t *nd0 = NODE_DATA(0); u32 aper_size; void *p; @@ -65,12 +64,12 @@ static u32 __init allocate_aperture(void) * Unfortunately we cannot move it up because that would make the * IOMMU useless. */ - p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); + p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); if (!p || __pa(p)+aper_size > 0xffffffff) { printk("Cannot allocate aperture memory hole (%p,%uK)\n", p, aper_size>>10); if (p) - free_bootmem_node(nd0, __pa(p), aper_size); + free_bootmem(__pa(p), aper_size); return 0; } printk("Mapping aperture over %d KB of RAM @ %lx\n", From a75c54f933bd8db9f4a609bd128663c179b3e6a1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 128/231] [PATCH] i386: i386 separate hardware-defined TSS from Linux additions On Thu, 2007-03-29 at 13:16 +0200, Andi Kleen wrote: > Please clean it up properly with two structs. Not sure about this, now I've done it. Running it here. If you like it, I can do x86-64 as well. == lguest defines its own TSS struct because the "struct tss_struct" contains linux-specific additions. Andi asked me to split the struct in processor.h. Unfortunately it makes usage a little awkward. Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- arch/i386/kernel/asm-offsets.c | 2 +- arch/i386/kernel/doublefault.c | 29 ++++++++++++++++------------- arch/i386/kernel/ioport.c | 2 +- arch/i386/kernel/process.c | 8 ++++---- arch/i386/kernel/sysenter.c | 6 +++--- arch/i386/kernel/traps.c | 4 ++-- arch/i386/kernel/vmi.c | 8 ++++---- include/asm-i386/processor.h | 24 ++++++++++++++++-------- 8 files changed, 47 insertions(+), 36 deletions(-) diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 655cc8d4c745..d558adfc293c 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -93,7 +93,7 @@ void foo(void) OFFSET(pbe_next, pbe, next); /* Offset from the sysenter stack to tss.esp0 */ - DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - + DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c index b4d14c2eb345..265c5597efb0 100644 --- a/arch/i386/kernel/doublefault.c +++ b/arch/i386/kernel/doublefault.c @@ -33,7 +33,7 @@ static void doublefault_fn(void) printk("double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { - struct tss_struct *t = (struct tss_struct *)tss; + struct i386_hw_tss *t = (struct i386_hw_tss *)tss; printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp); @@ -49,18 +49,21 @@ static void doublefault_fn(void) } struct tss_struct doublefault_tss __cacheline_aligned = { - .esp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + .x86_tss = { + .esp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - .eip = (unsigned long) doublefault_fn, - .eflags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */ - .esp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, + .eip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .eflags = X86_EFLAGS_SF | 0x2, + .esp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, - .__cr3 = __pa(swapper_pg_dir) + .__cr3 = __pa(swapper_pg_dir) + } }; diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c index 1b4530e6cd82..d1e42e0dbe67 100644 --- a/arch/i386/kernel/ioport.c +++ b/arch/i386/kernel/ioport.c @@ -114,7 +114,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * Reset the owner so that a process switch will not set * tss->io_bitmap_base to IO_BITMAP_OFFSET. */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; tss->io_bitmap_owner = NULL; put_cpu(); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 7e8e129b3d7d..5fb9524c6f4b 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -375,7 +375,7 @@ void exit_thread(void) t->io_bitmap_max = 0; tss->io_bitmap_owner = NULL; tss->io_bitmap_max = 0; - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } } @@ -554,7 +554,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, * Disable the bitmap via an invalid offset. We still cache * the previous bitmap owner and the IO bitmap contents: */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; return; } @@ -564,7 +564,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, * matches the next task, we dont have to do anything but * to set a valid offset in the TSS: */ - tss->io_bitmap_base = IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; return; } /* @@ -576,7 +576,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, * redundant copies when the currently switched task does not * perform any I/O during its timeslice. */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; } /* diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 0b9768ee1e8d..94defac6fc3d 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -183,10 +183,10 @@ void enable_sep_cpu(void) return; } - tss->ss1 = __KERNEL_CS; - tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; + tss->x86_tss.ss1 = __KERNEL_CS; + tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss; wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); put_cpu(); } diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 8722444cacaa..e0a23bee6967 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -596,7 +596,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, * and we set the offset field correctly. Then we let the CPU to * restart the faulting instruction. */ - if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && thread->io_bitmap_ptr) { memcpy(tss->io_bitmap, thread->io_bitmap_ptr, thread->io_bitmap_max); @@ -609,7 +609,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, thread->io_bitmap_max, 0xff, tss->io_bitmap_max - thread->io_bitmap_max); tss->io_bitmap_max = thread->io_bitmap_max; - tss->io_bitmap_base = IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; tss->io_bitmap_owner = thread; put_cpu(); return; diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index 626c82063d19..8f3bac473450 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -230,14 +230,14 @@ static void vmi_set_tr(void) static void vmi_load_esp0(struct tss_struct *tss, struct thread_struct *thread) { - tss->esp0 = thread->esp0; + tss->x86_tss.esp0 = thread->esp0; /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { + tss->x86_tss.ss1 = thread->sysenter_cs; wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } - vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0); + vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0); } static void vmi_flush_tlb_user(void) diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 77e263267aa6..922260474646 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -291,7 +291,8 @@ typedef struct { struct thread_struct; -struct tss_struct { +/* This is the TSS defined by the hardware. */ +struct i386_hw_tss { unsigned short back_link,__blh; unsigned long esp0; unsigned short ss0,__ss0h; @@ -315,6 +316,11 @@ struct tss_struct { unsigned short gs, __gsh; unsigned short ldt, __ldth; unsigned short trace, io_bitmap_base; +} __attribute__((packed)); + +struct tss_struct { + struct i386_hw_tss x86_tss; + /* * The extra 1 is there because the CPU will access an * additional byte beyond the end of the IO permission @@ -381,10 +387,12 @@ struct thread_struct { * be within the limit. */ #define INIT_TSS { \ - .esp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + .x86_tss = { \ + .esp0 = sizeof(init_stack) + (long)&init_stack, \ + .ss0 = __KERNEL_DS, \ + .ss1 = __KERNEL_CS, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + }, \ .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ } @@ -493,10 +501,10 @@ static inline void rep_nop(void) static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread) { - tss->esp0 = thread->esp0; + tss->x86_tss.esp0 = thread->esp0; /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { + tss->x86_tss.ss1 = thread->sysenter_cs; wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } } From 4cdf6bc2476157f397f3b71a9bd4e23c7a7aaf80 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 129/231] [PATCH] x86-64: update MAINTAINERS Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chris Wright Cc: Zachary Amsden Cc: Rusty Russell --- MAINTAINERS | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a57589e2d38e..4b307e6263c2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2623,6 +2623,19 @@ T: git kernel.org:/pub/scm/linux/kernel/git/kyle/parisc-2.6.git T: cvs cvs.parisc-linux.org:/var/cvs/linux-2.6 S: Maintained +PARAVIRT_OPS INTERFACE +P: Jeremy Fitzhardinge +M: jeremy@xensource.com +P: Chris Wright +M: chrisw@sous-sol.org +P: Zachary Amsden +M: zach@vmware.com +P: Rusty Russell +M: rusty@rustcorp.com.au +L: virtualization@lists.osdl.org +L: linux-kernel@vger.kernel.org +S: Supported + PC87360 HARDWARE MONITORING DRIVER P: Jim Cromie M: jim.cromie@gmail.com @@ -3847,6 +3860,15 @@ M: eis@baty.hanse.de L: linux-x25@vger.kernel.org S: Maintained +XEN HYPERVISOR INTERFACE +P: Jeremy Fitzhardinge +M: jeremy@xensource.com +P: Chris Wright +M: chrisw@sous-sol.org +L: virtualization@lists.osdl.org +L: xen-devel@lists.xensource.com +S: Supported + XFS FILESYSTEM P: Silicon Graphics Inc P: Tim Shimmin, David Chatterton From 7f63c41c6c57371a0931da3940c6620c2301442c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 130/231] [PATCH] i386: PARAVIRT: Remove CONFIG_DEBUG_PARAVIRT Remove CONFIG_DEBUG_PARAVIRT. When inlining code, this option attempts to trash registers in the patch-site's "clobber" field, on the grounds that this should find bugs with incorrect clobbers. Unfortunately, the clobber field really means "registers modified by this patch site", which includes return values. Because of this, this option has outlived its usefulness, so remove it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell --- arch/i386/Kconfig.debug | 10 ---------- arch/i386/kernel/alternative.c | 14 +------------- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug index 458bc1611933..b31c0802e1cc 100644 --- a/arch/i386/Kconfig.debug +++ b/arch/i386/Kconfig.debug @@ -85,14 +85,4 @@ config DOUBLEFAULT option saves about 4k and might cause you much additional grey hair. -config DEBUG_PARAVIRT - bool "Enable some paravirtualization debugging" - default n - depends on PARAVIRT && DEBUG_KERNEL - help - Currently deliberately clobbers regs which are allowed to be - clobbered in inlined paravirt hooks, even in native mode. - If turning this off solves a problem, then DISABLE_INTERRUPTS() or - ENABLE_INTERRUPTS() is lying about what registers can be clobbered. - endmenu diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 9b8e85a8edec..915b6c4d9baf 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -334,19 +334,7 @@ void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr, p->len); -#ifdef CONFIG_DEBUG_PARAVIRT - { - int i; - /* Deliberately clobber regs using "not %reg" to find bugs. */ - for (i = 0; i < 3; i++) { - if (p->len - used >= 2 && (p->clobbers & (1 << i))) { - memcpy(p->instr + used, "\xf7\xd0", 2); - p->instr[used+1] |= i; - used += 2; - } - } - } -#endif + /* Pad the rest with nops */ nop_out(p->instr + used, p->len - used); } From 45876233605c268e929a7875081e129debe34bdc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 131/231] [PATCH] i386: PARAVIRT: use paravirt_nop to consistently mark no-op operations Add a _paravirt_nop function for use as a stub for no-op operations, and paravirt_nop #defined void * version to make using it easier (since all its uses are as a void *). This is useful to allow the patcher to automatically identify noop operations so it can simply nop out the callsite. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Acked-by: Ingo Molnar [mingo] but only as a cleanup of the current open-coded (void *) casts. My problem with this is that it loses the types. Not that there is much to check for, but still, this adds some assumptions about how function calls look like --- arch/i386/kernel/paravirt.c | 24 ++++++++++++------------ include/asm-i386/paravirt.h | 3 +++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 47698756aec5..3fdbd1f62379 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -35,7 +35,7 @@ #include /* nop stub */ -static void native_nop(void) +void _paravirt_nop(void) { } @@ -207,7 +207,7 @@ struct paravirt_ops paravirt_ops = { .patch = native_patch, .banner = default_banner, - .arch_setup = native_nop, + .arch_setup = paravirt_nop, .memory_setup = machine_specific_memory_setup, .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, @@ -263,25 +263,25 @@ struct paravirt_ops paravirt_ops = { .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, #endif - .set_lazy_mode = (void *)native_nop, + .set_lazy_mode = paravirt_nop, .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, - .map_pt_hook = (void *)native_nop, + .map_pt_hook = paravirt_nop, - .alloc_pt = (void *)native_nop, - .alloc_pd = (void *)native_nop, - .alloc_pd_clone = (void *)native_nop, - .release_pt = (void *)native_nop, - .release_pd = (void *)native_nop, + .alloc_pt = paravirt_nop, + .alloc_pd = paravirt_nop, + .alloc_pd_clone = paravirt_nop, + .release_pt = paravirt_nop, + .release_pd = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, - .pte_update = (void *)native_nop, - .pte_update_defer = (void *)native_nop, + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .set_pte_present = native_set_pte_present, @@ -293,7 +293,7 @@ struct paravirt_ops paravirt_ops = { .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, - .startup_ipi_hook = (void *)native_nop, + .startup_ipi_hook = paravirt_nop, }; /* diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 32acebce9ae2..f0bdaea6235d 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -434,6 +434,9 @@ static inline void pmd_clear(pmd_t *pmdp) #define arch_leave_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_NONE) #define arch_flush_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_FLUSH) +void _paravirt_nop(void); +#define paravirt_nop ((void *)_paravirt_nop) + /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch { u8 *instr; /* original instructions */ From 3dc494e86d1c93afd4c66385f270899dbfae483d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 132/231] [PATCH] i386: PARAVIRT: Add pagetable accessors to pack and unpack pagetable entries Add a set of accessors to pack, unpack and modify page table entries (at all levels). This allows a paravirt implementation to control the contents of pgd/pmd/pte entries. For example, Xen uses this to convert the (pseudo-)physical address into a machine address when populating a pagetable entry, and converting back to pphys address when an entry is read. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Acked-by: Ingo Molnar --- arch/i386/kernel/paravirt.c | 84 +++++-------------------------- arch/i386/kernel/vmi.c | 6 +-- include/asm-i386/page.h | 79 +++++++++++++++++++++++++---- include/asm-i386/paravirt.h | 52 ++++++++++++++----- include/asm-i386/pgtable-2level.h | 26 ++++++++-- include/asm-i386/pgtable-3level.h | 63 ++++++++++++++--------- include/asm-i386/pgtable.h | 2 + 7 files changed, 184 insertions(+), 128 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 3fdbd1f62379..cba7a15ce1b0 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -117,78 +117,6 @@ static void native_flush_tlb_single(u32 addr) __native_flush_tlb_single(addr); } -#ifndef CONFIG_X86_PAE -static void native_set_pte(pte_t *ptep, pte_t pteval) -{ - *ptep = pteval; -} - -static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) -{ - *ptep = pteval; -} - -static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - *pmdp = pmdval; -} - -#else /* CONFIG_X86_PAE */ - -static void native_set_pte(pte_t *ptep, pte_t pte) -{ - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) -{ - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) -{ - ptep->pte_low = 0; - smp_wmb(); - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -static void native_set_pte_atomic(pte_t *ptep, pte_t pteval) -{ - set_64bit((unsigned long long *)ptep,pte_val(pteval)); -} - -static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); -} - -static void native_set_pud(pud_t *pudp, pud_t pudval) -{ - *pudp = pudval; -} - -static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - ptep->pte_low = 0; - smp_wmb(); - ptep->pte_high = 0; -} - -static void native_pmd_clear(pmd_t *pmd) -{ - u32 *tmp = (u32 *)pmd; - *tmp = 0; - smp_wmb(); - *(tmp + 1) = 0; -} -#endif /* CONFIG_X86_PAE */ - /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); @@ -282,14 +210,26 @@ struct paravirt_ops paravirt_ops = { .set_pmd = native_set_pmd, .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, + + .ptep_get_and_clear = native_ptep_get_and_clear, + #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .set_pte_present = native_set_pte_present, .set_pud = native_set_pud, .pte_clear = native_pte_clear, .pmd_clear = native_pmd_clear, + + .pmd_val = native_pmd_val, + .make_pmd = native_make_pmd, #endif + .pte_val = native_pte_val, + .pgd_val = native_pgd_val, + + .make_pte = native_make_pte, + .make_pgd = native_make_pgd, + .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index 8f3bac473450..ea77d93f59dd 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -443,13 +443,13 @@ static void vmi_release_pd(u32 pfn) ((level) | (is_current_as(mm, user) ? \ (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) -static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep) +static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } -static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) +static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); @@ -462,7 +462,7 @@ static void vmi_set_pte(pte_t *ptep, pte_t pte) vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); } -static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index fd3f64ace248..818ac8bf01e2 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -12,7 +12,6 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ - #ifdef CONFIG_X86_USE_3DNOW #include @@ -42,26 +41,81 @@ * These are used to make use of C type-checking.. */ extern int nx_enabled; + #ifdef CONFIG_X86_PAE extern unsigned long long __supported_pte_mask; typedef struct { unsigned long pte_low, pte_high; } pte_t; typedef struct { unsigned long long pmd; } pmd_t; typedef struct { unsigned long long pgd; } pgd_t; typedef struct { unsigned long long pgprot; } pgprot_t; -#define pmd_val(x) ((x).pmd) -#define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32)) -#define __pmd(x) ((pmd_t) { (x) } ) + +static inline unsigned long long native_pgd_val(pgd_t pgd) +{ + return pgd.pgd; +} + +static inline unsigned long long native_pmd_val(pmd_t pmd) +{ + return pmd.pmd; +} + +static inline unsigned long long native_pte_val(pte_t pte) +{ + return pte.pte_low | ((unsigned long long)pte.pte_high << 32); +} + +static inline pgd_t native_make_pgd(unsigned long long val) +{ + return (pgd_t) { val }; +} + +static inline pmd_t native_make_pmd(unsigned long long val) +{ + return (pmd_t) { val }; +} + +static inline pte_t native_make_pte(unsigned long long val) +{ + return (pte_t) { .pte_low = val, .pte_high = (val >> 32) } ; +} + +#ifndef CONFIG_PARAVIRT +#define pmd_val(x) native_pmd_val(x) +#define __pmd(x) native_make_pmd(x) +#endif + #define HPAGE_SHIFT 21 #include -#else +#else /* !CONFIG_X86_PAE */ typedef struct { unsigned long pte_low; } pte_t; typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgprot; } pgprot_t; #define boot_pte_t pte_t /* or would you rather have a typedef */ -#define pte_val(x) ((x).pte_low) + +static inline unsigned long native_pgd_val(pgd_t pgd) +{ + return pgd.pgd; +} + +static inline unsigned long native_pte_val(pte_t pte) +{ + return pte.pte_low; +} + +static inline pgd_t native_make_pgd(unsigned long val) +{ + return (pgd_t) { val }; +} + +static inline pte_t native_make_pte(unsigned long val) +{ + return (pte_t) { .pte_low = val }; +} + #define HPAGE_SHIFT 22 #include -#endif +#endif /* CONFIG_X86_PAE */ + #define PTE_MASK PAGE_MASK #ifdef CONFIG_HUGETLB_PAGE @@ -71,13 +125,16 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif -#define pgd_val(x) ((x).pgd) #define pgprot_val(x) ((x).pgprot) - -#define __pte(x) ((pte_t) { (x) } ) -#define __pgd(x) ((pgd_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } ) +#ifndef CONFIG_PARAVIRT +#define pgd_val(x) native_pgd_val(x) +#define __pgd(x) native_make_pgd(x) +#define pte_val(x) native_pte_val(x) +#define __pte(x) native_make_pte(x) +#endif + #endif /* !__ASSEMBLY__ */ /* to align the pointer to the (next) page boundary */ diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index f0bdaea6235d..0aacb13bb929 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -2,7 +2,6 @@ #define __ASM_PARAVIRT_H /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ -#include #include #include @@ -25,6 +24,8 @@ #define CLBR_ANY 0x7 #ifndef __ASSEMBLY__ +#include + struct thread_struct; struct Xgt_desc_struct; struct tss_struct; @@ -55,11 +56,6 @@ struct paravirt_ops int (*set_wallclock)(unsigned long); void (*time_init)(void); - /* All the function pointers here are declared as "fastcall" - so that we get a specific register-based calling - convention. This makes it easier to implement inline - assembler replacements. */ - void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); @@ -139,16 +135,33 @@ struct paravirt_ops void (*release_pd)(u32 pfn); void (*set_pte)(pte_t *ptep, pte_t pteval); - void (*set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval); + void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - void (*pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep); - void (*pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep); + void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (*pte_update_defer)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + + pte_t (*ptep_get_and_clear)(pte_t *ptep); + #ifdef CONFIG_X86_PAE void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); - void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); + void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); void (*set_pud)(pud_t *pudp, pud_t pudval); - void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void (*pmd_clear)(pmd_t *pmdp); + + unsigned long long (*pte_val)(pte_t); + unsigned long long (*pmd_val)(pmd_t); + unsigned long long (*pgd_val)(pgd_t); + + pte_t (*make_pte)(unsigned long long pte); + pmd_t (*make_pmd)(unsigned long long pmd); + pgd_t (*make_pgd)(unsigned long long pgd); +#else + unsigned long (*pte_val)(pte_t); + unsigned long (*pgd_val)(pgd_t); + + pte_t (*make_pte)(unsigned long pte); + pgd_t (*make_pgd)(unsigned long pgd); #endif void (*set_lazy_mode)(int mode); @@ -219,6 +232,8 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, #define read_cr4_safe(x) paravirt_ops.read_cr4_safe() #define write_cr4(x) paravirt_ops.write_cr4(x) +#define raw_ptep_get_and_clear(xp) (paravirt_ops.ptep_get_and_clear(xp)) + static inline void raw_safe_halt(void) { paravirt_ops.safe_halt(); @@ -304,6 +319,17 @@ static inline void halt(void) (paravirt_ops.write_idt_entry((dt), (entry), (low), (high))) #define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask)) +#define __pte(x) paravirt_ops.make_pte(x) +#define __pgd(x) paravirt_ops.make_pgd(x) + +#define pte_val(x) paravirt_ops.pte_val(x) +#define pgd_val(x) paravirt_ops.pgd_val(x) + +#ifdef CONFIG_X86_PAE +#define __pmd(x) paravirt_ops.make_pmd(x) +#define pmd_val(x) paravirt_ops.pmd_val(x) +#endif + /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { paravirt_ops.io_delay(); @@ -344,6 +370,7 @@ static inline void setup_secondary_clock(void) } #endif + #ifdef CONFIG_SMP static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) @@ -371,7 +398,8 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) paravirt_ops.set_pte(ptep, pteval); } -static inline void set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) +static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) { paravirt_ops.set_pte_at(mm, addr, ptep, pteval); } diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index 38c3fcc0676d..043a2bcfa86a 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -11,10 +11,23 @@ * within a page table are directly modified. Thus, the following * hook is made available. */ +static inline void native_set_pte(pte_t *ptep , pte_t pte) +{ + *ptep = pte; +} +static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , pte_t pte) +{ + native_set_pte(ptep, pte); +} +static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + *pmdp = pmd; +} #ifndef CONFIG_PARAVIRT -#define set_pte(pteptr, pteval) (*(pteptr) = pteval) -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) -#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) +#define set_pte(pteptr, pteval) native_set_pte(pteptr, pteval) +#define set_pte_at(mm,addr,ptep,pteval) native_set_pte_at(mm, addr, ptep, pteval) +#define set_pmd(pmdptr, pmdval) native_set_pmd(pmdptr, pmdval) #endif #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) @@ -23,11 +36,14 @@ #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define raw_ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) +static inline pte_t native_ptep_get_and_clear(pte_t *xp) +{ + return __pte(xchg(&xp->pte_low, 0)); +} #define pte_page(x) pfn_to_page(pte_pfn(x)) #define pte_none(x) (!(x).pte_low) -#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) +#define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT) #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index 7a2318f38303..be6017f37a91 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h @@ -42,20 +42,23 @@ static inline int pte_exec_kernel(pte_t pte) return pte_x(pte); } -#ifndef CONFIG_PARAVIRT /* Rules for using set_pte: the pte being assigned *must* be * either not present or in a state where the hardware will * not attempt to update the pte. In places where this is * not possible, use pte_get_and_clear to obtain the old pte * value and then use set_pte to update it. -ben */ -static inline void set_pte(pte_t *ptep, pte_t pte) +static inline void native_set_pte(pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) +static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , pte_t pte) +{ + native_set_pte(ptep, pte); +} /* * Since this is only called on user PTEs, and the page fault handler @@ -63,7 +66,8 @@ static inline void set_pte(pte_t *ptep, pte_t pte) * we are justified in merely clearing the PTE present bit, followed * by a set. The ordering here is important. */ -static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +static inline void native_set_pte_present(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) { ptep->pte_low = 0; smp_wmb(); @@ -72,32 +76,48 @@ static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte ptep->pte_low = pte.pte_low; } -#define set_pte_atomic(pteptr,pteval) \ - set_64bit((unsigned long long *)(pteptr),pte_val(pteval)) -#define set_pmd(pmdptr,pmdval) \ - set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval)) -#define set_pud(pudptr,pudval) \ - (*(pudptr) = (pudval)) +static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((unsigned long long *)(ptep),native_pte_val(pte)); +} +static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + set_64bit((unsigned long long *)(pmdp),native_pmd_val(pmd)); +} +static inline void native_set_pud(pud_t *pudp, pud_t pud) +{ + *pudp = pud; +} /* * For PTEs and PDEs, we must clear the P-bit first when clearing a page table * entry, so clear the bottom half first and enforce ordering with a compiler * barrier. */ -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep->pte_low = 0; smp_wmb(); ptep->pte_high = 0; } -static inline void pmd_clear(pmd_t *pmd) +static inline void native_pmd_clear(pmd_t *pmd) { u32 *tmp = (u32 *)pmd; *tmp = 0; smp_wmb(); *(tmp + 1) = 0; } + +#ifndef CONFIG_PARAVIRT +#define set_pte(ptep, pte) native_set_pte(ptep, pte) +#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) +#define set_pte_present(mm, addr, ptep, pte) native_set_pte_present(mm, addr, ptep, pte) +#define set_pte_atomic(ptep, pte) native_set_pte_atomic(ptep, pte) +#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) +#define set_pud(pudp, pud) native_set_pud(pudp, pud) +#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) +#define pmd_clear(pmd) native_pmd_clear(pmd) #endif /* @@ -119,7 +139,7 @@ static inline void pud_clear (pud_t * pud) { } #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ pmd_index(address)) -static inline pte_t raw_ptep_get_and_clear(pte_t *ptep) +static inline pte_t native_ptep_get_and_clear(pte_t *ptep) { pte_t res; @@ -146,28 +166,21 @@ static inline int pte_none(pte_t pte) static inline unsigned long pte_pfn(pte_t pte) { - return (pte.pte_low >> PAGE_SHIFT) | - (pte.pte_high << (32 - PAGE_SHIFT)); + return pte_val(pte) >> PAGE_SHIFT; } extern unsigned long long __supported_pte_mask; static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { - pte_t pte; - - pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \ - (pgprot_val(pgprot) >> 32); - pte.pte_high &= (__supported_pte_mask >> 32); - pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \ - __supported_pte_mask; - return pte; + return __pte((((unsigned long long)page_nr << PAGE_SHIFT) | + pgprot_val(pgprot)) & __supported_pte_mask); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | \ - pgprot_val(pgprot)) & __supported_pte_mask); + return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | + pgprot_val(pgprot)) & __supported_pte_mask); } /* diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 143ddc42b86f..147f2553784d 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -266,6 +266,8 @@ static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return p #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) #define paravirt_map_pt_hook(slot, va, pfn) do { } while (0) + +#define raw_ptep_get_and_clear(xp) native_ptep_get_and_clear(xp) #endif /* From b239fb2501117bf3aeb4dd6926edd855be92333d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 133/231] [PATCH] i386: PARAVIRT: Hooks to set up initial pagetable This patch introduces paravirt_ops hooks to control how the kernel's initial pagetable is set up. In the case of a native boot, the very early bootstrap code creates a simple non-PAE pagetable to map the kernel and physical memory. When the VM subsystem is initialized, it creates a proper pagetable which respects the PAE mode, large pages, etc. When booting under a hypervisor, there are many possibilities for what paging environment the hypervisor establishes for the guest kernel, so the constructon of the kernel's pagetable depends on the hypervisor. In the case of Xen, the hypervisor boots the kernel with a fully constructed pagetable, which is already using PAE if necessary. Also, Xen requires particular care when constructing pagetables to make sure all pagetables are always mapped read-only. In order to make this easier, kernel's initial pagetable construction has been changed to only allocate and initialize a pagetable page if there's no page already present in the pagetable. This allows the Xen paravirt backend to make a copy of the hypervisor-provided pagetable, allowing the kernel to establish any more mappings it needs while keeping the existing ones. A slightly subtle point which is worth highlighting here is that Xen requires all kernel mappings to share the same pte_t pages between all pagetables, so that updating a kernel page's mapping in one pagetable is reflected in all other pagetables. This makes it possible to allocate a page and attach it to a pagetable without having to explicitly enumerate that page's mapping in all pagetables. And: +From: "Eric W. Biederman" If we don't set the leaf page table entries it is quite possible that will inherit and incorrect page table entry from the initial boot page table setup in head.S. So we need to redo the effort here, so we pick up PSE, PGE and the like. Hypervisors like Xen require that their page tables be read-only, which is slightly incompatible with our low identity mappings, however I discussed this with Jeremy he has modified the Xen early set_pte function to avoid problems in this area. Signed-off-by: Eric W. Biederman Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Acked-by: William Irwin Cc: Ingo Molnar --- arch/i386/kernel/paravirt.c | 3 + arch/i386/mm/init.c | 138 ++++++++++++++++++++++++------------ include/asm-i386/paravirt.h | 17 ++++- include/asm-i386/pgtable.h | 16 +++++ 4 files changed, 126 insertions(+), 48 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index cba7a15ce1b0..47d075bdfb95 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -193,6 +193,9 @@ struct paravirt_ops paravirt_ops = { #endif .set_lazy_mode = paravirt_nop, + .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_setup_done = native_pagetable_setup_done, + .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index bd5ef3718504..e8545dcf06c5 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -43,6 +43,7 @@ #include #include #include +#include unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -62,17 +63,18 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_t *pmd_table; #ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); -#else + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + if (pmd_table != pmd_offset(pud, 0)) + BUG(); + } +#endif pud = pud_offset(pgd, 0); pmd_table = pmd_offset(pud, 0); -#endif - return pmd_table; } @@ -82,14 +84,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) */ static pte_t * __init one_page_table_init(pmd_t *pmd) { - if (pmd_none(*pmd)) { + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - if (page_table != pte_offset_kernel(pmd, 0)) - BUG(); - - return page_table; + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } return pte_offset_kernel(pmd, 0); @@ -109,7 +109,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { pgd_t *pgd; - pud_t *pud; pmd_t *pmd; int pgd_idx, pmd_idx; unsigned long vaddr; @@ -120,13 +119,10 @@ static void __init page_table_range_init (unsigned long start, unsigned long end pgd = pgd_base + pgd_idx; for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) - one_md_table_init(pgd); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); + pmd = one_md_table_init(pgd); + pmd = pmd + pmd_index(vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (pmd_none(*pmd)) - one_page_table_init(pmd); + one_page_table_init(pmd); vaddr += PMD_SIZE; } @@ -168,20 +164,22 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) /* Map with big pages if possible, otherwise create normal page tables. */ if (cpu_has_pse) { unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - if (is_kernel_text(address) || is_kernel_text(address2)) set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); else set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + pfn += PTRS_PER_PTE; } else { pte = one_page_table_init(pmd); - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + for (pte_ofs = 0; + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { + if (is_kernel_text(address)) + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + else + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); } } } @@ -338,24 +336,78 @@ extern void __init remap_numa_kva(void); #define remap_numa_kva() do {} while (0) #endif -static void __init pagetable_init (void) +void __init native_pagetable_setup_start(pgd_t *base) { - unsigned long vaddr; - pgd_t *pgd_base = swapper_pg_dir; - #ifdef CONFIG_X86_PAE int i; - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); + + /* + * Init entries of the first-level page table to the + * zero page, if they haven't already been set up. + * + * In a normal native boot, we'll be running on a + * pagetable rooted in swapper_pg_dir, but not in PAE + * mode, so this will end up clobbering the mappings + * for the lower 24Mbytes of the address space, + * without affecting the kernel address space. + */ + for (i = 0; i < USER_PTRS_PER_PGD; i++) + set_pgd(&base[i], + __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); + + /* Make sure kernel address space is empty so that a pagetable + will be allocated for it. */ + memset(&base[USER_PTRS_PER_PGD], 0, + KERNEL_PGD_PTRS * sizeof(pgd_t)); #else paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); #endif +} + +void __init native_pagetable_setup_done(pgd_t *base) +{ +#ifdef CONFIG_X86_PAE + /* + * Add low memory identity-mappings - SMP needs it when + * starting up on an AP from real-mode. In the non-PAE + * case we already have these mappings through head.S. + * All user-space mappings are explicitly cleared after + * SMP startup. + */ + set_pgd(&base[0], base[USER_PTRS_PER_PGD]); +#endif +} + +/* + * Build a proper pagetable for the kernel mappings. Up until this + * point, we've been running on some set of pagetables constructed by + * the boot process. + * + * If we're booting on native hardware, this will be a pagetable + * constructed in arch/i386/kernel/head.S, and not running in PAE mode + * (even if we'll end up running in PAE). The root of the pagetable + * will be swapper_pg_dir. + * + * If we're booting paravirtualized under a hypervisor, then there are + * more options: we may already be running PAE, and the pagetable may + * or may not be based in swapper_pg_dir. In any case, + * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * appropriately for the rest of the initialization to work. + * + * In general, pagetable_init() assumes that the pagetable may already + * be partially populated, and so it avoids stomping on any existing + * mappings. + */ +static void __init pagetable_init (void) +{ + unsigned long vaddr, end; + pgd_t *pgd_base = swapper_pg_dir; + + paravirt_pagetable_setup_start(pgd_base); /* Enable PSE if available */ - if (cpu_has_pse) { + if (cpu_has_pse) set_in_cr4(X86_CR4_PSE); - } /* Enable PGE if available */ if (cpu_has_pge) { @@ -372,20 +424,12 @@ static void __init pagetable_init (void) * created - mappings will be set by set_fixmap(): */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, 0, pgd_base); + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; + page_table_range_init(vaddr, end, pgd_base); permanent_kmaps_init(pgd_base); -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); -#endif + paravirt_pagetable_setup_done(pgd_base); } #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 0aacb13bb929..c49b44cdd8ee 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -2,10 +2,11 @@ #define __ASM_PARAVIRT_H /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ + +#ifdef CONFIG_PARAVIRT #include #include -#ifdef CONFIG_PARAVIRT /* These are the most performance critical ops, so we want to be able to patch * callers */ #define PARAVIRT_IRQ_DISABLE 0 @@ -50,6 +51,9 @@ struct paravirt_ops char *(*memory_setup)(void); void (*init_IRQ)(void); + void (*pagetable_setup_start)(pgd_t *pgd_base); + void (*pagetable_setup_done)(pgd_t *pgd_base); + void (*banner)(void); unsigned long (*get_wallclock)(void); @@ -370,6 +374,17 @@ static inline void setup_secondary_clock(void) } #endif +static inline void paravirt_pagetable_setup_start(pgd_t *base) +{ + if (paravirt_ops.pagetable_setup_start) + (*paravirt_ops.pagetable_setup_start)(base); +} + +static inline void paravirt_pagetable_setup_done(pgd_t *base) +{ + if (paravirt_ops.pagetable_setup_done) + (*paravirt_ops.pagetable_setup_done)(base); +} #ifdef CONFIG_SMP static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 147f2553784d..0790ad6ed440 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -514,6 +514,22 @@ do { \ * tables contain all the necessary information. */ #define update_mmu_cache(vma,address,pte) do { } while (0) + +void native_pagetable_setup_start(pgd_t *base); +void native_pagetable_setup_done(pgd_t *base); + +#ifndef CONFIG_PARAVIRT +static inline void paravirt_pagetable_setup_start(pgd_t *base) +{ + native_pagetable_setup_start(base); +} + +static inline void paravirt_pagetable_setup_done(pgd_t *base) +{ + native_pagetable_setup_done(base); +} +#endif /* !CONFIG_PARAVIRT */ + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_FLATMEM From 90caccb9758e88db68a69553689baee38254287b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 134/231] [PATCH] i386: PARAVIRT: Allocate a fixmap slot Allocate a fixmap slot for use by a paravirt_ops implementation. This is intended for early-boot bootstrap mappings. Once the zones and allocator have been set up, it would be better to use get_vm_area() to allocate some virtual space. Xen uses this to map the hypervisor's shared info page, which doesn't have a pseudo-physical page number, and therefore can't be mapped ordinarily. It is needed early because it contains the vcpu state, including the interrupt mask. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Acked-by: Ingo Molnar --- include/asm-i386/fixmap.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h index e5651b2a585a..80ea052ee3a4 100644 --- a/include/asm-i386/fixmap.h +++ b/include/asm-i386/fixmap.h @@ -83,6 +83,9 @@ enum fixed_addresses { #endif #ifdef CONFIG_PCI_MMCONFIG FIX_PCIE_MCFG, +#endif +#ifdef CONFIG_PARAVIRT + FIX_PARAVIRT_BOOTMAP, #endif __end_of_permanent_fixed_addresses, /* temporary boot-time mappings, used before ioremap() is functional */ From 5311ab62cdc7788784971ed816ce85e926f3e994 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH 135/231] [PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing Normally when running in PAE mode, the 4th PMD maps the kernel address space, which can be shared among all processes (since they all need the same kernel mappings). Xen, however, does not allow guests to have the kernel pmd shared between page tables, so parameterize pgtable.c to allow both modes of operation. There are several side-effects of this. One is that vmalloc will update the kernel address space mappings, and those updates need to be propagated into all processes if the kernel mappings are not intrinsically shared. In the non-PAE case, this is done by maintaining a pgd_list of all processes; this list is used when all process pagetables must be updated. pgd_list is threaded via otherwise unused entries in the page structure for the pgd, which means that the pgd must be page-sized for this to work. Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE pgd to page aligned anyway, so this patch forces the pgd to be page aligned+sized when the kernel pmd is unshared, to accomodate both these requirements. Also, since there may be several distinct kernel pmds (if the user/kernel split is below 3G), there's no point in allocating them from a slab cache; they're just allocated with get_free_page and initialized appropriately. (Of course the could be cached if there is just a single kernel pmd - which is the default with a 3G user/kernel split - but it doesn't seem worthwhile to add yet another case into this code). [ Many thanks to wli for review comments. ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: William Lee Irwin III Signed-off-by: Andi Kleen Cc: Zachary Amsden Cc: Christoph Lameter Acked-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 1 + arch/i386/mm/fault.c | 5 +- arch/i386/mm/init.c | 18 +++++- arch/i386/mm/pageattr.c | 2 +- arch/i386/mm/pgtable.c | 88 ++++++++++++++++++++------ include/asm-i386/paravirt.h | 1 + include/asm-i386/pgtable-2level-defs.h | 2 + include/asm-i386/pgtable-2level.h | 2 - include/asm-i386/pgtable-3level-defs.h | 6 ++ include/asm-i386/pgtable-3level.h | 2 - include/asm-i386/pgtable.h | 2 + 11 files changed, 101 insertions(+), 28 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 47d075bdfb95..2040a831d5b3 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -132,6 +132,7 @@ struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, .kernel_rpl = 0, + .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ .patch = native_patch, .banner = default_banner, diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index c6a0a06258e6..f534c29e80b2 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -603,7 +603,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -#ifndef CONFIG_X86_PAE void vmalloc_sync_all(void) { /* @@ -616,6 +615,9 @@ void vmalloc_sync_all(void) static unsigned long start = TASK_SIZE; unsigned long address; + if (SHARED_KERNEL_PMD) + return; + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { if (!test_bit(pgd_index(address), insync)) { @@ -638,4 +640,3 @@ void vmalloc_sync_all(void) start = address + PGDIR_SIZE; } } -#endif diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index e8545dcf06c5..dbe16f63a566 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -745,6 +745,8 @@ struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { + size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); + if (PTRS_PER_PMD > 1) { pmd_cache = kmem_cache_create("pmd", PTRS_PER_PMD*sizeof(pmd_t), @@ -754,13 +756,23 @@ void __init pgtable_cache_init(void) NULL); if (!pmd_cache) panic("pgtable_cache_init(): cannot create pmd cache"); + + if (!SHARED_KERNEL_PMD) { + /* If we're in PAE mode and have a non-shared + kernel pmd, then the pgd size must be a + page size. This is because the pgd_list + links through the page structure, so there + can only be one pgd per page for this to + work. */ + pgd_size = PAGE_SIZE; + } } pgd_cache = kmem_cache_create("pgd", - PTRS_PER_PGD*sizeof(pgd_t), - PTRS_PER_PGD*sizeof(pgd_t), + pgd_size, + pgd_size, 0, pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + (!SHARED_KERNEL_PMD) ? pgd_dtor : NULL); if (!pgd_cache) panic("pgtable_cache_init(): Cannot create pgd cache"); } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index ea6b6d4a0a2a..47bd477c8ecc 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -91,7 +91,7 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) unsigned long flags; set_pte_atomic(kpte, pte); /* change init_mm */ - if (PTRS_PER_PMD > 1) + if (SHARED_KERNEL_PMD) return; spin_lock_irqsave(&pgd_lock, flags); diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 99c09edc3dbb..9a96c1647428 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -232,42 +232,92 @@ static inline void pgd_list_del(pgd_t *pgd) set_page_private(next, (unsigned long)pprev); } +#if (PTRS_PER_PMD == 1) +/* Non-PAE pgd constructor */ void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; - if (PTRS_PER_PMD == 1) { - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - spin_lock_irqsave(&pgd_lock, flags); - } + /* !PAE, no pagetable sharing */ + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + spin_lock_irqsave(&pgd_lock, flags); + + /* must happen under lock */ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); - - if (PTRS_PER_PMD > 1) - return; - - /* must happen under lock */ paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); - + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } +#else /* PTRS_PER_PMD > 1 */ +/* PAE pgd constructor */ +void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) +{ + /* PAE, kernel PMD may be shared */ + + if (SHARED_KERNEL_PMD) { + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + } else { + unsigned long flags; + + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + } +} +#endif /* PTRS_PER_PMD */ -/* never called when PTRS_PER_PMD > 1 */ void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ + BUG_ON(SHARED_KERNEL_PMD); + paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) + +/* If we allocate a pmd for part of the kernel address space, then + make sure its initialized with the appropriate kernel mappings. + Otherwise use a cached zeroed pmd. */ +static pmd_t *pmd_cache_alloc(int idx) +{ + pmd_t *pmd; + + if (idx >= USER_PTRS_PER_PGD) { + pmd = (pmd_t *)__get_free_page(GFP_KERNEL); + + if (pmd) + memcpy(pmd, + (void *)pgd_page_vaddr(swapper_pg_dir[idx]), + sizeof(pmd_t) * PTRS_PER_PMD); + } else + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + + return pmd; +} + +static void pmd_cache_free(pmd_t *pmd, int idx) +{ + if (idx >= USER_PTRS_PER_PGD) + free_page((unsigned long)pmd); + else + kmem_cache_free(pmd_cache, pmd); +} + pgd_t *pgd_alloc(struct mm_struct *mm) { int i; @@ -276,10 +326,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (PTRS_PER_PMD == 1 || !pgd) return pgd; - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { + pmd_t *pmd = pmd_cache_alloc(i); + if (!pmd) goto out_oom; + paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } @@ -290,7 +342,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd); + pmd_cache_free(pmd, i); } kmem_cache_free(pgd_cache, pgd); return NULL; @@ -302,11 +354,11 @@ void pgd_free(pgd_t *pgd) /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd); + pmd_cache_free(pmd, i); } /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index c49b44cdd8ee..f93599dc7756 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -35,6 +35,7 @@ struct desc_struct; struct paravirt_ops { unsigned int kernel_rpl; + int shared_kernel_pmd; int paravirt_enabled; const char *name; diff --git a/include/asm-i386/pgtable-2level-defs.h b/include/asm-i386/pgtable-2level-defs.h index 02518079f816..0f71c9f13da4 100644 --- a/include/asm-i386/pgtable-2level-defs.h +++ b/include/asm-i386/pgtable-2level-defs.h @@ -1,6 +1,8 @@ #ifndef _I386_PGTABLE_2LEVEL_DEFS_H #define _I386_PGTABLE_2LEVEL_DEFS_H +#define SHARED_KERNEL_PMD 0 + /* * traditional i386 two-level paging structure: */ diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index 043a2bcfa86a..781fe4bcc962 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -82,6 +82,4 @@ static inline int pte_exec_kernel(pte_t pte) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -void vmalloc_sync_all(void); - #endif /* _I386_PGTABLE_2LEVEL_H */ diff --git a/include/asm-i386/pgtable-3level-defs.h b/include/asm-i386/pgtable-3level-defs.h index eb3a1ea88671..c0df89f66e8b 100644 --- a/include/asm-i386/pgtable-3level-defs.h +++ b/include/asm-i386/pgtable-3level-defs.h @@ -1,6 +1,12 @@ #ifndef _I386_PGTABLE_3LEVEL_DEFS_H #define _I386_PGTABLE_3LEVEL_DEFS_H +#ifdef CONFIG_PARAVIRT +#define SHARED_KERNEL_PMD (paravirt_ops.shared_kernel_pmd) +#else +#define SHARED_KERNEL_PMD 1 +#endif + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index be6017f37a91..664bfee5a2f2 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h @@ -200,6 +200,4 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) #define __pmd_free_tlb(tlb, x) do { } while (0) -#define vmalloc_sync_all() ((void)0) - #endif /* _I386_PGTABLE_3LEVEL_H */ diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 0790ad6ed440..5b88a6a1278e 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -243,6 +243,8 @@ static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; re static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } +extern void vmalloc_sync_all(void); + #ifdef CONFIG_X86_PAE # include #else From d6dd61c831226f9cd7750885da04d360d6455101 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:14 +0200 Subject: [PATCH 136/231] [PATCH] x86: PARAVIRT: add hooks to intercept mm creation and destruction Add hooks to allow a paravirt implementation to track the lifetime of an mm. Paravirtualization requires three hooks, but only two are needed in common code. They are: arch_dup_mmap, which is called when a new mmap is created at fork arch_exit_mmap, which is called when the last process reference to an mm is dropped, which typically happens on exit and exec. The third hook is activate_mm, which is called from the arch-specific activate_mm() macro/function, and so doesn't need stub versions for other architectures. It's called when an mm is first used. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: linux-arch@vger.kernel.org Cc: James Bottomley Acked-by: Ingo Molnar --- arch/i386/kernel/paravirt.c | 4 ++++ include/asm-alpha/mmu_context.h | 1 + include/asm-arm/mmu_context.h | 1 + include/asm-arm26/mmu_context.h | 2 ++ include/asm-avr32/mmu_context.h | 1 + include/asm-cris/mmu_context.h | 2 ++ include/asm-frv/mmu_context.h | 1 + include/asm-generic/mm_hooks.h | 18 ++++++++++++++++++ include/asm-h8300/mmu_context.h | 1 + include/asm-i386/mmu_context.h | 17 +++++++++++++++-- include/asm-i386/paravirt.h | 23 +++++++++++++++++++++++ include/asm-ia64/mmu_context.h | 1 + include/asm-m32r/mmu_context.h | 1 + include/asm-m68k/mmu_context.h | 1 + include/asm-m68knommu/mmu_context.h | 1 + include/asm-mips/mmu_context.h | 1 + include/asm-parisc/mmu_context.h | 1 + include/asm-powerpc/mmu_context.h | 1 + include/asm-ppc/mmu_context.h | 1 + include/asm-s390/mmu_context.h | 2 ++ include/asm-sh/mmu_context.h | 1 + include/asm-sh64/mmu_context.h | 2 +- include/asm-sparc/mmu_context.h | 2 ++ include/asm-sparc64/mmu_context.h | 1 + include/asm-um/mmu_context.h | 2 ++ include/asm-v850/mmu_context.h | 2 ++ include/asm-x86_64/mmu_context.h | 1 + include/asm-xtensa/mmu_context.h | 1 + kernel/fork.c | 2 ++ mm/mmap.c | 4 ++++ 30 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 include/asm-generic/mm_hooks.h diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 2040a831d5b3..54cc14d99740 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -237,6 +237,10 @@ struct paravirt_ops paravirt_ops = { .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, + .dup_mmap = paravirt_nop, + .exit_mmap = paravirt_nop, + .activate_mm = paravirt_nop, + .startup_ipi_hook = paravirt_nop, }; diff --git a/include/asm-alpha/mmu_context.h b/include/asm-alpha/mmu_context.h index fe249e9d3360..0bd7bd2ccb90 100644 --- a/include/asm-alpha/mmu_context.h +++ b/include/asm-alpha/mmu_context.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * Force a context reload. This is needed when we change the page diff --git a/include/asm-arm/mmu_context.h b/include/asm-arm/mmu_context.h index d1a65b1edcaa..f8755c818b54 100644 --- a/include/asm-arm/mmu_context.h +++ b/include/asm-arm/mmu_context.h @@ -16,6 +16,7 @@ #include #include #include +#include void __check_kvm_seq(struct mm_struct *mm); diff --git a/include/asm-arm26/mmu_context.h b/include/asm-arm26/mmu_context.h index 1a929bfe5c3a..16c821f81b8d 100644 --- a/include/asm-arm26/mmu_context.h +++ b/include/asm-arm26/mmu_context.h @@ -13,6 +13,8 @@ #ifndef __ASM_ARM_MMU_CONTEXT_H #define __ASM_ARM_MMU_CONTEXT_H +#include + #define init_new_context(tsk,mm) 0 #define destroy_context(mm) do { } while(0) diff --git a/include/asm-avr32/mmu_context.h b/include/asm-avr32/mmu_context.h index 31add1ae8089..c37c391faef6 100644 --- a/include/asm-avr32/mmu_context.h +++ b/include/asm-avr32/mmu_context.h @@ -15,6 +15,7 @@ #include #include #include +#include /* * The MMU "context" consists of two things: diff --git a/include/asm-cris/mmu_context.h b/include/asm-cris/mmu_context.h index e6e659dc757b..72ba08dcfd18 100644 --- a/include/asm-cris/mmu_context.h +++ b/include/asm-cris/mmu_context.h @@ -1,6 +1,8 @@ #ifndef __CRIS_MMU_CONTEXT_H #define __CRIS_MMU_CONTEXT_H +#include + extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm); extern void get_mmu_context(struct mm_struct *mm); extern void destroy_context(struct mm_struct *mm); diff --git a/include/asm-frv/mmu_context.h b/include/asm-frv/mmu_context.h index 72edcaaccd5d..c7daa395156a 100644 --- a/include/asm-frv/mmu_context.h +++ b/include/asm-frv/mmu_context.h @@ -15,6 +15,7 @@ #include #include #include +#include static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h new file mode 100644 index 000000000000..67dea8123683 --- /dev/null +++ b/include/asm-generic/mm_hooks.h @@ -0,0 +1,18 @@ +/* + * Define generic no-op hooks for arch_dup_mmap and arch_exit_mmap, to + * be included in asm-FOO/mmu_context.h for any arch FOO which doesn't + * need to hook these. + */ +#ifndef _ASM_GENERIC_MM_HOOKS_H +#define _ASM_GENERIC_MM_HOOKS_H + +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + +#endif /* _ASM_GENERIC_MM_HOOKS_H */ diff --git a/include/asm-h8300/mmu_context.h b/include/asm-h8300/mmu_context.h index 5c165f7bee0e..f44b730da54d 100644 --- a/include/asm-h8300/mmu_context.h +++ b/include/asm-h8300/mmu_context.h @@ -4,6 +4,7 @@ #include #include #include +#include static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h index e6aa30f8de5b..8198d1cca1f3 100644 --- a/include/asm-i386/mmu_context.h +++ b/include/asm-i386/mmu_context.h @@ -5,6 +5,16 @@ #include #include #include +#include +#ifndef CONFIG_PARAVIRT +#include + +static inline void paravirt_activate_mm(struct mm_struct *prev, + struct mm_struct *next) +{ +} +#endif /* !CONFIG_PARAVIRT */ + /* * Used for LDT copy/destruction. @@ -65,7 +75,10 @@ static inline void switch_mm(struct mm_struct *prev, #define deactivate_mm(tsk, mm) \ asm("movl %0,%%gs": :"r" (0)); -#define activate_mm(prev, next) \ - switch_mm((prev),(next),NULL) +#define activate_mm(prev, next) \ + do { \ + paravirt_activate_mm(prev, next); \ + switch_mm((prev),(next),NULL); \ + } while(0); #endif diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index f93599dc7756..61c03f1e0c29 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -119,6 +119,12 @@ struct paravirt_ops void (*io_delay)(void); + void (*activate_mm)(struct mm_struct *prev, + struct mm_struct *next); + void (*dup_mmap)(struct mm_struct *oldmm, + struct mm_struct *mm); + void (*exit_mmap)(struct mm_struct *mm); + #ifdef CONFIG_X86_LOCAL_APIC void (*apic_write)(unsigned long reg, unsigned long v); void (*apic_write_atomic)(unsigned long reg, unsigned long v); @@ -395,6 +401,23 @@ static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif +static inline void paravirt_activate_mm(struct mm_struct *prev, + struct mm_struct *next) +{ + paravirt_ops.activate_mm(prev, next); +} + +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + paravirt_ops.dup_mmap(oldmm, mm); +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ + paravirt_ops.exit_mmap(mm); +} + #define __flush_tlb() paravirt_ops.flush_tlb_user() #define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() #define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr) diff --git a/include/asm-ia64/mmu_context.h b/include/asm-ia64/mmu_context.h index b5c65081a3aa..cef2400983fa 100644 --- a/include/asm-ia64/mmu_context.h +++ b/include/asm-ia64/mmu_context.h @@ -29,6 +29,7 @@ #include #include +#include struct ia64_ctx { spinlock_t lock; diff --git a/include/asm-m32r/mmu_context.h b/include/asm-m32r/mmu_context.h index 1f40d4a0acf1..91909e5dd9d0 100644 --- a/include/asm-m32r/mmu_context.h +++ b/include/asm-m32r/mmu_context.h @@ -15,6 +15,7 @@ #include #include #include +#include /* * Cache of MMU context last used. diff --git a/include/asm-m68k/mmu_context.h b/include/asm-m68k/mmu_context.h index 231d11bd8e32..894dacbcee14 100644 --- a/include/asm-m68k/mmu_context.h +++ b/include/asm-m68k/mmu_context.h @@ -1,6 +1,7 @@ #ifndef __M68K_MMU_CONTEXT_H #define __M68K_MMU_CONTEXT_H +#include static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { diff --git a/include/asm-m68knommu/mmu_context.h b/include/asm-m68knommu/mmu_context.h index 6c077d3a2572..9ccee4278c97 100644 --- a/include/asm-m68knommu/mmu_context.h +++ b/include/asm-m68knommu/mmu_context.h @@ -4,6 +4,7 @@ #include #include #include +#include static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { diff --git a/include/asm-mips/mmu_context.h b/include/asm-mips/mmu_context.h index fe065d6070ca..65024ffd7879 100644 --- a/include/asm-mips/mmu_context.h +++ b/include/asm-mips/mmu_context.h @@ -20,6 +20,7 @@ #include #include #endif /* SMTC */ +#include /* * For the fast tlb miss handlers, we keep a per cpu array of pointers diff --git a/include/asm-parisc/mmu_context.h b/include/asm-parisc/mmu_context.h index 9c05836239a2..bad690298f0c 100644 --- a/include/asm-parisc/mmu_context.h +++ b/include/asm-parisc/mmu_context.h @@ -5,6 +5,7 @@ #include #include #include +#include static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { diff --git a/include/asm-powerpc/mmu_context.h b/include/asm-powerpc/mmu_context.h index 083ac917bd29..c0d7795e3d25 100644 --- a/include/asm-powerpc/mmu_context.h +++ b/include/asm-powerpc/mmu_context.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * Copyright (C) 2001 PPC 64 Team, IBM Corp diff --git a/include/asm-ppc/mmu_context.h b/include/asm-ppc/mmu_context.h index 2bc8589cc451..a6441a063e5d 100644 --- a/include/asm-ppc/mmu_context.h +++ b/include/asm-ppc/mmu_context.h @@ -6,6 +6,7 @@ #include #include #include +#include /* * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs diff --git a/include/asm-s390/mmu_context.h b/include/asm-s390/mmu_context.h index 1d21da220d49..501cb9b06314 100644 --- a/include/asm-s390/mmu_context.h +++ b/include/asm-s390/mmu_context.h @@ -10,6 +10,8 @@ #define __S390_MMU_CONTEXT_H #include +#include + /* * get a new mmu context.. S390 don't know about contexts. */ diff --git a/include/asm-sh/mmu_context.h b/include/asm-sh/mmu_context.h index 342024425b7d..01acaaae9751 100644 --- a/include/asm-sh/mmu_context.h +++ b/include/asm-sh/mmu_context.h @@ -12,6 +12,7 @@ #include #include #include +#include /* * The MMU "context" consists of two things: diff --git a/include/asm-sh64/mmu_context.h b/include/asm-sh64/mmu_context.h index 8c860dab2d0e..507bf72bb8e1 100644 --- a/include/asm-sh64/mmu_context.h +++ b/include/asm-sh64/mmu_context.h @@ -27,7 +27,7 @@ extern unsigned long mmu_context_cache; #include - +#include /* Current mm's pgd */ extern pgd_t *mmu_pdtp_cache; diff --git a/include/asm-sparc/mmu_context.h b/include/asm-sparc/mmu_context.h index ed1e01d04d21..671a997b9e69 100644 --- a/include/asm-sparc/mmu_context.h +++ b/include/asm-sparc/mmu_context.h @@ -5,6 +5,8 @@ #ifndef __ASSEMBLY__ +#include + static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { } diff --git a/include/asm-sparc64/mmu_context.h b/include/asm-sparc64/mmu_context.h index 2337eb487719..8d129032013e 100644 --- a/include/asm-sparc64/mmu_context.h +++ b/include/asm-sparc64/mmu_context.h @@ -9,6 +9,7 @@ #include #include #include +#include static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { diff --git a/include/asm-um/mmu_context.h b/include/asm-um/mmu_context.h index f709c784bf12..9aa4b44e8cc1 100644 --- a/include/asm-um/mmu_context.h +++ b/include/asm-um/mmu_context.h @@ -6,6 +6,8 @@ #ifndef __UM_MMU_CONTEXT_H #define __UM_MMU_CONTEXT_H +#include + #include "linux/sched.h" #include "choose-mode.h" #include "um_mmu.h" diff --git a/include/asm-v850/mmu_context.h b/include/asm-v850/mmu_context.h index f521c8050d3c..01daacd5474e 100644 --- a/include/asm-v850/mmu_context.h +++ b/include/asm-v850/mmu_context.h @@ -1,6 +1,8 @@ #ifndef __V850_MMU_CONTEXT_H__ #define __V850_MMU_CONTEXT_H__ +#include + #define destroy_context(mm) ((void)0) #define init_new_context(tsk,mm) 0 #define switch_mm(prev,next,tsk) ((void)0) diff --git a/include/asm-x86_64/mmu_context.h b/include/asm-x86_64/mmu_context.h index af03b9f852d6..0cce83a78378 100644 --- a/include/asm-x86_64/mmu_context.h +++ b/include/asm-x86_64/mmu_context.h @@ -7,6 +7,7 @@ #include #include #include +#include /* * possibly do the LDT unload here? diff --git a/include/asm-xtensa/mmu_context.h b/include/asm-xtensa/mmu_context.h index f14851f086c3..92f948392ebc 100644 --- a/include/asm-xtensa/mmu_context.h +++ b/include/asm-xtensa/mmu_context.h @@ -18,6 +18,7 @@ #include #include #include +#include #define XCHAL_MMU_ASID_BITS 8 diff --git a/kernel/fork.c b/kernel/fork.c index 6af959c034d8..ffccefb28b6a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -286,6 +286,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); diff --git a/mm/mmap.c b/mm/mmap.c index 84f997da78d7..88da687bde89 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -29,6 +29,7 @@ #include #include #include +#include #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) @@ -1979,6 +1980,9 @@ void exit_mmap(struct mm_struct *mm) unsigned long nr_accounted = 0; unsigned long end; + /* mm's last user has gone, and its about to be pulled down */ + arch_exit_mmap(mm); + lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); From 98de032b681d8a7532d44dfc66aa5c0c1c755a9d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:14 +0200 Subject: [PATCH 137/231] [PATCH] i386: PARAVIRT: rename struct paravirt_patch to paravirt_patch_site for clarity Rename struct paravirt_patch to paravirt_patch_site, so that it clearly refers to a callsite, and not the patch which may be applied to that callsite. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Zachary Amsden --- arch/i386/kernel/alternative.c | 7 ++++--- include/asm-i386/alternative.h | 8 +++++--- include/asm-i386/paravirt.h | 5 ++++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 915b6c4d9baf..dae3ded9041c 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -325,9 +325,10 @@ void alternatives_smp_switch(int smp) #endif #ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) { - struct paravirt_patch *p; + struct paravirt_patch_site *p; for (p = start; p < end; p++) { unsigned int used; @@ -342,7 +343,7 @@ void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) /* Sync to be conservative, in case we patched following instructions */ sync_core(); } -extern struct paravirt_patch __start_parainstructions[], +extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; #endif /* CONFIG_PARAVIRT */ diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h index 4d518eebe461..5b59d07e9d29 100644 --- a/include/asm-i386/alternative.h +++ b/include/asm-i386/alternative.h @@ -115,12 +115,14 @@ static inline void alternatives_smp_switch(int smp) {} #define LOCK_PREFIX "" #endif -struct paravirt_patch; +struct paravirt_patch_site; #ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end); +void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end); #else static inline void -apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) {} #define __start_parainstructions NULL #define __stop_parainstructions NULL diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 61c03f1e0c29..b4cc2fc4031e 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -505,13 +505,16 @@ void _paravirt_nop(void); #define paravirt_nop ((void *)_paravirt_nop) /* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch { +struct paravirt_patch_site { u8 *instr; /* original instructions */ u8 instrtype; /* type of this instruction */ u8 len; /* length of original instruction */ u16 clobbers; /* what registers you may clobber */ }; +extern struct paravirt_patch_site __parainstructions[], + __parainstructions_end[]; + #define paravirt_alt(insn_string, typenum, clobber) \ "771:\n\t" insn_string "\n" "772:\n" \ ".pushsection .parainstructions,\"a\"\n" \ From d582203578a1f3d408e27bb9042e8635954cd320 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:14 +0200 Subject: [PATCH 138/231] [PATCH] i386: PARAVIRT: Use patch site IDs computed from offset in paravirt_ops structure Use patch type identifiers derived from the offset of the operation in the paravirt_ops structure. This avoids having to maintain a separate enum for patch site types. Also, since the identifier is derived from the offset into paravirt_ops, the offset can be derived from the identifier. This is used to remove replicated information in the various callsite macros, which has been a source of bugs in the past. This patch also drops the fused save_fl+cli operation, which doesn't really add much and makes things more complex - specifically because it breaks the 1:1 relationship between identifiers and offsets. If this operation turns out to be particularly beneficial, then the right answer is to define a new entrypoint for it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Zachary Amsden --- arch/i386/kernel/paravirt.c | 14 ++- arch/i386/kernel/vmi.c | 39 ++------ include/asm-i386/paravirt.h | 171 +++++++++++++++++++----------------- 3 files changed, 101 insertions(+), 123 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 54cc14d99740..f2982832d3b9 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -58,7 +58,6 @@ DEF_NATIVE(cli, "cli"); DEF_NATIVE(sti, "sti"); DEF_NATIVE(popf, "push %eax; popf"); DEF_NATIVE(pushf, "pushf; pop %eax"); -DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli"); DEF_NATIVE(iret, "iret"); DEF_NATIVE(sti_sysexit, "sti; sysexit"); @@ -66,13 +65,12 @@ static const struct native_insns { const char *start, *end; } native_insns[] = { - [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli }, - [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti }, - [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf }, - [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf }, - [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli }, - [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret }, - [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit }, + [PARAVIRT_PATCH(irq_disable)] = { start_cli, end_cli }, + [PARAVIRT_PATCH(irq_enable)] = { start_sti, end_sti }, + [PARAVIRT_PATCH(restore_fl)] = { start_popf, end_popf }, + [PARAVIRT_PATCH(save_fl)] = { start_pushf, end_pushf }, + [PARAVIRT_PATCH(iret)] = { start_iret, end_iret }, + [PARAVIRT_PATCH(irq_enable_sysexit)] = { start_sti_sysexit, end_sti_sysexit }, }; static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index ea77d93f59dd..b8d01c3cbff4 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -83,11 +83,6 @@ extern struct paravirt_patch __start_parainstructions[], #define MNEM_JMP 0xe9 #define MNEM_RET 0xc3 -static char irq_save_disable_callout[] = { - MNEM_CALL, 0, 0, 0, 0, - MNEM_CALL, 0, 0, 0, 0, - MNEM_RET -}; #define IRQ_PATCH_INT_MASK 0 #define IRQ_PATCH_DISABLE 5 @@ -135,33 +130,17 @@ static unsigned patch_internal(int call, unsigned len, void *insns) static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len) { switch (type) { - case PARAVIRT_IRQ_DISABLE: + case PARAVIRT_PATCH(irq_disable): return patch_internal(VMI_CALL_DisableInterrupts, len, insns); - case PARAVIRT_IRQ_ENABLE: + case PARAVIRT_PATCH(irq_enable): return patch_internal(VMI_CALL_EnableInterrupts, len, insns); - case PARAVIRT_RESTORE_FLAGS: + case PARAVIRT_PATCH(restore_fl): return patch_internal(VMI_CALL_SetInterruptMask, len, insns); - case PARAVIRT_SAVE_FLAGS: + case PARAVIRT_PATCH(save_fl): return patch_internal(VMI_CALL_GetInterruptMask, len, insns); - case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE: - if (len >= 10) { - patch_internal(VMI_CALL_GetInterruptMask, len, insns); - patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5); - return 10; - } else { - /* - * You bastards didn't leave enough room to - * patch save_flags_irq_disable inline. Patch - * to a helper - */ - BUG_ON(len < 5); - *(char *)insns = MNEM_CALL; - patch_offset(insns, irq_save_disable_callout); - return 5; - } - case PARAVIRT_INTERRUPT_RETURN: + case PARAVIRT_PATCH(iret): return patch_internal(VMI_CALL_IRET, len, insns); - case PARAVIRT_STI_SYSEXIT: + case PARAVIRT_PATCH(irq_enable_sysexit): return patch_internal(VMI_CALL_SYSEXIT, len, insns); default: break; @@ -796,12 +775,6 @@ static inline int __init activate_vmi(void) para_fill(irq_disable, DisableInterrupts); para_fill(irq_enable, EnableInterrupts); - /* irq_save_disable !!! sheer pain */ - patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK], - (char *)paravirt_ops.save_fl); - patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE], - (char *)paravirt_ops.irq_disable); - para_fill(wbinvd, WBINVD); para_fill(read_tsc, RDTSC); diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index b4cc2fc4031e..1dbc01f4ed4d 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -4,19 +4,8 @@ * para-virtualization: those hooks are defined here. */ #ifdef CONFIG_PARAVIRT -#include #include -/* These are the most performance critical ops, so we want to be able to patch - * callers */ -#define PARAVIRT_IRQ_DISABLE 0 -#define PARAVIRT_IRQ_ENABLE 1 -#define PARAVIRT_RESTORE_FLAGS 2 -#define PARAVIRT_SAVE_FLAGS 3 -#define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4 -#define PARAVIRT_INTERRUPT_RETURN 5 -#define PARAVIRT_STI_SYSEXIT 6 - /* Bitmask of what can be clobbered: usually at least eax. */ #define CLBR_NONE 0x0 #define CLBR_EAX 0x1 @@ -191,6 +180,28 @@ struct paravirt_ops extern struct paravirt_ops paravirt_ops; +#define PARAVIRT_PATCH(x) \ + (offsetof(struct paravirt_ops, x) / sizeof(void *)) + +#define paravirt_type(type) \ + [paravirt_typenum] "i" (PARAVIRT_PATCH(type)) +#define paravirt_clobber(clobber) \ + [paravirt_clobber] "i" (clobber) + +#define PARAVIRT_CALL "call *paravirt_ops+%c[paravirt_typenum]*4;" + +#define _paravirt_alt(insn_string, type, clobber) \ + "771:\n\t" insn_string "\n" "772:\n" \ + ".pushsection .parainstructions,\"a\"\n" \ + " .long 771b\n" \ + " .byte " type "\n" \ + " .byte 772b-771b\n" \ + " .short " clobber "\n" \ + ".popsection\n" + +#define paravirt_alt(insn_string) \ + _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + #define paravirt_enabled() (paravirt_ops.paravirt_enabled) static inline void load_esp0(struct tss_struct *tss, @@ -515,93 +526,89 @@ struct paravirt_patch_site { extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; -#define paravirt_alt(insn_string, typenum, clobber) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - " .long 771b\n" \ - " .byte " __stringify(typenum) "\n" \ - " .byte 772b-771b\n" \ - " .short " __stringify(clobber) "\n" \ - ".popsection" - static inline unsigned long __raw_local_save_flags(void) { unsigned long f; - __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" - "call *%1;" - "popl %%edx; popl %%ecx", - PARAVIRT_SAVE_FLAGS, CLBR_NONE) - : "=a"(f): "m"(paravirt_ops.save_fl) - : "memory", "cc"); + asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;" + PARAVIRT_CALL + "popl %%edx; popl %%ecx") + : "=a"(f) + : paravirt_type(save_fl), + paravirt_clobber(CLBR_NONE) + : "memory", "cc"); return f; } static inline void raw_local_irq_restore(unsigned long f) { - __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" - "call *%1;" - "popl %%edx; popl %%ecx", - PARAVIRT_RESTORE_FLAGS, CLBR_EAX) - : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f) - : "memory", "cc"); + asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;" + PARAVIRT_CALL + "popl %%edx; popl %%ecx") + : "=a"(f) + : "0"(f), + paravirt_type(restore_fl), + paravirt_clobber(CLBR_EAX) + : "memory", "cc"); } static inline void raw_local_irq_disable(void) { - __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" - "call *%0;" - "popl %%edx; popl %%ecx", - PARAVIRT_IRQ_DISABLE, CLBR_EAX) - : : "m" (paravirt_ops.irq_disable) - : "memory", "eax", "cc"); + asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;" + PARAVIRT_CALL + "popl %%edx; popl %%ecx") + : + : paravirt_type(irq_disable), + paravirt_clobber(CLBR_EAX) + : "memory", "eax", "cc"); } static inline void raw_local_irq_enable(void) { - __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" - "call *%0;" - "popl %%edx; popl %%ecx", - PARAVIRT_IRQ_ENABLE, CLBR_EAX) - : : "m" (paravirt_ops.irq_enable) - : "memory", "eax", "cc"); + asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;" + PARAVIRT_CALL + "popl %%edx; popl %%ecx") + : + : paravirt_type(irq_enable), + paravirt_clobber(CLBR_EAX) + : "memory", "eax", "cc"); } static inline unsigned long __raw_local_irq_save(void) { unsigned long f; - __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" - "call *%1; pushl %%eax;" - "call *%2; popl %%eax;" - "popl %%edx; popl %%ecx", - PARAVIRT_SAVE_FLAGS_IRQ_DISABLE, - CLBR_NONE) - : "=a"(f) - : "m" (paravirt_ops.save_fl), - "m" (paravirt_ops.irq_disable) - : "memory", "cc"); + f = __raw_local_save_flags(); + raw_local_irq_disable(); return f; } -#define CLI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \ - "call *paravirt_ops+%c[irq_disable];" \ - "popl %%edx; popl %%ecx", \ - PARAVIRT_IRQ_DISABLE, CLBR_EAX) +#define CLI_STRING \ + _paravirt_alt("pushl %%ecx; pushl %%edx;" \ + "call *paravirt_ops+%c[paravirt_cli_type]*4;" \ + "popl %%edx; popl %%ecx", \ + "%c[paravirt_cli_type]", "%c[paravirt_clobber]") + +#define STI_STRING \ + _paravirt_alt("pushl %%ecx; pushl %%edx;" \ + "call *paravirt_ops+%c[paravirt_sti_type]*4;" \ + "popl %%edx; popl %%ecx", \ + "%c[paravirt_sti_type]", "%c[paravirt_clobber]") -#define STI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \ - "call *paravirt_ops+%c[irq_enable];" \ - "popl %%edx; popl %%ecx", \ - PARAVIRT_IRQ_ENABLE, CLBR_EAX) #define CLI_STI_CLOBBERS , "%eax" -#define CLI_STI_INPUT_ARGS \ +#define CLI_STI_INPUT_ARGS \ , \ - [irq_disable] "i" (offsetof(struct paravirt_ops, irq_disable)), \ - [irq_enable] "i" (offsetof(struct paravirt_ops, irq_enable)) + [paravirt_cli_type] "i" (PARAVIRT_PATCH(irq_disable)), \ + [paravirt_sti_type] "i" (PARAVIRT_PATCH(irq_enable)), \ + paravirt_clobber(CLBR_EAX) + +#undef PARAVIRT_CALL #else /* __ASSEMBLY__ */ -#define PARA_PATCH(ptype, clobbers, ops) \ +#define PARA_PATCH(off) ((off) / 4) + +#define PARA_SITE(ptype, clobbers, ops) \ 771:; \ ops; \ 772:; \ @@ -612,25 +619,25 @@ static inline unsigned long __raw_local_irq_save(void) .short clobbers; \ .popsection -#define INTERRUPT_RETURN \ - PARA_PATCH(PARAVIRT_INTERRUPT_RETURN, CLBR_ANY, \ - jmp *%cs:paravirt_ops+PARAVIRT_iret) +#define INTERRUPT_RETURN \ + PARA_SITE(PARA_PATCH(PARAVIRT_iret), CLBR_ANY, \ + jmp *%cs:paravirt_ops+PARAVIRT_iret) -#define DISABLE_INTERRUPTS(clobbers) \ - PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers, \ - pushl %ecx; pushl %edx; \ - call *paravirt_ops+PARAVIRT_irq_disable; \ - popl %edx; popl %ecx) \ +#define DISABLE_INTERRUPTS(clobbers) \ + PARA_SITE(PARA_PATCH(PARAVIRT_irq_disable), clobbers, \ + pushl %ecx; pushl %edx; \ + call *%cs:paravirt_ops+PARAVIRT_irq_disable; \ + popl %edx; popl %ecx) \ -#define ENABLE_INTERRUPTS(clobbers) \ - PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers, \ - pushl %ecx; pushl %edx; \ - call *%cs:paravirt_ops+PARAVIRT_irq_enable; \ - popl %edx; popl %ecx) +#define ENABLE_INTERRUPTS(clobbers) \ + PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable), clobbers, \ + pushl %ecx; pushl %edx; \ + call *%cs:paravirt_ops+PARAVIRT_irq_enable; \ + popl %edx; popl %ecx) -#define ENABLE_INTERRUPTS_SYSEXIT \ - PARA_PATCH(PARAVIRT_STI_SYSEXIT, CLBR_ANY, \ - jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit) +#define ENABLE_INTERRUPTS_SYSEXIT \ + PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable_sysexit), CLBR_ANY, \ + jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit) #define GET_CR0_INTO_EAX \ call *paravirt_ops+PARAVIRT_read_cr0 From 42c24fa22e86365055fc931d833f26165e687c19 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:14 +0200 Subject: [PATCH 139/231] [PATCH] i386: PARAVIRT: Fix patch site clobbers to include return register Fix a few clobbers to include the return register. The clobbers set is the set of all registers modified (or may be modified) by the code snippet, regardless of whether it was deliberate or accidental. Also, make sure that callsites which are used in contexts which don't allow clobbers actually save and restore all clobberable registers. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Zachary Amsden --- arch/i386/kernel/entry.S | 2 +- include/asm-i386/paravirt.h | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index e901952dff37..e07473c0d3e7 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -338,7 +338,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) - DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 1dbc01f4ed4d..87fd4317bee9 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -535,7 +535,7 @@ static inline unsigned long __raw_local_save_flags(void) "popl %%edx; popl %%ecx") : "=a"(f) : paravirt_type(save_fl), - paravirt_clobber(CLBR_NONE) + paravirt_clobber(CLBR_EAX) : "memory", "cc"); return f; } @@ -620,27 +620,29 @@ static inline unsigned long __raw_local_irq_save(void) .popsection #define INTERRUPT_RETURN \ - PARA_SITE(PARA_PATCH(PARAVIRT_iret), CLBR_ANY, \ + PARA_SITE(PARA_PATCH(PARAVIRT_iret), CLBR_NONE, \ jmp *%cs:paravirt_ops+PARAVIRT_iret) #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(PARAVIRT_irq_disable), clobbers, \ - pushl %ecx; pushl %edx; \ + pushl %eax; pushl %ecx; pushl %edx; \ call *%cs:paravirt_ops+PARAVIRT_irq_disable; \ - popl %edx; popl %ecx) \ + popl %edx; popl %ecx; popl %eax) \ #define ENABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable), clobbers, \ - pushl %ecx; pushl %edx; \ + pushl %eax; pushl %ecx; pushl %edx; \ call *%cs:paravirt_ops+PARAVIRT_irq_enable; \ - popl %edx; popl %ecx) + popl %edx; popl %ecx; popl %eax) #define ENABLE_INTERRUPTS_SYSEXIT \ - PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable_sysexit), CLBR_ANY, \ + PARA_SITE(PARA_PATCH(PARAVIRT_irq_enable_sysexit), CLBR_NONE, \ jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit) #define GET_CR0_INTO_EAX \ - call *paravirt_ops+PARAVIRT_read_cr0 + push %ecx; push %edx; \ + call *paravirt_ops+PARAVIRT_read_cr0; \ + pop %edx; pop %ecx #endif /* __ASSEMBLY__ */ #endif /* CONFIG_PARAVIRT */ From f8822f42019eceed19cc6c0f985a489e17796ed8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:14 +0200 Subject: [PATCH 140/231] [PATCH] i386: PARAVIRT: Consistently wrap paravirt ops callsites to make them patchable Wrap a set of interesting paravirt_ops calls in a wrapper which makes the callsites available for patching. Unfortunately this is pretty ugly because there's no way to get gcc to generate a function call, but also wrap just the callsite itself with the necessary labels. This patch supports functions with 0-4 arguments, and either void or returning a value. 64-bit arguments must be split into a pair of 32-bit arguments (lower word first). Small structures are returned in registers. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Zachary Amsden Cc: Anthony Liguori --- include/asm-i386/paravirt.h | 704 +++++++++++++++++++++++++++++------- 1 file changed, 569 insertions(+), 135 deletions(-) diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 87fd4317bee9..837457b42dbe 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -124,7 +124,7 @@ struct paravirt_ops void (*flush_tlb_user)(void); void (*flush_tlb_kernel)(void); - void (*flush_tlb_single)(u32 addr); + void (*flush_tlb_single)(unsigned long addr); void (*map_pt_hook)(int type, pte_t *va, u32 pfn); @@ -188,7 +188,7 @@ extern struct paravirt_ops paravirt_ops; #define paravirt_clobber(clobber) \ [paravirt_clobber] "i" (clobber) -#define PARAVIRT_CALL "call *paravirt_ops+%c[paravirt_typenum]*4;" +#define PARAVIRT_CALL "call *(paravirt_ops+%c[paravirt_typenum]*4);" #define _paravirt_alt(insn_string, type, clobber) \ "771:\n\t" insn_string "\n" "772:\n" \ @@ -199,26 +199,234 @@ extern struct paravirt_ops paravirt_ops; " .short " clobber "\n" \ ".popsection\n" -#define paravirt_alt(insn_string) \ +#define paravirt_alt(insn_string) \ _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") -#define paravirt_enabled() (paravirt_ops.paravirt_enabled) +#define PVOP_CALL0(__rettype, __op) \ + ({ \ + __rettype __ret; \ + if (sizeof(__rettype) > sizeof(unsigned long)) { \ + unsigned long long __tmp; \ + unsigned long __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=A" (__tmp), "=c" (__ecx) \ + : paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + __ret = (__rettype)__tmp; \ + } else { \ + unsigned long __tmp, __edx, __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=a" (__tmp), "=d" (__edx), \ + "=c" (__ecx) \ + : paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + __ret = (__rettype)__tmp; \ + } \ + __ret; \ + }) +#define PVOP_VCALL0(__op) \ + ({ \ + unsigned long __eax, __edx, __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ + : paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + }) + +#define PVOP_CALL1(__rettype, __op, arg1) \ + ({ \ + __rettype __ret; \ + if (sizeof(__rettype) > sizeof(unsigned long)) { \ + unsigned long long __tmp; \ + unsigned long __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=A" (__tmp), "=c" (__ecx) \ + : "a" ((u32)(arg1)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + __ret = (__rettype)__tmp; \ + } else { \ + unsigned long __tmp, __edx, __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=a" (__tmp), "=d" (__edx), \ + "=c" (__ecx) \ + : "0" ((u32)(arg1)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + __ret = (__rettype)__tmp; \ + } \ + __ret; \ + }) +#define PVOP_VCALL1(__op, arg1) \ + ({ \ + unsigned long __eax, __edx, __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ + : "0" ((u32)(arg1)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + }) + +#define PVOP_CALL2(__rettype, __op, arg1, arg2) \ + ({ \ + __rettype __ret; \ + if (sizeof(__rettype) > sizeof(unsigned long)) { \ + unsigned long long __tmp; \ + unsigned long __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=A" (__tmp), "=c" (__ecx) \ + : "a" ((u32)(arg1)), \ + "d" ((u32)(arg2)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + __ret = (__rettype)__tmp; \ + } else { \ + unsigned long __tmp, __edx, __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=a" (__tmp), "=d" (__edx), \ + "=c" (__ecx) \ + : "0" ((u32)(arg1)), \ + "1" ((u32)(arg2)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + __ret = (__rettype)__tmp; \ + } \ + __ret; \ + }) +#define PVOP_VCALL2(__op, arg1, arg2) \ + ({ \ + unsigned long __eax, __edx, __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ + : "0" ((u32)(arg1)), \ + "1" ((u32)(arg2)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + }) + +#define PVOP_CALL3(__rettype, __op, arg1, arg2, arg3) \ + ({ \ + __rettype __ret; \ + if (sizeof(__rettype) > sizeof(unsigned long)) { \ + unsigned long long __tmp; \ + unsigned long __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=A" (__tmp), "=c" (__ecx) \ + : "a" ((u32)(arg1)), \ + "d" ((u32)(arg2)), \ + "1" ((u32)(arg3)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + __ret = (__rettype)__tmp; \ + } else { \ + unsigned long __tmp, __edx, __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=a" (__tmp), "=d" (__edx), \ + "=c" (__ecx) \ + : "0" ((u32)(arg1)), \ + "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + __ret = (__rettype)__tmp; \ + } \ + __ret; \ + }) +#define PVOP_VCALL3(__op, arg1, arg2, arg3) \ + ({ \ + unsigned long __eax, __edx, __ecx; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ + : "0" ((u32)(arg1)), \ + "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + }) + +#define PVOP_CALL4(__rettype, __op, arg1, arg2, arg3, arg4) \ + ({ \ + __rettype __ret; \ + if (sizeof(__rettype) > sizeof(unsigned long)) { \ + unsigned long long __tmp; \ + unsigned long __ecx; \ + asm volatile("push %[_arg4]; " \ + paravirt_alt(PARAVIRT_CALL) \ + "lea 4(%%esp),%%esp" \ + : "=A" (__tmp), "=c" (__ecx) \ + : "a" ((u32)(arg1)), \ + "d" ((u32)(arg2)), \ + "1" ((u32)(arg3)), \ + [_arg4] "mr" ((u32)(arg4)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc",); \ + __ret = (__rettype)__tmp; \ + } else { \ + unsigned long __tmp, __edx, __ecx; \ + asm volatile("push %[_arg4]; " \ + paravirt_alt(PARAVIRT_CALL) \ + "lea 4(%%esp),%%esp" \ + : "=a" (__tmp), "=d" (__edx), "=c" (__ecx) \ + : "0" ((u32)(arg1)), \ + "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), \ + [_arg4]"mr" ((u32)(arg4)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + __ret = (__rettype)__tmp; \ + } \ + __ret; \ + }) +#define PVOP_VCALL4(__op, arg1, arg2, arg3, arg4) \ + ({ \ + unsigned long __eax, __edx, __ecx; \ + asm volatile("push %[_arg4]; " \ + paravirt_alt(PARAVIRT_CALL) \ + "lea 4(%%esp),%%esp" \ + : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ + : "0" ((u32)(arg1)), \ + "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), \ + [_arg4]"mr" ((u32)(arg4)), \ + paravirt_type(__op), \ + paravirt_clobber(CLBR_ANY) \ + : "memory", "cc"); \ + }) + +static inline int paravirt_enabled(void) +{ + return paravirt_ops.paravirt_enabled; +} static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) { - paravirt_ops.load_esp0(tss, thread); + PVOP_VCALL2(load_esp0, tss, thread); } #define ARCH_SETUP paravirt_ops.arch_setup(); static inline unsigned long get_wallclock(void) { - return paravirt_ops.get_wallclock(); + return PVOP_CALL0(unsigned long, get_wallclock); } static inline int set_wallclock(unsigned long nowtime) { - return paravirt_ops.set_wallclock(nowtime); + return PVOP_CALL1(int, set_wallclock, nowtime); } static inline void (*choose_time_init(void))(void) @@ -230,127 +438,208 @@ static inline void (*choose_time_init(void))(void) static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { - paravirt_ops.cpuid(eax, ebx, ecx, edx); + PVOP_VCALL4(cpuid, eax, ebx, ecx, edx); } /* * These special macros can be used to get or set a debugging register */ -#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg) -#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val) +static inline unsigned long paravirt_get_debugreg(int reg) +{ + return PVOP_CALL1(unsigned long, get_debugreg, reg); +} +#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) +static inline void set_debugreg(unsigned long val, int reg) +{ + PVOP_VCALL2(set_debugreg, reg, val); +} -#define clts() paravirt_ops.clts() +static inline void clts(void) +{ + PVOP_VCALL0(clts); +} -#define read_cr0() paravirt_ops.read_cr0() -#define write_cr0(x) paravirt_ops.write_cr0(x) +static inline unsigned long read_cr0(void) +{ + return PVOP_CALL0(unsigned long, read_cr0); +} -#define read_cr2() paravirt_ops.read_cr2() -#define write_cr2(x) paravirt_ops.write_cr2(x) +static inline void write_cr0(unsigned long x) +{ + PVOP_VCALL1(write_cr0, x); +} -#define read_cr3() paravirt_ops.read_cr3() -#define write_cr3(x) paravirt_ops.write_cr3(x) +static inline unsigned long read_cr2(void) +{ + return PVOP_CALL0(unsigned long, read_cr2); +} -#define read_cr4() paravirt_ops.read_cr4() -#define read_cr4_safe(x) paravirt_ops.read_cr4_safe() -#define write_cr4(x) paravirt_ops.write_cr4(x) +static inline void write_cr2(unsigned long x) +{ + PVOP_VCALL1(write_cr2, x); +} -#define raw_ptep_get_and_clear(xp) (paravirt_ops.ptep_get_and_clear(xp)) +static inline unsigned long read_cr3(void) +{ + return PVOP_CALL0(unsigned long, read_cr3); +} + +static inline void write_cr3(unsigned long x) +{ + PVOP_VCALL1(write_cr3, x); +} + +static inline unsigned long read_cr4(void) +{ + return PVOP_CALL0(unsigned long, read_cr4); +} +static inline unsigned long read_cr4_safe(void) +{ + return PVOP_CALL0(unsigned long, read_cr4_safe); +} + +static inline void write_cr4(unsigned long x) +{ + PVOP_VCALL1(write_cr4, x); +} static inline void raw_safe_halt(void) { - paravirt_ops.safe_halt(); + PVOP_VCALL0(safe_halt); } static inline void halt(void) { - paravirt_ops.safe_halt(); + PVOP_VCALL0(safe_halt); +} + +static inline void wbinvd(void) +{ + PVOP_VCALL0(wbinvd); } -#define wbinvd() paravirt_ops.wbinvd() #define get_kernel_rpl() (paravirt_ops.kernel_rpl) +static inline u64 paravirt_read_msr(unsigned msr, int *err) +{ + return PVOP_CALL2(u64, read_msr, msr, err); +} +static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) +{ + return PVOP_CALL3(int, write_msr, msr, low, high); +} + /* These should all do BUG_ON(_err), but our headers are too tangled. */ -#define rdmsr(msr,val1,val2) do { \ - int _err; \ - u64 _l = paravirt_ops.read_msr(msr,&_err); \ - val1 = (u32)_l; \ - val2 = _l >> 32; \ +#define rdmsr(msr,val1,val2) do { \ + int _err; \ + u64 _l = paravirt_read_msr(msr, &_err); \ + val1 = (u32)_l; \ + val2 = _l >> 32; \ } while(0) -#define wrmsr(msr,val1,val2) do { \ - u64 _l = ((u64)(val2) << 32) | (val1); \ - paravirt_ops.write_msr((msr), _l); \ +#define wrmsr(msr,val1,val2) do { \ + paravirt_write_msr(msr, val1, val2); \ } while(0) -#define rdmsrl(msr,val) do { \ - int _err; \ - val = paravirt_ops.read_msr((msr),&_err); \ +#define rdmsrl(msr,val) do { \ + int _err; \ + val = paravirt_read_msr(msr, &_err); \ } while(0) -#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val))) -#define wrmsr_safe(msr,a,b) ({ \ - u64 _l = ((u64)(b) << 32) | (a); \ - paravirt_ops.write_msr((msr),_l); \ -}) +#define wrmsrl(msr,val) ((void)paravirt_write_msr(msr, val, 0)) +#define wrmsr_safe(msr,a,b) paravirt_write_msr(msr, a, b) /* rdmsr with exception handling */ -#define rdmsr_safe(msr,a,b) ({ \ - int _err; \ - u64 _l = paravirt_ops.read_msr(msr,&_err); \ - (*a) = (u32)_l; \ - (*b) = _l >> 32; \ +#define rdmsr_safe(msr,a,b) ({ \ + int _err; \ + u64 _l = paravirt_read_msr(msr, &_err); \ + (*a) = (u32)_l; \ + (*b) = _l >> 32; \ _err; }) -#define rdtsc(low,high) do { \ - u64 _l = paravirt_ops.read_tsc(); \ - low = (u32)_l; \ - high = _l >> 32; \ + +static inline u64 paravirt_read_tsc(void) +{ + return PVOP_CALL0(u64, read_tsc); +} +#define rdtsc(low,high) do { \ + u64 _l = paravirt_read_tsc(); \ + low = (u32)_l; \ + high = _l >> 32; \ } while(0) -#define rdtscl(low) do { \ - u64 _l = paravirt_ops.read_tsc(); \ - low = (int)_l; \ +#define rdtscl(low) do { \ + u64 _l = paravirt_read_tsc(); \ + low = (int)_l; \ } while(0) -#define rdtscll(val) (val = paravirt_ops.read_tsc()) +#define rdtscll(val) (val = paravirt_read_tsc()) #define get_scheduled_cycles(val) (val = paravirt_ops.get_scheduled_cycles()) #define calculate_cpu_khz() (paravirt_ops.get_cpu_khz()) #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) -#define rdpmc(counter,low,high) do { \ - u64 _l = paravirt_ops.read_pmc(); \ - low = (u32)_l; \ - high = _l >> 32; \ +static inline unsigned long long paravirt_read_pmc(int counter) +{ + return PVOP_CALL1(u64, read_pmc, counter); +} + +#define rdpmc(counter,low,high) do { \ + u64 _l = paravirt_read_pmc(counter); \ + low = (u32)_l; \ + high = _l >> 32; \ } while(0) -#define load_TR_desc() (paravirt_ops.load_tr_desc()) -#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr)) -#define load_idt(dtr) (paravirt_ops.load_idt(dtr)) -#define set_ldt(addr, entries) (paravirt_ops.set_ldt((addr), (entries))) -#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr)) -#define store_idt(dtr) (paravirt_ops.store_idt(dtr)) -#define store_tr(tr) ((tr) = paravirt_ops.store_tr()) -#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu))) -#define write_ldt_entry(dt, entry, low, high) \ - (paravirt_ops.write_ldt_entry((dt), (entry), (low), (high))) -#define write_gdt_entry(dt, entry, low, high) \ - (paravirt_ops.write_gdt_entry((dt), (entry), (low), (high))) -#define write_idt_entry(dt, entry, low, high) \ - (paravirt_ops.write_idt_entry((dt), (entry), (low), (high))) -#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask)) - -#define __pte(x) paravirt_ops.make_pte(x) -#define __pgd(x) paravirt_ops.make_pgd(x) - -#define pte_val(x) paravirt_ops.pte_val(x) -#define pgd_val(x) paravirt_ops.pgd_val(x) - -#ifdef CONFIG_X86_PAE -#define __pmd(x) paravirt_ops.make_pmd(x) -#define pmd_val(x) paravirt_ops.pmd_val(x) -#endif +static inline void load_TR_desc(void) +{ + PVOP_VCALL0(load_tr_desc); +} +static inline void load_gdt(const struct Xgt_desc_struct *dtr) +{ + PVOP_VCALL1(load_gdt, dtr); +} +static inline void load_idt(const struct Xgt_desc_struct *dtr) +{ + PVOP_VCALL1(load_idt, dtr); +} +static inline void set_ldt(const void *addr, unsigned entries) +{ + PVOP_VCALL2(set_ldt, addr, entries); +} +static inline void store_gdt(struct Xgt_desc_struct *dtr) +{ + PVOP_VCALL1(store_gdt, dtr); +} +static inline void store_idt(struct Xgt_desc_struct *dtr) +{ + PVOP_VCALL1(store_idt, dtr); +} +static inline unsigned long paravirt_store_tr(void) +{ + return PVOP_CALL0(unsigned long, store_tr); +} +#define store_tr(tr) ((tr) = paravirt_store_tr()) +static inline void load_TLS(struct thread_struct *t, unsigned cpu) +{ + PVOP_VCALL2(load_tls, t, cpu); +} +static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high) +{ + PVOP_VCALL4(write_ldt_entry, dt, entry, low, high); +} +static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high) +{ + PVOP_VCALL4(write_gdt_entry, dt, entry, low, high); +} +static inline void write_idt_entry(void *dt, int entry, u32 low, u32 high) +{ + PVOP_VCALL4(write_idt_entry, dt, entry, low, high); +} +static inline void set_iopl_mask(unsigned mask) +{ + PVOP_VCALL1(set_iopl_mask, mask); +} /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { @@ -368,27 +657,27 @@ static inline void slow_down_io(void) { */ static inline void apic_write(unsigned long reg, unsigned long v) { - paravirt_ops.apic_write(reg,v); + PVOP_VCALL2(apic_write, reg, v); } static inline void apic_write_atomic(unsigned long reg, unsigned long v) { - paravirt_ops.apic_write_atomic(reg,v); + PVOP_VCALL2(apic_write_atomic, reg, v); } static inline unsigned long apic_read(unsigned long reg) { - return paravirt_ops.apic_read(reg); + return PVOP_CALL1(unsigned long, apic_read, reg); } static inline void setup_boot_clock(void) { - paravirt_ops.setup_boot_clock(); + PVOP_VCALL0(setup_boot_clock); } static inline void setup_secondary_clock(void) { - paravirt_ops.setup_secondary_clock(); + PVOP_VCALL0(setup_secondary_clock); } #endif @@ -408,93 +697,205 @@ static inline void paravirt_pagetable_setup_done(pgd_t *base) static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) { - return paravirt_ops.startup_ipi_hook(phys_apicid, start_eip, start_esp); + PVOP_VCALL3(startup_ipi_hook, phys_apicid, start_eip, start_esp); } #endif static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { - paravirt_ops.activate_mm(prev, next); + PVOP_VCALL2(activate_mm, prev, next); } static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { - paravirt_ops.dup_mmap(oldmm, mm); + PVOP_VCALL2(dup_mmap, oldmm, mm); } static inline void arch_exit_mmap(struct mm_struct *mm) { - paravirt_ops.exit_mmap(mm); + PVOP_VCALL1(exit_mmap, mm); } -#define __flush_tlb() paravirt_ops.flush_tlb_user() -#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() -#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr) +static inline void __flush_tlb(void) +{ + PVOP_VCALL0(flush_tlb_user); +} +static inline void __flush_tlb_global(void) +{ + PVOP_VCALL0(flush_tlb_kernel); +} +static inline void __flush_tlb_single(unsigned long addr) +{ + PVOP_VCALL1(flush_tlb_single, addr); +} -#define paravirt_map_pt_hook(type, va, pfn) paravirt_ops.map_pt_hook(type, va, pfn) +static inline void paravirt_map_pt_hook(int type, pte_t *va, u32 pfn) +{ + PVOP_VCALL3(map_pt_hook, type, va, pfn); +} -#define paravirt_alloc_pt(pfn) paravirt_ops.alloc_pt(pfn) -#define paravirt_release_pt(pfn) paravirt_ops.release_pt(pfn) +static inline void paravirt_alloc_pt(unsigned pfn) +{ + PVOP_VCALL1(alloc_pt, pfn); +} +static inline void paravirt_release_pt(unsigned pfn) +{ + PVOP_VCALL1(release_pt, pfn); +} -#define paravirt_alloc_pd(pfn) paravirt_ops.alloc_pd(pfn) -#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) \ - paravirt_ops.alloc_pd_clone(pfn, clonepfn, start, count) -#define paravirt_release_pd(pfn) paravirt_ops.release_pd(pfn) +static inline void paravirt_alloc_pd(unsigned pfn) +{ + PVOP_VCALL1(alloc_pd, pfn); +} + +static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn, + unsigned start, unsigned count) +{ + PVOP_VCALL4(alloc_pd_clone, pfn, clonepfn, start, count); +} +static inline void paravirt_release_pd(unsigned pfn) +{ + PVOP_VCALL1(release_pd, pfn); +} + +static inline void pte_update(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + PVOP_VCALL3(pte_update, mm, addr, ptep); +} + +static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + PVOP_VCALL3(pte_update_defer, mm, addr, ptep); +} + +#ifdef CONFIG_X86_PAE +static inline pte_t __pte(unsigned long long val) +{ + unsigned long long ret = PVOP_CALL2(unsigned long long, make_pte, + val, val >> 32); + return (pte_t) { ret, ret >> 32 }; +} + +static inline pmd_t __pmd(unsigned long long val) +{ + return (pmd_t) { PVOP_CALL2(unsigned long long, make_pmd, val, val >> 32) }; +} + +static inline pgd_t __pgd(unsigned long long val) +{ + return (pgd_t) { PVOP_CALL2(unsigned long long, make_pgd, val, val >> 32) }; +} + +static inline unsigned long long pte_val(pte_t x) +{ + return PVOP_CALL2(unsigned long long, pte_val, x.pte_low, x.pte_high); +} + +static inline unsigned long long pmd_val(pmd_t x) +{ + return PVOP_CALL2(unsigned long long, pmd_val, x.pmd, x.pmd >> 32); +} + +static inline unsigned long long pgd_val(pgd_t x) +{ + return PVOP_CALL2(unsigned long long, pgd_val, x.pgd, x.pgd >> 32); +} static inline void set_pte(pte_t *ptep, pte_t pteval) { - paravirt_ops.set_pte(ptep, pteval); + PVOP_VCALL3(set_pte, ptep, pteval.pte_low, pteval.pte_high); } static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { + /* 5 arg words */ paravirt_ops.set_pte_at(mm, addr, ptep, pteval); } +static inline void set_pte_atomic(pte_t *ptep, pte_t pteval) +{ + PVOP_VCALL3(set_pte_atomic, ptep, pteval.pte_low, pteval.pte_high); +} + +static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* 5 arg words */ + paravirt_ops.set_pte_present(mm, addr, ptep, pte); +} + static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) { - paravirt_ops.set_pmd(pmdp, pmdval); -} - -static inline void pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep) -{ - paravirt_ops.pte_update(mm, addr, ptep); -} - -static inline void pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) -{ - paravirt_ops.pte_update_defer(mm, addr, ptep); -} - -#ifdef CONFIG_X86_PAE -static inline void set_pte_atomic(pte_t *ptep, pte_t pteval) -{ - paravirt_ops.set_pte_atomic(ptep, pteval); -} - -static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) -{ - paravirt_ops.set_pte_present(mm, addr, ptep, pte); + PVOP_VCALL3(set_pmd, pmdp, pmdval.pmd, pmdval.pmd >> 32); } static inline void set_pud(pud_t *pudp, pud_t pudval) { - paravirt_ops.set_pud(pudp, pudval); + PVOP_VCALL3(set_pud, pudp, pudval.pgd.pgd, pudval.pgd.pgd >> 32); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - paravirt_ops.pte_clear(mm, addr, ptep); + PVOP_VCALL3(pte_clear, mm, addr, ptep); } static inline void pmd_clear(pmd_t *pmdp) { - paravirt_ops.pmd_clear(pmdp); + PVOP_VCALL1(pmd_clear, pmdp); } -#endif + +static inline pte_t raw_ptep_get_and_clear(pte_t *p) +{ + unsigned long long val = PVOP_CALL1(unsigned long long, ptep_get_and_clear, p); + return (pte_t) { val, val >> 32 }; +} +#else /* !CONFIG_X86_PAE */ +static inline pte_t __pte(unsigned long val) +{ + return (pte_t) { PVOP_CALL1(unsigned long, make_pte, val) }; +} + +static inline pgd_t __pgd(unsigned long val) +{ + return (pgd_t) { PVOP_CALL1(unsigned long, make_pgd, val) }; +} + +static inline unsigned long pte_val(pte_t x) +{ + return PVOP_CALL1(unsigned long, pte_val, x.pte_low); +} + +static inline unsigned long pgd_val(pgd_t x) +{ + return PVOP_CALL1(unsigned long, pgd_val, x.pgd); +} + +static inline void set_pte(pte_t *ptep, pte_t pteval) +{ + PVOP_VCALL2(set_pte, ptep, pteval.pte_low); +} + +static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + PVOP_VCALL4(set_pte_at, mm, addr, ptep, pteval.pte_low); +} + +static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + PVOP_VCALL2(set_pmd, pmdp, pmdval.pud.pgd.pgd); +} + +static inline pte_t raw_ptep_get_and_clear(pte_t *p) +{ + return (pte_t) { PVOP_CALL1(unsigned long, ptep_get_and_clear, p) }; +} +#endif /* CONFIG_X86_PAE */ /* Lazy mode for batching updates / context switch */ #define PARAVIRT_LAZY_NONE 0 @@ -503,14 +904,37 @@ static inline void pmd_clear(pmd_t *pmdp) #define PARAVIRT_LAZY_FLUSH 3 #define __HAVE_ARCH_ENTER_LAZY_CPU_MODE -#define arch_enter_lazy_cpu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_CPU) -#define arch_leave_lazy_cpu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_NONE) -#define arch_flush_lazy_cpu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_FLUSH) +static inline void arch_enter_lazy_cpu_mode(void) +{ + PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_CPU); +} + +static inline void arch_leave_lazy_cpu_mode(void) +{ + PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_NONE); +} + +static inline void arch_flush_lazy_cpu_mode(void) +{ + PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_FLUSH); +} + #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_MMU) -#define arch_leave_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_NONE) -#define arch_flush_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_FLUSH) +static inline void arch_enter_lazy_mmu_mode(void) +{ + PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_MMU); +} + +static inline void arch_leave_lazy_mmu_mode(void) +{ + PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_NONE); +} + +static inline void arch_flush_lazy_mmu_mode(void) +{ + PVOP_VCALL1(set_lazy_mode, PARAVIRT_LAZY_FLUSH); +} void _paravirt_nop(void); #define paravirt_nop ((void *)_paravirt_nop) @@ -603,6 +1027,16 @@ static inline unsigned long __raw_local_irq_save(void) paravirt_clobber(CLBR_EAX) #undef PARAVIRT_CALL +#undef PVOP_VCALL0 +#undef PVOP_CALL0 +#undef PVOP_VCALL1 +#undef PVOP_CALL1 +#undef PVOP_VCALL2 +#undef PVOP_CALL2 +#undef PVOP_VCALL3 +#undef PVOP_CALL3 +#undef PVOP_VCALL4 +#undef PVOP_CALL4 #else /* __ASSEMBLY__ */ From 294688c028e80fd467cdd22da79f62c5f311eaf5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:14 +0200 Subject: [PATCH 141/231] [PATCH] i386: PARAVIRT: Document asm-i386/paravirt.h Clean things up, and broadly document: - the paravirt_ops functions themselves - the patching mechanism Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell --- include/asm-i386/paravirt.h | 131 +++++++++++++++++++++++++++++++++--- 1 file changed, 121 insertions(+), 10 deletions(-) diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 837457b42dbe..8bfaf10d9961 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -21,6 +21,14 @@ struct Xgt_desc_struct; struct tss_struct; struct mm_struct; struct desc_struct; + +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE = 0, + PARAVIRT_LAZY_MMU = 1, + PARAVIRT_LAZY_CPU = 2, +}; + struct paravirt_ops { unsigned int kernel_rpl; @@ -37,22 +45,33 @@ struct paravirt_ops */ unsigned (*patch)(u8 type, u16 clobber, void *firstinsn, unsigned len); + /* Basic arch-specific setup */ void (*arch_setup)(void); char *(*memory_setup)(void); void (*init_IRQ)(void); + void (*time_init)(void); + /* + * Called before/after init_mm pagetable setup. setup_start + * may reset %cr3, and may pre-install parts of the pagetable; + * pagetable setup is expected to preserve any existing + * mapping. + */ void (*pagetable_setup_start)(pgd_t *pgd_base); void (*pagetable_setup_done)(pgd_t *pgd_base); + /* Print a banner to identify the environment */ void (*banner)(void); + /* Set and set time of day */ unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long); - void (*time_init)(void); + /* cpuid emulation, mostly so that caps bits can be disabled */ void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); + /* hooks for various privileged instructions */ unsigned long (*get_debugreg)(int regno); void (*set_debugreg)(int regno, unsigned long value); @@ -71,15 +90,23 @@ struct paravirt_ops unsigned long (*read_cr4)(void); void (*write_cr4)(unsigned long); + /* + * Get/set interrupt state. save_fl and restore_fl are only + * expected to use X86_EFLAGS_IF; all other bits + * returned from save_fl are undefined, and may be ignored by + * restore_fl. + */ unsigned long (*save_fl)(void); void (*restore_fl)(unsigned long); void (*irq_disable)(void); void (*irq_enable)(void); void (*safe_halt)(void); void (*halt)(void); + void (*wbinvd)(void); - /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + /* MSR, PMC and TSR operations. + err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ u64 (*read_msr)(unsigned int msr, int *err); int (*write_msr)(unsigned int msr, u64 val); @@ -88,6 +115,7 @@ struct paravirt_ops u64 (*get_scheduled_cycles)(void); unsigned long (*get_cpu_khz)(void); + /* Segment descriptor handling */ void (*load_tr_desc)(void); void (*load_gdt)(const struct Xgt_desc_struct *); void (*load_idt)(const struct Xgt_desc_struct *); @@ -105,9 +133,12 @@ struct paravirt_ops void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t); void (*set_iopl_mask)(unsigned mask); - void (*io_delay)(void); + /* + * Hooks for intercepting the creation/use/destruction of an + * mm_struct. + */ void (*activate_mm)(struct mm_struct *prev, struct mm_struct *next); void (*dup_mmap)(struct mm_struct *oldmm, @@ -115,30 +146,43 @@ struct paravirt_ops void (*exit_mmap)(struct mm_struct *mm); #ifdef CONFIG_X86_LOCAL_APIC + /* + * Direct APIC operations, principally for VMI. Ideally + * these shouldn't be in this interface. + */ void (*apic_write)(unsigned long reg, unsigned long v); void (*apic_write_atomic)(unsigned long reg, unsigned long v); unsigned long (*apic_read)(unsigned long reg); void (*setup_boot_clock)(void); void (*setup_secondary_clock)(void); + + void (*startup_ipi_hook)(int phys_apicid, + unsigned long start_eip, + unsigned long start_esp); #endif + /* TLB operations */ void (*flush_tlb_user)(void); void (*flush_tlb_kernel)(void); void (*flush_tlb_single)(unsigned long addr); void (*map_pt_hook)(int type, pte_t *va, u32 pfn); + /* Hooks for allocating/releasing pagetable pages */ void (*alloc_pt)(u32 pfn); void (*alloc_pd)(u32 pfn); void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count); void (*release_pt)(u32 pfn); void (*release_pd)(u32 pfn); + /* Pagetable manipulation functions */ void (*set_pte)(pte_t *ptep, pte_t pteval); - void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); + void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); - void (*pte_update_defer)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (*pte_update_defer)(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); pte_t (*ptep_get_and_clear)(pte_t *ptep); @@ -164,13 +208,12 @@ struct paravirt_ops pgd_t (*make_pgd)(unsigned long pgd); #endif - void (*set_lazy_mode)(int mode); + /* Set deferred update mode, used for batching operations. */ + void (*set_lazy_mode)(enum paravirt_lazy_mode mode); /* These two are jmp to, not actually called. */ void (*irq_enable_sysexit)(void); void (*iret)(void); - - void (*startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp); }; /* Mark a paravirt probe function. */ @@ -188,8 +231,10 @@ extern struct paravirt_ops paravirt_ops; #define paravirt_clobber(clobber) \ [paravirt_clobber] "i" (clobber) -#define PARAVIRT_CALL "call *(paravirt_ops+%c[paravirt_typenum]*4);" - +/* + * Generate some code, and mark it as patchable by the + * apply_paravirt() alternate instruction patcher. + */ #define _paravirt_alt(insn_string, type, clobber) \ "771:\n\t" insn_string "\n" "772:\n" \ ".pushsection .parainstructions,\"a\"\n" \ @@ -199,9 +244,74 @@ extern struct paravirt_ops paravirt_ops; " .short " clobber "\n" \ ".popsection\n" +/* Generate patchable code, with the default asm parameters. */ #define paravirt_alt(insn_string) \ _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") +/* + * This generates an indirect call based on the operation type number. + * The type number, computed in PARAVIRT_PATCH, is derived from the + * offset into the paravirt_ops structure, and can therefore be freely + * converted back into a structure offset. + */ +#define PARAVIRT_CALL "call *(paravirt_ops+%c[paravirt_typenum]*4);" + +/* + * These macros are intended to wrap calls into a paravirt_ops + * operation, so that they can be later identified and patched at + * runtime. + * + * Normally, a call to a pv_op function is a simple indirect call: + * (paravirt_ops.operations)(args...). + * + * Unfortunately, this is a relatively slow operation for modern CPUs, + * because it cannot necessarily determine what the destination + * address is. In this case, the address is a runtime constant, so at + * the very least we can patch the call to e a simple direct call, or + * ideally, patch an inline implementation into the callsite. (Direct + * calls are essentially free, because the call and return addresses + * are completely predictable.) + * + * These macros rely on the standard gcc "regparm(3)" calling + * convention, in which the first three arguments are placed in %eax, + * %edx, %ecx (in that order), and the remaining arguments are placed + * on the stack. All caller-save registers (eax,edx,ecx) are expected + * to be modified (either clobbered or used for return values). + * + * The call instruction itself is marked by placing its start address + * and size into the .parainstructions section, so that + * apply_paravirt() in arch/i386/kernel/alternative.c can do the + * appropriate patching under the control of the backend paravirt_ops + * implementation. + * + * Unfortunately there's no way to get gcc to generate the args setup + * for the call, and then allow the call itself to be generated by an + * inline asm. Because of this, we must do the complete arg setup and + * return value handling from within these macros. This is fairly + * cumbersome. + * + * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. + * It could be extended to more arguments, but there would be little + * to be gained from that. For each number of arguments, there are + * the two VCALL and CALL variants for void and non-void functions. + * + * When there is a return value, the invoker of the macro must specify + * the return type. The macro then uses sizeof() on that type to + * determine whether its a 32 or 64 bit value, and places the return + * in the right register(s) (just %eax for 32-bit, and %edx:%eax for + * 64-bit). + * + * 64-bit arguments are passed as a pair of adjacent 32-bit arguments + * in low,high order. + * + * Small structures are passed and returned in registers. The macro + * calling convention can't directly deal with this, so the wrapper + * functions must do this. + * + * These PVOP_* macros are only defined within this header. This + * means that all uses must be wrapped in inline functions. This also + * makes sure the incoming and outgoing types are always correct. + */ #define PVOP_CALL0(__rettype, __op) \ ({ \ __rettype __ret; \ @@ -1026,6 +1136,7 @@ static inline unsigned long __raw_local_irq_save(void) [paravirt_sti_type] "i" (PARAVIRT_PATCH(irq_enable)), \ paravirt_clobber(CLBR_EAX) +/* Make sure as little as possible of this mess escapes. */ #undef PARAVIRT_CALL #undef PVOP_VCALL0 #undef PVOP_CALL0 From 63f70270ccd981ce40a8ff58c03a8c2e97e368be Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:14 +0200 Subject: [PATCH 142/231] [PATCH] i386: PARAVIRT: add common patching machinery Implement the actual patching machinery. paravirt_patch_default() contains the logic to automatically patch a callsite based on a few simple rules: - if the paravirt_op function is paravirt_nop, then patch nops - if the paravirt_op function is a jmp target, then jmp to it - if the paravirt_op function is callable and doesn't clobber too much for the callsite, call it directly paravirt_patch_default is suitable as a default implementation of paravirt_ops.patch, will remove most of the expensive indirect calls in favour of either a direct call or a pile of nops. Backends may implement their own patcher, however. There are several helper functions to help with this: paravirt_patch_nop nop out a callsite paravirt_patch_ignore leave the callsite as-is paravirt_patch_call patch a call if the caller and callee have compatible clobbers paravirt_patch_jmp patch in a jmp paravirt_patch_insns patch some literal instructions over the callsite, if they fit This patch also implements more direct patches for the native case, so that when running on native hardware many common operations are implemented inline. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Zachary Amsden Cc: Anthony Liguori Acked-by: Ingo Molnar --- arch/i386/kernel/alternative.c | 5 +- arch/i386/kernel/paravirt.c | 156 +++++++++++++++++++++++++++------ include/asm-i386/paravirt.h | 12 +++ 3 files changed, 145 insertions(+), 28 deletions(-) diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index dae3ded9041c..c5d037c60950 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -336,11 +336,14 @@ void apply_paravirt(struct paravirt_patch_site *start, used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr, p->len); + BUG_ON(used > p->len); + /* Pad the rest with nops */ nop_out(p->instr + used, p->len - used); } - /* Sync to be conservative, in case we patched following instructions */ + /* Sync to be conservative, in case we patched following + * instructions */ sync_core(); } extern struct paravirt_patch_site __start_parainstructions[], diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index f2982832d3b9..b0ed163e6f70 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -54,40 +54,142 @@ char *memory_setup(void) #define DEF_NATIVE(name, code) \ extern const char start_##name[], end_##name[]; \ asm("start_" #name ": " code "; end_" #name ":") -DEF_NATIVE(cli, "cli"); -DEF_NATIVE(sti, "sti"); -DEF_NATIVE(popf, "push %eax; popf"); -DEF_NATIVE(pushf, "pushf; pop %eax"); -DEF_NATIVE(iret, "iret"); -DEF_NATIVE(sti_sysexit, "sti; sysexit"); -static const struct native_insns -{ - const char *start, *end; -} native_insns[] = { - [PARAVIRT_PATCH(irq_disable)] = { start_cli, end_cli }, - [PARAVIRT_PATCH(irq_enable)] = { start_sti, end_sti }, - [PARAVIRT_PATCH(restore_fl)] = { start_popf, end_popf }, - [PARAVIRT_PATCH(save_fl)] = { start_pushf, end_pushf }, - [PARAVIRT_PATCH(iret)] = { start_iret, end_iret }, - [PARAVIRT_PATCH(irq_enable_sysexit)] = { start_sti_sysexit, end_sti_sysexit }, -}; +DEF_NATIVE(irq_disable, "cli"); +DEF_NATIVE(irq_enable, "sti"); +DEF_NATIVE(restore_fl, "push %eax; popf"); +DEF_NATIVE(save_fl, "pushf; pop %eax"); +DEF_NATIVE(iret, "iret"); +DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); +DEF_NATIVE(read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(clts, "clts"); +DEF_NATIVE(read_tsc, "rdtsc"); + +DEF_NATIVE(ud2a, "ud2a"); static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) { - unsigned int insn_len; + const unsigned char *start, *end; + unsigned ret; - /* Don't touch it if we don't have a replacement */ - if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start) - return len; + switch(type) { +#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site + SITE(irq_disable); + SITE(irq_enable); + SITE(restore_fl); + SITE(save_fl); + SITE(iret); + SITE(irq_enable_sysexit); + SITE(read_cr2); + SITE(read_cr3); + SITE(write_cr3); + SITE(clts); + SITE(read_tsc); +#undef SITE - insn_len = native_insns[type].end - native_insns[type].start; + patch_site: + ret = paravirt_patch_insns(insns, len, start, end); + break; - /* Similarly if we can't fit replacement. */ - if (len < insn_len) - return len; + case PARAVIRT_PATCH(make_pgd): + case PARAVIRT_PATCH(make_pte): + case PARAVIRT_PATCH(pgd_val): + case PARAVIRT_PATCH(pte_val): +#ifdef CONFIG_X86_PAE + case PARAVIRT_PATCH(make_pmd): + case PARAVIRT_PATCH(pmd_val): +#endif + /* These functions end up returning exactly what + they're passed, in the same registers. */ + ret = paravirt_patch_nop(); + break; + + default: + ret = paravirt_patch_default(type, clobbers, insns, len); + break; + } + + return ret; +} + +unsigned paravirt_patch_nop(void) +{ + return 0; +} + +unsigned paravirt_patch_ignore(unsigned len) +{ + return len; +} + +unsigned paravirt_patch_call(void *target, u16 tgt_clobbers, + void *site, u16 site_clobbers, + unsigned len) +{ + unsigned char *call = site; + unsigned long delta = (unsigned long)target - (unsigned long)(call+5); + + if (tgt_clobbers & ~site_clobbers) + return len; /* target would clobber too much for this site */ + if (len < 5) + return len; /* call too long for patch site */ + + *call++ = 0xe8; /* call */ + *(unsigned long *)call = delta; + + return 5; +} + +unsigned paravirt_patch_jmp(void *target, void *site, unsigned len) +{ + unsigned char *jmp = site; + unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5); + + if (len < 5) + return len; /* call too long for patch site */ + + *jmp++ = 0xe9; /* jmp */ + *(unsigned long *)jmp = delta; + + return 5; +} + +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len) +{ + void *opfunc = *((void **)¶virt_ops + type); + unsigned ret; + + if (opfunc == NULL) + /* If there's no function, patch it with a ud2a (BUG) */ + ret = paravirt_patch_insns(site, len, start_ud2a, end_ud2a); + else if (opfunc == paravirt_nop) + /* If the operation is a nop, then nop the callsite */ + ret = paravirt_patch_nop(); + else if (type == PARAVIRT_PATCH(iret) || + type == PARAVIRT_PATCH(irq_enable_sysexit)) + /* If operation requires a jmp, then jmp */ + ret = paravirt_patch_jmp(opfunc, site, len); + else + /* Otherwise call the function; assume target could + clobber any caller-save reg */ + ret = paravirt_patch_call(opfunc, CLBR_ANY, + site, clobbers, len); + + return ret; +} + +unsigned paravirt_patch_insns(void *site, unsigned len, + const char *start, const char *end) +{ + unsigned insn_len = end - start; + + if (insn_len > len || start == NULL) + insn_len = len; + else + memcpy(site, start, insn_len); - memcpy(insns, native_insns[type].start, insn_len); return insn_len; } @@ -110,7 +212,7 @@ static void native_flush_tlb_global(void) __native_flush_tlb_global(); } -static void native_flush_tlb_single(u32 addr) +static void native_flush_tlb_single(unsigned long addr) { __native_flush_tlb_single(addr); } diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 8bfaf10d9961..4b3d50858670 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -248,6 +248,18 @@ extern struct paravirt_ops paravirt_ops; #define paravirt_alt(insn_string) \ _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") +unsigned paravirt_patch_nop(void); +unsigned paravirt_patch_ignore(unsigned len); +unsigned paravirt_patch_call(void *target, u16 tgt_clobbers, + void *site, u16 site_clobbers, + unsigned len); +unsigned paravirt_patch_jmp(void *target, void *site, unsigned len); +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len); + +unsigned paravirt_patch_insns(void *site, unsigned len, + const char *start, const char *end); + + /* * This generates an indirect call based on the operation type number. * The type number, computed in PARAVIRT_PATCH, is derived from the From d4c104771a1c58e3de2a888b73b0ba1b54c0ae76 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH 143/231] [PATCH] i386: PARAVIRT: add flush_tlb_others paravirt_op This patch adds a pv_op for flush_tlb_others. Linux running on native hardware uses cross-CPU IPIs to flush the TLB on any CPU which may have a particular mm's pagetable entries cached in its TLB. This is inefficient in a paravirtualized environment, since the hypervisor knows which real CPUs actually contain cached mappings, which may be a small subset of a guest's VCPUs. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/kernel/paravirt.c | 1 + arch/i386/kernel/smp.c | 13 +++++++------ include/asm-i386/paravirt.h | 9 +++++++++ include/asm-i386/tlbflush.h | 19 +++++++++++++++++-- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index b0ed163e6f70..c7f0cf92925f 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -300,6 +300,7 @@ struct paravirt_ops paravirt_ops = { .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, + .flush_tlb_others = native_flush_tlb_others, .map_pt_hook = paravirt_nop, diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 9d84f6f001bf..892cd64130bc 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -256,7 +256,6 @@ static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); -#define FLUSH_ALL 0xffffffff /* * We cannot call mmdrop() because we are in interrupt context, @@ -338,7 +337,7 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs) if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) + if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(flush_va); @@ -353,9 +352,11 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs) put_cpu_no_resched(); } -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, + unsigned long va) { + cpumask_t cpumask = *cpumaskp; + /* * A couple of (to be removed) sanity checks: * @@ -417,7 +418,7 @@ void flush_tlb_current_task(void) local_flush_tlb(); if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -436,7 +437,7 @@ void flush_tlb_mm (struct mm_struct * mm) leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 4b3d50858670..f880b06d6d56 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -15,6 +15,7 @@ #ifndef __ASSEMBLY__ #include +#include struct thread_struct; struct Xgt_desc_struct; @@ -165,6 +166,8 @@ struct paravirt_ops void (*flush_tlb_user)(void); void (*flush_tlb_kernel)(void); void (*flush_tlb_single)(unsigned long addr); + void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm, + unsigned long va); void (*map_pt_hook)(int type, pte_t *va, u32 pfn); @@ -853,6 +856,12 @@ static inline void __flush_tlb_single(unsigned long addr) PVOP_VCALL1(flush_tlb_single, addr); } +static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, + unsigned long va) +{ + PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va); +} + static inline void paravirt_map_pt_hook(int type, pte_t *va, u32 pfn) { PVOP_VCALL3(map_pt_hook, type, va, pfn); diff --git a/include/asm-i386/tlbflush.h b/include/asm-i386/tlbflush.h index 4dd82840d53b..db7f77eacfa0 100644 --- a/include/asm-i386/tlbflush.h +++ b/include/asm-i386/tlbflush.h @@ -79,11 +79,15 @@ * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables + * - flush_tlb_others(cpumask, mm, va) flushes a TLBs on other cpus * * ..but the i386 has somewhat limited tlb flushing capabilities, * and page-granular flushes are available only on i486 and up. */ +#define TLB_FLUSH_ALL 0xffffffff + + #ifndef CONFIG_SMP #define flush_tlb() __flush_tlb() @@ -110,7 +114,12 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, __flush_tlb(); } -#else +static inline void native_flush_tlb_others(const cpumask_t *cpumask, + struct mm_struct *mm, unsigned long va) +{ +} + +#else /* SMP */ #include @@ -129,6 +138,9 @@ static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long st flush_tlb_mm(vma->vm_mm); } +void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm, + unsigned long va); + #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 @@ -139,8 +151,11 @@ struct tlb_state char __cacheline_padding[L1_CACHE_BYTES-8]; }; DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); +#endif /* SMP */ - +#ifndef CONFIG_PARAVIRT +#define flush_tlb_others(mask, mm, va) \ + native_flush_tlb_others(&mask, mm, va) #endif #define flush_tlb_kernel_range(start, end) flush_tlb_all() From a27fe809b82c5e18932fcceded28d0d1481ce7bb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH 144/231] [PATCH] i386: PARAVIRT: revert map_pt_hook. Back out the map_pt_hook to clear the way for kmap_atomic_pte. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Zachary Amsden --- arch/i386/kernel/paravirt.c | 2 -- arch/i386/kernel/vmi.c | 2 ++ include/asm-i386/paravirt.h | 7 ------- include/asm-i386/pgtable.h | 23 ++++------------------- 4 files changed, 6 insertions(+), 28 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index c7f0cf92925f..13f41b5c887a 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -302,8 +302,6 @@ struct paravirt_ops paravirt_ops = { .flush_tlb_single = native_flush_tlb_single, .flush_tlb_others = native_flush_tlb_others, - .map_pt_hook = paravirt_nop, - .alloc_pt = paravirt_nop, .alloc_pd = paravirt_nop, .alloc_pd_clone = paravirt_nop, diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index b8d01c3cbff4..ccad7ee960aa 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -851,8 +851,10 @@ static inline int __init activate_vmi(void) paravirt_ops.release_pt = vmi_release_pt; paravirt_ops.release_pd = vmi_release_pd; } +#if 0 para_wrap(map_pt_hook, vmi_map_pt_hook, set_linear_mapping, SetLinearMapping); +#endif /* * These MUST always be patched. Don't support indirect jumps diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index f880b06d6d56..10f44af76b49 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -169,8 +169,6 @@ struct paravirt_ops void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm, unsigned long va); - void (*map_pt_hook)(int type, pte_t *va, u32 pfn); - /* Hooks for allocating/releasing pagetable pages */ void (*alloc_pt)(u32 pfn); void (*alloc_pd)(u32 pfn); @@ -862,11 +860,6 @@ static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va); } -static inline void paravirt_map_pt_hook(int type, pte_t *va, u32 pfn) -{ - PVOP_VCALL3(map_pt_hook, type, va, pfn); -} - static inline void paravirt_alloc_pt(unsigned pfn) { PVOP_VCALL1(alloc_pt, pfn); diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 5b88a6a1278e..6599f2aa91b7 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -267,7 +267,6 @@ extern void vmalloc_sync_all(void); */ #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) -#define paravirt_map_pt_hook(slot, va, pfn) do { } while (0) #define raw_ptep_get_and_clear(xp) native_ptep_get_and_clear(xp) #endif @@ -476,24 +475,10 @@ extern pte_t *lookup_address(unsigned long address); #endif #if defined(CONFIG_HIGHPTE) -#define pte_offset_map(dir, address) \ -({ \ - pte_t *__ptep; \ - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \ - __ptep = (pte_t *)kmap_atomic(pfn_to_page(pfn),KM_PTE0);\ - paravirt_map_pt_hook(KM_PTE0,__ptep, pfn); \ - __ptep = __ptep + pte_index(address); \ - __ptep; \ -}) -#define pte_offset_map_nested(dir, address) \ -({ \ - pte_t *__ptep; \ - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \ - __ptep = (pte_t *)kmap_atomic(pfn_to_page(pfn),KM_PTE1);\ - paravirt_map_pt_hook(KM_PTE1,__ptep, pfn); \ - __ptep = __ptep + pte_index(address); \ - __ptep; \ -}) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) +#define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) #else From ce6234b5298902aaec831a67d5f8d9bd2ef5a488 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH 145/231] [PATCH] i386: PARAVIRT: add kmap_atomic_pte for mapping highpte pages Xen and VMI both have special requirements when mapping a highmem pte page into the kernel address space. These can be dealt with by adding a new kmap_atomic_pte() function for mapping highptes, and hooking it into the paravirt_ops infrastructure. Xen specifically wants to map the pte page RO, so this patch exposes a helper function, kmap_atomic_prot, which maps the page with the specified page protections. This also adds a kmap_flush_unused() function to clear out the cached kmap mappings. Xen needs this to clear out any potential stray RW mappings of pages which will become part of a pagetable. [ Zach - vmi.c will need some attention after this patch. It wasn't immediately obvious to me what needs to be done. ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Zachary Amsden --- arch/i386/kernel/paravirt.c | 5 +++++ arch/i386/mm/highmem.c | 9 +++++++-- include/asm-i386/highmem.h | 6 ++++++ include/asm-i386/paravirt.h | 15 +++++++++++++++ include/asm-i386/pgtable.h | 4 ++-- include/linux/highmem.h | 6 ++++++ mm/highmem.c | 9 +++++++++ 7 files changed, 50 insertions(+), 4 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 13f41b5c887a..596f382c641c 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -316,6 +317,10 @@ struct paravirt_ops paravirt_ops = { .ptep_get_and_clear = native_ptep_get_and_clear, +#ifdef CONFIG_HIGHPTE + .kmap_atomic_pte = kmap_atomic, +#endif + #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .set_pte_present = native_set_pte_present, diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index ac70d09df7ee..a1a21abf742e 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -26,7 +26,7 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; @@ -41,12 +41,17 @@ void *kmap_atomic(struct page *page, enum km_type type) return page_address(page); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); arch_flush_lazy_mmu_mode(); return (void*) vaddr; } +void *kmap_atomic(struct page *page, enum km_type type) +{ + return kmap_atomic_prot(page, type, kmap_prot); +} + void kunmap_atomic(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; diff --git a/include/asm-i386/highmem.h b/include/asm-i386/highmem.h index e9a34ebc25d5..13cdcd66fff2 100644 --- a/include/asm-i386/highmem.h +++ b/include/asm-i386/highmem.h @@ -24,6 +24,7 @@ #include #include #include +#include /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; @@ -67,11 +68,16 @@ extern void FASTCALL(kunmap_high(struct page *page)); void *kmap(struct page *page); void kunmap(struct page *page); +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); void *kmap_atomic(struct page *page, enum km_type type); void kunmap_atomic(void *kvaddr, enum km_type type); void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); struct page *kmap_atomic_to_page(void *ptr); +#ifndef CONFIG_PARAVIRT +#define kmap_atomic_pte(page, type) kmap_atomic(page, type) +#endif + #define flush_cache_kmaps() do { } while (0) #endif /* __KERNEL__ */ diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 10f44af76b49..5048b41428fa 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -16,7 +16,9 @@ #ifndef __ASSEMBLY__ #include #include +#include +struct page; struct thread_struct; struct Xgt_desc_struct; struct tss_struct; @@ -187,6 +189,10 @@ struct paravirt_ops pte_t (*ptep_get_and_clear)(pte_t *ptep); +#ifdef CONFIG_HIGHPTE + void *(*kmap_atomic_pte)(struct page *page, enum km_type type); +#endif + #ifdef CONFIG_X86_PAE void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); @@ -884,6 +890,15 @@ static inline void paravirt_release_pd(unsigned pfn) PVOP_VCALL1(release_pd, pfn); } +#ifdef CONFIG_HIGHPTE +static inline void *kmap_atomic_pte(struct page *page, enum km_type type) +{ + unsigned long ret; + ret = PVOP_CALL2(unsigned long, kmap_atomic_pte, page, type); + return (void *)ret; +} +#endif + static inline void pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 6599f2aa91b7..befc697821e5 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -476,9 +476,9 @@ extern pte_t *lookup_address(unsigned long address); #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) #define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) #else diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 645d440807c2..bca8e2dfa355 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -27,6 +27,8 @@ static inline void flush_kernel_dcache_page(struct page *page) unsigned int nr_free_highpages(void); extern unsigned long totalhigh_pages; +void kmap_flush_unused(void); + #else /* CONFIG_HIGHMEM */ static inline unsigned int nr_free_highpages(void) { return 0; } @@ -44,9 +46,13 @@ static inline void *kmap(struct page *page) #define kmap_atomic(page, idx) \ ({ pagefault_disable(); page_address(page); }) +#define kmap_atomic_prot(page, idx, prot) kmap_atomic(page, idx) + #define kunmap_atomic(addr, idx) do { pagefault_enable(); } while (0) #define kmap_atomic_pfn(pfn, idx) kmap_atomic(pfn_to_page(pfn), (idx)) #define kmap_atomic_to_page(ptr) virt_to_page(ptr) + +#define kmap_flush_unused() do {} while(0) #endif #endif /* CONFIG_HIGHMEM */ diff --git a/mm/highmem.c b/mm/highmem.c index 51e1c1995fec..be8f8d36a8b9 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -99,6 +99,15 @@ static void flush_all_zero_pkmaps(void) flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } +/* Flush all unused kmap mappings in order to remove stray + mappings. */ +void kmap_flush_unused(void) +{ + spin_lock(&kmap_lock); + flush_all_zero_pkmaps(); + spin_unlock(&kmap_lock); +} + static inline unsigned long map_new_virtual(struct page *page) { unsigned long vaddr; From 7b2f27f4e1818fad980da7bea688dca2b9e9c3f3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH 146/231] [PATCH] i386: PARAVIRT: flush lazy mmu updates on kunmap_atomic kunmap_atomic should flush any pending lazy mmu updates, mainly to be consistent with kmap_atomic, and to preserve its normal behaviour. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/mm/highmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index a1a21abf742e..ad8d86cc683e 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -72,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) #endif } + arch_flush_lazy_mmu_mode(); pagefault_enable(); } From 4e0fa85602a4fa219fc3a9c053d5140bf987d3e3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH 147/231] [PATCH] i386: PARAVIRT: Use enums for paravirt lazy flush modi Remove #defines, add enum for PARAVIRT_LAZY_FLUSH. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- include/asm-i386/paravirt.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 5048b41428fa..c5451923c79e 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -30,6 +30,7 @@ enum paravirt_lazy_mode { PARAVIRT_LAZY_NONE = 0, PARAVIRT_LAZY_MMU = 1, PARAVIRT_LAZY_CPU = 2, + PARAVIRT_LAZY_FLUSH = 3, }; struct paravirt_ops @@ -1036,12 +1037,6 @@ static inline pte_t raw_ptep_get_and_clear(pte_t *p) } #endif /* CONFIG_X86_PAE */ -/* Lazy mode for batching updates / context switch */ -#define PARAVIRT_LAZY_NONE 0 -#define PARAVIRT_LAZY_MMU 1 -#define PARAVIRT_LAZY_CPU 2 -#define PARAVIRT_LAZY_FLUSH 3 - #define __HAVE_ARCH_ENTER_LAZY_CPU_MODE static inline void arch_enter_lazy_cpu_mode(void) { From 1a45b7aaa5051489b46afbc48509bd91f8b4a1ba Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH 148/231] [PATCH] i386: PARAVIRT: Clean up paravirt patchable wrappers Replace all the open-coded macros for generating calls with a pair of more general macros (__PVOP_CALL/VCALL), and redefine all the PVOP_V?CALL[0-4] in terms of them. [ Andrew, Andi: this should slot in immediately after "Document asm-i386/paravirt.h" (paravirt_ops-document-asm-i386-paravirth.patch) ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Ingo Molnar --- include/asm-i386/paravirt.h | 254 +++++++++--------------------------- 1 file changed, 63 insertions(+), 191 deletions(-) diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index c5451923c79e..2ba18963e114 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -332,211 +332,81 @@ unsigned paravirt_patch_insns(void *site, unsigned len, * means that all uses must be wrapped in inline functions. This also * makes sure the incoming and outgoing types are always correct. */ -#define PVOP_CALL0(__rettype, __op) \ - ({ \ - __rettype __ret; \ - if (sizeof(__rettype) > sizeof(unsigned long)) { \ - unsigned long long __tmp; \ - unsigned long __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=A" (__tmp), "=c" (__ecx) \ - : paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - __ret = (__rettype)__tmp; \ - } else { \ - unsigned long __tmp, __edx, __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=a" (__tmp), "=d" (__edx), \ - "=c" (__ecx) \ - : paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - __ret = (__rettype)__tmp; \ - } \ - __ret; \ - }) -#define PVOP_VCALL0(__op) \ +#define __PVOP_CALL(rettype, op, pre, post, ...) \ ({ \ + rettype __ret; \ unsigned long __eax, __edx, __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ - : paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - }) - -#define PVOP_CALL1(__rettype, __op, arg1) \ - ({ \ - __rettype __ret; \ - if (sizeof(__rettype) > sizeof(unsigned long)) { \ - unsigned long long __tmp; \ - unsigned long __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=A" (__tmp), "=c" (__ecx) \ - : "a" ((u32)(arg1)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - __ret = (__rettype)__tmp; \ - } else { \ - unsigned long __tmp, __edx, __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=a" (__tmp), "=d" (__edx), \ - "=c" (__ecx) \ - : "0" ((u32)(arg1)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - __ret = (__rettype)__tmp; \ - } \ - __ret; \ - }) -#define PVOP_VCALL1(__op, arg1) \ - ({ \ - unsigned long __eax, __edx, __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ - : "0" ((u32)(arg1)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - }) - -#define PVOP_CALL2(__rettype, __op, arg1, arg2) \ - ({ \ - __rettype __ret; \ - if (sizeof(__rettype) > sizeof(unsigned long)) { \ - unsigned long long __tmp; \ - unsigned long __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=A" (__tmp), "=c" (__ecx) \ - : "a" ((u32)(arg1)), \ - "d" ((u32)(arg2)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - __ret = (__rettype)__tmp; \ - } else { \ - unsigned long __tmp, __edx, __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=a" (__tmp), "=d" (__edx), \ - "=c" (__ecx) \ - : "0" ((u32)(arg1)), \ - "1" ((u32)(arg2)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - __ret = (__rettype)__tmp; \ - } \ - __ret; \ - }) -#define PVOP_VCALL2(__op, arg1, arg2) \ - ({ \ - unsigned long __eax, __edx, __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ - : "0" ((u32)(arg1)), \ - "1" ((u32)(arg2)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - }) - -#define PVOP_CALL3(__rettype, __op, arg1, arg2, arg3) \ - ({ \ - __rettype __ret; \ - if (sizeof(__rettype) > sizeof(unsigned long)) { \ - unsigned long long __tmp; \ - unsigned long __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=A" (__tmp), "=c" (__ecx) \ - : "a" ((u32)(arg1)), \ - "d" ((u32)(arg2)), \ - "1" ((u32)(arg3)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - __ret = (__rettype)__tmp; \ - } else { \ - unsigned long __tmp, __edx, __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=a" (__tmp), "=d" (__edx), \ - "=c" (__ecx) \ - : "0" ((u32)(arg1)), \ - "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - __ret = (__rettype)__tmp; \ - } \ - __ret; \ - }) -#define PVOP_VCALL3(__op, arg1, arg2, arg3) \ - ({ \ - unsigned long __eax, __edx, __ecx; \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ - : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ - : "0" ((u32)(arg1)), \ - "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc"); \ - }) - -#define PVOP_CALL4(__rettype, __op, arg1, arg2, arg3, arg4) \ - ({ \ - __rettype __ret; \ - if (sizeof(__rettype) > sizeof(unsigned long)) { \ - unsigned long long __tmp; \ - unsigned long __ecx; \ - asm volatile("push %[_arg4]; " \ + if (sizeof(rettype) > sizeof(unsigned long)) { \ + asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ - "lea 4(%%esp),%%esp" \ - : "=A" (__tmp), "=c" (__ecx) \ - : "a" ((u32)(arg1)), \ - "d" ((u32)(arg2)), \ - "1" ((u32)(arg3)), \ - [_arg4] "mr" ((u32)(arg4)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ - : "memory", "cc",); \ - __ret = (__rettype)__tmp; \ - } else { \ - unsigned long __tmp, __edx, __ecx; \ - asm volatile("push %[_arg4]; " \ - paravirt_alt(PARAVIRT_CALL) \ - "lea 4(%%esp),%%esp" \ - : "=a" (__tmp), "=d" (__edx), "=c" (__ecx) \ - : "0" ((u32)(arg1)), \ - "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), \ - [_arg4]"mr" ((u32)(arg4)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ + post \ + : "=a" (__eax), "=d" (__edx), \ + "=c" (__ecx) \ + : paravirt_type(op), \ + paravirt_clobber(CLBR_ANY), \ + ##__VA_ARGS__ \ : "memory", "cc"); \ - __ret = (__rettype)__tmp; \ + __ret = (rettype)((((u64)__edx) << 32) | __eax); \ + } else { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : "=a" (__eax), "=d" (__edx), \ + "=c" (__ecx) \ + : paravirt_type(op), \ + paravirt_clobber(CLBR_ANY), \ + ##__VA_ARGS__ \ + : "memory", "cc"); \ + __ret = (rettype)__eax; \ } \ __ret; \ }) -#define PVOP_VCALL4(__op, arg1, arg2, arg3, arg4) \ +#define __PVOP_VCALL(op, pre, post, ...) \ ({ \ unsigned long __eax, __edx, __ecx; \ - asm volatile("push %[_arg4]; " \ + asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ - "lea 4(%%esp),%%esp" \ + post \ : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \ - : "0" ((u32)(arg1)), \ - "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), \ - [_arg4]"mr" ((u32)(arg4)), \ - paravirt_type(__op), \ - paravirt_clobber(CLBR_ANY) \ + : paravirt_type(op), \ + paravirt_clobber(CLBR_ANY), \ + ##__VA_ARGS__ \ : "memory", "cc"); \ }) +#define PVOP_CALL0(rettype, op) \ + __PVOP_CALL(rettype, op, "", "") +#define PVOP_VCALL0(op) \ + __PVOP_VCALL(op, "", "") + +#define PVOP_CALL1(rettype, op, arg1) \ + __PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1))) +#define PVOP_VCALL1(op, arg1) \ + __PVOP_VCALL(op, "", "", "0" ((u32)(arg1))) + +#define PVOP_CALL2(rettype, op, arg1, arg2) \ + __PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2))) +#define PVOP_VCALL2(op, arg1, arg2) \ + __PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2))) + +#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ + __PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)), \ + "1"((u32)(arg2)), "2"((u32)(arg3))) +#define PVOP_VCALL3(op, arg1, arg2, arg3) \ + __PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1"((u32)(arg2)), \ + "2"((u32)(arg3))) + +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) + static inline int paravirt_enabled(void) { return paravirt_ops.paravirt_enabled; @@ -1162,6 +1032,8 @@ static inline unsigned long __raw_local_irq_save(void) /* Make sure as little as possible of this mess escapes. */ #undef PARAVIRT_CALL +#undef __PVOP_CALL +#undef __PVOP_VCALL #undef PVOP_VCALL0 #undef PVOP_CALL0 #undef PVOP_VCALL1 From 4cdd9c8931767e1c56a51a1078d33a8c340f4405 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH 149/231] [PATCH] i386: PARAVIRT: drop unused ptep_get_and_clear In shadow mode hypervisors, ptep_get_and_clear achieves the desired purpose of keeping the shadows in sync by issuing a native_get_and_clear, followed by a call to pte_update, which indicates the PTE has been modified. Direct mode hypervisors (Xen) have no need for this anyway, and will trap the update using writable pagetables. This means no hypervisor makes use of ptep_get_and_clear; there is no reason to have it in the paravirt-ops structure. Change confusing terminology about raw vs. native functions into consistent use of native_pte_xxx for operations which do not invoke paravirt-ops. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/i386/kernel/paravirt.c | 2 -- include/asm-i386/paravirt.h | 13 +------------ include/asm-i386/pgtable.h | 4 +--- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 596f382c641c..c4850ddd6a9d 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -315,8 +315,6 @@ struct paravirt_ops paravirt_ops = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .ptep_get_and_clear = native_ptep_get_and_clear, - #ifdef CONFIG_HIGHPTE .kmap_atomic_pte = kmap_atomic, #endif diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 2ba18963e114..e2e7f98723c5 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -188,8 +188,6 @@ struct paravirt_ops void (*pte_update_defer)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); - pte_t (*ptep_get_and_clear)(pte_t *ptep); - #ifdef CONFIG_HIGHPTE void *(*kmap_atomic_pte)(struct page *page, enum km_type type); #endif @@ -859,12 +857,8 @@ static inline void pmd_clear(pmd_t *pmdp) PVOP_VCALL1(pmd_clear, pmdp); } -static inline pte_t raw_ptep_get_and_clear(pte_t *p) -{ - unsigned long long val = PVOP_CALL1(unsigned long long, ptep_get_and_clear, p); - return (pte_t) { val, val >> 32 }; -} #else /* !CONFIG_X86_PAE */ + static inline pte_t __pte(unsigned long val) { return (pte_t) { PVOP_CALL1(unsigned long, make_pte, val) }; @@ -900,11 +894,6 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) { PVOP_VCALL2(set_pmd, pmdp, pmdval.pud.pgd.pgd); } - -static inline pte_t raw_ptep_get_and_clear(pte_t *p) -{ - return (pte_t) { PVOP_CALL1(unsigned long, ptep_get_and_clear, p) }; -} #endif /* CONFIG_X86_PAE */ #define __HAVE_ARCH_ENTER_LAZY_CPU_MODE diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index befc697821e5..e7ddd2341309 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -267,8 +267,6 @@ extern void vmalloc_sync_all(void); */ #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) - -#define raw_ptep_get_and_clear(xp) native_ptep_get_and_clear(xp) #endif /* @@ -335,7 +333,7 @@ do { \ #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte = raw_ptep_get_and_clear(ptep); + pte_t pte = native_ptep_get_and_clear(ptep); pte_update(mm, addr, ptep); return pte; } From 35c7422649ee7a3d0eb4ebd32c997eeb45f81046 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH 150/231] [PATCH] x86: deflate stack usage in lib/inflate.c inflate_fixed and huft_build together use around 2.7k of stack. When using 4k stacks, I saw stack overflows from interrupts arriving while unpacking the root initrd: do_IRQ: stack overflow: 384 [] show_trace_log_lvl+0x1a/0x30 [] show_trace+0x12/0x14 [] dump_stack+0x16/0x18 [] do_IRQ+0x6d/0xd9 [] xen_evtchn_do_upcall+0x6e/0xa2 [] xen_hypervisor_callback+0x25/0x2c [] xen_restore_fl+0x27/0x29 [] _spin_unlock_irqrestore+0x4a/0x50 [] change_page_attr+0x577/0x584 [] kernel_map_pages+0x8d/0xb4 [] cache_alloc_refill+0x53f/0x632 [] __kmalloc+0xc1/0x10d [] malloc+0x10/0x12 [] huft_build+0x2a7/0x5fa [] inflate_fixed+0x91/0x136 [] unpack_to_rootfs+0x5f2/0x8c1 [] populate_rootfs+0x1e/0xe4 (This was under Xen, but there's no reason it couldn't happen on bare hardware.) This patch mallocs the local variables, thereby reducing the stack usage to sane levels. Also, up the heap size for the kernel decompressor to deal with the extra allocation. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Tim Yamin Cc: Andi Kleen Cc: Matt Mackall Cc: Ivan Kokshaysky Cc: Richard Henderson Cc: Russell King Cc: Ian Molton --- arch/alpha/boot/misc.c | 2 +- arch/arm/boot/compressed/misc.c | 2 +- arch/arm26/boot/compressed/misc.c | 2 +- arch/i386/boot/compressed/misc.c | 2 +- arch/x86_64/boot/compressed/misc.c | 2 +- lib/inflate.c | 66 ++++++++++++++++++++++-------- 6 files changed, 54 insertions(+), 22 deletions(-) diff --git a/arch/alpha/boot/misc.c b/arch/alpha/boot/misc.c index 1d65adf5691e..c00646b25f6e 100644 --- a/arch/alpha/boot/misc.c +++ b/arch/alpha/boot/misc.c @@ -98,7 +98,7 @@ extern int end; static ulg free_mem_ptr; static ulg free_mem_ptr_end; -#define HEAP_SIZE 0x2000 +#define HEAP_SIZE 0x3000 #include "../../../lib/inflate.c" diff --git a/arch/arm/boot/compressed/misc.c b/arch/arm/boot/compressed/misc.c index 283891c736c4..9b444022cb9b 100644 --- a/arch/arm/boot/compressed/misc.c +++ b/arch/arm/boot/compressed/misc.c @@ -239,7 +239,7 @@ extern int end; static ulg free_mem_ptr; static ulg free_mem_ptr_end; -#define HEAP_SIZE 0x2000 +#define HEAP_SIZE 0x3000 #include "../../../../lib/inflate.c" diff --git a/arch/arm26/boot/compressed/misc.c b/arch/arm26/boot/compressed/misc.c index f17f50e5516f..0714d19c5776 100644 --- a/arch/arm26/boot/compressed/misc.c +++ b/arch/arm26/boot/compressed/misc.c @@ -182,7 +182,7 @@ extern int end; static ulg free_mem_ptr; static ulg free_mem_ptr_end; -#define HEAP_SIZE 0x2000 +#define HEAP_SIZE 0x3000 #include "../../../../lib/inflate.c" diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index 1ce7017fd627..b28505c544c9 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -189,7 +189,7 @@ static void putstr(const char *); static unsigned long free_mem_ptr; static unsigned long free_mem_end_ptr; -#define HEAP_SIZE 0x3000 +#define HEAP_SIZE 0x4000 static char *vidmem = (char *)0xb8000; static int vidport; diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index fed1167159c3..f932b0e89096 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -189,7 +189,7 @@ static void putstr(const char *); static long free_mem_ptr; static long free_mem_end_ptr; -#define HEAP_SIZE 0x6000 +#define HEAP_SIZE 0x7000 static char *vidmem = (char *)0xb8000; static int vidport; diff --git a/lib/inflate.c b/lib/inflate.c index 6db6e98d1637..88a22f4a49df 100644 --- a/lib/inflate.c +++ b/lib/inflate.c @@ -292,7 +292,6 @@ STATIC int INIT huft_build( oversubscribed set of lengths), and three if not enough memory. */ { unsigned a; /* counter for codes of length k */ - unsigned c[BMAX+1]; /* bit length count table */ unsigned f; /* i repeats in table every f entries */ int g; /* maximum code length */ int h; /* table level */ @@ -303,18 +302,33 @@ STATIC int INIT huft_build( register unsigned *p; /* pointer into c[], b[], or v[] */ register struct huft *q; /* points to current table */ struct huft r; /* table entry for structure assignment */ - struct huft *u[BMAX]; /* table stack */ - unsigned v[N_MAX]; /* values in order of bit length */ register int w; /* bits before this table == (l * h) */ - unsigned x[BMAX+1]; /* bit offsets, then code stack */ unsigned *xp; /* pointer into x */ int y; /* number of dummy codes added */ unsigned z; /* number of entries in current table */ + struct { + unsigned c[BMAX+1]; /* bit length count table */ + struct huft *u[BMAX]; /* table stack */ + unsigned v[N_MAX]; /* values in order of bit length */ + unsigned x[BMAX+1]; /* bit offsets, then code stack */ + } *stk; + unsigned *c, *v, *x; + struct huft **u; + int ret; DEBG("huft1 "); + stk = malloc(sizeof(*stk)); + if (stk == NULL) + return 3; /* out of memory */ + + c = stk->c; + v = stk->v; + x = stk->x; + u = stk->u; + /* Generate counts for each bit length */ - memzero(c, sizeof(c)); + memzero(stk->c, sizeof(stk->c)); p = b; i = n; do { Tracecv(*p, (stderr, (n-i >= ' ' && n-i <= '~' ? "%c %d\n" : "0x%x %d\n"), @@ -326,7 +340,8 @@ DEBG("huft1 "); { *t = (struct huft *)NULL; *m = 0; - return 2; + ret = 2; + goto out; } DEBG("huft2 "); @@ -351,10 +366,14 @@ DEBG("huft3 "); /* Adjust last length count to fill out codes, if needed */ for (y = 1 << j; j < i; j++, y <<= 1) - if ((y -= c[j]) < 0) - return 2; /* bad input: more codes than bits */ - if ((y -= c[i]) < 0) - return 2; + if ((y -= c[j]) < 0) { + ret = 2; /* bad input: more codes than bits */ + goto out; + } + if ((y -= c[i]) < 0) { + ret = 2; + goto out; + } c[i] += y; DEBG("huft4 "); @@ -428,7 +447,8 @@ DEBG1("3 "); { if (h) huft_free(u[0]); - return 3; /* not enough memory */ + ret = 3; /* not enough memory */ + goto out; } DEBG1("4 "); hufts += z + 1; /* track memory usage */ @@ -492,7 +512,11 @@ DEBG("h6f "); DEBG("huft7 "); /* Return true (1) if we were given an incomplete table */ - return y != 0 && g != 1; + ret = y != 0 && g != 1; + + out: + free(stk); + return ret; } @@ -705,10 +729,14 @@ STATIC int noinline INIT inflate_fixed(void) struct huft *td; /* distance code table */ int bl; /* lookup bits for tl */ int bd; /* lookup bits for td */ - unsigned l[288]; /* length list for huft_build */ + unsigned *l; /* length list for huft_build */ DEBG(" 1) { huft_free(tl); + free(l); DEBG(">"); return i; @@ -737,11 +767,13 @@ DEBG(" Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH 151/231] [PATCH] x86-64: deflate inflate_dynamic too inflate_dynamic() has piggy stack usage too, so heap allocate it too. I'm not sure it actually gets used, but it shows up large in "make checkstack". Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- lib/inflate.c | 61 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/lib/inflate.c b/lib/inflate.c index 88a22f4a49df..845f91d3ac12 100644 --- a/lib/inflate.c +++ b/lib/inflate.c @@ -798,16 +798,19 @@ STATIC int noinline INIT inflate_dynamic(void) unsigned nb; /* number of bit length codes */ unsigned nl; /* number of literal/length codes */ unsigned nd; /* number of distance codes */ -#ifdef PKZIP_BUG_WORKAROUND - unsigned ll[288+32]; /* literal/length and distance code lengths */ -#else - unsigned ll[286+30]; /* literal/length and distance code lengths */ -#endif + unsigned *ll; /* literal/length and distance code lengths */ register ulg b; /* bit buffer */ register unsigned k; /* number of bits in bit buffer */ + int ret; DEBG(" 286 || nd > 30) #endif - return 1; /* bad lengths */ + { + ret = 1; /* bad lengths */ + goto out; + } DEBG("dyn1 "); @@ -850,7 +856,8 @@ DEBG("dyn2 "); { if (i == 1) huft_free(tl); - return i; /* incomplete code set */ + ret = i; /* incomplete code set */ + goto out; } DEBG("dyn3 "); @@ -872,8 +879,10 @@ DEBG("dyn3 "); NEEDBITS(2) j = 3 + ((unsigned)b & 3); DUMPBITS(2) - if ((unsigned)i + j > n) - return 1; + if ((unsigned)i + j > n) { + ret = 1; + goto out; + } while (j--) ll[i++] = l; } @@ -882,8 +891,10 @@ DEBG("dyn3 "); NEEDBITS(3) j = 3 + ((unsigned)b & 7); DUMPBITS(3) - if ((unsigned)i + j > n) - return 1; + if ((unsigned)i + j > n) { + ret = 1; + goto out; + } while (j--) ll[i++] = 0; l = 0; @@ -893,8 +904,10 @@ DEBG("dyn3 "); NEEDBITS(7) j = 11 + ((unsigned)b & 0x7f); DUMPBITS(7) - if ((unsigned)i + j > n) - return 1; + if ((unsigned)i + j > n) { + ret = 1; + goto out; + } while (j--) ll[i++] = 0; l = 0; @@ -923,7 +936,8 @@ DEBG("dyn5b "); error("incomplete literal tree"); huft_free(tl); } - return i; /* incomplete code set */ + ret = i; /* incomplete code set */ + goto out; } DEBG("dyn5c "); bd = dbits; @@ -939,15 +953,18 @@ DEBG("dyn5d "); huft_free(td); } huft_free(tl); - return i; /* incomplete code set */ + ret = i; /* incomplete code set */ + goto out; #endif } DEBG("dyn6 "); /* decompress until an end-of-block code */ - if (inflate_codes(tl, td, bl, bd)) - return 1; + if (inflate_codes(tl, td, bl, bd)) { + ret = 1; + goto out; + } DEBG("dyn7 "); @@ -956,10 +973,14 @@ DEBG("dyn7 "); huft_free(td); DEBG(">"); - return 0; + ret = 0; +out: + free(ll); + return ret; - underrun: - return 4; /* Input underrun */ +underrun: + ret = 4; /* Input underrun */ + goto out; } From 7a61d35d4b4056e7711031202da7605e052f4137 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:15 +0200 Subject: [PATCH 152/231] [PATCH] i386: Page-align the GDT Xen wants a dedicated page for the GDT. I believe VMI likes it too. lguest, KVM and native don't care. Simple transformation to page-aligned "struct gdt_page". Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Acked-by: Jeremy Fitzhardinge --- arch/i386/kernel/cpu/common.c | 6 +++--- arch/i386/kernel/entry.S | 2 +- arch/i386/kernel/head.S | 2 +- arch/i386/kernel/traps.c | 2 +- include/asm-i386/desc.h | 9 +++++++-- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 58128585ae60..7a4c036d93c8 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -22,7 +22,7 @@ #include "cpu.h" -DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]) = { +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, @@ -48,8 +48,8 @@ DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]) = { [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, [GDT_ENTRY_PDA] = { 0x00000000, 0x00c09200 }, /* set in setup_pda */ -}; -EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt); +} }; +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); DEFINE_PER_CPU(struct i386_pda, _cpu_pda); EXPORT_PER_CPU_SYMBOL(_cpu_pda); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index e07473c0d3e7..3e4aa1fd33e2 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -557,7 +557,7 @@ END(syscall_badsys) #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ movl %fs:PDA_cpu, %ebx; \ - PER_CPU(cpu_gdt, %ebx); \ + PER_CPU(gdt_page, %ebx); \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ addl %esp, %eax; \ pushl $__KERNEL_DS; \ diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index cc46494787e8..bb36c24311b4 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -598,7 +598,7 @@ idt_descr: .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 - .long per_cpu__cpu_gdt /* Overwritten for secondary CPUs */ + .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ /* * The boot_gdt must mirror the equivalent in setup.S and is diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index e0a23bee6967..f21b41e7770c 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -1030,7 +1030,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, fastcall unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { - struct desc_struct *gdt = __get_cpu_var(cpu_gdt); + struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; unsigned long base = (kesp - uesp) & -THREAD_SIZE; unsigned long new_kesp = kesp - base; unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index 4a974064e928..c547403f341d 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -18,10 +18,15 @@ struct Xgt_desc_struct { unsigned short pad; } __attribute__ ((packed)); -DECLARE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]); +struct gdt_page +{ + struct desc_struct gdt[GDT_ENTRIES]; +} __attribute__((aligned(PAGE_SIZE))); +DECLARE_PER_CPU(struct gdt_page, gdt_page); + static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { - return per_cpu(cpu_gdt, cpu); + return per_cpu(gdt_page, cpu).gdt; } extern struct Xgt_desc_struct idt_descr; From 7c3576d261ce046789a7db14f43303f8120910c7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 153/231] [PATCH] i386: Convert PDA into the percpu section Currently x86 (similar to x84-64) has a special per-cpu structure called "i386_pda" which can be easily and efficiently referenced via the %fs register. An ELF section is more flexible than a structure, allowing any piece of code to use this area. Indeed, such a section already exists: the per-cpu area. So this patch: (1) Removes the PDA and uses per-cpu variables for each current member. (2) Replaces the __KERNEL_PDA segment with __KERNEL_PERCPU. (3) Creates a per-cpu mirror of __per_cpu_offset called this_cpu_off, which can be used to calculate addresses for this CPU's variables. (4) Simplifies startup, because %fs doesn't need to be loaded with a special segment at early boot; it can be deferred until the first percpu area is allocated (or never for UP). The result is less code and one less x86-specific concept. Signed-off-by: Rusty Russell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen --- arch/i386/kernel/asm-offsets.c | 5 -- arch/i386/kernel/cpu/common.c | 17 +---- arch/i386/kernel/entry.S | 5 +- arch/i386/kernel/head.S | 31 ++------ arch/i386/kernel/i386_ksyms.c | 2 - arch/i386/kernel/irq.c | 3 + arch/i386/kernel/process.c | 12 ++- arch/i386/kernel/smpboot.c | 30 ++++---- arch/i386/kernel/vmi.c | 6 +- arch/i386/kernel/vmlinux.lds.S | 1 - include/asm-i386/current.h | 5 +- include/asm-i386/irq_regs.h | 12 +-- include/asm-i386/pda.h | 99 ------------------------- include/asm-i386/percpu.h | 132 ++++++++++++++++++++++++++++++--- include/asm-i386/processor.h | 2 +- include/asm-i386/segment.h | 6 +- include/asm-i386/smp.h | 4 +- 17 files changed, 177 insertions(+), 195 deletions(-) delete mode 100644 include/asm-i386/pda.h diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index d558adfc293c..b05e85fd1c1e 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -15,7 +15,6 @@ #include #include #include -#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -101,10 +100,6 @@ void foo(void) OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); - BLANK(); - OFFSET(PDA_cpu, i386_pda, cpu_number); - OFFSET(PDA_pcurrent, i386_pda, pcurrent); - #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 7a4c036d93c8..27e00565f5e4 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -18,7 +18,6 @@ #include #include #endif -#include #include "cpu.h" @@ -47,13 +46,10 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, - [GDT_ENTRY_PDA] = { 0x00000000, 0x00c09200 }, /* set in setup_pda */ + [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); -DEFINE_PER_CPU(struct i386_pda, _cpu_pda); -EXPORT_PER_CPU_SYMBOL(_cpu_pda); - static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -634,21 +630,14 @@ void __init early_cpu_init(void) #endif } -/* Make sure %gs is initialized properly in idle threads */ +/* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xfs = __KERNEL_PDA; + regs->xfs = __KERNEL_PERCPU; return regs; } -/* Initial PDA used by boot CPU */ -struct i386_pda boot_pda = { - ._pda = &boot_pda, - .cpu_number = 0, - .pcurrent = &init_task, -}; - /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3e4aa1fd33e2..7f92ceb428ad 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -132,7 +132,7 @@ VM_MASK = 0x00020000 movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; \ - movl $(__KERNEL_PDA), %edx; \ + movl $(__KERNEL_PERCPU), %edx; \ movl %edx, %fs #define RESTORE_INT_REGS \ @@ -556,7 +556,6 @@ END(syscall_badsys) #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ - movl %fs:PDA_cpu, %ebx; \ PER_CPU(gdt_page, %ebx); \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ addl %esp, %eax; \ @@ -681,7 +680,7 @@ error_code: pushl %fs CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET fs, 0*/ - movl $(__KERNEL_PDA), %ecx + movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs UNWIND_ESPFIX_STACK popl %ecx diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index bb36c24311b4..12277d8938df 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -317,12 +317,12 @@ is386: movl $2,%ecx # set MP movl %eax,%cr0 call check_x87 - call setup_pda lgdt early_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. + movl %eax,%fs # gets reset once there's real percpu movl $(__USER_DS),%eax # DS/ES contains default USER segment movl %eax,%ds @@ -332,16 +332,17 @@ is386: movl $2,%ecx # set MP movl %eax,%gs lldt %ax - movl $(__KERNEL_PDA),%eax - mov %eax,%fs - cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder #ifdef CONFIG_SMP movb ready, %cl movb $1, ready cmpb $0,%cl # the first CPU calls start_kernel - jne initialize_secondary # all other CPUs call initialize_secondary + je 1f + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + jmp initialize_secondary # all other CPUs call initialize_secondary +1: #endif /* CONFIG_SMP */ jmp start_kernel @@ -364,23 +365,6 @@ check_x87: .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ ret -/* - * Point the GDT at this CPU's PDA. On boot this will be - * cpu_gdt_table and boot_pda; for secondary CPUs, these will be - * that CPU's GDT and PDA. - */ -ENTRY(setup_pda) - /* get the PDA pointer */ - movl start_pda, %eax - - /* slot the PDA address into the GDT */ - mov early_gdt_descr+2, %ecx - mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ - shr $16, %eax - mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ - mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ - ret - /* * setup_idt * @@ -553,9 +537,6 @@ ENTRY(empty_zero_page) * This starts the data section. */ .data -ENTRY(start_pda) - .long boot_pda - ENTRY(stack_start) .long init_thread_union+THREAD_SIZE .long __BOOT_DS diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 4afe26e86260..e3d4b73bfdb0 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -28,5 +28,3 @@ EXPORT_SYMBOL(__read_lock_failed); #endif EXPORT_SYMBOL(csum_partial); - -EXPORT_SYMBOL(_proxy_pda); diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 8db8d514c9c0..d2daf672f4a2 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -24,6 +24,9 @@ DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 5fb9524c6f4b..61999479b7a4 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -57,7 +58,6 @@ #include #include -#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -66,6 +66,12 @@ static int hlt_counter; unsigned long boot_option_idle_override = 0; EXPORT_SYMBOL(boot_option_idle_override); +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + +DEFINE_PER_CPU(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + /* * Return saved PC of a blocked thread. */ @@ -342,7 +348,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xds = __USER_DS; regs.xes = __USER_DS; - regs.xfs = __KERNEL_PDA; + regs.xfs = __KERNEL_PERCPU; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -711,7 +717,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (prev->gs | next->gs) loadsegment(gs, next->gs); - write_pda(pcurrent, next_p); + x86_write_percpu(current_task, next_p); return prev_p; } diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 61e2842add36..f79b6233db78 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -53,7 +53,6 @@ #include #include #include -#include #include #include @@ -99,6 +98,9 @@ EXPORT_SYMBOL(x86_cpu_to_apicid); u8 apicid_2_node[MAX_APICID]; +DEFINE_PER_CPU(unsigned long, this_cpu_off); +EXPORT_PER_CPU_SYMBOL(this_cpu_off); + /* * Trampoline 80x86 program as an array. */ @@ -456,7 +458,6 @@ extern struct { void * esp; unsigned short ss; } stack_start; -extern struct i386_pda *start_pda; #ifdef CONFIG_NUMA @@ -784,20 +785,17 @@ static inline struct task_struct * alloc_idle_task(int cpu) /* Initialize the CPU's GDT. This is either the boot CPU doing itself (still using the master per-cpu area), or a CPU doing it for a secondary which will soon come up. */ -static __cpuinit void init_gdt(int cpu, struct task_struct *idle) +static __cpuinit void init_gdt(int cpu) { struct desc_struct *gdt = get_cpu_gdt_table(cpu); - struct i386_pda *pda = &per_cpu(_cpu_pda, cpu); - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, - (u32 *)&gdt[GDT_ENTRY_PDA].b, - (unsigned long)pda, sizeof(*pda) - 1, - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, + (u32 *)&gdt[GDT_ENTRY_PERCPU].b, + __per_cpu_offset[cpu], 0xFFFFF, + 0x80 | DESCTYPE_S | 0x2, 0x8); - memset(pda, 0, sizeof(*pda)); - pda->_pda = pda; - pda->cpu_number = cpu; - pda->pcurrent = idle; + per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; + per_cpu(cpu_number, cpu) = cpu; } /* Defined in head.S */ @@ -824,9 +822,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); - init_gdt(cpu, idle); + init_gdt(cpu); + per_cpu(current_task, cpu) = idle; early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); - start_pda = cpu_pda(cpu); idle->thread.eip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ @@ -1188,14 +1186,14 @@ static inline void switch_to_new_gdt(void) gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); - asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); } void __init native_smp_prepare_boot_cpu(void) { unsigned int cpu = smp_processor_id(); - init_gdt(cpu, current); + init_gdt(cpu); switch_to_new_gdt(); cpu_set(cpu, cpu_online_map); diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index ccad7ee960aa..12312988c626 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -504,8 +504,6 @@ static void vmi_pmd_clear(pmd_t *pmd) #endif #ifdef CONFIG_SMP -extern void setup_pda(void); - static void __devinit vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) @@ -530,13 +528,11 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, ap.ds = __USER_DS; ap.es = __USER_DS; - ap.fs = __KERNEL_PDA; + ap.fs = __KERNEL_PERCPU; ap.gs = 0; ap.eflags = 0; - setup_pda(); - #ifdef CONFIG_X86_PAE /* efer should match BSP efer. */ if (cpu_has_nx) { diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 2ce4aa185fc8..d125784ddf5e 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -26,7 +26,6 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) ENTRY(phys_startup_32) jiffies = jiffies_64; -_proxy_pda = 1; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ diff --git a/include/asm-i386/current.h b/include/asm-i386/current.h index 5252ee0f6d7a..d35248539912 100644 --- a/include/asm-i386/current.h +++ b/include/asm-i386/current.h @@ -1,14 +1,15 @@ #ifndef _I386_CURRENT_H #define _I386_CURRENT_H -#include #include +#include struct task_struct; +DECLARE_PER_CPU(struct task_struct *, current_task); static __always_inline struct task_struct *get_current(void) { - return read_pda(pcurrent); + return x86_read_percpu(current_task); } #define current get_current() diff --git a/include/asm-i386/irq_regs.h b/include/asm-i386/irq_regs.h index a1b3f7f594a2..3368b20c0b48 100644 --- a/include/asm-i386/irq_regs.h +++ b/include/asm-i386/irq_regs.h @@ -1,25 +1,27 @@ /* * Per-cpu current frame pointer - the location of the last exception frame on - * the stack, stored in the PDA. + * the stack, stored in the per-cpu area. * * Jeremy Fitzhardinge */ #ifndef _ASM_I386_IRQ_REGS_H #define _ASM_I386_IRQ_REGS_H -#include +#include + +DECLARE_PER_CPU(struct pt_regs *, irq_regs); static inline struct pt_regs *get_irq_regs(void) { - return read_pda(irq_regs); + return x86_read_percpu(irq_regs); } static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) { struct pt_regs *old_regs; - old_regs = read_pda(irq_regs); - write_pda(irq_regs, new_regs); + old_regs = get_irq_regs(); + x86_write_percpu(irq_regs, new_regs); return old_regs; } diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h deleted file mode 100644 index aef7f732f77e..000000000000 --- a/include/asm-i386/pda.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - Per-processor Data Areas - Jeremy Fitzhardinge 2006 - Based on asm-x86_64/pda.h by Andi Kleen. - */ -#ifndef _I386_PDA_H -#define _I386_PDA_H - -#include -#include -#include - -struct i386_pda -{ - struct i386_pda *_pda; /* pointer to self */ - - int cpu_number; - struct task_struct *pcurrent; /* current process */ - struct pt_regs *irq_regs; -}; - -DECLARE_PER_CPU(struct i386_pda, _cpu_pda); -#define cpu_pda(i) (&per_cpu(_cpu_pda, (i))) -#define pda_offset(field) offsetof(struct i386_pda, field) - -extern void __bad_pda_field(void); - -/* This variable is never instantiated. It is only used as a stand-in - for the real per-cpu PDA memory, so that gcc can understand what - memory operations the inline asms() below are performing. This - eliminates the need to make the asms volatile or have memory - clobbers, so gcc can readily analyse them. */ -extern struct i386_pda _proxy_pda; - -#define pda_to_op(op,field,val) \ - do { \ - typedef typeof(_proxy_pda.field) T__; \ - if (0) { T__ tmp__; tmp__ = (val); } \ - switch (sizeof(_proxy_pda.field)) { \ - case 1: \ - asm(op "b %1,%%fs:%c2" \ - : "+m" (_proxy_pda.field) \ - :"ri" ((T__)val), \ - "i"(pda_offset(field))); \ - break; \ - case 2: \ - asm(op "w %1,%%fs:%c2" \ - : "+m" (_proxy_pda.field) \ - :"ri" ((T__)val), \ - "i"(pda_offset(field))); \ - break; \ - case 4: \ - asm(op "l %1,%%fs:%c2" \ - : "+m" (_proxy_pda.field) \ - :"ri" ((T__)val), \ - "i"(pda_offset(field))); \ - break; \ - default: __bad_pda_field(); \ - } \ - } while (0) - -#define pda_from_op(op,field) \ - ({ \ - typeof(_proxy_pda.field) ret__; \ - switch (sizeof(_proxy_pda.field)) { \ - case 1: \ - asm(op "b %%fs:%c1,%0" \ - : "=r" (ret__) \ - : "i" (pda_offset(field)), \ - "m" (_proxy_pda.field)); \ - break; \ - case 2: \ - asm(op "w %%fs:%c1,%0" \ - : "=r" (ret__) \ - : "i" (pda_offset(field)), \ - "m" (_proxy_pda.field)); \ - break; \ - case 4: \ - asm(op "l %%fs:%c1,%0" \ - : "=r" (ret__) \ - : "i" (pda_offset(field)), \ - "m" (_proxy_pda.field)); \ - break; \ - default: __bad_pda_field(); \ - } \ - ret__; }) - -/* Return a pointer to a pda field */ -#define pda_addr(field) \ - ((typeof(_proxy_pda.field) *)((unsigned char *)read_pda(_pda) + \ - pda_offset(field))) - -#define read_pda(field) pda_from_op("mov",field) -#define write_pda(field,val) pda_to_op("mov",field,val) -#define add_pda(field,val) pda_to_op("add",field,val) -#define sub_pda(field,val) pda_to_op("sub",field,val) -#define or_pda(field,val) pda_to_op("or",field,val) - -#endif /* _I386_PDA_H */ diff --git a/include/asm-i386/percpu.h b/include/asm-i386/percpu.h index a10e7c68ae9d..c5f12f0d9c23 100644 --- a/include/asm-i386/percpu.h +++ b/include/asm-i386/percpu.h @@ -1,9 +1,30 @@ #ifndef __ARCH_I386_PERCPU__ #define __ARCH_I386_PERCPU__ -#ifndef __ASSEMBLY__ -#include -#else +#ifdef __ASSEMBLY__ + +/* + * PER_CPU finds an address of a per-cpu variable. + * + * Args: + * var - variable name + * reg - 32bit register + * + * The resulting address is stored in the "reg" argument. + * + * Example: + * PER_CPU(cpu_gdt_descr, %ebx) + */ +#ifdef CONFIG_SMP +#define PER_CPU(var, reg) \ + movl %fs:per_cpu__this_cpu_off, reg; \ + addl $per_cpu__##var, reg +#else /* ! SMP */ +#define PER_CPU(var, reg) \ + movl $per_cpu__##var, reg; +#endif /* SMP */ + +#else /* ...!ASSEMBLY */ /* * PER_CPU finds an address of a per-cpu variable. @@ -18,14 +39,107 @@ * PER_CPU(cpu_gdt_descr, %ebx) */ #ifdef CONFIG_SMP -#define PER_CPU(var, cpu) \ - movl __per_cpu_offset(,cpu,4), cpu; \ - addl $per_cpu__##var, cpu; -#else /* ! SMP */ -#define PER_CPU(var, cpu) \ - movl $per_cpu__##var, cpu; +/* Same as generic implementation except for optimized local access. */ +#define __GENERIC_PER_CPU + +/* This is used for other cpus to find our section. */ +extern unsigned long __per_cpu_offset[]; + +/* Separate out the type, so (int[3], foo) works. */ +#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name +#define DEFINE_PER_CPU(type, name) \ + __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name + +/* We can use this directly for local CPU (faster). */ +DECLARE_PER_CPU(unsigned long, this_cpu_off); + +/* var is in discarded region: offset to particular copy we want */ +#define per_cpu(var, cpu) (*({ \ + extern int simple_indentifier_##var(void); \ + RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); })) + +#define __raw_get_cpu_var(var) (*({ \ + extern int simple_indentifier_##var(void); \ + RELOC_HIDE(&per_cpu__##var, x86_read_percpu(this_cpu_off)); \ +})) + +#define __get_cpu_var(var) __raw_get_cpu_var(var) + +/* A macro to avoid #include hell... */ +#define percpu_modcopy(pcpudst, src, size) \ +do { \ + unsigned int __i; \ + for_each_possible_cpu(__i) \ + memcpy((pcpudst)+__per_cpu_offset[__i], \ + (src), (size)); \ +} while (0) + +#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) + +/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */ +#define __percpu_seg "%%fs:" +#else /* !SMP */ +#include +#define __percpu_seg "" #endif /* SMP */ +/* For arch-specific code, we can use direct single-insn ops (they + * don't give an lvalue though). */ +extern void __bad_percpu_size(void); + +#define percpu_to_op(op,var,val) \ + do { \ + typedef typeof(var) T__; \ + if (0) { T__ tmp__; tmp__ = (val); } \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b %1,"__percpu_seg"%0" \ + : "+m" (var) \ + :"ri" ((T__)val)); \ + break; \ + case 2: \ + asm(op "w %1,"__percpu_seg"%0" \ + : "+m" (var) \ + :"ri" ((T__)val)); \ + break; \ + case 4: \ + asm(op "l %1,"__percpu_seg"%0" \ + : "+m" (var) \ + :"ri" ((T__)val)); \ + break; \ + default: __bad_percpu_size(); \ + } \ + } while (0) + +#define percpu_from_op(op,var) \ + ({ \ + typeof(var) ret__; \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b "__percpu_seg"%1,%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + case 2: \ + asm(op "w "__percpu_seg"%1,%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + case 4: \ + asm(op "l "__percpu_seg"%1,%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + default: __bad_percpu_size(); \ + } \ + ret__; }) + +#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var) +#define x86_write_percpu(var,val) percpu_to_op("mov", per_cpu__##var, val) +#define x86_add_percpu(var,val) percpu_to_op("add", per_cpu__##var, val) +#define x86_sub_percpu(var,val) percpu_to_op("sub", per_cpu__##var, val) +#define x86_or_percpu(var,val) percpu_to_op("or", per_cpu__##var, val) #endif /* !__ASSEMBLY__ */ #endif /* __ARCH_I386_PERCPU__ */ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 922260474646..ced2da8a0d65 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -377,7 +377,7 @@ struct thread_struct { .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ - .fs = __KERNEL_PDA, \ + .fs = __KERNEL_PERCPU, \ } /* diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h index 065f10bfa487..07e70624d87c 100644 --- a/include/asm-i386/segment.h +++ b/include/asm-i386/segment.h @@ -39,7 +39,7 @@ * 25 - APM BIOS support * * 26 - ESPFIX small SS - * 27 - PDA [ per-cpu private data area ] + * 27 - per-cpu [ offset to per-cpu data area ] * 28 - unused * 29 - unused * 30 - unused @@ -74,8 +74,8 @@ #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) -#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15) -#define __KERNEL_PDA (GDT_ENTRY_PDA * 8) +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) #define GDT_ENTRY_DOUBLEFAULT_TSS 31 diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index 2d083cb4ca93..090abc1da32a 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -8,7 +8,6 @@ #include #include #include -#include #endif #if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__) @@ -112,7 +111,8 @@ do { } while (0) * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (read_pda(cpu_number)) +DECLARE_PER_CPU(int, cpu_number); +#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) extern cpumask_t cpu_callout_map; extern cpumask_t cpu_callin_map; From 978c038ec944e4f2c940b0975c6acb433203a9be Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 154/231] [PATCH] i386: cleanups to help using per-cpu variables from asm This patch does a few small cleanups: - use PER_CPU_NAME to generate the names of per-cpu variables - use lea to add the per_cpu offset in PER_CPU(), because it doesn't affect condition flags - add PER_CPU_VAR which allows direct access to pre-cpu variables with the %fs: prefix on SMP. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Andi Kleen --- include/asm-i386/percpu.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/asm-i386/percpu.h b/include/asm-i386/percpu.h index c5f12f0d9c23..cdcb63db30a9 100644 --- a/include/asm-i386/percpu.h +++ b/include/asm-i386/percpu.h @@ -16,12 +16,14 @@ * PER_CPU(cpu_gdt_descr, %ebx) */ #ifdef CONFIG_SMP -#define PER_CPU(var, reg) \ - movl %fs:per_cpu__this_cpu_off, reg; \ - addl $per_cpu__##var, reg +#define PER_CPU(var, reg) \ + movl %fs:per_cpu__##this_cpu_off, reg; \ + lea per_cpu__##var(reg), reg +#define PER_CPU_VAR(var) %fs:per_cpu__##var #else /* ! SMP */ -#define PER_CPU(var, reg) \ - movl $per_cpu__##var, reg; +#define PER_CPU(var, reg) \ + movl $per_cpu__##var, reg +#define PER_CPU_VAR(var) per_cpu__##var #endif /* SMP */ #else /* ...!ASSEMBLY */ From 1956c73bb5bf81ee577ed7d3c64e3cad876ad2a5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 155/231] [PATCH] i386: Define per_cpu_offset Define per_cpu_offset in asm-i386/percpu.h when SMP defined, like asm-generic/percpu.h does for UP. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Andi Kleen --- include/asm-i386/percpu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-i386/percpu.h b/include/asm-i386/percpu.h index cdcb63db30a9..f54830b5d5ac 100644 --- a/include/asm-i386/percpu.h +++ b/include/asm-i386/percpu.h @@ -47,6 +47,8 @@ /* This is used for other cpus to find our section. */ extern unsigned long __per_cpu_offset[]; +#define per_cpu_offset(x) (__per_cpu_offset[x]) + /* Separate out the type, so (int[3], foo) works. */ #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name #define DEFINE_PER_CPU(type, name) \ From c5413fbe894924ddb8aa474a4d4da52e7a6c7e0b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 156/231] [PATCH] i386: Fix UP gdt bugs Fixes two problems with the GDT when compiling for uniprocessor: - There's no percpu segment, so trying to load its selector into %fs fails. Use a null selector instead. - The real gdt needs to be loaded at some point. Do it in cpu_init(). Signed-off-by: Chris Wright Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell --- arch/i386/kernel/cpu/common.c | 13 +++++++++++++ arch/i386/kernel/smpboot.c | 12 ------------ include/asm-i386/processor.h | 1 + include/asm-i386/segment.h | 4 ++++ 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 27e00565f5e4..794d593c47eb 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -638,6 +638,18 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) return regs; } +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +void switch_to_new_gdt(void) +{ + struct Xgt_desc_struct gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); +} + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -668,6 +680,7 @@ void __cpuinit cpu_init(void) } load_idt(&idt_descr); + switch_to_new_gdt(); /* * Set up and load the per-CPU TSS and LDT diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index f79b6233db78..7c1dbef399cd 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1177,18 +1177,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) smp_boot_cpus(max_cpus); } -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -static inline void switch_to_new_gdt(void) -{ - struct Xgt_desc_struct gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); - asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); -} - void __init native_smp_prepare_boot_cpu(void) { unsigned int cpu = smp_processor_id(); diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index ced2da8a0d65..70f3515c3db0 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -750,6 +750,7 @@ extern void enable_sep_cpu(void); extern int sysenter_setup(void); extern void cpu_set_gdt(int); +extern void switch_to_new_gdt(void); extern void cpu_init(void); extern int force_mwait; diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h index 07e70624d87c..597a47c2515f 100644 --- a/include/asm-i386/segment.h +++ b/include/asm-i386/segment.h @@ -75,7 +75,11 @@ #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) #define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) +#ifdef CONFIG_SMP #define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) +#else +#define __KERNEL_PERCPU 0 +#endif #define GDT_ENTRY_DOUBLEFAULT_TSS 31 From 9ce8c2ed12550f90fd6e902990652b13df647793 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 157/231] [PATCH] i386: map enough initial memory to create lowmem mappings head.S creates the very initial pagetable for the kernel. This just maps enough space for the kernel itself, and an allocation bitmap. The amount of mapped memory is rounded up to 4Mbytes, and so this typically ends up mapping 8Mbytes of memory. When booting, pagetable_init() needs to create mappings for all lowmem, and the pagetables for these mappings are allocated from the free pages around the kernel in low memory. If the number of pagetable pages + kernel size exceeds head.S's initial mapping, it will end up faulting on an unmapped page. This will only happen with specific combinations of kernel size and memory size. This patch makes sure that head.S also maps enough space to fit the kernel pagetables as well as the kernel itself. It ends up using an additional two pages of unreclaimable memory. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Acked-by: "H. Peter Anvin" Cc: Andi Kleen Cc: Zachary Amsden Cc: Chris Wright Cc: "Eric W. Biederman" Cc: Linus Torvalds , --- arch/i386/kernel/asm-offsets.c | 6 ++++++ arch/i386/kernel/head.S | 23 +++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index b05e85fd1c1e..27a776c9044d 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -11,6 +11,7 @@ #include #include #include "sigframe.h" +#include #include #include #include @@ -96,6 +97,11 @@ void foo(void) sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); + DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); + DEFINE(PTRS_PER_PTE, PTRS_PER_PTE); + DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); + DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); + DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 12277d8938df..9b10af65faaa 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -34,17 +34,32 @@ /* * This is how much memory *in addition to the memory covered up to - * and including _end* we need mapped initially. We need one bit for - * each possible page, but only in low memory, which means - * 2^32/4096/8 = 128K worst case (4G/4G split.) + * and including _end* we need mapped initially. + * We need: + * - one bit for each possible page, but only in low memory, which means + * 2^32/4096/8 = 128K worst case (4G/4G split.) + * - enough space to map all low memory, which means + * (2^32/4096) / 1024 pages (worst case, non PAE) + * (2^32/4096) / 512 + 4 pages (worst case for PAE) + * - a few pages for allocator use before the kernel pagetable has + * been set up * * Modulo rounding, each megabyte assigned here requires a kilobyte of * memory, which is currently unreclaimed. * * This should be a multiple of a page. */ -#define INIT_MAP_BEYOND_END (128*1024) +LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) +#if PTRS_PER_PMD > 1 +PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD +#else +PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) +#endif +BOOTBITMAP_SIZE = LOW_PAGES / 8 +ALLOCATOR_SLOP = 4 + +INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm /* * 32-bit kernel entrypoint; only used by the boot CPU. On entry, From 57decbda6a2a7c400b2a3b3b12e52ccbdc977118 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 158/231] [PATCH] x86: update for i386 and x86-64 check_bugs Remove spurious comments, headers and keywords from x86-64 bugs.[ch]. Use identify_boot_cpu() AK: merged with other patch Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/x86_64/kernel/bugs.c | 9 +-------- include/asm-i386/bugs.h | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c index 131e541e3f7b..12b585b5345d 100644 --- a/arch/x86_64/kernel/bugs.c +++ b/arch/x86_64/kernel/bugs.c @@ -3,19 +3,12 @@ * * Copyright (C) 1994 Linus Torvalds * Copyright (C) 2000 SuSE - * - * This is included by init/main.c to check for architecture-dependent bugs. - * - * Needs: - * void check_bugs(void); */ #include +#include #include #include -#include -#include -#include void __init check_bugs(void) { diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h index df539b390448..d28979ff73be 100644 --- a/include/asm-i386/bugs.h +++ b/include/asm-i386/bugs.h @@ -7,6 +7,6 @@ #ifndef _ASM_I386_BUG_H #define _ASM_I386_BUG_H -extern void __init check_bugs(void); +void check_bugs(void); #endif /* _ASM_I386_BUG_H */ From 752783c050f1729452a89b2baea45b0124ac91c7 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 159/231] [PATCH] i386: In compat mode, the return value here was uninitialized. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/i386/kernel/sysenter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 94defac6fc3d..ff4ee6f3326b 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -268,7 +268,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) { struct mm_struct *mm = current->mm; unsigned long addr; - int ret; + int ret = 0; bool compat; down_write(&mm->mmap_sem); From 959b4fdfe7e27bcf101e2381e500e4076f2bb9ce Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 160/231] [PATCH] i386: PARAVIRT: Allow boot-time disable of paravirt_ops patching Add "noreplace-paravirt" to disable paravirt_ops patching. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Andi Kleen Signed-off-by: Andrew Morton --- Documentation/kernel-parameters.txt | 3 +++ arch/i386/kernel/alternative.c | 13 +++++++++++++ 2 files changed, 16 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 242b3a09b6ce..38d7db3262c7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -64,6 +64,7 @@ parameter is applicable: GENERIC_TIME The generic timeofday code is enabled. NFS Appropriate NFS support is enabled. OSS OSS sound support is enabled. + PV_OPS A paravirtualized kernel PARIDE The ParIDE subsystem is enabled. PARISC The PA-RISC architecture is enabled. PCI PCI bus support is enabled. @@ -1164,6 +1165,8 @@ and is between 256 and 4096 characters. It is defined in the file nomce [IA-32] Machine Check Exception + noreplace-paravirt [IA-32,PV_OPS] Don't patch paravirt_ops + noreplace-smp [IA-32,SMP] Don't replace SMP instructions with UP alternatives diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index c5d037c60950..080a59d56ea6 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -30,6 +30,16 @@ static int __init setup_noreplace_smp(char *str) } __setup("noreplace-smp", setup_noreplace_smp); +#ifdef CONFIG_PARAVIRT +static int noreplace_paravirt = 0; + +static int __init setup_noreplace_paravirt(char *str) +{ + noreplace_paravirt = 1; + return 1; +} +__setup("noreplace-paravirt", setup_noreplace_paravirt); +#endif #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) @@ -330,6 +340,9 @@ void apply_paravirt(struct paravirt_patch_site *start, { struct paravirt_patch_site *p; + if (noreplace_paravirt) + return; + for (p = start; p < end; p++) { unsigned int used; From 18420001d6ceafbe094a6f911126c6eee34d25c4 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 161/231] [PATCH] i386: Clean up arch/i386/kernel/cpu/mcheck/p4.c No, just no. You do not use goto to skip a code block. You do not return an obvious variable from a singly-inlined function and give the function a return value. You don't put unexplained comments about kmalloc in code which doesn't do dynamic allocation. And you don't leave stray warnings around for no good reason. Also, when possible, it is better to use block scoped variables because gcc can sometime generate better code. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mcheck/p4.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index 504434a46011..1509edfb2313 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -124,13 +124,10 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { u32 h; - if (mce_num_extended_msrs == 0) - goto done; - rdmsr (MSR_IA32_MCG_EAX, r->eax, h); rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); @@ -141,12 +138,6 @@ static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r) rdmsr (MSR_IA32_MCG_ESP, r->esp, h); rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); rdmsr (MSR_IA32_MCG_EIP, r->eip, h); - - /* can we rely on kmalloc to do a dynamic - * allocation for the reserved registers? - */ -done: - return mce_num_extended_msrs; } static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) @@ -155,7 +146,6 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; int i; - struct intel_mce_extended_msrs dbg; rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); if (mcgstl & (1<<0)) /* Recoverable ? */ @@ -164,7 +154,9 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); - if (intel_get_extended_msrs(&dbg)) { + if (mce_num_extended_msrs > 0) { + struct intel_mce_extended_msrs dbg; + intel_get_extended_msrs(&dbg); printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", smp_processor_id(), dbg.eip, dbg.eflags); printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", From 9f53a729dbf0ba8abdc464f6eb828f485d3417f7 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 162/231] [PATCH] i386: Now that the VDSO can be relocated, we can support it in VMI configurations. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 6964e247f473..1a94a73fe801 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -220,7 +220,7 @@ config PARAVIRT config VMI bool "VMI Paravirt-ops support" - depends on PARAVIRT && !COMPAT_VDSO + depends on PARAVIRT help VMI provides a paravirtualized interface to the VMware ESX server (it could be used by other hypervisors in theory too, but is not From eeef9c68aae2f4f21ab810d0339e0f22d30b0cd8 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 163/231] [PATCH] i386: Implement vmi_kmap_atomic_pte Implement vmi_kmap_atomic_pte in terms of the backend set_linear_mapping operation. The conversion is rather straighforward; call kmap_atomic and then inform the hypervisor of the page mapping. The _flush_tlb damage is due to macros being pulled in from highmem.h. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/i386/kernel/vmi.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index 12312988c626..0df0b2cd3617 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -65,8 +66,8 @@ static struct { void (*release_page)(u32, u32); void (*set_pte)(pte_t, pte_t *, unsigned); void (*update_pte)(pte_t *, unsigned); - void (*set_linear_mapping)(int, u32, u32, u32); - void (*flush_tlb)(int); + void (*set_linear_mapping)(int, void *, u32, u32); + void (*_flush_tlb)(int); void (*set_initial_ap_state)(int, int); void (*halt)(void); void (*set_lazy_mode)(int mode); @@ -221,12 +222,12 @@ static void vmi_load_esp0(struct tss_struct *tss, static void vmi_flush_tlb_user(void) { - vmi_ops.flush_tlb(VMI_FLUSH_TLB); + vmi_ops._flush_tlb(VMI_FLUSH_TLB); } static void vmi_flush_tlb_kernel(void) { - vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); + vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); } /* Stub to do nothing at all; used for delays and unimplemented calls */ @@ -349,8 +350,11 @@ static void vmi_check_page_type(u32 pfn, int type) #define vmi_check_page_type(p,t) do { } while (0) #endif -static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn) +#ifdef CONFIG_HIGHPTE +static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) { + void *va = kmap_atomic(page, type); + /* * Internally, the VMI ROM must map virtual addresses to physical * addresses for processing MMU updates. By the time MMU updates @@ -364,8 +368,11 @@ static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn) * args: SLOT VA COUNT PFN */ BUG_ON(type != KM_PTE0 && type != KM_PTE1); - vmi_ops.set_linear_mapping((type - KM_PTE0)+1, (u32)va, 1, pfn); + vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page)); + + return va; } +#endif static void vmi_allocate_pt(u32 pfn) { @@ -660,7 +667,7 @@ void vmi_bringup(void) { /* We must establish the lowmem mapping for MMU ops to work */ if (vmi_ops.set_linear_mapping) - vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0); + vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); } /* @@ -800,8 +807,8 @@ static inline int __init activate_vmi(void) para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ - para_wrap(flush_tlb_user, vmi_flush_tlb_user, flush_tlb, FlushTLB); - para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, flush_tlb, FlushTLB); + para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); + para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); para_fill(flush_tlb_single, InvalPage); /* @@ -847,9 +854,12 @@ static inline int __init activate_vmi(void) paravirt_ops.release_pt = vmi_release_pt; paravirt_ops.release_pd = vmi_release_pd; } -#if 0 - para_wrap(map_pt_hook, vmi_map_pt_hook, set_linear_mapping, - SetLinearMapping); + + /* Set linear is needed in all cases */ + vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); +#ifdef CONFIG_HIGHPTE + if (vmi_ops.set_linear_mapping) + paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; #endif /* From e0bb8643974397a8d36670e06e6a54bb84f3289f Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 164/231] [PATCH] i386: Convert VMI timer to use clock events Convert VMI timer to use clock events, making it properly able to use the NO_HZ infrastructure. On UP systems, with no local APIC, we just continue to route these events through the PIT. On systems with a local APIC, or SMP, we provide a single source interrupt chip which creates the local timer IRQ. It actually gets delivered by the APIC hardware, but we don't want to use the same local APIC clocksource processing, so we create our own handler here. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen CC: Dan Hecht CC: Ingo Molnar CC: Thomas Gleixner --- arch/i386/kernel/Makefile | 2 +- arch/i386/kernel/entry.S | 5 - arch/i386/kernel/vmi.c | 26 +- arch/i386/kernel/vmiclock.c | 318 ++++++++++++++++++++++++ arch/i386/kernel/vmitime.c | 482 ------------------------------------ include/asm-i386/vmi_time.h | 18 +- 6 files changed, 327 insertions(+), 524 deletions(-) create mode 100644 arch/i386/kernel/vmiclock.c delete mode 100644 arch/i386/kernel/vmitime.c diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index bd7753cb9e69..4f98516b9f94 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -39,7 +39,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_VMI) += vmi.o vmitime.o +obj-$(CONFIG_VMI) += vmi.o vmiclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o obj-y += pcspeaker.o diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 7f92ceb428ad..90ffcdb69838 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -637,11 +637,6 @@ ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -/* This alternate entry is needed because we hijack the apic LVTT */ -#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC) -BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR) -#endif - KPROBE_ENTRY(page_fault) RING0_EC_FRAME pushl $do_page_fault diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index 0df0b2cd3617..0fae15dee765 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -77,6 +77,9 @@ static struct { extern struct paravirt_patch __start_parainstructions[], __stop_parainstructions[]; +/* Cached VMI operations */ +struct vmi_timer_ops vmi_timer_ops; + /* * VMI patching routines. */ @@ -235,18 +238,6 @@ static void vmi_nop(void) { } -/* For NO_IDLE_HZ, we stop the clock when halting the kernel */ -static fastcall void vmi_safe_halt(void) -{ - int idle = vmi_stop_hz_timer(); - vmi_ops.halt(); - if (idle) { - local_irq_disable(); - vmi_account_time_restart_hz_timer(); - local_irq_enable(); - } -} - #ifdef CONFIG_DEBUG_PAGE_TYPE #ifdef CONFIG_X86_PAE @@ -722,7 +713,6 @@ do { \ } \ } while (0) - /* * Activate the VMI interface and switch into paravirtualized mode */ @@ -901,8 +891,8 @@ static inline int __init activate_vmi(void) paravirt_ops.get_wallclock = vmi_get_wallclock; paravirt_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm; - paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm; + paravirt_ops.setup_boot_clock = vmi_time_bsp_init; + paravirt_ops.setup_secondary_clock = vmi_time_ap_init; #endif paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; paravirt_ops.get_cpu_khz = vmi_cpu_khz; @@ -914,11 +904,7 @@ static inline int __init activate_vmi(void) disable_vmi_timer = 1; } - /* No idle HZ mode only works if VMI timer and no idle is enabled */ - if (disable_noidle || disable_vmi_timer) - para_fill(safe_halt, Halt); - else - para_wrap(safe_halt, vmi_safe_halt, halt, Halt); + para_fill(safe_halt, Halt); /* * Alternative instruction rewriting doesn't happen soon enough diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c new file mode 100644 index 000000000000..26a37f8a8762 --- /dev/null +++ b/arch/i386/kernel/vmiclock.c @@ -0,0 +1,318 @@ +/* + * VMI paravirtual timer support routines. + * + * Copyright (C) 2007, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include "io_ports.h" + +#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) +#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) + +static DEFINE_PER_CPU(struct clock_event_device, local_events); + +static inline u32 vmi_counter(u32 flags) +{ + /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding + * cycle counter. */ + return flags & VMI_ALARM_COUNTER_MASK; +} + +/* paravirt_ops.get_wallclock = vmi_get_wallclock */ +unsigned long vmi_get_wallclock(void) +{ + unsigned long long wallclock; + wallclock = vmi_timer_ops.get_wallclock(); // nsec + (void)do_div(wallclock, 1000000000); // sec + + return wallclock; +} + +/* paravirt_ops.set_wallclock = vmi_set_wallclock */ +int vmi_set_wallclock(unsigned long now) +{ + return 0; +} + +/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */ +unsigned long long vmi_get_sched_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); +} + +/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ +unsigned long vmi_cpu_khz(void) +{ + unsigned long long khz; + khz = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(khz, 1000); + return khz; +} + +static inline unsigned int vmi_get_timer_vector(void) +{ +#ifdef CONFIG_X86_IO_APIC + return FIRST_DEVICE_VECTOR; +#else + return FIRST_EXTERNAL_VECTOR; +#endif +} + +/** vmi clockchip */ +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned int startup_timer_irq(unsigned int irq) +{ + unsigned long val = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, vmi_get_timer_vector()); + + return (val & APIC_SEND_PENDING); +} + +static void mask_timer_irq(unsigned int irq) +{ + unsigned long val = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, val | APIC_LVT_MASKED); +} + +static void unmask_timer_irq(unsigned int irq) +{ + unsigned long val = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED); +} + +static void ack_timer_irq(unsigned int irq) +{ + ack_APIC_irq(); +} + +static struct irq_chip vmi_chip __read_mostly = { + .name = "VMI-LOCAL", + .startup = startup_timer_irq, + .mask = mask_timer_irq, + .unmask = unmask_timer_irq, + .ack = ack_timer_irq +}; +#endif + +/** vmi clockevent */ +#define VMI_ALARM_WIRED_IRQ0 0x00000000 +#define VMI_ALARM_WIRED_LVTT 0x00010000 +static int vmi_wiring = VMI_ALARM_WIRED_IRQ0; + +static inline int vmi_get_alarm_wiring(void) +{ + return vmi_wiring; +} + +static void vmi_timer_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + cycle_t now, cycles_per_hz; + BUG_ON(!irqs_disabled()); + + switch (mode) { + case CLOCK_EVT_MODE_ONESHOT: + break; + case CLOCK_EVT_MODE_PERIODIC: + cycles_per_hz = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(cycles_per_hz, HZ); + now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC)); + vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + switch (evt->mode) { + case CLOCK_EVT_MODE_ONESHOT: + vmi_timer_ops.cancel_alarm(VMI_ONESHOT); + break; + case CLOCK_EVT_MODE_PERIODIC: + vmi_timer_ops.cancel_alarm(VMI_PERIODIC); + break; + default: + break; + } + break; + default: + break; + } +} + +static int vmi_timer_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + /* Unfortunately, set_next_event interface only passes relative + * expiry, but we want absolute expiry. It'd be better if were + * were passed an aboslute expiry, since a bunch of time may + * have been stolen between the time the delta is computed and + * when we set the alarm below. */ + cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); + + BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0); + return 0; +} + +static struct clock_event_device vmi_clockevent = { + .name = "vmi-timer", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .shift = 22, + .set_mode = vmi_timer_set_mode, + .set_next_event = vmi_timer_next_event, + .rating = 1000, + .irq = 0, +}; + +static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = &__get_cpu_var(local_events); + evt->event_handler(evt); + return IRQ_HANDLED; +} + +static struct irqaction vmi_clock_action = { + .name = "vmi-timer", + .handler = vmi_timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .mask = CPU_MASK_ALL, +}; + +static void __devinit vmi_time_init_clockevent(void) +{ + cycle_t cycles_per_msec; + struct clock_event_device *evt; + + int cpu = smp_processor_id(); + evt = &__get_cpu_var(local_events); + + /* Use cycles_per_msec since div_sc params are 32-bits. */ + cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(cycles_per_msec, 1000); + + memcpy(evt, &vmi_clockevent, sizeof(*evt)); + /* Must pick .shift such that .mult fits in 32-bits. Choosing + * .shift to be 22 allows 2^(32-22) cycles per nano-seconds + * before overflow. */ + evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift); + /* Upper bound is clockevent's use of ulong for cycle deltas. */ + evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); + evt->min_delta_ns = clockevent_delta2ns(1, evt); + evt->cpumask = cpumask_of_cpu(cpu); + + printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", + evt->name, evt->mult, evt->shift); + clockevents_register_device(evt); +} + +void __init vmi_time_init(void) +{ + /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ + outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ + + vmi_time_init_clockevent(); + setup_irq(0, &vmi_clock_action); +} + +#ifdef CONFIG_X86_LOCAL_APIC +void __devinit vmi_time_bsp_init(void) +{ + /* + * On APIC systems, we want local timers to fire on each cpu. We do + * this by programming LVTT to deliver timer events to the IRQ handler + * for IRQ-0, since we can't re-use the APIC local timer handler + * without interfering with that code. + */ + clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + local_irq_disable(); +#ifdef CONFIG_X86_SMP + /* + * XXX handle_percpu_irq only defined for SMP; we need to switch over + * to using it, since this is a local interrupt, which each CPU must + * handle individually without locking out or dropping simultaneous + * local timers on other CPUs. We also don't want to trigger the + * quirk workaround code for interrupts which gets invoked from + * handle_percpu_irq via eoi, so we use our own IRQ chip. + */ + set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt"); +#else + set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt"); +#endif + vmi_wiring = VMI_ALARM_WIRED_LVTT; + apic_write(APIC_LVTT, vmi_get_timer_vector()); + local_irq_enable(); + clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); +} + +void __devinit vmi_time_ap_init(void) +{ + vmi_time_init_clockevent(); + apic_write(APIC_LVTT, vmi_get_timer_vector()); +} +#endif + +/** vmi clocksource */ + +static cycle_t read_real_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); +} + +static struct clocksource clocksource_vmi = { + .name = "vmi-timer", + .rating = 450, + .read = read_real_cycles, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be set */ + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static int __init init_vmi_clocksource(void) +{ + cycle_t cycles_per_msec; + + if (!vmi_timer_ops.get_cycle_frequency) + return 0; + /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */ + cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(cycles_per_msec, 1000); + + /* Note that clocksource.{mult, shift} converts in the opposite direction + * as clockevents. */ + clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, + clocksource_vmi.shift); + + printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec); + return clocksource_register(&clocksource_vmi); + +} +module_init(init_vmi_clocksource); diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c deleted file mode 100644 index 9dfb17739b67..000000000000 --- a/arch/i386/kernel/vmitime.c +++ /dev/null @@ -1,482 +0,0 @@ -/* - * VMI paravirtual timer support routines. - * - * Copyright (C) 2005, VMware, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to dhecht@vmware.com - * - */ - -/* - * Portions of this code from arch/i386/kernel/timers/timer_tsc.c. - * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c. - * See comments there for proper credits. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#ifdef CONFIG_X86_LOCAL_APIC -#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT -#else -#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0 -#endif - -/* Cached VMI operations */ -struct vmi_timer_ops vmi_timer_ops; - -#ifdef CONFIG_NO_IDLE_HZ - -/* /proc/sys/kernel/hz_timer state. */ -int sysctl_hz_timer; - -/* Some stats */ -static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs); -static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies); -static DEFINE_PER_CPU(unsigned long, idle_start_jiffies); - -#endif /* CONFIG_NO_IDLE_HZ */ - -/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */ -static int alarm_hz = CONFIG_VMI_ALARM_HZ; - -/* Cache of the value get_cycle_frequency / HZ. */ -static signed long long cycles_per_jiffy; - -/* Cache of the value get_cycle_frequency / alarm_hz. */ -static signed long long cycles_per_alarm; - -/* The number of cycles accounted for by the 'jiffies'/'xtime' count. - * Protected by xtime_lock. */ -static unsigned long long real_cycles_accounted_system; - -/* The number of cycles accounted for by update_process_times(), per cpu. */ -static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu); - -/* The number of stolen cycles accounted, per cpu. */ -static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu); - -/* Clock source. */ -static cycle_t read_real_cycles(void) -{ - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); -} - -static cycle_t read_available_cycles(void) -{ - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); -} - -#if 0 -static cycle_t read_stolen_cycles(void) -{ - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN); -} -#endif /* 0 */ - -static struct clocksource clocksource_vmi = { - .name = "vmi-timer", - .rating = 450, - .read = read_real_cycles, - .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /* to be set */ - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - - -/* Timer interrupt handler. */ -static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id); - -static struct irqaction vmi_timer_irq = { - .handler = vmi_timer_interrupt, - .flags = IRQF_DISABLED, - .mask = CPU_MASK_NONE, - .name = "VMI-alarm", -}; - -/* Alarm rate */ -static int __init vmi_timer_alarm_rate_setup(char* str) -{ - int alarm_rate; - if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) { - alarm_hz = alarm_rate; - printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz); - } - return 1; -} -__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup); - - -/* Initialization */ -static void vmi_get_wallclock_ts(struct timespec *ts) -{ - unsigned long long wallclock; - wallclock = vmi_timer_ops.get_wallclock(); // nsec units - ts->tv_nsec = do_div(wallclock, 1000000000); - ts->tv_sec = wallclock; -} - -unsigned long vmi_get_wallclock(void) -{ - struct timespec ts; - vmi_get_wallclock_ts(&ts); - return ts.tv_sec; -} - -int vmi_set_wallclock(unsigned long now) -{ - return -1; -} - -unsigned long long vmi_get_sched_cycles(void) -{ - return read_available_cycles(); -} - -unsigned long vmi_cpu_khz(void) -{ - unsigned long long khz; - - khz = vmi_timer_ops.get_cycle_frequency(); - (void)do_div(khz, 1000); - return khz; -} - -void __init vmi_time_init(void) -{ - unsigned long long cycles_per_sec, cycles_per_msec; - unsigned long flags; - - local_irq_save(flags); - setup_irq(0, &vmi_timer_irq); -#ifdef CONFIG_X86_LOCAL_APIC - set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt); -#endif - - real_cycles_accounted_system = read_real_cycles(); - per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles(); - - cycles_per_sec = vmi_timer_ops.get_cycle_frequency(); - cycles_per_jiffy = cycles_per_sec; - (void)do_div(cycles_per_jiffy, HZ); - cycles_per_alarm = cycles_per_sec; - (void)do_div(cycles_per_alarm, alarm_hz); - cycles_per_msec = cycles_per_sec; - (void)do_div(cycles_per_msec, 1000); - - printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;" - "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy, - cycles_per_alarm); - - clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, - clocksource_vmi.shift); - if (clocksource_register(&clocksource_vmi)) - printk(KERN_WARNING "Error registering VMITIME clocksource."); - - /* Disable PIT. */ - outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ - - /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu - * reduce the latency calling update_process_times. */ - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, - cycles_per_alarm); - - local_irq_restore(flags); -} - -#ifdef CONFIG_X86_LOCAL_APIC - -void __init vmi_timer_setup_boot_alarm(void) -{ - local_irq_disable(); - - /* Route the interrupt to the correct vector. */ - apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); - - /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */ - vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, - cycles_per_alarm); - local_irq_enable(); -} - -/* Initialize the time accounting variables for an AP on an SMP system. - * Also, set the local alarm for the AP. */ -void __devinit vmi_timer_setup_secondary_alarm(void) -{ - int cpu = smp_processor_id(); - - /* Route the interrupt to the correct vector. */ - apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); - - per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles(); - - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, - cycles_per_alarm); -} - -#endif - -/* Update system wide (real) time accounting (e.g. jiffies, xtime). */ -static void vmi_account_real_cycles(unsigned long long cur_real_cycles) -{ - long long cycles_not_accounted; - - write_seqlock(&xtime_lock); - - cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system; - while (cycles_not_accounted >= cycles_per_jiffy) { - /* systems wide jiffies. */ - do_timer(1); - - cycles_not_accounted -= cycles_per_jiffy; - real_cycles_accounted_system += cycles_per_jiffy; - } - - write_sequnlock(&xtime_lock); -} - -/* Update per-cpu process times. */ -static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu, - unsigned long long cur_process_times_cycles) -{ - long long cycles_not_accounted; - cycles_not_accounted = cur_process_times_cycles - - per_cpu(process_times_cycles_accounted_cpu, cpu); - - while (cycles_not_accounted >= cycles_per_jiffy) { - /* Account time to the current process. This includes - * calling into the scheduler to decrement the timeslice - * and possibly reschedule.*/ - update_process_times(user_mode(regs)); - /* XXX handle /proc/profile multiplier. */ - profile_tick(CPU_PROFILING); - - cycles_not_accounted -= cycles_per_jiffy; - per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; - } -} - -#ifdef CONFIG_NO_IDLE_HZ -/* Update per-cpu idle times. Used when a no-hz halt is ended. */ -static void vmi_account_no_hz_idle_cycles(int cpu, - unsigned long long cur_process_times_cycles) -{ - long long cycles_not_accounted; - unsigned long no_idle_hz_jiffies = 0; - - cycles_not_accounted = cur_process_times_cycles - - per_cpu(process_times_cycles_accounted_cpu, cpu); - - while (cycles_not_accounted >= cycles_per_jiffy) { - no_idle_hz_jiffies++; - cycles_not_accounted -= cycles_per_jiffy; - per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; - } - /* Account time to the idle process. */ - account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies)); -} -#endif - -/* Update per-cpu stolen time. */ -static void vmi_account_stolen_cycles(int cpu, - unsigned long long cur_real_cycles, - unsigned long long cur_avail_cycles) -{ - long long stolen_cycles_not_accounted; - unsigned long stolen_jiffies = 0; - - if (cur_real_cycles < cur_avail_cycles) - return; - - stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles - - per_cpu(stolen_cycles_accounted_cpu, cpu); - - while (stolen_cycles_not_accounted >= cycles_per_jiffy) { - stolen_jiffies++; - stolen_cycles_not_accounted -= cycles_per_jiffy; - per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy; - } - /* HACK: pass NULL to force time onto cpustat->steal. */ - account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies)); -} - -/* Body of either IRQ0 interrupt handler (UP no local-APIC) or - * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */ -static void vmi_local_timer_interrupt(int cpu) -{ - unsigned long long cur_real_cycles, cur_process_times_cycles; - - cur_real_cycles = read_real_cycles(); - cur_process_times_cycles = read_available_cycles(); - /* Update system wide (real) time state (xtime, jiffies). */ - vmi_account_real_cycles(cur_real_cycles); - /* Update per-cpu process times. */ - vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles); - /* Update time stolen from this cpu by the hypervisor. */ - vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); -} - -#ifdef CONFIG_NO_IDLE_HZ - -/* Must be called only from idle loop, with interrupts disabled. */ -int vmi_stop_hz_timer(void) -{ - /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */ - - unsigned long seq, next; - unsigned long long real_cycles_expiry; - int cpu = smp_processor_id(); - - BUG_ON(!irqs_disabled()); - if (sysctl_hz_timer != 0) - return 0; - - cpu_set(cpu, nohz_cpu_mask); - smp_mb(); - - if (rcu_needs_cpu(cpu) || local_softirq_pending() || - (next = next_timer_interrupt(), - time_before_eq(next, jiffies + HZ/CONFIG_VMI_ALARM_HZ))) { - cpu_clear(cpu, nohz_cpu_mask); - return 0; - } - - /* Convert jiffies to the real cycle counter. */ - do { - seq = read_seqbegin(&xtime_lock); - real_cycles_expiry = real_cycles_accounted_system + - (long)(next - jiffies) * cycles_per_jiffy; - } while (read_seqretry(&xtime_lock, seq)); - - /* This cpu is going idle. Disable the periodic alarm. */ - vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); - per_cpu(idle_start_jiffies, cpu) = jiffies; - /* Set the real time alarm to expire at the next event. */ - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL, - real_cycles_expiry, 0); - return 1; -} - -static void vmi_reenable_hz_timer(int cpu) -{ - /* For /proc/vmi/info idle_hz stat. */ - per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu); - per_cpu(vmi_idle_no_hz_irqs, cpu)++; - - /* Don't bother explicitly cancelling the one-shot alarm -- at - * worse we will receive a spurious timer interrupt. */ - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, - cycles_per_alarm); - /* Indicate this cpu is no longer nohz idle. */ - cpu_clear(cpu, nohz_cpu_mask); -} - -/* Called from interrupt handlers when (local) HZ timer is disabled. */ -void vmi_account_time_restart_hz_timer(void) -{ - unsigned long long cur_real_cycles, cur_process_times_cycles; - int cpu = smp_processor_id(); - - BUG_ON(!irqs_disabled()); - /* Account the time during which the HZ timer was disabled. */ - cur_real_cycles = read_real_cycles(); - cur_process_times_cycles = read_available_cycles(); - /* Update system wide (real) time state (xtime, jiffies). */ - vmi_account_real_cycles(cur_real_cycles); - /* Update per-cpu idle times. */ - vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles); - /* Update time stolen from this cpu by the hypervisor. */ - vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); - /* Reenable the hz timer. */ - vmi_reenable_hz_timer(cpu); -} - -#endif /* CONFIG_NO_IDLE_HZ */ - -/* UP (and no local-APIC) VMI-timer alarm interrupt handler. - * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after - * APIC setup and setup_boot_vmi_alarm() is called. */ -static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) -{ - vmi_local_timer_interrupt(smp_processor_id()); - return IRQ_HANDLED; -} - -#ifdef CONFIG_X86_LOCAL_APIC - -/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector. - * Also used in UP when CONFIG_X86_LOCAL_APIC. - * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */ -void smp_apic_vmi_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat,cpu).apic_timer_irqs++; - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - irq_enter(); - vmi_local_timer_interrupt(cpu); - irq_exit(); - set_irq_regs(old_regs); -} - -#endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/include/asm-i386/vmi_time.h b/include/asm-i386/vmi_time.h index c3a1fcf66c96..213930b995cb 100644 --- a/include/asm-i386/vmi_time.h +++ b/include/asm-i386/vmi_time.h @@ -53,22 +53,8 @@ extern unsigned long long vmi_get_sched_cycles(void); extern unsigned long vmi_cpu_khz(void); #ifdef CONFIG_X86_LOCAL_APIC -extern void __init vmi_timer_setup_boot_alarm(void); -extern void __devinit vmi_timer_setup_secondary_alarm(void); -extern void apic_vmi_timer_interrupt(void); -#endif - -#ifdef CONFIG_NO_IDLE_HZ -extern int vmi_stop_hz_timer(void); -extern void vmi_account_time_restart_hz_timer(void); -#else -static inline int vmi_stop_hz_timer(void) -{ - return 0; -} -static inline void vmi_account_time_restart_hz_timer(void) -{ -} +extern void __devinit vmi_time_bsp_init(void); +extern void __devinit vmi_time_ap_init(void); #endif /* From 441d40dca024deb305a5e3d5003e8cd9d364d10f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH 165/231] [PATCH] x86: PARAVIRT: Jeremy Fitzhardinge The other symbols used to delineate the alt-instructions sections have the form __foo/__foo_end. Rename parainstructions to match. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Rusty Russell Signed-off-by: Andrew Morton --- arch/i386/kernel/alternative.c | 2 +- arch/i386/kernel/vmi.c | 10 +++------- arch/i386/kernel/vmlinux.lds.S | 4 ++-- include/asm-i386/alternative.h | 4 ++-- include/asm-x86_64/alternative.h | 4 ++-- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 080a59d56ea6..e5cec6685cc5 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -399,6 +399,6 @@ void __init alternative_instructions(void) alternatives_smp_switch(0); } #endif - apply_paravirt(__start_parainstructions, __stop_parainstructions); + apply_paravirt(__parainstructions, __parainstructions_end); local_irq_restore(flags); } diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index 0fae15dee765..c8726c424b35 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -73,10 +73,6 @@ static struct { void (*set_lazy_mode)(int mode); } vmi_ops; -/* XXX move this to alternative.h */ -extern struct paravirt_patch __start_parainstructions[], - __stop_parainstructions[]; - /* Cached VMI operations */ struct vmi_timer_ops vmi_timer_ops; @@ -548,9 +544,9 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_set_lazy_mode(int mode) +static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) { - static DEFINE_PER_CPU(int, lazy_mode); + static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); if (!vmi_ops.set_lazy_mode) return; @@ -912,7 +908,7 @@ static inline int __init activate_vmi(void) * to do this before IRQs get reenabled. Fortunately, it is * idempotent. */ - apply_paravirt(__start_parainstructions, __stop_parainstructions); + apply_paravirt(__parainstructions, __parainstructions_end); vmi_bringup(); diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index d125784ddf5e..568caca87715 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -166,9 +166,9 @@ SECTIONS } . = ALIGN(4); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __start_parainstructions = .; + __parainstructions = .; *(.parainstructions) - __stop_parainstructions = .; + __parainstructions_end = .; } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h index 5b59d07e9d29..277467329583 100644 --- a/include/asm-i386/alternative.h +++ b/include/asm-i386/alternative.h @@ -124,8 +124,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, struct paravirt_patch_site *end) {} -#define __start_parainstructions NULL -#define __stop_parainstructions NULL +#define __parainstructions NULL +#define __parainstructions_end NULL #endif #endif /* _I386_ALTERNATIVE_H */ diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h index 67ebea3cc481..a09fe85c268e 100644 --- a/include/asm-x86_64/alternative.h +++ b/include/asm-x86_64/alternative.h @@ -142,8 +142,8 @@ void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end); static inline void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) {} -#define __start_parainstructions NULL -#define __stop_parainstructions NULL +#define __parainstructions NULL +#define __parainstructions_end NULL #endif #endif /* _X86_64_ALTERNATIVE_H */ From 21564fd2a3deb48200b595332f9ed4c9f311f2a7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 166/231] [PATCH] i386: PARAVIRT: Export paravirt_ops for non GPL modules too Otherwise non GPL modules cannot even do basic operations like disabling interrupts anymore, which would be excessive. Longer term should split the single structure up into internal and external symbols and not export the internal ones at all. Signed-off-by: Andi Kleen --- arch/i386/kernel/paravirt.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index c4850ddd6a9d..f99d3d78c53f 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -346,10 +346,4 @@ struct paravirt_ops paravirt_ops = { .startup_ipi_hook = paravirt_nop, }; -/* - * NOTE: CONFIG_PARAVIRT is experimental and the paravirt_ops - * semantics are subject to change. Hence we only do this - * internal-only export of this, until it gets sorted out and - * all lowlevel CPU ops used by modules are separately exported. - */ -EXPORT_SYMBOL_GPL(paravirt_ops); +EXPORT_SYMBOL(paravirt_ops); From 03df4f6ee997589a84d5f9492c6419183724c710 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 167/231] [PATCH] i386: Clean up ELF note generation Three cleanups: 1: ELF notes are never mapped, so there's no need to have any access flags in their phdr. 2: When generating them from asm, tell the assembler to use a SHT_NOTE section type. There doesn't seem to be a way to do this from C. 3: Use ANSI rather than traditional cpp behaviour to stringify the macro argument. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Eric W. Biederman --- arch/i386/kernel/vmlinux.lds.S | 2 +- include/asm-generic/vmlinux.lds.h | 2 +- include/linux/elfnote.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 568caca87715..23e8614edeee 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -30,7 +30,7 @@ jiffies = jiffies_64; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(4); /* R__ */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 9fcc8d9fbb14..f3806a74c478 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -208,7 +208,7 @@ } #define NOTES \ - .notes : { *(.note.*) } :note + .notes : { *(.note.*) } :note #define INITCALLS \ *(.initcall0.init) \ diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 67396db141e8..9a1e0674e56c 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -39,12 +39,12 @@ * ELFNOTE(XYZCo, 12, .long, 0xdeadbeef) */ #define ELFNOTE(name, type, desctype, descdata) \ -.pushsection .note.name ; \ +.pushsection .note.name, "",@note ; \ .align 4 ; \ .long 2f - 1f /* namesz */ ; \ .long 4f - 3f /* descsz */ ; \ .long type ; \ -1:.asciz "name" ; \ +1:.asciz #name ; \ 2:.align 4 ; \ 3:desctype descdata ; \ 4:.align 4 ; \ From 856f44ff4af6e57fdc39a8b2bec498c88438bd27 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 168/231] [PATCH] x86-64: Move mtrr prototypes from proto.h to mtrr.h Signed-off-by: Andi Kleen --- include/asm-x86_64/mtrr.h | 8 ++++++++ include/asm-x86_64/proto.h | 7 ------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h index d6135b2549bf..66809eca98cf 100644 --- a/include/asm-x86_64/mtrr.h +++ b/include/asm-x86_64/mtrr.h @@ -135,6 +135,14 @@ struct mtrr_gentry32 #endif /* CONFIG_COMPAT */ +#ifdef CONFIG_MTRR +extern void mtrr_ap_init(void); +extern void mtrr_bp_init(void); +#else +#define mtrr_ap_init() do {} while (0) +#define mtrr_bp_init() do {} while (0) +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MTRR_H */ diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 98063bcb3b33..85255db1e82d 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -14,13 +14,6 @@ extern void pda_init(int); extern void early_idt_handler(void); extern void mcheck_init(struct cpuinfo_x86 *c); -#ifdef CONFIG_MTRR -extern void mtrr_ap_init(void); -extern void mtrr_bp_init(void); -#else -#define mtrr_ap_init() do {} while (0) -#define mtrr_bp_init() do {} while (0) -#endif extern void init_memory_mapping(unsigned long start, unsigned long end); extern void system_call(void); From 2b3b4835c94226681c496de9446d456dcf42ed08 Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 169/231] [PATCH] x86: Adds mtrr_save_fixed_ranges() for use in two later patches. In this current implementation which is used in other patches, mtrr_save_fixed_ranges() accepts a dummy void pointer because in the current implementation of one of these patches, this function may be called from smp_call_function_single() which requires that this function takes a void pointer argument. This function calls get_fixed_ranges(), passing mtrr_state.fixed_ranges which is the element of the static struct which stores our current backup of the fixed-range MTRR values which all CPUs shall be using. Because mtrr_save_fixed_ranges calls get_fixed_ranges after kernel initialisation time, __init needs to be removed from the declaration of get_fixed_ranges(). If CONFIG_MTRR is not set, we define mtrr_save_fixed_ranges as an empty statement because there is nothing to do. AK: Moved prototypes for x86-64 around to fix warnings Signed-off-by: Bernhard Kaindl Signed-off-by: Andi Kleen Cc: Andrew Morton Cc: Andi Kleen Cc: Dave Jones --- arch/i386/kernel/cpu/mtrr/generic.c | 7 ++++++- include/asm-i386/mtrr.h | 2 ++ include/asm-x86_64/mtrr.h | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index 68b383788882..150cf5055a3c 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -37,7 +37,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } -static void __init +static void get_fixed_ranges(mtrr_type * frs) { unsigned int *p = (unsigned int *) frs; @@ -51,6 +51,11 @@ get_fixed_ranges(mtrr_type * frs) rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); } +void mtrr_save_fixed_ranges(void *info) +{ + get_fixed_ranges(mtrr_state.fixed_ranges); +} + static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types) { unsigned i; diff --git a/include/asm-i386/mtrr.h b/include/asm-i386/mtrr.h index 07f063ae26ea..02a41b99bd7e 100644 --- a/include/asm-i386/mtrr.h +++ b/include/asm-i386/mtrr.h @@ -69,6 +69,7 @@ struct mtrr_gentry /* The following functions are for use by other drivers */ # ifdef CONFIG_MTRR +extern void mtrr_save_fixed_ranges(void *); extern int mtrr_add (unsigned long base, unsigned long size, unsigned int type, char increment); extern int mtrr_add_page (unsigned long base, unsigned long size, @@ -79,6 +80,7 @@ extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); # else +#define mtrr_save_fixed_ranges(arg) do {} while (0) static __inline__ int mtrr_add (unsigned long base, unsigned long size, unsigned int type, char increment) { diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h index 66809eca98cf..1b326cbb9305 100644 --- a/include/asm-x86_64/mtrr.h +++ b/include/asm-x86_64/mtrr.h @@ -138,9 +138,11 @@ struct mtrr_gentry32 #ifdef CONFIG_MTRR extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); +extern void mtrr_save_fixed_ranges(void *); #else #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) +#define mtrr_save_fixed_ranges(arg) do {} while (0) #endif #endif /* __KERNEL__ */ From 2b1f6278d77c1f2f669346fc2bb48012b5e9495a Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 170/231] [PATCH] x86: Save the MTRRs of the BSP before booting an AP Applied fix by Andew Morton: http://lkml.org/lkml/2007/4/8/88 - Fix `make headers_check'. AMD and Intel x86 CPU manuals state that it is the responsibility of system software to initialize and maintain MTRR consistency across all processors in Multi-Processing Environments. Quote from page 188 of the AMD64 System Programming manual (Volume 2): 7.6.5 MTRRs in Multi-Processing Environments "In multi-processing environments, the MTRRs located in all processors must characterize memory in the same way. Generally, this means that identical values are written to the MTRRs used by the processors." (short omission here) "Failure to do so may result in coherency violations or loss of atomicity. Processor implementations do not check the MTRR settings in other processors to ensure consistency. It is the responsibility of system software to initialize and maintain MTRR consistency across all processors." Current Linux MTRR code already implements the above in the case that the BIOS does not properly initialize MTRRs on the secondary processors, but the case where the fixed-range MTRRs of the boot processor are changed after Linux started to boot, before the initialsation of a secondary processor, is not handled yet. In this case, secondary processors are currently initialized by Linux with MTRRs which the boot processor had very early, when mtrr_bp_init() did run, but not with the MTRRs which the boot processor uses at the time when that secondary processors is actually booted, causing differing MTRR contents on the secondary processors. Such situation happens on Acer Ferrari 1000 and 5000 notebooks where the BIOS enables and sets AMD-specific IORR bits in the fixed-range MTRRs of the boot processor when it transitions the system into ACPI mode. The SMI handler of the BIOS does this in SMM, entered while Linux ACPI code runs acpi_enable(). Other occasions where the SMI handler of the BIOS may change bits in the MTRRs could occur as well. To initialize newly booted secodary processors with the fixed-range MTRRs which the boot processor uses at that time, this patch saves the fixed-range MTRRs of the boot processor before new secondary processors are started. When the secondary processors run their Linux initialisation code, their fixed-range MTRRs will be updated with the saved fixed-range MTRRs. If CONFIG_MTRR is not set, we define mtrr_save_state as an empty statement because there is nothing to do. Possible TODOs: *) CPU-hotplugging outside of SMP suspend/resume is not yet tested with this patch. *) If, even in this case, an AP never runs i386/do_boot_cpu or x86_64/cpu_up, then the calls to mtrr_save_state() could be replaced by calls to mtrr_save_fixed_ranges(NULL) and mtrr_save_state() would not be needed. That would need either verification of the CPU-hotplug code or at least a test on a >2 CPU machine. *) The MTRRs of other running processors are not yet checked at this time but it might be interesting to syncronize the MTTRs of all processors before booting. That would be an incremental patch, but of rather low priority since there is no machine known so far which would require this. AK: moved prototypes on x86-64 around to fix warnings Signed-off-by: Bernhard Kaindl Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Dave Jones --- arch/i386/kernel/cpu/mtrr/main.c | 11 +++++++++++ arch/i386/kernel/smpboot.c | 7 +++++++ arch/x86_64/kernel/smpboot.c | 6 ++++++ include/asm-i386/mtrr.h | 2 ++ include/asm-x86_64/mtrr.h | 2 ++ 5 files changed, 28 insertions(+) diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 0acfb6a5a220..02a2f39e5e0a 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -729,6 +729,17 @@ void mtrr_ap_init(void) local_irq_restore(flags); } +/** + * Save current fixed-range MTRR state of the BSP + */ +void mtrr_save_state(void) +{ + if (smp_processor_id() == 0) + mtrr_save_fixed_ranges(NULL); + else + smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 7c1dbef399cd..f14d93351a82 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -58,6 +58,7 @@ #include #include #include +#include /* Set if we find a B stepping CPU */ static int __devinitdata smp_b_stepping; @@ -814,6 +815,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; + /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + /* * We can't use kernel_thread since we must avoid to * reschedule the child. diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 14724be48bec..ddc392bee243 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -944,6 +944,12 @@ int __cpuinit __cpu_up(unsigned int cpu) return -ENOSYS; } + /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Boot it! */ err = do_boot_cpu(cpu, apicid); diff --git a/include/asm-i386/mtrr.h b/include/asm-i386/mtrr.h index 02a41b99bd7e..7e9c7ccbdcfe 100644 --- a/include/asm-i386/mtrr.h +++ b/include/asm-i386/mtrr.h @@ -70,6 +70,7 @@ struct mtrr_gentry /* The following functions are for use by other drivers */ # ifdef CONFIG_MTRR extern void mtrr_save_fixed_ranges(void *); +extern void mtrr_save_state(void); extern int mtrr_add (unsigned long base, unsigned long size, unsigned int type, char increment); extern int mtrr_add_page (unsigned long base, unsigned long size, @@ -81,6 +82,7 @@ extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); # else #define mtrr_save_fixed_ranges(arg) do {} while (0) +#define mtrr_save_state() do {} while (0) static __inline__ int mtrr_add (unsigned long base, unsigned long size, unsigned int type, char increment) { diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h index 1b326cbb9305..b557c486bef8 100644 --- a/include/asm-x86_64/mtrr.h +++ b/include/asm-x86_64/mtrr.h @@ -139,10 +139,12 @@ struct mtrr_gentry32 extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); extern void mtrr_save_fixed_ranges(void *); +extern void mtrr_save_state(void); #else #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) #define mtrr_save_fixed_ranges(arg) do {} while (0) +#define mtrr_save_state() do {} while (0) #endif #endif /* __KERNEL__ */ From 3ebad5905609476a4ff1151a66b21d9794009961 Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 171/231] [PATCH] x86: Save and restore the fixed-range MTRRs of the BSP when suspending Note: This patch didn'nt need an update since it's initial post. Some BIOSes may modify fixed-range MTRRs in SMM, e.g. when they transition the system into ACPI mode, which is entered thru an SMI, triggered by Linux in acpi_enable(). SMIs which cause that Linux is interrupted and BIOS code is executed (which may change e.g. fixed-range MTRRs) in SMM may be raised by an embedded system controller which is often found in notebooks also at other occasions. If we would not update our copy of the fixed-range MTRRs before suspending to RAM or to disk, restore_processor_state() would set the fixed-range MTRRs of the BSP using old backup values which may be outdated and this could cause the system to fail later during resume. This patch ensures that our copy of the fixed-range MTRRs is updated when saving the boot processor state on suspend to disk and suspend to RAM. In combination with other patches this allows to fix s2ram and s2disk on the Acer Ferrari 1000 notebook and at least s2disk on the Acer Ferrari 5000 notebook. Signed-off-by: Bernhard Kaindl Signed-off-by: Andi Kleen Cc: Andrew Morton Cc: Andi Kleen Cc: Dave Jones --- arch/i386/power/cpu.c | 1 + arch/x86_64/kernel/suspend.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c index 2c15500f8713..998fd3ec0d68 100644 --- a/arch/i386/power/cpu.c +++ b/arch/i386/power/cpu.c @@ -21,6 +21,7 @@ unsigned long saved_context_eflags; void __save_processor_state(struct saved_context *ctxt) { + mtrr_save_fixed_ranges(NULL); kernel_fpu_begin(); /* diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index 4ca523d58a5b..6a5a98f2a75c 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -12,6 +12,7 @@ #include #include #include +#include /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; @@ -48,6 +49,7 @@ void __save_processor_state(struct saved_context *ctxt) rdmsrl(MSR_FS_BASE, ctxt->fs_base); rdmsrl(MSR_GS_BASE, ctxt->gs_base); rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + mtrr_save_fixed_ranges(NULL); /* * control registers From de938c51d5fec4ae03af64b06beb15d4423ec611 Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 172/231] [PATCH] i386: Enable support for fixed-range IORRs to keep RdMem & WrMem in sync If our copy of the MTRRs of the BSP has RdMem or WrMem set, and we are running on an AMD64/K8 system, the boot CPU must have had MtrrFixDramEn and MtrrFixDramModEn set (otherwise our RDMSR would have copied these bits cleared), so we set them on this CPU as well. This allows us to keep the AMD64/K8 RdMem and WrMem bits in sync across the CPUs of SMP systems in order to fullfill the duty of system software to "initialize and maintain MTRR consistency across all processors." as written in the AMD and Intel manuals. If an WRMSR instruction fails because MtrrFixDramModEn is not set, I expect that also the Intel-style MTRR bits are not updated. AK: minor cleanup, moved MSR defines around Signed-off-by: Bernhard Kaindl Signed-off-by: Andi Kleen Cc: Andrew Morton Cc: Andi Kleen Cc: Dave Jones --- arch/i386/kernel/cpu/mtrr/generic.c | 85 ++++++++++++++++++++--------- include/asm-i386/msr-index.h | 5 ++ 2 files changed, 65 insertions(+), 25 deletions(-) diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index 150cf5055a3c..000c07e86433 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -20,6 +20,18 @@ struct mtrr_state { mtrr_type def_type; }; +struct fixed_range_block { + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ +}; + +static struct fixed_range_block fixed_range_blocks[] = { + { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ + { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ + { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ + {} +}; + static unsigned long smp_changes_mask; static struct mtrr_state mtrr_state = {}; @@ -152,6 +164,44 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) smp_processor_id(), msr, a, b); } +/** + * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs + * see AMD publication no. 24593, chapter 3.2.1 for more information + */ +static inline void k8_enable_fixed_iorrs(void) +{ + unsigned lo, hi; + + rdmsr(MSR_K8_SYSCFG, lo, hi); + mtrr_wrmsr(MSR_K8_SYSCFG, lo + | K8_MTRRFIXRANGE_DRAM_ENABLE + | K8_MTRRFIXRANGE_DRAM_MODIFY, hi); +} + +/** + * Checks and updates an fixed-range MTRR if it differs from the value it + * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also. + * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information + * \param msr MSR address of the MTTR which should be checked and updated + * \param changed pointer which indicates whether the MTRR needed to be changed + * \param msrwords pointer to the MSR values which the MSR should have + */ +static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) +{ + unsigned lo, hi; + + rdmsr(msr, lo, hi); + + if (lo != msrwords[0] || hi != msrwords[1]) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 15 && + ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) + k8_enable_fixed_iorrs(); + mtrr_wrmsr(msr, msrwords[0], msrwords[1]); + *changed = TRUE; + } +} + int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free MTRR. The starting (base) address of the region. @@ -201,36 +251,21 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, *type = base_lo & 0xff; } +/** + * Checks and updates the fixed-range MTRRs if they differ from the saved set + * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges() + */ static int set_fixed_ranges(mtrr_type * frs) { - unsigned int *p = (unsigned int *) frs; + unsigned long long *saved = (unsigned long long *) frs; int changed = FALSE; - int i; - unsigned int lo, hi; + int block=-1, range; - rdmsr(MTRRfix64K_00000_MSR, lo, hi); - if (p[0] != lo || p[1] != hi) { - mtrr_wrmsr(MTRRfix64K_00000_MSR, p[0], p[1]); - changed = TRUE; - } + while (fixed_range_blocks[++block].ranges) + for (range=0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *) saved++); - for (i = 0; i < 2; i++) { - rdmsr(MTRRfix16K_80000_MSR + i, lo, hi); - if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) { - mtrr_wrmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], - p[3 + i * 2]); - changed = TRUE; - } - } - - for (i = 0; i < 8; i++) { - rdmsr(MTRRfix4K_C0000_MSR + i, lo, hi); - if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) { - mtrr_wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], - p[7 + i * 2]); - changed = TRUE; - } - } return changed; } diff --git a/include/asm-i386/msr-index.h b/include/asm-i386/msr-index.h index f1190802283d..a02eb2991349 100644 --- a/include/asm-i386/msr-index.h +++ b/include/asm-i386/msr-index.h @@ -87,6 +87,11 @@ #define MSR_K7_CLK_CTL 0xc001001b #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 + +#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */ +#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */ +#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */ + #define MSR_K7_HWCR 0xc0010015 #define MSR_K8_HWCR 0xc0010015 #define MSR_K7_FID_VID_CTL 0xc0010041 From f2b218dd6199983b120a96bc6531c1b81f4090d8 Mon Sep 17 00:00:00 2001 From: Fernando Luis VazquezCao Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 173/231] [PATCH] i386: safe_apic_wait_icr_idle - i386 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit apic_wait_icr_idle looks like this: static __inline__ void apic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } The busy loop in this function would not be problematic if the corresponding status bit in the ICR were always updated, but that does not seem to be the case under certain crash scenarios. Kdump uses an IPI to stop the other CPUs in the event of a crash, but when any of the other CPUs are locked-up inside the NMI handler the CPU that sends the IPI will end up looping forever in the ICR check, effectively hard-locking the whole system. Quoting from Intel's "MultiProcessor Specification" (Version 1.4), B-3: "A local APIC unit indicates successful dispatch of an IPI by resetting the Delivery Status bit in the Interrupt Command Register (ICR). The operating system polls the delivery status bit after sending an INIT or STARTUP IPI until the command has been dispatched. A period of 20 microseconds should be sufficient for IPI dispatch to complete under normal operating conditions. If the IPI is not successfully dispatched, the operating system can abort the command. Alternatively, the operating system can retry the IPI by writing the lower 32-bit double word of the ICR. This “time-out” mechanism can be implemented through an external interrupt, if interrupts are enabled on the processor, or through execution of an instruction or time-stamp counter spin loop." Intel's documentation suggests the implementation of a time-out mechanism, which, by the way, is already being open-coded in some parts of the kernel that tinker with ICR. Create a apic_wait_icr_idle replacement that implements the time-out mechanism and that can be used to solve the aforementioned problem. AK: moved both functions out of line AK: added improved loop from Keith Owens Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/i386/kernel/apic.c | 22 ++++++++++++++++++++++ include/asm-i386/apic.h | 9 +++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 93aa911646ad..aca054cc0552 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -129,6 +129,28 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +unsigned long safe_apic_wait_icr_idle(void) +{ + unsigned long send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h index a19810a08ae9..1e8f6f252dd3 100644 --- a/include/asm-i386/apic.h +++ b/include/asm-i386/apic.h @@ -2,6 +2,7 @@ #define __ASM_APIC_H #include +#include #include #include #include @@ -64,12 +65,8 @@ static __inline fastcall unsigned long native_apic_read(unsigned long reg) return *((volatile unsigned long *)(APIC_BASE+reg)); } -static __inline__ void apic_wait_icr_idle(void) -{ - while ( apic_read( APIC_ICR ) & APIC_ICR_BUSY ) - cpu_relax(); -} - +void apic_wait_icr_idle(void); +unsigned long safe_apic_wait_icr_idle(void); int get_physical_broadcast(void); #ifdef CONFIG_X86_GOOD_APIC From 8339e9fba33aa3205f541478c413982c0ac5a37f Mon Sep 17 00:00:00 2001 From: Fernando Luis VazquezCao Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 174/231] [PATCH] x86-64: safe_apic_wait_icr_idle - x86_64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit apic_wait_icr_idle looks like this: static __inline__ void apic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } The busy loop in this function would not be problematic if the corresponding status bit in the ICR were always updated, but that does not seem to be the case under certain crash scenarios. Kdump uses an IPI to stop the other CPUs in the event of a crash, but when any of the other CPUs are locked-up inside the NMI handler the CPU that sends the IPI will end up looping forever in the ICR check, effectively hard-locking the whole system. Quoting from Intel's "MultiProcessor Specification" (Version 1.4), B-3: "A local APIC unit indicates successful dispatch of an IPI by resetting the Delivery Status bit in the Interrupt Command Register (ICR). The operating system polls the delivery status bit after sending an INIT or STARTUP IPI until the command has been dispatched. A period of 20 microseconds should be sufficient for IPI dispatch to complete under normal operating conditions. If the IPI is not successfully dispatched, the operating system can abort the command. Alternatively, the operating system can retry the IPI by writing the lower 32-bit double word of the ICR. This “time-out” mechanism can be implemented through an external interrupt, if interrupts are enabled on the processor, or through execution of an instruction or time-stamp counter spin loop." Intel's documentation suggests the implementation of a time-out mechanism, which, by the way, is already being open-coded in some parts of the kernel that tinker with ICR. Create a apic_wait_icr_idle replacement that implements the time-out mechanism and that can be used to solve the aforementioned problem. AK: moved both functions out of line AK: Added improved loop from Keith Owens Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 22 ++++++++++++++++++++++ include/asm-x86_64/apic.h | 8 +++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 3421f21b6c70..943ec4d1bd8e 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -68,6 +68,28 @@ int using_apic_timer __read_mostly = 0; static void apic_pm_activate(void); +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +unsigned int safe_apic_wait_icr_idle(void) +{ + unsigned int send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + void enable_NMI_through_LVT0 (void * dummy) { unsigned int v; diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index 2f3b013595ab..45e9fca1febc 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h @@ -2,6 +2,7 @@ #define __ASM_APIC_H #include +#include #include #include #include @@ -47,11 +48,8 @@ static __inline unsigned int apic_read(unsigned long reg) return *((volatile unsigned int *)(APIC_BASE+reg)); } -static __inline__ void apic_wait_icr_idle(void) -{ - while (apic_read( APIC_ICR ) & APIC_ICR_BUSY) - cpu_relax(); -} +extern void apic_wait_icr_idle(void); +extern unsigned int safe_apic_wait_icr_idle(void); static inline void ack_APIC_irq(void) { From ae08e43eecd250e50ffa40c4bd62a65e005b875b Mon Sep 17 00:00:00 2001 From: Fernando Luis VazquezCao Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 175/231] [PATCH] i386: use safe_apic_wait_icr_idle - i386 The functionality provided by the new safe_apic_wait_icr_idle is being open-coded all over "kernel/smpboot.c". Use safe_apic_wait_icr_idle instead to consolidate code and ease maintenance. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/i386/kernel/smpboot.c | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index f14d93351a82..94dce14a1b37 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -563,8 +563,8 @@ static inline void __inquire_remote_apic(int apicid) static int __devinit wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) { - unsigned long send_status = 0, accept_status = 0; - int timeout, maxlvt; + unsigned long send_status, accept_status = 0; + int maxlvt; /* Target chip */ apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); @@ -574,12 +574,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. @@ -609,8 +604,8 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) static int __devinit wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) { - unsigned long send_status = 0, accept_status = 0; - int maxlvt, timeout, num_starts, j; + unsigned long send_status, accept_status = 0; + int maxlvt, num_starts, j; /* * Be paranoid about clearing APIC errors. @@ -635,12 +630,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); mdelay(10); @@ -653,12 +643,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); atomic_set(&init_deasserted, 1); @@ -714,12 +699,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) Dprintk("Startup point 1.\n"); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. From ea8c733b98b73289421ed8f8fc15cfbce3adc3bc Mon Sep 17 00:00:00 2001 From: Fernando Luis VazquezCao Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 176/231] [PATCH] x86-64: use safe_apic_wait_icr_idle in smpboot.c - x86_64 The functionality provided by the new safe_apic_wait_icr_idle is being open-coded all over "kernel/smpboot.c". Use safe_apic_wait_icr_idle instead to consolidate code and ease maintenance. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/x86_64/kernel/smpboot.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index ddc392bee243..f2a234b390e8 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -429,8 +429,8 @@ static void inquire_remote_apic(int apicid) */ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) { - unsigned long send_status = 0, accept_status = 0; - int maxlvt, timeout, num_starts, j; + unsigned long send_status, accept_status = 0; + int maxlvt, num_starts, j; Dprintk("Asserting INIT.\n"); @@ -446,12 +446,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); mdelay(10); @@ -464,12 +459,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); mb(); atomic_set(&init_deasserted, 1); @@ -508,12 +498,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta Dprintk("Startup point 1.\n"); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. From 4312fa8157f5ebdaf36e615cf40f9d7a6137a22c Mon Sep 17 00:00:00 2001 From: Fernando Luis VazquezCao Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 177/231] [PATCH] i386: use safe_apic_wait_icr_idle in smpboot.c __inquire_remote_apic is used for APIC debugging, so use safe_apic_wait_icr_idle instead of apic_wait_icr_idle to avoid possible lockups when APIC delivery fails. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/i386/kernel/smpboot.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 94dce14a1b37..a768eceeac37 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -521,7 +521,8 @@ static inline void __inquire_remote_apic(int apicid) { int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; - int timeout, status; + int timeout; + unsigned long status; printk("Inquiring remote APIC #%d...\n", apicid); @@ -531,7 +532,9 @@ static inline void __inquire_remote_apic(int apicid) /* * Wait for idle. */ - apic_wait_icr_idle(); + status = safe_apic_wait_icr_idle(); + if (status) + printk("a previous APIC delivery may have failed\n"); apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); From 3144c332fa2ee4c4b9804aae5fe31098f04bffd9 Mon Sep 17 00:00:00 2001 From: Fernando Luis VazquezCao Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH 178/231] [PATCH] x86-64: use safe_apic_wait_icr_idle in smpboot.c - x86_64 inquire_remote_apic is used for APIC debugging, so use safe_apic_wait_icr_idle instead of apic_wait_icr_idle to avoid possible lockups when APIC delivery fails. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/x86_64/kernel/smpboot.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index f2a234b390e8..082f478fb777 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -391,7 +391,8 @@ static void inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; - int timeout, status; + int timeout; + unsigned int status; printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); @@ -401,7 +402,9 @@ static void inquire_remote_apic(int apicid) /* * Wait for idle. */ - apic_wait_icr_idle(); + status = safe_apic_wait_icr_idle(); + if (status) + printk("a previous APIC delivery may have failed\n"); apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); From 45ae5e968ea01d8326833ca2863cec5183ce1930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20=5B**=20ISO-8859-1=20charset=20**=5D=20V?= =?UTF-8?q?=E1zquezCao?= Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 179/231] [PATCH] i386: __send_IPI_dest_field - i386 Implement __send_IPI_dest_field which can be used to send IPIs when the "destination shorthand" field of the ICR is set to 00 (destination field). Use it whenever possible. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/i386/kernel/smp.c | 47 ++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 892cd64130bc..d14ffe2109b8 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -165,16 +165,13 @@ void fastcall send_IPI_self(int vector) } /* - * This is only used on smaller machines. + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). */ -void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +static inline void __send_IPI_dest_field(unsigned long mask, int vector) { - unsigned long mask = cpus_addr(cpumask)[0]; unsigned long cfg; - unsigned long flags; - local_irq_save(flags); - WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); /* * Wait for idle. */ @@ -195,13 +192,25 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) * Send the IPI. The write to APIC_ICR fires this off. */ apic_write_around(APIC_ICR, cfg); +} +/* + * This is only used on smaller machines. + */ +void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +{ + unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long flags; + + local_irq_save(flags); + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); + __send_IPI_dest_field(mask, vector); local_irq_restore(flags); } void send_IPI_mask_sequence(cpumask_t mask, int vector) { - unsigned long cfg, flags; + unsigned long flags; unsigned int query_cpu; /* @@ -211,30 +220,10 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) */ local_irq_save(flags); - for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { if (cpu_isset(query_cpu, mask)) { - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu)); - apic_write_around(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write_around(APIC_ICR, cfg); + __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), + vector); } } local_irq_restore(flags); From 9062d888aa448318e38792b6879a795dd10adda4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20=5B**=20ISO-8859-1=20charset=20**=5D=20V?= =?UTF-8?q?=E1zquezCao?= Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 180/231] [PATCH] x86-64: __send_IPI_dest_field - x86_64 Implement __send_IPI_dest_field which can be used to send IPIs when the "destination shorthand" field of the ICR is set to 00 (destination field). Use it whenever possible. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic_flat.c | 23 +------------ include/asm-x86_64/ipi.h | 54 ++++++++++++++++++------------- 2 files changed, 33 insertions(+), 44 deletions(-) diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 9e0a552f0e4a..01d3939e3a9c 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -61,31 +61,10 @@ static void flat_init_apic_ldr(void) static void flat_send_IPI_mask(cpumask_t cpumask, int vector) { unsigned long mask = cpus_addr(cpumask)[0]; - unsigned long cfg; unsigned long flags; local_irq_save(flags); - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - apic_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); + __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } diff --git a/include/asm-x86_64/ipi.h b/include/asm-x86_64/ipi.h index ffa6f1517f1a..26961e671948 100644 --- a/include/asm-x86_64/ipi.h +++ b/include/asm-x86_64/ipi.h @@ -74,10 +74,39 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector, unsign apic_write(APIC_ICR, cfg); } +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +static inline void __send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) +{ + unsigned long cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + apic_write(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write(APIC_ICR, cfg); +} static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) { - unsigned long cfg, flags; + unsigned long flags; unsigned long query_cpu; /* @@ -86,28 +115,9 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) * - mbligh */ local_irq_save(flags); - for_each_cpu_mask(query_cpu, mask) { - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(x86_cpu_to_apicid[query_cpu]); - apic_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, APIC_DEST_PHYSICAL); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); + __send_IPI_dest_field(x86_cpu_to_apicid[query_cpu], + vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } From f5efb41e793ce587836b89b7191083b9a6185ed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20=5B**=20ISO-8859-1=20charset=20**=5D=20V?= =?UTF-8?q?=E1zquezCao?= Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 181/231] [PATCH] i386: Use safe_apic_wait_icr_idle in safe_apic_wait_icr_idle - i386 Use safe_apic_wait_icr_idle to check ICR idle bit if the vector is NMI_VECTOR to avoid potential hangups in the event of crash when kdump tries to stop the other CPUs. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/i386/kernel/smp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index d14ffe2109b8..f8667109db1c 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -175,7 +175,10 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) /* * Wait for idle. */ - apic_wait_icr_idle(); + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + apic_wait_icr_idle(); /* * prepare target chip field From 70ae77f497a57b3ef6b0987b6310327264517cb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20=5B**=20ISO-8859-1=20charset=20**=5D=20V?= =?UTF-8?q?=E1zquezCao?= Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 182/231] [PATCH] x86-64: Use safe_apic_wait_icr_idle in __send_IPI_dest_field - x86_64 Use safe_apic_wait_icr_idle to check ICR idle bit if the vector is NMI_VECTOR to avoid potential hangups in the event of crash when kdump tries to stop the other CPUs. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- include/asm-x86_64/ipi.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/asm-x86_64/ipi.h b/include/asm-x86_64/ipi.h index 26961e671948..a7c75ea408a8 100644 --- a/include/asm-x86_64/ipi.h +++ b/include/asm-x86_64/ipi.h @@ -85,7 +85,10 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector, unsigned /* * Wait for idle. */ - apic_wait_icr_idle(); + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + apic_wait_icr_idle(); /* * prepare target chip field From 25c16b992c32db4bc7e085b763abd866a505ca72 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 183/231] [PATCH] i386: fix mtrr sections Fix section mismatch warnings in mtrr code. Fix line length on one source line. WARNING: arch/x86_64/kernel/built-in.o - Section mismatch: reference to .init.data: from .text.get_mtrr_state after 'get_mtrr_state' (at offset 0x103) WARNING: arch/x86_64/kernel/built-in.o - Section mismatch: reference to .init.text: from .text.get_mtrr_state after 'get_mtrr_state' (at offset 0x180) WARNING: arch/x86_64/kernel/built-in.o - Section mismatch: reference to .init.text: from .text.get_mtrr_state after 'get_mtrr_state' (at offset 0x199) WARNING: arch/x86_64/kernel/built-in.o - Section mismatch: reference to .init.text: from .text.get_mtrr_state after 'get_mtrr_state' (at offset 0x1c1) Signed-off-by: Randy Dunlap Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/mtrr/generic.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index 000c07e86433..5367e32e0403 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -38,7 +38,7 @@ static struct mtrr_state mtrr_state = {}; #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "mtrr." -static __initdata int mtrr_show; +static int mtrr_show; module_param_named(show, mtrr_show, bool, 0); /* Get the MSR pair relating to a var range */ @@ -68,12 +68,13 @@ void mtrr_save_fixed_ranges(void *info) get_fixed_ranges(mtrr_state.fixed_ranges); } -static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types) +static void __cpuinit print_fixed(unsigned base, unsigned step, const mtrr_type*types) { unsigned i; for (i = 0; i < 8; ++i, ++types, base += step) - printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types)); + printk(KERN_INFO "MTRR %05X-%05X %s\n", + base, base + step - 1, mtrr_attrib_to_str(*types)); } /* Grab all of the MTRR state for this CPU into *state */ From 0260c196c97e48e4b821031ae55912c22113ed87 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 184/231] [PATCH] i386: PARAVIRT: fix startup_ipi_hook config dependency startup_ipi_hook depends on CONFIG_X86_LOCAL_APIC, so move it to the right part of the paravirt_ops initialization. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/i386/kernel/paravirt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index f99d3d78c53f..5c10f376bce1 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -292,6 +292,7 @@ struct paravirt_ops paravirt_ops = { .apic_read = native_apic_read, .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, + .startup_ipi_hook = paravirt_nop, #endif .set_lazy_mode = paravirt_nop, @@ -342,8 +343,6 @@ struct paravirt_ops paravirt_ops = { .dup_mmap = paravirt_nop, .exit_mmap = paravirt_nop, .activate_mm = paravirt_nop, - - .startup_ipi_hook = paravirt_nop, }; EXPORT_SYMBOL(paravirt_ops); From 141a892f57972b01891df7036f567a70459c19ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 185/231] [PATCH] x86-64: move __vgetcpu_mode & __jiffies to the vsyscall_2 zone We apparently hit the 1024 limit of vsyscall_0 zone when some debugging options are set, or if __vsyscall_gtod_data is 64 bytes larger. In order to save 128 bytes from the vsyscall_0 zone, we move __vgetcpu_mode & __jiffies to vsyscall_2 zone where they really belong, since they are used only from vgetcpu() (which is in this vsyscall_2 area). After patch is applied, new layout is : ffffffffff600000 T vgettimeofday ffffffffff60004e t vsysc2 ffffffffff600140 t vread_hpet ffffffffff600150 t vread_tsc ffffffffff600180 D __vsyscall_gtod_data ffffffffff600400 T vtime ffffffffff600413 t vsysc1 ffffffffff600800 T vgetcpu ffffffffff600870 D __vgetcpu_mode ffffffffff600880 D __jiffies ffffffffff600c00 T venosys_1 Signed-off-by: Eric Dumazet Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/vmlinux.lds.S | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 7ef0b8820cd2..32e427a95ffd 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -95,6 +95,12 @@ SECTIONS { *(.vsyscall_gtod_data) } vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) + { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) + { *(.vsyscall_2) } + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } vgetcpu_mode = VVIRT(.vgetcpu_mode); @@ -102,10 +108,6 @@ SECTIONS .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } jiffies = VVIRT(.jiffies); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } From e8a72ffa3aa618fb25b5727c0e0ae939d30d66c0 Mon Sep 17 00:00:00 2001 From: "Keshavamurthy, Anil S" Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 186/231] [PATCH] i386: avoid checking for cpu gone when CONFIG_HOTPLUG_CPU not defined Avoid checking for cpu gone in mm hot path when CONFIG_HOTPLUG_CPU is not defined. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Gautham R Shenoy Signed-off-by: Andrew Morton --- arch/i386/kernel/smp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index f8667109db1c..f98c3ffd6fc3 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -359,10 +359,12 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, BUG_ON(cpu_isset(smp_processor_id(), cpumask)); BUG_ON(!mm); +#ifdef CONFIG_HOTPLUG_CPU /* If a CPU which we ran on has gone down, OK. */ cpus_and(cpumask, cpumask, cpu_online_map); - if (cpus_empty(cpumask)) + if (unlikely(cpus_empty(cpumask))) return; +#endif /* * i'm not happy about this global shared spinlock in the From 62dbc210e2532dec061ca65eeb8bc31b6c898b01 Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 187/231] [PATCH] i386: replace spin_lock_irqsave with spin_lock IRQ is already disabled through local_irq_disable(). So spin_lock_irqsave() can be replaced with spin_lock(). Signed-off-by: Hisashi Hifumi Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/reboot.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 14b4de2882be..50dfc65319cd 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -198,8 +198,6 @@ static unsigned char jump_to_bios [] = */ void machine_real_restart(unsigned char *code, int length) { - unsigned long flags; - local_irq_disable(); /* Write zero to CMOS register number 0x0f, which the BIOS POST @@ -212,9 +210,9 @@ void machine_real_restart(unsigned char *code, int length) safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) */ - spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&rtc_lock); CMOS_WRITE(0x00, 0x8f); - spin_unlock_irqrestore(&rtc_lock, flags); + spin_unlock(&rtc_lock); /* Remap the kernel at virtual address zero, as well as offset zero from the kernel segment. This assumes the kernel segment starts at From 1bdae4583e7abd2c1daedfc9f46ac6420a26c1b0 Mon Sep 17 00:00:00 2001 From: "Keshavamurthy, Anil S" Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 188/231] [PATCH] i386: clean up flush_tlb_others fn Cleanup flush_tlb_others(), no functional change. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/smp.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index f98c3ffd6fc3..89a45a9ddcd4 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -375,17 +375,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, flush_mm = mm; flush_va = va; -#if NR_CPUS <= BITS_PER_LONG - atomic_set_mask(cpumask, &flush_cpumask); -#else - { - int k; - unsigned long *flush_mask = (unsigned long *)&flush_cpumask; - unsigned long *cpu_mask = (unsigned long *)&cpumask; - for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k) - atomic_set_mask(cpu_mask[k], &flush_mask[k]); - } -#endif + cpus_or(flush_cpumask, cpumask, flush_cpumask); /* * We have to send the IPI only to * CPUs affected. From 2f3c30e6a886ddaf65cb74df82c03245050ff0aa Mon Sep 17 00:00:00 2001 From: Joachim Deguara Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 189/231] [PATCH] i386: check capability Currently the i386 architecture checks the family for mce capability and this removes that and uses the CPUID information. Tested on a K8 revE and a family10h processor. This eliminates checking of a set AMD procesor family if mce is allowed and relies on the information being in CPUID. Signed-off-by: Joachim Deguara Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/mcheck/k7.c | 3 +++ arch/i386/kernel/cpu/mcheck/mce.c | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c index 7a2472557bbb..f9fa4142551e 100644 --- a/arch/i386/kernel/cpu/mcheck/k7.c +++ b/arch/i386/kernel/cpu/mcheck/k7.c @@ -75,6 +75,9 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) machine_check_vector = k7_machine_check; wmb(); + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + printk (KERN_INFO "Intel machine check architecture supported.\n"); rdmsr (MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index 4f10c62d180c..56cd485b127c 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -38,8 +38,7 @@ void mcheck_init(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_AMD: - if (c->x86==6 || c->x86==15) - amd_mcheck_init(c); + amd_mcheck_init(c); break; case X86_VENDOR_INTEL: From 57a4f91ae5571edd7c0428285d8df16bb8bf5f40 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 190/231] [PATCH] x86-64: Auto compute __NR_syscall_max at compile time No need to maintain it anymore Signed-off-by: Andi Kleen --- arch/x86_64/kernel/asm-offsets.c | 10 ++++++++++ arch/x86_64/kernel/syscall.c | 1 + include/asm-x86_64/unistd.h | 2 -- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c index 96687e2beb2c..778953bc636c 100644 --- a/arch/x86_64/kernel/asm-offsets.c +++ b/arch/x86_64/kernel/asm-offsets.c @@ -21,6 +21,14 @@ #define BLANK() asm volatile("\n->" : : ) +#define __NO_STUBS 1 +#undef __SYSCALL +#undef _ASM_X86_64_UNISTD_H_ +#define __SYSCALL(nr, sym) [nr] = 1, +static char syscalls[] = { +#include +}; + int main(void) { #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) @@ -71,5 +79,7 @@ int main(void) DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); BLANK(); DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); return 0; } diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c index 213fd6ab789d..63d592c276cc 100644 --- a/arch/x86_64/kernel/syscall.c +++ b/arch/x86_64/kernel/syscall.c @@ -3,6 +3,7 @@ #include #include #include +#include #define __NO_STUBS diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 576b29732d3c..26e23e01c54a 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -620,8 +620,6 @@ __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages 279 __SYSCALL(__NR_move_pages, sys_move_pages) -#define __NR_syscall_max __NR_move_pages - #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT From 62918a036148230ba1ad175dc8a0952e3752ac57 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 191/231] [PATCH] x86-64: skip cache_free_alien() on non NUMA Set use_alien_caches to 0 on non NUMA platforms. And avoid calling the cache_free_alien() when use_alien_caches is not set. This will avoid the cache miss that happens while dereferencing slabp to get nodeid. Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Eric Dumazet Cc: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton --- mm/slab.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 4cbac24ae2f1..168bfe9d8ffe 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1146,7 +1146,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches)) + if (likely(slabp->nodeid == node)) return 0; l3 = cachep->nodelists[node]; @@ -1394,6 +1394,9 @@ void __init kmem_cache_init(void) int order; int node; + if (num_possible_nodes() == 1) + use_alien_caches = 0; + for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); if (i < MAX_NUMNODES) @@ -3563,7 +3566,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (cache_free_alien(cachep, objp)) + if (use_alien_caches && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { From df3624aa293dfa2d46089747d919711089a702eb Mon Sep 17 00:00:00 2001 From: Daniel Walker Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH 192/231] [PATCH] i386: remove xtime_lock'ing around cpufreq notifier The locking of the xtime_lock around the cpu notifier is unessesary now. At one time the tsc was used after a frequency change for timekeeping, but the re-write of timekeeping no longer uses the TSC unless the frequency is constant. The variables that are changed in this section of code had also once been used for timekeeping, but not any longer .. Signed-off-by: Daniel Walker Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton --- arch/i386/kernel/tsc.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 755209dc93e1..f64b81f3033b 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -200,13 +200,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_seqlock_irq(&xtime_lock); - if (!ref_freq) { if (!freq->old){ ref_freq = freq->new; - goto end; + return 0; } ref_freq = freq->old; loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; @@ -237,9 +234,6 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) } } } -end: - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_sequnlock_irq(&xtime_lock); return 0; } From c2c1accd4b2f9c82fb89d40611c7f581948db255 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 2 May 2007 19:27:19 +0200 Subject: [PATCH 193/231] [PATCH] i386: pte clear optimization When exiting from an address space, no special hypervisor notification of page table updates needs to occur; direct page table hypervisors, such as Xen, switch to another address space first (init_mm) and unprotects the page tables to avoid the cost of trapping to the hypervisor for each pte_clear. Shadow mode hypervisors, such as VMI and lhype don't need to do the extra work of calling through paravirt-ops, and can just directly clear the page table entries without notifiying the hypervisor, since all the page tables are about to be freed. So introduce native_pte_clear functions which bypass any paravirt-ops notification. This results in a significant performance win for VMI and removes some indirect calls from zap_pte_range. Note the 3-level paging already had a native_pte_clear function, thus demanding argument conformance and extra args for the 2-level definition. Signed-off-by: Zachary Amsden Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- include/asm-i386/pgtable-2level.h | 5 +++++ include/asm-i386/pgtable.h | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index 781fe4bcc962..85d9005c0cdf 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -36,6 +36,11 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) +static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp) +{ + *xp = __pte(0); +} + static inline pte_t native_ptep_get_and_clear(pte_t *xp) { return __pte(xchg(&xp->pte_low, 0)); diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index e7ddd2341309..00e97a9d367e 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -344,7 +344,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long pte_t pte; if (full) { pte = *ptep; - pte_clear(mm, addr, ptep); + native_pte_clear(mm, addr, ptep); } else { pte = ptep_get_and_clear(mm, addr, ptep); } From 142dd975911fdd82b1b6f6617cd20ac90a8ccf00 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 2 May 2007 19:27:19 +0200 Subject: [PATCH 194/231] [PATCH] i386: pte xchg optimization In situations where page table updates need only be made locally, and there is no cross-processor A/D bit races involved, we need not use the heavyweight xchg instruction to atomically fetch and clear page table entries. Instead, we can just read and clear them directly. This introduces a neat optimization for non-SMP kernels; drop the atomic xchg operations from page table updates. Thanks to Michel Lespinasse for noting this potential optimization. Signed-off-by: Zachary Amsden Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- include/asm-i386/pgtable-2level.h | 14 ++++++++++++++ include/asm-i386/pgtable-3level.h | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index 85d9005c0cdf..3daab67cd366 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -41,10 +41,24 @@ static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pt *xp = __pte(0); } +/* local pte updates need not use xchg for locking */ +static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) +{ + pte_t res; + + res = *ptep; + native_pte_clear(NULL, 0, ptep); + return res; +} + +#ifdef CONFIG_SMP static inline pte_t native_ptep_get_and_clear(pte_t *xp) { return __pte(xchg(&xp->pte_low, 0)); } +#else +#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) +#endif #define pte_page(x) pfn_to_page(pte_pfn(x)) #define pte_none(x) (!(x).pte_low) diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index 664bfee5a2f2..45b024181507 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h @@ -139,6 +139,17 @@ static inline void pud_clear (pud_t * pud) { } #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ pmd_index(address)) +/* local pte updates need not use xchg for locking */ +static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) +{ + pte_t res; + + res = *ptep; + native_pte_clear(NULL, 0, ptep); + return res; +} + +#ifdef CONFIG_SMP static inline pte_t native_ptep_get_and_clear(pte_t *ptep) { pte_t res; @@ -150,6 +161,9 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) return res; } +#else +#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) +#endif #define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t a, pte_t b) From 9e5e3162b2d5e4466187ecd63c9eec2de33cb7bc Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 2 May 2007 19:27:19 +0200 Subject: [PATCH 195/231] [PATCH] i386: pte simplify ops Add comment and condense code to make use of native_local_ptep_get_and_clear function. Also, it turns out the 2-level and 3-level paging definitions were identical, so move the common definition into pgtable.h Signed-off-by: Zachary Amsden Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- include/asm-i386/pgtable-2level.h | 10 ---------- include/asm-i386/pgtable-3level.h | 10 ---------- include/asm-i386/pgtable.h | 17 +++++++++++++++-- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index 3daab67cd366..a50fd1773de8 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -41,16 +41,6 @@ static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pt *xp = __pte(0); } -/* local pte updates need not use xchg for locking */ -static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) -{ - pte_t res; - - res = *ptep; - native_pte_clear(NULL, 0, ptep); - return res; -} - #ifdef CONFIG_SMP static inline pte_t native_ptep_get_and_clear(pte_t *xp) { diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index 45b024181507..eb0f1d7e96a1 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h @@ -139,16 +139,6 @@ static inline void pud_clear (pud_t * pud) { } #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ pmd_index(address)) -/* local pte updates need not use xchg for locking */ -static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) -{ - pte_t res; - - res = *ptep; - native_pte_clear(NULL, 0, ptep); - return res; -} - #ifdef CONFIG_SMP static inline pte_t native_ptep_get_and_clear(pte_t *ptep) { diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 00e97a9d367e..c6b8b944120c 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -269,6 +269,16 @@ extern void vmalloc_sync_all(void); #define pte_update_defer(mm, addr, ptep) do { } while (0) #endif +/* local pte updates need not use xchg for locking */ +static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) +{ + pte_t res = *ptep; + + /* Pure native function needs no input for mm, addr */ + native_pte_clear(NULL, 0, ptep); + return res; +} + /* * We only update the dirty/accessed state if we set * the dirty bit by hand in the kernel, since the hardware @@ -343,8 +353,11 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long { pte_t pte; if (full) { - pte = *ptep; - native_pte_clear(mm, addr, ptep); + /* + * Full address destruction in progress; paravirt does not + * care about updates and native needs no locking + */ + pte = native_local_ptep_get_and_clear(ptep); } else { pte = ptep_get_and_clear(mm, addr, ptep); } From f26d6a2bbcf381230df771123578380584004631 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 2 May 2007 19:27:19 +0200 Subject: [PATCH 196/231] [PATCH] i386: convert to the kthread API This patch just trivial converts from calling kernel_thread and daemonize to just calling kthread_run. Signed-off-by: Eric W. Biederman Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/io_apic.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 8191583032d0..f23a17b3b8cf 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -661,8 +662,6 @@ static int balanced_irq(void *unused) unsigned long prev_balance_time = jiffies; long time_remaining = balanced_irq_interval; - daemonize("kirqd"); - /* push everything to CPU 0 to give us a starting point. */ for (i = 0 ; i < NR_IRQS ; i++) { irq_desc[i].pending_mask = cpumask_of_cpu(0); @@ -722,10 +721,9 @@ static int __init balanced_irq_init(void) } printk(KERN_INFO "Starting balanced_irq\n"); - if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) + if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) return 0; - else - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); failed: for_each_possible_cpu(i) { kfree(irq_cpu_data[i].irq_delta); From 425001fea782dd072cfec5694b93778eb59a1fd3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 May 2007 19:27:19 +0200 Subject: [PATCH 197/231] [PATCH] x86-64: unexport cpu_llc_id WARNING: arch/x86_64/kernel/built-in.o - Section mismatch: reference to .init.data:cpu_llc_id from __ksymtab between '__ksymtab_cpu_llc_id' (at offset 0x4a0) and '__ksymtab_smp_num_siblings' It is strange to export a __cpuinitdata symbols to modules, and no module appears to use it anyway. Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/smpboot.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 082f478fb777..4d9dacfae575 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -67,7 +67,6 @@ EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; -EXPORT_SYMBOL(cpu_llc_id); /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; From 889f21ce272e38db19c8114a7e0a5793d4590077 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:19 +0200 Subject: [PATCH 198/231] [PATCH] i386: fix wrong comment for syscall stack layout `ret_from_sys_call' label no longer exist and `syscall_exit' label was introduced instead. Signed-off-by: Satoru Takeuchi Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 90ffcdb69838..b1f16ee65e4d 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -15,7 +15,7 @@ * I changed all the .align's to 4 (16 byte alignment), as that's faster * on a 486. * - * Stack layout in 'ret_from_system_call': + * Stack layout in 'syscall_exit': * ptrace needs to have all regs on the stack. * if the order here is changed, it needs to be * updated in fork.c:copy_process, signal.c:do_signal, From f82af20e1a028e16b9bb11da081fa1148d40fa6a Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 2 May 2007 19:27:19 +0200 Subject: [PATCH 199/231] [PATCH] x86-64: ignore vgacon if hardware not present Avoid trying to set up vgacon if there's no vga hardware present. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Alan Acked-by: Ingo Molnar --- drivers/video/console/vgacon.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c index 91a20785108a..3e67c34df9a5 100644 --- a/drivers/video/console/vgacon.c +++ b/drivers/video/console/vgacon.c @@ -371,7 +371,8 @@ static const char *vgacon_startup(void) } /* VGA16 modes are not handled by VGACON */ - if ((ORIG_VIDEO_MODE == 0x0D) || /* 320x200/4 */ + if ((ORIG_VIDEO_MODE == 0x00) || /* SCREEN_INFO not initialized */ + (ORIG_VIDEO_MODE == 0x0D) || /* 320x200/4 */ (ORIG_VIDEO_MODE == 0x0E) || /* 640x200/4 */ (ORIG_VIDEO_MODE == 0x10) || /* 640x350/4 */ (ORIG_VIDEO_MODE == 0x12) || /* 640x480/4 */ From 8a336b0a4b6dfacc8cc5fd617ba1e1904077de2d Mon Sep 17 00:00:00 2001 From: Tim Hockin Date: Wed, 2 May 2007 19:27:19 +0200 Subject: [PATCH 200/231] [PATCH] x86-64: Dynamically adjust machine check interval Background: We've found that MCEs (specifically DRAM SBEs) tend to come in bunches, especially when we are trying really hard to stress the system out. The current MCE poller uses a static interval which does not care whether it has or has not found MCEs recently. Description: This patch makes the MCE poller adjust the polling interval dynamically. If we find an MCE, poll 2x faster (down to 10 ms). When we stop finding MCEs, poll 2x slower (up to check_interval seconds). The check_interval tunable becomes the max polling interval. The "Machine check events logged" printk() is rate limited to the check_interval, which should be identical behavior to the old functionality. Result: If you start to take a lot of correctable errors (not exceptions), you log them faster and more accurately (less chance of overflowing the MCA registers). If you don't take a lot of errors, you will see no change. Alternatives: I considered simply reducing the polling interval to 10 ms immediately and keeping it there as long as we continue to find errors. This felt a bit heavy handed, but does perform significantly better for the default check_interval of 5 minutes (we're using a few seconds when testing for DRAM errors). I could be convinced to go with this, if anyone felt it was not too aggressive. Testing: I used an error-injecting DIMM to create lots of correctable DRAM errors and verified that the polling interval accelerates. The printk() only happens once per check_interval seconds. Patch: This patch is against 2.6.21-rc7. Signed-Off-By: Tim Hockin Signed-off-by: Andi Kleen --- Documentation/x86_64/machinecheck | 7 ++++++- arch/x86_64/kernel/mce.c | 32 +++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck index 068a6d9904b9..feaeaf6f6e4d 100644 --- a/Documentation/x86_64/machinecheck +++ b/Documentation/x86_64/machinecheck @@ -36,7 +36,12 @@ between all CPUs. check_interval How often to poll for corrected machine check errors, in seconds - (Note output is hexademical). Default 5 minutes. + (Note output is hexademical). Default 5 minutes. When the poller + finds MCEs it triggers an exponential speedup (poll more often) on + the polling interval. When the poller stops finding MCEs, it + triggers an exponential backoff (poll less often) on the polling + interval. The check_interval variable is both the initial and + maximum polling interval. tolerant Tolerance level. When a machine check exception occurs for a non diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 8011a8e1c7d4..fa2672682477 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -323,10 +323,13 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) #endif /* CONFIG_X86_MCE_INTEL */ /* - * Periodic polling timer for "silent" machine check errors. + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). */ static int check_interval = 5 * 60; /* 5 minutes */ +static int next_interval; /* in jiffies */ static void mcheck_timer(struct work_struct *work); static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); @@ -339,7 +342,6 @@ static void mcheck_check_cpu(void *info) static void mcheck_timer(struct work_struct *work) { on_each_cpu(mcheck_check_cpu, NULL, 1, 1); - schedule_delayed_work(&mcheck_work, check_interval * HZ); /* * It's ok to read stale data here for notify_user and @@ -349,17 +351,30 @@ static void mcheck_timer(struct work_struct *work) * writes. */ if (notify_user && console_logged) { + static unsigned long last_print; + unsigned long now = jiffies; + + /* if we logged an MCE, reduce the polling interval */ + next_interval = max(next_interval/2, HZ/100); notify_user = 0; clear_bit(0, &console_logged); - printk(KERN_INFO "Machine check events logged\n"); + if (time_after_eq(now, last_print + (check_interval*HZ))) { + last_print = now; + printk(KERN_INFO "Machine check events logged\n"); + } + } else { + next_interval = min(next_interval*2, check_interval*HZ); } + + schedule_delayed_work(&mcheck_work, next_interval); } static __init int periodic_mcheck_init(void) { - if (check_interval) - schedule_delayed_work(&mcheck_work, check_interval*HZ); + next_interval = check_interval * HZ; + if (next_interval) + schedule_delayed_work(&mcheck_work, next_interval); return 0; } __initcall(periodic_mcheck_init); @@ -597,12 +612,13 @@ static int mce_resume(struct sys_device *dev) /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { - if (check_interval) + if (next_interval) cancel_delayed_work(&mcheck_work); /* Timer race is harmless here */ on_each_cpu(mce_init, NULL, 1, 1); - if (check_interval) - schedule_delayed_work(&mcheck_work, check_interval*HZ); + next_interval = check_interval * HZ; + if (next_interval) + schedule_delayed_work(&mcheck_work, next_interval); } static struct sysdev_class mce_sysclass = { From e3f1caeef9a70b0699518092d653c15274b025ab Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 201/231] [PATCH] x86-64: set node_possible_map at runtime - try 2 Set the node_possible_map at runtime on x86_64. On a non NUMA system, num_possible_nodes() will now say '1'. Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Eric Dumazet Cc: David Rientjes Cc: Christoph Lameter --- arch/x86_64/mm/k8topology.c | 7 ++----- arch/x86_64/mm/numa.c | 10 ++++++++-- arch/x86_64/mm/srat.c | 8 +++++--- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index b5b8dba28b4e..60e860e5ef4c 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -49,11 +49,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) int found = 0; u32 reg; unsigned numnodes; - nodemask_t nodes_parsed; unsigned dualcore = 0; - nodes_clear(nodes_parsed); - if (!early_pci_allowed()) return -1; @@ -102,7 +99,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodeid, (base>>8)&3, (limit>>8) & 3); return -1; } - if (node_isset(nodeid, nodes_parsed)) { + if (node_isset(nodeid, node_possible_map)) { printk(KERN_INFO "Node %d already present. Skipping\n", nodeid); continue; @@ -155,7 +152,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = base; - node_set(nodeid, nodes_parsed); + node_set(nodeid, node_possible_map); } if (!found) diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 5ee07bc41eb5..51548947ad3b 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -295,7 +295,7 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, ret = -1; } nodes[nid].end = *addr; - node_set_online(nid); + node_set(nid, node_possible_map); printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, nodes[nid].start, nodes[nid].end, (nodes[nid].end - nodes[nid].start) >> 20); @@ -479,7 +479,7 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) * SRAT. */ remove_all_active_ranges(); - for_each_online_node(i) { + for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); @@ -494,20 +494,25 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int i; + nodes_clear(node_possible_map); + #ifdef CONFIG_NUMA_EMU if (cmdline && !numa_emulation(start_pfn, end_pfn)) return; + nodes_clear(node_possible_map); #endif #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT)) return; + nodes_clear(node_possible_map); #endif #ifdef CONFIG_K8_NUMA if (!numa_off && !k8_scan_nodes(start_pfn< Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 202/231] [PATCH] i386: Clean up NMI watchdog code - Introduce a wd_ops structure - Convert the various nmi watchdogs over to it - This allows to split the perfctr reservation from the watchdog setup cleanly. - Do perfctr reservation globally as it should have always been - Remove dead code referenced only by unused EXPORT_SYMBOLs Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/Makefile | 2 + arch/i386/kernel/cpu/perfctr-watchdog.c | 658 +++++++++++++++++++ arch/i386/kernel/nmi.c | 831 ++---------------------- include/asm-i386/nmi.h | 8 + 4 files changed, 722 insertions(+), 777 deletions(-) create mode 100644 arch/i386/kernel/cpu/perfctr-watchdog.c diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile index 5fb1a7560438..74f27a463db0 100644 --- a/arch/i386/kernel/cpu/Makefile +++ b/arch/i386/kernel/cpu/Makefile @@ -17,3 +17,5 @@ obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c new file mode 100644 index 000000000000..2b04c8f1db62 --- /dev/null +++ b/arch/i386/kernel/cpu/perfctr-watchdog.c @@ -0,0 +1,658 @@ +/* local apic based NMI watchdog for various CPUs. + This file also handles reservation of performance counters for coordination + with other users (like oprofile). + + Note that these events normally don't tick when the CPU idles. This means + the frequency varies with CPU load. + + Original code for K7/P6 written by Keith Owens */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nmi_watchdog_ctlblk { + unsigned int cccr_msr; + unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ + unsigned int evntsel_msr; /* the MSR to select the events to handle */ +}; + +/* Interface defining a CPU specific perfctr watchdog */ +struct wd_ops { + int (*reserve)(void); + void (*unreserve)(void); + int (*setup)(unsigned nmi_hz); + void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); + void (*stop)(void *); + unsigned perfctr; + unsigned evntsel; + u64 checkbit; +}; + +static struct wd_ops *wd_ops; + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); +static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); + +static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + return wd_ops ? msr - wd_ops->perfctr : 0; +} + +/* converts an msr to an appropriate reservation bit */ +/* returns the bit offset of the event selection register */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + return wd_ops ? msr - wd_ops->evntsel : 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, perfctr_nmi_owner)); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, perfctr_nmi_owner)); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, perfctr_nmi_owner)) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, perfctr_nmi_owner); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, evntsel_nmi_owner)) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, evntsel_nmi_owner); +} + +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); + +void disable_lapic_nmi_watchdog(void) +{ + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + if (atomic_read(&nmi_active) <= 0) + return; + + on_each_cpu(wd_ops->stop, NULL, 0, 1); + wd_ops->unreserve(); + + BUG_ON(atomic_read(&nmi_active) != 0); +} + +void enable_lapic_nmi_watchdog(void) +{ + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + /* are we already enabled */ + if (atomic_read(&nmi_active) != 0) + return; + + /* are we lapic aware */ + if (!wd_ops) + return; + if (!wd_ops->reserve()) { + printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n"); + return; + } + + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + touch_nmi_watchdog(); +} + +/* + * Activate the NMI watchdog via the local APIC. + */ + +static unsigned int adjust_for_32bit_ctr(unsigned int hz) +{ + u64 counter_val; + unsigned int retval = hz; + + /* + * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + counter_val = (u64)cpu_khz * 1000; + do_div(counter_val, retval); + if (counter_val > 0x7fffffffULL) { + u64 count = (u64)cpu_khz * 1000; + do_div(count, 0x7fffffffUL); + retval = count + 1; + } + return retval; +} + +static void +write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsrl(perfctr_msr, 0 - count); +} + +static void write_watchdog_counter32(unsigned int perfctr_msr, + const char *descr, unsigned nmi_hz) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsr(perfctr_msr, (u32)(-count), 0); +} + +/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface + nicely stable so there is not much variety */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + +static int setup_k7_watchdog(unsigned nmi_hz) +{ + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + perfctr_msr = MSR_K7_PERFCTR0; + evntsel_msr = MSR_K7_EVNTSEL0; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = K7_EVNTSEL_INT + | K7_EVNTSEL_OS + | K7_EVNTSEL_USR + | K7_NMI_EVENT; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= K7_EVNTSEL_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + return 1; +} + +static void single_msr_stop_watchdog(void *arg) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); +} + +static int single_msr_reserve(void) +{ + if (!reserve_perfctr_nmi(wd_ops->perfctr)) + return 0; + + if (!reserve_evntsel_nmi(wd_ops->evntsel)) { + release_perfctr_nmi(wd_ops->perfctr); + return 0; + } + return 1; +} + +static void single_msr_unreserve(void) +{ + release_evntsel_nmi(wd_ops->perfctr); + release_perfctr_nmi(wd_ops->evntsel); +} + +static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +{ + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); +} + +static struct wd_ops k7_wd_ops = { + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_k7_watchdog, + .rearm = single_msr_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_K7_PERFCTR0, + .evntsel = MSR_K7_EVNTSEL0, + .checkbit = 1ULL<<63, +}; + +/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ + +#define P6_EVNTSEL0_ENABLE (1 << 22) +#define P6_EVNTSEL_INT (1 << 20) +#define P6_EVNTSEL_OS (1 << 17) +#define P6_EVNTSEL_USR (1 << 16) +#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 +#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED + +static int setup_p6_watchdog(unsigned nmi_hz) +{ + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + perfctr_msr = MSR_P6_PERFCTR0; + evntsel_msr = MSR_P6_EVNTSEL0; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = P6_EVNTSEL_INT + | P6_EVNTSEL_OS + | P6_EVNTSEL_USR + | P6_NMI_EVENT; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= P6_EVNTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + return 1; +} + +static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +{ + /* P6 based Pentium M need to re-unmask + * the apic vector but it doesn't hurt + * other P6 variant. + * ArchPerfom/Core Duo also needs this */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + /* P6/ARCH_PERFMON has 32 bit counter write */ + write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); +} + +static struct wd_ops p6_wd_ops = { + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_p6_watchdog, + .rearm = p6_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_P6_PERFCTR0, + .evntsel = MSR_P6_EVNTSEL0, + .checkbit = 1ULL<<39, +}; + +/* Intel P4 performance counters. By far the most complicated of all. */ + +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +#define P4_CCCR_OVF (1<<31) + +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ + +static int setup_p4_watchdog(unsigned nmi_hz) +{ + unsigned int perfctr_msr, evntsel_msr, cccr_msr; + unsigned int evntsel, cccr_val; + unsigned int misc_enable, dummy; + unsigned int ht_num; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); + if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) + return 0; + +#ifdef CONFIG_SMP + /* detect which hyperthread we are on */ + if (smp_num_siblings == 2) { + unsigned int ebx, apicid; + + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; + } else +#endif + ht_num = 0; + + /* performance counters are shared resources + * assign each hyperthread its own set + * (re-use the ESCR0 register, seems safe + * and keeps the cccr_val the same) + */ + if (!ht_num) { + /* logical cpu 0 */ + perfctr_msr = MSR_P4_IQ_PERFCTR0; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR0; + cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + } else { + /* logical cpu 1 */ + perfctr_msr = MSR_P4_IQ_PERFCTR1; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR1; + cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + } + + evntsel = P4_ESCR_EVENT_SELECT(0x3F) + | P4_ESCR_OS + | P4_ESCR_USR; + + cccr_val |= P4_CCCR_THRESHOLD(15) + | P4_CCCR_COMPLEMENT + | P4_CCCR_COMPARE + | P4_CCCR_REQUIRED; + + wrmsr(evntsel_msr, evntsel, 0); + wrmsr(cccr_msr, cccr_val, 0); + write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = cccr_msr; + return 1; +} + +static void stop_p4_watchdog(void *arg) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + wrmsr(wd->cccr_msr, 0, 0); + wrmsr(wd->evntsel_msr, 0, 0); +} + +static int p4_reserve(void) +{ + if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0)) + return 0; +#ifdef CONFIG_SMP + if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1)) + goto fail1; +#endif + if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + goto fail2; + /* RED-PEN why is ESCR1 not reserved here? */ + return 1; + fail2: +#ifdef CONFIG_SMP + if (smp_num_siblings > 1) + release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); + fail1: +#endif + release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); + return 0; +} + +static void p4_unreserve(void) +{ +#ifdef CONFIG_SMP + if (smp_num_siblings > 1) + release_evntsel_nmi(MSR_P4_IQ_PERFCTR1); +#endif + release_evntsel_nmi(MSR_P4_IQ_PERFCTR0); + release_perfctr_nmi(MSR_P4_CRU_ESCR0); +} + +static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +{ + unsigned dummy; + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + rdmsrl(wd->cccr_msr, dummy); + dummy &= ~P4_CCCR_OVF; + wrmsrl(wd->cccr_msr, dummy); + apic_write(APIC_LVTPC, APIC_DM_NMI); + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); +} + +static struct wd_ops p4_wd_ops = { + .reserve = p4_reserve, + .unreserve = p4_unreserve, + .setup = setup_p4_watchdog, + .rearm = p4_rearm, + .stop = stop_p4_watchdog, + /* RED-PEN this is wrong for the other sibling */ + .perfctr = MSR_P4_BPU_PERFCTR0, + .evntsel = MSR_P4_BSU_ESCR0, + .checkbit = 1ULL<<39, +}; + +/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully + all future Intel CPUs. */ + +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(unsigned nmi_hz) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return 0; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd_ops->checkbit = 1ULL << (eax.split.bit_width - 1); + return 1; +} + +static struct wd_ops intel_arch_wd_ops = { + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_intel_arch_watchdog, + .rearm = p6_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .evntsel = MSR_ARCH_PERFMON_EVENTSEL0, +}; + +static void probe_nmi_watchdog(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && + boot_cpu_data.x86 != 16) + return; + wd_ops = &k7_wd_ops; + break; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + wd_ops = &intel_arch_wd_ops; + break; + } + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + return; + + wd_ops = &p6_wd_ops; + break; + case 15: + if (boot_cpu_data.x86_model > 0x4) + return; + + wd_ops = &p4_wd_ops; + break; + default: + return; + } + break; + } +} + +/* Interface to nmi.c */ + +int lapic_watchdog_init(unsigned nmi_hz) +{ + if (!wd_ops) { + probe_nmi_watchdog(); + if (!wd_ops) + return -1; + } + + if (!(wd_ops->setup(nmi_hz))) { + printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n", + raw_smp_processor_id()); + return -1; + } + + return 0; +} + +void lapic_watchdog_stop(void) +{ + if (wd_ops) + wd_ops->stop(NULL); +} + +unsigned lapic_adjust_nmi_hz(unsigned hz) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + if (wd->perfctr_msr == MSR_P6_PERFCTR0 || + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) + hz = adjust_for_32bit_ctr(hz); + return hz; +} + +int lapic_wd_event(unsigned nmi_hz) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + u64 ctr; + rdmsrl(wd->perfctr_msr, ctr); + if (ctr & wd_ops->checkbit) { /* perfctr still running? */ + return 0; + } + wd_ops->rearm(wd, nmi_hz); + return 1; +} + +int lapic_watchdog_ok(void) +{ + return wd_ops != NULL; +} diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 2dec1b105737..33cf2f3c444f 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -28,30 +27,14 @@ #include #include #include -#include #include "mach_traps.h" int unknown_nmi_panic; int nmi_watchdog_enabled; -/* perfctr_nmi_owner tracks the ownership of the perfctr registers: - * evtsel_nmi_owner tracks the ownership of the event selection - * - different performance counters/ event selection may be reserved for - * different subsystems this reservation system just tries to coordinate - * things a little - */ - -/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's - * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) - */ -#define NMI_MAX_COUNTER_BITS 66 -#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS) - -static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]); -static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]); - static cpumask_t backtrace_mask = CPU_MASK_NONE; + /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled * <0: the lapic NMI watchdog has not been set up, and cannot @@ -63,203 +46,11 @@ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -struct nmi_watchdog_ctlblk { - int enabled; - u64 check_bit; - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); +static DEFINE_PER_CPU(short, wd_enabled); /* local prototypes */ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the performance counter register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); - - switch (boot_cpu_data.x86) { - case 6: - return (msr - MSR_P6_PERFCTR0); - case 15: - return (msr - MSR_P4_BPU_PERFCTR0); - } - } - return 0; -} - -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the event selection register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); - - switch (boot_cpu_data.x86) { - case 6: - return (msr - MSR_P6_EVNTSEL0); - case 15: - return (msr - MSR_P4_BSU_ESCR0); - } - } - return 0; -} - -/* checks for a bit availability (hack for oprofile) */ -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) -{ - int cpu; - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0])) - return 0; - } - return 1; -} - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - int cpu; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0])) - return 0; - } - return 1; -} - -static int __reserve_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0])) - return 1; - return 0; -} - -static void __release_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]); -} - -int reserve_perfctr_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_perfctr_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_perfctr_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_perfctr_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) { - __release_perfctr_nmi(cpu, msr); - } -} - -int __reserve_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0])) - return 1; - return 0; -} - -static void __release_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]); -} - -int reserve_evntsel_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_evntsel_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_evntsel_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_evntsel_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) { - __release_evntsel_nmi(cpu, msr); - } -} - -static __cpuinit inline int nmi_known_cpu(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6) - || (boot_cpu_data.x86 == 16)); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return 1; - else - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); - } - return 0; -} - static int endflag __initdata = 0; #ifdef CONFIG_SMP @@ -281,28 +72,6 @@ static __init void nmi_cpu_busy(void *data) } #endif -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - u64 counter_val; - unsigned int retval = hz; - - /* - * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - counter_val = (u64)cpu_khz * 1000; - do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - retval = count + 1; - } - return retval; -} - static int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -335,14 +104,14 @@ static int __init check_nmi_watchdog(void) if (!cpu_isset(cpu, cpu_callin_map)) continue; #endif - if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + if (!per_cpu(wd_enabled, cpu)) continue; if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], nmi_count(cpu)); - per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } } @@ -356,16 +125,8 @@ static int __init check_nmi_watchdog(void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - nmi_hz = 1; - - if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) { - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - } - } + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); kfree(prev_nmi_count); return 0; @@ -388,85 +149,8 @@ static int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -static void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (nmi_known_cpu() <= 0) - return; - - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - touch_nmi_watchdog(); -} - -void disable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - disable_irq(0); - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) == 0) { - touch_nmi_watchdog(); - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - enable_irq(0); - } -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} +/* Suspend/resume support */ #ifdef CONFIG_PM @@ -513,7 +197,7 @@ static int __init init_lapic_nmi_sysfs(void) if (nmi_watchdog != NMI_LOCAL_APIC) return 0; - if ( atomic_read(&nmi_active) < 0 ) + if (atomic_read(&nmi_active) < 0) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -526,433 +210,69 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ +static void __acpi_nmi_enable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI); +} + /* - * Activate the NMI watchdog via the local APIC. - * Original code written by Keith Owens. + * Enable timer based NMIs on all CPUs: */ - -static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr) +void acpi_nmi_enable(void) { - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(perfctr_msr, 0 - count); + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); } -static void write_watchdog_counter32(unsigned int perfctr_msr, - const char *descr) +static void __acpi_nmi_disable(void *__unused) { - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); - wrmsr(perfctr_msr, (u32)(-count), 0); + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(void) +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) { - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = MSR_K7_PERFCTR0; - evntsel_msr = MSR_K7_EVNTSEL0; - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL<<63; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_k7_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED - -static int setup_p6_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = MSR_P6_PERFCTR0; - evntsel_msr = MSR_P6_EVNTSEL0; - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = P6_EVNTSEL_INT - | P6_EVNTSEL_OS - | P6_EVNTSEL_USR - | P6_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL<<39; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_p6_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -#define P4_CCCR_OVF (1<<31) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ - -static int setup_p4_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); - } - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - wd->check_bit = 1ULL<<39; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_p4_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static int setup_intel_arch_watchdog(void) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - goto fail; - - perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1; - evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1; - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL << (eax.split.bit_width - 1); - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_intel_arch_watchdog(void) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return; - - wrmsr(wd->evntsel_msr, 0, 0); - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); } void setup_apic_nmi_watchdog (void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - - if (wd->enabled == 1) - return; + if (__get_cpu_var(wd_enabled)) + return; /* cheap hack to support suspend/resume */ /* if cpu0 is not active neither should the other cpus */ if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) return; - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - return; - - if (!setup_p6_watchdog()) - return; - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) - return; - - if (!setup_p4_watchdog()) - return; - break; - default: - return; - } - break; - default: + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */ + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; return; } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); } - wd->enabled = 1; - atomic_inc(&nmi_active); } void stop_apic_nmi_watchdog(void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; - - if (wd->enabled == 0) + if (__get_cpu_var(wd_enabled) == 0) return; - - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - stop_k7_watchdog(); - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - stop_intel_arch_watchdog(); - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - break; - stop_p6_watchdog(); - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) - break; - stop_p4_watchdog(); - break; - } - break; - default: - return; - } - } - wd->enabled = 0; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; atomic_dec(&nmi_active); } @@ -1008,8 +328,6 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) unsigned int sum; int touched = 0; int cpu = smp_processor_id(); - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 dummy; int rc=0; /* check for other users first */ @@ -1052,53 +370,20 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) alert_counter[cpu] = 0; } /* see if the nmi watchdog went off */ - if (wd->enabled) { - if (nmi_watchdog == NMI_LOCAL_APIC) { - rdmsrl(wd->perfctr_msr, dummy); - if (dummy & wd->check_bit){ - /* this wasn't a watchdog timer interrupt */ - goto done; - } - - /* only Intel P4 uses the cccr msr */ - if (wd->cccr_msr != 0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL); - } - else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) { - /* P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant. - * ArchPerfom/Core Duo also needs this */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL); - } else { - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL); - } - rc = 1; - } else if (nmi_watchdog == NMI_IO_APIC) { - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - } + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; } -done: return rc; } @@ -1143,7 +428,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, } if (nmi_watchdog == NMI_DEFAULT) { - if (nmi_known_cpu() > 0) + if (lapic_watchdog_ok()) nmi_watchdog = NMI_LOCAL_APIC; else nmi_watchdog = NMI_IO_APIC; @@ -1179,11 +464,3 @@ void __trigger_all_cpu_backtrace(void) EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); -EXPORT_SYMBOL(reserve_perfctr_nmi); -EXPORT_SYMBOL(release_perfctr_nmi); -EXPORT_SYMBOL(reserve_evntsel_nmi); -EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(disable_timer_nmi_watchdog); -EXPORT_SYMBOL(enable_timer_nmi_watchdog); diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index b04333ea6f31..fb1e133efd9f 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -50,4 +50,12 @@ void __trigger_all_cpu_backtrace(void); #endif +void lapic_watchdog_stop(void); +int lapic_watchdog_init(unsigned nmi_hz); +int lapic_wd_event(unsigned nmi_hz); +unsigned lapic_adjust_nmi_hz(unsigned hz); +int lapic_watchdog_ok(void); +void disable_lapic_nmi_watchdog(void); +void enable_lapic_nmi_watchdog(void); + #endif /* ASM_NMI_H */ From 05cb007dac9a50148daf87d0b9469e0cd05fd5e7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 203/231] [PATCH] x86-64: Use the 32bit wd_ops for 64bit too. This mainly removes a lot of code, replacing it with calls into the new 32bit perfctr-watchdog.c Signed-off-by: Andi Kleen --- arch/x86_64/kernel/Makefile | 4 +- arch/x86_64/kernel/nmi.c | 678 ++---------------------------------- include/asm-x86_64/nmi.h | 9 + 3 files changed, 45 insertions(+), 646 deletions(-) diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index a613e13d0e75..4d94c51803d8 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,8 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \ + perfctr-watchdog.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o @@ -57,3 +58,4 @@ i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o alternative-y += ../../i386/kernel/alternative.o pcspeaker-y += ../../i386/kernel/pcspeaker.o +perfctr-watchdog-y += ../../i386/kernel/cpu/perfctr-watchdog.o diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 010d3d9bd56a..6cd2b30e2ffc 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -27,28 +27,11 @@ #include #include #include -#include int unknown_nmi_panic; int nmi_watchdog_enabled; int panic_on_unrecovered_nmi; -/* perfctr_nmi_owner tracks the ownership of the perfctr registers: - * evtsel_nmi_owner tracks the ownership of the event selection - * - different performance counters/ event selection may be reserved for - * different subsystems this reservation system just tries to coordinate - * things a little - */ - -/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's - * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) - */ -#define NMI_MAX_COUNTER_BITS 66 -#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS) - -static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]); -static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]); - static cpumask_t backtrace_mask = CPU_MASK_NONE; /* nmi_active: @@ -63,191 +46,11 @@ int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -struct nmi_watchdog_ctlblk { - int enabled; - u64 check_bit; - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); +static DEFINE_PER_CPU(short, wd_enabled); /* local prototypes */ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the performance counter register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); - else - return (msr - MSR_P4_BPU_PERFCTR0); - } - return 0; -} - -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the event selection register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); - else - return (msr - MSR_P4_BSU_ESCR0); - } - return 0; -} - -/* checks for a bit availability (hack for oprofile) */ -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) -{ - int cpu; - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu))) - return 0; - } - return 1; -} - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - int cpu; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu))) - return 0; - } - return 1; -} - -static int __reserve_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu))) - return 1; - return 0; -} - -static void __release_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)); -} - -int reserve_perfctr_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_perfctr_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_perfctr_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_perfctr_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) - __release_perfctr_nmi(cpu, msr); -} - -int __reserve_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0])) - return 1; - return 0; -} - -static void __release_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]); -} - -int reserve_evntsel_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_evntsel_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_evntsel_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_evntsel_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) { - __release_evntsel_nmi(cpu, msr); - } -} - -static __cpuinit inline int nmi_known_cpu(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return 1; - else - return (boot_cpu_data.x86 == 15); - } - return 0; -} - /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { @@ -277,23 +80,6 @@ static __init void nmi_cpu_busy(void *data) } #endif -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - unsigned int retval = hz; - - /* - * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) { - retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; - } - return retval; -} - int __init check_nmi_watchdog (void) { int *counts; @@ -322,14 +108,14 @@ int __init check_nmi_watchdog (void) mdelay((20*1000)/nmi_hz); // wait 20 ticks for_each_online_cpu(cpu) { - if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + if (!per_cpu(wd_enabled, cpu)) continue; if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, counts[cpu], cpu_pda(cpu)->__nmi_count); - per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } } @@ -344,13 +130,8 @@ int __init check_nmi_watchdog (void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - nmi_hz = 1; - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - } + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); kfree(counts); return 0; @@ -379,57 +160,6 @@ int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -static void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (nmi_known_cpu() <= 0) - return; - - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - touch_nmi_watchdog(); -} - -void disable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - disable_irq(0); - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) == 0) { - touch_nmi_watchdog(); - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - enable_irq(0); - } -} static void __acpi_nmi_disable(void *__unused) { @@ -515,275 +245,9 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ -/* - * Activate the NMI watchdog via the local APIC. - * Original code written by Keith Owens. - */ - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = MSR_K7_PERFCTR0; - evntsel_msr = MSR_K7_EVNTSEL0; - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - /* Simulator may not support it */ - if (checking_wrmsrl(evntsel_msr, 0UL)) - goto fail2; - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL<<63; - return 1; -fail2: - __release_evntsel_nmi(-1, evntsel_msr); -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_k7_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -#define P4_CCCR_OVF (1<<31) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ - -static int setup_p4_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); - } - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - wd->check_bit = 1ULL<<39; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_p4_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static int setup_intel_arch_watchdog(void) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - goto fail; - - perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1; - evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1; - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL << (eax.split.bit_width - 1); - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_intel_arch_watchdog(void) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return; - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - void setup_apic_nmi_watchdog(void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - - if (wd->enabled == 1) + if (__get_cpu_var(wd_enabled) == 1) return; /* cheap hack to support suspend/resume */ @@ -791,62 +255,31 @@ void setup_apic_nmi_watchdog(void *unused) if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) return; - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - break; - } - if (!setup_p4_watchdog()) - return; - break; - default: + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + __get_cpu_var(wd_enabled) = 1; + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; return; } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); } - wd->enabled = 1; - atomic_inc(&nmi_active); } void stop_apic_nmi_watchdog(void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; - - if (wd->enabled == 0) + if (__get_cpu_var(wd_enabled) == 0) return; - - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - stop_k7_watchdog(); - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - stop_intel_arch_watchdog(); - break; - } - stop_p4_watchdog(); - break; - default: - return; - } - } - wd->enabled = 0; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; atomic_dec(&nmi_active); } @@ -885,9 +318,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) int sum; int touched = 0; int cpu = smp_processor_id(); - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 dummy; - int rc=0; + int rc = 0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) @@ -934,55 +365,20 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) } /* see if the nmi watchdog went off */ - if (wd->enabled) { - if (nmi_watchdog == NMI_LOCAL_APIC) { - rdmsrl(wd->perfctr_msr, dummy); - if (dummy & wd->check_bit){ - /* this wasn't a watchdog timer interrupt */ - goto done; - } - - /* only Intel uses the cccr msr */ - if (wd->cccr_msr != 0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - wrmsrl(wd->perfctr_msr, - -((u64)cpu_khz * 1000 / nmi_hz)); - } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) { - /* - * ArchPerfom/Core Duo needs to re-unmask - * the apic vector - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* ARCH_PERFMON has 32 bit counter writes */ - wrmsr(wd->perfctr_msr, - (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); - } else { - /* start the cycle over again */ - wrmsrl(wd->perfctr_msr, - -((u64)cpu_khz * 1000 / nmi_hz)); - } - rc = 1; - } else if (nmi_watchdog == NMI_IO_APIC) { - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - } else - printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; } -done: return rc; } @@ -1067,12 +463,4 @@ void __trigger_all_cpu_backtrace(void) EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); -EXPORT_SYMBOL(reserve_perfctr_nmi); -EXPORT_SYMBOL(release_perfctr_nmi); -EXPORT_SYMBOL(reserve_evntsel_nmi); -EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(disable_timer_nmi_watchdog); -EXPORT_SYMBOL(enable_timer_nmi_watchdog); EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 72375e7d32a8..d0a7f53b1497 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -80,4 +80,13 @@ extern int unknown_nmi_panic; void __trigger_all_cpu_backtrace(void); #define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() + +void lapic_watchdog_stop(void); +int lapic_watchdog_init(unsigned nmi_hz); +int lapic_wd_event(unsigned nmi_hz); +unsigned lapic_adjust_nmi_hz(unsigned hz); +int lapic_watchdog_ok(void); +void disable_lapic_nmi_watchdog(void); +void enable_lapic_nmi_watchdog(void); + #endif /* ASM_NMI_H */ From 421f028100cbef0ba15086d63cad87fb6e5c3da9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 204/231] [PATCH] x86-64: Define IGNORE_IOCTL() macro for compat_ioctls Define a new IGNORE_IOCTL() to let a compat ioctl not be warned about even when it is not implemented. This is the same as COMPATIBLE_IOCTL internally, but better self documentng. Valid reasons to use this: - It is implemented with ->compat_ioctl on some device, but programs call it on others too. - The ioctl is not implemented in the native kernel, but programs call it commonly anyways. Most other reasons are not valid. Signed-off-by: Andi Kleen --- fs/compat_ioctl.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index c68b055fa26e..c5a4f5459a07 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -2396,6 +2396,14 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg) #define ULONG_IOCTL(cmd) \ { (cmd), (ioctl_trans_handler_t)sys_ioctl }, +/* ioctl should not be warned about even if it's not implemented. + Valid reasons to use this: + - It is implemented with ->compat_ioctl on some device, but programs + call it on others too. + - The ioctl is not implemented in the native kernel, but programs + call it commonly anyways. + Most other reasons are not valid. */ +#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd) struct ioctl_trans ioctl_start[] = { #include From 9d016dd43b8df0228f1022f483f582eeb52d256e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 205/231] [PATCH] x86-64: Shut up 32bit emulation for SIOCGIFCOUNT The kernel doesn't implement it, but some programs like java use it anyways. Shut the code up. Signed-off-by: Andi Kleen --- fs/compat_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index c5a4f5459a07..f6f5e0849d9b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -2602,6 +2602,8 @@ HANDLE_IOCTL(SIOCGIWENCODEEXT, do_wireless_ioctl) HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl) HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl) HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl) +/* Not implemented in the native kernel */ +IGNORE_IOCTL(SIOCGIFCOUNT) HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl) HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl) HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl) From 4637a74cf2ac3a3696d385c8624d84de789d1bbe Mon Sep 17 00:00:00 2001 From: "David P. Reed" Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 206/231] [PATCH] x86-64: Avoid overflows during apic timer calibration - Use 64bit TSC calculations to avoid handling overflow - Use 32bit unsigned arithmetic for the APIC timer. This way overflows are handled correctly. - Fix exit check of loop to account for apic timer counting down Signed-off-by: dpreed@reed.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 943ec4d1bd8e..d198f7d82e5a 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -839,14 +839,15 @@ static void setup_APIC_timer(unsigned int clocks) static int __init calibrate_APIC_clock(void) { - int apic, apic_start, tsc, tsc_start; + unsigned apic, apic_start; + unsigned long tsc, tsc_start; int result; /* * Put whatever arbitrary (but long enough) timeout * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(4000000000); apic_start = apic_read(APIC_TMCCT); #ifdef CONFIG_X86_PM_TIMER @@ -857,13 +858,13 @@ static int __init calibrate_APIC_clock(void) } else #endif { - rdtscl(tsc_start); + rdtscll(tsc_start); do { apic = apic_read(APIC_TMCCT); - rdtscl(tsc); + rdtscll(tsc); } while ((tsc - tsc_start) < TICK_COUNT && - (apic - apic_start) < TICK_COUNT); + (apic_start - apic) < TICK_COUNT); result = (apic_start - apic) * 1000L * tsc_khz / (tsc - tsc_start); From 72b1b1d0133d7eb4040697f1052bf92123fb051b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 207/231] [PATCH] x86-64: Use symbolic CPU features in early CPUID check Dead to magic numbers! Generated code is the same. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/verify_cpu.S | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S index 72edabd2ef9a..e035f5948199 100644 --- a/arch/x86_64/kernel/verify_cpu.S +++ b/arch/x86_64/kernel/verify_cpu.S @@ -30,18 +30,27 @@ * appropriately. Either display a message or halt. */ -verify_cpu: +#include +verify_cpu: pushfl # Save caller passed flags pushl $0 # Kill any dangerous flags popfl - /* minimum CPUID flags for x86-64 */ - /* see http://www.x86-64.org/lists/discuss/msg02971.html */ -#define SSE_MASK ((1<<25)|(1<<26)) -#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ - (1<<13)|(1<<15)|(1<<24)) -#define REQUIRED_MASK2 (1<<29) + /* minimum CPUID flags for x86-64 as defined by AMD */ +#define M(x) (1<<(x)) +#define M2(a,b) M(a)|M(b) +#define M4(a,b,c,d) M(a)|M(b)|M(c)|M(d) + +#define SSE_MASK \ + (M2(X86_FEATURE_XMM,X86_FEATURE_XMM2)) +#define REQUIRED_MASK1 \ + (M4(X86_FEATURE_FPU,X86_FEATURE_PSE,X86_FEATURE_TSC,X86_FEATURE_MSR)|\ + M4(X86_FEATURE_PAE,X86_FEATURE_CX8,X86_FEATURE_PGE,X86_FEATURE_CMOV)|\ + M(X86_FEATURE_FXSR)) +#define REQUIRED_MASK2 \ + (M(X86_FEATURE_LM - 32)) + pushfl # standard way to check for cpuid popl %eax movl %eax,%ebx From fa0a00910925fdbd3a528404a47817b3a9fda5be Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 208/231] [PATCH] x86-64: Drop -traditional for arch/x86_64/boot Follows i386 and useful cleanup. Signed-off-by: Andi Kleen --- arch/x86_64/boot/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile index deb063e7762d..ee6f6505f95f 100644 --- a/arch/x86_64/boot/Makefile +++ b/arch/x86_64/boot/Makefile @@ -36,7 +36,7 @@ subdir- := compressed/ #Let make clean descend in compressed/ # --------------------------------------------------------------------------- $(obj)/bzImage: IMAGE_OFFSET := 0x100000 -$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ +$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ $(obj)/bzImage: BUILDFLAGS := -b quiet_cmd_image = BUILD $@ From 484ad393659f20d784a3a93613fb3fd3d9f171fa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 209/231] [PATCH] i386: Drop -traditional in arch/i386/boot Needed for followon patch Signed-off-by: Andi Kleen --- arch/i386/boot/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile index e97946626064..bfbc32098a4a 100644 --- a/arch/i386/boot/Makefile +++ b/arch/i386/boot/Makefile @@ -36,9 +36,9 @@ HOSTCFLAGS_build.o := $(LINUXINCLUDE) # --------------------------------------------------------------------------- $(obj)/zImage: IMAGE_OFFSET := 0x1000 -$(obj)/zImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) +$(obj)/zImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) $(obj)/bzImage: IMAGE_OFFSET := 0x100000 -$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ +$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ $(obj)/bzImage: BUILDFLAGS := -b quiet_cmd_image = BUILD $@ From c7f81c9453375d6416658995eafd3397cb9bba1d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 210/231] [PATCH] i386: Verify important CPUID bits in real mode Check some CPUID bits that are needed for compiler generated early in boot. When the system is still in real mode before changing the VESA BIOS mode it is possible to still display an visible error message on the screen. Similar to x86-64. Includes cleanups from Eric Biederman Signed-off-by: Andi Kleen --- arch/i386/Kconfig.cpu | 22 +++++++++- arch/i386/boot/setup.S | 17 ++++++++ arch/i386/kernel/verify_cpu.S | 65 ++++++++++++++++++++++++++++ include/asm-i386/cpufeature.h | 3 ++ include/asm-i386/required-features.h | 34 +++++++++++++++ 5 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 arch/i386/kernel/verify_cpu.S create mode 100644 include/asm-i386/required-features.h diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu index b1af9f50a143..dce6124cb842 100644 --- a/arch/i386/Kconfig.cpu +++ b/arch/i386/Kconfig.cpu @@ -240,14 +240,19 @@ config X86_L1_CACHE_SHIFT default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 +config X86_XADD + bool + depends on !M386 + default y + config RWSEM_GENERIC_SPINLOCK bool - depends on M386 + depends on !X86_XADD default y config RWSEM_XCHGADD_ALGORITHM bool - depends on !M386 + depends on X86_XADD default y config ARCH_HAS_ILOG2_U32 @@ -331,3 +336,16 @@ config X86_TSC bool depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ default y + +# this should be set for all -march=.. options where the compiler +# generates cmov. +config X86_CMOV + bool + depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7) + default y + +config X86_MINIMUM_CPU_MODEL + int + default "4" if X86_XADD || X86_CMPXCHG || X86_BSWAP + default "0" + diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S index b74b7f40b79c..f8b3b9cda2b1 100644 --- a/arch/i386/boot/setup.S +++ b/arch/i386/boot/setup.S @@ -302,7 +302,24 @@ good_sig: loader_panic_mess: .string "Wrong loader, giving up..." +# check minimum cpuid +# we do this here because it is the last place we can actually +# show a user visible error message. Later the video modus +# might be already messed up. loader_ok: + call verify_cpu + testl %eax,%eax + jz cpu_ok + lea cpu_panic_mess,%si + call prtstr +1: jmp 1b + +cpu_panic_mess: + .asciz "PANIC: CPU too old for this kernel." + +#include "../kernel/verify_cpu.S" + +cpu_ok: # Get memory size (extended mem, kB) xorl %eax, %eax diff --git a/arch/i386/kernel/verify_cpu.S b/arch/i386/kernel/verify_cpu.S new file mode 100644 index 000000000000..e51a8695d54e --- /dev/null +++ b/arch/i386/kernel/verify_cpu.S @@ -0,0 +1,65 @@ +/* Check if CPU has some minimum CPUID bits + This runs in 16bit mode so that the caller can still use the BIOS + to output errors on the screen */ +#include + +verify_cpu: + pushfl # Save caller passed flags + pushl $0 # Kill any dangerous flags + popfl + +#if CONFIG_X86_MINIMUM_CPU_MODEL >= 4 + pushfl + orl $(1<<18),(%esp) # try setting AC + popfl + pushfl + popl %eax + testl $(1<<18),%eax + jz bad +#endif +#if REQUIRED_MASK1 != 0 + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + jz bad # REQUIRED_MASK1 != 0 requires CPUID + + movl $0x0,%eax # See if cpuid 1 is implemented + cpuid + cmpl $0x1,%eax + jb bad # no cpuid 1 + + movl $0x1,%eax # Does the cpu have what it takes + cpuid + +#if CONFIG_X86_MINIMUM_CPU_MODEL > 4 +#error add proper model checking here +#endif + + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx + jnz bad +#endif /* REQUIRED_MASK1 */ + + popfl + xor %eax,%eax + ret + +bad: + popfl + movl $1,%eax + ret diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index d1b8e4ab6c1a..e66d004aa651 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -7,7 +7,10 @@ #ifndef __ASM_I386_CPUFEATURE_H #define __ASM_I386_CPUFEATURE_H +#ifndef __ASSEMBLY__ #include +#endif +#include #define NCAPINTS 7 /* N 32-bit words worth of info */ diff --git a/include/asm-i386/required-features.h b/include/asm-i386/required-features.h new file mode 100644 index 000000000000..9db866c1e64c --- /dev/null +++ b/include/asm-i386/required-features.h @@ -0,0 +1,34 @@ +#ifndef _ASM_REQUIRED_FEATURES_H +#define _ASM_REQUIRED_FEATURES_H 1 + +/* Define minimum CPUID feature set for kernel These bits are checked + really early to actually display a visible error message before the + kernel dies. Only add word 0 bits here + + Some requirements that are not in CPUID yet are also in the + CONFIG_X86_MINIMUM_CPU mode which is checked too. + + The real information is in arch/i386/Kconfig.cpu, this just converts + the CONFIGs into a bitmask */ + +#ifdef CONFIG_X86_PAE +#define NEED_PAE (1< Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 211/231] [PATCH] i386: Evaluate constant cpu features at runtime Redefine cpu_has() to evaluate cpu features already checked in early boot at compile time. This way the compiler might eliminate some dead code. Signed-off-by: Andi Kleen --- include/asm-i386/cpufeature.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index e66d004aa651..20e849ae6ddc 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -106,8 +106,12 @@ #define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */ #define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */ -#define cpu_has(c, bit) test_bit(bit, (c)->x86_capability) -#define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability) +#define cpu_has(c, bit) \ + ((__builtin_constant_p(bit) && (bit) < 32 && \ + (1UL << (bit)) & REQUIRED_MASK1) ? \ + 1 : \ + test_bit(bit, (c)->x86_capability)) +#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) #define cpu_has_vme boot_cpu_has(X86_FEATURE_VME) From e859dc553c857f4672b3bbb73ee9170a901f8712 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 212/231] [PATCH] i386: Implement alternative_io for i386 Ported from x86-64. Signed-off-by: Andi Kleen --- include/asm-i386/alternative.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h index 277467329583..0f70b379b029 100644 --- a/include/asm-i386/alternative.h +++ b/include/asm-i386/alternative.h @@ -82,6 +82,21 @@ static inline void alternatives_smp_switch(int smp) {} "663:\n\t" newinstr "\n664:\n" /* replacement */\ ".previous" :: "i" (feature), ##input) +/* Like alternative_input, but with a single output argument */ +#define alternative_io(oldinstr, newinstr, feature, output, input...) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + " .align 4\n" \ + " .long 661b\n" /* label */ \ + " .long 663f\n" /* new instruction */ \ + " .byte %c[feat]\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" : output : [feat] "i" (feature), ##input) + /* * Alternative inline assembly for SMP. * From 3aefbe0746580a710d4392a884ac1e4aac7c728f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 213/231] [PATCH] i386: Implement X86_FEATURE_SYNC_RDTSC on i386 Syncs up with x86-64. Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/intel.c | 4 +++- include/asm-i386/cpufeature.h | 1 + include/asm-i386/tsc.h | 4 ---- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 56fe26584957..dc4e08147b1f 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -188,8 +188,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #endif - if (c->x86 == 15) + if (c->x86 == 15) { set_bit(X86_FEATURE_P4, c->x86_capability); + set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability); + } if (c->x86 == 6) set_bit(X86_FEATURE_P3, c->x86_capability); if ((c->x86 == 0xf && c->x86_model >= 0x03) || diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index 20e849ae6ddc..b8a3a5a85fd3 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -79,6 +79,7 @@ #define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ #define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ #define X86_FEATURE_LAPIC_TIMER_BROKEN (3*32+ 14) /* lapic timer broken in C1 */ +#define X86_FEATURE_SYNC_RDTSC (3*32+15) /* RDTSC synchronizes the CPU */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h index 346976632e15..0181f9df7539 100644 --- a/include/asm-i386/tsc.h +++ b/include/asm-i386/tsc.h @@ -35,7 +35,6 @@ static inline cycles_t get_cycles(void) static __always_inline cycles_t get_cycles_sync(void) { unsigned long long ret; -#ifdef X86_FEATURE_SYNC_RDTSC unsigned eax; /* @@ -44,9 +43,6 @@ static __always_inline cycles_t get_cycles_sync(void) */ alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC, "=a" (eax), "0" (1) : "ebx","ecx","edx","memory"); -#else - sync_core(); -#endif rdtscll(ret); return ret; From 9bccb23dc5fc2d268ab676e2d4212d29e230fd86 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH 214/231] [PATCH] i386: Add X86_FEATURE_RDTSCP Following x86-64 Signed-off-by: Andi Kleen --- include/asm-i386/cpufeature.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index b8a3a5a85fd3..f514e906643a 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -52,6 +52,7 @@ #define X86_FEATURE_MP (1*32+19) /* MP Capable. */ #define X86_FEATURE_NX (1*32+20) /* Execute Disable */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */ #define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */ #define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */ #define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */ From c5bcb5635a03da3158f121ae20ccbbf72b4fc62a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 215/231] [PATCH] x86: Use RDTSCP for synchronous get_cycles if possible RDTSCP is already synchronous and doesn't need an explicit CPUID. This is a little faster and more importantly avoids VMEXITs on Hypervisors. Original patch from Joerg Roedel, but reworked by AK Also includes miscompilation fix by Eric Biederman Cc: "Joerg Roedel" Signed-off-by: Andi Kleen --- include/asm-i386/tsc.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h index 0181f9df7539..3f3c1fa000b4 100644 --- a/include/asm-i386/tsc.h +++ b/include/asm-i386/tsc.h @@ -37,6 +37,15 @@ static __always_inline cycles_t get_cycles_sync(void) unsigned long long ret; unsigned eax; + /* + * Use RDTSCP if possible; it is guaranteed to be synchronous + * and doesn't cause a VMEXIT on Hypervisors + */ + alternative_io(ASM_NOP3, ".byte 0x0f,0x01,0xf9", X86_FEATURE_RDTSCP, + "=A" (ret), "0" (0ULL) : "ecx", "memory"); + if (ret) + return ret; + /* * Don't do an additional sync on CPUs where we know * RDTSC is already synchronous: From 3bea9c9793a17053e05d970e5d90d48fc9fce07d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 216/231] [PATCH] x86-64: Don't enable NUMA for a single node in K8 NUMA scanning This was supposed to see the full memory on a ASUS A8SX motherboard with 4GB RAM where the northbridge reports less memory, but it didn't help there. But it's a reasonable change so let's include it anyways. Signed-off-by: Andi Kleen --- arch/x86_64/mm/k8topology.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index 60e860e5ef4c..f983c75825d0 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -62,6 +62,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; + if (numnodes <= 1) + return -1; printk(KERN_INFO "Number of nodes %d\n", numnodes); From ec1180db2ca2ec8692ce101560eff9eedf561781 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 217/231] [PATCH] i386: Little cleanups in smpboot.c - Remove #if that is always set - Fix warning Signed-off-by: Andi Kleen --- arch/i386/kernel/smpboot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index a768eceeac37..a4b7ad283f49 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -516,7 +516,6 @@ static void unmap_cpu_to_logical_apicid(int cpu) unmap_cpu_to_node(cpu); } -#if APIC_DEBUG static inline void __inquire_remote_apic(int apicid) { int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; @@ -548,14 +547,13 @@ static inline void __inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk("%08x\n", status); + printk("%lx\n", status); break; default: printk("failed\n"); } } } -#endif #ifdef WAKE_SECONDARY_VIA_NMI /* From c812d6c198c5f39a6ea22a9a12d518cd3714af16 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 218/231] [PATCH] i386: Remove copy_*_user BUG_ONs for (size < 0) access_ok checks this case anyways, no need to check twice. Signed-off-by: Andi Kleen --- arch/i386/lib/usercopy.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c index 086b3726862a..9f38b12b4af1 100644 --- a/arch/i386/lib/usercopy.c +++ b/arch/i386/lib/usercopy.c @@ -716,7 +716,6 @@ do { \ unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n) { - BUG_ON((long) n < 0); #ifndef CONFIG_X86_WP_WORKS_OK if (unlikely(boot_cpu_data.wp_works_ok == 0) && ((unsigned long )to) < TASK_SIZE) { @@ -785,7 +784,6 @@ EXPORT_SYMBOL(__copy_to_user_ll); unsigned long __copy_from_user_ll(void *to, const void __user *from, unsigned long n) { - BUG_ON((long)n < 0); if (movsl_is_ok(to, from, n)) __copy_user_zeroing(to, from, n); else @@ -797,7 +795,6 @@ EXPORT_SYMBOL(__copy_from_user_ll); unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from, unsigned long n) { - BUG_ON((long)n < 0); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else @@ -810,7 +807,6 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero); unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, unsigned long n) { - BUG_ON((long)n < 0); #ifdef CONFIG_X86_INTEL_USERCOPY if ( n > 64 && cpu_has_xmm2) n = __copy_user_zeroing_intel_nocache(to, from, n); @@ -825,7 +821,6 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, unsigned long n) { - BUG_ON((long)n < 0); #ifdef CONFIG_X86_INTEL_USERCOPY if ( n > 64 && cpu_has_xmm2) n = __copy_user_intel_nocache(to, from, n); @@ -853,7 +848,6 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr unsigned long copy_to_user(void __user *to, const void *from, unsigned long n) { - BUG_ON((long) n < 0); if (access_ok(VERIFY_WRITE, to, n)) n = __copy_to_user(to, from, n); return n; @@ -879,7 +873,6 @@ EXPORT_SYMBOL(copy_to_user); unsigned long copy_from_user(void *to, const void __user *from, unsigned long n) { - BUG_ON((long) n < 0); if (access_ok(VERIFY_READ, from, n)) n = __copy_from_user(to, from, n); else From a106009bdfa12b6452b724cc0718ca8e1334745d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 219/231] [PATCH] x86-64: Print type and size correctly for unknown compat ioctls Signed-off-by: Andi Kleen --- fs/compat.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/compat.c b/fs/compat.c index 040a8be38a48..72e5e6923828 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -371,13 +371,14 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd, fn = "?"; } - sprintf(buf,"'%c'", (cmd>>24) & 0x3f); + sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK); if (!isprint(buf[1])) sprintf(buf, "%02x", buf[1]); compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " - "cmd(%08x){%s} arg(%08x) on %s\n", + "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n", current->comm, current->pid, (int)fd, (unsigned int)cmd, buf, + (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK, (unsigned int)arg, fn); if (path) From 2136220d00d84e5dd923f23552f75b1864a76f21 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 220/231] [PATCH] x86-64: Remove CONFIG_REORDER The option never worked well and functionlist wasn't well maintained. Also it made the build very slow on many binutils version. So just remove it. Cc: arjan@linux.intel.com Signed-off-by: Andi Kleen --- arch/x86_64/Kconfig | 8 - arch/x86_64/Makefile | 1 - arch/x86_64/kernel/functionlist | 1284 ------------------------------ arch/x86_64/kernel/vmlinux.lds.S | 3 - 4 files changed, 1296 deletions(-) delete mode 100644 arch/x86_64/kernel/functionlist diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 715632026073..1cad418373c3 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -660,14 +660,6 @@ config CC_STACKPROTECTOR_ALL source kernel/Kconfig.hz -config REORDER - bool "Function reordering" - default n - help - This option enables the toolchain to reorder functions for a more - optimal TLB usage. If you have pretty much any version of binutils, - this can increase your kernel build time by roughly one minute. - config K8_NB def_bool y depends on AGP_AMD64 || IOMMU || (PCI && NUMA) diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 803cfcc9b346..29617ae3926d 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -40,7 +40,6 @@ cflags-y += -m64 cflags-y += -mno-red-zone cflags-y += -mcmodel=kernel cflags-y += -pipe -cflags-kernel-$(CONFIG_REORDER) += -ffunction-sections cflags-y += -Wno-sign-compare cflags-y += -fno-asynchronous-unwind-tables ifneq ($(CONFIG_DEBUG_INFO),y) diff --git a/arch/x86_64/kernel/functionlist b/arch/x86_64/kernel/functionlist deleted file mode 100644 index 7ae18ec12454..000000000000 --- a/arch/x86_64/kernel/functionlist +++ /dev/null @@ -1,1284 +0,0 @@ -*(.text.flush_thread) -*(.text.check_poison_obj) -*(.text.copy_page) -*(.text.__set_personality) -*(.text.gart_map_sg) -*(.text.kmem_cache_free) -*(.text.find_get_page) -*(.text._raw_spin_lock) -*(.text.ide_outb) -*(.text.unmap_vmas) -*(.text.copy_page_range) -*(.text.kprobe_handler) -*(.text.__handle_mm_fault) -*(.text.__d_lookup) -*(.text.copy_user_generic) -*(.text.__link_path_walk) -*(.text.get_page_from_freelist) -*(.text.kmem_cache_alloc) -*(.text.drive_cmd_intr) -*(.text.ia32_setup_sigcontext) -*(.text.huge_pte_offset) -*(.text.do_page_fault) -*(.text.page_remove_rmap) -*(.text.release_pages) -*(.text.ide_end_request) -*(.text.__mutex_lock_slowpath) -*(.text.__find_get_block) -*(.text.kfree) -*(.text.vfs_read) -*(.text._raw_spin_unlock) -*(.text.free_hot_cold_page) -*(.text.fget_light) -*(.text.schedule) -*(.text.memcmp) -*(.text.touch_atime) -*(.text.__might_sleep) -*(.text.__down_read_trylock) -*(.text.arch_pick_mmap_layout) -*(.text.find_vma) -*(.text.__make_request) -*(.text.do_generic_mapping_read) -*(.text.mutex_lock_interruptible) -*(.text.__generic_file_aio_read) -*(.text._atomic_dec_and_lock) -*(.text.__wake_up_bit) -*(.text.add_to_page_cache) -*(.text.cache_alloc_debugcheck_after) -*(.text.vm_normal_page) -*(.text.mutex_debug_check_no_locks_freed) -*(.text.net_rx_action) -*(.text.__find_first_zero_bit) -*(.text.put_page) -*(.text._raw_read_lock) -*(.text.__delay) -*(.text.dnotify_parent) -*(.text.do_path_lookup) -*(.text.do_sync_read) -*(.text.do_lookup) -*(.text.bit_waitqueue) -*(.text.file_read_actor) -*(.text.strncpy_from_user) -*(.text.__pagevec_lru_add_active) -*(.text.fget) -*(.text.dput) -*(.text.__strnlen_user) -*(.text.inotify_inode_queue_event) -*(.text.rw_verify_area) -*(.text.ide_intr) -*(.text.inotify_dentry_parent_queue_event) -*(.text.permission) -*(.text.memscan) -*(.text.hpet_rtc_interrupt) -*(.text.do_mmap_pgoff) -*(.text.current_fs_time) -*(.text.vfs_getattr) -*(.text.kmem_flagcheck) -*(.text.mark_page_accessed) -*(.text.free_pages_and_swap_cache) -*(.text.generic_fillattr) -*(.text.__block_prepare_write) -*(.text.__set_page_dirty_nobuffers) -*(.text.link_path_walk) -*(.text.find_get_pages_tag) -*(.text.ide_do_request) -*(.text.__alloc_pages) -*(.text.generic_permission) -*(.text.mod_page_state_offset) -*(.text.free_pgd_range) -*(.text.generic_file_buffered_write) -*(.text.number) -*(.text.ide_do_rw_disk) -*(.text.__brelse) -*(.text.__mod_page_state_offset) -*(.text.rotate_reclaimable_page) -*(.text.find_vma_prepare) -*(.text.find_vma_prev) -*(.text.lru_cache_add_active) -*(.text.__kmalloc_track_caller) -*(.text.smp_invalidate_interrupt) -*(.text.handle_IRQ_event) -*(.text.__find_get_block_slow) -*(.text.do_wp_page) -*(.text.do_select) -*(.text.set_user_nice) -*(.text.sys_read) -*(.text.do_munmap) -*(.text.csum_partial) -*(.text.__do_softirq) -*(.text.may_open) -*(.text.getname) -*(.text.get_empty_filp) -*(.text.__fput) -*(.text.remove_mapping) -*(.text.filp_ctor) -*(.text.poison_obj) -*(.text.unmap_region) -*(.text.test_set_page_writeback) -*(.text.__do_page_cache_readahead) -*(.text.sock_def_readable) -*(.text.ide_outl) -*(.text.shrink_zone) -*(.text.rb_insert_color) -*(.text.get_request) -*(.text.sys_pread64) -*(.text.spin_bug) -*(.text.ide_outsl) -*(.text.mask_and_ack_8259A) -*(.text.filemap_nopage) -*(.text.page_add_file_rmap) -*(.text.find_lock_page) -*(.text.tcp_poll) -*(.text.__mark_inode_dirty) -*(.text.file_ra_state_init) -*(.text.generic_file_llseek) -*(.text.__pagevec_lru_add) -*(.text.page_cache_readahead) -*(.text.n_tty_receive_buf) -*(.text.zonelist_policy) -*(.text.vma_adjust) -*(.text.test_clear_page_dirty) -*(.text.sync_buffer) -*(.text.do_exit) -*(.text.__bitmap_weight) -*(.text.alloc_pages_current) -*(.text.get_unused_fd) -*(.text.zone_watermark_ok) -*(.text.cpuset_update_task_memory_state) -*(.text.__bitmap_empty) -*(.text.sys_munmap) -*(.text.__inode_dir_notify) -*(.text.__generic_file_aio_write_nolock) -*(.text.__pte_alloc) -*(.text.sys_select) -*(.text.vm_acct_memory) -*(.text.vfs_write) -*(.text.__lru_add_drain) -*(.text.prio_tree_insert) -*(.text.generic_file_aio_read) -*(.text.vma_merge) -*(.text.block_write_full_page) -*(.text.__page_set_anon_rmap) -*(.text.apic_timer_interrupt) -*(.text.release_console_sem) -*(.text.sys_write) -*(.text.sys_brk) -*(.text.dup_mm) -*(.text.read_current_timer) -*(.text.ll_rw_block) -*(.text.blk_rq_map_sg) -*(.text.dbg_userword) -*(.text.__block_commit_write) -*(.text.cache_grow) -*(.text.copy_strings) -*(.text.release_task) -*(.text.do_sync_write) -*(.text.unlock_page) -*(.text.load_elf_binary) -*(.text.__follow_mount) -*(.text.__getblk) -*(.text.do_sys_open) -*(.text.current_kernel_time) -*(.text.call_rcu) -*(.text.write_chan) -*(.text.vsnprintf) -*(.text.dummy_inode_setsecurity) -*(.text.submit_bh) -*(.text.poll_freewait) -*(.text.bio_alloc_bioset) -*(.text.skb_clone) -*(.text.page_waitqueue) -*(.text.__mutex_lock_interruptible_slowpath) -*(.text.get_index) -*(.text.csum_partial_copy_generic) -*(.text.bad_range) -*(.text.remove_vma) -*(.text.cp_new_stat) -*(.text.alloc_arraycache) -*(.text.test_clear_page_writeback) -*(.text.strsep) -*(.text.open_namei) -*(.text._raw_read_unlock) -*(.text.get_vma_policy) -*(.text.__down_write_trylock) -*(.text.find_get_pages) -*(.text.tcp_rcv_established) -*(.text.generic_make_request) -*(.text.__block_write_full_page) -*(.text.cfq_set_request) -*(.text.sys_inotify_init) -*(.text.split_vma) -*(.text.__mod_timer) -*(.text.get_options) -*(.text.vma_link) -*(.text.mpage_writepages) -*(.text.truncate_complete_page) -*(.text.tcp_recvmsg) -*(.text.sigprocmask) -*(.text.filemap_populate) -*(.text.sys_close) -*(.text.inotify_dev_queue_event) -*(.text.do_task_stat) -*(.text.__dentry_open) -*(.text.unlink_file_vma) -*(.text.__pollwait) -*(.text.packet_rcv_spkt) -*(.text.drop_buffers) -*(.text.free_pgtables) -*(.text.generic_file_direct_write) -*(.text.copy_process) -*(.text.netif_receive_skb) -*(.text.dnotify_flush) -*(.text.print_bad_pte) -*(.text.anon_vma_unlink) -*(.text.sys_mprotect) -*(.text.sync_sb_inodes) -*(.text.find_inode_fast) -*(.text.dummy_inode_readlink) -*(.text.putname) -*(.text.init_smp_flush) -*(.text.dbg_redzone2) -*(.text.sk_run_filter) -*(.text.may_expand_vm) -*(.text.generic_file_aio_write) -*(.text.find_next_zero_bit) -*(.text.file_kill) -*(.text.audit_getname) -*(.text.arch_unmap_area_topdown) -*(.text.alloc_page_vma) -*(.text.tcp_transmit_skb) -*(.text.rb_next) -*(.text.dbg_redzone1) -*(.text.generic_file_mmap) -*(.text.vfs_fstat) -*(.text.sys_time) -*(.text.page_lock_anon_vma) -*(.text.get_unmapped_area) -*(.text.remote_llseek) -*(.text.__up_read) -*(.text.fd_install) -*(.text.eventpoll_init_file) -*(.text.dma_alloc_coherent) -*(.text.create_empty_buffers) -*(.text.__mutex_unlock_slowpath) -*(.text.dup_fd) -*(.text.d_alloc) -*(.text.tty_ldisc_try) -*(.text.sys_stime) -*(.text.__rb_rotate_right) -*(.text.d_validate) -*(.text.rb_erase) -*(.text.path_release) -*(.text.memmove) -*(.text.invalidate_complete_page) -*(.text.clear_inode) -*(.text.cache_estimate) -*(.text.alloc_buffer_head) -*(.text.smp_call_function_interrupt) -*(.text.flush_tlb_others) -*(.text.file_move) -*(.text.balance_dirty_pages_ratelimited) -*(.text.vma_prio_tree_add) -*(.text.timespec_trunc) -*(.text.mempool_alloc) -*(.text.iget_locked) -*(.text.d_alloc_root) -*(.text.cpuset_populate_dir) -*(.text.anon_vma_prepare) -*(.text.sys_newstat) -*(.text.alloc_page_interleave) -*(.text.__path_lookup_intent_open) -*(.text.__pagevec_free) -*(.text.inode_init_once) -*(.text.free_vfsmnt) -*(.text.__user_walk_fd) -*(.text.cfq_idle_slice_timer) -*(.text.sys_mmap) -*(.text.sys_llseek) -*(.text.prio_tree_remove) -*(.text.filp_close) -*(.text.file_permission) -*(.text.vma_prio_tree_remove) -*(.text.tcp_ack) -*(.text.nameidata_to_filp) -*(.text.sys_lseek) -*(.text.percpu_counter_mod) -*(.text.igrab) -*(.text.__bread) -*(.text.alloc_inode) -*(.text.filldir) -*(.text.__rb_rotate_left) -*(.text.irq_affinity_write_proc) -*(.text.init_request_from_bio) -*(.text.find_or_create_page) -*(.text.tty_poll) -*(.text.tcp_sendmsg) -*(.text.ide_wait_stat) -*(.text.free_buffer_head) -*(.text.flush_signal_handlers) -*(.text.tcp_v4_rcv) -*(.text.nr_blockdev_pages) -*(.text.locks_remove_flock) -*(.text.__iowrite32_copy) -*(.text.do_filp_open) -*(.text.try_to_release_page) -*(.text.page_add_new_anon_rmap) -*(.text.kmem_cache_size) -*(.text.eth_type_trans) -*(.text.try_to_free_buffers) -*(.text.schedule_tail) -*(.text.proc_lookup) -*(.text.no_llseek) -*(.text.kfree_skbmem) -*(.text.do_wait) -*(.text.do_mpage_readpage) -*(.text.vfs_stat_fd) -*(.text.tty_write) -*(.text.705) -*(.text.sync_page) -*(.text.__remove_shared_vm_struct) -*(.text.__kfree_skb) -*(.text.sock_poll) -*(.text.get_request_wait) -*(.text.do_sigaction) -*(.text.do_brk) -*(.text.tcp_event_data_recv) -*(.text.read_chan) -*(.text.pipe_writev) -*(.text.__emul_lookup_dentry) -*(.text.rtc_get_rtc_time) -*(.text.print_objinfo) -*(.text.file_update_time) -*(.text.do_signal) -*(.text.disable_8259A_irq) -*(.text.blk_queue_bounce) -*(.text.__anon_vma_link) -*(.text.__vma_link) -*(.text.vfs_rename) -*(.text.sys_newlstat) -*(.text.sys_newfstat) -*(.text.sys_mknod) -*(.text.__show_regs) -*(.text.iput) -*(.text.get_signal_to_deliver) -*(.text.flush_tlb_page) -*(.text.debug_mutex_wake_waiter) -*(.text.copy_thread) -*(.text.clear_page_dirty_for_io) -*(.text.buffer_io_error) -*(.text.vfs_permission) -*(.text.truncate_inode_pages_range) -*(.text.sys_recvfrom) -*(.text.remove_suid) -*(.text.mark_buffer_dirty) -*(.text.local_bh_enable) -*(.text.get_zeroed_page) -*(.text.get_vmalloc_info) -*(.text.flush_old_exec) -*(.text.dummy_inode_permission) -*(.text.__bio_add_page) -*(.text.prio_tree_replace) -*(.text.notify_change) -*(.text.mntput_no_expire) -*(.text.fput) -*(.text.__end_that_request_first) -*(.text.wake_up_bit) -*(.text.unuse_mm) -*(.text.shrink_icache_memory) -*(.text.sched_balance_self) -*(.text.__pmd_alloc) -*(.text.pipe_poll) -*(.text.normal_poll) -*(.text.__free_pages) -*(.text.follow_mount) -*(.text.cdrom_start_packet_command) -*(.text.blk_recount_segments) -*(.text.bio_put) -*(.text.__alloc_skb) -*(.text.__wake_up) -*(.text.vm_stat_account) -*(.text.sys_fcntl) -*(.text.sys_fadvise64) -*(.text._raw_write_unlock) -*(.text.__pud_alloc) -*(.text.alloc_page_buffers) -*(.text.vfs_llseek) -*(.text.sockfd_lookup) -*(.text._raw_write_lock) -*(.text.put_compound_page) -*(.text.prune_dcache) -*(.text.pipe_readv) -*(.text.mempool_free) -*(.text.make_ahead_window) -*(.text.lru_add_drain) -*(.text.constant_test_bit) -*(.text.__clear_user) -*(.text.arch_unmap_area) -*(.text.anon_vma_link) -*(.text.sys_chroot) -*(.text.setup_arg_pages) -*(.text.radix_tree_preload) -*(.text.init_rwsem) -*(.text.generic_osync_inode) -*(.text.generic_delete_inode) -*(.text.do_sys_poll) -*(.text.dev_queue_xmit) -*(.text.default_llseek) -*(.text.__writeback_single_inode) -*(.text.vfs_ioctl) -*(.text.__up_write) -*(.text.unix_poll) -*(.text.sys_rt_sigprocmask) -*(.text.sock_recvmsg) -*(.text.recalc_bh_state) -*(.text.__put_unused_fd) -*(.text.process_backlog) -*(.text.locks_remove_posix) -*(.text.lease_modify) -*(.text.expand_files) -*(.text.end_buffer_read_nobh) -*(.text.d_splice_alias) -*(.text.debug_mutex_init_waiter) -*(.text.copy_from_user) -*(.text.cap_vm_enough_memory) -*(.text.show_vfsmnt) -*(.text.release_sock) -*(.text.pfifo_fast_enqueue) -*(.text.half_md4_transform) -*(.text.fs_may_remount_ro) -*(.text.do_fork) -*(.text.copy_hugetlb_page_range) -*(.text.cache_free_debugcheck) -*(.text.__tcp_select_window) -*(.text.task_handoff_register) -*(.text.sys_open) -*(.text.strlcpy) -*(.text.skb_copy_datagram_iovec) -*(.text.set_up_list3s) -*(.text.release_open_intent) -*(.text.qdisc_restart) -*(.text.n_tty_chars_in_buffer) -*(.text.inode_change_ok) -*(.text.__downgrade_write) -*(.text.debug_mutex_unlock) -*(.text.add_timer_randomness) -*(.text.sock_common_recvmsg) -*(.text.set_bh_page) -*(.text.printk_lock) -*(.text.path_release_on_umount) -*(.text.ip_output) -*(.text.ide_build_dmatable) -*(.text.__get_user_8) -*(.text.end_buffer_read_sync) -*(.text.__d_path) -*(.text.d_move) -*(.text.del_timer) -*(.text.constant_test_bit) -*(.text.blockable_page_cache_readahead) -*(.text.tty_read) -*(.text.sys_readlink) -*(.text.sys_faccessat) -*(.text.read_swap_cache_async) -*(.text.pty_write_room) -*(.text.page_address_in_vma) -*(.text.kthread) -*(.text.cfq_exit_io_context) -*(.text.__tcp_push_pending_frames) -*(.text.sys_pipe) -*(.text.submit_bio) -*(.text.pid_revalidate) -*(.text.page_referenced_file) -*(.text.lock_sock) -*(.text.get_page_state_node) -*(.text.generic_block_bmap) -*(.text.do_setitimer) -*(.text.dev_queue_xmit_nit) -*(.text.copy_from_read_buf) -*(.text.__const_udelay) -*(.text.console_conditional_schedule) -*(.text.wake_up_new_task) -*(.text.wait_for_completion_interruptible) -*(.text.tcp_rcv_rtt_update) -*(.text.sys_mlockall) -*(.text.set_fs_altroot) -*(.text.schedule_timeout) -*(.text.nr_free_pagecache_pages) -*(.text.nf_iterate) -*(.text.mapping_tagged) -*(.text.ip_queue_xmit) -*(.text.ip_local_deliver) -*(.text.follow_page) -*(.text.elf_map) -*(.text.dummy_file_permission) -*(.text.dispose_list) -*(.text.dentry_open) -*(.text.dentry_iput) -*(.text.bio_alloc) -*(.text.wait_on_page_bit) -*(.text.vfs_readdir) -*(.text.vfs_lstat) -*(.text.seq_escape) -*(.text.__posix_lock_file) -*(.text.mm_release) -*(.text.kref_put) -*(.text.ip_rcv) -*(.text.__iget) -*(.text.free_pages) -*(.text.find_mergeable_anon_vma) -*(.text.find_extend_vma) -*(.text.dummy_inode_listsecurity) -*(.text.bio_add_page) -*(.text.__vm_enough_memory) -*(.text.vfs_stat) -*(.text.tty_paranoia_check) -*(.text.tcp_read_sock) -*(.text.tcp_data_queue) -*(.text.sys_uname) -*(.text.sys_renameat) -*(.text.__strncpy_from_user) -*(.text.__mutex_init) -*(.text.__lookup_hash) -*(.text.kref_get) -*(.text.ip_route_input) -*(.text.__insert_inode_hash) -*(.text.do_sock_write) -*(.text.blk_done_softirq) -*(.text.__wake_up_sync) -*(.text.__vma_link_rb) -*(.text.tty_ioctl) -*(.text.tracesys) -*(.text.sys_getdents) -*(.text.sys_dup) -*(.text.stub_execve) -*(.text.sha_transform) -*(.text.radix_tree_tag_clear) -*(.text.put_unused_fd) -*(.text.put_files_struct) -*(.text.mpage_readpages) -*(.text.may_delete) -*(.text.kmem_cache_create) -*(.text.ip_mc_output) -*(.text.interleave_nodes) -*(.text.groups_search) -*(.text.generic_drop_inode) -*(.text.generic_commit_write) -*(.text.fcntl_setlk) -*(.text.exit_mmap) -*(.text.end_page_writeback) -*(.text.__d_rehash) -*(.text.debug_mutex_free_waiter) -*(.text.csum_ipv6_magic) -*(.text.count) -*(.text.cleanup_rbuf) -*(.text.check_spinlock_acquired_node) -*(.text.can_vma_merge_after) -*(.text.bio_endio) -*(.text.alloc_pidmap) -*(.text.write_ldt) -*(.text.vmtruncate_range) -*(.text.vfs_create) -*(.text.__user_walk) -*(.text.update_send_head) -*(.text.unmap_underlying_metadata) -*(.text.tty_ldisc_deref) -*(.text.tcp_setsockopt) -*(.text.tcp_send_ack) -*(.text.sys_pause) -*(.text.sys_gettimeofday) -*(.text.sync_dirty_buffer) -*(.text.strncmp) -*(.text.release_posix_timer) -*(.text.proc_file_read) -*(.text.prepare_to_wait) -*(.text.locks_mandatory_locked) -*(.text.interruptible_sleep_on_timeout) -*(.text.inode_sub_bytes) -*(.text.in_group_p) -*(.text.hrtimer_try_to_cancel) -*(.text.filldir64) -*(.text.fasync_helper) -*(.text.dummy_sb_pivotroot) -*(.text.d_lookup) -*(.text.d_instantiate) -*(.text.__d_find_alias) -*(.text.cpu_idle_wait) -*(.text.cond_resched_lock) -*(.text.chown_common) -*(.text.blk_congestion_wait) -*(.text.activate_page) -*(.text.unlock_buffer) -*(.text.tty_wakeup) -*(.text.tcp_v4_do_rcv) -*(.text.tcp_current_mss) -*(.text.sys_openat) -*(.text.sys_fchdir) -*(.text.strnlen_user) -*(.text.strnlen) -*(.text.strchr) -*(.text.sock_common_getsockopt) -*(.text.skb_checksum) -*(.text.remove_wait_queue) -*(.text.rb_replace_node) -*(.text.radix_tree_node_ctor) -*(.text.pty_chars_in_buffer) -*(.text.profile_hit) -*(.text.prio_tree_left) -*(.text.pgd_clear_bad) -*(.text.pfifo_fast_dequeue) -*(.text.page_referenced) -*(.text.open_exec) -*(.text.mmput) -*(.text.mm_init) -*(.text.__ide_dma_off_quietly) -*(.text.ide_dma_intr) -*(.text.hrtimer_start) -*(.text.get_io_context) -*(.text.__get_free_pages) -*(.text.find_first_zero_bit) -*(.text.file_free_rcu) -*(.text.dummy_socket_sendmsg) -*(.text.do_unlinkat) -*(.text.do_arch_prctl) -*(.text.destroy_inode) -*(.text.can_vma_merge_before) -*(.text.block_sync_page) -*(.text.block_prepare_write) -*(.text.bio_init) -*(.text.arch_ptrace) -*(.text.wake_up_inode) -*(.text.wait_on_retry_sync_kiocb) -*(.text.vma_prio_tree_next) -*(.text.tcp_rcv_space_adjust) -*(.text.__tcp_ack_snd_check) -*(.text.sys_utime) -*(.text.sys_recvmsg) -*(.text.sys_mremap) -*(.text.sys_bdflush) -*(.text.sleep_on) -*(.text.set_page_dirty_lock) -*(.text.seq_path) -*(.text.schedule_timeout_interruptible) -*(.text.sched_fork) -*(.text.rt_run_flush) -*(.text.profile_munmap) -*(.text.prepare_binprm) -*(.text.__pagevec_release_nonlru) -*(.text.m_show) -*(.text.lookup_mnt) -*(.text.__lookup_mnt) -*(.text.lock_timer_base) -*(.text.is_subdir) -*(.text.invalidate_bh_lru) -*(.text.init_buffer_head) -*(.text.ifind_fast) -*(.text.ide_dma_start) -*(.text.__get_page_state) -*(.text.flock_to_posix_lock) -*(.text.__find_symbol) -*(.text.do_futex) -*(.text.do_execve) -*(.text.dirty_writeback_centisecs_handler) -*(.text.dev_watchdog) -*(.text.can_share_swap_page) -*(.text.blkdev_put) -*(.text.bio_get_nr_vecs) -*(.text.xfrm_compile_policy) -*(.text.vma_prio_tree_insert) -*(.text.vfs_lstat_fd) -*(.text.__user_path_lookup_open) -*(.text.thread_return) -*(.text.tcp_send_delayed_ack) -*(.text.sock_def_error_report) -*(.text.shrink_slab) -*(.text.serial_out) -*(.text.seq_read) -*(.text.secure_ip_id) -*(.text.search_binary_handler) -*(.text.proc_pid_unhash) -*(.text.pagevec_lookup) -*(.text.new_inode) -*(.text.memcpy_toiovec) -*(.text.locks_free_lock) -*(.text.__lock_page) -*(.text.__lock_buffer) -*(.text.load_module) -*(.text.is_bad_inode) -*(.text.invalidate_inode_buffers) -*(.text.insert_vm_struct) -*(.text.inode_setattr) -*(.text.inode_add_bytes) -*(.text.ide_read_24) -*(.text.ide_get_error_location) -*(.text.ide_do_drive_cmd) -*(.text.get_locked_pte) -*(.text.get_filesystem_list) -*(.text.generic_file_open) -*(.text.follow_down) -*(.text.find_next_bit) -*(.text.__find_first_bit) -*(.text.exit_mm) -*(.text.exec_keys) -*(.text.end_buffer_write_sync) -*(.text.end_bio_bh_io_sync) -*(.text.dummy_socket_shutdown) -*(.text.d_rehash) -*(.text.d_path) -*(.text.do_ioctl) -*(.text.dget_locked) -*(.text.copy_thread_group_keys) -*(.text.cdrom_end_request) -*(.text.cap_bprm_apply_creds) -*(.text.blk_rq_bio_prep) -*(.text.__bitmap_intersects) -*(.text.bio_phys_segments) -*(.text.bio_free) -*(.text.arch_get_unmapped_area_topdown) -*(.text.writeback_in_progress) -*(.text.vfs_follow_link) -*(.text.tcp_rcv_state_process) -*(.text.tcp_check_space) -*(.text.sys_stat) -*(.text.sys_rt_sigreturn) -*(.text.sys_rt_sigaction) -*(.text.sys_remap_file_pages) -*(.text.sys_pwrite64) -*(.text.sys_fchownat) -*(.text.sys_fchmodat) -*(.text.strncat) -*(.text.strlcat) -*(.text.strcmp) -*(.text.steal_locks) -*(.text.sock_create) -*(.text.sk_stream_rfree) -*(.text.sk_stream_mem_schedule) -*(.text.skip_atoi) -*(.text.sk_alloc) -*(.text.show_stat) -*(.text.set_fs_pwd) -*(.text.set_binfmt) -*(.text.pty_unthrottle) -*(.text.proc_symlink) -*(.text.pipe_release) -*(.text.pageout) -*(.text.n_tty_write_wakeup) -*(.text.n_tty_ioctl) -*(.text.nr_free_zone_pages) -*(.text.migration_thread) -*(.text.mempool_free_slab) -*(.text.meminfo_read_proc) -*(.text.max_sane_readahead) -*(.text.lru_cache_add) -*(.text.kill_fasync) -*(.text.kernel_read) -*(.text.invalidate_mapping_pages) -*(.text.inode_has_buffers) -*(.text.init_once) -*(.text.inet_sendmsg) -*(.text.idedisk_issue_flush) -*(.text.generic_file_write) -*(.text.free_more_memory) -*(.text.__free_fdtable) -*(.text.filp_dtor) -*(.text.exit_sem) -*(.text.exit_itimers) -*(.text.error_interrupt) -*(.text.end_buffer_async_write) -*(.text.eligible_child) -*(.text.elf_map) -*(.text.dump_task_regs) -*(.text.dummy_task_setscheduler) -*(.text.dummy_socket_accept) -*(.text.dummy_file_free_security) -*(.text.__down_read) -*(.text.do_sock_read) -*(.text.do_sigaltstack) -*(.text.do_mremap) -*(.text.current_io_context) -*(.text.cpu_swap_callback) -*(.text.copy_vma) -*(.text.cap_bprm_set_security) -*(.text.blk_insert_request) -*(.text.bio_map_kern_endio) -*(.text.bio_hw_segments) -*(.text.bictcp_cong_avoid) -*(.text.add_interrupt_randomness) -*(.text.wait_for_completion) -*(.text.version_read_proc) -*(.text.unix_write_space) -*(.text.tty_ldisc_ref_wait) -*(.text.tty_ldisc_put) -*(.text.try_to_wake_up) -*(.text.tcp_v4_tw_remember_stamp) -*(.text.tcp_try_undo_dsack) -*(.text.tcp_may_send_now) -*(.text.sys_waitid) -*(.text.sys_sched_getparam) -*(.text.sys_getppid) -*(.text.sys_getcwd) -*(.text.sys_dup2) -*(.text.sys_chmod) -*(.text.sys_chdir) -*(.text.sprintf) -*(.text.sock_wfree) -*(.text.sock_aio_write) -*(.text.skb_drop_fraglist) -*(.text.skb_dequeue) -*(.text.set_close_on_exec) -*(.text.set_brk) -*(.text.seq_puts) -*(.text.SELECT_DRIVE) -*(.text.sched_exec) -*(.text.return_EIO) -*(.text.remove_from_page_cache) -*(.text.rcu_start_batch) -*(.text.__put_task_struct) -*(.text.proc_pid_readdir) -*(.text.proc_get_inode) -*(.text.prepare_to_wait_exclusive) -*(.text.pipe_wait) -*(.text.pipe_new) -*(.text.pdflush_operation) -*(.text.__pagevec_release) -*(.text.pagevec_lookup_tag) -*(.text.packet_rcv) -*(.text.n_tty_set_room) -*(.text.nr_free_pages) -*(.text.__net_timestamp) -*(.text.mpage_end_io_read) -*(.text.mod_timer) -*(.text.__memcpy) -*(.text.mb_cache_shrink_fn) -*(.text.lock_rename) -*(.text.kstrdup) -*(.text.is_ignored) -*(.text.int_very_careful) -*(.text.inotify_inode_is_dead) -*(.text.inotify_get_cookie) -*(.text.inode_get_bytes) -*(.text.init_timer) -*(.text.init_dev) -*(.text.inet_getname) -*(.text.ide_map_sg) -*(.text.__ide_dma_end) -*(.text.hrtimer_get_remaining) -*(.text.get_task_mm) -*(.text.get_random_int) -*(.text.free_pipe_info) -*(.text.filemap_write_and_wait_range) -*(.text.exit_thread) -*(.text.enter_idle) -*(.text.end_that_request_first) -*(.text.end_8259A_irq) -*(.text.dummy_file_alloc_security) -*(.text.do_group_exit) -*(.text.debug_mutex_init) -*(.text.cpuset_exit) -*(.text.cpu_idle) -*(.text.copy_semundo) -*(.text.copy_files) -*(.text.chrdev_open) -*(.text.cdrom_transfer_packet_command) -*(.text.cdrom_mode_sense) -*(.text.blk_phys_contig_segment) -*(.text.blk_get_queue) -*(.text.bio_split) -*(.text.audit_alloc) -*(.text.anon_pipe_buf_release) -*(.text.add_wait_queue_exclusive) -*(.text.add_wait_queue) -*(.text.acct_process) -*(.text.account) -*(.text.zeromap_page_range) -*(.text.yield) -*(.text.writeback_acquire) -*(.text.worker_thread) -*(.text.wait_on_page_writeback_range) -*(.text.__wait_on_buffer) -*(.text.vscnprintf) -*(.text.vmalloc_to_pfn) -*(.text.vgacon_save_screen) -*(.text.vfs_unlink) -*(.text.vfs_rmdir) -*(.text.unregister_md_personality) -*(.text.unlock_new_inode) -*(.text.unix_stream_sendmsg) -*(.text.unix_stream_recvmsg) -*(.text.unhash_process) -*(.text.udp_v4_lookup_longway) -*(.text.tty_ldisc_flush) -*(.text.tty_ldisc_enable) -*(.text.tty_hung_up_p) -*(.text.tty_buffer_free_all) -*(.text.tso_fragment) -*(.text.try_to_del_timer_sync) -*(.text.tcp_v4_err) -*(.text.tcp_unhash) -*(.text.tcp_seq_next) -*(.text.tcp_select_initial_window) -*(.text.tcp_sacktag_write_queue) -*(.text.tcp_cwnd_validate) -*(.text.sys_vhangup) -*(.text.sys_uselib) -*(.text.sys_symlink) -*(.text.sys_signal) -*(.text.sys_poll) -*(.text.sys_mount) -*(.text.sys_kill) -*(.text.sys_ioctl) -*(.text.sys_inotify_add_watch) -*(.text.sys_getuid) -*(.text.sys_getrlimit) -*(.text.sys_getitimer) -*(.text.sys_getgroups) -*(.text.sys_ftruncate) -*(.text.sysfs_lookup) -*(.text.sys_exit_group) -*(.text.stub_fork) -*(.text.sscanf) -*(.text.sock_map_fd) -*(.text.sock_get_timestamp) -*(.text.__sock_create) -*(.text.smp_call_function_single) -*(.text.sk_stop_timer) -*(.text.skb_copy_and_csum_datagram) -*(.text.__skb_checksum_complete) -*(.text.single_next) -*(.text.sigqueue_alloc) -*(.text.shrink_dcache_parent) -*(.text.select_idle_routine) -*(.text.run_workqueue) -*(.text.run_local_timers) -*(.text.remove_inode_hash) -*(.text.remove_dquot_ref) -*(.text.register_binfmt) -*(.text.read_cache_pages) -*(.text.rb_last) -*(.text.pty_open) -*(.text.proc_root_readdir) -*(.text.proc_pid_flush) -*(.text.proc_pident_lookup) -*(.text.proc_fill_super) -*(.text.proc_exe_link) -*(.text.posix_locks_deadlock) -*(.text.pipe_iov_copy_from_user) -*(.text.opost) -*(.text.nf_register_hook) -*(.text.netif_rx_ni) -*(.text.m_start) -*(.text.mpage_writepage) -*(.text.mm_alloc) -*(.text.memory_open) -*(.text.mark_buffer_async_write) -*(.text.lru_add_drain_all) -*(.text.locks_init_lock) -*(.text.locks_delete_lock) -*(.text.lock_hrtimer_base) -*(.text.load_script) -*(.text.__kill_fasync) -*(.text.ip_mc_sf_allow) -*(.text.__ioremap) -*(.text.int_with_check) -*(.text.int_sqrt) -*(.text.install_thread_keyring) -*(.text.init_page_buffers) -*(.text.inet_sock_destruct) -*(.text.idle_notifier_register) -*(.text.ide_execute_command) -*(.text.ide_end_drive_cmd) -*(.text.__ide_dma_host_on) -*(.text.hrtimer_run_queues) -*(.text.hpet_mask_rtc_irq_bit) -*(.text.__get_zone_counts) -*(.text.get_zone_counts) -*(.text.get_write_access) -*(.text.get_fs_struct) -*(.text.get_dirty_limits) -*(.text.generic_readlink) -*(.text.free_hot_page) -*(.text.finish_wait) -*(.text.find_inode) -*(.text.find_first_bit) -*(.text.__filemap_fdatawrite_range) -*(.text.__filemap_copy_from_user_iovec) -*(.text.exit_aio) -*(.text.elv_set_request) -*(.text.elv_former_request) -*(.text.dup_namespace) -*(.text.dupfd) -*(.text.dummy_socket_getsockopt) -*(.text.dummy_sb_post_mountroot) -*(.text.dummy_quotactl) -*(.text.dummy_inode_rename) -*(.text.__do_SAK) -*(.text.do_pipe) -*(.text.do_fsync) -*(.text.d_instantiate_unique) -*(.text.d_find_alias) -*(.text.deny_write_access) -*(.text.dentry_unhash) -*(.text.d_delete) -*(.text.datagram_poll) -*(.text.cpuset_fork) -*(.text.cpuid_read) -*(.text.copy_namespace) -*(.text.cond_resched) -*(.text.check_version) -*(.text.__change_page_attr) -*(.text.cfq_slab_kill) -*(.text.cfq_completed_request) -*(.text.cdrom_pc_intr) -*(.text.cdrom_decode_status) -*(.text.cap_capset_check) -*(.text.blk_put_request) -*(.text.bio_fs_destructor) -*(.text.bictcp_min_cwnd) -*(.text.alloc_chrdev_region) -*(.text.add_element) -*(.text.acct_update_integrals) -*(.text.write_boundary_block) -*(.text.writeback_release) -*(.text.writeback_inodes) -*(.text.wake_up_state) -*(.text.__wake_up_locked) -*(.text.wake_futex) -*(.text.wait_task_inactive) -*(.text.__wait_on_freeing_inode) -*(.text.wait_noreap_copyout) -*(.text.vmstat_start) -*(.text.vgacon_do_font_op) -*(.text.vfs_readv) -*(.text.vfs_quota_sync) -*(.text.update_queue) -*(.text.unshare_files) -*(.text.unmap_vm_area) -*(.text.unix_socketpair) -*(.text.unix_release_sock) -*(.text.unix_detach_fds) -*(.text.unix_create1) -*(.text.unix_bind) -*(.text.udp_sendmsg) -*(.text.udp_rcv) -*(.text.udp_queue_rcv_skb) -*(.text.uart_write) -*(.text.uart_startup) -*(.text.uart_open) -*(.text.tty_vhangup) -*(.text.tty_termios_baud_rate) -*(.text.tty_release) -*(.text.tty_ldisc_ref) -*(.text.throttle_vm_writeout) -*(.text.058) -*(.text.tcp_xmit_probe_skb) -*(.text.tcp_v4_send_check) -*(.text.tcp_v4_destroy_sock) -*(.text.tcp_sync_mss) -*(.text.tcp_snd_test) -*(.text.tcp_slow_start) -*(.text.tcp_send_fin) -*(.text.tcp_rtt_estimator) -*(.text.tcp_parse_options) -*(.text.tcp_ioctl) -*(.text.tcp_init_tso_segs) -*(.text.tcp_init_cwnd) -*(.text.tcp_getsockopt) -*(.text.tcp_fin) -*(.text.tcp_connect) -*(.text.tcp_cong_avoid) -*(.text.__tcp_checksum_complete_user) -*(.text.task_dumpable) -*(.text.sys_wait4) -*(.text.sys_utimes) -*(.text.sys_symlinkat) -*(.text.sys_socketpair) -*(.text.sys_rmdir) -*(.text.sys_readahead) -*(.text.sys_nanosleep) -*(.text.sys_linkat) -*(.text.sys_fstat) -*(.text.sysfs_readdir) -*(.text.sys_execve) -*(.text.sysenter_tracesys) -*(.text.sys_chown) -*(.text.stub_clone) -*(.text.strrchr) -*(.text.strncpy) -*(.text.stopmachine_set_state) -*(.text.sock_sendmsg) -*(.text.sock_release) -*(.text.sock_fasync) -*(.text.sock_close) -*(.text.sk_stream_write_space) -*(.text.sk_reset_timer) -*(.text.skb_split) -*(.text.skb_recv_datagram) -*(.text.skb_queue_tail) -*(.text.sk_attach_filter) -*(.text.si_swapinfo) -*(.text.simple_strtoll) -*(.text.set_termios) -*(.text.set_task_comm) -*(.text.set_shrinker) -*(.text.set_normalized_timespec) -*(.text.set_brk) -*(.text.serial_in) -*(.text.seq_printf) -*(.text.secure_dccp_sequence_number) -*(.text.rwlock_bug) -*(.text.rt_hash_code) -*(.text.__rta_fill) -*(.text.__request_resource) -*(.text.relocate_new_kernel) -*(.text.release_thread) -*(.text.release_mem) -*(.text.rb_prev) -*(.text.rb_first) -*(.text.random_poll) -*(.text.__put_super_and_need_restart) -*(.text.pty_write) -*(.text.ptrace_stop) -*(.text.proc_self_readlink) -*(.text.proc_root_lookup) -*(.text.proc_root_link) -*(.text.proc_pid_make_inode) -*(.text.proc_pid_attr_write) -*(.text.proc_lookupfd) -*(.text.proc_delete_inode) -*(.text.posix_same_owner) -*(.text.posix_block_lock) -*(.text.poll_initwait) -*(.text.pipe_write) -*(.text.pipe_read_fasync) -*(.text.pipe_ioctl) -*(.text.pdflush) -*(.text.pci_user_read_config_dword) -*(.text.page_readlink) -*(.text.null_lseek) -*(.text.nf_hook_slow) -*(.text.netlink_sock_destruct) -*(.text.netlink_broadcast) -*(.text.neigh_resolve_output) -*(.text.name_to_int) -*(.text.mwait_idle) -*(.text.mutex_trylock) -*(.text.mutex_debug_check_no_locks_held) -*(.text.m_stop) -*(.text.mpage_end_io_write) -*(.text.mpage_alloc) -*(.text.move_page_tables) -*(.text.mounts_open) -*(.text.__memset) -*(.text.memcpy_fromiovec) -*(.text.make_8259A_irq) -*(.text.lookup_user_key_possessed) -*(.text.lookup_create) -*(.text.locks_insert_lock) -*(.text.locks_alloc_lock) -*(.text.kthread_should_stop) -*(.text.kswapd) -*(.text.kobject_uevent) -*(.text.kobject_get_path) -*(.text.kobject_get) -*(.text.klist_children_put) -*(.text.__ip_route_output_key) -*(.text.ip_flush_pending_frames) -*(.text.ip_compute_csum) -*(.text.ip_append_data) -*(.text.ioc_set_batching) -*(.text.invalidate_inode_pages) -*(.text.__invalidate_device) -*(.text.install_arg_page) -*(.text.in_sched_functions) -*(.text.inotify_unmount_inodes) -*(.text.init_once) -*(.text.init_cdrom_command) -*(.text.inet_stream_connect) -*(.text.inet_sk_rebuild_header) -*(.text.inet_csk_addr2sockaddr) -*(.text.inet_create) -*(.text.ifind) -*(.text.ide_setup_dma) -*(.text.ide_outsw) -*(.text.ide_fixstring) -*(.text.ide_dma_setup) -*(.text.ide_cdrom_packet) -*(.text.ide_cd_put) -*(.text.ide_build_sglist) -*(.text.i8259A_shutdown) -*(.text.hung_up_tty_ioctl) -*(.text.hrtimer_nanosleep) -*(.text.hrtimer_init) -*(.text.hrtimer_cancel) -*(.text.hash_futex) -*(.text.group_send_sig_info) -*(.text.grab_cache_page_nowait) -*(.text.get_wchan) -*(.text.get_stack) -*(.text.get_page_state) -*(.text.getnstimeofday) -*(.text.get_node) -*(.text.get_kprobe) -*(.text.generic_unplug_device) -*(.text.free_task) -*(.text.frag_show) -*(.text.find_next_zero_string) -*(.text.filp_open) -*(.text.fillonedir) -*(.text.exit_io_context) -*(.text.exit_idle) -*(.text.exact_lock) -*(.text.eth_header) -*(.text.dummy_unregister_security) -*(.text.dummy_socket_post_create) -*(.text.dummy_socket_listen) -*(.text.dummy_quota_on) -*(.text.dummy_inode_follow_link) -*(.text.dummy_file_receive) -*(.text.dummy_file_mprotect) -*(.text.dummy_file_lock) -*(.text.dummy_file_ioctl) -*(.text.dummy_bprm_post_apply_creds) -*(.text.do_writepages) -*(.text.__down_interruptible) -*(.text.do_notify_resume) -*(.text.do_acct_process) -*(.text.del_timer_sync) -*(.text.default_rebuild_header) -*(.text.d_callback) -*(.text.dcache_readdir) -*(.text.ctrl_dumpfamily) -*(.text.cpuset_rmdir) -*(.text.copy_strings_kernel) -*(.text.con_write_room) -*(.text.complete_all) -*(.text.collect_sigign_sigcatch) -*(.text.clear_user) -*(.text.check_unthrottle) -*(.text.cdrom_release) -*(.text.cdrom_newpc_intr) -*(.text.cdrom_ioctl) -*(.text.cdrom_check_status) -*(.text.cdev_put) -*(.text.cdev_add) -*(.text.cap_ptrace) -*(.text.cap_bprm_secureexec) -*(.text.cache_alloc_refill) -*(.text.bmap) -*(.text.blk_run_queue) -*(.text.blk_queue_dma_alignment) -*(.text.blk_ordered_req_seq) -*(.text.blk_backing_dev_unplug) -*(.text.__bitmap_subset) -*(.text.__bitmap_and) -*(.text.bio_unmap_user) -*(.text.__bforget) -*(.text.bd_forget) -*(.text.bad_pipe_w) -*(.text.bad_get_user) -*(.text.audit_free) -*(.text.anon_vma_ctor) -*(.text.anon_pipe_buf_map) -*(.text.alloc_sock_iocb) -*(.text.alloc_fdset) -*(.text.aio_kick_handler) -*(.text.__add_entropy_words) -*(.text.add_disk_randomness) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 32e427a95ffd..88cfa50b424d 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -30,9 +30,6 @@ SECTIONS /* First the code that has to be first for bootstrapping */ *(.bootstrap.text) _stext = .; - /* Then all the functions that are "hot" in profiles, to group them - onto the same hugetlb entry */ - #include "functionlist" /* Then the rest */ *(.text) SCHED_TEXT From fac15a8e4d4c36eb4676bfd2c30ea16ad05c401a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 221/231] [PATCH] x86-64: Share identical video.S between i386 and x86-64 Signed-off-by: Andi Kleen --- arch/x86_64/boot/setup.S | 2 +- arch/x86_64/boot/video.S | 2043 -------------------------------------- 2 files changed, 1 insertion(+), 2044 deletions(-) delete mode 100644 arch/x86_64/boot/video.S diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index 4f8851000d7e..e9e33f949697 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -807,7 +807,7 @@ gdt_48: # Include video setup & detection code -#include "video.S" +#include "../../i386/boot/video.S" # Setup signature -- must be last setup_sig1: .word SIG1 diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S deleted file mode 100644 index 8143c9516cb4..000000000000 --- a/arch/x86_64/boot/video.S +++ /dev/null @@ -1,2043 +0,0 @@ -/* video.S - * - * Display adapter & video mode setup, version 2.13 (14-May-99) - * - * Copyright (C) 1995 -- 1998 Martin Mares - * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson - * - * Rewritten to use GNU 'as' by Chris Noe May 1999 - * - * For further information, look at Documentation/svga.txt. - * - */ - -/* Enable autodetection of SVGA adapters and modes. */ -#undef CONFIG_VIDEO_SVGA - -/* Enable autodetection of VESA modes */ -#define CONFIG_VIDEO_VESA - -/* Enable compacting of mode table */ -#define CONFIG_VIDEO_COMPACT - -/* Retain screen contents when switching modes */ -#define CONFIG_VIDEO_RETAIN - -/* Enable local mode list */ -#undef CONFIG_VIDEO_LOCAL - -/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ -#undef CONFIG_VIDEO_400_HACK - -/* Hack that lets you force specific BIOS mode ID and specific dimensions */ -#undef CONFIG_VIDEO_GFX_HACK -#define VIDEO_GFX_BIOS_AX 0x4f02 /* 800x600 on ThinkPad */ -#define VIDEO_GFX_BIOS_BX 0x0102 -#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425 /* 100x37 */ - -/* This code uses an extended set of video mode numbers. These include: - * Aliases for standard modes - * NORMAL_VGA (-1) - * EXTENDED_VGA (-2) - * ASK_VGA (-3) - * Video modes numbered by menu position -- NOT RECOMMENDED because of lack - * of compatibility when extending the table. These are between 0x00 and 0xff. - */ -#define VIDEO_FIRST_MENU 0x0000 - -/* Standard BIOS video modes (BIOS number + 0x0100) */ -#define VIDEO_FIRST_BIOS 0x0100 - -/* VESA BIOS video modes (VESA number + 0x0200) */ -#define VIDEO_FIRST_VESA 0x0200 - -/* Video7 special modes (BIOS number + 0x0900) */ -#define VIDEO_FIRST_V7 0x0900 - -/* Special video modes */ -#define VIDEO_FIRST_SPECIAL 0x0f00 -#define VIDEO_80x25 0x0f00 -#define VIDEO_8POINT 0x0f01 -#define VIDEO_80x43 0x0f02 -#define VIDEO_80x28 0x0f03 -#define VIDEO_CURRENT_MODE 0x0f04 -#define VIDEO_80x30 0x0f05 -#define VIDEO_80x34 0x0f06 -#define VIDEO_80x60 0x0f07 -#define VIDEO_GFX_HACK 0x0f08 -#define VIDEO_LAST_SPECIAL 0x0f09 - -/* Video modes given by resolution */ -#define VIDEO_FIRST_RESOLUTION 0x1000 - -/* The "recalculate timings" flag */ -#define VIDEO_RECALC 0x8000 - -/* Positions of various video parameters passed to the kernel */ -/* (see also include/linux/tty.h) */ -#define PARAM_CURSOR_POS 0x00 -#define PARAM_VIDEO_PAGE 0x04 -#define PARAM_VIDEO_MODE 0x06 -#define PARAM_VIDEO_COLS 0x07 -#define PARAM_VIDEO_EGA_BX 0x0a -#define PARAM_VIDEO_LINES 0x0e -#define PARAM_HAVE_VGA 0x0f -#define PARAM_FONT_POINTS 0x10 - -#define PARAM_LFB_WIDTH 0x12 -#define PARAM_LFB_HEIGHT 0x14 -#define PARAM_LFB_DEPTH 0x16 -#define PARAM_LFB_BASE 0x18 -#define PARAM_LFB_SIZE 0x1c -#define PARAM_LFB_LINELENGTH 0x24 -#define PARAM_LFB_COLORS 0x26 -#define PARAM_VESAPM_SEG 0x2e -#define PARAM_VESAPM_OFF 0x30 -#define PARAM_LFB_PAGES 0x32 -#define PARAM_VESA_ATTRIB 0x34 -#define PARAM_CAPABILITIES 0x36 - -/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ -#ifdef CONFIG_VIDEO_RETAIN -#define DO_STORE call store_screen -#else -#define DO_STORE -#endif /* CONFIG_VIDEO_RETAIN */ - -# This is the main entry point called by setup.S -# %ds *must* be pointing to the bootsector -video: pushw %ds # We use different segments - pushw %ds # FS contains original DS - popw %fs - pushw %cs # DS is equal to CS - popw %ds - pushw %cs # ES is equal to CS - popw %es - xorw %ax, %ax - movw %ax, %gs # GS is zero - cld - call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA) -#ifdef CONFIG_VIDEO_SELECT - movw %fs:(0x01fa), %ax # User selected video mode - cmpw $ASK_VGA, %ax # Bring up the menu - jz vid2 - - call mode_set # Set the mode - jc vid1 - - leaw badmdt, %si # Invalid mode ID - call prtstr -vid2: call mode_menu -vid1: -#ifdef CONFIG_VIDEO_RETAIN - call restore_screen # Restore screen contents -#endif /* CONFIG_VIDEO_RETAIN */ - call store_edid -#endif /* CONFIG_VIDEO_SELECT */ - call mode_params # Store mode parameters - popw %ds # Restore original DS - ret - -# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel. -basic_detect: - movb $0, %fs:(PARAM_HAVE_VGA) - movb $0x12, %ah # Check EGA/VGA - movb $0x10, %bl - int $0x10 - movw %bx, %fs:(PARAM_VIDEO_EGA_BX) # Identifies EGA to the kernel - cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card. - je basret - - incb adapter - movw $0x1a00, %ax # Check EGA or VGA? - int $0x10 - cmpb $0x1a, %al # 1a means VGA... - jne basret # anything else is EGA. - - incb %fs:(PARAM_HAVE_VGA) # We've detected a VGA - incb adapter -basret: ret - -# Store the video mode parameters for later usage by the kernel. -# This is done by asking the BIOS except for the rows/columns -# parameters in the default 80x25 mode -- these are set directly, -# because some very obscure BIOSes supply insane values. -mode_params: -#ifdef CONFIG_VIDEO_SELECT - cmpb $0, graphic_mode - jnz mopar_gr -#endif - movb $0x03, %ah # Read cursor position - xorb %bh, %bh - int $0x10 - movw %dx, %fs:(PARAM_CURSOR_POS) - movb $0x0f, %ah # Read page/mode/width - int $0x10 - movw %bx, %fs:(PARAM_VIDEO_PAGE) - movw %ax, %fs:(PARAM_VIDEO_MODE) # Video mode and screen width - cmpb $0x7, %al # MDA/HGA => segment differs - jnz mopar0 - - movw $0xb000, video_segment -mopar0: movw %gs:(0x485), %ax # Font size - movw %ax, %fs:(PARAM_FONT_POINTS) # (valid only on EGA/VGA) - movw force_size, %ax # Forced size? - orw %ax, %ax - jz mopar1 - - movb %ah, %fs:(PARAM_VIDEO_COLS) - movb %al, %fs:(PARAM_VIDEO_LINES) - ret - -mopar1: movb $25, %al - cmpb $0, adapter # If we are on CGA/MDA/HGA, the - jz mopar2 # screen must have 25 lines. - - movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS - incb %al # location of max lines. -mopar2: movb %al, %fs:(PARAM_VIDEO_LINES) - ret - -#ifdef CONFIG_VIDEO_SELECT -# Fetching of VESA frame buffer parameters -mopar_gr: - leaw modelist+1024, %di - movb $0x23, %fs:(PARAM_HAVE_VGA) - movw 16(%di), %ax - movw %ax, %fs:(PARAM_LFB_LINELENGTH) - movw 18(%di), %ax - movw %ax, %fs:(PARAM_LFB_WIDTH) - movw 20(%di), %ax - movw %ax, %fs:(PARAM_LFB_HEIGHT) - movb 25(%di), %al - movb $0, %ah - movw %ax, %fs:(PARAM_LFB_DEPTH) - movb 29(%di), %al - movb $0, %ah - movw %ax, %fs:(PARAM_LFB_PAGES) - movl 40(%di), %eax - movl %eax, %fs:(PARAM_LFB_BASE) - movl 31(%di), %eax - movl %eax, %fs:(PARAM_LFB_COLORS) - movl 35(%di), %eax - movl %eax, %fs:(PARAM_LFB_COLORS+4) - movw 0(%di), %ax - movw %ax, %fs:(PARAM_VESA_ATTRIB) - -# get video mem size - leaw modelist+1024, %di - movw $0x4f00, %ax - int $0x10 - xorl %eax, %eax - movw 18(%di), %ax - movl %eax, %fs:(PARAM_LFB_SIZE) - -# store mode capabilities - movl 10(%di), %eax - movl %eax, %fs:(PARAM_CAPABILITIES) - -# switching the DAC to 8-bit is for <= 8 bpp only - movw %fs:(PARAM_LFB_DEPTH), %ax - cmpw $8, %ax - jg dac_done - -# get DAC switching capability - xorl %eax, %eax - movb 10(%di), %al - testb $1, %al - jz dac_set - -# attempt to switch DAC to 8-bit - movw $0x4f08, %ax - movw $0x0800, %bx - int $0x10 - cmpw $0x004f, %ax - jne dac_set - movb %bh, dac_size # store actual DAC size - -dac_set: -# set color size to DAC size - movb dac_size, %al - movb %al, %fs:(PARAM_LFB_COLORS+0) - movb %al, %fs:(PARAM_LFB_COLORS+2) - movb %al, %fs:(PARAM_LFB_COLORS+4) - movb %al, %fs:(PARAM_LFB_COLORS+6) - -# set color offsets to 0 - movb $0, %fs:(PARAM_LFB_COLORS+1) - movb $0, %fs:(PARAM_LFB_COLORS+3) - movb $0, %fs:(PARAM_LFB_COLORS+5) - movb $0, %fs:(PARAM_LFB_COLORS+7) - -dac_done: -# get protected mode interface informations - movw $0x4f0a, %ax - xorw %bx, %bx - xorw %di, %di - int $0x10 - cmp $0x004f, %ax - jnz no_pm - - movw %es, %fs:(PARAM_VESAPM_SEG) - movw %di, %fs:(PARAM_VESAPM_OFF) -no_pm: ret - -# The video mode menu -mode_menu: - leaw keymsg, %si # "Return/Space/Timeout" message - call prtstr - call flush -nokey: call getkt - - cmpb $0x0d, %al # ENTER ? - je listm # yes - manual mode selection - - cmpb $0x20, %al # SPACE ? - je defmd1 # no - repeat - - call beep - jmp nokey - -defmd1: ret # No mode chosen? Default 80x25 - -listm: call mode_table # List mode table -listm0: leaw name_bann, %si # Print adapter name - call prtstr - movw card_name, %si - orw %si, %si - jnz an2 - - movb adapter, %al - leaw old_name, %si - orb %al, %al - jz an1 - - leaw ega_name, %si - decb %al - jz an1 - - leaw vga_name, %si - jmp an1 - -an2: call prtstr - leaw svga_name, %si -an1: call prtstr - leaw listhdr, %si # Table header - call prtstr - movb $0x30, %dl # DL holds mode number - leaw modelist, %si -lm1: cmpw $ASK_VGA, (%si) # End? - jz lm2 - - movb %dl, %al # Menu selection number - call prtchr - call prtsp2 - lodsw - call prthw # Mode ID - call prtsp2 - movb 0x1(%si), %al - call prtdec # Rows - movb $0x78, %al # the letter 'x' - call prtchr - lodsw - call prtdec # Columns - movb $0x0d, %al # New line - call prtchr - movb $0x0a, %al - call prtchr - incb %dl # Next character - cmpb $0x3a, %dl - jnz lm1 - - movb $0x61, %dl - jmp lm1 - -lm2: leaw prompt, %si # Mode prompt - call prtstr - leaw edit_buf, %di # Editor buffer -lm3: call getkey - cmpb $0x0d, %al # Enter? - jz lment - - cmpb $0x08, %al # Backspace? - jz lmbs - - cmpb $0x20, %al # Printable? - jc lm3 - - cmpw $edit_buf+4, %di # Enough space? - jz lm3 - - stosb - call prtchr - jmp lm3 - -lmbs: cmpw $edit_buf, %di # Backspace - jz lm3 - - decw %di - movb $0x08, %al - call prtchr - call prtspc - movb $0x08, %al - call prtchr - jmp lm3 - -lment: movb $0, (%di) - leaw crlft, %si - call prtstr - leaw edit_buf, %si - cmpb $0, (%si) # Empty string = default mode - jz lmdef - - cmpb $0, 1(%si) # One character = menu selection - jz mnusel - - cmpw $0x6373, (%si) # "scan" => mode scanning - jnz lmhx - - cmpw $0x6e61, 2(%si) - jz lmscan - -lmhx: xorw %bx, %bx # Else => mode ID in hex -lmhex: lodsb - orb %al, %al - jz lmuse1 - - subb $0x30, %al - jc lmbad - - cmpb $10, %al - jc lmhx1 - - subb $7, %al - andb $0xdf, %al - cmpb $10, %al - jc lmbad - - cmpb $16, %al - jnc lmbad - -lmhx1: shlw $4, %bx - orb %al, %bl - jmp lmhex - -lmuse1: movw %bx, %ax - jmp lmuse - -mnusel: lodsb # Menu selection - xorb %ah, %ah - subb $0x30, %al - jc lmbad - - cmpb $10, %al - jc lmuse - - cmpb $0x61-0x30, %al - jc lmbad - - subb $0x61-0x30-10, %al - cmpb $36, %al - jnc lmbad - -lmuse: call mode_set - jc lmdef - -lmbad: leaw unknt, %si - call prtstr - jmp lm2 -lmscan: cmpb $0, adapter # Scanning only on EGA/VGA - jz lmbad - - movw $0, mt_end # Scanning of modes is - movb $1, scanning # done as new autodetection. - call mode_table - jmp listm0 -lmdef: ret - -# Additional parts of mode_set... (relative jumps, you know) -setv7: # Video7 extended modes - DO_STORE - subb $VIDEO_FIRST_V7>>8, %bh - movw $0x6f05, %ax - int $0x10 - stc - ret - -_setrec: jmp setrec # Ugly... -_set_80x25: jmp set_80x25 - -# Aliases for backward compatibility. -setalias: - movw $VIDEO_80x25, %ax - incw %bx - jz mode_set - - movb $VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al - incw %bx - jnz setbad # Fall-through! - -# Setting of user mode (AX=mode ID) => CF=success -mode_set: - movw %ax, %fs:(0x01fa) # Store mode for use in acpi_wakeup.S - movw %ax, %bx - cmpb $0xff, %ah - jz setalias - - testb $VIDEO_RECALC>>8, %ah - jnz _setrec - - cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah - jnc setres - - cmpb $VIDEO_FIRST_SPECIAL>>8, %ah - jz setspc - - cmpb $VIDEO_FIRST_V7>>8, %ah - jz setv7 - - cmpb $VIDEO_FIRST_VESA>>8, %ah - jnc check_vesa - - orb %ah, %ah - jz setmenu - - decb %ah - jz setbios - -setbad: clc - movb $0, do_restore # The screen needn't be restored - ret - -setvesa: - DO_STORE - subb $VIDEO_FIRST_VESA>>8, %bh - movw $0x4f02, %ax # VESA BIOS mode set call - int $0x10 - cmpw $0x004f, %ax # AL=4f if implemented - jnz setbad # AH=0 if OK - - stc - ret - -setbios: - DO_STORE - int $0x10 # Standard BIOS mode set call - pushw %bx - movb $0x0f, %ah # Check if really set - int $0x10 - popw %bx - cmpb %bl, %al - jnz setbad - - stc - ret - -setspc: xorb %bh, %bh # Set special mode - cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl - jnc setbad - - addw %bx, %bx - jmp *spec_inits(%bx) - -setmenu: - orb %al, %al # 80x25 is an exception - jz _set_80x25 - - pushw %bx # Set mode chosen from menu - call mode_table # Build the mode table - popw %ax - shlw $2, %ax - addw %ax, %si - cmpw %di, %si - jnc setbad - - movw (%si), %ax # Fetch mode ID -_m_s: jmp mode_set - -setres: pushw %bx # Set mode chosen by resolution - call mode_table - popw %bx - xchgb %bl, %bh -setr1: lodsw - cmpw $ASK_VGA, %ax # End of the list? - jz setbad - - lodsw - cmpw %bx, %ax - jnz setr1 - - movw -4(%si), %ax # Fetch mode ID - jmp _m_s - -check_vesa: -#ifdef CONFIG_FIRMWARE_EDID - leaw modelist+1024, %di - movw $0x4f00, %ax - int $0x10 - cmpw $0x004f, %ax - jnz setbad - - movw 4(%di), %ax - movw %ax, vbe_version -#endif - leaw modelist+1024, %di - subb $VIDEO_FIRST_VESA>>8, %bh - movw %bx, %cx # Get mode information structure - movw $0x4f01, %ax - int $0x10 - addb $VIDEO_FIRST_VESA>>8, %bh - cmpw $0x004f, %ax - jnz setbad - - movb (%di), %al # Check capabilities. - andb $0x19, %al - cmpb $0x09, %al - jz setvesa # This is a text mode - - movb (%di), %al # Check capabilities. - andb $0x99, %al - cmpb $0x99, %al - jnz _setbad # Doh! No linear frame buffer. - - subb $VIDEO_FIRST_VESA>>8, %bh - orw $0x4000, %bx # Use linear frame buffer - movw $0x4f02, %ax # VESA BIOS mode set call - int $0x10 - cmpw $0x004f, %ax # AL=4f if implemented - jnz _setbad # AH=0 if OK - - movb $1, graphic_mode # flag graphic mode - movb $0, do_restore # no screen restore - stc - ret - -_setbad: jmp setbad # Ugly... - -# Recalculate vertical display end registers -- this fixes various -# inconsistencies of extended modes on many adapters. Called when -# the VIDEO_RECALC flag is set in the mode ID. - -setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode - call mode_set - jnc rct3 - - movw %gs:(0x485), %ax # Font size in pixels - movb %gs:(0x484), %bl # Number of rows - incb %bl - mulb %bl # Number of visible - decw %ax # scan lines - 1 - movw $0x3d4, %dx - movw %ax, %bx - movb $0x12, %al # Lower 8 bits - movb %bl, %ah - outw %ax, %dx - movb $0x07, %al # Bits 8 and 9 in the overflow register - call inidx - xchgb %al, %ah - andb $0xbd, %ah - shrb %bh - jnc rct1 - orb $0x02, %ah -rct1: shrb %bh - jnc rct2 - orb $0x40, %ah -rct2: movb $0x07, %al - outw %ax, %dx - stc -rct3: ret - -# Table of routines for setting of the special modes. -spec_inits: - .word set_80x25 - .word set_8pixel - .word set_80x43 - .word set_80x28 - .word set_current - .word set_80x30 - .word set_80x34 - .word set_80x60 - .word set_gfx - -# Set the 80x25 mode. If already set, do nothing. -set_80x25: - movw $0x5019, force_size # Override possibly broken BIOS -use_80x25: -#ifdef CONFIG_VIDEO_400_HACK - movw $0x1202, %ax # Force 400 scan lines - movb $0x30, %bl - int $0x10 -#else - movb $0x0f, %ah # Get current mode ID - int $0x10 - cmpw $0x5007, %ax # Mode 7 (80x25 mono) is the only one available - jz st80 # on CGA/MDA/HGA and is also available on EGAM - - cmpw $0x5003, %ax # Unknown mode, force 80x25 color - jnz force3 - -st80: cmpb $0, adapter # CGA/MDA/HGA => mode 3/7 is always 80x25 - jz set80 - - movb %gs:(0x0484), %al # This is EGA+ -- beware of 80x50 etc. - orb %al, %al # Some buggy BIOS'es set 0 rows - jz set80 - - cmpb $24, %al # It's hopefully correct - jz set80 -#endif /* CONFIG_VIDEO_400_HACK */ -force3: DO_STORE - movw $0x0003, %ax # Forced set - int $0x10 -set80: stc - ret - -# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls. -set_8pixel: - DO_STORE - call use_80x25 # The base is 80x25 -set_8pt: - movw $0x1112, %ax # Use 8x8 font - xorb %bl, %bl - int $0x10 - movw $0x1200, %ax # Use alternate print screen - movb $0x20, %bl - int $0x10 - movw $0x1201, %ax # Turn off cursor emulation - movb $0x34, %bl - int $0x10 - movb $0x01, %ah # Define cursor scan lines 6-7 - movw $0x0607, %cx - int $0x10 -set_current: - stc - ret - -# Set the 80x28 mode. This mode works on all VGA's, because it's a standard -# 80x25 mode with 14-point fonts instead of 16-point. -set_80x28: - DO_STORE - call use_80x25 # The base is 80x25 -set14: movw $0x1111, %ax # Use 9x14 font - xorb %bl, %bl - int $0x10 - movb $0x01, %ah # Define cursor scan lines 11-12 - movw $0x0b0c, %cx - int $0x10 - stc - ret - -# Set the 80x43 mode. This mode is works on all VGA's. -# It's a 350-scanline mode with 8-pixel font. -set_80x43: - DO_STORE - movw $0x1201, %ax # Set 350 scans - movb $0x30, %bl - int $0x10 - movw $0x0003, %ax # Reset video mode - int $0x10 - jmp set_8pt # Use 8-pixel font - -# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font. -set_80x30: - call use_80x25 # Start with real 80x25 - DO_STORE - movw $0x3cc, %dx # Get CRTC port - inb %dx, %al - movb $0xd4, %dl - rorb %al # Mono or color? - jc set48a - - movb $0xb4, %dl -set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7) - call outidx - movw $0x0b06, %ax # Vertical total - call outidx - movw $0x3e07, %ax # (Vertical) overflow - call outidx - movw $0xea10, %ax # Vertical sync start - call outidx - movw $0xdf12, %ax # Vertical display end - call outidx - movw $0xe715, %ax # Vertical blank start - call outidx - movw $0x0416, %ax # Vertical blank end - call outidx - pushw %dx - movb $0xcc, %dl # Misc output register (read) - inb %dx, %al - movb $0xc2, %dl # (write) - andb $0x0d, %al # Preserve clock select bits and color bit - orb $0xe2, %al # Set correct sync polarity - outb %al, %dx - popw %dx - movw $0x501e, force_size - stc # That's all. - ret - -# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font. -set_80x34: - call set_80x30 # Set 480 scans - call set14 # And 14-pt font - movw $0xdb12, %ax # VGA vertical display end - movw $0x5022, force_size -setvde: call outidx - stc - ret - -# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font. -set_80x60: - call set_80x30 # Set 480 scans - call set_8pt # And 8-pt font - movw $0xdf12, %ax # VGA vertical display end - movw $0x503c, force_size - jmp setvde - -# Special hack for ThinkPad graphics -set_gfx: -#ifdef CONFIG_VIDEO_GFX_HACK - movw $VIDEO_GFX_BIOS_AX, %ax - movw $VIDEO_GFX_BIOS_BX, %bx - int $0x10 - movw $VIDEO_GFX_DUMMY_RESOLUTION, force_size - stc -#endif - ret - -#ifdef CONFIG_VIDEO_RETAIN - -# Store screen contents to temporary buffer. -store_screen: - cmpb $0, do_restore # Already stored? - jnz stsr - - testb $CAN_USE_HEAP, loadflags # Have we space for storing? - jz stsr - - pushw %ax - pushw %bx - pushw force_size # Don't force specific size - movw $0, force_size - call mode_params # Obtain params of current mode - popw force_size - movb %fs:(PARAM_VIDEO_LINES), %ah - movb %fs:(PARAM_VIDEO_COLS), %al - movw %ax, %bx # BX=dimensions - mulb %ah - movw %ax, %cx # CX=number of characters - addw %ax, %ax # Calculate image size - addw $modelist+1024+4, %ax - cmpw heap_end_ptr, %ax - jnc sts1 # Unfortunately, out of memory - - movw %fs:(PARAM_CURSOR_POS), %ax # Store mode params - leaw modelist+1024, %di - stosw - movw %bx, %ax - stosw - pushw %ds # Store the screen - movw video_segment, %ds - xorw %si, %si - rep - movsw - popw %ds - incb do_restore # Screen will be restored later -sts1: popw %bx - popw %ax -stsr: ret - -# Restore screen contents from temporary buffer. -restore_screen: - cmpb $0, do_restore # Has the screen been stored? - jz res1 - - call mode_params # Get parameters of current mode - movb %fs:(PARAM_VIDEO_LINES), %cl - movb %fs:(PARAM_VIDEO_COLS), %ch - leaw modelist+1024, %si # Screen buffer - lodsw # Set cursor position - movw %ax, %dx - cmpb %cl, %dh - jc res2 - - movb %cl, %dh - decb %dh -res2: cmpb %ch, %dl - jc res3 - - movb %ch, %dl - decb %dl -res3: movb $0x02, %ah - movb $0x00, %bh - int $0x10 - lodsw # Display size - movb %ah, %dl # DL=number of lines - movb $0, %ah # BX=phys. length of orig. line - movw %ax, %bx - cmpb %cl, %dl # Too many? - jc res4 - - pushw %ax - movb %dl, %al - subb %cl, %al - mulb %bl - addw %ax, %si - addw %ax, %si - popw %ax - movb %cl, %dl -res4: cmpb %ch, %al # Too wide? - jc res5 - - movb %ch, %al # AX=width of src. line -res5: movb $0, %cl - xchgb %ch, %cl - movw %cx, %bp # BP=width of dest. line - pushw %es - movw video_segment, %es - xorw %di, %di # Move the data - addw %bx, %bx # Convert BX and BP to _bytes_ - addw %bp, %bp -res6: pushw %si - pushw %di - movw %ax, %cx - rep - movsw - popw %di - popw %si - addw %bp, %di - addw %bx, %si - decb %dl - jnz res6 - - popw %es # Done -res1: ret -#endif /* CONFIG_VIDEO_RETAIN */ - -# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port) -outidx: outb %al, %dx - pushw %ax - movb %ah, %al - incw %dx - outb %al, %dx - decw %dx - popw %ax - ret - -# Build the table of video modes (stored after the setup.S code at the -# `modelist' label. Each video mode record looks like: -# .word MODE-ID (our special mode ID (see above)) -# .byte rows (number of rows) -# .byte columns (number of columns) -# Returns address of the end of the table in DI, the end is marked -# with a ASK_VGA ID. -mode_table: - movw mt_end, %di # Already filled? - orw %di, %di - jnz mtab1x - - leaw modelist, %di # Store standard modes: - movl $VIDEO_80x25 + 0x50190000, %eax # The 80x25 mode (ALL) - stosl - movb adapter, %al # CGA/MDA/HGA -- no more modes - orb %al, %al - jz mtabe - - decb %al - jnz mtabv - - movl $VIDEO_8POINT + 0x502b0000, %eax # The 80x43 EGA mode - stosl - jmp mtabe - -mtab1x: jmp mtab1 - -mtabv: leaw vga_modes, %si # All modes for std VGA - movw $vga_modes_end-vga_modes, %cx - rep # I'm unable to use movsw as I don't know how to store a half - movsb # of the expression above to cx without using explicit shr. - - cmpb $0, scanning # Mode scan requested? - jz mscan1 - - call mode_scan -mscan1: - -#ifdef CONFIG_VIDEO_LOCAL - call local_modes -#endif /* CONFIG_VIDEO_LOCAL */ - -#ifdef CONFIG_VIDEO_VESA - call vesa_modes # Detect VESA VGA modes -#endif /* CONFIG_VIDEO_VESA */ - -#ifdef CONFIG_VIDEO_SVGA - cmpb $0, scanning # Bypass when scanning - jnz mscan2 - - call svga_modes # Detect SVGA cards & modes -mscan2: -#endif /* CONFIG_VIDEO_SVGA */ - -mtabe: - -#ifdef CONFIG_VIDEO_COMPACT - leaw modelist, %si - movw %di, %dx - movw %si, %di -cmt1: cmpw %dx, %si # Scan all modes - jz cmt2 - - leaw modelist, %bx # Find in previous entries - movw 2(%si), %cx -cmt3: cmpw %bx, %si - jz cmt4 - - cmpw 2(%bx), %cx # Found => don't copy this entry - jz cmt5 - - addw $4, %bx - jmp cmt3 - -cmt4: movsl # Copy entry - jmp cmt1 - -cmt5: addw $4, %si # Skip entry - jmp cmt1 - -cmt2: -#endif /* CONFIG_VIDEO_COMPACT */ - - movw $ASK_VGA, (%di) # End marker - movw %di, mt_end -mtab1: leaw modelist, %si # SI=mode list, DI=list end -ret0: ret - -# Modes usable on all standard VGAs -vga_modes: - .word VIDEO_8POINT - .word 0x5032 # 80x50 - .word VIDEO_80x43 - .word 0x502b # 80x43 - .word VIDEO_80x28 - .word 0x501c # 80x28 - .word VIDEO_80x30 - .word 0x501e # 80x30 - .word VIDEO_80x34 - .word 0x5022 # 80x34 - .word VIDEO_80x60 - .word 0x503c # 80x60 -#ifdef CONFIG_VIDEO_GFX_HACK - .word VIDEO_GFX_HACK - .word VIDEO_GFX_DUMMY_RESOLUTION -#endif - -vga_modes_end: -# Detect VESA modes. - -#ifdef CONFIG_VIDEO_VESA -vesa_modes: - cmpb $2, adapter # VGA only - jnz ret0 - - movw %di, %bp # BP=original mode table end - addw $0x200, %di # Buffer space - movw $0x4f00, %ax # VESA Get card info call - int $0x10 - movw %bp, %di - cmpw $0x004f, %ax # Successful? - jnz ret0 - - cmpw $0x4556, 0x200(%di) - jnz ret0 - - cmpw $0x4153, 0x202(%di) - jnz ret0 - - movw $vesa_name, card_name # Set name to "VESA VGA" - pushw %gs - lgsw 0x20e(%di), %si # GS:SI=mode list - movw $128, %cx # Iteration limit -vesa1: -# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst. -# XXX: lodsw %gs:(%si), %ax # Get next mode in the list - gs; lodsw - cmpw $0xffff, %ax # End of the table? - jz vesar - - cmpw $0x0080, %ax # Check validity of mode ID - jc vesa2 - - orb %ah, %ah # Valid IDs: 0x0000-0x007f/0x0100-0x07ff - jz vesan # Certain BIOSes report 0x80-0xff! - - cmpw $0x0800, %ax - jnc vesae - -vesa2: pushw %cx - movw %ax, %cx # Get mode information structure - movw $0x4f01, %ax - int $0x10 - movw %cx, %bx # BX=mode number - addb $VIDEO_FIRST_VESA>>8, %bh - popw %cx - cmpw $0x004f, %ax - jnz vesan # Don't report errors (buggy BIOSES) - - movb (%di), %al # Check capabilities. We require - andb $0x19, %al # a color text mode. - cmpb $0x09, %al - jnz vesan - - cmpw $0xb800, 8(%di) # Standard video memory address required - jnz vesan - - testb $2, (%di) # Mode characteristics supplied? - movw %bx, (%di) # Store mode number - jz vesa3 - - xorw %dx, %dx - movw 0x12(%di), %bx # Width - orb %bh, %bh - jnz vesan - - movb %bl, 0x3(%di) - movw 0x14(%di), %ax # Height - orb %ah, %ah - jnz vesan - - movb %al, 2(%di) - mulb %bl - cmpw $8193, %ax # Small enough for Linux console driver? - jnc vesan - - jmp vesaok - -vesa3: subw $0x8108, %bx # This mode has no detailed info specified, - jc vesan # so it must be a standard VESA mode. - - cmpw $5, %bx - jnc vesan - - movw vesa_text_mode_table(%bx), %ax - movw %ax, 2(%di) -vesaok: addw $4, %di # The mode is valid. Store it. -vesan: loop vesa1 # Next mode. Limit exceeded => error -vesae: leaw vesaer, %si - call prtstr - movw %bp, %di # Discard already found modes. -vesar: popw %gs - ret - -# Dimensions of standard VESA text modes -vesa_text_mode_table: - .byte 60, 80 # 0108 - .byte 25, 132 # 0109 - .byte 43, 132 # 010A - .byte 50, 132 # 010B - .byte 60, 132 # 010C -#endif /* CONFIG_VIDEO_VESA */ - -# Scan for video modes. A bit dirty, but should work. -mode_scan: - movw $0x0100, %cx # Start with mode 0 -scm1: movb $0, %ah # Test the mode - movb %cl, %al - int $0x10 - movb $0x0f, %ah - int $0x10 - cmpb %cl, %al - jnz scm2 # Mode not set - - movw $0x3c0, %dx # Test if it's a text mode - movb $0x10, %al # Mode bits - call inidx - andb $0x03, %al - jnz scm2 - - movb $0xce, %dl # Another set of mode bits - movb $0x06, %al - call inidx - shrb %al - jc scm2 - - movb $0xd4, %dl # Cursor location - movb $0x0f, %al - call inidx - orb %al, %al - jnz scm2 - - movw %cx, %ax # Ok, store the mode - stosw - movb %gs:(0x484), %al # Number of rows - incb %al - stosb - movw %gs:(0x44a), %ax # Number of columns - stosb -scm2: incb %cl - jns scm1 - - movw $0x0003, %ax # Return back to mode 3 - int $0x10 - ret - -tstidx: outw %ax, %dx # OUT DX,AX and inidx -inidx: outb %al, %dx # Read from indexed VGA register - incw %dx # AL=index, DX=index reg port -> AL=data - inb %dx, %al - decw %dx - ret - -# Try to detect type of SVGA card and supply (usually approximate) video -# mode table for it. - -#ifdef CONFIG_VIDEO_SVGA -svga_modes: - leaw svga_table, %si # Test all known SVGA adapters -dosvga: lodsw - movw %ax, %bp # Default mode table - orw %ax, %ax - jz didsv1 - - lodsw # Pointer to test routine - pushw %si - pushw %di - pushw %es - movw $0xc000, %bx - movw %bx, %es - call *%ax # Call test routine - popw %es - popw %di - popw %si - orw %bp, %bp - jz dosvga - - movw %bp, %si # Found, copy the modes - movb svga_prefix, %ah -cpsvga: lodsb - orb %al, %al - jz didsv - - stosw - movsw - jmp cpsvga - -didsv: movw %si, card_name # Store pointer to card name -didsv1: ret - -# Table of all known SVGA cards. For each card, we store a pointer to -# a table of video modes supported by the card and a pointer to a routine -# used for testing of presence of the card. The video mode table is always -# followed by the name of the card or the chipset. -svga_table: - .word ati_md, ati_test - .word oak_md, oak_test - .word paradise_md, paradise_test - .word realtek_md, realtek_test - .word s3_md, s3_test - .word chips_md, chips_test - .word video7_md, video7_test - .word cirrus5_md, cirrus5_test - .word cirrus6_md, cirrus6_test - .word cirrus1_md, cirrus1_test - .word ahead_md, ahead_test - .word everex_md, everex_test - .word genoa_md, genoa_test - .word trident_md, trident_test - .word tseng_md, tseng_test - .word 0 - -# Test routines and mode tables: - -# S3 - The test algorithm was taken from the SuperProbe package -# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org -s3_test: - movw $0x0f35, %cx # we store some constants in cl/ch - movw $0x03d4, %dx - movb $0x38, %al - call inidx - movb %al, %bh # store current CRT-register 0x38 - movw $0x0038, %ax - call outidx # disable writing to special regs - movb %cl, %al # check whether we can write special reg 0x35 - call inidx - movb %al, %bl # save the current value of CRT reg 0x35 - andb $0xf0, %al # clear bits 0-3 - movb %al, %ah - movb %cl, %al # and write it to CRT reg 0x35 - call outidx - call inidx # now read it back - andb %ch, %al # clear the upper 4 bits - jz s3_2 # the first test failed. But we have a - - movb %bl, %ah # second chance - movb %cl, %al - call outidx - jmp s3_1 # do the other tests - -s3_2: movw %cx, %ax # load ah with 0xf and al with 0x35 - orb %bl, %ah # set the upper 4 bits of ah with the orig value - call outidx # write ... - call inidx # ... and reread - andb %cl, %al # turn off the upper 4 bits - pushw %ax - movb %bl, %ah # restore old value in register 0x35 - movb %cl, %al - call outidx - popw %ax - cmpb %ch, %al # setting lower 4 bits was successful => bad - je no_s3 # writing is allowed => this is not an S3 - -s3_1: movw $0x4838, %ax # allow writing to special regs by putting - call outidx # magic number into CRT-register 0x38 - movb %cl, %al # check whether we can write special reg 0x35 - call inidx - movb %al, %bl - andb $0xf0, %al - movb %al, %ah - movb %cl, %al - call outidx - call inidx - andb %ch, %al - jnz no_s3 # no, we can't write => no S3 - - movw %cx, %ax - orb %bl, %ah - call outidx - call inidx - andb %ch, %al - pushw %ax - movb %bl, %ah # restore old value in register 0x35 - movb %cl, %al - call outidx - popw %ax - cmpb %ch, %al - jne no_s31 # writing not possible => no S3 - movb $0x30, %al - call inidx # now get the S3 id ... - leaw idS3, %di - movw $0x10, %cx - repne - scasb - je no_s31 - - movb %bh, %ah - movb $0x38, %al - jmp s3rest - -no_s3: movb $0x35, %al # restore CRT register 0x35 - movb %bl, %ah - call outidx -no_s31: xorw %bp, %bp # Detection failed -s3rest: movb %bh, %ah - movb $0x38, %al # restore old value of CRT register 0x38 - jmp outidx - -idS3: .byte 0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95 - .byte 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0 - -s3_md: .byte 0x54, 0x2b, 0x84 - .byte 0x55, 0x19, 0x84 - .byte 0 - .ascii "S3" - .byte 0 - -# ATI cards. -ati_test: - leaw idati, %si - movw $0x31, %di - movw $0x09, %cx - repe - cmpsb - je atiok - - xorw %bp, %bp -atiok: ret - -idati: .ascii "761295520" - -ati_md: .byte 0x23, 0x19, 0x84 - .byte 0x33, 0x2c, 0x84 - .byte 0x22, 0x1e, 0x64 - .byte 0x21, 0x19, 0x64 - .byte 0x58, 0x21, 0x50 - .byte 0x5b, 0x1e, 0x50 - .byte 0 - .ascii "ATI" - .byte 0 - -# AHEAD -ahead_test: - movw $0x200f, %ax - movw $0x3ce, %dx - outw %ax, %dx - incw %dx - inb %dx, %al - cmpb $0x20, %al - je isahed - - cmpb $0x21, %al - je isahed - - xorw %bp, %bp -isahed: ret - -ahead_md: - .byte 0x22, 0x2c, 0x84 - .byte 0x23, 0x19, 0x84 - .byte 0x24, 0x1c, 0x84 - .byte 0x2f, 0x32, 0xa0 - .byte 0x32, 0x22, 0x50 - .byte 0x34, 0x42, 0x50 - .byte 0 - .ascii "Ahead" - .byte 0 - -# Chips & Tech. -chips_test: - movw $0x3c3, %dx - inb %dx, %al - orb $0x10, %al - outb %al, %dx - movw $0x104, %dx - inb %dx, %al - movb %al, %bl - movw $0x3c3, %dx - inb %dx, %al - andb $0xef, %al - outb %al, %dx - cmpb $0xa5, %bl - je cantok - - xorw %bp, %bp -cantok: ret - -chips_md: - .byte 0x60, 0x19, 0x84 - .byte 0x61, 0x32, 0x84 - .byte 0 - .ascii "Chips & Technologies" - .byte 0 - -# Cirrus Logic 5X0 -cirrus1_test: - movw $0x3d4, %dx - movb $0x0c, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bl - xorb %al, %al - outb %al, %dx - decw %dx - movb $0x1f, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bh - xorb %ah, %ah - shlb $4, %al - movw %ax, %cx - movb %bh, %al - shrb $4, %al - addw %ax, %cx - shlw $8, %cx - addw $6, %cx - movw %cx, %ax - movw $0x3c4, %dx - outw %ax, %dx - incw %dx - inb %dx, %al - andb %al, %al - jnz nocirr - - movb %bh, %al - outb %al, %dx - inb %dx, %al - cmpb $0x01, %al - je iscirr - -nocirr: xorw %bp, %bp -iscirr: movw $0x3d4, %dx - movb %bl, %al - xorb %ah, %ah - shlw $8, %ax - addw $0x0c, %ax - outw %ax, %dx - ret - -cirrus1_md: - .byte 0x1f, 0x19, 0x84 - .byte 0x20, 0x2c, 0x84 - .byte 0x22, 0x1e, 0x84 - .byte 0x31, 0x25, 0x64 - .byte 0 - .ascii "Cirrus Logic 5X0" - .byte 0 - -# Cirrus Logic 54XX -cirrus5_test: - movw $0x3c4, %dx - movb $6, %al - call inidx - movb %al, %bl # BL=backup - movw $6, %ax - call tstidx - cmpb $0x0f, %al - jne c5fail - - movw $0x1206, %ax - call tstidx - cmpb $0x12, %al - jne c5fail - - movb $0x1e, %al - call inidx - movb %al, %bh - movb %bh, %ah - andb $0xc0, %ah - movb $0x1e, %al - call tstidx - andb $0x3f, %al - jne c5xx - - movb $0x1e, %al - movb %bh, %ah - orb $0x3f, %ah - call tstidx - xorb $0x3f, %al - andb $0x3f, %al -c5xx: pushf - movb $0x1e, %al - movb %bh, %ah - outw %ax, %dx - popf - je c5done - -c5fail: xorw %bp, %bp -c5done: movb $6, %al - movb %bl, %ah - outw %ax, %dx - ret - -cirrus5_md: - .byte 0x14, 0x19, 0x84 - .byte 0x54, 0x2b, 0x84 - .byte 0 - .ascii "Cirrus Logic 54XX" - .byte 0 - -# Cirrus Logic 64XX -- no known extra modes, but must be identified, because -# it's misidentified by the Ahead test. -cirrus6_test: - movw $0x3ce, %dx - movb $0x0a, %al - call inidx - movb %al, %bl # BL=backup - movw $0xce0a, %ax - call tstidx - orb %al, %al - jne c2fail - - movw $0xec0a, %ax - call tstidx - cmpb $0x01, %al - jne c2fail - - movb $0xaa, %al - call inidx # 4X, 5X, 7X and 8X are valid 64XX chip ID's. - shrb $4, %al - subb $4, %al - jz c6done - - decb %al - jz c6done - - subb $2, %al - jz c6done - - decb %al - jz c6done - -c2fail: xorw %bp, %bp -c6done: movb $0x0a, %al - movb %bl, %ah - outw %ax, %dx - ret - -cirrus6_md: - .byte 0 - .ascii "Cirrus Logic 64XX" - .byte 0 - -# Everex / Trident -everex_test: - movw $0x7000, %ax - xorw %bx, %bx - int $0x10 - cmpb $0x70, %al - jne noevrx - - shrw $4, %dx - cmpw $0x678, %dx - je evtrid - - cmpw $0x236, %dx - jne evrxok - -evtrid: leaw trident_md, %bp -evrxok: ret - -noevrx: xorw %bp, %bp - ret - -everex_md: - .byte 0x03, 0x22, 0x50 - .byte 0x04, 0x3c, 0x50 - .byte 0x07, 0x2b, 0x64 - .byte 0x08, 0x4b, 0x64 - .byte 0x0a, 0x19, 0x84 - .byte 0x0b, 0x2c, 0x84 - .byte 0x16, 0x1e, 0x50 - .byte 0x18, 0x1b, 0x64 - .byte 0x21, 0x40, 0xa0 - .byte 0x40, 0x1e, 0x84 - .byte 0 - .ascii "Everex/Trident" - .byte 0 - -# Genoa. -genoa_test: - leaw idgenoa, %si # Check Genoa 'clues' - xorw %ax, %ax - movb %es:(0x37), %al - movw %ax, %di - movw $0x04, %cx - decw %si - decw %di -l1: incw %si - incw %di - movb (%si), %al - testb %al, %al - jz l2 - - cmpb %es:(%di), %al -l2: loope l1 - orw %cx, %cx - je isgen - - xorw %bp, %bp -isgen: ret - -idgenoa: .byte 0x77, 0x00, 0x99, 0x66 - -genoa_md: - .byte 0x58, 0x20, 0x50 - .byte 0x5a, 0x2a, 0x64 - .byte 0x60, 0x19, 0x84 - .byte 0x61, 0x1d, 0x84 - .byte 0x62, 0x20, 0x84 - .byte 0x63, 0x2c, 0x84 - .byte 0x64, 0x3c, 0x84 - .byte 0x6b, 0x4f, 0x64 - .byte 0x72, 0x3c, 0x50 - .byte 0x74, 0x42, 0x50 - .byte 0x78, 0x4b, 0x64 - .byte 0 - .ascii "Genoa" - .byte 0 - -# OAK -oak_test: - leaw idoakvga, %si - movw $0x08, %di - movw $0x08, %cx - repe - cmpsb - je isoak - - xorw %bp, %bp -isoak: ret - -idoakvga: .ascii "OAK VGA " - -oak_md: .byte 0x4e, 0x3c, 0x50 - .byte 0x4f, 0x3c, 0x84 - .byte 0x50, 0x19, 0x84 - .byte 0x51, 0x2b, 0x84 - .byte 0 - .ascii "OAK" - .byte 0 - -# WD Paradise. -paradise_test: - leaw idparadise, %si - movw $0x7d, %di - movw $0x04, %cx - repe - cmpsb - je ispara - - xorw %bp, %bp -ispara: ret - -idparadise: .ascii "VGA=" - -paradise_md: - .byte 0x41, 0x22, 0x50 - .byte 0x47, 0x1c, 0x84 - .byte 0x55, 0x19, 0x84 - .byte 0x54, 0x2c, 0x84 - .byte 0 - .ascii "Paradise" - .byte 0 - -# Trident. -trident_test: - movw $0x3c4, %dx - movb $0x0e, %al - outb %al, %dx - incw %dx - inb %dx, %al - xchgb %al, %ah - xorb %al, %al - outb %al, %dx - inb %dx, %al - xchgb %ah, %al - movb %al, %bl # Strange thing ... in the book this wasn't - andb $0x02, %bl # necessary but it worked on my card which - jz setb2 # is a trident. Without it the screen goes - # blurred ... - andb $0xfd, %al - jmp clrb2 - -setb2: orb $0x02, %al -clrb2: outb %al, %dx - andb $0x0f, %ah - cmpb $0x02, %ah - je istrid - - xorw %bp, %bp -istrid: ret - -trident_md: - .byte 0x50, 0x1e, 0x50 - .byte 0x51, 0x2b, 0x50 - .byte 0x52, 0x3c, 0x50 - .byte 0x57, 0x19, 0x84 - .byte 0x58, 0x1e, 0x84 - .byte 0x59, 0x2b, 0x84 - .byte 0x5a, 0x3c, 0x84 - .byte 0 - .ascii "Trident" - .byte 0 - -# Tseng. -tseng_test: - movw $0x3cd, %dx - inb %dx, %al # Could things be this simple ! :-) - movb %al, %bl - movb $0x55, %al - outb %al, %dx - inb %dx, %al - movb %al, %ah - movb %bl, %al - outb %al, %dx - cmpb $0x55, %ah - je istsen - -isnot: xorw %bp, %bp -istsen: ret - -tseng_md: - .byte 0x26, 0x3c, 0x50 - .byte 0x2a, 0x28, 0x64 - .byte 0x23, 0x19, 0x84 - .byte 0x24, 0x1c, 0x84 - .byte 0x22, 0x2c, 0x84 - .byte 0x21, 0x3c, 0x84 - .byte 0 - .ascii "Tseng" - .byte 0 - -# Video7. -video7_test: - movw $0x3cc, %dx - inb %dx, %al - movw $0x3b4, %dx - andb $0x01, %al - jz even7 - - movw $0x3d4, %dx -even7: movb $0x0c, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bl - movb $0x55, %al - outb %al, %dx - inb %dx, %al - decw %dx - movb $0x1f, %al - outb %al, %dx - incw %dx - inb %dx, %al - movb %al, %bh - decw %dx - movb $0x0c, %al - outb %al, %dx - incw %dx - movb %bl, %al - outb %al, %dx - movb $0x55, %al - xorb $0xea, %al - cmpb %bh, %al - jne isnot - - movb $VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching - ret - -video7_md: - .byte 0x40, 0x2b, 0x50 - .byte 0x43, 0x3c, 0x50 - .byte 0x44, 0x3c, 0x64 - .byte 0x41, 0x19, 0x84 - .byte 0x42, 0x2c, 0x84 - .byte 0x45, 0x1c, 0x84 - .byte 0 - .ascii "Video 7" - .byte 0 - -# Realtek VGA -realtek_test: - leaw idrtvga, %si - movw $0x45, %di - movw $0x0b, %cx - repe - cmpsb - je isrt - - xorw %bp, %bp -isrt: ret - -idrtvga: .ascii "REALTEK VGA" - -realtek_md: - .byte 0x1a, 0x3c, 0x50 - .byte 0x1b, 0x19, 0x84 - .byte 0x1c, 0x1e, 0x84 - .byte 0x1d, 0x2b, 0x84 - .byte 0x1e, 0x3c, 0x84 - .byte 0 - .ascii "REALTEK" - .byte 0 - -#endif /* CONFIG_VIDEO_SVGA */ - -# User-defined local mode table (VGA only) -#ifdef CONFIG_VIDEO_LOCAL -local_modes: - leaw local_mode_table, %si -locm1: lodsw - orw %ax, %ax - jz locm2 - - stosw - movsw - jmp locm1 - -locm2: ret - -# This is the table of local video modes which can be supplied manually -# by the user. Each entry consists of mode ID (word) and dimensions -# (byte for column count and another byte for row count). These modes -# are placed before all SVGA and VESA modes and override them if table -# compacting is enabled. The table must end with a zero word followed -# by NUL-terminated video adapter name. -local_mode_table: - .word 0x0100 # Example: 40x25 - .byte 25,40 - .word 0 - .ascii "Local" - .byte 0 -#endif /* CONFIG_VIDEO_LOCAL */ - -# Read a key and return the ASCII code in al, scan code in ah -getkey: xorb %ah, %ah - int $0x16 - ret - -# Read a key with a timeout of 30 seconds. -# The hardware clock is used to get the time. -getkt: call gettime - addb $30, %al # Wait 30 seconds - cmpb $60, %al - jl lminute - - subb $60, %al -lminute: - movb %al, %cl -again: movb $0x01, %ah - int $0x16 - jnz getkey # key pressed, so get it - - call gettime - cmpb %cl, %al - jne again - - movb $0x20, %al # timeout, return `space' - ret - -# Flush the keyboard buffer -flush: movb $0x01, %ah - int $0x16 - jz empty - - xorb %ah, %ah - int $0x16 - jmp flush - -empty: ret - -# Print hexadecimal number. -prthw: pushw %ax - movb %ah, %al - call prthb - popw %ax -prthb: pushw %ax - shrb $4, %al - call prthn - popw %ax - andb $0x0f, %al -prthn: cmpb $0x0a, %al - jc prth1 - - addb $0x07, %al -prth1: addb $0x30, %al - jmp prtchr - -# Print decimal number in al -prtdec: pushw %ax - pushw %cx - xorb %ah, %ah - movb $0x0a, %cl - idivb %cl - cmpb $0x09, %al - jbe lt100 - - call prtdec - jmp skip10 - -lt100: addb $0x30, %al - call prtchr -skip10: movb %ah, %al - addb $0x30, %al - call prtchr - popw %cx - popw %ax - ret - -store_edid: -#ifdef CONFIG_FIRMWARE_EDID - pushw %es # just save all registers - pushw %ax - pushw %bx - pushw %cx - pushw %dx - pushw %di - - pushw %fs - popw %es - - movl $0x13131313, %eax # memset block with 0x13 - movw $32, %cx - movw $0x140, %di - cld - rep - stosl - - cmpw $0x0200, vbe_version # only do EDID on >= VBE2.0 - jl no_edid - - pushw %es # save ES - xorw %di, %di # Report Capability - pushw %di - popw %es # ES:DI must be 0:0 - movw $0x4f15, %ax - xorw %bx, %bx - xorw %cx, %cx - int $0x10 - popw %es # restore ES - - cmpb $0x00, %ah # call successful - jne no_edid - - cmpb $0x4f, %al # function supported - jne no_edid - - movw $0x4f15, %ax # do VBE/DDC - movw $0x01, %bx - movw $0x00, %cx - movw $0x00, %dx - movw $0x140, %di - int $0x10 - -no_edid: - popw %di # restore all registers - popw %dx - popw %cx - popw %bx - popw %ax - popw %es -#endif - ret - -# VIDEO_SELECT-only variables -mt_end: .word 0 # End of video mode table if built -edit_buf: .space 6 # Line editor buffer -card_name: .word 0 # Pointer to adapter name -scanning: .byte 0 # Performing mode scan -do_restore: .byte 0 # Screen contents altered during mode change -svga_prefix: .byte VIDEO_FIRST_BIOS>>8 # Default prefix for BIOS modes -graphic_mode: .byte 0 # Graphic mode with a linear frame buffer -dac_size: .byte 6 # DAC bit depth -vbe_version: .word 0 # VBE bios version - -# Status messages -keymsg: .ascii "Press to see video modes available, " - .ascii " to continue or wait 30 secs" - .byte 0x0d, 0x0a, 0 - -listhdr: .byte 0x0d, 0x0a - .ascii "Mode: COLSxROWS:" - -crlft: .byte 0x0d, 0x0a, 0 - -prompt: .byte 0x0d, 0x0a - .asciz "Enter mode number or `scan': " - -unknt: .asciz "Unknown mode ID. Try again." - -badmdt: .ascii "You passed an undefined mode number." - .byte 0x0d, 0x0a, 0 - -vesaer: .ascii "Error: Scanning of VESA modes failed. Please " - .ascii "report to ." - .byte 0x0d, 0x0a, 0 - -old_name: .asciz "CGA/MDA/HGA" - -ega_name: .asciz "EGA" - -svga_name: .ascii " " - -vga_name: .asciz "VGA" - -vesa_name: .asciz "VESA" - -name_bann: .asciz "Video adapter: " -#endif /* CONFIG_VIDEO_SELECT */ - -# Other variables: -adapter: .byte 0 # Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA -video_segment: .word 0xb800 # Video memory segment -force_size: .word 0 # Use this size instead of the one in BIOS vars From 2724b6db663a4efb69462ff6eb8f88d7528cace3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 222/231] [PATCH] x86-64: Shut up warnings for vfat compat ioctls on other file systems vfat implements compat handlers for these ioctls, but when they were executed on other file systems the kernel would still complain about an unknown compat ioctl. Just declare them as compatible and let them be rejected when not needed by the normal path. This makes wine runs a lot quieter Signed-off-by: Andi Kleen --- fs/compat_ioctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index f6f5e0849d9b..464c04a9541d 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -2627,6 +2627,15 @@ COMPATIBLE_IOCTL(LPRESET) /*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/ COMPATIBLE_IOCTL(LPGETFLAGS) HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans) + +/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, + but we don't want warnings on other file systems. So declare + them as compatible here. */ +#define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2]) +#define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2]) + +IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) +IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) }; int ioctl_table_size = ARRAY_SIZE(ioctl_start); From f19cccf366a07e05703c90038704a3a5ffcb0607 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 223/231] [PATCH] x86-64: Fix allnoconfig error in genapic_flat.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: In file included from include2/asm/apic.h:5, from include2/asm/smp.h:15, from linux/arch/x86_64/kernel/genapic_flat.c:18: linux/include/linux/pm.h: In function ‘call_platform_enable_wakeup’: linux/include/linux/pm.h:331: error: ‘EIO’ undeclared (first use in this function) linux/include/linux/pm.h:331: error: (Each undeclared identifier is reported only once linux/include/linux/pm.h:331: error: for each function it appears in.) Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic_flat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 01d3939e3a9c..ecb01eefdd27 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -8,6 +8,7 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ +#include #include #include #include From 1306383282aaf58e85e5461404db367be0f88dca Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 224/231] [PATCH] i386: Drop noisy e820 debugging printks Signed-off-by: Andi Kleen --- arch/i386/kernel/e820.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 829beec9247e..9645bb51f76a 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -393,10 +393,8 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) ____________________33__ ______________________4_ */ - printk("sanitize start\n"); /* if there's only one memory region, don't bother */ if (*pnr_map < 2) { - printk("sanitize bail 0\n"); return -1; } @@ -405,7 +403,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) /* bail out if we find any unreasonable addresses in bios map */ for (i=0; isize; unsigned long long end = start + size; unsigned long type = biosmap->type; - printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type); /* Overflow in 64 bits? Ignore the memory map. */ if (start > end) @@ -543,17 +538,11 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) * Not right. Fix it up. */ if (type == E820_RAM) { - printk("copy_e820_map() type is E820_RAM\n"); if (start < 0x100000ULL && end > 0xA0000ULL) { - printk("copy_e820_map() lies in range...\n"); - if (start < 0xA0000ULL) { - printk("copy_e820_map() start < 0xA0000ULL\n"); + if (start < 0xA0000ULL) add_memory_region(start, 0xA0000ULL-start, type); - } - if (end <= 0x100000ULL) { - printk("copy_e820_map() end <= 0x100000ULL\n"); + if (end <= 0x100000ULL) continue; - } start = 0x100000ULL; size = end - start; } From 02b64dab5675bc08048c7f70cbb0d8a417d20dbe Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 225/231] [PATCH] i386: white space fixes in i387.h Signed-off-by: Jan Kiszka Signed-off-by: Andi Kleen --- include/asm-i386/i387.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-i386/i387.h b/include/asm-i386/i387.h index 434936c732d6..49dc8e141310 100644 --- a/include/asm-i386/i387.h +++ b/include/asm-i386/i387.h @@ -83,8 +83,8 @@ static inline void __save_init_fpu( struct task_struct *tsk ) #define __clear_fpu( tsk ) \ do { \ - if (task_thread_info(tsk)->status & TS_USEDFPU) { \ - asm volatile("fnclex ; fwait"); \ + if (task_thread_info(tsk)->status & TS_USEDFPU) { \ + asm volatile("fnclex ; fwait"); \ task_thread_info(tsk)->status &= ~TS_USEDFPU; \ stts(); \ } \ @@ -113,7 +113,7 @@ static inline void save_init_fpu( struct task_struct *tsk ) __clear_fpu( tsk ); \ preempt_enable(); \ } while (0) - \ + /* * FPU state interaction... */ From c41bf8fa5e777b6a8a19cf2484937a7167eac77f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 226/231] [PATCH] i386: avoid redundant preempt_disable in __unlazy_fpu There are two callers of __unlazy_fpu, unlazy_fpu and __switch_to, and none of them appear to require additional preempt_disable/enable here. Let's open-code save_init_fpu in __unlazy_fpu to save a few ops. Signed-off-by: Jan Kiszka Signed-off-by: Andi Kleen --- include/asm-i386/i387.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/asm-i386/i387.h b/include/asm-i386/i387.h index 49dc8e141310..cdd1e248e3b4 100644 --- a/include/asm-i386/i387.h +++ b/include/asm-i386/i387.h @@ -74,11 +74,12 @@ static inline void __save_init_fpu( struct task_struct *tsk ) task_thread_info(tsk)->status &= ~TS_USEDFPU; } -#define __unlazy_fpu( tsk ) do { \ - if (task_thread_info(tsk)->status & TS_USEDFPU) \ - save_init_fpu( tsk ); \ - else \ - tsk->fpu_counter = 0; \ +#define __unlazy_fpu( tsk ) do { \ + if (task_thread_info(tsk)->status & TS_USEDFPU) { \ + __save_init_fpu(tsk); \ + stts(); \ + } else \ + tsk->fpu_counter = 0; \ } while (0) #define __clear_fpu( tsk ) \ From b466004a660c490f3bfb12be8b5ca18bfc393261 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH 227/231] [PATCH] x86-64: Don't exclude asm-offsets.c in Documentation/dontdiff asm-offsets.c is valid source code and needs to be diffed. Signed-off-by: Andi Kleen --- Documentation/dontdiff | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 63c2d0c55aa2..64e9f6c4826b 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -55,8 +55,8 @@ aic7*seq.h* aicasm aicdb.h* asm -asm-offsets.* -asm_offsets.* +asm-offsets.h +asm_offsets.h autoconf.h* bbootsect bin2c From 6c2af35820f100bde7b9de8a00a76faa7af6bede Mon Sep 17 00:00:00 2001 From: Bill Irwin Date: Wed, 2 May 2007 19:27:22 +0200 Subject: [PATCH 228/231] [PATCH] i386: Add missing !X86_PAE dependincy to the 2G/2G split. Only 1GB-aligned kernel/user splits are now handled for PAE. The 2GB/2GB split attempts to avoid aliasing vmallocspace with the 1:1 mapping for physical memory by using an actual split of 1.875/2.125 to accommodate 128MB of vmallocspace out of what would otherwise be a full 2GB for userspace. That attempt disturbs the alignment required by PAE for 2GB/2GB splits, and furthermore does not provide a 2GB/2GB split as advertised. This patch resolves the issues here in two manners. The first is by providing a true 2GB/2GB split in addition to the 1.875/2.125 split. The second is by renaming the 1.875/2.125 split to CONFIG_VMSPLIT_2G_OPT analogously to CONFIG_VMSPLIT_3G_OPT, which performs a similar manuever to avoid aliasing vmallocspace with the 1:1 mapping for physical memory around the 3GB boundary. With the 1.875/2.125 split properly-named, its config option is then tagged as depending on !HIGHMEM to express the PAE implementation's current inability to deal with such unaligned splits. This patch is essentially a combination of two patches, one written by Eric Biederman and the other by Eric Dumazet. If they could add their Signed-off-by: to this, I'd be much obliged. Signed-off-by: William Irwin Signed-off-by: Andi Kleen Cc: Eric Dumazet Cc: Mark Lord Cc: Eric W. Biederman Cc: Andi Kleen --- arch/i386/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 1a94a73fe801..c6f8d6856c4d 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -571,6 +571,9 @@ choice bool "3G/1G user/kernel split (for full 1G low memory)" config VMSPLIT_2G bool "2G/2G user/kernel split" + config VMSPLIT_2G_OPT + depends on !HIGHMEM + bool "2G/2G user/kernel split (for full 2G low memory)" config VMSPLIT_1G bool "1G/3G user/kernel split" endchoice @@ -578,7 +581,8 @@ endchoice config PAGE_OFFSET hex default 0xB0000000 if VMSPLIT_3G_OPT - default 0x78000000 if VMSPLIT_2G + default 0x80000000 if VMSPLIT_2G + default 0x78000000 if VMSPLIT_2G_OPT default 0x40000000 if VMSPLIT_1G default 0xC0000000 From b5229dbb857f61d77d8d4048d9033387a5411b8e Mon Sep 17 00:00:00 2001 From: Olivier Galibert Date: Wed, 2 May 2007 19:27:22 +0200 Subject: [PATCH 229/231] [PATCH] i386: Some additional chipset register values validation. On i945, a mmconfig range hitting the f0000000-ffffffff zone conflicts with the APIC registers and others. Consider it invalid. On E7520, values 0000 and f000 for the window register are defined invalid in the documentation. I haven't seen a bios use these values, but who trusts biosen these days? Signed-off-by: Olivier Galibert Signed-off-by: Andi Kleen arch/i386/pci/mmconfig-shared.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) --- arch/i386/pci/mmconfig-shared.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index 747d8c63b0c4..c7cabeed4d7b 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -60,14 +60,19 @@ static const char __init *pci_mmcfg_e7520(void) u32 win; pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win); - pci_mmcfg_config_num = 1; - pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); - if (!pci_mmcfg_config) - return NULL; - pci_mmcfg_config[0].address = (win & 0xf000) << 16; - pci_mmcfg_config[0].pci_segment = 0; - pci_mmcfg_config[0].start_bus_number = 0; - pci_mmcfg_config[0].end_bus_number = 255; + win = win & 0xf000; + if(win == 0x0000 || win == 0xf000) + pci_mmcfg_config_num = 0; + else { + pci_mmcfg_config_num = 1; + pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); + if (!pci_mmcfg_config) + return NULL; + pci_mmcfg_config[0].address = win << 16; + pci_mmcfg_config[0].pci_segment = 0; + pci_mmcfg_config[0].start_bus_number = 0; + pci_mmcfg_config[0].end_bus_number = 255; + } return "Intel Corporation E7520 Memory Controller Hub"; } @@ -108,6 +113,10 @@ static const char __init *pci_mmcfg_intel_945(void) if ((pciexbar & mask) & 0x0fffffffU) pci_mmcfg_config_num = 0; + /* Don't hit the APIC registers and their friends */ + if ((pciexbar & mask) >= 0xf0000000U) + pci_mmcfg_config_num = 0; + if (pci_mmcfg_config_num) { pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); if (!pci_mmcfg_config) From a3193348d407baaa7aef79decfa0e9a7fef74a17 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:22 +0200 Subject: [PATCH 230/231] [PATCH] i386: type may be unused In the case of !CONFIG_PCI_DIRECT && !CONFIG_PCI_MMCONFIG, type is unreferened. Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen --- arch/i386/pci/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/pci/init.c b/arch/i386/pci/init.c index b21b6da8ab1d..1cf11af96de2 100644 --- a/arch/i386/pci/init.c +++ b/arch/i386/pci/init.c @@ -6,7 +6,7 @@ in the right sequence from here. */ static __init int pci_access_init(void) { - int type = 0; + int type __attribute__((unused)) = 0; #ifdef CONFIG_PCI_DIRECT type = pci_direct_probe(); From 35060b6a9a4e1c89bc6fbea61090e302dbc61847 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Wed, 2 May 2007 19:27:22 +0200 Subject: [PATCH 231/231] [PATCH] i386: Don't delete cpu_devs data to identify different x86 types in late_initcall In arch/i386/cpu/common.c there is: cpu_devs[X86_VENDOR_INTEL] cpu_devs[X86_VENDOR_CYRIX] cpu_devs[X86_VENDOR_AMD] ... They are all filled with data early. The data (struct) got set to NULL for all, but Intel in different late_initcall (exit_cpu_vendor) calls. I don't see what sense this makes at all, maybe something that got forgotten with the HOTPLUG_CPU extenstions? Please check/review whether initdata, cpuinitdata is still ok and this still works with HOTPLUG_CPU and without, it should... Signed-off-by: Thomas Renninger Signed-off-by: Andi Kleen Cc: davej@redhat.com --- arch/i386/kernel/cpu/amd.c | 10 ---------- arch/i386/kernel/cpu/centaur.c | 10 ---------- arch/i386/kernel/cpu/cyrix.c | 19 ------------------- arch/i386/kernel/cpu/nexgen.c | 10 ---------- arch/i386/kernel/cpu/rise.c | 9 --------- arch/i386/kernel/cpu/transmeta.c | 10 ---------- arch/i386/kernel/cpu/umc.c | 10 ---------- 7 files changed, 78 deletions(-) diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 197cda62caa3..4fec702afd7e 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -319,13 +319,3 @@ int __init amd_init_cpu(void) cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev; return 0; } - -//early_arch_initcall(amd_init_cpu); - -static int __init amd_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_AMD] = NULL; - return 0; -} - -late_initcall(amd_exit_cpu); diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index 8c25047975c0..473eac883c7b 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -469,13 +469,3 @@ int __init centaur_init_cpu(void) cpu_devs[X86_VENDOR_CENTAUR] = ¢aur_cpu_dev; return 0; } - -//early_arch_initcall(centaur_init_cpu); - -static int __init centaur_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_CENTAUR] = NULL; - return 0; -} - -late_initcall(centaur_exit_cpu); diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index e77f8e1cf7aa..0b8411a864fb 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -448,16 +448,6 @@ int __init cyrix_init_cpu(void) return 0; } -//early_arch_initcall(cyrix_init_cpu); - -static int __init cyrix_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_CYRIX] = NULL; - return 0; -} - -late_initcall(cyrix_exit_cpu); - static struct cpu_dev nsc_cpu_dev __cpuinitdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, @@ -470,12 +460,3 @@ int __init nsc_init_cpu(void) return 0; } -//early_arch_initcall(nsc_init_cpu); - -static int __init nsc_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_NSC] = NULL; - return 0; -} - -late_initcall(nsc_exit_cpu); diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index 8bf23cc80c63..961fbe1a748f 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -58,13 +58,3 @@ int __init nexgen_init_cpu(void) cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev; return 0; } - -//early_arch_initcall(nexgen_init_cpu); - -static int __init nexgen_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_NEXGEN] = NULL; - return 0; -} - -late_initcall(nexgen_exit_cpu); diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c index 9317f7414989..50076f22e90f 100644 --- a/arch/i386/kernel/cpu/rise.c +++ b/arch/i386/kernel/cpu/rise.c @@ -50,12 +50,3 @@ int __init rise_init_cpu(void) return 0; } -//early_arch_initcall(rise_init_cpu); - -static int __init rise_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_RISE] = NULL; - return 0; -} - -late_initcall(rise_exit_cpu); diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 5678d46863c6..6471a5a13202 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -112,13 +112,3 @@ int __init transmeta_init_cpu(void) cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev; return 0; } - -//early_arch_initcall(transmeta_init_cpu); - -static int __init transmeta_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_TRANSMETA] = NULL; - return 0; -} - -late_initcall(transmeta_exit_cpu); diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c index 1bf3f87e9c5b..a7a4e75bdcd7 100644 --- a/arch/i386/kernel/cpu/umc.c +++ b/arch/i386/kernel/cpu/umc.c @@ -24,13 +24,3 @@ int __init umc_init_cpu(void) cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev; return 0; } - -//early_arch_initcall(umc_init_cpu); - -static int __init umc_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_UMC] = NULL; - return 0; -} - -late_initcall(umc_exit_cpu);