--- linux-ec2-2.6.32.orig/MAINTAINERS +++ linux-ec2-2.6.32/MAINTAINERS @@ -5594,9 +5594,11 @@ F: drivers/net/wireless/rndis_wlan.c USB XHCI DRIVER -M: Sarah Sharp +M: Sarah Sharp L: linux-usb@vger.kernel.org S: Supported +F: drivers/usb/host/xhci* +F: drivers/usb/host/pci-quirks* USB ZC0301 DRIVER M: Luca Risolia @@ -5718,6 +5720,14 @@ S: Maintained F: drivers/net/vmxnet3/ +VMware PVSCSI driver +M: Alok Kataria +M: VMware PV-Drivers +L: linux-scsi@vger.kernel.org +S: Maintained +F: drivers/scsi/vmw_pvscsi.c +F: drivers/scsi/vmw_pvscsi.h + VOLTAGE AND CURRENT REGULATOR FRAMEWORK M: Liam Girdwood M: Mark Brown --- linux-ec2-2.6.32.orig/Makefile +++ linux-ec2-2.6.32/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 32 -EXTRAVERSION = +EXTRAVERSION = .11+drm33.2 NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* @@ -331,14 +331,23 @@ AFLAGS_KERNEL = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage +# Prefer linux-backports-modules +ifneq ($(KBUILD_SRC),) +ifneq ($(shell if test -e $(KBUILD_OUTPUT)/ubuntu-build; then echo yes; fi),yes) +UBUNTUINCLUDE := -I/usr/src/linux-headers-lbm-$(KERNELRELEASE) +endif +endif # Use LINUXINCLUDE when you must reference the include/ directory. # Needed to be compatible with the O= option -LINUXINCLUDE := -Iinclude \ +LINUXINCLUDE := $(UBUNTUINCLUDE) -Iinclude \ $(if $(KBUILD_SRC),-Iinclude2 -I$(srctree)/include) \ -I$(srctree)/arch/$(hdr-arch)/include \ -include include/linux/autoconf.h +# UBUNTU: Include our third party driver stuff too +LINUXINCLUDE += -Iubuntu/include $(if $(KBUILD_SRC),-I$(srctree)/ubuntu/include) + KBUILD_CPPFLAGS := -D__KERNEL__ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ @@ -464,12 +473,12 @@ # Carefully list dependencies so we do not try to build scripts twice # in parallel PHONY += scripts -scripts: scripts_basic include/config/auto.conf +scripts: scripts_basic include/config/auto.conf include/config/tristate.conf $(Q)$(MAKE) $(build)=$(@) # Objects we will link into vmlinux / subdirs we need to visit init-y := init/ -drivers-y := drivers/ sound/ firmware/ +drivers-y := drivers/ sound/ firmware/ ubuntu/ net-y := net/ libs-y := lib/ core-y := usr/ @@ -491,7 +500,7 @@ # with it and forgot to run make oldconfig. # if auto.conf.cmd is missing then we are probably in a cleaned tree so # we execute the config step to be sure to catch updated Kconfig files -include/config/auto.conf: $(KCONFIG_CONFIG) include/config/auto.conf.cmd +include/config/%.conf: $(KCONFIG_CONFIG) include/config/auto.conf.cmd $(Q)$(MAKE) -f $(srctree)/Makefile silentoldconfig else # external modules needs include/linux/autoconf.h and include/config/auto.conf @@ -876,6 +885,9 @@ PHONY += $(vmlinux-dirs) $(vmlinux-dirs): prepare scripts $(Q)$(MAKE) $(build)=$@ +ifdef CONFIG_MODULES + $(Q)$(MAKE) $(modbuiltin)=$@ +endif # Build the kernel release string # @@ -1126,6 +1138,7 @@ PHONY += modules modules: $(vmlinux-dirs) $(if $(KBUILD_BUILTIN),vmlinux) $(Q)$(AWK) '!x[$$0]++' $(vmlinux-dirs:%=$(objtree)/%/modules.order) > $(objtree)/modules.order + $(Q)$(AWK) '!x[$$0]++' $(vmlinux-dirs:%=$(objtree)/%/modules.builtin) > $(objtree)/modules.builtin @$(kecho) ' Building modules, stage 2.'; $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_modbuild @@ -1155,6 +1168,7 @@ ln -s $(objtree) $(MODLIB)/build ; \ fi @cp -f $(objtree)/modules.order $(MODLIB)/ + @cp -f $(objtree)/modules.builtin $(MODLIB)/ $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst # This depmod is only for convenience to give the initial @@ -1218,6 +1232,7 @@ -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \ -o -name '*.symtypes' -o -name 'modules.order' \ -o -name 'Module.markers' -o -name '.tmp_*.o.*' \ + -o -name 'modules.builtin' \ -o -name '*.gcno' \) -type f -print | xargs rm -f # mrproper - Delete all generated files, including .config @@ -1416,7 +1431,8 @@ clean: rm-dirs := $(MODVERDIR) clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers \ $(KBUILD_EXTMOD)/Module.markers \ - $(KBUILD_EXTMOD)/modules.order + $(KBUILD_EXTMOD)/modules.order \ + $(KBUILD_EXTMOD)/modules.builtin clean: $(clean-dirs) $(call cmd,rmdirs) $(call cmd,rmfiles) --- linux-ec2-2.6.32.orig/Documentation/Changes +++ linux-ec2-2.6.32/Documentation/Changes @@ -49,6 +49,8 @@ o udev 081 # udevinfo -V o grub 0.93 # grub --version o mcelog 0.6 +o iptables 1.4.1 # iptables -V + Kernel compilation ================== --- linux-ec2-2.6.32.orig/Documentation/kernel-parameters.txt +++ linux-ec2-2.6.32/Documentation/kernel-parameters.txt @@ -803,6 +803,24 @@ gpt [EFI] Forces disk with valid GPT signature but invalid Protective MBR to be treated as GPT. + guestdev= [PCI,ACPI,XEN] + Format: {|}][,{|}[,...]] + Format of device path: [:]-.[-.[,...]][+iomul] + Format of sbdf: [:]:.[+iomul] + Specifies PCI device for guest domain. + If PCI-PCI bridge is specified, all PCI devices + behind PCI-PCI bridge are reserved. + +iomul means that this PCI function will share + IO ports with other +iomul functions under same + switch. NOTE: if +iomul is specfied, all the functions + of the device will share IO ports. + + guestiomuldev= [PCI,ACPI,XEN] + Format: [sbd][,][,...] + Format of sbdf: [:]: + Note: function shouldn't be specified. + Specifies PCI device for IO port multiplexing driver. + gvp11= [HW,SCSI] hashdist= [KNL,NUMA] Large hashes allocated during boot @@ -1960,6 +1978,13 @@ off: Turn ECRC off on: Turn ECRC on. + pci_reserve= [PCI] + Format: [[+IO][+MEM]][,...] + Format of sbdf: [:]:. + Specifies the least reserved io size or memory size + which is assigned to PCI bridge even when no child + pci device exists. This is useful with PCI hotplug. + pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. off Disable ASPM. @@ -2114,6 +2139,10 @@ Run specified binary instead of /init from the ramdisk, used for early userspace startup. See initrd. + reassign_resources [PCI,ACPI,XEN] + Use guestdev= parameter to reassign device's + resources. + reboot= [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode Format: [,[,...]] See arch/*/kernel/reboot.c or arch/*/kernel/process.c @@ -2645,6 +2674,8 @@ to a common usb-storage quirk flag as follows: a = SANE_SENSE (collect more than 18 bytes of sense data); + b = BAD_SENSE (don't collect more than 18 + bytes of sense data); c = FIX_CAPACITY (decrease the reported device capacity by one sector); h = CAPACITY_HEURISTICS (decrease the @@ -2666,6 +2697,13 @@ medium is write-protected). Example: quirks=0419:aaf5:rl,0421:0433:rc + userpte= + [X86] Flags controlling user PTE allocations. + + nohigh = do not allocate PTE pages in + HIGHMEM regardless of setting + of CONFIG_HIGHPTE. + vdso= [X86,SH] vdso=2: enable compat VDSO (default with COMPAT_VDSO) vdso=1: enable VDSO (default) --- linux-ec2-2.6.32.orig/Documentation/DocBook/Makefile +++ linux-ec2-2.6.32/Documentation/DocBook/Makefile @@ -32,28 +32,37 @@ ### # The targets that may be used. -PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs cleandocs media +PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs cleandocs xmldoclinks BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) -xmldocs: $(BOOKS) +xmldocs: xmldoclinks $(BOOKS) sgmldocs: xmldocs PS := $(patsubst %.xml, %.ps, $(BOOKS)) -psdocs: $(PS) +psdocs: xmldoclinks $(PS) PDF := $(patsubst %.xml, %.pdf, $(BOOKS)) -pdfdocs: $(PDF) +pdfdocs: xmldoclinks $(PDF) HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS))) -htmldocs: media $(HTML) +htmldocs: xmldoclinks $(HTML) $(call build_main_index) + $(call build_images) MAN := $(patsubst %.xml, %.9, $(BOOKS)) mandocs: $(MAN) -media: - mkdir -p $(srctree)/Documentation/DocBook/media/ - cp $(srctree)/Documentation/DocBook/dvb/*.png $(srctree)/Documentation/DocBook/v4l/*.gif $(srctree)/Documentation/DocBook/media/ +build_images = mkdir -p $(objtree)/Documentation/DocBook/media/ && \ + cp $(srctree)/Documentation/DocBook/dvb/*.png $(srctree)/Documentation/DocBook/v4l/*.gif $(objtree)/Documentation/DocBook/media/ + +xmldoclinks: +ifneq ($(objtree),$(srctree)) + for dep in dvb media-entities.tmpl media-indices.tmpl v4l; do \ + rm -f $(objtree)/Documentation/DocBook/$$dep \ + && ln -s $(srctree)/Documentation/DocBook/$$dep $(objtree)/Documentation/DocBook/ \ + || exit; \ + done +endif installmandocs: mandocs mkdir -p /usr/local/man/man9/ --- linux-ec2-2.6.32.orig/Documentation/filesystems/ext4.txt +++ linux-ec2-2.6.32/Documentation/filesystems/ext4.txt @@ -153,8 +153,8 @@ identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. Note that - if the filesystem was not unmounted cleanly, +norecovery Don't load the journal on mounting. Note that +noload if the filesystem was not unmounted cleanly, skipping the journal replay will lead to the filesystem containing inconsistencies that can lead to any number of problems. @@ -196,7 +196,7 @@ also be used to enable or disable barriers, for consistency with other ext4 mount options. -inode_readahead=n This tuning parameter controls the maximum +inode_readahead_blks=n This tuning parameter controls the maximum number of inode table blocks that ext4's inode table readahead algorithm will pre-read into the buffer cache. The default value is 32 blocks. @@ -353,6 +353,12 @@ system crashes before the delayed allocation blocks are forced to disk. +discard Controls whether ext4 should issue discard/TRIM +nodiscard(*) commands to the underlying block device when + blocks are freed. This is useful for SSD devices + and sparse/thinly-provisioned LUNs, but it is off + by default until sufficient testing has been done. + Data Mode ========= There are 3 different data modes: --- linux-ec2-2.6.32.orig/Documentation/filesystems/tmpfs.txt +++ linux-ec2-2.6.32/Documentation/filesystems/tmpfs.txt @@ -82,11 +82,13 @@ all files in that instance (if CONFIG_NUMA is enabled) - which can be adjusted on the fly via 'mount -o remount ...' -mpol=default prefers to allocate memory from the local node +mpol=default use the process allocation policy + (see set_mempolicy(2)) mpol=prefer:Node prefers to allocate memory from the given Node mpol=bind:NodeList allocates memory only from nodes in NodeList mpol=interleave prefers to allocate from each node in turn mpol=interleave:NodeList allocates from each node of NodeList in turn +mpol=local prefers to allocate memory from the local node NodeList format is a comma-separated list of decimal numbers and ranges, a range being two hyphen-separated decimal numbers, the smallest and @@ -134,3 +136,5 @@ Christoph Rohland , 1.12.01 Updated: Hugh Dickins, 4 June 2007 +Updated: + KOSAKI Motohiro, 16 Mar 2010 --- linux-ec2-2.6.32.orig/Documentation/kbuild/kbuild.txt +++ linux-ec2-2.6.32/Documentation/kbuild/kbuild.txt @@ -1,3 +1,17 @@ +Output files + +modules.order +-------------------------------------------------- +This file records the order in which modules appear in Makefiles. This +is used by modprobe to deterministically resolve aliases that match +multiple modules. + +modules.builtin +-------------------------------------------------- +This file lists all modules that are built into the kernel. This is used +by modprobe to not fail when trying to load something builtin. + + Environment variables KCPPFLAGS --- linux-ec2-2.6.32.orig/Documentation/kbuild/kconfig.txt +++ linux-ec2-2.6.32/Documentation/kbuild/kconfig.txt @@ -103,6 +103,11 @@ This environment variable can be set to specify the path & name of the "auto.conf" file. Its default value is "include/config/auto.conf". +KCONFIG_TRISTATE +-------------------------------------------------- +This environment variable can be set to specify the path & name of the +"tristate.conf" file. Its default value is "include/config/tristate.conf". + KCONFIG_AUTOHEADER -------------------------------------------------- This environment variable can be set to specify the path & name of the --- linux-ec2-2.6.32.orig/Documentation/kvm/api.txt +++ linux-ec2-2.6.32/Documentation/kvm/api.txt @@ -593,6 +593,42 @@ } chip; }; +4.27 KVM_GET_CLOCK + +Capability: KVM_CAP_ADJUST_CLOCK +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_clock_data (out) +Returns: 0 on success, -1 on error + +Gets the current timestamp of kvmclock as seen by the current guest. In +conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; +}; + +4.28 KVM_SET_CLOCK + +Capability: KVM_CAP_ADJUST_CLOCK +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_clock_data (in) +Returns: 0 on success, -1 on error + +Sets the current timestamp of kvmclock to the valued specific in its parameter. +In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; +}; + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by --- linux-ec2-2.6.32.orig/Documentation/networking/README.ipw2200 +++ linux-ec2-2.6.32/Documentation/networking/README.ipw2200 @@ -171,7 +171,7 @@ led Can be used to turn on experimental LED code. - 0 = Off, 1 = On. Default is 0. + 0 = Off, 1 = On. Default is 1. mode Can be used to set the default mode of the adapter. --- linux-ec2-2.6.32.orig/Documentation/video4linux/CARDLIST.saa7134 +++ linux-ec2-2.6.32/Documentation/video4linux/CARDLIST.saa7134 @@ -172,3 +172,4 @@ 171 -> Beholder BeholdTV X7 [5ace:7595] 172 -> RoverMedia TV Link Pro FM [19d1:0138] 173 -> Zolid Hybrid TV Tuner PCI [1131:2004] +174 -> Asus Europa Hybrid OEM [1043:4847] --- linux-ec2-2.6.32.orig/Documentation/video4linux/gspca.txt +++ linux-ec2-2.6.32/Documentation/video4linux/gspca.txt @@ -37,6 +37,7 @@ ov519 041e:4060 Creative Live! VISTA VF0350 ov519 041e:4061 Creative Live! VISTA VF0400 ov519 041e:4064 Creative Live! VISTA VF0420 +ov519 041e:4067 Creative Live! Cam Video IM (VF0350) ov519 041e:4068 Creative Live! VISTA VF0470 spca561 0458:7004 Genius VideoCAM Express V2 sunplus 0458:7006 Genius Dsc 1.3 Smart --- linux-ec2-2.6.32.orig/arch/Kconfig +++ linux-ec2-2.6.32/arch/Kconfig @@ -6,8 +6,6 @@ tristate "OProfile system profiling (EXPERIMENTAL)" depends on PROFILING depends on HAVE_OPROFILE - depends on TRACING_SUPPORT - select TRACING select RING_BUFFER select RING_BUFFER_ALLOW_SWAP help @@ -20,7 +18,7 @@ config OPROFILE_IBS bool "OProfile AMD IBS support (EXPERIMENTAL)" default n - depends on OPROFILE && SMP && X86 + depends on OPROFILE && SMP && X86 && !XEN help Instruction-Based Sampling (IBS) is a new profiling technique that provides rich, precise program performance --- linux-ec2-2.6.32.orig/arch/alpha/kernel/osf_sys.c +++ linux-ec2-2.6.32/arch/alpha/kernel/osf_sys.c @@ -178,25 +178,18 @@ unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - struct file *file = NULL; - unsigned long ret = -EBADF; + unsigned long ret = -EINVAL; #if 0 if (flags & (_MAP_HASSEMAPHORE | _MAP_INHERIT | _MAP_UNALIGNED)) printk("%s: unimplemented OSF mmap flags %04lx\n", current->comm, flags); #endif - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - ret = do_mmap(file, addr, len, prot, flags, off); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + if ((off + PAGE_ALIGN(len)) < off) + goto out; + if (off & ~PAGE_MASK) + goto out; + ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return ret; } --- linux-ec2-2.6.32.orig/arch/arm/Kconfig +++ linux-ec2-2.6.32/arch/arm/Kconfig @@ -1508,6 +1508,8 @@ source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "fs/Kconfig" source "arch/arm/Kconfig.debug" --- linux-ec2-2.6.32.orig/arch/arm/boot/compressed/head.S +++ linux-ec2-2.6.32/arch/arm/boot/compressed/head.S @@ -162,8 +162,8 @@ .text adr r0, LC0 - ARM( ldmia r0, {r1, r2, r3, r4, r5, r6, ip, sp} ) - THUMB( ldmia r0, {r1, r2, r3, r4, r5, r6, ip} ) + ARM( ldmia r0, {r1, r2, r3, r4, r5, r6, r11, ip, sp}) + THUMB( ldmia r0, {r1, r2, r3, r4, r5, r6, r11, ip} ) THUMB( ldr sp, [r0, #28] ) subs r0, r0, r1 @ calculate the delta offset @@ -174,12 +174,13 @@ /* * We're running at a different address. We need to fix * up various pointers: - * r5 - zImage base address - * r6 - GOT start + * r5 - zImage base address (_start) + * r6 - size of decompressed image + * r11 - GOT start * ip - GOT end */ add r5, r5, r0 - add r6, r6, r0 + add r11, r11, r0 add ip, ip, r0 #ifndef CONFIG_ZBOOT_ROM @@ -197,10 +198,10 @@ /* * Relocate all entries in the GOT table. */ -1: ldr r1, [r6, #0] @ relocate entries in the GOT +1: ldr r1, [r11, #0] @ relocate entries in the GOT add r1, r1, r0 @ table. This fixes up the - str r1, [r6], #4 @ C references. - cmp r6, ip + str r1, [r11], #4 @ C references. + cmp r11, ip blo 1b #else @@ -208,12 +209,12 @@ * Relocate entries in the GOT table. We only relocate * the entries that are outside the (relocated) BSS region. */ -1: ldr r1, [r6, #0] @ relocate entries in the GOT +1: ldr r1, [r11, #0] @ relocate entries in the GOT cmp r1, r2 @ entry < bss_start || cmphs r3, r1 @ _end < entry addlo r1, r1, r0 @ table. This fixes up the - str r1, [r6], #4 @ C references. - cmp r6, ip + str r1, [r11], #4 @ C references. + cmp r11, ip blo 1b #endif @@ -239,6 +240,7 @@ * Check to see if we will overwrite ourselves. * r4 = final kernel address * r5 = start of this image + * r6 = size of decompressed image * r2 = end of malloc space (and therefore this image) * We basically want: * r4 >= r2 -> OK @@ -246,8 +248,7 @@ */ cmp r4, r2 bhs wont_overwrite - sub r3, sp, r5 @ > compressed kernel size - add r0, r4, r3, lsl #2 @ allow for 4x expansion + add r0, r4, r6 cmp r0, r5 bls wont_overwrite @@ -263,7 +264,6 @@ * r1-r3 = unused * r4 = kernel execution address * r5 = decompressed kernel start - * r6 = processor ID * r7 = architecture ID * r8 = atags pointer * r9-r12,r14 = corrupted @@ -304,7 +304,8 @@ .word _end @ r3 .word zreladdr @ r4 .word _start @ r5 - .word _got_start @ r6 + .word _image_size @ r6 + .word _got_start @ r11 .word _got_end @ ip .word user_stack+4096 @ sp LC1: .word reloc_end - reloc_start @@ -328,7 +329,6 @@ * * On entry, * r4 = kernel execution address - * r6 = processor ID * r7 = architecture number * r8 = atags pointer * r9 = run-time address of "start" (???) @@ -534,7 +534,6 @@ * r1-r3 = unused * r4 = kernel execution address * r5 = decompressed kernel start - * r6 = processor ID * r7 = architecture ID * r8 = atags pointer * r9-r12,r14 = corrupted @@ -573,19 +572,19 @@ * r1 = corrupted * r2 = corrupted * r3 = block offset - * r6 = corrupted + * r9 = corrupted * r12 = corrupted */ call_cache_fn: adr r12, proc_types #ifdef CONFIG_CPU_CP15 - mrc p15, 0, r6, c0, c0 @ get processor ID + mrc p15, 0, r9, c0, c0 @ get processor ID #else - ldr r6, =CONFIG_PROCESSOR_ID + ldr r9, =CONFIG_PROCESSOR_ID #endif 1: ldr r1, [r12, #0] @ get value ldr r2, [r12, #4] @ get mask - eor r1, r1, r6 @ (real ^ match) + eor r1, r1, r9 @ (real ^ match) tst r1, r2 @ & mask ARM( addeq pc, r12, r3 ) @ call cache function THUMB( addeq r12, r3 ) @@ -764,8 +763,7 @@ * Turn off the Cache and MMU. ARMv3 does not support * reading the control register, but ARMv4 does. * - * On entry, r6 = processor ID - * On exit, r0, r1, r2, r3, r12 corrupted + * On exit, r0, r1, r2, r3, r9, r12 corrupted * This routine must preserve: r4, r6, r7 */ .align 5 @@ -838,10 +836,8 @@ /* * Clean and flush the cache to maintain consistency. * - * On entry, - * r6 = processor ID * On exit, - * r1, r2, r3, r11, r12 corrupted + * r1, r2, r3, r9, r11, r12 corrupted * This routine must preserve: * r0, r4, r5, r6, r7 */ @@ -953,7 +949,7 @@ mov r2, #64*1024 @ default: 32K dcache size (*2) mov r11, #32 @ default: 32 byte line size mrc p15, 0, r3, c0, c0, 1 @ read cache type - teq r3, r6 @ cache ID register present? + teq r3, r9 @ cache ID register present? beq no_cache_id mov r1, r3, lsr #18 and r1, r1, #7 --- linux-ec2-2.6.32.orig/arch/arm/boot/compressed/vmlinux.lds.in +++ linux-ec2-2.6.32/arch/arm/boot/compressed/vmlinux.lds.in @@ -36,6 +36,9 @@ _etext = .; + /* Assume size of decompressed image is 4x the compressed image */ + _image_size = (_etext - _text) * 4; + _got_start = .; .got : { *(.got) } _got_end = .; --- linux-ec2-2.6.32.orig/arch/arm/include/asm/mman.h +++ linux-ec2-2.6.32/arch/arm/include/asm/mman.h @@ -1 +1,4 @@ #include + +#define arch_mmap_check(addr, len, flags) \ + (((flags) & MAP_FIXED && (addr) < FIRST_USER_ADDRESS) ? -EINVAL : 0) --- linux-ec2-2.6.32.orig/arch/arm/kernel/calls.S +++ linux-ec2-2.6.32/arch/arm/kernel/calls.S @@ -172,7 +172,7 @@ /* 160 */ CALL(sys_sched_get_priority_min) CALL(sys_sched_rr_get_interval) CALL(sys_nanosleep) - CALL(sys_arm_mremap) + CALL(sys_mremap) CALL(sys_setresuid16) /* 165 */ CALL(sys_getresuid16) CALL(sys_ni_syscall) /* vm86 */ --- linux-ec2-2.6.32.orig/arch/arm/kernel/entry-common.S +++ linux-ec2-2.6.32/arch/arm/kernel/entry-common.S @@ -416,12 +416,12 @@ tst r5, #PGOFF_MASK moveq r5, r5, lsr #PAGE_SHIFT - 12 streq r5, [sp, #4] - beq do_mmap2 + beq sys_mmap_pgoff mov r0, #-EINVAL mov pc, lr #else str r5, [sp, #4] - b do_mmap2 + b sys_mmap_pgoff #endif ENDPROC(sys_mmap2) --- linux-ec2-2.6.32.orig/arch/arm/kernel/sys_arm.c +++ linux-ec2-2.6.32/arch/arm/kernel/sys_arm.c @@ -28,41 +28,6 @@ #include #include -extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, - unsigned long new_len, unsigned long flags, - unsigned long new_addr); - -/* common code for old and new mmaps */ -inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EINVAL; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - if (flags & MAP_FIXED && addr < FIRST_USER_ADDRESS) - goto out; - - error = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - struct mmap_arg_struct { unsigned long addr; unsigned long len; @@ -84,29 +49,11 @@ if (a.offset & ~PAGE_MASK) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return error; } -asmlinkage unsigned long -sys_arm_mremap(unsigned long addr, unsigned long old_len, - unsigned long new_len, unsigned long flags, - unsigned long new_addr) -{ - unsigned long ret = -EINVAL; - - if (flags & MREMAP_FIXED && new_addr < FIRST_USER_ADDRESS) - goto out; - - down_write(¤t->mm->mmap_sem); - ret = do_mremap(addr, old_len, new_len, flags, new_addr); - up_write(¤t->mm->mmap_sem); - -out: - return ret; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. --- linux-ec2-2.6.32.orig/arch/arm/mach-davinci/dm646x.c +++ linux-ec2-2.6.32/arch/arm/mach-davinci/dm646x.c @@ -789,7 +789,14 @@ .part_no = 0xb770, .manufacturer = 0x017, .cpu_id = DAVINCI_CPU_ID_DM6467, - .name = "dm6467", + .name = "dm6467_rev1.x", + }, + { + .variant = 0x1, + .part_no = 0xb770, + .manufacturer = 0x017, + .cpu_id = DAVINCI_CPU_ID_DM6467, + .name = "dm6467_rev3.x", }, }; --- linux-ec2-2.6.32.orig/arch/arm/mach-pxa/em-x270.c +++ linux-ec2-2.6.32/arch/arm/mach-pxa/em-x270.c @@ -497,16 +497,15 @@ goto err_free_vbus_gpio; /* USB Hub power-on and reset */ - gpio_direction_output(usb_hub_reset, 0); + gpio_direction_output(usb_hub_reset, 1); + gpio_direction_output(GPIO9_USB_VBUS_EN, 0); regulator_enable(em_x270_usb_ldo); - gpio_set_value(usb_hub_reset, 1); gpio_set_value(usb_hub_reset, 0); + gpio_set_value(usb_hub_reset, 1); regulator_disable(em_x270_usb_ldo); regulator_enable(em_x270_usb_ldo); - gpio_set_value(usb_hub_reset, 1); - - /* enable VBUS */ - gpio_direction_output(GPIO9_USB_VBUS_EN, 1); + gpio_set_value(usb_hub_reset, 0); + gpio_set_value(GPIO9_USB_VBUS_EN, 1); return 0; --- linux-ec2-2.6.32.orig/arch/arm/mach-versatile/Kconfig +++ linux-ec2-2.6.32/arch/arm/mach-versatile/Kconfig @@ -3,14 +3,14 @@ config ARCH_VERSATILE_PB bool "Support Versatile/PB platform" - select CPU_ARM926T + select CPU_V7 default y help Include support for the ARM(R) Versatile/PB platform. config MACH_VERSATILE_AB bool "Support Versatile/AB platform" - select CPU_ARM926T + select CPU_V7 help Include support for the ARM(R) Versatile/AP platform. --- linux-ec2-2.6.32.orig/arch/arm/mm/mmap.c +++ linux-ec2-2.6.32/arch/arm/mm/mmap.c @@ -54,7 +54,8 @@ * We enforce the MAP_FIXED case. */ if (flags & MAP_FIXED) { - if (aliasing && flags & MAP_SHARED && addr & (SHMLBA - 1)) + if (aliasing && flags & MAP_SHARED && + (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)) return -EINVAL; return addr; } --- linux-ec2-2.6.32.orig/arch/avr32/include/asm/syscalls.h +++ linux-ec2-2.6.32/arch/avr32/include/asm/syscalls.h @@ -29,10 +29,6 @@ struct pt_regs *); asmlinkage int sys_rt_sigreturn(struct pt_regs *); -/* kernel/sys_avr32.c */ -asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, off_t); - /* mm/cache.c */ asmlinkage int sys_cacheflush(int, void __user *, size_t); --- linux-ec2-2.6.32.orig/arch/avr32/kernel/sys_avr32.c +++ linux-ec2-2.6.32/arch/avr32/kernel/sys_avr32.c @@ -5,39 +5,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include -#include -#include -#include #include -#include -#include -#include - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, off_t offset) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return error; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, offset); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - return error; -} - int kernel_execve(const char *file, char **argv, char **envp) { register long scno asm("r8") = __NR_execve; --- linux-ec2-2.6.32.orig/arch/avr32/kernel/syscall-stubs.S +++ linux-ec2-2.6.32/arch/avr32/kernel/syscall-stubs.S @@ -61,7 +61,7 @@ __sys_mmap2: pushm lr st.w --sp, ARG6 - call sys_mmap2 + call sys_mmap_pgoff sub sp, -4 popm pc --- linux-ec2-2.6.32.orig/arch/blackfin/include/asm/page.h +++ linux-ec2-2.6.32/arch/blackfin/include/asm/page.h @@ -10,4 +10,9 @@ #include #define MAP_NR(addr) (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT) +#define VM_DATA_DEFAULT_FLAGS \ + (VM_READ | VM_WRITE | \ + ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + #endif --- linux-ec2-2.6.32.orig/arch/blackfin/kernel/sys_bfin.c +++ linux-ec2-2.6.32/arch/blackfin/kernel/sys_bfin.c @@ -22,39 +22,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long -do_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - out: - return error; -} - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - asmlinkage void *sys_sram_alloc(size_t size, unsigned long flags) { return sram_alloc_with_lsl(size, flags); --- linux-ec2-2.6.32.orig/arch/blackfin/mach-common/entry.S +++ linux-ec2-2.6.32/arch/blackfin/mach-common/entry.S @@ -1422,7 +1422,7 @@ .long _sys_ni_syscall /* streams2 */ .long _sys_vfork /* 190 */ .long _sys_getrlimit - .long _sys_mmap2 + .long _sys_mmap_pgoff .long _sys_truncate64 .long _sys_ftruncate64 .long _sys_stat64 /* 195 */ --- linux-ec2-2.6.32.orig/arch/cris/kernel/sys_cris.c +++ linux-ec2-2.6.32/arch/cris/kernel/sys_cris.c @@ -26,31 +26,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long -do_mmap2(unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage unsigned long old_mmap(unsigned long __user *args) { unsigned long buffer[6]; @@ -63,7 +38,7 @@ if (buffer[5] & ~PAGE_MASK) /* verify that offset is on page boundary */ goto out; - err = do_mmap2(buffer[0], buffer[1], buffer[2], buffer[3], + err = sys_mmap_pgoff(buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], buffer[5] >> PAGE_SHIFT); out: return err; @@ -73,7 +48,8 @@ sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - return do_mmap2(addr, len, prot, flags, fd, pgoff); + /* bug(?): 8Kb pages here */ + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* --- linux-ec2-2.6.32.orig/arch/frv/include/asm/page.h +++ linux-ec2-2.6.32/arch/frv/include/asm/page.h @@ -63,12 +63,10 @@ #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#ifdef CONFIG_MMU #define VM_DATA_DEFAULT_FLAGS \ (VM_READ | VM_WRITE | \ ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#endif #endif /* __ASSEMBLY__ */ --- linux-ec2-2.6.32.orig/arch/frv/kernel/sys_frv.c +++ linux-ec2-2.6.32/arch/frv/kernel/sys_frv.c @@ -31,9 +31,6 @@ unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - int error = -EBADF; - struct file * file = NULL; - /* As with sparc32, make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have.... */ @@ -41,69 +38,10 @@ trying to map something we can't */ if (pgoff & ((1 << (PAGE_SHIFT - 12)) - 1)) return -EINVAL; - pgoff >>= PAGE_SHIFT - 12; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -#if 0 /* DAVIDM - do we want this */ -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return error; + return sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } -#endif /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. --- linux-ec2-2.6.32.orig/arch/h8300/kernel/sys_h8300.c +++ linux-ec2-2.6.32/arch/h8300/kernel/sys_h8300.c @@ -26,39 +26,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/m68k cloned Linux/i386, which didn't use to be able to @@ -87,58 +54,12 @@ if (a.offset & ~PAGE_MASK) goto out; - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } -#if 0 /* DAVIDM - do we want this */ -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return error; -} -#endif - struct sel_arg_struct { unsigned long n; fd_set *inp, *outp, *exp; --- linux-ec2-2.6.32.orig/arch/h8300/kernel/syscalls.S +++ linux-ec2-2.6.32/arch/h8300/kernel/syscalls.S @@ -206,7 +206,7 @@ .long SYMBOL_NAME(sys_ni_syscall) /* streams2 */ .long SYMBOL_NAME(sys_vfork) /* 190 */ .long SYMBOL_NAME(sys_getrlimit) - .long SYMBOL_NAME(sys_mmap2) + .long SYMBOL_NAME(sys_mmap_pgoff) .long SYMBOL_NAME(sys_truncate64) .long SYMBOL_NAME(sys_ftruncate64) .long SYMBOL_NAME(sys_stat64) /* 195 */ --- linux-ec2-2.6.32.orig/arch/ia64/Kconfig +++ linux-ec2-2.6.32/arch/ia64/Kconfig @@ -231,7 +231,7 @@ config IA64_XEN_GUEST bool "Xen guest" select SWIOTLB - depends on XEN + depends on PARAVIRT_XEN help Build a kernel that runs on Xen guest domain. At this moment only 16KB page size in supported. --- linux-ec2-2.6.32.orig/arch/ia64/Makefile +++ linux-ec2-2.6.32/arch/ia64/Makefile @@ -56,7 +56,7 @@ core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/ core-$(CONFIG_IA64_SGI_UV) += arch/ia64/uv/ core-$(CONFIG_KVM) += arch/ia64/kvm/ -core-$(CONFIG_XEN) += arch/ia64/xen/ +core-$(CONFIG_PARAVIRT_XEN) += arch/ia64/xen/ drivers-$(CONFIG_PCI) += arch/ia64/pci/ drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/ --- linux-ec2-2.6.32.orig/arch/ia64/ia32/sys_ia32.c +++ linux-ec2-2.6.32/arch/ia64/ia32/sys_ia32.c @@ -858,6 +858,9 @@ prot = get_prot32(prot); + if (flags & MAP_HUGETLB) + return -ENOMEM; + #if PAGE_SHIFT > IA32_PAGE_SHIFT mutex_lock(&ia32_mmap_mutex); { --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/acpi.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/acpi.h @@ -94,6 +94,7 @@ #define acpi_noirq 0 /* ACPI always enabled on IA64 */ #define acpi_pci_disabled 0 /* ACPI PCI always enabled on IA64 */ #define acpi_strict 1 /* no ACPI spec workarounds on IA64 */ +#define acpi_ht 0 /* no HT-only mode on IA64 */ #endif #define acpi_processor_cstate_check(x) (x) /* no idle limits on IA64 :) */ static inline void disable_acpi(void) { } --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/io.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/io.h @@ -424,6 +424,8 @@ extern void __iomem * ioremap(unsigned long offset, unsigned long size); extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); extern void iounmap (volatile void __iomem *addr); +extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size); +extern void early_iounmap (volatile void __iomem *addr, unsigned long size); /* * String version of IO memory access ops: --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/xen/hypervisor.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/xen/hypervisor.h @@ -34,10 +34,12 @@ #define _ASM_IA64_XEN_HYPERVISOR_H #include +#ifdef CONFIG_PARAVIRT_XEN #include #include /* to compile feature.c */ #include /* to comiple xen-netfront.c */ #include +#endif /* xen_domain_type is set before executing any C code by early_xen_setup */ enum xen_domain_type { @@ -46,7 +48,7 @@ XEN_HVM_DOMAIN, /* running in a Xen hvm domain*/ }; -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN extern enum xen_domain_type xen_domain_type; #else #define xen_domain_type XEN_NATIVE @@ -66,7 +68,7 @@ #endif -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/xen/interface.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/xen/interface.h @@ -56,29 +56,21 @@ #ifndef _ASM_IA64_XEN_INTERFACE_H #define _ASM_IA64_XEN_INTERFACE_H -#define __DEFINE_GUEST_HANDLE(name, type) \ +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name #define DEFINE_GUEST_HANDLE_STRUCT(name) \ - __DEFINE_GUEST_HANDLE(name, struct name) -#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) -#define GUEST_HANDLE(name) __guest_handle_ ## name -#define GUEST_HANDLE_64(name) GUEST_HANDLE(name) + __DEFINE_XEN_GUEST_HANDLE(name, struct name) +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) +#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name +#define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name) #define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0) #ifndef __ASSEMBLY__ -/* Guest handles for primitive C types. */ -__DEFINE_GUEST_HANDLE(uchar, unsigned char); -__DEFINE_GUEST_HANDLE(uint, unsigned int); -__DEFINE_GUEST_HANDLE(ulong, unsigned long); -__DEFINE_GUEST_HANDLE(u64, unsigned long); -DEFINE_GUEST_HANDLE(char); -DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(long); -DEFINE_GUEST_HANDLE(void); +__DEFINE_XEN_GUEST_HANDLE(u64, unsigned long); +typedef unsigned long xen_ulong_t; typedef unsigned long xen_pfn_t; -DEFINE_GUEST_HANDLE(xen_pfn_t); #define PRI_xen_pfn "lx" #endif --- linux-ec2-2.6.32.orig/arch/ia64/kernel/asm-offsets.c +++ linux-ec2-2.6.32/arch/ia64/kernel/asm-offsets.c @@ -290,7 +290,7 @@ DEFINE(IA64_ITC_LASTCYCLE_OFFSET, offsetof (struct itc_jitter_data_t, itc_lastcycle)); -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN BLANK(); DEFINE(XEN_NATIVE_ASM, XEN_NATIVE); --- linux-ec2-2.6.32.orig/arch/ia64/kernel/sys_ia64.c +++ linux-ec2-2.6.32/arch/ia64/kernel/sys_ia64.c @@ -100,51 +100,7 @@ asmlinkage unsigned long ia64_brk (unsigned long brk) { - unsigned long rlim, retval, newbrk, oldbrk; - struct mm_struct *mm = current->mm; - - /* - * Most of this replicates the code in sys_brk() except for an additional safety - * check and the clearing of r8. However, we can't call sys_brk() because we need - * to acquire the mmap_sem before we can do the test... - */ - down_write(&mm->mmap_sem); - - if (brk < mm->end_code) - goto out; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; - - /* Always allow shrinking brk. */ - if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) - goto set_brk; - goto out; - } - - /* Check against unimplemented/unmapped addresses: */ - if ((newbrk - oldbrk) > RGN_MAP_LIMIT || REGION_OFFSET(newbrk) > RGN_MAP_LIMIT) - goto out; - - /* Check against rlimit.. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; - if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) - goto out; - - /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) - goto out; - - /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) - goto out; -set_brk: - mm->brk = brk; -out: - retval = mm->brk; - up_write(&mm->mmap_sem); + unsigned long retval = sys_brk(brk); force_successful_syscall_return(); return retval; } @@ -185,39 +141,6 @@ return 0; } -static inline unsigned long -do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff) -{ - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - - if (!file->f_op || !file->f_op->mmap) { - addr = -ENODEV; - goto out; - } - } - - /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) { - addr = -EINVAL; - goto out; - } - - down_write(¤t->mm->mmap_sem); - addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - -out: if (file) - fput(file); - return addr; -} - /* * mmap2() is like mmap() except that the offset is expressed in units * of PAGE_SIZE (instead of bytes). This allows to mmap2() (pieces @@ -226,7 +149,7 @@ asmlinkage unsigned long sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff) { - addr = do_mmap2(addr, len, prot, flags, fd, pgoff); + addr = sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; @@ -238,7 +161,7 @@ if (offset_in_page(off) != 0) return -EINVAL; - addr = do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + addr = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; --- linux-ec2-2.6.32.orig/arch/ia64/kernel/vmlinux.lds.S +++ linux-ec2-2.6.32/arch/ia64/kernel/vmlinux.lds.S @@ -176,7 +176,7 @@ __start_gate_section = .; *(.data.gate) __stop_gate_section = .; -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN . = ALIGN(PAGE_SIZE); __xen_start_gate_section = .; *(.data.gate.xen) --- linux-ec2-2.6.32.orig/arch/ia64/mm/ioremap.c +++ linux-ec2-2.6.32/arch/ia64/mm/ioremap.c @@ -22,6 +22,12 @@ } void __iomem * +early_ioremap (unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr); +} + +void __iomem * ioremap (unsigned long phys_addr, unsigned long size) { void __iomem *addr; @@ -102,6 +108,11 @@ EXPORT_SYMBOL(ioremap_nocache); void +early_iounmap (volatile void __iomem *addr, unsigned long size) +{ +} + +void iounmap (volatile void __iomem *addr) { if (REGION_NUMBER(addr) == RGN_GATE) --- linux-ec2-2.6.32.orig/arch/ia64/xen/Kconfig +++ linux-ec2-2.6.32/arch/ia64/xen/Kconfig @@ -2,7 +2,7 @@ # This Kconfig describes xen/ia64 options # -config XEN +config PARAVIRT_XEN bool "Xen hypervisor support" default y depends on PARAVIRT && MCKINLEY && IA64_PAGE_SIZE_16KB && EXPERIMENTAL @@ -17,9 +17,9 @@ both as a guest OS on Xen and natively on hardware. config XEN_XENCOMM - depends on XEN + depends on PARAVIRT_XEN bool config NO_IDLE_HZ - depends on XEN + depends on PARAVIRT_XEN bool --- linux-ec2-2.6.32.orig/arch/ia64/xen/xcom_hcall.c +++ linux-ec2-2.6.32/arch/ia64/xen/xcom_hcall.c @@ -343,7 +343,7 @@ int xencomm_hypercall_memory_op(unsigned int cmd, void *arg) { - GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} }; + XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} }; struct xen_memory_reservation *xmr = NULL; int rc; struct xencomm_handle *desc; --- linux-ec2-2.6.32.orig/arch/m32r/kernel/sys_m32r.c +++ linux-ec2-2.6.32/arch/m32r/kernel/sys_m32r.c @@ -76,30 +76,6 @@ return oldval; } -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. * --- linux-ec2-2.6.32.orig/arch/m32r/kernel/syscall_table.S +++ linux-ec2-2.6.32/arch/m32r/kernel/syscall_table.S @@ -191,7 +191,7 @@ .long sys_ni_syscall /* streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ --- linux-ec2-2.6.32.orig/arch/m68k/kernel/sys_m68k.c +++ linux-ec2-2.6.32/arch/m68k/kernel/sys_m68k.c @@ -29,37 +29,16 @@ #include #include -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - return do_mmap2(addr, len, prot, flags, fd, pgoff); + /* + * This is wrong for sun3 - there PAGE_SIZE is 8Kb, + * so we need to shift the argument down by 1; m68k mmap64(3) + * (in libc) expects the last argument of mmap2 in 4Kb units. + */ + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* @@ -90,57 +69,11 @@ if (a.offset & ~PAGE_MASK) goto out; - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); -out: - return error; -} - -#if 0 -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } -#endif struct sel_arg_struct { unsigned long n; --- linux-ec2-2.6.32.orig/arch/m68knommu/kernel/sys_m68k.c +++ linux-ec2-2.6.32/arch/m68knommu/kernel/sys_m68k.c @@ -27,39 +27,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/m68k cloned Linux/i386, which didn't use to be able to @@ -88,9 +55,8 @@ if (a.offset & ~PAGE_MASK) goto out; - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } --- linux-ec2-2.6.32.orig/arch/m68knommu/kernel/syscalltable.S +++ linux-ec2-2.6.32/arch/m68knommu/kernel/syscalltable.S @@ -210,7 +210,7 @@ .long sys_ni_syscall /* streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ --- linux-ec2-2.6.32.orig/arch/microblaze/kernel/sys_microblaze.c +++ linux-ec2-2.6.32/arch/microblaze/kernel/sys_microblaze.c @@ -62,46 +62,14 @@ return error; } -asmlinkage long -sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file *file = NULL; - int ret = -EBADF; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) { - printk(KERN_INFO "no fd in mmap\r\n"); - goto out; - } - } - - down_write(¤t->mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return ret; -} - asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, off_t pgoff) { - int err = -EINVAL; - - if (pgoff & ~PAGE_MASK) { - printk(KERN_INFO "no pagemask in mmap\r\n"); - goto out; - } + if (pgoff & ~PAGE_MASK) + return -EINVAL; - err = sys_mmap2(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); -out: - return err; + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); } /* --- linux-ec2-2.6.32.orig/arch/microblaze/kernel/syscall_table.S +++ linux-ec2-2.6.32/arch/microblaze/kernel/syscall_table.S @@ -196,7 +196,7 @@ .long sys_ni_syscall /* reserved for streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 /* mmap2 */ + .long sys_mmap_pgoff /* mmap2 */ .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ --- linux-ec2-2.6.32.orig/arch/mips/kernel/linux32.c +++ linux-ec2-2.6.32/arch/mips/kernel/linux32.c @@ -67,28 +67,13 @@ unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { - struct file * file = NULL; unsigned long error; error = -EINVAL; if (pgoff & (~PAGE_MASK >> 12)) goto out; - pgoff >>= PAGE_SHIFT-12; - - if (!(flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); - + error = sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT-12)); out: return error; } --- linux-ec2-2.6.32.orig/arch/mips/kernel/syscall.c +++ linux-ec2-2.6.32/arch/mips/kernel/syscall.c @@ -93,7 +93,8 @@ * We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && (addr & shm_align_mask)) + if ((flags & MAP_SHARED) && + ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) return -EINVAL; return addr; } @@ -129,31 +130,6 @@ } } -/* common code for old and new mmaps */ -static inline unsigned long -do_mmap2(unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long pgoff) -{ - unsigned long error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - SYSCALL_DEFINE6(mips_mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, off_t, offset) @@ -164,7 +140,7 @@ if (offset & ~PAGE_MASK) goto out; - result = do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + result = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); out: return result; @@ -177,7 +153,7 @@ if (pgoff & (~PAGE_MASK >> 12)) return -EINVAL; - return do_mmap2(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT-12)); + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT-12)); } save_static_function(sys_fork); --- linux-ec2-2.6.32.orig/arch/mips/mm/tlbex.c +++ linux-ec2-2.6.32/arch/mips/mm/tlbex.c @@ -73,9 +73,6 @@ enum label_id { label_second_part = 1, label_leave, -#ifdef MODULE_START - label_module_alloc, -#endif label_vmalloc, label_vmalloc_done, label_tlbw_hazard, @@ -92,9 +89,6 @@ UASM_L_LA(_second_part) UASM_L_LA(_leave) -#ifdef MODULE_START -UASM_L_LA(_module_alloc) -#endif UASM_L_LA(_vmalloc) UASM_L_LA(_vmalloc_done) UASM_L_LA(_tlbw_hazard) @@ -802,8 +796,6 @@ } else { #if defined(CONFIG_HUGETLB_PAGE) const enum label_id ls = label_tlb_huge_update; -#elif defined(MODULE_START) - const enum label_id ls = label_module_alloc; #else const enum label_id ls = label_vmalloc; #endif --- linux-ec2-2.6.32.orig/arch/mn10300/include/asm/mman.h +++ linux-ec2-2.6.32/arch/mn10300/include/asm/mman.h @@ -1 +1,6 @@ #include + +#define MIN_MAP_ADDR PAGE_SIZE /* minimum fixed mmap address */ + +#define arch_mmap_check(addr, len, flags) \ + (((flags) & MAP_FIXED && (addr) < MIN_MAP_ADDR) ? -EINVAL : 0) --- linux-ec2-2.6.32.orig/arch/mn10300/kernel/entry.S +++ linux-ec2-2.6.32/arch/mn10300/kernel/entry.S @@ -578,7 +578,7 @@ .long sys_ni_syscall /* reserved for streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ --- linux-ec2-2.6.32.orig/arch/mn10300/kernel/sys_mn10300.c +++ linux-ec2-2.6.32/arch/mn10300/kernel/sys_mn10300.c @@ -23,47 +23,13 @@ #include -#define MIN_MAP_ADDR PAGE_SIZE /* minimum fixed mmap address */ - -/* - * memory mapping syscall - */ -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file *file = NULL; - long error = -EINVAL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - if (flags & MAP_FIXED && addr < MIN_MAP_ADDR) - goto out; - - error = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage long old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) { if (offset & ~PAGE_MASK) return -EINVAL; - return sys_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + return sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); } struct sel_arg_struct { --- linux-ec2-2.6.32.orig/arch/parisc/kernel/sys_parisc.c +++ linux-ec2-2.6.32/arch/parisc/kernel/sys_parisc.c @@ -110,37 +110,14 @@ return addr; } -static unsigned long do_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, unsigned long fd, - unsigned long pgoff) -{ - struct file * file = NULL; - unsigned long error = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file != NULL) - fput(file); -out: - return error; -} - asmlinkage unsigned long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { /* Make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have. */ - return do_mmap2(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT - 12)); + return sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, @@ -148,7 +125,8 @@ unsigned long offset) { if (!(offset & ~PAGE_MASK)) { - return do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + return sys_mmap_pgoff(addr, len, prot, flags, fd, + offset >> PAGE_SHIFT); } else { return -EINVAL; } --- linux-ec2-2.6.32.orig/arch/powerpc/Kconfig +++ linux-ec2-2.6.32/arch/powerpc/Kconfig @@ -935,6 +935,8 @@ source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "fs/Kconfig" source "arch/powerpc/sysdev/qe_lib/Kconfig" --- linux-ec2-2.6.32.orig/arch/powerpc/include/asm/elf.h +++ linux-ec2-2.6.32/arch/powerpc/include/asm/elf.h @@ -236,14 +236,10 @@ #ifdef __powerpc64__ # define SET_PERSONALITY(ex) \ do { \ - unsigned long new_flags = 0; \ if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ - new_flags = _TIF_32BIT; \ - if ((current_thread_info()->flags & _TIF_32BIT) \ - != new_flags) \ - set_thread_flag(TIF_ABI_PENDING); \ + set_thread_flag(TIF_32BIT); \ else \ - clear_thread_flag(TIF_ABI_PENDING); \ + clear_thread_flag(TIF_32BIT); \ if (personality(current->personality) != PER_LINUX32) \ set_personality(PER_LINUX | \ (current->personality & (~PER_MASK))); \ --- linux-ec2-2.6.32.orig/arch/powerpc/include/asm/module.h +++ linux-ec2-2.6.32/arch/powerpc/include/asm/module.h @@ -87,5 +87,10 @@ void sort_ex_table(struct exception_table_entry *start, struct exception_table_entry *finish); +#ifdef CONFIG_MODVERSIONS +#define ARCH_RELOCATES_KCRCTAB + +extern const unsigned long reloc_start[]; +#endif #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MODULE_H */ --- linux-ec2-2.6.32.orig/arch/powerpc/include/asm/thread_info.h +++ linux-ec2-2.6.32/arch/powerpc/include/asm/thread_info.h @@ -111,7 +111,6 @@ #define TIF_NOTIFY_RESUME 13 /* callback before returning to user */ #define TIF_FREEZE 14 /* Freezing for suspend */ #define TIF_RUNLATCH 15 /* Is the runlatch enabled? */ -#define TIF_ABI_PENDING 16 /* 32/64 bit switch needed */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<thread.vr[reg - 32]; - if (flags & ST) - ret = __copy_to_user(addr, ptr, length); - else { - if (flags & SPLT){ - ret = __copy_from_user(ptr, addr, length); - ptr += length; + lptr = (unsigned long *) ptr; + + if (flags & SW) + sw = elsize-1; + + for (j = 0; j < length; j += elsize) { + for (i = 0; i < elsize; ++i) { + if (flags & ST) + ret |= __put_user(ptr[i^sw], addr + i); + else + ret |= __get_user(ptr[i^sw], addr + i); } - ret |= __copy_from_user(ptr, addr, length); + ptr += elsize; + addr += elsize; } - if (flags & U) - regs->gpr[areg] = regs->dar; - if (ret) + + if (!ret) { + if (flags & U) + regs->gpr[areg] = regs->dar; + + /* Splat load copies the same data to top and bottom 8 bytes */ + if (flags & SPLT) + lptr[1] = lptr[0]; + /* For 8 byte loads, zero the top 8 bytes */ + else if (!(flags & ST) && (8 == length)) + lptr[1] = 0; + } else return -EFAULT; + return 1; } #endif @@ -767,16 +787,25 @@ #ifdef CONFIG_VSX if ((instruction & 0xfc00003e) == 0x7c000018) { - /* Additional register addressing bit (64 VSX vs 32 FPR/GPR */ + unsigned int elsize; + + /* Additional register addressing bit (64 VSX vs 32 FPR/GPR) */ reg |= (instruction & 0x1) << 5; /* Simple inline decoder instead of a table */ + /* VSX has only 8 and 16 byte memory accesses */ + nb = 8; if (instruction & 0x200) nb = 16; - else if (instruction & 0x080) - nb = 8; - else - nb = 4; + + /* Vector stores in little-endian mode swap individual + elements, so process them separately */ + elsize = 4; + if (instruction & 0x80) + elsize = 8; + flags = 0; + if (regs->msr & MSR_LE) + flags |= SW; if (instruction & 0x100) flags |= ST; if (instruction & 0x040) @@ -787,7 +816,7 @@ nb = 8; } PPC_WARN_EMULATED(vsx); - return emulate_vsx(addr, reg, areg, regs, flags, nb); + return emulate_vsx(addr, reg, areg, regs, flags, nb, elsize); } #endif /* A size of 0 indicates an instruction we don't support, with --- linux-ec2-2.6.32.orig/arch/powerpc/kernel/pci-common.c +++ linux-ec2-2.6.32/arch/powerpc/kernel/pci-common.c @@ -1107,6 +1107,12 @@ list_for_each_entry(dev, &bus->devices, bus_list) { struct dev_archdata *sd = &dev->dev.archdata; + /* Cardbus can call us to add new devices to a bus, so ignore + * those who are already fully discovered + */ + if (dev->is_added) + continue; + /* Setup OF node pointer in archdata */ sd->of_node = pci_device_to_OF_node(dev); @@ -1147,6 +1153,13 @@ } EXPORT_SYMBOL(pcibios_fixup_bus); +void __devinit pci_fixup_cardbus(struct pci_bus *bus) +{ + /* Now fixup devices on that bus */ + pcibios_setup_bus_devices(bus); +} + + static int skip_isa_ioresource_align(struct pci_dev *dev) { if ((ppc_pci_flags & PPC_PCI_CAN_SKIP_ISA_ALIGN) && --- linux-ec2-2.6.32.orig/arch/powerpc/kernel/process.c +++ linux-ec2-2.6.32/arch/powerpc/kernel/process.c @@ -554,18 +554,6 @@ void flush_thread(void) { -#ifdef CONFIG_PPC64 - struct thread_info *t = current_thread_info(); - - if (test_ti_thread_flag(t, TIF_ABI_PENDING)) { - clear_ti_thread_flag(t, TIF_ABI_PENDING); - if (test_ti_thread_flag(t, TIF_32BIT)) - clear_ti_thread_flag(t, TIF_32BIT); - else - set_ti_thread_flag(t, TIF_32BIT); - } -#endif - discard_lazy_cpu_state(); if (current->thread.dabr) { --- linux-ec2-2.6.32.orig/arch/powerpc/kernel/syscalls.c +++ linux-ec2-2.6.32/arch/powerpc/kernel/syscalls.c @@ -140,7 +140,6 @@ unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off, int shift) { - struct file * file = NULL; unsigned long ret = -EINVAL; if (!arch_validate_prot(prot)) @@ -151,20 +150,8 @@ goto out; off >>= shift; } - - ret = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - if (!(file = fget(fd))) - goto out; - } - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flags, off); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off); out: return ret; } --- linux-ec2-2.6.32.orig/arch/powerpc/kernel/vector.S +++ linux-ec2-2.6.32/arch/powerpc/kernel/vector.S @@ -58,7 +58,7 @@ * all 1's */ mfspr r4,SPRN_VRSAVE - cmpdi 0,r4,0 + cmpwi 0,r4,0 bne+ 1f li r4,-1 mtspr SPRN_VRSAVE,r4 --- linux-ec2-2.6.32.orig/arch/powerpc/kernel/vmlinux.lds.S +++ linux-ec2-2.6.32/arch/powerpc/kernel/vmlinux.lds.S @@ -38,6 +38,9 @@ #endif SECTIONS { + . = 0; + reloc_start = .; + . = KERNELBASE; /* --- linux-ec2-2.6.32.orig/arch/powerpc/sysdev/fsl_pci.c +++ linux-ec2-2.6.32/arch/powerpc/sysdev/fsl_pci.c @@ -392,8 +392,22 @@ DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8641, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8641D, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8610, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1011E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1011, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1013E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1013, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1020E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1020, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1022E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1022, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2010E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2010, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2020E, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2020, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4040E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4040, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4080E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4080, quirk_fsl_pcie_header); #endif /* CONFIG_PPC_85xx || CONFIG_PPC_86xx */ #if defined(CONFIG_PPC_83xx) || defined(CONFIG_PPC_MPC512x) --- linux-ec2-2.6.32.orig/arch/s390/include/asm/kvm.h +++ linux-ec2-2.6.32/arch/s390/include/asm/kvm.h @@ -1,6 +1,5 @@ #ifndef __LINUX_KVM_S390_H #define __LINUX_KVM_S390_H - /* * asm-s390/kvm.h - KVM s390 specific structures and definitions * @@ -15,6 +14,8 @@ */ #include +#define __KVM_S390 + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ --- linux-ec2-2.6.32.orig/arch/s390/kernel/compat_linux.c +++ linux-ec2-2.6.32/arch/s390/kernel/compat_linux.c @@ -683,38 +683,6 @@ u32 offset; }; -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file * file = NULL; - unsigned long error = -EBADF; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - if (!IS_ERR((void *) error) && error + len >= 0x80000000ULL) { - /* Result is out of bounds. */ - do_munmap(current->mm, addr, len); - error = -ENOMEM; - } - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - - asmlinkage unsigned long old32_mmap(struct mmap_arg_struct_emu31 __user *arg) { @@ -728,7 +696,8 @@ if (a.offset & ~PAGE_MASK) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } @@ -741,7 +710,7 @@ if (copy_from_user(&a, arg, sizeof(a))) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); out: return error; } --- linux-ec2-2.6.32.orig/arch/s390/kernel/entry.S +++ linux-ec2-2.6.32/arch/s390/kernel/entry.S @@ -571,6 +571,7 @@ mvc __THREAD_per+__PER_access_id(1,%r8),__LC_PER_ACCESS_ID oi __TI_flags+3(%r9),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP TRACE_IRQS_ON + lm %r2,%r6,SP_R2(%r15) # load svc arguments stosm __SF_EMPTY(%r15),0x03 # reenable interrupts b BASED(sysc_do_svc) --- linux-ec2-2.6.32.orig/arch/s390/kernel/entry64.S +++ linux-ec2-2.6.32/arch/s390/kernel/entry64.S @@ -549,6 +549,7 @@ mvc __THREAD_per+__PER_access_id(1,%r8),__LC_PER_ACCESS_ID oi __TI_flags+7(%r9),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP TRACE_IRQS_ON + lmg %r2,%r6,SP_R2(%r15) # load svc arguments stosm __SF_EMPTY(%r15),0x03 # reenable interrupts j sysc_do_svc --- linux-ec2-2.6.32.orig/arch/s390/kernel/head64.S +++ linux-ec2-2.6.32/arch/s390/kernel/head64.S @@ -83,6 +83,8 @@ slr %r0,%r0 # set cpuid to zero sigp %r1,%r0,0x12 # switch to esame mode sam64 # switch to 64 bit mode + llgfr %r13,%r13 # clear high-order half of base reg + lmh %r0,%r15,.Lzero64-.LPG1(%r13) # clear high-order half lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area # move IPL device to lowcore @@ -127,6 +129,7 @@ .L4malign:.quad 0xffffffffffc00000 .Lscan2g:.quad 0x80000000 + 0x20000 - 8 # 2GB + 128K - 8 .Lnop: .long 0x07000700 +.Lzero64:.fill 16,4,0x0 #ifdef CONFIG_ZFCPDUMP .Lcurrent_cpu: .long 0x0 --- linux-ec2-2.6.32.orig/arch/s390/kernel/sys_s390.c +++ linux-ec2-2.6.32/arch/s390/kernel/sys_s390.c @@ -32,32 +32,6 @@ #include #include "entry.h" -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - long error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux for S/390 isn't able to handle more than 5 @@ -81,7 +55,7 @@ if (copy_from_user(&a, arg, sizeof(a))) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); out: return error; } @@ -98,7 +72,7 @@ if (a.offset & ~PAGE_MASK) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return error; } --- linux-ec2-2.6.32.orig/arch/s390/kvm/intercept.c +++ linux-ec2-2.6.32/arch/s390/kvm/intercept.c @@ -213,7 +213,7 @@ return rc2; } -static const intercept_handler_t intercept_funcs[0x48 >> 2] = { +static const intercept_handler_t intercept_funcs[] = { [0x00 >> 2] = handle_noop, [0x04 >> 2] = handle_instruction, [0x08 >> 2] = handle_prog, @@ -230,7 +230,7 @@ intercept_handler_t func; u8 code = vcpu->arch.sie_block->icptcode; - if (code & 3 || code > 0x48) + if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs)) return -ENOTSUPP; func = intercept_funcs[code >> 2]; if (func) --- linux-ec2-2.6.32.orig/arch/s390/kvm/kvm-s390.c +++ linux-ec2-2.6.32/arch/s390/kvm/kvm-s390.c @@ -116,10 +116,16 @@ int kvm_dev_ioctl_check_extension(long ext) { + int r; + switch (ext) { + case KVM_CAP_S390_PSW: + r = 1; + break; default: - return 0; + r = 0; } + return r; } /* Section: vm related */ @@ -419,8 +425,10 @@ vcpu_load(vcpu); if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING) rc = -EBUSY; - else - vcpu->arch.sie_block->gpsw = psw; + else { + vcpu->run->psw_mask = psw.mask; + vcpu->run->psw_addr = psw.addr; + } vcpu_put(vcpu); return rc; } @@ -508,9 +516,6 @@ switch (kvm_run->exit_reason) { case KVM_EXIT_S390_SIEIC: - vcpu->arch.sie_block->gpsw.mask = kvm_run->s390_sieic.mask; - vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr; - break; case KVM_EXIT_UNKNOWN: case KVM_EXIT_INTR: case KVM_EXIT_S390_RESET: @@ -519,6 +524,9 @@ BUG(); } + vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; + vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; + might_fault(); do { @@ -538,8 +546,6 @@ /* intercept cannot be handled in-kernel, prepare kvm-run */ kvm_run->exit_reason = KVM_EXIT_S390_SIEIC; kvm_run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode; - kvm_run->s390_sieic.mask = vcpu->arch.sie_block->gpsw.mask; - kvm_run->s390_sieic.addr = vcpu->arch.sie_block->gpsw.addr; kvm_run->s390_sieic.ipa = vcpu->arch.sie_block->ipa; kvm_run->s390_sieic.ipb = vcpu->arch.sie_block->ipb; rc = 0; @@ -551,6 +557,9 @@ rc = 0; } + kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; + kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; + if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); --- linux-ec2-2.6.32.orig/arch/s390/kvm/sigp.c +++ linux-ec2-2.6.32/arch/s390/kvm/sigp.c @@ -188,9 +188,9 @@ /* make sure that the new value is valid memory */ address = address & 0x7fffe000u; - if ((copy_from_guest(vcpu, &tmp, - (u64) (address + vcpu->arch.sie_block->gmsor) , 1)) || - (copy_from_guest(vcpu, &tmp, (u64) (address + + if ((copy_from_user(&tmp, (void __user *) + (address + vcpu->arch.sie_block->gmsor) , 1)) || + (copy_from_user(&tmp, (void __user *)(address + vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) { *reg |= SIGP_STAT_INVALID_PARAMETER; return 1; /* invalid parameter */ --- linux-ec2-2.6.32.orig/arch/sh/boot/compressed/misc.c +++ linux-ec2-2.6.32/arch/sh/boot/compressed/misc.c @@ -132,7 +132,7 @@ output_addr = (CONFIG_MEMORY_START + 0x2000); #else output_addr = PHYSADDR((unsigned long)&_text+PAGE_SIZE); -#ifdef CONFIG_29BIT +#if defined(CONFIG_29BIT) || defined(CONFIG_PMB_FIXED) output_addr |= P2SEG; #endif #endif --- linux-ec2-2.6.32.orig/arch/sh/include/asm/pgtable_32.h +++ linux-ec2-2.6.32/arch/sh/include/asm/pgtable_32.h @@ -344,7 +344,8 @@ #define pte_special(pte) ((pte).pte_low & _PAGE_SPECIAL) #ifdef CONFIG_X2TLB -#define pte_write(pte) ((pte).pte_high & _PAGE_EXT_USER_WRITE) +#define pte_write(pte) \ + ((pte).pte_high & (_PAGE_EXT_USER_WRITE | _PAGE_EXT_KERN_WRITE)) #else #define pte_write(pte) ((pte).pte_low & _PAGE_RW) #endif @@ -358,7 +359,7 @@ * individually toggled (and user permissions are entirely decoupled from * kernel permissions), we attempt to couple them a bit more sanely here. */ -PTE_BIT_FUNC(high, wrprotect, &= ~_PAGE_EXT_USER_WRITE); +PTE_BIT_FUNC(high, wrprotect, &= ~(_PAGE_EXT_USER_WRITE | _PAGE_EXT_KERN_WRITE)); PTE_BIT_FUNC(high, mkwrite, |= _PAGE_EXT_USER_WRITE | _PAGE_EXT_KERN_WRITE); PTE_BIT_FUNC(high, mkhuge, |= _PAGE_SZHUGE); #else --- linux-ec2-2.6.32.orig/arch/sh/kernel/process_64.c +++ linux-ec2-2.6.32/arch/sh/kernel/process_64.c @@ -367,7 +367,7 @@ void flush_thread(void) { - /* Called by fs/exec.c (flush_old_exec) to remove traces of a + /* Called by fs/exec.c (setup_new_exec) to remove traces of a * previously running executable. */ #ifdef CONFIG_SH_FPU if (last_task_used_math == current) { --- linux-ec2-2.6.32.orig/arch/sh/kernel/sys_sh.c +++ linux-ec2-2.6.32/arch/sh/kernel/sys_sh.c @@ -28,37 +28,13 @@ #include #include -static inline long -do_mmap2(unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, int fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage int old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, int fd, unsigned long off) { if (off & ~PAGE_MASK) return -EINVAL; - return do_mmap2(addr, len, prot, flags, fd, off>>PAGE_SHIFT); + return sys_mmap_pgoff(addr, len, prot, flags, fd, off>>PAGE_SHIFT); } asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, @@ -74,7 +50,7 @@ pgoff >>= PAGE_SHIFT - 12; - return do_mmap2(addr, len, prot, flags, fd, pgoff); + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* --- linux-ec2-2.6.32.orig/arch/sh/mm/mmap.c +++ linux-ec2-2.6.32/arch/sh/mm/mmap.c @@ -54,7 +54,8 @@ /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && (addr & shm_align_mask)) + if ((flags & MAP_SHARED) && + ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) return -EINVAL; return addr; } --- linux-ec2-2.6.32.orig/arch/sparc/Makefile +++ linux-ec2-2.6.32/arch/sparc/Makefile @@ -27,6 +27,7 @@ LDFLAGS := -m elf32_sparc CHECKFLAGS += -D__sparc__ export BITS := 32 +UTS_MACHINE := sparc #KBUILD_CFLAGS += -g -pipe -fcall-used-g5 -fcall-used-g7 KBUILD_CFLAGS += -m32 -pipe -mno-fpu -fcall-used-g5 -fcall-used-g7 @@ -46,6 +47,7 @@ LDFLAGS := -m elf64_sparc export BITS := 64 +UTS_MACHINE := sparc64 KBUILD_CFLAGS += -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow \ -ffixed-g4 -ffixed-g5 -fcall-used-g7 -Wno-sign-compare \ --- linux-ec2-2.6.32.orig/arch/sparc/include/asm/elf_64.h +++ linux-ec2-2.6.32/arch/sparc/include/asm/elf_64.h @@ -196,17 +196,10 @@ #define ELF_PLATFORM (NULL) #define SET_PERSONALITY(ex) \ -do { unsigned long new_flags = current_thread_info()->flags; \ - new_flags &= _TIF_32BIT; \ - if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ - new_flags |= _TIF_32BIT; \ +do { if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ + set_thread_flag(TIF_32BIT); \ else \ - new_flags &= ~_TIF_32BIT; \ - if ((current_thread_info()->flags & _TIF_32BIT) \ - != new_flags) \ - set_thread_flag(TIF_ABI_PENDING); \ - else \ - clear_thread_flag(TIF_ABI_PENDING); \ + clear_thread_flag(TIF_32BIT); \ /* flush_thread will update pgd cache */ \ if (personality(current->personality) != PER_LINUX32) \ set_personality(PER_LINUX | \ --- linux-ec2-2.6.32.orig/arch/sparc/include/asm/io_32.h +++ linux-ec2-2.6.32/arch/sparc/include/asm/io_32.h @@ -8,7 +8,7 @@ #include /* IO address mapping routines need this */ #include -#define page_to_phys(page) (((page) - mem_map) << PAGE_SHIFT) +#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) static inline u32 flip_dword (u32 l) { --- linux-ec2-2.6.32.orig/arch/sparc/include/asm/page_32.h +++ linux-ec2-2.6.32/arch/sparc/include/asm/page_32.h @@ -143,7 +143,7 @@ #define phys_to_virt __va #define ARCH_PFN_OFFSET (pfn_base) -#define virt_to_page(kaddr) (mem_map + ((((unsigned long)(kaddr)-PAGE_OFFSET)>>PAGE_SHIFT))) +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_valid(pfn) (((pfn) >= (pfn_base)) && (((pfn)-(pfn_base)) < max_mapnr)) #define virt_addr_valid(kaddr) ((((unsigned long)(kaddr)-PAGE_OFFSET)>>PAGE_SHIFT) < max_mapnr) --- linux-ec2-2.6.32.orig/arch/sparc/include/asm/stat.h +++ linux-ec2-2.6.32/arch/sparc/include/asm/stat.h @@ -53,8 +53,8 @@ ino_t st_ino; mode_t st_mode; short st_nlink; - uid_t st_uid; - gid_t st_gid; + uid16_t st_uid; + gid16_t st_gid; unsigned short st_rdev; off_t st_size; time_t st_atime; --- linux-ec2-2.6.32.orig/arch/sparc/include/asm/thread_info_64.h +++ linux-ec2-2.6.32/arch/sparc/include/asm/thread_info_64.h @@ -227,12 +227,11 @@ /* flag bit 8 is available */ #define TIF_SECCOMP 9 /* secure computing */ #define TIF_SYSCALL_AUDIT 10 /* syscall auditing active */ -/* flag bit 11 is available */ /* NOTE: Thread flags >= 12 should be ones we have no interest * in using in assembly, else we can't use the mask as * an immediate value in instructions such as andcc. */ -#define TIF_ABI_PENDING 12 +/* flag bit 12 is available */ #define TIF_MEMDIE 13 #define TIF_POLLING_NRFLAG 14 #define TIF_FREEZE 15 /* is freezing for suspend */ @@ -246,7 +245,6 @@ #define _TIF_32BIT (1<leds_resource.start = (unsigned long) (p->clock_regs + CLOCK_CTRL); - p->leds_resource.end = p->leds_resource.end; + p->leds_resource.end = p->leds_resource.start; p->leds_resource.name = "leds"; p->leds_pdev.name = "sunfire-clockboard-leds"; @@ -194,7 +194,7 @@ if (!p->central) { p->leds_resource.start = (unsigned long) (p->pregs + FHC_PREGS_CTRL); - p->leds_resource.end = p->leds_resource.end; + p->leds_resource.end = p->leds_resource.start; p->leds_resource.name = "leds"; p->leds_pdev.name = "sunfire-fhc-leds"; --- linux-ec2-2.6.32.orig/arch/sparc/kernel/ldc.c +++ linux-ec2-2.6.32/arch/sparc/kernel/ldc.c @@ -1242,13 +1242,13 @@ snprintf(lp->tx_irq_name, LDC_IRQ_NAME_MAX, "%s TX", name); err = request_irq(lp->cfg.rx_irq, ldc_rx, - IRQF_SAMPLE_RANDOM | IRQF_DISABLED | IRQF_SHARED, + IRQF_SAMPLE_RANDOM | IRQF_DISABLED, lp->rx_irq_name, lp); if (err) return err; err = request_irq(lp->cfg.tx_irq, ldc_tx, - IRQF_SAMPLE_RANDOM | IRQF_DISABLED | IRQF_SHARED, + IRQF_SAMPLE_RANDOM | IRQF_DISABLED, lp->tx_irq_name, lp); if (err) { free_irq(lp->cfg.rx_irq, lp); --- linux-ec2-2.6.32.orig/arch/sparc/kernel/nmi.c +++ linux-ec2-2.6.32/arch/sparc/kernel/nmi.c @@ -96,7 +96,6 @@ int cpu = smp_processor_id(); clear_softint(1 << irq); - pcr_ops->write(PCR_PIC_PRIV); local_cpu_data().__nmi_count++; @@ -105,6 +104,8 @@ if (notify_die(DIE_NMI, "nmi", regs, 0, pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP) touched = 1; + else + pcr_ops->write(PCR_PIC_PRIV); sum = kstat_irqs_cpu(0, cpu); if (__get_cpu_var(nmi_touch)) { --- linux-ec2-2.6.32.orig/arch/sparc/kernel/of_device_64.c +++ linux-ec2-2.6.32/arch/sparc/kernel/of_device_64.c @@ -104,9 +104,19 @@ int i; /* Check address type match */ - if ((addr[0] ^ range[0]) & 0x03000000) - return -EINVAL; + if (!((addr[0] ^ range[0]) & 0x03000000)) + goto type_match; + + /* Special exception, we can map a 64-bit address into + * a 32-bit range. + */ + if ((addr[0] & 0x03000000) == 0x03000000 && + (range[0] & 0x03000000) == 0x02000000) + goto type_match; + + return -EINVAL; +type_match: if (of_out_of_range(addr + 1, range + 1, range + na + pna, na - 1, ns)) return -EINVAL; --- linux-ec2-2.6.32.orig/arch/sparc/kernel/process_32.c +++ linux-ec2-2.6.32/arch/sparc/kernel/process_32.c @@ -526,7 +526,7 @@ * Set some valid stack frames to give to the child. */ childstack = (struct sparc_stackf __user *) - (sp & ~0x7UL); + (sp & ~0xfUL); parentstack = (struct sparc_stackf __user *) regs->u_regs[UREG_FP]; --- linux-ec2-2.6.32.orig/arch/sparc/kernel/process_64.c +++ linux-ec2-2.6.32/arch/sparc/kernel/process_64.c @@ -365,14 +365,6 @@ struct thread_info *t = current_thread_info(); struct mm_struct *mm; - if (test_ti_thread_flag(t, TIF_ABI_PENDING)) { - clear_ti_thread_flag(t, TIF_ABI_PENDING); - if (test_ti_thread_flag(t, TIF_32BIT)) - clear_ti_thread_flag(t, TIF_32BIT); - else - set_ti_thread_flag(t, TIF_32BIT); - } - mm = t->task->mm; if (mm) tsb_context_switch(mm); @@ -406,11 +398,11 @@ } else __get_user(fp, &(((struct reg_window32 __user *)psp)->ins[6])); - /* Now 8-byte align the stack as this is mandatory in the - * Sparc ABI due to how register windows work. This hides - * the restriction from thread libraries etc. -DaveM + /* Now align the stack as this is mandatory in the Sparc ABI + * due to how register windows work. This hides the + * restriction from thread libraries etc. */ - csp &= ~7UL; + csp &= ~15UL; distance = fp - psp; rval = (csp - distance); --- linux-ec2-2.6.32.orig/arch/sparc/kernel/signal32.c +++ linux-ec2-2.6.32/arch/sparc/kernel/signal32.c @@ -120,8 +120,8 @@ }; /* Align macros */ -#define SF_ALIGNEDSZ (((sizeof(struct signal_frame32) + 7) & (~7))) -#define RT_ALIGNEDSZ (((sizeof(struct rt_signal_frame32) + 7) & (~7))) +#define SF_ALIGNEDSZ (((sizeof(struct signal_frame32) + 15) & (~15))) +#define RT_ALIGNEDSZ (((sizeof(struct rt_signal_frame32) + 15) & (~15))) int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { @@ -420,15 +420,17 @@ sp = current->sas_ss_sp + current->sas_ss_size; } + sp -= framesize; + /* Always align the stack frame. This handles two cases. First, * sigaltstack need not be mindful of platform specific stack * alignment. Second, if we took this signal because the stack * is not aligned properly, we'd like to take the signal cleanly * and report that. */ - sp &= ~7UL; + sp &= ~15UL; - return (void __user *)(sp - framesize); + return (void __user *) sp; } static int save_fpu_state32(struct pt_regs *regs, __siginfo_fpu_t __user *fpu) --- linux-ec2-2.6.32.orig/arch/sparc/kernel/signal_32.c +++ linux-ec2-2.6.32/arch/sparc/kernel/signal_32.c @@ -267,15 +267,17 @@ sp = current->sas_ss_sp + current->sas_ss_size; } + sp -= framesize; + /* Always align the stack frame. This handles two cases. First, * sigaltstack need not be mindful of platform specific stack * alignment. Second, if we took this signal because the stack * is not aligned properly, we'd like to take the signal cleanly * and report that. */ - sp &= ~7UL; + sp &= ~15UL; - return (void __user *)(sp - framesize); + return (void __user *) sp; } static inline int --- linux-ec2-2.6.32.orig/arch/sparc/kernel/signal_64.c +++ linux-ec2-2.6.32/arch/sparc/kernel/signal_64.c @@ -353,7 +353,7 @@ /* Checks if the fp is valid */ static int invalid_frame_pointer(void __user *fp, int fplen) { - if (((unsigned long) fp) & 7) + if (((unsigned long) fp) & 15) return 1; return 0; } @@ -396,15 +396,17 @@ sp = current->sas_ss_sp + current->sas_ss_size; } + sp -= framesize; + /* Always align the stack frame. This handles two cases. First, * sigaltstack need not be mindful of platform specific stack * alignment. Second, if we took this signal because the stack * is not aligned properly, we'd like to take the signal cleanly * and report that. */ - sp &= ~7UL; + sp &= ~15UL; - return (void __user *)(sp - framesize); + return (void __user *) sp; } static inline void --- linux-ec2-2.6.32.orig/arch/sparc/kernel/sys_sparc_32.c +++ linux-ec2-2.6.32/arch/sparc/kernel/sys_sparc_32.c @@ -45,7 +45,8 @@ /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && (addr & (SHMLBA - 1))) + if ((flags & MAP_SHARED) && + ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; } @@ -79,15 +80,6 @@ } } -asmlinkage unsigned long sparc_brk(unsigned long brk) -{ - if(ARCH_SUN4C) { - if ((brk & 0xe0000000) != (current->mm->brk & 0xe0000000)) - return current->mm->brk; - } - return sys_brk(brk); -} - /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way unix traditionally does this, though. @@ -234,31 +226,6 @@ } /* Linux version of mmap */ -static unsigned long do_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, unsigned long fd, - unsigned long pgoff) -{ - struct file * file = NULL; - unsigned long retval = -EBADF; - - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - len = PAGE_ALIGN(len); - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return retval; -} asmlinkage unsigned long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, @@ -266,14 +233,16 @@ { /* Make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have. */ - return do_mmap2(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT - 12)); + return sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off) { - return do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + /* no alignment check? */ + return sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } long sparc_remap_file_pages(unsigned long start, unsigned long size, @@ -287,27 +256,6 @@ (pgoff >> (PAGE_SHIFT - 12)), flags); } -extern unsigned long do_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr); - -asmlinkage unsigned long sparc_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) -{ - unsigned long ret = -EINVAL; - - if (unlikely(sparc_mmap_check(addr, old_len))) - goto out; - if (unlikely(sparc_mmap_check(new_addr, new_len))) - goto out; - down_write(¤t->mm->mmap_sem); - ret = do_mremap(addr, old_len, new_len, flags, new_addr); - up_write(¤t->mm->mmap_sem); -out: - return ret; -} - /* we come to here via sys_nis_syscall so it can setup the regs argument */ asmlinkage unsigned long c_sys_nis_syscall (struct pt_regs *regs) --- linux-ec2-2.6.32.orig/arch/sparc/kernel/sys_sparc_64.c +++ linux-ec2-2.6.32/arch/sparc/kernel/sys_sparc_64.c @@ -317,10 +317,14 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long align_goal, addr = -ENOMEM; + unsigned long (*get_area)(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); + + get_area = current->mm->get_unmapped_area; if (flags & MAP_FIXED) { /* Ok, don't mess with it. */ - return get_unmapped_area(NULL, orig_addr, len, pgoff, flags); + return get_area(NULL, orig_addr, len, pgoff, flags); } flags &= ~MAP_SHARED; @@ -333,7 +337,7 @@ align_goal = (64UL * 1024); do { - addr = get_unmapped_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags); + addr = get_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags); if (!(addr & ~PAGE_MASK)) { addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL); break; @@ -351,7 +355,7 @@ * be obtained. */ if (addr & ~PAGE_MASK) - addr = get_unmapped_area(NULL, orig_addr, len, pgoff, flags); + addr = get_area(NULL, orig_addr, len, pgoff, flags); return addr; } @@ -399,18 +403,6 @@ } } -SYSCALL_DEFINE1(sparc_brk, unsigned long, brk) -{ - /* People could try to be nasty and use ta 0x6d in 32bit programs */ - if (test_thread_flag(TIF_32BIT) && brk >= STACK_TOP32) - return current->mm->brk; - - if (unlikely(straddles_64bit_va_hole(current->mm->brk, brk))) - return current->mm->brk; - - return sys_brk(brk); -} - /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way unix traditionally does this, though. @@ -568,23 +560,13 @@ unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - struct file * file = NULL; - unsigned long retval = -EBADF; - - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - len = PAGE_ALIGN(len); + unsigned long retval = -EINVAL; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(file, addr, len, prot, flags, off); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); + if ((off + PAGE_ALIGN(len)) < off) + goto out; + if (off & ~PAGE_MASK) + goto out; + retval = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return retval; } @@ -614,12 +596,6 @@ if (test_thread_flag(TIF_32BIT)) goto out; - if (unlikely(new_len >= VA_EXCLUDE_START)) - goto out; - if (unlikely(sparc_mmap_check(addr, old_len))) - goto out; - if (unlikely(sparc_mmap_check(new_addr, new_len))) - goto out; down_write(¤t->mm->mmap_sem); ret = do_mremap(addr, old_len, new_len, flags, new_addr); --- linux-ec2-2.6.32.orig/arch/sparc/kernel/systbls.h +++ linux-ec2-2.6.32/arch/sparc/kernel/systbls.h @@ -9,7 +9,6 @@ struct new_utsname; extern asmlinkage unsigned long sys_getpagesize(void); -extern asmlinkage unsigned long sparc_brk(unsigned long brk); extern asmlinkage long sparc_pipe(struct pt_regs *regs); extern asmlinkage long sys_ipc(unsigned int call, int first, unsigned long second, --- linux-ec2-2.6.32.orig/arch/sparc/kernel/systbls_32.S +++ linux-ec2-2.6.32/arch/sparc/kernel/systbls_32.S @@ -19,7 +19,7 @@ /*0*/ .long sys_restart_syscall, sys_exit, sys_fork, sys_read, sys_write /*5*/ .long sys_open, sys_close, sys_wait4, sys_creat, sys_link /*10*/ .long sys_unlink, sunos_execv, sys_chdir, sys_chown16, sys_mknod -/*15*/ .long sys_chmod, sys_lchown16, sparc_brk, sys_nis_syscall, sys_lseek +/*15*/ .long sys_chmod, sys_lchown16, sys_brk, sys_nis_syscall, sys_lseek /*20*/ .long sys_getpid, sys_capget, sys_capset, sys_setuid16, sys_getuid16 /*25*/ .long sys_vmsplice, sys_ptrace, sys_alarm, sys_sigaltstack, sys_pause /*30*/ .long sys_utime, sys_lchown, sys_fchown, sys_access, sys_nice @@ -67,7 +67,7 @@ /*235*/ .long sys_fstatfs64, sys_llseek, sys_mlock, sys_munlock, sys_mlockall /*240*/ .long sys_munlockall, sys_sched_setparam, sys_sched_getparam, sys_sched_setscheduler, sys_sched_getscheduler /*245*/ .long sys_sched_yield, sys_sched_get_priority_max, sys_sched_get_priority_min, sys_sched_rr_get_interval, sys_nanosleep -/*250*/ .long sparc_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl +/*250*/ .long sys_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl /*255*/ .long sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep /*260*/ .long sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun /*265*/ .long sys_timer_delete, sys_timer_create, sys_nis_syscall, sys_io_setup, sys_io_destroy --- linux-ec2-2.6.32.orig/arch/sparc/kernel/systbls_64.S +++ linux-ec2-2.6.32/arch/sparc/kernel/systbls_64.S @@ -21,7 +21,7 @@ /*0*/ .word sys_restart_syscall, sys32_exit, sys_fork, sys_read, sys_write /*5*/ .word sys32_open, sys_close, sys32_wait4, sys32_creat, sys_link /*10*/ .word sys_unlink, sunos_execv, sys_chdir, sys_chown16, sys32_mknod -/*15*/ .word sys_chmod, sys_lchown16, sys_sparc_brk, sys32_perfctr, sys32_lseek +/*15*/ .word sys_chmod, sys_lchown16, sys_brk, sys32_perfctr, sys32_lseek /*20*/ .word sys_getpid, sys_capget, sys_capset, sys_setuid16, sys_getuid16 /*25*/ .word sys32_vmsplice, compat_sys_ptrace, sys_alarm, sys32_sigaltstack, sys_pause /*30*/ .word compat_sys_utime, sys_lchown, sys_fchown, sys32_access, sys32_nice @@ -96,7 +96,7 @@ /*0*/ .word sys_restart_syscall, sparc_exit, sys_fork, sys_read, sys_write /*5*/ .word sys_open, sys_close, sys_wait4, sys_creat, sys_link /*10*/ .word sys_unlink, sys_nis_syscall, sys_chdir, sys_chown, sys_mknod -/*15*/ .word sys_chmod, sys_lchown, sys_sparc_brk, sys_perfctr, sys_lseek +/*15*/ .word sys_chmod, sys_lchown, sys_brk, sys_perfctr, sys_lseek /*20*/ .word sys_getpid, sys_capget, sys_capset, sys_setuid, sys_getuid /*25*/ .word sys_vmsplice, sys_ptrace, sys_alarm, sys_sigaltstack, sys_nis_syscall /*30*/ .word sys_utime, sys_nis_syscall, sys_nis_syscall, sys_access, sys_nice --- linux-ec2-2.6.32.orig/arch/sparc/kernel/tsb.S +++ linux-ec2-2.6.32/arch/sparc/kernel/tsb.S @@ -191,10 +191,12 @@ tsb_itlb_load: /* Executable bit must be set. */ -661: andcc %g5, _PAGE_EXEC_4U, %g0 - .section .sun4v_1insn_patch, "ax" +661: sethi %hi(_PAGE_EXEC_4U), %g4 + andcc %g5, %g4, %g0 + .section .sun4v_2insn_patch, "ax" .word 661b andcc %g5, _PAGE_EXEC_4V, %g0 + nop .previous be,pn %xcc, tsb_do_fault --- linux-ec2-2.6.32.orig/arch/sparc/kernel/perf_event.c +++ linux-ec2-2.6.32/arch/sparc/kernel/perf_event.c @@ -986,6 +986,17 @@ data.addr = 0; cpuc = &__get_cpu_var(cpu_hw_events); + + /* If the PMU has the TOE IRQ enable bits, we need to do a + * dummy write to the %pcr to clear the overflow bits and thus + * the interrupt. + * + * Do this before we peek at the counters to determine + * overflow so we don't lose any events. + */ + if (sparc_pmu->irq_bit) + pcr_ops->write(cpuc->pcr); + for (idx = 0; idx < MAX_HWEVENTS; idx++) { struct perf_event *event = cpuc->events[idx]; struct hw_perf_event *hwc; --- linux-ec2-2.6.32.orig/arch/sparc/lib/mcount.S +++ linux-ec2-2.6.32/arch/sparc/lib/mcount.S @@ -64,8 +64,9 @@ 2: sethi %hi(softirq_stack), %g3 or %g3, %lo(softirq_stack), %g3 ldx [%g3 + %g1], %g7 + sub %g7, STACK_BIAS, %g7 cmp %sp, %g7 - bleu,pt %xcc, 2f + bleu,pt %xcc, 3f sethi %hi(THREAD_SIZE), %g3 add %g7, %g3, %g7 cmp %sp, %g7 @@ -75,7 +76,7 @@ * again, we are already trying to output the stack overflow * message. */ - sethi %hi(ovstack), %g7 ! cant move to panic stack fast enough +3: sethi %hi(ovstack), %g7 ! cant move to panic stack fast enough or %g7, %lo(ovstack), %g7 add %g7, OVSTACKSIZE, %g3 sub %g3, STACK_BIAS + 192, %g3 --- linux-ec2-2.6.32.orig/arch/sparc/prom/p1275.c +++ linux-ec2-2.6.32/arch/sparc/prom/p1275.c @@ -32,8 +32,7 @@ extern void prom_cif_callback(void); /* - * This provides SMP safety on the p1275buf. prom_callback() drops this lock - * to allow recursuve acquisition. + * This provides SMP safety on the p1275buf. */ DEFINE_SPINLOCK(prom_entry_lock); @@ -47,7 +46,9 @@ p = p1275buf.prom_buffer; - spin_lock_irqsave(&prom_entry_lock, flags); + raw_local_save_flags(flags); + raw_local_irq_restore(PIL_NMI); + spin_lock(&prom_entry_lock); p1275buf.prom_args[0] = (unsigned long)p; /* service */ strcpy (p, service); @@ -139,7 +140,8 @@ va_end(list); x = p1275buf.prom_args [nargs + 3]; - spin_unlock_irqrestore(&prom_entry_lock, flags); + spin_unlock(&prom_entry_lock); + raw_local_irq_restore(flags); return x; } --- linux-ec2-2.6.32.orig/arch/um/kernel/syscall.c +++ linux-ec2-2.6.32/arch/um/kernel/syscall.c @@ -8,6 +8,7 @@ #include "linux/mm.h" #include "linux/sched.h" #include "linux/utsname.h" +#include "linux/syscalls.h" #include "asm/current.h" #include "asm/mman.h" #include "asm/uaccess.h" @@ -37,31 +38,6 @@ return ret; } -/* common code for old and new mmaps */ -long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - long error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - out: - return error; -} - long old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) @@ -70,7 +46,7 @@ if (offset & ~PAGE_MASK) goto out; - err = sys_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + err = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); out: return err; } --- linux-ec2-2.6.32.orig/arch/um/sys-i386/shared/sysdep/syscalls.h +++ linux-ec2-2.6.32/arch/um/sys-i386/shared/sysdep/syscalls.h @@ -20,7 +20,3 @@ #define EXECUTE_SYSCALL(syscall, regs) \ ((long (*)(struct syscall_args)) \ (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) - -extern long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); --- linux-ec2-2.6.32.orig/arch/x86/Kbuild +++ linux-ec2-2.6.32/arch/x86/Kbuild @@ -2,7 +2,7 @@ obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support -obj-$(CONFIG_XEN) += xen/ +obj-$(CONFIG_PARAVIRT_XEN) += xen/ # lguest paravirtualization support obj-$(CONFIG_LGUEST_GUEST) += lguest/ --- linux-ec2-2.6.32.orig/arch/x86/Kconfig +++ linux-ec2-2.6.32/arch/x86/Kconfig @@ -24,7 +24,7 @@ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PERF_EVENTS if (!M386 && !M486) + select HAVE_PERF_EVENTS if (!M386 && !M486 && !XEN) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -39,16 +39,16 @@ select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_SYSCALL_TRACEPOINTS - select HAVE_KVM - select HAVE_ARCH_KGDB + select HAVE_KVM if !XEN + select HAVE_ARCH_KGDB if !XEN select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select USER_STACKTRACE_SUPPORT select HAVE_DMA_API_DEBUG select HAVE_KERNEL_GZIP - select HAVE_KERNEL_BZIP2 - select HAVE_KERNEL_LZMA + select HAVE_KERNEL_BZIP2 if !XEN + select HAVE_KERNEL_LZMA if !XEN select HAVE_ARCH_KMEMCHECK config OUTPUT_FORMAT @@ -69,13 +69,16 @@ config CLOCKSOURCE_WATCHDOG def_bool y + depends on !XEN config GENERIC_CLOCKEVENTS def_bool y + depends on !XEN config GENERIC_CLOCKEVENTS_BROADCAST def_bool y depends on X86_64 || (X86_32 && X86_LOCAL_APIC) + depends on !XEN config LOCKDEP_SUPPORT def_bool y @@ -157,6 +160,7 @@ config ARCH_HIBERNATION_POSSIBLE def_bool y + depends on !XEN config ARCH_SUSPEND_POSSIBLE def_bool y @@ -213,14 +217,23 @@ config X86_HT bool - depends on SMP + depends on SMP && !XEN default y config X86_TRAMPOLINE bool depends on SMP || (64BIT && ACPI_SLEEP) + depends on !XEN default y +config X86_NO_TSS + def_bool y + depends on XEN + +config X86_NO_IDT + def_bool y + depends on XEN + config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR @@ -298,13 +311,25 @@ For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it +config X86_XEN + bool "Xen-compatible" + depends on X86_32 + select XEN + select X86_PAE + select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST + select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST + select SWIOTLB + help + Choose this option if you plan to run this kernel on top of the + Xen Hypervisor. + config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" - depends on X86_32 && SMP + depends on X86_32 && SMP && !XEN ---help--- This option is needed for the systems that have more than 8 CPUs -if X86_32 +if X86_32 && !XEN config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -327,7 +352,15 @@ generic distribution kernel, say Y here - otherwise say N. endif -if X86_64 +config X86_64_XEN + bool "Enable Xen compatible kernel" + depends on X86_64 + select XEN + select SWIOTLB + help + This option will compile a kernel compatible with Xen hypervisor + +if X86_64 && !XEN config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -370,6 +403,12 @@ # Following is an alphabetically sorted list of 32 bit extended platforms # Please maintain the alphabetic order if and when there are additions +config X86_LPIA + bool "LPIA-compatible" + depends on X86_32 && X86_PC + help + Choose this option if your computer is an LPIA platform. + config X86_ELAN bool "AMD Elan" depends on X86_32 @@ -480,6 +519,7 @@ menuconfig PARAVIRT_GUEST bool "Paravirtualized guest support" + depends on !XEN ---help--- Say Y here to get to see options related to running Linux under various hypervisors. This option alone does not add any kernel code. @@ -565,6 +605,7 @@ config MEMTEST bool "Memtest" + depends on !XEN ---help--- This option adds a kernel parameter 'memtest', which allows memtest to be set. @@ -587,6 +628,7 @@ config HPET_TIMER def_bool X86_64 prompt "HPET Timer Support" if X86_32 + depends on !XEN ---help--- Use the IA-PC HPET (High Precision Event Timer) to manage time in preference to the PIT and RTC, if a HPET is @@ -612,6 +654,7 @@ config DMI default y bool "Enable DMI scanning" if EMBEDDED + depends on !XEN_UNPRIVILEGED_GUEST ---help--- Enabled scanning of DMI to identify machine quirks. Say Y here unless you have verified that your setup is not @@ -622,7 +665,7 @@ bool "GART IOMMU support" if EMBEDDED default y select SWIOTLB - depends on X86_64 && PCI + depends on X86_64 && PCI && !X86_64_XEN ---help--- Support for full DMA access of devices with 32bit memory access only on systems with more than 3GB. This is usually needed for USB, @@ -637,7 +680,7 @@ config CALGARY_IOMMU bool "IBM Calgary IOMMU support" select SWIOTLB - depends on X86_64 && PCI && EXPERIMENTAL + depends on X86_64 && PCI && !X86_64_XEN && EXPERIMENTAL ---help--- Support for hardware IOMMUs in IBM's xSeries x366 and x460 systems. Needed to run systems with more than 3GB of memory @@ -667,7 +710,7 @@ bool "AMD IOMMU support" select SWIOTLB select PCI_MSI - depends on X86_64 && PCI && ACPI + depends on X86_64 && PCI && ACPI && !XEN ---help--- With this option you can enable support for AMD IOMMU hardware in your system. An IOMMU is a hardware component which provides @@ -716,11 +759,12 @@ config NR_CPUS int "Maximum number of CPUs" if SMP && !MAXSMP - range 2 8 if SMP && X86_32 && !X86_BIGSMP + range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN range 2 512 if SMP && !MAXSMP default "1" if !SMP default "4096" if MAXSMP default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) + default "16" if X86_64_XEN default "8" if SMP ---help--- This allows you to specify the maximum number of CPUs which this @@ -752,7 +796,7 @@ config X86_UP_APIC bool "Local APIC support on uniprocessors" - depends on X86_32 && !SMP && !X86_32_NON_STANDARD + depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !XEN_UNPRIVILEGED_GUEST ---help--- A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -778,10 +822,12 @@ config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + depends on !XEN_UNPRIVILEGED_GUEST config X86_IO_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + depends on !XEN_UNPRIVILEGED_GUEST config X86_VISWS_APIC def_bool y @@ -790,7 +836,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" default n - depends on X86_IO_APIC + depends on X86_IO_APIC && !XEN ---help--- This option enables a workaround that fixes a source of spurious interrupts. This is recommended when threaded @@ -813,6 +859,7 @@ config X86_MCE bool "Machine Check / overheating reporting" + depends on !XEN_UNPRIVILEGED_GUEST ---help--- Machine Check support allows the processor to notify the kernel if it detects a problem (e.g. overheating, data corruption). @@ -822,22 +869,30 @@ config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && !XEN ---help--- Additional support for intel specific MCE features such as the thermal monitor. +config X86_MCE_XEON75XX + tristate "Intel Xeon 7500 series corrected memory error driver" + depends on X86_MCE_INTEL && PCI + ---help--- + Add support for a Intel Xeon 7500 series specific memory error driver. + This allows to report the DIMM and physical address on a corrected + memory error machine check event. + config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && !XEN ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. config X86_ANCIENT_MCE def_bool n - depends on X86_32 && X86_MCE + depends on X86_32 && X86_MCE && !XEN prompt "Support for old Pentium 5 / WinChip machine checks" ---help--- Include support for machine check handling on old Pentium 5 or WinChip @@ -857,6 +912,10 @@ If you don't know what a machine check is and you don't do kernel QA it is safe to say n. +config X86_XEN_MCE + def_bool y + depends on XEN && X86_MCE + config X86_THERMAL_VECTOR def_bool y depends on X86_MCE_INTEL @@ -909,7 +968,7 @@ config X86_REBOOTFIXUPS bool "Enable X86 board specific fixups for reboot" - depends on X86_32 + depends on X86_32 && !XEN ---help--- This enables chipset and/or board specific fixups to be done in order to get reboot to work correctly. This is only needed on @@ -926,6 +985,7 @@ config MICROCODE tristate "/dev/cpu/microcode - microcode support" + depends on !XEN_UNPRIVILEGED_GUEST select FW_LOADER ---help--- If you say Y here, you will be able to update the microcode on @@ -944,7 +1004,7 @@ config MICROCODE_INTEL bool "Intel microcode patch loading support" - depends on MICROCODE + depends on MICROCODE && !XEN default MICROCODE select FW_LOADER ---help--- @@ -957,7 +1017,7 @@ config MICROCODE_AMD bool "AMD microcode patch loading support" - depends on MICROCODE + depends on MICROCODE && !XEN select FW_LOADER ---help--- If you select this option, microcode patch loading support for AMD @@ -984,12 +1044,6 @@ with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. -config X86_CPU_DEBUG - tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support" - ---help--- - If you select this option, this will provide various x86 CPUs - information through debugfs. - choice prompt "High Memory Support" default HIGHMEM4G if !X86_NUMAQ @@ -1113,7 +1167,7 @@ config DIRECT_GBPAGES bool "Enable 1GB pages for kernel pagetables" if EMBEDDED default y - depends on X86_64 + depends on X86_64 && !XEN ---help--- Allow the kernel linear mapping to use 1GB pages on CPUs that support it. This can improve the kernel's performance a tiny bit by @@ -1122,7 +1176,7 @@ # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" - depends on SMP + depends on SMP && !XEN depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) ---help--- @@ -1226,11 +1280,12 @@ config ARCH_SPARSEMEM_DEFAULT def_bool y - depends on X86_64 + depends on X86_64 && !X86_64_XEN config ARCH_SPARSEMEM_ENABLE def_bool y depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD + depends on !XEN select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 @@ -1255,6 +1310,7 @@ config X86_CHECK_BIOS_CORRUPTION bool "Check for low memory corruption" + depends on !XEN ---help--- Periodically check for memory corruption in low memory, which is suspected to be caused by BIOS. Even when enabled in the @@ -1285,6 +1341,7 @@ config X86_RESERVE_LOW_64K bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" + depends on !XEN default y ---help--- Reserve the first 64K of physical RAM on BIOSes that are known @@ -1306,6 +1363,7 @@ config MATH_EMULATION bool prompt "Math emulation" if X86_32 + depends on !XEN ---help--- Linux can emulate a math coprocessor (used for floating point operations) if you don't have one. 486DX and Pentium processors have @@ -1331,6 +1389,7 @@ config MTRR bool "MTRR (Memory Type Range Register) support" + depends on !XEN_UNPRIVILEGED_GUEST ---help--- On Intel P6 family processors (Pentium Pro, Pentium II and later) the Memory Type Range Registers (MTRRs) may be used to control @@ -1366,7 +1425,7 @@ config MTRR_SANITIZER def_bool y prompt "MTRR cleanup support" - depends on MTRR + depends on MTRR && !XEN ---help--- Convert MTRR layout from continuous to discrete, so X drivers can add writeback entries. @@ -1415,7 +1474,7 @@ config EFI bool "EFI runtime service support" - depends on ACPI + depends on ACPI && !XEN ---help--- This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). @@ -1463,6 +1522,7 @@ config KEXEC bool "kexec system call" + depends on !XEN_UNPRIVILEGED_GUEST ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1480,6 +1540,7 @@ config CRASH_DUMP bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) + depends on !XEN ---help--- Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels @@ -1500,7 +1561,8 @@ code in physical address mode via KEXEC config PHYSICAL_START - hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) + hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP || XEN) + default 0x100000 if XEN default "0x1000000" ---help--- This gives the physical address where the kernel is loaded. @@ -1542,6 +1604,7 @@ config RELOCATABLE bool "Build a relocatable kernel" + depends on !XEN default y ---help--- This builds a kernel image that retains relocation information @@ -1564,7 +1627,8 @@ config PHYSICAL_ALIGN hex - prompt "Alignment value to which kernel should be aligned" if X86_32 + prompt "Alignment value to which kernel should be aligned" if X86_32 && !XEN + default 0x2000 if XEN default "0x1000000" range 0x2000 0x1000000 ---help--- @@ -1659,6 +1723,7 @@ config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) + depends on !XEN config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y @@ -1669,6 +1734,7 @@ depends on NUMA menu "Power management and ACPI options" + depends on !XEN_UNPRIVILEGED_GUEST config ARCH_HIBERNATION_HEADER def_bool y @@ -1687,7 +1753,7 @@ menuconfig APM tristate "APM (Advanced Power Management) BIOS support" - depends on X86_32 && PM_SLEEP + depends on X86_32 && PM_SLEEP && !XEN ---help--- APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with @@ -1821,6 +1887,7 @@ bool "PCI support" default y select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) + select ARCH_SUPPORTS_MSI if (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND) ---help--- Find out whether you have a PCI motherboard. PCI is the name of a bus system, i.e. the way the CPU talks to the other stuff inside @@ -1848,6 +1915,7 @@ config PCI_GOBIOS bool "BIOS" + depends on !XEN config PCI_GOMMCONFIG bool "MMConfig" @@ -1859,6 +1927,13 @@ bool "OLPC" depends on OLPC +config PCI_GOXEN_FE + bool "Xen PCI Frontend" + depends on X86_XEN + help + The PCI device frontend driver allows the kernel to import arbitrary + PCI devices from a PCI backend to support PCI driver domains. + config PCI_GOANY bool "Any" @@ -1866,7 +1941,7 @@ config PCI_BIOS def_bool y - depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY) + depends on X86_32 && PCI && !XEN && (PCI_GOBIOS || PCI_GOANY) # x86-64 doesn't support PCI BIOS access from long mode so always go direct. config PCI_DIRECT @@ -1889,9 +1964,24 @@ bool "Support mmconfig PCI config space access" depends on X86_64 && PCI && ACPI +config XEN_PCIDEV_FRONTEND + def_bool y + prompt "Xen PCI Frontend" if X86_64 + depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64) + select HOTPLUG + help + The PCI device frontend driver allows the kernel to import arbitrary + PCI devices from a PCI backend to support PCI driver domains. + +config XEN_PCIDEV_FE_DEBUG + bool "Xen PCI Frontend Debugging" + depends on XEN_PCIDEV_FRONTEND + help + Enables some debug statements within the PCI Frontend. + config DMAR bool "Support for DMA Remapping Devices (EXPERIMENTAL)" - depends on PCI_MSI && ACPI && EXPERIMENTAL + depends on PCI_MSI && ACPI && !XEN && EXPERIMENTAL help DMA remapping (DMAR) devices support enables independent address translations for Direct Memory Access (DMA) from devices. @@ -1933,7 +2023,7 @@ config INTR_REMAP bool "Support for Interrupt Remapping (EXPERIMENTAL)" - depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL + depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && !XEN && EXPERIMENTAL ---help--- Supports Interrupt remapping for IO-APIC and MSI devices. To use x2apic mode in the CPU's which support x2APIC enhancements or @@ -1951,6 +2041,7 @@ config ISA bool "ISA support" + depends on !XEN ---help--- Find out whether you have ISA slots on your motherboard. ISA is the name of a bus system, i.e. the way the CPU talks to the other stuff @@ -1978,6 +2069,7 @@ config MCA bool "MCA support" + depends on !XEN ---help--- MicroChannel Architecture is found in some IBM PS/2 machines and laptops. It is a bus system similar to PCI or ISA. See @@ -2079,7 +2171,11 @@ source "drivers/Kconfig" +source "ubuntu/Kconfig" + +if !XEN_UNPRIVILEGED_GUEST source "drivers/firmware/Kconfig" +endif source "fs/Kconfig" --- linux-ec2-2.6.32.orig/arch/x86/Kconfig.cpu +++ linux-ec2-2.6.32/arch/x86/Kconfig.cpu @@ -340,7 +340,7 @@ config X86_F00F_BUG def_bool y - depends on M586MMX || M586TSC || M586 || M486 || M386 + depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT config X86_WP_WORKS_OK def_bool y @@ -397,10 +397,11 @@ config X86_TSC def_bool y depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64 + depends on !XEN config X86_CMPXCHG64 def_bool y - depends on !M386 && !M486 + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM # this should be set for all -march=.. options where the compiler # generates cmov. @@ -496,7 +497,7 @@ config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT - depends on !64BIT + depends on !64BIT && !XEN ---help--- This enables detection, tunings and quirks for UMC processors @@ -509,13 +510,13 @@ config X86_DS def_bool X86_PTRACE_BTS - depends on X86_DEBUGCTLMSR + depends on X86_DEBUGCTLMSR && !XEN select HAVE_HW_BRANCH_TRACER config X86_PTRACE_BTS bool "Branch Trace Store" default y - depends on X86_DEBUGCTLMSR + depends on X86_DEBUGCTLMSR && !XEN depends on BROKEN ---help--- This adds a ptrace interface to the hardware's branch trace store. --- linux-ec2-2.6.32.orig/arch/x86/Kconfig.debug +++ linux-ec2-2.6.32/arch/x86/Kconfig.debug @@ -25,6 +25,7 @@ config X86_VERBOSE_BOOTUP bool "Enable verbose x86 bootup info messages" default y + depends on !XEN ---help--- Enables the informational output from the decompression stage (e.g. bzImage) of the boot. If you disable this you will still @@ -136,7 +137,7 @@ config DOUBLEFAULT default y bool "Enable doublefault exception handler" if EMBEDDED - depends on X86_32 + depends on X86_32 && !X86_NO_TSS ---help--- This option allows trapping of rare doublefault exceptions that would otherwise cause a system to silently reboot. Disabling this @@ -185,6 +186,7 @@ config HAVE_MMIOTRACE_SUPPORT def_bool y + depends on !XEN # # IO delay types: @@ -264,6 +266,7 @@ bool "Debug boot parameters" depends on DEBUG_KERNEL depends on DEBUG_FS + depends on !XEN ---help--- This option will cause struct boot_params to be exported via debugfs. --- linux-ec2-2.6.32.orig/arch/x86/Makefile +++ linux-ec2-2.6.32/arch/x86/Makefile @@ -146,8 +146,27 @@ BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage -PHONY += bzImage $(BOOT_TARGETS) +PHONY += bzImage vmlinuz $(BOOT_TARGETS) +ifdef CONFIG_XEN +LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \ + -I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE) + +ifdef CONFIG_X86_64 +LDFLAGS_vmlinux := -e startup_64 +endif + +# Default kernel to build +all: vmlinuz + +# KBUILD_IMAGE specifies the target image being built +KBUILD_IMAGE := $(boot)/vmlinuz + +vmlinuz: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) + $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot + $(Q)ln -fsn ../../x86/boot/$@ $(objtree)/arch/$(UTS_MACHINE)/boot/$@ +else # Default kernel to build all: bzImage @@ -158,6 +177,7 @@ $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ +endif $(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) $@ --- linux-ec2-2.6.32.orig/arch/x86/Makefile_32.cpu +++ linux-ec2-2.6.32/arch/x86/Makefile_32.cpu @@ -46,6 +46,13 @@ # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) +# Work around the pentium-mmx code generator madness of gcc4.4.x which +# does stack alignment by generating horrible code _before_ the mcount +# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph +# tracer assumptions. For i686, generic, core2 this is set by the +# compiler anyway +cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args) + # Bug fix for binutils: this option is required in order to keep # binutils from generating NOPL instructions against our will. ifneq ($(CONFIG_X86_P6_NOP),y) --- linux-ec2-2.6.32.orig/arch/x86/boot/Makefile +++ linux-ec2-2.6.32/arch/x86/boot/Makefile @@ -23,6 +23,7 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf bzImage +targets += vmlinuz vmlinux-stripped targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed @@ -195,6 +196,20 @@ cp System.map $(INSTALL_PATH)/ if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi +$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE + $(call if_changed,gzip) + @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' + +$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded +$(obj)/vmlinux-stripped: vmlinux FORCE + $(call if_changed,objcopy) + +ifndef CONFIG_XEN +bzImage := bzImage +else +bzImage := vmlinuz +endif + install: - sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ + sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/$(bzImage) \ System.map "$(INSTALL_PATH)" --- linux-ec2-2.6.32.orig/arch/x86/crypto/Makefile +++ linux-ec2-2.6.32/arch/x86/crypto/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o +obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o @@ -24,3 +25,5 @@ salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o + +ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o --- linux-ec2-2.6.32.orig/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ linux-ec2-2.6.32/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -0,0 +1,157 @@ +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains accelerated part of ghash + * implementation. More information about PCLMULQDQ can be found at: + * + * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * Vinodh Gopal + * Erdinc Ozturk + * Deniz Karakoyunlu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include + +.align 16 +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f +.Lpoly: + .octa 0xc2000000000000000000000000000001 +.Ltwo_one: + .octa 0x00000001000000000000000000000001 + +#define DATA %xmm0 +#define SHASH %xmm1 +#define T1 %xmm2 +#define T2 %xmm3 +#define T3 %xmm4 +#define BSWAP %xmm5 +#define IN1 %xmm6 + +.text + +/* + * __clmul_gf128mul_ble: internal ABI + * input: + * DATA: operand1 + * SHASH: operand2, hash_key << 1 mod poly + * output: + * DATA: operand1 * operand2 mod poly + * changed: + * T1 + * T2 + * T3 + */ +__clmul_gf128mul_ble: + movaps DATA, T1 + pshufd $0b01001110, DATA, T2 + pshufd $0b01001110, SHASH, T3 + pxor DATA, T2 + pxor SHASH, T3 + + # pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 + .byte 0x66, 0x0f, 0x3a, 0x44, 0xc1, 0x00 + # pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 + .byte 0x66, 0x0f, 0x3a, 0x44, 0xd1, 0x11 + # pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) + .byte 0x66, 0x0f, 0x3a, 0x44, 0xdc, 0x00 + pxor DATA, T2 + pxor T1, T2 # T2 = a0 * b1 + a1 * b0 + + movaps T2, T3 + pslldq $8, T3 + psrldq $8, T2 + pxor T3, DATA + pxor T2, T1 # is result of + # carry-less multiplication + + # first phase of the reduction + movaps DATA, T3 + psllq $1, T3 + pxor DATA, T3 + psllq $5, T3 + pxor DATA, T3 + psllq $57, T3 + movaps T3, T2 + pslldq $8, T2 + psrldq $8, T3 + pxor T2, DATA + pxor T3, T1 + + # second phase of the reduction + movaps DATA, T2 + psrlq $5, T2 + pxor DATA, T2 + psrlq $1, T2 + pxor DATA, T2 + psrlq $1, T2 + pxor T2, T1 + pxor T1, DATA + ret + +/* void clmul_ghash_mul(char *dst, const be128 *shash) */ +ENTRY(clmul_ghash_mul) + movups (%rdi), DATA + movups (%rsi), SHASH + movaps .Lbswap_mask, BSWAP + pshufb BSWAP, DATA + call __clmul_gf128mul_ble + pshufb BSWAP, DATA + movups DATA, (%rdi) + ret + +/* + * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + * const be128 *shash); + */ +ENTRY(clmul_ghash_update) + cmp $16, %rdx + jb .Lupdate_just_ret # check length + movaps .Lbswap_mask, BSWAP + movups (%rdi), DATA + movups (%rcx), SHASH + pshufb BSWAP, DATA +.align 4 +.Lupdate_loop: + movups (%rsi), IN1 + pshufb BSWAP, IN1 + pxor IN1, DATA + call __clmul_gf128mul_ble + sub $16, %rdx + add $16, %rsi + cmp $16, %rdx + jge .Lupdate_loop + pshufb BSWAP, DATA + movups DATA, (%rdi) +.Lupdate_just_ret: + ret + +/* + * void clmul_ghash_setkey(be128 *shash, const u8 *key); + * + * Calculate hash_key << 1 mod poly + */ +ENTRY(clmul_ghash_setkey) + movaps .Lbswap_mask, BSWAP + movups (%rsi), %xmm0 + pshufb BSWAP, %xmm0 + movaps %xmm0, %xmm1 + psllq $1, %xmm0 + psrlq $63, %xmm1 + movaps %xmm1, %xmm2 + pslldq $8, %xmm1 + psrldq $8, %xmm2 + por %xmm1, %xmm0 + # reduction + pshufd $0b00100100, %xmm2, %xmm1 + pcmpeqd .Ltwo_one, %xmm1 + pand .Lpoly, %xmm1 + pxor %xmm1, %xmm0 + movups %xmm0, (%rdi) + ret --- linux-ec2-2.6.32.orig/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ linux-ec2-2.6.32/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -0,0 +1,333 @@ +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains glue code. + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +void clmul_ghash_mul(char *dst, const be128 *shash); + +void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + const be128 *shash); + +void clmul_ghash_setkey(be128 *shash, const u8 *key); + +struct ghash_async_ctx { + struct cryptd_ahash *cryptd_tfm; +}; + +struct ghash_ctx { + be128 shash; +}; + +struct ghash_desc_ctx { + u8 buffer[GHASH_BLOCK_SIZE]; + u32 bytes; +}; + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + clmul_ghash_setkey(&ctx->shash, key); + + return 0; +} + +static int ghash_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *dst = dctx->buffer; + + kernel_fpu_begin(); + if (dctx->bytes) { + int n = min(srclen, dctx->bytes); + u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + clmul_ghash_mul(dst, &ctx->shash); + } + + clmul_ghash_update(dst, src, srclen, &ctx->shash); + kernel_fpu_end(); + + if (srclen & 0xf) { + src += srclen - (srclen & 0xf); + srclen &= 0xf; + dctx->bytes = GHASH_BLOCK_SIZE - srclen; + while (srclen--) + *dst++ ^= *src++; + } + + return 0; +} + +static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) +{ + u8 *dst = dctx->buffer; + + if (dctx->bytes) { + u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes); + + while (dctx->bytes--) + *tmp++ ^= 0; + + kernel_fpu_begin(); + clmul_ghash_mul(dst, &ctx->shash); + kernel_fpu_end(); + } + + dctx->bytes = 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *buf = dctx->buffer; + + ghash_flush(ctx, dctx); + memcpy(dst, buf, GHASH_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg ghash_alg = { + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), + .base = { + .cra_name = "__ghash", + .cra_driver_name = "__ghash-pclmulqdqni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list), + }, +}; + +static int ghash_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (irq_fpu_usable()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_init(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return crypto_shash_init(desc); + } +} + +static int ghash_async_update(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (irq_fpu_usable()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_update(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return shash_ahash_update(req, desc); + } +} + +static int ghash_async_final(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (irq_fpu_usable()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_final(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return crypto_shash_final(desc, req->result); + } +} + +static int ghash_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (irq_fpu_usable()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_digest(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return shash_ahash_digest(req, desc); + } +} + +static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, + unsigned int keylen) +{ + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct crypto_ahash *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ahash_setkey(child, key, keylen); + crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) + & CRYPTO_TFM_RES_MASK); + + return 0; +} + +static int ghash_async_init_tfm(struct crypto_tfm *tfm) +{ + struct cryptd_ahash *cryptd_tfm; + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ctx->cryptd_tfm = cryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&cryptd_tfm->base)); + + return 0; +} + +static void ghash_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ahash(ctx->cryptd_tfm); +} + +static struct ahash_alg ghash_async_alg = { + .init = ghash_async_init, + .update = ghash_async_update, + .final = ghash_async_final, + .setkey = ghash_async_setkey, + .digest = ghash_async_digest, + .halg = { + .digestsize = GHASH_DIGEST_SIZE, + .base = { + .cra_name = "ghash", + .cra_driver_name = "ghash-clmulni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list), + .cra_init = ghash_async_init_tfm, + .cra_exit = ghash_async_exit_tfm, + }, + }, +}; + +static int __init ghash_pclmulqdqni_mod_init(void) +{ + int err; + + if (!cpu_has_pclmulqdq) { + printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not" + " detected.\n"); + return -ENODEV; + } + + err = crypto_register_shash(&ghash_alg); + if (err) + goto err_out; + err = crypto_register_ahash(&ghash_async_alg); + if (err) + goto err_shash; + + return 0; + +err_shash: + crypto_unregister_shash(&ghash_alg); +err_out: + return err; +} + +static void __exit ghash_pclmulqdqni_mod_exit(void) +{ + crypto_unregister_ahash(&ghash_async_alg); + crypto_unregister_shash(&ghash_alg); +} + +module_init(ghash_pclmulqdqni_mod_init); +module_exit(ghash_pclmulqdqni_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " + "acclerated by PCLMULQDQ-NI"); +MODULE_ALIAS("ghash"); --- linux-ec2-2.6.32.orig/arch/x86/ia32/ia32_aout.c +++ linux-ec2-2.6.32/arch/x86/ia32/ia32_aout.c @@ -308,14 +308,15 @@ if (retval) return retval; - regs->cs = __USER32_CS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_ABI_PENDING); + + setup_new_exec(bprm); + + regs->cs = __USER32_CS; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = + regs->r13 = regs->r14 = regs->r15 = 0; current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); @@ -326,7 +327,6 @@ current->mm->free_area_cache = TASK_UNMAPPED_BASE; current->mm->cached_hole_size = 0; - current->mm->mmap = NULL; install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; --- linux-ec2-2.6.32.orig/arch/x86/ia32/ia32entry.S +++ linux-ec2-2.6.32/arch/x86/ia32/ia32entry.S @@ -696,7 +696,7 @@ .quad quiet_ni_syscall /* streams2 */ .quad stub32_vfork /* 190 */ .quad compat_sys_getrlimit - .quad sys32_mmap2 + .quad sys_mmap_pgoff .quad sys32_truncate64 .quad sys32_ftruncate64 .quad sys32_stat64 /* 195 */ --- linux-ec2-2.6.32.orig/arch/x86/ia32/sys_ia32.c +++ linux-ec2-2.6.32/arch/x86/ia32/sys_ia32.c @@ -155,9 +155,6 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) { struct mmap_arg_struct a; - struct file *file = NULL; - unsigned long retval; - struct mm_struct *mm ; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; @@ -165,22 +162,8 @@ if (a.offset & ~PAGE_MASK) return -EINVAL; - if (!(a.flags & MAP_ANONYMOUS)) { - file = fget(a.fd); - if (!file) - return -EBADF; - } - - mm = current->mm; - down_write(&mm->mmap_sem); - retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset>>PAGE_SHIFT); - if (file) - fput(file); - - up_write(&mm->mmap_sem); - - return retval; } asmlinkage long sys32_mprotect(unsigned long start, size_t len, @@ -539,30 +522,6 @@ return ret; } -asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct mm_struct *mm = current->mm; - unsigned long error; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); - return error; -} - asmlinkage long sys32_olduname(struct oldold_utsname __user *name) { char *arch = "x86_64"; --- linux-ec2-2.6.32.orig/arch/x86/ia32/ia32entry-xen.S +++ linux-ec2-2.6.32/arch/x86/ia32/ia32entry-xen.S @@ -0,0 +1,782 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysexit_audit ia32_ret_from_sys_call +#define sysretl_audit ia32_ret_from_sys_call +#endif + +#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) + + .macro IA32_ARG_FIXUP noebp=0 + movl %edi,%r8d + .if \noebp + .else + movl %ebp,%r9d + .endif + xchg %ecx,%esi + movl %ebx,%edi + movl %edx,%edx /* zero extension */ + .endm + + /* clobbers %eax */ + .macro CLEAR_RREGS offset=0, _r9=rax + xorl %eax,%eax + movq %rax,\offset+R11(%rsp) + movq %rax,\offset+R10(%rsp) + movq %\_r9,\offset+R9(%rsp) + movq %rax,\offset+R8(%rsp) + .endm + + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %eax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ + .macro LOAD_ARGS32 offset, _r9=0 + .if \_r9 + movl \offset+16(%rsp),%r9d + .endif + movl \offset+40(%rsp),%ecx + movl \offset+48(%rsp),%edx + movl \offset+56(%rsp),%esi + movl \offset+64(%rsp),%edi + .endm + + .macro CFI_STARTPROC32 simple + CFI_STARTPROC \simple + CFI_UNDEFINED r8 + CFI_UNDEFINED r9 + CFI_UNDEFINED r10 + CFI_UNDEFINED r11 + CFI_UNDEFINED r12 + CFI_UNDEFINED r13 + CFI_UNDEFINED r14 + CFI_UNDEFINED r15 + .endm + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) + +ENTRY(native_irq_enable_sysexit) + swapgs + sti + sysexit +ENDPROC(native_irq_enable_sysexit) +#endif + +/* + * 32bit SYSENTER instruction entry. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx Arg2 + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp user stack + * 0(%ebp) Arg6 + * + * Interrupts on. + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. Set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8-RIP+16 + /*CFI_REL_OFFSET ss,SS-RIP+16*/ + CFI_REL_OFFSET rsp,RSP-RIP+16 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ + /*CFI_REL_OFFSET cs,CS-RIP+16*/ + CFI_REL_OFFSET rip,RIP-RIP+16 + CFI_REL_OFFSET r11,8 + CFI_REL_OFFSET rcx,0 + movq 8(%rsp),%r11 + CFI_RESTORE r11 + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + movl %ebp,%ebp /* zero extension */ + movl %eax,%eax + movl 48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d + movl $__USER32_DS,40(%rsp) + movq %rbp,32(%rsp) + movl $__USER32_CS,16(%rsp) + movq %r10,8(%rsp) + movq %rax,(%rsp) + cld + SAVE_ARGS 0,0,1 + /* no need to do an access_ok check here because rbp has been + 32bit zero extended */ +1: movl (%rbp),%ebp + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + GET_THREAD_INFO(%r10) + orl $TS_COMPAT,TI_status(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) + jnz sysenter_tracesys + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys +sysenter_do_call: + IA32_ARG_FIXUP +sysenter_dispatch: + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) + GET_THREAD_INFO(%r10) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK,TI_flags(%r10) + jnz sysexit_audit + jmp ia32_ret_from_sys_call + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi,%r9d /* 6th arg: 4th syscall arg */ + movl %edx,%r8d /* 5th arg: 3rd syscall arg */ + /* (already in %ecx) 4th arg: 2nd syscall arg */ + movl %ebx,%edx /* 3rd arg: 1st syscall arg */ + movl %eax,%esi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ + call audit_syscall_entry + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys + movl %ebx,%edi /* reload 1st syscall arg */ + movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + jnz ia32_ret_from_sys_call + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax,%esi /* second arg, syscall return value */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + CLEAR_RREGS -ARGOFFSET + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + .endm + +sysenter_auditsys: + auditsys_entry_common + movl %ebp,%r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + jz sysenter_auditsys +#endif + SAVE_REST + CLEAR_RREGS + movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + cmpl $(IA32_NR_syscalls-1),%eax + ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ + jmp sysenter_do_call + CFI_ENDPROC +ENDPROC(ia32_sysenter_target) + +/* + * 32bit SYSCALL instruction entry. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx return EIP + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp Arg2 [note: not saved in the stack frame, should not be touched] + * %esp user stack + * 0(%esp) Arg6 + * + * Interrupts on. + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. Set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8-RIP+16 + /*CFI_REL_OFFSET ss,SS-RIP+16*/ + CFI_REL_OFFSET rsp,RSP-RIP+16 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ + /*CFI_REL_OFFSET cs,CS-RIP+16*/ + CFI_REL_OFFSET rip,RIP-RIP+16 + movl %eax,%eax /* zero extension */ + movl RSP-RIP+16(%rsp),%r8d + SAVE_ARGS -8,1,1 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ + movl %ebp,%ecx + movl $__USER32_CS,CS-ARGOFFSET(%rsp) + movl $__USER32_DS,SS-ARGOFFSET(%rsp) + /* no need to do an access_ok check here because r8 has been + 32bit zero extended */ + /* hardware stack frame is complete now */ +1: movl (%r8),%r9d + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + GET_THREAD_INFO(%r10) + orl $TS_COMPAT,TI_status(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) + jnz cstar_tracesys + cmpl $IA32_NR_syscalls-1,%eax + ja ia32_badsys +cstar_do_call: + IA32_ARG_FIXUP 1 +cstar_dispatch: + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) + GET_THREAD_INFO(%r10) + DISABLE_INTERRUPTS(CLBR_NONE) + testl $_TIF_ALLWORK_MASK,TI_flags(%r10) + jnz sysretl_audit + jmp ia32_ret_from_sys_call + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ + auditsys_entry_common + movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + jz cstar_auditsys +#endif + xchgl %r9d,%ebp + SAVE_REST + CLEAR_RREGS 0, r9 + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ + RESTORE_REST + xchgl %ebp,%r9d + cmpl $(IA32_NR_syscalls-1),%eax + ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ + jmp cstar_do_call +END(ia32_cstar_target) + +ia32_badarg: + movq $-EFAULT,%rax + jmp ia32_sysret + CFI_ENDPROC + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx Arg2 + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp Arg6 [note: not saved in the stack frame, should not be touched] + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except %eax must be saved (but ptrace may violate that) + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts on. + */ + +ENTRY(ia32_syscall) + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8-RIP+16 + /*CFI_REL_OFFSET ss,SS-RIP+16*/ + CFI_REL_OFFSET rsp,RSP-RIP+16 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ + /*CFI_REL_OFFSET cs,CS-RIP+16*/ + CFI_REL_OFFSET rip,RIP-RIP+16 + CFI_REL_OFFSET r11,8 + CFI_REL_OFFSET rcx,0 + movq 8(%rsp),%r11 + CFI_RESTORE r11 + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + movl %eax,%eax + movq %rax,(%rsp) + cld + /* note the registers are not zero extended to the sf. + this could be a problem. */ + SAVE_ARGS 0,0,1 + GET_THREAD_INFO(%r10) + orl $TS_COMPAT,TI_status(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) + jnz ia32_tracesys + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys +ia32_do_call: + IA32_ARG_FIXUP + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative +ia32_sysret: + movq %rax,RAX-ARGOFFSET(%rsp) +ia32_ret_from_sys_call: + CLEAR_RREGS -ARGOFFSET + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_REST + CLEAR_RREGS + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + cmpl $(IA32_NR_syscalls-1),%eax + ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ + jmp ia32_do_call +END(ia32_syscall) + +ia32_badsys: + movq $0,ORIG_RAX-ARGOFFSET(%rsp) + movq $-ENOSYS,%rax + jmp ia32_sysret + +quiet_ni_syscall: + movq $-ENOSYS,%rax + ret + CFI_ENDPROC + + .macro PTREGSCALL label, func, arg + .globl \label +\label: + leaq \func(%rip),%rax + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ + jmp ia32_ptregs_common + .endm + + CFI_STARTPROC32 + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi + PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi + PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx + PTREGSCALL stub32_execve, sys32_execve, %rcx + PTREGSCALL stub32_fork, sys_fork, %rdi + PTREGSCALL stub32_clone, sys32_clone, %rdx + PTREGSCALL stub32_vfork, sys_vfork, %rdi + PTREGSCALL stub32_iopl, sys_iopl, %rsi + +ENTRY(ia32_ptregs_common) + popq %r11 + CFI_ENDPROC + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8-ARGOFFSET + CFI_REL_OFFSET rax,RAX-ARGOFFSET + CFI_REL_OFFSET rcx,RCX-ARGOFFSET + CFI_REL_OFFSET rdx,RDX-ARGOFFSET + CFI_REL_OFFSET rsi,RSI-ARGOFFSET + CFI_REL_OFFSET rdi,RDI-ARGOFFSET + CFI_REL_OFFSET rip,RIP-ARGOFFSET +/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ +/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ + CFI_REL_OFFSET rsp,RSP-ARGOFFSET +/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ + SAVE_REST + call *%rax + RESTORE_REST + jmp ia32_sysret /* misbalances the return cache */ + CFI_ENDPROC +END(ia32_ptregs_common) + + .section .rodata,"a" + .align 8 +ia32_sys_call_table: + .quad sys_restart_syscall + .quad sys_exit + .quad stub32_fork + .quad sys_read + .quad sys_write + .quad compat_sys_open /* 5 */ + .quad sys_close + .quad sys32_waitpid + .quad sys_creat + .quad sys_link + .quad sys_unlink /* 10 */ + .quad stub32_execve + .quad sys_chdir + .quad compat_sys_time + .quad sys_mknod + .quad sys_chmod /* 15 */ + .quad sys_lchown16 + .quad quiet_ni_syscall /* old break syscall holder */ + .quad sys_stat + .quad sys32_lseek + .quad sys_getpid /* 20 */ + .quad compat_sys_mount /* mount */ + .quad sys_oldumount /* old_umount */ + .quad sys_setuid16 + .quad sys_getuid16 + .quad compat_sys_stime /* stime */ /* 25 */ + .quad compat_sys_ptrace /* ptrace */ + .quad sys_alarm + .quad sys_fstat /* (old)fstat */ + .quad sys_pause + .quad compat_sys_utime /* 30 */ + .quad quiet_ni_syscall /* old stty syscall holder */ + .quad quiet_ni_syscall /* old gtty syscall holder */ + .quad sys_access + .quad sys_nice + .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ + .quad sys_sync + .quad sys32_kill + .quad sys_rename + .quad sys_mkdir + .quad sys_rmdir /* 40 */ + .quad sys_dup + .quad sys_pipe + .quad compat_sys_times + .quad quiet_ni_syscall /* old prof syscall holder */ + .quad sys_brk /* 45 */ + .quad sys_setgid16 + .quad sys_getgid16 + .quad sys_signal + .quad sys_geteuid16 + .quad sys_getegid16 /* 50 */ + .quad sys_acct + .quad sys_umount /* new_umount */ + .quad quiet_ni_syscall /* old lock syscall holder */ + .quad compat_sys_ioctl + .quad compat_sys_fcntl64 /* 55 */ + .quad quiet_ni_syscall /* old mpx syscall holder */ + .quad sys_setpgid + .quad quiet_ni_syscall /* old ulimit syscall holder */ + .quad sys32_olduname + .quad sys_umask /* 60 */ + .quad sys_chroot + .quad compat_sys_ustat + .quad sys_dup2 + .quad sys_getppid + .quad sys_getpgrp /* 65 */ + .quad sys_setsid + .quad sys32_sigaction + .quad sys_sgetmask + .quad sys_ssetmask + .quad sys_setreuid16 /* 70 */ + .quad sys_setregid16 + .quad sys32_sigsuspend + .quad compat_sys_sigpending + .quad sys_sethostname + .quad compat_sys_setrlimit /* 75 */ + .quad compat_sys_old_getrlimit /* old_getrlimit */ + .quad compat_sys_getrusage + .quad compat_sys_gettimeofday + .quad compat_sys_settimeofday + .quad sys_getgroups16 /* 80 */ + .quad sys_setgroups16 + .quad sys32_old_select + .quad sys_symlink + .quad sys_lstat + .quad sys_readlink /* 85 */ + .quad sys_uselib + .quad sys_swapon + .quad sys_reboot + .quad compat_sys_old_readdir + .quad sys32_mmap /* 90 */ + .quad sys_munmap + .quad sys_truncate + .quad sys_ftruncate + .quad sys_fchmod + .quad sys_fchown16 /* 95 */ + .quad sys_getpriority + .quad sys_setpriority + .quad quiet_ni_syscall /* old profil syscall holder */ + .quad compat_sys_statfs + .quad compat_sys_fstatfs /* 100 */ + .quad sys_ioperm + .quad compat_sys_socketcall + .quad sys_syslog + .quad compat_sys_setitimer + .quad compat_sys_getitimer /* 105 */ + .quad compat_sys_newstat + .quad compat_sys_newlstat + .quad compat_sys_newfstat + .quad sys32_uname + .quad stub32_iopl /* 110 */ + .quad sys_vhangup + .quad quiet_ni_syscall /* old "idle" system call */ + .quad sys32_vm86_warning /* vm86old */ + .quad compat_sys_wait4 + .quad sys_swapoff /* 115 */ + .quad compat_sys_sysinfo + .quad sys32_ipc + .quad sys_fsync + .quad stub32_sigreturn + .quad stub32_clone /* 120 */ + .quad sys_setdomainname + .quad sys_uname + .quad sys_modify_ldt + .quad compat_sys_adjtimex + .quad sys32_mprotect /* 125 */ + .quad compat_sys_sigprocmask + .quad quiet_ni_syscall /* create_module */ + .quad sys_init_module + .quad sys_delete_module + .quad quiet_ni_syscall /* 130 get_kernel_syms */ + .quad sys32_quotactl + .quad sys_getpgid + .quad sys_fchdir + .quad quiet_ni_syscall /* bdflush */ + .quad sys_sysfs /* 135 */ + .quad sys_personality + .quad quiet_ni_syscall /* for afs_syscall */ + .quad sys_setfsuid16 + .quad sys_setfsgid16 + .quad sys_llseek /* 140 */ + .quad compat_sys_getdents + .quad compat_sys_select + .quad sys_flock + .quad sys_msync + .quad compat_sys_readv /* 145 */ + .quad compat_sys_writev + .quad sys_getsid + .quad sys_fdatasync + .quad sys32_sysctl /* sysctl */ + .quad sys_mlock /* 150 */ + .quad sys_munlock + .quad sys_mlockall + .quad sys_munlockall + .quad sys_sched_setparam + .quad sys_sched_getparam /* 155 */ + .quad sys_sched_setscheduler + .quad sys_sched_getscheduler + .quad sys_sched_yield + .quad sys_sched_get_priority_max + .quad sys_sched_get_priority_min /* 160 */ + .quad sys32_sched_rr_get_interval + .quad compat_sys_nanosleep + .quad sys_mremap + .quad sys_setresuid16 + .quad sys_getresuid16 /* 165 */ + .quad sys32_vm86_warning /* vm86 */ + .quad quiet_ni_syscall /* query_module */ + .quad sys_poll + .quad compat_sys_nfsservctl + .quad sys_setresgid16 /* 170 */ + .quad sys_getresgid16 + .quad sys_prctl + .quad stub32_rt_sigreturn + .quad sys32_rt_sigaction + .quad sys32_rt_sigprocmask /* 175 */ + .quad sys32_rt_sigpending + .quad compat_sys_rt_sigtimedwait + .quad sys32_rt_sigqueueinfo + .quad sys_rt_sigsuspend + .quad sys32_pread /* 180 */ + .quad sys32_pwrite + .quad sys_chown16 + .quad sys_getcwd + .quad sys_capget + .quad sys_capset + .quad stub32_sigaltstack + .quad sys32_sendfile + .quad quiet_ni_syscall /* streams1 */ + .quad quiet_ni_syscall /* streams2 */ + .quad stub32_vfork /* 190 */ + .quad compat_sys_getrlimit + .quad sys_mmap_pgoff + .quad sys32_truncate64 + .quad sys32_ftruncate64 + .quad sys32_stat64 /* 195 */ + .quad sys32_lstat64 + .quad sys32_fstat64 + .quad sys_lchown + .quad sys_getuid + .quad sys_getgid /* 200 */ + .quad sys_geteuid + .quad sys_getegid + .quad sys_setreuid + .quad sys_setregid + .quad sys_getgroups /* 205 */ + .quad sys_setgroups + .quad sys_fchown + .quad sys_setresuid + .quad sys_getresuid + .quad sys_setresgid /* 210 */ + .quad sys_getresgid + .quad sys_chown + .quad sys_setuid + .quad sys_setgid + .quad sys_setfsuid /* 215 */ + .quad sys_setfsgid + .quad sys_pivot_root + .quad sys_mincore + .quad sys_madvise + .quad compat_sys_getdents64 /* 220 getdents64 */ + .quad compat_sys_fcntl64 + .quad quiet_ni_syscall /* tux */ + .quad quiet_ni_syscall /* security */ + .quad sys_gettid + .quad sys32_readahead /* 225 */ + .quad sys_setxattr + .quad sys_lsetxattr + .quad sys_fsetxattr + .quad sys_getxattr + .quad sys_lgetxattr /* 230 */ + .quad sys_fgetxattr + .quad sys_listxattr + .quad sys_llistxattr + .quad sys_flistxattr + .quad sys_removexattr /* 235 */ + .quad sys_lremovexattr + .quad sys_fremovexattr + .quad sys_tkill + .quad sys_sendfile64 + .quad compat_sys_futex /* 240 */ + .quad compat_sys_sched_setaffinity + .quad compat_sys_sched_getaffinity + .quad sys_set_thread_area + .quad sys_get_thread_area + .quad compat_sys_io_setup /* 245 */ + .quad sys_io_destroy + .quad compat_sys_io_getevents + .quad compat_sys_io_submit + .quad sys_io_cancel + .quad sys32_fadvise64 /* 250 */ + .quad quiet_ni_syscall /* free_huge_pages */ + .quad sys_exit_group + .quad sys32_lookup_dcookie + .quad sys_epoll_create + .quad sys_epoll_ctl /* 255 */ + .quad sys_epoll_wait + .quad sys_remap_file_pages + .quad sys_set_tid_address + .quad compat_sys_timer_create + .quad compat_sys_timer_settime /* 260 */ + .quad compat_sys_timer_gettime + .quad sys_timer_getoverrun + .quad sys_timer_delete + .quad compat_sys_clock_settime + .quad compat_sys_clock_gettime /* 265 */ + .quad compat_sys_clock_getres + .quad compat_sys_clock_nanosleep + .quad compat_sys_statfs64 + .quad compat_sys_fstatfs64 + .quad sys_tgkill /* 270 */ + .quad compat_sys_utimes + .quad sys32_fadvise64_64 + .quad quiet_ni_syscall /* sys_vserver */ + .quad sys_mbind + .quad compat_sys_get_mempolicy /* 275 */ + .quad sys_set_mempolicy + .quad compat_sys_mq_open + .quad sys_mq_unlink + .quad compat_sys_mq_timedsend + .quad compat_sys_mq_timedreceive /* 280 */ + .quad compat_sys_mq_notify + .quad compat_sys_mq_getsetattr + .quad compat_sys_kexec_load /* reserved for kexec */ + .quad compat_sys_waitid + .quad quiet_ni_syscall /* 285: sys_altroot */ + .quad sys_add_key + .quad sys_request_key + .quad sys_keyctl + .quad sys_ioprio_set + .quad sys_ioprio_get /* 290 */ + .quad sys_inotify_init + .quad sys_inotify_add_watch + .quad sys_inotify_rm_watch + .quad sys_migrate_pages + .quad compat_sys_openat /* 295 */ + .quad sys_mkdirat + .quad sys_mknodat + .quad sys_fchownat + .quad compat_sys_futimesat + .quad sys32_fstatat /* 300 */ + .quad sys_unlinkat + .quad sys_renameat + .quad sys_linkat + .quad sys_symlinkat + .quad sys_readlinkat /* 305 */ + .quad sys_fchmodat + .quad sys_faccessat + .quad compat_sys_pselect6 + .quad compat_sys_ppoll + .quad sys_unshare /* 310 */ + .quad compat_sys_set_robust_list + .quad compat_sys_get_robust_list + .quad sys_splice + .quad sys32_sync_file_range + .quad sys_tee /* 315 */ + .quad compat_sys_vmsplice + .quad compat_sys_move_pages + .quad sys_getcpu + .quad sys_epoll_pwait + .quad compat_sys_utimensat /* 320 */ + .quad compat_sys_signalfd + .quad sys_timerfd_create + .quad sys_eventfd + .quad sys32_fallocate + .quad compat_sys_timerfd_settime /* 325 */ + .quad compat_sys_timerfd_gettime + .quad compat_sys_signalfd4 + .quad sys_eventfd2 + .quad sys_epoll_create1 + .quad sys_dup3 /* 330 */ + .quad sys_pipe2 + .quad sys_inotify_init1 + .quad compat_sys_preadv + .quad compat_sys_pwritev + .quad compat_sys_rt_tgsigqueueinfo /* 335 */ + .quad sys_perf_event_open +ia32_syscall_end: --- linux-ec2-2.6.32.orig/arch/x86/include/asm/acpi.h +++ linux-ec2-2.6.32/arch/x86/include/asm/acpi.h @@ -30,6 +30,10 @@ #include #include +#ifdef CONFIG_XEN +#include +#endif + #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long @@ -120,6 +124,27 @@ /* early initialization routine */ extern void acpi_reserve_bootmem(void); +#ifdef CONFIG_XEN +static inline int acpi_notify_hypervisor_state(u8 sleep_state, + u32 pm1a_cnt_val, + u32 pm1b_cnt_val) +{ + struct xen_platform_op op = { + .cmd = XENPF_enter_acpi_sleep, + .interface_version = XENPF_INTERFACE_VERSION, + .u = { + .enter_acpi_sleep = { + .pm1a_cnt_val = pm1a_cnt_val, + .pm1b_cnt_val = pm1b_cnt_val, + .sleep_state = sleep_state, + }, + }, + }; + + return HYPERVISOR_platform_op(&op); +} +#endif /* CONFIG_XEN */ + /* * Check if the CPU can handle C2 and deeper */ @@ -152,7 +177,9 @@ #endif /* !CONFIG_ACPI */ +#ifndef CONFIG_XEN #define ARCH_HAS_POWER_INIT 1 +#endif struct bootnode; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/amd_iommu.h +++ linux-ec2-2.6.32/arch/x86/include/asm/amd_iommu.h @@ -32,6 +32,7 @@ extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_shutdown(void); extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_init_api(void); #else static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } --- linux-ec2-2.6.32.orig/arch/x86/include/asm/apic.h +++ linux-ec2-2.6.32/arch/x86/include/asm/apic.h @@ -10,12 +10,16 @@ #include #include #include +#ifndef CONFIG_XEN #include +#endif #include #include #include +#ifndef CONFIG_XEN #define ARCH_APICTIMER_STOPS_ON_C3 1 +#endif /* * Debugging macros @@ -47,6 +51,7 @@ #ifdef CONFIG_X86_LOCAL_APIC extern unsigned int apic_verbosity; +#ifndef CONFIG_XEN extern int local_apic_timer_c2_ok; extern int disable_apic; @@ -119,6 +124,8 @@ extern int x2apic_mode; +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_X2APIC /* * Make previous memory operations globally visible before @@ -365,6 +372,8 @@ */ extern struct apic *apic; +#ifndef CONFIG_XEN + /* * APIC functionality to boot other CPUs - only used on SMP: */ @@ -458,6 +467,8 @@ extern void generic_bigsmp_probe(void); +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_LOCAL_APIC @@ -477,6 +488,8 @@ DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); +#ifndef CONFIG_XEN + static inline unsigned int read_apic_id(void) { unsigned int reg; @@ -588,6 +601,8 @@ return physid_mask_of_physid(phys_apicid); } +#endif /* CONFIG_XEN */ + #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_32 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/apicdef.h +++ linux-ec2-2.6.32/arch/x86/include/asm/apicdef.h @@ -11,6 +11,8 @@ #define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 #define APIC_DEFAULT_PHYS_BASE 0xfee00000 +#ifndef CONFIG_XEN + #define APIC_ID 0x20 #define APIC_LVR 0x30 @@ -136,6 +138,16 @@ #define APIC_BASE_MSR 0x800 #define X2APIC_ENABLE (1UL << 10) +#else /* CONFIG_XEN */ + +enum { + APIC_DEST_ALLBUT = 0x1, + APIC_DEST_SELF, + APIC_DEST_ALLINC +}; + +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_32 # define MAX_IO_APICS 64 #else @@ -143,6 +155,8 @@ # define MAX_LOCAL_APIC 32768 #endif +#ifndef CONFIG_XEN + /* * All x86-64 systems are xAPIC compatible. * In the following, "apicid" is a physical APIC ID. @@ -413,6 +427,8 @@ #undef u32 +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_32 #define BAD_APICID 0xFFu #else --- linux-ec2-2.6.32.orig/arch/x86/include/asm/boot.h +++ linux-ec2-2.6.32/arch/x86/include/asm/boot.h @@ -16,7 +16,7 @@ & ~(CONFIG_PHYSICAL_ALIGN - 1)) /* Minimum kernel alignment, as a power of two */ -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else #define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/desc.h +++ linux-ec2-2.6.32/arch/x86/include/asm/desc.h @@ -5,6 +5,7 @@ #include #include #include +#include static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info) @@ -93,6 +94,9 @@ #define load_TLS(t, cpu) native_load_tls(t, cpu) #define set_ldt native_set_ldt +#ifdef CONFIG_X86_32 +#define load_user_cs_desc native_load_user_cs_desc +#endif /*CONFIG_X86_32*/ #define write_ldt_entry(dt, entry, desc) \ native_write_ldt_entry(dt, entry, desc) @@ -392,4 +396,25 @@ _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); } +#ifdef CONFIG_X86_32 +static inline void set_user_cs(struct desc_struct *desc, unsigned long limit) +{ + limit = (limit - 1) / PAGE_SIZE; + desc->a = limit & 0xffff; + desc->b = (limit & 0xf0000) | 0x00c0fb00; +} + +static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm) +{ + get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs; +} + +#define arch_add_exec_range arch_add_exec_range +#define arch_remove_exec_range arch_remove_exec_range +#define arch_flush_exec_range arch_flush_exec_range +extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_flush_exec_range(struct mm_struct *mm); +#endif /* CONFIG_X86_32 */ + #endif /* _ASM_X86_DESC_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/cpufeature.h +++ linux-ec2-2.6.32/arch/x86/include/asm/cpufeature.h @@ -246,8 +246,13 @@ #define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1) #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) +#ifndef CONFIG_XEN #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#else +#define cpu_has_xsave boot_cpu_has(X86_FEATURE_OSXSAVE) +#endif #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) +#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/elf.h +++ linux-ec2-2.6.32/arch/x86/include/asm/elf.h @@ -197,14 +197,8 @@ set_fs(USER_DS); \ } while (0) -#define COMPAT_SET_PERSONALITY(ex) \ -do { \ - if (test_thread_flag(TIF_IA32)) \ - clear_thread_flag(TIF_ABI_PENDING); \ - else \ - set_thread_flag(TIF_ABI_PENDING); \ - current->personality |= force_personality32; \ -} while (0) +void set_personality_ia32(void); +#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32() #define COMPAT_ELF_PLATFORM ("i686") --- linux-ec2-2.6.32.orig/arch/x86/include/asm/e820.h +++ linux-ec2-2.6.32/arch/x86/include/asm/e820.h @@ -129,7 +129,11 @@ #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ +#ifndef CONFIG_XEN #define ISA_START_ADDRESS 0xa0000 +#else +#define ISA_START_ADDRESS 0 +#endif #define ISA_END_ADDRESS 0x100000 #define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/dma-mapping.h +++ linux-ec2-2.6.32/arch/x86/include/asm/dma-mapping.h @@ -151,8 +151,6 @@ { struct dma_map_ops *ops = get_dma_ops(dev); - WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_from_coherent(dev, get_order(size), vaddr)) return; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/hpet.h +++ linux-ec2-2.6.32/arch/x86/include/asm/hpet.h @@ -66,6 +66,7 @@ extern unsigned long hpet_address; extern unsigned long force_hpet_address; extern int hpet_force_user; +extern u8 hpet_msi_disable; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/fixmap.h +++ linux-ec2-2.6.32/arch/x86/include/asm/fixmap.h @@ -82,6 +82,9 @@ #endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif @@ -126,9 +129,6 @@ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/hw_irq.h +++ linux-ec2-2.6.32/arch/x86/include/asm/hw_irq.h @@ -107,6 +107,7 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void); #endif #ifdef CONFIG_SMP +#ifndef CONFIG_XEN extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); extern void smp_call_function_single_interrupt(struct pt_regs *); @@ -115,9 +116,18 @@ #else extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif +#else +#include +extern irqreturn_t smp_reschedule_interrupt(int, void *); +extern irqreturn_t smp_call_function_interrupt(int, void *); +extern irqreturn_t smp_call_function_single_interrupt(int, void *); +extern irqreturn_t smp_reboot_interrupt(int, void *); +#endif #endif +#ifndef CONFIG_XEN extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); +#endif typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/mce.h +++ linux-ec2-2.6.32/arch/x86/include/asm/mce.h @@ -67,6 +67,8 @@ __u32 socketid; /* CPU socket ID */ __u32 apicid; /* CPU initial apic ID */ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ + __u64 aux0; /* model specific */ + __u64 aux1; /* model specific */ }; /* @@ -214,5 +216,11 @@ void mce_log_therm_throt_event(__u64 status); +#ifdef CONFIG_X86_THERMAL_VECTOR +extern void mcheck_intel_therm_init(void); +#else +static inline void mcheck_intel_therm_init(void) { } +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/hypervisor.h +++ linux-ec2-2.6.32/arch/x86/include/asm/hypervisor.h @@ -24,3 +24,7 @@ extern void init_hypervisor_platform(void); #endif + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include_next +#endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/i8253.h +++ linux-ec2-2.6.32/arch/x86/include/asm/i8253.h @@ -8,10 +8,14 @@ extern spinlock_t i8253_lock; +#ifdef CONFIG_GENERIC_CLOCKEVENTS + extern struct clock_event_device *global_clock_event; extern void setup_pit_timer(void); +#endif + #define inb_pit inb_p #define outb_pit outb_p --- linux-ec2-2.6.32.orig/arch/x86/include/asm/io_apic.h +++ linux-ec2-2.6.32/arch/x86/include/asm/io_apic.h @@ -160,6 +160,7 @@ struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); +void setup_IO_APIC_irq_extra(u32 gsi); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); extern void ioapic_insert_resources(void); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/irq.h +++ linux-ec2-2.6.32/arch/x86/include/asm/irq.h @@ -15,7 +15,7 @@ return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) # define ARCH_HAS_NMI_WATCHDOG #endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/irq_vectors.h +++ linux-ec2-2.6.32/arch/x86/include/asm/irq_vectors.h @@ -113,7 +113,7 @@ */ #define LOCAL_PENDING_VECTOR 0xec -#define UV_BAU_MESSAGE 0xec +#define UV_BAU_MESSAGE 0xea /* * Self IPI vector for machine checks --- linux-ec2-2.6.32.orig/arch/x86/include/asm/kexec.h +++ linux-ec2-2.6.32/arch/x86/include/asm/kexec.h @@ -5,14 +5,30 @@ # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_PGD 2 +# ifndef CONFIG_XEN # define PA_SWAP_PAGE 3 # define PAGES_NR 4 +# else /* CONFIG_XEN */ +/* + * The hypervisor interface implicitly requires that all entries (except + * for possibly the final one) are arranged in matching PA_/VA_ pairs. +# define VA_PGD 3 + */ +# define PA_SWAP_PAGE 4 +# define PAGES_NR 5 +# endif /* CONFIG_XEN */ #else # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_TABLE_PAGE 2 +# ifndef CONFIG_XEN # define PA_SWAP_PAGE 3 # define PAGES_NR 4 +# else /* CONFIG_XEN, see comment above +# define VA_TABLE_PAGE 3 */ +# define PA_SWAP_PAGE 4 +# define PAGES_NR 5 +# endif /* CONFIG_XEN */ #endif # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 @@ -163,6 +179,19 @@ }; #endif +/* Under Xen we need to work with machine addresses. These macros give the + * machine address of a certain page to the generic kexec code instead of + * the pseudo physical address which would be given by the default macros. + */ + +#ifdef CONFIG_XEN +#define KEXEC_ARCH_HAS_PAGE_MACROS +#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page)) +#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn)) +#define kexec_virt_to_phys(addr) virt_to_machine(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr)) +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/kvm_host.h +++ linux-ec2-2.6.32/arch/x86/include/asm/kvm_host.h @@ -256,7 +256,8 @@ void (*new_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, + u32 *error); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -412,6 +413,7 @@ unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; u64 vm_init_tsc; + s64 kvmclock_offset; }; struct kvm_vm_stat { @@ -600,8 +602,7 @@ unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); @@ -644,6 +645,10 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -657,6 +662,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +bool kvm_check_iopl(struct kvm_vcpu *vcpu); struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/mmu.h +++ linux-ec2-2.6.32/arch/x86/include/asm/mmu.h @@ -7,15 +7,25 @@ /* * The x86 doesn't have a mmu context, but * we put the segment information here. + * + * exec_limit is used to track the range PROT_EXEC + * mappings span. */ typedef struct { void *ldt; int size; +#ifdef CONFIG_XEN + unsigned has_foreign_mappings:1; +#endif struct mutex lock; void *vdso; +#ifdef CONFIG_X86_32 + struct desc_struct user_cs; + unsigned long exec_limit; +#endif } mm_context_t; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) void leave_mm(int cpu); #else static inline void leave_mm(int cpu) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/msr.h +++ linux-ec2-2.6.32/arch/x86/include/asm/msr.h @@ -27,6 +27,18 @@ }; }; +struct msr_info { + u32 msr_no; + struct msr reg; + struct msr *msrs; + int err; +}; + +struct msr_regs_info { + u32 *regs; + int err; +}; + static inline unsigned long long native_read_tscp(unsigned int *aux) { unsigned long low, high; @@ -244,11 +256,14 @@ #define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) +struct msr *msrs_alloc(void); +void msrs_free(struct msr *msrs); + #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); -void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); -void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/nmi.h +++ linux-ec2-2.6.32/arch/x86/include/asm/nmi.h @@ -5,8 +5,6 @@ #include #include -#ifdef ARCH_HAS_NMI_WATCHDOG - /** * do_nmi_callback * @@ -16,6 +14,11 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); + +extern int unknown_nmi_panic; + +#ifdef ARCH_HAS_NMI_WATCHDOG + extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); @@ -42,7 +45,6 @@ struct ctl_table; extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); -extern int unknown_nmi_panic; void arch_trigger_all_cpu_backtrace(void); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace @@ -65,7 +67,6 @@ */ return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC); } -#endif void lapic_watchdog_stop(void); int lapic_watchdog_init(unsigned nmi_hz); @@ -73,6 +74,9 @@ unsigned lapic_adjust_nmi_hz(unsigned hz); void disable_lapic_nmi_watchdog(void); void enable_lapic_nmi_watchdog(void); + +#endif + void stop_nmi(void); void restart_nmi(void); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/msr-index.h +++ linux-ec2-2.6.32/arch/x86/include/asm/msr-index.h @@ -104,6 +104,8 @@ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/page_64_types.h +++ linux-ec2-2.6.32/arch/x86/include/asm/page_64_types.h @@ -69,7 +69,15 @@ #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_FLATMEM +/* + * While max_pfn is not exported, max_mapnr never gets initialized for non-Xen + * other than for hotplugged memory. + */ +#ifndef CONFIG_XEN #define pfn_valid(pfn) ((pfn) < max_pfn) +#else +#define pfn_valid(pfn) ((pfn) < max_mapnr) +#endif #endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/paravirt.h +++ linux-ec2-2.6.32/arch/x86/include/asm/paravirt.h @@ -289,6 +289,12 @@ { PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); } +#ifdef CONFIG_X86_32 +static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm) +{ + PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm); +} +#endif /*CONFIG_X86_32*/ static inline void store_gdt(struct desc_ptr *dtr) { PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/percpu.h +++ linux-ec2-2.6.32/arch/x86/include/asm/percpu.h @@ -133,6 +133,38 @@ ret__; \ }) +#define percpu_xchg_op(op, var, val) \ +({ \ + typedef typeof(var) T__; \ + T__ ret__; \ + if (0) \ + ret__ = (val); \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b %0,"__percpu_arg(1) \ + : "=q" (ret__), "+m" (var) \ + : "0" ((T__)(val))); \ + break; \ + case 2: \ + asm(op "w %0,"__percpu_arg(1) \ + : "=r" (ret__), "+m" (var) \ + : "0" ((T__)(val))); \ + break; \ + case 4: \ + asm(op "l %0,"__percpu_arg(1) \ + : "=r" (ret__), "+m" (var) \ + : "0" ((T__)(val))); \ + break; \ + case 8: \ + asm(op "q %0,"__percpu_arg(1) \ + : "=r" (ret__), "+m" (var) \ + : "0" ((T__)(val))); \ + break; \ + default: __bad_percpu_size(); \ + } \ + ret__; \ +}) + /* * percpu_read() makes gcc load the percpu variable every time it is * accessed while percpu_read_stable() allows the value to be cached. @@ -152,6 +184,10 @@ #define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) #define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) #define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) +#define percpu_xchg(var, val) percpu_xchg_op("xchg", per_cpu__##var, val) +#if defined(CONFIG_X86_XADD) || defined(CONFIG_X86_64) +#define percpu_xadd(var, val) percpu_xchg_op("xadd", per_cpu__##var, val) +#endif /* This is not atomic against other CPUs -- CPU preemption needs to be off */ #define x86_test_and_clear_bit_percpu(bit, var) \ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/processor.h +++ linux-ec2-2.6.32/arch/x86/include/asm/processor.h @@ -180,7 +180,7 @@ unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ - asm("cpuid" + asm volatile("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), --- linux-ec2-2.6.32.orig/arch/x86/include/asm/pgalloc.h +++ linux-ec2-2.6.32/arch/x86/include/asm/pgalloc.h @@ -23,6 +23,11 @@ #endif /* + * Flags to use when allocating a user page table page. + */ +extern gfp_t __userpte_alloc_gfp; + +/* * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/ptrace.h +++ linux-ec2-2.6.32/arch/x86/include/asm/ptrace.h @@ -224,7 +224,9 @@ extern void user_disable_single_step(struct task_struct *); extern void user_enable_block_step(struct task_struct *); -#ifdef CONFIG_X86_DEBUGCTLMSR +#if defined(CONFIG_XEN) +#define arch_has_block_step() (0) +#elif defined(CONFIG_X86_DEBUGCTLMSR) #define arch_has_block_step() (1) #else #define arch_has_block_step() (boot_cpu_data.x86 >= 6) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/required-features.h +++ linux-ec2-2.6.32/arch/x86/include/asm/required-features.h @@ -48,7 +48,7 @@ #endif #ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT +#if defined(CONFIG_PARAVIRT) || defined(CONFIG_XEN) /* Paravirtualized systems may not have PSE or PGE available */ #define NEED_PSE 0 #define NEED_PGE 0 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/scatterlist.h +++ linux-ec2-2.6.32/arch/x86/include/asm/scatterlist.h @@ -3,6 +3,10 @@ #define ISA_DMA_THRESHOLD (0x00ffffff) +#ifdef CONFIG_X86_XEN +# define sg_dma_len(sg) ((sg)->dma_length) +#endif + #include #endif /* _ASM_X86_SCATTERLIST_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/segment.h +++ linux-ec2-2.6.32/arch/x86/include/asm/segment.h @@ -186,7 +186,9 @@ #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3) -#ifndef CONFIG_PARAVIRT +#if defined(CONFIG_X86_XEN) +#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) +#elif !defined(CONFIG_PARAVIRT) #define get_kernel_rpl() 0 #endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/time.h +++ linux-ec2-2.6.32/arch/x86/include/asm/time.h @@ -7,4 +7,11 @@ extern void time_init(void); +#ifdef CONFIG_XEN +struct timespec; +extern int xen_independent_wallclock(void); +extern void xen_read_persistent_clock(struct timespec *); +extern int xen_update_persistent_clock(void); +#endif + #endif /* _ASM_X86_TIME_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/traps.h +++ linux-ec2-2.6.32/arch/x86/include/asm/traps.h @@ -37,6 +37,9 @@ asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); +#ifdef CONFIG_X86_XEN +asmlinkage void fixup_4gb_segment(void); +#endif dotraplinkage void do_divide_error(struct pt_regs *, long); dotraplinkage void do_debug(struct pt_regs *, long); @@ -65,6 +68,9 @@ dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); +#ifdef CONFIG_XEN +void do_fixup_4gb_segment(struct pt_regs *, long); +#endif #endif static inline int get_si_code(unsigned long condition) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/sys_ia32.h +++ linux-ec2-2.6.32/arch/x86/include/asm/sys_ia32.h @@ -62,9 +62,6 @@ asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); - struct oldold_utsname; struct old_utsname; asmlinkage long sys32_olduname(struct oldold_utsname __user *); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/syscalls.h +++ linux-ec2-2.6.32/arch/x86/include/asm/syscalls.h @@ -55,8 +55,6 @@ struct oldold_utsname; struct old_utsname; -asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); asmlinkage int old_mmap(struct mmap_arg_struct __user *); asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/thread_info.h +++ linux-ec2-2.6.32/arch/x86/include/asm/thread_info.h @@ -86,7 +86,6 @@ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ -#define TIF_ABI_PENDING 19 #define TIF_MEMDIE 20 #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -96,6 +95,9 @@ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ +#ifdef CONFIG_X86_XEN +#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */ +#endif #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -110,7 +112,6 @@ #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) -#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FREEZE (1 << TIF_FREEZE) @@ -119,6 +120,7 @@ #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_CSTAR (1 << TIF_CSTAR) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -145,9 +147,14 @@ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) /* flags to check in __switch_to() */ +#ifndef CONFIG_XEN #define _TIF_WORK_CTXSW \ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) +#else +#define _TIF_WORK_CTXSW (_TIF_NOTSC \ + /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/) +#endif #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/topology.h +++ linux-ec2-2.6.32/arch/x86/include/asm/topology.h @@ -30,7 +30,7 @@ # define ENABLE_TOPO_DEFINES # endif #else -# ifdef CONFIG_SMP +# if defined(CONFIG_SMP) && !defined(CONFIG_XEN) # define ENABLE_TOPO_DEFINES # endif #endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/vmx.h +++ linux-ec2-2.6.32/arch/x86/include/asm/vmx.h @@ -56,6 +56,7 @@ #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 +#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -144,6 +145,8 @@ VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, TPR_THRESHOLD = 0x0000401c, SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + PLE_GAP = 0x00004020, + PLE_WINDOW = 0x00004022, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, @@ -248,6 +251,7 @@ #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/kvm_emulate.h +++ linux-ec2-2.6.32/arch/x86/include/asm/kvm_emulate.h @@ -54,13 +54,23 @@ struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. + * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); + + /* + * fetch: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*fetch)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* * read_emulated: Read bytes from emulated/special memory area. @@ -129,7 +139,7 @@ u8 seg_override; unsigned int d; unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; + unsigned long eip, eip_orig; /* modrm */ u8 modrm; u8 modrm_mod; @@ -168,6 +178,7 @@ /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/paravirt_types.h +++ linux-ec2-2.6.32/arch/x86/include/asm/paravirt_types.h @@ -118,6 +118,9 @@ void (*store_gdt)(struct desc_ptr *); void (*store_idt)(struct desc_ptr *); void (*set_ldt)(const void *desc, unsigned entries); +#ifdef CONFIG_X86_32 + void (*load_user_cs_desc)(int cpu, struct mm_struct *mm); +#endif /*CONFIG_X86_32*/ unsigned long (*store_tr)(void); void (*load_tls)(struct thread_struct *t, unsigned int cpu); #ifdef CONFIG_X86_64 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/uv/uv_hub.h +++ linux-ec2-2.6.32/arch/x86/include/asm/uv/uv_hub.h @@ -11,7 +11,7 @@ #ifndef _ASM_X86_UV_UV_HUB_H #define _ASM_X86_UV_UV_HUB_H -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_UV #include #include #include @@ -31,20 +31,20 @@ * contiguous (although various IO spaces may punch holes in * it).. * - * N - Number of bits in the node portion of a socket physical - * address. + * N - Number of bits in the node portion of a socket physical + * address. * - * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of - * routers always have low bit of 1, C/MBricks have low bit - * equal to 0. Most addressing macros that target UV hub chips - * right shift the NASID by 1 to exclude the always-zero bit. - * NASIDs contain up to 15 bits. + * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of + * routers always have low bit of 1, C/MBricks have low bit + * equal to 0. Most addressing macros that target UV hub chips + * right shift the NASID by 1 to exclude the always-zero bit. + * NASIDs contain up to 15 bits. * * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead * of nasids. * - * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant - * of the nasid for socket usage. + * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant + * of the nasid for socket usage. * * * NumaLink Global Physical Address Format: @@ -71,12 +71,12 @@ * * * APICID format - * NOTE!!!!!! This is the current format of the APICID. However, code - * should assume that this will change in the future. Use functions - * in this file for all APICID bit manipulations and conversion. + * NOTE!!!!!! This is the current format of the APICID. However, code + * should assume that this will change in the future. Use functions + * in this file for all APICID bit manipulations and conversion. * - * 1111110000000000 - * 5432109876543210 + * 1111110000000000 + * 5432109876543210 * pppppppppplc0cch * sssssssssss * @@ -89,9 +89,9 @@ * Note: Processor only supports 12 bits in the APICID register. The ACPI * tables hold all 16 bits. Software needs to be aware of this. * - * Unless otherwise specified, all references to APICID refer to - * the FULL value contained in ACPI tables, not the subset in the - * processor APICID register. + * Unless otherwise specified, all references to APICID refer to + * the FULL value contained in ACPI tables, not the subset in the + * processor APICID register. */ @@ -151,16 +151,16 @@ }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) +#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) /* * Local & Global MMR space macros. - * Note: macros are intended to be used ONLY by inline functions - * in this file - not by other kernel code. - * n - NASID (full 15-bit global nasid) - * g - GNODE (full 15-bit global nasid, right shifted 1) - * p - PNODE (local part of nsids, right shifted 1) + * Note: macros are intended to be used ONLY by inline functions + * in this file - not by other kernel code. + * n - NASID (full 15-bit global nasid) + * g - GNODE (full 15-bit global nasid, right shifted 1) + * p - PNODE (local part of nsids, right shifted 1) */ #define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) #define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) @@ -213,8 +213,8 @@ /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. - * Note: use the standard __pa() & __va() macros for converting - * between socket virtual and socket physical addresses. + * Note: use the standard __pa() & __va() macros for converting + * between socket virtual and socket physical addresses. */ /* socket phys RAM --> UV global physical address */ @@ -265,21 +265,18 @@ * Access global MMRs using the low memory MMR32 space. This region supports * faster MMR access but not all MMRs are accessible in this space. */ -static inline unsigned long *uv_global_mmr32_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR32_BASE | UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr32(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr32_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr32(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset) { return readq(uv_global_mmr32_address(pnode, offset)); } @@ -288,25 +285,32 @@ * Access Global MMR space using the MMR space located at the top of physical * memory. */ -static inline unsigned long *uv_global_mmr64_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR64_BASE | UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr64(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr64_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr64(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset) { return readq(uv_global_mmr64_address(pnode, offset)); } +static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val) +{ + writeb(val, uv_global_mmr64_address(pnode, offset)); +} + +static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset) +{ + return readb(uv_global_mmr64_address(pnode, offset)); +} + /* * Access hub local MMRs. Faster than using global space but only local MMRs * are accessible. @@ -426,11 +430,17 @@ } } +static inline unsigned long uv_scir_offset(int apicid) +{ + return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f); +} + static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) { if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_write_global_mmr8(uv_cpu_to_pnode(cpu), + uv_cpu_hub_info(cpu)->scir.offset, value); uv_cpu_hub_info(cpu)->scir.state = value; - uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); } } --- linux-ec2-2.6.32.orig/arch/x86/include/asm/xen/hypervisor.h +++ linux-ec2-2.6.32/arch/x86/include/asm/xen/hypervisor.h @@ -43,7 +43,7 @@ XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ }; -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN extern enum xen_domain_type xen_domain_type; #else #define xen_domain_type XEN_NATIVE --- linux-ec2-2.6.32.orig/arch/x86/include/asm/xen/interface.h +++ linux-ec2-2.6.32/arch/x86/include/asm/xen/interface.h @@ -10,17 +10,20 @@ #define _ASM_X86_XEN_INTERFACE_H #ifdef __XEN__ -#define __DEFINE_GUEST_HANDLE(name, type) \ +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name #else -#define __DEFINE_GUEST_HANDLE(name, type) \ +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef type * __guest_handle_ ## name #endif +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \ + ___DEFINE_XEN_GUEST_HANDLE(name, type); \ + ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type) #define DEFINE_GUEST_HANDLE_STRUCT(name) \ - __DEFINE_GUEST_HANDLE(name, struct name) -#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) -#define GUEST_HANDLE(name) __guest_handle_ ## name + __DEFINE_XEN_GUEST_HANDLE(name, struct name) +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) +#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name #ifdef __XEN__ #if defined(__i386__) @@ -47,14 +50,8 @@ #endif #ifndef __ASSEMBLY__ -/* Guest handles for primitive C types. */ -__DEFINE_GUEST_HANDLE(uchar, unsigned char); -__DEFINE_GUEST_HANDLE(uint, unsigned int); -__DEFINE_GUEST_HANDLE(ulong, unsigned long); -DEFINE_GUEST_HANDLE(char); -DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(long); -DEFINE_GUEST_HANDLE(void); +typedef unsigned long xen_pfn_t; +typedef unsigned long xen_ulong_t; #endif #ifndef HYPERVISOR_VIRT_START --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/agp.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/agp.h @@ -0,0 +1,41 @@ +#ifndef _ASM_X86_AGP_H +#define _ASM_X86_AGP_H + +#include +#include +#include + +/* + * Functions to keep the agpgart mappings coherent with the MMU. The + * GART gives the CPU a physical alias of pages in memory. The alias + * region is mapped uncacheable. Make sure there are no conflicting + * mappings with different cachability attributes for the same + * page. This avoids data corruption on some CPUs. + */ + +#define map_page_into_agp(page) ( \ + xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \ + ?: set_pages_uc(page, 1)) +#define unmap_page_from_agp(page) ( \ + xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \ + /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ + set_pages_wb(page, 1)) + +/* + * Could use CLFLUSH here if the cpu supports it. But then it would + * need to be called for each cacheline of the whole page so it may + * not be worth it. Would need a page for it. + */ +#define flush_agp_cache() wbinvd() + +#define virt_to_gart virt_to_machine + +/* GATT allocation. Returns/accepts GATT kernel virtual address. */ +#define alloc_gatt_pages(order) ({ \ + char *_t; dma_addr_t _d; \ + _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \ + _t; }) +#define free_gatt_pages(table, order) \ + dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table)) + +#endif /* _ASM_X86_AGP_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/desc.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/desc.h @@ -0,0 +1,421 @@ +#ifndef _ASM_X86_DESC_H +#define _ASM_X86_DESC_H + +#include +#include +#include +#include + +static inline void fill_ldt(struct desc_struct *desc, + const struct user_desc *info) +{ + desc->limit0 = info->limit & 0x0ffff; + desc->base0 = info->base_addr & 0x0000ffff; + + desc->base1 = (info->base_addr & 0x00ff0000) >> 16; + desc->type = (info->read_exec_only ^ 1) << 1; + desc->type |= info->contents << 2; + desc->s = 1; + desc->dpl = 0x3; + desc->p = info->seg_not_present ^ 1; + desc->limit = (info->limit & 0xf0000) >> 16; + desc->avl = info->useable; + desc->d = info->seg_32bit; + desc->g = info->limit_in_pages; + desc->base2 = (info->base_addr & 0xff000000) >> 24; + /* + * Don't allow setting of the lm bit. It is useless anyway + * because 64bit system calls require __USER_CS: + */ + desc->l = 0; +} + +#ifndef CONFIG_X86_NO_IDT +extern struct desc_ptr idt_descr; +extern gate_desc idt_table[]; +#endif + +struct gdt_page { + struct desc_struct gdt[GDT_ENTRIES]; +} __attribute__((aligned(PAGE_SIZE))); +DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); + +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) +{ + return per_cpu(gdt_page, cpu).gdt; +} + +#ifdef CONFIG_X86_64 + +static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, + unsigned dpl, unsigned ist, unsigned seg) +{ + gate->offset_low = PTR_LOW(func); + gate->segment = __KERNEL_CS; + gate->ist = ist; + gate->p = 1; + gate->dpl = dpl; + gate->zero0 = 0; + gate->zero1 = 0; + gate->type = type; + gate->offset_middle = PTR_MIDDLE(func); + gate->offset_high = PTR_HIGH(func); +} + +#else +static inline void pack_gate(gate_desc *gate, unsigned char type, + unsigned long base, unsigned dpl, unsigned flags, + unsigned short seg) +{ + gate->a = (seg << 16) | (base & 0xffff); + gate->b = (base & 0xffff0000) | + (((0x80 | type | (dpl << 5)) & 0xff) << 8); +} + +#endif + +static inline int desc_empty(const void *ptr) +{ + const u32 *desc = ptr; + return !(desc[0] | desc[1]); +} + +#ifndef CONFIG_XEN +#define load_TR_desc() native_load_tr_desc() +#define load_gdt(dtr) native_load_gdt(dtr) +#define load_idt(dtr) native_load_idt(dtr) +#define load_tr(tr) asm volatile("ltr %0"::"m" (tr)) +#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) + +#define store_gdt(dtr) native_store_gdt(dtr) +#define store_idt(dtr) native_store_idt(dtr) +#define store_tr(tr) (tr = native_store_tr()) + +#define load_TLS(t, cpu) native_load_tls(t, cpu) +#define set_ldt native_set_ldt + +#define write_ldt_entry(dt, entry, desc) \ + native_write_ldt_entry(dt, entry, desc) +#define write_gdt_entry(dt, entry, desc, type) \ + native_write_gdt_entry(dt, entry, desc, type) +#define write_idt_entry(dt, entry, g) \ + native_write_idt_entry(dt, entry, g) + +static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ +} + +static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) +{ +} + +#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt)) + +static inline void native_write_idt_entry(gate_desc *idt, int entry, + const gate_desc *gate) +{ + memcpy(&idt[entry], gate, sizeof(*gate)); +} + +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, + const void *desc) +{ + memcpy(&ldt[entry], desc, 8); +} + +static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, + const void *desc, int type) +{ + unsigned int size; + switch (type) { + case DESC_TSS: + size = sizeof(tss_desc); + break; + case DESC_LDT: + size = sizeof(ldt_desc); + break; + default: + size = sizeof(struct desc_struct); + break; + } + memcpy(&gdt[entry], desc, size); +} +#endif + +static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, + unsigned long limit, unsigned char type, + unsigned char flags) +{ + desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); + desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | + (limit & 0x000f0000) | ((type & 0xff) << 8) | + ((flags & 0xf) << 20); + desc->p = 1; +} + + +#ifndef CONFIG_XEN +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) +{ +#ifdef CONFIG_X86_64 + struct ldttss_desc64 *desc = d; + memset(desc, 0, sizeof(*desc)); + desc->limit0 = size & 0xFFFF; + desc->base0 = PTR_LOW(addr); + desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->type = type; + desc->p = 1; + desc->limit1 = (size >> 16) & 0xF; + desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; + desc->base3 = PTR_HIGH(addr); +#else + pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); +#endif +} + +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) +{ + struct desc_struct *d = get_cpu_gdt_table(cpu); + tss_desc tss; + + /* + * sizeof(unsigned long) coming from an extra "long" at the end + * of the iobitmap. See tss_struct definition in processor.h + * + * -1? seg base+limit should be pointing to the address of the + * last valid byte + */ + set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + + sizeof(unsigned long) - 1); + write_gdt_entry(d, entry, &tss, DESC_TSS); +} + +#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) + +static inline void native_set_ldt(const void *addr, unsigned int entries) +{ + if (likely(entries == 0)) + asm volatile("lldt %w0"::"q" (0)); + else { + unsigned cpu = smp_processor_id(); + ldt_desc ldt; + + set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, + entries * LDT_ENTRY_SIZE - 1); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, + &ldt, DESC_LDT); + asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); + } +} + +static inline void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static inline void native_load_gdt(const struct desc_ptr *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static inline void native_load_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static inline void native_store_gdt(struct desc_ptr *dtr) +{ + asm volatile("sgdt %0":"=m" (*dtr)); +} + +static inline void native_store_idt(struct desc_ptr *dtr) +{ + asm volatile("sidt %0":"=m" (*dtr)); +} + +static inline unsigned long native_store_tr(void) +{ + unsigned long tr; + asm volatile("str %0":"=r" (tr)); + return tr; +} + +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; +} +#else +#include + +#define load_TLS(t, cpu) xen_load_tls(t, cpu) +#define set_ldt xen_set_ldt + +extern int write_ldt_entry(struct desc_struct *ldt, int entry, + const void *desc); +extern int write_gdt_entry(struct desc_struct *gdt, int entry, + const void *desc, int type); + +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN; + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + if (HYPERVISOR_update_descriptor( + arbitrary_virt_to_machine(&gdt[i]), + *(u64 *)&t->tls_array[i])) + BUG(); +} +#endif + +#define _LDT_empty(info) \ + ((info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0) + +#ifdef CONFIG_X86_64 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) +#else +#define LDT_empty(info) (_LDT_empty(info)) +#endif + +static inline void clear_LDT(void) +{ + set_ldt(NULL, 0); +} + +/* + * load one particular LDT into the current CPU + */ +static inline void load_LDT_nolock(mm_context_t *pc) +{ + set_ldt(pc->ldt, pc->size); +} + +static inline void load_LDT(mm_context_t *pc) +{ + preempt_disable(); + load_LDT_nolock(pc); + preempt_enable(); +} + +static inline unsigned long get_desc_base(const struct desc_struct *desc) +{ + return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); +} + +static inline void set_desc_base(struct desc_struct *desc, unsigned long base) +{ + desc->base0 = base & 0xffff; + desc->base1 = (base >> 16) & 0xff; + desc->base2 = (base >> 24) & 0xff; +} + +static inline unsigned long get_desc_limit(const struct desc_struct *desc) +{ + return desc->limit0 | (desc->limit << 16); +} + +static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) +{ + desc->limit0 = limit & 0xffff; + desc->limit = (limit >> 16) & 0xf; +} + +#ifndef CONFIG_X86_NO_IDT +static inline void _set_gate(int gate, unsigned type, void *addr, + unsigned dpl, unsigned ist, unsigned seg) +{ + gate_desc s; + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); + /* + * does not need to be atomic because it is only done once at + * setup time + */ + write_idt_entry(idt_table, gate, &s); +} + +/* + * This needs to use 'idt_table' rather than 'idt', and + * thus use the _nonmapped_ version of the IDT, as the + * Pentium F0 0F bugfix can have resulted in the mapped + * IDT being write-protected. + */ +static inline void set_intr_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); +} + +extern int first_system_vector; +/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ +extern unsigned long used_vectors[]; + +static inline void alloc_system_vector(int vector) +{ + if (!test_bit(vector, used_vectors)) { + set_bit(vector, used_vectors); + if (first_system_vector > vector) + first_system_vector = vector; + } else + BUG(); +} + +static inline void alloc_intr_gate(unsigned int n, void *addr) +{ + alloc_system_vector(n); + set_intr_gate(n, addr); +} + +/* + * This routine sets up an interrupt gate at directory privilege level 3. + */ +static inline void set_system_intr_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); +} + +static inline void set_system_trap_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); +} + +static inline void set_trap_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); +} + +static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); +} + +static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); +} + +static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); +} +#endif + +#endif /* _ASM_X86_DESC_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/dma-mapping.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/dma-mapping.h @@ -0,0 +1,26 @@ +#ifndef _ASM_X86_DMA_MAPPING_H_ +#define _ASM_X86_DMA_MAPPING_H_ + +#define phys_to_dma _phys_to_dma_ +#define dma_to_phys _dma_to_phys_ + +#include_next + +#undef phys_to_dma +#undef dma_to_phys + +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return phys_to_machine(paddr); +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return machine_to_phys(daddr); +} + +void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t); + +extern int range_straddles_page_boundary(paddr_t p, size_t size); + +#endif /* _ASM_X86_DMA_MAPPING_H_ */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/fixmap.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/fixmap.h @@ -0,0 +1,214 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009 + */ + +#ifndef _ASM_X86_FIXMAP_H +#define _ASM_X86_FIXMAP_H + +#ifndef __ASSEMBLY__ +#include +#include +#include +#ifdef CONFIG_X86_32 +#include +#include +#else +#include +#endif + +/* + * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall + * uses fixmaps that relies on FIXADDR_TOP for proper address calculation. + * Because of this, FIXADDR_TOP x86 integration was left as later work. + */ +#ifdef CONFIG_X86_32 +/* used by vmalloc.c, vsyscall.lds.S. + * + * Leave one empty page between vmalloc'ed areas and + * the start of the fixmap. + */ +extern unsigned long __FIXADDR_TOP; +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) + +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) +#else +#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) + +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ +#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) +#endif + + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. + * for x86_32: We allocate these special addresses + * from the end of virtual memory (0xfffff000) backwards. + * Also this lets us do fail-safe vmalloc(), we + * can guarantee that these special addresses and + * vmalloc()-ed addresses never overlap. + * + * These 'compile-time allocated' memory buffers are + * fixed-size 4k pages (or larger if used with an increment + * higher than 1). Use set_fixmap(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ +enum fixed_addresses { +#ifdef CONFIG_X86_32 + FIX_HOLE, + FIX_VDSO, +#else + VSYSCALL_LAST_PAGE, + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, + VSYSCALL_HPET, +#endif + FIX_DBGP_BASE, + FIX_EARLYCON_MEM_BASE, +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_LOCAL_APIC + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +#endif +#ifdef CONFIG_X86_IO_APIC + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, +#endif +#else + FIX_SHARED_INFO, +#define NR_FIX_ISAMAPS 256 + FIX_ISAMAP_END, + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, +#endif +#ifdef CONFIG_X86_VISWS_APIC + FIX_CO_CPU, /* Cobalt timer */ + FIX_CO_APIC, /* Cobalt APIC Redirection Table */ + FIX_LI_PCIA, /* Lithium PCI Bridge A */ + FIX_LI_PCIB, /* Lithium PCI Bridge B */ +#endif +#ifdef CONFIG_X86_F00F_BUG + FIX_F00F_IDT, /* Virtual mapping for IDT */ +#endif +#ifdef CONFIG_X86_CYCLONE_TIMER + FIX_CYCLONE_TIMER, /*cyclone timer register*/ +#endif +#ifdef CONFIG_X86_32 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +#ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, +#endif +#endif +#ifdef CONFIG_PARAVIRT + FIX_PARAVIRT_BOOTMAP, +#endif + FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ + __end_of_permanent_fixed_addresses, + /* + * 256 temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + * + * We round it up to the next 256 pages boundary so that we + * can have a single pgd entry and a single pte table: + */ +#define NR_FIX_BTMAPS 64 +#define FIX_BTMAPS_SLOTS 4 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - + (__end_of_permanent_fixed_addresses & 255), + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif +#ifdef CONFIG_X86_32 + FIX_WP_TEST, +#endif +#ifdef CONFIG_INTEL_TXT + FIX_TBOOT_BASE, +#endif + __end_of_fixed_addresses +}; + + +extern void reserve_top_address(unsigned long reserve); + +#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) +#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE) + +extern int fixmaps_set; + +extern pte_t *kmap_pte; +extern pgprot_t kmap_prot; +extern pte_t *pkmap_page_table; + +void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t); + +static inline void __set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + xen_set_fixmap(idx, phys, flags); +} + +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) + +/* + * Some hardware wants to get fixmapped without caching. + */ +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) + +#define clear_fixmap(idx) \ + __set_fixmap(idx, 0, __pgprot(0)) + +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + +extern void __this_fixmap_does_not_exist(void); + +/* + * 'index to address' translation. If anyone tries to use the idx + * directly without translation, we catch the bug with a NULL-deference + * kernel oops. Illegal ranges of incoming indices are caught too. + */ +static __always_inline unsigned long fix_to_virt(const unsigned int idx) +{ + /* + * this branch gets completely eliminated after inlining, + * except when someone tries to use fixaddr indices in an + * illegal way. (such as mixing up address types or using + * out-of-range indices). + * + * If it doesn't get removed, the linker will complain + * loudly with a reasonably clear error message.. + */ + if (idx >= __end_of_fixed_addresses) + __this_fixmap_does_not_exist(); + + return __fix_to_virt(idx); +} + +static inline unsigned long virt_to_fix(const unsigned long vaddr) +{ + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); +} +#endif /* !__ASSEMBLY__ */ +#endif /* _ASM_X86_FIXMAP_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/gnttab_dma.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/gnttab_dma.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2007 Herbert Xu + * Copyright (c) 2007 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_I386_GNTTAB_DMA_H +#define _ASM_I386_GNTTAB_DMA_H + +static inline int gnttab_dma_local_pfn(struct page *page) +{ + /* Has it become a local MFN? */ + return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page)))); +} + +static inline maddr_t gnttab_dma_map_page(struct page *page) +{ + __gnttab_dma_map_page(page); + return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT); +} + +static inline void gnttab_dma_unmap_page(maddr_t maddr) +{ + __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr))); +} + +#endif /* _ASM_I386_GNTTAB_DMA_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/highmem.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/highmem.h @@ -0,0 +1,98 @@ +/* + * highmem.h: virtual kernel memory mappings for high memory + * + * Used in CONFIG_HIGHMEM systems for memory pages which + * are not addressable by direct kernel virtual addresses. + * + * Copyright (C) 1999 Gerhard Wichert, Siemens AG + * Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * up to 16 Terabyte physical memory. With current x86 CPUs + * we now support up to 64 Gigabytes physical RAM. + * + * Copyright (C) 1999 Ingo Molnar + */ + +#ifndef _ASM_X86_HIGHMEM_H +#define _ASM_X86_HIGHMEM_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +/* declarations for highmem.c */ +extern unsigned long highstart_pfn, highend_pfn; + +/* + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical + * chunk of RAM. + */ +/* + * Ordering is: + * + * FIXADDR_TOP + * fixed_addresses + * FIXADDR_START + * temp fixed addresses + * FIXADDR_BOOT_START + * Persistent kmap area + * PKMAP_BASE + * VMALLOC_END + * Vmalloc area + * VMALLOC_START + * high_memory + */ +#define LAST_PKMAP_MASK (LAST_PKMAP-1) +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) + +extern void *kmap_high(struct page *page); +extern void kunmap_high(struct page *page); + +void *kmap(struct page *page); +void kunmap(struct page *page); +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); +void *kmap_atomic(struct page *page, enum km_type type); +void *kmap_atomic_pte(struct page *page, enum km_type type); +void kunmap_atomic(void *kvaddr, enum km_type type); +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); +void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); +struct page *kmap_atomic_to_page(void *ptr); + +#define kmap_atomic_pte(page, type) \ + kmap_atomic_prot(page, type, \ + PagePinned(page) ? PAGE_KERNEL_RO : kmap_prot) + +#define flush_cache_kmaps() do { } while (0) + +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn); + +void clear_highpage(struct page *); +static inline void clear_user_highpage(struct page *page, unsigned long vaddr) +{ + clear_highpage(page); +} +#define __HAVE_ARCH_CLEAR_HIGHPAGE +#define clear_user_highpage clear_user_highpage +#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE + +void copy_highpage(struct page *to, struct page *from); +static inline void copy_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + copy_highpage(to, from); +} +#define __HAVE_ARCH_COPY_HIGHPAGE +#define __HAVE_ARCH_COPY_USER_HIGHPAGE + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_HIGHMEM_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/hypercall.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/hypercall.h @@ -0,0 +1,430 @@ +/****************************************************************************** + * hypercall.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * 64-bit updates: + * Benjamin Liu + * Jun Nakajima + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERCALL_H__ +#define __HYPERCALL_H__ + +#ifndef __HYPERVISOR_H__ +# error "please don't include this file directly" +#endif + +#if CONFIG_XEN_COMPAT <= 0x030002 +# include /* memcpy() */ +#endif + +#ifdef CONFIG_XEN +#define HYPERCALL_ASM_OPERAND "%c" +#define HYPERCALL_LOCATION(op) (hypercall_page + (op) * 32) +#define HYPERCALL_C_OPERAND(name) "i" (HYPERCALL_LOCATION(__HYPERVISOR_##name)) +#else +#define HYPERCALL_ASM_OPERAND "*%" +#define HYPERCALL_LOCATION(op) (hypercall_stubs + (op) * 32) +#define HYPERCALL_C_OPERAND(name) "g" (HYPERCALL_LOCATION(__HYPERVISOR_##name)) +#endif + +#define HYPERCALL_ARG(arg, n) \ + register typeof((arg)+0) __arg##n asm(HYPERCALL_arg##n) = (arg) + +#define _hypercall0(type, name) \ +({ \ + type __res; \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "1" \ + : "=a" (__res) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall1(type, name, arg) \ +({ \ + type __res; \ + HYPERCALL_ARG(arg, 1); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "2" \ + : "=a" (__res), "+r" (__arg1) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "3" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "4" \ + : "=a" (__res), "+r" (__arg1), \ + "+r" (__arg2), "+r" (__arg3) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + HYPERCALL_ARG(a4, 4); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "5" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + HYPERCALL_ARG(a4, 4); \ + HYPERCALL_ARG(a5, 5); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "6" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4), "+r" (__arg5) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall(type, op, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + HYPERCALL_ARG(a4, 4); \ + HYPERCALL_ARG(a5, 5); \ + asm volatile ( \ + "call *%6" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4), "+r" (__arg5) \ + : "g" (HYPERCALL_LOCATION(op)) \ + : "memory" ); \ + __res; \ +}) + +#ifdef CONFIG_X86_32 +# include "hypercall_32.h" +#else +# include "hypercall_64.h" +#endif + +static inline int __must_check +HYPERVISOR_set_trap_table( + const trap_info_t *table) +{ + return _hypercall1(int, set_trap_table, table); +} + +static inline int __must_check +HYPERVISOR_mmu_update( + mmu_update_t *req, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + if (arch_use_lazy_mmu_mode()) + return xen_multi_mmu_update(req, count, success_count, domid); + return _hypercall4(int, mmu_update, req, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_mmuext_op( + struct mmuext_op *op, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + if (arch_use_lazy_mmu_mode()) + return xen_multi_mmuext_op(op, count, success_count, domid); + return _hypercall4(int, mmuext_op, op, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_set_gdt( + unsigned long *frame_list, unsigned int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); +} + +static inline int __must_check +HYPERVISOR_stack_switch( + unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); +} + +static inline int +HYPERVISOR_fpu_taskswitch( + int set) +{ + return _hypercall1(int, fpu_taskswitch, set); +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int __must_check +HYPERVISOR_sched_op_compat( + int cmd, unsigned long arg) +{ + return _hypercall2(int, sched_op_compat, cmd, arg); +} +#endif + +static inline int __must_check +HYPERVISOR_sched_op( + int cmd, void *arg) +{ + return _hypercall2(int, sched_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_platform_op( + struct xen_platform_op *platform_op) +{ + platform_op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, platform_op); +} + +struct xen_mc; +static inline int __must_check +HYPERVISOR_mca( + struct xen_mc *mc_op) +{ + mc_op->interface_version = XEN_MCA_INTERFACE_VERSION; + return _hypercall1(int, mca, mc_op); +} + +static inline int __must_check +HYPERVISOR_set_debugreg( + unsigned int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static inline unsigned long __must_check +HYPERVISOR_get_debugreg( + unsigned int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int __must_check +HYPERVISOR_memory_op( + unsigned int cmd, void *arg) +{ + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(false); + return _hypercall2(int, memory_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_multicall( + multicall_entry_t *call_list, unsigned int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); +} + +static inline int __must_check +HYPERVISOR_event_channel_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, event_channel_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + struct evtchn_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_xen_version( + int cmd, void *arg) +{ + return _hypercall2(int, xen_version, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_console_io( + int cmd, unsigned int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); +} + +static inline int __must_check +HYPERVISOR_physdev_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, physdev_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + struct physdev_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_grant_table_op( + unsigned int cmd, void *uop, unsigned int count) +{ + bool fixup = false; + int rc; + + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(false); +#ifdef GNTTABOP_map_grant_ref + if (cmd == GNTTABOP_map_grant_ref) +#endif + fixup = gnttab_pre_map_adjust(cmd, uop, count); + rc = _hypercall3(int, grant_table_op, cmd, uop, count); + if (rc == 0 && fixup) + rc = gnttab_post_map_adjust(uop, count); + return rc; +} + +static inline int __must_check +HYPERVISOR_vm_assist( + unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); +} + +static inline int __must_check +HYPERVISOR_vcpu_op( + int cmd, unsigned int vcpuid, void *extra_args) +{ + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); +} + +static inline int __must_check +HYPERVISOR_suspend( + unsigned long srec) +{ + struct sched_shutdown sched_shutdown = { + .reason = SHUTDOWN_suspend + }; + + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, + &sched_shutdown, srec); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, + SHUTDOWN_suspend, srec); +#endif + + return rc; +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int +HYPERVISOR_nmi_op( + unsigned long op, void *arg) +{ + return _hypercall2(int, nmi_op, op, arg); +} +#endif + +#ifndef CONFIG_XEN +static inline unsigned long __must_check +HYPERVISOR_hvm_op( + int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} +#endif + +static inline int __must_check +HYPERVISOR_callback_op( + int cmd, const void *arg) +{ + return _hypercall2(int, callback_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_xenoprof_op( + int op, void *arg) +{ + return _hypercall2(int, xenoprof_op, op, arg); +} + +static inline int __must_check +HYPERVISOR_kexec_op( + unsigned long op, void *args) +{ + return _hypercall2(int, kexec_op, op, args); +} + +static inline int __must_check +HYPERVISOR_tmem_op( + struct tmem_op *op) +{ + return _hypercall1(int, tmem_op, op); +} + +#endif /* __HYPERCALL_H__ */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/hypercall_32.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/hypercall_32.h @@ -0,0 +1,62 @@ +#define HYPERCALL_arg1 "ebx" +#define HYPERCALL_arg2 "ecx" +#define HYPERCALL_arg3 "edx" +#define HYPERCALL_arg4 "esi" +#define HYPERCALL_arg5 "edi" + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int __must_check +HYPERVISOR_set_callbacks( + unsigned long event_selector, unsigned long event_address, + unsigned long failsafe_selector, unsigned long failsafe_address) +{ + return _hypercall4(int, set_callbacks, + event_selector, event_address, + failsafe_selector, failsafe_address); +} +#endif + +static inline long __must_check +HYPERVISOR_set_timer_op( + u64 timeout) +{ + return _hypercall2(long, set_timer_op, + (unsigned long)timeout, + (unsigned long)(timeout>>32)); +} + +static inline int __must_check +HYPERVISOR_update_descriptor( + u64 ma, u64 desc) +{ + return _hypercall4(int, update_descriptor, + (unsigned long)ma, (unsigned long)(ma>>32), + (unsigned long)desc, (unsigned long)(desc>>32)); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping( + unsigned long va, pte_t new_val, unsigned long flags) +{ + unsigned long pte_hi = 0; + + if (arch_use_lazy_mmu_mode()) + return xen_multi_update_va_mapping(va, new_val, flags); +#ifdef CONFIG_X86_PAE + pte_hi = new_val.pte_high; +#endif + return _hypercall4(int, update_va_mapping, va, + new_val.pte_low, pte_hi, flags); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + unsigned long pte_hi = 0; +#ifdef CONFIG_X86_PAE + pte_hi = new_val.pte_high; +#endif + return _hypercall5(int, update_va_mapping_otherdomain, va, + new_val.pte_low, pte_hi, flags, domid); +} --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/hypercall_64.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/hypercall_64.h @@ -0,0 +1,54 @@ +#define HYPERCALL_arg1 "rdi" +#define HYPERCALL_arg2 "rsi" +#define HYPERCALL_arg3 "rdx" +#define HYPERCALL_arg4 "r10" +#define HYPERCALL_arg5 "r8" + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int __must_check +HYPERVISOR_set_callbacks( + unsigned long event_address, unsigned long failsafe_address, + unsigned long syscall_address) +{ + return _hypercall3(int, set_callbacks, + event_address, failsafe_address, syscall_address); +} +#endif + +static inline long __must_check +HYPERVISOR_set_timer_op( + u64 timeout) +{ + return _hypercall1(long, set_timer_op, timeout); +} + +static inline int __must_check +HYPERVISOR_update_descriptor( + unsigned long ma, unsigned long word) +{ + return _hypercall2(int, update_descriptor, ma, word); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping( + unsigned long va, pte_t new_val, unsigned long flags) +{ + if (arch_use_lazy_mmu_mode()) + return xen_multi_update_va_mapping(va, new_val, flags); + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + return _hypercall4(int, update_va_mapping_otherdomain, va, + new_val.pte, flags, domid); +} + +static inline int __must_check +HYPERVISOR_set_segment_base( + int reg, unsigned long value) +{ + return _hypercall2(int, set_segment_base, reg, value); +} --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/hypervisor.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/hypervisor.h @@ -0,0 +1,388 @@ +/****************************************************************************** + * hypervisor.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERVISOR_H__ +#define __HYPERVISOR_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern shared_info_t *HYPERVISOR_shared_info; + +#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT +DECLARE_PER_CPU(struct vcpu_info, vcpu_info); +#define vcpu_info(cpu) (&per_cpu(vcpu_info, cpu)) +#define current_vcpu_info() (&__get_cpu_var(vcpu_info)) +#define vcpu_info_read(fld) percpu_read(vcpu_info.fld) +#define vcpu_info_write(fld, val) percpu_write(vcpu_info.fld, val) +#define vcpu_info_xchg(fld, val) percpu_xchg(vcpu_info.fld, val) +void setup_vcpu_info(unsigned int cpu); +void adjust_boot_vcpu_info(void); +#else +#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu)) +#ifdef CONFIG_SMP +#define current_vcpu_info() vcpu_info(smp_processor_id()) +#else +#define current_vcpu_info() vcpu_info(0) +#endif +#define vcpu_info_read(fld) (current_vcpu_info()->fld) +#define vcpu_info_write(fld, val) (current_vcpu_info()->fld = (val)) +static inline void setup_vcpu_info(unsigned int cpu) {} +#endif + +#ifdef CONFIG_X86_32 +extern unsigned long hypervisor_virt_start; +#endif + +/* arch/xen/i386/kernel/setup.c */ +extern start_info_t *xen_start_info; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN) +#else +#define is_initial_xendomain() 0 +#endif + +#define init_hypervisor(c) ((void)((c)->x86_hyper_vendor = X86_HYPER_VENDOR_XEN)) +#define init_hypervisor_platform() init_hypervisor(&boot_cpu_data) + +/* arch/xen/kernel/evtchn.c */ +/* Force a proper event-channel callback from Xen. */ +void force_evtchn_callback(void); + +/* arch/xen/kernel/process.c */ +void xen_cpu_idle (void); + +/* arch/xen/i386/kernel/hypervisor.c */ +void do_hypervisor_callback(struct pt_regs *regs); + +/* arch/xen/i386/mm/hypervisor.c */ +/* + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already + * be MACHINE addresses. + */ + +void xen_pt_switch(pgd_t *); +void xen_new_user_pt(pgd_t *); /* x86_64 only */ +void xen_load_gs(unsigned int selector); /* x86_64 only */ +void xen_tlb_flush(void); +void xen_invlpg(unsigned long ptr); + +void xen_l1_entry_update(pte_t *ptr, pte_t val); +void xen_l2_entry_update(pmd_t *ptr, pmd_t val); +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */ +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */ +void xen_pgd_pin(pgd_t *); +void xen_pgd_unpin(pgd_t *); + +void xen_init_pgd_pin(void); + +void xen_set_ldt(const void *ptr, unsigned int ents); + +#ifdef CONFIG_SMP +#include +void xen_tlb_flush_all(void); +void xen_invlpg_all(unsigned long ptr); +void xen_tlb_flush_mask(const cpumask_t *mask); +void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr); +#else +#define xen_tlb_flush_all xen_tlb_flush +#define xen_invlpg_all xen_invlpg +#endif + +/* Returns zero on success else negative errno. */ +int xen_create_contiguous_region( + unsigned long vstart, unsigned int order, unsigned int address_bits); +void xen_destroy_contiguous_region( + unsigned long vstart, unsigned int order); +int early_create_contiguous_region(unsigned long pfn, unsigned int order, + unsigned int address_bits); + +struct page; + +int xen_limit_pages_to_max_mfn( + struct page *pages, unsigned int order, unsigned int address_bits); + +/* Turn jiffies into Xen system time. */ +u64 jiffies_to_st(unsigned long jiffies); + +#ifdef CONFIG_XEN_SCRUB_PAGES +void scrub_pages(void *, unsigned int); +#else +#define scrub_pages(_p,_n) ((void)0) +#endif + +#if defined(CONFIG_XEN) && !defined(MODULE) + +DECLARE_PER_CPU(bool, xen_lazy_mmu); + +void xen_multicall_flush(bool); + +int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t, + unsigned long flags); +int __must_check xen_multi_mmu_update(mmu_update_t *, unsigned int count, + unsigned int *success_count, domid_t); +int __must_check xen_multi_mmuext_op(struct mmuext_op *, unsigned int count, + unsigned int *success_count, domid_t); + +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE +static inline void arch_enter_lazy_mmu_mode(void) +{ + percpu_write(xen_lazy_mmu, true); +} + +static inline void arch_leave_lazy_mmu_mode(void) +{ + percpu_write(xen_lazy_mmu, false); + xen_multicall_flush(false); +} + +#define arch_use_lazy_mmu_mode() unlikely(percpu_read(xen_lazy_mmu)) + +#if 0 /* All uses are in places potentially called asynchronously, but + * asynchronous code should rather not make use of lazy mode at all. + * Therefore, all uses of this function get commented out, proper + * detection of asynchronous invocations is added whereever needed, + * and this function is disabled to catch any new (improper) uses. + */ +static inline void arch_flush_lazy_mmu_mode(void) +{ + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(false); +} +#endif + +#else /* !CONFIG_XEN || MODULE */ + +static inline void xen_multicall_flush(bool ignore) {} +#define arch_use_lazy_mmu_mode() false +#define xen_multi_update_va_mapping(...) ({ BUG(); -ENOSYS; }) +#define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; }) +#define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; }) + +#endif /* CONFIG_XEN && !MODULE */ + +#ifdef CONFIG_XEN + +struct gnttab_map_grant_ref; +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *, + unsigned int count); +#if CONFIG_XEN_COMPAT < 0x030400 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int); +#else +static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m, + unsigned int count) +{ + BUG(); + return -ENOSYS; +} +#endif + +#else /* !CONFIG_XEN */ + +#define gnttab_pre_map_adjust(...) false +#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; }) + +#endif /* CONFIG_XEN */ + +#if defined(CONFIG_X86_64) +#define MULTI_UVMFLAGS_INDEX 2 +#define MULTI_UVMDOMID_INDEX 3 +#else +#define MULTI_UVMFLAGS_INDEX 3 +#define MULTI_UVMDOMID_INDEX 4 +#endif + +#ifdef CONFIG_XEN +#define is_running_on_xen() 1 +extern char hypercall_page[PAGE_SIZE]; +#else +extern char *hypercall_stubs; +#define is_running_on_xen() (!!hypercall_stubs) +#endif + +#include + +static inline int +HYPERVISOR_yield( + void) +{ + int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +static inline int +HYPERVISOR_block( + void) +{ + int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0); +#endif + + return rc; +} + +static inline void __noreturn +HYPERVISOR_shutdown( + unsigned int reason) +{ + struct sched_shutdown sched_shutdown = { + .reason = reason + }; + + VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown)); +#if CONFIG_XEN_COMPAT <= 0x030002 + VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason)); +#endif + /* Don't recurse needlessly. */ + BUG_ON(reason != SHUTDOWN_crash); + for(;;); +} + +static inline int __must_check +HYPERVISOR_poll( + evtchn_port_t *ports, unsigned int nr_ports, u64 timeout) +{ + int rc; + struct sched_poll sched_poll = { + .nr_ports = nr_ports, + .timeout = jiffies_to_st(timeout) + }; + set_xen_guest_handle(sched_poll.ports, ports); + + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_poll_no_timeout( + evtchn_port_t *ports, unsigned int nr_ports) +{ + int rc; + struct sched_poll sched_poll = { + .nr_ports = nr_ports + }; + set_xen_guest_handle(sched_poll.ports, ports); + + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +#ifdef CONFIG_XEN + +static inline void +MULTI_update_va_mapping( + multicall_entry_t *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +{ + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; +#if defined(CONFIG_X86_64) + mcl->args[1] = new_val.pte; +#elif defined(CONFIG_X86_PAE) + mcl->args[1] = new_val.pte_low; + mcl->args[2] = new_val.pte_high; +#else + mcl->args[1] = new_val.pte_low; + mcl->args[2] = 0; +#endif + mcl->args[MULTI_UVMFLAGS_INDEX] = flags; +} + +static inline void +MULTI_mmu_update(multicall_entry_t *mcl, mmu_update_t *req, + unsigned int count, unsigned int *success_count, + domid_t domid) +{ + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)req; + mcl->args[1] = count; + mcl->args[2] = (unsigned long)success_count; + mcl->args[3] = domid; +} + +static inline void +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd, + void *uop, unsigned int count) +{ + mcl->op = __HYPERVISOR_grant_table_op; + mcl->args[0] = cmd; + mcl->args[1] = (unsigned long)uop; + mcl->args[2] = count; +} + +#else /* !defined(CONFIG_XEN) */ + +/* Multicalls not supported for HVM guests. */ +#define MULTI_update_va_mapping(a,b,c,d) ((void)0) +#define MULTI_grant_table_op(a,b,c,d) ((void)0) + +#endif + +#define uvm_multi(cpumask) ((unsigned long)cpumask_bits(cpumask) | UVMF_MULTI) + +#ifdef LINUX +/* drivers/staging/ use Windows-style types, including VOID */ +#undef VOID +#endif + +#endif /* __HYPERVISOR_H__ */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/io.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/io.h @@ -0,0 +1,229 @@ +#ifndef _ASM_X86_IO_H +#define _ASM_X86_IO_H + +#define ARCH_HAS_IOREMAP_WC + +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +#define build_mmio_read(name, size, type, reg, barrier) \ +static inline type name(const volatile void __iomem *addr) \ +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \ +:"m" (*(volatile type __force *)addr) barrier); return ret; } + +#define build_mmio_write(name, size, type, reg, barrier) \ +static inline void name(type val, volatile void __iomem *addr) \ +{ asm volatile("mov" size " %0,%1": :reg (val), \ +"m" (*(volatile type __force *)addr) barrier); } + +build_mmio_read(readb, "b", unsigned char, "=q", :"memory") +build_mmio_read(readw, "w", unsigned short, "=r", :"memory") +build_mmio_read(readl, "l", unsigned int, "=r", :"memory") + +build_mmio_read(__readb, "b", unsigned char, "=q", ) +build_mmio_read(__readw, "w", unsigned short, "=r", ) +build_mmio_read(__readl, "l", unsigned int, "=r", ) + +build_mmio_write(writeb, "b", unsigned char, "q", :"memory") +build_mmio_write(writew, "w", unsigned short, "r", :"memory") +build_mmio_write(writel, "l", unsigned int, "r", :"memory") + +build_mmio_write(__writeb, "b", unsigned char, "q", ) +build_mmio_write(__writew, "w", unsigned short, "r", ) +build_mmio_write(__writel, "l", unsigned int, "r", ) + +#define readb_relaxed(a) __readb(a) +#define readw_relaxed(a) __readw(a) +#define readl_relaxed(a) __readl(a) +#define __raw_readb __readb +#define __raw_readw __readw +#define __raw_readl __readl + +#define __raw_writeb __writeb +#define __raw_writew __writew +#define __raw_writel __writel + +#define mmiowb() barrier() + +#ifdef CONFIG_X86_64 + +build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_write(writeq, "q", unsigned long, "r", :"memory") + +#else + +static inline __u64 readq(const volatile void __iomem *addr) +{ + const volatile u32 __iomem *p = addr; + u32 low, high; + + low = readl(p); + high = readl(p + 1); + + return low + ((u64)high << 32); +} + +static inline void writeq(__u64 val, volatile void __iomem *addr) +{ + writel(val, addr); + writel(val >> 32, addr+4); +} + +#endif + +#define readq_relaxed(a) readq(a) + +#define __raw_readq(a) readq(a) +#define __raw_writeq(val, addr) writeq(val, addr) + +/* Let people know that we have them */ +#define readq readq +#define writeq writeq + +#define native_io_delay xen_io_delay + +/** + * virt_to_phys - map virtual addresses to physical + * @address: address to remap + * + * The returned physical address is the physical (CPU) mapping for + * the memory address given. It is only valid to use this function on + * addresses directly mapped or allocated via kmalloc. + * + * This function does not give bus mappings for DMA transfers. In + * almost all conceivable cases a device driver should not be using + * this function + */ + +static inline phys_addr_t virt_to_phys(volatile void *address) +{ + return __pa(address); +} + +/** + * phys_to_virt - map physical address to virtual + * @address: address to remap + * + * The returned virtual address is a current CPU mapping for + * the memory address given. It is only valid to use this function on + * addresses that have a kernel mapping + * + * This function does not handle bus mappings for DMA transfers. In + * almost all conceivable cases a device driver should not be using + * this function + */ + +static inline void *phys_to_virt(phys_addr_t address) +{ + return __va(address); +} + +/* + * Change "struct page" to physical address. + */ +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) +#undef page_to_phys +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page))) +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page))) + +/* + * ISA I/O bus memory addresses are 1:1 with the physical address. + * However, we truncate the address to unsigned int to avoid undesirable + * promitions in legacy drivers. + */ +#define isa_virt_to_bus(_x) ({ \ + unsigned long _va_ = (unsigned long)(_x); \ + _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \ + ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \ + : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); }) +#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x)) + +/* + * However PCI ones are not necessarily 1:1 and therefore these interfaces + * are forbidden in portable PCI drivers. + * + * Allow them on x86 for legacy drivers, though. + */ +#define virt_to_bus(_x) phys_to_machine(__pa(_x)) +#define bus_to_virt(_x) __va(machine_to_phys(_x)) + +/** + * ioremap - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * If the area you are trying to map is a PCI BAR you should have a + * look at pci_iomap(). + */ +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, + unsigned long prot_val); + +/* + * The default ioremap() behavior is non-cached: + */ +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) +{ + return ioremap_nocache(offset, size); +} + +extern void iounmap(volatile void __iomem *addr); + + +#ifdef CONFIG_X86_32 +# include "../../asm/io_32.h" +#else +# include "../../asm/io_64.h" +#endif + +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) + +/* We will be supplying our own /dev/mem implementation */ +#define ARCH_HAS_DEV_MEM + +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \ + (unsigned long) (bv)->bv_offset) + +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ + (bvec_to_phys(vec1) + (vec1)->bv_len == bvec_to_phys(vec2) \ + && bvec_to_pseudophys(vec1) + (vec1)->bv_len \ + == bvec_to_pseudophys(vec2)) + +#undef __ISA_IO_base + +#endif + +extern void *xlate_dev_mem_ptr(unsigned long phys); +extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); + +extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size, + unsigned long prot_val); +extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); + +/* + * early_ioremap() and early_iounmap() are for temporary early boot-time + * mappings, before the real ioremap() is functional. + * A boot-time mapping is currently limited to at most 16 pages. + */ +extern void early_ioremap_init(void); +extern void early_ioremap_reset(void); +extern void __iomem *early_ioremap(resource_size_t phys_addr, + unsigned long size); +extern void __iomem *early_memremap(resource_size_t phys_addr, + unsigned long size); +extern void early_iounmap(void __iomem *addr, unsigned long size); + +#define IO_SPACE_LIMIT 0xffff + +#endif /* _ASM_X86_IO_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/ipi.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/ipi.h @@ -0,0 +1,13 @@ +#ifndef _ASM_X86_IPI_H +#define _ASM_X86_IPI_H + +#include +#include + +void xen_send_IPI_mask(const struct cpumask *, int vector); +void xen_send_IPI_mask_allbutself(const struct cpumask *, int vector); +void xen_send_IPI_allbutself(int vector); +void xen_send_IPI_all(int vector); +void xen_send_IPI_self(int vector); + +#endif /* _ASM_X86_IPI_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/irq_vectors.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/irq_vectors.h @@ -0,0 +1,94 @@ +#ifndef _ASM_X86_IRQ_VECTORS_H +#define _ASM_X86_IRQ_VECTORS_H + +#define MCE_VECTOR 0x12 + +#ifdef CONFIG_X86_32 +# define SYSCALL_VECTOR 0x80 +# define IA32_SYSCALL_VECTOR 0x80 +#else +# define IA32_SYSCALL_VECTOR 0x80 +#endif + +#define RESCHEDULE_VECTOR 0 +#define CALL_FUNCTION_VECTOR 1 +#define CALL_FUNC_SINGLE_VECTOR 2 +#define SPIN_UNLOCK_VECTOR 3 +#define REBOOT_VECTOR 4 +#define NR_IPIS 5 + +/* + * The maximum number of vectors supported by i386 processors + * is limited to 256. For processors other than i386, NR_VECTORS + * should be changed accordingly. + */ +#define NR_VECTORS 256 + +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 + +#ifndef __ASSEMBLY__ +static inline int invalid_vm86_irq(int irq) +{ + return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; +} +#endif + +/* + * Size the maximum number of interrupts. + * + * If the irq_desc[] array has a sparse layout, we can size things + * generously - it scales up linearly with the maximum number of CPUs, + * and the maximum number of IO-APICs, whichever is higher. + * + * In other cases we size more conservatively, to not create too large + * static arrays. + */ + +#define NR_IRQS_LEGACY 16 + +/* + * The flat IRQ space is divided into two regions: + * 1. A one-to-one mapping of real physical IRQs. This space is only used + * if we have physical device-access privilege. This region is at the + * start of the IRQ space so that existing device drivers do not need + * to be modified to translate physical IRQ numbers into our IRQ space. + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These + * are bound using the provided bind/unbind functions. + */ +#define PIRQ_BASE 0 + +#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS ) +#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) + +#ifdef CONFIG_X86_IO_APIC +# if !defined(NR_CPUS) || !defined(MAX_IO_APICS) +/* nothing */ +# elif defined(CONFIG_SPARSE_IRQ) +# define NR_PIRQS \ + (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ + (NR_VECTORS + CPU_VECTOR_LIMIT) : \ + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) +# elif NR_CPUS < MAX_IO_APICS +# define NR_PIRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) +# else +# define NR_PIRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) +# endif +#else /* !CONFIG_X86_IO_APIC: */ +# define NR_PIRQS NR_IRQS_LEGACY +#endif + +#ifndef __ASSEMBLY__ +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SPARSE_IRQ) +extern int nr_pirqs; +#else +# define nr_pirqs NR_PIRQS +#endif +#endif + +#define DYNIRQ_BASE (PIRQ_BASE + nr_pirqs) +#define NR_DYNIRQS (64 + CONFIG_XEN_NR_GUEST_DEVICES) + +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) + +#endif /* _ASM_X86_IRQ_VECTORS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/irqflags.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/irqflags.h @@ -0,0 +1,223 @@ +#ifndef _X86_IRQFLAGS_H_ +#define _X86_IRQFLAGS_H_ + +#include + +#ifndef __ASSEMBLY__ +/* + * The use of 'barrier' in the following reflects their use as local-lock + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following + * critical operations are executed. All critical operations must complete + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also + * includes these barriers, for example. + */ + +#define xen_save_fl(void) vcpu_info_read(evtchn_upcall_mask) + +#define xen_restore_fl(f) \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \ + barrier(); /* unmask then check (avoid races) */\ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ + } \ +} while (0) + +#define xen_irq_disable() \ +do { \ + vcpu_info_write(evtchn_upcall_mask, 1); \ + barrier(); \ +} while (0) + +#define xen_irq_enable() \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + _vcpu->evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ +} while (0) + +void xen_safe_halt(void); + +void xen_halt(void); + +#define __raw_local_save_flags() xen_save_fl() + +#define raw_local_irq_restore(flags) xen_restore_fl(flags) + +#define raw_local_irq_disable() xen_irq_disable() + +#define raw_local_irq_enable() xen_irq_enable() + +/* + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +static inline void raw_safe_halt(void) +{ + xen_safe_halt(); +} + +/* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +static inline void halt(void) +{ + xen_halt(); +} + +/* + * For spinlocks, etc: + */ +#define __raw_local_irq_save() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_local_irq_disable(); \ + \ + flags; \ +}) +#else + +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#ifdef CONFIG_X86_64 +# define __REG_si %rsi +# define __CPU_num PER_CPU_VAR(cpu_number) +#else +# define __REG_si %esi +# define __CPU_num TI_cpu(%ebp) +#endif + +#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT + +#define GET_VCPU_INFO PER_CPU(vcpu_info, __REG_si) +#define __DISABLE_INTERRUPTS movb $1,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask) +#define __ENABLE_INTERRUPTS movb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask) +#define __TEST_PENDING cmpb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_pending+0) +#define DISABLE_INTERRUPTS(clb) __DISABLE_INTERRUPTS +#define ENABLE_INTERRUPTS(clb) __ENABLE_INTERRUPTS + +#define __SIZEOF_DISABLE_INTERRUPTS 8 +#define __SIZEOF_TEST_PENDING 8 + +#else /* CONFIG_XEN_VCPU_INFO_PLACEMENT */ + +#define sizeof_vcpu_shift 6 + +#ifdef CONFIG_SMP +#define GET_VCPU_INFO movl __CPU_num,%esi ; \ + shl $sizeof_vcpu_shift,%esi ; \ + add HYPERVISOR_shared_info,__REG_si +#else +#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si +#endif + +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si) +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si) +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si) +#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ + __DISABLE_INTERRUPTS +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ + __ENABLE_INTERRUPTS + +#define __SIZEOF_DISABLE_INTERRUPTS 4 +#define __SIZEOF_TEST_PENDING 3 + +#endif /* CONFIG_XEN_VCPU_INFO_PLACEMENT */ + +#ifndef CONFIG_X86_64 +#define INTERRUPT_RETURN iret +#define ENABLE_INTERRUPTS_SYSEXIT \ + movb $0,evtchn_upcall_mask(%esi) /* __ENABLE_INTERRUPTS */ ; \ +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \ + cmpb $0,evtchn_upcall_pending(%esi) /* __TEST_PENDING */ ; \ + jnz 14f /* process more events if necessary... */ ; \ + movl PT_ESI(%esp), %esi ; \ + sysexit ; \ +14: movb $1,evtchn_upcall_mask(%esi) /* __DISABLE_INTERRUPTS */ ; \ + TRACE_IRQS_OFF ; \ +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \ + mov $__KERNEL_PERCPU, %ecx ; \ + push %esp ; \ + mov %ecx, %fs ; \ + SET_KERNEL_GS %ecx ; \ + call evtchn_do_upcall ; \ + add $4,%esp ; \ + jmp ret_from_intr +#endif + + +#endif /* __ASSEMBLY__ */ + +#ifndef __ASSEMBLY__ +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) + +#define raw_local_irq_save(flags) \ + do { (flags) = __raw_local_irq_save(); } while (0) + +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return (flags != 0); +} + +#define raw_irqs_disabled() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_irqs_disabled_flags(flags); \ +}) + +#else + +#ifdef CONFIG_X86_64 +#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ + TRACE_IRQS_ON; \ + ENABLE_INTERRUPTS(CLBR_NONE); \ + SAVE_REST; \ + LOCKDEP_SYS_EXIT; \ + RESTORE_REST; \ + __DISABLE_INTERRUPTS; \ + TRACE_IRQS_OFF; + +#else +#define ARCH_LOCKDEP_SYS_EXIT \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call lockdep_sys_exit; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_LOCKDEP_SYS_EXIT_IRQ +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; +#else +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ +# else +# define LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ +# endif + +#endif /* __ASSEMBLY__ */ +#endif --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/mach_traps.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/mach_traps.h @@ -0,0 +1,33 @@ +/* + * include/asm-xen/asm-i386/mach-xen/mach_traps.h + * + * Machine specific NMI handling for Xen + */ +#ifndef _MACH_TRAPS_H +#define _MACH_TRAPS_H + +#include +#include + +static inline void clear_mem_error(unsigned char reason) {} +static inline void clear_io_check_error(unsigned char reason) {} + +static inline unsigned char get_nmi_reason(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + unsigned char reason = 0; + + /* construct a value which looks like it came from + * port 0x61. + */ + if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason)) + reason |= 0x40; + if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason)) + reason |= 0x80; + + return reason; +} + +static inline void reassert_nmi(void) {} + +#endif /* !_MACH_TRAPS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/maddr.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/maddr.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "maddr_32.h" +#else +# include "maddr_64.h" +#endif --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/maddr_32.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/maddr_32.h @@ -0,0 +1,182 @@ +#ifndef _I386_MADDR_H +#define _I386_MADDR_H + +#include +#include +#include + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +/* Definitions for machine and pseudophysical addresses. */ +#ifdef CONFIG_X86_PAE +typedef unsigned long long paddr_t; +typedef unsigned long long maddr_t; +#else +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; +#endif + +#ifdef CONFIG_XEN + +extern unsigned long *phys_to_machine_mapping; +extern unsigned long max_mapnr; + +#undef machine_to_phys_mapping +extern unsigned long *machine_to_phys_mapping; +extern unsigned int machine_to_phys_order; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return pfn; + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return 1; + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return mfn; + + if (unlikely((mfn >> machine_to_phys_order) != 0)) + return max_mapnr; + + /* The array access can fail (e.g., device space beyond end of RAM). */ + asm ( + "1: movl %1,%0\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl %2,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b,3b\n" + ".previous" + : "=r" (pfn) + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) ); + + return pfn; +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))==MFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservation, + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +{ + unsigned long pfn = mfn_to_pfn(mfn); + if (likely(pfn < max_mapnr) + && likely(!xen_feature(XENFEAT_auto_translated_physmap)) + && unlikely(phys_to_machine_mapping[pfn] != mfn)) + return max_mapnr; /* force !pfn_valid() */ + return pfn; +} + +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + phys_to_machine_mapping[pfn] = mfn; +} + +static inline maddr_t phys_to_machine(paddr_t phys) +{ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); + return machine; +} + +static inline paddr_t machine_to_phys(maddr_t machine) +{ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); + return phys; +} + +#ifdef CONFIG_X86_PAE +static inline paddr_t pte_phys_to_machine(paddr_t phys) +{ + /* + * In PAE mode, the NX bit needs to be dealt with in the value + * passed to pfn_to_mfn(). On x86_64, we need to mask it off, + * but for i386 the conversion to ulong for the argument will + * clip it off. + */ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK); + return machine; +} + +static inline paddr_t pte_machine_to_phys(maddr_t machine) +{ + /* + * In PAE mode, the NX bit needs to be dealt with in the value + * passed to mfn_to_pfn(). On x86_64, we need to mask it off, + * but for i386 the conversion to ulong for the argument will + * clip it off. + */ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); + return phys; +} +#else +#define pte_phys_to_machine phys_to_machine +#define pte_machine_to_phys machine_to_phys +#endif + +#else /* !CONFIG_XEN */ + +#define pfn_to_mfn(pfn) (pfn) +#define mfn_to_pfn(mfn) (mfn) +#define mfn_to_local_pfn(mfn) (mfn) +#define set_phys_to_machine(pfn, mfn) ((void)0) +#define phys_to_machine_mapping_valid(pfn) (1) +#define phys_to_machine(phys) ((maddr_t)(phys)) +#define machine_to_phys(mach) ((paddr_t)(mach)) +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot) +#define __pte_ma(x) __pte(x) + +#endif /* !CONFIG_XEN */ + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) (phys_to_machine(__pa(v))) +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) + +#endif /* _I386_MADDR_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/maddr_64.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/maddr_64.h @@ -0,0 +1,163 @@ +#ifndef _X86_64_MADDR_H +#define _X86_64_MADDR_H + +#include +#include +#include + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL<<63) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +/* Definitions for machine and pseudophysical addresses. */ +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; + +#ifdef CONFIG_XEN + +extern unsigned long *phys_to_machine_mapping; +extern unsigned long max_mapnr; + +#undef machine_to_phys_mapping +extern unsigned long *machine_to_phys_mapping; +extern unsigned int machine_to_phys_order; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return pfn; + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return 1; + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return mfn; + + if (unlikely((mfn >> machine_to_phys_order) != 0)) + return max_mapnr; + + /* The array access can fail (e.g., device space beyond end of RAM). */ + asm ( + "1: movq %1,%0\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movq %2,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 1b,3b\n" + ".previous" + : "=r" (pfn) + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) ); + + return pfn; +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))==MFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservation, + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +{ + unsigned long pfn = mfn_to_pfn(mfn); + if (likely(pfn < max_mapnr) + && likely(!xen_feature(XENFEAT_auto_translated_physmap)) + && unlikely(phys_to_machine_mapping[pfn] != mfn)) + return max_mapnr; /* force !pfn_valid() */ + return pfn; +} + +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + phys_to_machine_mapping[pfn] = mfn; +} + +static inline maddr_t phys_to_machine(paddr_t phys) +{ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); + return machine; +} + +static inline paddr_t machine_to_phys(maddr_t machine) +{ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); + return phys; +} + +static inline paddr_t pte_phys_to_machine(paddr_t phys) +{ + maddr_t machine; + machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK); + return machine; +} + +static inline paddr_t pte_machine_to_phys(maddr_t machine) +{ + paddr_t phys; + phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); + return phys; +} + +#else /* !CONFIG_XEN */ + +#define pfn_to_mfn(pfn) (pfn) +#define mfn_to_pfn(mfn) (mfn) +#define mfn_to_local_pfn(mfn) (mfn) +#define set_phys_to_machine(pfn, mfn) ((void)0) +#define phys_to_machine_mapping_valid(pfn) (1) +#define phys_to_machine(phys) ((maddr_t)(phys)) +#define machine_to_phys(mach) ((paddr_t)(mach)) +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot) +#define __pte_ma(x) __pte(x) + +#endif /* !CONFIG_XEN */ + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) (phys_to_machine(__pa(v))) +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) + +#endif /* _X86_64_MADDR_H */ + --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/mmu_context.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/mmu_context.h @@ -0,0 +1,164 @@ +#ifndef _ASM_X86_MMU_CONTEXT_H +#define _ASM_X86_MMU_CONTEXT_H + +#include +#include +#include +#include + +void arch_exit_mmap(struct mm_struct *mm); +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); + +void mm_pin(struct mm_struct *mm); +void mm_unpin(struct mm_struct *mm); +void mm_pin_all(void); + +static inline void xen_activate_mm(struct mm_struct *prev, + struct mm_struct *next) +{ + if (!PagePinned(virt_to_page(next->pgd))) + mm_pin(next); +} + +/* + * Used for LDT copy/destruction. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context(struct mm_struct *mm); + + +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); +#endif +} + +#define prepare_arch_switch(next) __prepare_arch_switch() + +static inline void __prepare_arch_switch(void) +{ +#ifdef CONFIG_X86_32 + /* + * Save away %gs. No need to save %fs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. + */ + lazy_save_gs(current->thread.gs); + lazy_load_gs(__KERNEL_STACK_CANARY); +#else + /* + * Save away %es, %ds, %fs and %gs. Must happen before reload + * of cr3/ldt (i.e., not in __switch_to). + */ + __asm__ __volatile__ ( + "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3" + : "=m" (current->thread.es), + "=m" (current->thread.ds), + "=m" (current->thread.fsindex), + "=m" (current->thread.gsindex) ); + + if (current->thread.ds) + __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) ); + + if (current->thread.es) + __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) ); + + if (current->thread.fsindex) { + __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) ); + current->thread.fs = 0; + } + + if (current->thread.gsindex) { + load_gs_index(0); + current->thread.gs = 0; + } +#endif +} + +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op; +#ifdef CONFIG_X86_64 + pgd_t *upgd; +#endif + + if (likely(prev != next)) { + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && + !PagePinned(virt_to_page(next->pgd))); + + /* stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + percpu_write(cpu_tlbstate.active_mm, next); +#endif + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* Re-load page tables: load_cr3(next->pgd) */ + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = virt_to_mfn(next->pgd); + op++; + + /* xen_new_user_pt(next->pgd) */ +#ifdef CONFIG_X86_64 + op->cmd = MMUEXT_NEW_USER_BASEPTR; + upgd = __user_pgd(next->pgd); + op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0; + op++; +#endif + + /* + * load the LDT, if the LDT is different: + */ + if (unlikely(prev->context.ldt != next->context.ldt)) { + /* load_LDT_nolock(&next->context) */ + op->cmd = MMUEXT_SET_LDT; + op->arg1.linear_addr = (unsigned long)next->context.ldt; + op->arg2.nr_ents = next->context.size; + op++; + } + + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); + } +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + else { + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); + + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { + /* We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + */ + load_cr3(next->pgd); + xen_new_user_pt(next->pgd); + load_LDT_nolock(&next->context); + } + } +#endif +} + +#define activate_mm(prev, next) \ +do { \ + xen_activate_mm(prev, next); \ + switch_mm((prev), (next), NULL); \ +} while (0); + +#ifdef CONFIG_X86_32 +#define deactivate_mm(tsk, mm) \ +do { \ + lazy_load_gs(0); \ +} while (0) +#else +#define deactivate_mm(tsk, mm) \ +do { \ + load_gs_index(0); \ + loadsegment(fs, 0); \ +} while (0) +#endif + +#endif /* _ASM_X86_MMU_CONTEXT_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pci.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pci.h @@ -0,0 +1,162 @@ +#ifndef _ASM_X86_PCI_H +#define _ASM_X86_PCI_H + +#include /* for struct page */ +#include +#include +#include +#include +#include + +#ifdef __KERNEL__ + +struct pci_sysdata { + int domain; /* PCI domain */ + int node; /* NUMA node */ +#ifdef CONFIG_X86_64 + void *iommu; /* IOMMU private data */ +#endif +#ifdef CONFIG_XEN_PCIDEV_FRONTEND + struct pcifront_device *pdev; +#endif +}; + +extern int pci_routeirq; +extern int noioapicquirk; +extern int noioapicreroute; + +/* scan a bus after allocating a pci_sysdata for it */ +extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, + int node); +extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); + +static inline int pci_domain_nr(struct pci_bus *bus) +{ + struct pci_sysdata *sd = bus->sysdata; + return sd->domain; +} + +static inline int pci_proc_domain(struct pci_bus *bus) +{ + return pci_domain_nr(bus); +} + + +/* Can be used to override the logic in pci_scan_bus for skipping + already-configured bus numbers - to be used for buggy BIOSes + or architectures with incomplete PCI setup by the loader */ + +#ifdef CONFIG_PCI +extern unsigned int pcibios_assign_all_busses(void); +#else +#define pcibios_assign_all_busses() 0 +#endif + +#include +#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain()) + +extern unsigned long pci_mem_start; +#define PCIBIOS_MIN_IO 0x1000 +#define PCIBIOS_MIN_MEM (pci_mem_start) + +#define PCIBIOS_MIN_CARDBUS_IO 0x4000 + +void pcibios_config_init(void); +struct pci_bus *pcibios_scan_root(int bus); + +void pcibios_set_master(struct pci_dev *dev); +void pcibios_penalize_isa_irq(int irq, int active); +struct irq_routing_table *pcibios_get_irq_routing_table(void); +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); + + +#define HAVE_PCI_MMAP +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, + int write_combine); + + +#ifdef CONFIG_PCI +extern void early_quirks(void); +static inline void pci_dma_burst_advice(struct pci_dev *pdev, + enum pci_dma_burst_strategy *strat, + unsigned long *strategy_parameter) +{ + *strat = PCI_DMA_BURST_INFINITY; + *strategy_parameter = ~0UL; +} +#else +static inline void early_quirks(void) { } +#endif + +extern void pci_iommu_alloc(void); + +/* MSI arch hooks */ +#define arch_setup_msi_irqs arch_setup_msi_irqs +#define arch_teardown_msi_irqs arch_teardown_msi_irqs + +#define PCI_DMA_BUS_IS_PHYS 0 + +#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) \ + || defined(CONFIG_SWIOTLB) + +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ + dma_addr_t ADDR_NAME; +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ + __u32 LEN_NAME; +#define pci_unmap_addr(PTR, ADDR_NAME) \ + ((PTR)->ADDR_NAME) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + (((PTR)->ADDR_NAME) = (VAL)) +#define pci_unmap_len(PTR, LEN_NAME) \ + ((PTR)->LEN_NAME) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ + (((PTR)->LEN_NAME) = (VAL)) + +#else + +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0]; +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0]; +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME)) +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ + do { break; } while (pci_unmap_len(PTR, LEN_NAME)) + +#endif + +#endif /* __KERNEL__ */ + +#ifdef CONFIG_X86_64 +#include "../../asm/pci_64.h" +#endif + +/* implement the pci_ DMA API in terms of the generic device dma_ one */ +#include + +/* generic pci stuff */ +#include +#define PCIBIOS_MAX_MEM_32 0xffffffff + +#ifdef CONFIG_NUMA +/* Returns the node based on pci bus */ +static inline int __pcibus_to_node(const struct pci_bus *bus) +{ + const struct pci_sysdata *sd = bus->sysdata; + + return sd->node; +} + +static inline const struct cpumask * +cpumask_of_pcibus(const struct pci_bus *bus) +{ + int node; + + node = __pcibus_to_node(bus); + return (node == -1) ? cpu_online_mask : + cpumask_of_node(node); +} +#endif + +#endif /* _ASM_X86_PCI_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgalloc.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgalloc.h @@ -0,0 +1,154 @@ +#ifndef _ASM_X86_PGALLOC_H +#define _ASM_X86_PGALLOC_H + +#include +#include /* for struct page */ +#include + +#include /* for phys_to_virt and page_to_pseudophys */ + +static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {} + +static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, + unsigned long start, unsigned long count) {} +static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_release_pte(unsigned long pfn) {} +static inline void paravirt_release_pmd(unsigned long pfn) {} +static inline void paravirt_release_pud(unsigned long pfn) {} + +#ifdef CONFIG_X86_64 +void early_make_page_readonly(void *va, unsigned int feature); +pmd_t *early_get_pmd(unsigned long va); +#define make_lowmem_page_readonly make_page_readonly +#define make_lowmem_page_writable make_page_writable +#endif + +/* + * Allocate and free page tables. + */ +extern pgd_t *pgd_alloc(struct mm_struct *); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); + +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); + +/* Should really implement gc for free page table pages. This could be + done with a reference count in struct page. */ + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); + make_lowmem_page_writable(pte, XENFEAT_writable_page_tables); + free_page((unsigned long)pte); +} + +extern void __pte_free(pgtable_t); +static inline void pte_free(struct mm_struct *mm, struct page *pte) +{ + __pte_free(pte); +} + +extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); + +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, + unsigned long address) +{ + ___pte_free_tlb(tlb, pte); +} + +static inline void pmd_populate_kernel(struct mm_struct *mm, + pmd_t *pmd, pte_t *pte) +{ + paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + struct page *pte) +{ + unsigned long pfn = page_to_pfn(pte); + pmd_t ent = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE); + + paravirt_alloc_pte(mm, pfn); + if (PagePinned(virt_to_page(pmd))) { +#ifndef CONFIG_HIGHPTE + BUG_ON(PageHighMem(pte)); +#endif + set_pmd(pmd, ent); + } else + *pmd = ent; +} + +#define pmd_pgtable(pmd) pmd_page(pmd) + +#if PAGETABLE_LEVELS > 2 +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr); +extern void __pmd_free(pgtable_t); + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); + __pmd_free(virt_to_page(pmd)); +} + +extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long adddress) +{ + ___pmd_free_tlb(tlb, pmd); +} + +#ifdef CONFIG_X86_PAE +extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); +#else /* !CONFIG_X86_PAE */ +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + pud_t ent = __pud(_PAGE_TABLE | __pa(pmd)); + + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + if (PagePinned(virt_to_page(pud))) + set_pud(pud, ent); + else + *pud = ent; +} +#endif /* CONFIG_X86_PAE */ + +#if PAGETABLE_LEVELS > 3 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + pgd_t ent = __pgd(_PAGE_TABLE | __pa(pud)); + + paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); + if (unlikely(PagePinned(virt_to_page(pgd)))) + xen_l4_entry_update(pgd, ent); + else + *__user_pgd(pgd) = *pgd = ent; +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return (pud_t *)pmd_alloc_one(mm, addr); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); + __pmd_free(virt_to_page(pud)); +} + +extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); + +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + ___pud_free_tlb(tlb, pud); +} + +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +#endif /* _ASM_X86_PGALLOC_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable-3level.h @@ -0,0 +1,137 @@ +#ifndef _ASM_X86_PGTABLE_3LEVEL_H +#define _ASM_X86_PGTABLE_3LEVEL_H + +/* + * Intel Physical Address Extension (PAE) Mode - three-level page + * tables on PPro+ CPUs. + * + * Copyright (C) 1999 Ingo Molnar + */ + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \ + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \ + __FILE__, __LINE__, &(e), __pmd_val(e), \ + (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \ + __FILE__, __LINE__, &(e), __pgd_val(e), \ + (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT) + +/* Rules for using set_pte: the pte being assigned *must* be + * either not present or in a state where the hardware will + * not attempt to update the pte. In places where this is + * not possible, use pte_get_and_clear to obtain the old pte + * value and then use set_pte to update it. -ben + */ + +static inline void xen_set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((unsigned long long *)(ptep), __pte_val(pte)); +} + +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + xen_l2_entry_update(pmdp, pmd); +} + +static inline void xen_set_pud(pud_t *pudp, pud_t pud) +{ + xen_l3_entry_update(pudp, pud); +} + +/* + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table + * entry, so clear the bottom half first and enforce ordering with a compiler + * barrier. + */ +static inline void __xen_pte_clear(pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; +} + +#define xen_pmd_clear(pmd) \ +({ \ + pmd_t *__pmdp = (pmd); \ + PagePinned(virt_to_page(__pmdp)) \ + ? set_pmd(__pmdp, __pmd(0)) \ + : (void)(*__pmdp = __pmd(0)); \ +}) + +static inline void __xen_pud_clear(pud_t *pudp) +{ + pgdval_t pgd; + + set_pud(pudp, __pud(0)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + * + * Make sure the pud entry we're updating is within the + * current pgd to avoid unnecessary TLB flushes. + */ + pgd = read_cr3(); + if (__pa(pudp) >= pgd && __pa(pudp) < + (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) + xen_tlb_flush(); +} + +#define xen_pud_clear(pudp) \ +({ \ + pud_t *__pudp = (pudp); \ + PagePinned(virt_to_page(__pudp)) \ + ? __xen_pud_clear(__pudp) \ + : (void)(*__pudp = __pud(0)); \ +}) + +#ifdef CONFIG_SMP +static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res) +{ + uint64_t val = __pte_val(res); + if (__cmpxchg64(ptep, val, 0) != val) { + /* xchg acts as a barrier before the setting of the high bits */ + res.pte_low = xchg(&ptep->pte_low, 0); + res.pte_high = ptep->pte_high; + ptep->pte_high = 0; + } + return res; +} +#else +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) +#endif + +#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \ + ((_pte).pte_high << (32-PAGE_SHIFT))) + +/* + * Bits 0, 6 and 7 are taken in the low part of the pte, + * put the 32 bits of offset into the high part. + */ +#define pte_to_pgoff(pte) ((pte).pte_high) +#define pgoff_to_pte(off) \ + ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) +#define PTE_FILE_MAX_BITS 32 + +/* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) +#define __swp_type(x) (((x).val) & 0x1f) +#define __swp_offset(x) ((x).val >> 5) +#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) +#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) +#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) + +#endif /* _ASM_X86_PGTABLE_3LEVEL_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable-3level_types.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable-3level_types.h @@ -0,0 +1,44 @@ +#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H +#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H + +#ifndef __ASSEMBLY__ +#include + +typedef u64 pteval_t; +typedef u64 pmdval_t; +typedef u64 pudval_t; +typedef u64 pgdval_t; +typedef u64 pgprotval_t; + +typedef union { + struct { + unsigned long pte_low, pte_high; + }; + pteval_t pte; +} pte_t; +#endif /* !__ASSEMBLY__ */ + +#define SHARED_KERNEL_PMD 0 + +#define PAGETABLE_LEVELS 3 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 30 +#define PTRS_PER_PGD 4 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + + +#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable.h @@ -0,0 +1,742 @@ +#ifndef _ASM_X86_PGTABLE_H +#define _ASM_X86_PGTABLE_H + +#include +#include + +#include + +/* + * Macro to mark a page protection value as UC- + */ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ + : (prot)) + +#ifndef __ASSEMBLY__ + +/* + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. + */ +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + +extern spinlock_t pgd_lock; +extern struct list_head pgd_list; + +#ifdef CONFIG_PARAVIRT +#include +#else /* !CONFIG_PARAVIRT */ +#define set_pte(ptep, pte) xen_set_pte(ptep, pte) +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) + +#define set_pte_atomic(ptep, pte) \ + xen_set_pte_atomic(ptep, pte) + +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) + +#ifndef __PAGETABLE_PUD_FOLDED +#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd) +#define pgd_clear(pgd) xen_pgd_clear(pgd) +#endif + +#ifndef set_pud +# define set_pud(pudp, pud) xen_set_pud(pudp, pud) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pud_clear(pud) xen_pud_clear(pud) +#endif + +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) +#define pmd_clear(pmd) xen_pmd_clear(pmd) + +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) + +#define pgd_val(x) xen_pgd_val(x) +#define __pgd(x) xen_make_pgd(x) + +#ifndef __PAGETABLE_PUD_FOLDED +#define pud_val(x) xen_pud_val(x) +#define __pud(x) xen_make_pud(x) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pmd_val(x) xen_pmd_val(x) +#define __pmd(x) xen_make_pmd(x) +#endif + +#define pte_val(x) xen_pte_val(x) +#define __pte(x) xen_make_pte(x) + +#define arch_end_context_switch(prev) do {} while(0) + +#endif /* CONFIG_PARAVIRT */ + +/* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ +static inline int pte_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_DIRTY; +} + +static inline int pte_young(pte_t pte) +{ + return pte_flags(pte) & _PAGE_ACCESSED; +} + +static inline int pte_write(pte_t pte) +{ + return pte_flags(pte) & _PAGE_RW; +} + +static inline int pte_file(pte_t pte) +{ + return pte_flags(pte) & _PAGE_FILE; +} + +static inline int pte_huge(pte_t pte) +{ + return pte_flags(pte) & _PAGE_PSE; +} + +static inline int pte_global(pte_t pte) +{ + return 0; +} + +static inline int pte_exec(pte_t pte) +{ + return !(pte_flags(pte) & _PAGE_NX); +} + +static inline int pte_special(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SPECIAL; +} + +#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \ + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) +#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IOMAP ? max_mapnr : \ + (_pte).pte_low & _PAGE_PRESENT ? \ + mfn_to_local_pfn(__pte_mfn(_pte)) : \ + __pte_mfn(_pte)) + +#define pte_page(pte) pfn_to_page(pte_pfn(pte)) + +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + +static inline int pmd_large(pmd_t pte) +{ + return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); +} + +static inline pte_t pte_set_flags(pte_t pte, pteval_t set) +{ + pteval_t v = __pte_val(pte); + + return __pte_ma(v | set); +} + +static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) +{ + pteval_t v = __pte_val(pte); + + return __pte_ma(v & ~clear); +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_DIRTY); +} + +static inline pte_t pte_mkold(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_ACCESSED); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_RW); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_NX); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_ACCESSED); +} + +static inline pte_t pte_mkwrite(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_RW); +} + +static inline pte_t pte_mkhuge(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_PSE); +} + +static inline pte_t pte_clrhuge(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_PSE); +} + +static inline pte_t pte_mkglobal(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_clrglobal(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SPECIAL); +} + +/* + * Mask out unsupported bits in a present pgprot. Non-present pgprots + * can use those bits for other purposes, so leave them be. + */ +static inline pgprotval_t massage_pgprot(pgprot_t pgprot) +{ + pgprotval_t protval = pgprot_val(pgprot); + + if (protval & _PAGE_PRESENT) + protval &= __supported_pte_mask; + + return protval; +} + +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} + +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) +{ + return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} + +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + pteval_t val = pte_val(pte) & _PAGE_CHG_MASK; + + val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK; + + return __pte(val); +} + +/* mprotect needs to preserve PAT bits when updating vm_page_prot */ +#define pgprot_modify pgprot_modify +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK; + pgprotval_t addbits = pgprot_val(newprot); + return __pgprot(preservebits | addbits); +} + +#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) + +#define canon_pgprot(p) __pgprot(massage_pgprot(p)) + +static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, + unsigned long flags, + unsigned long new_flags) +{ + /* + * PAT type is always WB for ISA. So no need to check. + */ + if (is_ISA_range(paddr, paddr + size - 1)) + return 1; + + /* + * Certain new memtypes are not allowed with certain + * requested memtype: + * - request is uncached, return cannot be write-back + * - request is write-combine, return cannot be write-back + */ + if ((flags == _PAGE_CACHE_UC_MINUS && + new_flags == _PAGE_CACHE_WB) || + (flags == _PAGE_CACHE_WC && + new_flags == _PAGE_CACHE_WB)) { + return 0; + } + + return 1; +} + +pmd_t *populate_extra_pmd(unsigned long vaddr); +pte_t *populate_extra_pte(unsigned long vaddr); +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_X86_32 +# include "pgtable_32.h" +#else +# include "pgtable_64.h" +#endif + +#ifndef __ASSEMBLY__ +#include + +static inline int pte_none(pte_t pte) +{ + return !pte.pte; +} + +#define __HAVE_ARCH_PTE_SAME +static inline int pte_same(pte_t a, pte_t b) +{ + return a.pte == b.pte; +} + +static inline int pte_present(pte_t a) +{ + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); +} + +static inline int pte_hidden(pte_t pte) +{ + return pte_flags(pte) & _PAGE_HIDDEN; +} + +static inline int pmd_present(pmd_t pmd) +{ +#if CONFIG_XEN_COMPAT <= 0x030002 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. + can temporarily clear it. */ + return __pmd_val(pmd) != 0; +#else + return pmd_flags(pmd) & _PAGE_PRESENT; +#endif +} + +static inline int pmd_none(pmd_t pmd) +{ + /* Only check low word on 32-bit platforms, since it might be + out of sync with upper half. */ + return (unsigned long)__pmd_val(pmd) == 0; +} + +static inline unsigned long pmd_page_vaddr(pmd_t pmd) +{ + return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) + +/* + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] + * + * this macro returns the index of the entry in the pmd page which would + * control the given virtual address + */ +static inline unsigned long pmd_index(unsigned long address) +{ + return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); +} + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + * + * (Currently stuck as a macro because of indirect forward reference + * to linux/mm.h:page_to_nid()) + */ +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) + +/* + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] + * + * this function returns the index of the entry in the pte page which would + * control the given virtual address + */ +static inline unsigned long pte_index(unsigned long address) +{ + return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); +} + +static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) +{ + return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); +} + +static inline int pmd_bad(pmd_t pmd) +{ +#if CONFIG_XEN_COMPAT <= 0x030002 + return (pmd_flags(pmd) & ~_PAGE_USER & ~_PAGE_PRESENT) + != (_KERNPG_TABLE & ~_PAGE_PRESENT); +#else + return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; +#endif +} + +static inline unsigned long pages_to_mb(unsigned long npg) +{ + return npg >> (20 - PAGE_SHIFT); +} + +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO) + +#if PAGETABLE_LEVELS > 2 +static inline int pud_none(pud_t pud) +{ + return __pud_val(pud) == 0; +} + +static inline int pud_present(pud_t pud) +{ + return pud_flags(pud) & _PAGE_PRESENT; +} + +static inline unsigned long pud_page_vaddr(pud_t pud) +{ + return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) + +/* Find an entry in the second-level page table.. */ +static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) +{ + return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); +} + +static inline int pud_large(pud_t pud) +{ + return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); +} + +static inline int pud_bad(pud_t pud) +{ + return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; +} +#else +static inline int pud_large(pud_t pud) +{ + return 0; +} +#endif /* PAGETABLE_LEVELS > 2 */ + +#if PAGETABLE_LEVELS > 3 +static inline int pgd_present(pgd_t pgd) +{ + return pgd_flags(pgd) & _PAGE_PRESENT; +} + +static inline unsigned long pgd_page_vaddr(pgd_t pgd) +{ + return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +/* to find an entry in a page-table-directory. */ +static inline unsigned long pud_index(unsigned long address) +{ + return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + +static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) +{ + return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); +} + +static inline int pgd_bad(pgd_t pgd) +{ + return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; +} + +static inline int pgd_none(pgd_t pgd) +{ + return !__pgd_val(pgd); +} +#endif /* PAGETABLE_LEVELS > 3 */ + +#endif /* __ASSEMBLY__ */ + +/* + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] + * + * this macro returns the index of the entry in the pgd page which would + * control the given virtual address + */ +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) + +/* + * pgd_offset() returns a (pgd_t *) + * pgd_index() is used get the offset into the pgd page's array of pgd_t's; + */ +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +/* + * a shortcut which implies the use of the kernel's pgd, instead + * of a process's + */ +#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) + + +#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) +#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) + +#ifndef __ASSEMBLY__ + +#define direct_gbpages 0 + +/* local pte updates need not use xchg for locking */ +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) +{ + xen_set_pte(ptep, __pte(0)); + return res; +} + +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , pte_t pte) +{ + if ((mm != current->mm && mm != &init_mm) || + HYPERVISOR_update_va_mapping(addr, pte, 0)) + xen_set_pte(ptep, pte); +} + +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + if ((mm != current->mm && mm != &init_mm) + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) + __xen_pte_clear(ptep); +} + +#ifndef CONFIG_PARAVIRT +/* + * Rules for using pte_update - it must be called after any PTE update which + * has not been done using the set_pte / clear_pte interfaces. It is used by + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE + * updates should either be sets, clears, or set_pte_atomic for P->P + * transitions, which means this hook should only be called for user PTEs. + * This hook implies a P->P protection or access change has taken place, which + * requires a subsequent TLB flush. The notification can optionally be delayed + * until the TLB flush event by using the pte_update_defer form of the + * interface, but care must be taken to assure that the flush happens while + * still holding the same page table lock so that the shadow and primary pages + * do not become out of sync on SMP. + */ +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) +#endif + +/* + * We only update the dirty/accessed state if we set + * the dirty bit by hand in the kernel, since the hardware + * will do the accessed bit for us, and we don't want to + * race with other CPU's that might be updating the dirty + * bit at the same time. + */ +struct vm_area_struct; + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +extern int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +extern int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); + +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH +#define ptep_clear_flush(vma, addr, ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + pte_t __res = *__ptep; \ + if (!pte_none(__res) && \ + ((vma)->vm_mm != current->mm || \ + HYPERVISOR_update_va_mapping(addr, __pte(0), \ + uvm_multi(mm_cpumask((vma)->vm_mm)) | \ + UVMF_INVLPG))) { \ + __xen_pte_clear(__ptep); \ + flush_tlb_page(vma, addr); \ + } \ + __res; \ +}) + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_t pte = *ptep; + if (!pte_none(pte) + && (mm != &init_mm + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) { + pte = xen_ptep_get_and_clear(ptep, pte); + pte_update(mm, addr, ptep); + } + return pte; +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL +#define ptep_get_and_clear_full(mm, addr, ptep, full) \ + ((full) ? ({ \ + pte_t *__ptep = (ptep); \ + pte_t __res = *__ptep; \ + if (!PagePinned(virt_to_page((mm)->pgd))) \ + __xen_pte_clear(__ptep); \ + else if (!pte_none(__res)) \ + xen_l1_entry_update(__ptep, __pte(0)); \ + __res; \ + }) : \ + ptep_get_and_clear(mm, addr, ptep)) + +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int); + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + if (pte_write(pte)) + set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); +} + +/* + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); + * + * dst - pointer to pgd range anwhere on a pgd page + * src - "" + * count - the number of pgds to copy. + * + * dst and src can be on the same page, but the range must not overlap, + * and must not cross a page boundary. + */ +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) +{ + memcpy(dst, src, count * sizeof(pgd_t)); +} + +#define arbitrary_virt_to_mfn(va) \ +({ \ + unsigned int __lvl; \ + pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \ + BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\ + pte_mfn(*__ptep); \ +}) + +#define arbitrary_virt_to_machine(va) \ + (((maddr_t)arbitrary_virt_to_mfn(va) << PAGE_SHIFT) \ + | ((unsigned long)(va) & (PAGE_SIZE - 1))) + +#ifdef CONFIG_HIGHPTE +#include +struct page *kmap_atomic_to_page(void *); +#define ptep_to_machine(ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + page_to_phys(kmap_atomic_to_page(__ptep)) \ + | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \ +}) +#else +#define ptep_to_machine(ptep) virt_to_machine(ptep) +#endif + +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ +#if CONFIG_XEN_COMPAT < 0x030300 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) + return ptep_get_and_clear(mm, addr, ptep); +#endif + return *ptep; +} + +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + mmu_update_t u; + +#if CONFIG_XEN_COMPAT < 0x030300 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) { + set_pte_at(mm, addr, ptep, pte); + return; + } +#endif + u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD; + u.val = __pte_val(pte); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF)) + BUG(); +} + +#include + +#include +void make_page_readonly(void *va, unsigned int feature); +void make_page_writable(void *va, unsigned int feature); +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); +void make_pages_writable(void *va, unsigned int nr, unsigned int feature); + +struct vm_area_struct; + +int direct_remap_pfn_range(struct vm_area_struct *vma, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); +int direct_kernel_remap_pfn_range(unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + uint64_t *ptep); +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size); + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable_32.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable_32.h @@ -0,0 +1,99 @@ +#ifndef _ASM_X86_PGTABLE_32_H +#define _ASM_X86_PGTABLE_32_H + +#include + +/* + * The Linux memory management assumes a three-level page table setup. On + * the i386, we use that, but "fold" the mid level into the top-level page + * table, so that we physically have the same two-level page table as the + * i386 mmu expects. + * + * This file contains the functions and defines necessary to modify and use + * the i386 page table tree. + */ +#ifndef __ASSEMBLY__ +#include +#include +#include + +#include +#include +#include +#include +#include + +struct vm_area_struct; + +extern pgd_t *swapper_pg_dir; + +static inline void pgtable_cache_init(void) { } +static inline void check_pgt_cache(void) { } +void paging_init(void); + +extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); + + +/* + * Define this if things work differently on an i386 and an i486: + * it will (on an i486) warn about kernel memory accesses that are + * done without a 'access_ok(VERIFY_WRITE,..)' + */ +#undef TEST_ACCESS_OK + +#ifdef CONFIG_X86_PAE +# include +#else +# include +#endif + +#if defined(CONFIG_HIGHPTE) +#define __KM_PTE \ + (in_nmi() ? KM_NMI_PTE : \ + in_irq() ? KM_IRQ_PTE : \ + KM_PTE0) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ + pte_index((address))) +#define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ + pte_index((address))) +#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) +#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) +#else +#define pte_offset_map(dir, address) \ + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) +#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) +#endif + +/* Clear a kernel PTE and flush it from the TLB */ +#define kpte_clear_flush(ptep, vaddr) \ +do { \ + if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \ + BUG(); \ +} while (0) + +/* + * The i386 doesn't have any external MMU info: the kernel page + * tables contain all the necessary information. + */ +#define update_mmu_cache(vma, address, pte) do { } while (0) + +void make_lowmem_page_readonly(void *va, unsigned int feature); +void make_lowmem_page_writable(void *va, unsigned int feature); + +#endif /* !__ASSEMBLY__ */ + +/* + * kern_addr_valid() is (1) for FLATMEM and (0) for + * SPARSEMEM and DISCONTIGMEM + */ +#ifdef CONFIG_FLATMEM +#define kern_addr_valid(addr) (1) +#else +#define kern_addr_valid(kaddr) (0) +#endif + +#endif /* _ASM_X86_PGTABLE_32_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable_64.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable_64.h @@ -0,0 +1,197 @@ +#ifndef _ASM_X86_PGTABLE_64_H +#define _ASM_X86_PGTABLE_64_H + +#include +#include + +#ifndef __ASSEMBLY__ + +/* + * This file contains the functions and defines necessary to modify and use + * the x86-64 page table tree. + */ +#include +#include +#include +#include + +#ifdef CONFIG_XEN +extern pud_t level3_user_pgt[512]; + +extern void xen_init_pt(void); +extern void xen_switch_pt(void); +#endif + +extern pud_t level3_kernel_pgt[512]; +extern pud_t level3_ident_pgt[512]; +extern pmd_t level2_kernel_pgt[512]; +extern pmd_t level2_fixmap_pgt[512]; +extern pmd_t level2_ident_pgt[512]; +extern pgd_t init_level4_pgt[]; + +#define swapper_pg_dir init_level4_pgt + +extern void paging_init(void); + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \ + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \ + __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pud %p(%016lx pfn %010Lx).\n", \ + __FILE__, __LINE__, &(e), __pud_val(e), \ + (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016lx pfn %010Lx).\n", \ + __FILE__, __LINE__, &(e), __pgd_val(e), \ + (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) + +struct mm_struct; + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); + + +#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0)) + +static inline void xen_set_pte(pte_t *ptep, pte_t pte) +{ + *ptep = pte; +} + +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + xen_set_pte(ptep, pte); +} + +#ifdef CONFIG_SMP +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret) +{ + return __pte_ma(xchg(&xp->pte, 0)); +} +#else +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) +#endif + +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + xen_l2_entry_update(pmdp, pmd); +} + +#define xen_pmd_clear(pmd) \ +({ \ + pmd_t *__pmdp = (pmd); \ + PagePinned(virt_to_page(__pmdp)) \ + ? set_pmd(__pmdp, xen_make_pmd(0)) \ + : (void)(*__pmdp = xen_make_pmd(0)); \ +}) + +static inline void xen_set_pud(pud_t *pudp, pud_t pud) +{ + xen_l3_entry_update(pudp, pud); +} + +#define xen_pud_clear(pud) \ +({ \ + pud_t *__pudp = (pud); \ + PagePinned(virt_to_page(__pudp)) \ + ? set_pud(__pudp, xen_make_pud(0)) \ + : (void)(*__pudp = xen_make_pud(0)); \ +}) + +static inline pgd_t *__user_pgd(pgd_t *pgd) +{ + if (unlikely(((unsigned long)pgd & PAGE_MASK) + == (unsigned long)init_level4_pgt)) + return NULL; + return (pgd_t *)(virt_to_page(pgd)->index + + ((unsigned long)pgd & ~PAGE_MASK)); +} + +static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd) +{ + xen_l4_entry_update(pgdp, pgd); +} + +#define xen_pgd_clear(pgd) \ +({ \ + pgd_t *__pgdp = (pgd); \ + PagePinned(virt_to_page(__pgdp)) \ + ? xen_l4_entry_update(__pgdp, xen_make_pgd(0)) \ + : (void)(*__user_pgd(__pgdp) = *__pgdp = xen_make_pgd(0)); \ +}) + +#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT) + +extern unsigned long early_arbitrary_virt_to_mfn(void *va); + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ + +/* + * Level 4 access. + */ +static inline int pgd_large(pgd_t pgd) { return 0; } +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) + +/* PUD - Level3 access */ + +/* PMD - Level 2 access */ +#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ + _PAGE_FILE }) +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT + +/* PTE - Level 1 access. */ + +/* x86-64 always has all page tables mapped. */ +#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) +#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) +#define pte_unmap(pte) /* NOP */ +#define pte_unmap_nested(pte) /* NOP */ + +#define update_mmu_cache(vma, address, pte) do { } while (0) + +/* Encode and de-code a swap entry */ +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) +#define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) }) +#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) + +extern int kern_addr_valid(unsigned long addr); +extern void cleanup_highmap(void); + +#define HAVE_ARCH_UNMAPPED_AREA +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN + +#define pgtable_cache_init() do { } while (0) +#define check_pgt_cache() do { } while (0) + +#define PAGE_AGP PAGE_KERNEL_NOCACHE +#define HAVE_PAGE_AGP 1 + +/* fs/proc/kcore.c */ +#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) +#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) + +#define __HAVE_ARCH_PTE_SAME +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_64_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable_64_types.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable_64_types.h @@ -0,0 +1,64 @@ +#ifndef _ASM_X86_PGTABLE_64_DEFS_H +#define _ASM_X86_PGTABLE_64_DEFS_H + +#ifndef __ASSEMBLY__ +#include + +/* + * These are used to make use of C type-checking.. + */ +typedef unsigned long pteval_t; +typedef unsigned long pmdval_t; +typedef unsigned long pudval_t; +typedef unsigned long pgdval_t; +typedef unsigned long pgprotval_t; + +typedef union { pteval_t pte; unsigned int pte_low; } pte_t; + +#endif /* !__ASSEMBLY__ */ + +#define SHARED_KERNEL_PMD 0 +#define PAGETABLE_LEVELS 4 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 39 +#define PTRS_PER_PGD 512 + +/* + * 3rd level page + */ +#define PUD_SHIFT 30 +#define PTRS_PER_PUD 512 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + +#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE - 1)) +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) +#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE - 1)) + +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +#define MAX_PHYSMEM_BITS 43 +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define VMALLOC_START _AC(0xffffc90000000000, UL) +#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffea0000000000, UL) +#define MODULES_VADDR _AC(0xffffffffa0000000, UL) +#define MODULES_END _AC(0xffffffffff000000, UL) +#define MODULES_LEN (MODULES_END - MODULES_VADDR) + +#endif /* _ASM_X86_PGTABLE_64_DEFS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable_types.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable_types.h @@ -0,0 +1,385 @@ +#ifndef _ASM_X86_PGTABLE_DEFS_H +#define _ASM_X86_PGTABLE_DEFS_H + +#include +#include + +#define FIRST_USER_ADDRESS 0 + +#define _PAGE_BIT_PRESENT 0 /* is present */ +#define _PAGE_BIT_RW 1 /* writeable */ +#define _PAGE_BIT_USER 2 /* userspace addressable */ +#define _PAGE_BIT_PWT 3 /* page write through */ +#define _PAGE_BIT_PCD 4 /* page cache disabled */ +#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ +#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ +#define _PAGE_BIT_PAT 7 /* on 4KB pages */ +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ +#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ +#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ + +/* If _PAGE_BIT_PRESENT is clear, we use these: */ +/* - if the user mapped it with PROT_NONE; pte_present gives true */ +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL +/* - set: nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY + +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) +#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) +#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define __HAVE_ARCH_PTE_SPECIAL + +#ifdef CONFIG_KMEMCHECK +#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#else +#define _PAGE_HIDDEN (_AT(pteval_t, 0)) +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) +#else +#define _PAGE_NX (_AT(pteval_t, 0)) +#endif + +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) + +#ifndef __ASSEMBLY__ +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002 +extern unsigned int __kernel_page_user; +#else +#define __kernel_page_user 0 +#endif +#endif + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | __kernel_page_user) + +/* Set of bits not changed in pte_modify */ +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \ + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) + +/* + * PAT settings are part of the hypervisor interface, which sets the + * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]). + */ +#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT) +#define _PAGE_CACHE_WB (0) +#define _PAGE_CACHE_WT (_PAGE_PWT) +#define _PAGE_CACHE_WC (_PAGE_PAT) +#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT) +#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) +#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) + +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) + +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) +#define PAGE_COPY PAGE_COPY_NOEXEC +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) + +#define __PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) +#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) + +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) +#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) +#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) + +#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) +#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) + +/* xwr */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_EXEC +#define __P101 PAGE_READONLY_EXEC +#define __P110 PAGE_COPY_EXEC +#define __P111 PAGE_COPY_EXEC + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_EXEC +#define __S101 PAGE_READONLY_EXEC +#define __S110 PAGE_SHARED_EXEC +#define __S111 PAGE_SHARED_EXEC + +/* + * early identity mapping pte attrib macros. + */ +#ifdef CONFIG_X86_64 +#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +#else +/* + * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection + * bits are combined, this will alow user to access the high address mapped + * VDSO in the presence of CONFIG_COMPAT_VDSO + */ +#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ +#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ +#endif + +#ifdef CONFIG_X86_32 +# include +#else +# include "pgtable_64_types.h" +#endif + +#ifndef __ASSEMBLY__ + +#include + +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) + +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +#define PTE_FLAGS_MASK (~PTE_PFN_MASK) + +typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; + +#include + +typedef struct { pgdval_t pgd; } pgd_t; + +#define __pgd_ma(x) ((pgd_t) { (x) } ) +static inline pgd_t xen_make_pgd(pgdval_t val) +{ + if (likely(val & _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pgd_t) { val }; +} + +#define __pgd_val(x) ((x).pgd) +static inline pgdval_t xen_pgd_val(pgd_t pgd) +{ + pgdval_t ret = __pgd_val(pgd); +#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002 + if (likely(ret)) + ret = machine_to_phys(ret) | _PAGE_PRESENT; +#else + if (likely(ret & _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); +#endif + return ret; +} + +static inline pgdval_t pgd_flags(pgd_t pgd) +{ + return __pgd_val(pgd) & PTE_FLAGS_MASK; +} + +#if PAGETABLE_LEVELS > 3 +typedef struct { pudval_t pud; } pud_t; + +#define __pud_ma(x) ((pud_t) { (x) } ) +static inline pud_t xen_make_pud(pudval_t val) +{ + if (likely(val & _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pud_t) { val }; +} + +#define __pud_val(x) ((x).pud) +static inline pudval_t xen_pud_val(pud_t pud) +{ + pudval_t ret = __pud_val(pud); + if (likely(ret & _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); + return ret; +} +#else +#include + +#define __pud_val(x) __pgd_val((x).pgd) +static inline pudval_t xen_pud_val(pud_t pud) +{ + return xen_pgd_val(pud.pgd); +} +#endif + +#if PAGETABLE_LEVELS > 2 +typedef struct { pmdval_t pmd; } pmd_t; + +#define __pmd_ma(x) ((pmd_t) { (x) } ) +static inline pmd_t xen_make_pmd(pmdval_t val) +{ + if (likely(val & _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pmd_t) { val }; +} + +#define __pmd_val(x) ((x).pmd) +static inline pmdval_t xen_pmd_val(pmd_t pmd) +{ + pmdval_t ret = __pmd_val(pmd); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (likely(ret)) + ret = pte_machine_to_phys(ret) | _PAGE_PRESENT; +#else + if (likely(ret & _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); +#endif + return ret; +} +#else +#include + +#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } ) +#define __pmd_val(x) __pgd_val((x).pud.pgd) +static inline pmdval_t xen_pmd_val(pmd_t pmd) +{ + return xen_pgd_val(pmd.pud.pgd); +} +#endif + +static inline pudval_t pud_flags(pud_t pud) +{ + return __pud_val(pud) & PTE_FLAGS_MASK; +} + +static inline pmdval_t pmd_flags(pmd_t pmd) +{ + return __pmd_val(pmd) & PTE_FLAGS_MASK; +} + +#define __pte_ma(x) ((pte_t) { .pte = (x) } ) +static inline pte_t xen_make_pte(pteval_t val) +{ + if (likely((val & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pte_t) { .pte = val }; +} + +#define __pte_val(x) ((x).pte) +static inline pteval_t xen_pte_val(pte_t pte) +{ + pteval_t ret = __pte_val(pte); + if (likely((pte.pte_low & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); + return ret; +} + +static inline pteval_t pte_flags(pte_t pte) +{ + return __pte_val(pte) & PTE_FLAGS_MASK; +} + +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) } ) + + +typedef struct page *pgtable_t; + +extern pteval_t __supported_pte_mask; +extern void set_nx(void); +extern int nx_enabled; + +#define pgprot_writecombine pgprot_writecombine +extern pgprot_t pgprot_writecombine(pgprot_t prot); + +#ifndef CONFIG_XEN +/* Indicate that x86 has its own track and untrack pfn vma functions */ +#define __HAVE_PFNMAP_TRACKING +#endif + +#define __HAVE_PHYS_MEM_ACCESS_PROT +struct file; +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot); +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot); + +/* Install a pte for a particular vaddr in kernel space. */ +void set_pte_vaddr(unsigned long vaddr, pte_t pte); + +struct seq_file; +extern void arch_report_meminfo(struct seq_file *m); + +enum { + PG_LEVEL_NONE, + PG_LEVEL_4K, + PG_LEVEL_2M, + PG_LEVEL_1G, + PG_LEVEL_NUM +}; + +#ifdef CONFIG_PROC_FS +extern void update_page_count(int level, unsigned long pages); +#else +static inline void update_page_count(int level, unsigned long pages) { } +#endif + +/* + * Helper function that returns the kernel pagetable entry controlling + * the virtual address 'address'. NULL means no pagetable entry present. + * NOTE: the return type is pte_t but if the pmd is PSE then we return it + * as a pte too. + */ +extern pte_t *lookup_address(unsigned long address, unsigned int *level); + +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_DEFS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/processor.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/processor.h @@ -0,0 +1,1002 @@ +#ifndef _ASM_X86_PROCESSOR_H +#define _ASM_X86_PROCESSOR_H + +#include + +/* Forward declaration, a strange C thing */ +struct task_struct; +struct mm_struct; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* + * Default implementation of macro that returns current + * instruction pointer ("program counter"). + */ +static inline void *current_text_addr(void) +{ + void *pc; + + asm volatile("mov $1f, %0; 1:":"=r" (pc)); + + return pc; +} + +#ifdef CONFIG_X86_VSMP +# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) +# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) +#else +# define ARCH_MIN_TASKALIGN 16 +# define ARCH_MIN_MMSTRUCT_ALIGN 0 +#endif + +/* + * CPU type and hardware bug flags. Kept separately for each CPU. + * Members of this structure are referenced in head.S, so think twice + * before touching them. [mj] + */ + +struct cpuinfo_x86 { + __u8 x86; /* CPU family */ + __u8 x86_vendor; /* CPU vendor */ + __u8 x86_model; + __u8 x86_mask; +#ifdef CONFIG_X86_32 + char wp_works_ok; /* It doesn't on 386's */ + + /* Problems on some 486Dx4's and old 386's: */ + char hlt_works_ok; + char hard_math; + char rfu; + char fdiv_bug; + char f00f_bug; + char coma_bug; + char pad0; +#else + /* Number of 4K pages in DTLB/ITLB combined(in pages): */ + int x86_tlbsize; +#endif + __u8 x86_virt_bits; + __u8 x86_phys_bits; + /* CPUID returned core id bits: */ + __u8 x86_coreid_bits; + /* Max extended CPUID function supported: */ + __u32 extended_cpuid_level; + /* Maximum supported CPUID level, -1=no CPUID: */ + int cpuid_level; + __u32 x86_capability[NCAPINTS]; + char x86_vendor_id[16]; + char x86_model_id[64]; + /* in KB - valid for CPUS which support this call: */ + int x86_cache_size; + int x86_cache_alignment; /* In bytes */ + int x86_power; + unsigned long loops_per_jiffy; +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) + /* cpus sharing the last level cache: */ + cpumask_var_t llc_shared_map; +#endif + /* cpuid returned max cores value: */ + u16 x86_max_cores; + u16 apicid; + u16 initial_apicid; + u16 x86_clflush_size; +#ifdef CONFIG_SMP + /* number of cores as seen by the OS: */ + u16 booted_cores; + /* Physical processor id: */ + u16 phys_proc_id; + /* Core id: */ + u16 cpu_core_id; + /* Index into per_cpu list: */ + u16 cpu_index; +#endif + unsigned int x86_hyper_vendor; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +#define X86_VENDOR_INTEL 0 +#define X86_VENDOR_CYRIX 1 +#define X86_VENDOR_AMD 2 +#define X86_VENDOR_UMC 3 +#define X86_VENDOR_CENTAUR 5 +#define X86_VENDOR_TRANSMETA 7 +#define X86_VENDOR_NSC 8 +#define X86_VENDOR_NUM 9 + +#define X86_VENDOR_UNKNOWN 0xff + +#define X86_HYPER_VENDOR_NONE 0 +#define X86_HYPER_VENDOR_VMWARE 1 +#define X86_HYPER_VENDOR_XEN 'X' + +/* + * capabilities of CPUs + */ +extern struct cpuinfo_x86 boot_cpu_data; +extern struct cpuinfo_x86 new_cpu_data; + +extern __u32 cpu_caps_cleared[NCAPINTS]; +extern __u32 cpu_caps_set[NCAPINTS]; + +#ifdef CONFIG_SMP +DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +#define cpu_data(cpu) per_cpu(cpu_info, cpu) +#define current_cpu_data __get_cpu_var(cpu_info) +#else +#define cpu_data(cpu) boot_cpu_data +#define current_cpu_data boot_cpu_data +#endif + +extern const struct seq_operations cpuinfo_op; + +static inline int hlt_works(int cpu) +{ +#ifdef CONFIG_X86_32 + return cpu_data(cpu).hlt_works_ok; +#else + return 1; +#endif +} + +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) + +extern void cpu_detect(struct cpuinfo_x86 *c); + +extern struct pt_regs *idle_regs(struct pt_regs *); + +extern void early_cpu_init(void); +extern void identify_boot_cpu(void); +extern void identify_secondary_cpu(struct cpuinfo_x86 *); +extern void print_cpu_info(struct cpuinfo_x86 *); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern unsigned short num_cache_leaves; + +extern void detect_extended_topology(struct cpuinfo_x86 *c); +extern void detect_ht(struct cpuinfo_x86 *c); + +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm(XEN_CPUID + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static inline void load_cr3(pgd_t *pgdir) +{ + write_cr3(__pa(pgdir)); +} + +#ifndef CONFIG_X86_NO_TSS +#ifdef CONFIG_X86_32 +/* This is the TSS defined by the hardware. */ +struct x86_hw_tss { + unsigned short back_link, __blh; + unsigned long sp0; + unsigned short ss0, __ss0h; + unsigned long sp1; + /* ss1 caches MSR_IA32_SYSENTER_CS: */ + unsigned short ss1, __ss1h; + unsigned long sp2; + unsigned short ss2, __ss2h; + unsigned long __cr3; + unsigned long ip; + unsigned long flags; + unsigned long ax; + unsigned long cx; + unsigned long dx; + unsigned long bx; + unsigned long sp; + unsigned long bp; + unsigned long si; + unsigned long di; + unsigned short es, __esh; + unsigned short cs, __csh; + unsigned short ss, __ssh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; + unsigned short ldt, __ldth; + unsigned short trace; + unsigned short io_bitmap_base; + +} __attribute__((packed)); +extern struct tss_struct doublefault_tss; +#else +struct x86_hw_tss { + u32 reserved1; + u64 sp0; + u64 sp1; + u64 sp2; + u64 reserved2; + u64 ist[7]; + u32 reserved3; + u32 reserved4; + u16 reserved5; + u16 io_bitmap_base; + +} __attribute__((packed)) ____cacheline_aligned; +#endif +#endif /* CONFIG_X86_NO_TSS */ + +/* + * IO-bitmap sizes: + */ +#define IO_BITMAP_BITS 65536 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) +#define INVALID_IO_BITMAP_OFFSET 0x8000 + +#ifndef CONFIG_X86_NO_TSS +struct tss_struct { + /* + * The hardware state: + */ + struct x86_hw_tss x86_tss; + + /* + * The extra 1 is there because the CPU will access an + * additional byte beyond the end of the IO permission + * bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + + /* + * .. and then another 0x100 bytes for the emergency kernel stack: + */ + unsigned long stack[64]; + +} ____cacheline_aligned; + +DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); + +/* + * Save the original ist values for checking stack pointers during debugging + */ +struct orig_ist { + unsigned long ist[7]; +}; +#endif /* CONFIG_X86_NO_TSS */ + +#define MXCSR_DEFAULT 0x1f80 + +struct i387_fsave_struct { + u32 cwd; /* FPU Control Word */ + u32 swd; /* FPU Status Word */ + u32 twd; /* FPU Tag Word */ + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Pointer Offset */ + u32 fos; /* FPU Operand Pointer Selector */ + + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + + /* Software status information [not touched by FSAVE ]: */ + u32 status; +}; + +struct i387_fxsave_struct { + u16 cwd; /* Control Word */ + u16 swd; /* Status Word */ + u16 twd; /* Tag Word */ + u16 fop; /* Last Instruction Opcode */ + union { + struct { + u64 rip; /* Instruction Pointer */ + u64 rdp; /* Data Pointer */ + }; + struct { + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Offset */ + u32 fos; /* FPU Operand Selector */ + }; + }; + u32 mxcsr; /* MXCSR Register State */ + u32 mxcsr_mask; /* MXCSR Mask */ + + /* 8*16 bytes for each FP-reg = 128 bytes: */ + u32 st_space[32]; + + /* 16*16 bytes for each XMM-reg = 256 bytes: */ + u32 xmm_space[64]; + + u32 padding[12]; + + union { + u32 padding1[12]; + u32 sw_reserved[12]; + }; + +} __attribute__((aligned(16))); + +struct i387_soft_struct { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + u8 ftop; + u8 changed; + u8 lookahead; + u8 no_update; + u8 rm; + u8 alimit; + struct math_emu_info *info; + u32 entry_eip; +}; + +struct ymmh_struct { + /* 16 * 16 bytes for each YMMH-reg = 256 bytes */ + u32 ymmh_space[64]; +}; + +struct xsave_hdr_struct { + u64 xstate_bv; + u64 reserved1[2]; + u64 reserved2[5]; +} __attribute__((packed)); + +struct xsave_struct { + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; + struct ymmh_struct ymmh; + /* new processor state extensions will go here */ +} __attribute__ ((packed, aligned (64))); + +union thread_xstate { + struct i387_fsave_struct fsave; + struct i387_fxsave_struct fxsave; + struct i387_soft_struct soft; + struct xsave_struct xsave; +}; + +#ifdef CONFIG_X86_64 +#ifndef CONFIG_X86_NO_TSS +DECLARE_PER_CPU(struct orig_ist, orig_ist); +#endif + +union irq_stack_union { + char irq_stack[IRQ_STACK_SIZE]; + /* + * GCC hardcodes the stack canary as %gs:40. Since the + * irq_stack is the object at %gs:0, we reserve the bottom + * 48 bytes of the irq stack for the canary. + */ + struct { + char gs_base[40]; + unsigned long stack_canary; + }; +}; + +DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union); +DECLARE_INIT_PER_CPU(irq_stack_union); + +DECLARE_PER_CPU(char *, irq_stack_ptr); +DECLARE_PER_CPU(unsigned int, irq_count); +extern unsigned long kernel_eflags; +extern asmlinkage void ignore_sysret(void); +#else /* X86_64 */ +#ifdef CONFIG_CC_STACKPROTECTOR +/* + * Make sure stack canary segment base is cached-aligned: + * "For Intel Atom processors, avoid non zero segment base address + * that is not aligned to cache line boundary at all cost." + * (Optim Ref Manual Assembly/Compiler Coding Rule 15.) + */ +struct stack_canary { + char __pad[20]; /* canary at %gs:20 */ + unsigned long canary; +}; +DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); +#endif +#endif /* X86_64 */ + +extern unsigned int xstate_size; +extern void free_thread_xstate(struct task_struct *); +extern struct kmem_cache *task_xstate_cachep; + +struct thread_struct { + /* Cached TLS descriptors: */ + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + unsigned long sp0; + unsigned long sp; +#ifdef CONFIG_X86_32 + unsigned long sysenter_cs; +#else + unsigned long usersp; /* Copy from PDA */ + unsigned short es; + unsigned short ds; + unsigned short fsindex; + unsigned short gsindex; +#endif +#ifdef CONFIG_X86_32 + unsigned long ip; +#endif +#ifdef CONFIG_X86_64 + unsigned long fs; +#endif + unsigned long gs; + /* Hardware debugging registers: */ + unsigned long debugreg0; + unsigned long debugreg1; + unsigned long debugreg2; + unsigned long debugreg3; + unsigned long debugreg6; + unsigned long debugreg7; + /* Fault info: */ + unsigned long cr2; + unsigned long trap_no; + unsigned long error_code; + /* floating point and extended processor state */ + union thread_xstate *xstate; +#ifdef CONFIG_X86_32 + /* Virtual 86 mode info */ + struct vm86_struct __user *vm86_info; + unsigned long screen_bitmap; + unsigned long v86flags, v86mask, saved_sp0; + unsigned int saved_fs, saved_gs; +#endif + /* IO permissions: */ + unsigned long *io_bitmap_ptr; + unsigned long iopl; + /* Max allowed port in the bitmap, in bytes: */ + unsigned io_bitmap_max; +/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ + unsigned long debugctlmsr; + /* Debug Store context; see asm/ds.h */ + struct ds_context *ds_ctx; +}; + +static inline unsigned long xen_get_debugreg(int regno) +{ + return HYPERVISOR_get_debugreg(regno); +} + +static inline void xen_set_debugreg(int regno, unsigned long value) +{ + WARN_ON(HYPERVISOR_set_debugreg(regno, value)); +} + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static inline void xen_set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); +} + +#ifndef CONFIG_X86_NO_TSS +static inline void +native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) +{ + tss->x86_tss.sp0 = thread->sp0; +#ifdef CONFIG_X86_32 + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { + tss->x86_tss.ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +#endif +} +#else +#define xen_load_sp0(tss, thread) do { \ + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \ + BUG(); \ +} while (0) +#endif + +#define __cpuid xen_cpuid +#define paravirt_enabled() 0 + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = xen_get_debugreg(register) +#define set_debugreg(value, register) \ + xen_set_debugreg(register, value) + +#define load_sp0 xen_load_sp0 + +#define set_iopl_mask xen_set_iopl_mask + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ +extern unsigned long mmu_cr4_features; + +static inline void set_in_cr4(unsigned long mask) +{ + unsigned cr4; + + mmu_cr4_features |= mask; + cr4 = read_cr4(); + cr4 |= mask; + write_cr4(cr4); +} + +static inline void clear_in_cr4(unsigned long mask) +{ + unsigned cr4; + + mmu_cr4_features &= ~mask; + cr4 = read_cr4(); + cr4 &= ~mask; + write_cr4(cr4); +} + +typedef struct { + unsigned long seg; +} mm_segment_t; + + +/* + * create a kernel thread without removing it from tasklists + */ +extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); + +/* Free all resources held by a thread. */ +extern void release_thread(struct task_struct *); + +/* Prepare to copy thread state - unlazy all lazy state */ +extern void prepare_to_copy(struct task_struct *tsk); + +unsigned long get_wchan(struct task_struct *p); + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return eax; +} + +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ebx; +} + +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ecx; +} + +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return edx; +} + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + asm volatile("rep; nop" ::: "memory"); +} + +static inline void cpu_relax(void) +{ + rep_nop(); +} + +/* Stop speculative execution and prefetching of modified code. */ +static inline void sync_core(void) +{ + int tmp; + +#if defined(CONFIG_M386) || defined(CONFIG_M486) + if (boot_cpu_data.x86 < 5) + /* There is no speculative execution. + * jmp is a barrier to prefetching. */ + asm volatile("jmp 1f\n1:\n" ::: "memory"); + else +#endif + /* cpuid is a barrier to speculative execution. + * Prefetched instructions are automatically + * invalidated when modified. */ + asm volatile("cpuid" : "=a" (tmp) : "0" (1) + : "ebx", "ecx", "edx", "memory"); +} + +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); + +extern void select_idle_routine(const struct cpuinfo_x86 *c); +extern void init_c1e_mask(void); + +extern unsigned long boot_option_idle_override; +extern unsigned long idle_halt; +extern unsigned long idle_nomwait; + +#ifndef CONFIG_XEN +/* + * on systems with caches, caches must be flashed as the absolute + * last instruction before going into a suspended halt. Otherwise, + * dirty data can linger in the cache and become stale on resume, + * leading to strange errors. + * + * perform a variety of operations to guarantee that the compiler + * will not reorder instructions. wbinvd itself is serializing + * so the processor will not reorder. + * + * Systems without cache can just go into halt. + */ +static inline void wbinvd_halt(void) +{ + mb(); + /* check for clflush to determine if wbinvd is legal */ + if (cpu_has_clflush) + asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory"); + else + while (1) + halt(); +} +#endif + +extern void enable_sep_cpu(void); +extern int sysenter_setup(void); + +/* Defined in head.S */ +extern struct desc_ptr early_gdt_descr; + +extern void cpu_set_gdt(int); +extern void switch_to_new_gdt(int); +extern void load_percpu_segment(int); +extern void cpu_init(void); + +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + +static inline unsigned long get_debugctlmsr_on_cpu(int cpu) +{ + u64 debugctlmsr = 0; + u32 val1, val2; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2); + debugctlmsr = val1 | ((u64)val2 << 32); + + return debugctlmsr; +} + +static inline void update_debugctlmsr(unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); +} + +static inline void update_debugctlmsr_on_cpu(int cpu, + unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, + (u32)((u64)debugctlmsr), + (u32)((u64)debugctlmsr >> 32)); +} + +/* + * from system description table in BIOS. Mostly for MCA use, but + * others may find it useful: + */ +extern unsigned int machine_id; +extern unsigned int machine_submodel_id; +extern unsigned int BIOS_revision; + +/* Boot loader type from the setup header: */ +extern int bootloader_type; +extern int bootloader_version; + +extern char ignore_fpu_irq; + +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 +#define ARCH_HAS_PREFETCHW +#define ARCH_HAS_SPINLOCK_PREFETCH + +#ifdef CONFIG_X86_32 +# define BASE_PREFETCH ASM_NOP4 +# define ARCH_HAS_PREFETCH +#else +# define BASE_PREFETCH "prefetcht0 (%1)" +#endif + +/* + * Prefetch instructions for Pentium III (+) and AMD Athlon (+) + * + * It's not worth to care about 3dnow prefetches for the K6 + * because they are microcoded there and very slow. + */ +static inline void prefetch(const void *x) +{ + alternative_input(BASE_PREFETCH, + "prefetchnta (%1)", + X86_FEATURE_XMM, + "r" (x)); +} + +/* + * 3dnow prefetch to get an exclusive cache line. + * Useful for spinlocks to avoid one state transition in the + * cache coherency protocol: + */ +static inline void prefetchw(const void *x) +{ + alternative_input(BASE_PREFETCH, + "prefetchw (%1)", + X86_FEATURE_3DNOW, + "r" (x)); +} + +static inline void spin_lock_prefetch(const void *x) +{ + prefetchw(x); +} + +#ifdef CONFIG_X86_32 +/* + * User space process size: 3GB (default). + */ +#define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_MAX TASK_SIZE +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +#define INIT_THREAD { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .vm86_info = NULL, \ + .sysenter_cs = __KERNEL_CS, \ + .io_bitmap_ptr = NULL, \ +} + +/* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ +#define INIT_TSS { \ + .x86_tss = { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .ss0 = __KERNEL_DS, \ + .ss1 = __KERNEL_CS, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + }, \ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ +} + +extern unsigned long thread_saved_pc(struct task_struct *tsk); + +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) +#define KSTK_TOP(info) \ +({ \ + unsigned long *__ptr = (unsigned long *)(info); \ + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ +}) + +/* + * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * This is necessary to guarantee that the entire "struct pt_regs" + * is accessable even if the CPU haven't stored the SS/ESP registers + * on the stack (interrupt gate does not save these registers + * when switching to the same priv ring). + * Therefore beware: accessing the ss/esp fields of the + * "struct pt_regs" is possible, but they may contain the + * completely wrong values. + */ +#define task_pt_regs(task) \ +({ \ + struct pt_regs *__regs__; \ + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ + __regs__ - 1; \ +}) + +#define KSTK_ESP(task) (task_pt_regs(task)->sp) + +#else +/* + * User space process size. 47bits minus one guard page. + */ +#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ + 0xc0000000 : 0xFFFFe000) + +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) + +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX TASK_SIZE_MAX + +#define INIT_THREAD { \ + .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +#define INIT_TSS { \ + .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +/* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) + +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ +#endif /* CONFIG_X86_64 */ + +extern void start_thread(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp); + +/* + * This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) + +#define KSTK_EIP(task) (task_pt_regs(task)->ip) + +/* Get/set a process' ability to use the timestamp counter instruction */ +#define GET_TSC_CTL(adr) get_tsc_mode((adr)) +#define SET_TSC_CTL(val) set_tsc_mode((val)) + +extern int get_tsc_mode(unsigned long adr); +extern int set_tsc_mode(unsigned int val); + +extern int amd_get_nb_id(int cpu); + +struct aperfmperf { + u64 aperf, mperf; +}; + +static inline void get_aperfmperf(struct aperfmperf *am) +{ + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); + + rdmsrl(MSR_IA32_APERF, am->aperf); + rdmsrl(MSR_IA32_MPERF, am->mperf); +} + +#define APERFMPERF_SHIFT 10 + +static inline +unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, + struct aperfmperf *new) +{ + u64 aperf = new->aperf - old->aperf; + u64 mperf = new->mperf - old->mperf; + unsigned long ratio = aperf; + + mperf >>= APERFMPERF_SHIFT; + if (mperf) + ratio = div64_u64(aperf, mperf); + + return ratio; +} + +#endif /* _ASM_X86_PROCESSOR_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/setup.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/setup.h @@ -0,0 +1,8 @@ +#ifndef __ASSEMBLY__ + +void xen_start_kernel(void); +void xen_arch_setup(void); + +#endif + +#include_next --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/smp-processor-id.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/smp-processor-id.h @@ -0,0 +1,36 @@ +#ifndef _ASM_X86_SMP_PROCESSOR_ID_H +#define _ASM_X86_SMP_PROCESSOR_ID_H + +#if defined(CONFIG_SMP) && !defined(__ASSEMBLY__) + +#include + +DECLARE_PER_CPU(int, cpu_number); + +/* + * This function is needed by all SMP systems. It must _always_ be valid + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +#define raw_smp_processor_id() percpu_read(cpu_number) +#define safe_smp_processor_id() smp_processor_id() + +#ifdef CONFIG_X86_64_SMP +#define stack_smp_processor_id() \ +({ \ + struct thread_info *ti; \ + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ + ti->cpu; \ +}) +#endif + +#ifdef CONFIG_DEBUG_PREEMPT +extern unsigned int debug_smp_processor_id(void); +# define smp_processor_id() debug_smp_processor_id() +#else +# define smp_processor_id() raw_smp_processor_id() +#endif + +#endif /* SMP && !__ASSEMBLY__ */ + +#endif /* _ASM_X86_SMP_PROCESSOR_ID_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/smp.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/smp.h @@ -0,0 +1,196 @@ +#ifndef _ASM_X86_SMP_H +#define _ASM_X86_SMP_H +#ifndef __ASSEMBLY__ +#include +#include +#include + +/* + * We need the APIC definitions automatically as part of 'smp.h' + */ +#ifdef CONFIG_X86_LOCAL_APIC +# include +# include +# ifdef CONFIG_X86_IO_APIC +# include +# endif +#endif +#include +#include + +extern int smp_num_siblings; +extern unsigned int num_processors; + +DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); +DECLARE_PER_CPU(u16, cpu_llc_id); +DECLARE_PER_CPU(int, cpu_number); + +static inline struct cpumask *cpu_sibling_mask(int cpu) +{ + return per_cpu(cpu_sibling_map, cpu); +} + +static inline struct cpumask *cpu_core_mask(int cpu) +{ + return per_cpu(cpu_core_map, cpu); +} + +DECLARE_PER_CPU(u16, x86_cpu_to_apicid); +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); + +#ifdef CONFIG_SMP + +#ifndef CONFIG_XEN + +/* Static state in head.S used to set up a CPU */ +extern struct { + void *sp; + unsigned short ss; +} stack_start; + +struct smp_ops { + void (*smp_prepare_boot_cpu)(void); + void (*smp_prepare_cpus)(unsigned max_cpus); + void (*smp_cpus_done)(unsigned max_cpus); + + void (*smp_send_stop)(void); + void (*smp_send_reschedule)(int cpu); + + int (*cpu_up)(unsigned cpu); + int (*cpu_disable)(void); + void (*cpu_die)(unsigned int cpu); + void (*play_dead)(void); + + void (*send_call_func_ipi)(const struct cpumask *mask); + void (*send_call_func_single_ipi)(int cpu); +}; + +/* Globals due to paravirt */ +extern void set_cpu_sibling_map(int cpu); + +#ifndef CONFIG_PARAVIRT +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0) +#endif +extern struct smp_ops smp_ops; + +static inline void smp_send_stop(void) +{ + smp_ops.smp_send_stop(); +} + +static inline void smp_prepare_boot_cpu(void) +{ + smp_ops.smp_prepare_boot_cpu(); +} + +static inline void smp_prepare_cpus(unsigned int max_cpus) +{ + smp_ops.smp_prepare_cpus(max_cpus); +} + +static inline void smp_cpus_done(unsigned int max_cpus) +{ + smp_ops.smp_cpus_done(max_cpus); +} + +static inline int __cpu_up(unsigned int cpu) +{ + return smp_ops.cpu_up(cpu); +} + +static inline int __cpu_disable(void) +{ + return smp_ops.cpu_disable(); +} + +static inline void __cpu_die(unsigned int cpu) +{ + smp_ops.cpu_die(cpu); +} + +static inline void play_dead(void) +{ + smp_ops.play_dead(); +} + +static inline void smp_send_reschedule(int cpu) +{ + smp_ops.smp_send_reschedule(cpu); +} + +static inline void arch_send_call_function_single_ipi(int cpu) +{ + smp_ops.send_call_func_single_ipi(cpu); +} + +static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) +{ + smp_ops.send_call_func_ipi(mask); +} + +void cpu_disable_common(void); +void native_smp_prepare_boot_cpu(void); +void native_smp_prepare_cpus(unsigned int max_cpus); +void native_smp_cpus_done(unsigned int max_cpus); +int native_cpu_up(unsigned int cpunum); +int native_cpu_disable(void); +void native_cpu_die(unsigned int cpu); +void native_play_dead(void); +void play_dead_common(void); + +#else /* CONFIG_XEN */ + +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); +void xen_smp_send_stop(void); +void xen_smp_send_reschedule(int cpu); +void xen_send_call_func_ipi(const struct cpumask *mask); +void xen_send_call_func_single_ipi(int cpu); + +#define smp_send_stop xen_smp_send_stop +#define smp_send_reschedule xen_smp_send_reschedule +#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi +#define arch_send_call_function_ipi_mask xen_send_call_func_ipi + +void play_dead(void); + +#endif /* CONFIG_XEN */ + +void smp_store_cpu_info(int id); +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) + +/* We don't mark CPUs online until __cpu_up(), so we need another measure */ +static inline int num_booting_cpus(void) +{ + return cpumask_weight(cpu_callout_mask); +} +#endif /* CONFIG_SMP */ + +extern unsigned disabled_cpus __cpuinitdata; + +#include + +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) + +#ifndef CONFIG_X86_64 +static inline int logical_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); +} + +#endif + +extern int hard_smp_processor_id(void); + +#else /* CONFIG_X86_LOCAL_APIC */ + +# ifndef CONFIG_SMP +# define hard_smp_processor_id() 0 +# endif + +#endif /* CONFIG_X86_LOCAL_APIC */ + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_SMP_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/spinlock.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/spinlock.h @@ -0,0 +1,377 @@ +#ifndef _ASM_X86_SPINLOCK_H +#define _ASM_X86_SPINLOCK_H + +#include +#include +#include +#include +#include + +/* + * Your basic SMP spinlocks, allowing only a single CPU anywhere + * + * Simple spin lock operations. There are two variants, one clears IRQ's + * on the local processor, one does not. + * + * These are fair FIFO ticket locks, which are currently limited to 256 + * CPUs. + * + * (the type definitions are in asm/spinlock_types.h) + */ + +#ifdef CONFIG_X86_32 +# define LOCK_PTR_REG "a" +# define REG_PTR_MODE "k" +#else +# define LOCK_PTR_REG "D" +# define REG_PTR_MODE "q" +#endif + +#if defined(CONFIG_X86_32) && \ + (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)) +/* + * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock + * (PPro errata 66, 92) + */ +# define UNLOCK_LOCK_PREFIX LOCK_PREFIX +#else +# define UNLOCK_LOCK_PREFIX +#endif + +int xen_spinlock_init(unsigned int cpu); +void xen_spinlock_cleanup(unsigned int cpu); +extern int xen_spin_wait(raw_spinlock_t *, unsigned int token); +extern int xen_spin_wait_flags(raw_spinlock_t *, unsigned int *token, + unsigned int flags); +extern unsigned int xen_spin_adjust(raw_spinlock_t *, unsigned int token); +extern void xen_spin_kick(raw_spinlock_t *, unsigned int token); + +/* + * Ticket locks are conceptually two parts, one indicating the current head of + * the queue, and the other indicating the current tail. The lock is acquired + * by atomically noting the tail and incrementing it by one (thus adding + * ourself to the queue and noting our position), then waiting until the head + * becomes equal to the the initial value of the tail. + * + * We use an xadd covering *both* parts of the lock, to increment the tail and + * also load the position of the head, which takes care of memory ordering + * issues and should be optimal for the uncontended case. Note the tail must be + * in the high part, because a wide xadd increment of the low part would carry + * up and contaminate the high part. + * + * With fewer than 2^8 possible CPUs, we can use x86's partial registers to + * save some instructions and make the code more elegant. There really isn't + * much between them in performance though, especially as locks are out of line. + */ +#if (NR_CPUS < 256) +#define TICKET_SHIFT 8 +#define __ticket_spin_lock_preamble \ + asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \ + "cmpb %h0, %b0\n\t" \ + "sete %1" \ + : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \ + : "0" (0x0100) \ + : "memory", "cc") +#define __ticket_spin_lock_body \ + asm("1:\t" \ + "cmpb %h0, %b0\n\t" \ + "je 2f\n\t" \ + "decl %1\n\t" \ + "jz 2f\n\t" \ + "rep ; nop\n\t" \ + "movb %2, %b0\n\t" \ + /* don't need lfence here, because loads are in-order */ \ + "jmp 1b\n" \ + "2:" \ + : "+Q" (token), "+g" (count) \ + : "m" (lock->slock) \ + : "memory", "cc") + + +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +{ + int tmp, new; + + asm("movzwl %2, %0\n\t" + "cmpb %h0, %b0\n\t" + "leal 0x100(%" REG_PTR_MODE "0), %1\n\t" + "jne 1f\n\t" + LOCK_PREFIX "cmpxchgw %w1, %2\n\t" + "1:\t" + "sete %b1\n\t" + "movzbl %b1, %0\n\t" + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) + : + : "memory", "cc"); + + return tmp; +} + +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) +{ + unsigned int token; + unsigned char kick; + + asm(UNLOCK_LOCK_PREFIX "incb %2\n\t" + "movzwl %2, %0\n\t" + "cmpb %h0, %b0\n\t" + "setne %1" + : "=&Q" (token), "=qm" (kick), "+m" (lock->slock) + : + : "memory", "cc"); + if (kick) + xen_spin_kick(lock, token); +} +#else +#define TICKET_SHIFT 16 +#define __ticket_spin_lock_preamble \ + do { \ + unsigned int tmp; \ + asm(LOCK_PREFIX "xaddl %0, %2\n\t" \ + "shldl $16, %0, %3\n\t" \ + "cmpw %w3, %w0\n\t" \ + "sete %1" \ + : "=&r" (token), "=qm" (free), "+m" (lock->slock), \ + "=&g" (tmp) \ + : "0" (0x00010000) \ + : "memory", "cc"); \ + } while (0) +#define __ticket_spin_lock_body \ + do { \ + unsigned int tmp; \ + asm("shldl $16, %0, %2\n" \ + "1:\t" \ + "cmpw %w2, %w0\n\t" \ + "je 2f\n\t" \ + "decl %1\n\t" \ + "jz 2f\n\t" \ + "rep ; nop\n\t" \ + "movw %3, %w0\n\t" \ + /* don't need lfence here, because loads are in-order */ \ + "jmp 1b\n" \ + "2:" \ + : "+r" (token), "+g" (count), "=&g" (tmp) \ + : "m" (lock->slock) \ + : "memory", "cc"); \ + } while (0) + +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +{ + int tmp; + int new; + + asm("movl %2, %0\n\t" + "movl %0, %1\n\t" + "roll $16, %0\n\t" + "cmpl %0, %1\n\t" + "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t" + "jne 1f\n\t" + LOCK_PREFIX "cmpxchgl %1, %2\n" + "1:\t" + "sete %b1\n\t" + "movzbl %b1, %0\n\t" + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) + : + : "memory", "cc"); + + return tmp; +} + +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) +{ + unsigned int token, tmp; + bool kick; + + asm(UNLOCK_LOCK_PREFIX "incw %2\n\t" + "movl %2, %0\n\t" + "shldl $16, %0, %3\n\t" + "cmpw %w3, %w0\n\t" + "setne %1" + : "=&r" (token), "=qm" (kick), "+m" (lock->slock), "=&r" (tmp) + : + : "memory", "cc"); + if (kick) + xen_spin_kick(lock, token); +} +#endif + +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) +{ + int tmp = ACCESS_ONCE(lock->slock); + + return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); +} + +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) +{ + int tmp = ACCESS_ONCE(lock->slock); + + return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; +} + +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) +{ + unsigned int token, count; + bool free; + + __ticket_spin_lock_preamble; + if (likely(free)) + return; + token = xen_spin_adjust(lock, token); + do { + count = 1 << 10; + __ticket_spin_lock_body; + } while (unlikely(!count) && !xen_spin_wait(lock, token)); +} + +static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock, + unsigned long flags) +{ + unsigned int token, count; + bool free; + + __ticket_spin_lock_preamble; + if (likely(free)) + return; + token = xen_spin_adjust(lock, token); + do { + count = 1 << 10; + __ticket_spin_lock_body; + } while (unlikely(!count) && !xen_spin_wait_flags(lock, &token, flags)); +} + +#ifndef CONFIG_PARAVIRT_SPINLOCKS + +static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +{ + return __ticket_spin_is_locked(lock); +} + +static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +{ + return __ticket_spin_is_contended(lock); +} +#define __raw_spin_is_contended __raw_spin_is_contended + +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) +{ + __ticket_spin_lock(lock); +} + +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) +{ + return __ticket_spin_trylock(lock); +} + +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) +{ + __ticket_spin_unlock(lock); +} + +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, + unsigned long flags) +{ + __ticket_spin_lock_flags(lock, flags); +} + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +{ + while (__raw_spin_is_locked(lock)) + cpu_relax(); +} + +/* + * Read-write spinlocks, allowing multiple readers + * but only one writer. + * + * NOTE! it is quite common to have readers in interrupts + * but no interrupt writers. For those circumstances we + * can "mix" irq-safe locks - any writer needs to get a + * irq-safe write-lock, but readers can get non-irqsafe + * read-locks. + * + * On x86, we implement read-write locks as a 32-bit counter + * with the high bit (sign) being the "contended" bit. + */ + +/** + * read_can_lock - would read_trylock() succeed? + * @lock: the rwlock in question. + */ +static inline int __raw_read_can_lock(raw_rwlock_t *lock) +{ + return (int)(lock)->lock > 0; +} + +/** + * write_can_lock - would write_trylock() succeed? + * @lock: the rwlock in question. + */ +static inline int __raw_write_can_lock(raw_rwlock_t *lock) +{ + return (lock)->lock == RW_LOCK_BIAS; +} + +static inline void __raw_read_lock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" + "jns 1f\n" + "call __read_lock_failed\n\t" + "1:\n" + ::LOCK_PTR_REG (rw) : "memory"); +} + +static inline void __raw_write_lock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" + "jz 1f\n" + "call __write_lock_failed\n\t" + "1:\n" + ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); +} + +static inline int __raw_read_trylock(raw_rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + + if (atomic_dec_return(count) >= 0) + return 1; + atomic_inc(count); + return 0; +} + +static inline int __raw_write_trylock(raw_rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + + if (atomic_sub_and_test(RW_LOCK_BIAS, count)) + return 1; + atomic_add(RW_LOCK_BIAS, count); + return 0; +} + +static inline void __raw_read_unlock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); +} + +static inline void __raw_write_unlock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX "addl %1, %0" + : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); +} + +#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) +#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) + +#define _raw_spin_relax(lock) cpu_relax() +#define _raw_read_relax(lock) cpu_relax() +#define _raw_write_relax(lock) cpu_relax() + +/* The {read|write|spin}_lock() on x86 are full memory barriers. */ +static inline void smp_mb__after_lock(void) { } +#define ARCH_HAS_SMP_MB_AFTER_LOCK + +#endif /* _ASM_X86_SPINLOCK_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/swiotlb.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/swiotlb.h @@ -0,0 +1,4 @@ +#include_next + +dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size, + int dir); --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/synch_bitops.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/synch_bitops.h @@ -0,0 +1,126 @@ +#ifndef __XEN_SYNCH_BITOPS_H__ +#define __XEN_SYNCH_BITOPS_H__ + +/* + * Copyright 1992, Linus Torvalds. + * Heavily modified to provide guaranteed strong synchronisation + * when communicating with Xen or other guest OSes running on other CPUs. + */ + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include +#endif + +#define ADDR (*(volatile long *) addr) + +static __inline__ void synch_set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btsl %1,%0" + : "+m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btrl %1,%0" + : "+m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_change_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btcl %1,%0" + : "+m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btsl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btrl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__ ( + "lock btcl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +struct __synch_xchg_dummy { unsigned long a[100]; }; +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x)) + +#define synch_cmpxchg(ptr, old, new) \ +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\ + (unsigned long)(old), \ + (unsigned long)(new), \ + sizeof(*(ptr)))) + +static inline unsigned long __synch_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#ifdef CONFIG_X86_64 + case 4: + __asm__ __volatile__("lock; cmpxchgl %k1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 8: + __asm__ __volatile__("lock; cmpxchgq %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#else + case 4: + __asm__ __volatile__("lock; cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#endif + } + return old; +} + +#define synch_test_bit test_bit + +#define synch_cmpxchg_subword synch_cmpxchg + +#endif /* __XEN_SYNCH_BITOPS_H__ */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/system.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/system.h @@ -0,0 +1,440 @@ +#ifndef _ASM_X86_SYSTEM_H +#define _ASM_X86_SYSTEM_H + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* entries in ARCH_DLINFO: */ +#ifdef CONFIG_IA32_EMULATION +# define AT_VECTOR_SIZE_ARCH 2 +#else +# define AT_VECTOR_SIZE_ARCH 1 +#endif + +struct task_struct; /* one of the stranger aspects of C forward declarations */ +struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); + +#ifdef CONFIG_X86_32 + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movl %P[task_canary](%[next]), %%ebx\n\t" \ + "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" +#define __switch_canary_oparam \ + , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. + */ +#define switch_to(prev, next, last) \ +do { \ + /* \ + * Context-switching clobbers all registers, so we clobber \ + * them explicitly, via unused output variables. \ + * (EAX and EBP is not listed because EBP is saved/restored \ + * explicitly for wchan access and EAX is the return value of \ + * __switch_to()) \ + */ \ + unsigned long ebx, ecx, edx, esi, edi; \ + \ + asm volatile("pushfl\n\t" /* save flags */ \ + "pushl %%ebp\n\t" /* save EBP */ \ + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ + "pushl %[next_ip]\n\t" /* restore EIP */ \ + __switch_canary \ + "jmp __switch_to\n" /* regparm call */ \ + "1:\t" \ + "popl %%ebp\n\t" /* restore EBP */ \ + "popfl\n" /* restore flags */ \ + \ + /* output parameters */ \ + : [prev_sp] "=m" (prev->thread.sp), \ + [prev_ip] "=m" (prev->thread.ip), \ + "=a" (last), \ + \ + /* clobbered output registers: */ \ + "=b" (ebx), "=c" (ecx), "=d" (edx), \ + "=S" (esi), "=D" (edi) \ + \ + __switch_canary_oparam \ + \ + /* input parameters: */ \ + : [next_sp] "m" (next->thread.sp), \ + [next_ip] "m" (next->thread.ip), \ + \ + /* regparm parameters for __switch_to(): */ \ + [prev] "a" (prev), \ + [next] "d" (next) \ + \ + __switch_canary_iparam \ + \ + : /* reloaded segment registers */ \ + "memory"); \ +} while (0) + +/* + * disable hlt during certain critical i/o operations + */ +#define HAVE_DISABLE_HLT +#else +#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" +#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" + +/* frame pointer must be last for get_wchan */ +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" + +#define __EXTRA_CLOBBER \ + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ + "r12", "r13", "r14", "r15" + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movq %P[task_canary](%%rsi),%%r8\n\t" \ + "movq %%r8,"__percpu_arg([gs_canary])"\n\t" +#define __switch_canary_oparam \ + , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* Save restore flags to clear handle leaking NT */ +#define switch_to(prev, next, last) \ + asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + ".globl thread_return\n" \ + "thread_return:\n\t" \ + "movq "__percpu_arg([current_task])",%%rsi\n\t" \ + __switch_canary \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ + "movq %%rax,%%rdi\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "jnz ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ + __switch_canary_oparam \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ + [_tif_fork] "i" (_TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ + [current_task] "m" (per_cpu_var(current_task)) \ + __switch_canary_iparam \ + : "memory", "cc" __EXTRA_CLOBBER) +#endif + +#ifdef __KERNEL__ + +extern void xen_load_gs_index(unsigned); + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg, value) \ + asm volatile("\n" \ + "1:\t" \ + "movl %k0,%%" #seg "\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3:\t" \ + "movl %k1, %%" #seg "\n\t" \ + "jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b,3b) \ + : :"r" (value), "r" (0) : "memory") + + +/* + * Save a segment register away + */ +#define savesegment(seg, value) \ + asm("mov %%" #seg ",%0":"=r" (value) : : "memory") + +/* + * x86_32 user gs accessors. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_32_LAZY_GS +#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) +#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +#define task_user_gs(tsk) ((tsk)->thread.gs) +#define lazy_save_gs(v) savesegment(gs, (v)) +#define lazy_load_gs(v) loadsegment(gs, (v)) +#else /* X86_32_LAZY_GS */ +#define get_user_gs(regs) (u16)((regs)->gs) +#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +#define lazy_save_gs(v) do { } while (0) +#define lazy_load_gs(v) do { } while (0) +#endif /* X86_32_LAZY_GS */ +#endif /* X86_32 */ + +static inline unsigned long get_limit(unsigned long segment) +{ + unsigned long __limit; + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); + return __limit + 1; +} + +static inline void xen_clts(void) +{ + HYPERVISOR_fpu_taskswitch(0); +} + +static inline void xen_stts(void) +{ + HYPERVISOR_fpu_taskswitch(1); +} + +/* + * Volatile isn't enough to prevent the compiler from reordering the + * read/write functions for the control registers and messing everything up. + * A memory clobber would solve the problem, but would prevent reordering of + * all loads stores around it, which can hurt performance. Solution is to + * use a variable and mimic reads and writes to it to enforce serialization + */ +static unsigned long __force_order; + +static inline unsigned long xen_read_cr0(void) +{ + unsigned long val; + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void xen_write_cr0(unsigned long val) +{ + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); +} + +#define xen_read_cr2() vcpu_info_read(arch.cr2) +#define xen_write_cr2(val) vcpu_info_write(arch.cr2, val) + +static inline unsigned long xen_read_cr3(void) +{ + unsigned long val; + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); +#ifdef CONFIG_X86_32 + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT; +#else + return machine_to_phys(val); +#endif +} + +static inline void xen_write_cr3(unsigned long val) +{ +#ifdef CONFIG_X86_32 + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT)); +#else + val = phys_to_machine(val); +#endif + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long xen_read_cr4(void) +{ + unsigned long val; + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +#define xen_read_cr4_safe() xen_read_cr4() + +static inline void xen_write_cr4(unsigned long val) +{ + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long xen_read_cr8(void) +{ + return 0; +} + +static inline void xen_write_cr8(unsigned long val) +{ + BUG_ON(val); +} +#endif + +static inline void xen_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +#define read_cr0() (xen_read_cr0()) +#define write_cr0(x) (xen_write_cr0(x)) +#define read_cr2() (xen_read_cr2()) +#define write_cr2(x) (xen_write_cr2(x)) +#define read_cr3() (xen_read_cr3()) +#define write_cr3(x) (xen_write_cr3(x)) +#define read_cr4() (xen_read_cr4()) +#define read_cr4_safe() (xen_read_cr4_safe()) +#define write_cr4(x) (xen_write_cr4(x)) +#define wbinvd() (xen_wbinvd()) +#ifdef CONFIG_X86_64 +#define read_cr8() (xen_read_cr8()) +#define write_cr8(x) (xen_write_cr8(x)) +#define load_gs_index xen_load_gs_index +#endif + +/* Clear the 'TS' bit */ +#define clts() (xen_clts()) +#define stts() (xen_stts()) + +#endif /* __KERNEL__ */ + +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); +} + +#define nop() asm volatile ("nop") + +void disable_hlt(void); +void enable_hlt(void); + +void cpu_idle_wait(void); + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +void xen_idle(void); + +void stop_this_cpu(void *dummy); + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ +#ifdef CONFIG_X86_32 +/* + * Some non-Intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +#endif /* _ASM_X86_SYSTEM_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/system_64.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/system_64.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_SYSTEM_64_H +#define _ASM_X86_SYSTEM_64_H + +#include +#include + + +static inline unsigned long read_cr8(void) +{ + return 0; +} + +static inline void write_cr8(unsigned long val) +{ + BUG_ON(val); +} + +#include + +#endif /* _ASM_X86_SYSTEM_64_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/tlbflush.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/tlbflush.h @@ -0,0 +1,116 @@ +#ifndef _ASM_X86_TLBFLUSH_H +#define _ASM_X86_TLBFLUSH_H + +#include +#include + +#include +#include + +#define __flush_tlb() xen_tlb_flush() +#define __flush_tlb_global() xen_tlb_flush() +#define __flush_tlb_single(addr) xen_invlpg(addr) +#define __flush_tlb_all() xen_tlb_flush() +#define __flush_tlb_one(addr) xen_invlpg(addr) + +#ifdef CONFIG_X86_32 +# define TLB_FLUSH_ALL 0xffffffff +#else +# define TLB_FLUSH_ALL -1ULL +#endif + +/* + * TLB flushing: + * + * - flush_tlb() flushes the current mm struct TLBs + * - flush_tlb_all() flushes all processes TLBs + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages + * + * ..but the i386 has somewhat limited tlb flushing capabilities, + * and page-granular flushes are available only on i486 and up. + * + * x86-64 can only flush individual pages or full VMs. For a range flush + * we always do the full VM. Might be worth trying if for a small + * range a few INVLPGs in a row are a win. + */ + +#ifndef CONFIG_SMP + +#define flush_tlb() __flush_tlb() +#define flush_tlb_all() __flush_tlb_all() +#define local_flush_tlb() __flush_tlb() + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + if (mm == current->active_mm) + __flush_tlb(); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long addr) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb_one(addr); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb(); +} + +static inline void reset_lazy_tlbstate(void) +{ +} + +#else /* SMP */ + +#include + +#define local_flush_tlb() __flush_tlb() + +#define flush_tlb_all xen_tlb_flush_all +#define flush_tlb_current_task() xen_tlb_flush_mask(mm_cpumask(current->mm)) +#define flush_tlb_mm(mm) xen_tlb_flush_mask(mm_cpumask(mm)) +#define flush_tlb_page(vma, va) xen_invlpg_mask(mm_cpumask((vma)->vm_mm), va) + +#define flush_tlb() flush_tlb_current_task() + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + flush_tlb_mm(vma->vm_mm); +} + +#ifndef CONFIG_XEN +#define TLBSTATE_OK 1 +#define TLBSTATE_LAZY 2 + +struct tlb_state { + struct mm_struct *active_mm; + int state; +}; +DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); + +static inline void reset_lazy_tlbstate(void) +{ + percpu_write(cpu_tlbstate.state, 0); + percpu_write(cpu_tlbstate.active_mm, &init_mm); +} +#endif + +#endif /* SMP */ + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + flush_tlb_all(); +} + +extern void zap_low_mappings(bool early); + +#endif /* _ASM_X86_TLBFLUSH_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/vga.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/vga.h @@ -0,0 +1,20 @@ +/* + * Access to VGA videoram + * + * (c) 1998 Martin Mares + */ + +#ifndef _ASM_X86_VGA_H +#define _ASM_X86_VGA_H + +/* + * On the PC, we can just recalculate addresses and then + * access the videoram directly without any black magic. + */ + +#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x) + +#define vga_readb(x) (*(x)) +#define vga_writeb(x, y) (*(y) = (x)) + +#endif /* _ASM_X86_VGA_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/xenoprof.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/xenoprof.h @@ -0,0 +1,48 @@ +/****************************************************************************** + * asm-i386/mach-xen/asm/xenoprof.h + * + * Copyright (c) 2006 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef __ASM_XENOPROF_H__ +#define __ASM_XENOPROF_H__ +#ifdef CONFIG_XEN + +struct super_block; +struct dentry; +int xenoprof_create_files(struct super_block * sb, struct dentry * root); +#define HAVE_XENOPROF_CREATE_FILES + +struct xenoprof_init; +void xenoprof_arch_init_counter(struct xenoprof_init *init); +void xenoprof_arch_counter(void); +void xenoprof_arch_start(void); +void xenoprof_arch_stop(void); + +struct xenoprof_arch_shared_buffer { + /* nothing */ +}; +struct xenoprof_shared_buffer; +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf); +struct xenoprof_get_buffer; +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf); +struct xenoprof_passive; +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf); + +#endif /* CONFIG_XEN */ +#endif /* __ASM_XENOPROF_H__ */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/xor.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/xor.h @@ -0,0 +1,8 @@ +#ifdef CONFIG_KMEMCHECK +/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ +# include +#elif defined(CONFIG_X86_32) +# include "../../asm/xor_32.h" +#else +# include "xor_64.h" +#endif --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/xor_64.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/xor_64.h @@ -0,0 +1,337 @@ +#ifndef _ASM_X86_XOR_64_H +#define _ASM_X86_XOR_64_H + +/* + * x86-64 changes / gcc fixes from Andi Kleen. + * Copyright 2002 Andi Kleen, SuSE Labs. + * + * This hasn't been optimized for the hammer yet, but there are likely + * no advantages to be gotten from x86-64 here anyways. + */ + +typedef struct { + unsigned long a, b; +} __attribute__((aligned(16))) xmm_store_t; + +/* Doesn't use gcc to save the XMM registers, because there is no easy way to + tell it to do a clts before the register saving. */ +#define XMMS_SAVE \ +do { \ + preempt_disable(); \ + if (!(current_thread_info()->status & TS_USEDFPU)) \ + clts(); \ + asm volatile( \ + "movups %%xmm0,(%1) ;\n\t" \ + "movups %%xmm1,0x10(%1) ;\n\t" \ + "movups %%xmm2,0x20(%1) ;\n\t" \ + "movups %%xmm3,0x30(%1) ;\n\t" \ + : "=&r" (cr0) \ + : "r" (xmm_save) \ + : "memory"); \ +} while (0) + +#define XMMS_RESTORE \ +do { \ + asm volatile( \ + "sfence ;\n\t" \ + "movups (%1),%%xmm0 ;\n\t" \ + "movups 0x10(%1),%%xmm1 ;\n\t" \ + "movups 0x20(%1),%%xmm2 ;\n\t" \ + "movups 0x30(%1),%%xmm3 ;\n\t" \ + : \ + : "r" (cr0), "r" (xmm_save) \ + : "memory"); \ + if (!(current_thread_info()->status & TS_USEDFPU)) \ + stts(); \ + preempt_enable(); \ +} while (0) + +#define OFFS(x) "16*("#x")" +#define PF_OFFS(x) "256+16*("#x")" +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" +#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" +#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" +#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" +#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" +#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" +#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" +#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" +#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" + + +static void +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{ + unsigned int lines = bytes >> 8; + unsigned long cr0; + xmm_store_t xmm_save[4]; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + LD(i, 0) \ + LD(i + 1, 1) \ + PF1(i) \ + PF1(i + 2) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " decl %[cnt] ; jnz 1b" + : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) + : [inc] "r" (256UL) + : "memory"); + + XMMS_RESTORE; +} + +static void +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+r" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) + : [inc] "r" (256UL) + : "memory"); + XMMS_RESTORE; +} + +static void +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+c" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) + : [inc] "r" (256UL) + : "memory" ); + + XMMS_RESTORE; +} + +static void +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4, unsigned long *p5) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + PF4(i) \ + PF4(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + XO4(i, 0) \ + XO4(i + 1, 1) \ + XO4(i + 2, 2) \ + XO4(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " addq %[inc], %[p5] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+c" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), + [p5] "+r" (p5) + : [inc] "r" (256UL) + : "memory"); + + XMMS_RESTORE; +} + +static struct xor_block_template xor_block_sse = { + .name = "generic_sse", + .do_2 = xor_sse_2, + .do_3 = xor_sse_3, + .do_4 = xor_sse_4, + .do_5 = xor_sse_5, +}; + +#undef XOR_TRY_TEMPLATES +#define XOR_TRY_TEMPLATES \ +do { \ + xor_speed(&xor_block_sse); \ +} while (0) + +/* We force the use of the SSE xor block because it can write around L2. + We may also be able to load into the L1 only depending on how the cpu + deals with a load to a line that is being prefetched. */ +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) + +#endif /* _ASM_X86_XOR_64_H */ --- linux-ec2-2.6.32.orig/arch/x86/kernel/Makefile +++ linux-ec2-2.6.32/arch/x86/kernel/Makefile @@ -117,6 +117,8 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_X86_XEN) += fixup.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) @@ -131,3 +133,8 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o endif + +disabled-obj-$(CONFIG_XEN) := %_uv.o crash.o early-quirks.o hpet.o i8253.o \ + i8259.o irqinit.o pci-swiotlb.o reboot.o smpboot.o tsc.o tsc_sync.o \ + uv_%.o vsmp_64.o +disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms_32.o --- linux-ec2-2.6.32.orig/arch/x86/kernel/amd_iommu.c +++ linux-ec2-2.6.32/arch/x86/kernel/amd_iommu.c @@ -540,7 +540,7 @@ static void flush_devices_by_domain(struct protection_domain *domain) { struct amd_iommu *iommu; - int i; + unsigned long i; for (i = 0; i <= amd_iommu_last_bdf; ++i) { if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || @@ -1230,9 +1230,10 @@ /* * If we run in passthrough mode the device must be assigned to the - * passthrough domain if it is detached from any other domain + * passthrough domain if it is detached from any other domain. + * Make sure we can deassign from the pt_domain itself. */ - if (iommu_pass_through) { + if (iommu_pass_through && domain != pt_domain) { struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; __attach_device(iommu, pt_domain, devid); } @@ -2047,10 +2048,10 @@ struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - u16 devid; + u16 devid, __devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = calc_devid(dev->bus->number, dev->devfn); + __devid = devid = calc_devid(dev->bus->number, dev->devfn); if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; @@ -2065,6 +2066,10 @@ init_unity_mappings_for_device(dma_dom, devid); dma_dom->target_dev = devid; + attach_device(iommu, &dma_dom->domain, devid); + if (__devid != devid) + attach_device(iommu, &dma_dom->domain, __devid); + list_add_tail(&dma_dom->list, &iommu_pd_list); } } @@ -2079,6 +2084,11 @@ .dma_supported = amd_iommu_dma_supported, }; +void __init amd_iommu_init_api(void) +{ + register_iommu(&amd_iommu_ops); +} + /* * The function which clues the AMD IOMMU driver into dma_ops. */ @@ -2120,8 +2130,6 @@ /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; - register_iommu(&amd_iommu_ops); - bus_register_notifier(&pci_bus_type, &device_nb); amd_iommu_stats_init(); --- linux-ec2-2.6.32.orig/arch/x86/kernel/amd_iommu_init.c +++ linux-ec2-2.6.32/arch/x86/kernel/amd_iommu_init.c @@ -136,6 +136,11 @@ system */ /* + * Set to true if ACPI table parsing and hardware intialization went properly + */ +static bool amd_iommu_initialized; + +/* * Pointer to the device table which is shared by all AMD IOMMUs * it is indexed by the PCI device id or the HT unit id and contains * information about the domain the device belongs to as well as the @@ -913,6 +918,8 @@ } WARN_ON(p != end); + amd_iommu_initialized = true; + return 0; } @@ -925,7 +932,7 @@ * ****************************************************************************/ -static int __init iommu_setup_msi(struct amd_iommu *iommu) +static int iommu_setup_msi(struct amd_iommu *iommu) { int r; @@ -1263,6 +1270,9 @@ if (acpi_table_parse("IVRS", init_iommu_all) != 0) goto free; + if (!amd_iommu_initialized) + goto free; + if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; @@ -1278,9 +1288,12 @@ ret = amd_iommu_init_passthrough(); else ret = amd_iommu_init_dma_ops(); + if (ret) goto free; + amd_iommu_init_api(); + enable_iommus(); if (iommu_pass_through) --- linux-ec2-2.6.32.orig/arch/x86/kernel/asm-offsets_32.c +++ linux-ec2-2.6.32/arch/x86/kernel/asm-offsets_32.c @@ -20,10 +20,14 @@ #include #include +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN) #include +#endif +#ifdef CONFIG_LGUEST_GUEST #include #include "../../../drivers/lguest/lg.h" +#endif /* workaround for a warning with -Wmissing-prototypes */ void foo(void); @@ -55,6 +59,7 @@ OFFSET(TI_exec_domain, thread_info, exec_domain); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); + OFFSET(TI_cpu, thread_info, cpu); OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); @@ -93,9 +98,14 @@ OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); +#ifndef CONFIG_X86_NO_TSS /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - + DEFINE(SYSENTER_stack_sp0, offsetof(struct tss_struct, x86_tss.sp0) - sizeof(struct tss_struct)); +#else + /* sysenter stack points directly to sp0 */ + DEFINE(SYSENTER_stack_sp0, 0); +#endif DEFINE(PAGE_SIZE_asm, PAGE_SIZE); DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); @@ -105,6 +115,11 @@ OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_START_mfn_list, start_info, mfn_list); +#endif + #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); @@ -117,7 +132,7 @@ OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); --- linux-ec2-2.6.32.orig/arch/x86/kernel/asm-offsets_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/asm-offsets_64.c @@ -115,8 +115,10 @@ ENTRY(cr8); BLANK(); #undef ENTRY +#ifndef CONFIG_X86_NO_TSS DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); BLANK(); +#endif DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); @@ -130,7 +132,7 @@ BLANK(); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); --- linux-ec2-2.6.32.orig/arch/x86/kernel/cpuid.c +++ linux-ec2-2.6.32/arch/x86/kernel/cpuid.c @@ -192,7 +192,8 @@ int i, err = 0; i = 0; - if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { + if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, + "cpu/cpuid", &cpuid_fops)) { printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", CPUID_MAJOR); err = -EBUSY; @@ -221,7 +222,7 @@ } class_destroy(cpuid_class); out_chrdev: - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); out: return err; } @@ -233,7 +234,7 @@ for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); } --- linux-ec2-2.6.32.orig/arch/x86/kernel/dumpstack_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/dumpstack_64.c @@ -19,6 +19,7 @@ #include "dumpstack.h" +#ifndef CONFIG_X86_NO_TSS static char x86_stack_ids[][8] = { [DEBUG_STACK - 1] = "#DB", [NMI_STACK - 1] = "NMI", @@ -30,15 +31,21 @@ N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" #endif }; +#endif int x86_is_stack_id(int id, char *name) { +#ifndef CONFIG_X86_NO_TSS return x86_stack_ids[id - 1] == name; +#else + return 0; +#endif } static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) { +#ifndef CONFIG_X86_NO_TSS unsigned k; /* @@ -98,6 +105,7 @@ } #endif } +#endif /* CONFIG_X86_NO_TSS */ return NULL; } --- linux-ec2-2.6.32.orig/arch/x86/kernel/e820.c +++ linux-ec2-2.6.32/arch/x86/kernel/e820.c @@ -79,7 +79,7 @@ * Note: this function only works correct if the e820 table is sorted and * not-overlapping, which is the case */ -int __init e820_all_mapped(u64 start, u64 end, unsigned type) +int e820_all_mapped(u64 start, u64 end, unsigned type) { int i; @@ -106,6 +106,7 @@ } return 0; } +EXPORT_SYMBOL_GPL(e820_all_mapped); /* * Add a memory region to the kernel e820 map. --- linux-ec2-2.6.32.orig/arch/x86/kernel/entry_32.S +++ linux-ec2-2.6.32/arch/x86/kernel/entry_32.S @@ -393,7 +393,7 @@ CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp - movl TSS_sysenter_sp0(%esp),%esp + movl SYSENTER_stack_sp0(%esp),%esp sysenter_past_esp: /* * Interrupts are disabled here, but we can't trace it until @@ -995,7 +995,7 @@ CFI_ENDPROC ENDPROC(kernel_thread_helper) -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN /* Xen doesn't set %esp to be precisely what the normal sysenter entrypoint expects, so fix it up before using the normal path. */ ENTRY(xen_sysenter_target) @@ -1088,7 +1088,7 @@ .previous ENDPROC(xen_failsafe_callback) -#endif /* CONFIG_XEN */ +#endif /* CONFIG_PARAVIRT_XEN */ #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -1270,7 +1270,7 @@ * that sets up the real kernel stack. Check here, since we can't * allow the wrong stack to be used. * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have * already pushed 3 words if it hits on the sysenter instruction: * eflags, cs and eip. * @@ -1282,7 +1282,7 @@ cmpw $__KERNEL_CS, 4(%esp) jne \ok \label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp + movl SYSENTER_stack_sp0 + \offset(%esp), %esp CFI_DEF_CFA esp, 0 CFI_UNDEFINED eip pushfl --- linux-ec2-2.6.32.orig/arch/x86/kernel/entry_64.S +++ linux-ec2-2.6.32/arch/x86/kernel/entry_64.S @@ -1267,7 +1267,7 @@ CFI_ENDPROC END(call_softirq) -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN zeroentry xen_hypervisor_callback xen_do_hypervisor_callback /* @@ -1364,7 +1364,7 @@ CFI_ENDPROC END(xen_failsafe_callback) -#endif /* CONFIG_XEN */ +#endif /* CONFIG_PARAVIRT_XEN */ /* * Some functions should be protected against kprobes @@ -1374,7 +1374,7 @@ paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN zeroentry xen_debug do_debug zeroentry xen_int3 do_int3 errorentry xen_stack_segment do_stack_segment --- linux-ec2-2.6.32.orig/arch/x86/kernel/hpet.c +++ linux-ec2-2.6.32/arch/x86/kernel/hpet.c @@ -33,6 +33,8 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +u8 hpet_msi_disable; + #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -584,6 +586,9 @@ unsigned int num_timers_used = 0; int i; + if (hpet_msi_disable) + return; + id = hpet_readl(HPET_ID); num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); @@ -911,6 +916,9 @@ hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (hpet_msi_disable) + return 0; + for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); } --- linux-ec2-2.6.32.orig/arch/x86/kernel/init_task.c +++ linux-ec2-2.6.32/arch/x86/kernel/init_task.c @@ -31,6 +31,7 @@ struct task_struct init_task = INIT_TASK(init_task); EXPORT_SYMBOL(init_task); +#ifndef CONFIG_X86_NO_TSS /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. The TSS size is kept cacheline-aligned @@ -39,4 +40,4 @@ * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; - +#endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/msr.c +++ linux-ec2-2.6.32/arch/x86/kernel/msr.c @@ -251,7 +251,7 @@ int i, err = 0; i = 0; - if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { + if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { printk(KERN_ERR "msr: unable to get major %d for msr\n", MSR_MAJOR); err = -EBUSY; @@ -279,7 +279,7 @@ msr_device_destroy(i); class_destroy(msr_class); out_chrdev: - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); out: return err; } @@ -290,7 +290,7 @@ for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); } --- linux-ec2-2.6.32.orig/arch/x86/kernel/machine_kexec_32.c +++ linux-ec2-2.6.32/arch/x86/kernel/machine_kexec_32.c @@ -26,47 +26,9 @@ #include #include -static void set_idt(void *newidt, __u16 limit) -{ - struct desc_ptr curidt; - - /* ia32 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - load_idt(&curidt); -} - - -static void set_gdt(void *newgdt, __u16 limit) -{ - struct desc_ptr curgdt; - - /* ia32 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - load_gdt(&curgdt); -} - -static void load_segments(void) -{ -#define __STR(X) #X -#define STR(X) __STR(X) - - __asm__ __volatile__ ( - "\tljmp $"STR(__KERNEL_CS)",$1f\n" - "\t1:\n" - "\tmovl $"STR(__KERNEL_DS)",%%eax\n" - "\tmovl %%eax,%%ds\n" - "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" - "\tmovl %%eax,%%ss\n" - : : : "eax", "memory"); -#undef STR -#undef __STR -} +#ifdef CONFIG_XEN +#include +#endif static void machine_kexec_free_page_tables(struct kimage *image) { @@ -138,6 +100,51 @@ __pa(control_page), __pa(control_page)); } +#ifdef CONFIG_XEN + +#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) + +#if PAGES_NR > KEXEC_XEN_NO_PAGES +#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break +#endif + +#if PA_CONTROL_PAGE != 0 +#error PA_CONTROL_PAGE is non zero - Xen support will break +#endif + +void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) +{ + void *control_page; + + memset(xki->page_list, 0, sizeof(xki->page_list)); + + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); + xki->page_list[PA_PGD] = __ma(image->arch.pgd); + + if (image->type == KEXEC_TYPE_DEFAULT) + xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page); +} + +int __init machine_kexec_setup_resources(struct resource *hypervisor, + struct resource *phys_cpus, + int nr_phys_cpus) +{ + int k; + + /* The per-cpu crash note resources belong to the hypervisor resource */ + for (k = 0; k < nr_phys_cpus; k++) + request_resource(hypervisor, phys_cpus + k); + + return 0; +} + +void machine_kexec_register_resources(struct resource *res) { ; } + +#endif /* CONFIG_XEN */ + /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -177,6 +184,7 @@ machine_kexec_free_page_tables(image); } +#ifndef CONFIG_XEN /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. @@ -228,24 +236,6 @@ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); - /* - * The segment registers are funny things, they have both a - * visible and an invisible part. Whenever the visible part is - * set to a specific selector, the invisible part is loaded - * with from a table in memory. At no other time is the - * descriptor table in memory accessed. - * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. - */ - load_segments(); - /* - * The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); - /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, (unsigned long)page_list, @@ -259,6 +249,7 @@ __ftrace_enabled_restore(save_ftrace_enabled); } +#endif void arch_crash_save_vmcoreinfo(void) { --- linux-ec2-2.6.32.orig/arch/x86/kernel/machine_kexec_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/machine_kexec_64.c @@ -19,6 +19,112 @@ #include #include +#ifdef CONFIG_XEN + +/* In the case of Xen, override hypervisor functions to be able to create + * a regular identity mapping page table... + */ + +#include +#include + +#define x__pmd(x) ((pmd_t) { (x) } ) +#define x__pud(x) ((pud_t) { (x) } ) +#define x__pgd(x) ((pgd_t) { (x) } ) + +#define x_pmd_val(x) ((x).pmd) +#define x_pud_val(x) ((x).pud) +#define x_pgd_val(x) ((x).pgd) + +static inline void x_set_pmd(pmd_t *dst, pmd_t val) +{ + x_pmd_val(*dst) = x_pmd_val(val); +} + +static inline void x_set_pud(pud_t *dst, pud_t val) +{ + x_pud_val(*dst) = phys_to_machine(x_pud_val(val)); +} + +static inline void x_pud_clear (pud_t *pud) +{ + x_pud_val(*pud) = 0; +} + +static inline void x_set_pgd(pgd_t *dst, pgd_t val) +{ + x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); +} + +static inline void x_pgd_clear (pgd_t * pgd) +{ + x_pgd_val(*pgd) = 0; +} + +#define X__PAGE_KERNEL_LARGE_EXEC \ + _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE +#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY + +#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) + +#if PAGES_NR > KEXEC_XEN_NO_PAGES +#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break +#endif + +#if PA_CONTROL_PAGE != 0 +#error PA_CONTROL_PAGE is non zero - Xen support will break +#endif + +void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) +{ + void *control_page; + void *table_page; + + memset(xki->page_list, 0, sizeof(xki->page_list)); + + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + table_page = page_address(image->control_code_page); + + xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); + xki->page_list[PA_TABLE_PAGE] = __ma(table_page); + + if (image->type == KEXEC_TYPE_DEFAULT) + xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page); +} + +int __init machine_kexec_setup_resources(struct resource *hypervisor, + struct resource *phys_cpus, + int nr_phys_cpus) +{ + int k; + + /* The per-cpu crash note resources belong to the hypervisor resource */ + for (k = 0; k < nr_phys_cpus; k++) + request_resource(hypervisor, phys_cpus + k); + + return 0; +} + +#else /* CONFIG_XEN */ + +#define x__pmd(x) __pmd(x) +#define x__pud(x) __pud(x) +#define x__pgd(x) __pgd(x) + +#define x_set_pmd(x, y) set_pmd(x, y) +#define x_set_pud(x, y) set_pud(x, y) +#define x_set_pgd(x, y) set_pgd(x, y) + +#define x_pud_clear(x) pud_clear(x) +#define x_pgd_clear(x) pgd_clear(x) + +#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +#define X_KERNPG_TABLE _KERNPG_TABLE + +#endif /* CONFIG_XEN */ + static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) { @@ -48,7 +154,7 @@ } pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); result = 0; out: return result; @@ -61,7 +167,7 @@ addr &= PAGE_MASK; end_addr = addr + PUD_SIZE; while (addr < end_addr) { - set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); addr += PMD_SIZE; } } @@ -86,12 +192,12 @@ } level2p = (pmd_t *)page_address(page); init_level2_page(level2p, addr); - set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); + x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE)); addr += PUD_SIZE; } /* clear the unused entries */ while (addr < end_addr) { - pud_clear(level3p++); + x_pud_clear(level3p++); addr += PUD_SIZE; } out: @@ -121,12 +227,12 @@ result = init_level3_page(image, level3p, addr, last_addr); if (result) goto out; - set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); + x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE)); addr += PGDIR_SIZE; } /* clear the unused entries */ while (addr < end_addr) { - pgd_clear(level4p++); + x_pgd_clear(level4p++); addr += PGDIR_SIZE; } out: @@ -187,8 +293,14 @@ { pgd_t *level4p; int result; + unsigned long x_max_pfn = max_pfn; + +#ifdef CONFIG_XEN + x_max_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); +#endif + level4p = (pgd_t *)__va(start_pgtable); - result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); + result = init_level4_page(image, level4p, 0, x_max_pfn << PAGE_SHIFT); if (result) return result; /* @@ -201,47 +313,6 @@ return init_transition_pgtable(image, level4p); } -static void set_idt(void *newidt, u16 limit) -{ - struct desc_ptr curidt; - - /* x86-64 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - __asm__ __volatile__ ( - "lidtq %0\n" - : : "m" (curidt) - ); -}; - - -static void set_gdt(void *newgdt, u16 limit) -{ - struct desc_ptr curgdt; - - /* x86-64 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - __asm__ __volatile__ ( - "lgdtq %0\n" - : : "m" (curgdt) - ); -}; - -static void load_segments(void) -{ - __asm__ __volatile__ ( - "\tmovl %0,%%ds\n" - "\tmovl %0,%%es\n" - "\tmovl %0,%%ss\n" - "\tmovl %0,%%fs\n" - "\tmovl %0,%%gs\n" - : : "a" (__KERNEL_DS) : "memory" - ); -} - int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -263,6 +334,7 @@ free_transition_pgtable(image); } +#ifndef CONFIG_XEN /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. @@ -308,24 +380,6 @@ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); - /* - * The segment registers are funny things, they have both a - * visible and an invisible part. Whenever the visible part is - * set to a specific selector, the invisible part is loaded - * with from a table in memory. At no other time is the - * descriptor table in memory accessed. - * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. - */ - load_segments(); - /* - * The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); - /* now call it */ image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, @@ -339,10 +393,13 @@ __ftrace_enabled_restore(save_ftrace_enabled); } +#endif void arch_crash_save_vmcoreinfo(void) { +#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */ VMCOREINFO_SYMBOL(phys_base); +#endif VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA --- linux-ec2-2.6.32.orig/arch/x86/kernel/mpparse.c +++ linux-ec2-2.6.32/arch/x86/kernel/mpparse.c @@ -359,13 +359,6 @@ x86_init.mpparse.mpc_record(1); } -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); - if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; --- linux-ec2-2.6.32.orig/arch/x86/kernel/mmconf-fam10h_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/mmconf-fam10h_64.c @@ -218,6 +218,16 @@ val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | FAM10H_MMIO_CONF_ENABLE; wrmsrl(address, val); + +#ifdef CONFIG_XEN + { + u64 val2; + + rdmsrl(address, val2); + if (val2 != val) + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; + } +#endif } static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) --- linux-ec2-2.6.32.orig/arch/x86/kernel/paravirt.c +++ linux-ec2-2.6.32/arch/x86/kernel/paravirt.c @@ -345,6 +345,9 @@ .read_tscp = native_read_tscp, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, +#ifdef CONFIG_X86_32 + .load_user_cs_desc = native_load_user_cs_desc, +#endif /*CONFIG_X86_32*/ .load_gdt = native_load_gdt, .load_idt = native_load_idt, .store_gdt = native_store_gdt, --- linux-ec2-2.6.32.orig/arch/x86/kernel/pci-calgary_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/pci-calgary_64.c @@ -318,13 +318,15 @@ pdev = to_pci_dev(dev); + /* search up the device tree for an iommu */ pbus = pdev->bus; - - /* is the device behind a bridge? Look for the root bus */ - while (pbus->parent) + do { + tbl = pci_iommu(pbus); + if (tbl && tbl->it_busno == pbus->number) + break; + tbl = NULL; pbus = pbus->parent; - - tbl = pci_iommu(pbus); + } while (pbus); BUG_ON(tbl && (tbl->it_busno != pbus->number)); --- linux-ec2-2.6.32.orig/arch/x86/kernel/pci-dma.c +++ linux-ec2-2.6.32/arch/x86/kernel/pci-dma.c @@ -214,7 +214,7 @@ if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; + forbid_dac = 1; if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; --- linux-ec2-2.6.32.orig/arch/x86/kernel/pci-gart_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/pci-gart_64.c @@ -856,7 +856,7 @@ #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush", 8)) + if (!strncmp(p, "fullflush", 9)) iommu_fullflush = 1; if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; --- linux-ec2-2.6.32.orig/arch/x86/kernel/process.c +++ linux-ec2-2.6.32/arch/x86/kernel/process.c @@ -91,18 +91,6 @@ { struct task_struct *tsk = current; -#ifdef CONFIG_X86_64 - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } -#endif - clear_tsk_thread_flag(tsk, TIF_DEBUG); tsk->thread.debugreg0 = 0; @@ -451,21 +439,37 @@ } /* - * Check for AMD CPUs, which have potentially C1E support + * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. + * For more information see + * - Erratum #400 for NPT family 0xf and family 0x10 CPUs + * - Erratum #365 for family 0x11 (not affected because C1e not in use) */ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) { + u64 val; if (c->x86_vendor != X86_VENDOR_AMD) - return 0; - - if (c->x86 < 0x0F) - return 0; + goto no_c1e_idle; /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; + if (c->x86 == 0x0F && c->x86_model >= 0x40) + return 1; - return 1; + if (c->x86 == 0x10) { + /* + * check OSVW bit for CPUs that are not affected + * by erratum #400 + */ + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); + if (val >= 2) { + rdmsrl(MSR_AMD64_OSVW_STATUS, val); + if (!(val & BIT(1))) + goto no_c1e_idle; + } + return 1; + } + +no_c1e_idle: + return 0; } static cpumask_var_t c1e_mask; @@ -607,6 +611,16 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) { unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; + unsigned long bump = 0; +#ifdef CONFIG_X86_32 + /* in the case of NX emulation, shove the brk segment way out of the + way of the exec randomization area, since it can collide with + future allocations if not. */ + if ( (mm->get_unmapped_exec_area == arch_get_unmapped_exec_area) && + (mm->brk < 0x08000000) ) { + bump = (TASK_SIZE/6); + } +#endif + return bump + (randomize_range(mm->brk, range_end, 0) ? : mm->brk); } --- linux-ec2-2.6.32.orig/arch/x86/kernel/pcspeaker.c +++ linux-ec2-2.6.32/arch/x86/kernel/pcspeaker.c @@ -6,6 +6,11 @@ { struct platform_device *pd; +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return 0; +#endif + pd = platform_device_register_simple("pcspkr", -1, NULL, 0); return IS_ERR(pd) ? PTR_ERR(pd) : 0; --- linux-ec2-2.6.32.orig/arch/x86/kernel/probe_roms_32.c +++ linux-ec2-2.6.32/arch/x86/kernel/probe_roms_32.c @@ -131,7 +131,7 @@ upper = system_rom_resource.start; /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start); if (romsignature(rom)) { length = extension_rom_resource.end - extension_rom_resource.start + 1; if (romchecksum(rom, length)) { --- linux-ec2-2.6.32.orig/arch/x86/kernel/process_32.c +++ linux-ec2-2.6.32/arch/x86/kernel/process_32.c @@ -296,6 +296,8 @@ void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { + int cpu; + set_user_gs(regs, 0); regs->fs = 0; set_fs(USER_DS); @@ -305,6 +307,11 @@ regs->cs = __USER_CS; regs->ip = new_ip; regs->sp = new_sp; + + cpu = get_cpu(); + load_user_cs_desc(cpu, current->mm); + put_cpu(); + /* * Free the old FP and other extended state */ @@ -359,6 +366,8 @@ preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; __unlazy_fpu(prev_p); + if (next_p->mm) + load_user_cs_desc(cpu, next_p->mm); /* we're going to use this soon, after a few expensive things */ if (preload_fpu) @@ -497,3 +506,40 @@ return 0; } +static void modify_cs(struct mm_struct *mm, unsigned long limit) +{ + mm->context.exec_limit = limit; + set_user_cs(&mm->context.user_cs, limit); + if (mm == current->mm) { + int cpu; + + cpu = get_cpu(); + load_user_cs_desc(cpu, mm); + put_cpu(); + } +} + +void arch_add_exec_range(struct mm_struct *mm, unsigned long limit) +{ + if (limit > mm->context.exec_limit) + modify_cs(mm, limit); +} + +void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end) +{ + struct vm_area_struct *vma; + unsigned long limit = PAGE_SIZE; + + if (old_end == mm->context.exec_limit) { + for (vma = mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + modify_cs(mm, limit); + } +} + +void arch_flush_exec_range(struct mm_struct *mm) +{ + mm->context.exec_limit = 0; + set_user_cs(&mm->context.user_cs, 0); +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/process_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/process_64.c @@ -540,6 +540,18 @@ return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } +void set_personality_ia32(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + current->personality |= force_personality32; + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; +} + unsigned long get_wchan(struct task_struct *p) { unsigned long stack; --- linux-ec2-2.6.32.orig/arch/x86/kernel/ptrace.c +++ linux-ec2-2.6.32/arch/x86/kernel/ptrace.c @@ -408,14 +408,14 @@ { if (kbuf) { unsigned long *k = kbuf; - while (count > 0) { + while (count >= sizeof(*k)) { *k++ = getreg(target, pos); count -= sizeof(*k); pos += sizeof(*k); } } else { unsigned long __user *u = ubuf; - while (count > 0) { + while (count >= sizeof(*u)) { if (__put_user(getreg(target, pos), u++)) return -EFAULT; count -= sizeof(*u); @@ -434,14 +434,14 @@ int ret = 0; if (kbuf) { const unsigned long *k = kbuf; - while (count > 0 && !ret) { + while (count >= sizeof(*k) && !ret) { ret = putreg(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const unsigned long __user *u = ubuf; - while (count > 0 && !ret) { + while (count >= sizeof(*u) && !ret) { unsigned long word; ret = __get_user(word, u++); if (ret) @@ -1219,14 +1219,14 @@ { if (kbuf) { compat_ulong_t *k = kbuf; - while (count > 0) { + while (count >= sizeof(*k)) { getreg32(target, pos, k++); count -= sizeof(*k); pos += sizeof(*k); } } else { compat_ulong_t __user *u = ubuf; - while (count > 0) { + while (count >= sizeof(*u)) { compat_ulong_t word; getreg32(target, pos, &word); if (__put_user(word, u++)) @@ -1247,14 +1247,14 @@ int ret = 0; if (kbuf) { const compat_ulong_t *k = kbuf; - while (count > 0 && !ret) { + while (count >= sizeof(*k) && !ret) { ret = putreg32(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const compat_ulong_t __user *u = ubuf; - while (count > 0 && !ret) { + while (count >= sizeof(*u) && !ret) { compat_ulong_t word; ret = __get_user(word, u++); if (ret) --- linux-ec2-2.6.32.orig/arch/x86/kernel/quirks.c +++ linux-ec2-2.6.32/arch/x86/kernel/quirks.c @@ -491,6 +491,19 @@ break; } } + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + #endif #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) --- linux-ec2-2.6.32.orig/arch/x86/kernel/reboot.c +++ linux-ec2-2.6.32/arch/x86/kernel/reboot.c @@ -203,6 +203,15 @@ DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 760", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", @@ -259,6 +268,14 @@ DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), }, }, + { /* Handle problems with rebooting on ASUS P4S800 */ + .callback = set_bios_reboot, + .ident = "ASUS P4S800", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, { } }; @@ -444,6 +461,14 @@ DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), }, }, + { /* Handle problems with rebooting on the iMac9,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac9,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + }, + }, { } }; --- linux-ec2-2.6.32.orig/arch/x86/kernel/setup.c +++ linux-ec2-2.6.32/arch/x86/kernel/setup.c @@ -109,6 +109,7 @@ #ifdef CONFIG_X86_64 #include #endif +#include /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -666,19 +667,27 @@ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), }, }, - { /* - * AMI BIOS with low memory corruption was found on Intel DG45ID board. - * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * AMI BIOS with low memory corruption was found on Intel DG45ID and + * DG45FC boards. + * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will * match only DMI_BOARD_NAME and see if there is more bad products * with this vendor. */ + { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", .matches = { DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), + }, + }, #endif {} }; @@ -1031,6 +1040,8 @@ #endif #endif x86_init.oem.banner(); + + mcheck_intel_therm_init(); } #ifdef CONFIG_X86_32 --- linux-ec2-2.6.32.orig/arch/x86/kernel/relocate_kernel_32.S +++ linux-ec2-2.6.32/arch/x86/kernel/relocate_kernel_32.S @@ -87,14 +87,32 @@ movl PTR(PA_PGD)(%ebp), %eax movl %eax, %cr3 + /* setup idt */ + lidtl idt_48 - relocate_kernel(%edi) + + /* setup gdt */ + leal gdt - relocate_kernel(%edi), %eax + movl %eax, (gdt_48 - relocate_kernel) + 2(%edi) + lgdtl gdt_48 - relocate_kernel(%edi) + + /* setup data segment registers */ + mov $(gdt_ds - gdt), %eax + mov %eax, %ds + mov %eax, %es + mov %eax, %fs + mov %eax, %gs + mov %eax, %ss + /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%edi), %esp - /* jump to identity mapped page */ + /* load new code segment and jump to identity mapped page */ + pushl $0 + pushl $(gdt_cs - gdt) movl %edi, %eax addl $(identity_mapped - relocate_kernel), %eax pushl %eax - ret + iretl identity_mapped: /* store the start address on the stack */ @@ -271,5 +289,22 @@ popl %ebp ret + .align 16 +gdt: + .quad 0x0000000000000000 /* NULL descriptor */ +gdt_cs: + .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ +gdt_ds: + .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ +gdt_end: + +gdt_48: + .word gdt_end - gdt - 1 /* limit */ + .long 0 /* base - filled in by code above */ + +idt_48: + .word 0 /* limit */ + .long 0 /* base */ + .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel --- linux-ec2-2.6.32.orig/arch/x86/kernel/relocate_kernel_64.S +++ linux-ec2-2.6.32/arch/x86/kernel/relocate_kernel_64.S @@ -91,13 +91,30 @@ /* Switch to the identity mapped page tables */ movq %r9, %cr3 + /* setup idt */ + lidtq idt_80 - relocate_kernel(%r8) + + /* setup gdt */ + leaq gdt - relocate_kernel(%r8), %rax + movq %rax, (gdt_80 - relocate_kernel) + 2(%r8) + lgdtq gdt_80 - relocate_kernel(%r8) + + /* setup data segment registers */ + xorl %eax, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss + /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%r8), %rsp - /* jump to identity mapped page */ + /* load new code segment and jump to identity mapped page */ addq $(identity_mapped - relocate_kernel), %r8 + pushq $(gdt_cs - gdt) pushq %r8 - ret + lretq identity_mapped: /* store the start address on the stack */ @@ -262,5 +279,20 @@ 3: ret + .align 16 +gdt: + .quad 0x0000000000000000 /* NULL descriptor */ +gdt_cs: + .quad 0x00af9a000000ffff +gdt_end: + +gdt_80: + .word gdt_end - gdt - 1 /* limit */ + .quad 0 /* base - filled in by code above */ + +idt_80: + .word 0 /* limit */ + .quad 0 /* base */ + .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel --- linux-ec2-2.6.32.orig/arch/x86/kernel/rtc.c +++ linux-ec2-2.6.32/arch/x86/kernel/rtc.c @@ -171,6 +171,11 @@ unsigned long flags; int retval; +#ifdef CONFIG_XEN + if (xen_update_persistent_clock() < 0 || xen_independent_wallclock()) + return 0; +#endif + spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.set_wallclock(now.tv_sec); spin_unlock_irqrestore(&rtc_lock, flags); @@ -183,6 +188,12 @@ { unsigned long retval, flags; +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) { + xen_read_persistent_clock(ts); + return; + } +#endif spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); --- linux-ec2-2.6.32.orig/arch/x86/kernel/setup_percpu.c +++ linux-ec2-2.6.32/arch/x86/kernel/setup_percpu.c @@ -224,7 +224,7 @@ * are zeroed indicating that the static arrays are * gone. */ -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) per_cpu(x86_cpu_to_apicid, cpu) = early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = @@ -248,7 +248,7 @@ } /* indicate the early static arrays will soon be gone */ -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; #endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/traps.c +++ linux-ec2-2.6.32/arch/x86/kernel/traps.c @@ -115,6 +115,67 @@ if (!user_mode_vm(regs)) die(str, regs, err); } + +static inline int +__compare_user_cs_desc(const struct desc_struct *desc1, + const struct desc_struct *desc2) +{ + return ((desc1->limit0 != desc2->limit0) || + (desc1->limit != desc2->limit) || + (desc1->base0 != desc2->base0) || + (desc1->base1 != desc2->base1) || + (desc1->base2 != desc2->base2)); +} + +/* + * lazy-check for CS validity on exec-shield binaries: + * + * the original non-exec stack patch was written by + * Solar Designer . Thanks! + */ +static int +check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code) +{ + struct desc_struct *desc1, *desc2; + struct vm_area_struct *vma; + unsigned long limit; + + if (current->mm == NULL) + return 0; + + limit = -1UL; + if (current->mm->context.exec_limit != -1UL) { + limit = PAGE_SIZE; + spin_lock(¤t->mm->page_table_lock); + for (vma = current->mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + vma = get_gate_vma(current); + if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + spin_unlock(¤t->mm->page_table_lock); + if (limit >= TASK_SIZE) + limit = -1UL; + current->mm->context.exec_limit = limit; + } + set_user_cs(¤t->mm->context.user_cs, limit); + + desc1 = ¤t->mm->context.user_cs; + desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS; + + if (__compare_user_cs_desc(desc1, desc2)) { + /* + * The CS was not in sync - reload it and retry the + * instruction. If the instruction still faults then + * we won't hit this branch next time around. + */ + load_user_cs_desc(cpu, current->mm); + + return 1; + } + + return 0; +} #endif static void __kprobes @@ -273,6 +334,20 @@ if (!user_mode(regs)) goto gp_in_kernel; +#ifdef CONFIG_X86_32 +{ + int cpu; + int ok; + + cpu = get_cpu(); + ok = check_lazy_exec_limit(cpu, regs, error_code); + put_cpu(); + + if (ok) + return; +} +#endif + tsk->thread.error_code = error_code; tsk->thread.trap_no = 13; @@ -881,11 +956,29 @@ } #ifdef CONFIG_X86_32 +/* + * The fixup code for errors in iret jumps to here (iret_exc). It loses + * the original trap number and erorr code. The bogus trap 32 and error + * code 0 are what the vanilla kernel delivers via: + * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) + * + * NOTE: Because of the final "1" in the macro we need to enable interrupts. + * + * In case of a general protection fault in the iret instruction, we + * need to check for a lazy CS update for exec-shield. + */ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; + int ok; + int cpu; local_irq_enable(); + cpu = get_cpu(); + ok = check_lazy_exec_limit(cpu, regs, error_code); + put_cpu(); + if (ok) return; + info.si_signo = SIGILL; info.si_errno = 0; info.si_code = ILL_BADSTK; --- linux-ec2-2.6.32.orig/arch/x86/kernel/smpboot.c +++ linux-ec2-2.6.32/arch/x86/kernel/smpboot.c @@ -1066,9 +1066,7 @@ set_cpu_sibling_map(0); enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); --- linux-ec2-2.6.32.orig/arch/x86/kernel/sys_i386_32.c +++ linux-ec2-2.6.32/arch/x86/kernel/sys_i386_32.c @@ -24,31 +24,6 @@ #include -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - struct mm_struct *mm = current->mm; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/i386 didn't use to be able to handle more than @@ -77,7 +52,7 @@ if (a.offset & ~PAGE_MASK) goto out; - err = sys_mmap2(a.addr, a.len, a.prot, a.flags, + err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return err; --- linux-ec2-2.6.32.orig/arch/x86/kernel/sys_x86_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/sys_x86_64.c @@ -23,26 +23,11 @@ unsigned long, fd, unsigned long, off) { long error; - struct file *file; - error = -EINVAL; if (off & ~PAGE_MASK) goto out; - error = -EBADF; - file = NULL; - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); + error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return error; } --- linux-ec2-2.6.32.orig/arch/x86/kernel/syscall_table_32.S +++ linux-ec2-2.6.32/arch/x86/kernel/syscall_table_32.S @@ -191,7 +191,7 @@ .long sys_ni_syscall /* reserved for streams2 */ .long ptregs_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ --- linux-ec2-2.6.32.orig/arch/x86/kernel/tlb_uv.c +++ linux-ec2-2.6.32/arch/x86/kernel/tlb_uv.c @@ -817,10 +817,8 @@ */ apicid = blade_to_first_apicid(blade); pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); - if ((pa & 0xff) != UV_BAU_MESSAGE) { - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, + uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, ((apicid << 32) | UV_BAU_MESSAGE)); - } return 0; } --- linux-ec2-2.6.32.orig/arch/x86/kernel/tsc.c +++ linux-ec2-2.6.32/arch/x86/kernel/tsc.c @@ -763,6 +763,7 @@ { if (!tsc_unstable) { tsc_unstable = 1; + sched_clock_stable = 0; printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) --- linux-ec2-2.6.32.orig/arch/x86/kernel/vm86_32.c +++ linux-ec2-2.6.32/arch/x86/kernel/vm86_32.c @@ -125,7 +125,9 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) { +#ifndef CONFIG_X86_NO_TSS struct tss_struct *tss; +#endif struct pt_regs *ret; unsigned long tmp; @@ -148,12 +150,16 @@ do_exit(SIGSEGV); } +#ifndef CONFIG_X86_NO_TSS tss = &per_cpu(init_tss, get_cpu()); +#endif current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, ¤t->thread); current->thread.saved_sp0 = 0; +#ifndef CONFIG_X86_NO_TSS put_cpu(); +#endif ret = KVM86->regs32; @@ -280,7 +286,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) { +#ifndef CONFIG_X86_NO_TSS struct tss_struct *tss; +#endif /* * make sure the vm86() system call doesn't try to do anything silly */ @@ -324,12 +332,16 @@ tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); +#ifndef CONFIG_X86_NO_TSS tss = &per_cpu(init_tss, get_cpu()); +#endif tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; load_sp0(tss, &tsk->thread); +#ifndef CONFIG_X86_NO_TSS put_cpu(); +#endif tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) --- linux-ec2-2.6.32.orig/arch/x86/kernel/vmlinux.lds.S +++ linux-ec2-2.6.32/arch/x86/kernel/vmlinux.lds.S @@ -16,8 +16,10 @@ #ifdef CONFIG_X86_32 #define LOAD_OFFSET __PAGE_OFFSET -#else +#elif !defined(CONFIG_XEN) || CONFIG_XEN_COMPAT > 0x030002 #define LOAD_OFFSET __START_KERNEL_map +#else +#define LOAD_OFFSET 0 #endif #include @@ -58,6 +60,10 @@ { #ifdef CONFIG_X86_32 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; +#if defined(CONFIG_XEN) && CONFIG_XEN_COMPAT <= 0x030002 +#undef LOAD_OFFSET +#define LOAD_OFFSET 0 +#endif phys_startup_32 = startup_32 - LOAD_OFFSET; #else . = __START_KERNEL; @@ -308,8 +314,10 @@ /* * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: */ +#if !defined(CONFIG_XEN) || CONFIG_XEN_COMPAT > 0x030002 . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); +#endif #else /* * Per-cpu symbols which need to be offset from __per_cpu_load --- linux-ec2-2.6.32.orig/arch/x86/kernel/e820-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/e820-xen.c @@ -0,0 +1,1642 @@ +/* + * Handle the memory map. + * The functions here do the job until bootmem takes over. + * + * Getting sanitize_e820_map() in sync with i386 version by applying change: + * - Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * Venkatesh Pallipadi + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * The e820 map is the map that gets modified e.g. with command line parameters + * and that is also registered with modifications in the kernel resource tree + * with the iomem_resource as parent. + * + * The e820_saved is directly saved after the BIOS-provided memory map is + * copied. It doesn't get modified afterwards. It's registered for the + * /sys/firmware/memmap interface. + * + * That memory map is not modified and is used as base for kexec. The kexec'd + * kernel should get the same memory map as the firmware provides. Then the + * user can e.g. boot the original kernel with mem=1G while still booting the + * next kernel with full memory. + */ +struct e820map e820; +#ifndef CONFIG_XEN +struct e820map e820_saved; +#else +static struct e820map machine_e820; +#define e820_saved machine_e820 +#endif + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0xaeedbabe; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +/* + * This function checks if any part of the range is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + +#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; +#else + if (!is_initial_xendomain()) + return 0; + for (i = 0; i < machine_e820.nr_map; ++i) { + const struct e820entry *ei = &machine_e820.map[i]; +#endif + + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + +/* + * This function checks if the entire range is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init e820_all_mapped(u64 start, u64 end, unsigned type) +{ + int i; + +#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; +#else + if (!is_initial_xendomain()) + return 0; + for (i = 0; i < machine_e820.nr_map; ++i) { + const struct e820entry *ei = &machine_e820.map[i]; +#endif + + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* + * if start is now at or beyond end, we're done, full + * coverage + */ + if (start >= end) + return 1; + } + return 0; +} + +/* + * Add a memory region to the kernel e820 map. + */ +static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, + int type) +{ + int x = e820x->nr_map; + + if (x >= ARRAY_SIZE(e820x->map)) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820x->map[x].addr = start; + e820x->map[x].size = size; + e820x->map[x].type = type; + e820x->nr_map++; +} + +void __init e820_add_region(u64 start, u64 size, int type) +{ + __e820_add_region(&e820, start, size, type); +} + +static void __init e820_print_type(u32 type) +{ + switch (type) { + case E820_RAM: + case E820_RESERVED_KERN: + printk(KERN_CONT "(usable)"); + break; + case E820_RESERVED: + printk(KERN_CONT "(reserved)"); + break; + case E820_ACPI: + printk(KERN_CONT "(ACPI data)"); + break; + case E820_NVS: + printk(KERN_CONT "(ACPI NVS)"); + break; + case E820_UNUSABLE: + printk(KERN_CONT "(unusable)"); + break; + default: + printk(KERN_CONT "type %u", type); + break; + } +} + +static void __init _e820_print_map(const struct e820map *e820, const char *who) +{ + int i; + + for (i = 0; i < e820->nr_map; i++) { + printk(KERN_INFO " %s: %016Lx - %016Lx ", who, + (unsigned long long) e820->map[i].addr, + (unsigned long long) + (e820->map[i].addr + e820->map[i].size)); + e820_print_type(e820->map[i].type); + printk(KERN_CONT "\n"); + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps, + * and resolving conflicting memory types in favor of highest + * numbered type. + * + * The input parameter biosmap points to an array of 'struct + * e820entry' which on entry has elements in the range [0, *pnr_map) + * valid, and which has space for up to max_nr_map entries. + * On return, the resulting sanitized e820 map entries will be in + * overwritten in the same location, starting at biosmap. + * + * The integer pointed to by pnr_map must be valid on entry (the + * current number of valid entries located at biosmap) and will + * be updated on return, with the new number of valid entries + * (something no more than max_nr_map.) + * + * The return value from sanitize_e820_map() is zero if it + * successfully 'sanitized' the map entries passed in, and is -1 + * if it did nothing, which can happen if either of (1) it was + * only passed one map entry, or (2) any of the input map entries + * were invalid (start + size < start, meaning that the size was + * so big the described memory range wrapped around through zero.) + * + * Visually we're performing the following + * (1,2,3,4 = memory types)... + * + * Sample memory map (w/overlaps): + * ____22__________________ + * ______________________4_ + * ____1111________________ + * _44_____________________ + * 11111111________________ + * ____________________33__ + * ___________44___________ + * __________33333_________ + * ______________22________ + * ___________________2222_ + * _________111111111______ + * _____________________11_ + * _________________4______ + * + * Sanitized equivalent (no overlap): + * 1_______________________ + * _44_____________________ + * ___1____________________ + * ____22__________________ + * ______11________________ + * _________1______________ + * __________3_____________ + * ___________44___________ + * _____________33_________ + * _______________2________ + * ________________1_______ + * _________________4______ + * ___________________2____ + * ____________________33__ + * ______________________4_ + */ + +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, + u32 *pnr_map) +{ + struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ + }; + static struct change_member change_point_list[2*E820_X_MAX] __initdata; + static struct change_member *change_point[2*E820_X_MAX] __initdata; + static struct e820entry *overlap_list[E820_X_MAX] __initdata; + static struct e820entry new_bios[E820_X_MAX] __initdata; + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* if there's only one memory region, don't bother */ +#ifdef CONFIG_XEN + if (*pnr_map == 1) + return 0; +#endif + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + BUG_ON(old_nr > max_nr_map); + + /* bail out if we find any unreasonable addresses in bios map */ + for (i = 0; i < old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i = 0; i < 2 * old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ + chgidx = 0; + for (i = 0; i < old_nr; i++) { + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i = 1; i < chg_nr; i++) { + unsigned long long curaddr, lastaddr; + unsigned long long curpbaddr, lastpbaddr; + + curaddr = change_point[i]->addr; + lastaddr = change_point[i - 1]->addr; + curpbaddr = change_point[i]->pbios->addr; + lastpbaddr = change_point[i - 1]->pbios->addr; + + /* + * swap entries, when: + * + * curaddr > lastaddr or + * curaddr == lastaddr and curaddr == curpbaddr and + * lastaddr != lastpbaddr + */ + if (curaddr < lastaddr || + (curaddr == lastaddr && curaddr == curpbaddr && + lastaddr != lastpbaddr)) { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing = 1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + + /* loop through change-points, determining affect on the new bios map */ + for (chgidx = 0; chgidx < chg_nr; chgidx++) { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ + current_type = 0; + for (i = 0; i < overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* + * continue building up new bios map based on this + * information + */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* + * move forward only if the new size + * was non-zero + */ + if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ + if (++new_bios_entry >= max_nr_map) + break; + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr = change_point[chgidx]->addr; + } + last_type = current_type; + } + } + /* retain count for new bios entries */ + new_nr = new_bios_entry; + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) +{ + while (nr_map) { + u64 start = biosmap->addr; + u64 size = biosmap->size; + u64 end = start + size; + u32 type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + e820_add_region(start, size, type); + + biosmap++; + nr_map--; + } + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + */ +static int __init append_e820_map(struct e820entry *biosmap, int nr_map) +{ +#ifndef CONFIG_XEN + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; +#else + BUG_ON(nr_map < 1); +#endif + + return __append_e820_map(biosmap, nr_map); +} + +static u64 __init __e820_update_range(struct e820map *e820x, u64 start, + u64 size, unsigned old_type, + unsigned new_type) +{ + u64 end; + unsigned int i; + u64 real_updated_size = 0; + + BUG_ON(old_type == new_type); + + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + + end = start + size; + printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + e820_print_type(old_type); + printk(KERN_CONT " ==> "); + e820_print_type(new_type); + printk(KERN_CONT "\n"); + + for (i = 0; i < e820x->nr_map; i++) { + struct e820entry *ei = &e820x->map[i]; + u64 final_start, final_end; + u64 ei_end; + + if (ei->type != old_type) + continue; + + ei_end = ei->addr + ei->size; + /* totally covered by new range? */ + if (ei->addr >= start && ei_end <= end) { + ei->type = new_type; + real_updated_size += ei->size; + continue; + } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + __e820_add_region(e820x, start, size, new_type); + __e820_add_region(e820x, end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_updated_size += size; + continue; + } + + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(end, ei_end); + if (final_start >= final_end) + continue; + + __e820_add_region(e820x, final_start, final_end - final_start, + new_type); + + real_updated_size += final_end - final_start; + + /* + * left range could be head or tail, so need to update + * size at first. + */ + ei->size -= final_end - final_start; + if (ei->addr < final_start) + continue; + ei->addr = final_end; + } + return real_updated_size; +} + +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + return __e820_update_range(&e820, start, size, old_type, new_type); +} + +static u64 __init e820_update_range_saved(u64 start, u64 size, + unsigned old_type, unsigned new_type) +{ +#ifdef CONFIG_XEN + if (is_initial_xendomain()) + return __e820_update_range(&machine_e820, + phys_to_machine(start), size, + old_type, new_type); +#endif + return __e820_update_range(&e820_saved, start, size, old_type, + new_type); +} + +/* make e820 not cover the range */ +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, + int checktype) +{ + int i; + u64 real_removed_size = 0; + + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 final_start, final_end; + + if (checktype && ei->type != old_type) + continue; + /* totally covered? */ + if (ei->addr >= start && + (ei->addr + ei->size) <= (start + size)) { + real_removed_size += ei->size; + memset(ei, 0, sizeof(struct e820entry)); + continue; + } + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(start + size, ei->addr + ei->size); + if (final_start >= final_end) + continue; + real_removed_size += final_end - final_start; + + ei->size -= final_end - final_start; + if (ei->addr < final_start) + continue; + ei->addr = final_end; + } + return real_removed_size; +} + +void __init update_e820(void) +{ + u32 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + _e820_print_map(&e820, "modified"); +} +static void __init update_e820_saved(void) +{ + u32 nr_map; + + nr_map = e820_saved.nr_map; + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) + return; + e820_saved.nr_map = nr_map; +} + +#ifdef CONFIG_XEN +#define e820 machine_e820 +#endif + +#define MAX_GAP_END 0x100000000ull +/* + * Search for a gap in the e820 memory space from start_addr to end_addr. + */ +__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, + unsigned long start_addr, unsigned long long end_addr) +{ + unsigned long long last; + int i = e820.nr_map; + int found = 0; + + last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; +#ifdef CONFIG_X86_64 + if (start_addr >= MAX_GAP_END) + last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits); +#endif + + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + if (end < start_addr) + continue; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap >= *gapsize) { + *gapsize = gap; + *gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + return found; +} + +/* + * Search for the biggest gap in the low 32 bits of the e820 + * memory space. We pass this space to PCI to assign MMIO resources + * for hotplug or unconfigured devices in. + * Hopefully the BIOS let enough space left. + */ +__init void e820_setup_gap(void) +{ + unsigned long gapstart, gapsize; + int found; + + gapstart = 0x10000000; + gapsize = 0x400000; + found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END); + +#ifdef CONFIG_X86_64 + if (!found) { + printk(KERN_ERR + "PCI: Warning: Cannot find a gap in the 32bit address range\n" + "PCI: Unassigned devices with 32bit resource registers may break!\n"); + found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0); + WARN_ON(!found); + } +#endif + + /* + * e820_reserve_resources_late protect stolen RAM already + */ + pci_mem_start = gapstart; + + printk(KERN_INFO + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); +} + +#undef e820 + +#ifndef CONFIG_XEN +/** + * Because of the size limitation of struct boot_params, only first + * 128 E820 memory entries are passed to kernel via + * boot_params.e820_map, others are passed via SETUP_E820_EXT node of + * linked list of struct setup_data, which is parsed here. + */ +void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) +{ + u32 map_len; + int entries; + struct e820entry *extmap; + + entries = sdata->len / sizeof(struct e820entry); + map_len = sdata->len + sizeof(struct setup_data); + if (map_len > PAGE_SIZE) + sdata = early_ioremap(pa_data, map_len); + extmap = (struct e820entry *)(sdata->data); + __append_e820_map(extmap, entries); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + if (map_len > PAGE_SIZE) + early_iounmap(sdata, map_len); + printk(KERN_INFO "extended physical RAM map:\n"); + _e820_print_map(&e820, "extended"); +} + +#if defined(CONFIG_X86_64) || \ + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) +/** + * Find the ranges of physical addresses that do not correspond to + * e820 RAM areas and mark the corresponding pages as nosave for + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(unsigned long limit_pfn) +{ + int i; + unsigned long pfn; + + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (pfn < PFN_UP(ei->addr)) + register_nosave_region(pfn, PFN_UP(ei->addr)); + + pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + register_nosave_region(PFN_UP(ei->addr), pfn); + + if (pfn >= limit_pfn) + break; + } +} +#endif + +#ifdef CONFIG_HIBERNATION +/** + * Mark ACPI NVS memory region, so that we can save/restore it during + * hibernation and the subsequent resume. + */ +static int __init e820_mark_nvs_memory(void) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (ei->type == E820_NVS) + hibernate_nvs_register(ei->addr, ei->size); + } + + return 0; +} +core_initcall(e820_mark_nvs_memory); +#endif +#endif + +/* + * Early reserved memory areas. + */ +#define MAX_EARLY_RES 20 + +struct early_res { + u64 start, end; + char name[16]; + char overlap_ok; +}; +static struct early_res early_res[MAX_EARLY_RES] __initdata = { +#ifndef CONFIG_XEN + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ +#endif + {} +}; + +static int __init find_overlapped_early(u64 start, u64 end) +{ + int i; + struct early_res *r; + + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + break; + } + + return i; +} + +/* + * Drop the i-th range from the early reservation map, + * by copying any higher ranges down one over it, and + * clearing what had been the last slot. + */ +static void __init drop_range(int i) +{ + int j; + + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) + ; + + memmove(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; +} + +/* + * Split any existing ranges that: + * 1) are marked 'overlap_ok', and + * 2) overlap with the stated range [start, end) + * into whatever portion (if any) of the existing range is entirely + * below or entirely above the stated range. Drop the portion + * of the existing range that overlaps with the stated range, + * which will allow the caller of this routine to then add that + * stated range without conflicting with any existing range. + */ +static void __init drop_overlaps_that_are_ok(u64 start, u64 end) +{ + int i; + struct early_res *r; + u64 lower_start, lower_end; + u64 upper_start, upper_end; + char name[16]; + + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + + /* Continue past non-overlapping ranges */ + if (end <= r->start || start >= r->end) + continue; + + /* + * Leave non-ok overlaps as is; let caller + * panic "Overlapping early reservations" + * when it hits this overlap. + */ + if (!r->overlap_ok) + return; + + /* + * We have an ok overlap. We will drop it from the early + * reservation map, and add back in any non-overlapping + * portions (lower or upper) as separate, overlap_ok, + * non-overlapping ranges. + */ + + /* 1. Note any non-overlapping (lower or upper) ranges. */ + strncpy(name, r->name, sizeof(name) - 1); + + lower_start = lower_end = 0; + upper_start = upper_end = 0; + if (r->start < start) { + lower_start = r->start; + lower_end = start; + } + if (r->end > end) { + upper_start = end; + upper_end = r->end; + } + + /* 2. Drop the original ok overlapping range */ + drop_range(i); + + i--; /* resume for-loop on copied down entry */ + + /* 3. Add back in any non-overlapping ranges. */ + if (lower_end) + reserve_early_overlap_ok(lower_start, lower_end, name); + if (upper_end) + reserve_early_overlap_ok(upper_start, upper_end, name); + } +} + +static void __init __reserve_early(u64 start, u64 end, char *name, + int overlap_ok) +{ + int i; + struct early_res *r; + + i = find_overlapped_early(start, end); + if (i >= MAX_EARLY_RES) + panic("Too many early reservations"); + r = &early_res[i]; + if (r->end) + panic("Overlapping early reservations " + "%llx-%llx %s to %llx-%llx %s\n", + start, end - 1, name?name:"", r->start, + r->end - 1, r->name); + r->start = start; + r->end = end; + r->overlap_ok = overlap_ok; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); +} + +/* + * A few early reservtations come here. + * + * The 'overlap_ok' in the name of this routine does -not- mean it + * is ok for these reservations to overlap an earlier reservation. + * Rather it means that it is ok for subsequent reservations to + * overlap this one. + * + * Use this entry point to reserve early ranges when you are doing + * so out of "Paranoia", reserving perhaps more memory than you need, + * just in case, and don't mind a subsequent overlapping reservation + * that is known to be needed. + * + * The drop_overlaps_that_are_ok() call here isn't really needed. + * It would be needed if we had two colliding 'overlap_ok' + * reservations, so that the second such would not panic on the + * overlap with the first. We don't have any such as of this + * writing, but might as well tolerate such if it happens in + * the future. + */ +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) +{ + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 1); +} + +/* + * Most early reservations come here. + * + * We first have drop_overlaps_that_are_ok() drop any pre-existing + * 'overlap_ok' ranges, so that we can then reserve this memory + * range without risk of panic'ing on an overlapping overlap_ok + * early reservation. + */ +void __init reserve_early(u64 start, u64 end, char *name) +{ + if (start >= end) + return; + + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 0); +} + +void __init free_early(u64 start, u64 end) +{ + struct early_res *r; + int i; + + i = find_overlapped_early(start, end); + r = &early_res[i]; + if (i >= MAX_EARLY_RES || r->end != end || r->start != start) + panic("free_early on not reserved area: %llx-%llx!", + start, end - 1); + + drop_range(i); +} + +void __init early_res_to_bootmem(u64 start, u64 end) +{ + int i, count; + u64 final_start, final_end; + + count = 0; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) + count++; + + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count, start, end); + for (i = 0; i < count; i++) { + struct early_res *r = &early_res[i]; + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, + r->start, r->end, r->name); + final_start = max(start, r->start); + final_end = min(end, r->end); + if (final_start >= final_end) { + printk(KERN_CONT "\n"); + continue; + } + printk(KERN_CONT " ==> [%010llx - %010llx]\n", + final_start, final_end); + reserve_bootmem_generic(final_start, final_end - final_start, + BOOTMEM_DEFAULT); + } +} + +/* Check for already reserved areas */ +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) +{ + int i; + u64 addr = *addrp; + int changed = 0; + struct early_res *r; +again: + i = find_overlapped_early(addr, addr + size); + r = &early_res[i]; + if (i < MAX_EARLY_RES && r->end) { + *addrp = addr = round_up(r->end, align); + changed = 1; + goto again; + } + return changed; +} + +/* Check for already reserved areas */ +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) +{ + int i; + u64 addr = *addrp, last; + u64 size = *sizep; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last > r->start && addr < r->start) { + size = r->start - addr; + changed = 1; + goto again; + } + if (last > r->end && addr < r->end) { + addr = round_up(r->end, align); + size = last - addr; + changed = 1; + goto again; + } + if (last <= r->end && addr >= r->start) { + (*sizep)++; + return 0; + } + } + if (changed) { + *addrp = addr; + *sizep = size; + } + return changed; +} + +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr, last; + u64 ei_last; + + if (ei->type != E820_RAM) + continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + continue; + if (last > end) + continue; + return addr; + } + return -1ULL; +} + +/* + * Find next free range after *start + */ +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr, last; + u64 ei_last; + + if (ei->type != E820_RAM) + continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && + addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + continue; + return addr; + } + + return -1ULL; +} + +/* + * pre allocated 4k and reserved it in e820 + */ +u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) +{ + u64 size = 0; + u64 addr; + u64 start; +#ifdef CONFIG_XEN + unsigned int order = get_order(sizet); + + if (is_initial_xendomain()) { + sizet = PAGE_SIZE << order; + if (align < PAGE_SIZE) + align = PAGE_SIZE; + } +#endif + for (start = startt; ; start += size) { + start = find_e820_area_size(start, &size, align); + if (!(start + 1)) + return 0; + if (size >= sizet) + break; + } + +#ifdef CONFIG_X86_32 + if (start >= MAXMEM) + return 0; + if (start + size > MAXMEM) + size = MAXMEM - start; +#endif +#ifdef CONFIG_XEN + if ((start >> PAGE_SHIFT) >= xen_start_info->nr_pages) + return 0; + if (PFN_UP(start + size) > xen_start_info->nr_pages) + size = ((u64)xen_start_info->nr_pages << PAGE_SHIFT) - start; +#endif + + addr = round_down(start + size - sizet, align); + if (addr < start) + return 0; +#ifdef CONFIG_XEN + if (is_initial_xendomain()) { + int rc; + unsigned long max_initmap_pfn; + + max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base)) + + xen_start_info->nr_pt_frames + + 1 + (1 << (19 - PAGE_SHIFT)), + 1UL << (22 - PAGE_SHIFT)); +#ifdef CONFIG_X86_32 + if ((addr >> PAGE_SHIFT) + < max(max_initmap_pfn, max_pfn_mapped)) + rc = xen_create_contiguous_region((unsigned long) + __va(addr), + order, 32); +#else + if ((addr >> PAGE_SHIFT) < max_pfn_mapped) + rc = xen_create_contiguous_region((unsigned long) + __va(addr), + order, 32); + else if ((addr >> PAGE_SHIFT) < max_initmap_pfn) + rc = xen_create_contiguous_region(__START_KERNEL_map + + addr, + order, 32); +#endif + else + rc = early_create_contiguous_region(addr >> PAGE_SHIFT, + order, 32); + if (rc) + return 0; + } +#endif + e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); + printk(KERN_INFO "update e820 for early_reserve_e820\n"); + update_e820(); + update_e820_saved(); + + return addr; +} + +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_PAE +# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT)) +# else +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) +# endif +#else /* CONFIG_X86_32 */ +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT +#endif + +/* + * Find the highest page frame number we have available + */ +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +{ + int i; + unsigned long last_pfn = 0; + unsigned long max_arch_pfn = MAX_ARCH_PFN; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long start_pfn; + unsigned long end_pfn; + + if (ei->type != type) + continue; + + start_pfn = ei->addr >> PAGE_SHIFT; + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; + + if (start_pfn >= limit_pfn) + continue; + if (end_pfn > limit_pfn) { + last_pfn = limit_pfn; + break; + } + if (end_pfn > last_pfn) + last_pfn = end_pfn; + } + + if (last_pfn > max_arch_pfn) + last_pfn = max_arch_pfn; + + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", + last_pfn, max_arch_pfn); + return last_pfn; +} +unsigned long __init e820_end_of_ram_pfn(void) +{ + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); +} + +unsigned long __init e820_end_of_low_ram_pfn(void) +{ + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); +} +/* + * Finds an active region in the address range from start_pfn to last_pfn and + * returns its range in ei_startpfn and ei_endpfn for the e820 entry. + */ +int __init e820_find_active_region(const struct e820entry *ei, + unsigned long start_pfn, + unsigned long last_pfn, + unsigned long *ei_startpfn, + unsigned long *ei_endpfn) +{ + u64 align = PAGE_SIZE; + +#ifdef CONFIG_XEN + if (last_pfn > xen_start_info->nr_pages) + last_pfn = xen_start_info->nr_pages; +#endif + + *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT; + *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT; + + /* Skip map entries smaller than a page */ + if (*ei_startpfn >= *ei_endpfn) + return 0; + + /* Skip if map is outside the node */ + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || + *ei_startpfn >= last_pfn) + return 0; + + /* Check for overlaps */ + if (*ei_startpfn < start_pfn) + *ei_startpfn = start_pfn; + if (*ei_endpfn > last_pfn) + *ei_endpfn = last_pfn; + + return 1; +} + +/* Walk the e820 map and register active regions within a node */ +void __init e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long last_pfn) +{ + unsigned long ei_startpfn; + unsigned long ei_endpfn; + int i; + + for (i = 0; i < e820.nr_map; i++) + if (e820_find_active_region(&e820.map[i], + start_pfn, last_pfn, + &ei_startpfn, &ei_endpfn)) + add_active_range(nid, ei_startpfn, ei_endpfn); +#ifdef CONFIG_XEN + BUG_ON(nid); + add_active_range(nid, last_pfn, last_pfn); +#endif +} + +/* + * Find the hole size (in bytes) in the memory range. + * @start: starting address of the memory range to scan + * @end: ending address of the memory range to scan + */ +u64 __init e820_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long last_pfn = end >> PAGE_SHIFT; + unsigned long ei_startpfn, ei_endpfn, ram = 0; + int i; + + for (i = 0; i < e820.nr_map; i++) { + if (e820_find_active_region(&e820.map[i], + start_pfn, last_pfn, + &ei_startpfn, &ei_endpfn)) + ram += ei_endpfn - ei_startpfn; + } + return end - start - ((u64)ram << PAGE_SHIFT); +} + +static void early_panic(char *msg) +{ + early_printk(msg); + panic(msg); +} + +static int userdef __initdata; + +/* "mem=nopentium" disables the 4MB page tables. */ +static int __init parse_memopt(char *p) +{ + u64 mem_size, current_end; + unsigned int i; + + if (!p) + return -EINVAL; + +#ifdef CONFIG_X86_32 + if (!strcmp(p, "nopentium")) { + setup_clear_cpu_cap(X86_FEATURE_PSE); + return 0; + } +#endif + + userdef = 1; + mem_size = memparse(p, &p); +#ifdef CONFIG_XEN + /* + * A little less than 2% of available memory are needed for page + * tables, p2m map, and mem_map. Hence the maximum amount of memory + * we can potentially balloon up to can in no case exceed about 50 + * times of what we've been given initially. Since even with that we + * won't be able to boot (due to various calculations done based on + * the total number of pages) we further restrict this to factor 32. + */ + if ((mem_size >> (PAGE_SHIFT + 5)) > xen_start_info->nr_pages) { + u64 size = (u64)xen_start_info->nr_pages << 5; + + printk(KERN_WARNING "mem=%Luk is invalid for an initial" + " allocation of %luk, using %Luk\n", + (unsigned long long)mem_size >> 10, + xen_start_info->nr_pages << (PAGE_SHIFT - 10), + (unsigned long long)size << (PAGE_SHIFT - 10)); + mem_size = size << PAGE_SHIFT; + } +#endif + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + + i = e820.nr_map - 1; + current_end = e820.map[i].addr + e820.map[i].size; + if (current_end < mem_size) { + /* + * The e820 map ends before our requested size so + * extend the final entry to the requested address. + */ + if (e820.map[i].type == E820_RAM) + e820.map[i].size = mem_size - e820.map[i].addr; + else + e820_add_region(current_end, mem_size - current_end, E820_RAM); + } + + return 0; +} +early_param("mem", parse_memopt); + +#ifndef CONFIG_XEN +static int __init parse_memmap_opt(char *p) +{ + char *oldp; + u64 start_at, mem_size; + + if (!p) + return -EINVAL; + + if (!strncmp(p, "exactmap", 8)) { +#ifdef CONFIG_CRASH_DUMP + /* + * If we are doing a crash dump, we still need to know + * the real mem size before original memory map is + * reset. + */ + saved_max_pfn = e820_end_of_ram_pfn(); +#endif + e820.nr_map = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + + userdef = 1; + if (*p == '@') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_RAM); + } else if (*p == '#') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_ACPI); + } else if (*p == '$') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_RESERVED); + } else + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + + return *p == '\0' ? 0 : -EINVAL; +} +early_param("memmap", parse_memmap_opt); +#endif + +void __init finish_e820_parsing(void) +{ + if (userdef) { + u32 nr = e820.nr_map; + + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) + early_panic("Invalid user supplied memory map"); + e820.nr_map = nr; + + printk(KERN_INFO "user-defined physical RAM map:\n"); + _e820_print_map(&e820, "user"); + } +} + +static inline const char *e820_type_to_string(int e820_type) +{ + switch (e820_type) { + case E820_RESERVED_KERN: + case E820_RAM: return "System RAM"; + case E820_ACPI: return "ACPI Tables"; + case E820_NVS: return "ACPI Non-volatile Storage"; + case E820_UNUSABLE: return "Unusable memory"; + default: return "reserved"; + } +} + +#ifdef CONFIG_XEN +#define e820 machine_e820 +#endif + +/* + * Mark e820 reserved areas as busy for the resource manager. + */ +static struct resource __initdata *e820_res; +void __init e820_reserve_resources(void) +{ + int i; + struct resource *res; + u64 end; + + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); + e820_res = res; + for (i = 0; i < e820.nr_map; i++) { + end = e820.map[i].addr + e820.map[i].size - 1; + if (end != (resource_size_t)end) { + res++; + continue; + } + res->name = e820_type_to_string(e820.map[i].type); + res->start = e820.map[i].addr; + res->end = end; + + res->flags = IORESOURCE_MEM; + + /* + * don't register the region that could be conflicted with + * pci device BAR resource and insert them later in + * pcibios_resource_survey() + */ + if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) { + res->flags |= IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + } + res++; + } + + for (i = 0; i < e820_saved.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + firmware_map_add_early(entry->addr, + entry->addr + entry->size - 1, + e820_type_to_string(entry->type)); + } +} + +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 64MB for anything above that */ + return 64*1024*1024; +} + +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + +void __init e820_reserve_resources_late(void) +{ + int i; + struct resource *res; + + res = e820_res; + for (i = 0; i < e820.nr_map; i++) { + if (!res->parent && res->end) + insert_resource_expand_to_fit(&iomem_resource, res); + res++; + } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820.map[i]; + u64 start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) + continue; + reserve_region_with_split(&iomem_resource, start, end, + "RAM buffer"); + } +} + +#undef e820 + +char *__init default_machine_specific_memory_setup(void) +{ + int rc, nr_map; + unsigned long long maxmem; + struct xen_memory_map memmap; + /* + * This is rather large for a stack variable but this early in + * the boot process we know we have plenty slack space. + */ + struct e820entry map[E820MAX]; + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, map); + + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc == -ENOSYS) { + memmap.nr_entries = 1; + map[0].addr = 0ULL; + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages); + /* 8MB slack (to balance backend allocations). */ + map[0].size += 8ULL << 20; + map[0].type = E820_RAM; + rc = 0; + } + BUG_ON(rc); + + nr_map = memmap.nr_entries; + sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map); + + if (append_e820_map(map, nr_map) < 0) + BUG(); + +#ifdef CONFIG_XEN + /* See the comment in parse_memopt(). */ + for (maxmem = rc = 0; rc < e820.nr_map; ++rc) + if (e820.map[rc].type == E820_RAM) + maxmem += e820.map[rc].size; + if ((maxmem >> (PAGE_SHIFT + 5)) > xen_start_info->nr_pages) { + unsigned long long size = (u64)xen_start_info->nr_pages << 5; + + printk(KERN_WARNING "maxmem of %LuM is invalid for an initial" + " allocation of %luM, using %LuM\n", + maxmem >> 20, + xen_start_info->nr_pages >> (20 - PAGE_SHIFT), + size >> (20 - PAGE_SHIFT)); + size <<= PAGE_SHIFT; + e820_remove_range(size, ULLONG_MAX - size, E820_RAM, 1); + } + + if (is_initial_xendomain()) { + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, machine_e820.map); + + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) + BUG(); + machine_e820.nr_map = memmap.nr_entries; + } +#endif + + return "Xen"; +} + +void __init setup_memory_map(void) +{ + char *who; + + who = x86_init.resources.memory_setup(); +#ifdef CONFIG_XEN + if (is_initial_xendomain()) { + printk(KERN_INFO "Xen-provided machine memory map:\n"); + _e820_print_map(&machine_e820, "BIOS"); + } else +#endif + memcpy(&e820_saved, &e820, sizeof(struct e820map)); + printk(KERN_INFO "Xen-provided physical RAM map:\n"); + _e820_print_map(&e820, who); +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/entry_32-xen.S +++ linux-ec2-2.6.32/arch/x86/kernel/entry_32-xen.S @@ -0,0 +1,1739 @@ +/* + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'syscall_exit': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +#define nr_syscalls ((syscall_table_size)/4) + +/* Pseudo-eflags. */ +NMI_MASK = 0x80000000 + +#ifdef CONFIG_PREEMPT +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +#define preempt_stop(clobbers) +#define resume_kernel restore_all +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +#ifdef CONFIG_VM86 +#define resume_userspace_sig check_userspace +#else +#define resume_userspace_sig resume_userspace +#endif + +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp + CFI_ADJUST_CFA_OFFSET -(4 + \pop) +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl %gs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET gs, 0*/ +.endm + +.macro POP_GS pop=0 +98: popl %gs + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE gs*/ + .if \pop <> 0 + add $\pop, %esp + CFI_ADJUST_CFA_OFFSET -\pop + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.section __ex_table, "a" + .align 4 + .long 98b, 99b +.popsection +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.section __ex_table, "a" + .align 4 + .long 98b, 99b +.popsection +.endm + +.macro GS_TO_REG reg + movl %gs, \reg + /*CFI_REGISTER gs, \reg*/ +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) + /*CFI_REL_OFFSET gs, PT_GS*/ +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL + cld + PUSH_GS + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0;*/ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0;*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0;*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs + SET_KERNEL_GS %edx +.endm + +.macro RESTORE_INT_REGS + popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ecx + popl %edx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edx + popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi + popl %edi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edi + popl %ebp + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebp + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE eax +.endm + +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl %ds + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE ds;*/ +2: popl %es + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE es;*/ +3: popl %fs + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE fs;*/ + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.section __ex_table, "a" + .align 4 + .long 1b, 4b + .long 2b, 5b + .long 3b, 6b +.popsection + POP_GS_EX +.endm + +.macro RING0_INT_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 3*4 + /*CFI_OFFSET cs, -2*4;*/ + CFI_OFFSET eip, -3*4 +.endm + +.macro RING0_EC_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 4*4 + /*CFI_OFFSET cs, -2*4;*/ + CFI_OFFSET eip, -3*4 +.endm + +.macro RING0_PTREGS_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ + CFI_OFFSET eip, PT_EIP-PT_OLDESP + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ + CFI_OFFSET eax, PT_EAX-PT_OLDESP + CFI_OFFSET ebp, PT_EBP-PT_OLDESP + CFI_OFFSET edi, PT_EDI-PT_OLDESP + CFI_OFFSET esi, PT_ESI-PT_OLDESP + CFI_OFFSET edx, PT_EDX-PT_OLDESP + CFI_OFFSET ecx, PT_ECX-PT_OLDESP + CFI_OFFSET ebx, PT_EBX-PT_OLDESP +.endm + +ENTRY(ret_from_fork) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + pushl $0x0202 # Reset kernel eflags + CFI_ADJUST_CFA_OFFSET 4 + popfl + CFI_ADJUST_CFA_OFFSET -4 + jmp syscall_exit + CFI_ENDPROC +END(ret_from_fork) + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN + RING0_PTREGS_FRAME +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: + GET_THREAD_INFO(%ebp) +check_userspace: + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jnz restore_all +need_resched: + movl TI_flags(%ebp), %ecx # need_resched set ? + testb $_TIF_NEED_RESCHED, %cl + jz restore_all + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +END(resume_kernel) +#endif + CFI_ENDPROC + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(ia32_sysenter_target) + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp + movl SYSENTER_stack_sp0(%esp),%esp +sysenter_past_esp: + /* + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. + */ + pushl $(__USER_DS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ss, 0*/ + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esp, 0 + pushfl + orl $X86_EFLAGS_IF, (%esp) + CFI_ADJUST_CFA_OFFSET 4 + pushl $(__USER_CS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET cs, 0*/ + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 + + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp + movl %ebp,PT_EBP(%esp) +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + + GET_THREAD_INFO(%ebp) + + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: + cmpl $(nr_syscalls), %eax + jae syscall_badsys + call *sys_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jne sysexit_audit +sysenter_exit: +/* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp,%ebp +#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT + GET_VCPU_INFO +#endif + TRACE_IRQS_ON +1: mov PT_FS(%esp), %fs + PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ + movl %eax,%edx /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + call audit_syscall_entry + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jne syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jne syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + + CFI_ENDPROC +.pushsection .fixup,"ax" +2: movl $0,PT_FS(%esp) + jmp 1b +.section __ex_table,"a" + .align 4 + .long 1b,2b +.popsection + PTGS_TO_GS_EX +ENDPROC(ia32_sysenter_target) + + # pv sysenter call handler stub +ENTRY(ia32pv_sysenter_target) + RING0_INT_FRAME + movl $__USER_DS,16(%esp) + movl %ebp,12(%esp) + movl $__USER_CS,4(%esp) + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + CFI_ADJUST_CFA_OFFSET 4 +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + /* fall through */ + CFI_ENDPROC +ENDPROC(ia32pv_sysenter_target) + + # system call handler stub +ENTRY(system_call) + RING0_INT_FRAME # can't unwind into user space anyway + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) # store the return value +syscall_exit: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jne syscall_exit_work + +restore_all: + TRACE_IRQS_IRET +restore_all_notrace: +#ifndef CONFIG_XEN + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. + # See comments in process.c:copy_thread() for details. + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + CFI_REMEMBER_STATE + je ldt_ss # returning to user-space with LDT SS +restore_nocheck: +#else +restore_nocheck: + movl PT_EFLAGS(%esp), %eax + testl $(X86_EFLAGS_VM|NMI_MASK), %eax + CFI_REMEMBER_STATE + jnz hypervisor_iret + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF + GET_VCPU_INFO + andb evtchn_upcall_mask(%esi),%al + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask + CFI_REMEMBER_STATE + jnz restore_all_enable_events # != 0 => enable event delivery +#endif + RESTORE_REGS 4 # skip orig_eax/error_code + CFI_ADJUST_CFA_OFFSET -4 +irq_return: + INTERRUPT_RETURN +.section .fixup,"ax" +ENTRY(iret_exc) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous +.section __ex_table,"a" + .align 4 + .long irq_return,iret_exc +.previous + + CFI_RESTORE_STATE +#ifndef CONFIG_XEN +ldt_ss: + larl PT_OLDSS(%esp), %eax + jnz restore_nocheck + testl $0x00400000, %eax # returning to 32bit stack? + jnz restore_nocheck # allright, normal return + +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck +#endif + +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + PER_CPU(gdt_page, %ebx) + shr $16, %edx + mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ + mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ + pushl $__ESPFIX_SS + CFI_ADJUST_CFA_OFFSET 4 + push %eax /* new kernel esp */ + CFI_ADJUST_CFA_OFFSET 4 + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ + DISABLE_INTERRUPTS(CLBR_EAX) + lss (%esp), %esp /* switch to espfix segment */ + CFI_ADJUST_CFA_OFFSET -8 + jmp restore_nocheck +#else + ALIGN +restore_all_enable_events: + TRACE_IRQS_ON + __ENABLE_INTERRUPTS +scrit: /**** START OF CRITICAL REGION ****/ + __TEST_PENDING + jnz 14f # process more events if necessary... + RESTORE_REGS 4 +1: INTERRUPT_RETURN +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +14: __DISABLE_INTERRUPTS + TRACE_IRQS_OFF +ecrit: /**** END OF CRITICAL REGION ****/ + jmp .Ldo_upcall + + CFI_RESTORE_STATE +hypervisor_iret: + andl $~NMI_MASK, PT_EFLAGS(%esp) + RESTORE_REGS 4 + jmp hypercall_page + (__HYPERVISOR_iret * 32) +#endif + CFI_ENDPROC +ENDPROC(system_call) + + # perform work that needs to be done immediately before resumption + ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jne work_notifysig_v86 # returning to kernel-space or + # vm86-space + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig + + ALIGN +work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + CFI_ADJUST_CFA_OFFSET 4 + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + movl %eax, %esp +#else + movl %esp, %eax +#endif + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig +END(work_pending) + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $(nr_syscalls), %eax + jnae syscall_call + jmp syscall_exit +END(syscall_trace_entry) + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace +END(syscall_exit_work) + CFI_ENDPROC + + RING0_INT_FRAME # can't unwind into user space anyway +syscall_fault: + GET_THREAD_INFO(%ebp) + movl $-EFAULT,PT_EAX(%esp) + jmp resume_userspace +END(syscall_fault) + +syscall_badsys: + movl $-ENOSYS,PT_EAX(%esp) + jmp resume_userspace +END(syscall_badsys) + CFI_ENDPROC + +/* + * System calls that need a pt_regs pointer. + */ +#define PTREGSCALL(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%eax; \ + jmp sys_##name; + +PTREGSCALL(iopl) +PTREGSCALL(fork) +PTREGSCALL(clone) +PTREGSCALL(vfork) +PTREGSCALL(execve) +PTREGSCALL(sigaltstack) +PTREGSCALL(sigreturn) +PTREGSCALL(rt_sigreturn) +PTREGSCALL(vm86) +PTREGSCALL(vm86old) + +#ifndef CONFIG_XEN +.macro FIXUP_ESPFIX_STACK +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ + /* fixup the stack */ + PER_CPU(gdt_page, %ebx) + mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ + mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ + pushl $__KERNEL_DS + CFI_ADJUST_CFA_OFFSET 4 + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + lss (%esp), %esp /* switch to the normal stack segment */ + CFI_ADJUST_CFA_OFFSET -8 +.endm +.macro UNWIND_ESPFIX_STACK + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +.endm + +/* + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. + */ +.section .init.rodata,"a" +ENTRY(interrupt) +.text + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT +ENTRY(irq_entries_start) + RING0_INT_FRAME +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector <> FIRST_EXTERNAL_VECTOR + CFI_ADJUST_CFA_OFFSET -4 + .endif +1: pushl $(~vector+0x80) /* Note: always in signed byte range */ + CFI_ADJUST_CFA_OFFSET 4 + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 + jmp 2f + .endif + .previous + .long 1b + .text +vector=vector+1 + .endif + .endr +2: jmp common_interrupt +.endr +END(irq_entries_start) + +.previous +END(interrupt) +.previous + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + SAVE_ALL + TRACE_IRQS_OFF + movl %esp,%eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + CFI_ENDPROC + +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + RING0_INT_FRAME; \ + pushl $~(nr); \ + CFI_ADJUST_CFA_OFFSET 4; \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ + call fn; \ + jmp ret_from_intr; \ + CFI_ENDPROC; \ +ENDPROC(name) + +#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) + +/* The include is where all of the SMP etc. interrupts come from */ +#include + +#else +#define UNWIND_ESPFIX_STACK + +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +# +# The sysexit critical region is slightly different. sysexit +# atomically removes the entire stack frame. If we interrupt in the +# critical region we know that the entire frame is present and correct +# so we can simply throw away the new one. +ENTRY(hypervisor_callback) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + movl PT_CS(%esp),%ecx + movl PT_EIP(%esp),%eax + andl $SEGMENT_RPL_MASK,%ecx + cmpl $USER_RPL,%ecx + jae .Ldo_upcall + cmpl $scrit,%eax + jb 0f + cmpl $ecrit,%eax + jb critical_region_fixup +0: +#ifdef CONFIG_XEN_SUPERVISOR_MODE_KERNEL + cmpl $sysexit_scrit,%eax + jb .Ldo_upcall + cmpl $sysexit_ecrit,%eax + ja .Ldo_upcall + addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame. +#endif +.Ldo_upcall: + push %esp + CFI_ADJUST_CFA_OFFSET 4 + call evtchn_do_upcall + add $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + jmp ret_from_intr + CFI_ENDPROC + +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +critical_region_fixup: + movsbl critical_fixup_table-scrit(%eax),%ecx # %ecx contains num slots popped + testl %ecx,%ecx + leal (%esp,%ecx,4),%esi # %esi points at end of src region + leal PT_OLDESP(%esp),%edi # %edi points at end of dst region + jle 17f # skip loop if nothing to copy +16: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 16b +17: movl %edi,%esp # final %edi is top of merged stack + jmp .Ldo_upcall + +.section .rodata,"a" +critical_fixup_table: + .rept __SIZEOF_TEST_PENDING + .byte -1 + .endr + .byte -1,-1 # jnz 14f + .byte 0 # pop %ebx + .byte 1 # pop %ecx + .byte 2 # pop %edx + .byte 3 # pop %esi + .byte 4 # pop %edi + .byte 5 # pop %ebp + .byte 6 # pop %eax + .byte 7 # pop %ds + .byte 8 # pop %es + .byte 9,9 # pop %fs +#ifndef CONFIG_X86_32_LAZY_GS + .byte 10,10 # pop %gs + .byte 11,11,11 # add $4,%esp +#else + .byte 10,10,10 # add $8,%esp +#endif + .byte 12 # iret + .rept __SIZEOF_DISABLE_INTERRUPTS + .byte -1 + .endr +.previous + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(failsafe_callback) + pushl %eax + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + jz 5f + addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) + jmp iret_exc +5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) + RING0_INT_FRAME + pushl $0 + SAVE_ALL + jmp ret_from_exception +.section .fixup,"ax"; \ +6: xorl %eax,%eax; \ + movl %eax,4(%esp); \ + jmp 1b; \ +7: xorl %eax,%eax; \ + movl %eax,8(%esp); \ + jmp 2b; \ +8: xorl %eax,%eax; \ + movl %eax,12(%esp); \ + jmp 3b; \ +9: xorl %eax,%eax; \ + movl %eax,16(%esp); \ + jmp 4b; \ +.previous; \ +.section __ex_table,"a"; \ + .align 4; \ + .long 1b,6b; \ + .long 2b,7b; \ + .long 3b,8b; \ + .long 4b,9b; \ +.previous +#endif + CFI_ENDPROC + +ENTRY(coprocessor_error) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_simd_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(simd_coprocessor_error) + +ENTRY(device_not_available) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_device_not_available + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(device_not_available) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) + iret +.section __ex_table,"a" + .align 4 + .long native_iret, iret_exc +.previous +END(native_iret) + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +END(native_irq_enable_sysexit) +#endif + +ENTRY(overflow) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_overflow + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(overflow) + +ENTRY(bounds) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_bounds + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(bounds) + +ENTRY(invalid_op) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_invalid_op + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_coprocessor_segment_overrun + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + RING0_EC_FRAME + pushl $do_invalid_TSS + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(invalid_TSS) + +ENTRY(segment_not_present) + RING0_EC_FRAME + pushl $do_segment_not_present + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(segment_not_present) + +ENTRY(stack_segment) + RING0_EC_FRAME + pushl $do_stack_segment + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(stack_segment) + +ENTRY(alignment_check) + RING0_EC_FRAME + pushl $do_alignment_check + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(alignment_check) + +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl machine_check_vector + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(machine_check) +#endif + +#ifndef CONFIG_XEN +ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_spurious_interrupt_bug + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +#endif /* !CONFIG_XEN */ + +ENTRY(fixup_4gb_segment) + RING0_EC_FRAME + pushl $do_fixup_4gb_segment + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(spurious_interrupt_bug) + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movl 4(%esp), %edx + movl (%esp), %ecx + leal 4(%esp), %eax + movl %ebx, PT_EBX(%edx) + xorl %ebx, %ebx + movl %ebx, PT_ECX(%edx) + movl %ebx, PT_EDX(%edx) + movl %esi, PT_ESI(%edx) + movl %edi, PT_EDI(%edx) + movl %ebp, PT_EBP(%edx) + movl %ebx, PT_EAX(%edx) + movl $__USER_DS, PT_DS(%edx) + movl $__USER_DS, PT_ES(%edx) + movl $__KERNEL_PERCPU, PT_FS(%edx) + movl $__KERNEL_STACK_CANARY, PT_GS(%edx) + movl %eax, PT_OLDESP(%edx) + movl 16(%esp), %eax + movl %ebx, PT_ORIG_EAX(%edx) + movl %ecx, PT_EIP(%edx) + movl 12(%esp), %ecx + movl $__KERNEL_CS, PT_CS(%edx) + movl %eax, 12(%esp) + movl 8(%esp), %eax + movl %ecx, 8(%esp) + movl %ebx, PT_EFLAGS(%edx) + movl PT_EBX(%edx), %ebx + movl $__KERNEL_DS, PT_OLDSS(%edx) + jmpl *%eax + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif + +ENTRY(kernel_thread_helper) + pushl $0 # fake return address for unwinder + CFI_STARTPROC + movl %edx,%eax + push %edx + CFI_ADJUST_CFA_OFFSET 4 + call *%ebx + push %eax + CFI_ADJUST_CFA_OFFSET 4 + call do_exit + ud2 # padding for call trace + CFI_ENDPROC +ENDPROC(kernel_thread_helper) + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %edx + lea 0x4(%ebp), %eax + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %edx + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %eax + pushl %ecx + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, 0xc(%esp) + popl %edx + popl %ecx + popl %eax + ret +#endif + +#include + + # pv syscall call handler stub +ENTRY(ia32pv_cstar_target) + RING0_INT_FRAME + movl $__USER_DS,16(%esp) + movl %ebp,%ecx + movl $__USER_CS,4(%esp) + movl 12(%esp),%ebp + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-4,%ebp + CFI_REMEMBER_STATE + ja cstar_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,cstar_fault +.previous + SAVE_ALL + GET_THREAD_INFO(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz cstar_trace_entry + cmpl $nr_syscalls,%eax + jae cstar_badsys +.Lcstar_call: + btl %eax,cstar_special + jc .Lcstar_special + call *cstar_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) # store the return value +.Lcstar_exit: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp syscall_exit +.Lcstar_special: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp syscall_call +cstar_set_tif: + movl $cstar_clear_tif,(%esp) # replace return address + LOCK_PREFIX + orl $_TIF_CSTAR,TI_flags(%ebp) + jmp *sys_call_table(,%eax,4) +cstar_clear_tif: + movl %eax,PT_EAX(%esp) # store the return value + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + jmp .Lcstar_exit +cstar_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + cmpl $nr_syscalls,%eax + jae 1f + btl %eax,cstar_special + jc .Lcstar_trace_special +1: movl %esp,%eax + LOCK_PREFIX + orl $_TIF_CSTAR,TI_flags(%ebp) + call syscall_trace_enter + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + /* What it returned is what we'll actually use. */ + cmpl $nr_syscalls,%eax + jb .Lcstar_call + jmp .Lcstar_exit +.Lcstar_trace_special: + movl PT_ECX(%esp),%ecx + movl %esp,%eax + movl %ecx,PT_EBP(%esp) # put user EBP back in place + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $nr_syscalls,%eax + jb syscall_call + jmp syscall_exit +cstar_badsys: + movl $-ENOSYS,PT_EAX(%esp) +.Lcstar_resume: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp resume_userspace + CFI_RESTORE_STATE +cstar_fault: + movl $-EFAULT,%eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + jmp .Lcstar_resume + CFI_ENDPROC +ENDPROC(ia32pv_cstar_target) + +ENTRY(cstar_ret_from_fork) + CFI_STARTPROC + movl PT_ECX(%esp),%ecx + GET_THREAD_INFO(%ebp) + movl %ecx,PT_EBP(%esp) # put user EBP back in place + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + jmp ret_from_fork + CFI_ENDPROC +END(ret_from_fork) + +.section .rodata,"a" +#include "syscall_table_32.S" + +syscall_table_size=(.-sys_call_table) + +#include +cstar_special: +nr=0 +mask=0 +.rept nr_syscalls+31 + .irp n, __NR_sigreturn, __NR_rt_sigreturn + .if nr == \n + mask = mask | (1 << (\n & 31)) + .endif + .endr + nr = nr + 1 + .if (nr & 31) == 0 + .long mask + mask = 0 + .endif +.endr +#define sys_call_table cstar_call_table +#define sys_fork cstar_set_tif +#define sys_clone cstar_set_tif +#define sys_vfork cstar_set_tif +#include "syscall_table_32.S" +#undef sys_call_table +#undef sys_fork +#undef sys_clone +#undef sys_vfork + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 + ALIGN +error_code: + /* the function address is in %gs's slot on the stack */ + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC +END(page_fault) + +#ifndef CONFIG_XEN +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +.macro FIX_STACK offset ok label + cmpw $__KERNEL_CS, 4(%esp) + jne \ok +\label: + movl TSS_sysenter_sp0 + \offset(%esp), %esp + CFI_DEF_CFA esp, 0 + CFI_UNDEFINED eip + pushfl + CFI_ADJUST_CFA_OFFSET 4 + pushl $__KERNEL_CS + CFI_ADJUST_CFA_OFFSET 4 + pushl $sysenter_past_esp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 +.endm +#endif /* CONFIG_XEN */ + +ENTRY(debug) + RING0_INT_FRAME +#ifndef CONFIG_XEN + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn +debug_stack_correct: +#endif /* !CONFIG_XEN */ + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 +#ifndef CONFIG_XEN + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + je nmi_espfix_stack + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace + CFI_ENDPROC + +nmi_stack_fixup: + RING0_INT_FRAME + FIX_STACK 12, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ + pushl %ss + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 + addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 + .endr + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 + jmp irq_return +#else + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + orl $NMI_MASK, PT_EFLAGS(%esp) + jmp restore_all +#endif + CFI_ENDPROC +END(nmi) + +ENTRY(int3) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC +END(int3) + +ENTRY(general_protection) + RING0_EC_FRAME + pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(general_protection) + +/* + * End of kprobes section + */ + .popsection --- linux-ec2-2.6.32.orig/arch/x86/kernel/sfi.c +++ linux-ec2-2.6.32/arch/x86/kernel/sfi.c @@ -31,7 +31,7 @@ #include #include -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; void __init mp_sfi_register_lapic_address(unsigned long address) @@ -99,9 +99,12 @@ pentry++; } +#ifndef CONFIG_XEN WARN(pic_mode, KERN_WARNING "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); pic_mode = 0; +#endif + return 0; } #endif /* CONFIG_X86_IO_APIC */ @@ -111,7 +114,7 @@ */ int __init sfi_platform_init(void) { -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) mp_sfi_register_lapic_address(sfi_lapic_addr); sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); #endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/early_printk-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/early_printk-xen.c @@ -0,0 +1,271 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_XEN +/* Simple VGA output */ +#define VGABASE (__ISA_IO_base + 0xb8000) + +static int max_ypos = 25, max_xpos = 80; +static int current_ypos = 25, current_xpos; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= max_ypos) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < max_ypos; k++, j++) { + for (i = 0; i < max_xpos; i++) { + writew(readw(VGABASE+2*(max_xpos*k+i)), + VGABASE + 2*(max_xpos*j + i)); + } + } + for (i = 0; i < max_xpos; i++) + writew(0x720, VGABASE + 2*(max_xpos*j + i)); + current_ypos = max_ypos-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(max_xpos*current_ypos + + current_xpos++)); + if (current_xpos >= max_xpos) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ + +static int early_serial_base = 0x3f8; /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + if (*s == '\n') + early_serial_putc('\r'); + early_serial_putc(*s); + s++; + } +} + +#define DEFAULT_BAUD 9600 + +static __init void early_serial_init(char *s) +{ + unsigned char c; + unsigned divisor; + unsigned baud = DEFAULT_BAUD; + char *e; + + if (*s == ',') + ++s; + + if (*s) { + unsigned port; + if (!strncmp(s, "0x", 2)) { + early_serial_base = simple_strtoul(s, &e, 16); + } else { + static const int __initconst bases[] = { 0x3f8, 0x2f8 }; + + if (!strncmp(s, "ttyS", 4)) + s += 4; + port = simple_strtoul(s, &e, 10); + if (port > 1 || s == e) + port = 0; + early_serial_base = bases[port]; + } + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + if (*s) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +#else /* CONFIG_XEN */ + +static void +early_serial_write(struct console *con, const char *s, unsigned count) +{ + int n; + + while (count > 0) { + n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s); + if (n <= 0) + break; + count -= n; + s += n; + } +} + +static __init void early_serial_init(char *s) +{ +} + +/* + * No early VGA console on Xen, as we do not have convenient ISA-space + * mappings. Someone should fix this for domain 0. For now, use fake serial. + */ +#define early_vga_console early_serial_console + +#endif + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +static struct console *early_console = &early_vga_console; +static int __initdata early_console_initialized; + +asmlinkage void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + + va_start(ap, fmt); + n = vscnprintf(buf, sizeof(buf), fmt, ap); + early_console->write(early_console, buf, n); + va_end(ap); +} + +static inline void early_console_register(struct console *con, int keep_early) +{ + if (early_console->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} + +static int __init setup_early_printk(char *buf) +{ + int keep; + + if (!buf) + return 0; + + if (early_console_initialized) + return 0; + early_console_initialized = 1; + + keep = (strstr(buf, "keep") != NULL); + + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } +#ifndef CONFIG_XEN + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; +#else + if (!strncmp(buf, "vga", 3) || !strncmp(buf, "xen", 3)) { +#endif + early_console_register(&early_vga_console, keep); + } +#ifdef CONFIG_EARLY_PRINTK_DBGP + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); +#endif +#ifdef CONFIG_HVC_XEN + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); +#endif + buf++; + } + return 0; +} + +early_param("earlyprintk", setup_early_printk); --- linux-ec2-2.6.32.orig/arch/x86/kernel/entry_64-xen.S +++ linux-ec2-2.6.32/arch/x86/kernel/entry_64-xen.S @@ -0,0 +1,1440 @@ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek + * Jun Nakajima + * Asit Mallick + * Modified for Xen + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * + * NOTE: This code handles signal-recognition, which happens every time + * after an interrupt and after each system call. + * + * Normal syscalls and interrupts don't save a full stack frame, this is + * only done for syscall tracing, signals or fork/exec et.al. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * - partial stack frame: partially saved registers upto R11. + * - full stack frame: Like partial stack frame, but all register saved. + * + * Some macro usage: + * - CFI macros are used to generate dwarf2 unwind information for better + * backtraces. They don't change any code. + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. + * There are unfortunately lots of special cases where some registers + * not touched. The macro is a big mess that should be cleaned up. + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. + * Gives a full stack frame. + * - ENTRY/END Define functions in the symbol table. + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack + * frame that is otherwise undefined after a SYSCALL + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - errorentry/paranoidentry/zeroentry - Define exception entry points. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + + .code64 +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE +ENTRY(mcount) + retq +END(mcount) + +ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + MCOUNT_SAVE_FRAME + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + +GLOBAL(ftrace_call) + call ftrace_stub + + MCOUNT_RESTORE_FRAME + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +GLOBAL(ftrace_graph_call) + jmp ftrace_stub +#endif + +GLOBAL(ftrace_stub) + retq +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ +ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + + cmpq $ftrace_stub, ftrace_trace_function + jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif + +GLOBAL(ftrace_stub) + retq + +trace: + MCOUNT_SAVE_FRAME + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + + call *ftrace_trace_function + + MCOUNT_RESTORE_FRAME + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + MCOUNT_SAVE_FRAME + + leaq 8(%rbp), %rdi + movq 0x38(%rsp), %rsi + movq (%rbp), %rdx + subq $MCOUNT_INSN_SIZE, %rsi + + call prepare_ftrace_return + + MCOUNT_RESTORE_FRAME + + retq +END(ftrace_graph_caller) + +GLOBAL(return_to_handler) + subq $24, %rsp + + /* Save the return values */ + movq %rax, (%rsp) + movq %rdx, 8(%rsp) + movq %rbp, %rdi + + call ftrace_return_to_handler + + movq %rax, 16(%rsp) + movq 8(%rsp), %rdx + movq (%rsp), %rax + addq $16, %rsp + retq +#endif + + +#ifndef CONFIG_PREEMPT +#define retint_kernel retint_restore_args +#endif + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret64) + swapgs + sysretq +ENDPROC(native_usergs_sysret64) +#endif /* CONFIG_PARAVIRT */ + + +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + +NMI_MASK = 0x80000000 + +/* + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based + * fast path FIXUP_TOP_OF_STACK is needed. + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs + * manipulation. + */ + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp offset=0 + movq $__USER_CS,CS+\offset(%rsp) + movq $-1,RCX+\offset(%rsp) + .endm + + .macro RESTORE_TOP_OF_STACK tmp offset=0 + .endm + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorl %eax, %eax + pushq $__KERNEL_DS /* ss */ + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET ss,0*/ + pushq %rax /* rsp */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rsp,0 + pushq $X86_EFLAGS_IF /* eflags - interrupts on */ + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET rflags,0*/ + pushq $__KERNEL_CS /* cs */ + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET cs,0*/ + pushq \child_rip /* rip */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rip,0 + pushq %rax /* orig rax */ + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + CFI_ADJUST_CFA_OFFSET -(6*8) + .endm + +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro EMPTY_FRAME offset=0 + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,\offset + .endm + +/* + * initial frame state for syscall + */ + .macro BASIC_FRAME start=1 offset=0 + .if \start + EMPTY_FRAME __stringify(SS+8+\offset-RIP) + .else + CFI_DEF_CFA_OFFSET SS+8+\offset-RIP + .endif + /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ + CFI_REL_OFFSET rsp, RSP+\offset-RIP + /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ + /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ + CFI_REL_OFFSET rip, RIP+\offset-RIP + .endm + +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro INTR_FRAME start=1 offset=0 + .if \start == 1 + BASIC_FRAME 1, \offset+2*8 + CFI_REL_OFFSET rcx, 0+\offset + CFI_REL_OFFSET r11, 8+\offset + .else + BASIC_FRAME \start, \offset + .endif + .endm + +/* + * initial frame state for exceptions with error code (and interrupts + * with vector already pushed) + */ + .macro XCPT_FRAME start=1 offset=0 + INTR_FRAME \start, __stringify(RIP+\offset-ORIG_RAX) + .endm + +/* + * frame that enables calling into C. + */ + .macro PARTIAL_FRAME start=1 offset=0 + .if \start >= 0 + XCPT_FRAME 2*\start, __stringify(ORIG_RAX+\offset-ARGOFFSET) + .endif + CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET + CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET + CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET + CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET + CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET + CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET + CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET + CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET + CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET + .endm + +/* + * frame that enables passing a complete pt_regs to a C function. + */ + .macro DEFAULT_FRAME start=1 offset=0 + .if \start >= -1 + PARTIAL_FRAME \start, __stringify(R11+\offset-R15) + .endif + CFI_REL_OFFSET rbx, RBX+\offset + CFI_REL_OFFSET rbp, RBP+\offset + CFI_REL_OFFSET r12, R12+\offset + CFI_REL_OFFSET r13, R13+\offset + CFI_REL_OFFSET r14, R14+\offset + CFI_REL_OFFSET r15, R15+\offset + .endm + + /* + * Must be consistent with the definition in arch-x86/xen-x86_64.h: + * struct iret_context { + * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; + * }; + * with rax, r11, and rcx being taken care of in the hypercall stub. + */ + .macro HYPERVISOR_IRET flag + testb $3,1*8(%rsp) + jnz 2f + testl $NMI_MASK,2*8(%rsp) + jnz 2f + + cmpb $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip) + jne 1f + + /* Direct iret to kernel space. Correct CS and SS. */ + orl $3,1*8(%rsp) + orl $3,4*8(%rsp) +1: iretq + +2: /* Slow iret via hypervisor. */ + andl $~NMI_MASK, 2*8(%rsp) + pushq $\flag + jmp hypercall_page + (__HYPERVISOR_iret * 32) + .endm + +#ifndef CONFIG_XEN +/* save partial stack frame */ +ENTRY(save_args) + XCPT_FRAME offset=__stringify(ORIG_RAX-ARGOFFSET+16) + cld + movq %rdi, RDI+16-ARGOFFSET(%rsp) + movq %rsi, RSI+16-ARGOFFSET(%rsp) + movq %rdx, RDX+16-ARGOFFSET(%rsp) + movq %rcx, RCX+16-ARGOFFSET(%rsp) + movq_cfi rax, __stringify(RAX+16-ARGOFFSET) + movq %r8, R8+16-ARGOFFSET(%rsp) + movq %r9, R9+16-ARGOFFSET(%rsp) + movq %r10, R10+16-ARGOFFSET(%rsp) + movq_cfi r11, __stringify(R11+16-ARGOFFSET) + + leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ + movq_cfi rbp, 8 /* push %rbp */ + leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ + CFI_DEF_CFA_REGISTER rbp + CFI_ADJUST_CFA_OFFSET -8 + testl $3, CS(%rdi) + je 1f + SWAPGS + /* + * irq_count is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ +1: incl PER_CPU_VAR(irq_count) + jne 2f + popq %rax /* move return address... */ + mov PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rbp /* backlink for unwinder */ + pushq %rax /* ... to the new stack */ + /* + * We entered an interrupt context - irqs are off: + */ +2: TRACE_IRQS_OFF + ret + CFI_ENDPROC +END(save_args) +#endif + +ENTRY(save_rest) + CFI_STARTPROC + movq 5*8+16(%rsp), %r11 /* save return address */ + movq %rbx, RBX+16(%rsp) + movq %rbp, RBP+16(%rsp) + movq %r12, R12+16(%rsp) + movq %r13, R13+16(%rsp) + movq %r14, R14+16(%rsp) + movq %r15, R15+16(%rsp) + movq %r11, 8(%rsp) /* return address */ + FIXUP_TOP_OF_STACK %r11, 16 + ret + CFI_ENDPROC +END(save_rest) + +#ifndef CONFIG_XEN +/* save complete stack frame */ + .pushsection .kprobes.text, "ax" +ENTRY(save_paranoid) + XCPT_FRAME offset=__stringify(ORIG_RAX-R15+8) + cld + movq %rdi, RDI+8(%rsp) + movq %rsi, RSI+8(%rsp) + movq_cfi rdx, __stringify(RDX+8) + movq_cfi rcx, __stringify(RCX+8) + movq_cfi rax, __stringify(RAX+8) + movq %r8, R8+8(%rsp) + movq %r9, R9+8(%rsp) + movq %r10, R10+8(%rsp) + movq %r11, R11+8(%rsp) + movq_cfi rbx, __stringify(RBX+8) + movq %rbp, RBP+8(%rsp) + movq %r12, R12+8(%rsp) + movq %r13, R13+8(%rsp) + movq %r14, R14+8(%rsp) + movq %r15, R15+8(%rsp) + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(save_paranoid) + .popsection +#endif + +/* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ +ENTRY(ret_from_fork) + DEFAULT_FRAME + + LOCK ; btr $TIF_FORK,TI_flags(%r8) + + push kernel_eflags(%rip) + CFI_ADJUST_CFA_OFFSET 8 + popf # reset kernel eflags + CFI_ADJUST_CFA_OFFSET -8 + + call schedule_tail # rdi: 'prev' task parameter + + GET_THREAD_INFO(%rcx) + + RESTORE_REST + + testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? + je int_ret_from_sys_call + + testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET + jnz int_ret_from_sys_call + + RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET + jmp ret_from_sys_call # go to the SYSRET fastpath + + CFI_ENDPROC +END(ret_from_fork) + +/* + * System call entry. Upto 6 arguments in registers are supported. + * + * SYSCALL does not save anything on the stack and does not change the + * stack pointer. + */ + +/* + * Register setup: + * rax system call number + * rdi arg0 + * rcx return address for syscall/sysret, C arg3 + * rsi arg1 + * rdx arg2 + * r10 arg3 (--> moved to rcx for C) + * r8 arg4 + * r9 arg5 + * r11 eflags for syscall/sysret, temporary for C + * r12-r15,rbp,rbx saved by C code, not touched. + * + * Interrupts are enabled on entry. + * Only called from user space. + * + * XXX if we had a free scratch register we could save the RSP into the stack frame + * and report it properly in ps. Unfortunately we haven't. + * + * When user can change the frames always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. + */ + +ENTRY(system_call) + INTR_FRAME start=2 offset=2*8 + SAVE_ARGS -8,0 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + GET_THREAD_INFO(%rcx) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) + jnz tracesys +system_call_fastpath: + cmpq $__NR_syscall_max,%rax + ja badsys + movq %r10,%rcx + call *sys_call_table(,%rax,8) # XXX: rip relative + movq %rax,RAX-ARGOFFSET(%rsp) +/* + * Syscall return path ending with SYSRET (fast path) + * Has incomplete stack frame and undefined top of stack. + */ +ret_from_sys_call: + movl $_TIF_ALLWORK_MASK,%edi + /* edi: flagmask */ +sysret_check: + LOCKDEP_SYS_EXIT + GET_THREAD_INFO(%rcx) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + movl TI_flags(%rcx),%edx + andl %edi,%edx + jnz sysret_careful + CFI_REMEMBER_STATE + /* + * sysretq will re-enable interrupts: + */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + RESTORE_ARGS 0,8,0 + HYPERVISOR_IRET VGCF_IN_SYSCALL + + CFI_RESTORE_STATE + /* Handle reschedules */ + /* edx: work, edi: workmask */ +sysret_careful: + bt $TIF_NEED_RESCHED,%edx + jnc sysret_signal + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + call schedule + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + jmp sysret_check + + /* Handle a signal */ +sysret_signal: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) +#ifdef CONFIG_AUDITSYSCALL + bt $TIF_SYSCALL_AUDIT,%edx + jc sysret_audit +#endif + /* + * We have a signal, or exit tracing or single-step. + * These all wind up with the iret return path anyway, + * so just join that path right now. + */ + FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + jmp int_check_syscall_exit_work + +badsys: + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp ret_from_sys_call + +#ifdef CONFIG_AUDITSYSCALL + /* + * Fast path for syscall audit without full syscall trace. + * We just call audit_syscall_entry() directly, and then + * jump back to the normal fast path. + */ +auditsys: + movq %r10,%r9 /* 6th arg: 4th syscall arg */ + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ + movq %rax,%rsi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ + call audit_syscall_entry + LOAD_ARGS 0 /* reload call-clobbered registers */ + jmp system_call_fastpath + + /* + * Return fast path for syscall audit. Call audit_syscall_exit() + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT + * masked off. + */ +sysret_audit: + movq %rax,%rsi /* second arg, syscall return value */ + cmpq $0,%rax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + jmp sysret_check +#endif /* CONFIG_AUDITSYSCALL */ + + /* Do syscall tracing */ +tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + jz auditsys +#endif + SAVE_REST + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + call syscall_trace_enter + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ + LOAD_ARGS ARGOFFSET, 1 + RESTORE_REST + cmpq $__NR_syscall_max,%rax + ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) + /* Use IRET because user could have changed frame */ + +/* + * Syscall return path ending with IRET. + * Has correct top of stack, but partial stack frame. + */ +GLOBAL(int_ret_from_sys_call) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testb $3,CS-ARGOFFSET(%rsp) + jnz 1f + /* Need to set the proper %ss (not NULL) for ring 3 iretq */ + movl $__KERNEL_DS,SS-ARGOFFSET(%rsp) + jmp retint_restore_args # retrun from ring3 kernel +1: + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +GLOBAL(int_with_check) + LOCKDEP_SYS_EXIT_IRQ + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%edx + andl %edi,%edx + jnz int_careful + andl $~TS_COMPAT,TI_status(%rcx) + jmp retint_restore_args + + /* Either reschedule or signal or syscall exit tracking needed. */ + /* First do a reschedule test. */ + /* edx: work, edi: workmask */ +int_careful: + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + call schedule + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + + /* handle signals and tracing -- both require a full stack frame */ +int_very_careful: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work: + SAVE_REST + /* Check for syscall exit trace */ + testl $_TIF_WORK_SYSCALL_EXIT,%edx + jz int_signal + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + leaq 8(%rsp),%rdi # &ptregs -> arg1 + call syscall_trace_leave + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi + jmp int_restore_rest + +int_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call do_notify_resume +1: movl $_TIF_WORK_MASK,%edi +int_restore_rest: + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + CFI_ENDPROC +END(system_call) + +/* + * Certain special system calls that need to save a complete full stack frame. + */ + .macro PTREGSCALL label,func,arg +ENTRY(\label) + PARTIAL_FRAME 1 8 /* offset 8: return address */ + subq $REST_SKIP, %rsp + CFI_ADJUST_CFA_OFFSET REST_SKIP + call save_rest + DEFAULT_FRAME -2 8 /* offset 8: return address */ + leaq 8(%rsp), \arg /* pt_regs pointer */ + call \func + jmp ptregscall_common + CFI_ENDPROC +END(\label) + .endm + + PTREGSCALL stub_clone, sys_clone, %r8 + PTREGSCALL stub_fork, sys_fork, %rdi + PTREGSCALL stub_vfork, sys_vfork, %rdi + PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx + PTREGSCALL stub_iopl, sys_iopl, %rsi + +ENTRY(ptregscall_common) + DEFAULT_FRAME 1 8 /* offset 8: return address */ + RESTORE_TOP_OF_STACK %r11, 8 + movq_cfi_restore __stringify(R15+8), r15 + movq_cfi_restore __stringify(R14+8), r14 + movq_cfi_restore __stringify(R13+8), r13 + movq_cfi_restore __stringify(R12+8), r12 + movq_cfi_restore __stringify(RBP+8), rbp + movq_cfi_restore __stringify(RBX+8), rbx + ret $REST_SKIP /* pop extended registers */ + CFI_ENDPROC +END(ptregscall_common) + +ENTRY(stub_execve) + CFI_STARTPROC + popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx + call sys_execve + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execve) + +/* + * sigreturn is special because it needs to restore all registers on return. + * This cannot be done with SYSRET, so use the IRET return path instead. + */ +ENTRY(stub_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 + SAVE_REST + movq %rsp,%rdi + FIXUP_TOP_OF_STACK %r11 + call sys_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_rt_sigreturn) + +/* + * Interrupt exit. + */ + +retint_with_reschedule: + PARTIAL_FRAME + movl $_TIF_WORK_MASK,%edi +retint_check: + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + andl %edi,%edx + CFI_REMEMBER_STATE + jnz retint_careful +retint_restore_args: /* return to kernel space */ + movl EFLAGS-REST_SKIP(%rsp), %eax + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF + GET_VCPU_INFO + andb evtchn_upcall_mask(%rsi),%al + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask + jnz restore_all_enable_events # != 0 => enable event delivery + + RESTORE_ARGS 0,8,0 + HYPERVISOR_IRET 0 + + /* edi: workmask, edx: work */ +retint_careful: + CFI_RESTORE_STATE + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + call schedule + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + GET_THREAD_INFO(%rcx) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp retint_check + +retint_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz retint_restore_args + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_REST + movq $-1,ORIG_RAX(%rsp) + xorl %esi,%esi # oldset + movq %rsp,%rdi # &pt_regs + call do_notify_resume + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_with_reschedule + +#ifdef CONFIG_PREEMPT + /* Returning to kernel space. Check if we need preemption */ + /* rcx: threadinfo. interrupts off. */ +ENTRY(retint_kernel) + cmpl $0,TI_preempt_count(%rcx) + jnz retint_restore_args + bt $TIF_NEED_RESCHED,TI_flags(%rcx) + jnc retint_restore_args + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ + jnc retint_restore_args + call preempt_schedule_irq + jmp retint_kernel /* check again */ +#endif + + CFI_ENDPROC +END(retint_check) + +#ifndef CONFIG_XEN +/* + * APIC interrupts. + */ +.macro apicinterrupt num sym do_sym +ENTRY(\sym) + INTR_FRAME + pushq $~(\num) + CFI_ADJUST_CFA_OFFSET 8 + interrupt \do_sym + jmp error_entry + CFI_ENDPROC +END(\sym) +.endm + +#ifdef CONFIG_SMP +apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt +#endif + +#ifdef CONFIG_X86_UV +apicinterrupt UV_BAU_MESSAGE \ + uv_bau_message_intr1 uv_bau_message_interrupt +#endif +apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt GENERIC_INTERRUPT_VECTOR \ + generic_interrupt smp_generic_interrupt + +#ifdef CONFIG_SMP +apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ + invalidate_interrupt0 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ + invalidate_interrupt1 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ + invalidate_interrupt2 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ + invalidate_interrupt3 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ + invalidate_interrupt4 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \ + invalidate_interrupt5 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ + invalidate_interrupt6 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ + invalidate_interrupt7 smp_invalidate_interrupt +#endif + +apicinterrupt THRESHOLD_APIC_VECTOR \ + threshold_interrupt smp_threshold_interrupt +apicinterrupt THERMAL_APIC_VECTOR \ + thermal_interrupt smp_thermal_interrupt + +#ifdef CONFIG_X86_MCE +apicinterrupt MCE_SELF_VECTOR \ + mce_self_interrupt smp_mce_self_interrupt +#endif + +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ + call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR \ + call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR \ + reschedule_interrupt smp_reschedule_interrupt +#endif + +apicinterrupt ERROR_APIC_VECTOR \ + error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt + +#ifdef CONFIG_PERF_EVENTS +apicinterrupt LOCAL_PENDING_VECTOR \ + perf_pending_interrupt smp_perf_pending_interrupt +#endif +#endif /* !CONFIG_XEN */ + +/* + * Exception entry points. + */ +.macro zeroentry sym do_sym +ENTRY(\sym) + INTR_FRAME + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + movq $-1,8(%rsp) /* ORIG_RAX: no syscall to restart */ + subq $ORIG_RAX-R15-8, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-8 + call error_entry + DEFAULT_FRAME -1 + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ + CFI_ENDPROC +END(\sym) +.endm + +.macro paranoidzeroentry sym do_sym + zeroentry \sym \do_sym +.endm + +.macro paranoidzeroentry_ist sym do_sym ist + zeroentry \sym \do_sym +.endm + +.macro errorentry sym do_sym +ENTRY(\sym) + XCPT_FRAME + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + subq $ORIG_RAX-R15-2*8, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-2*8 + call error_entry + DEFAULT_FRAME -1 + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ + CFI_ENDPROC +END(\sym) +.endm + + /* error code is on the stack already */ +.macro paranoiderrorentry sym do_sym + errorentry \sym \do_sym +.endm + +/* + * Copied from arch/xen/i386/kernel/entry.S + */ +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) + CFI_STARTPROC +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will +# see the correct pointer to the pt_regs + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + DEFAULT_FRAME +11: incl PER_CPU_VAR(irq_count) + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rbp # backlink for old unwinder + call evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp + decl PER_CPU_VAR(irq_count) + jmp error_exit + CFI_ENDPROC +END(do_hypervisor_callback) + + ALIGN +restore_all_enable_events: + PARTIAL_FRAME + TRACE_IRQS_ON + __ENABLE_INTERRUPTS + +scrit: /**** START OF CRITICAL REGION ****/ + __TEST_PENDING + CFI_REMEMBER_STATE + jnz 14f # process more events if necessary... + RESTORE_ARGS 0,8,0 + HYPERVISOR_IRET 0 + + CFI_RESTORE_STATE +14: __DISABLE_INTERRUPTS + SAVE_REST + movq %rsp,%rdi # set the argument again + jmp 11b + CFI_ENDPROC +ecrit: /**** END OF CRITICAL REGION ****/ +# At this point, unlike on x86-32, we don't do the fixup to simplify the +# code and the stack frame is more complex on x86-64. +# When the kernel is interrupted in the critical section, the kernel +# will do IRET in that case, and everything will be restored at that point, +# i.e. it just resumes from the next instruction interrupted with the same context. + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we do not need to fix up as Xen has already reloaded all segment +# registers that could be reloaded and zeroed the others. +# Category 2 we fix up by killing the current process. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by comparing each saved segment register +# with its current contents: any discrepancy means we in category 1. +ENTRY(failsafe_callback) + INTR_FRAME offset=4*8 + movw %ds,%cx + cmpw %cx,0x10(%rsp) + CFI_REMEMBER_STATE + jne 1f + movw %es,%cx + cmpw %cx,0x18(%rsp) + jne 1f + movw %fs,%cx + cmpw %cx,0x20(%rsp) + jne 1f + movw %gs,%cx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + movq $11,%rdi /* SIGSEGV */ + jmp do_exit + CFI_RESTORE_STATE +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + SAVE_ALL + jmp error_exit + CFI_ENDPROC + +zeroentry divide_error do_divide_error +zeroentry overflow do_overflow +zeroentry bounds do_bounds +zeroentry invalid_op do_invalid_op +zeroentry device_not_available do_device_not_available +zeroentry hypervisor_callback do_hypervisor_callback +zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun +errorentry invalid_TSS do_invalid_TSS +errorentry segment_not_present do_segment_not_present +zeroentry coprocessor_error do_coprocessor_error +errorentry alignment_check do_alignment_check +zeroentry simd_coprocessor_error do_simd_coprocessor_error + +/* + * Create a kernel thread. + * + * C extern interface: + * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) + * + * asm input arguments: + * rdi: fn, rsi: arg, rdx: flags + */ +ENTRY(kernel_thread) + CFI_STARTPROC + FAKE_STACK_FRAME $child_rip + SAVE_ALL + + # rdi: flags, rsi: usp, rdx: will be &pt_regs + movq %rdx,%rdi + orq kernel_thread_flags(%rip),%rdi + movq $-1, %rsi + movq %rsp, %rdx + + xorl %r8d,%r8d + xorl %r9d,%r9d + + # clone now + call do_fork + movq %rax,RAX(%rsp) + xorl %edi,%edi + + /* + * It isn't worth to check for reschedule here, + * so internally to the x86_64 port you can rely on kernel_thread() + * not to reschedule the child before returning, this avoids the need + * of hacks for example to fork off the per-CPU idle tasks. + * [Hopefully no generic code relies on the reschedule -AK] + */ + RESTORE_ALL + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +END(kernel_thread) + +ENTRY(child_rip) + pushq $0 # fake return address + CFI_STARTPROC + /* + * Here we are in the child and the registers are set as they were + * at kernel_thread() invocation in the parent. + */ + movq %rdi, %rax + movq %rsi, %rdi + call *%rax + # exit + mov %eax, %edi + call do_exit + ud2 # padding for call trace + CFI_ENDPROC +END(child_rip) + +/* + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. + * + * C extern interface: + * extern long execve(char *name, char **argv, char **envp) + * + * asm input arguments: + * rdi: name, rsi: argv, rdx: envp + * + * We want to fallback into: + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) + * + * do_sys_execve asm fallback arguments: + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack + */ +ENTRY(kernel_execve) + CFI_STARTPROC + FAKE_STACK_FRAME $0 + SAVE_ALL + movq %rsp,%rcx + call sys_execve + movq %rax, RAX(%rsp) + RESTORE_REST + testq %rax,%rax + jne 1f + jmp int_ret_from_sys_call +1: RESTORE_ARGS + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +END(kernel_execve) + +/* Call softirq on interrupt stack. Interrupts are off. */ +ENTRY(call_softirq) + CFI_STARTPROC + push %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbp,0 + mov %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr),%rsp + push %rbp # backlink for old unwinder + call __do_softirq + leaveq + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 + decl PER_CPU_VAR(irq_count) + ret + CFI_ENDPROC +END(call_softirq) + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movq %r15, R15(%rdi) + movq %r14, R14(%rdi) + xchgq %rsi, %rdx + movq %r13, R13(%rdi) + movq %r12, R12(%rdi) + xorl %eax, %eax + movq %rbp, RBP(%rdi) + movq %rbx, RBX(%rdi) + movq (%rsp), %r9 + xchgq %rdx, %rcx + movq %rax, R11(%rdi) + movq %rax, R10(%rdi) + movq %rax, R9(%rdi) + movq %rax, R8(%rdi) + movq %rax, RAX(%rdi) + movq %rax, RCX(%rdi) + movq %rax, RDX(%rdi) + movq %rax, RSI(%rdi) + movq %rax, RDI(%rdi) + movq %rax, ORIG_RAX(%rdi) + movq %r9, RIP(%rdi) + leaq 8(%rsp), %r9 + movq $__KERNEL_CS, CS(%rdi) + movq %rax, EFLAGS(%rdi) + movq %r9, RSP(%rdi) + movq $__KERNEL_DS, SS(%rdi) + jmpq *%rcx + CFI_ENDPROC +END(arch_unwind_init_running) +#endif + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +paranoidzeroentry_ist debug do_debug DEBUG_STACK +zeroentry nmi do_nmi_callback +paranoidzeroentry_ist int3 do_int3 DEBUG_STACK +paranoiderrorentry stack_segment do_stack_segment +errorentry general_protection do_general_protection +errorentry page_fault do_page_fault +#ifdef CONFIG_X86_MCE +paranoidzeroentry machine_check *machine_check_vector(%rip) +#endif + +#ifndef CONFIG_XEN + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + + /* ebx: no swapgs flag */ +ENTRY(paranoid_exit) + DEFAULT_FRAME + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore + testl $3,CS(%rsp) + jnz paranoid_userspace +paranoid_swapgs: + TRACE_IRQS_IRETQ 0 + SWAPGS_UNSAFE_STACK + RESTORE_ALL 8 + jmp irq_return +paranoid_restore: + TRACE_IRQS_IRETQ 0 + RESTORE_ALL 8 + jmp irq_return +paranoid_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule + movl %ebx,%edx /* arg3: thread flags */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp paranoid_userspace +paranoid_schedule: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + jmp paranoid_userspace + CFI_ENDPROC +END(paranoid_exit) +#endif + +/* + * Exception entry point. This expects an error code/orig_rax on the stack. + * returns in "no swapgs flag" in %ebx. + */ +ENTRY(error_entry) + XCPT_FRAME start=2 offset=__stringify(ORIG_RAX-R15+8) + /* oldrax contains error code */ + cld + movq %rdi, RDI+8(%rsp) + movq %rsi, RSI+8(%rsp) + movq %rdx, RDX+8(%rsp) + movq %rcx, RCX+8(%rsp) + movq %rax, RAX+8(%rsp) + movq %r8, R8+8(%rsp) + movq %r9, R9+8(%rsp) + movq %r10, R10+8(%rsp) + movq %r11, R11+8(%rsp) + movq_cfi rbx, __stringify(RBX+8) + movq %rbp, RBP+8(%rsp) + movq %r12, R12+8(%rsp) + movq %r13, R13+8(%rsp) + movq %r14, R14+8(%rsp) + movq %r15, R15+8(%rsp) +#ifndef CONFIG_XEN + xorl %ebx,%ebx + testl $3,CS+8(%rsp) + je error_kernelspace +error_swapgs: + SWAPGS +error_sti: +#endif + TRACE_IRQS_OFF + ret + +#ifndef CONFIG_XEN +/* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. The exception handlers after iret run with + * kernel gs again, so don't set the user space flag. B stepping K8s + * sometimes report an truncated RIP for IRET exceptions returning to + * compat mode. Check for these here too. + */ +error_kernelspace: + CFI_REL_OFFSET rcx, RCX+8 + incl %ebx + leaq irq_return(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + movl %ecx,%ecx /* zero extend */ + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti +#endif + CFI_ENDPROC +END(error_entry) + + +ENTRY(error_exit) + DEFAULT_FRAME + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + testb $3,CS-ARGOFFSET(%rsp) + jz retint_kernel + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + jmp retint_restore_args + CFI_ENDPROC +END(error_exit) + + +do_nmi_callback: + CFI_STARTPROC + addq $8, %rsp + CFI_ENDPROC + DEFAULT_FRAME + call do_nmi + orl $NMI_MASK,EFLAGS(%rsp) + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_restore_args + CFI_ENDPROC +END(do_nmi_callback) + + +#ifndef CONFIG_IA32_EMULATION +ENTRY(ignore_sysret) + INTR_FRAME + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE r11 + mov $-ENOSYS,%eax + HYPERVISOR_IRET 0 + CFI_ENDPROC +END(ignore_sysret) +#endif + +/* + * End of kprobes section + */ + .popsection --- linux-ec2-2.6.32.orig/arch/x86/kernel/fixup.c +++ linux-ec2-2.6.32/arch/x86/kernel/fixup.c @@ -0,0 +1,89 @@ +/****************************************************************************** + * fixup.c + * + * Binary-rewriting of certain IA32 instructions, on notification by Xen. + * Used to avoid repeated slow emulation of common instructions used by the + * user-space TLS (Thread-Local Storage) libraries. + * + * **** NOTE **** + * Issues with the binary rewriting have caused it to be removed. Instead + * we rely on Xen's emulator to boot the kernel, and then print a banner + * message recommending that the user disables /lib/tls. + * + * Copyright (c) 2004, K A Fraser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) + +dotraplinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) +{ + static unsigned long printed = 0; + char info[100]; + int i; + + /* Ignore statically-linked init. */ + if (current->tgid == 1) + return; + + VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable, + VMASST_TYPE_4gb_segments_notify)); + + if (test_and_set_bit(0, &printed)) + return; + + sprintf(info, "%s (pid=%d)", current->comm, current->tgid); + + DP(""); + DP("***************************************************************"); + DP("***************************************************************"); + DP("** WARNING: Currently emulating unsupported memory accesses **"); + DP("** in /lib/tls glibc libraries. The emulation is **"); + DP("** slow. To ensure full performance you should **"); + DP("** install a 'xen-friendly' (nosegneg) version of **"); + DP("** the library, or disable tls support by executing **"); + DP("** the following as root: **"); + DP("** mv /lib/tls /lib/tls.disabled **"); + DP("** Offending process: %-38.38s **", info); + DP("***************************************************************"); + DP("***************************************************************"); + DP(""); + + for (i = 5; i > 0; i--) { + touch_softlockup_watchdog(); + printk("Pausing... %d", i); + mdelay(1000); + printk("\b\b\b\b\b\b\b\b\b\b\b\b"); + } + + printk("Continuing...\n\n"); +} + +static int __init fixup_init(void) +{ + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_4gb_segments_notify)); + return 0; +} +__initcall(fixup_init); --- linux-ec2-2.6.32.orig/arch/x86/kernel/head-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/head-xen.c @@ -0,0 +1,222 @@ +#include +#include + +#include +#ifndef CONFIG_XEN +#include + +#define BIOS_LOWMEM_KILOBYTES 0x413 + +/* + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. This also contains a + * workaround for Dell systems that neglect to reserve EBDA. + * The same workaround also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in. + */ +void __init reserve_ebda_region(void) +{ + unsigned int lowmem, ebda_addr; + + /* To determine the position of the EBDA and the */ + /* end of conventional memory, we need to look at */ + /* the BIOS data area. In a paravirtual environment */ + /* that area is absent. We'll just have to assume */ + /* that the paravirt case can handle memory setup */ + /* correctly, without our help. */ + if (paravirt_enabled()) + return; + + /* end of low (conventional) memory */ + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); + lowmem <<= 10; + + /* start of EBDA area */ + ebda_addr = get_bios_ebda(); + + /* Fixup: bios puts an EBDA in the top 64K segment */ + /* of conventional memory, but does not adjust lowmem. */ + if ((lowmem - ebda_addr) <= 0x10000) + lowmem = ebda_addr; + + /* Fixup: bios does not report an EBDA at all. */ + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ + if ((ebda_addr == 0) && (lowmem >= 0x9f000)) + lowmem = 0x9f000; + + /* Paranoia: should never happen, but... */ + if ((lowmem == 0) || (lowmem >= 0x100000)) + lowmem = 0x9f000; + + /* reserve all memory between lowmem and the 1MB mark */ + reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved"); +} +#else /* CONFIG_XEN */ +#include +#include +#include +#include +#include +#include + +extern void hypervisor_callback(void); +extern void failsafe_callback(void); +extern void nmi(void); + +#ifdef CONFIG_X86_64 +#include +#define CALLBACK_ADDR(fn) ((unsigned long)(fn)) +#else +#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) } +#endif + +unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; +EXPORT_SYMBOL(machine_to_phys_mapping); +unsigned int machine_to_phys_order; +EXPORT_SYMBOL(machine_to_phys_order); + +void __init xen_start_kernel(void) +{ + unsigned int i; + struct xen_machphys_mapping mapping; + unsigned long machine_to_phys_nr_ents; +#ifdef CONFIG_X86_32 + struct xen_platform_parameters pp; + extern pte_t swapper_pg_fixmap[PTRS_PER_PTE]; + unsigned long addr; +#endif + + xen_setup_features(); + + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { + machine_to_phys_mapping = (unsigned long *)mapping.v_start; + machine_to_phys_nr_ents = mapping.max_mfn + 1; + } else + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; + while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents ) + machine_to_phys_order++; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping = + (unsigned long *)xen_start_info->mfn_list; + + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_writable_pagetables)); + + reserve_early(ALIGN(__pa_symbol(&_end), PAGE_SIZE), + __pa(xen_start_info->pt_base) + + (xen_start_info->nr_pt_frames << PAGE_SHIFT), + "Xen provided"); + +#ifdef CONFIG_X86_32 + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_4gb_segments)); + + init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) { + hypervisor_virt_start = pp.virt_start; + reserve_top_address(0UL - pp.virt_start); + } + + BUG_ON(pte_index(hypervisor_virt_start)); + + /* Do an early initialization of the fixmap area */ + make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables); + addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); + set_pmd(pmd_offset(pud_offset(swapper_pg_dir + pgd_index(addr), + addr), + addr), + __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE)); +#else + check_efer(); + xen_init_pt(); +#endif + +#define __FIXADDR_TOP (-PAGE_SIZE) +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) +#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \ + != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE))) + FIX_BUG_ON(SHARED_INFO); + FIX_BUG_ON(ISAMAP_BEGIN); + FIX_BUG_ON(ISAMAP_END); +#undef pmd_index +#undef __FIXADDR_TOP + + /* Switch to the real shared_info page, and clear the dummy page. */ + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + memset(empty_zero_page, 0, sizeof(empty_zero_page)); + + setup_vcpu_info(0); + + /* Set up mapping of lowest 1MB of physical memory. */ + for (i = 0; i < NR_FIX_ISAMAPS; i++) + if (is_initial_xendomain()) + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); + else + __set_fixmap(FIX_ISAMAP_BEGIN - i, + virt_to_machine(empty_zero_page), + PAGE_KERNEL_RO); + +} + +void __init xen_arch_setup(void) +{ + int ret; + static const struct callback_register __initconst event = { + .type = CALLBACKTYPE_event, + .address = CALLBACK_ADDR(hypervisor_callback) + }; + static const struct callback_register __initconst failsafe = { + .type = CALLBACKTYPE_failsafe, + .address = CALLBACK_ADDR(failsafe_callback) + }; +#ifdef CONFIG_X86_64 + static const struct callback_register __initconst syscall = { + .type = CALLBACKTYPE_syscall, + .address = CALLBACK_ADDR(system_call) + }; +#endif + static const struct callback_register __initconst nmi_cb = { + .type = CALLBACKTYPE_nmi, + .address = CALLBACK_ADDR(nmi) + }; + + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event); + if (ret == 0) + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); +#ifdef CONFIG_X86_64 + if (ret == 0) + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall); +#endif +#if CONFIG_XEN_COMPAT <= 0x030002 +#ifdef CONFIG_X86_32 + if (ret == -ENOSYS) + ret = HYPERVISOR_set_callbacks( + event.address.cs, event.address.eip, + failsafe.address.cs, failsafe.address.eip); +#else + ret = HYPERVISOR_set_callbacks( + event.address, + failsafe.address, + syscall.address); +#endif +#endif + BUG_ON(ret); + + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (ret == -ENOSYS) { + static struct xennmi_callback __initdata cb = { + .handler_address = (unsigned long)nmi + }; + + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb); + } +#endif +} +#endif /* CONFIG_XEN */ --- linux-ec2-2.6.32.orig/arch/x86/kernel/head32-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/head32-xen.c @@ -0,0 +1,80 @@ +/* + * linux/arch/i386/kernel/head32.c -- prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2007 Eric Biederman + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +static void __init i386_default_early_setup(void) +{ + /* Initialize 32bit specific setup functions */ + if (is_initial_xendomain()) + x86_init.resources.probe_roms = probe_roms; + x86_init.resources.reserve_resources = i386_reserve_resources; +#ifndef CONFIG_XEN + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + + reserve_ebda_region(); +#endif +} + +void __init i386_start_kernel(void) +{ + reserve_trampoline_memory(); + + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + +#ifndef CONFIG_XEN +#ifdef CONFIG_BLK_DEV_INITRD + /* Reserve INITRD */ + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_end = ramdisk_image + ramdisk_size; + reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); + } +#endif + + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_MRST: + x86_mrst_early_setup(); + break; + default: + i386_default_early_setup(); + break; + } +#else + { + int max_cmdline; + + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) + max_cmdline = COMMAND_LINE_SIZE; + memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline); + boot_command_line[max_cmdline-1] = '\0'; + } + + i386_default_early_setup(); + xen_start_kernel(); +#endif + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + + start_kernel(); +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/head64-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/head64-xen.c @@ -0,0 +1,141 @@ +/* + * prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * + * Jun Nakajima + * Modified for Xen. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_XEN +static void __init zap_identity_mappings(void) +{ + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); + __flush_tlb_all(); +} + +/* Don't add a printk in there. printk relies on the PDA which is not initialized + yet. */ +static void __init clear_bss(void) +{ + memset(__bss_start, 0, + (unsigned long) __bss_stop - (unsigned long) __bss_start); +} +#endif + +static void __init copy_bootdata(char *real_mode_data) +{ +#ifndef CONFIG_XEN + char * command_line; + + memcpy(&boot_params, real_mode_data, sizeof boot_params); + if (boot_params.hdr.cmd_line_ptr) { + command_line = __va(boot_params.hdr.cmd_line_ptr); + memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); + } +#else + int max_cmdline; + + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) + max_cmdline = COMMAND_LINE_SIZE; + memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline); + boot_command_line[max_cmdline-1] = '\0'; +#endif +} + +#include + +void __init x86_64_start_kernel(char * real_mode_data) +{ + /* + * Build-time sanity checks on the kernel image and module + * area mappings. (these are purely build-time and produce no code) + */ + BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START); + BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE); + BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); + BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0); + BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + + xen_start_info = (struct start_info *)real_mode_data; + xen_start_kernel(); + +#ifndef CONFIG_XEN + /* clear bss before set_intr_gate with early_idt_handler */ + clear_bss(); + + /* Make NULL pointers segfault */ + zap_identity_mappings(); + + /* Cleanup the over mapped high alias */ + cleanup_highmap(); + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { +#ifdef CONFIG_EARLY_PRINTK + set_intr_gate(i, &early_idt_handlers[i]); +#else + set_intr_gate(i, early_idt_handler); +#endif + } + load_idt((const struct desc_ptr *)&idt_descr); +#endif + + if (console_loglevel == 10) + early_printk("Kernel alive\n"); + + xen_switch_pt(); + + x86_64_start_reservations(real_mode_data); +} + +void __init x86_64_start_reservations(char *real_mode_data) +{ + copy_bootdata(__va(real_mode_data)); + + reserve_trampoline_memory(); + + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + xen_start_info->mfn_list = ~0UL; + else if (xen_start_info->mfn_list < __START_KERNEL_map) + reserve_early(xen_start_info->first_p2m_pfn << PAGE_SHIFT, + (xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT, + "INITP2M"); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + + start_kernel(); +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/head_32-xen.S +++ linux-ec2-2.6.32/arch/x86/kernel/head_32-xen.S @@ -0,0 +1,198 @@ + + +.text +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + +__HEAD +#define VIRT_ENTRY_OFFSET 0x0 +.org VIRT_ENTRY_OFFSET +ENTRY(startup_32) + movl %esi,xen_start_info + cld + + /* Set up the stack pointer */ + movl $(init_thread_union+THREAD_SIZE),%esp + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + XEN_CPUID + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + movl $1,%eax # Use the CPUID instruction to get CPU type + XEN_CPUID + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_MASK + movl %edx,X86_CAPABILITY + + movb $1,X86_HARD_MATH + +#ifdef CONFIG_CC_STACKPROTECTOR + /* + * The linker can't handle this by relocation. Manually set + * base address in stack canary segment descriptor. + */ + movl $per_cpu__gdt_page,%eax + movl $per_cpu__stack_canary,%ecx + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) + movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +#endif + + # %esi still points to start_info, and no registers + # need to be preserved. + + movl XEN_START_mfn_list(%esi), %ebx + movl $(per_cpu__gdt_page - __PAGE_OFFSET), %eax + shrl $PAGE_SHIFT, %eax + movl (%ebx,%eax,4), %ecx + pushl %ecx # frame number for set_gdt below + + xorl %esi, %esi + xorl %edx, %edx + shldl $PAGE_SHIFT, %ecx, %edx + shll $PAGE_SHIFT, %ecx + orl $_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY, %ecx + movl $per_cpu__gdt_page, %ebx + movl $__HYPERVISOR_update_va_mapping, %eax + int $0x82 + + movl $(PAGE_SIZE_asm / 8), %ecx + movl %esp, %ebx + movl $__HYPERVISOR_set_gdt, %eax + int $0x82 + + popl %ecx + + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + + movl $(__KERNEL_STACK_CANARY),%eax + movl %eax,%gs + + cld # gcc2 wants the direction flag cleared at all times + + pushl $0 # fake return address for unwinder + jmp i386_start_kernel + +#define HYPERCALL_PAGE_OFFSET 0x1000 +.org HYPERCALL_PAGE_OFFSET +ENTRY(hypercall_page) + CFI_STARTPROC +.skip 0x1000 + CFI_ENDPROC + +/* + * BSS section + */ +__PAGE_ALIGNED_BSS + .align PAGE_SIZE_asm +ENTRY(swapper_pg_fixmap) + .fill 1024,4,0 +ENTRY(empty_zero_page) + .fill 4096,1,0 + +/* + * This starts the data section. + */ +.data + +#if CONFIG_XEN_COMPAT <= 0x030002 +/* + * __xen_guest information + */ +.macro utoa value + .if (\value) < 0 || (\value) >= 0x10 + utoa (((\value)>>4)&0x0fffffff) + .endif + .if ((\value) & 0xf) < 10 + .byte '0' + ((\value) & 0xf) + .else + .byte 'A' + ((\value) & 0xf) - 10 + .endif +.endm + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=xen-3.0" + .ascii ",VIRT_BASE=0x" + utoa __PAGE_OFFSET + .ascii ",ELF_PADDR_OFFSET=0x" + utoa __PAGE_OFFSET + .ascii ",VIRT_ENTRY=0x" + utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET) + .ascii ",HYPERCALL_PAGE=0x" + utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) + .ascii ",FEATURES=writable_page_tables" + .ascii "|writable_descriptor_tables" + .ascii "|auto_translated_physmap" + .ascii "|pae_pgdir_above_4gb" + .ascii "|supervisor_mode_kernel" +#ifdef CONFIG_X86_PAE + .ascii ",PAE=yes[extended-cr3]" +#else + .ascii ",PAE=no" +#endif + .ascii ",LOADER=generic" + .byte 0 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ + + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) +#if CONFIG_XEN_COMPAT <= 0x030002 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long __PAGE_OFFSET) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long 0) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_32) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long _PAGE_PRESENT, _PAGE_PRESENT) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) --- linux-ec2-2.6.32.orig/arch/x86/kernel/head_64-xen.S +++ linux-ec2-2.6.32/arch/x86/kernel/head_64-xen.S @@ -0,0 +1,151 @@ +/* + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Karsten Keil + * Copyright (C) 2001,2002 Andi Kleen + * Copyright (C) 2005 Eric Biederman + * Jun Nakajima + * Modified for Xen + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + __HEAD + .code64 + .globl startup_64 +startup_64: + movq $(init_thread_union+THREAD_SIZE-8),%rsp + + /* rsi is pointer to startup info structure. + pass it to C */ + movq %rsi,%rdi + + /* Set up %gs. + * + * The base of %gs always points to the bottom of the irqstack + * union. If the stack protector canary is enabled, it is + * located at %gs:40. Note that, on SMP, the boot cpu uses + * init data section till per cpu areas are set up. + */ + movl $MSR_GS_BASE,%ecx + movq $INIT_PER_CPU_VAR(irq_stack_union),%rax + movq %rax,%rdx + shrq $32,%rdx + wrmsr + + pushq $0 # fake return address + jmp x86_64_start_kernel + +#define NEXT_PAGE(name) \ + .balign PAGE_SIZE; \ + phys_##name = . - .head.text; \ +ENTRY(name) + +NEXT_PAGE(init_level4_pgt) + .fill 512,8,0 + +NEXT_PAGE(level3_kernel_pgt) + .fill 512,8,0 + + /* + * This is used for vsyscall area mapping as we have a different + * level4 page table for user. + */ +NEXT_PAGE(level3_user_pgt) + .fill 512,8,0 + +NEXT_PAGE(level2_fixmap_pgt) + .fill 512,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 + +NEXT_PAGE(hypercall_page) + CFI_STARTPROC + .rept 0x1000 / 0x20 + .skip 1 /* push %rcx */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx,0 + .skip 2 /* push %r11 */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx,0 + .skip 5 /* mov $#,%eax */ + .skip 2 /* syscall */ + .skip 2 /* pop %r11 */ + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE r11 + .skip 1 /* pop %rcx */ + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + .align 0x20,0 /* ret */ + .endr + CFI_ENDPROC + +#undef NEXT_PAGE + + __PAGE_ALIGNED_BSS + .align PAGE_SIZE +ENTRY(empty_zero_page) + .skip PAGE_SIZE + +#if CONFIG_XEN_COMPAT <= 0x030002 +/* + * __xen_guest information + */ +.macro utoh value + i = 64 + .rept 16 + i = i - 4 + .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf) + .endr +.endm + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=xen-3.0" + .ascii ",VIRT_BASE=0x" + utoh __START_KERNEL_map + .ascii ",ELF_PADDR_OFFSET=0x" + utoh __START_KERNEL_map + .ascii ",VIRT_ENTRY=0x" + utoh (__START_KERNEL_map + __PHYSICAL_START) + .ascii ",HYPERCALL_PAGE=0x" + utoh (phys_hypercall_page >> PAGE_SHIFT) + .ascii ",FEATURES=writable_page_tables" + .ascii "|writable_descriptor_tables" + .ascii "|auto_translated_physmap" + .ascii "|supervisor_mode_kernel" + .ascii ",LOADER=generic" + .byte 0 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad __START_KERNEL_map) +#if CONFIG_XEN_COMPAT <= 0x030002 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad __START_KERNEL_map) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad 0) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) --- linux-ec2-2.6.32.orig/arch/x86/kernel/ioport-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/ioport-xen.c @@ -0,0 +1,112 @@ +/* + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. 32/64 bits code unification by Miguel Botón. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, + unsigned int extent, int new_value) +{ + unsigned int i; + + for (i = base; i < base + extent; i++) { + if (new_value) + __set_bit(i, bitmap); + else + __clear_bit(i, bitmap); + } +} + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct *t = ¤t->thread; + struct physdev_set_iobitmap set_iobitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); + + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); + set_iobitmap.nr_ports = IO_BITMAP_BITS; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &set_iobitmap)); + } + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + */ +static int do_iopl(unsigned int level, struct thread_struct *t) +{ + unsigned int old = t->iopl >> 12; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + + return 0; +} + +#ifdef CONFIG_X86_32 +long sys_iopl(struct pt_regs *regs) +{ + unsigned int level = regs->bx; +#else +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +{ +#endif + struct thread_struct *t = ¤t->thread; + int rc; + + rc = do_iopl(level, t); + if (rc < 0) + goto out; + + t->iopl = level << 12; + set_iopl_mask(t->iopl); +out: + return rc; +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/irq-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/irq-xen.c @@ -0,0 +1,290 @@ +/* + * Common interrupt code for 32 and 64 bit + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_XEN +atomic_t irq_err_count; +#endif + +/* Function pointer for generic interrupt vector handling */ +void (*generic_interrupt_extension)(void) = NULL; + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); + +#ifndef CONFIG_XEN + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + ack_APIC_irq(); +#endif +} + +#define irq_stats(x) (&per_cpu(irq_stat, x)) +/* + * /proc/interrupts printing: + */ +static int show_other_interrupts(struct seq_file *p, int prec) +{ + int j; + + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) + seq_printf(p, "%*s: ", prec, "LOC"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); + + seq_printf(p, "%*s: ", prec, "SPU"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); + seq_printf(p, "%*s: ", prec, "PMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_printf(p, " Performance monitoring interrupts\n"); + seq_printf(p, "%*s: ", prec, "PND"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); + seq_printf(p, " Performance pending work\n"); +#endif + if (generic_interrupt_extension) { + seq_printf(p, "%*s: ", prec, "PLT"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, " Platform interrupts\n"); + } +#ifdef CONFIG_SMP + seq_printf(p, "%*s: ", prec, "RES"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "%*s: ", prec, "CAL"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_printf(p, " Function call interrupts\n"); +#ifndef CONFIG_XEN + seq_printf(p, "%*s: ", prec, "TLB"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR + seq_printf(p, "%*s: ", prec, "TRM"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + seq_printf(p, "%*s: ", prec, "THR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_printf(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_printf(p, " Machine check polls\n"); +#endif +#ifndef CONFIG_XEN + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); +#endif +#endif + return 0; +} + +int show_interrupts(struct seq_file *p, void *v) +{ + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j, prec; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + + if (i == nr_irqs) + return show_other_interrupts(p, prec); + + /* print header */ + if (i == 0) { + seq_printf(p, "%*s", prec + 8, ""); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d", j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + if (!desc) + return 0; + + spin_lock_irqsave(&desc->lock, flags); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%*d: ", prec, i); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = irq_stats(cpu)->__nmi_count; + +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->irq_spurious_count; + sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_pending_irqs; +#endif + if (generic_interrupt_extension) + sum += irq_stats(cpu)->generic_irqs; +#ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; +#ifndef CONFIG_XEN + sum += irq_stats(cpu)->irq_tlb_count; +#endif +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR + sum += irq_stats(cpu)->irq_thermal_count; +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + sum += irq_stats(cpu)->irq_threshold_count; +#endif +#ifdef CONFIG_X86_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ +#ifndef CONFIG_XEN + u64 sum = atomic_read(&irq_err_count); + +#ifdef CONFIG_X86_IO_APIC + sum += atomic_read(&irq_mis_count); +#endif + return sum; +#else + return 0; +#endif +} + + +#ifndef CONFIG_XEN +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* high bit used in ret_from_ code */ + unsigned vector = ~regs->orig_ax; + unsigned irq; + + exit_idle(); + irq_enter(); + + irq = __get_cpu_var(vector_irq)[vector]; + + if (!handle_irq(irq, regs)) { + ack_APIC_irq(); + + if (printk_ratelimit()) + pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), vector, irq); + } + + irq_exit(); + + set_irq_regs(old_regs); + return 1; +} + +/* + * Handler for GENERIC_INTERRUPT_VECTOR. + */ +void smp_generic_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + exit_idle(); + + irq_enter(); + + inc_irq_stat(generic_irqs); + + if (generic_interrupt_extension) + generic_interrupt_extension(); + + irq_exit(); + + set_irq_regs(old_regs); +} +#endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/irq_32-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/irq_32-xen.c @@ -0,0 +1,260 @@ +/* + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86-specific interrupt + * entry, irq-stacks and irq statistics code. All the remaining + * irq logic is done by the generic kernel/irq/ code and + * by the x86-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + +#ifdef CONFIG_DEBUG_STACKOVERFLOW +/* Debugging check for stack overflow: is there less than 1KB free? */ +static int check_stack_overflow(void) +{ + long sp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (sp) : "0" (THREAD_SIZE - 1)); + + return sp < (sizeof(struct thread_info) + STACK_WARN); +} + +static void print_stack_overflow(void) +{ + printk(KERN_WARNING "low stack detected by irq handler\n"); + dump_stack(); +} + +#else +static inline int check_stack_overflow(void) { return 0; } +static inline void print_stack_overflow(void) { } +#endif + +#ifdef CONFIG_4KSTACKS +/* + * per-CPU IRQ handling contexts (thread information and stack) + */ +union irq_ctx { + struct thread_info tinfo; + u32 stack[THREAD_SIZE/sizeof(u32)]; +} __attribute__((aligned(PAGE_SIZE))); + +static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); +static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); + +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); + +static void call_on_stack(void *func, void *stack) +{ + asm volatile("xchgl %%ebx,%%esp \n" + "call *%%edi \n" + "movl %%ebx,%%esp \n" + : "=b" (stack) + : "0" (stack), + "D"(func) + : "memory", "cc", "edx", "ecx", "eax"); +} + +static inline int +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) +{ + union irq_ctx *curctx, *irqctx; + u32 *isp, arg1, arg2; + + curctx = (union irq_ctx *) current_thread_info(); + irqctx = __get_cpu_var(hardirq_ctx); + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (unlikely(curctx == irqctx)) + return 0; + + /* build the stack frame on the IRQ stack */ + isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); + irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* + * Copy the softirq bits in preempt_count so that the + * softirq checks work in the hardirq context. + */ + irqctx->tinfo.preempt_count = + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | + (curctx->tinfo.preempt_count & SOFTIRQ_MASK); + + if (unlikely(overflow)) + call_on_stack(print_stack_overflow, isp); + + asm volatile("xchgl %%ebx,%%esp \n" + "call *%%edi \n" + "movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (isp) + : "0" (irq), "1" (desc), "2" (isp), + "D" (desc->handle_irq) + : "memory", "cc", "ecx"); + return 1; +} + +/* + * allocate per-cpu stacks for hardirq and for softirq processing + */ +void __cpuinit irq_ctx_init(int cpu) +{ + union irq_ctx *irqctx; + + if (per_cpu(hardirq_ctx, cpu)) + return; + + irqctx = &per_cpu(hardirq_stack, cpu); + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + per_cpu(hardirq_ctx, cpu) = irqctx; + + irqctx = &per_cpu(softirq_stack, cpu); + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = 0; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + per_cpu(softirq_ctx, cpu) = irqctx; + + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", + cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); +} + +void irq_ctx_exit(int cpu) +{ + per_cpu(hardirq_ctx, cpu) = NULL; +} + +asmlinkage void do_softirq(void) +{ + unsigned long flags; + struct thread_info *curctx; + union irq_ctx *irqctx; + u32 *isp; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + if (local_softirq_pending()) { + curctx = current_thread_info(); + irqctx = __get_cpu_var(softirq_ctx); + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* build the stack frame on the softirq stack */ + isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); + + call_on_stack(__do_softirq, isp); + /* + * Shouldnt happen, we returned above if in_interrupt(): + */ + WARN_ON_ONCE(softirq_count()); + } + + local_irq_restore(flags); +} + +#else +static inline int +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } +#endif + +bool handle_irq(unsigned irq, struct pt_regs *regs) +{ + struct irq_desc *desc; + int overflow; + + overflow = check_stack_overflow(); + + desc = irq_to_desc(irq); + if (unlikely(!desc)) + return false; + + if (!execute_on_irq_stack(overflow, desc, irq)) { + if (unlikely(overflow)) + print_stack_overflow(); + desc->handle_irq(irq, desc); + } + + return true; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + if (desc->status & IRQ_PER_CPU) + continue; + + affinity = desc->affinity; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + /*printk("Breaking affinity for irq %i\n", irq);*/ + affinity = cpu_all_mask; + } + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, affinity); + else if (desc->action) + printk_once("Cannot set affinity for irq %i\n", irq); + } + +#if 0 + barrier(); + /* Ingo Molnar says: "after the IO-APIC masks have been redirected + [note the nop - the interrupt-enable boundary on x86 is two + instructions from sti] - to flush out pending hardirqs and + IPIs. After this point nothing is supposed to reach this CPU." */ + __asm__ __volatile__("sti; nop; cli"); + barrier(); +#else + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +#endif +} +#endif + --- linux-ec2-2.6.32.orig/arch/x86/kernel/irq_64-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/irq_64-xen.c @@ -0,0 +1,143 @@ +/* + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86_64-specific interrupt + * entry and irq statistics code. All the remaining irq logic is + * done by the generic kernel/irq/ code and in the + * x86_64-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + +/* + * Probabilistic stack overflow check: + * + * Only check the stack in process context, because everything else + * runs on the big interrupt stacks. Checking reliably is too expensive, + * so we just check from interrupts. + */ +static inline void stack_overflow_check(struct pt_regs *regs) +{ +#ifdef CONFIG_DEBUG_STACKOVERFLOW + u64 curbase = (u64)task_stack_page(current); + + WARN_ONCE(regs->sp >= curbase && + regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128, + + "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); +#endif +} + +bool handle_irq(unsigned irq, struct pt_regs *regs) +{ + struct irq_desc *desc; + + stack_overflow_check(regs); + + desc = irq_to_desc(irq); + if (unlikely(!desc)) + return false; + + generic_handle_irq_desc(irq, desc); + return true; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq; + static int warned; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + int break_affinity = 0; + int set_affinity = 1; + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + spin_lock(&desc->lock); + + affinity = desc->affinity; + if (!irq_has_action(irq) || + (desc->status & IRQ_PER_CPU) || + cpumask_equal(affinity, cpu_online_mask)) { + spin_unlock(&desc->lock); + continue; + } + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + break_affinity = 1; + affinity = cpu_all_mask; + } + + if (desc->chip->mask) + desc->chip->mask(irq); + + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, affinity); + else if (!(warned++)) + set_affinity = 0; + + if (desc->chip->unmask) + desc->chip->unmask(irq); + + spin_unlock(&desc->lock); + + if (break_affinity && set_affinity) + /*printk("Broke affinity for irq %i\n", irq)*/; + else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} +#endif + +extern void call_softirq(void); + +asmlinkage void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + pending = local_softirq_pending(); + /* Switch to interrupt stack */ + if (pending) { + call_softirq(); + WARN_ON_ONCE(softirq_count()); + } + local_irq_restore(flags); +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/ldt-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/ldt-xen.c @@ -0,0 +1,271 @@ +/* + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP +static void flush_ldt(void *current_mm) +{ + if (current->active_mm == current_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt, *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount * LDT_ENTRY_SIZE); + else + newldt = (void *)__get_free_page(GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, + (mincount - oldsize) * LDT_ENTRY_SIZE); + +#ifdef CONFIG_X86_64 + /* CHECKME: Do we really need this ? */ + wmb(); +#endif + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + preempt_disable(); +#endif + make_pages_readonly(newldt, + (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + load_LDT(pc); +#ifdef CONFIG_SMP + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) + smp_call_function(flush_ldt, current->mm, 1); + preempt_enable(); +#endif + } + if (oldsize) { + make_pages_writable(oldldt, + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + put_page(virt_to_page(oldldt)); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); + make_pages_readonly(new->ldt, + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct *old_mm; + int retval = 0; + + memset(&mm->context, 0, sizeof(mm->context)); + mutex_init(&mm->context.lock); + old_mm = current->mm; + if (old_mm) + mm->context.vdso = old_mm->context.vdso; + if (old_mm && old_mm->context.size > 0) { + mutex_lock(&old_mm->context.lock); + retval = copy_ldt(&mm->context, &old_mm->context); + mutex_unlock(&old_mm->context.lock); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + /* CHECKME: Can this ever happen ? */ + if (mm == current->active_mm) + clear_LDT(); + make_pages_writable(mm->context.ldt, + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + put_page(virt_to_page(mm->context.ldt)); + mm->context.size = 0; + } +} + +static int read_ldt(void __user *ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct *mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; + + mutex_lock(&mm->context.lock); + size = mm->context.size * LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + mutex_unlock(&mm->context.lock); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr + size, bytecount - size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user *ptr, unsigned long bytecount) +{ + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct *mm = current->mm; + struct desc_struct ldt; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + mutex_lock(&mm->context.lock); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, + ldt_info.entry_number + 1, 1); + if (error < 0) + goto out_unlock; + } + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + memset(&ldt, 0, sizeof(ldt)); + goto install; + } + } + + fill_ldt(&ldt, &ldt_info); + if (oldmode) + ldt.avl = 0; + + /* Install the new entry ... */ +install: + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); + +out_unlock: + mutex_unlock(&mm->context.lock); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, + unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/microcode_core-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/microcode_core-xen.c @@ -0,0 +1,224 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_LICENSE("GPL"); + +static int verbose; +module_param(verbose, int, 0644); + +#define MICROCODE_VERSION "2.00-xen" + +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ +static DEFINE_MUTEX(microcode_mutex); + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static int do_microcode_update(const void __user *ubuf, size_t len) +{ + int err; + void *kbuf; + + kbuf = vmalloc(len); + if (!kbuf) + return -ENOMEM; + + if (copy_from_user(kbuf, ubuf, len) == 0) { + struct xen_platform_op op; + + op.cmd = XENPF_microcode_update; + set_xen_guest_handle(op.u.microcode.data, kbuf); + op.u.microcode.length = len; + err = HYPERVISOR_platform_op(&op); + } else + err = -EFAULT; + + vfree(kbuf); + + return err; +} + +static int microcode_open(struct inode *unused1, struct file *unused2) +{ + cycle_kernel_lock(); + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret = -EINVAL; + + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); + return ret; + } + + mutex_lock(µcode_mutex); + + if (do_microcode_update(buf, len) == 0) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .nodename = "cpu/microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init(void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void microcode_dev_exit(void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) +#endif + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +static int request_microcode(const char *name) +{ + const struct firmware *firmware; + int error; + struct xen_platform_op op; + + error = request_firmware(&firmware, name, µcode_pdev->dev); + if (error) { + pr_debug("microcode: data file %s load failed\n", name); + return error; + } + + op.cmd = XENPF_microcode_update; + set_xen_guest_handle(op.u.microcode.data, firmware->data); + op.u.microcode.length = firmware->size; + error = HYPERVISOR_platform_op(&op); + + release_firmware(firmware); + + if (error) + pr_debug("ucode load failed\n"); + + return error; +} + +static int __init microcode_init(void) +{ + const struct cpuinfo_x86 *c = &boot_cpu_data; + char buf[32]; + const char *fw_name = buf; + int error; + + if (c->x86_vendor == X86_VENDOR_INTEL) + sprintf(buf, "intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + else if (c->x86_vendor == X86_VENDOR_AMD) + fw_name = "amd-ucode/microcode_amd.bin"; + else { + pr_err("microcode: no support for this CPU vendor\n"); + return -ENODEV; + } + + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + return PTR_ERR(microcode_pdev); + } + + error = microcode_dev_init(); + if (error) + return error; + + request_microcode(fw_name); + + pr_info("Microcode Update Driver: v" MICROCODE_VERSION + " ," + " Peter Oruba\n"); + + return 0; +} +module_init(microcode_init); + +static void __exit microcode_exit(void) +{ + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} +module_exit(microcode_exit); --- linux-ec2-2.6.32.orig/arch/x86/kernel/mpparse-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/mpparse-xen.c @@ -0,0 +1,1088 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998, 1999, 2000, 2009 Ingo Molnar + * (c) 2008 Alexey Starikovskiy + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void *_bus_to_virt(unsigned long ma) +{ + return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma); +} + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +#ifndef CONFIG_XEN +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} +#endif + +static void __init MP_processor_info(struct mpc_cpu *m) +{ +#ifndef CONFIG_XEN + int apicid; + char *bootup_cpu = ""; + + if (!(m->cpuflag & CPU_ENABLED)) { + disabled_cpus++; + return; + } + + apicid = x86_init.mpparse.mpc_apic_id(m); + + if (m->cpuflag & CPU_BOOTPROCESSOR) { + bootup_cpu = " (Bootup-CPU)"; + boot_cpu_physical_apicid = m->apicid; + } + + printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu); + generic_processor_info(apicid, m->apicver); +#else /* CONFIG_XEN */ + num_processors++; +#endif +} + +#ifdef CONFIG_X86_IO_APIC +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) +{ + memcpy(str, m->bustype, 6); + str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} + +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + x86_init.mpparse.mpc_oem_bus_info(m, str); + +#if MAX_MP_BUSSES < 256 + if (m->busid >= MAX_MP_BUSSES) { + printk(KERN_WARNING "MP table busid value (%d) for bustype %s " + " is too large, max. supported is %d\n", + m->busid, str, MAX_MP_BUSSES - 1); + return; + } +#endif + + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { + set_bit(m->busid, mp_bus_not_pci); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + mp_bus_id_to_type[m->busid] = MP_BUS_ISA; +#endif + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); + + clear_bit(m->busid, mp_bus_not_pci); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + mp_bus_id_to_type[m->busid] = MP_BUS_PCI; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { + mp_bus_id_to_type[m->busid] = MP_BUS_EISA; + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { + mp_bus_id_to_type[m->busid] = MP_BUS_MCA; +#endif + } else + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); +} + +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +static void __init MP_ioapic_info(struct mpc_ioapic *m) +{ + if (!(m->flags & MPC_APIC_USABLE)) + return; + + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", + m->apicid, m->apicver, m->apicaddr); + + if (bad_ioapic(m->apicaddr)) + return; + + mp_ioapics[nr_ioapics].apicaddr = m->apicaddr; + mp_ioapics[nr_ioapics].apicid = m->apicid; + mp_ioapics[nr_ioapics].type = m->type; + mp_ioapics[nr_ioapics].apicver = m->apicver; + mp_ioapics[nr_ioapics].flags = m->flags; + nr_ioapics++; +} + +static void print_MP_intsrc_info(struct mpc_intsrc *m) +{ + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); +} + +static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) +{ + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + mp_irq->irqtype, mp_irq->irqflag & 3, + (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, + mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); +} + +static void __init assign_to_mp_irq(struct mpc_intsrc *m, + struct mpc_intsrc *mp_irq) +{ + mp_irq->dstapic = m->dstapic; + mp_irq->type = m->type; + mp_irq->irqtype = m->irqtype; + mp_irq->irqflag = m->irqflag; + mp_irq->srcbus = m->srcbus; + mp_irq->srcbusirq = m->srcbusirq; + mp_irq->dstirq = m->dstirq; +} + +static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) +{ + m->dstapic = mp_irq->dstapic; + m->type = mp_irq->type; + m->irqtype = mp_irq->irqtype; + m->irqflag = mp_irq->irqflag; + m->srcbus = mp_irq->srcbus; + m->srcbusirq = mp_irq->srcbusirq; + m->dstirq = mp_irq->dstirq; +} + +static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) +{ + if (mp_irq->dstapic != m->dstapic) + return 1; + if (mp_irq->type != m->type) + return 2; + if (mp_irq->irqtype != m->irqtype) + return 3; + if (mp_irq->irqflag != m->irqflag) + return 4; + if (mp_irq->srcbus != m->srcbus) + return 5; + if (mp_irq->srcbusirq != m->srcbusirq) + return 6; + if (mp_irq->dstirq != m->dstirq) + return 7; + + return 0; +} + +static void __init MP_intsrc_info(struct mpc_intsrc *m) +{ + int i; + + print_MP_intsrc_info(m); + + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) + return; + } + + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} +#else /* CONFIG_X86_IO_APIC */ +static inline void __init MP_bus_info(struct mpc_bus *m) {} +static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} +static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {} +#endif /* CONFIG_X86_IO_APIC */ + + +static void __init MP_lintsrc_info(struct mpc_lintsrc *m) +{ + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid, + m->srcbusirq, m->destapic, m->destapiclint); +} + +/* + * Read/parse the MPC + */ + +static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) +{ + + if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) { + printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", + mpc->signature[0], mpc->signature[1], + mpc->signature[2], mpc->signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc, mpc->length)) { + printk(KERN_ERR "MPTABLE: checksum error!\n"); + return 0; + } + if (mpc->spec != 0x01 && mpc->spec != 0x04) { + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", + mpc->spec); + return 0; + } + if (!mpc->lapic) { + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); + return 0; + } + memcpy(oem, mpc->oem, 8); + oem[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); + + memcpy(str, mpc->productid, 12); + str[12] = 0; + + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); + +#ifndef CONFIG_XEN + printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic); +#endif + + return 1; +} + +static void skip_entry(unsigned char **ptr, int *count, int size) +{ + *ptr += size; + *count += size; +} + +static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) +{ + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n" + "type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->length, 1); +} + +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + +static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) +{ + char str[16]; + char oem[10]; + + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_32 + generic_mps_oem_check(mpc, oem, str); +#endif + /* save the local APIC address, it might be non-default */ + if (!acpi_lapic) + mp_lapic_addr = mpc->lapic; +#endif + + if (early) + return 1; + + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); + + /* + * Now process the configuration blocks. + */ + x86_init.mpparse.mpc_record(0); + + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info((struct mpc_cpu *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + MP_bus_info((struct mpc_bus *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + MP_ioapic_info((struct mpc_ioapic *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + MP_intsrc_info((struct mpc_intsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + MP_lintsrc_info((struct mpc_lintsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + count = mpc->length; + break; + } + x86_init.mpparse.mpc_record(1); + } + +#ifdef CONFIG_X86_BIGSMP + generic_bigsmp_probe(); +#endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); + + if (!num_processors) + printk(KERN_ERR "MPTABLE: no processors registered!\n"); + return num_processors; +} + +#ifdef CONFIG_X86_IO_APIC + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.type = MP_INTSRC; + intsrc.irqflag = 0; /* conforming */ + intsrc.srcbus = 0; + intsrc.dstapic = mp_ioapics[0].apicid; + + intsrc.irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... " + "falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || + ELCR_trigger(13)) + printk(KERN_ERR "ELCR contains invalid data... " + "not using ELCR\n"); + else { + printk(KERN_INFO + "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.irqflag = 13; + else + intsrc.irqflag = 0; + } + + intsrc.srcbusirq = i; + intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.irqtype = mp_ExtINT; + intsrc.srcbusirq = 0; + intsrc.dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + + +static void __init construct_ioapic_table(int mpc_default_type) +{ + struct mpc_ioapic ioapic; + struct mpc_bus bus; + + bus.type = MP_BUS; + bus.busid = 0; + switch (mpc_default_type) { + default: + printk(KERN_ERR "???\nUnknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.busid = 1; + memcpy(bus.bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); +} +#else +static inline void __init construct_ioapic_table(int mpc_default_type) { } +#endif + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_cpu processor; + struct mpc_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + +#ifndef CONFIG_XEN + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; +#endif + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.cpuflag = CPU_ENABLED; + processor.cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.featureflag = boot_cpu_data.x86_capability[0]; + processor.reserved[0] = 0; + processor.reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.apicid = i; + MP_processor_info(&processor); + } + + construct_ioapic_table(mpc_default_type); + + lintsrc.type = MP_LINTSRC; + lintsrc.irqflag = 0; /* conforming */ + lintsrc.srcbusid = 0; + lintsrc.srcbusirq = 0; + lintsrc.destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.irqtype = linttypes[i]; + lintsrc.destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct mpf_intel *mpf_found; + +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_ioremap(physptr, PAGE_SIZE); + size = mpc->length; + early_iounmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + +static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) +{ + struct mpc_table *mpc; + unsigned long size; + + size = get_mpc_size(mpf->physptr); + mpc = early_ioremap(mpf->physptr, size); + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(mpc, early)) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 0; +#endif + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n" + "... disabling SMP support. (tell your hw vendor)\n"); + early_iounmap(mpc, size); + return -1; + } + early_iounmap(mpc, size); + + if (early) + return -1; + +#ifdef CONFIG_X86_IO_APIC + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " + "using default mptable. (tell your hw vendor)\n"); + + bus.type = MP_BUS; + bus.busid = 0; + memcpy(bus.bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } +#endif + + return 0; +} + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init default_get_smp_config(unsigned int early) +{ + struct mpf_intel *mpf = mpf_found; + + if (!mpf) + return; + + if (acpi_lapic && early) + return; + + /* + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table + */ + if (acpi_lapic && acpi_ioapic) + return; + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", + mpf->specification); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN) + if (mpf->feature2 & (1 << 7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } +#endif + /* + * Now see if we need to read further. + */ + if (mpf->feature1 != 0) { + if (early) { +#ifndef CONFIG_XEN + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; +#endif + return; + } + + printk(KERN_INFO "Default MP configuration #%d\n", + mpf->feature1); + construct_default_ISA_mptable(mpf->feature1); + + } else if (mpf->physptr) { + if (check_physptr(mpf, early)) + return; + } else + BUG(); + + if (!early) + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +#ifndef CONFIG_XEN +static void __init smp_reserve_bootmem(struct mpf_intel *mpf) +{ + unsigned long size = get_mpc_size(mpf->physptr); +#ifdef CONFIG_X86_32 + /* + * We cannot access to MPC table to compute table size yet, + * as only few megabytes from the bottom is mapped now. + * PC-9800's MPC table places on the very last of physical + * memory; so that simply reserving PAGE_SIZE from mpf->physptr + * yields BUG() in reserve_bootmem. + * also need to make sure physptr is below than max_low_pfn + * we don't need reserve the area above max_low_pfn + */ + unsigned long end = max_low_pfn * PAGE_SIZE; + + if (mpf->physptr < end) { + if (mpf->physptr + size > end) + size = end - mpf->physptr; + reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); + } +#else + reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); +#endif +} +#endif + +static int __init smp_scan_config(unsigned long base, unsigned long length, + unsigned reserve) +{ + unsigned int *bp = _bus_to_virt(base); + struct mpf_intel *mpf; + + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", + bp, length); + BUILD_BUG_ON(sizeof(*mpf) != 16); + + while (length > 0) { + mpf = (struct mpf_intel *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->specification == 1) + || (mpf->specification == 4))) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 1; +#endif + mpf_found = mpf; + +#ifndef CONFIG_XEN + printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", + mpf, (u64)virt_to_phys(mpf)); + + if (!reserve) + return 1; + reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), + BOOTMEM_DEFAULT); + if (mpf->physptr) + smp_reserve_bootmem(mpf); +#else + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, ((void *)bp - _bus_to_virt(base)) + base); +#endif + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init default_find_smp_config(unsigned int reserve) +{ +#ifndef CONFIG_XEN + unsigned int address; +#endif + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0, 0x400, reserve) || + smp_scan_config(639 * 0x400, 0x400, reserve) || + smp_scan_config(0xF0000, 0x10000, reserve)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + +#ifndef CONFIG_XEN + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400, reserve); +#endif +} + +#ifdef CONFIG_X86_IO_APIC +static u8 __initdata irq_used[MAX_IRQ_SOURCES]; + +static int __init get_MP_intsrc_index(struct mpc_intsrc *m) +{ + int i; + + if (m->irqtype != mp_INT) + return 0; + + if (m->irqflag != 0x0f) + return 0; + + /* not legacy */ + + for (i = 0; i < mp_irq_entries; i++) { + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != 0x0f) + continue; + + if (mp_irqs[i].srcbus != m->srcbus) + continue; + if (mp_irqs[i].srcbusirq != m->srcbusirq) + continue; + if (irq_used[i]) { + /* already claimed */ + return -2; + } + irq_used[i] = 1; + return i; + } + + /* not found */ + return -1; +} + +#define SPARE_SLOT_NUM 20 + +static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; + +static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) +{ + int i; + + apic_printk(APIC_VERBOSE, "OLD "); + print_MP_intsrc_info(m); + + i = get_MP_intsrc_index(m); + if (i > 0) { + assign_to_mpc_intsrc(&mp_irqs[i], m); + apic_printk(APIC_VERBOSE, "NEW "); + print_mp_irq_info(&mp_irqs[i]); + return; + } + if (!i) { + /* legacy, do nothing */ + return; + } + if (*nr_m_spare < SPARE_SLOT_NUM) { + /* + * not found (-1), or duplicated (-2) are invalid entries, + * we need to use the slot later + */ + m_spare[*nr_m_spare] = m; + *nr_m_spare += 1; + } +} +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ + +static int +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) +{ + int ret = 0; + + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; + } + + return ret; +} + +static int __init replace_intsrc_all(struct mpc_table *mpc, + unsigned long mpc_new_phys, + unsigned long mpc_new_length) +{ +#ifdef CONFIG_X86_IO_APIC + int i; +#endif + int count = sizeof(*mpc); + int nr_m_spare = 0; + unsigned char *mpt = ((unsigned char *)mpc) + count; + + printk(KERN_INFO "mpc_length %x\n", mpc->length); + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + goto out; + } + } + +#ifdef CONFIG_X86_IO_APIC + for (i = 0; i < mp_irq_entries; i++) { + if (irq_used[i]) + continue; + + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != 0x0f) + continue; + + if (nr_m_spare > 0) { + apic_printk(APIC_VERBOSE, "*NEW* found\n"); + nr_m_spare--; + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); + m_spare[nr_m_spare] = NULL; + } else { + struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; + count += sizeof(struct mpc_intsrc); + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) + goto out; + assign_to_mpc_intsrc(&mp_irqs[i], m); + mpc->length = count; + mpt += sizeof(struct mpc_intsrc); + } + print_mp_irq_info(&mp_irqs[i]); + } +#endif +out: + /* update checksum */ + mpc->checksum = 0; + mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length); + + return 0; +} + +int enable_update_mptable; + +static int __init update_mptable_setup(char *str) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + return 0; +} +early_param("update_mptable", update_mptable_setup); + +static unsigned long __initdata mpc_new_phys; +static unsigned long mpc_new_length __initdata = 4096; + +/* alloc_mptable or alloc_mptable=4k */ +static int __initdata alloc_mptable; +static int __init parse_alloc_mptable_opt(char *p) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + alloc_mptable = 1; + if (!p) + return 0; + mpc_new_length = memparse(p, &p); + return 0; +} +early_param("alloc_mptable", parse_alloc_mptable_opt); + +void __init early_reserve_e820_mpc_new(void) +{ + if (enable_update_mptable && alloc_mptable) { + u64 startt = 0; +#ifdef CONFIG_X86_TRAMPOLINE + startt = TRAMPOLINE_BASE; +#endif + mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); + } +} + +static int __init update_mp_table(void) +{ + char str[16]; + char oem[10]; + struct mpf_intel *mpf; + struct mpc_table *mpc, *mpc_new; + + if (!enable_update_mptable) + return 0; + + mpf = mpf_found; + if (!mpf) + return 0; + + /* + * Now see if we need to go further. + */ + if (mpf->feature1 != 0) + return 0; + + if (!mpf->physptr) + return 0; + + mpc = _bus_to_virt(mpf->physptr); + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + + printk(KERN_INFO "mpf: %llx\n", (u64)arbitrary_virt_to_machine(mpf)); + printk(KERN_INFO "physptr: %x\n", mpf->physptr); + + if (mpc_new_phys && mpc->length > mpc_new_length) { + mpc_new_phys = 0; + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", + mpc_new_length); + } + + if (!mpc_new_phys) { + unsigned char old, new; + /* check if we can change the postion */ + mpc->checksum = 0; + old = mpf_checksum((unsigned char *)mpc, mpc->length); + mpc->checksum = 0xff; + new = mpf_checksum((unsigned char *)mpc, mpc->length); + if (old == new) { + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); + return 0; + } + printk(KERN_INFO "use in-positon replacing\n"); + } else { + maddr_t mpc_new_bus; + + mpc_new_bus = phys_to_machine(mpc_new_phys); + mpf->physptr = mpc_new_bus; + mpc_new = phys_to_virt(mpc_new_phys); + memcpy(mpc_new, mpc, mpc->length); + mpc = mpc_new; + /* check if we can modify that */ + if (mpc_new_bus - mpf->physptr) { + struct mpf_intel *mpf_new; + /* steal 16 bytes from [0, 1k) */ + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); + mpf_new = isa_bus_to_virt(0x400 - 16); + memcpy(mpf_new, mpf, 16); + mpf = mpf_new; + mpf->physptr = mpc_new_bus; + } + mpf->checksum = 0; + mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); + printk(KERN_INFO "physptr new: %x\n", mpf->physptr); + } + + /* + * only replace the one with mp_INT and + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW, + * already in mp_irqs , stored by ... and mp_config_acpi_gsi, + * may need pci=routeirq for all coverage + */ + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); + + return 0; +} + +late_initcall(update_mp_table); --- linux-ec2-2.6.32.orig/arch/x86/kernel/pci-dma-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/pci-dma-xen.c @@ -0,0 +1,408 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static int forbid_dac __read_mostly; + +struct dma_map_ops *dma_ops; +EXPORT_SYMBOL(dma_ops); + +static int iommu_sac_force __read_mostly; + +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow __read_mostly = 1; +int force_iommu __read_mostly = 1; +#else +int panic_on_overflow __read_mostly = 0; +int force_iommu __read_mostly = 0; +#endif + +int iommu_merge __read_mostly = 0; + +int no_iommu __read_mostly; +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; + +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA translation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user want to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; + +dma_addr_t bad_dma_address __read_mostly = 0; +EXPORT_SYMBOL(bad_dma_address); + +/* Dummy device used for NULL arguments (normally ISA). Better would + be probably a smaller DMA mask, but this is bug-to-bug compatible + to older i386. */ +struct device x86_dma_fallback_dev = { + .init_name = "fallback device", + .coherent_dma_mask = DMA_BIT_MASK(32), + .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, +}; +EXPORT_SYMBOL(x86_dma_fallback_dev); + +/* Number of entries preallocated for DMA-API debugging */ +#define PREALLOC_DMA_DEBUG_ENTRIES 32768 + +int dma_set_mask(struct device *dev, u64 mask) +{ + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + + *dev->dma_mask = mask; + + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) +static __initdata void *dma32_bootmem_ptr; +static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); + +static int __init parse_dma32_size_opt(char *p) +{ + if (!p) + return -EINVAL; + dma32_bootmem_size = memparse(p, &p); + return 0; +} +early_param("dma32_size", parse_dma32_size_opt); + +void __init dma32_reserve_bootmem(void) +{ + unsigned long size, align; + if (max_pfn <= MAX_DMA32_PFN) + return; + + /* + * check aperture_64.c allocate_aperture() for reason about + * using 512M as goal + */ + align = 64ULL<<20; + size = roundup(dma32_bootmem_size, align); + dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, + 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(dma32_bootmem_ptr); + if (dma32_bootmem_ptr) + dma32_bootmem_size = size; + else + dma32_bootmem_size = 0; +} +static void __init dma32_free_bootmem(void) +{ + + if (max_pfn <= MAX_DMA32_PFN) + return; + + if (!dma32_bootmem_ptr) + return; + + free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size); + + dma32_bootmem_ptr = NULL; + dma32_bootmem_size = 0; +} +#endif + +static struct dma_map_ops swiotlb_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, + .mapping_error = swiotlb_dma_mapping_error, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, + .sync_single_range_for_device = swiotlb_sync_single_range_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .dma_supported = swiotlb_dma_supported +}; + +void __init pci_iommu_alloc(void) +{ +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + /* free the range so iommu could get some range less than 4G */ + dma32_free_bootmem(); +#endif + + /* + * The order of these functions is important for + * fall-back/fail-over reasons + */ + gart_iommu_hole_init(); + + detect_calgary(); + + detect_intel_iommu(); + + amd_iommu_detect(); + + swiotlb_init(); + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + dma_ops = &swiotlb_dma_ops; + } +} + +void *dma_generic_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag) +{ + unsigned long dma_mask; + struct page *page; +#ifndef CONFIG_XEN + dma_addr_t addr; +#else + void *memory; +#endif + unsigned int order = get_order(size); + + dma_mask = dma_alloc_coherent_mask(dev, flag); + +#ifndef CONFIG_XEN + flag |= __GFP_ZERO; +again: +#else + flag &= ~(__GFP_DMA | __GFP_DMA32); +#endif + page = alloc_pages_node(dev_to_node(dev), flag, order); + if (!page) + return NULL; + +#ifndef CONFIG_XEN + addr = page_to_phys(page); + if (addr + size > dma_mask) { + __free_pages(page, order); + + if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { + flag = (flag & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + return NULL; + } + + *dma_addr = addr; + return page_address(page); +#else + memory = page_address(page); + if (xen_create_contiguous_region((unsigned long)memory, order, + fls64(dma_mask))) { + __free_pages(page, order); + return NULL; + } + + *dma_addr = virt_to_bus(memory); + return memset(memory, 0, size); +#endif +} + +#ifdef CONFIG_XEN +void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + unsigned int order = get_order(size); + unsigned long va = (unsigned long)vaddr; + + xen_destroy_contiguous_region(va, order); + free_pages(va, order); +} +#endif + +/* + * See for the iommu kernel parameter + * documentation. + */ +static __init int iommu_setup(char *p) +{ + iommu_merge = 1; + + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p, "off", 3)) + no_iommu = 1; + /* gart_parse_options has more force support */ + if (!strncmp(p, "force", 5)) + force_iommu = 1; + if (!strncmp(p, "noforce", 7)) { + iommu_merge = 0; + force_iommu = 0; + } + + if (!strncmp(p, "biomerge", 8)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic", 5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic", 7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge", 5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge", 7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac", 8)) + iommu_sac_force = 1; + if (!strncmp(p, "allowdac", 8)) + forbid_dac = 0; + if (!strncmp(p, "nodac", 5)) + forbid_dac = -1; + if (!strncmp(p, "usedac", 6)) { + forbid_dac = -1; + return 1; + } +#ifdef CONFIG_SWIOTLB + if (!strncmp(p, "soft", 4)) + swiotlb = 1; +#endif + if (!strncmp(p, "pt", 2)) + iommu_pass_through = 1; + + gart_parse_options(p); + +#ifdef CONFIG_CALGARY_IOMMU + if (!strncmp(p, "calgary", 7)) + use_calgary = 1; +#endif /* CONFIG_CALGARY_IOMMU */ + + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 0; +} +early_param("iommu", iommu_setup); + +static int check_pages_physically_contiguous(unsigned long pfn, + unsigned int offset, + size_t length) +{ + unsigned long next_mfn; + int i; + int nr_pages; + + next_mfn = pfn_to_mfn(pfn); + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; + + for (i = 1; i < nr_pages; i++) { + if (pfn_to_mfn(++pfn) != ++next_mfn) + return 0; + } + return 1; +} + +int range_straddles_page_boundary(paddr_t p, size_t size) +{ + unsigned long pfn = p >> PAGE_SHIFT; + unsigned int offset = p & ~PAGE_MASK; + + return ((offset + size > PAGE_SIZE) && + !check_pages_physically_contiguous(pfn, offset, size)); +} + +int dma_supported(struct device *dev, u64 mask) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + +#ifdef CONFIG_PCI + if (mask > 0xffffffff && forbid_dac > 0) { + dev_info(dev, "PCI: Disallowing DAC for device\n"); + return 0; + } +#endif + + if (ops->dma_supported) + return ops->dma_supported(dev, mask); + + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < DMA_BIT_MASK(24)) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. This + allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area and + return DAC as fallback address the device may not handle it + correctly. + + As a special case some controllers have a 39bit address + mode that is as efficient as 32bit (aic79xx). Don't force + SAC for these. Assume all masks <= 40 bits are of this + type. Normally this doesn't make any difference, but gives + more gentle handling of IOMMU overflow. */ + if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) { + dev_info(dev, "Force SAC with mask %Lx\n", mask); + return 0; + } + + return 1; +} +EXPORT_SYMBOL(dma_supported); + +static int __init pci_iommu_init(void) +{ + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); + +#ifdef CONFIG_PCI + dma_debug_add_bus(&pci_bus_type); +#endif + + calgary_iommu_init(); + + intel_iommu_init(); + + amd_iommu_init(); + + gart_iommu_init(); + + no_iommu_init(); + return 0; +} + +void pci_iommu_shutdown(void) +{ + gart_iommu_shutdown(); + + amd_iommu_shutdown(); +} +/* Must execute after PCI subsystem */ +rootfs_initcall(pci_iommu_init); + +#ifdef CONFIG_PCI +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ + +static __devinit void via_no_dac(struct pci_dev *dev) +{ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); + forbid_dac = 1; + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +#endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/pci-nommu-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/pci-nommu-xen.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#define IOMMU_BUG_ON(test) \ +do { \ + if (unlikely(test)) { \ + printk(KERN_ALERT "Fatal DMA error! " \ + "Please use 'swiotlb=force'\n"); \ + BUG(); \ + } \ +} while (0) + +static int +gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + unsigned int i; + struct scatterlist *sg; + + WARN_ON(nents == 0 || sgl->length == 0); + + for_each_sg(sgl, sg, nents, i) { + BUG_ON(!sg_page(sg)); + sg->dma_address = + gnttab_dma_map_page(sg_page(sg)) + sg->offset; + sg->dma_length = sg->length; + IOMMU_BUG_ON(!dma_capable( + hwdev, sg->dma_address, sg->length)); + IOMMU_BUG_ON(range_straddles_page_boundary( + page_to_pseudophys(sg_page(sg)) + sg->offset, + sg->length)); + } + + return nents; +} + +static void +gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + unsigned int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) + gnttab_dma_unmap_page(sg->dma_address); +} + +static dma_addr_t +gnttab_map_page(struct device *dev, struct page *page, unsigned long offset, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + dma_addr_t dma; + + WARN_ON(size == 0); + + dma = gnttab_dma_map_page(page) + offset; + IOMMU_BUG_ON(range_straddles_page_boundary(offset, size)); + IOMMU_BUG_ON(!dma_capable(dev, dma, size)); + + return dma; +} + +static void +gnttab_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + gnttab_dma_unmap_page(dma_addr); +} + +static void nommu_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + + +static void nommu_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + +struct dma_map_ops nommu_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, + .map_page = gnttab_map_page, + .unmap_page = gnttab_unmap_page, + .map_sg = gnttab_map_sg, + .unmap_sg = gnttab_unmap_sg, + .sync_single_for_device = nommu_sync_single_for_device, + .sync_sg_for_device = nommu_sync_sg_for_device, + .dma_supported = swiotlb_dma_supported, +}; + +void __init no_iommu_init(void) +{ + if (dma_ops) + return; + + force_iommu = 0; /* no HW IOMMU */ + dma_ops = &nommu_dma_ops; +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/process-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/process-xen.c @@ -0,0 +1,593 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long idle_halt; +EXPORT_SYMBOL(idle_halt); +unsigned long idle_nomwait; +EXPORT_SYMBOL(idle_nomwait); + +struct kmem_cache *task_xstate_cachep; + +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) +{ + *dst = *src; + if (src->thread.xstate) { + dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, + GFP_KERNEL); + if (!dst->thread.xstate) + return -ENOMEM; + WARN_ON((unsigned long)dst->thread.xstate & 15); + memcpy(dst->thread.xstate, src->thread.xstate, xstate_size); + } + return 0; +} + +void free_thread_xstate(struct task_struct *tsk) +{ + if (tsk->thread.xstate) { + kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); + tsk->thread.xstate = NULL; + } + + WARN(tsk->thread.ds_ctx, "leaking DS context\n"); +} + +void free_thread_info(struct thread_info *ti) +{ + free_thread_xstate(ti->task); + free_pages((unsigned long)ti, get_order(THREAD_SIZE)); +} + +void arch_task_cache_init(void) +{ + task_xstate_cachep = + kmem_cache_create("task_xstate", xstate_size, + __alignof__(union thread_xstate), + SLAB_PANIC | SLAB_NOTRACK, NULL); +} + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *me = current; + struct thread_struct *t = &me->thread; + unsigned long *bp = t->io_bitmap_ptr; + + if (bp) { + struct physdev_set_iobitmap set_iobitmap; + + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); + /* + * Careful, clear this in the TSS too: + */ + memset(&set_iobitmap, 0, sizeof(set_iobitmap)); + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &set_iobitmap)); + t->io_bitmap_max = 0; + kfree(bp); + } +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + + clear_tsk_thread_flag(tsk, TIF_DEBUG); + + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* + * Forget coprocessor state.. + */ + tsk->fpu_counter = 0; + clear_fpu(tsk); + clear_used_math(); +} + +static void hard_disable_TSC(void) +{ + write_cr4(read_cr4() | X86_CR4_TSD); +} + +void disable_TSC(void) +{ +#ifdef CONFIG_SECCOMP_DISABLE_TSC + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_disable_TSC(); + preempt_enable(); +#endif +} + +static void hard_enable_TSC(void) +{ + write_cr4(read_cr4() & ~X86_CR4_TSD); +} + +static void enable_TSC(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_enable_TSC(); + preempt_enable(); +} + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (test_thread_flag(TIF_NOTSC)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (val == PR_TSC_SIGSEGV) + disable_TSC(); + else if (val == PR_TSC_ENABLE) + enable_TSC(); + else + return -EINVAL; + + return 0; +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread; + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) + update_debugctlmsr(next->debugctlmsr); + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + set_debugreg(next->debugreg0, 0); + set_debugreg(next->debugreg1, 1); + set_debugreg(next->debugreg2, 2); + set_debugreg(next->debugreg3, 3); + /* no 4 and 5 */ + set_debugreg(next->debugreg6, 6); + set_debugreg(next->debugreg7, 7); + } + + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ + test_tsk_thread_flag(next_p, TIF_NOTSC)) { + /* prev and next are different */ + if (test_tsk_thread_flag(next_p, TIF_NOTSC)) + hard_disable_TSC(); + else + hard_enable_TSC(); + } +} + +int sys_fork(struct pt_regs *regs) +{ + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +int sys_vfork(struct pt_regs *regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, + NULL, NULL); +} + + +/* + * Idle related variables and functions + */ +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); + +#ifdef CONFIG_X86_32 +/* + * This halt magic was a workaround for ancient floppy DMA + * wreckage. It should be safe to remove. + */ +static int hlt_counter; +void disable_hlt(void) +{ + hlt_counter++; +} +EXPORT_SYMBOL(disable_hlt); + +void enable_hlt(void) +{ + hlt_counter--; +} +EXPORT_SYMBOL(enable_hlt); + +static inline int hlt_use_halt(void) +{ + return (!hlt_counter && boot_cpu_data.hlt_works_ok); +} +#else +static inline int hlt_use_halt(void) +{ + return 1; +} +#endif + +/* + * We use this if we don't have any better + * idle routine.. + */ +void xen_idle(void) +{ + trace_power_start(POWER_CSTATE, 1); + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); + + if (!need_resched()) + safe_halt(); /* enables interrupts racelessly */ + else + local_irq_enable(); + current_thread_info()->status |= TS_POLLING; +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(default_idle); +#endif + +void stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + set_cpu_online(smp_processor_id(), false); + disable_all_local_evtchn(); + + for (;;) { + if (hlt_works(smp_processor_id())) + halt(); + } +} + +static void do_nothing(void *unused) +{ +} + +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ +void cpu_idle_wait(void) +{ + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 1); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +#ifndef CONFIG_XEN +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) +{ + trace_power_start(POWER_CSTATE, (ax>>4)+1); + if (!need_resched()) { + if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __mwait(ax, cx); + } +} + +/* Default MONITOR/MWAIT with no hints, used for default C1 state */ +static void mwait_idle(void) +{ + if (!need_resched()) { + trace_power_start(POWER_CSTATE, 1); + if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + } else + local_irq_enable(); +} +#endif + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->work.need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle(void) +{ + trace_power_start(POWER_CSTATE, 0); + local_irq_enable(); + while (!need_resched()) + cpu_relax(); + trace_power_end(0); +} + +#ifndef CONFIG_XEN +/* + * mwait selection logic: + * + * It depends on the CPU. For AMD CPUs that support MWAIT this is + * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings + * then depend on a clock divisor and current Pstate of the core. If + * all cores of a processor are in halt state (C1) the processor can + * enter the C1E (C1 enhanced) state. If mwait is used this will never + * happen. + * + * idle=mwait overrides this decision and forces the usage of mwait. + */ +static int __cpuinitdata force_mwait; + +#define MWAIT_INFO 0x05 +#define MWAIT_ECX_EXTENDED_INFO 0x01 +#define MWAIT_EDX_C1 0xf0 + +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + + if (force_mwait) + return 1; + + if (c->cpuid_level < MWAIT_INFO) + return 0; + + cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx); + /* Check, whether EDX has extended info about MWAIT */ + if (!(ecx & MWAIT_ECX_EXTENDED_INFO)) + return 1; + + /* + * edx enumeratios MONITOR/MWAIT extensions. Check, whether + * C1 supports MWAIT + */ + return (edx & MWAIT_EDX_C1); +} + +/* + * Check for AMD CPUs, which have potentially C1E support + */ +static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) +{ + if (c->x86_vendor != X86_VENDOR_AMD) + return 0; + + if (c->x86 < 0x0F) + return 0; + + /* Family 0x0f models < rev F do not have C1E */ + if (c->x86 == 0x0f && c->x86_model < 0x40) + return 0; + + return 1; +} + +static cpumask_var_t c1e_mask; +static int c1e_detected; + +void c1e_remove_cpu(int cpu) +{ + if (c1e_mask != NULL) + cpumask_clear_cpu(cpu, c1e_mask); +} + +/* + * C1E aware idle routine. We check for C1E active in the interrupt + * pending message MSR. If we detect C1E, then we handle it the same + * way as C3 power states (local apic timer and TSC stop) + */ +static void c1e_idle(void) +{ + if (need_resched()) + return; + + if (!c1e_detected) { + u32 lo, hi; + + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { + c1e_detected = 1; + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + printk(KERN_INFO "System has AMD C1E enabled\n"); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); + } + } + + if (c1e_detected) { + int cpu = smp_processor_id(); + + if (!cpumask_test_cpu(cpu, c1e_mask)) { + cpumask_set_cpu(cpu, c1e_mask); + /* + * Force broadcast so ACPI can not interfere. + */ + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, + &cpu); + printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", + cpu); + } + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); + + default_idle(); + + /* + * The switch back from broadcast mode needs to be + * called with interrupts disabled. + */ + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); + } else + default_idle(); +} +#endif + +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +{ +#ifndef CONFIG_XEN +#ifdef CONFIG_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); + } +#endif + if (pm_idle) + return; + + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { + /* + * One CPU supports mwait => All CPUs supports mwait + */ + printk(KERN_INFO "using mwait in idle threads.\n"); + pm_idle = mwait_idle; + } else if (check_c1e_idle(c)) { + printk(KERN_INFO "using C1E aware idle routine\n"); + pm_idle = c1e_idle; + } else + pm_idle = default_idle; +#endif +} + +void __init init_c1e_mask(void) +{ +#ifndef CONFIG_XEN + /* If we're using c1e_idle, we need to allocate c1e_mask. */ + if (pm_idle == c1e_idle) + zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); +#endif +} + +static int __init idle_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "poll")) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; +#ifndef CONFIG_XEN + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else if (!strcmp(str, "halt")) { + /* + * When the boot option of idle=halt is added, halt is + * forced to be used for CPU idle. In such case CPU C2/C3 + * won't be used again. + * To continue to load the CPU idle driver, don't touch + * the boot_option_idle_override. + */ + pm_idle = default_idle; + idle_halt = 1; + return 0; + } else if (!strcmp(str, "nomwait")) { + /* + * If the boot option of "idle=nomwait" is added, + * it means that mwait will be disabled for CPU C2/C3 + * states. In such case it won't touch the variable + * of boot_option_idle_override. + */ + idle_nomwait = 1; + return 0; +#endif + } else + return -1; + + boot_option_idle_override = 1; + return 0; +} +early_param("idle", idle_setup); + +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + --- linux-ec2-2.6.32.orig/arch/x86/kernel/process_32-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/process_32-xen.c @@ -0,0 +1,568 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_MATH_EMULATION +#include +#endif + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); + +/* + * Return saved PC of a blocked thread. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + return ((unsigned long *)tsk->thread.sp)[3]; +} + +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + int cpu = smp_processor_id(); + + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + + current_thread_info()->status |= TS_POLLING; + + /* endless idle loop with no priority at all */ + while (1) { + tick_nohz_stop_sched_tick(1); + while (!need_resched()) { + + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(cpu)) + play_dead(); + + local_irq_disable(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); + xen_idle(); + start_critical_timings(); + } + tick_nohz_restart_sched_tick(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + +void __show_regs(struct pt_regs *regs, int all) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned long sp; + unsigned short ss, gs; + const char *board; + + if (user_mode_vm(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + gs = get_user_gs(regs); + } else { + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + savesegment(gs, gs); + } + + printk("\n"); + + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", + task_pid_nr(current), current->comm, + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, board); + + printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + (u16)regs->cs, regs->ip, regs->flags, + smp_processor_id()); + print_symbol("EIP is at %s\n", regs->ip); + + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); + printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + regs->si, regs->di, regs->bp, sp); + printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); + + if (!all) + return; + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4_safe(); + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + cr0, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + d0, d1, d2, d3); + + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk("DR6: %08lx DR7: %08lx\n", + d6, d7); +} + +void show_regs(struct pt_regs *regs) +{ + __show_regs(regs, 1); + show_trace(NULL, regs, ®s->sp, regs->bp); +} + +/* + * This gets run with %bx containing the + * function to call, and %dx containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.bx = (unsigned long) fn; + regs.dx = (unsigned long) arg; + + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + +void release_thread(struct task_struct *dead_task) +{ + BUG_ON(dead_task->mm); + release_vm86_irqs(dead_task); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(unsigned long clone_flags, unsigned long sp, + unsigned long unused, + struct task_struct *p, struct pt_regs *regs) +{ + struct pt_regs *childregs; + struct task_struct *tsk; + int err; + + childregs = task_pt_regs(p); + *childregs = *regs; + childregs->ax = 0; + childregs->sp = sp; + + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + + p->thread.ip = (unsigned long) ret_from_fork; + + task_user_gs(p) = get_user_gs(regs); + + tsk = current; + if (test_tsk_thread_flag(tsk, TIF_CSTAR)) + p->thread.ip = (unsigned long) cstar_ret_from_fork; + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + set_tsk_thread_flag(p, TIF_IO_BITMAP); + } + + err = 0; + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); + + p->thread.iopl = current->thread.iopl; + + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + + return err; +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + set_user_gs(regs, 0); + regs->fs = 0; + set_fs(USER_DS); + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + regs->ip = new_ip; + regs->sp = new_sp; + /* + * Free the old FP and other extended state + */ + free_thread_xstate(current); +} +EXPORT_SYMBOL_GPL(start_thread); + +/* + * switch_to(x,yn) should switch tasks from x to y. + * + * We fsave/fwait so that an exception goes off at the right time + * (as a call from the fsave or fwait in effect) rather than to + * the wrong process. Lazy FP saving no longer makes any sense + * with modern CPU's, and this simplifies a lot of things (SMP + * and UP become the same). + * + * NOTE! We used to use the x86 hardware context switching. The + * reason for not using it any more becomes apparent when you + * try to recover gracefully from saved state that is no longer + * valid (stale segment register values in particular). With the + * hardware task-switch, there is no way to fix up bad state in + * a reasonable manner. + * + * The fact that Intel documents the hardware task-switching to + * be slow is a fairly red herring - this code is not noticeably + * faster. However, there _is_ some room for improvement here, + * so the performance issues may eventually be a valid point. + * More important, however, is the fact that this allows us much + * more flexibility. + * + * The return value (in %ax) will be the "prev" task after + * the task-switch, and shows up in ret_from_fork in entry.S, + * for example. + */ +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *tss = &per_cpu(init_tss, cpu); +#endif + bool preload_fpu; +#if CONFIG_XEN_COMPAT > 0x030002 + struct physdev_set_iopl iopl_op; + struct physdev_set_iobitmap iobmp_op; +#else + struct physdev_op _pdo[2], *pdo = _pdo; +#define iopl_op pdo->u.set_iopl +#define iobmp_op pdo->u.set_iobitmap +#endif + multicall_entry_t _mcl[8], *mcl = _mcl; + + /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ + + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + + /* + * This is basically '__unlazy_fpu', except that we queue a + * multicall to indicate FPU task switch, rather than + * synchronously trapping to Xen. + */ + if (task_thread_info(prev_p)->status & TS_USEDFPU) { + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ + if (!preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } + } +#if 0 /* lazy fpu sanity check */ + else BUG_ON(!(read_cr0() & 8)); +#endif + + /* + * Reload sp0. + * This is load_sp0(tss, next) with a multicall. + */ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = __KERNEL_DS; + mcl->args[1] = next->sp0; + mcl++; + + /* + * Load the per-thread Thread-Local Storage descriptor. + * This is load_TLS(next, cpu) with multicalls. + */ +#define C(i) do { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ + mcl->op = __HYPERVISOR_update_descriptor; \ + *(u64 *)&mcl->args[0] = arbitrary_virt_to_machine( \ + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ + *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ + mcl++; \ + } \ +} while (0) + C(0); C(1); C(2); +#undef C + + if (unlikely(prev->iopl != next->iopl)) { + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iopl; + mcl->args[1] = (unsigned long)&iopl_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iopl; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + + /* If we're going to preload the fpu context, make sure clts + is run while we're batching the cpu state updates. */ + if (preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 0; + mcl++; + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + set_xen_guest_handle(iobmp_op.bitmap, + (char *)next->io_bitmap_ptr); + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iobitmap; + mcl->args[1] = (unsigned long)&iobmp_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iobitmap; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + +#if CONFIG_XEN_COMPAT <= 0x030002 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); +#endif + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) + BUG(); + + /* we're going to use this soon, after a few expensive things */ + if (preload_fpu) + prefetch(next->xstate); + + /* + * Now maybe handle debug registers + */ + if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || + task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) + __switch_to_xtra(prev_p, next_p); + + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_end_context_switch(next_p); + + if (preload_fpu) + __math_state_restore(); + + /* + * Restore %gs if needed (which is common) + */ + if (prev->gs | next->gs) + lazy_load_gs(next->gs); + + percpu_write(current_task, next_p); + + return prev_p; +} + +int sys_clone(struct pt_regs *regs) +{ + unsigned long clone_flags; + unsigned long newsp; + int __user *parent_tidptr, *child_tidptr; + + clone_flags = regs->bx; + newsp = regs->cx; + parent_tidptr = (int __user *)regs->dx; + child_tidptr = (int __user *)regs->di; + if (!newsp) + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); +} + +/* + * sys_execve() executes a new program. + */ +int sys_execve(struct pt_regs *regs) +{ + int error; + char *filename; + + filename = getname((char __user *) regs->bx); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + goto out; + error = do_execve(filename, + (char __user * __user *) regs->cx, + (char __user * __user *) regs->dx, + regs); + if (error == 0) { + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } + putname(filename); +out: + return error; +} + +#define top_esp (THREAD_SIZE - sizeof(unsigned long)) +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long bp, sp, ip; + unsigned long stack_page; + int count = 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack_page = (unsigned long)task_stack_page(p); + sp = p->thread.sp; + if (!stack_page || sp < stack_page || sp > top_esp+stack_page) + return 0; + /* include/asm-i386/system.h:switch_to() pushes bp last. */ + bp = *(unsigned long *) sp; + do { + if (bp < stack_page || bp > top_ebp+stack_page) + return 0; + ip = *(unsigned long *) (bp+4); + if (!in_sched_functions(ip)) + return ip; + bp = *(unsigned long *) bp; + } while (count++ < 16); + return 0; +} + --- linux-ec2-2.6.32.orig/arch/x86/kernel/process_64-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/process_64-xen.c @@ -0,0 +1,754 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + * + * X86-64 port + * Andi Kleen. + * + * CPU hotplug support - ashok.raj@intel.com + * + * Jun Nakajima + * Modified for Xen + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +asmlinkage extern void ret_from_fork(void); + +DEFINE_PER_CPU(unsigned long, old_rsp); +static DEFINE_PER_CPU(unsigned char, is_idle); + +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); + +void enter_idle(void) +{ + percpu_write(is_idle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); +} + +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + current_thread_info()->status |= TS_POLLING; + + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + + /* endless idle loop with no priority at all */ + while (1) { + tick_nohz_stop_sched_tick(1); + while (!need_resched()) { + + rmb(); + + if (cpu_is_offline(smp_processor_id())) + play_dead(); + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_irq_disable(); + enter_idle(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); + xen_idle(); + start_critical_timings(); + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ + __exit_idle(); + } + + tick_nohz_restart_sched_tick(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + +/* Prints also some state that isn't saved in the pt_regs */ +void __show_regs(struct pt_regs *regs, int all) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; + const char *board; + + printk("\n"); + print_modules(); + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, board); + printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk_address(regs->ip, 1); + printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + regs->sp, regs->flags); + printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + regs->ax, regs->bx, regs->cx); + printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + regs->dx, regs->si, regs->di); + printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + regs->bp, regs->r8, regs->r9); + printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); + asm("mov %%fs,%0" : "=r" (fsindex)); + asm("mov %%gs,%0" : "=r" (gsindex)); + + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + if (!all) + return; + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4(); + + printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs, fsindex, gs, gsindex, shadowgs); + printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + es, cr0); + printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); +} + +void show_regs(struct pt_regs *regs) +{ + printk(KERN_INFO "CPU %d:", smp_processor_id()); + __show_regs(regs, 1); + show_trace(NULL, regs, (void *)(regs + 1), regs->bp); +} + +void xen_load_gs_index(unsigned gs) +{ + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs)); +} + +void release_thread(struct task_struct *dead_task) +{ + if (dead_task->mm) { + if (dead_task->mm->context.size) { + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); + BUG(); + } + } +} + +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) +{ + struct user_desc ud = { + .base_addr = addr, + .limit = 0xfffff, + .seg_32bit = 1, + .limit_in_pages = 1, + .useable = 1, + }; + struct desc_struct *desc = t->thread.tls_array; + desc += tls; + fill_ldt(desc, &ud); +} + +static inline u32 read_32bit_tls(struct task_struct *t, int tls) +{ + return get_desc_base(&t->thread.tls_array[tls]); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(unsigned long clone_flags, unsigned long sp, + unsigned long unused, + struct task_struct *p, struct pt_regs *regs) +{ + int err; + struct pt_regs *childregs; + struct task_struct *me = current; + + childregs = ((struct pt_regs *) + (THREAD_SIZE + task_stack_page(p))) - 1; + *childregs = *regs; + + childregs->ax = 0; + childregs->sp = sp; + if (sp == ~0UL) + childregs->sp = (unsigned long)childregs; + + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + p->thread.usersp = me->thread.usersp; + + set_tsk_thread_flag(p, TIF_FORK); + + p->thread.fs = me->thread.fs; + p->thread.gs = me->thread.gs; + + savesegment(gs, p->thread.gsindex); + savesegment(fs, p->thread.fsindex); + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); + + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); + set_tsk_thread_flag(p, TIF_IO_BITMAP); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) + goto out; + } + p->thread.iopl = current->thread.iopl; + + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + + err = 0; +out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + loadsegment(fs, 0); + loadsegment(es, 0); + loadsegment(ds, 0); + load_gs_index(0); + regs->ip = new_ip; + regs->sp = new_sp; + percpu_write(old_rsp, new_sp); + regs->cs = __USER_CS; + regs->ss = __USER_DS; + regs->flags = 0x200; + set_fs(USER_DS); + /* + * Free the old FP and other extended state + */ + free_thread_xstate(current); +} +EXPORT_SYMBOL_GPL(start_thread); + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * This could still be optimized: + * - fold all the options into a flag word and test it with a single test. + * - could test fs/gs bitsliced + * + * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. + */ +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; + int cpu = smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *tss = &per_cpu(init_tss, cpu); +#endif + bool preload_fpu; +#if CONFIG_XEN_COMPAT > 0x030002 + struct physdev_set_iopl iopl_op; + struct physdev_set_iobitmap iobmp_op; +#else + struct physdev_op _pdo[2], *pdo = _pdo; +#define iopl_op pdo->u.set_iopl +#define iobmp_op pdo->u.set_iobitmap +#endif + multicall_entry_t _mcl[8], *mcl = _mcl; + + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + + /* we're going to use this soon, after a few expensive things */ + if (preload_fpu) + prefetch(next->xstate); + + /* + * This is basically '__unlazy_fpu', except that we queue a + * multicall to indicate FPU task switch, rather than + * synchronously trapping to Xen. + * The AMD workaround requires it to be after DS reload, or + * after DS has been cleared, which we do in __prepare_arch_switch. + */ + if (task_thread_info(prev_p)->status & TS_USEDFPU) { + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ + if (!preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } + } else + prev_p->fpu_counter = 0; + + /* Make sure cpu is ready for new context */ + if (preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 0; + mcl++; + } + + /* + * Reload sp0. + * This is load_sp0(tss, next) with a multicall. + */ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = __KERNEL_DS; + mcl->args[1] = next->sp0; + mcl++; + + /* + * Load the per-thread Thread-Local Storage descriptor. + * This is load_TLS(next, cpu) with multicalls. + */ +#define C(i) do { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ + mcl->op = __HYPERVISOR_update_descriptor; \ + mcl->args[0] = arbitrary_virt_to_machine( \ + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ + mcl->args[1] = *(u64 *)&next->tls_array[i]; \ + mcl++; \ + } \ +} while (0) + C(0); C(1); C(2); +#undef C + + if (unlikely(prev->iopl != next->iopl)) { + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iopl; + mcl->args[1] = (unsigned long)&iopl_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iopl; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + set_xen_guest_handle(iobmp_op.bitmap, + (char *)next->io_bitmap_ptr); + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iobitmap; + mcl->args[1] = (unsigned long)&iobmp_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iobitmap; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + +#if CONFIG_XEN_COMPAT <= 0x030002 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); +#endif + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) + BUG(); + + /* + * Switch DS and ES. + * This won't pick up thread selector changes, but I guess that is ok. + */ + if (unlikely(next->es)) + loadsegment(es, next->es); + + if (unlikely(next->ds)) + loadsegment(ds, next->ds); + + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_end_context_switch(next_p); + + /* + * Switch FS and GS. + * + * Segment register != 0 always requires a reload. Also + * reload when it has changed. When prev process used 64bit + * base always reload to avoid an information leak. + */ + if (unlikely(next->fsindex)) + loadsegment(fs, next->fsindex); + + if (next->fs) + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs)); + + if (unlikely(next->gsindex)) + load_gs_index(next->gsindex); + + if (next->gs) + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs)); + + /* + * Switch the PDA context. + */ + prev->usersp = percpu_read(old_rsp); + percpu_write(old_rsp, next->usersp); + percpu_write(current_task, next_p); + + percpu_write(kernel_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); + + /* + * Now maybe reload the debug registers + */ + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) + __switch_to_xtra(prev_p, next_p); + + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. + */ + if (preload_fpu) + __math_state_restore(); + return prev_p; +} + +/* + * sys_execve() executes a new program. + */ +asmlinkage +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs *regs) +{ + long error; + char *filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, regs); + putname(filename); + return error; +} + +void set_personality_64bit(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 64bit mode */ + clear_thread_flag(TIF_IA32); + + /* TBD: overwrites user setup. Should have two bits. + But 64bit processes have always behaved this way, + so it's not too bad. The main problem is just that + 32bit childs are affected again. */ + current->personality &= ~READ_IMPLIES_EXEC; +} + +asmlinkage long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + +void set_personality_ia32(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; +} + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long stack; + u64 fp, ip; + int count = 0; + + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack = (unsigned long)task_stack_page(p); + if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) + return 0; + fp = *(u64 *)(p->thread.sp); + do { + if (fp < (unsigned long)stack || + fp >= (unsigned long)stack+THREAD_SIZE) + return 0; + ip = *(u64 *)(fp+8); + if (!in_sched_functions(ip)) + return ip; + fp = *(u64 *)fp; + } while (count++ < 16); + return 0; +} + +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +{ + int ret = 0; + int doit = task == current; + int cpu; + + switch (code) { + case ARCH_SET_GS: + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + load_gs_index(GS_TLS_SEL); + } + task->thread.gsindex = GS_TLS_SEL; + task->thread.gs = 0; + } else { + task->thread.gsindex = 0; + task->thread.gs = addr; + if (doit) { + load_gs_index(0); + ret = HYPERVISOR_set_segment_base( + SEGBASE_GS_USER, addr); + } + } + put_cpu(); + break; + case ARCH_SET_FS: + /* Not strictly needed for fs, but do it for symmetry + with gs */ + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, FS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + loadsegment(fs, FS_TLS_SEL); + } + task->thread.fsindex = FS_TLS_SEL; + task->thread.fs = 0; + } else { + task->thread.fsindex = 0; + task->thread.fs = addr; + if (doit) { + /* set the selector to 0 to not confuse + __switch_to */ + loadsegment(fs, 0); + ret = HYPERVISOR_set_segment_base(SEGBASE_FS, + addr); + } + } + put_cpu(); + break; + case ARCH_GET_FS: { + unsigned long base; + if (task->thread.fsindex == FS_TLS_SEL) + base = read_32bit_tls(task, FS_TLS); + else if (doit) + rdmsrl(MSR_FS_BASE, base); + else + base = task->thread.fs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + case ARCH_GET_GS: { + unsigned long base; + unsigned gsindex; + if (task->thread.gsindex == GS_TLS_SEL) + base = read_32bit_tls(task, GS_TLS); + else if (doit) { + savesegment(gs, gsindex); + if (gsindex) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = task->thread.gs; + } else + base = task->thread.gs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +long sys_arch_prctl(int code, unsigned long addr) +{ + return do_arch_prctl(current, code, addr); +} + --- linux-ec2-2.6.32.orig/arch/x86/kernel/quirks-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/quirks-xen.c @@ -0,0 +1,533 @@ +/* + * This file contains work-arounds for x86 and x86_64 platform bugs. + */ +#include +#include + +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) + +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +{ + u8 config, rev; + u16 word; + + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. + * Disable SW irqbalance/affinity on those platforms. + */ + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + if (rev > 0x9) + return; + + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + pci_write_config_byte(dev, 0xf4, config|0x2); + + /* + * read xTPR register. We may not have a pci_dev for device 8 + * because it might be hidden until the above write. + */ + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); + + if (!(word & (1 << 13))) { + struct xen_platform_op op; + + dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " + "disabling irq balancing and affinity\n"); + op.cmd = XENPF_platform_quirk; + op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; + WARN_ON(HYPERVISOR_platform_op(&op)); + } + + /* put back the original value for config space*/ + if (!(config & 0x2)) + pci_write_config_byte(dev, 0xf4, config); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, + quirk_intel_irqbalance); +#endif + +#if defined(CONFIG_HPET_TIMER) +#include + +unsigned long force_hpet_address; + +static enum { + NONE_FORCE_HPET_RESUME, + OLD_ICH_FORCE_HPET_RESUME, + ICH_FORCE_HPET_RESUME, + VT8237_FORCE_HPET_RESUME, + NVIDIA_FORCE_HPET_RESUME, + ATI_FORCE_HPET_RESUME, +} force_hpet_resume_type; + +static void __iomem *rcba_base; + +static void ich_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address) + return; + + BUG_ON(rcba_base == NULL); + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + } + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) + BUG(); + else + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + + return; +} + +static void ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 uninitialized_var(rcba); + int err = 0; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xF0, &rcba); + rcba &= 0xFFFFC000; + if (rcba == 0) { + dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; " + "cannot force enable HPET\n"); + return; + } + + /* use bits 31:14, 16 kB aligned */ + rcba_base = ioremap_nocache(rcba, 0x4000); + if (rcba_base == NULL) { + dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; " + "cannot force enable HPET\n"); + return; + } + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + + if (val & 0x80) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + iounmap(rcba_base); + return; + } + + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + err = 1; + } else { + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + } + + if (err) { + force_hpet_address = 0; + iounmap(rcba_base); + dev_printk(KERN_DEBUG, &dev->dev, + "Failed to force enable HPET\n"); + } else { + force_hpet_resume_type = ICH_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + } +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */ + ich_force_enable_hpet); + +static struct pci_dev *cached_dev; + +static void hpet_print_force_info(void) +{ + printk(KERN_INFO "HPET not enabled in BIOS. " + "You might try hpet=force boot option\n"); +} + +static void old_ich_force_hpet_resume(void) +{ + u32 val; + u32 uninitialized_var(gen_cntl); + + if (!force_hpet_address || !cached_dev) + return; + + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + + pci_write_config_dword(cached_dev, 0xD0, gen_cntl); + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + val = gen_cntl >> 15; + val &= 0x7; + if (val == 0x4) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void old_ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 uninitialized_var(gen_cntl); + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + /* + * Bit 17 is HPET enable bit. + * Bit 16:15 control the HPET base address. + */ + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + pci_write_config_dword(dev, 0xD0, gen_cntl); + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; + return; + } + + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); +} + +/* + * Undocumented chipset features. Make sure that the user enforced + * this. + */ +static void old_ich_force_enable_hpet_user(struct pci_dev *dev) +{ + if (hpet_force_user) + old_ich_force_enable_hpet(dev); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, + old_ich_force_enable_hpet); + + +static void vt8237_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address || !cached_dev) + return; + + val = 0xfed00000 | 0x80; + pci_write_config_dword(cached_dev, 0x68, val); + + pci_read_config_dword(cached_dev, 0x68, &val); + if (val & 0x80) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void vt8237_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + pci_read_config_dword(dev, 0x68, &val); + /* + * Bit 7 is HPET enable bit. + * Bit 31:10 is HPET base address (contrary to what datasheet claims) + */ + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + val = 0xfed00000 | 0x80; + pci_write_config_dword(dev, 0x68, val); + + pci_read_config_dword(dev, 0x68, &val); + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; + return; + } + + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, + vt8237_force_enable_hpet); + +static void ati_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x14, 0xfed00000); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static u32 ati_ixp4x0_rev(struct pci_dev *dev) +{ + u32 d; + u8 b; + + pci_read_config_byte(dev, 0xac, &b); + b &= ~(1<<5); + pci_write_config_byte(dev, 0xac, b); + pci_read_config_dword(dev, 0x70, &d); + d |= 1<<8; + pci_write_config_dword(dev, 0x70, d); + pci_read_config_dword(dev, 0x8, &d); + d &= 0xff; + dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); + return d; +} + +static void ati_force_enable_hpet(struct pci_dev *dev) +{ + u32 d, val; + u8 b; + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + d = ati_ixp4x0_rev(dev); + if (d < 0x82) + return; + + /* base address */ + pci_write_config_dword(dev, 0x14, 0xfed00000); + pci_read_config_dword(dev, 0x14, &val); + + /* enable interrupt */ + outb(0x72, 0xcd6); b = inb(0xcd7); + b |= 0x1; + outb(0x72, 0xcd6); outb(b, 0xcd7); + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x1)) + return; + pci_read_config_dword(dev, 0x64, &d); + d |= (1<<10); + pci_write_config_dword(dev, 0x64, d); + pci_read_config_dword(dev, 0x64, &d); + if (!(d & (1<<10))) + return; + + force_hpet_address = val; + force_hpet_resume_type = ATI_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + ati_force_enable_hpet); + +/* + * Undocumented chipset feature taken from LinuxBIOS. + */ +static void nvidia_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x44, 0xfed00001); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static void nvidia_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + pci_write_config_dword(dev, 0x44, 0xfed00001); + pci_read_config_dword(dev, 0x44, &val); + force_hpet_address = val & 0xfffffffe; + force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; + return; +} + +/* ISA Bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0050, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051, + nvidia_force_enable_hpet); + +/* LPC bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0362, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0363, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0364, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0365, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0366, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, + nvidia_force_enable_hpet); + +void force_hpet_resume(void) +{ + switch (force_hpet_resume_type) { + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + case ATI_FORCE_HPET_RESUME: + ati_force_hpet_resume(); + return; + default: + break; + } +} +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void __init quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + set_dev_node(&dev->dev, val & 7); + pci_dev_put(nb_ht); +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); +#endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/setup-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/setup-xen.c @@ -0,0 +1,1365 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons , July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle , February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel , March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include