--- linux-ec2-2.6.32.orig/MAINTAINERS +++ linux-ec2-2.6.32/MAINTAINERS @@ -1974,6 +1974,12 @@ S: Maintained F: drivers/platform/x86/eeepc-laptop.c +EFIFB FRAMEBUFFER DRIVER +L: linux-fbdev@vger.kernel.org +M: Peter Jones +S: Maintained +F: drivers/video/efifb.c + EFS FILESYSTEM W: http://aeschi.ch.eu.org/efs/ S: Orphan @@ -4373,7 +4379,7 @@ F: drivers/block/brd.c RANDOM NUMBER DRIVER -M: Matt Mackall +M: Theodore Ts'o" S: Maintained F: drivers/char/random.c @@ -5004,8 +5010,7 @@ STABLE BRANCH M: Greg Kroah-Hartman -M: Chris Wright -L: stable@kernel.org +L: stable@vger.kernel.org S: Maintained STAGING SUBSYSTEM @@ -5594,9 +5599,11 @@ F: drivers/net/wireless/rndis_wlan.c USB XHCI DRIVER -M: Sarah Sharp +M: Sarah Sharp L: linux-usb@vger.kernel.org S: Supported +F: drivers/usb/host/xhci* +F: drivers/usb/host/pci-quirks* USB ZC0301 DRIVER M: Luca Risolia @@ -5718,6 +5725,14 @@ S: Maintained F: drivers/net/vmxnet3/ +VMware PVSCSI driver +M: Alok Kataria +M: VMware PV-Drivers +L: linux-scsi@vger.kernel.org +S: Maintained +F: drivers/scsi/vmw_pvscsi.c +F: drivers/scsi/vmw_pvscsi.h + VOLTAGE AND CURRENT REGULATOR FRAMEWORK M: Liam Girdwood M: Mark Brown --- linux-ec2-2.6.32.orig/Makefile +++ linux-ec2-2.6.32/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 32 -EXTRAVERSION = +EXTRAVERSION = .63+drm33.26 NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* @@ -331,14 +331,23 @@ AFLAGS_KERNEL = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage +# Prefer linux-backports-modules +ifneq ($(KBUILD_SRC),) +ifneq ($(shell if test -e $(KBUILD_OUTPUT)/ubuntu-build; then echo yes; fi),yes) +UBUNTUINCLUDE := -I/usr/src/linux-headers-lbm-$(KERNELRELEASE) +endif +endif # Use LINUXINCLUDE when you must reference the include/ directory. # Needed to be compatible with the O= option -LINUXINCLUDE := -Iinclude \ +LINUXINCLUDE := $(UBUNTUINCLUDE) -Iinclude \ $(if $(KBUILD_SRC),-Iinclude2 -I$(srctree)/include) \ -I$(srctree)/arch/$(hdr-arch)/include \ -include include/linux/autoconf.h +# UBUNTU: Include our third party driver stuff too +LINUXINCLUDE += -Iubuntu/include $(if $(KBUILD_SRC),-I$(srctree)/ubuntu/include) + KBUILD_CPPFLAGS := -D__KERNEL__ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ @@ -464,12 +473,12 @@ # Carefully list dependencies so we do not try to build scripts twice # in parallel PHONY += scripts -scripts: scripts_basic include/config/auto.conf +scripts: scripts_basic include/config/auto.conf include/config/tristate.conf $(Q)$(MAKE) $(build)=$(@) # Objects we will link into vmlinux / subdirs we need to visit init-y := init/ -drivers-y := drivers/ sound/ firmware/ +drivers-y := drivers/ sound/ firmware/ ubuntu/ net-y := net/ libs-y := lib/ core-y := usr/ @@ -491,7 +500,7 @@ # with it and forgot to run make oldconfig. # if auto.conf.cmd is missing then we are probably in a cleaned tree so # we execute the config step to be sure to catch updated Kconfig files -include/config/auto.conf: $(KCONFIG_CONFIG) include/config/auto.conf.cmd +include/config/%.conf: $(KCONFIG_CONFIG) include/config/auto.conf.cmd $(Q)$(MAKE) -f $(srctree)/Makefile silentoldconfig else # external modules needs include/linux/autoconf.h and include/config/auto.conf @@ -537,6 +546,9 @@ KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector) endif +# This warning generated too much noise in a regular build. +KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) + ifdef CONFIG_FRAME_POINTER KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls else @@ -565,7 +577,7 @@ KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,) # disable pointer signed / unsigned warnings in gcc 4.0 -KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,) +KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign) # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow) @@ -876,6 +888,9 @@ PHONY += $(vmlinux-dirs) $(vmlinux-dirs): prepare scripts $(Q)$(MAKE) $(build)=$@ +ifdef CONFIG_MODULES + $(Q)$(MAKE) $(modbuiltin)=$@ +endif # Build the kernel release string # @@ -1126,6 +1141,7 @@ PHONY += modules modules: $(vmlinux-dirs) $(if $(KBUILD_BUILTIN),vmlinux) $(Q)$(AWK) '!x[$$0]++' $(vmlinux-dirs:%=$(objtree)/%/modules.order) > $(objtree)/modules.order + $(Q)$(AWK) '!x[$$0]++' $(vmlinux-dirs:%=$(objtree)/%/modules.builtin) > $(objtree)/modules.builtin @$(kecho) ' Building modules, stage 2.'; $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_modbuild @@ -1155,6 +1171,7 @@ ln -s $(objtree) $(MODLIB)/build ; \ fi @cp -f $(objtree)/modules.order $(MODLIB)/ + @cp -f $(objtree)/modules.builtin $(MODLIB)/ $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst # This depmod is only for convenience to give the initial @@ -1218,6 +1235,7 @@ -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \ -o -name '*.symtypes' -o -name 'modules.order' \ -o -name 'Module.markers' -o -name '.tmp_*.o.*' \ + -o -name 'modules.builtin' \ -o -name '*.gcno' \) -type f -print | xargs rm -f # mrproper - Delete all generated files, including .config @@ -1416,7 +1434,8 @@ clean: rm-dirs := $(MODVERDIR) clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers \ $(KBUILD_EXTMOD)/Module.markers \ - $(KBUILD_EXTMOD)/modules.order + $(KBUILD_EXTMOD)/modules.order \ + $(KBUILD_EXTMOD)/modules.builtin clean: $(clean-dirs) $(call cmd,rmdirs) $(call cmd,rmfiles) --- linux-ec2-2.6.32.orig/include/Kbuild +++ linux-ec2-2.6.32/include/Kbuild @@ -9,4 +9,3 @@ header-y += video/ header-y += drm/ header-y += xen/ -header-y += scsi/ --- linux-ec2-2.6.32.orig/include/math-emu/op-common.h +++ linux-ec2-2.6.32/include/math-emu/op-common.h @@ -799,7 +799,7 @@ X##_e -= (_FP_W_TYPE_SIZE - rsize); \ X##_e = rsize - X##_e - 1; \ \ - if (_FP_FRACBITS_##fs < rsize && _FP_WFRACBITS_##fs < X##_e) \ + if (_FP_FRACBITS_##fs < rsize && _FP_WFRACBITS_##fs <= X##_e) \ __FP_FRAC_SRS_1(ur_, (X##_e - _FP_WFRACBITS_##fs + 1), rsize);\ _FP_FRAC_DISASSEMBLE_##wc(X, ur_, rsize); \ if ((_FP_WFRACBITS_##fs - X##_e - 1) > 0) \ --- linux-ec2-2.6.32.orig/include/crypto/cryptd.h +++ linux-ec2-2.6.32/include/crypto/cryptd.h @@ -39,6 +39,7 @@ struct cryptd_ahash *cryptd_alloc_ahash(const char *alg_name, u32 type, u32 mask); struct crypto_shash *cryptd_ahash_child(struct cryptd_ahash *tfm); +struct shash_desc *cryptd_shash_desc(struct ahash_request *req); void cryptd_free_ahash(struct cryptd_ahash *tfm); #endif --- linux-ec2-2.6.32.orig/include/net/inet_sock.h +++ linux-ec2-2.6.32/include/net/inet_sock.h @@ -56,7 +56,15 @@ unsigned char __data[0]; }; -#define optlength(opt) (sizeof(struct ip_options) + opt->optlen) +struct ip_options_rcu { + struct rcu_head rcu; + struct ip_options opt; +}; + +struct ip_options_data { + struct ip_options_rcu opt; + char data[40]; +}; struct inet_request_sock { struct request_sock req; @@ -77,7 +85,7 @@ acked : 1, no_srccheck: 1; kmemcheck_bitfield_end(flags); - struct ip_options *opt; + struct ip_options_rcu *opt; }; static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) @@ -122,7 +130,7 @@ __be32 saddr; __s16 uc_ttl; __u16 cmsg_flags; - struct ip_options *opt; + struct ip_options_rcu *inet_opt; __be16 sport; __u16 id; __u8 tos; --- linux-ec2-2.6.32.orig/include/net/dst.h +++ linux-ec2-2.6.32/include/net/dst.h @@ -286,11 +286,22 @@ { return 0; } +static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) +{ + return NULL; +} + #else extern int xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl, struct sock *sk, int flags); extern int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl, struct sock *sk, int flags); + +/* skb attached with this dst needs transformation if dst->xfrm is valid */ +static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) +{ + return dst->xfrm; +} #endif #endif --- linux-ec2-2.6.32.orig/include/net/mac80211.h +++ linux-ec2-2.6.32/include/net/mac80211.h @@ -908,6 +908,9 @@ * @IEEE80211_HW_BEACON_FILTER: * Hardware supports dropping of irrelevant beacon frames to * avoid waking up cpu. + * @IEEE80211_HW_REPORTS_TX_ACK_STATUS: + * Hardware can provide ack status reports of Tx frames to + * the stack. */ enum ieee80211_hw_flags { IEEE80211_HW_RX_INCLUDES_FCS = 1<<1, @@ -924,6 +927,7 @@ IEEE80211_HW_SUPPORTS_DYNAMIC_PS = 1<<12, IEEE80211_HW_MFP_CAPABLE = 1<<13, IEEE80211_HW_BEACON_FILTER = 1<<14, + IEEE80211_HW_REPORTS_TX_ACK_STATUS = 1<<15, }; /** @@ -1696,6 +1700,12 @@ */ void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); +/* + * The TX headroom reserved by mac80211 for its own tx_status functions. + * This is enough for the radiotap header. + */ +#define IEEE80211_TX_STATUS_HEADROOM 13 + /** * ieee80211_tx_status - transmit status callback * --- linux-ec2-2.6.32.orig/include/net/secure_seq.h +++ linux-ec2-2.6.32/include/net/secure_seq.h @@ -0,0 +1,20 @@ +#ifndef _NET_SECURE_SEQ +#define _NET_SECURE_SEQ + +#include + +extern __u32 secure_ip_id(__be32 daddr); +extern __u32 secure_ipv6_id(const __be32 daddr[4]); +extern u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); +extern u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, + __be16 dport); +extern __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport); +extern __u32 secure_tcpv6_sequence_number(__be32 *saddr, __be32 *daddr, + __be16 sport, __be16 dport); +extern u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport); +extern u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, + __be16 sport, __be16 dport); + +#endif /* _NET_SECURE_SEQ */ --- linux-ec2-2.6.32.orig/include/net/inet_ecn.h +++ linux-ec2-2.6.32/include/net/inet_ecn.h @@ -38,9 +38,19 @@ return outer; } -#define INET_ECN_xmit(sk) do { inet_sk(sk)->tos |= INET_ECN_ECT_0; } while (0) -#define INET_ECN_dontxmit(sk) \ - do { inet_sk(sk)->tos &= ~INET_ECN_MASK; } while (0) +static inline void INET_ECN_xmit(struct sock *sk) +{ + inet_sk(sk)->tos |= INET_ECN_ECT_0; + if (inet6_sk(sk) != NULL) + inet6_sk(sk)->tclass |= INET_ECN_ECT_0; +} + +static inline void INET_ECN_dontxmit(struct sock *sk) +{ + inet_sk(sk)->tos &= ~INET_ECN_MASK; + if (inet6_sk(sk) != NULL) + inet6_sk(sk)->tclass &= ~INET_ECN_MASK; +} #define IP6_ECN_flow_init(label) do { \ (label) &= ~htonl(INET_ECN_MASK << 20); \ --- linux-ec2-2.6.32.orig/include/net/ipv6.h +++ linux-ec2-2.6.32/include/net/ipv6.h @@ -354,8 +354,16 @@ struct inet_frag_queue; +enum ip6_defrag_users { + IP6_DEFRAG_LOCAL_DELIVER, + IP6_DEFRAG_CONNTRACK_IN, + IP6_DEFRAG_CONNTRACK_OUT, + IP6_DEFRAG_CONNTRACK_BRIDGE_IN, +}; + struct ip6_create_arg { __be32 id; + u32 user; struct in6_addr *src; struct in6_addr *dst; }; @@ -441,17 +449,7 @@ return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } -static __inline__ void ipv6_select_ident(struct frag_hdr *fhdr) -{ - static u32 ipv6_fragmentation_id = 1; - static DEFINE_SPINLOCK(ip6_id_lock); - - spin_lock_bh(&ip6_id_lock); - fhdr->identification = htonl(ipv6_fragmentation_id); - if (++ipv6_fragmentation_id == 0) - ipv6_fragmentation_id = 1; - spin_unlock_bh(&ip6_id_lock); -} +extern void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt); /* * Prototypes exported by ipv6 @@ -569,7 +567,8 @@ extern int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); -extern int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len); +extern int ipv6_recv_error(struct sock *sk, struct msghdr *msg, + int len, int *addr_len); extern void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); extern void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info); --- linux-ec2-2.6.32.orig/include/net/x25.h +++ linux-ec2-2.6.32/include/net/x25.h @@ -182,6 +182,10 @@ extern int sysctl_x25_ack_holdback_timeout; extern int sysctl_x25_forward; +extern int x25_parse_address_block(struct sk_buff *skb, + struct x25_address *called_addr, + struct x25_address *calling_addr); + extern int x25_addr_ntoa(unsigned char *, struct x25_address *, struct x25_address *); extern int x25_addr_aton(unsigned char *, struct x25_address *, --- linux-ec2-2.6.32.orig/include/net/udp.h +++ linux-ec2-2.6.32/include/net/udp.h @@ -134,6 +134,7 @@ extern int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); +extern int udp_push_pending_frames(struct sock *sk); extern void udp_flush_pending_frames(struct sock *sk); extern int udp_rcv(struct sk_buff *skb); --- linux-ec2-2.6.32.orig/include/net/transp_v6.h +++ linux-ec2-2.6.32/include/net/transp_v6.h @@ -16,6 +16,8 @@ struct flowi; +extern void initialize_hashidentrnd(void); + /* extention headers */ extern int ipv6_exthdrs_init(void); extern void ipv6_exthdrs_exit(void); --- linux-ec2-2.6.32.orig/include/net/af_unix.h +++ linux-ec2-2.6.32/include/net/af_unix.h @@ -10,6 +10,7 @@ extern void unix_notinflight(struct file *fp); extern void unix_gc(void); extern void wait_for_unix_gc(void); +extern struct sock *unix_get_socket(struct file *filp); #define UNIX_HASH_SIZE 256 @@ -56,6 +57,7 @@ spinlock_t lock; unsigned int gc_candidate : 1; unsigned int gc_maybe_cycle : 1; + unsigned char recursion_level; wait_queue_head_t peer_wait; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) --- linux-ec2-2.6.32.orig/include/net/netrom.h +++ linux-ec2-2.6.32/include/net/netrom.h @@ -132,6 +132,8 @@ static __inline__ void nr_neigh_put(struct nr_neigh *nr_neigh) { if (atomic_dec_and_test(&nr_neigh->refcount)) { + if (nr_neigh->ax25) + ax25_cb_put(nr_neigh->ax25); kfree(nr_neigh->digipeat); kfree(nr_neigh); } --- linux-ec2-2.6.32.orig/include/net/tcp.h +++ linux-ec2-2.6.32/include/net/tcp.h @@ -259,11 +259,21 @@ return seq3 - seq2 >= seq1 - seq2; } -static inline int tcp_too_many_orphans(struct sock *sk, int num) +static inline bool tcp_too_many_orphans(struct sock *sk, int shift) { - return (num > sysctl_tcp_max_orphans) || - (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]); + struct percpu_counter *ocp = sk->sk_prot->orphan_count; + int orphans = percpu_counter_read_positive(ocp); + + if (orphans << shift > sysctl_tcp_max_orphans) { + orphans = percpu_counter_sum_positive(ocp); + if (orphans << shift > sysctl_tcp_max_orphans) + return true; + } + + if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) + return true; + return false; } /* syncookies: remember time of last synqueue overflow */ @@ -501,8 +511,22 @@ /* Bound MSS / TSO packet size with the half of the window */ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) { - if (tp->max_window && pktsize > (tp->max_window >> 1)) - return max(tp->max_window >> 1, 68U - tp->tcp_header_len); + int cutoff; + + /* When peer uses tiny windows, there is no use in packetizing + * to sub-MSS pieces for the sake of SWS or making sure there + * are enough packets in the pipe for fast recovery. + * + * On the other hand, for extremely large MSS devices, handling + * smaller than MSS windows in this way does make sense. + */ + if (tp->max_window >= 512) + cutoff = (tp->max_window >> 1); + else + cutoff = tp->max_window; + + if (cutoff && pktsize > cutoff) + return max_t(int, cutoff, 68U - tp->tcp_header_len); else return pktsize; } @@ -1263,14 +1287,20 @@ * TCP connection after "boundary" unsucessful, exponentially backed-off * retransmissions with an initial RTO of TCP_RTO_MIN. */ -static inline bool retransmits_timed_out(const struct sock *sk, +static inline bool retransmits_timed_out(struct sock *sk, unsigned int boundary) { unsigned int timeout, linear_backoff_thresh; + unsigned int start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; + if (unlikely(!tcp_sk(sk)->retrans_stamp)) + start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; + else + start_ts = tcp_sk(sk)->retrans_stamp; + linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); if (boundary <= linear_backoff_thresh) @@ -1279,7 +1309,7 @@ timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; - return (tcp_time_stamp - tcp_sk(sk)->retrans_stamp) >= timeout; + return (tcp_time_stamp - start_ts) >= timeout; } static inline struct sk_buff *tcp_send_head(struct sock *sk) --- linux-ec2-2.6.32.orig/include/net/scm.h +++ linux-ec2-2.6.32/include/net/scm.h @@ -10,12 +10,13 @@ /* Well, we should have at least one descriptor open * to accept passed FDs 8) */ -#define SCM_MAX_FD 255 +#define SCM_MAX_FD 253 struct scm_fp_list { struct list_head list; - int count; + short count; + short max; struct file *fp[SCM_MAX_FD]; }; --- linux-ec2-2.6.32.orig/include/net/ip.h +++ linux-ec2-2.6.32/include/net/ip.h @@ -54,7 +54,7 @@ { __be32 addr; int oif; - struct ip_options *opt; + struct ip_options_rcu *opt; union skb_shared_tx shtx; }; @@ -92,7 +92,7 @@ extern int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, __be32 saddr, __be32 daddr, - struct ip_options *opt); + struct ip_options_rcu *opt); extern int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); extern int ip_local_deliver(struct sk_buff *skb); @@ -342,6 +342,7 @@ IP_DEFRAG_CALL_RA_CHAIN, IP_DEFRAG_CONNTRACK_IN, IP_DEFRAG_CONNTRACK_OUT, + IP_DEFRAG_CONNTRACK_BRIDGE_IN, IP_DEFRAG_VS_IN, IP_DEFRAG_VS_OUT, IP_DEFRAG_VS_FWD @@ -361,14 +362,15 @@ * Functions provided by ip_options.c */ -extern void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt, int is_frag); +extern void ip_options_build(struct sk_buff *skb, struct ip_options *opt, + __be32 daddr, struct rtable *rt, int is_frag); extern int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb); extern void ip_options_fragment(struct sk_buff *skb); extern int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb); -extern int ip_options_get(struct net *net, struct ip_options **optp, +extern int ip_options_get(struct net *net, struct ip_options_rcu **optp, unsigned char *data, int optlen); -extern int ip_options_get_from_user(struct net *net, struct ip_options **optp, +extern int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp, unsigned char __user *data, int optlen); extern void ip_options_undo(struct ip_options * opt); extern void ip_forward_options(struct sk_buff *skb); @@ -389,7 +391,7 @@ int optname, char __user *optval, int __user *optlen); extern int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)); -extern int ip_recv_error(struct sock *sk, struct msghdr *msg, int len); +extern int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); extern void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); extern void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, --- linux-ec2-2.6.32.orig/include/net/rose.h +++ linux-ec2-2.6.32/include/net/rose.h @@ -14,6 +14,12 @@ #define ROSE_MIN_LEN 3 +#define ROSE_CALL_REQ_ADDR_LEN_OFF 3 +#define ROSE_CALL_REQ_ADDR_LEN_VAL 0xAA /* each address is 10 digits */ +#define ROSE_CALL_REQ_DEST_ADDR_OFF 4 +#define ROSE_CALL_REQ_SRC_ADDR_OFF 9 +#define ROSE_CALL_REQ_FACILITIES_OFF 14 + #define ROSE_GFI 0x10 #define ROSE_Q_BIT 0x80 #define ROSE_D_BIT 0x40 @@ -214,7 +220,7 @@ extern int rose_validate_nr(struct sock *, unsigned short); extern void rose_write_internal(struct sock *, int); extern int rose_decode(struct sk_buff *, int *, int *, int *, int *, int *); -extern int rose_parse_facilities(unsigned char *, struct rose_facilities_struct *); +extern int rose_parse_facilities(unsigned char *, unsigned int, struct rose_facilities_struct *); extern void rose_disconnect(struct sock *, int, int, int); /* rose_timer.c */ --- linux-ec2-2.6.32.orig/include/net/netlink.h +++ linux-ec2-2.6.32/include/net/netlink.h @@ -384,7 +384,7 @@ * * Returns the first attribute which matches the specified type. */ -static inline struct nlattr *nlmsg_find_attr(struct nlmsghdr *nlh, +static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) { return nla_find(nlmsg_attrdata(nlh, hdrlen), --- linux-ec2-2.6.32.orig/include/net/sock.h +++ linux-ec2-2.6.32/include/net/sock.h @@ -242,6 +242,7 @@ struct { struct sk_buff *head; struct sk_buff *tail; + int len; } sk_backlog; wait_queue_head_t *sk_sleep; struct dst_entry *sk_dst_cache; @@ -561,8 +562,8 @@ return sk->sk_wmem_queued < sk->sk_sndbuf; } -/* The per-socket spinlock must be held here. */ -static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb) +/* OOB backlog add */ +static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { if (!sk->sk_backlog.tail) { sk->sk_backlog.head = sk->sk_backlog.tail = skb; @@ -573,6 +574,27 @@ skb->next = NULL; } +/* + * Take into account size of receive queue and backlog queue + */ +static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb) +{ + unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); + + return qsize + skb->truesize > sk->sk_rcvbuf; +} + +/* The per-socket spinlock must be held here. */ +static inline int sk_add_backlog(struct sock *sk, struct sk_buff *skb) +{ + if (sk_rcvqueues_full(sk, skb)) + return -ENOBUFS; + + __sk_add_backlog(sk, skb); + sk->sk_backlog.len += skb->truesize; + return 0; +} + static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { return sk->sk_backlog_rcv(sk, skb); @@ -1354,20 +1376,7 @@ extern int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) -{ - /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces - number of warnings when compiling with -W --ANK - */ - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned)sk->sk_rcvbuf) - return -ENOMEM; - skb_set_owner_r(skb, sk); - skb_queue_tail(&sk->sk_error_queue, skb); - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb->len); - return 0; -} +extern int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); /* * Recover an error report and clear atomically @@ -1598,4 +1607,8 @@ extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; +#ifdef CONFIG_NET_NS +extern int max_netns_count; +#endif + #endif /* _SOCK_H */ --- linux-ec2-2.6.32.orig/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ linux-ec2-2.6.32/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -9,7 +9,7 @@ extern int nf_ct_frag6_init(void); extern void nf_ct_frag6_cleanup(void); -extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb); +extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user); extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, struct net_device *in, struct net_device *out, --- linux-ec2-2.6.32.orig/include/net/sctp/command.h +++ linux-ec2-2.6.32/include/net/sctp/command.h @@ -107,6 +107,8 @@ SCTP_CMD_T1_RETRAN, /* Mark for retransmission after T1 timeout */ SCTP_CMD_UPDATE_INITTAG, /* Update peer inittag */ SCTP_CMD_SEND_MSG, /* Send the whole use message */ + SCTP_CMD_SEND_NEXT_ASCONF, /* Send the next ASCONF after ACK */ + SCTP_CMD_SET_ASOC, /* Restore association context */ SCTP_CMD_LAST } sctp_verb_t; --- linux-ec2-2.6.32.orig/include/net/sctp/structs.h +++ linux-ec2-2.6.32/include/net/sctp/structs.h @@ -772,6 +772,7 @@ struct iovec *data); void sctp_chunk_free(struct sctp_chunk *); void *sctp_addto_chunk(struct sctp_chunk *, int len, const void *data); +void *sctp_addto_chunk_fixed(struct sctp_chunk *, int len, const void *data); struct sctp_chunk *sctp_chunkify(struct sk_buff *, const struct sctp_association *, struct sock *); @@ -1007,6 +1008,9 @@ /* Heartbeat timer is per destination. */ struct timer_list hb_timer; + /* Timer to handle ICMP proto unreachable envets */ + struct timer_list proto_unreach_timer; + /* Since we're using per-destination retransmission timers * (see above), we're also using per-destination "transmitted" * queues. This probably ought to be a private struct --- linux-ec2-2.6.32.orig/include/net/sctp/sm.h +++ linux-ec2-2.6.32/include/net/sctp/sm.h @@ -251,9 +251,9 @@ int, __be16); struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, union sctp_addr *addr); -int sctp_verify_asconf(const struct sctp_association *asoc, - struct sctp_paramhdr *param_hdr, void *chunk_end, - struct sctp_paramhdr **errp); +bool sctp_verify_asconf(const struct sctp_association *asoc, + struct sctp_chunk *chunk, bool addr_param_needed, + struct sctp_paramhdr **errp); struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, struct sctp_chunk *asconf); int sctp_process_asconf_ack(struct sctp_association *asoc, @@ -278,6 +278,7 @@ /* 2nd level prototypes */ void sctp_generate_t3_rtx_event(unsigned long peer); void sctp_generate_heartbeat_event(unsigned long peer); +void sctp_generate_proto_unreach_event(unsigned long peer); void sctp_ootb_pkt_free(struct sctp_packet *); --- linux-ec2-2.6.32.orig/include/net/sctp/sctp.h +++ linux-ec2-2.6.32/include/net/sctp/sctp.h @@ -509,6 +509,11 @@ asoc->pmtu_pending = 0; } +static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk) +{ + return !list_empty(&chunk->list); +} + /* Walk through a list of TLV parameters. Don't trust the * individual parameter lengths and instead depend on * the chunk length to indicate when to stop. Make sure --- linux-ec2-2.6.32.orig/include/net/netns/conntrack.h +++ linux-ec2-2.6.32/include/net/netns/conntrack.h @@ -11,6 +11,8 @@ struct netns_ct { atomic_t count; unsigned int expect_count; + unsigned int htable_size; + struct kmem_cache *nf_conntrack_cachep; struct hlist_nulls_head *hash; struct hlist_head *expect_hash; struct hlist_nulls_head unconfirmed; @@ -28,5 +30,6 @@ #endif int hash_vmalloc; int expect_vmalloc; + char *slabname; }; #endif --- linux-ec2-2.6.32.orig/include/net/netns/ipv4.h +++ linux-ec2-2.6.32/include/net/netns/ipv4.h @@ -40,6 +40,7 @@ struct xt_table *iptable_security; struct xt_table *nat_table; struct hlist_head *nat_bysource; + unsigned int nat_htable_size; int nat_vmalloced; #endif --- linux-ec2-2.6.32.orig/include/drm/drm_pciids.h +++ linux-ec2-2.6.32/include/drm/drm_pciids.h @@ -6,6 +6,7 @@ {0x1002, 0x3150, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV380|RADEON_IS_MOBILITY}, \ {0x1002, 0x3152, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV380|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x3154, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV380|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x3155, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV380|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x3E50, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV380|RADEON_NEW_MEMMAP}, \ {0x1002, 0x3E54, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV380|RADEON_NEW_MEMMAP}, \ {0x1002, 0x4136, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS100|RADEON_IS_IGP}, \ @@ -27,7 +28,6 @@ {0x1002, 0x4156, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV350}, \ {0x1002, 0x4237, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS200|RADEON_IS_IGP}, \ {0x1002, 0x4242, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R200}, \ - {0x1002, 0x4243, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R200}, \ {0x1002, 0x4336, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS100|RADEON_IS_IGP|RADEON_IS_MOBILITY}, \ {0x1002, 0x4337, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS200|RADEON_IS_IGP|RADEON_IS_MOBILITY}, \ {0x1002, 0x4437, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS200|RADEON_IS_IGP|RADEON_IS_MOBILITY}, \ @@ -84,7 +84,6 @@ {0x1002, 0x5460, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV380|RADEON_IS_MOBILITY}, \ {0x1002, 0x5462, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV380|RADEON_IS_MOBILITY}, \ {0x1002, 0x5464, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV380|RADEON_IS_MOBILITY}, \ - {0x1002, 0x5657, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV380|RADEON_NEW_MEMMAP}, \ {0x1002, 0x5548, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R423|RADEON_NEW_MEMMAP}, \ {0x1002, 0x5549, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R423|RADEON_NEW_MEMMAP}, \ {0x1002, 0x554A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R423|RADEON_NEW_MEMMAP}, \ @@ -102,6 +101,7 @@ {0x1002, 0x564F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV410|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x5652, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV410|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x5653, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV410|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x5657, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV410|RADEON_NEW_MEMMAP}, \ {0x1002, 0x5834, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS300|RADEON_IS_IGP}, \ {0x1002, 0x5835, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS300|RADEON_IS_IGP|RADEON_IS_MOBILITY}, \ {0x1002, 0x5954, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS480|RADEON_IS_IGP|RADEON_IS_MOBILITY|RADEON_IS_IGPGART}, \ @@ -375,6 +375,7 @@ {0x1002, 0x9712, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9713, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9714, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9715, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0, 0, 0} #define r128_PCI_IDS \ --- linux-ec2-2.6.32.orig/include/drm/radeon_drm.h +++ linux-ec2-2.6.32/include/drm/radeon_drm.h @@ -33,7 +33,7 @@ #ifndef __RADEON_DRM_H__ #define __RADEON_DRM_H__ -#include +#include "drm.h" /* WARNING: If you change any of these defines, make sure to change the * defines in the X server file (radeon_sarea.h) --- linux-ec2-2.6.32.orig/include/drm/drm_mode.h +++ linux-ec2-2.6.32/include/drm/drm_mode.h @@ -27,9 +27,6 @@ #ifndef _DRM_MODE_H #define _DRM_MODE_H -#include -#include - #define DRM_DISPLAY_INFO_LEN 32 #define DRM_CONNECTOR_NAME_LEN 32 #define DRM_DISPLAY_MODE_LEN 32 @@ -78,12 +75,17 @@ #define DRM_MODE_DITHERING_OFF 0 #define DRM_MODE_DITHERING_ON 1 +/* Dirty info options */ +#define DRM_MODE_DIRTY_OFF 0 +#define DRM_MODE_DIRTY_ON 1 +#define DRM_MODE_DIRTY_ANNOTATE 2 + struct drm_mode_modeinfo { __u32 clock; __u16 hdisplay, hsync_start, hsync_end, htotal, hskew; __u16 vdisplay, vsync_start, vsync_end, vtotal, vscan; - __u32 vrefresh; /* vertical refresh * 1000 */ + __u32 vrefresh; __u32 flags; __u32 type; @@ -158,6 +160,7 @@ #define DRM_MODE_CONNECTOR_HDMIA 11 #define DRM_MODE_CONNECTOR_HDMIB 12 #define DRM_MODE_CONNECTOR_TV 13 +#define DRM_MODE_CONNECTOR_eDP 14 struct drm_mode_get_connector { @@ -225,6 +228,47 @@ __u32 handle; }; +#define DRM_MODE_FB_DIRTY_ANNOTATE_COPY 0x01 +#define DRM_MODE_FB_DIRTY_ANNOTATE_FILL 0x02 +#define DRM_MODE_FB_DIRTY_FLAGS 0x03 + +#define DRM_MODE_FB_DIRTY_MAX_CLIPS 256 + +/* + * Mark a region of a framebuffer as dirty. + * + * Some hardware does not automatically update display contents + * as a hardware or software draw to a framebuffer. This ioctl + * allows userspace to tell the kernel and the hardware what + * regions of the framebuffer have changed. + * + * The kernel or hardware is free to update more then just the + * region specified by the clip rects. The kernel or hardware + * may also delay and/or coalesce several calls to dirty into a + * single update. + * + * Userspace may annotate the updates, the annotates are a + * promise made by the caller that the change is either a copy + * of pixels or a fill of a single color in the region specified. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_COPY flag is given then + * the number of updated regions are half of num_clips given, + * where the clip rects are paired in src and dst. The width and + * height of each one of the pairs must match. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_FILL flag is given the caller + * promises that the region specified of the clip rects is filled + * completely with a single color as given in the color argument. + */ + +struct drm_mode_fb_dirty_cmd { + __u32 fb_id; + __u32 flags; + __u32 color; + __u32 num_clips; + __u64 clips_ptr; +}; + struct drm_mode_mode_cmd { __u32 connector_id; struct drm_mode_modeinfo mode; @@ -268,4 +312,37 @@ __u64 blue; }; +#define DRM_MODE_PAGE_FLIP_EVENT 0x01 +#define DRM_MODE_PAGE_FLIP_FLAGS DRM_MODE_PAGE_FLIP_EVENT + +/* + * Request a page flip on the specified crtc. + * + * This ioctl will ask KMS to schedule a page flip for the specified + * crtc. Once any pending rendering targeting the specified fb (as of + * ioctl time) has completed, the crtc will be reprogrammed to display + * that fb after the next vertical refresh. The ioctl returns + * immediately, but subsequent rendering to the current fb will block + * in the execbuffer ioctl until the page flip happens. If a page + * flip is already pending as the ioctl is called, EBUSY will be + * returned. + * + * The ioctl supports one flag, DRM_MODE_PAGE_FLIP_EVENT, which will + * request that drm sends back a vblank event (see drm.h: struct + * drm_event_vblank) when the page flip is done. The user_data field + * passed in with this ioctl will be returned as the user_data field + * in the vblank event struct. + * + * The reserved field must be zero until we figure out something + * clever to use it for. + */ + +struct drm_mode_crtc_page_flip { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 reserved; + __u64 user_data; +}; + #endif --- linux-ec2-2.6.32.orig/include/drm/drm.h +++ linux-ec2-2.6.32/include/drm/drm.h @@ -36,17 +36,27 @@ #ifndef _DRM_H_ #define _DRM_H_ +#if defined(__linux__) + #include -#include /* For _IO* macros */ -#define DRM_IOCTL_NR(n) _IOC_NR(n) -#define DRM_IOC_VOID _IOC_NONE -#define DRM_IOC_READ _IOC_READ -#define DRM_IOC_WRITE _IOC_WRITE -#define DRM_IOC_READWRITE _IOC_READ|_IOC_WRITE -#define DRM_IOC(dir, group, nr, size) _IOC(dir, group, nr, size) +#include +typedef unsigned int drm_handle_t; + +#else /* One of the BSDs */ -#define DRM_MAJOR 226 -#define DRM_MAX_MINOR 15 +#include +#include +typedef int8_t __s8; +typedef uint8_t __u8; +typedef int16_t __s16; +typedef uint16_t __u16; +typedef int32_t __s32; +typedef uint32_t __u32; +typedef int64_t __s64; +typedef uint64_t __u64; +typedef unsigned long drm_handle_t; + +#endif #define DRM_NAME "drm" /**< Name in kernel, /dev, and /proc */ #define DRM_MIN_ORDER 5 /**< At least 2^5 bytes = 32 bytes */ @@ -59,7 +69,6 @@ #define _DRM_LOCK_IS_CONT(lock) ((lock) & _DRM_LOCK_CONT) #define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT)) -typedef unsigned int drm_handle_t; typedef unsigned int drm_context_t; typedef unsigned int drm_drawable_t; typedef unsigned int drm_magic_t; @@ -454,6 +463,7 @@ enum drm_vblank_seq_type { _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ @@ -461,8 +471,8 @@ }; #define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) -#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_SIGNAL | _DRM_VBLANK_SECONDARY | \ - _DRM_VBLANK_NEXTONMISS) +#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ + _DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS) struct drm_wait_vblank_request { enum drm_vblank_seq_type type; @@ -686,6 +696,8 @@ #define DRM_IOCTL_MODE_GETFB DRM_IOWR(0xAD, struct drm_mode_fb_cmd) #define DRM_IOCTL_MODE_ADDFB DRM_IOWR(0xAE, struct drm_mode_fb_cmd) #define DRM_IOCTL_MODE_RMFB DRM_IOWR(0xAF, unsigned int) +#define DRM_IOCTL_MODE_PAGE_FLIP DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip) +#define DRM_IOCTL_MODE_DIRTYFB DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd) /** * Device specific ioctls should only be in their respective headers @@ -698,6 +710,35 @@ #define DRM_COMMAND_BASE 0x40 #define DRM_COMMAND_END 0xA0 +/** + * Header for events written back to userspace on the drm fd. The + * type defines the type of event, the length specifies the total + * length of the event (including the header), and user_data is + * typically a 64 bit value passed with the ioctl that triggered the + * event. A read on the drm fd will always only return complete + * events, that is, if for example the read buffer is 100 bytes, and + * there are two 64 byte events pending, only one will be returned. + * + * Event types 0 - 0x7fffffff are generic drm events, 0x80000000 and + * up are chipset specific. + */ +struct drm_event { + __u32 type; + __u32 length; +}; + +#define DRM_EVENT_VBLANK 0x01 +#define DRM_EVENT_FLIP_COMPLETE 0x02 + +struct drm_event_vblank { + struct drm_event base; + __u64 user_data; + __u32 tv_sec; + __u32 tv_usec; + __u32 sequence; + __u32 reserved; +}; + /* typedef area */ #ifndef __KERNEL__ typedef struct drm_clip_rect drm_clip_rect_t; --- linux-ec2-2.6.32.orig/include/drm/drm_edid.h +++ linux-ec2-2.6.32/include/drm/drm_edid.h @@ -106,6 +106,10 @@ u8 wpindex2[3]; } __attribute__((packed)); +struct cvt_timing { + u8 code[3]; +} __attribute__((packed)); + struct detailed_non_pixel { u8 pad1; u8 type; /* ff=serial, fe=string, fd=monitor range, fc=monitor name @@ -117,9 +121,13 @@ struct detailed_data_monitor_range range; struct detailed_data_wpindex color; struct std_timing timings[5]; + struct cvt_timing cvt[4]; } data; } __attribute__((packed)); +#define EDID_DETAIL_EST_TIMINGS 0xf7 +#define EDID_DETAIL_CVT_3BYTE 0xf8 +#define EDID_DETAIL_COLOR_MGMT_DATA 0xf9 #define EDID_DETAIL_STD_MODES 0xfa #define EDID_DETAIL_MONITOR_CPDATA 0xfb #define EDID_DETAIL_MONITOR_NAME 0xfc --- linux-ec2-2.6.32.orig/include/drm/mga_drm.h +++ linux-ec2-2.6.32/include/drm/mga_drm.h @@ -35,7 +35,7 @@ #ifndef __MGA_DRM_H__ #define __MGA_DRM_H__ -#include +#include "drm.h" /* WARNING: If you change any of these defines, make sure to change the * defines in the Xserver file (mga_sarea.h) --- linux-ec2-2.6.32.orig/include/drm/drm_crtc.h +++ linux-ec2-2.6.32/include/drm/drm_crtc.h @@ -123,7 +123,7 @@ int type; /* Proposed mode values */ - int clock; + int clock; /* in kHz */ int hdisplay; int hsync_start; int hsync_end; @@ -164,8 +164,8 @@ int *private; int private_flags; - int vrefresh; - float hsync; + int vrefresh; /* in Hz */ + int hsync; /* in kHz */ }; enum drm_connector_status { @@ -242,6 +242,21 @@ int (*create_handle)(struct drm_framebuffer *fb, struct drm_file *file_priv, unsigned int *handle); + /** + * Optinal callback for the dirty fb ioctl. + * + * Userspace can notify the driver via this callback + * that a area of the framebuffer has changed and should + * be flushed to the display hardware. + * + * See documentation in drm_mode.h for the struct + * drm_mode_fb_dirty_cmd for more information as all + * the semantics and arguments have a one to one mapping + * on this function. + */ + int (*dirty)(struct drm_framebuffer *framebuffer, unsigned flags, + unsigned color, struct drm_clip_rect *clips, + unsigned num_clips); }; struct drm_framebuffer { @@ -256,7 +271,7 @@ unsigned int depth; int bits_per_pixel; int flags; - void *fbdev; + struct fb_info *fbdev; u32 pseudo_palette[17]; struct list_head filp_head; /* if you are using the helper */ @@ -290,6 +305,7 @@ struct drm_crtc; struct drm_connector; struct drm_encoder; +struct drm_pending_vblank_event; /** * drm_crtc_funcs - control CRTCs for a given device @@ -333,6 +349,19 @@ void (*destroy)(struct drm_crtc *crtc); int (*set_config)(struct drm_mode_set *set); + + /* + * Flip to the given framebuffer. This implements the page + * flip ioctl descibed in drm_mode.h, specifically, the + * implementation must return immediately and block all + * rendering to the current fb until the flip has completed. + * If userspace set the event flag in the ioctl, the event + * argument will point to an event to send back when the flip + * completes, otherwise it will be NULL. + */ + int (*page_flip)(struct drm_crtc *crtc, + struct drm_framebuffer *fb, + struct drm_pending_vblank_event *event); }; /** @@ -596,6 +625,7 @@ /* Optional properties */ struct drm_property *scaling_mode_property; struct drm_property *dithering_mode_property; + struct drm_property *dirty_info_property; }; #define obj_to_crtc(x) container_of(x, struct drm_crtc, base) @@ -667,6 +697,7 @@ extern void drm_mode_prune_invalid(struct drm_device *dev, struct list_head *mode_list, bool verbose); extern void drm_mode_sort(struct list_head *mode_list); +extern int drm_mode_hsync(struct drm_display_mode *mode); extern int drm_mode_vrefresh(struct drm_display_mode *mode); extern void drm_mode_set_crtcinfo(struct drm_display_mode *p, int adjust_flags); @@ -703,6 +734,7 @@ char *formats[]); extern int drm_mode_create_scaling_mode_property(struct drm_device *dev); extern int drm_mode_create_dithering_property(struct drm_device *dev); +extern int drm_mode_create_dirty_info_property(struct drm_device *dev); extern char *drm_get_encoder_name(struct drm_encoder *encoder); extern int drm_mode_connector_attach_encoder(struct drm_connector *connector, @@ -711,7 +743,8 @@ struct drm_encoder *encoder); extern bool drm_mode_crtc_set_gamma_size(struct drm_crtc *crtc, int gamma_size); -extern void *drm_mode_object_find(struct drm_device *dev, uint32_t id, uint32_t type); +extern struct drm_mode_object *drm_mode_object_find(struct drm_device *dev, + uint32_t id, uint32_t type); /* IOCTLs */ extern int drm_mode_getresources(struct drm_device *dev, void *data, struct drm_file *file_priv); @@ -730,6 +763,8 @@ void *data, struct drm_file *file_priv); extern int drm_mode_getfb(struct drm_device *dev, void *data, struct drm_file *file_priv); +extern int drm_mode_dirtyfb_ioctl(struct drm_device *dev, + void *data, struct drm_file *file_priv); extern int drm_mode_addmode_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_mode_rmmode_ioctl(struct drm_device *dev, @@ -756,6 +791,8 @@ extern int drm_mode_gamma_set_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern bool drm_detect_hdmi_monitor(struct edid *edid); +extern int drm_mode_page_flip_ioctl(struct drm_device *dev, + void *data, struct drm_file *file_priv); extern struct drm_display_mode *drm_cvt_mode(struct drm_device *dev, int hdisplay, int vdisplay, int vrefresh, bool reduced, bool interlaced, bool margins); --- linux-ec2-2.6.32.orig/include/drm/Kbuild +++ linux-ec2-2.6.32/include/drm/Kbuild @@ -7,4 +7,6 @@ unifdef-y += radeon_drm.h unifdef-y += sis_drm.h unifdef-y += savage_drm.h +unifdef-y += vmwgfx_drm.h unifdef-y += via_drm.h +unifdef-y += nouveau_drm.h --- linux-ec2-2.6.32.orig/include/drm/drm_mm.h +++ linux-ec2-2.6.32/include/drm/drm_mm.h @@ -44,7 +44,10 @@ struct drm_mm_node { struct list_head fl_entry; struct list_head ml_entry; - int free; + unsigned free : 1; + unsigned scanned_block : 1; + unsigned scanned_prev_free : 1; + unsigned scanned_next_free : 1; unsigned long start; unsigned long size; struct drm_mm *mm; @@ -57,6 +60,11 @@ struct list_head unused_nodes; int num_unused; spinlock_t unused_lock; + unsigned scan_alignment; + unsigned long scan_size; + unsigned long scan_hit_start; + unsigned scan_hit_size; + unsigned scanned_blocks; }; /* @@ -66,6 +74,13 @@ unsigned long size, unsigned alignment, int atomic); +extern struct drm_mm_node *drm_mm_get_block_range_generic( + struct drm_mm_node *node, + unsigned long size, + unsigned alignment, + unsigned long start, + unsigned long end, + int atomic); static inline struct drm_mm_node *drm_mm_get_block(struct drm_mm_node *parent, unsigned long size, unsigned alignment) @@ -78,11 +93,38 @@ { return drm_mm_get_block_generic(parent, size, alignment, 1); } +static inline struct drm_mm_node *drm_mm_get_block_range( + struct drm_mm_node *parent, + unsigned long size, + unsigned alignment, + unsigned long start, + unsigned long end) +{ + return drm_mm_get_block_range_generic(parent, size, alignment, + start, end, 0); +} +static inline struct drm_mm_node *drm_mm_get_block_atomic_range( + struct drm_mm_node *parent, + unsigned long size, + unsigned alignment, + unsigned long start, + unsigned long end) +{ + return drm_mm_get_block_range_generic(parent, size, alignment, + start, end, 1); +} extern void drm_mm_put_block(struct drm_mm_node *cur); extern struct drm_mm_node *drm_mm_search_free(const struct drm_mm *mm, unsigned long size, unsigned alignment, int best_match); +extern struct drm_mm_node *drm_mm_search_free_in_range( + const struct drm_mm *mm, + unsigned long size, + unsigned alignment, + unsigned long start, + unsigned long end, + int best_match); extern int drm_mm_init(struct drm_mm *mm, unsigned long start, unsigned long size); extern void drm_mm_takedown(struct drm_mm *mm); @@ -99,6 +141,12 @@ return block->mm; } +void drm_mm_init_scan(struct drm_mm *mm, unsigned long size, + unsigned alignment); +int drm_mm_scan_add_block(struct drm_mm_node *node); +int drm_mm_scan_remove_block(struct drm_mm_node *node); + +extern void drm_mm_debug_table(struct drm_mm *mm, const char *prefix); #ifdef CONFIG_DEBUG_FS int drm_mm_dump_table(struct seq_file *m, struct drm_mm *mm); #endif --- linux-ec2-2.6.32.orig/include/drm/vmwgfx_drm.h +++ linux-ec2-2.6.32/include/drm/vmwgfx_drm.h @@ -0,0 +1,588 @@ +/************************************************************************** + * + * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef __VMWGFX_DRM_H__ +#define __VMWGFX_DRM_H__ + +#define DRM_VMW_MAX_SURFACE_FACES 6 +#define DRM_VMW_MAX_MIP_LEVELS 24 + +#define DRM_VMW_EXT_NAME_LEN 128 + +#define DRM_VMW_GET_PARAM 0 +#define DRM_VMW_ALLOC_DMABUF 1 +#define DRM_VMW_UNREF_DMABUF 2 +#define DRM_VMW_CURSOR_BYPASS 3 +/* guarded by DRM_VMW_PARAM_NUM_STREAMS != 0*/ +#define DRM_VMW_CONTROL_STREAM 4 +#define DRM_VMW_CLAIM_STREAM 5 +#define DRM_VMW_UNREF_STREAM 6 +/* guarded by DRM_VMW_PARAM_3D == 1 */ +#define DRM_VMW_CREATE_CONTEXT 7 +#define DRM_VMW_UNREF_CONTEXT 8 +#define DRM_VMW_CREATE_SURFACE 9 +#define DRM_VMW_UNREF_SURFACE 10 +#define DRM_VMW_REF_SURFACE 11 +#define DRM_VMW_EXECBUF 12 +#define DRM_VMW_FIFO_DEBUG 13 +#define DRM_VMW_FENCE_WAIT 14 + + +/*************************************************************************/ +/** + * DRM_VMW_GET_PARAM - get device information. + * + * DRM_VMW_PARAM_FIFO_OFFSET: + * Offset to use to map the first page of the FIFO read-only. + * The fifo is mapped using the mmap() system call on the drm device. + * + * DRM_VMW_PARAM_OVERLAY_IOCTL: + * Does the driver support the overlay ioctl. + */ + +#define DRM_VMW_PARAM_NUM_STREAMS 0 +#define DRM_VMW_PARAM_NUM_FREE_STREAMS 1 +#define DRM_VMW_PARAM_3D 2 +#define DRM_VMW_PARAM_FIFO_OFFSET 3 +#define DRM_VMW_PARAM_HW_CAPS 4 +#define DRM_VMW_PARAM_FIFO_CAPS 5 + +/** + * struct drm_vmw_getparam_arg + * + * @value: Returned value. //Out + * @param: Parameter to query. //In. + * + * Argument to the DRM_VMW_GET_PARAM Ioctl. + */ + +struct drm_vmw_getparam_arg { + uint64_t value; + uint32_t param; + uint32_t pad64; +}; + +/*************************************************************************/ +/** + * DRM_VMW_EXTENSION - Query device extensions. + */ + +/** + * struct drm_vmw_extension_rep + * + * @exists: The queried extension exists. + * @driver_ioctl_offset: Ioctl number of the first ioctl in the extension. + * @driver_sarea_offset: Offset to any space in the DRI SAREA + * used by the extension. + * @major: Major version number of the extension. + * @minor: Minor version number of the extension. + * @pl: Patch level version number of the extension. + * + * Output argument to the DRM_VMW_EXTENSION Ioctl. + */ + +struct drm_vmw_extension_rep { + int32_t exists; + uint32_t driver_ioctl_offset; + uint32_t driver_sarea_offset; + uint32_t major; + uint32_t minor; + uint32_t pl; + uint32_t pad64; +}; + +/** + * union drm_vmw_extension_arg + * + * @extension - Ascii name of the extension to be queried. //In + * @rep - Reply as defined above. //Out + * + * Argument to the DRM_VMW_EXTENSION Ioctl. + */ + +union drm_vmw_extension_arg { + char extension[DRM_VMW_EXT_NAME_LEN]; + struct drm_vmw_extension_rep rep; +}; + +/*************************************************************************/ +/** + * DRM_VMW_CREATE_CONTEXT - Create a host context. + * + * Allocates a device unique context id, and queues a create context command + * for the host. Does not wait for host completion. + */ + +/** + * struct drm_vmw_context_arg + * + * @cid: Device unique context ID. + * + * Output argument to the DRM_VMW_CREATE_CONTEXT Ioctl. + * Input argument to the DRM_VMW_UNREF_CONTEXT Ioctl. + */ + +struct drm_vmw_context_arg { + int32_t cid; + uint32_t pad64; +}; + +/*************************************************************************/ +/** + * DRM_VMW_UNREF_CONTEXT - Create a host context. + * + * Frees a global context id, and queues a destroy host command for the host. + * Does not wait for host completion. The context ID can be used directly + * in the command stream and shows up as the same context ID on the host. + */ + +/*************************************************************************/ +/** + * DRM_VMW_CREATE_SURFACE - Create a host suface. + * + * Allocates a device unique surface id, and queues a create surface command + * for the host. Does not wait for host completion. The surface ID can be + * used directly in the command stream and shows up as the same surface + * ID on the host. + */ + +/** + * struct drm_wmv_surface_create_req + * + * @flags: Surface flags as understood by the host. + * @format: Surface format as understood by the host. + * @mip_levels: Number of mip levels for each face. + * An unused face should have 0 encoded. + * @size_addr: Address of a user-space array of sruct drm_vmw_size + * cast to an uint64_t for 32-64 bit compatibility. + * The size of the array should equal the total number of mipmap levels. + * @shareable: Boolean whether other clients (as identified by file descriptors) + * may reference this surface. + * @scanout: Boolean whether the surface is intended to be used as a + * scanout. + * + * Input data to the DRM_VMW_CREATE_SURFACE Ioctl. + * Output data from the DRM_VMW_REF_SURFACE Ioctl. + */ + +struct drm_vmw_surface_create_req { + uint32_t flags; + uint32_t format; + uint32_t mip_levels[DRM_VMW_MAX_SURFACE_FACES]; + uint64_t size_addr; + int32_t shareable; + int32_t scanout; +}; + +/** + * struct drm_wmv_surface_arg + * + * @sid: Surface id of created surface or surface to destroy or reference. + * + * Output data from the DRM_VMW_CREATE_SURFACE Ioctl. + * Input argument to the DRM_VMW_UNREF_SURFACE Ioctl. + * Input argument to the DRM_VMW_REF_SURFACE Ioctl. + */ + +struct drm_vmw_surface_arg { + int32_t sid; + uint32_t pad64; +}; + +/** + * struct drm_vmw_size ioctl. + * + * @width - mip level width + * @height - mip level height + * @depth - mip level depth + * + * Description of a mip level. + * Input data to the DRM_WMW_CREATE_SURFACE Ioctl. + */ + +struct drm_vmw_size { + uint32_t width; + uint32_t height; + uint32_t depth; + uint32_t pad64; +}; + +/** + * union drm_vmw_surface_create_arg + * + * @rep: Output data as described above. + * @req: Input data as described above. + * + * Argument to the DRM_VMW_CREATE_SURFACE Ioctl. + */ + +union drm_vmw_surface_create_arg { + struct drm_vmw_surface_arg rep; + struct drm_vmw_surface_create_req req; +}; + +/*************************************************************************/ +/** + * DRM_VMW_REF_SURFACE - Reference a host surface. + * + * Puts a reference on a host surface with a give sid, as previously + * returned by the DRM_VMW_CREATE_SURFACE ioctl. + * A reference will make sure the surface isn't destroyed while we hold + * it and will allow the calling client to use the surface ID in the command + * stream. + * + * On successful return, the Ioctl returns the surface information given + * in the DRM_VMW_CREATE_SURFACE ioctl. + */ + +/** + * union drm_vmw_surface_reference_arg + * + * @rep: Output data as described above. + * @req: Input data as described above. + * + * Argument to the DRM_VMW_REF_SURFACE Ioctl. + */ + +union drm_vmw_surface_reference_arg { + struct drm_vmw_surface_create_req rep; + struct drm_vmw_surface_arg req; +}; + +/*************************************************************************/ +/** + * DRM_VMW_UNREF_SURFACE - Unreference a host surface. + * + * Clear a reference previously put on a host surface. + * When all references are gone, including the one implicitly placed + * on creation, + * a destroy surface command will be queued for the host. + * Does not wait for completion. + */ + +/*************************************************************************/ +/** + * DRM_VMW_EXECBUF + * + * Submit a command buffer for execution on the host, and return a + * fence sequence that when signaled, indicates that the command buffer has + * executed. + */ + +/** + * struct drm_vmw_execbuf_arg + * + * @commands: User-space address of a command buffer cast to an uint64_t. + * @command-size: Size in bytes of the command buffer. + * @throttle-us: Sleep until software is less than @throttle_us + * microseconds ahead of hardware. The driver may round this value + * to the nearest kernel tick. + * @fence_rep: User-space address of a struct drm_vmw_fence_rep cast to an + * uint64_t. + * @version: Allows expanding the execbuf ioctl parameters without breaking + * backwards compatibility, since user-space will always tell the kernel + * which version it uses. + * @flags: Execbuf flags. None currently. + * + * Argument to the DRM_VMW_EXECBUF Ioctl. + */ + +#define DRM_VMW_EXECBUF_VERSION 0 + +struct drm_vmw_execbuf_arg { + uint64_t commands; + uint32_t command_size; + uint32_t throttle_us; + uint64_t fence_rep; + uint32_t version; + uint32_t flags; +}; + +/** + * struct drm_vmw_fence_rep + * + * @fence_seq: Fence sequence associated with a command submission. + * @error: This member should've been set to -EFAULT on submission. + * The following actions should be take on completion: + * error == -EFAULT: Fence communication failed. The host is synchronized. + * Use the last fence id read from the FIFO fence register. + * error != 0 && error != -EFAULT: + * Fence submission failed. The host is synchronized. Use the fence_seq member. + * error == 0: All is OK, The host may not be synchronized. + * Use the fence_seq member. + * + * Input / Output data to the DRM_VMW_EXECBUF Ioctl. + */ + +struct drm_vmw_fence_rep { + uint64_t fence_seq; + int32_t error; + uint32_t pad64; +}; + +/*************************************************************************/ +/** + * DRM_VMW_ALLOC_DMABUF + * + * Allocate a DMA buffer that is visible also to the host. + * NOTE: The buffer is + * identified by a handle and an offset, which are private to the guest, but + * useable in the command stream. The guest kernel may translate these + * and patch up the command stream accordingly. In the future, the offset may + * be zero at all times, or it may disappear from the interface before it is + * fixed. + * + * The DMA buffer may stay user-space mapped in the guest at all times, + * and is thus suitable for sub-allocation. + * + * DMA buffers are mapped using the mmap() syscall on the drm device. + */ + +/** + * struct drm_vmw_alloc_dmabuf_req + * + * @size: Required minimum size of the buffer. + * + * Input data to the DRM_VMW_ALLOC_DMABUF Ioctl. + */ + +struct drm_vmw_alloc_dmabuf_req { + uint32_t size; + uint32_t pad64; +}; + +/** + * struct drm_vmw_dmabuf_rep + * + * @map_handle: Offset to use in the mmap() call used to map the buffer. + * @handle: Handle unique to this buffer. Used for unreferencing. + * @cur_gmr_id: GMR id to use in the command stream when this buffer is + * referenced. See not above. + * @cur_gmr_offset: Offset to use in the command stream when this buffer is + * referenced. See note above. + * + * Output data from the DRM_VMW_ALLOC_DMABUF Ioctl. + */ + +struct drm_vmw_dmabuf_rep { + uint64_t map_handle; + uint32_t handle; + uint32_t cur_gmr_id; + uint32_t cur_gmr_offset; + uint32_t pad64; +}; + +/** + * union drm_vmw_dmabuf_arg + * + * @req: Input data as described above. + * @rep: Output data as described above. + * + * Argument to the DRM_VMW_ALLOC_DMABUF Ioctl. + */ + +union drm_vmw_alloc_dmabuf_arg { + struct drm_vmw_alloc_dmabuf_req req; + struct drm_vmw_dmabuf_rep rep; +}; + +/*************************************************************************/ +/** + * DRM_VMW_UNREF_DMABUF - Free a DMA buffer. + * + */ + +/** + * struct drm_vmw_unref_dmabuf_arg + * + * @handle: Handle indicating what buffer to free. Obtained from the + * DRM_VMW_ALLOC_DMABUF Ioctl. + * + * Argument to the DRM_VMW_UNREF_DMABUF Ioctl. + */ + +struct drm_vmw_unref_dmabuf_arg { + uint32_t handle; + uint32_t pad64; +}; + +/*************************************************************************/ +/** + * DRM_VMW_FIFO_DEBUG - Get last FIFO submission. + * + * This IOCTL copies the last FIFO submission directly out of the FIFO buffer. + */ + +/** + * struct drm_vmw_fifo_debug_arg + * + * @debug_buffer: User space address of a debug_buffer cast to an uint64_t //In + * @debug_buffer_size: Size in bytes of debug buffer //In + * @used_size: Number of bytes copied to the buffer // Out + * @did_not_fit: Boolean indicating that the fifo contents did not fit. //Out + * + * Argument to the DRM_VMW_FIFO_DEBUG Ioctl. + */ + +struct drm_vmw_fifo_debug_arg { + uint64_t debug_buffer; + uint32_t debug_buffer_size; + uint32_t used_size; + int32_t did_not_fit; + uint32_t pad64; +}; + +struct drm_vmw_fence_wait_arg { + uint64_t sequence; + uint64_t kernel_cookie; + int32_t cookie_valid; + int32_t pad64; +}; + +/*************************************************************************/ +/** + * DRM_VMW_CONTROL_STREAM - Control overlays, aka streams. + * + * This IOCTL controls the overlay units of the svga device. + * The SVGA overlay units does not work like regular hardware units in + * that they do not automaticaly read back the contents of the given dma + * buffer. But instead only read back for each call to this ioctl, and + * at any point between this call being made and a following call that + * either changes the buffer or disables the stream. + */ + +/** + * struct drm_vmw_rect + * + * Defines a rectangle. Used in the overlay ioctl to define + * source and destination rectangle. + */ + +struct drm_vmw_rect { + int32_t x; + int32_t y; + uint32_t w; + uint32_t h; +}; + +/** + * struct drm_vmw_control_stream_arg + * + * @stream_id: Stearm to control + * @enabled: If false all following arguments are ignored. + * @handle: Handle to buffer for getting data from. + * @format: Format of the overlay as understood by the host. + * @width: Width of the overlay. + * @height: Height of the overlay. + * @size: Size of the overlay in bytes. + * @pitch: Array of pitches, the two last are only used for YUV12 formats. + * @offset: Offset from start of dma buffer to overlay. + * @src: Source rect, must be within the defined area above. + * @dst: Destination rect, x and y may be negative. + * + * Argument to the DRM_VMW_CONTROL_STREAM Ioctl. + */ + +struct drm_vmw_control_stream_arg { + uint32_t stream_id; + uint32_t enabled; + + uint32_t flags; + uint32_t color_key; + + uint32_t handle; + uint32_t offset; + int32_t format; + uint32_t size; + uint32_t width; + uint32_t height; + uint32_t pitch[3]; + + uint32_t pad64; + struct drm_vmw_rect src; + struct drm_vmw_rect dst; +}; + +/*************************************************************************/ +/** + * DRM_VMW_CURSOR_BYPASS - Give extra information about cursor bypass. + * + */ + +#define DRM_VMW_CURSOR_BYPASS_ALL (1 << 0) +#define DRM_VMW_CURSOR_BYPASS_FLAGS (1) + +/** + * struct drm_vmw_cursor_bypass_arg + * + * @flags: Flags. + * @crtc_id: Crtc id, only used if DMR_CURSOR_BYPASS_ALL isn't passed. + * @xpos: X position of cursor. + * @ypos: Y position of cursor. + * @xhot: X hotspot. + * @yhot: Y hotspot. + * + * Argument to the DRM_VMW_CURSOR_BYPASS Ioctl. + */ + +struct drm_vmw_cursor_bypass_arg { + uint32_t flags; + uint32_t crtc_id; + int32_t xpos; + int32_t ypos; + int32_t xhot; + int32_t yhot; +}; + +/*************************************************************************/ +/** + * DRM_VMW_CLAIM_STREAM - Claim a single stream. + */ + +/** + * struct drm_vmw_context_arg + * + * @stream_id: Device unique context ID. + * + * Output argument to the DRM_VMW_CREATE_CONTEXT Ioctl. + * Input argument to the DRM_VMW_UNREF_CONTEXT Ioctl. + */ + +struct drm_vmw_stream_arg { + uint32_t stream_id; + uint32_t pad64; +}; + +/*************************************************************************/ +/** + * DRM_VMW_UNREF_STREAM - Unclaim a stream. + * + * Return a single stream that was claimed by this process. Also makes + * sure that the stream has been stopped. + */ + +#endif --- linux-ec2-2.6.32.orig/include/drm/drmP.h +++ linux-ec2-2.6.32/include/drm/drmP.h @@ -245,16 +245,6 @@ #endif -#define DRM_PROC_LIMIT (PAGE_SIZE-80) - -#define DRM_PROC_PRINT(fmt, arg...) \ - len += sprintf(&buf[len], fmt , ##arg); \ - if (len > DRM_PROC_LIMIT) { *eof = 1; return len - offset; } - -#define DRM_PROC_PRINT_RET(ret, fmt, arg...) \ - len += sprintf(&buf[len], fmt , ##arg); \ - if (len > DRM_PROC_LIMIT) { ret; *eof = 1; return len - offset; } - /*@}*/ /***********************************************************************/ @@ -265,19 +255,8 @@ #define DRM_LEFTCOUNT(x) (((x)->rp + (x)->count - (x)->wp) % ((x)->count + 1)) #define DRM_BUFCOUNT(x) ((x)->count - DRM_LEFTCOUNT(x)) -#define DRM_WAITCOUNT(dev,idx) DRM_BUFCOUNT(&dev->queuelist[idx]->waitlist) #define DRM_IF_VERSION(maj, min) (maj << 16 | min) -/** - * Get the private SAREA mapping. - * - * \param _dev DRM device. - * \param _ctx context number. - * \param _map output mapping. - */ -#define DRM_GET_PRIV_SAREA(_dev, _ctx, _map) do { \ - (_map) = (_dev)->context_sareas[_ctx]; \ -} while(0) /** * Test that the hardware lock is held by the caller, returning otherwise. @@ -297,18 +276,6 @@ } while (0) /** - * Copy and IOCTL return string to user space - */ -#define DRM_COPY( name, value ) \ - len = strlen( value ); \ - if ( len > name##_len ) len = name##_len; \ - name##_len = strlen( value ); \ - if ( len && name ) { \ - if ( copy_to_user( name, value, len ) ) \ - return -EFAULT; \ - } - -/** * Ioctl function type. * * \param inode device inode. @@ -322,10 +289,14 @@ typedef int drm_ioctl_compat_t(struct file *filp, unsigned int cmd, unsigned long arg); +#define DRM_IOCTL_NR(n) _IOC_NR(n) +#define DRM_MAJOR 226 + #define DRM_AUTH 0x1 #define DRM_MASTER 0x2 #define DRM_ROOT_ONLY 0x4 #define DRM_CONTROL_ALLOW 0x8 +#define DRM_UNLOCKED 0x10 struct drm_ioctl_desc { unsigned int cmd; @@ -426,6 +397,14 @@ struct drm_freelist freelist; }; +/* Event queued up for userspace to read */ +struct drm_pending_event { + struct drm_event *event; + struct list_head link; + struct drm_file *file_priv; + void (*destroy)(struct drm_pending_event *event); +}; + /** File private data */ struct drm_file { int authenticated; @@ -449,6 +428,10 @@ struct drm_master *master; /* master this node is currently associated with N.B. not always minor->master */ struct list_head fbs; + + wait_queue_head_t event_wait; + struct list_head event_list; + int event_space; }; /** Wait queue */ @@ -795,6 +778,15 @@ /* Master routines */ int (*master_create)(struct drm_device *dev, struct drm_master *master); void (*master_destroy)(struct drm_device *dev, struct drm_master *master); + /** + * master_set is called whenever the minor master is set. + * master_drop is called whenever the minor master is dropped. + */ + + int (*master_set)(struct drm_device *dev, struct drm_file *file_priv, + bool from_open); + void (*master_drop)(struct drm_device *dev, struct drm_file *file_priv, + bool from_release); int (*proc_init)(struct drm_minor *minor); void (*proc_cleanup)(struct drm_minor *minor); @@ -900,6 +892,12 @@ struct drm_mode_group mode_group; }; +struct drm_pending_vblank_event { + struct drm_pending_event base; + int pipe; + struct drm_event_vblank event; +}; + /** * DRM device structure. This structure represent a complete card that * may contain multiple heads. @@ -999,6 +997,12 @@ u32 max_vblank_count; /**< size of vblank counter register */ + /** + * List of events + */ + struct list_head vblank_event_list; + spinlock_t event_lock; + /*@} */ cycles_t ctx_start; cycles_t lck_start; @@ -1016,7 +1020,7 @@ struct pci_controller *hose; #endif struct drm_sg_mem *sg; /**< Scatter gather memory */ - int num_crtcs; /**< Number of CRTCs on this device */ + unsigned int num_crtcs; /**< Number of CRTCs on this device */ void *dev_private; /**< device private data */ void *mm_private; struct address_space *dev_mapping; @@ -1125,8 +1129,8 @@ /* Driver support (drm_drv.h) */ extern int drm_init(struct drm_driver *driver); extern void drm_exit(struct drm_driver *driver); -extern int drm_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); +extern long drm_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg); extern long drm_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); extern int drm_lastclose(struct drm_device *dev); @@ -1135,6 +1139,8 @@ extern int drm_open(struct inode *inode, struct file *filp); extern int drm_stub_open(struct inode *inode, struct file *filp); extern int drm_fasync(int fd, struct file *filp, int on); +extern ssize_t drm_read(struct file *filp, char __user *buffer, + size_t count, loff_t *offset); extern int drm_release(struct inode *inode, struct file *filp); /* Mapping support (drm_vm.h) */ @@ -1221,6 +1227,7 @@ struct drm_file *file_priv); extern int drm_authmagic(struct drm_device *dev, void *data, struct drm_file *file_priv); +extern int drm_remove_magic(struct drm_master *master, drm_magic_t magic); /* Cache management (drm_cache.c) */ void drm_clflush_pages(struct page *pages[], unsigned long num_pages); @@ -1295,6 +1302,7 @@ extern void drm_handle_vblank(struct drm_device *dev, int crtc); extern int drm_vblank_get(struct drm_device *dev, int crtc); extern void drm_vblank_put(struct drm_device *dev, int crtc); +extern void drm_vblank_off(struct drm_device *dev, int crtc); extern void drm_vblank_cleanup(struct drm_device *dev); /* Modesetting support */ extern void drm_vblank_pre_modeset(struct drm_device *dev, int crtc); @@ -1401,7 +1409,7 @@ struct drm_ati_pcigart_info * gart_info); extern drm_dma_handle_t *drm_pci_alloc(struct drm_device *dev, size_t size, - size_t align, dma_addr_t maxaddr); + size_t align); extern void __drm_pci_free(struct drm_device *dev, drm_dma_handle_t * dmah); extern void drm_pci_free(struct drm_device *dev, drm_dma_handle_t * dmah); @@ -1519,14 +1527,27 @@ static __inline__ void *drm_calloc_large(size_t nmemb, size_t size) { + if (size != 0 && nmemb > ULONG_MAX / size) + return NULL; + if (size * nmemb <= PAGE_SIZE) return kcalloc(nmemb, size, GFP_KERNEL); + return __vmalloc(size * nmemb, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); +} + +/* Modeled after cairo's malloc_ab, it's like calloc but without the zeroing. */ +static __inline__ void *drm_malloc_ab(size_t nmemb, size_t size) +{ if (size != 0 && nmemb > ULONG_MAX / size) return NULL; + if (size * nmemb <= PAGE_SIZE) + return kmalloc(nmemb * size, GFP_KERNEL); + return __vmalloc(size * nmemb, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); } static __inline void drm_free_large(void *ptr) --- linux-ec2-2.6.32.orig/include/drm/drm_os_linux.h +++ linux-ec2-2.6.32/include/drm/drm_os_linux.h @@ -123,5 +123,5 @@ remove_wait_queue(&(queue), &entry); \ } while (0) -#define DRM_WAKEUP( queue ) wake_up_interruptible( queue ) +#define DRM_WAKEUP( queue ) wake_up( queue ) #define DRM_INIT_WAITQUEUE( queue ) init_waitqueue_head( queue ) --- linux-ec2-2.6.32.orig/include/drm/i915_drm.h +++ linux-ec2-2.6.32/include/drm/i915_drm.h @@ -27,11 +27,11 @@ #ifndef _I915_DRM_H_ #define _I915_DRM_H_ +#include "drm.h" + /* Please note that modifications to all structs defined here are * subject to backwards-compatibility constraints. */ -#include -#include "drm.h" /* Each region is a minimum of 16k, and there are at most 255 of them. */ @@ -186,6 +186,9 @@ #define DRM_I915_GEM_MMAP_GTT 0x24 #define DRM_I915_GET_PIPE_FROM_CRTC_ID 0x25 #define DRM_I915_GEM_MADVISE 0x26 +#define DRM_I915_OVERLAY_PUT_IMAGE 0x27 +#define DRM_I915_OVERLAY_ATTRS 0x28 +#define DRM_I915_GEM_EXECBUFFER2 0x29 #define DRM_IOCTL_I915_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t) #define DRM_IOCTL_I915_FLUSH DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH) @@ -205,6 +208,7 @@ #define DRM_IOCTL_I915_VBLANK_SWAP DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_VBLANK_SWAP, drm_i915_vblank_swap_t) #define DRM_IOCTL_I915_GEM_INIT DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init) #define DRM_IOCTL_I915_GEM_EXECBUFFER DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer) +#define DRM_IOCTL_I915_GEM_EXECBUFFER2 DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2) #define DRM_IOCTL_I915_GEM_PIN DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin) #define DRM_IOCTL_I915_GEM_UNPIN DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin) #define DRM_IOCTL_I915_GEM_BUSY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy) @@ -221,8 +225,10 @@ #define DRM_IOCTL_I915_GEM_SET_TILING DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_SET_TILING, struct drm_i915_gem_set_tiling) #define DRM_IOCTL_I915_GEM_GET_TILING DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_GET_TILING, struct drm_i915_gem_get_tiling) #define DRM_IOCTL_I915_GEM_GET_APERTURE DRM_IOR (DRM_COMMAND_BASE + DRM_I915_GEM_GET_APERTURE, struct drm_i915_gem_get_aperture) -#define DRM_IOCTL_I915_GET_PIPE_FROM_CRTC_ID DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_PIPE_FROM_CRTC_ID, struct drm_intel_get_pipe_from_crtc_id) +#define DRM_IOCTL_I915_GET_PIPE_FROM_CRTC_ID DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_PIPE_FROM_CRTC_ID, struct drm_i915_get_pipe_from_crtc_id) #define DRM_IOCTL_I915_GEM_MADVISE DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MADVISE, struct drm_i915_gem_madvise) +#define DRM_IOCTL_I915_OVERLAY_PUT_IMAGE DRM_IOW(DRM_COMMAND_BASE + DRM_IOCTL_I915_OVERLAY_ATTRS, struct drm_intel_overlay_put_image) +#define DRM_IOCTL_I915_OVERLAY_ATTRS DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_OVERLAY_ATTRS, struct drm_intel_overlay_attrs) /* Allow drivers to submit batchbuffers directly to hardware, relying * on the security mechanisms provided by hardware. @@ -266,6 +272,9 @@ #define I915_PARAM_CHIPSET_ID 4 #define I915_PARAM_HAS_GEM 5 #define I915_PARAM_NUM_FENCES_AVAIL 6 +#define I915_PARAM_HAS_OVERLAY 7 +#define I915_PARAM_HAS_PAGEFLIPPING 8 +#define I915_PARAM_HAS_EXECBUF2 9 typedef struct drm_i915_getparam { int param; @@ -561,6 +570,57 @@ __u64 cliprects_ptr; }; +struct drm_i915_gem_exec_object2 { + /** + * User's handle for a buffer to be bound into the GTT for this + * operation. + */ + __u32 handle; + + /** Number of relocations to be performed on this buffer */ + __u32 relocation_count; + /** + * Pointer to array of struct drm_i915_gem_relocation_entry containing + * the relocations to be performed in this buffer. + */ + __u64 relocs_ptr; + + /** Required alignment in graphics aperture */ + __u64 alignment; + + /** + * Returned value of the updated offset of the object, for future + * presumed_offset writes. + */ + __u64 offset; + +#define EXEC_OBJECT_NEEDS_FENCE (1<<0) + __u64 flags; + __u64 rsvd1; + __u64 rsvd2; +}; + +struct drm_i915_gem_execbuffer2 { + /** + * List of gem_exec_object2 structs + */ + __u64 buffers_ptr; + __u32 buffer_count; + + /** Offset in the batchbuffer to start execution from. */ + __u32 batch_start_offset; + /** Bytes used in batchbuffer from batch_start_offset */ + __u32 batch_len; + __u32 DR1; + __u32 DR4; + __u32 num_cliprects; + /** This is a struct drm_clip_rect *cliprects */ + __u64 cliprects_ptr; + __u64 flags; /* currently unused */ + __u64 rsvd1; + __u64 rsvd2; +}; + struct drm_i915_gem_pin { /** Handle of the buffer to be pinned. */ __u32 handle; @@ -686,4 +746,70 @@ __u32 retained; }; +/* flags */ +#define I915_OVERLAY_TYPE_MASK 0xff +#define I915_OVERLAY_YUV_PLANAR 0x01 +#define I915_OVERLAY_YUV_PACKED 0x02 +#define I915_OVERLAY_RGB 0x03 + +#define I915_OVERLAY_DEPTH_MASK 0xff00 +#define I915_OVERLAY_RGB24 0x1000 +#define I915_OVERLAY_RGB16 0x2000 +#define I915_OVERLAY_RGB15 0x3000 +#define I915_OVERLAY_YUV422 0x0100 +#define I915_OVERLAY_YUV411 0x0200 +#define I915_OVERLAY_YUV420 0x0300 +#define I915_OVERLAY_YUV410 0x0400 + +#define I915_OVERLAY_SWAP_MASK 0xff0000 +#define I915_OVERLAY_NO_SWAP 0x000000 +#define I915_OVERLAY_UV_SWAP 0x010000 +#define I915_OVERLAY_Y_SWAP 0x020000 +#define I915_OVERLAY_Y_AND_UV_SWAP 0x030000 + +#define I915_OVERLAY_FLAGS_MASK 0xff000000 +#define I915_OVERLAY_ENABLE 0x01000000 + +struct drm_intel_overlay_put_image { + /* various flags and src format description */ + __u32 flags; + /* source picture description */ + __u32 bo_handle; + /* stride values and offsets are in bytes, buffer relative */ + __u16 stride_Y; /* stride for packed formats */ + __u16 stride_UV; + __u32 offset_Y; /* offset for packet formats */ + __u32 offset_U; + __u32 offset_V; + /* in pixels */ + __u16 src_width; + __u16 src_height; + /* to compensate the scaling factors for partially covered surfaces */ + __u16 src_scan_width; + __u16 src_scan_height; + /* output crtc description */ + __u32 crtc_id; + __u16 dst_x; + __u16 dst_y; + __u16 dst_width; + __u16 dst_height; +}; + +/* flags */ +#define I915_OVERLAY_UPDATE_ATTRS (1<<0) +#define I915_OVERLAY_UPDATE_GAMMA (1<<1) +struct drm_intel_overlay_attrs { + __u32 flags; + __u32 color_key; + __s32 brightness; + __u32 contrast; + __u32 saturation; + __u32 gamma0; + __u32 gamma1; + __u32 gamma2; + __u32 gamma3; + __u32 gamma4; + __u32 gamma5; +}; + #endif /* _I915_DRM_H_ */ --- linux-ec2-2.6.32.orig/include/drm/via_drm.h +++ linux-ec2-2.6.32/include/drm/via_drm.h @@ -24,7 +24,7 @@ #ifndef _VIA_DRM_H_ #define _VIA_DRM_H_ -#include +#include "drm.h" /* WARNING: These defines must be the same as what the Xserver uses. * if you change them, you must change the defines in the Xserver. --- linux-ec2-2.6.32.orig/include/drm/nouveau_drm.h +++ linux-ec2-2.6.32/include/drm/nouveau_drm.h @@ -0,0 +1,221 @@ +/* + * Copyright 2005 Stephane Marchesin. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __NOUVEAU_DRM_H__ +#define __NOUVEAU_DRM_H__ + +#define NOUVEAU_DRM_HEADER_PATCHLEVEL 15 + +struct drm_nouveau_channel_alloc { + uint32_t fb_ctxdma_handle; + uint32_t tt_ctxdma_handle; + + int channel; + + /* Notifier memory */ + uint32_t notifier_handle; + + /* DRM-enforced subchannel assignments */ + struct { + uint32_t handle; + uint32_t grclass; + } subchan[8]; + uint32_t nr_subchan; +}; + +struct drm_nouveau_channel_free { + int channel; +}; + +struct drm_nouveau_grobj_alloc { + int channel; + uint32_t handle; + int class; +}; + +struct drm_nouveau_notifierobj_alloc { + uint32_t channel; + uint32_t handle; + uint32_t size; + uint32_t offset; +}; + +struct drm_nouveau_gpuobj_free { + int channel; + uint32_t handle; +}; + +/* FIXME : maybe unify {GET,SET}PARAMs */ +#define NOUVEAU_GETPARAM_PCI_VENDOR 3 +#define NOUVEAU_GETPARAM_PCI_DEVICE 4 +#define NOUVEAU_GETPARAM_BUS_TYPE 5 +#define NOUVEAU_GETPARAM_FB_PHYSICAL 6 +#define NOUVEAU_GETPARAM_AGP_PHYSICAL 7 +#define NOUVEAU_GETPARAM_FB_SIZE 8 +#define NOUVEAU_GETPARAM_AGP_SIZE 9 +#define NOUVEAU_GETPARAM_PCI_PHYSICAL 10 +#define NOUVEAU_GETPARAM_CHIPSET_ID 11 +#define NOUVEAU_GETPARAM_VM_VRAM_BASE 12 +#define NOUVEAU_GETPARAM_GRAPH_UNITS 13 +struct drm_nouveau_getparam { + uint64_t param; + uint64_t value; +}; + +struct drm_nouveau_setparam { + uint64_t param; + uint64_t value; +}; + +#define NOUVEAU_GEM_DOMAIN_CPU (1 << 0) +#define NOUVEAU_GEM_DOMAIN_VRAM (1 << 1) +#define NOUVEAU_GEM_DOMAIN_GART (1 << 2) +#define NOUVEAU_GEM_DOMAIN_MAPPABLE (1 << 3) + +struct drm_nouveau_gem_info { + uint32_t handle; + uint32_t domain; + uint64_t size; + uint64_t offset; + uint64_t map_handle; + uint32_t tile_mode; + uint32_t tile_flags; +}; + +struct drm_nouveau_gem_new { + struct drm_nouveau_gem_info info; + uint32_t channel_hint; + uint32_t align; +}; + +struct drm_nouveau_gem_pushbuf_bo { + uint64_t user_priv; + uint32_t handle; + uint32_t read_domains; + uint32_t write_domains; + uint32_t valid_domains; + uint32_t presumed_ok; + uint32_t presumed_domain; + uint64_t presumed_offset; +}; + +#define NOUVEAU_GEM_RELOC_LOW (1 << 0) +#define NOUVEAU_GEM_RELOC_HIGH (1 << 1) +#define NOUVEAU_GEM_RELOC_OR (1 << 2) +struct drm_nouveau_gem_pushbuf_reloc { + uint32_t bo_index; + uint32_t reloc_index; + uint32_t flags; + uint32_t data; + uint32_t vor; + uint32_t tor; +}; + +#define NOUVEAU_GEM_MAX_BUFFERS 1024 +#define NOUVEAU_GEM_MAX_RELOCS 1024 + +struct drm_nouveau_gem_pushbuf { + uint32_t channel; + uint32_t nr_dwords; + uint32_t nr_buffers; + uint32_t nr_relocs; + uint64_t dwords; + uint64_t buffers; + uint64_t relocs; +}; + +struct drm_nouveau_gem_pushbuf_call { + uint32_t channel; + uint32_t handle; + uint32_t offset; + uint32_t nr_buffers; + uint32_t nr_relocs; + uint32_t nr_dwords; + uint64_t buffers; + uint64_t relocs; + uint32_t suffix0; + uint32_t suffix1; + /* below only accessed for CALL2 */ + uint64_t vram_available; + uint64_t gart_available; +}; + +struct drm_nouveau_gem_pin { + uint32_t handle; + uint32_t domain; + uint64_t offset; +}; + +struct drm_nouveau_gem_unpin { + uint32_t handle; +}; + +#define NOUVEAU_GEM_CPU_PREP_NOWAIT 0x00000001 +#define NOUVEAU_GEM_CPU_PREP_NOBLOCK 0x00000002 +#define NOUVEAU_GEM_CPU_PREP_WRITE 0x00000004 +struct drm_nouveau_gem_cpu_prep { + uint32_t handle; + uint32_t flags; +}; + +struct drm_nouveau_gem_cpu_fini { + uint32_t handle; +}; + +struct drm_nouveau_gem_tile { + uint32_t handle; + uint32_t offset; + uint32_t size; + uint32_t tile_mode; + uint32_t tile_flags; +}; + +enum nouveau_bus_type { + NV_AGP = 0, + NV_PCI = 1, + NV_PCIE = 2, +}; + +struct drm_nouveau_sarea { +}; + +#define DRM_NOUVEAU_CARD_INIT 0x00 +#define DRM_NOUVEAU_GETPARAM 0x01 +#define DRM_NOUVEAU_SETPARAM 0x02 +#define DRM_NOUVEAU_CHANNEL_ALLOC 0x03 +#define DRM_NOUVEAU_CHANNEL_FREE 0x04 +#define DRM_NOUVEAU_GROBJ_ALLOC 0x05 +#define DRM_NOUVEAU_NOTIFIEROBJ_ALLOC 0x06 +#define DRM_NOUVEAU_GPUOBJ_FREE 0x07 +#define DRM_NOUVEAU_GEM_NEW 0x40 +#define DRM_NOUVEAU_GEM_PUSHBUF 0x41 +#define DRM_NOUVEAU_GEM_PUSHBUF_CALL 0x42 +#define DRM_NOUVEAU_GEM_PIN 0x43 /* !KMS only */ +#define DRM_NOUVEAU_GEM_UNPIN 0x44 /* !KMS only */ +#define DRM_NOUVEAU_GEM_CPU_PREP 0x45 +#define DRM_NOUVEAU_GEM_CPU_FINI 0x46 +#define DRM_NOUVEAU_GEM_INFO 0x47 +#define DRM_NOUVEAU_GEM_PUSHBUF_CALL2 0x48 + +#endif /* __NOUVEAU_DRM_H__ */ --- linux-ec2-2.6.32.orig/include/drm/drm_dp_helper.h +++ linux-ec2-2.6.32/include/drm/drm_dp_helper.h @@ -0,0 +1,180 @@ +/* + * Copyright © 2008 Keith Packard + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that copyright + * notice and this permission notice appear in supporting documentation, and + * that the name of the copyright holders not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. The copyright holders make no representations + * about the suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THIS SOFTWARE. + */ + +#ifndef _DRM_DP_HELPER_H_ +#define _DRM_DP_HELPER_H_ + +/* From the VESA DisplayPort spec */ + +#define AUX_NATIVE_WRITE 0x8 +#define AUX_NATIVE_READ 0x9 +#define AUX_I2C_WRITE 0x0 +#define AUX_I2C_READ 0x1 +#define AUX_I2C_STATUS 0x2 +#define AUX_I2C_MOT 0x4 + +#define AUX_NATIVE_REPLY_ACK (0x0 << 4) +#define AUX_NATIVE_REPLY_NACK (0x1 << 4) +#define AUX_NATIVE_REPLY_DEFER (0x2 << 4) +#define AUX_NATIVE_REPLY_MASK (0x3 << 4) + +#define AUX_I2C_REPLY_ACK (0x0 << 6) +#define AUX_I2C_REPLY_NACK (0x1 << 6) +#define AUX_I2C_REPLY_DEFER (0x2 << 6) +#define AUX_I2C_REPLY_MASK (0x3 << 6) + +/* AUX CH addresses */ +/* DPCD */ +#define DP_DPCD_REV 0x000 + +#define DP_MAX_LINK_RATE 0x001 + +#define DP_MAX_LANE_COUNT 0x002 +# define DP_MAX_LANE_COUNT_MASK 0x1f +# define DP_ENHANCED_FRAME_CAP (1 << 7) + +#define DP_MAX_DOWNSPREAD 0x003 +# define DP_NO_AUX_HANDSHAKE_LINK_TRAINING (1 << 6) + +#define DP_NORP 0x004 + +#define DP_DOWNSTREAMPORT_PRESENT 0x005 +# define DP_DWN_STRM_PORT_PRESENT (1 << 0) +# define DP_DWN_STRM_PORT_TYPE_MASK 0x06 +/* 00b = DisplayPort */ +/* 01b = Analog */ +/* 10b = TMDS or HDMI */ +/* 11b = Other */ +# define DP_FORMAT_CONVERSION (1 << 3) + +#define DP_MAIN_LINK_CHANNEL_CODING 0x006 + +/* link configuration */ +#define DP_LINK_BW_SET 0x100 +# define DP_LINK_BW_1_62 0x06 +# define DP_LINK_BW_2_7 0x0a + +#define DP_LANE_COUNT_SET 0x101 +# define DP_LANE_COUNT_MASK 0x0f +# define DP_LANE_COUNT_ENHANCED_FRAME_EN (1 << 7) + +#define DP_TRAINING_PATTERN_SET 0x102 +# define DP_TRAINING_PATTERN_DISABLE 0 +# define DP_TRAINING_PATTERN_1 1 +# define DP_TRAINING_PATTERN_2 2 +# define DP_TRAINING_PATTERN_MASK 0x3 + +# define DP_LINK_QUAL_PATTERN_DISABLE (0 << 2) +# define DP_LINK_QUAL_PATTERN_D10_2 (1 << 2) +# define DP_LINK_QUAL_PATTERN_ERROR_RATE (2 << 2) +# define DP_LINK_QUAL_PATTERN_PRBS7 (3 << 2) +# define DP_LINK_QUAL_PATTERN_MASK (3 << 2) + +# define DP_RECOVERED_CLOCK_OUT_EN (1 << 4) +# define DP_LINK_SCRAMBLING_DISABLE (1 << 5) + +# define DP_SYMBOL_ERROR_COUNT_BOTH (0 << 6) +# define DP_SYMBOL_ERROR_COUNT_DISPARITY (1 << 6) +# define DP_SYMBOL_ERROR_COUNT_SYMBOL (2 << 6) +# define DP_SYMBOL_ERROR_COUNT_MASK (3 << 6) + +#define DP_TRAINING_LANE0_SET 0x103 +#define DP_TRAINING_LANE1_SET 0x104 +#define DP_TRAINING_LANE2_SET 0x105 +#define DP_TRAINING_LANE3_SET 0x106 + +# define DP_TRAIN_VOLTAGE_SWING_MASK 0x3 +# define DP_TRAIN_VOLTAGE_SWING_SHIFT 0 +# define DP_TRAIN_MAX_SWING_REACHED (1 << 2) +# define DP_TRAIN_VOLTAGE_SWING_400 (0 << 0) +# define DP_TRAIN_VOLTAGE_SWING_600 (1 << 0) +# define DP_TRAIN_VOLTAGE_SWING_800 (2 << 0) +# define DP_TRAIN_VOLTAGE_SWING_1200 (3 << 0) + +# define DP_TRAIN_PRE_EMPHASIS_MASK (3 << 3) +# define DP_TRAIN_PRE_EMPHASIS_0 (0 << 3) +# define DP_TRAIN_PRE_EMPHASIS_3_5 (1 << 3) +# define DP_TRAIN_PRE_EMPHASIS_6 (2 << 3) +# define DP_TRAIN_PRE_EMPHASIS_9_5 (3 << 3) + +# define DP_TRAIN_PRE_EMPHASIS_SHIFT 3 +# define DP_TRAIN_MAX_PRE_EMPHASIS_REACHED (1 << 5) + +#define DP_DOWNSPREAD_CTRL 0x107 +# define DP_SPREAD_AMP_0_5 (1 << 4) + +#define DP_MAIN_LINK_CHANNEL_CODING_SET 0x108 +# define DP_SET_ANSI_8B10B (1 << 0) + +#define DP_LANE0_1_STATUS 0x202 +#define DP_LANE2_3_STATUS 0x203 +# define DP_LANE_CR_DONE (1 << 0) +# define DP_LANE_CHANNEL_EQ_DONE (1 << 1) +# define DP_LANE_SYMBOL_LOCKED (1 << 2) + +#define DP_CHANNEL_EQ_BITS (DP_LANE_CR_DONE | \ + DP_LANE_CHANNEL_EQ_DONE | \ + DP_LANE_SYMBOL_LOCKED) + +#define DP_LANE_ALIGN_STATUS_UPDATED 0x204 + +#define DP_INTERLANE_ALIGN_DONE (1 << 0) +#define DP_DOWNSTREAM_PORT_STATUS_CHANGED (1 << 6) +#define DP_LINK_STATUS_UPDATED (1 << 7) + +#define DP_SINK_STATUS 0x205 + +#define DP_RECEIVE_PORT_0_STATUS (1 << 0) +#define DP_RECEIVE_PORT_1_STATUS (1 << 1) + +#define DP_ADJUST_REQUEST_LANE0_1 0x206 +#define DP_ADJUST_REQUEST_LANE2_3 0x207 +# define DP_ADJUST_VOLTAGE_SWING_LANE0_MASK 0x03 +# define DP_ADJUST_VOLTAGE_SWING_LANE0_SHIFT 0 +# define DP_ADJUST_PRE_EMPHASIS_LANE0_MASK 0x0c +# define DP_ADJUST_PRE_EMPHASIS_LANE0_SHIFT 2 +# define DP_ADJUST_VOLTAGE_SWING_LANE1_MASK 0x30 +# define DP_ADJUST_VOLTAGE_SWING_LANE1_SHIFT 4 +# define DP_ADJUST_PRE_EMPHASIS_LANE1_MASK 0xc0 +# define DP_ADJUST_PRE_EMPHASIS_LANE1_SHIFT 6 + +#define DP_SET_POWER 0x600 +# define DP_SET_POWER_D0 0x1 +# define DP_SET_POWER_D3 0x2 + +#define MODE_I2C_START 1 +#define MODE_I2C_WRITE 2 +#define MODE_I2C_READ 4 +#define MODE_I2C_STOP 8 + +struct i2c_algo_dp_aux_data { + bool running; + u16 address; + int (*aux_ch) (struct i2c_adapter *adapter, + int mode, uint8_t write_byte, + uint8_t *read_byte); +}; + +int +i2c_dp_aux_add_bus(struct i2c_adapter *adapter); + +#endif /* _DRM_DP_HELPER_H_ */ --- linux-ec2-2.6.32.orig/include/drm/ttm/ttm_lock.h +++ linux-ec2-2.6.32/include/drm/ttm/ttm_lock.h @@ -0,0 +1,247 @@ +/************************************************************************** + * + * Copyright (c) 2007-2009 VMware, Inc., Palo Alto, CA., USA + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ +/* + * Authors: Thomas Hellstrom + */ + +/** @file ttm_lock.h + * This file implements a simple replacement for the buffer manager use + * of the DRM heavyweight hardware lock. + * The lock is a read-write lock. Taking it in read mode and write mode + * is relatively fast, and intended for in-kernel use only. + * + * The vt mode is used only when there is a need to block all + * user-space processes from validating buffers. + * It's allowed to leave kernel space with the vt lock held. + * If a user-space process dies while having the vt-lock, + * it will be released during the file descriptor release. The vt lock + * excludes write lock and read lock. + * + * The suspend mode is used to lock out all TTM users when preparing for + * and executing suspend operations. + * + */ + +#ifndef _TTM_LOCK_H_ +#define _TTM_LOCK_H_ + +#include "ttm/ttm_object.h" +#include +#include + +/** + * struct ttm_lock + * + * @base: ttm base object used solely to release the lock if the client + * holding the lock dies. + * @queue: Queue for processes waiting for lock change-of-status. + * @lock: Spinlock protecting some lock members. + * @rw: Read-write lock counter. Protected by @lock. + * @flags: Lock state. Protected by @lock. + * @kill_takers: Boolean whether to kill takers of the lock. + * @signal: Signal to send when kill_takers is true. + */ + +struct ttm_lock { + struct ttm_base_object base; + wait_queue_head_t queue; + spinlock_t lock; + int32_t rw; + uint32_t flags; + bool kill_takers; + int signal; + struct ttm_object_file *vt_holder; +}; + + +/** + * ttm_lock_init + * + * @lock: Pointer to a struct ttm_lock + * Initializes the lock. + */ +extern void ttm_lock_init(struct ttm_lock *lock); + +/** + * ttm_read_unlock + * + * @lock: Pointer to a struct ttm_lock + * + * Releases a read lock. + */ +extern void ttm_read_unlock(struct ttm_lock *lock); + +/** + * ttm_read_lock + * + * @lock: Pointer to a struct ttm_lock + * @interruptible: Interruptible sleeping while waiting for a lock. + * + * Takes the lock in read mode. + * Returns: + * -ERESTARTSYS If interrupted by a signal and interruptible is true. + */ +extern int ttm_read_lock(struct ttm_lock *lock, bool interruptible); + +/** + * ttm_read_trylock + * + * @lock: Pointer to a struct ttm_lock + * @interruptible: Interruptible sleeping while waiting for a lock. + * + * Tries to take the lock in read mode. If the lock is already held + * in write mode, the function will return -EBUSY. If the lock is held + * in vt or suspend mode, the function will sleep until these modes + * are unlocked. + * + * Returns: + * -EBUSY The lock was already held in write mode. + * -ERESTARTSYS If interrupted by a signal and interruptible is true. + */ +extern int ttm_read_trylock(struct ttm_lock *lock, bool interruptible); + +/** + * ttm_write_unlock + * + * @lock: Pointer to a struct ttm_lock + * + * Releases a write lock. + */ +extern void ttm_write_unlock(struct ttm_lock *lock); + +/** + * ttm_write_lock + * + * @lock: Pointer to a struct ttm_lock + * @interruptible: Interruptible sleeping while waiting for a lock. + * + * Takes the lock in write mode. + * Returns: + * -ERESTARTSYS If interrupted by a signal and interruptible is true. + */ +extern int ttm_write_lock(struct ttm_lock *lock, bool interruptible); + +/** + * ttm_lock_downgrade + * + * @lock: Pointer to a struct ttm_lock + * + * Downgrades a write lock to a read lock. + */ +extern void ttm_lock_downgrade(struct ttm_lock *lock); + +/** + * ttm_suspend_lock + * + * @lock: Pointer to a struct ttm_lock + * + * Takes the lock in suspend mode. Excludes read and write mode. + */ +extern void ttm_suspend_lock(struct ttm_lock *lock); + +/** + * ttm_suspend_unlock + * + * @lock: Pointer to a struct ttm_lock + * + * Releases a suspend lock + */ +extern void ttm_suspend_unlock(struct ttm_lock *lock); + +/** + * ttm_vt_lock + * + * @lock: Pointer to a struct ttm_lock + * @interruptible: Interruptible sleeping while waiting for a lock. + * @tfile: Pointer to a struct ttm_object_file to register the lock with. + * + * Takes the lock in vt mode. + * Returns: + * -ERESTARTSYS If interrupted by a signal and interruptible is true. + * -ENOMEM: Out of memory when locking. + */ +extern int ttm_vt_lock(struct ttm_lock *lock, bool interruptible, + struct ttm_object_file *tfile); + +/** + * ttm_vt_unlock + * + * @lock: Pointer to a struct ttm_lock + * + * Releases a vt lock. + * Returns: + * -EINVAL If the lock was not held. + */ +extern int ttm_vt_unlock(struct ttm_lock *lock); + +/** + * ttm_write_unlock + * + * @lock: Pointer to a struct ttm_lock + * + * Releases a write lock. + */ +extern void ttm_write_unlock(struct ttm_lock *lock); + +/** + * ttm_write_lock + * + * @lock: Pointer to a struct ttm_lock + * @interruptible: Interruptible sleeping while waiting for a lock. + * + * Takes the lock in write mode. + * Returns: + * -ERESTARTSYS If interrupted by a signal and interruptible is true. + */ +extern int ttm_write_lock(struct ttm_lock *lock, bool interruptible); + +/** + * ttm_lock_set_kill + * + * @lock: Pointer to a struct ttm_lock + * @val: Boolean whether to kill processes taking the lock. + * @signal: Signal to send to the process taking the lock. + * + * The kill-when-taking-lock functionality is used to kill processes that keep + * on using the TTM functionality when its resources has been taken down, for + * example when the X server exits. A typical sequence would look like this: + * - X server takes lock in write mode. + * - ttm_lock_set_kill() is called with @val set to true. + * - As part of X server exit, TTM resources are taken down. + * - X server releases the lock on file release. + * - Another dri client wants to render, takes the lock and is killed. + * + */ +static inline void ttm_lock_set_kill(struct ttm_lock *lock, bool val, + int signal) +{ + lock->kill_takers = val; + if (val) + lock->signal = signal; +} + +#endif --- linux-ec2-2.6.32.orig/include/drm/ttm/ttm_bo_driver.h +++ linux-ec2-2.6.32/include/drm/ttm/ttm_bo_driver.h @@ -242,12 +242,6 @@ /** * struct ttm_bo_driver * - * @mem_type_prio: Priority array of memory types to place a buffer object in - * if it fits without evicting buffers from any of these memory types. - * @mem_busy_prio: Priority array of memory types to place a buffer object in - * if it needs to evict buffers to make room. - * @num_mem_type_prio: Number of elements in the @mem_type_prio array. - * @num_mem_busy_prio: Number of elements in the @num_mem_busy_prio array. * @create_ttm_backend_entry: Callback to create a struct ttm_backend. * @invalidate_caches: Callback to invalidate read caches when a buffer object * has been evicted. @@ -265,11 +259,6 @@ */ struct ttm_bo_driver { - const uint32_t *mem_type_prio; - const uint32_t *mem_busy_prio; - uint32_t num_mem_type_prio; - uint32_t num_mem_busy_prio; - /** * struct ttm_bo_driver member create_ttm_backend_entry * @@ -306,7 +295,8 @@ * finished, they'll end up in bo->mem.flags */ - uint32_t(*evict_flags) (struct ttm_buffer_object *bo); + void(*evict_flags) (struct ttm_buffer_object *bo, + struct ttm_placement *placement); /** * struct ttm_bo_driver member move: * @@ -363,6 +353,11 @@ /* notify the driver we are taking a fault on this BO * and have reserved it */ void (*fault_reserve_notify)(struct ttm_buffer_object *bo); + + /** + * notify the driver that we're about to swap out this bo + */ + void (*swap_notify) (struct ttm_buffer_object *bo); }; /** @@ -545,6 +540,15 @@ extern int ttm_tt_bind(struct ttm_tt *ttm, struct ttm_mem_reg *bo_mem); /** + * ttm_tt_populate: + * + * @ttm: The struct ttm_tt to contain the backing pages. + * + * Add backing pages to all of @ttm + */ +extern int ttm_tt_populate(struct ttm_tt *ttm); + +/** * ttm_ttm_destroy: * * @ttm: The struct ttm_tt. @@ -639,12 +643,12 @@ * -EBUSY: No space available (only if no_wait == 1). * -ENOMEM: Could not allocate memory for the buffer object, either due to * fragmentation or concurrent allocators. - * -ERESTART: An interruptible sleep was interrupted by a signal. + * -ERESTARTSYS: An interruptible sleep was interrupted by a signal. */ extern int ttm_bo_mem_space(struct ttm_buffer_object *bo, - uint32_t proposed_placement, - struct ttm_mem_reg *mem, - bool interruptible, bool no_wait); + struct ttm_placement *placement, + struct ttm_mem_reg *mem, + bool interruptible, bool no_wait); /** * ttm_bo_wait_for_cpu * @@ -654,7 +658,7 @@ * Wait until a buffer object is no longer sync'ed for CPU access. * Returns: * -EBUSY: Buffer object was sync'ed for CPU access. (only if no_wait == 1). - * -ERESTART: An interruptible sleep was interrupted by a signal. + * -ERESTARTSYS: An interruptible sleep was interrupted by a signal. */ extern int ttm_bo_wait_cpu(struct ttm_buffer_object *bo, bool no_wait); @@ -758,7 +762,7 @@ * -EAGAIN: The reservation may cause a deadlock. * Release all buffer reservations, wait for @bo to become unreserved and * try again. (only if use_sequence == 1). - * -ERESTART: A wait for the buffer to become unreserved was interrupted by + * -ERESTARTSYS: A wait for the buffer to become unreserved was interrupted by * a signal. Release all buffer reservations and return to user-space. */ extern int ttm_bo_reserve(struct ttm_buffer_object *bo, @@ -799,7 +803,7 @@ * * Returns: * -EBUSY: If no_wait == 1 and the buffer is already reserved. - * -ERESTART: If interruptible == 1 and the process received a signal + * -ERESTARTSYS: If interruptible == 1 and the process received a signal * while sleeping. */ extern int ttm_bo_block_reservation(struct ttm_buffer_object *bo, --- linux-ec2-2.6.32.orig/include/drm/ttm/ttm_bo_api.h +++ linux-ec2-2.6.32/include/drm/ttm/ttm_bo_api.h @@ -44,6 +44,29 @@ struct drm_mm_node; + +/** + * struct ttm_placement + * + * @fpfn: first valid page frame number to put the object + * @lpfn: last valid page frame number to put the object + * @num_placement: number of prefered placements + * @placement: prefered placements + * @num_busy_placement: number of prefered placements when need to evict buffer + * @busy_placement: prefered placements when need to evict buffer + * + * Structure indicating the placement you request for an object. + */ +struct ttm_placement { + unsigned fpfn; + unsigned lpfn; + unsigned num_placement; + const uint32_t *placement; + unsigned num_busy_placement; + const uint32_t *busy_placement; +}; + + /** * struct ttm_mem_reg * @@ -109,10 +132,6 @@ * the object is destroyed. * @event_queue: Queue for processes waiting on buffer object status change. * @lock: spinlock protecting mostly synchronization members. - * @proposed_placement: Proposed placement for the buffer. Changed only by the - * creator prior to validation as opposed to bo->mem.proposed_flags which is - * changed by the implementation prior to a buffer move if it wants to outsmart - * the buffer creator / user. This latter happens, for example, at eviction. * @mem: structure describing current placement. * @persistant_swap_storage: Usually the swap storage is deleted for buffers * pinned in physical memory. If this behaviour is not desired, this member @@ -177,7 +196,6 @@ * Members protected by the bo::reserved lock. */ - uint32_t proposed_placement; struct ttm_mem_reg mem; struct file *persistant_swap_storage; struct ttm_tt *ttm; @@ -206,9 +224,11 @@ atomic_t reserved; - /** * Members protected by the bo::lock + * In addition, setting sync_obj to anything else + * than NULL requires bo::reserved to be held. This allows for + * checking NULL while reserved but not holding bo::lock. */ void *sync_obj_arg; @@ -285,29 +305,30 @@ * Note: It might be necessary to block validations before the * wait by reserving the buffer. * Returns -EBUSY if no_wait is true and the buffer is busy. - * Returns -ERESTART if interrupted by a signal. + * Returns -ERESTARTSYS if interrupted by a signal. */ extern int ttm_bo_wait(struct ttm_buffer_object *bo, bool lazy, bool interruptible, bool no_wait); /** - * ttm_buffer_object_validate + * ttm_bo_validate * * @bo: The buffer object. - * @proposed_placement: Proposed_placement for the buffer object. + * @placement: Proposed placement for the buffer object. * @interruptible: Sleep interruptible if sleeping. * @no_wait: Return immediately if the buffer is busy. * * Changes placement and caching policy of the buffer object - * according to bo::proposed_flags. + * according proposed placement. * Returns - * -EINVAL on invalid proposed_flags. + * -EINVAL on invalid proposed placement. * -ENOMEM on out-of-memory condition. * -EBUSY if no_wait is true and buffer busy. - * -ERESTART if interrupted by a signal. + * -ERESTARTSYS if interrupted by a signal. */ -extern int ttm_buffer_object_validate(struct ttm_buffer_object *bo, - uint32_t proposed_placement, - bool interruptible, bool no_wait); +extern int ttm_bo_validate(struct ttm_buffer_object *bo, + struct ttm_placement *placement, + bool interruptible, bool no_wait); + /** * ttm_bo_unref * @@ -328,7 +349,7 @@ * waiting for buffer idle. This lock is recursive. * Returns * -EBUSY if the buffer is busy and no_wait is true. - * -ERESTART if interrupted by a signal. + * -ERESTARTSYS if interrupted by a signal. */ extern int @@ -343,7 +364,7 @@ extern void ttm_bo_synccpu_write_release(struct ttm_buffer_object *bo); /** - * ttm_buffer_object_init + * ttm_bo_init * * @bdev: Pointer to a ttm_bo_device struct. * @bo: Pointer to a ttm_buffer_object to be initialized. @@ -371,20 +392,20 @@ * Returns * -ENOMEM: Out of memory. * -EINVAL: Invalid placement flags. - * -ERESTART: Interrupted by signal while sleeping waiting for resources. + * -ERESTARTSYS: Interrupted by signal while sleeping waiting for resources. */ -extern int ttm_buffer_object_init(struct ttm_bo_device *bdev, - struct ttm_buffer_object *bo, - unsigned long size, - enum ttm_bo_type type, - uint32_t flags, - uint32_t page_alignment, - unsigned long buffer_start, - bool interrubtible, - struct file *persistant_swap_storage, - size_t acc_size, - void (*destroy) (struct ttm_buffer_object *)); +extern int ttm_bo_init(struct ttm_bo_device *bdev, + struct ttm_buffer_object *bo, + unsigned long size, + enum ttm_bo_type type, + struct ttm_placement *placement, + uint32_t page_alignment, + unsigned long buffer_start, + bool interrubtible, + struct file *persistant_swap_storage, + size_t acc_size, + void (*destroy) (struct ttm_buffer_object *)); /** * ttm_bo_synccpu_object_init * @@ -405,47 +426,43 @@ * GEM user interface. * @p_bo: On successful completion *p_bo points to the created object. * - * This function allocates a ttm_buffer_object, and then calls - * ttm_buffer_object_init on that object. - * The destroy function is set to kfree(). + * This function allocates a ttm_buffer_object, and then calls ttm_bo_init + * on that object. The destroy function is set to kfree(). * Returns * -ENOMEM: Out of memory. * -EINVAL: Invalid placement flags. - * -ERESTART: Interrupted by signal while waiting for resources. + * -ERESTARTSYS: Interrupted by signal while waiting for resources. */ -extern int ttm_buffer_object_create(struct ttm_bo_device *bdev, - unsigned long size, - enum ttm_bo_type type, - uint32_t flags, - uint32_t page_alignment, - unsigned long buffer_start, - bool interruptible, - struct file *persistant_swap_storage, - struct ttm_buffer_object **p_bo); +extern int ttm_bo_create(struct ttm_bo_device *bdev, + unsigned long size, + enum ttm_bo_type type, + struct ttm_placement *placement, + uint32_t page_alignment, + unsigned long buffer_start, + bool interruptible, + struct file *persistant_swap_storage, + struct ttm_buffer_object **p_bo); /** * ttm_bo_check_placement * - * @bo: the buffer object. - * @set_flags: placement flags to set. - * @clr_flags: placement flags to clear. + * @bo: the buffer object. + * @placement: placements * * Performs minimal validity checking on an intended change of * placement flags. * Returns * -EINVAL: Intended change is invalid or not allowed. */ - extern int ttm_bo_check_placement(struct ttm_buffer_object *bo, - uint32_t set_flags, uint32_t clr_flags); + struct ttm_placement *placement); /** * ttm_bo_init_mm * * @bdev: Pointer to a ttm_bo_device struct. * @mem_type: The memory type. - * @p_offset: offset for managed area in pages. * @p_size: size managed area in pages. * * Initialize a manager for a given memory type. @@ -458,7 +475,7 @@ */ extern int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type, - unsigned long p_offset, unsigned long p_size); + unsigned long p_size); /** * ttm_bo_clean_mm * @@ -503,7 +520,7 @@ * * Returns: * -EINVAL: Invalid or uninitialized memory type. - * -ERESTART: The call was interrupted by a signal while waiting to + * -ERESTARTSYS: The call was interrupted by a signal while waiting to * evict a buffer. */ @@ -606,7 +623,7 @@ * be called from the fops::read and fops::write method. * Returns: * See man (2) write, man(2) read. In particular, - * the function may return -EINTR if + * the function may return -ERESTARTSYS if * interrupted by a signal. */ --- linux-ec2-2.6.32.orig/include/drm/ttm/ttm_execbuf_util.h +++ linux-ec2-2.6.32/include/drm/ttm/ttm_execbuf_util.h @@ -0,0 +1,107 @@ +/************************************************************************** + * + * Copyright (c) 2006-2009 VMware, Inc., Palo Alto, CA., USA + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ +/* + * Authors: Thomas Hellstrom + */ + +#ifndef _TTM_EXECBUF_UTIL_H_ +#define _TTM_EXECBUF_UTIL_H_ + +#include "ttm/ttm_bo_api.h" +#include + +/** + * struct ttm_validate_buffer + * + * @head: list head for thread-private list. + * @bo: refcounted buffer object pointer. + * @new_sync_obj_arg: New sync_obj_arg for @bo, to be used once + * adding a new sync object. + * @reservied: Indicates whether @bo has been reserved for validation. + */ + +struct ttm_validate_buffer { + struct list_head head; + struct ttm_buffer_object *bo; + void *new_sync_obj_arg; + bool reserved; +}; + +/** + * function ttm_eu_backoff_reservation + * + * @list: thread private list of ttm_validate_buffer structs. + * + * Undoes all buffer validation reservations for bos pointed to by + * the list entries. + */ + +extern void ttm_eu_backoff_reservation(struct list_head *list); + +/** + * function ttm_eu_reserve_buffers + * + * @list: thread private list of ttm_validate_buffer structs. + * @val_seq: A unique sequence number. + * + * Tries to reserve bos pointed to by the list entries for validation. + * If the function returns 0, all buffers are marked as "unfenced", + * taken off the lru lists and are not synced for write CPU usage. + * + * If the function detects a deadlock due to multiple threads trying to + * reserve the same buffers in reverse order, all threads except one will + * back off and retry. This function may sleep while waiting for + * CPU write reservations to be cleared, and for other threads to + * unreserve their buffers. + * + * This function may return -ERESTART or -EAGAIN if the calling process + * receives a signal while waiting. In that case, no buffers on the list + * will be reserved upon return. + * + * Buffers reserved by this function should be unreserved by + * a call to either ttm_eu_backoff_reservation() or + * ttm_eu_fence_buffer_objects() when command submission is complete or + * has failed. + */ + +extern int ttm_eu_reserve_buffers(struct list_head *list, uint32_t val_seq); + +/** + * function ttm_eu_fence_buffer_objects. + * + * @list: thread private list of ttm_validate_buffer structs. + * @sync_obj: The new sync object for the buffers. + * + * This function should be called when command submission is complete, and + * it will add a new sync object to bos pointed to by entries on @list. + * It also unreserves all buffers, putting them on lru lists. + * + */ + +extern void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj); + +#endif --- linux-ec2-2.6.32.orig/include/drm/ttm/ttm_object.h +++ linux-ec2-2.6.32/include/drm/ttm/ttm_object.h @@ -0,0 +1,271 @@ +/************************************************************************** + * + * Copyright (c) 2006-2009 VMware, Inc., Palo Alto, CA., USA + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ +/* + * Authors: Thomas Hellstrom + */ +/** @file ttm_object.h + * + * Base- and reference object implementation for the various + * ttm objects. Implements reference counting, minimal security checks + * and release on file close. + */ + +#ifndef _TTM_OBJECT_H_ +#define _TTM_OBJECT_H_ + +#include +#include "drm_hashtab.h" +#include +#include + +/** + * enum ttm_ref_type + * + * Describes what type of reference a ref object holds. + * + * TTM_REF_USAGE is a simple refcount on a base object. + * + * TTM_REF_SYNCCPU_READ is a SYNCCPU_READ reference on a + * buffer object. + * + * TTM_REF_SYNCCPU_WRITE is a SYNCCPU_WRITE reference on a + * buffer object. + * + */ + +enum ttm_ref_type { + TTM_REF_USAGE, + TTM_REF_SYNCCPU_READ, + TTM_REF_SYNCCPU_WRITE, + TTM_REF_NUM +}; + +/** + * enum ttm_object_type + * + * One entry per ttm object type. + * Device-specific types should use the + * ttm_driver_typex types. + */ + +enum ttm_object_type { + ttm_fence_type, + ttm_buffer_type, + ttm_lock_type, + ttm_driver_type0 = 256, + ttm_driver_type1, + ttm_driver_type2, + ttm_driver_type3, + ttm_driver_type4, + ttm_driver_type5 +}; + +struct ttm_object_file; +struct ttm_object_device; + +/** + * struct ttm_base_object + * + * @hash: hash entry for the per-device object hash. + * @type: derived type this object is base class for. + * @shareable: Other ttm_object_files can access this object. + * + * @tfile: Pointer to ttm_object_file of the creator. + * NULL if the object was not created by a user request. + * (kernel object). + * + * @refcount: Number of references to this object, not + * including the hash entry. A reference to a base object can + * only be held by a ref object. + * + * @refcount_release: A function to be called when there are + * no more references to this object. This function should + * destroy the object (or make sure destruction eventually happens), + * and when it is called, the object has + * already been taken out of the per-device hash. The parameter + * "base" should be set to NULL by the function. + * + * @ref_obj_release: A function to be called when a reference object + * with another ttm_ref_type than TTM_REF_USAGE is deleted. + * this function may, for example, release a lock held by a user-space + * process. + * + * This struct is intended to be used as a base struct for objects that + * are visible to user-space. It provides a global name, race-safe + * access and refcounting, minimal access contol and hooks for unref actions. + */ + +struct ttm_base_object { + struct drm_hash_item hash; + enum ttm_object_type object_type; + bool shareable; + struct ttm_object_file *tfile; + struct kref refcount; + void (*refcount_release) (struct ttm_base_object **base); + void (*ref_obj_release) (struct ttm_base_object *base, + enum ttm_ref_type ref_type); +}; + +/** + * ttm_base_object_init + * + * @tfile: Pointer to a struct ttm_object_file. + * @base: The struct ttm_base_object to initialize. + * @shareable: This object is shareable with other applcations. + * (different @tfile pointers.) + * @type: The object type. + * @refcount_release: See the struct ttm_base_object description. + * @ref_obj_release: See the struct ttm_base_object description. + * + * Initializes a struct ttm_base_object. + */ + +extern int ttm_base_object_init(struct ttm_object_file *tfile, + struct ttm_base_object *base, + bool shareable, + enum ttm_object_type type, + void (*refcount_release) (struct ttm_base_object + **), + void (*ref_obj_release) (struct ttm_base_object + *, + enum ttm_ref_type + ref_type)); + +/** + * ttm_base_object_lookup + * + * @tfile: Pointer to a struct ttm_object_file. + * @key: Hash key + * + * Looks up a struct ttm_base_object with the key @key. + * Also verifies that the object is visible to the application, by + * comparing the @tfile argument and checking the object shareable flag. + */ + +extern struct ttm_base_object *ttm_base_object_lookup(struct ttm_object_file + *tfile, uint32_t key); + +/** + * ttm_base_object_unref + * + * @p_base: Pointer to a pointer referncing a struct ttm_base_object. + * + * Decrements the base object refcount and clears the pointer pointed to by + * p_base. + */ + +extern void ttm_base_object_unref(struct ttm_base_object **p_base); + +/** + * ttm_ref_object_add. + * + * @tfile: A struct ttm_object_file representing the application owning the + * ref_object. + * @base: The base object to reference. + * @ref_type: The type of reference. + * @existed: Upon completion, indicates that an identical reference object + * already existed, and the refcount was upped on that object instead. + * + * Adding a ref object to a base object is basically like referencing the + * base object, but a user-space application holds the reference. When the + * file corresponding to @tfile is closed, all its reference objects are + * deleted. A reference object can have different types depending on what + * it's intended for. It can be refcounting to prevent object destruction, + * When user-space takes a lock, it can add a ref object to that lock to + * make sure the lock is released if the application dies. A ref object + * will hold a single reference on a base object. + */ +extern int ttm_ref_object_add(struct ttm_object_file *tfile, + struct ttm_base_object *base, + enum ttm_ref_type ref_type, bool *existed); +/** + * ttm_ref_object_base_unref + * + * @key: Key representing the base object. + * @ref_type: Ref type of the ref object to be dereferenced. + * + * Unreference a ref object with type @ref_type + * on the base object identified by @key. If there are no duplicate + * references, the ref object will be destroyed and the base object + * will be unreferenced. + */ +extern int ttm_ref_object_base_unref(struct ttm_object_file *tfile, + unsigned long key, + enum ttm_ref_type ref_type); + +/** + * ttm_object_file_init - initialize a struct ttm_object file + * + * @tdev: A struct ttm_object device this file is initialized on. + * @hash_order: Order of the hash table used to hold the reference objects. + * + * This is typically called by the file_ops::open function. + */ + +extern struct ttm_object_file *ttm_object_file_init(struct ttm_object_device + *tdev, + unsigned int hash_order); + +/** + * ttm_object_file_release - release data held by a ttm_object_file + * + * @p_tfile: Pointer to pointer to the ttm_object_file object to release. + * *p_tfile will be set to NULL by this function. + * + * Releases all data associated by a ttm_object_file. + * Typically called from file_ops::release. The caller must + * ensure that there are no concurrent users of tfile. + */ + +extern void ttm_object_file_release(struct ttm_object_file **p_tfile); + +/** + * ttm_object device init - initialize a struct ttm_object_device + * + * @hash_order: Order of hash table used to hash the base objects. + * + * This function is typically called on device initialization to prepare + * data structures needed for ttm base and ref objects. + */ + +extern struct ttm_object_device *ttm_object_device_init + (struct ttm_mem_global *mem_glob, unsigned int hash_order); + +/** + * ttm_object_device_release - release data held by a ttm_object_device + * + * @p_tdev: Pointer to pointer to the ttm_object_device object to release. + * *p_tdev will be set to NULL by this function. + * + * Releases all data associated by a ttm_object_device. + * Typically called from driver::unload before the destruction of the + * device private data structure. + */ + +extern void ttm_object_device_release(struct ttm_object_device **p_tdev); + +#endif --- linux-ec2-2.6.32.orig/include/drm/ttm/ttm_memory.h +++ linux-ec2-2.6.32/include/drm/ttm/ttm_memory.h @@ -33,6 +33,7 @@ #include #include #include +#include /** * struct ttm_mem_shrink - callback to shrink TTM memory usage. --- linux-ec2-2.6.32.orig/include/drm/i2c/ch7006.h +++ linux-ec2-2.6.32/include/drm/i2c/ch7006.h @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2009 Francisco Jerez. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __DRM_I2C_CH7006_H__ +#define __DRM_I2C_CH7006_H__ + +/** + * struct ch7006_encoder_params + * + * Describes how the ch7006 is wired up with the GPU. It should be + * used as the @params parameter of its @set_config method. + * + * See "http://www.chrontel.com/pdf/7006.pdf" for their precise + * meaning. + */ +struct ch7006_encoder_params { + enum { + CH7006_FORMAT_RGB16 = 0, + CH7006_FORMAT_YCrCb24m16, + CH7006_FORMAT_RGB24m16, + CH7006_FORMAT_RGB15, + CH7006_FORMAT_RGB24m12C, + CH7006_FORMAT_RGB24m12I, + CH7006_FORMAT_RGB24m8, + CH7006_FORMAT_RGB16m8, + CH7006_FORMAT_RGB15m8, + CH7006_FORMAT_YCrCb24m8, + } input_format; + + enum { + CH7006_CLOCK_SLAVE = 0, + CH7006_CLOCK_MASTER, + } clock_mode; + + enum { + CH7006_CLOCK_EDGE_NEG = 0, + CH7006_CLOCK_EDGE_POS, + } clock_edge; + + int xcm, pcm; + + enum { + CH7006_SYNC_SLAVE = 0, + CH7006_SYNC_MASTER, + } sync_direction; + + enum { + CH7006_SYNC_SEPARATED = 0, + CH7006_SYNC_EMBEDDED, + } sync_encoding; + + enum { + CH7006_POUT_1_8V = 0, + CH7006_POUT_3_3V, + } pout_level; + + enum { + CH7006_ACTIVE_HSYNC = 0, + CH7006_ACTIVE_DSTART, + } active_detect; +}; + +#endif --- linux-ec2-2.6.32.orig/include/xen/evtchn.h +++ linux-ec2-2.6.32/include/xen/evtchn.h @@ -1,7 +1,8 @@ /****************************************************************************** * evtchn.h * - * Interface to /dev/xen/evtchn. + * Communication via Xen event channels. + * Also definitions for the device that demuxes notifications to userspace. * * Copyright (c) 2003-2005, K A Fraser * @@ -30,6 +31,207 @@ * IN THE SOFTWARE. */ +#if !defined(__ASM_EVTCHN_H__) && defined(__KERNEL__) && !defined(CONFIG_PARAVIRT_XEN) +#define __ASM_EVTCHN_H__ + +#include +#include +#include +#include +#include +#include + +/* + * LOW-LEVEL DEFINITIONS + */ +struct irq_cfg { + u32 info; + union { + int bindcount; /* for dynamic IRQs */ +#ifdef CONFIG_X86_IO_APIC + u8 vector; /* for physical IRQs */ +#endif + }; +}; + +int assign_irq_vector(int irq, struct irq_cfg *, const struct cpumask *); + +/* + * Dynamically bind an event source to an IRQ-like callback handler. + * On some platforms this may not be implemented via the Linux IRQ subsystem. + * The IRQ argument passed to the callback handler is the same as returned + * from the bind call. It may not correspond to a Linux IRQ number. + * Returns IRQ or negative errno. + */ +int bind_caller_port_to_irqhandler( + unsigned int caller_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id); +int bind_listening_port_to_irqhandler( + unsigned int remote_domain, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id); +int bind_interdomain_evtchn_to_irqhandler( + unsigned int remote_domain, + unsigned int remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id); +int bind_virq_to_irqhandler( + unsigned int virq, + unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id); +#if defined(CONFIG_SMP) && defined(CONFIG_XEN) && defined(CONFIG_X86) +int bind_virq_to_irqaction( + unsigned int virq, + unsigned int cpu, + struct irqaction *action); +#else +#define bind_virq_to_irqaction(virq, cpu, action) \ + bind_virq_to_irqhandler(virq, cpu, (action)->handler, \ + (action)->flags | IRQF_NOBALANCING, \ + (action)->name, action) +#endif +#if defined(CONFIG_SMP) && !defined(MODULE) +#ifndef CONFIG_X86 +int bind_ipi_to_irqhandler( + unsigned int ipi, + unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id); +#else +int bind_ipi_to_irqaction( + unsigned int ipi, + unsigned int cpu, + struct irqaction *action); +#endif +#endif + +/* + * Common unbind function for all event sources. Takes IRQ to unbind from. + * Automatically closes the underlying event channel (except for bindings + * made with bind_caller_port_to_irqhandler()). + */ +void unbind_from_irqhandler(unsigned int irq, void *dev_id); + +#if defined(CONFIG_SMP) && defined(CONFIG_XEN) && defined(CONFIG_X86) +/* Specialized unbind function for per-CPU IRQs. */ +void unbind_from_per_cpu_irq(unsigned int irq, unsigned int cpu, + struct irqaction *); +#else +#define unbind_from_per_cpu_irq(irq, cpu, action) \ + unbind_from_irqhandler(irq, action) +#endif + +#ifndef CONFIG_XEN +void irq_resume(void); +#endif + +/* Entry point for notifications into Linux subsystems. */ +asmlinkage void evtchn_do_upcall(struct pt_regs *regs); + +/* Entry point for notifications into the userland character device. */ +void evtchn_device_upcall(int port); + +/* Mark a PIRQ as unavailable for dynamic allocation. */ +void evtchn_register_pirq(int irq); +/* Map a Xen-supplied PIRQ to a dynamically allocated one. */ +int evtchn_map_pirq(int irq, int xen_pirq); +/* Look up a Xen-supplied PIRQ for a dynamically allocated one. */ +int evtchn_get_xen_pirq(int irq); + +void mask_evtchn(int port); +void disable_all_local_evtchn(void); +void unmask_evtchn(int port); + +#ifdef CONFIG_SMP +void rebind_evtchn_to_cpu(int port, unsigned int cpu); +#else +#define rebind_evtchn_to_cpu(port, cpu) ((void)0) +#endif + +static inline int test_and_set_evtchn_mask(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + return synch_test_and_set_bit(port, s->evtchn_mask); +} + +static inline void clear_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + synch_clear_bit(port, s->evtchn_pending); +} + +static inline void set_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + synch_set_bit(port, s->evtchn_pending); +} + +static inline int test_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + return synch_test_bit(port, s->evtchn_pending); +} + +static inline void notify_remote_via_evtchn(int port) +{ + struct evtchn_send send = { .port = port }; + VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send)); +} + +static inline void +multi_notify_remote_via_evtchn(multicall_entry_t *mcl, int port) +{ + struct evtchn_send *send = (void *)(mcl->args + 2); + + BUILD_BUG_ON(sizeof(*send) > sizeof(mcl->args) - 2 * sizeof(*mcl->args)); + send->port = port; + mcl->op = __HYPERVISOR_event_channel_op; + mcl->args[0] = EVTCHNOP_send; + mcl->args[1] = (unsigned long)send; +} + +/* Clear an irq's pending state, in preparation for polling on it. */ +void xen_clear_irq_pending(int irq); + +/* Set an irq's pending state, to avoid blocking on it. */ +void xen_set_irq_pending(int irq); + +/* Test an irq's pending state. */ +int xen_test_irq_pending(int irq); + +/* Poll waiting for an irq to become pending. In the usual case, the + irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq(int irq); + +/* + * Use these to access the event channel underlying the IRQ handle returned + * by bind_*_to_irqhandler(). + */ +void notify_remote_via_irq(int irq); +int multi_notify_remote_via_irq(multicall_entry_t *, int irq); +int irq_to_evtchn_port(int irq); + +#if defined(CONFIG_SMP) && !defined(MODULE) && defined(CONFIG_X86) +void notify_remote_via_ipi(unsigned int ipi, unsigned int cpu); +#endif + +#endif /* __ASM_EVTCHN_H__ */ + +/* + * Interface to /dev/xen/evtchn. + */ #ifndef __LINUX_PUBLIC_EVTCHN_H__ #define __LINUX_PUBLIC_EVTCHN_H__ --- linux-ec2-2.6.32.orig/include/xen/balloon.h +++ linux-ec2-2.6.32/include/xen/balloon.h @@ -0,0 +1,65 @@ +/****************************************************************************** + * balloon.h + * + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_BALLOON_H__ +#define __XEN_BALLOON_H__ + +#include + +#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H) +/* + * Inform the balloon driver that it should allow some slop for device-driver + * memory activities. + */ +void balloon_update_driver_allowance(long delta); + +/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */ +struct page **alloc_empty_pages_and_pagevec(int nr_pages); +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages); + +/* Free an empty page range (not allocated through + alloc_empty_pages_and_pagevec), adding to the balloon. */ +void free_empty_pages(struct page **pagevec, int nr_pages); + +void balloon_release_driver_page(struct page *page); + +/* + * Prevent the balloon driver from changing the memory reservation during + * a driver critical region. + */ +extern spinlock_t balloon_lock; +#define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags) +#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags) +#endif + +#endif /* __XEN_BALLOON_H__ */ --- linux-ec2-2.6.32.orig/include/xen/gnttab.h +++ linux-ec2-2.6.32/include/xen/gnttab.h @@ -0,0 +1,165 @@ +/****************************************************************************** + * gnttab.h + * + * Two sets of functionality: + * 1. Granting foreign access to our memory reservation. + * 2. Accessing others' memory reservations via grant references. + * (i.e., mechanisms for both sender and recipient of grant references) + * + * Copyright (c) 2004-2005, K A Fraser + * Copyright (c) 2005, Christopher Clark + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __ASM_GNTTAB_H__ +#define __ASM_GNTTAB_H__ + +#include +#include /* maddr_t */ +#include +#include +#include + +struct gnttab_free_callback { + struct gnttab_free_callback *next; + void (*fn)(void *); + void *arg; + u16 count; + u8 queued; +}; + +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, + int flags); + +/* + * End access through the given grant reference, iff the grant entry is no + * longer in use. Return 1 if the grant entry was freed, 0 if it is still in + * use. + */ +int gnttab_end_foreign_access_ref(grant_ref_t ref); + +/* + * Eventually end access through the given grant reference, and once that + * access has been ended, free the given page too. Access will be ended + * immediately iff the grant entry is not in use, otherwise it will happen + * some time later. page may be 0, in which case no freeing will occur. + */ +void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page); + +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); + +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); + +int gnttab_query_foreign_access(grant_ref_t ref); + +/* + * operations on reserved batches of grant references + */ +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head); + +void gnttab_free_grant_reference(grant_ref_t ref); + +void gnttab_free_grant_references(grant_ref_t head); + +int gnttab_empty_grant_references(const grant_ref_t *pprivate_head); + +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head); + +void gnttab_release_grant_reference(grant_ref_t *private_head, + grant_ref_t release); + +void gnttab_request_free_callback(struct gnttab_free_callback *callback, + void (*fn)(void *), void *arg, u16 count); +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); + +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags); + +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, + unsigned long pfn); + +int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep); +void __gnttab_dma_map_page(struct page *page); +static inline void __gnttab_dma_unmap_page(struct page *page) +{ +} + +void gnttab_reset_grant_page(struct page *page); + +#ifndef CONFIG_XEN +int gnttab_resume(void); +#endif + +void *arch_gnttab_alloc_shared(unsigned long *frames); + +static inline void +gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr, + uint32_t flags, grant_ref_t ref, domid_t domid) +{ + if (flags & GNTMAP_contains_pte) + map->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + map->host_addr = __pa(addr); + else + map->host_addr = addr; + + map->flags = flags; + map->ref = ref; + map->dom = domid; +} + +static inline void +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr, + uint32_t flags, grant_handle_t handle) +{ + if (flags & GNTMAP_contains_pte) + unmap->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + unmap->host_addr = __pa(addr); + else + unmap->host_addr = addr; + + unmap->handle = handle; + unmap->dev_bus_addr = 0; +} + +static inline void +gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr, + maddr_t new_addr, grant_handle_t handle) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) { + unmap->host_addr = __pa(addr); + unmap->new_addr = __pa(new_addr); + } else { + unmap->host_addr = addr; + unmap->new_addr = new_addr; + } + + unmap->handle = handle; +} + +#endif /* __ASM_GNTTAB_H__ */ --- linux-ec2-2.6.32.orig/include/xen/hypercall.h +++ linux-ec2-2.6.32/include/xen/hypercall.h @@ -0,0 +1,30 @@ +#ifndef __XEN_HYPERCALL_H__ +#define __XEN_HYPERCALL_H__ + +#include + +static inline int __must_check +HYPERVISOR_multicall_check( + multicall_entry_t *call_list, unsigned int nr_calls, + const unsigned long *rc_list) +{ + int rc = HYPERVISOR_multicall(call_list, nr_calls); + + if (unlikely(rc < 0)) + return rc; + BUG_ON(rc); + BUG_ON((int)nr_calls < 0); + + for ( ; nr_calls > 0; --nr_calls, ++call_list) + if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0))) + return nr_calls; + + return 0; +} + +/* A construct to ignore the return value of hypercall wrappers in a few + * exceptional cases (simply casting the function result to void doesn't + * avoid the compiler warning): */ +#define VOID(expr) ((void)((expr)?:0)) + +#endif /* __XEN_HYPERCALL_H__ */ --- linux-ec2-2.6.32.orig/include/xen/xenbus.h +++ linux-ec2-2.6.32/include/xen/xenbus.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -55,8 +56,21 @@ /* Callback (executed in a process context with no locks held). */ void (*callback)(struct xenbus_watch *, const char **vec, unsigned int len); + +#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H) + /* See XBWF_ definitions below. */ + unsigned long flags; +#endif }; +#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H) +/* + * Execute callback in its own kthread. Useful if the callback is long + * running or heavily serialised, to avoid taking out the main xenwatch thread + * for a long period of time (or even unwittingly causing a deadlock). + */ +#define XBWF_new_thread 1 +#endif /* A xenbus device. */ struct xenbus_device { @@ -83,17 +97,21 @@ /* A xenbus driver. */ struct xenbus_driver { - char *name; - struct module *owner; + const char *name; const struct xenbus_device_id *ids; int (*probe)(struct xenbus_device *dev, const struct xenbus_device_id *id); void (*otherend_changed)(struct xenbus_device *dev, enum xenbus_state backend_state); int (*remove)(struct xenbus_device *dev); +#if !defined(CONFIG_XEN) && !defined(MODULE) int (*suspend)(struct xenbus_device *dev, pm_message_t state); +#else + int (*suspend)(struct xenbus_device *dev); + int (*suspend_cancel)(struct xenbus_device *dev); +#endif int (*resume)(struct xenbus_device *dev); - int (*uevent)(struct xenbus_device *, char **, int, char *, int); + int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *); struct device_driver driver; int (*read_otherend_details)(struct xenbus_device *dev); int (*is_ready)(struct xenbus_device *dev); @@ -111,7 +129,6 @@ static inline int __must_check xenbus_register_frontend(struct xenbus_driver *drv) { - WARN_ON(drv->owner != THIS_MODULE); return __xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME); } @@ -121,7 +138,6 @@ static inline int __must_check xenbus_register_backend(struct xenbus_driver *drv) { - WARN_ON(drv->owner != THIS_MODULE); return __xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME); } @@ -164,7 +180,6 @@ int xenbus_gather(struct xenbus_transaction t, const char *dir, ...); /* notifer routines for when the xenstore comes up */ -extern int xenstored_ready; int register_xenstore_notifier(struct notifier_block *nb); void unregister_xenstore_notifier(struct notifier_block *nb); @@ -177,12 +192,9 @@ /* Used by xenbus_dev to borrow kernel's store connection. */ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg); -struct work_struct; - /* Prepare for domain suspend: then resume or cancel the suspend. */ void xenbus_suspend(void); void xenbus_resume(void); -void xenbus_probe(struct work_struct *); void xenbus_suspend_cancel(void); #define XENBUS_IS_ERR_READ(str) ({ \ @@ -195,38 +207,134 @@ #define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE) + +/** + * Register a watch on the given path, using the given xenbus_watch structure + * for storage, and the given callback function as the callback. Return 0 on + * success, or -errno on error. On success, the given path will be saved as + * watch->node, and remains the caller's to free. On error, watch->node will + * be NULL, the device will switch to XenbusStateClosing, and the error will + * be saved in the store. + */ int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, const char **, unsigned int)); + + +#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H) +/** + * Register a watch on the given path/path2, using the given xenbus_watch + * structure for storage, and the given callback function as the callback. + * Return 0 on success, or -errno on error. On success, the watched path + * (path/path2) will be saved as watch->node, and becomes the caller's to + * kfree(). On error, watch->node will be NULL, so the caller has nothing to + * free, the device will switch to XenbusStateClosing, and the error will be + * saved in the store. + */ +int xenbus_watch_path2(struct xenbus_device *dev, const char *path, + const char *path2, struct xenbus_watch *watch, + void (*callback)(struct xenbus_watch *, + const char **, unsigned int)); +#else int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, const char **, unsigned int), const char *pathfmt, ...) __attribute__ ((format (printf, 4, 5))); +#endif +/** + * Advertise in the store a change of the given driver to the given new_state. + * Return 0 on success, or -errno on error. On error, the device will switch + * to XenbusStateClosing, and the error will be saved in the store. + */ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state); + + +/** + * Grant access to the given ring_mfn to the peer of the given device. Return + * 0 on success, or -errno on error. On error, the device will switch to + * XenbusStateClosing, and the error will be saved in the store. + */ int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn); -int xenbus_map_ring_valloc(struct xenbus_device *dev, - int gnt_ref, void **vaddr); + + +/** + * Map a page of memory into this domain from another domain's grant table. + * xenbus_map_ring_valloc allocates a page of virtual address space, maps the + * page to that address, and sets *vaddr to that address. + * xenbus_map_ring does not allocate the virtual address space (you must do + * this yourself!). It only maps in the page to the specified address. + * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h) + * or -ENOMEM on error. If an error is returned, device will switch to + * XenbusStateClosing and the error message will be saved in XenStore. + */ +struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, + int gnt_ref); int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref, grant_handle_t *handle, void *vaddr); -int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr); + +/** + * Unmap a page of memory in this domain that was imported from another domain. + * Use xenbus_unmap_ring_vfree if you mapped in your memory with + * xenbus_map_ring_valloc (it will free the virtual address space). + * Returns 0 on success and returns GNTST_* on error + * (see xen/include/interface/grant_table.h). + */ +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *); int xenbus_unmap_ring(struct xenbus_device *dev, grant_handle_t handle, void *vaddr); + +/** + * Allocate an event channel for the given xenbus_device, assigning the newly + * created local port to *port. Return 0 on success, or -errno on error. On + * error, the device will switch to XenbusStateClosing, and the error will be + * saved in the store. + */ int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port); -int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port); + + +/** + * Free an existing event channel. Returns 0 on success or -errno on error. + */ int xenbus_free_evtchn(struct xenbus_device *dev, int port); + +/** + * Return the state of the driver rooted at the given store path, or + * XenbusStateUnknown if no state can be read. + */ enum xenbus_state xenbus_read_driver_state(const char *path); -void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...); -void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...); + +/*** + * Report the given negative errno into the store, along with the given + * formatted message. + */ +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, + ...); + + +/*** + * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by + * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly + * closedown of this driver and its peer. + */ +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, + ...); + +#if defined(CONFIG_XEN) || defined(MODULE) +int xenbus_dev_init(void); +#endif const char *xenbus_strstate(enum xenbus_state state); int xenbus_dev_is_online(struct xenbus_device *dev); int xenbus_frontend_closed(struct xenbus_device *dev); +int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *)); +int xenbus_for_each_frontend(void *arg, int (*fn)(struct device *, void *)); + #endif /* _XEN_XENBUS_H */ --- linux-ec2-2.6.32.orig/include/xen/cpu_hotplug.h +++ linux-ec2-2.6.32/include/xen/cpu_hotplug.h @@ -0,0 +1,39 @@ +#ifndef __XEN_CPU_HOTPLUG_H__ +#define __XEN_CPU_HOTPLUG_H__ + +#include +#include + +#if defined(CONFIG_X86) && defined(CONFIG_SMP) +extern cpumask_var_t vcpu_initialized_mask; +#endif + +#if defined(CONFIG_HOTPLUG_CPU) + +int cpu_up_check(unsigned int cpu); +void init_xenbus_allowed_cpumask(void); +int smp_suspend(void); +void smp_resume(void); + +#else /* !defined(CONFIG_HOTPLUG_CPU) */ + +#define cpu_up_check(cpu) (0) +#define init_xenbus_allowed_cpumask() ((void)0) + +static inline int smp_suspend(void) +{ + if (num_online_cpus() > 1) { + printk(KERN_WARNING "Can't suspend SMP guests " + "without CONFIG_HOTPLUG_CPU\n"); + return -EOPNOTSUPP; + } + return 0; +} + +static inline void smp_resume(void) +{ +} + +#endif /* !defined(CONFIG_HOTPLUG_CPU) */ + +#endif /* __XEN_CPU_HOTPLUG_H__ */ --- linux-ec2-2.6.32.orig/include/xen/xenoprof.h +++ linux-ec2-2.6.32/include/xen/xenoprof.h @@ -0,0 +1,42 @@ +/****************************************************************************** + * xen/xenoprof.h + * + * Copyright (c) 2006 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef __XEN_XENOPROF_H__ +#define __XEN_XENOPROF_H__ +#ifdef CONFIG_XEN + +#include + +struct oprofile_operations; +int xenoprofile_init(struct oprofile_operations * ops); +void xenoprofile_exit(void); + +struct xenoprof_shared_buffer { + char *buffer; + struct xenoprof_arch_shared_buffer arch; +}; +#else +#define xenoprofile_init(ops) (-ENOSYS) +#define xenoprofile_exit() do { } while (0) + +#endif /* CONFIG_XEN */ +#endif /* __XEN_XENOPROF_H__ */ --- linux-ec2-2.6.32.orig/include/xen/firmware.h +++ linux-ec2-2.6.32/include/xen/firmware.h @@ -0,0 +1,14 @@ +#ifndef __XEN_FIRMWARE_H__ +#define __XEN_FIRMWARE_H__ + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +void copy_edd(void); +#endif + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +void copy_edid(void); +#else +static inline void copy_edid(void) {} +#endif + +#endif /* __XEN_FIRMWARE_H__ */ --- linux-ec2-2.6.32.orig/include/xen/blkif.h +++ linux-ec2-2.6.32/include/xen/blkif.h @@ -0,0 +1,123 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_BLKIF_H__ +#define __XEN_BLKIF_H__ + +#include +#include +#include + +/* Not a real protocol. Used to generate ring structs which contain + * the elements common to all protocols only. This way we get a + * compiler-checkable way to use common struct elements, so we can + * avoid using switch(protocol) in a number of places. */ +struct blkif_common_request { + char dummy; +}; +struct blkif_common_response { + char dummy; +}; + +/* i386 protocol version */ +#pragma pack(push, 4) +struct blkif_x86_32_request { + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; +struct blkif_x86_32_response { + uint64_t id; /* copied from request */ + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ +}; +typedef struct blkif_x86_32_request blkif_x86_32_request_t; +typedef struct blkif_x86_32_response blkif_x86_32_response_t; +#pragma pack(pop) + +/* x86_64 protocol version */ +struct blkif_x86_64_request { + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t __attribute__((__aligned__(8))) id; + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; +struct blkif_x86_64_response { + uint64_t __attribute__((__aligned__(8))) id; + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ +}; +typedef struct blkif_x86_64_request blkif_x86_64_request_t; +typedef struct blkif_x86_64_response blkif_x86_64_response_t; + +DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response); +DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response); +DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response); + +union blkif_back_rings { + blkif_back_ring_t native; + blkif_common_back_ring_t common; + blkif_x86_32_back_ring_t x86_32; + blkif_x86_64_back_ring_t x86_64; +}; +typedef union blkif_back_rings blkif_back_rings_t; + +enum blkif_protocol { + BLKIF_PROTOCOL_NATIVE = 1, + BLKIF_PROTOCOL_X86_32 = 2, + BLKIF_PROTOCOL_X86_64 = 3, +}; + +static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src) +{ + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; + dst->operation = src->operation; + dst->nr_segments = src->nr_segments; + dst->handle = src->handle; + dst->id = src->id; + dst->sector_number = src->sector_number; + barrier(); + if (n > dst->nr_segments) + n = dst->nr_segments; + for (i = 0; i < n; i++) + dst->seg[i] = src->seg[i]; +} + +static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src) +{ + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; + dst->operation = src->operation; + dst->nr_segments = src->nr_segments; + dst->handle = src->handle; + dst->id = src->id; + dst->sector_number = src->sector_number; + barrier(); + if (n > dst->nr_segments) + n = dst->nr_segments; + for (i = 0; i < n; i++) + dst->seg[i] = src->seg[i]; +} + +#endif /* __XEN_BLKIF_H__ */ --- linux-ec2-2.6.32.orig/include/xen/sysctl.h +++ linux-ec2-2.6.32/include/xen/sysctl.h @@ -0,0 +1,11 @@ +#ifndef _XEN_SYSCTL_H +#define _XEN_SYSCTL_H + +/* CTL_XEN names: */ +enum +{ + CTL_XEN_INDEPENDENT_WALLCLOCK=1, + CTL_XEN_PERMITTED_CLOCK_JITTER=2, +}; + +#endif /* _XEN_SYSCTL_H */ --- linux-ec2-2.6.32.orig/include/xen/events.h +++ linux-ec2-2.6.32/include/xen/events.h @@ -44,15 +44,6 @@ extern void xen_irq_resume(void); -/* Clear an irq's pending state, in preparation for polling on it */ -void xen_clear_irq_pending(int irq); -void xen_set_irq_pending(int irq); -bool xen_test_irq_pending(int irq); - -/* Poll waiting for an irq to become pending. In the usual case, the - irq will be disabled so it won't deliver an interrupt. */ -void xen_poll_irq(int irq); - /* Determine the IRQ which is bound to an event channel */ unsigned irq_from_evtchn(unsigned int evtchn); --- linux-ec2-2.6.32.orig/include/xen/xen_proc.h +++ linux-ec2-2.6.32/include/xen/xen_proc.h @@ -0,0 +1,12 @@ + +#ifndef __ASM_XEN_PROC_H__ +#define __ASM_XEN_PROC_H__ + +#include + +extern struct proc_dir_entry *create_xen_proc_entry( + const char *name, mode_t mode); +extern void remove_xen_proc_entry( + const char *name); + +#endif /* __ASM_XEN_PROC_H__ */ --- linux-ec2-2.6.32.orig/include/xen/features.h +++ linux-ec2-2.6.32/include/xen/features.h @@ -10,6 +10,7 @@ #define __XEN_FEATURES_H__ #include +#include void xen_setup_features(void); @@ -20,4 +21,4 @@ return xen_features[flag]; } -#endif /* __ASM_XEN_FEATURES_H__ */ +#endif /* __XEN_FEATURES_H__ */ --- linux-ec2-2.6.32.orig/include/xen/hvm.h +++ linux-ec2-2.6.32/include/xen/hvm.h @@ -0,0 +1,23 @@ +/* Simple wrappers around HVM functions */ +#ifndef XEN_HVM_H__ +#define XEN_HVM_H__ + +#include + +static inline unsigned long hvm_get_parameter(int idx) +{ + struct xen_hvm_param xhv; + int r; + + xhv.domid = DOMID_SELF; + xhv.index = idx; + r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv); + if (r < 0) { + printk(KERN_ERR "cannot get hvm parameter %d: %d.\n", + idx, r); + return 0; + } + return xhv.value; +} + +#endif /* XEN_HVM_H__ */ --- linux-ec2-2.6.32.orig/include/xen/pcifront.h +++ linux-ec2-2.6.32/include/xen/pcifront.h @@ -0,0 +1,69 @@ +/* + * PCI Frontend - arch-dependendent declarations + * + * Author: Ryan Wilson + */ +#ifndef __XEN_ASM_PCIFRONT_H__ +#define __XEN_ASM_PCIFRONT_H__ + +#include + +#ifdef __KERNEL__ + +#ifndef __ia64__ + +#include + +struct pcifront_device; +struct pci_bus; +#define pcifront_sd pci_sysdata + +static inline struct pcifront_device * +pcifront_get_pdev(struct pcifront_sd *sd) +{ + return sd->pdev; +} + +static inline void pcifront_init_sd(struct pcifront_sd *sd, + unsigned int domain, unsigned int bus, + struct pcifront_device *pdev) +{ + sd->domain = domain; + sd->pdev = pdev; +} + +static inline void pcifront_setup_root_resources(struct pci_bus *bus, + struct pcifront_sd *sd) +{ +} + +#else /* __ia64__ */ + +#include +#include +#define pcifront_sd pci_controller + +extern void xen_add_resource(struct pci_controller *, unsigned int, + unsigned int, struct acpi_resource *); +extern void xen_pcibios_setup_root_windows(struct pci_bus *, + struct pci_controller *); + +static inline struct pcifront_device * +pcifront_get_pdev(struct pcifront_sd *sd) +{ + return (struct pcifront_device *)sd->platform_data; +} + +static inline void pcifront_setup_root_resources(struct pci_bus *bus, + struct pcifront_sd *sd) +{ + xen_pcibios_setup_root_windows(bus, sd); +} + +#endif /* __ia64__ */ + +extern struct rw_semaphore pci_bus_sem; + +#endif /* __KERNEL__ */ + +#endif /* __XEN_ASM_PCIFRONT_H__ */ --- linux-ec2-2.6.32.orig/include/xen/xencons.h +++ linux-ec2-2.6.32/include/xen/xencons.h @@ -0,0 +1,19 @@ +#ifndef __ASM_XENCONS_H__ +#define __ASM_XENCONS_H__ + +struct dom0_vga_console_info; +void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t); + +void xencons_force_flush(void); +void xencons_resume(void); + +/* Interrupt work hooks. Receive data, or kick data out. */ +void xencons_rx(char *buf, unsigned len); +void xencons_tx(void); + +int xencons_ring_init(void); +int xencons_ring_send(const char *data, unsigned len); + +void xencons_early_setup(void); + +#endif /* __ASM_XENCONS_H__ */ --- linux-ec2-2.6.32.orig/include/xen/compat_ioctl.h +++ linux-ec2-2.6.32/include/xen/compat_ioctl.h @@ -0,0 +1,45 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2007 + * + * Authors: Jimi Xenidis + * Hollis Blanchard + */ + +#ifndef __LINUX_XEN_COMPAT_H__ +#define __LINUX_XEN_COMPAT_H__ + +#include + +extern int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg); +struct privcmd_mmap_32 { + int num; + domid_t dom; + compat_uptr_t entry; +}; + +struct privcmd_mmapbatch_32 { + int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + compat_uptr_t arr; /* array of mfns - top nibble set on err */ +}; +#define IOCTL_PRIVCMD_MMAP_32 \ + _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32)) +#define IOCTL_PRIVCMD_MMAPBATCH_32 \ + _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32)) + +#endif /* __LINUX_XEN_COMPAT_H__ */ --- linux-ec2-2.6.32.orig/include/xen/hypervisor_sysfs.h +++ linux-ec2-2.6.32/include/xen/hypervisor_sysfs.h @@ -0,0 +1,30 @@ +/* + * copyright (c) 2006 IBM Corporation + * Authored by: Mike D. Day + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _HYP_SYSFS_H_ +#define _HYP_SYSFS_H_ + +#include +#include + +#define HYPERVISOR_ATTR_RO(_name) \ +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) + +#define HYPERVISOR_ATTR_RW(_name) \ +static struct hyp_sysfs_attr _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +struct hyp_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct hyp_sysfs_attr *, char *); + ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t); + void *hyp_attr_data; +}; + +#endif /* _HYP_SYSFS_H_ */ --- linux-ec2-2.6.32.orig/include/xen/driver_util.h +++ linux-ec2-2.6.32/include/xen/driver_util.h @@ -0,0 +1,10 @@ + +#ifndef __ASM_XEN_DRIVER_UTIL_H__ +#define __ASM_XEN_DRIVER_UTIL_H__ + +#include +#include + +extern struct class *get_xen_class(void); + +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/physdev.h +++ linux-ec2-2.6.32/include/xen/interface/physdev.h @@ -21,10 +21,12 @@ #ifndef __XEN_PUBLIC_PHYSDEV_H__ #define __XEN_PUBLIC_PHYSDEV_H__ +#include "xen.h" + /* * Prototype for this hypercall is: * int physdev_op(int cmd, void *args) - * @cmd == PHYSDEVOP_??? (physdev operation). + * @cmd == PHYSDEVOP_??? (physdev operation). * @args == Operation-specific extra arguments (NULL if none). */ @@ -32,114 +34,218 @@ * Notify end-of-interrupt (EOI) for the specified IRQ. * @arg == pointer to physdev_eoi structure. */ -#define PHYSDEVOP_eoi 12 +#define PHYSDEVOP_eoi 12 struct physdev_eoi { - /* IN */ - uint32_t irq; + /* IN */ + uint32_t irq; }; +typedef struct physdev_eoi physdev_eoi_t; +DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t); + +/* + * Register a shared page for the hypervisor to indicate whether the guest + * must issue PHYSDEVOP_eoi. The semantics of PHYSDEVOP_eoi change slightly + * once the guest used this function in that the associated event channel + * will automatically get unmasked. The page registered is used as a bit + * array indexed by Xen's PIRQ value. + */ +#define PHYSDEVOP_pirq_eoi_gmfn 17 +struct physdev_pirq_eoi_gmfn { + /* IN */ + xen_pfn_t gmfn; +}; +typedef struct physdev_pirq_eoi_gmfn physdev_pirq_eoi_gmfn_t; +DEFINE_XEN_GUEST_HANDLE(physdev_pirq_eoi_gmfn_t); /* * Query the status of an IRQ line. * @arg == pointer to physdev_irq_status_query structure. */ -#define PHYSDEVOP_irq_status_query 5 +#define PHYSDEVOP_irq_status_query 5 struct physdev_irq_status_query { - /* IN */ - uint32_t irq; - /* OUT */ - uint32_t flags; /* XENIRQSTAT_* */ + /* IN */ + uint32_t irq; + /* OUT */ + uint32_t flags; /* XENIRQSTAT_* */ }; +typedef struct physdev_irq_status_query physdev_irq_status_query_t; +DEFINE_XEN_GUEST_HANDLE(physdev_irq_status_query_t); /* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */ -#define _XENIRQSTAT_needs_eoi (0) -#define XENIRQSTAT_needs_eoi (1U<<_XENIRQSTAT_needs_eoi) +#define _XENIRQSTAT_needs_eoi (0) +#define XENIRQSTAT_needs_eoi (1U<<_XENIRQSTAT_needs_eoi) /* IRQ shared by multiple guests? */ -#define _XENIRQSTAT_shared (1) -#define XENIRQSTAT_shared (1U<<_XENIRQSTAT_shared) +#define _XENIRQSTAT_shared (1) +#define XENIRQSTAT_shared (1U<<_XENIRQSTAT_shared) /* * Set the current VCPU's I/O privilege level. * @arg == pointer to physdev_set_iopl structure. */ -#define PHYSDEVOP_set_iopl 6 +#define PHYSDEVOP_set_iopl 6 struct physdev_set_iopl { - /* IN */ - uint32_t iopl; + /* IN */ + uint32_t iopl; }; +typedef struct physdev_set_iopl physdev_set_iopl_t; +DEFINE_XEN_GUEST_HANDLE(physdev_set_iopl_t); /* * Set the current VCPU's I/O-port permissions bitmap. * @arg == pointer to physdev_set_iobitmap structure. */ -#define PHYSDEVOP_set_iobitmap 7 +#define PHYSDEVOP_set_iobitmap 7 struct physdev_set_iobitmap { - /* IN */ - uint8_t * bitmap; - uint32_t nr_ports; + /* IN */ +#if __XEN_INTERFACE_VERSION__ >= 0x00030205 + XEN_GUEST_HANDLE(uint8) bitmap; +#else + uint8_t *bitmap; +#endif + uint32_t nr_ports; }; +typedef struct physdev_set_iobitmap physdev_set_iobitmap_t; +DEFINE_XEN_GUEST_HANDLE(physdev_set_iobitmap_t); /* * Read or write an IO-APIC register. * @arg == pointer to physdev_apic structure. */ -#define PHYSDEVOP_apic_read 8 -#define PHYSDEVOP_apic_write 9 +#define PHYSDEVOP_apic_read 8 +#define PHYSDEVOP_apic_write 9 struct physdev_apic { - /* IN */ - unsigned long apic_physbase; - uint32_t reg; - /* IN or OUT */ - uint32_t value; + /* IN */ + unsigned long apic_physbase; + uint32_t reg; + /* IN or OUT */ + uint32_t value; }; +typedef struct physdev_apic physdev_apic_t; +DEFINE_XEN_GUEST_HANDLE(physdev_apic_t); /* * Allocate or free a physical upcall vector for the specified IRQ line. * @arg == pointer to physdev_irq structure. */ -#define PHYSDEVOP_alloc_irq_vector 10 -#define PHYSDEVOP_free_irq_vector 11 +#define PHYSDEVOP_alloc_irq_vector 10 +#define PHYSDEVOP_free_irq_vector 11 struct physdev_irq { - /* IN */ - uint32_t irq; - /* IN or OUT */ - uint32_t vector; + /* IN */ + uint32_t irq; + /* IN or OUT */ + uint32_t vector; +}; +typedef struct physdev_irq physdev_irq_t; +DEFINE_XEN_GUEST_HANDLE(physdev_irq_t); + +#define MAP_PIRQ_TYPE_MSI 0x0 +#define MAP_PIRQ_TYPE_GSI 0x1 +#define MAP_PIRQ_TYPE_UNKNOWN 0x2 + +#define PHYSDEVOP_map_pirq 13 +struct physdev_map_pirq { + domid_t domid; + /* IN */ + int type; + /* IN */ + int index; + /* IN or OUT */ + int pirq; + /* IN */ + int bus; + /* IN */ + int devfn; + /* IN */ + int entry_nr; + /* IN */ + uint64_t table_base; +}; +typedef struct physdev_map_pirq physdev_map_pirq_t; +DEFINE_XEN_GUEST_HANDLE(physdev_map_pirq_t); + +#define PHYSDEVOP_unmap_pirq 14 +struct physdev_unmap_pirq { + domid_t domid; + /* IN */ + int pirq; +}; + +typedef struct physdev_unmap_pirq physdev_unmap_pirq_t; +DEFINE_XEN_GUEST_HANDLE(physdev_unmap_pirq_t); + +#define PHYSDEVOP_manage_pci_add 15 +#define PHYSDEVOP_manage_pci_remove 16 +struct physdev_manage_pci { + /* IN */ + uint8_t bus; + uint8_t devfn; +}; + +typedef struct physdev_manage_pci physdev_manage_pci_t; +DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_t); + +#define PHYSDEVOP_restore_msi 19 +struct physdev_restore_msi { + /* IN */ + uint8_t bus; + uint8_t devfn; +}; +typedef struct physdev_restore_msi physdev_restore_msi_t; +DEFINE_XEN_GUEST_HANDLE(physdev_restore_msi_t); + +#define PHYSDEVOP_manage_pci_add_ext 20 +struct physdev_manage_pci_ext { + /* IN */ + uint8_t bus; + uint8_t devfn; + unsigned is_extfn; + unsigned is_virtfn; + struct { + uint8_t bus; + uint8_t devfn; + } physfn; }; +typedef struct physdev_manage_pci_ext physdev_manage_pci_ext_t; +DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_ext_t); + /* * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op() * hypercall since 0x00030202. */ struct physdev_op { - uint32_t cmd; - union { - struct physdev_irq_status_query irq_status_query; - struct physdev_set_iopl set_iopl; - struct physdev_set_iobitmap set_iobitmap; - struct physdev_apic apic_op; - struct physdev_irq irq_op; - } u; + uint32_t cmd; + union { + struct physdev_irq_status_query irq_status_query; + struct physdev_set_iopl set_iopl; + struct physdev_set_iobitmap set_iobitmap; + struct physdev_apic apic_op; + struct physdev_irq irq_op; + } u; }; +typedef struct physdev_op physdev_op_t; +DEFINE_XEN_GUEST_HANDLE(physdev_op_t); /* * Notify that some PIRQ-bound event channels have been unmasked. * ** This command is obsolete since interface version 0x00030202 and is ** - * ** unsupported by newer versions of Xen. ** + * ** unsupported by newer versions of Xen. ** */ -#define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4 +#define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4 /* * These all-capitals physdev operation names are superceded by the new names * (defined above) since interface version 0x00030202. */ -#define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query -#define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl -#define PHYSDEVOP_SET_IOBITMAP PHYSDEVOP_set_iobitmap -#define PHYSDEVOP_APIC_READ PHYSDEVOP_apic_read -#define PHYSDEVOP_APIC_WRITE PHYSDEVOP_apic_write -#define PHYSDEVOP_ASSIGN_VECTOR PHYSDEVOP_alloc_irq_vector -#define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector +#define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query +#define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl +#define PHYSDEVOP_SET_IOBITMAP PHYSDEVOP_set_iobitmap +#define PHYSDEVOP_APIC_READ PHYSDEVOP_apic_read +#define PHYSDEVOP_APIC_WRITE PHYSDEVOP_apic_write +#define PHYSDEVOP_ASSIGN_VECTOR PHYSDEVOP_alloc_irq_vector +#define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector #define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi -#define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared +#define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared #endif /* __XEN_PUBLIC_PHYSDEV_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/memory.h +++ linux-ec2-2.6.32/include/xen/interface/memory.h @@ -3,20 +3,57 @@ * * Memory reservation and information. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2005, Keir Fraser */ #ifndef __XEN_PUBLIC_MEMORY_H__ #define __XEN_PUBLIC_MEMORY_H__ +#include "xen.h" + /* - * Increase or decrease the specified domain's memory reservation. Returns a - * -ve errcode on failure, or the # extents successfully allocated or freed. + * Increase or decrease the specified domain's memory reservation. Returns the + * number of extents successfully allocated or freed. * arg == addr of struct xen_memory_reservation. */ #define XENMEM_increase_reservation 0 #define XENMEM_decrease_reservation 1 #define XENMEM_populate_physmap 6 + +#if __XEN_INTERFACE_VERSION__ >= 0x00030209 +/* + * Maximum # bits addressable by the user of the allocated region (e.g., I/O + * devices often have a 32-bit limitation even in 64-bit systems). If zero + * then the user has no addressing restriction. This field is not used by + * XENMEM_decrease_reservation. + */ +#define XENMEMF_address_bits(x) (x) +#define XENMEMF_get_address_bits(x) ((x) & 0xffu) +/* NUMA node to allocate from. */ +#define XENMEMF_node(x) (((x) + 1) << 8) +#define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu) +/* Flag to populate physmap with populate-on-demand entries */ +#define XENMEMF_populate_on_demand (1<<16) +#endif + struct xen_memory_reservation { /* @@ -29,28 +66,71 @@ * OUT: GMFN bases of extents that were allocated * (NB. This command also updates the mach_to_phys translation table) */ - GUEST_HANDLE(ulong) extent_start; + XEN_GUEST_HANDLE(xen_pfn_t) extent_start; /* Number of extents, and size/alignment of each (2^extent_order pages). */ - unsigned long nr_extents; + xen_ulong_t nr_extents; unsigned int extent_order; - /* - * Maximum # bits addressable by the user of the allocated region (e.g., - * I/O devices often have a 32-bit limitation even in 64-bit systems). If - * zero then the user has no addressing restriction. - * This field is not used by XENMEM_decrease_reservation. - */ +#if __XEN_INTERFACE_VERSION__ >= 0x00030209 + /* XENMEMF flags. */ + unsigned int mem_flags; +#else unsigned int address_bits; +#endif /* * Domain whose reservation is being changed. * Unprivileged domains can specify only DOMID_SELF. */ domid_t domid; - }; DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation); +typedef struct xen_memory_reservation xen_memory_reservation_t; +DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t); + +/* + * An atomic exchange of memory pages. If return code is zero then + * @out.extent_list provides GMFNs of the newly-allocated memory. + * Returns zero on complete success, otherwise a negative error code. + * On complete success then always @nr_exchanged == @in.nr_extents. + * On partial success @nr_exchanged indicates how much work was done. + */ +#define XENMEM_exchange 11 +struct xen_memory_exchange { + /* + * [IN] Details of memory extents to be exchanged (GMFN bases). + * Note that @in.address_bits is ignored and unused. + */ + struct xen_memory_reservation in; + + /* + * [IN/OUT] Details of new memory extents. + * We require that: + * 1. @in.domid == @out.domid + * 2. @in.nr_extents << @in.extent_order == + * @out.nr_extents << @out.extent_order + * 3. @in.extent_start and @out.extent_start lists must not overlap + * 4. @out.extent_start lists GPFN bases to be populated + * 5. @out.extent_start is overwritten with allocated GMFN bases + */ + struct xen_memory_reservation out; + + /* + * [OUT] Number of input extents that were successfully exchanged: + * 1. The first @nr_exchanged input extents were successfully + * deallocated. + * 2. The corresponding first entries in the output extent list correctly + * indicate the GMFNs that were successfully exchanged. + * 3. All other input and output extents are untouched. + * 4. If not all input exents are exchanged then the return code of this + * command will be non-zero. + * 5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER! + */ + xen_ulong_t nr_exchanged; +}; +typedef struct xen_memory_exchange xen_memory_exchange_t; +DEFINE_XEN_GUEST_HANDLE(xen_memory_exchange_t); /* * Returns the maximum machine frame number of mapped RAM in this system. @@ -68,6 +148,11 @@ #define XENMEM_maximum_reservation 4 /* + * Returns the maximum GPFN in use by the guest, or -ve errcode on failure. + */ +#define XENMEM_maximum_gpfn 14 + +/* * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys * mapping table. Architectures which do not have a m2p table do not implement * this command. @@ -86,7 +171,7 @@ * any large discontiguities in the machine address space, 2MB gaps in * the machphys table will be represented by an MFN base of zero. */ - GUEST_HANDLE(ulong) extent_start; + XEN_GUEST_HANDLE(xen_pfn_t) extent_start; /* * Number of extents written to the above array. This will be smaller @@ -95,6 +180,22 @@ unsigned int nr_extents; }; DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list); +typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t; +DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t); + +/* + * Returns the location in virtual address space of the machine_to_phys + * mapping table. Architectures which do not have a m2p table, or which do not + * map it by default into guest address space, do not implement this command. + * arg == addr of xen_machphys_mapping_t. + */ +#define XENMEM_machphys_mapping 12 +struct xen_machphys_mapping { + xen_ulong_t v_start, v_end; /* Start and end virtual addresses. */ + xen_ulong_t max_mfn; /* Maximum MFN that can be looked up. */ +}; +typedef struct xen_machphys_mapping xen_machphys_mapping_t; +DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t); /* * Sets the GPFN at which a particular page appears in the specified guest's @@ -109,37 +210,76 @@ /* Source mapping space. */ #define XENMAPSPACE_shared_info 0 /* shared info page */ #define XENMAPSPACE_grant_table 1 /* grant table page */ +#define XENMAPSPACE_gmfn 2 /* GMFN */ unsigned int space; /* Index into source mapping space. */ - unsigned long idx; + xen_ulong_t idx; /* GPFN where the source mapping page should appear. */ - unsigned long gpfn; + xen_pfn_t gpfn; }; DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap); +typedef struct xen_add_to_physmap xen_add_to_physmap_t; +DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t); + +/*** REMOVED ***/ +/*#define XENMEM_translate_gpfn_list 8*/ /* - * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error - * code on failure. This call only works for auto-translated guests. + * Returns the pseudo-physical memory map as it was when the domain + * was started (specified by XENMEM_set_memory_map). + * arg == addr of xen_memory_map_t. */ -#define XENMEM_translate_gpfn_list 8 -struct xen_translate_gpfn_list { - /* Which domain to translate for? */ - domid_t domid; - - /* Length of list. */ - unsigned long nr_gpfns; - - /* List of GPFNs to translate. */ - GUEST_HANDLE(ulong) gpfn_list; +#define XENMEM_memory_map 9 +struct xen_memory_map { + /* + * On call the number of entries which can be stored in buffer. On + * return the number of entries which have been stored in + * buffer. + */ + unsigned int nr_entries; /* - * Output list to contain MFN translations. May be the same as the input - * list (in which case each input GPFN is overwritten with the output MFN). + * Entries in the buffer are in the same format as returned by the + * BIOS INT 0x15 EAX=0xE820 call. */ - GUEST_HANDLE(ulong) mfn_list; + XEN_GUEST_HANDLE(void) buffer; }; -DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list); +typedef struct xen_memory_map xen_memory_map_t; +DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t); + +/* + * Returns the real physical memory map. Passes the same structure as + * XENMEM_memory_map. + * arg == addr of xen_memory_map_t. + */ +#define XENMEM_machine_memory_map 10 +/* + * Set the pseudo-physical memory map of a domain, as returned by + * XENMEM_memory_map. + * arg == addr of xen_foreign_memory_map_t. + */ +#define XENMEM_set_memory_map 13 +struct xen_foreign_memory_map { + domid_t domid; + struct xen_memory_map map; +}; +typedef struct xen_foreign_memory_map xen_foreign_memory_map_t; +DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t); + +#define XENMEM_set_pod_target 16 +#define XENMEM_get_pod_target 17 +struct xen_pod_target { + /* IN */ + uint64_t target_pages; + /* OUT */ + uint64_t tot_pages; + uint64_t pod_cache_pages; + uint64_t pod_entries; + /* IN */ + domid_t domid; +}; +typedef struct xen_pod_target xen_pod_target_t; #endif /* __XEN_PUBLIC_MEMORY_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/callback.h +++ linux-ec2-2.6.32/include/xen/interface/callback.h @@ -86,6 +86,8 @@ uint16_t flags; xen_callback_t address; }; +typedef struct callback_register callback_register_t; +DEFINE_XEN_GUEST_HANDLE(callback_register_t); /* * Unregister a callback. @@ -98,5 +100,12 @@ uint16_t type; uint16_t _unused; }; +typedef struct callback_unregister callback_unregister_t; +DEFINE_XEN_GUEST_HANDLE(callback_unregister_t); + +#if __XEN_INTERFACE_VERSION__ < 0x00030207 +#undef CALLBACKTYPE_sysenter +#define CALLBACKTYPE_sysenter CALLBACKTYPE_sysenter_deprecated +#endif #endif /* __XEN_PUBLIC_CALLBACK_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/xen.h +++ linux-ec2-2.6.32/include/xen/interface/xen.h @@ -3,35 +3,70 @@ * * Guest OS interface to Xen. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2004, K A Fraser */ #ifndef __XEN_PUBLIC_XEN_H__ #define __XEN_PUBLIC_XEN_H__ -#include +#include "xen-compat.h" +#ifdef CONFIG_PARAVIRT_XEN #include +#endif -/* - * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). - */ +#if defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H) +#include +#elif defined(__i386__) || defined(__x86_64__) +#include "arch-x86/xen.h" +#elif defined(__ia64__) +#include "arch-ia64.h" +#else +#error "Unsupported architecture" +#endif + +#ifndef __ASSEMBLY__ +/* Guest handles for primitive C types. */ +DEFINE_XEN_GUEST_HANDLE(char); +__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char); +DEFINE_XEN_GUEST_HANDLE(int); +__DEFINE_XEN_GUEST_HANDLE(uint, unsigned int); +DEFINE_XEN_GUEST_HANDLE(long); +__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long); +DEFINE_XEN_GUEST_HANDLE(void); + +DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); +#endif /* - * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5. - * EAX = return value - * (argument registers may be clobbered on return) - * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6. - * RAX = return value - * (argument registers not clobbered on return; RCX, R11 are) + * HYPERCALLS */ + #define __HYPERVISOR_set_trap_table 0 #define __HYPERVISOR_mmu_update 1 #define __HYPERVISOR_set_gdt 2 #define __HYPERVISOR_stack_switch 3 #define __HYPERVISOR_set_callbacks 4 #define __HYPERVISOR_fpu_taskswitch 5 -#define __HYPERVISOR_sched_op 6 -#define __HYPERVISOR_dom0_op 7 +#define __HYPERVISOR_sched_op_compat 6 /* compat since 0x00030101 */ +#define __HYPERVISOR_platform_op 7 #define __HYPERVISOR_set_debugreg 8 #define __HYPERVISOR_get_debugreg 9 #define __HYPERVISOR_update_descriptor 10 @@ -39,10 +74,10 @@ #define __HYPERVISOR_multicall 13 #define __HYPERVISOR_update_va_mapping 14 #define __HYPERVISOR_set_timer_op 15 -#define __HYPERVISOR_event_channel_op_compat 16 +#define __HYPERVISOR_event_channel_op_compat 16 /* compat since 0x00030202 */ #define __HYPERVISOR_xen_version 17 #define __HYPERVISOR_console_io 18 -#define __HYPERVISOR_physdev_op_compat 19 +#define __HYPERVISOR_physdev_op_compat 19 /* compat since 0x00030202 */ #define __HYPERVISOR_grant_table_op 20 #define __HYPERVISOR_vm_assist 21 #define __HYPERVISOR_update_va_mapping_otherdomain 22 @@ -50,7 +85,7 @@ #define __HYPERVISOR_vcpu_op 24 #define __HYPERVISOR_set_segment_base 25 /* x86/64 only */ #define __HYPERVISOR_mmuext_op 26 -#define __HYPERVISOR_acm_op 27 +#define __HYPERVISOR_xsm_op 27 #define __HYPERVISOR_nmi_op 28 #define __HYPERVISOR_sched_op_new 29 #define __HYPERVISOR_callback_op 30 @@ -58,6 +93,10 @@ #define __HYPERVISOR_event_channel_op 32 #define __HYPERVISOR_physdev_op 33 #define __HYPERVISOR_hvm_op 34 +#define __HYPERVISOR_sysctl 35 +#define __HYPERVISOR_domctl 36 +#define __HYPERVISOR_kexec_op 37 +#define __HYPERVISOR_tmem_op 38 /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 @@ -70,15 +109,48 @@ #define __HYPERVISOR_arch_7 55 /* + * HYPERCALL COMPATIBILITY. + */ + +/* New sched_op hypercall introduced in 0x00030101. */ +#if __XEN_INTERFACE_VERSION__ < 0x00030101 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)) +#undef __HYPERVISOR_sched_op +#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat +#else +#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_new +#endif + +/* New event-channel and physdev hypercalls introduced in 0x00030202. */ +#if __XEN_INTERFACE_VERSION__ < 0x00030202 +#undef __HYPERVISOR_event_channel_op +#define __HYPERVISOR_event_channel_op __HYPERVISOR_event_channel_op_compat +#undef __HYPERVISOR_physdev_op +#define __HYPERVISOR_physdev_op __HYPERVISOR_physdev_op_compat +#endif + +/* New platform_op hypercall introduced in 0x00030204. */ +#if __XEN_INTERFACE_VERSION__ < 0x00030204 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)) +#define __HYPERVISOR_dom0_op __HYPERVISOR_platform_op +#endif + +/* * VIRTUAL INTERRUPTS * * Virtual interrupts that a guest OS may receive from Xen. - */ -#define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */ -#define VIRQ_DEBUG 1 /* Request guest to dump debug info. */ -#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */ -#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */ -#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */ + * + * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a + * global VIRQ. The former can be bound once per VCPU and cannot be re-bound. + * The latter can be allocated only once per guest: they must initially be + * allocated to VCPU0 but can subsequently be re-bound. + */ +#define VIRQ_TIMER 0 /* V. Timebase update, and/or requested timeout. */ +#define VIRQ_DEBUG 1 /* V. Request guest to dump debug info. */ +#define VIRQ_CONSOLE 2 /* G. (DOM0) Bytes received on emergency console. */ +#define VIRQ_DOM_EXC 3 /* G. (DOM0) Exceptional event for some domain. */ +#define VIRQ_TBUF 4 /* G. (DOM0) Trace buffer has records available. */ +#define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */ +#define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */ +#define VIRQ_CON_RING 8 /* G. (DOM0) Bytes received on console */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 @@ -91,6 +163,7 @@ #define VIRQ_ARCH_7 23 #define NR_VIRQS 24 + /* * MMU-UPDATE REQUESTS * @@ -119,8 +192,8 @@ * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed * with those in @val. */ -#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ -#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ +#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ +#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ #define MMU_PT_UPDATE_PRESERVE_AD 2 /* atomically: *ptr = val | (*ptr&(A|D)) */ /* @@ -163,9 +236,20 @@ * cmd: MMUEXT_FLUSH_CACHE * No additional arguments. Writes back and flushes cache contents. * + * cmd: MMUEXT_FLUSH_CACHE_GLOBAL + * No additional arguments. Writes back and flushes cache contents + * on all CPUs in the system. + * * cmd: MMUEXT_SET_LDT * linear_addr: Linear address of LDT base (NB. must be page-aligned). * nr_ents: Number of entries in LDT. + * + * cmd: MMUEXT_CLEAR_PAGE + * mfn: Machine frame number to be cleared. + * + * cmd: MMUEXT_COPY_PAGE + * mfn: Machine frame number of the destination page. + * src_mfn: Machine frame number of the source page. */ #define MMUEXT_PIN_L1_TABLE 0 #define MMUEXT_PIN_L2_TABLE 1 @@ -182,24 +266,36 @@ #define MMUEXT_FLUSH_CACHE 12 #define MMUEXT_SET_LDT 13 #define MMUEXT_NEW_USER_BASEPTR 15 +#define MMUEXT_CLEAR_PAGE 16 +#define MMUEXT_COPY_PAGE 17 +#define MMUEXT_FLUSH_CACHE_GLOBAL 18 #ifndef __ASSEMBLY__ struct mmuext_op { - unsigned int cmd; - union { - /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */ - unsigned long mfn; - /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ - unsigned long linear_addr; - } arg1; - union { - /* SET_LDT */ - unsigned int nr_ents; - /* TLB_FLUSH_MULTI, INVLPG_MULTI */ - void *vcpumask; - } arg2; + unsigned int cmd; + union { + /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR + * CLEAR_PAGE, COPY_PAGE */ + xen_pfn_t mfn; + /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ + unsigned long linear_addr; + } arg1; + union { + /* SET_LDT */ + unsigned int nr_ents; + /* TLB_FLUSH_MULTI, INVLPG_MULTI */ +#if __XEN_INTERFACE_VERSION__ >= 0x00030205 + XEN_GUEST_HANDLE(const_void) vcpumask; +#else + const void *vcpumask; +#endif + /* COPY_PAGE */ + xen_pfn_t src_mfn; + } arg2; }; DEFINE_GUEST_HANDLE_STRUCT(mmuext_op); +typedef struct mmuext_op mmuext_op_t; +DEFINE_XEN_GUEST_HANDLE(mmuext_op_t); #endif /* These are passed as 'flags' to update_va_mapping. They can be ORed. */ @@ -224,11 +320,24 @@ */ #define VMASST_CMD_enable 0 #define VMASST_CMD_disable 1 + +/* x86/32 guests: simulate full 4GB segment limits. */ #define VMASST_TYPE_4gb_segments 0 + +/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */ #define VMASST_TYPE_4gb_segments_notify 1 + +/* + * x86 guests: support writes to bottom-level PTEs. + * NB1. Page-directory entries cannot be written. + * NB2. Guest must continue to remove all writable mappings of PTEs. + */ #define VMASST_TYPE_writable_pagetables 2 + +/* x86/PAE guests: support PDPTs above 4GB. */ #define VMASST_TYPE_pae_extended_cr3 3 -#define MAX_VMASST_TYPE 3 + +#define MAX_VMASST_TYPE 3 #ifndef __ASSEMBLY__ @@ -259,6 +368,9 @@ */ #define DOMID_XEN (0x7FF2U) +/* DOMID_INVALID is used to identity invalid domid */ +#define DOMID_INVALID (0x7FFFU) + /* * Send an array of these to HYPERVISOR_mmu_update(). * NB. The fields are natural pointer/address size for this architecture. @@ -268,6 +380,8 @@ uint64_t val; /* New contents of PTE. */ }; DEFINE_GUEST_HANDLE_STRUCT(mmu_update); +typedef struct mmu_update mmu_update_t; +DEFINE_XEN_GUEST_HANDLE(mmu_update_t); /* * Send an array of these to HYPERVISOR_multicall(). @@ -275,10 +389,16 @@ */ struct multicall_entry { unsigned long op; +#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H) + unsigned long result; +#else long result; +#endif unsigned long args[6]; }; DEFINE_GUEST_HANDLE_STRUCT(multicall_entry); +typedef struct multicall_entry multicall_entry_t; +DEFINE_XEN_GUEST_HANDLE(multicall_entry_t); /* * Event channel endpoints per domain: @@ -287,173 +407,244 @@ #define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64) struct vcpu_time_info { - /* - * Updates to the following values are preceded and followed - * by an increment of 'version'. The guest can therefore - * detect updates by looking for changes to 'version'. If the - * least-significant bit of the version number is set then an - * update is in progress and the guest must wait to read a - * consistent set of values. The correct way to interact with - * the version number is similar to Linux's seqlock: see the - * implementations of read_seqbegin/read_seqretry. - */ - uint32_t version; - uint32_t pad0; - uint64_t tsc_timestamp; /* TSC at last update of time vals. */ - uint64_t system_time; /* Time, in nanosecs, since boot. */ - /* - * Current system time: - * system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul - * CPU frequency (Hz): - * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift - */ - uint32_t tsc_to_system_mul; - int8_t tsc_shift; - int8_t pad1[3]; + /* + * Updates to the following values are preceded and followed by an + * increment of 'version'. The guest can therefore detect updates by + * looking for changes to 'version'. If the least-significant bit of + * the version number is set then an update is in progress and the guest + * must wait to read a consistent set of values. + * The correct way to interact with the version number is similar to + * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry. + */ + uint32_t version; + uint32_t pad0; + uint64_t tsc_timestamp; /* TSC at last update of time vals. */ + uint64_t system_time; /* Time, in nanosecs, since boot. */ + /* + * Current system time: + * system_time + + * ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32) + * CPU frequency (Hz): + * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift + */ + uint32_t tsc_to_system_mul; + int8_t tsc_shift; + int8_t pad1[3]; }; /* 32 bytes */ +typedef struct vcpu_time_info vcpu_time_info_t; struct vcpu_info { - /* - * 'evtchn_upcall_pending' is written non-zero by Xen to indicate - * a pending notification for a particular VCPU. It is then cleared - * by the guest OS /before/ checking for pending work, thus avoiding - * a set-and-check race. Note that the mask is only accessed by Xen - * on the CPU that is currently hosting the VCPU. This means that the - * pending and mask flags can be updated by the guest without special - * synchronisation (i.e., no need for the x86 LOCK prefix). - * This may seem suboptimal because if the pending flag is set by - * a different CPU then an IPI may be scheduled even when the mask - * is set. However, note: - * 1. The task of 'interrupt holdoff' is covered by the per-event- - * channel mask bits. A 'noisy' event that is continually being - * triggered can be masked at source at this very precise - * granularity. - * 2. The main purpose of the per-VCPU mask is therefore to restrict - * reentrant execution: whether for concurrency control, or to - * prevent unbounded stack usage. Whatever the purpose, we expect - * that the mask will be asserted only for short periods at a time, - * and so the likelihood of a 'spurious' IPI is suitably small. - * The mask is read before making an event upcall to the guest: a - * non-zero mask therefore guarantees that the VCPU will not receive - * an upcall activation. The mask is cleared when the VCPU requests - * to block: this avoids wakeup-waiting races. - */ - uint8_t evtchn_upcall_pending; - uint8_t evtchn_upcall_mask; - unsigned long evtchn_pending_sel; - struct arch_vcpu_info arch; - struct pvclock_vcpu_time_info time; + /* + * 'evtchn_upcall_pending' is written non-zero by Xen to indicate + * a pending notification for a particular VCPU. It is then cleared + * by the guest OS /before/ checking for pending work, thus avoiding + * a set-and-check race. Note that the mask is only accessed by Xen + * on the CPU that is currently hosting the VCPU. This means that the + * pending and mask flags can be updated by the guest without special + * synchronisation (i.e., no need for the x86 LOCK prefix). + * This may seem suboptimal because if the pending flag is set by + * a different CPU then an IPI may be scheduled even when the mask + * is set. However, note: + * 1. The task of 'interrupt holdoff' is covered by the per-event- + * channel mask bits. A 'noisy' event that is continually being + * triggered can be masked at source at this very precise + * granularity. + * 2. The main purpose of the per-VCPU mask is therefore to restrict + * reentrant execution: whether for concurrency control, or to + * prevent unbounded stack usage. Whatever the purpose, we expect + * that the mask will be asserted only for short periods at a time, + * and so the likelihood of a 'spurious' IPI is suitably small. + * The mask is read before making an event upcall to the guest: a + * non-zero mask therefore guarantees that the VCPU will not receive + * an upcall activation. The mask is cleared when the VCPU requests + * to block: this avoids wakeup-waiting races. + */ + uint8_t evtchn_upcall_pending; + uint8_t evtchn_upcall_mask; + unsigned long evtchn_pending_sel; + struct arch_vcpu_info arch; +#ifdef CONFIG_PARAVIRT_XEN + struct pvclock_vcpu_time_info time; +#else + struct vcpu_time_info time; +#endif }; /* 64 bytes (x86) */ +#ifndef __XEN__ +typedef struct vcpu_info vcpu_info_t; +#endif /* * Xen/kernel shared data -- pointer provided in start_info. - * NB. We expect that this struct is smaller than a page. + * + * This structure is defined to be both smaller than a page, and the + * only data on the shared page, but may vary in actual size even within + * compatible Xen versions; guests should not rely on the size + * of this structure remaining constant. */ struct shared_info { - struct vcpu_info vcpu_info[MAX_VIRT_CPUS]; + struct vcpu_info vcpu_info[MAX_VIRT_CPUS]; - /* - * A domain can create "event channels" on which it can send and receive - * asynchronous event notifications. There are three classes of event that - * are delivered by this mechanism: - * 1. Bi-directional inter- and intra-domain connections. Domains must - * arrange out-of-band to set up a connection (usually by allocating - * an unbound 'listener' port and avertising that via a storage service - * such as xenstore). - * 2. Physical interrupts. A domain with suitable hardware-access - * privileges can bind an event-channel port to a physical interrupt - * source. - * 3. Virtual interrupts ('events'). A domain can bind an event-channel - * port to a virtual interrupt source, such as the virtual-timer - * device or the emergency console. - * - * Event channels are addressed by a "port index". Each channel is - * associated with two bits of information: - * 1. PENDING -- notifies the domain that there is a pending notification - * to be processed. This bit is cleared by the guest. - * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING - * will cause an asynchronous upcall to be scheduled. This bit is only - * updated by the guest. It is read-only within Xen. If a channel - * becomes pending while the channel is masked then the 'edge' is lost - * (i.e., when the channel is unmasked, the guest must manually handle - * pending notifications as no upcall will be scheduled by Xen). - * - * To expedite scanning of pending notifications, any 0->1 pending - * transition on an unmasked channel causes a corresponding bit in a - * per-vcpu selector word to be set. Each bit in the selector covers a - * 'C long' in the PENDING bitfield array. - */ - unsigned long evtchn_pending[sizeof(unsigned long) * 8]; - unsigned long evtchn_mask[sizeof(unsigned long) * 8]; - - /* - * Wallclock time: updated only by control software. Guests should base - * their gettimeofday() syscall on this wallclock-base value. - */ - struct pvclock_wall_clock wc; + /* + * A domain can create "event channels" on which it can send and receive + * asynchronous event notifications. There are three classes of event that + * are delivered by this mechanism: + * 1. Bi-directional inter- and intra-domain connections. Domains must + * arrange out-of-band to set up a connection (usually by allocating + * an unbound 'listener' port and avertising that via a storage service + * such as xenstore). + * 2. Physical interrupts. A domain with suitable hardware-access + * privileges can bind an event-channel port to a physical interrupt + * source. + * 3. Virtual interrupts ('events'). A domain can bind an event-channel + * port to a virtual interrupt source, such as the virtual-timer + * device or the emergency console. + * + * Event channels are addressed by a "port index". Each channel is + * associated with two bits of information: + * 1. PENDING -- notifies the domain that there is a pending notification + * to be processed. This bit is cleared by the guest. + * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING + * will cause an asynchronous upcall to be scheduled. This bit is only + * updated by the guest. It is read-only within Xen. If a channel + * becomes pending while the channel is masked then the 'edge' is lost + * (i.e., when the channel is unmasked, the guest must manually handle + * pending notifications as no upcall will be scheduled by Xen). + * + * To expedite scanning of pending notifications, any 0->1 pending + * transition on an unmasked channel causes a corresponding bit in a + * per-vcpu selector word to be set. Each bit in the selector covers a + * 'C long' in the PENDING bitfield array. + */ + unsigned long evtchn_pending[sizeof(unsigned long) * 8]; + unsigned long evtchn_mask[sizeof(unsigned long) * 8]; + + /* + * Wallclock time: updated only by control software. Guests should base + * their gettimeofday() syscall on this wallclock-base value. + */ +#ifdef CONFIG_PARAVIRT_XEN + struct pvclock_wall_clock wc; +#else + uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ + uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ + uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */ +#endif - struct arch_shared_info arch; + struct arch_shared_info arch; }; +#ifndef __XEN__ +typedef struct shared_info shared_info_t; +#endif /* - * Start-of-day memory layout for the initial domain (DOM0): + * Start-of-day memory layout: * 1. The domain is started within contiguous virtual-memory region. - * 2. The contiguous region begins and ends on an aligned 4MB boundary. - * 3. The region start corresponds to the load address of the OS image. - * If the load address is not 4MB aligned then the address is rounded down. - * 4. This the order of bootstrap elements in the initial virtual region: + * 2. The contiguous region ends on an aligned 4MB boundary. + * 3. This the order of bootstrap elements in the initial virtual region: * a. relocated kernel image * b. initial ram disk [mod_start, mod_len] * c. list of allocated page frames [mfn_list, nr_pages] + * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] * e. bootstrap page tables [pt_base, CR3 (x86)] * f. bootstrap stack [register ESP (x86)] - * 5. Bootstrap elements are packed together, but each is 4kB-aligned. - * 6. The initial ram disk may be omitted. - * 7. The list of page frames forms a contiguous 'pseudo-physical' memory + * 4. Bootstrap elements are packed together, but each is 4kB-aligned. + * 5. The initial ram disk may be omitted. + * 6. The list of page frames forms a contiguous 'pseudo-physical' memory * layout for the domain. In particular, the bootstrap virtual-memory * region is a 1:1 mapping to the first section of the pseudo-physical map. - * 8. All bootstrap elements are mapped read-writable for the guest OS. The + * 7. All bootstrap elements are mapped read-writable for the guest OS. The * only exception is the bootstrap page table, which is mapped read-only. - * 9. There is guaranteed to be at least 512kB padding after the final + * 8. There is guaranteed to be at least 512kB padding after the final * bootstrap element. If necessary, the bootstrap virtual region is * extended by an extra 4MB to ensure this. */ #define MAX_GUEST_CMDLINE 1024 struct start_info { - /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */ - char magic[32]; /* "xen--". */ - unsigned long nr_pages; /* Total pages allocated to this domain. */ - unsigned long shared_info; /* MACHINE address of shared info struct. */ - uint32_t flags; /* SIF_xxx flags. */ - unsigned long store_mfn; /* MACHINE page number of shared page. */ - uint32_t store_evtchn; /* Event channel for store communication. */ - union { - struct { - unsigned long mfn; /* MACHINE page number of console page. */ - uint32_t evtchn; /* Event channel for console page. */ - } domU; - struct { - uint32_t info_off; /* Offset of console_info struct. */ - uint32_t info_size; /* Size of console_info struct from start.*/ - } dom0; - } console; - /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */ - unsigned long pt_base; /* VIRTUAL address of page directory. */ - unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */ - unsigned long mfn_list; /* VIRTUAL address of page-frame list. */ - unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ - unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ - int8_t cmd_line[MAX_GUEST_CMDLINE]; + /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */ + char magic[32]; /* "xen--". */ + unsigned long nr_pages; /* Total pages allocated to this domain. */ + unsigned long shared_info; /* MACHINE address of shared info struct. */ + uint32_t flags; /* SIF_xxx flags. */ + xen_pfn_t store_mfn; /* MACHINE page number of shared page. */ + uint32_t store_evtchn; /* Event channel for store communication. */ + union { + struct { + xen_pfn_t mfn; /* MACHINE page number of console page. */ + uint32_t evtchn; /* Event channel for console page. */ + } domU; + struct { + uint32_t info_off; /* Offset of console_info struct. */ + uint32_t info_size; /* Size of console_info struct from start.*/ + } dom0; + } console; + /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */ + unsigned long pt_base; /* VIRTUAL address of page directory. */ + unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */ + unsigned long mfn_list; /* VIRTUAL address of page-frame list. */ + unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ + unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ + int8_t cmd_line[MAX_GUEST_CMDLINE]; + /* The pfn range here covers both page table and p->m table frames. */ + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ }; +typedef struct start_info start_info_t; + +/* New console union for dom0 introduced in 0x00030203. */ +#if __XEN_INTERFACE_VERSION__ < 0x00030203 +#define console_mfn console.domU.mfn +#define console_evtchn console.domU.evtchn +#endif /* These flags are passed in the 'flags' field of start_info_t. */ #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ +#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ -typedef uint64_t cpumap_t; +typedef struct dom0_vga_console_info { + uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */ +#define XEN_VGATYPE_TEXT_MODE_3 0x03 +#define XEN_VGATYPE_VESA_LFB 0x23 + + union { + struct { + /* Font height, in pixels. */ + uint16_t font_height; + /* Cursor location (column, row). */ + uint16_t cursor_x, cursor_y; + /* Number of rows and columns (dimensions in characters). */ + uint16_t rows, columns; + } text_mode_3; + + struct { + /* Width and height, in pixels. */ + uint16_t width, height; + /* Bytes per scan line. */ + uint16_t bytes_per_line; + /* Bits per pixel. */ + uint16_t bits_per_pixel; + /* LFB physical address, and size (in units of 64kB). */ + uint32_t lfb_base; + uint32_t lfb_size; + /* RGB mask offsets and sizes, as defined by VBE 1.2+ */ + uint8_t red_pos, red_size; + uint8_t green_pos, green_size; + uint8_t blue_pos, blue_size; + uint8_t rsvd_pos, rsvd_size; +#if __XEN_INTERFACE_VERSION__ >= 0x00030206 + /* VESA capabilities (offset 0xa, VESA command 0x4f00). */ + uint32_t gbl_caps; + /* Mode attributes (offset 0x0, VESA command 0x4f01). */ + uint16_t mode_attrs; +#endif + } vesa_lfb; + } u; +} dom0_vga_console_info_t; +#define xen_vga_console_info dom0_vga_console_info +#define xen_vga_console_info_t dom0_vga_console_info_t typedef uint8_t xen_domain_handle_t[16]; @@ -461,6 +652,11 @@ #define __mk_unsigned_long(x) x ## UL #define mk_unsigned_long(x) __mk_unsigned_long(x) +__DEFINE_XEN_GUEST_HANDLE(uint8, uint8_t); +__DEFINE_XEN_GUEST_HANDLE(uint16, uint16_t); +__DEFINE_XEN_GUEST_HANDLE(uint32, uint32_t); +__DEFINE_XEN_GUEST_HANDLE(uint64, uint64_t); + #else /* __ASSEMBLY__ */ /* In assembly code we cannot use C numeric constant suffixes. */ @@ -468,4 +664,14 @@ #endif /* !__ASSEMBLY__ */ +/* Default definitions for macros used by domctl/sysctl. */ +#if defined(__XEN__) || defined(__XEN_TOOLS__) +#ifndef uint64_aligned_t +#define uint64_aligned_t uint64_t +#endif +#ifndef XEN_GUEST_HANDLE_64 +#define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name) +#endif +#endif + #endif /* __XEN_PUBLIC_XEN_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/COPYING +++ linux-ec2-2.6.32/include/xen/interface/COPYING @@ -0,0 +1,38 @@ +XEN NOTICE +========== + +This copyright applies to all files within this subdirectory and its +subdirectories: + include/public/*.h + include/public/hvm/*.h + include/public/io/*.h + +The intention is that these files can be freely copied into the source +tree of an operating system when porting that OS to run on Xen. Doing +so does *not* cause the OS to become subject to the terms of the GPL. + +All other files in the Xen source distribution are covered by version +2 of the GNU General Public License except where explicitly stated +otherwise within individual source files. + + -- Keir Fraser (on behalf of the Xen team) + +===================================================================== + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. --- linux-ec2-2.6.32.orig/include/xen/interface/vcpu.h +++ linux-ec2-2.6.32/include/xen/interface/vcpu.h @@ -27,11 +27,13 @@ #ifndef __XEN_PUBLIC_VCPU_H__ #define __XEN_PUBLIC_VCPU_H__ +#include "xen.h" + /* * Prototype for this hypercall is: - * int vcpu_op(int cmd, int vcpuid, void *extra_args) - * @cmd == VCPUOP_??? (VCPU operation). - * @vcpuid == VCPU to operate on. + * int vcpu_op(int cmd, int vcpuid, void *extra_args) + * @cmd == VCPUOP_??? (VCPU operation). + * @vcpuid == VCPU to operate on. * @extra_args == Operation-specific extra arguments (NULL if none). */ @@ -40,52 +42,54 @@ * newly-initialised VCPU will not run until it is brought up by VCPUOP_up. * * @extra_arg == pointer to vcpu_guest_context structure containing initial - * state for the VCPU. + * state for the VCPU. */ -#define VCPUOP_initialise 0 +#define VCPUOP_initialise 0 /* * Bring up a VCPU. This makes the VCPU runnable. This operation will fail * if the VCPU has not been initialised (VCPUOP_initialise). */ -#define VCPUOP_up 1 +#define VCPUOP_up 1 /* * Bring down a VCPU (i.e., make it non-runnable). * There are a few caveats that callers should observe: - * 1. This operation may return, and VCPU_is_up may return false, before the - * VCPU stops running (i.e., the command is asynchronous). It is a good - * idea to ensure that the VCPU has entered a non-critical loop before - * bringing it down. Alternatively, this operation is guaranteed - * synchronous if invoked by the VCPU itself. - * 2. After a VCPU is initialised, there is currently no way to drop all its - * references to domain memory. Even a VCPU that is down still holds - * memory references via its pagetable base pointer and GDT. It is good - * practise to move a VCPU onto an 'idle' or default page table, LDT and - * GDT before bringing it down. + * 1. This operation may return, and VCPU_is_up may return false, before the + * VCPU stops running (i.e., the command is asynchronous). It is a good + * idea to ensure that the VCPU has entered a non-critical loop before + * bringing it down. Alternatively, this operation is guaranteed + * synchronous if invoked by the VCPU itself. + * 2. After a VCPU is initialised, there is currently no way to drop all its + * references to domain memory. Even a VCPU that is down still holds + * memory references via its pagetable base pointer and GDT. It is good + * practise to move a VCPU onto an 'idle' or default page table, LDT and + * GDT before bringing it down. */ -#define VCPUOP_down 2 +#define VCPUOP_down 2 /* Returns 1 if the given VCPU is up. */ -#define VCPUOP_is_up 3 +#define VCPUOP_is_up 3 /* * Return information about the state and running time of a VCPU. * @extra_arg == pointer to vcpu_runstate_info structure. */ -#define VCPUOP_get_runstate_info 4 +#define VCPUOP_get_runstate_info 4 struct vcpu_runstate_info { - /* VCPU's current state (RUNSTATE_*). */ - int state; - /* When was current state entered (system time, ns)? */ - uint64_t state_entry_time; - /* - * Time spent in each RUNSTATE_* (ns). The sum of these times is - * guaranteed not to drift from system time. - */ - uint64_t time[4]; + /* VCPU's current state (RUNSTATE_*). */ + int state; + /* When was current state entered (system time, ns)? */ + uint64_t state_entry_time; + /* + * Time spent in each RUNSTATE_* (ns). The sum of these times is + * guaranteed not to drift from system time. + */ + uint64_t time[4]; }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info); +typedef struct vcpu_runstate_info vcpu_runstate_info_t; +DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t); /* VCPU is currently running on a physical CPU. */ #define RUNSTATE_running 0 @@ -108,47 +112,54 @@ * Register a shared memory area from which the guest may obtain its own * runstate information without needing to execute a hypercall. * Notes: - * 1. The registered address may be virtual or physical, depending on the - * platform. The virtual address should be registered on x86 systems. - * 2. Only one shared area may be registered per VCPU. The shared area is - * updated by the hypervisor each time the VCPU is scheduled. Thus - * runstate.state will always be RUNSTATE_running and - * runstate.state_entry_time will indicate the system time at which the - * VCPU was last scheduled to run. + * 1. The registered address may be virtual or physical or guest handle, + * depending on the platform. Virtual address or guest handle should be + * registered on x86 systems. + * 2. Only one shared area may be registered per VCPU. The shared area is + * updated by the hypervisor each time the VCPU is scheduled. Thus + * runstate.state will always be RUNSTATE_running and + * runstate.state_entry_time will indicate the system time at which the + * VCPU was last scheduled to run. * @extra_arg == pointer to vcpu_register_runstate_memory_area structure. */ #define VCPUOP_register_runstate_memory_area 5 struct vcpu_register_runstate_memory_area { - union { - GUEST_HANDLE(vcpu_runstate_info) h; - struct vcpu_runstate_info *v; - uint64_t p; - } addr; + union { + XEN_GUEST_HANDLE(vcpu_runstate_info_t) h; + struct vcpu_runstate_info *v; + uint64_t p; + } addr; }; +typedef struct vcpu_register_runstate_memory_area vcpu_register_runstate_memory_area_t; +DEFINE_XEN_GUEST_HANDLE(vcpu_register_runstate_memory_area_t); /* * Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer * which can be set via these commands. Periods smaller than one millisecond * may not be supported. */ -#define VCPUOP_set_periodic_timer 6 /* arg == vcpu_set_periodic_timer_t */ -#define VCPUOP_stop_periodic_timer 7 /* arg == NULL */ +#define VCPUOP_set_periodic_timer 6 /* arg == vcpu_set_periodic_timer_t */ +#define VCPUOP_stop_periodic_timer 7 /* arg == NULL */ struct vcpu_set_periodic_timer { - uint64_t period_ns; + uint64_t period_ns; }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer); +typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t; +DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t); /* * Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot * timer which can be set via these commands. */ -#define VCPUOP_set_singleshot_timer 8 /* arg == vcpu_set_singleshot_timer_t */ +#define VCPUOP_set_singleshot_timer 8 /* arg == vcpu_set_singleshot_timer_t */ #define VCPUOP_stop_singleshot_timer 9 /* arg == NULL */ struct vcpu_set_singleshot_timer { - uint64_t timeout_abs_ns; - uint32_t flags; /* VCPU_SSHOTTMR_??? */ + uint64_t timeout_abs_ns; /* Absolute system time value in nanoseconds. */ + uint32_t flags; /* VCPU_SSHOTTMR_??? */ }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer); +typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t; +DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t); /* Flags to VCPUOP_set_singleshot_timer. */ /* Require the timeout to be in the future (return -ETIME if it's passed). */ @@ -161,13 +172,38 @@ * structure in a convenient place, such as in a per-cpu data area. * The pointer need not be page aligned, but the structure must not * cross a page boundary. + * + * This may be called only once per vcpu. */ -#define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */ +#define VCPUOP_register_vcpu_info 10 /* arg == vcpu_register_vcpu_info_t */ struct vcpu_register_vcpu_info { uint64_t mfn; /* mfn of page to place vcpu_info */ uint32_t offset; /* offset within page */ uint32_t rsvd; /* unused */ }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info); +typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t; +DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t); + +/* Send an NMI to the specified VCPU. @extra_arg == NULL. */ +#define VCPUOP_send_nmi 11 + +/* + * Get the physical ID information for a pinned vcpu's underlying physical + * processor. The physical ID informmation is architecture-specific. + * On x86: id[31:0]=apic_id, id[63:32]=acpi_id, and all values 0xff and + * greater are reserved. + * This command returns -EINVAL if it is not a valid operation for this VCPU. + */ +#define VCPUOP_get_physid 12 /* arg == vcpu_get_physid_t */ +struct vcpu_get_physid { + uint64_t phys_id; +}; +typedef struct vcpu_get_physid vcpu_get_physid_t; +DEFINE_XEN_GUEST_HANDLE(vcpu_get_physid_t); +#define xen_vcpu_physid_to_x86_apicid(physid) \ + ((((uint32_t)(physid)) >= 0xff) ? 0xff : ((uint8_t)(physid))) +#define xen_vcpu_physid_to_x86_acpiid(physid) \ + ((((uint32_t)((physid)>>32)) >= 0xff) ? 0xff : ((uint8_t)((physid)>>32))) #endif /* __XEN_PUBLIC_VCPU_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/xenoprof.h +++ linux-ec2-2.6.32/include/xen/interface/xenoprof.h @@ -0,0 +1,138 @@ +/****************************************************************************** + * xenoprof.h + * + * Interface for enabling system wide profiling based on hardware performance + * counters + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (C) 2005 Hewlett-Packard Co. + * Written by Aravind Menon & Jose Renato Santos + */ + +#ifndef __XEN_PUBLIC_XENOPROF_H__ +#define __XEN_PUBLIC_XENOPROF_H__ + +#include "xen.h" + +/* + * Commands to HYPERVISOR_xenoprof_op(). + */ +#define XENOPROF_init 0 +#define XENOPROF_reset_active_list 1 +#define XENOPROF_reset_passive_list 2 +#define XENOPROF_set_active 3 +#define XENOPROF_set_passive 4 +#define XENOPROF_reserve_counters 5 +#define XENOPROF_counter 6 +#define XENOPROF_setup_events 7 +#define XENOPROF_enable_virq 8 +#define XENOPROF_start 9 +#define XENOPROF_stop 10 +#define XENOPROF_disable_virq 11 +#define XENOPROF_release_counters 12 +#define XENOPROF_shutdown 13 +#define XENOPROF_get_buffer 14 +#define XENOPROF_set_backtrace 15 +#define XENOPROF_last_op 15 + +#define MAX_OPROF_EVENTS 32 +#define MAX_OPROF_DOMAINS 25 +#define XENOPROF_CPU_TYPE_SIZE 64 + +/* Xenoprof performance events (not Xen events) */ +struct event_log { + uint64_t eip; + uint8_t mode; + uint8_t event; +}; + +/* PC value that indicates a special code */ +#define XENOPROF_ESCAPE_CODE ~0UL +/* Transient events for the xenoprof->oprofile cpu buf */ +#define XENOPROF_TRACE_BEGIN 1 + +/* Xenoprof buffer shared between Xen and domain - 1 per VCPU */ +struct xenoprof_buf { + uint32_t event_head; + uint32_t event_tail; + uint32_t event_size; + uint32_t vcpu_id; + uint64_t xen_samples; + uint64_t kernel_samples; + uint64_t user_samples; + uint64_t lost_samples; + struct event_log event_log[1]; +}; +#ifndef __XEN__ +typedef struct xenoprof_buf xenoprof_buf_t; +DEFINE_XEN_GUEST_HANDLE(xenoprof_buf_t); +#endif + +struct xenoprof_init { + int32_t num_events; + int32_t is_primary; + char cpu_type[XENOPROF_CPU_TYPE_SIZE]; +}; +typedef struct xenoprof_init xenoprof_init_t; +DEFINE_XEN_GUEST_HANDLE(xenoprof_init_t); + +struct xenoprof_get_buffer { + int32_t max_samples; + int32_t nbuf; + int32_t bufsize; + uint64_t buf_gmaddr; +}; +typedef struct xenoprof_get_buffer xenoprof_get_buffer_t; +DEFINE_XEN_GUEST_HANDLE(xenoprof_get_buffer_t); + +struct xenoprof_counter { + uint32_t ind; + uint64_t count; + uint32_t enabled; + uint32_t event; + uint32_t hypervisor; + uint32_t kernel; + uint32_t user; + uint64_t unit_mask; +}; +typedef struct xenoprof_counter xenoprof_counter_t; +DEFINE_XEN_GUEST_HANDLE(xenoprof_counter_t); + +typedef struct xenoprof_passive { + uint16_t domain_id; + int32_t max_samples; + int32_t nbuf; + int32_t bufsize; + uint64_t buf_gmaddr; +} xenoprof_passive_t; +DEFINE_XEN_GUEST_HANDLE(xenoprof_passive_t); + + +#endif /* __XEN_PUBLIC_XENOPROF_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/trace.h +++ linux-ec2-2.6.32/include/xen/interface/trace.h @@ -0,0 +1,208 @@ +/****************************************************************************** + * include/public/trace.h + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Mark Williamson, (C) 2004 Intel Research Cambridge + * Copyright (C) 2005 Bin Ren + */ + +#ifndef __XEN_PUBLIC_TRACE_H__ +#define __XEN_PUBLIC_TRACE_H__ + +#define TRACE_EXTRA_MAX 7 +#define TRACE_EXTRA_SHIFT 28 + +/* Trace classes */ +#define TRC_CLS_SHIFT 16 +#define TRC_GEN 0x0001f000 /* General trace */ +#define TRC_SCHED 0x0002f000 /* Xen Scheduler trace */ +#define TRC_DOM0OP 0x0004f000 /* Xen DOM0 operation trace */ +#define TRC_HVM 0x0008f000 /* Xen HVM trace */ +#define TRC_MEM 0x0010f000 /* Xen memory trace */ +#define TRC_PV 0x0020f000 /* Xen PV traces */ +#define TRC_SHADOW 0x0040f000 /* Xen shadow tracing */ +#define TRC_PM 0x0080f000 /* Xen power management trace */ +#define TRC_ALL 0x0ffff000 +#define TRC_HD_TO_EVENT(x) ((x)&0x0fffffff) +#define TRC_HD_CYCLE_FLAG (1UL<<31) +#define TRC_HD_INCLUDES_CYCLE_COUNT(x) ( !!( (x) & TRC_HD_CYCLE_FLAG ) ) +#define TRC_HD_EXTRA(x) (((x)>>TRACE_EXTRA_SHIFT)&TRACE_EXTRA_MAX) + +/* Trace subclasses */ +#define TRC_SUBCLS_SHIFT 12 + +/* trace subclasses for SVM */ +#define TRC_HVM_ENTRYEXIT 0x00081000 /* VMENTRY and #VMEXIT */ +#define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */ + +#define TRC_SCHED_MIN 0x00021000 /* Just runstate changes */ +#define TRC_SCHED_VERBOSE 0x00028000 /* More inclusive scheduling */ + +/* Trace events per class */ +#define TRC_LOST_RECORDS (TRC_GEN + 1) +#define TRC_TRACE_WRAP_BUFFER (TRC_GEN + 2) +#define TRC_TRACE_CPU_CHANGE (TRC_GEN + 3) +#define TRC_TRACE_IRQ (TRC_GEN + 4) + +#define TRC_SCHED_RUNSTATE_CHANGE (TRC_SCHED_MIN + 1) +#define TRC_SCHED_CONTINUE_RUNNING (TRC_SCHED_MIN + 2) +#define TRC_SCHED_DOM_ADD (TRC_SCHED_VERBOSE + 1) +#define TRC_SCHED_DOM_REM (TRC_SCHED_VERBOSE + 2) +#define TRC_SCHED_SLEEP (TRC_SCHED_VERBOSE + 3) +#define TRC_SCHED_WAKE (TRC_SCHED_VERBOSE + 4) +#define TRC_SCHED_YIELD (TRC_SCHED_VERBOSE + 5) +#define TRC_SCHED_BLOCK (TRC_SCHED_VERBOSE + 6) +#define TRC_SCHED_SHUTDOWN (TRC_SCHED_VERBOSE + 7) +#define TRC_SCHED_CTL (TRC_SCHED_VERBOSE + 8) +#define TRC_SCHED_ADJDOM (TRC_SCHED_VERBOSE + 9) +#define TRC_SCHED_SWITCH (TRC_SCHED_VERBOSE + 10) +#define TRC_SCHED_S_TIMER_FN (TRC_SCHED_VERBOSE + 11) +#define TRC_SCHED_T_TIMER_FN (TRC_SCHED_VERBOSE + 12) +#define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED_VERBOSE + 13) +#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED_VERBOSE + 14) +#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED_VERBOSE + 15) + +#define TRC_MEM_PAGE_GRANT_MAP (TRC_MEM + 1) +#define TRC_MEM_PAGE_GRANT_UNMAP (TRC_MEM + 2) +#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3) + +#define TRC_PV_HYPERCALL (TRC_PV + 1) +#define TRC_PV_TRAP (TRC_PV + 3) +#define TRC_PV_PAGE_FAULT (TRC_PV + 4) +#define TRC_PV_FORCED_INVALID_OP (TRC_PV + 5) +#define TRC_PV_EMULATE_PRIVOP (TRC_PV + 6) +#define TRC_PV_EMULATE_4GB (TRC_PV + 7) +#define TRC_PV_MATH_STATE_RESTORE (TRC_PV + 8) +#define TRC_PV_PAGING_FIXUP (TRC_PV + 9) +#define TRC_PV_GDT_LDT_MAPPING_FAULT (TRC_PV + 10) +#define TRC_PV_PTWR_EMULATION (TRC_PV + 11) +#define TRC_PV_PTWR_EMULATION_PAE (TRC_PV + 12) + /* Indicates that addresses in trace record are 64 bits */ +#define TRC_64_FLAG (0x100) + +#define TRC_SHADOW_NOT_SHADOW (TRC_SHADOW + 1) +#define TRC_SHADOW_FAST_PROPAGATE (TRC_SHADOW + 2) +#define TRC_SHADOW_FAST_MMIO (TRC_SHADOW + 3) +#define TRC_SHADOW_FALSE_FAST_PATH (TRC_SHADOW + 4) +#define TRC_SHADOW_MMIO (TRC_SHADOW + 5) +#define TRC_SHADOW_FIXUP (TRC_SHADOW + 6) +#define TRC_SHADOW_DOMF_DYING (TRC_SHADOW + 7) +#define TRC_SHADOW_EMULATE (TRC_SHADOW + 8) +#define TRC_SHADOW_EMULATE_UNSHADOW_USER (TRC_SHADOW + 9) +#define TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ (TRC_SHADOW + 10) +#define TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED (TRC_SHADOW + 11) +#define TRC_SHADOW_WRMAP_BF (TRC_SHADOW + 12) +#define TRC_SHADOW_PREALLOC_UNPIN (TRC_SHADOW + 13) +#define TRC_SHADOW_RESYNC_FULL (TRC_SHADOW + 14) +#define TRC_SHADOW_RESYNC_ONLY (TRC_SHADOW + 15) + +/* trace events per subclass */ +#define TRC_HVM_VMENTRY (TRC_HVM_ENTRYEXIT + 0x01) +#define TRC_HVM_VMEXIT (TRC_HVM_ENTRYEXIT + 0x02) +#define TRC_HVM_VMEXIT64 (TRC_HVM_ENTRYEXIT + TRC_64_FLAG + 0x02) +#define TRC_HVM_PF_XEN (TRC_HVM_HANDLER + 0x01) +#define TRC_HVM_PF_XEN64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x01) +#define TRC_HVM_PF_INJECT (TRC_HVM_HANDLER + 0x02) +#define TRC_HVM_PF_INJECT64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x02) +#define TRC_HVM_INJ_EXC (TRC_HVM_HANDLER + 0x03) +#define TRC_HVM_INJ_VIRQ (TRC_HVM_HANDLER + 0x04) +#define TRC_HVM_REINJ_VIRQ (TRC_HVM_HANDLER + 0x05) +#define TRC_HVM_IO_READ (TRC_HVM_HANDLER + 0x06) +#define TRC_HVM_IO_WRITE (TRC_HVM_HANDLER + 0x07) +#define TRC_HVM_CR_READ (TRC_HVM_HANDLER + 0x08) +#define TRC_HVM_CR_READ64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x08) +#define TRC_HVM_CR_WRITE (TRC_HVM_HANDLER + 0x09) +#define TRC_HVM_CR_WRITE64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x09) +#define TRC_HVM_DR_READ (TRC_HVM_HANDLER + 0x0A) +#define TRC_HVM_DR_WRITE (TRC_HVM_HANDLER + 0x0B) +#define TRC_HVM_MSR_READ (TRC_HVM_HANDLER + 0x0C) +#define TRC_HVM_MSR_WRITE (TRC_HVM_HANDLER + 0x0D) +#define TRC_HVM_CPUID (TRC_HVM_HANDLER + 0x0E) +#define TRC_HVM_INTR (TRC_HVM_HANDLER + 0x0F) +#define TRC_HVM_NMI (TRC_HVM_HANDLER + 0x10) +#define TRC_HVM_SMI (TRC_HVM_HANDLER + 0x11) +#define TRC_HVM_VMMCALL (TRC_HVM_HANDLER + 0x12) +#define TRC_HVM_HLT (TRC_HVM_HANDLER + 0x13) +#define TRC_HVM_INVLPG (TRC_HVM_HANDLER + 0x14) +#define TRC_HVM_INVLPG64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x14) +#define TRC_HVM_MCE (TRC_HVM_HANDLER + 0x15) +#define TRC_HVM_IOPORT_READ (TRC_HVM_HANDLER + 0x16) +#define TRC_HVM_IOMEM_READ (TRC_HVM_HANDLER + 0x17) +#define TRC_HVM_CLTS (TRC_HVM_HANDLER + 0x18) +#define TRC_HVM_LMSW (TRC_HVM_HANDLER + 0x19) +#define TRC_HVM_LMSW64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x19) +#define TRC_HVM_INTR_WINDOW (TRC_HVM_HANDLER + 0x20) +#define TRC_HVM_IOPORT_WRITE (TRC_HVM_HANDLER + 0x216) +#define TRC_HVM_IOMEM_WRITE (TRC_HVM_HANDLER + 0x217) + +/* trace subclasses for power management */ +#define TRC_PM_FREQ 0x00801000 /* xen cpu freq events */ +#define TRC_PM_IDLE 0x00802000 /* xen cpu idle events */ + +/* trace events for per class */ +#define TRC_PM_FREQ_CHANGE (TRC_PM_FREQ + 0x01) +#define TRC_PM_IDLE_ENTRY (TRC_PM_IDLE + 0x01) +#define TRC_PM_IDLE_EXIT (TRC_PM_IDLE + 0x02) + +/* This structure represents a single trace buffer record. */ +struct t_rec { + uint32_t event:28; + uint32_t extra_u32:3; /* # entries in trailing extra_u32[] array */ + uint32_t cycles_included:1; /* u.cycles or u.no_cycles? */ + union { + struct { + uint32_t cycles_lo, cycles_hi; /* cycle counter timestamp */ + uint32_t extra_u32[7]; /* event data items */ + } cycles; + struct { + uint32_t extra_u32[7]; /* event data items */ + } nocycles; + } u; +}; + +/* + * This structure contains the metadata for a single trace buffer. The head + * field, indexes into an array of struct t_rec's. + */ +struct t_buf { + /* Assume the data buffer size is X. X is generally not a power of 2. + * CONS and PROD are incremented modulo (2*X): + * 0 <= cons < 2*X + * 0 <= prod < 2*X + * This is done because addition modulo X breaks at 2^32 when X is not a + * power of 2: + * (((2^32 - 1) % X) + 1) % X != (2^32) % X + */ + uint32_t cons; /* Offset of next item to be consumed by control tools. */ + uint32_t prod; /* Offset of next item to be produced by Xen. */ + /* Records follow immediately after the meta-data header. */ +}; + +#endif /* __XEN_PUBLIC_TRACE_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/sysctl.h +++ linux-ec2-2.6.32/include/xen/interface/sysctl.h @@ -0,0 +1,490 @@ +/****************************************************************************** + * sysctl.h + * + * System management operations. For use by node control stack. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2002-2006, K Fraser + */ + +#ifndef __XEN_PUBLIC_SYSCTL_H__ +#define __XEN_PUBLIC_SYSCTL_H__ + +#if !defined(__XEN__) && !defined(__XEN_TOOLS__) +#error "sysctl operations are intended for use by node control tools only" +#endif + +#include "xen.h" +#include "domctl.h" + +#define XEN_SYSCTL_INTERFACE_VERSION 0x00000006 + +/* + * Read console content from Xen buffer ring. + */ +#define XEN_SYSCTL_readconsole 1 +struct xen_sysctl_readconsole { + /* IN: Non-zero -> clear after reading. */ + uint8_t clear; + /* IN: Non-zero -> start index specified by @index field. */ + uint8_t incremental; + uint8_t pad0, pad1; + /* + * IN: Start index for consuming from ring buffer (if @incremental); + * OUT: End index after consuming from ring buffer. + */ + uint32_t index; + /* IN: Virtual address to write console data. */ + XEN_GUEST_HANDLE_64(char) buffer; + /* IN: Size of buffer; OUT: Bytes written to buffer. */ + uint32_t count; +}; +typedef struct xen_sysctl_readconsole xen_sysctl_readconsole_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_readconsole_t); + +/* Get trace buffers machine base address */ +#define XEN_SYSCTL_tbuf_op 2 +struct xen_sysctl_tbuf_op { + /* IN variables */ +#define XEN_SYSCTL_TBUFOP_get_info 0 +#define XEN_SYSCTL_TBUFOP_set_cpu_mask 1 +#define XEN_SYSCTL_TBUFOP_set_evt_mask 2 +#define XEN_SYSCTL_TBUFOP_set_size 3 +#define XEN_SYSCTL_TBUFOP_enable 4 +#define XEN_SYSCTL_TBUFOP_disable 5 + uint32_t cmd; + /* IN/OUT variables */ + struct xenctl_cpumap cpu_mask; + uint32_t evt_mask; + /* OUT variables */ + uint64_aligned_t buffer_mfn; + uint32_t size; +}; +typedef struct xen_sysctl_tbuf_op xen_sysctl_tbuf_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_op_t); + +/* + * Get physical information about the host machine + */ +#define XEN_SYSCTL_physinfo 3 + /* (x86) The platform supports HVM guests. */ +#define _XEN_SYSCTL_PHYSCAP_hvm 0 +#define XEN_SYSCTL_PHYSCAP_hvm (1u<<_XEN_SYSCTL_PHYSCAP_hvm) + /* (x86) The platform supports HVM-guest direct access to I/O devices. */ +#define _XEN_SYSCTL_PHYSCAP_hvm_directio 1 +#define XEN_SYSCTL_PHYSCAP_hvm_directio (1u<<_XEN_SYSCTL_PHYSCAP_hvm_directio) +struct xen_sysctl_physinfo { + uint32_t threads_per_core; + uint32_t cores_per_socket; + uint32_t nr_cpus; + uint32_t nr_nodes; + uint32_t cpu_khz; + uint64_aligned_t total_pages; + uint64_aligned_t free_pages; + uint64_aligned_t scrub_pages; + uint32_t hw_cap[8]; + + /* + * IN: maximum addressable entry in the caller-provided cpu_to_node array. + * OUT: largest cpu identifier in the system. + * If OUT is greater than IN then the cpu_to_node array is truncated! + */ + uint32_t max_cpu_id; + /* + * If not NULL, this array is filled with node identifier for each cpu. + * If a cpu has no node information (e.g., cpu not present) then the + * sentinel value ~0u is written. + * The size of this array is specified by the caller in @max_cpu_id. + * If the actual @max_cpu_id is smaller than the array then the trailing + * elements of the array will not be written by the sysctl. + */ + XEN_GUEST_HANDLE_64(uint32) cpu_to_node; + + /* XEN_SYSCTL_PHYSCAP_??? */ + uint32_t capabilities; +}; +typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t); + +/* + * Get the ID of the current scheduler. + */ +#define XEN_SYSCTL_sched_id 4 +struct xen_sysctl_sched_id { + /* OUT variable */ + uint32_t sched_id; +}; +typedef struct xen_sysctl_sched_id xen_sysctl_sched_id_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_sched_id_t); + +/* Interface for controlling Xen software performance counters. */ +#define XEN_SYSCTL_perfc_op 5 +/* Sub-operations: */ +#define XEN_SYSCTL_PERFCOP_reset 1 /* Reset all counters to zero. */ +#define XEN_SYSCTL_PERFCOP_query 2 /* Get perfctr information. */ +struct xen_sysctl_perfc_desc { + char name[80]; /* name of perf counter */ + uint32_t nr_vals; /* number of values for this counter */ +}; +typedef struct xen_sysctl_perfc_desc xen_sysctl_perfc_desc_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t); +typedef uint32_t xen_sysctl_perfc_val_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t); + +struct xen_sysctl_perfc_op { + /* IN variables. */ + uint32_t cmd; /* XEN_SYSCTL_PERFCOP_??? */ + /* OUT variables. */ + uint32_t nr_counters; /* number of counters description */ + uint32_t nr_vals; /* number of values */ + /* counter information (or NULL) */ + XEN_GUEST_HANDLE_64(xen_sysctl_perfc_desc_t) desc; + /* counter values (or NULL) */ + XEN_GUEST_HANDLE_64(xen_sysctl_perfc_val_t) val; +}; +typedef struct xen_sysctl_perfc_op xen_sysctl_perfc_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_op_t); + +#define XEN_SYSCTL_getdomaininfolist 6 +struct xen_sysctl_getdomaininfolist { + /* IN variables. */ + domid_t first_domain; + uint32_t max_domains; + XEN_GUEST_HANDLE_64(xen_domctl_getdomaininfo_t) buffer; + /* OUT variables. */ + uint32_t num_domains; +}; +typedef struct xen_sysctl_getdomaininfolist xen_sysctl_getdomaininfolist_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomaininfolist_t); + +/* Inject debug keys into Xen. */ +#define XEN_SYSCTL_debug_keys 7 +struct xen_sysctl_debug_keys { + /* IN variables. */ + XEN_GUEST_HANDLE_64(char) keys; + uint32_t nr_keys; +}; +typedef struct xen_sysctl_debug_keys xen_sysctl_debug_keys_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_debug_keys_t); + +/* Get physical CPU information. */ +#define XEN_SYSCTL_getcpuinfo 8 +struct xen_sysctl_cpuinfo { + uint64_aligned_t idletime; +}; +typedef struct xen_sysctl_cpuinfo xen_sysctl_cpuinfo_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpuinfo_t); +struct xen_sysctl_getcpuinfo { + /* IN variables. */ + uint32_t max_cpus; + XEN_GUEST_HANDLE_64(xen_sysctl_cpuinfo_t) info; + /* OUT variables. */ + uint32_t nr_cpus; +}; +typedef struct xen_sysctl_getcpuinfo xen_sysctl_getcpuinfo_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getcpuinfo_t); + +#define XEN_SYSCTL_availheap 9 +struct xen_sysctl_availheap { + /* IN variables. */ + uint32_t min_bitwidth; /* Smallest address width (zero if don't care). */ + uint32_t max_bitwidth; /* Largest address width (zero if don't care). */ + int32_t node; /* NUMA node of interest (-1 for all nodes). */ + /* OUT variables. */ + uint64_aligned_t avail_bytes;/* Bytes available in the specified region. */ +}; +typedef struct xen_sysctl_availheap xen_sysctl_availheap_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_availheap_t); + +#define XEN_SYSCTL_get_pmstat 10 +struct pm_px_val { + uint64_aligned_t freq; /* Px core frequency */ + uint64_aligned_t residency; /* Px residency time */ + uint64_aligned_t count; /* Px transition count */ +}; +typedef struct pm_px_val pm_px_val_t; +DEFINE_XEN_GUEST_HANDLE(pm_px_val_t); + +struct pm_px_stat { + uint8_t total; /* total Px states */ + uint8_t usable; /* usable Px states */ + uint8_t last; /* last Px state */ + uint8_t cur; /* current Px state */ + XEN_GUEST_HANDLE_64(uint64) trans_pt; /* Px transition table */ + XEN_GUEST_HANDLE_64(pm_px_val_t) pt; +}; +typedef struct pm_px_stat pm_px_stat_t; +DEFINE_XEN_GUEST_HANDLE(pm_px_stat_t); + +struct pm_cx_stat { + uint32_t nr; /* entry nr in triggers & residencies, including C0 */ + uint32_t last; /* last Cx state */ + uint64_aligned_t idle_time; /* idle time from boot */ + XEN_GUEST_HANDLE_64(uint64) triggers; /* Cx trigger counts */ + XEN_GUEST_HANDLE_64(uint64) residencies; /* Cx residencies */ +}; + +struct xen_sysctl_get_pmstat { +#define PMSTAT_CATEGORY_MASK 0xf0 +#define PMSTAT_PX 0x10 +#define PMSTAT_CX 0x20 +#define PMSTAT_get_max_px (PMSTAT_PX | 0x1) +#define PMSTAT_get_pxstat (PMSTAT_PX | 0x2) +#define PMSTAT_reset_pxstat (PMSTAT_PX | 0x3) +#define PMSTAT_get_max_cx (PMSTAT_CX | 0x1) +#define PMSTAT_get_cxstat (PMSTAT_CX | 0x2) +#define PMSTAT_reset_cxstat (PMSTAT_CX | 0x3) + uint32_t type; + uint32_t cpuid; + union { + struct pm_px_stat getpx; + struct pm_cx_stat getcx; + /* other struct for tx, etc */ + } u; +}; +typedef struct xen_sysctl_get_pmstat xen_sysctl_get_pmstat_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_get_pmstat_t); + +/* + * Status codes. Must be greater than 0 to avoid confusing + * sysctl callers that see 0 as a plain successful return. + */ +#define XEN_CPU_HOTPLUG_STATUS_OFFLINE 1 +#define XEN_CPU_HOTPLUG_STATUS_ONLINE 2 +#define XEN_CPU_HOTPLUG_STATUS_NEW 3 + +#define XEN_SYSCTL_cpu_hotplug 11 +struct xen_sysctl_cpu_hotplug { + /* IN variables */ + uint32_t cpu; /* Physical cpu. */ +#define XEN_SYSCTL_CPU_HOTPLUG_ONLINE 0 +#define XEN_SYSCTL_CPU_HOTPLUG_OFFLINE 1 +#define XEN_SYSCTL_CPU_HOTPLUG_STATUS 2 + uint32_t op; /* hotplug opcode */ +}; +typedef struct xen_sysctl_cpu_hotplug xen_sysctl_cpu_hotplug_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpu_hotplug_t); + +/* + * Get/set xen power management, include + * 1. cpufreq governors and related parameters + */ +#define XEN_SYSCTL_pm_op 12 +struct xen_userspace { + uint32_t scaling_setspeed; +}; +typedef struct xen_userspace xen_userspace_t; + +struct xen_ondemand { + uint32_t sampling_rate_max; + uint32_t sampling_rate_min; + + uint32_t sampling_rate; + uint32_t up_threshold; +}; +typedef struct xen_ondemand xen_ondemand_t; + +/* + * cpufreq para name of this structure named + * same as sysfs file name of native linux + */ +#define CPUFREQ_NAME_LEN 16 +struct xen_get_cpufreq_para { + /* IN/OUT variable */ + uint32_t cpu_num; + uint32_t freq_num; + uint32_t gov_num; + + /* for all governors */ + /* OUT variable */ + XEN_GUEST_HANDLE_64(uint32) affected_cpus; + XEN_GUEST_HANDLE_64(uint32) scaling_available_frequencies; + XEN_GUEST_HANDLE_64(char) scaling_available_governors; + char scaling_driver[CPUFREQ_NAME_LEN]; + + uint32_t cpuinfo_cur_freq; + uint32_t cpuinfo_max_freq; + uint32_t cpuinfo_min_freq; + uint32_t scaling_cur_freq; + + char scaling_governor[CPUFREQ_NAME_LEN]; + uint32_t scaling_max_freq; + uint32_t scaling_min_freq; + + /* for specific governor */ + union { + struct xen_userspace userspace; + struct xen_ondemand ondemand; + } u; +}; + +struct xen_set_cpufreq_gov { + char scaling_governor[CPUFREQ_NAME_LEN]; +}; + +struct xen_set_cpufreq_para { + #define SCALING_MAX_FREQ 1 + #define SCALING_MIN_FREQ 2 + #define SCALING_SETSPEED 3 + #define SAMPLING_RATE 4 + #define UP_THRESHOLD 5 + + uint32_t ctrl_type; + uint32_t ctrl_value; +}; + +/* Get physical CPU topology information. */ +#define INVALID_TOPOLOGY_ID (~0U) +struct xen_get_cputopo { + /* IN: maximum addressable entry in + * the caller-provided cpu_to_core/socket. + */ + uint32_t max_cpus; + XEN_GUEST_HANDLE_64(uint32) cpu_to_core; + XEN_GUEST_HANDLE_64(uint32) cpu_to_socket; + + /* OUT: number of cpus returned + * If OUT is greater than IN then the cpu_to_core/socket is truncated! + */ + uint32_t nr_cpus; +}; + +struct xen_sysctl_pm_op { + #define PM_PARA_CATEGORY_MASK 0xf0 + #define CPUFREQ_PARA 0x10 + + /* cpufreq command type */ + #define GET_CPUFREQ_PARA (CPUFREQ_PARA | 0x01) + #define SET_CPUFREQ_GOV (CPUFREQ_PARA | 0x02) + #define SET_CPUFREQ_PARA (CPUFREQ_PARA | 0x03) + #define GET_CPUFREQ_AVGFREQ (CPUFREQ_PARA | 0x04) + + /* get CPU topology */ + #define XEN_SYSCTL_pm_op_get_cputopo 0x20 + + /* set/reset scheduler power saving option */ + #define XEN_SYSCTL_pm_op_set_sched_opt_smt 0x21 + + /* cpuidle max_cstate access command */ + #define XEN_SYSCTL_pm_op_get_max_cstate 0x22 + #define XEN_SYSCTL_pm_op_set_max_cstate 0x23 + + /* set scheduler migration cost value */ + #define XEN_SYSCTL_pm_op_set_vcpu_migration_delay 0x24 + #define XEN_SYSCTL_pm_op_get_vcpu_migration_delay 0x25 + + uint32_t cmd; + uint32_t cpuid; + union { + struct xen_get_cpufreq_para get_para; + struct xen_set_cpufreq_gov set_gov; + struct xen_set_cpufreq_para set_para; + uint64_t get_avgfreq; + struct xen_get_cputopo get_topo; + uint32_t set_sched_opt_smt; + uint32_t get_max_cstate; + uint32_t set_max_cstate; + uint32_t get_vcpu_migration_delay; + uint32_t set_vcpu_migration_delay; + }; +}; + +#define XEN_SYSCTL_page_offline_op 14 +struct xen_sysctl_page_offline_op { + /* IN: range of page to be offlined */ +#define sysctl_page_offline 1 +#define sysctl_page_online 2 +#define sysctl_query_page_offline 3 + uint32_t cmd; + uint32_t start; + uint32_t end; + /* OUT: result of page offline request */ + /* + * bit 0~15: result flags + * bit 16~31: owner + */ + XEN_GUEST_HANDLE(uint32) status; +}; + +#define PG_OFFLINE_STATUS_MASK (0xFFUL) + +/* The result is invalid, i.e. HV does not handle it */ +#define PG_OFFLINE_INVALID (0x1UL << 0) + +#define PG_OFFLINE_OFFLINED (0x1UL << 1) +#define PG_OFFLINE_PENDING (0x1UL << 2) +#define PG_OFFLINE_FAILED (0x1UL << 3) + +#define PG_ONLINE_FAILED PG_OFFLINE_FAILED +#define PG_ONLINE_ONLINED PG_OFFLINE_OFFLINED + +#define PG_OFFLINE_STATUS_OFFLINED (0x1UL << 1) +#define PG_OFFLINE_STATUS_ONLINE (0x1UL << 2) +#define PG_OFFLINE_STATUS_OFFLINE_PENDING (0x1UL << 3) +#define PG_OFFLINE_STATUS_BROKEN (0x1UL << 4) + +#define PG_OFFLINE_MISC_MASK (0xFFUL << 4) + +/* only valid when PG_OFFLINE_FAILED */ +#define PG_OFFLINE_XENPAGE (0x1UL << 8) +#define PG_OFFLINE_DOM0PAGE (0x1UL << 9) +#define PG_OFFLINE_ANONYMOUS (0x1UL << 10) +#define PG_OFFLINE_NOT_CONV_RAM (0x1UL << 11) +#define PG_OFFLINE_OWNED (0x1UL << 12) + +#define PG_OFFLINE_BROKEN (0x1UL << 13) +#define PG_ONLINE_BROKEN PG_OFFLINE_BROKEN + +#define PG_OFFLINE_OWNER_SHIFT 16 + +struct xen_sysctl { + uint32_t cmd; + uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */ + union { + struct xen_sysctl_readconsole readconsole; + struct xen_sysctl_tbuf_op tbuf_op; + struct xen_sysctl_physinfo physinfo; + struct xen_sysctl_sched_id sched_id; + struct xen_sysctl_perfc_op perfc_op; + struct xen_sysctl_getdomaininfolist getdomaininfolist; + struct xen_sysctl_debug_keys debug_keys; + struct xen_sysctl_getcpuinfo getcpuinfo; + struct xen_sysctl_availheap availheap; + struct xen_sysctl_get_pmstat get_pmstat; + struct xen_sysctl_cpu_hotplug cpu_hotplug; + struct xen_sysctl_pm_op pm_op; + struct xen_sysctl_page_offline_op page_offline; + uint8_t pad[128]; + } u; +}; +typedef struct xen_sysctl xen_sysctl_t; +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_t); + +#endif /* __XEN_PUBLIC_SYSCTL_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/xen-compat.h +++ linux-ec2-2.6.32/include/xen/interface/xen-compat.h @@ -0,0 +1,44 @@ +/****************************************************************************** + * xen-compat.h + * + * Guest OS interface to Xen. Compatibility layer. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2006, Christian Limpach + */ + +#ifndef __XEN_PUBLIC_XEN_COMPAT_H__ +#define __XEN_PUBLIC_XEN_COMPAT_H__ + +#define __XEN_LATEST_INTERFACE_VERSION__ 0x00030209 + +#if defined(__XEN__) || defined(__XEN_TOOLS__) +/* Xen is built with matching headers and implements the latest interface. */ +#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__ +#elif !defined(__XEN_INTERFACE_VERSION__) +/* Guests which do not specify a version get the legacy interface. */ +#define __XEN_INTERFACE_VERSION__ 0x00000000 +#endif + +#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__ +#error "These header files do not support the requested interface version." +#endif + +#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/features.h +++ linux-ec2-2.6.32/include/xen/interface/features.h @@ -3,6 +3,24 @@ * * Feature flags, reported by XENVER_get_features. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2006, Keir Fraser */ @@ -41,6 +59,15 @@ /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */ #define XENFEAT_mmu_pt_update_preserve_ad 5 +/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */ +#define XENFEAT_highmem_assist 6 + +/* + * If set, GNTTABOP_map_grant_ref honors flags to be placed into guest kernel + * available pte bits. + */ +#define XENFEAT_gnttab_map_avail_bits 7 + #define XENFEAT_NR_SUBMAPS 1 #endif /* __XEN_PUBLIC_FEATURES_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/sched.h +++ linux-ec2-2.6.32/include/xen/interface/sched.h @@ -3,6 +3,24 @@ * * Scheduler state interactions * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2005, Keir Fraser */ @@ -13,17 +31,17 @@ /* * The prototype for this hypercall is: - * long sched_op_new(int cmd, void *arg) + * long sched_op(int cmd, void *arg) * @cmd == SCHEDOP_??? (scheduler operation). * @arg == Operation-specific extra argument(s), as described below. * - * **NOTE**: - * Versions of Xen prior to 3.0.2 provide only the following legacy version + * Versions of Xen prior to 3.0.2 provided only the following legacy version * of this hypercall, supporting only the commands yield, block and shutdown: * long sched_op(int cmd, unsigned long arg) * @cmd == SCHEDOP_??? (scheduler operation). * @arg == 0 (SCHEDOP_yield and SCHEDOP_block) * == SHUTDOWN_* code (SCHEDOP_shutdown) + * This legacy version is available to new guests as sched_op_compat(). */ /* @@ -50,6 +68,8 @@ unsigned int reason; /* SHUTDOWN_* */ }; DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown); +typedef struct sched_shutdown sched_shutdown_t; +DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t); /* * Poll a set of event-channel ports. Return when one or more are pending. An @@ -58,11 +78,27 @@ */ #define SCHEDOP_poll 3 struct sched_poll { - GUEST_HANDLE(evtchn_port_t) ports; + XEN_GUEST_HANDLE(evtchn_port_t) ports; unsigned int nr_ports; uint64_t timeout; }; DEFINE_GUEST_HANDLE_STRUCT(sched_poll); +typedef struct sched_poll sched_poll_t; +DEFINE_XEN_GUEST_HANDLE(sched_poll_t); + +/* + * Declare a shutdown for another domain. The main use of this function is + * in interpreting shutdown requests and reasons for fully-virtualized + * domains. A para-virtualized domain may use SCHEDOP_shutdown directly. + * @arg == pointer to sched_remote_shutdown structure. + */ +#define SCHEDOP_remote_shutdown 4 +struct sched_remote_shutdown { + domid_t domain_id; /* Remote domain ID */ + unsigned int reason; /* SHUTDOWN_xxx reason */ +}; +typedef struct sched_remote_shutdown sched_remote_shutdown_t; +DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t); /* * Reason codes for SCHEDOP_shutdown. These may be interpreted by control --- linux-ec2-2.6.32.orig/include/xen/interface/dom0_ops.h +++ linux-ec2-2.6.32/include/xen/interface/dom0_ops.h @@ -0,0 +1,120 @@ +/****************************************************************************** + * dom0_ops.h + * + * Process command requests from domain-0 guest OS. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2002-2003, B Dragovic + * Copyright (c) 2002-2006, K Fraser + */ + +#ifndef __XEN_PUBLIC_DOM0_OPS_H__ +#define __XEN_PUBLIC_DOM0_OPS_H__ + +#include "xen.h" +#include "platform.h" + +#if __XEN_INTERFACE_VERSION__ >= 0x00030204 +#error "dom0_ops.h is a compatibility interface only" +#endif + +#define DOM0_INTERFACE_VERSION XENPF_INTERFACE_VERSION + +#define DOM0_SETTIME XENPF_settime +#define dom0_settime xenpf_settime +#define dom0_settime_t xenpf_settime_t + +#define DOM0_ADD_MEMTYPE XENPF_add_memtype +#define dom0_add_memtype xenpf_add_memtype +#define dom0_add_memtype_t xenpf_add_memtype_t + +#define DOM0_DEL_MEMTYPE XENPF_del_memtype +#define dom0_del_memtype xenpf_del_memtype +#define dom0_del_memtype_t xenpf_del_memtype_t + +#define DOM0_READ_MEMTYPE XENPF_read_memtype +#define dom0_read_memtype xenpf_read_memtype +#define dom0_read_memtype_t xenpf_read_memtype_t + +#define DOM0_MICROCODE XENPF_microcode_update +#define dom0_microcode xenpf_microcode_update +#define dom0_microcode_t xenpf_microcode_update_t + +#define DOM0_PLATFORM_QUIRK XENPF_platform_quirk +#define dom0_platform_quirk xenpf_platform_quirk +#define dom0_platform_quirk_t xenpf_platform_quirk_t + +typedef uint64_t cpumap_t; + +/* Unsupported legacy operation -- defined for API compatibility. */ +#define DOM0_MSR 15 +struct dom0_msr { + /* IN variables. */ + uint32_t write; + cpumap_t cpu_mask; + uint32_t msr; + uint32_t in1; + uint32_t in2; + /* OUT variables. */ + uint32_t out1; + uint32_t out2; +}; +typedef struct dom0_msr dom0_msr_t; +DEFINE_XEN_GUEST_HANDLE(dom0_msr_t); + +/* Unsupported legacy operation -- defined for API compatibility. */ +#define DOM0_PHYSICAL_MEMORY_MAP 40 +struct dom0_memory_map_entry { + uint64_t start, end; + uint32_t flags; /* reserved */ + uint8_t is_ram; +}; +typedef struct dom0_memory_map_entry dom0_memory_map_entry_t; +DEFINE_XEN_GUEST_HANDLE(dom0_memory_map_entry_t); + +struct dom0_op { + uint32_t cmd; + uint32_t interface_version; /* DOM0_INTERFACE_VERSION */ + union { + struct dom0_msr msr; + struct dom0_settime settime; + struct dom0_add_memtype add_memtype; + struct dom0_del_memtype del_memtype; + struct dom0_read_memtype read_memtype; + struct dom0_microcode microcode; + struct dom0_platform_quirk platform_quirk; + struct dom0_memory_map_entry physical_memory_map; + uint8_t pad[128]; + } u; +}; +typedef struct dom0_op dom0_op_t; +DEFINE_XEN_GUEST_HANDLE(dom0_op_t); + +#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/kexec.h +++ linux-ec2-2.6.32/include/xen/interface/kexec.h @@ -0,0 +1,168 @@ +/****************************************************************************** + * kexec.h - Public portion + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Xen port written by: + * - Simon 'Horms' Horman + * - Magnus Damm + */ + +#ifndef _XEN_PUBLIC_KEXEC_H +#define _XEN_PUBLIC_KEXEC_H + + +/* This file describes the Kexec / Kdump hypercall interface for Xen. + * + * Kexec under vanilla Linux allows a user to reboot the physical machine + * into a new user-specified kernel. The Xen port extends this idea + * to allow rebooting of the machine from dom0. When kexec for dom0 + * is used to reboot, both the hypervisor and the domains get replaced + * with some other kernel. It is possible to kexec between vanilla + * Linux and Xen and back again. Xen to Xen works well too. + * + * The hypercall interface for kexec can be divided into three main + * types of hypercall operations: + * + * 1) Range information: + * This is used by the dom0 kernel to ask the hypervisor about various + * address information. This information is needed to allow kexec-tools + * to fill in the ELF headers for /proc/vmcore properly. + * + * 2) Load and unload of images: + * There are no big surprises here, the kexec binary from kexec-tools + * runs in userspace in dom0. The tool loads/unloads data into the + * dom0 kernel such as new kernel, initramfs and hypervisor. When + * loaded the dom0 kernel performs a load hypercall operation, and + * before releasing all page references the dom0 kernel calls unload. + * + * 3) Kexec operation: + * This is used to start a previously loaded kernel. + */ + +#include "xen.h" + +#if defined(__i386__) || defined(__x86_64__) +#define KEXEC_XEN_NO_PAGES 17 +#endif + +/* + * Prototype for this hypercall is: + * int kexec_op(int cmd, void *args) + * @cmd == KEXEC_CMD_... + * KEXEC operation to perform + * @args == Operation-specific extra arguments (NULL if none). + */ + +/* + * Kexec supports two types of operation: + * - kexec into a regular kernel, very similar to a standard reboot + * - KEXEC_TYPE_DEFAULT is used to specify this type + * - kexec into a special "crash kernel", aka kexec-on-panic + * - KEXEC_TYPE_CRASH is used to specify this type + * - parts of our system may be broken at kexec-on-panic time + * - the code should be kept as simple and self-contained as possible + */ + +#define KEXEC_TYPE_DEFAULT 0 +#define KEXEC_TYPE_CRASH 1 + + +/* The kexec implementation for Xen allows the user to load two + * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH. + * All data needed for a kexec reboot is kept in one xen_kexec_image_t + * per "instance". The data mainly consists of machine address lists to pages + * together with destination addresses. The data in xen_kexec_image_t + * is passed to the "code page" which is one page of code that performs + * the final relocations before jumping to the new kernel. + */ + +typedef struct xen_kexec_image { +#if defined(__i386__) || defined(__x86_64__) + unsigned long page_list[KEXEC_XEN_NO_PAGES]; +#endif +#if defined(__ia64__) + unsigned long reboot_code_buffer; +#endif + unsigned long indirection_page; + unsigned long start_address; +} xen_kexec_image_t; + +/* + * Perform kexec having previously loaded a kexec or kdump kernel + * as appropriate. + * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in] + */ +#define KEXEC_CMD_kexec 0 +typedef struct xen_kexec_exec { + int type; +} xen_kexec_exec_t; + +/* + * Load/Unload kernel image for kexec or kdump. + * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in] + * image == relocation information for kexec (ignored for unload) [in] + */ +#define KEXEC_CMD_kexec_load 1 +#define KEXEC_CMD_kexec_unload 2 +typedef struct xen_kexec_load { + int type; + xen_kexec_image_t image; +} xen_kexec_load_t; + +#define KEXEC_RANGE_MA_CRASH 0 /* machine address and size of crash area */ +#define KEXEC_RANGE_MA_XEN 1 /* machine address and size of Xen itself */ +#define KEXEC_RANGE_MA_CPU 2 /* machine address and size of a CPU note */ +#define KEXEC_RANGE_MA_XENHEAP 3 /* machine address and size of xenheap + * Note that although this is adjacent + * to Xen it exists in a separate EFI + * region on ia64, and thus needs to be + * inserted into iomem_machine separately */ +#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* machine address and size of + * the ia64_boot_param */ +#define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of + * of the EFI Memory Map */ +#define KEXEC_RANGE_MA_VMCOREINFO 6 /* machine address and size of vmcoreinfo */ + +/* + * Find the address and size of certain memory areas + * range == KEXEC_RANGE_... [in] + * nr == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in] + * size == number of bytes reserved in window [out] + * start == address of the first byte in the window [out] + */ +#define KEXEC_CMD_kexec_get_range 3 +typedef struct xen_kexec_range { + int range; + int nr; + unsigned long size; + unsigned long start; +} xen_kexec_range_t; + +#endif /* _XEN_PUBLIC_KEXEC_H */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/arch-x86_32.h +++ linux-ec2-2.6.32/include/xen/interface/arch-x86_32.h @@ -0,0 +1,27 @@ +/****************************************************************************** + * arch-x86_32.h + * + * Guest OS interface to x86 32-bit Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser + */ + +#include "arch-x86/xen.h" --- linux-ec2-2.6.32.orig/include/xen/interface/nmi.h +++ linux-ec2-2.6.32/include/xen/interface/nmi.h @@ -0,0 +1,80 @@ +/****************************************************************************** + * nmi.h + * + * NMI callback registration and reason codes. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2005, Keir Fraser + */ + +#ifndef __XEN_PUBLIC_NMI_H__ +#define __XEN_PUBLIC_NMI_H__ + +#include "xen.h" + +/* + * NMI reason codes: + * Currently these are x86-specific, stored in arch_shared_info.nmi_reason. + */ + /* I/O-check error reported via ISA port 0x61, bit 6. */ +#define _XEN_NMIREASON_io_error 0 +#define XEN_NMIREASON_io_error (1UL << _XEN_NMIREASON_io_error) + /* Parity error reported via ISA port 0x61, bit 7. */ +#define _XEN_NMIREASON_parity_error 1 +#define XEN_NMIREASON_parity_error (1UL << _XEN_NMIREASON_parity_error) + /* Unknown hardware-generated NMI. */ +#define _XEN_NMIREASON_unknown 2 +#define XEN_NMIREASON_unknown (1UL << _XEN_NMIREASON_unknown) + +/* + * long nmi_op(unsigned int cmd, void *arg) + * NB. All ops return zero on success, else a negative error code. + */ + +/* + * Register NMI callback for this (calling) VCPU. Currently this only makes + * sense for domain 0, vcpu 0. All other callers will be returned EINVAL. + * arg == pointer to xennmi_callback structure. + */ +#define XENNMI_register_callback 0 +struct xennmi_callback { + unsigned long handler_address; + unsigned long pad; +}; +typedef struct xennmi_callback xennmi_callback_t; +DEFINE_XEN_GUEST_HANDLE(xennmi_callback_t); + +/* + * Deregister NMI callback for this (calling) VCPU. + * arg == NULL. + */ +#define XENNMI_unregister_callback 1 + +#endif /* __XEN_PUBLIC_NMI_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/platform.h +++ linux-ec2-2.6.32/include/xen/interface/platform.h @@ -0,0 +1,357 @@ +/****************************************************************************** + * platform.h + * + * Hardware platform operations. Intended for use by domain-0 kernel. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2002-2006, K Fraser + */ + +#ifndef __XEN_PUBLIC_PLATFORM_H__ +#define __XEN_PUBLIC_PLATFORM_H__ + +#include "xen.h" + +#define XENPF_INTERFACE_VERSION 0x03000001 + +/* + * Set clock such that it would read after 00:00:00 UTC, + * 1 January, 1970 if the current system time was . + */ +#define XENPF_settime 17 +struct xenpf_settime { + /* IN variables. */ + uint32_t secs; + uint32_t nsecs; + uint64_t system_time; +}; +typedef struct xenpf_settime xenpf_settime_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t); + +/* + * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type. + * On x86, @type is an architecture-defined MTRR memory type. + * On success, returns the MTRR that was used (@reg) and a handle that can + * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting. + * (x86-specific). + */ +#define XENPF_add_memtype 31 +struct xenpf_add_memtype { + /* IN variables. */ + xen_pfn_t mfn; + uint64_t nr_mfns; + uint32_t type; + /* OUT variables. */ + uint32_t handle; + uint32_t reg; +}; +typedef struct xenpf_add_memtype xenpf_add_memtype_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_add_memtype_t); + +/* + * Tear down an existing memory-range type. If @handle is remembered then it + * should be passed in to accurately tear down the correct setting (in case + * of overlapping memory regions with differing types). If it is not known + * then @handle should be set to zero. In all cases @reg must be set. + * (x86-specific). + */ +#define XENPF_del_memtype 32 +struct xenpf_del_memtype { + /* IN variables. */ + uint32_t handle; + uint32_t reg; +}; +typedef struct xenpf_del_memtype xenpf_del_memtype_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_del_memtype_t); + +/* Read current type of an MTRR (x86-specific). */ +#define XENPF_read_memtype 33 +struct xenpf_read_memtype { + /* IN variables. */ + uint32_t reg; + /* OUT variables. */ + xen_pfn_t mfn; + uint64_t nr_mfns; + uint32_t type; +}; +typedef struct xenpf_read_memtype xenpf_read_memtype_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_read_memtype_t); + +#define XENPF_microcode_update 35 +struct xenpf_microcode_update { + /* IN variables. */ + XEN_GUEST_HANDLE(const_void) data;/* Pointer to microcode data */ + uint32_t length; /* Length of microcode data. */ +}; +typedef struct xenpf_microcode_update xenpf_microcode_update_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_microcode_update_t); + +#define XENPF_platform_quirk 39 +#define QUIRK_NOIRQBALANCING 1 /* Do not restrict IO-APIC RTE targets */ +#define QUIRK_IOAPIC_BAD_REGSEL 2 /* IO-APIC REGSEL forgets its value */ +#define QUIRK_IOAPIC_GOOD_REGSEL 3 /* IO-APIC REGSEL behaves properly */ +struct xenpf_platform_quirk { + /* IN variables. */ + uint32_t quirk_id; +}; +typedef struct xenpf_platform_quirk xenpf_platform_quirk_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_platform_quirk_t); + +#define XENPF_firmware_info 50 +#define XEN_FW_DISK_INFO 1 /* from int 13 AH=08/41/48 */ +#define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */ +#define XEN_FW_VBEDDC_INFO 3 /* from int 10 AX=4f15 */ +struct xenpf_firmware_info { + /* IN variables. */ + uint32_t type; + uint32_t index; + /* OUT variables. */ + union { + struct { + /* Int13, Fn48: Check Extensions Present. */ + uint8_t device; /* %dl: bios device number */ + uint8_t version; /* %ah: major version */ + uint16_t interface_support; /* %cx: support bitmap */ + /* Int13, Fn08: Legacy Get Device Parameters. */ + uint16_t legacy_max_cylinder; /* %cl[7:6]:%ch: max cyl # */ + uint8_t legacy_max_head; /* %dh: max head # */ + uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector # */ + /* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */ + /* NB. First uint16_t of buffer must be set to buffer size. */ + XEN_GUEST_HANDLE(void) edd_params; + } disk_info; /* XEN_FW_DISK_INFO */ + struct { + uint8_t device; /* bios device number */ + uint32_t mbr_signature; /* offset 0x1b8 in mbr */ + } disk_mbr_signature; /* XEN_FW_DISK_MBR_SIGNATURE */ + struct { + /* Int10, AX=4F15: Get EDID info. */ + uint8_t capabilities; + uint8_t edid_transfer_time; + /* must refer to 128-byte buffer */ + XEN_GUEST_HANDLE(uint8) edid; + } vbeddc_info; /* XEN_FW_VBEDDC_INFO */ + } u; +}; +typedef struct xenpf_firmware_info xenpf_firmware_info_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_firmware_info_t); + +#define XENPF_enter_acpi_sleep 51 +struct xenpf_enter_acpi_sleep { + /* IN variables */ + uint16_t pm1a_cnt_val; /* PM1a control value. */ + uint16_t pm1b_cnt_val; /* PM1b control value. */ + uint32_t sleep_state; /* Which state to enter (Sn). */ + uint32_t flags; /* Must be zero. */ +}; +typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_enter_acpi_sleep_t); + +#define XENPF_change_freq 52 +struct xenpf_change_freq { + /* IN variables */ + uint32_t flags; /* Must be zero. */ + uint32_t cpu; /* Physical cpu. */ + uint64_t freq; /* New frequency (Hz). */ +}; +typedef struct xenpf_change_freq xenpf_change_freq_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_change_freq_t); + +/* + * Get idle times (nanoseconds since boot) for physical CPUs specified in the + * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is + * indexed by CPU number; only entries with the corresponding @cpumap_bitmap + * bit set are written to. On return, @cpumap_bitmap is modified so that any + * non-existent CPUs are cleared. Such CPUs have their @idletime array entry + * cleared. + */ +#define XENPF_getidletime 53 +struct xenpf_getidletime { + /* IN/OUT variables */ + /* IN: CPUs to interrogate; OUT: subset of IN which are present */ + XEN_GUEST_HANDLE(uint8) cpumap_bitmap; + /* IN variables */ + /* Size of cpumap bitmap. */ + uint32_t cpumap_nr_cpus; + /* Must be indexable for every cpu in cpumap_bitmap. */ + XEN_GUEST_HANDLE(uint64) idletime; + /* OUT variables */ + /* System time when the idletime snapshots were taken. */ + uint64_t now; +}; +typedef struct xenpf_getidletime xenpf_getidletime_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t); + +#define XENPF_set_processor_pminfo 54 + +/* ability bits */ +#define XEN_PROCESSOR_PM_CX 1 +#define XEN_PROCESSOR_PM_PX 2 +#define XEN_PROCESSOR_PM_TX 4 + +/* cmd type */ +#define XEN_PM_CX 0 +#define XEN_PM_PX 1 +#define XEN_PM_TX 2 + +/* Px sub info type */ +#define XEN_PX_PCT 1 +#define XEN_PX_PSS 2 +#define XEN_PX_PPC 4 +#define XEN_PX_PSD 8 + +struct xen_power_register { + uint32_t space_id; + uint32_t bit_width; + uint32_t bit_offset; + uint32_t access_size; + uint64_t address; +}; + +struct xen_processor_csd { + uint32_t domain; /* domain number of one dependent group */ + uint32_t coord_type; /* coordination type */ + uint32_t num; /* number of processors in same domain */ +}; +typedef struct xen_processor_csd xen_processor_csd_t; +DEFINE_XEN_GUEST_HANDLE(xen_processor_csd_t); + +struct xen_processor_cx { + struct xen_power_register reg; /* GAS for Cx trigger register */ + uint8_t type; /* cstate value, c0: 0, c1: 1, ... */ + uint32_t latency; /* worst latency (ms) to enter/exit this cstate */ + uint32_t power; /* average power consumption(mW) */ + uint32_t dpcnt; /* number of dependency entries */ + XEN_GUEST_HANDLE(xen_processor_csd_t) dp; /* NULL if no dependency */ +}; +typedef struct xen_processor_cx xen_processor_cx_t; +DEFINE_XEN_GUEST_HANDLE(xen_processor_cx_t); + +struct xen_processor_flags { + uint32_t bm_control:1; + uint32_t bm_check:1; + uint32_t has_cst:1; + uint32_t power_setup_done:1; + uint32_t bm_rld_set:1; +}; + +struct xen_processor_power { + uint32_t count; /* number of C state entries in array below */ + struct xen_processor_flags flags; /* global flags of this processor */ + XEN_GUEST_HANDLE(xen_processor_cx_t) states; /* supported c states */ +}; + +struct xen_pct_register { + uint8_t descriptor; + uint16_t length; + uint8_t space_id; + uint8_t bit_width; + uint8_t bit_offset; + uint8_t reserved; + uint64_t address; +}; + +struct xen_processor_px { + uint64_t core_frequency; /* megahertz */ + uint64_t power; /* milliWatts */ + uint64_t transition_latency; /* microseconds */ + uint64_t bus_master_latency; /* microseconds */ + uint64_t control; /* control value */ + uint64_t status; /* success indicator */ +}; +typedef struct xen_processor_px xen_processor_px_t; +DEFINE_XEN_GUEST_HANDLE(xen_processor_px_t); + +struct xen_psd_package { + uint64_t num_entries; + uint64_t revision; + uint64_t domain; + uint64_t coord_type; + uint64_t num_processors; +}; + +struct xen_processor_performance { + uint32_t flags; /* flag for Px sub info type */ + uint32_t platform_limit; /* Platform limitation on freq usage */ + struct xen_pct_register control_register; + struct xen_pct_register status_register; + uint32_t state_count; /* total available performance states */ + XEN_GUEST_HANDLE(xen_processor_px_t) states; + struct xen_psd_package domain_info; + uint32_t shared_type; /* coordination type of this processor */ +}; +typedef struct xen_processor_performance xen_processor_performance_t; +DEFINE_XEN_GUEST_HANDLE(xen_processor_performance_t); + +struct xenpf_set_processor_pminfo { + /* IN variables */ + uint32_t id; /* ACPI CPU ID */ + uint32_t type; /* {XEN_PM_CX, XEN_PM_PX} */ + union { + struct xen_processor_power power;/* Cx: _CST/_CSD */ + struct xen_processor_performance perf; /* Px: _PPC/_PCT/_PSS/_PSD */ + } u; +}; +typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_set_processor_pminfo_t); + +#define XENPF_get_cpu_freq ('N' << 24) +struct xenpf_get_cpu_freq { + /* IN variables */ + uint32_t vcpu; + /* OUT variables */ + uint32_t freq; /* in kHz */ +}; +typedef struct xenpf_get_cpu_freq xenpf_get_cpu_freq_t; +DEFINE_XEN_GUEST_HANDLE(xenpf_get_cpu_freq_t); + +struct xen_platform_op { + uint32_t cmd; + uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ + union { + struct xenpf_settime settime; + struct xenpf_add_memtype add_memtype; + struct xenpf_del_memtype del_memtype; + struct xenpf_read_memtype read_memtype; + struct xenpf_microcode_update microcode; + struct xenpf_platform_quirk platform_quirk; + struct xenpf_firmware_info firmware_info; + struct xenpf_enter_acpi_sleep enter_acpi_sleep; + struct xenpf_change_freq change_freq; + struct xenpf_getidletime getidletime; + struct xenpf_set_processor_pminfo set_pminfo; + struct xenpf_get_cpu_freq get_cpu_freq; + uint8_t pad[128]; + } u; +}; +typedef struct xen_platform_op xen_platform_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_platform_op_t); + +#endif /* __XEN_PUBLIC_PLATFORM_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/version.h +++ linux-ec2-2.6.32/include/xen/interface/version.h @@ -3,6 +3,24 @@ * * Xen version, type, and compile information. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2005, Nguyen Anh Quynh * Copyright (c) 2005, Keir Fraser */ @@ -10,17 +28,18 @@ #ifndef __XEN_PUBLIC_VERSION_H__ #define __XEN_PUBLIC_VERSION_H__ -/* NB. All ops return zero on success, except XENVER_version. */ +/* NB. All ops return zero on success, except XENVER_{version,pagesize} */ /* arg == NULL; returns major:minor (16:16). */ #define XENVER_version 0 /* arg == xen_extraversion_t. */ #define XENVER_extraversion 1 +typedef char xen_extraversion_t[16]; struct xen_extraversion { - char extraversion[16]; + xen_extraversion_t extraversion; }; -#define XEN_EXTRAVERSION_LEN (sizeof(struct xen_extraversion)) +#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t)) /* arg == xen_compile_info_t. */ #define XENVER_compile_info 2 @@ -30,29 +49,34 @@ char compile_domain[32]; char compile_date[32]; }; +typedef struct xen_compile_info xen_compile_info_t; #define XENVER_capabilities 3 +typedef char xen_capabilities_info_t[1024]; struct xen_capabilities_info { - char info[1024]; + xen_capabilities_info_t info; }; -#define XEN_CAPABILITIES_INFO_LEN (sizeof(struct xen_capabilities_info)) +#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t)) #define XENVER_changeset 4 +typedef char xen_changeset_info_t[64]; struct xen_changeset_info { - char info[64]; + xen_changeset_info_t info; }; -#define XEN_CHANGESET_INFO_LEN (sizeof(struct xen_changeset_info)) +#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t)) #define XENVER_platform_parameters 5 struct xen_platform_parameters { unsigned long virt_start; }; +typedef struct xen_platform_parameters xen_platform_parameters_t; #define XENVER_get_features 6 struct xen_feature_info { unsigned int submap_idx; /* IN: which 32-bit submap to return */ uint32_t submap; /* OUT: 32-bit submap */ }; +typedef struct xen_feature_info xen_feature_info_t; /* Declares the features reported by XENVER_get_features. */ #include "features.h" @@ -60,4 +84,10 @@ /* arg == NULL; returns host memory page size. */ #define XENVER_pagesize 7 +/* arg == xen_domain_handle_t. */ +#define XENVER_guest_handle 8 + +#define XENVER_commandline 9 +typedef char xen_commandline_t[1024]; + #endif /* __XEN_PUBLIC_VERSION_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/arch-x86_64.h +++ linux-ec2-2.6.32/include/xen/interface/arch-x86_64.h @@ -0,0 +1,27 @@ +/****************************************************************************** + * arch-x86_64.h + * + * Guest OS interface to x86 64-bit Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser + */ + +#include "arch-x86/xen.h" --- linux-ec2-2.6.32.orig/include/xen/interface/elfnote.h +++ linux-ec2-2.6.32/include/xen/interface/elfnote.h @@ -3,6 +3,24 @@ * * Definitions used for the Xen ELF notes. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2006, Ian Campbell, XenSource Ltd. */ @@ -10,7 +28,7 @@ #define __XEN_PUBLIC_ELFNOTE_H__ /* - * The notes should live in a SHT_NOTE segment and have "Xen" in the + * The notes should live in a PT_NOTE segment and have "Xen" in the * name field. * * Numeric types are either 4 or 8 bytes depending on the content of @@ -22,8 +40,6 @@ /* * NAME=VALUE pair (string). - * - * LEGACY: FEATURES and PAE */ #define XEN_ELFNOTE_INFO 0 @@ -90,7 +106,12 @@ #define XEN_ELFNOTE_LOADER 8 /* - * The kernel supports PAE (x86/32 only, string = "yes" or "no"). + * The kernel supports PAE (x86/32 only, string = "yes", "no" or + * "bimodal"). + * + * For compatibility with Xen 3.0.3 and earlier the "bimodal" setting + * may be given as "yes,bimodal" which will cause older Xen to treat + * this kernel as PAE. * * LEGACY: PAE (n.b. The legacy interface included a provision to * indicate 'extended-cr3' support allowing L3 page tables to be @@ -140,6 +161,76 @@ */ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 +/* + * The (non-default) location the initial phys-to-machine map should be + * placed at by the hypervisor (Dom0) or the tools (DomU). + * The kernel must be prepared for this mapping to be established using + * large pages, despite such otherwise not being available to guests. + * The kernel must also be able to handle the page table pages used for + * this mapping not being accessible through the initial mapping. + * (Only x86-64 supports this at present.) + */ +#define XEN_ELFNOTE_INIT_P2M 15 + +/* + * The number of the highest elfnote defined. + */ +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M + +/* + * System information exported through crash notes. + * + * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_INFO + * note in case of a system crash. This note will contain various + * information about the system, see xen/include/xen/elfcore.h. + */ +#define XEN_ELFNOTE_CRASH_INFO 0x1000001 + +/* + * System registers exported through crash notes. + * + * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_REGS + * note per cpu in case of a system crash. This note is architecture + * specific and will contain registers not saved in the "CORE" note. + * See xen/include/xen/elfcore.h for more information. + */ +#define XEN_ELFNOTE_CRASH_REGS 0x1000002 + + +/* + * xen dump-core none note. + * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_NONE + * in its dump file to indicate that the file is xen dump-core + * file. This note doesn't have any other information. + * See tools/libxc/xc_core.h for more information. + */ +#define XEN_ELFNOTE_DUMPCORE_NONE 0x2000000 + +/* + * xen dump-core header note. + * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_HEADER + * in its dump file. + * See tools/libxc/xc_core.h for more information. + */ +#define XEN_ELFNOTE_DUMPCORE_HEADER 0x2000001 + +/* + * xen dump-core xen version note. + * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_XEN_VERSION + * in its dump file. It contains the xen version obtained via the + * XENVER hypercall. + * See tools/libxc/xc_core.h for more information. + */ +#define XEN_ELFNOTE_DUMPCORE_XEN_VERSION 0x2000002 + +/* + * xen dump-core format version note. + * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION + * in its dump file. It contains a format version identifier. + * See tools/libxc/xc_core.h for more information. + */ +#define XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION 0x2000003 + #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ /* --- linux-ec2-2.6.32.orig/include/xen/interface/grant_table.h +++ linux-ec2-2.6.32/include/xen/interface/grant_table.h @@ -28,6 +28,7 @@ #ifndef __XEN_PUBLIC_GRANT_TABLE_H__ #define __XEN_PUBLIC_GRANT_TABLE_H__ +#include "xen.h" /*********************************** * GRANT TABLE REPRESENTATION @@ -100,6 +101,7 @@ */ uint32_t frame; }; +typedef struct grant_entry grant_entry_t; /* * Type of grant entry. @@ -118,6 +120,7 @@ * GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST] * GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN] * GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN] + * GTF_PAT, GTF_PWT, GTF_PCD: (x86) cache attribute flags for the grant [GST] */ #define _GTF_readonly (2) #define GTF_readonly (1U<<_GTF_readonly) @@ -125,6 +128,12 @@ #define GTF_reading (1U<<_GTF_reading) #define _GTF_writing (4) #define GTF_writing (1U<<_GTF_writing) +#define _GTF_PWT (5) +#define GTF_PWT (1U<<_GTF_PWT) +#define _GTF_PCD (6) +#define GTF_PCD (1U<<_GTF_PCD) +#define _GTF_PAT (7) +#define GTF_PAT (1U<<_GTF_PAT) /* * Subflags for GTF_accept_transfer: @@ -186,6 +195,8 @@ uint64_t dev_bus_addr; }; DEFINE_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref); +typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t; +DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t); /* * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings @@ -208,6 +219,8 @@ int16_t status; /* GNTST_* */ }; DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref); +typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t; +DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t); /* * GNTTABOP_setup_table: Set up a grant table for comprising at least @@ -225,9 +238,11 @@ uint32_t nr_frames; /* OUT parameters. */ int16_t status; /* GNTST_* */ - GUEST_HANDLE(ulong) frame_list; + XEN_GUEST_HANDLE(ulong) frame_list; }; DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table); +typedef struct gnttab_setup_table gnttab_setup_table_t; +DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t); /* * GNTTABOP_dump_table: Dump the contents of the grant table to the @@ -241,6 +256,8 @@ int16_t status; /* GNTST_* */ }; DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table); +typedef struct gnttab_dump_table gnttab_dump_table_t; +DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t); /* * GNTTABOP_transfer_grant_ref: Transfer to a foreign domain. The @@ -253,13 +270,16 @@ #define GNTTABOP_transfer 4 struct gnttab_transfer { /* IN parameters. */ - unsigned long mfn; + xen_pfn_t mfn; domid_t domid; grant_ref_t ref; /* OUT parameters. */ int16_t status; }; DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer); +typedef struct gnttab_transfer gnttab_transfer_t; +DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t); + /* * GNTTABOP_copy: Hypervisor based copy @@ -285,22 +305,23 @@ #define GNTCOPY_dest_gref (1<<_GNTCOPY_dest_gref) #define GNTTABOP_copy 5 -struct gnttab_copy { - /* IN parameters. */ - struct { - union { - grant_ref_t ref; - unsigned long gmfn; - } u; - domid_t domid; - uint16_t offset; - } source, dest; - uint16_t len; - uint16_t flags; /* GNTCOPY_* */ - /* OUT parameters. */ - int16_t status; -}; +typedef struct gnttab_copy { + /* IN parameters. */ + struct { + union { + grant_ref_t ref; + xen_pfn_t gmfn; + } u; + domid_t domid; + uint16_t offset; + } source, dest; + uint16_t len; + uint16_t flags; /* GNTCOPY_* */ + /* OUT parameters. */ + int16_t status; +} gnttab_copy_t; DEFINE_GUEST_HANDLE_STRUCT(gnttab_copy); +DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t); /* * GNTTABOP_query_size: Query the current and maximum sizes of the shared @@ -319,9 +340,35 @@ int16_t status; /* GNTST_* */ }; DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size); +typedef struct gnttab_query_size gnttab_query_size_t; +DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t); + +/* + * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings + * tracked by but atomically replace the page table entry with one + * pointing to the machine address under . will be + * redirected to the null entry. + * NOTES: + * 1. The call may fail in an undefined manner if either mapping is not + * tracked by . + * 2. After executing a batch of unmaps, it is guaranteed that no stale + * mappings will remain in the device or host TLBs. + */ +#define GNTTABOP_unmap_and_replace 7 +struct gnttab_unmap_and_replace { + /* IN parameters. */ + uint64_t host_addr; + uint64_t new_addr; + grant_handle_t handle; + /* OUT parameters. */ + int16_t status; /* GNTST_* */ +}; +typedef struct gnttab_unmap_and_replace gnttab_unmap_and_replace_t; +DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t); + /* - * Bitfield values for update_pin_status.flags. + * Bitfield values for gnttab_map_grant_ref.flags. */ /* Map the grant entry for access by I/O devices. */ #define _GNTMAP_device_map (0) @@ -349,6 +396,13 @@ #define GNTMAP_contains_pte (1<<_GNTMAP_contains_pte) /* + * Bits to be placed in guest kernel available PTE bits (architecture + * dependent; only supported when XENFEAT_gnttab_map_avail_bits is set). + */ +#define _GNTMAP_guest_avail0 (16) +#define GNTMAP_guest_avail_mask ((uint32_t)~0 << _GNTMAP_guest_avail0) + +/* * Values for error status returns. All errors are -ve. */ #define GNTST_okay (0) /* Normal return. */ @@ -361,7 +415,8 @@ #define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */ #define GNTST_permission_denied (-8) /* Not enough privilege for operation. */ #define GNTST_bad_page (-9) /* Specified page was invalid for op. */ -#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary */ +#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary. */ +#define GNTST_address_too_big (-11) /* transfer page address too large. */ #define GNTTABOP_error_msgs { \ "okay", \ @@ -374,7 +429,8 @@ "no spare translation slot in the I/O MMU", \ "permission denied", \ "bad page", \ - "copy arguments cross page boundary" \ + "copy arguments cross page boundary", \ + "page address size too large" \ } #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/tmem.h +++ linux-ec2-2.6.32/include/xen/interface/tmem.h @@ -0,0 +1,111 @@ +/****************************************************************************** + * tmem.h + * + * Guest OS interface to Xen Transcendent Memory. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __XEN_PUBLIC_TMEM_H__ +#define __XEN_PUBLIC_TMEM_H__ + +#include "xen.h" + +/* Commands to HYPERVISOR_tmem_op() */ +#define TMEM_CONTROL 0 +#define TMEM_NEW_POOL 1 +#define TMEM_DESTROY_POOL 2 +#define TMEM_NEW_PAGE 3 +#define TMEM_PUT_PAGE 4 +#define TMEM_GET_PAGE 5 +#define TMEM_FLUSH_PAGE 6 +#define TMEM_FLUSH_OBJECT 7 +#define TMEM_READ 8 +#define TMEM_WRITE 9 +#define TMEM_XCHG 10 + +/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */ +#define TMEMC_THAW 0 +#define TMEMC_FREEZE 1 +#define TMEMC_FLUSH 2 +#define TMEMC_DESTROY 3 +#define TMEMC_LIST 4 +#define TMEMC_SET_WEIGHT 5 +#define TMEMC_SET_CAP 6 +#define TMEMC_SET_COMPRESS 7 + +/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ +#define TMEM_POOL_PERSIST 1 +#define TMEM_POOL_SHARED 2 +#define TMEM_POOL_PAGESIZE_SHIFT 4 +#define TMEM_POOL_PAGESIZE_MASK 0xf +#define TMEM_POOL_VERSION_SHIFT 24 +#define TMEM_POOL_VERSION_MASK 0xff + +/* Special errno values */ +#define EFROZEN 1000 +#define EEMPTY 1001 + + +#ifndef __ASSEMBLY__ +typedef xen_pfn_t tmem_cli_mfn_t; +typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t; +struct tmem_op { + uint32_t cmd; + int32_t pool_id; /* private > 0; shared < 0; 0 is invalid */ + union { + struct { /* for cmd == TMEM_NEW_POOL */ + uint64_t uuid[2]; + uint32_t flags; + } new; + struct { /* for cmd == TMEM_CONTROL */ + uint32_t subop; + uint32_t cli_id; + uint32_t arg1; + uint32_t arg2; + tmem_cli_va_t buf; + } ctrl; + struct { + uint64_t object; + uint32_t index; + uint32_t tmem_offset; + uint32_t pfn_offset; + uint32_t len; + tmem_cli_mfn_t cmfn; /* client machine page frame */ + } gen; + } u; +}; +typedef struct tmem_op tmem_op_t; +DEFINE_XEN_GUEST_HANDLE(tmem_op_t); + +#endif + +#endif /* __XEN_PUBLIC_TMEM_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/event_channel.h +++ linux-ec2-2.6.32/include/xen/interface/event_channel.h @@ -3,6 +3,24 @@ * * Event channels between domains. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2003-2004, K A Fraser. */ @@ -11,8 +29,15 @@ #include +/* + * Prototype for this hypercall is: + * int event_channel_op(int cmd, void *args) + * @cmd == EVTCHNOP_??? (event-channel operation). + * @args == Operation-specific extra arguments (NULL if none). + */ + typedef uint32_t evtchn_port_t; -DEFINE_GUEST_HANDLE(evtchn_port_t); +DEFINE_XEN_GUEST_HANDLE(evtchn_port_t); /* * EVTCHNOP_alloc_unbound: Allocate a port in domain and mark as @@ -22,13 +47,14 @@ * 1. If the caller is unprivileged then must be DOMID_SELF. * 2. may be DOMID_SELF, allowing loopback connections. */ -#define EVTCHNOP_alloc_unbound 6 +#define EVTCHNOP_alloc_unbound 6 struct evtchn_alloc_unbound { - /* IN parameters */ - domid_t dom, remote_dom; - /* OUT parameters */ - evtchn_port_t port; + /* IN parameters */ + domid_t dom, remote_dom; + /* OUT parameters */ + evtchn_port_t port; }; +typedef struct evtchn_alloc_unbound evtchn_alloc_unbound_t; /* * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between @@ -41,29 +67,35 @@ */ #define EVTCHNOP_bind_interdomain 0 struct evtchn_bind_interdomain { - /* IN parameters. */ - domid_t remote_dom; - evtchn_port_t remote_port; - /* OUT parameters. */ - evtchn_port_t local_port; + /* IN parameters. */ + domid_t remote_dom; + evtchn_port_t remote_port; + /* OUT parameters. */ + evtchn_port_t local_port; }; +typedef struct evtchn_bind_interdomain evtchn_bind_interdomain_t; /* * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ on specified * vcpu. * NOTES: - * 1. A virtual IRQ may be bound to at most one event channel per vcpu. - * 2. The allocated event channel is bound to the specified vcpu. The binding - * may not be changed. + * 1. Virtual IRQs are classified as per-vcpu or global. See the VIRQ list + * in xen.h for the classification of each VIRQ. + * 2. Global VIRQs must be allocated on VCPU0 but can subsequently be + * re-bound via EVTCHNOP_bind_vcpu. + * 3. Per-vcpu VIRQs may be bound to at most one event channel per vcpu. + * The allocated event channel is bound to the specified vcpu and the + * binding cannot be changed. */ -#define EVTCHNOP_bind_virq 1 +#define EVTCHNOP_bind_virq 1 struct evtchn_bind_virq { - /* IN parameters. */ - uint32_t virq; - uint32_t vcpu; - /* OUT parameters. */ - evtchn_port_t port; + /* IN parameters. */ + uint32_t virq; + uint32_t vcpu; + /* OUT parameters. */ + evtchn_port_t port; }; +typedef struct evtchn_bind_virq evtchn_bind_virq_t; /* * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ . @@ -71,15 +103,16 @@ * 1. A physical IRQ may be bound to at most one event channel per domain. * 2. Only a sufficiently-privileged domain may bind to a physical IRQ. */ -#define EVTCHNOP_bind_pirq 2 +#define EVTCHNOP_bind_pirq 2 struct evtchn_bind_pirq { - /* IN parameters. */ - uint32_t pirq; + /* IN parameters. */ + uint32_t pirq; #define BIND_PIRQ__WILL_SHARE 1 - uint32_t flags; /* BIND_PIRQ__* */ - /* OUT parameters. */ - evtchn_port_t port; + uint32_t flags; /* BIND_PIRQ__* */ + /* OUT parameters. */ + evtchn_port_t port; }; +typedef struct evtchn_bind_pirq evtchn_bind_pirq_t; /* * EVTCHNOP_bind_ipi: Bind a local event channel to receive events. @@ -87,33 +120,36 @@ * 1. The allocated event channel is bound to the specified vcpu. The binding * may not be changed. */ -#define EVTCHNOP_bind_ipi 7 +#define EVTCHNOP_bind_ipi 7 struct evtchn_bind_ipi { - uint32_t vcpu; - /* OUT parameters. */ - evtchn_port_t port; + uint32_t vcpu; + /* OUT parameters. */ + evtchn_port_t port; }; +typedef struct evtchn_bind_ipi evtchn_bind_ipi_t; /* * EVTCHNOP_close: Close a local event channel . If the channel is * interdomain then the remote end is placed in the unbound state * (EVTCHNSTAT_unbound), awaiting a new connection. */ -#define EVTCHNOP_close 3 +#define EVTCHNOP_close 3 struct evtchn_close { - /* IN parameters. */ - evtchn_port_t port; + /* IN parameters. */ + evtchn_port_t port; }; +typedef struct evtchn_close evtchn_close_t; /* * EVTCHNOP_send: Send an event to the remote end of the channel whose local * endpoint is . */ -#define EVTCHNOP_send 4 +#define EVTCHNOP_send 4 struct evtchn_send { - /* IN parameters. */ - evtchn_port_t port; + /* IN parameters. */ + evtchn_port_t port; }; +typedef struct evtchn_send evtchn_send_t; /* * EVTCHNOP_status: Get the current status of the communication channel which @@ -123,75 +159,99 @@ * 2. Only a sufficiently-privileged domain may obtain the status of an event * channel for which is not DOMID_SELF. */ -#define EVTCHNOP_status 5 +#define EVTCHNOP_status 5 struct evtchn_status { - /* IN parameters */ - domid_t dom; - evtchn_port_t port; - /* OUT parameters */ -#define EVTCHNSTAT_closed 0 /* Channel is not in use. */ -#define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/ -#define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */ -#define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */ -#define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */ -#define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */ - uint32_t status; - uint32_t vcpu; /* VCPU to which this channel is bound. */ - union { - struct { - domid_t dom; - } unbound; /* EVTCHNSTAT_unbound */ - struct { - domid_t dom; - evtchn_port_t port; - } interdomain; /* EVTCHNSTAT_interdomain */ - uint32_t pirq; /* EVTCHNSTAT_pirq */ - uint32_t virq; /* EVTCHNSTAT_virq */ - } u; + /* IN parameters */ + domid_t dom; + evtchn_port_t port; + /* OUT parameters */ +#define EVTCHNSTAT_closed 0 /* Channel is not in use. */ +#define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/ +#define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */ +#define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */ +#define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */ +#define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */ + uint32_t status; + uint32_t vcpu; /* VCPU to which this channel is bound. */ + union { + struct { + domid_t dom; + } unbound; /* EVTCHNSTAT_unbound */ + struct { + domid_t dom; + evtchn_port_t port; + } interdomain; /* EVTCHNSTAT_interdomain */ + uint32_t pirq; /* EVTCHNSTAT_pirq */ + uint32_t virq; /* EVTCHNSTAT_virq */ + } u; }; +typedef struct evtchn_status evtchn_status_t; /* * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an * event is pending. * NOTES: - * 1. IPI- and VIRQ-bound channels always notify the vcpu that initialised - * the binding. This binding cannot be changed. - * 2. All other channels notify vcpu0 by default. This default is set when + * 1. IPI-bound channels always notify the vcpu specified at bind time. + * This binding cannot be changed. + * 2. Per-VCPU VIRQ channels always notify the vcpu specified at bind time. + * This binding cannot be changed. + * 3. All other channels notify vcpu0 by default. This default is set when * the channel is allocated (a port that is freed and subsequently reused * has its binding reset to vcpu0). */ -#define EVTCHNOP_bind_vcpu 8 +#define EVTCHNOP_bind_vcpu 8 struct evtchn_bind_vcpu { - /* IN parameters. */ - evtchn_port_t port; - uint32_t vcpu; + /* IN parameters. */ + evtchn_port_t port; + uint32_t vcpu; }; +typedef struct evtchn_bind_vcpu evtchn_bind_vcpu_t; /* * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver * a notification to the appropriate VCPU if an event is pending. */ -#define EVTCHNOP_unmask 9 +#define EVTCHNOP_unmask 9 struct evtchn_unmask { - /* IN parameters. */ - evtchn_port_t port; + /* IN parameters. */ + evtchn_port_t port; }; +typedef struct evtchn_unmask evtchn_unmask_t; +/* + * EVTCHNOP_reset: Close all event channels associated with specified domain. + * NOTES: + * 1. may be specified as DOMID_SELF. + * 2. Only a sufficiently-privileged domain may specify other than DOMID_SELF. + */ +#define EVTCHNOP_reset 10 +struct evtchn_reset { + /* IN parameters. */ + domid_t dom; +}; +typedef struct evtchn_reset evtchn_reset_t; + +/* + * Argument to event_channel_op_compat() hypercall. Superceded by new + * event_channel_op() hypercall since 0x00030202. + */ struct evtchn_op { - uint32_t cmd; /* EVTCHNOP_* */ - union { - struct evtchn_alloc_unbound alloc_unbound; - struct evtchn_bind_interdomain bind_interdomain; - struct evtchn_bind_virq bind_virq; - struct evtchn_bind_pirq bind_pirq; - struct evtchn_bind_ipi bind_ipi; - struct evtchn_close close; - struct evtchn_send send; - struct evtchn_status status; - struct evtchn_bind_vcpu bind_vcpu; - struct evtchn_unmask unmask; - } u; + uint32_t cmd; /* EVTCHNOP_* */ + union { + struct evtchn_alloc_unbound alloc_unbound; + struct evtchn_bind_interdomain bind_interdomain; + struct evtchn_bind_virq bind_virq; + struct evtchn_bind_pirq bind_pirq; + struct evtchn_bind_ipi bind_ipi; + struct evtchn_close close; + struct evtchn_send send; + struct evtchn_status status; + struct evtchn_bind_vcpu bind_vcpu; + struct evtchn_unmask unmask; + } u; }; DEFINE_GUEST_HANDLE_STRUCT(evtchn_op); +typedef struct evtchn_op evtchn_op_t; +DEFINE_XEN_GUEST_HANDLE(evtchn_op_t); #endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/domctl.h +++ linux-ec2-2.6.32/include/xen/interface/domctl.h @@ -0,0 +1,710 @@ +/****************************************************************************** + * domctl.h + * + * Domain management operations. For use by node control stack. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2002-2003, B Dragovic + * Copyright (c) 2002-2006, K Fraser + */ + +#ifndef __XEN_PUBLIC_DOMCTL_H__ +#define __XEN_PUBLIC_DOMCTL_H__ + +#if !defined(__XEN__) && !defined(__XEN_TOOLS__) +#error "domctl operations are intended for use by node control tools only" +#endif + +#include "xen.h" + +#define XEN_DOMCTL_INTERFACE_VERSION 0x00000005 + +struct xenctl_cpumap { + XEN_GUEST_HANDLE_64(uint8) bitmap; + uint32_t nr_cpus; +}; + +/* + * NB. xen_domctl.domain is an IN/OUT parameter for this operation. + * If it is specified as zero, an id is auto-allocated and returned. + */ +#define XEN_DOMCTL_createdomain 1 +struct xen_domctl_createdomain { + /* IN parameters */ + uint32_t ssidref; + xen_domain_handle_t handle; + /* Is this an HVM guest (as opposed to a PV guest)? */ +#define _XEN_DOMCTL_CDF_hvm_guest 0 +#define XEN_DOMCTL_CDF_hvm_guest (1U<<_XEN_DOMCTL_CDF_hvm_guest) + /* Use hardware-assisted paging if available? */ +#define _XEN_DOMCTL_CDF_hap 1 +#define XEN_DOMCTL_CDF_hap (1U<<_XEN_DOMCTL_CDF_hap) + /* Should domain memory integrity be verifed by tboot during Sx? */ +#define _XEN_DOMCTL_CDF_s3_integrity 2 +#define XEN_DOMCTL_CDF_s3_integrity (1U<<_XEN_DOMCTL_CDF_s3_integrity) + uint32_t flags; +}; +typedef struct xen_domctl_createdomain xen_domctl_createdomain_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t); + +#define XEN_DOMCTL_destroydomain 2 +#define XEN_DOMCTL_pausedomain 3 +#define XEN_DOMCTL_unpausedomain 4 +#define XEN_DOMCTL_resumedomain 27 + +#define XEN_DOMCTL_getdomaininfo 5 +struct xen_domctl_getdomaininfo { + /* OUT variables. */ + domid_t domain; /* Also echoed in domctl.domain */ + /* Domain is scheduled to die. */ +#define _XEN_DOMINF_dying 0 +#define XEN_DOMINF_dying (1U<<_XEN_DOMINF_dying) + /* Domain is an HVM guest (as opposed to a PV guest). */ +#define _XEN_DOMINF_hvm_guest 1 +#define XEN_DOMINF_hvm_guest (1U<<_XEN_DOMINF_hvm_guest) + /* The guest OS has shut down. */ +#define _XEN_DOMINF_shutdown 2 +#define XEN_DOMINF_shutdown (1U<<_XEN_DOMINF_shutdown) + /* Currently paused by control software. */ +#define _XEN_DOMINF_paused 3 +#define XEN_DOMINF_paused (1U<<_XEN_DOMINF_paused) + /* Currently blocked pending an event. */ +#define _XEN_DOMINF_blocked 4 +#define XEN_DOMINF_blocked (1U<<_XEN_DOMINF_blocked) + /* Domain is currently running. */ +#define _XEN_DOMINF_running 5 +#define XEN_DOMINF_running (1U<<_XEN_DOMINF_running) + /* Being debugged. */ +#define _XEN_DOMINF_debugged 6 +#define XEN_DOMINF_debugged (1U<<_XEN_DOMINF_debugged) + /* CPU to which this domain is bound. */ +#define XEN_DOMINF_cpumask 255 +#define XEN_DOMINF_cpushift 8 + /* XEN_DOMINF_shutdown guest-supplied code. */ +#define XEN_DOMINF_shutdownmask 255 +#define XEN_DOMINF_shutdownshift 16 + uint32_t flags; /* XEN_DOMINF_* */ + uint64_aligned_t tot_pages; + uint64_aligned_t max_pages; + uint64_aligned_t shared_info_frame; /* GMFN of shared_info struct */ + uint64_aligned_t cpu_time; + uint32_t nr_online_vcpus; /* Number of VCPUs currently online. */ + uint32_t max_vcpu_id; /* Maximum VCPUID in use by this domain. */ + uint32_t ssidref; + xen_domain_handle_t handle; +}; +typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t); + + +#define XEN_DOMCTL_getmemlist 6 +struct xen_domctl_getmemlist { + /* IN variables. */ + /* Max entries to write to output buffer. */ + uint64_aligned_t max_pfns; + /* Start index in guest's page list. */ + uint64_aligned_t start_pfn; + XEN_GUEST_HANDLE_64(uint64) buffer; + /* OUT variables. */ + uint64_aligned_t num_pfns; +}; +typedef struct xen_domctl_getmemlist xen_domctl_getmemlist_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getmemlist_t); + + +#define XEN_DOMCTL_getpageframeinfo 7 + +#define XEN_DOMCTL_PFINFO_LTAB_SHIFT 28 +#define XEN_DOMCTL_PFINFO_NOTAB (0x0U<<28) +#define XEN_DOMCTL_PFINFO_L1TAB (0x1U<<28) +#define XEN_DOMCTL_PFINFO_L2TAB (0x2U<<28) +#define XEN_DOMCTL_PFINFO_L3TAB (0x3U<<28) +#define XEN_DOMCTL_PFINFO_L4TAB (0x4U<<28) +#define XEN_DOMCTL_PFINFO_LTABTYPE_MASK (0x7U<<28) +#define XEN_DOMCTL_PFINFO_LPINTAB (0x1U<<31) +#define XEN_DOMCTL_PFINFO_XTAB (0xfU<<28) /* invalid page */ +#define XEN_DOMCTL_PFINFO_LTAB_MASK (0xfU<<28) + +struct xen_domctl_getpageframeinfo { + /* IN variables. */ + uint64_aligned_t gmfn; /* GMFN to query */ + /* OUT variables. */ + /* Is the page PINNED to a type? */ + uint32_t type; /* see above type defs */ +}; +typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t); + + +#define XEN_DOMCTL_getpageframeinfo2 8 +struct xen_domctl_getpageframeinfo2 { + /* IN variables. */ + uint64_aligned_t num; + /* IN/OUT variables. */ + XEN_GUEST_HANDLE_64(uint32) array; +}; +typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t); + + +/* + * Control shadow pagetables operation + */ +#define XEN_DOMCTL_shadow_op 10 + +/* Disable shadow mode. */ +#define XEN_DOMCTL_SHADOW_OP_OFF 0 + +/* Enable shadow mode (mode contains ORed XEN_DOMCTL_SHADOW_ENABLE_* flags). */ +#define XEN_DOMCTL_SHADOW_OP_ENABLE 32 + +/* Log-dirty bitmap operations. */ + /* Return the bitmap and clean internal copy for next round. */ +#define XEN_DOMCTL_SHADOW_OP_CLEAN 11 + /* Return the bitmap but do not modify internal copy. */ +#define XEN_DOMCTL_SHADOW_OP_PEEK 12 + +/* Memory allocation accessors. */ +#define XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION 30 +#define XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION 31 + +/* Legacy enable operations. */ + /* Equiv. to ENABLE with no mode flags. */ +#define XEN_DOMCTL_SHADOW_OP_ENABLE_TEST 1 + /* Equiv. to ENABLE with mode flag ENABLE_LOG_DIRTY. */ +#define XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY 2 + /* Equiv. to ENABLE with mode flags ENABLE_REFCOUNT and ENABLE_TRANSLATE. */ +#define XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE 3 + +/* Mode flags for XEN_DOMCTL_SHADOW_OP_ENABLE. */ + /* + * Shadow pagetables are refcounted: guest does not use explicit mmu + * operations nor write-protect its pagetables. + */ +#define XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT (1 << 1) + /* + * Log pages in a bitmap as they are dirtied. + * Used for live relocation to determine which pages must be re-sent. + */ +#define XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY (1 << 2) + /* + * Automatically translate GPFNs into MFNs. + */ +#define XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE (1 << 3) + /* + * Xen does not steal virtual address space from the guest. + * Requires HVM support. + */ +#define XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL (1 << 4) + +struct xen_domctl_shadow_op_stats { + uint32_t fault_count; + uint32_t dirty_count; +}; +typedef struct xen_domctl_shadow_op_stats xen_domctl_shadow_op_stats_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_stats_t); + +struct xen_domctl_shadow_op { + /* IN variables. */ + uint32_t op; /* XEN_DOMCTL_SHADOW_OP_* */ + + /* OP_ENABLE */ + uint32_t mode; /* XEN_DOMCTL_SHADOW_ENABLE_* */ + + /* OP_GET_ALLOCATION / OP_SET_ALLOCATION */ + uint32_t mb; /* Shadow memory allocation in MB */ + + /* OP_PEEK / OP_CLEAN */ + XEN_GUEST_HANDLE_64(uint8) dirty_bitmap; + uint64_aligned_t pages; /* Size of buffer. Updated with actual size. */ + struct xen_domctl_shadow_op_stats stats; +}; +typedef struct xen_domctl_shadow_op xen_domctl_shadow_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_t); + + +#define XEN_DOMCTL_max_mem 11 +struct xen_domctl_max_mem { + /* IN variables. */ + uint64_aligned_t max_memkb; +}; +typedef struct xen_domctl_max_mem xen_domctl_max_mem_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_mem_t); + + +#define XEN_DOMCTL_setvcpucontext 12 +#define XEN_DOMCTL_getvcpucontext 13 +struct xen_domctl_vcpucontext { + uint32_t vcpu; /* IN */ + XEN_GUEST_HANDLE_64(vcpu_guest_context_t) ctxt; /* IN/OUT */ +}; +typedef struct xen_domctl_vcpucontext xen_domctl_vcpucontext_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpucontext_t); + + +#define XEN_DOMCTL_getvcpuinfo 14 +struct xen_domctl_getvcpuinfo { + /* IN variables. */ + uint32_t vcpu; + /* OUT variables. */ + uint8_t online; /* currently online (not hotplugged)? */ + uint8_t blocked; /* blocked waiting for an event? */ + uint8_t running; /* currently scheduled on its CPU? */ + uint64_aligned_t cpu_time; /* total cpu time consumed (ns) */ + uint32_t cpu; /* current mapping */ +}; +typedef struct xen_domctl_getvcpuinfo xen_domctl_getvcpuinfo_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t); + + +/* Get/set which physical cpus a vcpu can execute on. */ +#define XEN_DOMCTL_setvcpuaffinity 9 +#define XEN_DOMCTL_getvcpuaffinity 25 +struct xen_domctl_vcpuaffinity { + uint32_t vcpu; /* IN */ + struct xenctl_cpumap cpumap; /* IN/OUT */ +}; +typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t); + + +#define XEN_DOMCTL_max_vcpus 15 +struct xen_domctl_max_vcpus { + uint32_t max; /* maximum number of vcpus */ +}; +typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t); + + +#define XEN_DOMCTL_scheduler_op 16 +/* Scheduler types. */ +#define XEN_SCHEDULER_SEDF 4 +#define XEN_SCHEDULER_CREDIT 5 +/* Set or get info? */ +#define XEN_DOMCTL_SCHEDOP_putinfo 0 +#define XEN_DOMCTL_SCHEDOP_getinfo 1 +struct xen_domctl_scheduler_op { + uint32_t sched_id; /* XEN_SCHEDULER_* */ + uint32_t cmd; /* XEN_DOMCTL_SCHEDOP_* */ + union { + struct xen_domctl_sched_sedf { + uint64_aligned_t period; + uint64_aligned_t slice; + uint64_aligned_t latency; + uint32_t extratime; + uint32_t weight; + } sedf; + struct xen_domctl_sched_credit { + uint16_t weight; + uint16_t cap; + } credit; + } u; +}; +typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_scheduler_op_t); + + +#define XEN_DOMCTL_setdomainhandle 17 +struct xen_domctl_setdomainhandle { + xen_domain_handle_t handle; +}; +typedef struct xen_domctl_setdomainhandle xen_domctl_setdomainhandle_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdomainhandle_t); + + +#define XEN_DOMCTL_setdebugging 18 +struct xen_domctl_setdebugging { + uint8_t enable; +}; +typedef struct xen_domctl_setdebugging xen_domctl_setdebugging_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdebugging_t); + + +#define XEN_DOMCTL_irq_permission 19 +struct xen_domctl_irq_permission { + uint8_t pirq; + uint8_t allow_access; /* flag to specify enable/disable of IRQ access */ +}; +typedef struct xen_domctl_irq_permission xen_domctl_irq_permission_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_irq_permission_t); + + +#define XEN_DOMCTL_iomem_permission 20 +struct xen_domctl_iomem_permission { + uint64_aligned_t first_mfn;/* first page (physical page number) in range */ + uint64_aligned_t nr_mfns; /* number of pages in range (>0) */ + uint8_t allow_access; /* allow (!0) or deny (0) access to range? */ +}; +typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_iomem_permission_t); + + +#define XEN_DOMCTL_ioport_permission 21 +struct xen_domctl_ioport_permission { + uint32_t first_port; /* first port int range */ + uint32_t nr_ports; /* size of port range */ + uint8_t allow_access; /* allow or deny access to range? */ +}; +typedef struct xen_domctl_ioport_permission xen_domctl_ioport_permission_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_permission_t); + + +#define XEN_DOMCTL_hypercall_init 22 +struct xen_domctl_hypercall_init { + uint64_aligned_t gmfn; /* GMFN to be initialised */ +}; +typedef struct xen_domctl_hypercall_init xen_domctl_hypercall_init_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t); + + +#define XEN_DOMCTL_arch_setup 23 +#define _XEN_DOMAINSETUP_hvm_guest 0 +#define XEN_DOMAINSETUP_hvm_guest (1UL<<_XEN_DOMAINSETUP_hvm_guest) +#define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save) */ +#define XEN_DOMAINSETUP_query (1UL<<_XEN_DOMAINSETUP_query) +#define _XEN_DOMAINSETUP_sioemu_guest 2 +#define XEN_DOMAINSETUP_sioemu_guest (1UL<<_XEN_DOMAINSETUP_sioemu_guest) +typedef struct xen_domctl_arch_setup { + uint64_aligned_t flags; /* XEN_DOMAINSETUP_* */ +#ifdef __ia64__ + uint64_aligned_t bp; /* mpaddr of boot param area */ + uint64_aligned_t maxmem; /* Highest memory address for MDT. */ + uint64_aligned_t xsi_va; /* Xen shared_info area virtual address. */ + uint32_t hypercall_imm; /* Break imm for Xen hypercalls. */ + int8_t vhpt_size_log2; /* Log2 of VHPT size. */ +#endif +} xen_domctl_arch_setup_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t); + + +#define XEN_DOMCTL_settimeoffset 24 +struct xen_domctl_settimeoffset { + int32_t time_offset_seconds; /* applied to domain wallclock time */ +}; +typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t); + + +#define XEN_DOMCTL_gethvmcontext 33 +#define XEN_DOMCTL_sethvmcontext 34 +typedef struct xen_domctl_hvmcontext { + uint32_t size; /* IN/OUT: size of buffer / bytes filled */ + XEN_GUEST_HANDLE_64(uint8) buffer; /* IN/OUT: data, or call + * gethvmcontext with NULL + * buffer to get size req'd */ +} xen_domctl_hvmcontext_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_t); + + +#define XEN_DOMCTL_set_address_size 35 +#define XEN_DOMCTL_get_address_size 36 +typedef struct xen_domctl_address_size { + uint32_t size; +} xen_domctl_address_size_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_address_size_t); + + +#define XEN_DOMCTL_real_mode_area 26 +struct xen_domctl_real_mode_area { + uint32_t log; /* log2 of Real Mode Area size */ +}; +typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t); + + +#define XEN_DOMCTL_sendtrigger 28 +#define XEN_DOMCTL_SENDTRIGGER_NMI 0 +#define XEN_DOMCTL_SENDTRIGGER_RESET 1 +#define XEN_DOMCTL_SENDTRIGGER_INIT 2 +#define XEN_DOMCTL_SENDTRIGGER_POWER 3 +struct xen_domctl_sendtrigger { + uint32_t trigger; /* IN */ + uint32_t vcpu; /* IN */ +}; +typedef struct xen_domctl_sendtrigger xen_domctl_sendtrigger_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t); + + +/* Assign PCI device to HVM guest. Sets up IOMMU structures. */ +#define XEN_DOMCTL_assign_device 37 +#define XEN_DOMCTL_test_assign_device 45 +#define XEN_DOMCTL_deassign_device 47 +struct xen_domctl_assign_device { + uint32_t machine_bdf; /* machine PCI ID of assigned device */ +}; +typedef struct xen_domctl_assign_device xen_domctl_assign_device_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t); + +/* Retrieve sibling devices infomation of machine_bdf */ +#define XEN_DOMCTL_get_device_group 50 +struct xen_domctl_get_device_group { + uint32_t machine_bdf; /* IN */ + uint32_t max_sdevs; /* IN */ + uint32_t num_sdevs; /* OUT */ + XEN_GUEST_HANDLE_64(uint32) sdev_array; /* OUT */ +}; +typedef struct xen_domctl_get_device_group xen_domctl_get_device_group_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_get_device_group_t); + +/* Pass-through interrupts: bind real irq -> hvm devfn. */ +#define XEN_DOMCTL_bind_pt_irq 38 +#define XEN_DOMCTL_unbind_pt_irq 48 +typedef enum pt_irq_type_e { + PT_IRQ_TYPE_PCI, + PT_IRQ_TYPE_ISA, + PT_IRQ_TYPE_MSI, + PT_IRQ_TYPE_MSI_TRANSLATE, +} pt_irq_type_t; +struct xen_domctl_bind_pt_irq { + uint32_t machine_irq; + pt_irq_type_t irq_type; + uint32_t hvm_domid; + + union { + struct { + uint8_t isa_irq; + } isa; + struct { + uint8_t bus; + uint8_t device; + uint8_t intx; + } pci; + struct { + uint8_t gvec; + uint32_t gflags; + uint64_aligned_t gtable; + } msi; + } u; +}; +typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_pt_irq_t); + + +/* Bind machine I/O address range -> HVM address range. */ +#define XEN_DOMCTL_memory_mapping 39 +#define DPCI_ADD_MAPPING 1 +#define DPCI_REMOVE_MAPPING 0 +struct xen_domctl_memory_mapping { + uint64_aligned_t first_gfn; /* first page (hvm guest phys page) in range */ + uint64_aligned_t first_mfn; /* first page (machine page) in range */ + uint64_aligned_t nr_mfns; /* number of pages in range (>0) */ + uint32_t add_mapping; /* add or remove mapping */ + uint32_t padding; /* padding for 64-bit aligned structure */ +}; +typedef struct xen_domctl_memory_mapping xen_domctl_memory_mapping_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_memory_mapping_t); + + +/* Bind machine I/O port range -> HVM I/O port range. */ +#define XEN_DOMCTL_ioport_mapping 40 +struct xen_domctl_ioport_mapping { + uint32_t first_gport; /* first guest IO port*/ + uint32_t first_mport; /* first machine IO port */ + uint32_t nr_ports; /* size of port range */ + uint32_t add_mapping; /* add or remove mapping */ +}; +typedef struct xen_domctl_ioport_mapping xen_domctl_ioport_mapping_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_mapping_t); + + +/* + * Pin caching type of RAM space for x86 HVM domU. + */ +#define XEN_DOMCTL_pin_mem_cacheattr 41 +/* Caching types: these happen to be the same as x86 MTRR/PAT type codes. */ +#define XEN_DOMCTL_MEM_CACHEATTR_UC 0 +#define XEN_DOMCTL_MEM_CACHEATTR_WC 1 +#define XEN_DOMCTL_MEM_CACHEATTR_WT 4 +#define XEN_DOMCTL_MEM_CACHEATTR_WP 5 +#define XEN_DOMCTL_MEM_CACHEATTR_WB 6 +#define XEN_DOMCTL_MEM_CACHEATTR_UCM 7 +struct xen_domctl_pin_mem_cacheattr { + uint64_aligned_t start, end; + unsigned int type; /* XEN_DOMCTL_MEM_CACHEATTR_* */ +}; +typedef struct xen_domctl_pin_mem_cacheattr xen_domctl_pin_mem_cacheattr_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t); + + +#define XEN_DOMCTL_set_ext_vcpucontext 42 +#define XEN_DOMCTL_get_ext_vcpucontext 43 +struct xen_domctl_ext_vcpucontext { + /* IN: VCPU that this call applies to. */ + uint32_t vcpu; + /* + * SET: Size of struct (IN) + * GET: Size of struct (OUT) + */ + uint32_t size; +#if defined(__i386__) || defined(__x86_64__) + /* SYSCALL from 32-bit mode and SYSENTER callback information. */ + /* NB. SYSCALL from 64-bit mode is contained in vcpu_guest_context_t */ + uint64_aligned_t syscall32_callback_eip; + uint64_aligned_t sysenter_callback_eip; + uint16_t syscall32_callback_cs; + uint16_t sysenter_callback_cs; + uint8_t syscall32_disables_events; + uint8_t sysenter_disables_events; +#endif +}; +typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpucontext_t); + +/* + * Set optimizaton features for a domain + */ +#define XEN_DOMCTL_set_opt_feature 44 +struct xen_domctl_set_opt_feature { +#if defined(__ia64__) + struct xen_ia64_opt_feature optf; +#else + /* Make struct non-empty: do not depend on this field name! */ + uint64_t dummy; +#endif +}; +typedef struct xen_domctl_set_opt_feature xen_domctl_set_opt_feature_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_opt_feature_t); + +/* + * Set the target domain for a domain + */ +#define XEN_DOMCTL_set_target 46 +struct xen_domctl_set_target { + domid_t target; +}; +typedef struct xen_domctl_set_target xen_domctl_set_target_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_target_t); + +#if defined(__i386__) || defined(__x86_64__) +# define XEN_CPUID_INPUT_UNUSED 0xFFFFFFFF +# define XEN_DOMCTL_set_cpuid 49 +struct xen_domctl_cpuid { + unsigned int input[2]; + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; +}; +typedef struct xen_domctl_cpuid xen_domctl_cpuid_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpuid_t); +#endif + +#define XEN_DOMCTL_subscribe 29 +struct xen_domctl_subscribe { + uint32_t port; /* IN */ +}; +typedef struct xen_domctl_subscribe xen_domctl_subscribe_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_subscribe_t); + +/* + * Define the maximum machine address size which should be allocated + * to a guest. + */ +#define XEN_DOMCTL_set_machine_address_size 51 +#define XEN_DOMCTL_get_machine_address_size 52 + +/* + * Do not inject spurious page faults into this domain. + */ +#define XEN_DOMCTL_suppress_spurious_page_faults 53 + +#define XEN_DOMCTL_debug_op 54 +#define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF 0 +#define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON 1 +struct xen_domctl_debug_op { + uint32_t op; /* IN */ + uint32_t vcpu; /* IN */ +}; +typedef struct xen_domctl_debug_op xen_domctl_debug_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_debug_op_t); + +/* + * Request a particular record from the HVM context + */ +#define XEN_DOMCTL_gethvmcontext_partial 55 +typedef struct xen_domctl_hvmcontext_partial { + uint32_t type; /* IN: Type of record required */ + uint32_t instance; /* IN: Instance of that type */ + XEN_GUEST_HANDLE_64(uint8) buffer; /* OUT: buffer to write record into */ +} xen_domctl_hvmcontext_partial_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t); + + +struct xen_domctl { + uint32_t cmd; + uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */ + domid_t domain; + union { + struct xen_domctl_createdomain createdomain; + struct xen_domctl_getdomaininfo getdomaininfo; + struct xen_domctl_getmemlist getmemlist; + struct xen_domctl_getpageframeinfo getpageframeinfo; + struct xen_domctl_getpageframeinfo2 getpageframeinfo2; + struct xen_domctl_vcpuaffinity vcpuaffinity; + struct xen_domctl_shadow_op shadow_op; + struct xen_domctl_max_mem max_mem; + struct xen_domctl_vcpucontext vcpucontext; + struct xen_domctl_getvcpuinfo getvcpuinfo; + struct xen_domctl_max_vcpus max_vcpus; + struct xen_domctl_scheduler_op scheduler_op; + struct xen_domctl_setdomainhandle setdomainhandle; + struct xen_domctl_setdebugging setdebugging; + struct xen_domctl_irq_permission irq_permission; + struct xen_domctl_iomem_permission iomem_permission; + struct xen_domctl_ioport_permission ioport_permission; + struct xen_domctl_hypercall_init hypercall_init; + struct xen_domctl_arch_setup arch_setup; + struct xen_domctl_settimeoffset settimeoffset; + struct xen_domctl_real_mode_area real_mode_area; + struct xen_domctl_hvmcontext hvmcontext; + struct xen_domctl_hvmcontext_partial hvmcontext_partial; + struct xen_domctl_address_size address_size; + struct xen_domctl_sendtrigger sendtrigger; + struct xen_domctl_get_device_group get_device_group; + struct xen_domctl_assign_device assign_device; + struct xen_domctl_bind_pt_irq bind_pt_irq; + struct xen_domctl_memory_mapping memory_mapping; + struct xen_domctl_ioport_mapping ioport_mapping; + struct xen_domctl_pin_mem_cacheattr pin_mem_cacheattr; + struct xen_domctl_ext_vcpucontext ext_vcpucontext; + struct xen_domctl_set_opt_feature set_opt_feature; + struct xen_domctl_set_target set_target; + struct xen_domctl_subscribe subscribe; + struct xen_domctl_debug_op debug_op; +#if defined(__i386__) || defined(__x86_64__) + struct xen_domctl_cpuid cpuid; +#endif + uint8_t pad[128]; + } u; +}; +typedef struct xen_domctl xen_domctl_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_t); + +#endif /* __XEN_PUBLIC_DOMCTL_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/xsm/acm.h +++ linux-ec2-2.6.32/include/xen/interface/xsm/acm.h @@ -0,0 +1,235 @@ +/* + * acm.h: Xen access control module interface defintions + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Reiner Sailer + * Copyright (c) 2005, International Business Machines Corporation. + */ + +#ifndef _XEN_PUBLIC_ACM_H +#define _XEN_PUBLIC_ACM_H + +#include "../xen.h" + +/* if ACM_DEBUG defined, all hooks should + * print a short trace message (comment it out + * when not in testing mode ) + */ +/* #define ACM_DEBUG */ + +#ifdef ACM_DEBUG +# define printkd(fmt, args...) printk(fmt,## args) +#else +# define printkd(fmt, args...) +#endif + +/* default ssid reference value if not supplied */ +#define ACM_DEFAULT_SSID 0x0 +#define ACM_DEFAULT_LOCAL_SSID 0x0 + +/* Internal ACM ERROR types */ +#define ACM_OK 0 +#define ACM_UNDEF -1 +#define ACM_INIT_SSID_ERROR -2 +#define ACM_INIT_SOID_ERROR -3 +#define ACM_ERROR -4 + +/* External ACCESS DECISIONS */ +#define ACM_ACCESS_PERMITTED 0 +#define ACM_ACCESS_DENIED -111 +#define ACM_NULL_POINTER_ERROR -200 + +/* + Error codes reported in when trying to test for a new policy + These error codes are reported in an array of tuples where + each error code is followed by a parameter describing the error + more closely, such as a domain id. +*/ +#define ACM_EVTCHN_SHARING_VIOLATION 0x100 +#define ACM_GNTTAB_SHARING_VIOLATION 0x101 +#define ACM_DOMAIN_LOOKUP 0x102 +#define ACM_CHWALL_CONFLICT 0x103 +#define ACM_SSIDREF_IN_USE 0x104 + + +/* primary policy in lower 4 bits */ +#define ACM_NULL_POLICY 0 +#define ACM_CHINESE_WALL_POLICY 1 +#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2 +#define ACM_POLICY_UNDEFINED 15 + +/* combinations have secondary policy component in higher 4bit */ +#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \ + ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY) + +/* policy: */ +#define ACM_POLICY_NAME(X) \ + ((X) == (ACM_NULL_POLICY)) ? "NULL" : \ + ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL" : \ + ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT" : \ + ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT" : \ + "UNDEFINED" + +/* the following policy versions must be increased + * whenever the interpretation of the related + * policy's data structure changes + */ +#define ACM_POLICY_VERSION 4 +#define ACM_CHWALL_VERSION 1 +#define ACM_STE_VERSION 1 + +/* defines a ssid reference used by xen */ +typedef uint32_t ssidref_t; + +/* hooks that are known to domains */ +#define ACMHOOK_none 0 +#define ACMHOOK_sharing 1 +#define ACMHOOK_authorization 2 +#define ACMHOOK_conflictset 3 + +/* -------security policy relevant type definitions-------- */ + +/* type identifier; compares to "equal" or "not equal" */ +typedef uint16_t domaintype_t; + +/* CHINESE WALL POLICY DATA STRUCTURES + * + * current accumulated conflict type set: + * When a domain is started and has a type that is in + * a conflict set, the conflicting types are incremented in + * the aggregate set. When a domain is destroyed, the + * conflicting types to its type are decremented. + * If a domain has multiple types, this procedure works over + * all those types. + * + * conflict_aggregate_set[i] holds the number of + * running domains that have a conflict with type i. + * + * running_types[i] holds the number of running domains + * that include type i in their ssidref-referenced type set + * + * conflict_sets[i][j] is "0" if type j has no conflict + * with type i and is "1" otherwise. + */ +/* high-16 = version, low-16 = check magic */ +#define ACM_MAGIC 0x0001debc + +/* size of the SHA1 hash identifying the XML policy from which the + binary policy was created */ +#define ACM_SHA1_HASH_SIZE 20 + +/* each offset in bytes from start of the struct they + * are part of */ + +/* V3 of the policy buffer aded a version structure */ +struct acm_policy_version +{ + uint32_t major; + uint32_t minor; +}; + + +/* each buffer consists of all policy information for + * the respective policy given in the policy code + * + * acm_policy_buffer, acm_chwall_policy_buffer, + * and acm_ste_policy_buffer need to stay 32-bit aligned + * because we create binary policies also with external + * tools that assume packed representations (e.g. the java tool) + */ +struct acm_policy_buffer { + uint32_t magic; + uint32_t policy_version; /* ACM_POLICY_VERSION */ + uint32_t len; + uint32_t policy_reference_offset; + uint32_t primary_policy_code; + uint32_t primary_buffer_offset; + uint32_t secondary_policy_code; + uint32_t secondary_buffer_offset; + struct acm_policy_version xml_pol_version; /* add in V3 */ + uint8_t xml_policy_hash[ACM_SHA1_HASH_SIZE]; /* added in V4 */ +}; + + +struct acm_policy_reference_buffer { + uint32_t len; +}; + +struct acm_chwall_policy_buffer { + uint32_t policy_version; /* ACM_CHWALL_VERSION */ + uint32_t policy_code; + uint32_t chwall_max_types; + uint32_t chwall_max_ssidrefs; + uint32_t chwall_max_conflictsets; + uint32_t chwall_ssid_offset; + uint32_t chwall_conflict_sets_offset; + uint32_t chwall_running_types_offset; + uint32_t chwall_conflict_aggregate_offset; +}; + +struct acm_ste_policy_buffer { + uint32_t policy_version; /* ACM_STE_VERSION */ + uint32_t policy_code; + uint32_t ste_max_types; + uint32_t ste_max_ssidrefs; + uint32_t ste_ssid_offset; +}; + +struct acm_stats_buffer { + uint32_t magic; + uint32_t len; + uint32_t primary_policy_code; + uint32_t primary_stats_offset; + uint32_t secondary_policy_code; + uint32_t secondary_stats_offset; +}; + +struct acm_ste_stats_buffer { + uint32_t ec_eval_count; + uint32_t gt_eval_count; + uint32_t ec_denied_count; + uint32_t gt_denied_count; + uint32_t ec_cachehit_count; + uint32_t gt_cachehit_count; +}; + +struct acm_ssid_buffer { + uint32_t len; + ssidref_t ssidref; + uint32_t policy_reference_offset; + uint32_t primary_policy_code; + uint32_t primary_max_types; + uint32_t primary_types_offset; + uint32_t secondary_policy_code; + uint32_t secondary_max_types; + uint32_t secondary_types_offset; +}; + +#endif + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/xsm/acm_ops.h +++ linux-ec2-2.6.32/include/xen/interface/xsm/acm_ops.h @@ -0,0 +1,159 @@ +/* + * acm_ops.h: Xen access control module hypervisor commands + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Reiner Sailer + * Copyright (c) 2005,2006 International Business Machines Corporation. + */ + +#ifndef __XEN_PUBLIC_ACM_OPS_H__ +#define __XEN_PUBLIC_ACM_OPS_H__ + +#include "../xen.h" +#include "acm.h" + +/* + * Make sure you increment the interface version whenever you modify this file! + * This makes sure that old versions of acm tools will stop working in a + * well-defined way (rather than crashing the machine, for instance). + */ +#define ACM_INTERFACE_VERSION 0xAAAA000A + +/************************************************************************/ + +/* + * Prototype for this hypercall is: + * int acm_op(int cmd, void *args) + * @cmd == ACMOP_??? (access control module operation). + * @args == Operation-specific extra arguments (NULL if none). + */ + + +#define ACMOP_setpolicy 1 +struct acm_setpolicy { + /* IN */ + XEN_GUEST_HANDLE_64(void) pushcache; + uint32_t pushcache_size; +}; + + +#define ACMOP_getpolicy 2 +struct acm_getpolicy { + /* IN */ + XEN_GUEST_HANDLE_64(void) pullcache; + uint32_t pullcache_size; +}; + + +#define ACMOP_dumpstats 3 +struct acm_dumpstats { + /* IN */ + XEN_GUEST_HANDLE_64(void) pullcache; + uint32_t pullcache_size; +}; + + +#define ACMOP_getssid 4 +#define ACM_GETBY_ssidref 1 +#define ACM_GETBY_domainid 2 +struct acm_getssid { + /* IN */ + uint32_t get_ssid_by; /* ACM_GETBY_* */ + union { + domaintype_t domainid; + ssidref_t ssidref; + } id; + XEN_GUEST_HANDLE_64(void) ssidbuf; + uint32_t ssidbuf_size; +}; + +#define ACMOP_getdecision 5 +struct acm_getdecision { + /* IN */ + uint32_t get_decision_by1; /* ACM_GETBY_* */ + uint32_t get_decision_by2; /* ACM_GETBY_* */ + union { + domaintype_t domainid; + ssidref_t ssidref; + } id1; + union { + domaintype_t domainid; + ssidref_t ssidref; + } id2; + uint32_t hook; + /* OUT */ + uint32_t acm_decision; +}; + + +#define ACMOP_chgpolicy 6 +struct acm_change_policy { + /* IN */ + XEN_GUEST_HANDLE_64(void) policy_pushcache; + uint32_t policy_pushcache_size; + XEN_GUEST_HANDLE_64(void) del_array; + uint32_t delarray_size; + XEN_GUEST_HANDLE_64(void) chg_array; + uint32_t chgarray_size; + /* OUT */ + /* array with error code */ + XEN_GUEST_HANDLE_64(void) err_array; + uint32_t errarray_size; +}; + +#define ACMOP_relabeldoms 7 +struct acm_relabel_doms { + /* IN */ + XEN_GUEST_HANDLE_64(void) relabel_map; + uint32_t relabel_map_size; + /* OUT */ + XEN_GUEST_HANDLE_64(void) err_array; + uint32_t errarray_size; +}; + +/* future interface to Xen */ +struct xen_acmctl { + uint32_t cmd; + uint32_t interface_version; + union { + struct acm_setpolicy setpolicy; + struct acm_getpolicy getpolicy; + struct acm_dumpstats dumpstats; + struct acm_getssid getssid; + struct acm_getdecision getdecision; + struct acm_change_policy change_policy; + struct acm_relabel_doms relabel_doms; + } u; +}; + +typedef struct xen_acmctl xen_acmctl_t; +DEFINE_XEN_GUEST_HANDLE(xen_acmctl_t); + +#endif /* __XEN_PUBLIC_ACM_OPS_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/xsm/flask_op.h +++ linux-ec2-2.6.32/include/xen/interface/xsm/flask_op.h @@ -0,0 +1,45 @@ +/* + * This file contains the flask_op hypercall commands and definitions. + * + * Author: George Coker, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + */ + +#ifndef __FLASK_OP_H__ +#define __FLASK_OP_H__ + +#define FLASK_LOAD 1 +#define FLASK_GETENFORCE 2 +#define FLASK_SETENFORCE 3 +#define FLASK_CONTEXT_TO_SID 4 +#define FLASK_SID_TO_CONTEXT 5 +#define FLASK_ACCESS 6 +#define FLASK_CREATE 7 +#define FLASK_RELABEL 8 +#define FLASK_USER 9 +#define FLASK_POLICYVERS 10 +#define FLASK_GETBOOL 11 +#define FLASK_SETBOOL 12 +#define FLASK_COMMITBOOLS 13 +#define FLASK_MLS 14 +#define FLASK_DISABLE 15 +#define FLASK_GETAVC_THRESHOLD 16 +#define FLASK_SETAVC_THRESHOLD 17 +#define FLASK_AVC_HASHSTATS 18 +#define FLASK_AVC_CACHESTATS 19 +#define FLASK_MEMBER 20 + +#define FLASK_LAST FLASK_MEMBER + +typedef struct flask_op { + uint32_t cmd; + uint32_t size; + char *buf; +} flask_op_t; + +DEFINE_XEN_GUEST_HANDLE(flask_op_t); + +#endif --- linux-ec2-2.6.32.orig/include/xen/interface/io/xs_wire.h +++ linux-ec2-2.6.32/include/xen/interface/io/xs_wire.h @@ -1,6 +1,25 @@ /* * Details of the "wire" protocol between Xen Store Daemon and client * library or guest kernel. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (C) 2005 Rusty Russell IBM Corporation */ @@ -26,7 +45,9 @@ XS_SET_PERMS, XS_WATCH_EVENT, XS_ERROR, - XS_IS_DOMAIN_INTRODUCED + XS_IS_DOMAIN_INTRODUCED, + XS_RESUME, + XS_SET_TARGET }; #define XS_WRITE_NONE "NONE" @@ -39,8 +60,14 @@ int errnum; const char *errstring; }; +#ifdef EINVAL #define XSD_ERROR(x) { x, #x } -static struct xsd_errors xsd_errors[] __attribute__((unused)) = { +/* LINTED: static unused */ +static struct xsd_errors xsd_errors[] +#if defined(__GNUC__) +__attribute__((unused)) +#endif + = { XSD_ERROR(EINVAL), XSD_ERROR(EACCES), XSD_ERROR(EEXIST), @@ -56,6 +83,7 @@ XSD_ERROR(EAGAIN), XSD_ERROR(EISCONN) }; +#endif struct xsd_sockmsg { @@ -84,4 +112,11 @@ XENSTORE_RING_IDX rsp_cons, rsp_prod; }; +/* Violating this is very bad. See docs/misc/xenstore.txt. */ +#define XENSTORE_PAYLOAD_MAX 4096 + +/* Violating these just gets you an error back */ +#define XENSTORE_ABS_PATH_MAX 3072 +#define XENSTORE_REL_PATH_MAX 2048 + #endif /* _XS_WIRE_H */ --- linux-ec2-2.6.32.orig/include/xen/interface/io/kbdif.h +++ linux-ec2-2.6.32/include/xen/interface/io/kbdif.h @@ -45,34 +45,38 @@ */ #define XENKBD_TYPE_POS 4 -struct xenkbd_motion { - uint8_t type; /* XENKBD_TYPE_MOTION */ - int32_t rel_x; /* relative X motion */ - int32_t rel_y; /* relative Y motion */ - int32_t rel_z; /* relative Z motion (wheel) */ -}; - -struct xenkbd_key { - uint8_t type; /* XENKBD_TYPE_KEY */ - uint8_t pressed; /* 1 if pressed; 0 otherwise */ - uint32_t keycode; /* KEY_* from linux/input.h */ -}; - -struct xenkbd_position { - uint8_t type; /* XENKBD_TYPE_POS */ - int32_t abs_x; /* absolute X position (in FB pixels) */ - int32_t abs_y; /* absolute Y position (in FB pixels) */ - int32_t rel_z; /* relative Z motion (wheel) */ +struct xenkbd_motion +{ + uint8_t type; /* XENKBD_TYPE_MOTION */ + int32_t rel_x; /* relative X motion */ + int32_t rel_y; /* relative Y motion */ + int32_t rel_z; /* relative Z motion (wheel) */ +}; + +struct xenkbd_key +{ + uint8_t type; /* XENKBD_TYPE_KEY */ + uint8_t pressed; /* 1 if pressed; 0 otherwise */ + uint32_t keycode; /* KEY_* from linux/input.h */ +}; + +struct xenkbd_position +{ + uint8_t type; /* XENKBD_TYPE_POS */ + int32_t abs_x; /* absolute X position (in FB pixels) */ + int32_t abs_y; /* absolute Y position (in FB pixels) */ + int32_t rel_z; /* relative Z motion (wheel) */ }; #define XENKBD_IN_EVENT_SIZE 40 -union xenkbd_in_event { - uint8_t type; - struct xenkbd_motion motion; - struct xenkbd_key key; - struct xenkbd_position pos; - char pad[XENKBD_IN_EVENT_SIZE]; +union xenkbd_in_event +{ + uint8_t type; + struct xenkbd_motion motion; + struct xenkbd_key key; + struct xenkbd_position pos; + char pad[XENKBD_IN_EVENT_SIZE]; }; /* Out events (frontend -> backend) */ @@ -85,9 +89,10 @@ #define XENKBD_OUT_EVENT_SIZE 40 -union xenkbd_out_event { - uint8_t type; - char pad[XENKBD_OUT_EVENT_SIZE]; +union xenkbd_out_event +{ + uint8_t type; + char pad[XENKBD_OUT_EVENT_SIZE]; }; /* shared page */ @@ -96,21 +101,22 @@ #define XENKBD_IN_RING_LEN (XENKBD_IN_RING_SIZE / XENKBD_IN_EVENT_SIZE) #define XENKBD_IN_RING_OFFS 1024 #define XENKBD_IN_RING(page) \ - ((union xenkbd_in_event *)((char *)(page) + XENKBD_IN_RING_OFFS)) + ((union xenkbd_in_event *)((char *)(page) + XENKBD_IN_RING_OFFS)) #define XENKBD_IN_RING_REF(page, idx) \ - (XENKBD_IN_RING((page))[(idx) % XENKBD_IN_RING_LEN]) + (XENKBD_IN_RING((page))[(idx) % XENKBD_IN_RING_LEN]) #define XENKBD_OUT_RING_SIZE 1024 #define XENKBD_OUT_RING_LEN (XENKBD_OUT_RING_SIZE / XENKBD_OUT_EVENT_SIZE) #define XENKBD_OUT_RING_OFFS (XENKBD_IN_RING_OFFS + XENKBD_IN_RING_SIZE) #define XENKBD_OUT_RING(page) \ - ((union xenkbd_out_event *)((char *)(page) + XENKBD_OUT_RING_OFFS)) + ((union xenkbd_out_event *)((char *)(page) + XENKBD_OUT_RING_OFFS)) #define XENKBD_OUT_RING_REF(page, idx) \ - (XENKBD_OUT_RING((page))[(idx) % XENKBD_OUT_RING_LEN]) + (XENKBD_OUT_RING((page))[(idx) % XENKBD_OUT_RING_LEN]) -struct xenkbd_page { - uint32_t in_cons, in_prod; - uint32_t out_cons, out_prod; +struct xenkbd_page +{ + uint32_t in_cons, in_prod; + uint32_t out_cons, out_prod; }; #endif --- linux-ec2-2.6.32.orig/include/xen/interface/io/xenbus.h +++ linux-ec2-2.6.32/include/xen/interface/io/xenbus.h @@ -3,42 +3,68 @@ * * Xenbus protocol details. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (C) 2005 XenSource Ltd. */ #ifndef _XEN_PUBLIC_IO_XENBUS_H #define _XEN_PUBLIC_IO_XENBUS_H -/* The state of either end of the Xenbus, i.e. the current communication - status of initialisation across the bus. States here imply nothing about - the state of the connection between the driver and the kernel's device - layers. */ -enum xenbus_state -{ - XenbusStateUnknown = 0, - XenbusStateInitialising = 1, - XenbusStateInitWait = 2, /* Finished early - initialisation, but waiting - for information from the peer - or hotplug scripts. */ - XenbusStateInitialised = 3, /* Initialised and waiting for a - connection from the peer. */ - XenbusStateConnected = 4, - XenbusStateClosing = 5, /* The device is being closed - due to an error or an unplug - event. */ - XenbusStateClosed = 6 +/* + * The state of either end of the Xenbus, i.e. the current communication + * status of initialisation across the bus. States here imply nothing about + * the state of the connection between the driver and the kernel's device + * layers. + */ +enum xenbus_state { + XenbusStateUnknown = 0, + + XenbusStateInitialising = 1, + + /* + * InitWait: Finished early initialisation but waiting for information + * from the peer or hotplug scripts. + */ + XenbusStateInitWait = 2, + + /* + * Initialised: Waiting for a connection from the peer. + */ + XenbusStateInitialised = 3, + XenbusStateConnected = 4, + + /* + * Closing: The device is being closed due to an error or an unplug event. + */ + XenbusStateClosing = 5, + + XenbusStateClosed = 6, + + /* + * Reconfiguring: The device is being reconfigured. + */ + XenbusStateReconfiguring = 7, + + XenbusStateReconfigured = 8 }; +typedef enum xenbus_state XenbusState; #endif /* _XEN_PUBLIC_IO_XENBUS_H */ - -/* - * Local variables: - * c-file-style: "linux" - * indent-tabs-mode: t - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ --- linux-ec2-2.6.32.orig/include/xen/interface/io/tpmif.h +++ linux-ec2-2.6.32/include/xen/interface/io/tpmif.h @@ -0,0 +1,77 @@ +/****************************************************************************** + * tpmif.h + * + * TPM I/O interface for Xen guest OSes. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2005, IBM Corporation + * + * Author: Stefan Berger, stefanb@us.ibm.com + * Grant table support: Mahadevan Gomathisankaran + * + * This code has been derived from tools/libxc/xen/io/netif.h + * + * Copyright (c) 2003-2004, Keir Fraser + */ + +#ifndef __XEN_PUBLIC_IO_TPMIF_H__ +#define __XEN_PUBLIC_IO_TPMIF_H__ + +#include "../grant_table.h" + +struct tpmif_tx_request { + unsigned long addr; /* Machine address of packet. */ + grant_ref_t ref; /* grant table access reference */ + uint16_t unused; + uint16_t size; /* Packet size in bytes. */ +}; +typedef struct tpmif_tx_request tpmif_tx_request_t; + +/* + * The TPMIF_TX_RING_SIZE defines the number of pages the + * front-end and backend can exchange (= size of array). + */ +typedef uint32_t TPMIF_RING_IDX; + +#define TPMIF_TX_RING_SIZE 1 + +/* This structure must fit in a memory page. */ + +struct tpmif_ring { + struct tpmif_tx_request req; +}; +typedef struct tpmif_ring tpmif_ring_t; + +struct tpmif_tx_interface { + struct tpmif_ring ring[TPMIF_TX_RING_SIZE]; +}; +typedef struct tpmif_tx_interface tpmif_tx_interface_t; + +#endif + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/io/pciif.h +++ linux-ec2-2.6.32/include/xen/interface/io/pciif.h @@ -0,0 +1,124 @@ +/* + * PCI Backend/Frontend Common Data Structures & Macros + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Ryan Wilson + */ +#ifndef __XEN_PCI_COMMON_H__ +#define __XEN_PCI_COMMON_H__ + +/* Be sure to bump this number if you change this file */ +#define XEN_PCI_MAGIC "7" + +/* xen_pci_sharedinfo flags */ +#define _XEN_PCIF_active (0) +#define XEN_PCIF_active (1<<_XEN_PCIF_active) +#define _XEN_PCIB_AERHANDLER (1) +#define XEN_PCIB_AERHANDLER (1<<_XEN_PCIB_AERHANDLER) +#define _XEN_PCIB_active (2) +#define XEN_PCIB_active (1<<_XEN_PCIB_active) + +/* xen_pci_op commands */ +#define XEN_PCI_OP_conf_read (0) +#define XEN_PCI_OP_conf_write (1) +#define XEN_PCI_OP_enable_msi (2) +#define XEN_PCI_OP_disable_msi (3) +#define XEN_PCI_OP_enable_msix (4) +#define XEN_PCI_OP_disable_msix (5) +#define XEN_PCI_OP_aer_detected (6) +#define XEN_PCI_OP_aer_resume (7) +#define XEN_PCI_OP_aer_mmio (8) +#define XEN_PCI_OP_aer_slotreset (9) + +/* xen_pci_op error numbers */ +#define XEN_PCI_ERR_success (0) +#define XEN_PCI_ERR_dev_not_found (-1) +#define XEN_PCI_ERR_invalid_offset (-2) +#define XEN_PCI_ERR_access_denied (-3) +#define XEN_PCI_ERR_not_implemented (-4) +/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */ +#define XEN_PCI_ERR_op_failed (-5) + +/* + * it should be PAGE_SIZE-sizeof(struct xen_pci_op))/sizeof(struct msix_entry)) + * Should not exceed 128 + */ +#define SH_INFO_MAX_VEC 128 + +struct xen_msix_entry { + uint16_t vector; + uint16_t entry; +}; +struct xen_pci_op { + /* IN: what action to perform: XEN_PCI_OP_* */ + uint32_t cmd; + + /* OUT: will contain an error number (if any) from errno.h */ + int32_t err; + + /* IN: which device to touch */ + uint32_t domain; /* PCI Domain/Segment */ + uint32_t bus; + uint32_t devfn; + + /* IN: which configuration registers to touch */ + int32_t offset; + int32_t size; + + /* IN/OUT: Contains the result after a READ or the value to WRITE */ + uint32_t value; + /* IN: Contains extra infor for this operation */ + uint32_t info; + /*IN: param for msi-x */ + struct xen_msix_entry msix_entries[SH_INFO_MAX_VEC]; +}; + +/*used for pcie aer handling*/ +struct xen_pcie_aer_op +{ + + /* IN: what action to perform: XEN_PCI_OP_* */ + uint32_t cmd; + /*IN/OUT: return aer_op result or carry error_detected state as input*/ + int32_t err; + + /* IN: which device to touch */ + uint32_t domain; /* PCI Domain/Segment*/ + uint32_t bus; + uint32_t devfn; +}; +struct xen_pci_sharedinfo { + /* flags - XEN_PCIF_* */ + uint32_t flags; + struct xen_pci_op op; + struct xen_pcie_aer_op aer_op; +}; + +#endif /* __XEN_PCI_COMMON_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/io/blkif.h +++ linux-ec2-2.6.32/include/xen/interface/io/blkif.h @@ -3,6 +3,24 @@ * * Unified block-device I/O interface for Xen guest OSes. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2003-2004, Keir Fraser */ @@ -24,8 +42,10 @@ * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()). */ -typedef uint16_t blkif_vdev_t; -typedef uint64_t blkif_sector_t; +#ifndef blkif_vdev_t +#define blkif_vdev_t uint16_t +#endif +#define blkif_sector_t uint64_t /* * REQUEST CODES. @@ -34,7 +54,7 @@ #define BLKIF_OP_WRITE 1 /* * Recognised only if "feature-barrier" is present in backend xenbus info. - * The "feature_barrier" node contains a boolean indicating whether barrier + * The "feature-barrier" node contains a boolean indicating whether barrier * requests are likely to succeed or fail. Either way, a barrier request * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by * the underlying block-device hardware. The boolean simply indicates whether @@ -43,33 +63,61 @@ * create the "feature-barrier" node! */ #define BLKIF_OP_WRITE_BARRIER 2 +/* + * Recognised if "feature-flush-cache" is present in backend xenbus + * info. A flush will ask the underlying storage hardware to flush its + * non-volatile caches as appropriate. The "feature-flush-cache" node + * contains a boolean indicating whether flush requests are likely to + * succeed or fail. Either way, a flush request may fail at any time + * with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying + * block-device hardware. The boolean simply indicates whether or not it + * is worthwhile for the frontend to attempt flushes. If a backend does + * not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the + * "feature-flush-cache" node! + */ +#define BLKIF_OP_FLUSH_DISKCACHE 3 +/* + * Device specific command packet contained within the request + */ +#define BLKIF_OP_PACKET 4 /* * Maximum scatter/gather segments per request. - * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE. * NB. This could be 12 if the ring indexes weren't stored in the same page. */ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 +/* + * NB. first_sect and last_sect in blkif_request_segment, as well as + * sector_number in blkif_request, are always expressed in 512-byte units. + * However they must be properly aligned to the real sector size of the + * physical disk, which is reported in the "sector-size" node in the backend + * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units. + */ +struct blkif_request_segment { + grant_ref_t gref; /* reference to I/O buffer frame */ + /* @first_sect: first sector in frame to transfer (inclusive). */ + /* @last_sect: last sector in frame to transfer (inclusive). */ + uint8_t first_sect, last_sect; +}; + struct blkif_request { - uint8_t operation; /* BLKIF_OP_??? */ - uint8_t nr_segments; /* number of segments */ - blkif_vdev_t handle; /* only for read/write requests */ - uint64_t id; /* private guest value, echoed in resp */ - blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ - struct blkif_request_segment { - grant_ref_t gref; /* reference to I/O buffer frame */ - /* @first_sect: first sector in frame to transfer (inclusive). */ - /* @last_sect: last sector in frame to transfer (inclusive). */ - uint8_t first_sect, last_sect; - } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; +typedef struct blkif_request blkif_request_t; struct blkif_response { - uint64_t id; /* copied from request */ - uint8_t operation; /* copied from request */ - int16_t status; /* BLKIF_RSP_??? */ + uint64_t id; /* copied from request */ + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ }; +typedef struct blkif_response blkif_response_t; /* * STATUS RETURN CODES. --- linux-ec2-2.6.32.orig/include/xen/interface/io/netif.h +++ linux-ec2-2.6.32/include/xen/interface/io/netif.h @@ -3,6 +3,24 @@ * * Unified network-device I/O interface for Xen guest OSes. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2003-2004, Keir Fraser */ @@ -47,18 +65,21 @@ #define _NETTXF_extra_info (3) #define NETTXF_extra_info (1U<<_NETTXF_extra_info) -struct xen_netif_tx_request { +struct netif_tx_request { grant_ref_t gref; /* Reference to buffer page */ uint16_t offset; /* Offset within buffer page */ uint16_t flags; /* NETTXF_* */ uint16_t id; /* Echoed in response message. */ uint16_t size; /* Packet size in bytes. */ }; +typedef struct netif_tx_request netif_tx_request_t; /* Types of netif_extra_info descriptors. */ -#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */ -#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */ -#define XEN_NETIF_EXTRA_TYPE_MAX (2) +#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */ +#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */ +#define XEN_NETIF_EXTRA_TYPE_MCAST_ADD (2) /* u.mcast */ +#define XEN_NETIF_EXTRA_TYPE_MCAST_DEL (3) /* u.mcast */ +#define XEN_NETIF_EXTRA_TYPE_MAX (4) /* netif_extra_info flags. */ #define _XEN_NETIF_EXTRA_FLAG_MORE (0) @@ -71,49 +92,68 @@ * This structure needs to fit within both netif_tx_request and * netif_rx_response for compatibility. */ -struct xen_netif_extra_info { - uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */ - uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */ - - union { - struct { - /* - * Maximum payload size of each segment. For - * example, for TCP this is just the path MSS. - */ - uint16_t size; - - /* - * GSO type. This determines the protocol of - * the packet and any extra features required - * to segment the packet properly. - */ - uint8_t type; /* XEN_NETIF_GSO_TYPE_* */ - - /* Future expansion. */ - uint8_t pad; - - /* - * GSO features. This specifies any extra GSO - * features required to process this packet, - * such as ECN support for TCPv4. - */ - uint16_t features; /* XEN_NETIF_GSO_FEAT_* */ - } gso; +struct netif_extra_info { + uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */ + uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */ + + union { + /* + * XEN_NETIF_EXTRA_TYPE_GSO: + */ + struct { + /* + * Maximum payload size of each segment. For example, for TCP this + * is just the path MSS. + */ + uint16_t size; + + /* + * GSO type. This determines the protocol of the packet and any + * extra features required to segment the packet properly. + */ + uint8_t type; /* XEN_NETIF_GSO_TYPE_* */ + + /* Future expansion. */ + uint8_t pad; + + /* + * GSO features. This specifies any extra GSO features required + * to process this packet, such as ECN support for TCPv4. + */ + uint16_t features; /* XEN_NETIF_GSO_FEAT_* */ + } gso; + + /* + * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}: + * Backend advertises availability via 'feature-multicast-control' + * xenbus node containing value '1'. + * Frontend requests this feature by advertising + * 'request-multicast-control' xenbus node containing value '1'. + * If multicast control is requested then multicast flooding is + * disabled and the frontend must explicitly register its interest + * in multicast groups using dummy transmit requests containing + * MCAST_{ADD,DEL} extra-info fragments. + */ + struct { + uint8_t addr[6]; /* Address to add/remove. */ + } mcast; - uint16_t pad[3]; - } u; + uint16_t pad[3]; + } u; }; +typedef struct netif_extra_info netif_extra_info_t; -struct xen_netif_tx_response { - uint16_t id; - int16_t status; /* NETIF_RSP_* */ +struct netif_tx_response { + uint16_t id; + int16_t status; /* NETIF_RSP_* */ }; +typedef struct netif_tx_response netif_tx_response_t; -struct xen_netif_rx_request { - uint16_t id; /* Echoed in response message. */ - grant_ref_t gref; /* Reference to incoming granted frame */ +struct netif_rx_request { + uint16_t id; /* Echoed in response message. */ + grant_ref_t gref; /* Reference to incoming granted frame */ }; +typedef struct netif_rx_request netif_rx_request_t; /* Packet data has been validated against protocol checksum. */ #define _NETRXF_data_validated (0) @@ -131,23 +171,34 @@ #define _NETRXF_extra_info (3) #define NETRXF_extra_info (1U<<_NETRXF_extra_info) -struct xen_netif_rx_response { +struct netif_rx_response { uint16_t id; uint16_t offset; /* Offset in page of start of received packet */ uint16_t flags; /* NETRXF_* */ int16_t status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */ }; +typedef struct netif_rx_response netif_rx_response_t; /* * Generate netif ring structures and types. */ +#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H) +DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response); +DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response); +#else +#define xen_netif_tx_request netif_tx_request +#define xen_netif_rx_request netif_rx_request +#define xen_netif_tx_response netif_tx_response +#define xen_netif_rx_response netif_rx_response DEFINE_RING_TYPES(xen_netif_tx, struct xen_netif_tx_request, struct xen_netif_tx_response); DEFINE_RING_TYPES(xen_netif_rx, struct xen_netif_rx_request, struct xen_netif_rx_response); +#define xen_netif_extra_info netif_extra_info +#endif #define NETIF_RSP_DROPPED -2 #define NETIF_RSP_ERROR -1 --- linux-ec2-2.6.32.orig/include/xen/interface/io/cdromif.h +++ linux-ec2-2.6.32/include/xen/interface/io/cdromif.h @@ -0,0 +1,120 @@ +/****************************************************************************** + * cdromif.h + * + * Shared definitions between backend driver and Xen guest Virtual CDROM + * block device. + * + * Copyright (c) 2008, Pat Campell plc@novell.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_IO_CDROMIF_H__ +#define __XEN_PUBLIC_IO_CDROMIF_H__ + +/* + * Queries backend for CDROM support + */ +#define XEN_TYPE_CDROM_SUPPORT _IO('c', 1) + +struct xen_cdrom_support +{ + uint32_t type; + int8_t ret; /* returned, 0 succeded, -1 error */ + int8_t err; /* returned, backend errno */ + int8_t supported; /* returned, 1 supported */ +}; + +/* + * Opens backend device, returns drive geometry or + * any encountered errors + */ +#define XEN_TYPE_CDROM_OPEN _IO('c', 2) + +struct xen_cdrom_open +{ + uint32_t type; + int8_t ret; + int8_t err; + int8_t pad; + int8_t media_present; /* returned */ + uint32_t sectors; /* returned */ + uint32_t sector_size; /* returned */ + int32_t payload_offset; /* offset to backend node name payload */ +}; + +/* + * Queries backend for media changed status + */ +#define XEN_TYPE_CDROM_MEDIA_CHANGED _IO('c', 3) + +struct xen_cdrom_media_changed +{ + uint32_t type; + int8_t ret; + int8_t err; + int8_t media_changed; /* returned */ +}; + +/* + * Sends vcd generic CDROM packet to backend, followed + * immediately by the vcd_generic_command payload + */ +#define XEN_TYPE_CDROM_PACKET _IO('c', 4) + +struct xen_cdrom_packet +{ + uint32_t type; + int8_t ret; + int8_t err; + int8_t pad[2]; + int32_t payload_offset; /* offset to vcd_generic_command payload */ +}; + +/* CDROM_PACKET_COMMAND, payload for XEN_TYPE_CDROM_PACKET */ +struct vcd_generic_command +{ + uint8_t cmd[CDROM_PACKET_SIZE]; + uint8_t pad[4]; + uint32_t buffer_offset; + uint32_t buflen; + int32_t stat; + uint32_t sense_offset; + uint8_t data_direction; + uint8_t pad1[3]; + int32_t quiet; + int32_t timeout; +}; + +union xen_block_packet +{ + uint32_t type; + struct xen_cdrom_support xcs; + struct xen_cdrom_open xco; + struct xen_cdrom_media_changed xcmc; + struct xen_cdrom_packet xcp; +}; + +#define PACKET_PAYLOAD_OFFSET (sizeof(struct xen_cdrom_packet)) +#define PACKET_SENSE_OFFSET (PACKET_PAYLOAD_OFFSET + sizeof(struct vcd_generic_command)) +#define PACKET_BUFFER_OFFSET (PACKET_SENSE_OFFSET + sizeof(struct request_sense)) +#define MAX_PACKET_DATA (PAGE_SIZE - sizeof(struct xen_cdrom_packet) - \ + sizeof(struct vcd_generic_command) - sizeof(struct request_sense)) + +#endif --- linux-ec2-2.6.32.orig/include/xen/interface/io/fbif.h +++ linux-ec2-2.6.32/include/xen/interface/io/fbif.h @@ -41,12 +41,13 @@ */ #define XENFB_TYPE_UPDATE 2 -struct xenfb_update { - uint8_t type; /* XENFB_TYPE_UPDATE */ - int32_t x; /* source x */ - int32_t y; /* source y */ - int32_t width; /* rect width */ - int32_t height; /* rect height */ +struct xenfb_update +{ + uint8_t type; /* XENFB_TYPE_UPDATE */ + int32_t x; /* source x */ + int32_t y; /* source y */ + int32_t width; /* rect width */ + int32_t height; /* rect height */ }; /* @@ -55,36 +56,58 @@ */ #define XENFB_TYPE_RESIZE 3 -struct xenfb_resize { - uint8_t type; /* XENFB_TYPE_RESIZE */ - int32_t width; /* width in pixels */ - int32_t height; /* height in pixels */ - int32_t stride; /* stride in bytes */ - int32_t depth; /* depth in bits */ - int32_t offset; /* start offset within framebuffer */ +struct xenfb_resize +{ + uint8_t type; /* XENFB_TYPE_RESIZE */ + int32_t width; /* width in pixels */ + int32_t height; /* height in pixels */ + int32_t stride; /* stride in bytes */ + int32_t depth; /* depth in bits */ + int32_t offset; /* offset of the framebuffer in bytes */ }; #define XENFB_OUT_EVENT_SIZE 40 -union xenfb_out_event { - uint8_t type; - struct xenfb_update update; - struct xenfb_resize resize; - char pad[XENFB_OUT_EVENT_SIZE]; +union xenfb_out_event +{ + uint8_t type; + struct xenfb_update update; + struct xenfb_resize resize; + char pad[XENFB_OUT_EVENT_SIZE]; }; /* In events (backend -> frontend) */ /* * Frontends should ignore unknown in events. - * No in events currently defined. */ +/* + * Framebuffer refresh period advice + * Backend sends it to advise the frontend their preferred period of + * refresh. Frontends that keep the framebuffer constantly up-to-date + * just ignore it. Frontends that use the advice should immediately + * refresh the framebuffer (and send an update notification event if + * those have been requested), then use the update frequency to guide + * their periodical refreshs. + */ +#define XENFB_TYPE_REFRESH_PERIOD 1 +#define XENFB_NO_REFRESH 0 + +struct xenfb_refresh_period +{ + uint8_t type; /* XENFB_TYPE_UPDATE_PERIOD */ + uint32_t period; /* period of refresh, in ms, + * XENFB_NO_REFRESH if no refresh is needed */ +}; + #define XENFB_IN_EVENT_SIZE 40 -union xenfb_in_event { - uint8_t type; - char pad[XENFB_IN_EVENT_SIZE]; +union xenfb_in_event +{ + uint8_t type; + struct xenfb_refresh_period refresh_period; + char pad[XENFB_IN_EVENT_SIZE]; }; /* shared page */ @@ -93,41 +116,46 @@ #define XENFB_IN_RING_LEN (XENFB_IN_RING_SIZE / XENFB_IN_EVENT_SIZE) #define XENFB_IN_RING_OFFS 1024 #define XENFB_IN_RING(page) \ - ((union xenfb_in_event *)((char *)(page) + XENFB_IN_RING_OFFS)) + ((union xenfb_in_event *)((char *)(page) + XENFB_IN_RING_OFFS)) #define XENFB_IN_RING_REF(page, idx) \ - (XENFB_IN_RING((page))[(idx) % XENFB_IN_RING_LEN]) + (XENFB_IN_RING((page))[(idx) % XENFB_IN_RING_LEN]) #define XENFB_OUT_RING_SIZE 2048 #define XENFB_OUT_RING_LEN (XENFB_OUT_RING_SIZE / XENFB_OUT_EVENT_SIZE) #define XENFB_OUT_RING_OFFS (XENFB_IN_RING_OFFS + XENFB_IN_RING_SIZE) #define XENFB_OUT_RING(page) \ - ((union xenfb_out_event *)((char *)(page) + XENFB_OUT_RING_OFFS)) + ((union xenfb_out_event *)((char *)(page) + XENFB_OUT_RING_OFFS)) #define XENFB_OUT_RING_REF(page, idx) \ - (XENFB_OUT_RING((page))[(idx) % XENFB_OUT_RING_LEN]) + (XENFB_OUT_RING((page))[(idx) % XENFB_OUT_RING_LEN]) -struct xenfb_page { - uint32_t in_cons, in_prod; - uint32_t out_cons, out_prod; - - int32_t width; /* width of the framebuffer (in pixels) */ - int32_t height; /* height of the framebuffer (in pixels) */ - uint32_t line_length; /* length of a row of pixels (in bytes) */ - uint32_t mem_length; /* length of the framebuffer (in bytes) */ - uint8_t depth; /* depth of a pixel (in bits) */ - - /* - * Framebuffer page directory - * - * Each directory page holds PAGE_SIZE / sizeof(*pd) - * framebuffer pages, and can thus map up to PAGE_SIZE * - * PAGE_SIZE / sizeof(*pd) bytes. With PAGE_SIZE == 4096 and - * sizeof(unsigned long) == 4/8, that's 4 Megs 32 bit and 2 - * Megs 64 bit. 256 directories give enough room for a 512 - * Meg framebuffer with a max resolution of 12,800x10,240. - * Should be enough for a while with room leftover for - * expansion. - */ - unsigned long pd[256]; +struct xenfb_page +{ + uint32_t in_cons, in_prod; + uint32_t out_cons, out_prod; + + int32_t width; /* the width of the framebuffer (in pixels) */ + int32_t height; /* the height of the framebuffer (in pixels) */ + uint32_t line_length; /* the length of a row of pixels (in bytes) */ + uint32_t mem_length; /* the length of the framebuffer (in bytes) */ + uint8_t depth; /* the depth of a pixel (in bits) */ + + /* + * Framebuffer page directory + * + * Each directory page holds PAGE_SIZE / sizeof(*pd) + * framebuffer pages, and can thus map up to PAGE_SIZE * + * PAGE_SIZE / sizeof(*pd) bytes. With PAGE_SIZE == 4096 and + * sizeof(unsigned long) == 4/8, that's 4 Megs 32 bit and 2 Megs + * 64 bit. 256 directories give enough room for a 512 Meg + * framebuffer with a max resolution of 12,800x10,240. Should + * be enough for a while with room leftover for expansion. + */ +#ifndef CONFIG_PARAVIRT_XEN + unsigned long pd[256]; +#else + /* Two directory pages should be enough for a while. */ + unsigned long pd[2]; +#endif }; /* --- linux-ec2-2.6.32.orig/include/xen/interface/io/usbif.h +++ linux-ec2-2.6.32/include/xen/interface/io/usbif.h @@ -0,0 +1,151 @@ +/* + * usbif.h + * + * USB I/O interface for Xen guest OSes. + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_IO_USBIF_H__ +#define __XEN_PUBLIC_IO_USBIF_H__ + +#include "ring.h" +#include "../grant_table.h" + +enum usb_spec_version { + USB_VER_UNKNOWN = 0, + USB_VER_USB11, + USB_VER_USB20, + USB_VER_USB30, /* not supported yet */ +}; + +/* + * USB pipe in usbif_request + * + * bits 0-5 are specific bits for virtual USB driver. + * bits 7-31 are standard urb pipe. + * + * - port number(NEW): bits 0-4 + * (USB_MAXCHILDREN is 31) + * + * - operation flag(NEW): bit 5 + * (0 = submit urb, + * 1 = unlink urb) + * + * - direction: bit 7 + * (0 = Host-to-Device [Out] + * 1 = Device-to-Host [In]) + * + * - device address: bits 8-14 + * + * - endpoint: bits 15-18 + * + * - pipe type: bits 30-31 + * (00 = isochronous, 01 = interrupt, + * 10 = control, 11 = bulk) + */ +#define usbif_pipeportnum(pipe) ((pipe) & 0x1f) +#define usbif_setportnum_pipe(pipe, portnum) \ + ((pipe)|(portnum)) + +#define usbif_pipeunlink(pipe) ((pipe) & 0x20) +#define usbif_pipesubmit(pipe) (!usbif_pipeunlink(pipe)) +#define usbif_setunlink_pipe(pipe) ((pipe)|(0x20)) + +#define USBIF_BACK_MAX_PENDING_REQS (128) +#define USBIF_MAX_SEGMENTS_PER_REQUEST (16) + +/* + * RING for transferring urbs. + */ +struct usbif_request_segment { + grant_ref_t gref; + uint16_t offset; + uint16_t length; +}; + +struct usbif_urb_request { + uint16_t id; /* request id */ + uint16_t nr_buffer_segs; /* number of urb->transfer_buffer segments */ + + /* basic urb parameter */ + uint32_t pipe; + uint16_t transfer_flags; + uint16_t buffer_length; + union { + uint8_t ctrl[8]; /* setup_packet (Ctrl) */ + + struct { + uint16_t interval; /* maximum (1024*8) in usb core */ + uint16_t start_frame; /* start frame */ + uint16_t number_of_packets; /* number of ISO packet */ + uint16_t nr_frame_desc_segs; /* number of iso_frame_desc segments */ + } isoc; + + struct { + uint16_t interval; /* maximum (1024*8) in usb core */ + uint16_t pad[3]; + } intr; + + struct { + uint16_t unlink_id; /* unlink request id */ + uint16_t pad[3]; + } unlink; + + } u; + + /* urb data segments */ + struct usbif_request_segment seg[USBIF_MAX_SEGMENTS_PER_REQUEST]; +}; +typedef struct usbif_urb_request usbif_urb_request_t; + +struct usbif_urb_response { + uint16_t id; /* request id */ + uint16_t start_frame; /* start frame (ISO) */ + int32_t status; /* status (non-ISO) */ + int32_t actual_length; /* actual transfer length */ + int32_t error_count; /* number of ISO errors */ +}; +typedef struct usbif_urb_response usbif_urb_response_t; + +DEFINE_RING_TYPES(usbif_urb, struct usbif_urb_request, struct usbif_urb_response); +#define USB_URB_RING_SIZE __RING_SIZE((struct usbif_urb_sring *)0, PAGE_SIZE) + +/* + * RING for notifying connect/disconnect events to frontend + */ +struct usbif_conn_request { + uint16_t id; +}; +typedef struct usbif_conn_request usbif_conn_request_t; + +struct usbif_conn_response { + uint16_t id; /* request id */ + uint8_t portnum; /* port number */ + uint8_t speed; /* usb_device_speed */ +}; +typedef struct usbif_conn_response usbif_conn_response_t; + +DEFINE_RING_TYPES(usbif_conn, struct usbif_conn_request, struct usbif_conn_response); +#define USB_CONN_RING_SIZE __RING_SIZE((struct usbif_conn_sring *)0, PAGE_SIZE) + +#endif /* __XEN_PUBLIC_IO_USBIF_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/io/ring.h +++ linux-ec2-2.6.32/include/xen/interface/io/ring.h @@ -3,16 +3,42 @@ * * Shared producer-consumer ring macros. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Tim Deegan and Andrew Warfield November 2004. */ #ifndef __XEN_PUBLIC_IO_RING_H__ #define __XEN_PUBLIC_IO_RING_H__ +#include "../xen-compat.h" + +#if __XEN_INTERFACE_VERSION__ < 0x00030208 +#define xen_mb() mb() +#define xen_rmb() rmb() +#define xen_wmb() wmb() +#endif + typedef unsigned int RING_IDX; /* Round a 32-bit unsigned constant down to the nearest power of two. */ -#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1)) +#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1)) #define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x)) #define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x)) #define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x)) @@ -25,73 +51,77 @@ * power of two (so we can mask with (size-1) to loop around). */ #define __RING_SIZE(_s, _sz) \ - (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) + (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) /* * Macros to make the correct C datatypes for a new kind of ring. * * To make a new ring datatype, you need to have two message structures, - * let's say struct request, and struct response already defined. + * let's say request_t, and response_t already defined. * * In a header where you want the ring datatype declared, you then do: * - * DEFINE_RING_TYPES(mytag, struct request, struct response); + * DEFINE_RING_TYPES(mytag, request_t, response_t); * * These expand out to give you a set of types, as you can see below. * The most important of these are: * - * struct mytag_sring - The shared ring. - * struct mytag_front_ring - The 'front' half of the ring. - * struct mytag_back_ring - The 'back' half of the ring. + * mytag_sring_t - The shared ring. + * mytag_front_ring_t - The 'front' half of the ring. + * mytag_back_ring_t - The 'back' half of the ring. * * To initialize a ring in your code you need to know the location and size * of the shared memory area (PAGE_SIZE, for instance). To initialise * the front half: * - * struct mytag_front_ring front_ring; - * SHARED_RING_INIT((struct mytag_sring *)shared_page); - * FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page, - * PAGE_SIZE); + * mytag_front_ring_t front_ring; + * SHARED_RING_INIT((mytag_sring_t *)shared_page); + * FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE); * * Initializing the back follows similarly (note that only the front * initializes the shared ring): * - * struct mytag_back_ring back_ring; - * BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page, - * PAGE_SIZE); + * mytag_back_ring_t back_ring; + * BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE); */ -#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \ - \ -/* Shared ring entry */ \ -union __name##_sring_entry { \ - __req_t req; \ - __rsp_t rsp; \ -}; \ - \ -/* Shared ring page */ \ -struct __name##_sring { \ - RING_IDX req_prod, req_event; \ - RING_IDX rsp_prod, rsp_event; \ - uint8_t pad[48]; \ - union __name##_sring_entry ring[1]; /* variable-length */ \ -}; \ - \ -/* "Front" end's private variables */ \ -struct __name##_front_ring { \ - RING_IDX req_prod_pvt; \ - RING_IDX rsp_cons; \ - unsigned int nr_ents; \ - struct __name##_sring *sring; \ -}; \ - \ -/* "Back" end's private variables */ \ -struct __name##_back_ring { \ - RING_IDX rsp_prod_pvt; \ - RING_IDX req_cons; \ - unsigned int nr_ents; \ - struct __name##_sring *sring; \ -}; +#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \ + \ +/* Shared ring entry */ \ +union __name##_sring_entry { \ + __req_t req; \ + __rsp_t rsp; \ +}; \ + \ +/* Shared ring page */ \ +struct __name##_sring { \ + RING_IDX req_prod, req_event; \ + RING_IDX rsp_prod, rsp_event; \ + uint8_t netfront_smartpoll_active; \ + uint8_t pad[47]; \ + union __name##_sring_entry ring[1]; /* variable-length */ \ +}; \ + \ +/* "Front" end's private variables */ \ +struct __name##_front_ring { \ + RING_IDX req_prod_pvt; \ + RING_IDX rsp_cons; \ + unsigned int nr_ents; \ + struct __name##_sring *sring; \ +}; \ + \ +/* "Back" end's private variables */ \ +struct __name##_back_ring { \ + RING_IDX rsp_prod_pvt; \ + RING_IDX req_cons; \ + unsigned int nr_ents; \ + struct __name##_sring *sring; \ +}; \ + \ +/* Syntactic sugar */ \ +typedef struct __name##_sring __name##_sring_t; \ +typedef struct __name##_front_ring __name##_front_ring_t; \ +typedef struct __name##_back_ring __name##_back_ring_t /* * Macros for manipulating rings. @@ -109,86 +139,94 @@ */ /* Initialising empty rings */ -#define SHARED_RING_INIT(_s) do { \ - (_s)->req_prod = (_s)->rsp_prod = 0; \ - (_s)->req_event = (_s)->rsp_event = 1; \ - memset((_s)->pad, 0, sizeof((_s)->pad)); \ +#define SHARED_RING_INIT(_s) do { \ + (_s)->req_prod = (_s)->rsp_prod = 0; \ + (_s)->req_event = (_s)->rsp_event = 1; \ + (void)memset((_s)->pad, 0, sizeof((_s)->pad)); \ } while(0) -#define FRONT_RING_INIT(_r, _s, __size) do { \ - (_r)->req_prod_pvt = 0; \ - (_r)->rsp_cons = 0; \ - (_r)->nr_ents = __RING_SIZE(_s, __size); \ - (_r)->sring = (_s); \ +#define FRONT_RING_INIT(_r, _s, __size) do { \ + (_r)->req_prod_pvt = 0; \ + (_r)->rsp_cons = 0; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ + (_r)->sring = (_s); \ } while (0) -#define BACK_RING_INIT(_r, _s, __size) do { \ - (_r)->rsp_prod_pvt = 0; \ - (_r)->req_cons = 0; \ - (_r)->nr_ents = __RING_SIZE(_s, __size); \ - (_r)->sring = (_s); \ +#define BACK_RING_INIT(_r, _s, __size) do { \ + (_r)->rsp_prod_pvt = 0; \ + (_r)->req_cons = 0; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ + (_r)->sring = (_s); \ } while (0) /* Initialize to existing shared indexes -- for recovery */ -#define FRONT_RING_ATTACH(_r, _s, __size) do { \ - (_r)->sring = (_s); \ - (_r)->req_prod_pvt = (_s)->req_prod; \ - (_r)->rsp_cons = (_s)->rsp_prod; \ - (_r)->nr_ents = __RING_SIZE(_s, __size); \ +#define FRONT_RING_ATTACH(_r, _s, __size) do { \ + (_r)->sring = (_s); \ + (_r)->req_prod_pvt = (_s)->req_prod; \ + (_r)->rsp_cons = (_s)->rsp_prod; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ } while (0) -#define BACK_RING_ATTACH(_r, _s, __size) do { \ - (_r)->sring = (_s); \ - (_r)->rsp_prod_pvt = (_s)->rsp_prod; \ - (_r)->req_cons = (_s)->req_prod; \ - (_r)->nr_ents = __RING_SIZE(_s, __size); \ +#define BACK_RING_ATTACH(_r, _s, __size) do { \ + (_r)->sring = (_s); \ + (_r)->rsp_prod_pvt = (_s)->rsp_prod; \ + (_r)->req_cons = (_s)->req_prod; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ } while (0) /* How big is this ring? */ -#define RING_SIZE(_r) \ +#define RING_SIZE(_r) \ ((_r)->nr_ents) /* Number of free requests (for use on front side only). */ -#define RING_FREE_REQUESTS(_r) \ +#define RING_FREE_REQUESTS(_r) \ (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons)) /* Test if there is an empty slot available on the front ring. * (This is only meaningful from the front. ) */ -#define RING_FULL(_r) \ +#define RING_FULL(_r) \ (RING_FREE_REQUESTS(_r) == 0) /* Test if there are outstanding messages to be processed on a ring. */ -#define RING_HAS_UNCONSUMED_RESPONSES(_r) \ +#define RING_HAS_UNCONSUMED_RESPONSES(_r) \ ((_r)->sring->rsp_prod - (_r)->rsp_cons) -#define RING_HAS_UNCONSUMED_REQUESTS(_r) \ - ({ \ - unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \ - unsigned int rsp = RING_SIZE(_r) - \ - ((_r)->req_cons - (_r)->rsp_prod_pvt); \ - req < rsp ? req : rsp; \ - }) +#ifdef __GNUC__ +#define RING_HAS_UNCONSUMED_REQUESTS(_r) ({ \ + unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \ + unsigned int rsp = RING_SIZE(_r) - \ + ((_r)->req_cons - (_r)->rsp_prod_pvt); \ + req < rsp ? req : rsp; \ +}) +#else +/* Same as above, but without the nice GCC ({ ... }) syntax. */ +#define RING_HAS_UNCONSUMED_REQUESTS(_r) \ + ((((_r)->sring->req_prod - (_r)->req_cons) < \ + (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ? \ + ((_r)->sring->req_prod - (_r)->req_cons) : \ + (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) +#endif /* Direct access to individual ring elements, by index. */ -#define RING_GET_REQUEST(_r, _idx) \ +#define RING_GET_REQUEST(_r, _idx) \ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req)) -#define RING_GET_RESPONSE(_r, _idx) \ +#define RING_GET_RESPONSE(_r, _idx) \ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp)) /* Loop termination condition: Would the specified index overflow the ring? */ -#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ +#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) -#define RING_PUSH_REQUESTS(_r) do { \ - wmb(); /* back sees requests /before/ updated producer index */ \ - (_r)->sring->req_prod = (_r)->req_prod_pvt; \ +#define RING_PUSH_REQUESTS(_r) do { \ + xen_wmb(); /* back sees requests /before/ updated producer index */ \ + (_r)->sring->req_prod = (_r)->req_prod_pvt; \ } while (0) -#define RING_PUSH_RESPONSES(_r) do { \ - wmb(); /* front sees responses /before/ updated producer index */ \ - (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \ +#define RING_PUSH_RESPONSES(_r) do { \ + xen_wmb(); /* front sees resps /before/ updated producer index */ \ + (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \ } while (0) /* @@ -221,40 +259,40 @@ * field appropriately. */ -#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \ - RING_IDX __old = (_r)->sring->req_prod; \ - RING_IDX __new = (_r)->req_prod_pvt; \ - wmb(); /* back sees requests /before/ updated producer index */ \ - (_r)->sring->req_prod = __new; \ - mb(); /* back sees new requests /before/ we check req_event */ \ - (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \ - (RING_IDX)(__new - __old)); \ +#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \ + RING_IDX __old = (_r)->sring->req_prod; \ + RING_IDX __new = (_r)->req_prod_pvt; \ + xen_wmb(); /* back sees requests /before/ updated producer index */ \ + (_r)->sring->req_prod = __new; \ + xen_mb(); /* back sees new requests /before/ we check req_event */ \ + (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \ + (RING_IDX)(__new - __old)); \ } while (0) -#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \ - RING_IDX __old = (_r)->sring->rsp_prod; \ - RING_IDX __new = (_r)->rsp_prod_pvt; \ - wmb(); /* front sees responses /before/ updated producer index */ \ - (_r)->sring->rsp_prod = __new; \ - mb(); /* front sees new responses /before/ we check rsp_event */ \ - (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \ - (RING_IDX)(__new - __old)); \ +#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \ + RING_IDX __old = (_r)->sring->rsp_prod; \ + RING_IDX __new = (_r)->rsp_prod_pvt; \ + xen_wmb(); /* front sees resps /before/ updated producer index */ \ + (_r)->sring->rsp_prod = __new; \ + xen_mb(); /* front sees new resps /before/ we check rsp_event */ \ + (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \ + (RING_IDX)(__new - __old)); \ } while (0) -#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \ - (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ - if (_work_to_do) break; \ - (_r)->sring->req_event = (_r)->req_cons + 1; \ - mb(); \ - (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ +#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \ + (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ + if (_work_to_do) break; \ + (_r)->sring->req_event = (_r)->req_cons + 1; \ + xen_mb(); \ + (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ } while (0) -#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \ - (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ - if (_work_to_do) break; \ - (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \ - mb(); \ - (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ +#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \ + (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ + if (_work_to_do) break; \ + (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \ + xen_mb(); \ + (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ } while (0) #endif /* __XEN_PUBLIC_IO_RING_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/io/protocols.h +++ linux-ec2-2.6.32/include/xen/interface/io/protocols.h @@ -1,10 +1,31 @@ +/****************************************************************************** + * protocols.h + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + #ifndef __XEN_PROTOCOLS_H__ #define __XEN_PROTOCOLS_H__ #define XEN_IO_PROTO_ABI_X86_32 "x86_32-abi" #define XEN_IO_PROTO_ABI_X86_64 "x86_64-abi" #define XEN_IO_PROTO_ABI_IA64 "ia64-abi" -#define XEN_IO_PROTO_ABI_POWERPC64 "powerpc64-abi" #if defined(__i386__) # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_32 @@ -12,8 +33,6 @@ # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_64 #elif defined(__ia64__) # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_IA64 -#elif defined(__powerpc64__) -# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_POWERPC64 #else # error arch fixup needed here #endif --- linux-ec2-2.6.32.orig/include/xen/interface/io/vscsiif.h +++ linux-ec2-2.6.32/include/xen/interface/io/vscsiif.h @@ -0,0 +1,105 @@ +/****************************************************************************** + * vscsiif.h + * + * Based on the blkif.h code. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright(c) FUJITSU Limited 2008. + */ + +#ifndef __XEN__PUBLIC_IO_SCSI_H__ +#define __XEN__PUBLIC_IO_SCSI_H__ + +#include "ring.h" +#include "../grant_table.h" + +/* command between backend and frontend */ +#define VSCSIIF_ACT_SCSI_CDB 1 /* SCSI CDB command */ +#define VSCSIIF_ACT_SCSI_ABORT 2 /* SCSI Device(Lun) Abort*/ +#define VSCSIIF_ACT_SCSI_RESET 3 /* SCSI Device(Lun) Reset*/ + + +#define VSCSIIF_BACK_MAX_PENDING_REQS 128 + +/* + * Maximum scatter/gather segments per request. + * + * Considering balance between allocating al least 16 "vscsiif_request" + * structures on one page (4096bytes) and number of scatter gather + * needed, we decided to use 26 as a magic number. + */ +#define VSCSIIF_SG_TABLESIZE 26 + +/* + * base on linux kernel 2.6.18 + */ +#define VSCSIIF_MAX_COMMAND_SIZE 16 +#define VSCSIIF_SENSE_BUFFERSIZE 96 + + +struct vscsiif_request { + uint16_t rqid; /* private guest value, echoed in resp */ + uint8_t act; /* command between backend and frontend */ + uint8_t cmd_len; + + uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; + uint16_t timeout_per_command; /* The command is issued by twice + the value in Backend. */ + uint16_t channel, id, lun; + uint16_t padding; + uint8_t sc_data_direction; /* for DMA_TO_DEVICE(1) + DMA_FROM_DEVICE(2) + DMA_NONE(3) requests */ + uint8_t nr_segments; /* Number of pieces of scatter-gather */ + + struct scsiif_request_segment { + grant_ref_t gref; + uint16_t offset; + uint16_t length; + } seg[VSCSIIF_SG_TABLESIZE]; + uint32_t reserved[3]; +}; +typedef struct vscsiif_request vscsiif_request_t; + +struct vscsiif_response { + uint16_t rqid; + uint8_t padding; + uint8_t sense_len; + uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; + int32_t rslt; + uint32_t residual_len; /* request bufflen - + return the value from physical device */ + uint32_t reserved[36]; +}; +typedef struct vscsiif_response vscsiif_response_t; + +DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response); + + +#endif /*__XEN__PUBLIC_IO_SCSI_H__*/ +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/io/console.h +++ linux-ec2-2.6.32/include/xen/interface/io/console.h @@ -3,6 +3,24 @@ * * Console I/O interface for Xen guest OSes. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2005, Keir Fraser */ --- linux-ec2-2.6.32.orig/include/xen/interface/io/fsif.h +++ linux-ec2-2.6.32/include/xen/interface/io/fsif.h @@ -0,0 +1,192 @@ +/****************************************************************************** + * fsif.h + * + * Interface to FS level split device drivers. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2007, Grzegorz Milos, . + */ + +#ifndef __XEN_PUBLIC_IO_FSIF_H__ +#define __XEN_PUBLIC_IO_FSIF_H__ + +#include "ring.h" +#include "../grant_table.h" + +#define REQ_FILE_OPEN 1 +#define REQ_FILE_CLOSE 2 +#define REQ_FILE_READ 3 +#define REQ_FILE_WRITE 4 +#define REQ_STAT 5 +#define REQ_FILE_TRUNCATE 6 +#define REQ_REMOVE 7 +#define REQ_RENAME 8 +#define REQ_CREATE 9 +#define REQ_DIR_LIST 10 +#define REQ_CHMOD 11 +#define REQ_FS_SPACE 12 +#define REQ_FILE_SYNC 13 + +struct fsif_open_request { + grant_ref_t gref; +}; + +struct fsif_close_request { + uint32_t fd; +}; + +struct fsif_read_request { + uint32_t fd; + int32_t pad; + uint64_t len; + uint64_t offset; + grant_ref_t grefs[1]; /* Variable length */ +}; + +struct fsif_write_request { + uint32_t fd; + int32_t pad; + uint64_t len; + uint64_t offset; + grant_ref_t grefs[1]; /* Variable length */ +}; + +struct fsif_stat_request { + uint32_t fd; +}; + +/* This structure is a copy of some fields from stat structure, returned + * via the ring. */ +struct fsif_stat_response { + int32_t stat_mode; + uint32_t stat_uid; + uint32_t stat_gid; + int32_t stat_ret; + int64_t stat_size; + int64_t stat_atime; + int64_t stat_mtime; + int64_t stat_ctime; +}; + +struct fsif_truncate_request { + uint32_t fd; + int32_t pad; + int64_t length; +}; + +struct fsif_remove_request { + grant_ref_t gref; +}; + +struct fsif_rename_request { + uint16_t old_name_offset; + uint16_t new_name_offset; + grant_ref_t gref; +}; + +struct fsif_create_request { + int8_t directory; + int8_t pad; + int16_t pad2; + int32_t mode; + grant_ref_t gref; +}; + +struct fsif_list_request { + uint32_t offset; + grant_ref_t gref; +}; + +#define NR_FILES_SHIFT 0 +#define NR_FILES_SIZE 16 /* 16 bits for the number of files mask */ +#define NR_FILES_MASK (((1ULL << NR_FILES_SIZE) - 1) << NR_FILES_SHIFT) +#define ERROR_SIZE 32 /* 32 bits for the error mask */ +#define ERROR_SHIFT (NR_FILES_SIZE + NR_FILES_SHIFT) +#define ERROR_MASK (((1ULL << ERROR_SIZE) - 1) << ERROR_SHIFT) +#define HAS_MORE_SHIFT (ERROR_SHIFT + ERROR_SIZE) +#define HAS_MORE_FLAG (1ULL << HAS_MORE_SHIFT) + +struct fsif_chmod_request { + uint32_t fd; + int32_t mode; +}; + +struct fsif_space_request { + grant_ref_t gref; +}; + +struct fsif_sync_request { + uint32_t fd; +}; + + +/* FS operation request */ +struct fsif_request { + uint8_t type; /* Type of the request */ + uint8_t pad; + uint16_t id; /* Request ID, copied to the response */ + uint32_t pad2; + union { + struct fsif_open_request fopen; + struct fsif_close_request fclose; + struct fsif_read_request fread; + struct fsif_write_request fwrite; + struct fsif_stat_request fstat; + struct fsif_truncate_request ftruncate; + struct fsif_remove_request fremove; + struct fsif_rename_request frename; + struct fsif_create_request fcreate; + struct fsif_list_request flist; + struct fsif_chmod_request fchmod; + struct fsif_space_request fspace; + struct fsif_sync_request fsync; + } u; +}; +typedef struct fsif_request fsif_request_t; + +/* FS operation response */ +struct fsif_response { + uint16_t id; + uint16_t pad1; + uint32_t pad2; + union { + uint64_t ret_val; + struct fsif_stat_response fstat; + } u; +}; + +typedef struct fsif_response fsif_response_t; + +#define FSIF_RING_ENTRY_SIZE 64 + +#define FSIF_NR_READ_GNTS ((FSIF_RING_ENTRY_SIZE - sizeof(struct fsif_read_request)) / \ + sizeof(grant_ref_t) + 1) +#define FSIF_NR_WRITE_GNTS ((FSIF_RING_ENTRY_SIZE - sizeof(struct fsif_write_request)) / \ + sizeof(grant_ref_t) + 1) + +DEFINE_RING_TYPES(fsif, struct fsif_request, struct fsif_response); + +#define STATE_INITIALISED "init" +#define STATE_READY "ready" +#define STATE_CLOSING "closing" +#define STATE_CLOSED "closed" + + +#endif --- linux-ec2-2.6.32.orig/include/xen/interface/arch-x86/xen.h +++ linux-ec2-2.6.32/include/xen/interface/arch-x86/xen.h @@ -0,0 +1,203 @@ +/****************************************************************************** + * arch-x86/xen.h + * + * Guest OS interface to x86 Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser + */ + +#include "../xen.h" + +#ifndef __XEN_PUBLIC_ARCH_X86_XEN_H__ +#define __XEN_PUBLIC_ARCH_X86_XEN_H__ + +/* Structural guest handles introduced in 0x00030201. */ +#if __XEN_INTERFACE_VERSION__ >= 0x00030201 +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ + typedef struct { type *p; } __guest_handle_ ## name +#else +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ + typedef type * __guest_handle_ ## name +#endif + +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \ + ___DEFINE_XEN_GUEST_HANDLE(name, type); \ + ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type) +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) +#define __XEN_GUEST_HANDLE(name) __guest_handle_ ## name +#define XEN_GUEST_HANDLE(name) __XEN_GUEST_HANDLE(name) +#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0) +#ifdef __XEN_TOOLS__ +#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) +#endif + +/* Allow co-existing Linux 2.6.23+ Xen interface definitions. */ +#define DEFINE_GUEST_HANDLE_STRUCT(name) struct name + +#if defined(__i386__) +#include "xen-x86_32.h" +#elif defined(__x86_64__) +#include "xen-x86_64.h" +#endif + +#ifndef __ASSEMBLY__ +typedef unsigned long xen_pfn_t; +#define PRI_xen_pfn "lx" +#endif + +/* + * SEGMENT DESCRIPTOR TABLES + */ +/* + * A number of GDT entries are reserved by Xen. These are not situated at the + * start of the GDT because some stupid OSes export hard-coded selector values + * in their ABI. These hard-coded values are always near the start of the GDT, + * so Xen places itself out of the way, at the far end of the GDT. + */ +#define FIRST_RESERVED_GDT_PAGE 14 +#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) + +/* Maximum number of virtual CPUs in multi-processor guests. */ +#define MAX_VIRT_CPUS 32 + +#ifndef __ASSEMBLY__ + +typedef unsigned long xen_ulong_t; + +/* + * Send an array of these to HYPERVISOR_set_trap_table(). + * The privilege level specifies which modes may enter a trap via a software + * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate + * privilege levels as follows: + * Level == 0: Noone may enter + * Level == 1: Kernel may enter + * Level == 2: Kernel may enter + * Level == 3: Everyone may enter + */ +#define TI_GET_DPL(_ti) ((_ti)->flags & 3) +#define TI_GET_IF(_ti) ((_ti)->flags & 4) +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl)) +#define TI_SET_IF(_ti,_if) ((_ti)->flags |= ((!!(_if))<<2)) +struct trap_info { + uint8_t vector; /* exception vector */ + uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ + uint16_t cs; /* code selector */ + unsigned long address; /* code offset */ +}; +typedef struct trap_info trap_info_t; +DEFINE_XEN_GUEST_HANDLE(trap_info_t); + +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ + +/* + * The following is all CPU context. Note that the fpu_ctxt block is filled + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + */ +struct vcpu_guest_context { + /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ + struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ +#define VGCF_I387_VALID (1<<0) +#define VGCF_IN_KERNEL (1<<2) +#define _VGCF_i387_valid 0 +#define VGCF_i387_valid (1<<_VGCF_i387_valid) +#define _VGCF_in_kernel 2 +#define VGCF_in_kernel (1<<_VGCF_in_kernel) +#define _VGCF_failsafe_disables_events 3 +#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events) +#define _VGCF_syscall_disables_events 4 +#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events) +#define _VGCF_online 5 +#define VGCF_online (1<<_VGCF_online) + unsigned long flags; /* VGCF_* flags */ + struct cpu_user_regs user_regs; /* User-level CPU registers */ + struct trap_info trap_ctxt[256]; /* Virtual IDT */ + unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ + unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ + unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */ + /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */ + unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */ + unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ +#ifdef __i386__ + unsigned long event_callback_cs; /* CS:EIP of event callback */ + unsigned long event_callback_eip; + unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */ + unsigned long failsafe_callback_eip; +#else + unsigned long event_callback_eip; + unsigned long failsafe_callback_eip; +#ifdef __XEN__ + union { + unsigned long syscall_callback_eip; + struct { + unsigned int event_callback_cs; /* compat CS of event cb */ + unsigned int failsafe_callback_cs; /* compat CS of failsafe cb */ + }; + }; +#else + unsigned long syscall_callback_eip; +#endif +#endif + unsigned long vm_assist; /* VMASST_TYPE_* bitmap */ +#ifdef __x86_64__ + /* Segment base addresses. */ + uint64_t fs_base; + uint64_t gs_base_kernel; + uint64_t gs_base_user; +#endif +}; +typedef struct vcpu_guest_context vcpu_guest_context_t; +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t); + +struct arch_shared_info { + unsigned long max_pfn; /* max pfn that appears in table */ + /* Frame containing list of mfns containing list of mfns containing p2m. */ + xen_pfn_t pfn_to_mfn_frame_list_list; + unsigned long nmi_reason; + uint64_t pad[32]; +}; +typedef struct arch_shared_info arch_shared_info_t; + +#endif /* !__ASSEMBLY__ */ + +/* + * Prefix forces emulation of some non-trapping instructions. + * Currently only CPUID. + */ +#ifdef __ASSEMBLY__ +#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ; +#define XEN_CPUID XEN_EMULATE_PREFIX cpuid +#else +#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; " +#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid" +#endif + +#endif /* __XEN_PUBLIC_ARCH_X86_XEN_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/arch-x86/xen-mca.h +++ linux-ec2-2.6.32/include/xen/interface/arch-x86/xen-mca.h @@ -0,0 +1,422 @@ +/****************************************************************************** + * arch-x86/mca.h + * + * Contributed by Advanced Micro Devices, Inc. + * Author: Christoph Egger + * + * Guest OS machine check interface to x86 Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* Full MCA functionality has the following Usecases from the guest side: + * + * Must have's: + * 1. Dom0 and DomU register machine check trap callback handlers + * (already done via "set_trap_table" hypercall) + * 2. Dom0 registers machine check event callback handler + * (doable via EVTCHNOP_bind_virq) + * 3. Dom0 and DomU fetches machine check data + * 4. Dom0 wants Xen to notify a DomU + * 5. Dom0 gets DomU ID from physical address + * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy") + * + * Nice to have's: + * 7. Dom0 wants Xen to deactivate a physical CPU + * This is better done as separate task, physical CPU hotplugging, + * and hypercall(s) should be sysctl's + * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to + * move a DomU (or Dom0 itself) away from a malicious page + * producing correctable errors. + * 9. offlining physical page: + * Xen free's and never re-uses a certain physical page. + * 10. Testfacility: Allow Dom0 to write values into machine check MSR's + * and tell Xen to trigger a machine check + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__ +#define __XEN_PUBLIC_ARCH_X86_MCA_H__ + +/* Hypercall */ +#define __HYPERVISOR_mca __HYPERVISOR_arch_0 + +/* + * The xen-unstable repo has interface version 0x03000001; out interface + * is incompatible with that and any future minor revisions, so we + * choose a different version number range that is numerically less + * than that used in xen-unstable. + */ +#define XEN_MCA_INTERFACE_VERSION 0x01ecc003 + +/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */ +#define XEN_MC_NONURGENT 0x0001 +/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */ +#define XEN_MC_URGENT 0x0002 +/* IN: Dom0 acknowledges previosly-fetched telemetry */ +#define XEN_MC_ACK 0x0004 + +/* OUT: All is ok */ +#define XEN_MC_OK 0x0 +/* OUT: Domain could not fetch data. */ +#define XEN_MC_FETCHFAILED 0x1 +/* OUT: There was no machine check data to fetch. */ +#define XEN_MC_NODATA 0x2 +/* OUT: Between notification time and this hypercall an other + * (most likely) correctable error happened. The fetched data, + * does not match the original machine check data. */ +#define XEN_MC_NOMATCH 0x4 + +/* OUT: DomU did not register MC NMI handler. Try something else. */ +#define XEN_MC_CANNOTHANDLE 0x8 +/* OUT: Notifying DomU failed. Retry later or try something else. */ +#define XEN_MC_NOTDELIVERED 0x10 +/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */ + + +#ifndef __ASSEMBLY__ + +#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */ + +/* + * Machine Check Architecure: + * structs are read-only and used to report all kinds of + * correctable and uncorrectable errors detected by the HW. + * Dom0 and DomU: register a handler to get notified. + * Dom0 only: Correctable errors are reported via VIRQ_MCA + * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers + */ +#define MC_TYPE_GLOBAL 0 +#define MC_TYPE_BANK 1 +#define MC_TYPE_EXTENDED 2 +#define MC_TYPE_RECOVERY 3 + +struct mcinfo_common { + uint16_t type; /* structure type */ + uint16_t size; /* size of this struct in bytes */ +}; + + +#define MC_FLAG_CORRECTABLE (1 << 0) +#define MC_FLAG_UNCORRECTABLE (1 << 1) +#define MC_FLAG_RECOVERABLE (1 << 2) +#define MC_FLAG_POLLED (1 << 3) +#define MC_FLAG_RESET (1 << 4) +#define MC_FLAG_CMCI (1 << 5) +#define MC_FLAG_MCE (1 << 6) +/* contains global x86 mc information */ +struct mcinfo_global { + struct mcinfo_common common; + + /* running domain at the time in error (most likely the impacted one) */ + uint16_t mc_domid; + uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ + uint32_t mc_socketid; /* physical socket of the physical core */ + uint16_t mc_coreid; /* physical impacted core */ + uint16_t mc_core_threadid; /* core thread of physical core */ + uint32_t mc_apicid; + uint32_t mc_flags; + uint64_t mc_gstatus; /* global status */ +}; + +/* contains bank local x86 mc information */ +struct mcinfo_bank { + struct mcinfo_common common; + + uint16_t mc_bank; /* bank nr */ + uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on dom0 + * and if mc_addr is valid. Never valid on DomU. */ + uint64_t mc_status; /* bank status */ + uint64_t mc_addr; /* bank address, only valid + * if addr bit is set in mc_status */ + uint64_t mc_misc; + uint64_t mc_ctrl2; + uint64_t mc_tsc; +}; + + +struct mcinfo_msr { + uint64_t reg; /* MSR */ + uint64_t value; /* MSR value */ +}; + +/* contains mc information from other + * or additional mc MSRs */ +struct mcinfo_extended { + struct mcinfo_common common; + + /* You can fill up to five registers. + * If you need more, then use this structure + * multiple times. */ + + uint32_t mc_msrs; /* Number of msr with valid values. */ + /* + * Currently Intel extended MSR (32/64) include all gp registers + * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be + * useful at present. So expand this array to 16/32 to leave room. + */ + struct mcinfo_msr mc_msr[sizeof(void *) * 4]; +}; + +/* Recovery Action flags. Giving recovery result information to DOM0 */ + +/* Xen takes successful recovery action, the error is recovered */ +#define REC_ACTION_RECOVERED (0x1 << 0) +/* No action is performed by XEN */ +#define REC_ACTION_NONE (0x1 << 1) +/* It's possible DOM0 might take action ownership in some case */ +#define REC_ACTION_NEED_RESET (0x1 << 2) + +/* Different Recovery Action types, if the action is performed successfully, + * REC_ACTION_RECOVERED flag will be returned. + */ + +/* Page Offline Action */ +#define MC_ACTION_PAGE_OFFLINE (0x1 << 0) +/* CPU offline Action */ +#define MC_ACTION_CPU_OFFLINE (0x1 << 1) +/* L3 cache disable Action */ +#define MC_ACTION_CACHE_SHRINK (0x1 << 2) + +/* Below interface used between XEN/DOM0 for passing XEN's recovery action + * information to DOM0. + * usage Senario: After offlining broken page, XEN might pass its page offline + * recovery action result to DOM0. DOM0 will save the information in + * non-volatile memory for further proactive actions, such as offlining the + * easy broken page earlier when doing next reboot. +*/ +struct page_offline_action +{ + /* Params for passing the offlined page number to DOM0 */ + uint64_t mfn; + uint64_t status; +}; + +struct cpu_offline_action +{ + /* Params for passing the identity of the offlined CPU to DOM0 */ + uint32_t mc_socketid; + uint16_t mc_coreid; + uint16_t mc_core_threadid; +}; + +#define MAX_UNION_SIZE 16 +struct mcinfo_recovery +{ + struct mcinfo_common common; + uint16_t mc_bank; /* bank nr */ + uint8_t action_flags; + uint8_t action_types; + union { + struct page_offline_action page_retire; + struct cpu_offline_action cpu_offline; + uint8_t pad[MAX_UNION_SIZE]; + } action_info; +}; + + +#define MCINFO_HYPERCALLSIZE 1024 +#define MCINFO_MAXSIZE 768 + +struct mc_info { + /* Number of mcinfo_* entries in mi_data */ + uint32_t mi_nentries; + uint32_t _pad0; + uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8]; +}; +typedef struct mc_info mc_info_t; +DEFINE_XEN_GUEST_HANDLE(mc_info_t); + +#define __MC_MSR_ARRAYSIZE 8 +#define __MC_NMSRS 1 +#define MC_NCAPS 7 /* 7 CPU feature flag words */ +#define MC_CAPS_STD_EDX 0 /* cpuid level 0x00000001 (%edx) */ +#define MC_CAPS_AMD_EDX 1 /* cpuid level 0x80000001 (%edx) */ +#define MC_CAPS_TM 2 /* cpuid level 0x80860001 (TransMeta) */ +#define MC_CAPS_LINUX 3 /* Linux-defined */ +#define MC_CAPS_STD_ECX 4 /* cpuid level 0x00000001 (%ecx) */ +#define MC_CAPS_VIA 5 /* cpuid level 0xc0000001 */ +#define MC_CAPS_AMD_ECX 6 /* cpuid level 0x80000001 (%ecx) */ + +struct mcinfo_logical_cpu { + uint32_t mc_cpunr; + uint32_t mc_chipid; + uint16_t mc_coreid; + uint16_t mc_threadid; + uint32_t mc_apicid; + uint32_t mc_clusterid; + uint32_t mc_ncores; + uint32_t mc_ncores_active; + uint32_t mc_nthreads; + int32_t mc_cpuid_level; + uint32_t mc_family; + uint32_t mc_vendor; + uint32_t mc_model; + uint32_t mc_step; + char mc_vendorid[16]; + char mc_brandid[64]; + uint32_t mc_cpu_caps[MC_NCAPS]; + uint32_t mc_cache_size; + uint32_t mc_cache_alignment; + int32_t mc_nmsrvals; + struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE]; +}; +typedef struct mcinfo_logical_cpu xen_mc_logical_cpu_t; +DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_cpu_t); + + +/* + * OS's should use these instead of writing their own lookup function + * each with its own bugs and drawbacks. + * We use macros instead of static inline functions to allow guests + * to include this header in assembly files (*.S). + */ +/* Prototype: + * uint32_t x86_mcinfo_nentries(struct mc_info *mi); + */ +#define x86_mcinfo_nentries(_mi) \ + (_mi)->mi_nentries +/* Prototype: + * struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi); + */ +#define x86_mcinfo_first(_mi) \ + ((struct mcinfo_common *)(_mi)->mi_data) +/* Prototype: + * struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic); + */ +#define x86_mcinfo_next(_mic) \ + ((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size)) + +/* Prototype: + * void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type); + */ +#define x86_mcinfo_lookup(_ret, _mi, _type) \ + do { \ + uint32_t found, i; \ + struct mcinfo_common *_mic; \ + \ + found = 0; \ + (_ret) = NULL; \ + if (_mi == NULL) break; \ + _mic = x86_mcinfo_first(_mi); \ + for (i = 0; i < x86_mcinfo_nentries(_mi); i++) { \ + if (_mic->type == (_type)) { \ + found = 1; \ + break; \ + } \ + _mic = x86_mcinfo_next(_mic); \ + } \ + (_ret) = found ? _mic : NULL; \ + } while (0) + + +/* Usecase 1 + * Register machine check trap callback handler + * (already done via "set_trap_table" hypercall) + */ + +/* Usecase 2 + * Dom0 registers machine check event callback handler + * done by EVTCHNOP_bind_virq + */ + +/* Usecase 3 + * Fetch machine check data from hypervisor. + * Note, this hypercall is special, because both Dom0 and DomU must use this. + */ +#define XEN_MC_fetch 1 +struct xen_mc_fetch { + /* IN/OUT variables. */ + uint32_t flags; /* IN: XEN_MC_NONURGENT, XEN_MC_URGENT, + XEN_MC_ACK if ack'ing an earlier fetch */ + /* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, + XEN_MC_NODATA, XEN_MC_NOMATCH */ + uint32_t _pad0; + uint64_t fetch_id; /* OUT: id for ack, IN: id we are ack'ing */ + + /* OUT variables. */ + XEN_GUEST_HANDLE(mc_info_t) data; +}; +typedef struct xen_mc_fetch xen_mc_fetch_t; +DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t); + + +/* Usecase 4 + * This tells the hypervisor to notify a DomU about the machine check error + */ +#define XEN_MC_notifydomain 2 +struct xen_mc_notifydomain { + /* IN variables. */ + uint16_t mc_domid; /* The unprivileged domain to notify. */ + uint16_t mc_vcpuid; /* The vcpu in mc_domid to notify. + * Usually echo'd value from the fetch hypercall. */ + + /* IN/OUT variables. */ + uint32_t flags; + +/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */ +/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */ +}; +typedef struct xen_mc_notifydomain xen_mc_notifydomain_t; +DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t); + +#define XEN_MC_physcpuinfo 3 +struct xen_mc_physcpuinfo { + /* IN/OUT */ + uint32_t ncpus; + uint32_t _pad0; + /* OUT */ + XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info; +}; + +#define XEN_MC_msrinject 4 +#define MC_MSRINJ_MAXMSRS 8 +struct xen_mc_msrinject { + /* IN */ + uint32_t mcinj_cpunr; /* target processor id */ + uint32_t mcinj_flags; /* see MC_MSRINJ_F_* below */ + uint32_t mcinj_count; /* 0 .. count-1 in array are valid */ + uint32_t _pad0; + struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS]; +}; + +/* Flags for mcinj_flags above; bits 16-31 are reserved */ +#define MC_MSRINJ_F_INTERPOSE 0x1 + +#define XEN_MC_mceinject 5 +struct xen_mc_mceinject { + unsigned int mceinj_cpunr; /* target processor id */ +}; + +struct xen_mc { + uint32_t cmd; + uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */ + union { + struct xen_mc_fetch mc_fetch; + struct xen_mc_notifydomain mc_notifydomain; + struct xen_mc_physcpuinfo mc_physcpuinfo; + struct xen_mc_msrinject mc_msrinject; + struct xen_mc_mceinject mc_mceinject; + } u; +}; +typedef struct xen_mc xen_mc_t; +DEFINE_XEN_GUEST_HANDLE(xen_mc_t); + +#endif /* __ASSEMBLY__ */ + +#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/arch-x86/xen-x86_64.h +++ linux-ec2-2.6.32/include/xen/interface/arch-x86/xen-x86_64.h @@ -0,0 +1,212 @@ +/****************************************************************************** + * xen-x86_64.h + * + * Guest OS interface to x86 64-bit Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ +#define __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ + +/* + * Hypercall interface: + * Input: %rdi, %rsi, %rdx, %r10, %r8 (arguments 1-5) + * Output: %rax + * Access is via hypercall page (set up by guest loader or via a Xen MSR): + * call hypercall_page + hypercall-number * 32 + * Clobbered: argument registers (e.g., 2-arg hypercall clobbers %rdi,%rsi) + */ + +#if __XEN_INTERFACE_VERSION__ < 0x00030203 +/* + * Legacy hypercall interface: + * As above, except the entry sequence to the hypervisor is: + * mov $hypercall-number*32,%eax ; syscall + * Clobbered: %rcx, %r11, argument registers (as above) + */ +#define TRAP_INSTR "syscall" +#endif + +/* + * 64-bit segment selectors + * These flat segments are in the Xen-private section of every GDT. Since these + * are also present in the initial GDT, many OSes will be able to avoid + * installing their own GDT. + */ + +#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */ +#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */ +#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */ +#define FLAT_RING3_DS64 0x0000 /* NULL selector */ +#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */ +#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */ + +#define FLAT_KERNEL_DS64 FLAT_RING3_DS64 +#define FLAT_KERNEL_DS32 FLAT_RING3_DS32 +#define FLAT_KERNEL_DS FLAT_KERNEL_DS64 +#define FLAT_KERNEL_CS64 FLAT_RING3_CS64 +#define FLAT_KERNEL_CS32 FLAT_RING3_CS32 +#define FLAT_KERNEL_CS FLAT_KERNEL_CS64 +#define FLAT_KERNEL_SS64 FLAT_RING3_SS64 +#define FLAT_KERNEL_SS32 FLAT_RING3_SS32 +#define FLAT_KERNEL_SS FLAT_KERNEL_SS64 + +#define FLAT_USER_DS64 FLAT_RING3_DS64 +#define FLAT_USER_DS32 FLAT_RING3_DS32 +#define FLAT_USER_DS FLAT_USER_DS64 +#define FLAT_USER_CS64 FLAT_RING3_CS64 +#define FLAT_USER_CS32 FLAT_RING3_CS32 +#define FLAT_USER_CS FLAT_USER_CS64 +#define FLAT_USER_SS64 FLAT_RING3_SS64 +#define FLAT_USER_SS32 FLAT_RING3_SS32 +#define FLAT_USER_SS FLAT_USER_SS64 + +#define __HYPERVISOR_VIRT_START 0xFFFF800000000000 +#define __HYPERVISOR_VIRT_END 0xFFFF880000000000 +#define __MACH2PHYS_VIRT_START 0xFFFF800000000000 +#define __MACH2PHYS_VIRT_END 0xFFFF804000000000 + +#ifndef HYPERVISOR_VIRT_START +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END) +#endif + +#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) +#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) +#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3) +#ifndef machine_to_phys_mapping +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +#endif + +/* + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) + * @which == SEGBASE_* ; @base == 64-bit base address + * Returns 0 on success. + */ +#define SEGBASE_FS 0 +#define SEGBASE_GS_USER 1 +#define SEGBASE_GS_KERNEL 2 +#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */ + +/* + * int HYPERVISOR_iret(void) + * All arguments are on the kernel stack, in the following format. + * Never returns if successful. Current kernel context is lost. + * The saved CS is mapped as follows: + * RING0 -> RING3 kernel mode. + * RING1 -> RING3 kernel mode. + * RING2 -> RING3 kernel mode. + * RING3 -> RING3 user mode. + * However RING0 indicates that the guest kernel should return to iteself + * directly with + * orb $3,1*8(%rsp) + * iretq + * If flags contains VGCF_in_syscall: + * Restore RAX, RIP, RFLAGS, RSP. + * Discard R11, RCX, CS, SS. + * Otherwise: + * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP. + * All other registers are saved on hypercall entry and restored to user. + */ +/* Guest exited in SYSCALL context? Return to guest with SYSRET? */ +#define _VGCF_in_syscall 8 +#define VGCF_in_syscall (1<<_VGCF_in_syscall) +#define VGCF_IN_SYSCALL VGCF_in_syscall + +#ifndef __ASSEMBLY__ + +struct iret_context { + /* Top of stack (%rsp at point of hypercall). */ + uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; + /* Bottom of iret stack frame. */ +}; + +#if defined(__GNUC__) && !defined(__STRICT_ANSI__) +/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */ +#define __DECL_REG(name) union { \ + uint64_t r ## name, e ## name; \ + uint32_t _e ## name; \ +} +#else +/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */ +#define __DECL_REG(name) uint64_t r ## name +#endif + +struct cpu_user_regs { + uint64_t r15; + uint64_t r14; + uint64_t r13; + uint64_t r12; + __DECL_REG(bp); + __DECL_REG(bx); + uint64_t r11; + uint64_t r10; + uint64_t r9; + uint64_t r8; + __DECL_REG(ax); + __DECL_REG(cx); + __DECL_REG(dx); + __DECL_REG(si); + __DECL_REG(di); + uint32_t error_code; /* private */ + uint32_t entry_vector; /* private */ + __DECL_REG(ip); + uint16_t cs, _pad0[1]; + uint8_t saved_upcall_mask; + uint8_t _pad1[3]; + __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */ + __DECL_REG(sp); + uint16_t ss, _pad2[3]; + uint16_t es, _pad3[3]; + uint16_t ds, _pad4[3]; + uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */ + uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */ +}; +typedef struct cpu_user_regs cpu_user_regs_t; +DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t); + +#undef __DECL_REG + +#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12) +#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12) + +struct arch_vcpu_info { + unsigned long cr2; + unsigned long pad; /* sizeof(vcpu_info_t) == 64 */ +}; +typedef struct arch_vcpu_info arch_vcpu_info_t; + +typedef unsigned long xen_callback_t; + +#endif /* !__ASSEMBLY__ */ + +#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/arch-x86/xen-x86_32.h +++ linux-ec2-2.6.32/include/xen/interface/arch-x86/xen-x86_32.h @@ -0,0 +1,180 @@ +/****************************************************************************** + * xen-x86_32.h + * + * Guest OS interface to x86 32-bit Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2007, K A Fraser + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ +#define __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ + +/* + * Hypercall interface: + * Input: %ebx, %ecx, %edx, %esi, %edi (arguments 1-5) + * Output: %eax + * Access is via hypercall page (set up by guest loader or via a Xen MSR): + * call hypercall_page + hypercall-number * 32 + * Clobbered: Argument registers (e.g., 2-arg hypercall clobbers %ebx,%ecx) + */ + +#if __XEN_INTERFACE_VERSION__ < 0x00030203 +/* + * Legacy hypercall interface: + * As above, except the entry sequence to the hypervisor is: + * mov $hypercall-number*32,%eax ; int $0x82 + */ +#define TRAP_INSTR "int $0x82" +#endif + +/* + * These flat segments are in the Xen-private section of every GDT. Since these + * are also present in the initial GDT, many OSes will be able to avoid + * installing their own GDT. + */ +#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ +#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ +#define FLAT_RING1_SS 0xe021 /* GDT index 260 */ +#define FLAT_RING3_CS 0xe02b /* GDT index 261 */ +#define FLAT_RING3_DS 0xe033 /* GDT index 262 */ +#define FLAT_RING3_SS 0xe033 /* GDT index 262 */ + +#define FLAT_KERNEL_CS FLAT_RING1_CS +#define FLAT_KERNEL_DS FLAT_RING1_DS +#define FLAT_KERNEL_SS FLAT_RING1_SS +#define FLAT_USER_CS FLAT_RING3_CS +#define FLAT_USER_DS FLAT_RING3_DS +#define FLAT_USER_SS FLAT_RING3_SS + +#define __HYPERVISOR_VIRT_START_PAE 0xF5800000 +#define __MACH2PHYS_VIRT_START_PAE 0xF5800000 +#define __MACH2PHYS_VIRT_END_PAE 0xF6800000 +#define HYPERVISOR_VIRT_START_PAE \ + mk_unsigned_long(__HYPERVISOR_VIRT_START_PAE) +#define MACH2PHYS_VIRT_START_PAE \ + mk_unsigned_long(__MACH2PHYS_VIRT_START_PAE) +#define MACH2PHYS_VIRT_END_PAE \ + mk_unsigned_long(__MACH2PHYS_VIRT_END_PAE) + +/* Non-PAE bounds are obsolete. */ +#define __HYPERVISOR_VIRT_START_NONPAE 0xFC000000 +#define __MACH2PHYS_VIRT_START_NONPAE 0xFC000000 +#define __MACH2PHYS_VIRT_END_NONPAE 0xFC400000 +#define HYPERVISOR_VIRT_START_NONPAE \ + mk_unsigned_long(__HYPERVISOR_VIRT_START_NONPAE) +#define MACH2PHYS_VIRT_START_NONPAE \ + mk_unsigned_long(__MACH2PHYS_VIRT_START_NONPAE) +#define MACH2PHYS_VIRT_END_NONPAE \ + mk_unsigned_long(__MACH2PHYS_VIRT_END_NONPAE) + +#define __HYPERVISOR_VIRT_START __HYPERVISOR_VIRT_START_PAE +#define __MACH2PHYS_VIRT_START __MACH2PHYS_VIRT_START_PAE +#define __MACH2PHYS_VIRT_END __MACH2PHYS_VIRT_END_PAE + +#ifndef HYPERVISOR_VIRT_START +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +#endif + +#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) +#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) +#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>2) +#ifndef machine_to_phys_mapping +#define machine_to_phys_mapping ((unsigned long *)MACH2PHYS_VIRT_START) +#endif + +/* 32-/64-bit invariability for control interfaces (domctl/sysctl). */ +#if defined(__XEN__) || defined(__XEN_TOOLS__) +#undef ___DEFINE_XEN_GUEST_HANDLE +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ + typedef struct { type *p; } \ + __guest_handle_ ## name; \ + typedef struct { union { type *p; uint64_aligned_t q; }; } \ + __guest_handle_64_ ## name +#undef set_xen_guest_handle +#define set_xen_guest_handle(hnd, val) \ + do { if ( sizeof(hnd) == 8 ) *(uint64_t *)&(hnd) = 0; \ + (hnd).p = val; \ + } while ( 0 ) +#define uint64_aligned_t uint64_t __attribute__((aligned(8))) +#define __XEN_GUEST_HANDLE_64(name) __guest_handle_64_ ## name +#define XEN_GUEST_HANDLE_64(name) __XEN_GUEST_HANDLE_64(name) +#endif + +#ifndef __ASSEMBLY__ + +struct cpu_user_regs { + uint32_t ebx; + uint32_t ecx; + uint32_t edx; + uint32_t esi; + uint32_t edi; + uint32_t ebp; + uint32_t eax; + uint16_t error_code; /* private */ + uint16_t entry_vector; /* private */ + uint32_t eip; + uint16_t cs; + uint8_t saved_upcall_mask; + uint8_t _pad0; + uint32_t eflags; /* eflags.IF == !saved_upcall_mask */ + uint32_t esp; + uint16_t ss, _pad1; + uint16_t es, _pad2; + uint16_t ds, _pad3; + uint16_t fs, _pad4; + uint16_t gs, _pad5; +}; +typedef struct cpu_user_regs cpu_user_regs_t; +DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t); + +/* + * Page-directory addresses above 4GB do not fit into architectural %cr3. + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests + * must use the following accessor macros to pack/unpack valid MFNs. + */ +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) + +struct arch_vcpu_info { + unsigned long cr2; + unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */ +}; +typedef struct arch_vcpu_info arch_vcpu_info_t; + +struct xen_callback { + unsigned long cs; + unsigned long eip; +}; +typedef struct xen_callback xen_callback_t; + +#endif /* !__ASSEMBLY__ */ + +#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/arch-x86/cpuid.h +++ linux-ec2-2.6.32/include/xen/interface/arch-x86/cpuid.h @@ -0,0 +1,68 @@ +/****************************************************************************** + * arch-x86/cpuid.h + * + * CPUID interface to Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2007 Citrix Systems, Inc. + * + * Authors: + * Keir Fraser + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__ +#define __XEN_PUBLIC_ARCH_X86_CPUID_H__ + +/* Xen identification leaves start at 0x40000000. */ +#define XEN_CPUID_FIRST_LEAF 0x40000000 +#define XEN_CPUID_LEAF(i) (XEN_CPUID_FIRST_LEAF + (i)) + +/* + * Leaf 1 (0x40000000) + * EAX: Largest Xen-information leaf. All leaves up to an including @EAX + * are supported by the Xen host. + * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification + * of a Xen host. + */ +#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */ +#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */ +#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */ + +/* + * Leaf 2 (0x40000001) + * EAX[31:16]: Xen major version. + * EAX[15: 0]: Xen minor version. + * EBX-EDX: Reserved (currently all zeroes). + */ + +/* + * Leaf 3 (0x40000002) + * EAX: Number of hypercall transfer pages. This register is always guaranteed + * to specify one hypercall page. + * EBX: Base address of Xen-specific MSRs. + * ECX: Features 1. Unused bits are set to zero. + * EDX: Features 2. Unused bits are set to zero. + */ + +/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */ +#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0 +#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0) + +#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/arch-x86/hvm/save.h +++ linux-ec2-2.6.32/include/xen/interface/arch-x86/hvm/save.h @@ -0,0 +1,438 @@ +/* + * Structure definitions for HVM state that is held by Xen and must + * be saved along with the domain's memory and device-model state. + * + * Copyright (c) 2007 XenSource Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_HVM_SAVE_X86_H__ +#define __XEN_PUBLIC_HVM_SAVE_X86_H__ + +/* + * Save/restore header: general info about the save file. + */ + +#define HVM_FILE_MAGIC 0x54381286 +#define HVM_FILE_VERSION 0x00000001 + +struct hvm_save_header { + uint32_t magic; /* Must be HVM_FILE_MAGIC */ + uint32_t version; /* File format version */ + uint64_t changeset; /* Version of Xen that saved this file */ + uint32_t cpuid; /* CPUID[0x01][%eax] on the saving machine */ + uint32_t pad0; +}; + +DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header); + + +/* + * Processor + */ + +struct hvm_hw_cpu { + uint8_t fpu_regs[512]; + + uint64_t rax; + uint64_t rbx; + uint64_t rcx; + uint64_t rdx; + uint64_t rbp; + uint64_t rsi; + uint64_t rdi; + uint64_t rsp; + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + + uint64_t rip; + uint64_t rflags; + + uint64_t cr0; + uint64_t cr2; + uint64_t cr3; + uint64_t cr4; + + uint64_t dr0; + uint64_t dr1; + uint64_t dr2; + uint64_t dr3; + uint64_t dr6; + uint64_t dr7; + + uint32_t cs_sel; + uint32_t ds_sel; + uint32_t es_sel; + uint32_t fs_sel; + uint32_t gs_sel; + uint32_t ss_sel; + uint32_t tr_sel; + uint32_t ldtr_sel; + + uint32_t cs_limit; + uint32_t ds_limit; + uint32_t es_limit; + uint32_t fs_limit; + uint32_t gs_limit; + uint32_t ss_limit; + uint32_t tr_limit; + uint32_t ldtr_limit; + uint32_t idtr_limit; + uint32_t gdtr_limit; + + uint64_t cs_base; + uint64_t ds_base; + uint64_t es_base; + uint64_t fs_base; + uint64_t gs_base; + uint64_t ss_base; + uint64_t tr_base; + uint64_t ldtr_base; + uint64_t idtr_base; + uint64_t gdtr_base; + + uint32_t cs_arbytes; + uint32_t ds_arbytes; + uint32_t es_arbytes; + uint32_t fs_arbytes; + uint32_t gs_arbytes; + uint32_t ss_arbytes; + uint32_t tr_arbytes; + uint32_t ldtr_arbytes; + + uint64_t sysenter_cs; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + + /* msr for em64t */ + uint64_t shadow_gs; + + /* msr content saved/restored. */ + uint64_t msr_flags; + uint64_t msr_lstar; + uint64_t msr_star; + uint64_t msr_cstar; + uint64_t msr_syscall_mask; + uint64_t msr_efer; + + /* guest's idea of what rdtsc() would return */ + uint64_t tsc; + + /* pending event, if any */ + union { + uint32_t pending_event; + struct { + uint8_t pending_vector:8; + uint8_t pending_type:3; + uint8_t pending_error_valid:1; + uint32_t pending_reserved:19; + uint8_t pending_valid:1; + }; + }; + /* error code for pending event */ + uint32_t error_code; +}; + +DECLARE_HVM_SAVE_TYPE(CPU, 2, struct hvm_hw_cpu); + + +/* + * PIC + */ + +struct hvm_hw_vpic { + /* IR line bitmasks. */ + uint8_t irr; + uint8_t imr; + uint8_t isr; + + /* Line IRx maps to IRQ irq_base+x */ + uint8_t irq_base; + + /* + * Where are we in ICW2-4 initialisation (0 means no init in progress)? + * Bits 0-1 (=x): Next write at A=1 sets ICW(x+1). + * Bit 2: ICW1.IC4 (1 == ICW4 included in init sequence) + * Bit 3: ICW1.SNGL (0 == ICW3 included in init sequence) + */ + uint8_t init_state:4; + + /* IR line with highest priority. */ + uint8_t priority_add:4; + + /* Reads from A=0 obtain ISR or IRR? */ + uint8_t readsel_isr:1; + + /* Reads perform a polling read? */ + uint8_t poll:1; + + /* Automatically clear IRQs from the ISR during INTA? */ + uint8_t auto_eoi:1; + + /* Automatically rotate IRQ priorities during AEOI? */ + uint8_t rotate_on_auto_eoi:1; + + /* Exclude slave inputs when considering in-service IRQs? */ + uint8_t special_fully_nested_mode:1; + + /* Special mask mode excludes masked IRs from AEOI and priority checks. */ + uint8_t special_mask_mode:1; + + /* Is this a master PIC or slave PIC? (NB. This is not programmable.) */ + uint8_t is_master:1; + + /* Edge/trigger selection. */ + uint8_t elcr; + + /* Virtual INT output. */ + uint8_t int_output; +}; + +DECLARE_HVM_SAVE_TYPE(PIC, 3, struct hvm_hw_vpic); + + +/* + * IO-APIC + */ + +#ifdef __ia64__ +#define VIOAPIC_IS_IOSAPIC 1 +#define VIOAPIC_NUM_PINS 24 +#else +#define VIOAPIC_NUM_PINS 48 /* 16 ISA IRQs, 32 non-legacy PCI IRQS. */ +#endif + +struct hvm_hw_vioapic { + uint64_t base_address; + uint32_t ioregsel; + uint32_t id; + union vioapic_redir_entry + { + uint64_t bits; + struct { + uint8_t vector; + uint8_t delivery_mode:3; + uint8_t dest_mode:1; + uint8_t delivery_status:1; + uint8_t polarity:1; + uint8_t remote_irr:1; + uint8_t trig_mode:1; + uint8_t mask:1; + uint8_t reserve:7; +#if !VIOAPIC_IS_IOSAPIC + uint8_t reserved[4]; + uint8_t dest_id; +#else + uint8_t reserved[3]; + uint16_t dest_id; +#endif + } fields; + } redirtbl[VIOAPIC_NUM_PINS]; +}; + +DECLARE_HVM_SAVE_TYPE(IOAPIC, 4, struct hvm_hw_vioapic); + + +/* + * LAPIC + */ + +struct hvm_hw_lapic { + uint64_t apic_base_msr; + uint32_t disabled; /* VLAPIC_xx_DISABLED */ + uint32_t timer_divisor; +}; + +DECLARE_HVM_SAVE_TYPE(LAPIC, 5, struct hvm_hw_lapic); + +struct hvm_hw_lapic_regs { + uint8_t data[1024]; +}; + +DECLARE_HVM_SAVE_TYPE(LAPIC_REGS, 6, struct hvm_hw_lapic_regs); + + +/* + * IRQs + */ + +struct hvm_hw_pci_irqs { + /* + * Virtual interrupt wires for a single PCI bus. + * Indexed by: device*4 + INTx#. + */ + union { + unsigned long i[16 / sizeof (unsigned long)]; /* DECLARE_BITMAP(i, 32*4); */ + uint64_t pad[2]; + }; +}; + +DECLARE_HVM_SAVE_TYPE(PCI_IRQ, 7, struct hvm_hw_pci_irqs); + +struct hvm_hw_isa_irqs { + /* + * Virtual interrupt wires for ISA devices. + * Indexed by ISA IRQ (assumes no ISA-device IRQ sharing). + */ + union { + unsigned long i[1]; /* DECLARE_BITMAP(i, 16); */ + uint64_t pad[1]; + }; +}; + +DECLARE_HVM_SAVE_TYPE(ISA_IRQ, 8, struct hvm_hw_isa_irqs); + +struct hvm_hw_pci_link { + /* + * PCI-ISA interrupt router. + * Each PCI is 'wire-ORed' into one of four links using + * the traditional 'barber's pole' mapping ((device + INTx#) & 3). + * The router provides a programmable mapping from each link to a GSI. + */ + uint8_t route[4]; + uint8_t pad0[4]; +}; + +DECLARE_HVM_SAVE_TYPE(PCI_LINK, 9, struct hvm_hw_pci_link); + +/* + * PIT + */ + +struct hvm_hw_pit { + struct hvm_hw_pit_channel { + uint32_t count; /* can be 65536 */ + uint16_t latched_count; + uint8_t count_latched; + uint8_t status_latched; + uint8_t status; + uint8_t read_state; + uint8_t write_state; + uint8_t write_latch; + uint8_t rw_mode; + uint8_t mode; + uint8_t bcd; /* not supported */ + uint8_t gate; /* timer start */ + } channels[3]; /* 3 x 16 bytes */ + uint32_t speaker_data_on; + uint32_t pad0; +}; + +DECLARE_HVM_SAVE_TYPE(PIT, 10, struct hvm_hw_pit); + + +/* + * RTC + */ + +#define RTC_CMOS_SIZE 14 +struct hvm_hw_rtc { + /* CMOS bytes */ + uint8_t cmos_data[RTC_CMOS_SIZE]; + /* Index register for 2-part operations */ + uint8_t cmos_index; + uint8_t pad0; +}; + +DECLARE_HVM_SAVE_TYPE(RTC, 11, struct hvm_hw_rtc); + + +/* + * HPET + */ + +#define HPET_TIMER_NUM 3 /* 3 timers supported now */ +struct hvm_hw_hpet { + /* Memory-mapped, software visible registers */ + uint64_t capability; /* capabilities */ + uint64_t res0; /* reserved */ + uint64_t config; /* configuration */ + uint64_t res1; /* reserved */ + uint64_t isr; /* interrupt status reg */ + uint64_t res2[25]; /* reserved */ + uint64_t mc64; /* main counter */ + uint64_t res3; /* reserved */ + struct { /* timers */ + uint64_t config; /* configuration/cap */ + uint64_t cmp; /* comparator */ + uint64_t fsb; /* FSB route, not supported now */ + uint64_t res4; /* reserved */ + } timers[HPET_TIMER_NUM]; + uint64_t res5[4*(24-HPET_TIMER_NUM)]; /* reserved, up to 0x3ff */ + + /* Hidden register state */ + uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */ +}; + +DECLARE_HVM_SAVE_TYPE(HPET, 12, struct hvm_hw_hpet); + + +/* + * PM timer + */ + +struct hvm_hw_pmtimer { + uint32_t tmr_val; /* PM_TMR_BLK.TMR_VAL: 32bit free-running counter */ + uint16_t pm1a_sts; /* PM1a_EVT_BLK.PM1a_STS: status register */ + uint16_t pm1a_en; /* PM1a_EVT_BLK.PM1a_EN: enable register */ +}; + +DECLARE_HVM_SAVE_TYPE(PMTIMER, 13, struct hvm_hw_pmtimer); + +/* + * MTRR MSRs + */ + +struct hvm_hw_mtrr { +#define MTRR_VCNT 8 +#define NUM_FIXED_MSR 11 + uint64_t msr_pat_cr; + /* mtrr physbase & physmask msr pair*/ + uint64_t msr_mtrr_var[MTRR_VCNT*2]; + uint64_t msr_mtrr_fixed[NUM_FIXED_MSR]; + uint64_t msr_mtrr_cap; + uint64_t msr_mtrr_def_type; +}; + +DECLARE_HVM_SAVE_TYPE(MTRR, 14, struct hvm_hw_mtrr); + +/* + * Viridian hypervisor context. + */ + +struct hvm_viridian_context { + uint64_t hypercall_gpa; + uint64_t guest_os_id; +}; + +DECLARE_HVM_SAVE_TYPE(VIRIDIAN, 15, struct hvm_viridian_context); + +/* + * Largest type-code in use + */ +#define HVM_SAVE_CODE_MAX 15 + +#endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/hvm/save.h +++ linux-ec2-2.6.32/include/xen/interface/hvm/save.h @@ -0,0 +1,88 @@ +/* + * hvm/save.h + * + * Structure definitions for HVM state that is held by Xen and must + * be saved along with the domain's memory and device-model state. + * + * Copyright (c) 2007 XenSource Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_HVM_SAVE_H__ +#define __XEN_PUBLIC_HVM_SAVE_H__ + +/* + * Structures in this header *must* have the same layout in 32bit + * and 64bit environments: this means that all fields must be explicitly + * sized types and aligned to their sizes, and the structs must be + * a multiple of eight bytes long. + * + * Only the state necessary for saving and restoring (i.e. fields + * that are analogous to actual hardware state) should go in this file. + * Internal mechanisms should be kept in Xen-private headers. + */ + +#if !defined(__GNUC__) || defined(__STRICT_ANSI__) +#error "Anonymous structs/unions are a GNU extension." +#endif + +/* + * Each entry is preceded by a descriptor giving its type and length + */ +struct hvm_save_descriptor { + uint16_t typecode; /* Used to demux the various types below */ + uint16_t instance; /* Further demux within a type */ + uint32_t length; /* In bytes, *not* including this descriptor */ +}; + + +/* + * Each entry has a datatype associated with it: for example, the CPU state + * is saved as a HVM_SAVE_TYPE(CPU), which has HVM_SAVE_LENGTH(CPU), + * and is identified by a descriptor with typecode HVM_SAVE_CODE(CPU). + * DECLARE_HVM_SAVE_TYPE binds these things together with some type-system + * ugliness. + */ + +#define DECLARE_HVM_SAVE_TYPE(_x, _code, _type) \ + struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; } + +#define HVM_SAVE_TYPE(_x) typeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->t) +#define HVM_SAVE_LENGTH(_x) (sizeof (HVM_SAVE_TYPE(_x))) +#define HVM_SAVE_CODE(_x) (sizeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->c)) + + +/* + * The series of save records is teminated by a zero-type, zero-length + * descriptor. + */ + +struct hvm_save_end {}; +DECLARE_HVM_SAVE_TYPE(END, 0, struct hvm_save_end); + +#if defined(__i386__) || defined(__x86_64__) +#include "../arch-x86/hvm/save.h" +#elif defined(__ia64__) +#include "../arch-ia64/hvm/save.h" +#else +#error "unsupported architecture" +#endif + +#endif /* __XEN_PUBLIC_HVM_SAVE_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/hvm/hvm_info_table.h +++ linux-ec2-2.6.32/include/xen/interface/hvm/hvm_info_table.h @@ -0,0 +1,69 @@ +/****************************************************************************** + * hvm/hvm_info_table.h + * + * HVM parameter and information table, written into guest memory map. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ +#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ + +#define HVM_INFO_PFN 0x09F +#define HVM_INFO_OFFSET 0x800 +#define HVM_INFO_PADDR ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET) + +struct hvm_info_table { + char signature[8]; /* "HVM INFO" */ + uint32_t length; + uint8_t checksum; + + /* Should firmware build ACPI tables? */ + uint8_t acpi_enabled; + + /* Should firmware build APIC descriptors (APIC MADT / MP BIOS)? */ + uint8_t apic_mode; + + /* How many CPUs does this domain have? */ + uint32_t nr_vcpus; + + /* + * MEMORY MAP provided by HVM domain builder. + * Notes: + * 1. page_to_phys(x) = x << 12 + * 2. If a field is zero, the corresponding range does not exist. + */ + /* + * 0x0 to page_to_phys(low_mem_pgend)-1: + * RAM below 4GB (except for VGA hole 0xA0000-0xBFFFF) + */ + uint32_t low_mem_pgend; + /* + * page_to_phys(reserved_mem_pgstart) to 0xFFFFFFFF: + * Reserved for special memory mappings + */ + uint32_t reserved_mem_pgstart; + /* + * 0x100000000 to page_to_phys(high_mem_pgend)-1: + * RAM above 4GB + */ + uint32_t high_mem_pgend; +}; + +#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/hvm/ioreq.h +++ linux-ec2-2.6.32/include/xen/interface/hvm/ioreq.h @@ -0,0 +1,127 @@ +/* + * ioreq.h: I/O request definitions for device models + * Copyright (c) 2004, Intel Corporation. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _IOREQ_H_ +#define _IOREQ_H_ + +#define IOREQ_READ 1 +#define IOREQ_WRITE 0 + +#define STATE_IOREQ_NONE 0 +#define STATE_IOREQ_READY 1 +#define STATE_IOREQ_INPROCESS 2 +#define STATE_IORESP_READY 3 + +#define IOREQ_TYPE_PIO 0 /* pio */ +#define IOREQ_TYPE_COPY 1 /* mmio ops */ +#define IOREQ_TYPE_TIMEOFFSET 7 +#define IOREQ_TYPE_INVALIDATE 8 /* mapcache */ + +/* + * VMExit dispatcher should cooperate with instruction decoder to + * prepare this structure and notify service OS and DM by sending + * virq + */ +struct ioreq { + uint64_t addr; /* physical address */ + uint64_t size; /* size in bytes */ + uint64_t count; /* for rep prefixes */ + uint64_t data; /* data (or paddr of data) */ + uint8_t state:4; + uint8_t data_is_ptr:1; /* if 1, data above is the guest paddr + * of the real data to use. */ + uint8_t dir:1; /* 1=read, 0=write */ + uint8_t df:1; + uint8_t pad:1; + uint8_t type; /* I/O type */ + uint8_t _pad0[6]; + uint64_t io_count; /* How many IO done on a vcpu */ +}; +typedef struct ioreq ioreq_t; + +struct vcpu_iodata { + struct ioreq vp_ioreq; + /* Event channel port, used for notifications to/from the device model. */ + uint32_t vp_eport; + uint32_t _pad0; +}; +typedef struct vcpu_iodata vcpu_iodata_t; + +struct shared_iopage { + struct vcpu_iodata vcpu_iodata[1]; +}; +typedef struct shared_iopage shared_iopage_t; + +struct buf_ioreq { + uint8_t type; /* I/O type */ + uint8_t pad:1; + uint8_t dir:1; /* 1=read, 0=write */ + uint8_t size:2; /* 0=>1, 1=>2, 2=>4, 3=>8. If 8, use two buf_ioreqs */ + uint32_t addr:20;/* physical address */ + uint32_t data; /* data */ +}; +typedef struct buf_ioreq buf_ioreq_t; + +#define IOREQ_BUFFER_SLOT_NUM 511 /* 8 bytes each, plus 2 4-byte indexes */ +struct buffered_iopage { + unsigned int read_pointer; + unsigned int write_pointer; + buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM]; +}; /* NB. Size of this structure must be no greater than one page. */ +typedef struct buffered_iopage buffered_iopage_t; + +#if defined(__ia64__) +struct pio_buffer { + uint32_t page_offset; + uint32_t pointer; + uint32_t data_end; + uint32_t buf_size; + void *opaque; +}; + +#define PIO_BUFFER_IDE_PRIMARY 0 /* I/O port = 0x1F0 */ +#define PIO_BUFFER_IDE_SECONDARY 1 /* I/O port = 0x170 */ +#define PIO_BUFFER_ENTRY_NUM 2 +struct buffered_piopage { + struct pio_buffer pio[PIO_BUFFER_ENTRY_NUM]; + uint8_t buffer[1]; +}; +#endif /* defined(__ia64__) */ + +#define ACPI_PM1A_EVT_BLK_ADDRESS 0x0000000000001f40 +#define ACPI_PM1A_CNT_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x04) +#define ACPI_PM_TMR_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x08) +#define ACPI_GPE0_BLK_ADDRESS (ACPI_PM_TMR_BLK_ADDRESS + 0x20) +#define ACPI_GPE0_BLK_LEN 0x08 + +#endif /* _IOREQ_H_ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ --- linux-ec2-2.6.32.orig/include/xen/interface/hvm/e820.h +++ linux-ec2-2.6.32/include/xen/interface/hvm/e820.h @@ -0,0 +1,34 @@ + +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_HVM_E820_H__ +#define __XEN_PUBLIC_HVM_E820_H__ + +/* E820 location in HVM virtual address space. */ +#define HVM_E820_PAGE 0x00090000 +#define HVM_E820_NR_OFFSET 0x000001E8 +#define HVM_E820_OFFSET 0x000002D0 + +#define HVM_BELOW_4G_RAM_END 0xF0000000 +#define HVM_BELOW_4G_MMIO_START HVM_BELOW_4G_RAM_END +#define HVM_BELOW_4G_MMIO_LENGTH ((1ULL << 32) - HVM_BELOW_4G_MMIO_START) + +#endif /* __XEN_PUBLIC_HVM_E820_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/hvm/hvm_op.h +++ linux-ec2-2.6.32/include/xen/interface/hvm/hvm_op.h @@ -0,0 +1,133 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_HVM_HVM_OP_H__ +#define __XEN_PUBLIC_HVM_HVM_OP_H__ + +#include "../xen.h" + +/* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */ +#define HVMOP_set_param 0 +#define HVMOP_get_param 1 +struct xen_hvm_param { + domid_t domid; /* IN */ + uint32_t index; /* IN */ + uint64_t value; /* IN/OUT */ +}; +typedef struct xen_hvm_param xen_hvm_param_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_param_t); + +/* Set the logical level of one of a domain's PCI INTx wires. */ +#define HVMOP_set_pci_intx_level 2 +struct xen_hvm_set_pci_intx_level { + /* Domain to be updated. */ + domid_t domid; + /* PCI INTx identification in PCI topology (domain:bus:device:intx). */ + uint8_t domain, bus, device, intx; + /* Assertion level (0 = unasserted, 1 = asserted). */ + uint8_t level; +}; +typedef struct xen_hvm_set_pci_intx_level xen_hvm_set_pci_intx_level_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t); + +/* Set the logical level of one of a domain's ISA IRQ wires. */ +#define HVMOP_set_isa_irq_level 3 +struct xen_hvm_set_isa_irq_level { + /* Domain to be updated. */ + domid_t domid; + /* ISA device identification, by ISA IRQ (0-15). */ + uint8_t isa_irq; + /* Assertion level (0 = unasserted, 1 = asserted). */ + uint8_t level; +}; +typedef struct xen_hvm_set_isa_irq_level xen_hvm_set_isa_irq_level_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t); + +#define HVMOP_set_pci_link_route 4 +struct xen_hvm_set_pci_link_route { + /* Domain to be updated. */ + domid_t domid; + /* PCI link identifier (0-3). */ + uint8_t link; + /* ISA IRQ (1-15), or 0 (disable link). */ + uint8_t isa_irq; +}; +typedef struct xen_hvm_set_pci_link_route xen_hvm_set_pci_link_route_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t); + +/* Flushes all VCPU TLBs: @arg must be NULL. */ +#define HVMOP_flush_tlbs 5 + +/* Following tools-only interfaces may change in future. */ +#if defined(__XEN__) || defined(__XEN_TOOLS__) + +/* Track dirty VRAM. */ +#define HVMOP_track_dirty_vram 6 +struct xen_hvm_track_dirty_vram { + /* Domain to be tracked. */ + domid_t domid; + /* First pfn to track. */ + uint64_aligned_t first_pfn; + /* Number of pages to track. */ + uint64_aligned_t nr; + /* OUT variable. */ + /* Dirty bitmap buffer. */ + XEN_GUEST_HANDLE_64(uint8) dirty_bitmap; +}; +typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t); + +/* Notify that some pages got modified by the Device Model. */ +#define HVMOP_modified_memory 7 +struct xen_hvm_modified_memory { + /* Domain to be updated. */ + domid_t domid; + /* First pfn. */ + uint64_aligned_t first_pfn; + /* Number of pages. */ + uint64_aligned_t nr; +}; +typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t); + +#define HVMOP_set_mem_type 8 +typedef enum { + HVMMEM_ram_rw, /* Normal read/write guest RAM */ + HVMMEM_ram_ro, /* Read-only; writes are discarded */ + HVMMEM_mmio_dm, /* Reads and write go to the device model */ +} hvmmem_type_t; +/* Notify that a region of memory is to be treated in a specific way. */ +struct xen_hvm_set_mem_type { + /* Domain to be updated. */ + domid_t domid; + /* Memory type */ + hvmmem_type_t hvmmem_type; + /* First pfn. */ + uint64_aligned_t first_pfn; + /* Number of pages. */ + uint64_aligned_t nr; +}; +typedef struct xen_hvm_set_mem_type xen_hvm_set_mem_type_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_type_t); + + +#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ + +#endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */ --- linux-ec2-2.6.32.orig/include/xen/interface/hvm/params.h +++ linux-ec2-2.6.32/include/xen/interface/hvm/params.h @@ -0,0 +1,111 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_HVM_PARAMS_H__ +#define __XEN_PUBLIC_HVM_PARAMS_H__ + +#include "hvm_op.h" + +/* + * Parameter space for HVMOP_{set,get}_param. + */ + +/* + * How should CPU0 event-channel notifications be delivered? + * val[63:56] == 0: val[55:0] is a delivery GSI (Global System Interrupt). + * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows: + * Domain = val[47:32], Bus = val[31:16], + * DevFn = val[15: 8], IntX = val[ 1: 0] + * If val == 0 then CPU0 event-channel notifications are not delivered. + */ +#define HVM_PARAM_CALLBACK_IRQ 0 + +/* + * These are not used by Xen. They are here for convenience of HVM-guest + * xenbus implementations. + */ +#define HVM_PARAM_STORE_PFN 1 +#define HVM_PARAM_STORE_EVTCHN 2 + +#define HVM_PARAM_PAE_ENABLED 4 + +#define HVM_PARAM_IOREQ_PFN 5 + +#define HVM_PARAM_BUFIOREQ_PFN 6 + +#ifdef __ia64__ + +#define HVM_PARAM_NVRAM_FD 7 +#define HVM_PARAM_VHPT_SIZE 8 +#define HVM_PARAM_BUFPIOREQ_PFN 9 + +#elif defined(__i386__) || defined(__x86_64__) + +/* Expose Viridian interfaces to this HVM guest? */ +#define HVM_PARAM_VIRIDIAN 9 + +#endif + +/* + * Set mode for virtual timers (currently x86 only): + * delay_for_missed_ticks (default): + * Do not advance a vcpu's time beyond the correct delivery time for + * interrupts that have been missed due to preemption. Deliver missed + * interrupts when the vcpu is rescheduled and advance the vcpu's virtual + * time stepwise for each one. + * no_delay_for_missed_ticks: + * As above, missed interrupts are delivered, but guest time always tracks + * wallclock (i.e., real) time while doing so. + * no_missed_ticks_pending: + * No missed interrupts are held pending. Instead, to ensure ticks are + * delivered at some non-zero rate, if we detect missed ticks then the + * internal tick alarm is not disabled if the VCPU is preempted during the + * next tick period. + * one_missed_tick_pending: + * Missed interrupts are collapsed together and delivered as one 'late tick'. + * Guest time always tracks wallclock (i.e., real) time. + */ +#define HVM_PARAM_TIMER_MODE 10 +#define HVMPTM_delay_for_missed_ticks 0 +#define HVMPTM_no_delay_for_missed_ticks 1 +#define HVMPTM_no_missed_ticks_pending 2 +#define HVMPTM_one_missed_tick_pending 3 + +/* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */ +#define HVM_PARAM_HPET_ENABLED 11 + +/* Identity-map page directory used by Intel EPT when CR0.PG=0. */ +#define HVM_PARAM_IDENT_PT 12 + +/* Device Model domain, defaults to 0. */ +#define HVM_PARAM_DM_DOMAIN 13 + +/* ACPI S state: currently support S0 and S3 on x86. */ +#define HVM_PARAM_ACPI_S_STATE 14 + +/* TSS used on Intel when CR0.PE=0. */ +#define HVM_PARAM_VM86_TSS 15 + +/* Boolean: Enable aligning all periodic vpts to reduce interrupts */ +#define HVM_PARAM_VPT_ALIGN 16 + +#define HVM_NR_PARAMS 17 + +#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */ --- linux-ec2-2.6.32.orig/include/xen/public/evtchn.h +++ linux-ec2-2.6.32/include/xen/public/evtchn.h @@ -0,0 +1,88 @@ +/****************************************************************************** + * evtchn.h + * + * Interface to /dev/xen/evtchn. + * + * Copyright (c) 2003-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_EVTCHN_H__ +#define __LINUX_PUBLIC_EVTCHN_H__ + +/* + * Bind a fresh port to VIRQ @virq. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_VIRQ \ + _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq)) +struct ioctl_evtchn_bind_virq { + unsigned int virq; +}; + +/* + * Bind a fresh port to remote <@remote_domain, @remote_port>. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_INTERDOMAIN \ + _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain)) +struct ioctl_evtchn_bind_interdomain { + unsigned int remote_domain, remote_port; +}; + +/* + * Allocate a fresh port for binding to @remote_domain. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \ + _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port)) +struct ioctl_evtchn_bind_unbound_port { + unsigned int remote_domain; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_UNBIND \ + _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind)) +struct ioctl_evtchn_unbind { + unsigned int port; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_NOTIFY \ + _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify)) +struct ioctl_evtchn_notify { + unsigned int port; +}; + +/* Clear and reinitialise the event buffer. Clear error condition. */ +#define IOCTL_EVTCHN_RESET \ + _IOC(_IOC_NONE, 'E', 5, 0) + +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */ --- linux-ec2-2.6.32.orig/include/xen/public/xenbus.h +++ linux-ec2-2.6.32/include/xen/public/xenbus.h @@ -0,0 +1,56 @@ +/****************************************************************************** + * xenbus.h + * + * Interface to /proc/xen/xenbus. + * + * Copyright (c) 2008, Diego Ongaro + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_XENBUS_H__ +#define __LINUX_PUBLIC_XENBUS_H__ + +#include + +#ifndef __user +#define __user +#endif + +typedef struct xenbus_alloc { + domid_t dom; + __u32 port; + __u32 grant_ref; +} xenbus_alloc_t; + +/* + * @cmd: IOCTL_XENBUS_ALLOC + * @arg: &xenbus_alloc_t + * Return: 0, or -1 for error + */ +#define IOCTL_XENBUS_ALLOC \ + _IOC(_IOC_NONE, 'X', 0, sizeof(xenbus_alloc_t)) + +#endif /* __LINUX_PUBLIC_XENBUS_H__ */ --- linux-ec2-2.6.32.orig/include/xen/public/gntdev.h +++ linux-ec2-2.6.32/include/xen/public/gntdev.h @@ -0,0 +1,119 @@ +/****************************************************************************** + * gntdev.h + * + * Interface to /dev/xen/gntdev. + * + * Copyright (c) 2007, D G Murray + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_GNTDEV_H__ +#define __LINUX_PUBLIC_GNTDEV_H__ + +struct ioctl_gntdev_grant_ref { + /* The domain ID of the grant to be mapped. */ + uint32_t domid; + /* The grant reference of the grant to be mapped. */ + uint32_t ref; +}; + +/* + * Inserts the grant references into the mapping table of an instance + * of gntdev. N.B. This does not perform the mapping, which is deferred + * until mmap() is called with @index as the offset. + */ +#define IOCTL_GNTDEV_MAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) +struct ioctl_gntdev_map_grant_ref { + /* IN parameters */ + /* The number of grants to be mapped. */ + uint32_t count; + uint32_t pad; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* Variable IN parameter. */ + /* Array of grant references, of size @count. */ + struct ioctl_gntdev_grant_ref refs[1]; +}; + +/* + * Removes the grant references from the mapping table of an instance of + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * before this ioctl is called, or an error will result. + */ +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +struct ioctl_gntdev_unmap_grant_ref { + /* IN parameters */ + /* The offset was returned by the corresponding map operation. */ + uint64_t index; + /* The number of pages to be unmapped. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Returns the offset in the driver's address space that corresponds + * to @vaddr. This can be used to perform a munmap(), followed by an + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by + * the caller. The number of pages that were allocated at the same time as + * @vaddr is returned in @count. + * + * N.B. Where more than one page has been mapped into a contiguous range, the + * supplied @vaddr must correspond to the start of the range; otherwise + * an error will result. It is only possible to munmap() the entire + * contiguously-allocated range at once, and not any subrange thereof. + */ +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) +struct ioctl_gntdev_get_offset_for_vaddr { + /* IN parameters */ + /* The virtual address of the first mapped page in a range. */ + uint64_t vaddr; + /* OUT parameters */ + /* The offset that was used in the initial mmap() operation. */ + uint64_t offset; + /* The number of pages mapped in the VM area that begins at @vaddr. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Sets the maximum number of grants that may mapped at once by this gntdev + * instance. + * + * N.B. This must be called before any other ioctl is performed on the device. + */ +#define IOCTL_GNTDEV_SET_MAX_GRANTS \ +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) +struct ioctl_gntdev_set_max_grants { + /* IN parameter */ + /* The maximum number of grants that may be mapped at once. */ + uint32_t count; +}; + +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ --- linux-ec2-2.6.32.orig/include/xen/public/privcmd.h +++ linux-ec2-2.6.32/include/xen/public/privcmd.h @@ -0,0 +1,79 @@ +/****************************************************************************** + * privcmd.h + * + * Interface to /proc/xen/privcmd. + * + * Copyright (c) 2003-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_PRIVCMD_H__ +#define __LINUX_PUBLIC_PRIVCMD_H__ + +#include + +#ifndef __user +#define __user +#endif + +typedef struct privcmd_hypercall +{ + __u64 op; + __u64 arg[5]; +} privcmd_hypercall_t; + +typedef struct privcmd_mmap_entry { + __u64 va; + __u64 mfn; + __u64 npages; +} privcmd_mmap_entry_t; + +typedef struct privcmd_mmap { + int num; + domid_t dom; /* target domain */ + privcmd_mmap_entry_t __user *entry; +} privcmd_mmap_t; + +typedef struct privcmd_mmapbatch { + int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ +} privcmd_mmapbatch_t; + +/* + * @cmd: IOCTL_PRIVCMD_HYPERCALL + * @arg: &privcmd_hypercall_t + * Return: Value returned from execution of the specified hypercall. + */ +#define IOCTL_PRIVCMD_HYPERCALL \ + _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t)) +#define IOCTL_PRIVCMD_MMAP \ + _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t)) +#define IOCTL_PRIVCMD_MMAPBATCH \ + _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t)) + +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ --- linux-ec2-2.6.32.orig/include/keys/rxrpc-type.h +++ linux-ec2-2.6.32/include/keys/rxrpc-type.h @@ -99,7 +99,6 @@ * structure of raw payloads passed to add_key() or instantiate key */ struct rxrpc_key_data_v1 { - u32 kif_version; /* 1 */ u16 security_index; u16 ticket_length; u32 expiry; /* time_t */ --- linux-ec2-2.6.32.orig/include/linux/acpi.h +++ linux-ec2-2.6.32/include/linux/acpi.h @@ -247,12 +247,21 @@ int acpi_check_mem_region(resource_size_t start, resource_size_t n, const char *name); +int acpi_pci_get_root_seg_bbn(char *hid, char *uid, int *seg, int *bbn); + #ifdef CONFIG_PM_SLEEP void __init acpi_no_s4_hw_signature(void); void __init acpi_old_suspend_ordering(void); void __init acpi_s4_no_nvs(void); #endif /* CONFIG_PM_SLEEP */ +struct acpi_osc_context { + char *uuid_str; /* uuid string */ + int rev; + struct acpi_buffer cap; /* arg2/arg3 */ + struct acpi_buffer ret; /* free by caller if success */ +}; + #define OSC_QUERY_TYPE 0 #define OSC_SUPPORT_TYPE 1 #define OSC_CONTROL_TYPE 2 @@ -265,6 +274,15 @@ #define OSC_INVALID_REVISION_ERROR 8 #define OSC_CAPABILITIES_MASK_ERROR 16 +acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); + +/* platform-wide _OSC bits */ +#define OSC_SB_PAD_SUPPORT 1 +#define OSC_SB_PPC_OST_SUPPORT 2 +#define OSC_SB_PR3_SUPPORT 4 +#define OSC_SB_CPUHP_OST_SUPPORT 8 +#define OSC_SB_APEI_SUPPORT 16 + /* _OSC DW1 Definition (OS Support Fields) */ #define OSC_EXT_PCI_CONFIG_SUPPORT 1 #define OSC_ACTIVE_STATE_PWR_SUPPORT 2 --- linux-ec2-2.6.32.orig/include/linux/kvm_host.h +++ linux-ec2-2.6.32/include/linux/kvm_host.h @@ -53,7 +53,7 @@ */ struct kvm_io_bus { int dev_count; -#define NR_IOBUS_DEVS 6 +#define NR_IOBUS_DEVS 200 struct kvm_io_device *devs[NR_IOBUS_DEVS]; }; @@ -116,6 +116,11 @@ int user_alloc; }; +static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) +{ + return ALIGN(memslot->npages, BITS_PER_LONG) / 8; +} + struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; @@ -128,7 +133,17 @@ } irqchip; struct msi_msg msi; }; - struct list_head link; + struct hlist_node link; +}; + +struct kvm_irq_routing_table { + struct kvm_kernel_irq_routing_entry *rt_entries; + u32 nr_rt_entries; + /* + * Array indexed by gsi. Each entry contains list of irq chips + * the gsi is connected to. + */ + struct hlist_head map[0]; }; struct kvm { @@ -166,7 +181,7 @@ struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP - struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */ + struct kvm_irq_routing_table *irq_routing; struct hlist_head mask_notifier_list; #endif @@ -266,6 +281,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); void kvm_resched(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); @@ -390,7 +406,12 @@ struct kvm_irq_mask_notifier *kimn); void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); -int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); +#ifdef __KVM_HAVE_IOAPIC +void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, + union kvm_ioapic_redirect_entry *entry, + unsigned long *deliver_bitmask); +#endif +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); @@ -405,6 +426,7 @@ #ifdef CONFIG_IOMMU_API int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); +void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot); int kvm_iommu_map_guest(struct kvm *kvm); int kvm_iommu_unmap_guest(struct kvm *kvm); int kvm_assign_device(struct kvm *kvm, @@ -419,6 +441,11 @@ return 0; } +static inline void kvm_iommu_unmap_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ +} + static inline int kvm_iommu_map_guest(struct kvm *kvm) { return -ENODEV; @@ -551,5 +578,12 @@ { return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; } + +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu); + +#else + +static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; } + #endif #endif --- linux-ec2-2.6.32.orig/include/linux/kvm.h +++ linux-ec2-2.6.32/include/linux/kvm.h @@ -116,6 +116,11 @@ __u64 cr8; __u64 apic_base; +#ifdef __KVM_S390 + /* the processor status word for s390 */ + __u64 psw_mask; /* psw upper half */ + __u64 psw_addr; /* psw lower half */ +#endif union { /* KVM_EXIT_UNKNOWN */ struct { @@ -167,8 +172,6 @@ /* KVM_EXIT_S390_SIEIC */ struct { __u8 icptcode; - __u64 mask; /* psw upper half */ - __u64 addr; /* psw lower half */ __u16 ipa; __u32 ipb; } s390_sieic; @@ -436,6 +439,7 @@ #endif #define KVM_CAP_IOEVENTFD 36 #define KVM_CAP_SET_IDENTITY_MAP_ADDR 37 +#define KVM_CAP_ADJUST_CLOCK 39 #ifdef KVM_CAP_IRQ_ROUTING @@ -474,6 +478,7 @@ }; #endif +#define KVM_CAP_S390_PSW 42 #ifdef KVM_CAP_MCE /* x86 MCE */ @@ -497,6 +502,12 @@ __u8 pad[20]; }; +struct kvm_clock_data { + __u64 clock; + __u32 flags; + __u32 pad[9]; +}; + /* * ioctls for VM fds */ @@ -546,6 +557,8 @@ #define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) #define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) #define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd) +#define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) +#define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) /* * ioctls for vcpu fds --- linux-ec2-2.6.32.orig/include/linux/signalfd.h +++ linux-ec2-2.6.32/include/linux/signalfd.h @@ -60,13 +60,16 @@ wake_up(&tsk->sighand->signalfd_wqh); } +extern void signalfd_cleanup(struct sighand_struct *sighand); + #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } +static inline void signalfd_cleanup(struct sighand_struct *sighand) { } + #endif /* CONFIG_SIGNALFD */ #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNALFD_H */ - --- linux-ec2-2.6.32.orig/include/linux/clocksource.h +++ linux-ec2-2.6.32/include/linux/clocksource.h @@ -151,6 +151,7 @@ * subtraction of non 64 bit counters * @mult: cycle to nanosecond multiplier * @shift: cycle to nanosecond divisor (power of two) + * @max_idle_ns: max idle time permitted by the clocksource (nsecs) * @flags: flags describing special properties * @vread: vsyscall based read * @resume: resume function for the clocksource, if necessary @@ -168,6 +169,7 @@ cycle_t mask; u32 mult; u32 shift; + u64 max_idle_ns; unsigned long flags; cycle_t (*vread)(void); void (*resume)(void); @@ -188,6 +190,7 @@ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG /* Watchdog related data, used by the framework */ struct list_head wd_list; + cycle_t cs_last; cycle_t wd_last; #endif }; @@ -280,10 +283,12 @@ extern void clocksource_mark_unstable(struct clocksource *cs); #ifdef CONFIG_GENERIC_TIME_VSYSCALL -extern void update_vsyscall(struct timespec *ts, struct clocksource *c); +extern void +update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult); extern void update_vsyscall_tz(void); #else -static inline void update_vsyscall(struct timespec *ts, struct clocksource *c) +static inline void +update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult) { } --- linux-ec2-2.6.32.orig/include/linux/netdevice.h +++ linux-ec2-2.6.32/include/linux/netdevice.h @@ -348,13 +348,14 @@ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ }; -enum { +enum gro_result { GRO_MERGED, GRO_MERGED_FREE, GRO_HELD, GRO_NORMAL, GRO_DROP, }; +typedef enum gro_result gro_result_t; extern void __napi_schedule(struct napi_struct *n); @@ -1164,9 +1165,12 @@ static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { + if (!pskb_may_pull(skb, hlen)) + return NULL; + NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; + return skb->data + offset; } static inline void *skb_gro_mac_header(struct sk_buff *skb) @@ -1467,18 +1471,19 @@ #define HAVE_NETIF_RECEIVE_SKB 1 extern int netif_receive_skb(struct sk_buff *skb); extern void napi_gro_flush(struct napi_struct *napi); -extern int dev_gro_receive(struct napi_struct *napi, +extern gro_result_t dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb); -extern int napi_skb_finish(int ret, struct sk_buff *skb); -extern int napi_gro_receive(struct napi_struct *napi, +extern gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb); +extern gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); extern void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb); extern struct sk_buff * napi_get_frags(struct napi_struct *napi); -extern int napi_frags_finish(struct napi_struct *napi, - struct sk_buff *skb, int ret); +extern gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret); extern struct sk_buff * napi_frags_skb(struct napi_struct *napi); -extern int napi_gro_frags(struct napi_struct *napi); +extern gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) { @@ -1560,6 +1565,8 @@ extern void netif_carrier_off(struct net_device *dev); +extern void netif_notify_peers(struct net_device *dev); + /** * netif_dormant_on - mark device as dormant. * @dev: network device @@ -2013,6 +2020,10 @@ return 0; return dev->ethtool_ops->get_flags(dev); } + +#define MODULE_ALIAS_NETDEV(device) \ + MODULE_ALIAS("netdev-" device) + #endif /* __KERNEL__ */ #endif /* _LINUX_NETDEVICE_H */ --- linux-ec2-2.6.32.orig/include/linux/notifier.h +++ linux-ec2-2.6.32/include/linux/notifier.h @@ -201,6 +201,7 @@ #define NETDEV_PRE_UP 0x000D #define NETDEV_BONDING_OLDTYPE 0x000E #define NETDEV_BONDING_NEWTYPE 0x000F +#define NETDEV_NOTIFY_PEERS 0x0013 #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN --- linux-ec2-2.6.32.orig/include/linux/backing-dev.h +++ linux-ec2-2.6.32/include/linux/backing-dev.h @@ -81,8 +81,6 @@ struct bdi_writeback wb; /* default writeback info for this bdi */ spinlock_t wb_lock; /* protects update side of wb_list */ struct list_head wb_list; /* the flusher threads hanging off this bdi */ - unsigned long wb_mask; /* bitmask of registered tasks */ - unsigned int wb_cnt; /* number of registered tasks */ struct list_head work_list; @@ -101,10 +99,11 @@ const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); -void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages); +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages); +void bdi_start_background_writeback(struct backing_dev_info *bdi); int bdi_writeback_task(struct bdi_writeback *wb); int bdi_has_dirty_io(struct backing_dev_info *bdi); +void bdi_arm_supers_timer(void); extern spinlock_t bdi_lock; extern struct list_head bdi_list; --- linux-ec2-2.6.32.orig/include/linux/bitops.h +++ linux-ec2-2.6.32/include/linux/bitops.h @@ -46,6 +46,26 @@ } /** + * rol64 - rotate a 64-bit value left + * @word: value to rotate + * @shift: bits to roll + */ +static inline __u64 rol64(__u64 word, unsigned int shift) +{ + return (word << shift) | (word >> (64 - shift)); +} + +/** + * ror64 - rotate a 64-bit value right + * @word: value to rotate + * @shift: bits to roll + */ +static inline __u64 ror64(__u64 word, unsigned int shift) +{ + return (word >> shift) | (word << (64 - shift)); +} + +/** * rol32 - rotate a 32-bit value left * @word: value to rotate * @shift: bits to roll --- linux-ec2-2.6.32.orig/include/linux/input.h +++ linux-ec2-2.6.32/include/linux/input.h @@ -595,6 +595,48 @@ #define KEY_NUMERIC_STAR 0x20a #define KEY_NUMERIC_POUND 0x20b +#define BTN_TRIGGER_HAPPY 0x2c0 +#define BTN_TRIGGER_HAPPY1 0x2c0 +#define BTN_TRIGGER_HAPPY2 0x2c1 +#define BTN_TRIGGER_HAPPY3 0x2c2 +#define BTN_TRIGGER_HAPPY4 0x2c3 +#define BTN_TRIGGER_HAPPY5 0x2c4 +#define BTN_TRIGGER_HAPPY6 0x2c5 +#define BTN_TRIGGER_HAPPY7 0x2c6 +#define BTN_TRIGGER_HAPPY8 0x2c7 +#define BTN_TRIGGER_HAPPY9 0x2c8 +#define BTN_TRIGGER_HAPPY10 0x2c9 +#define BTN_TRIGGER_HAPPY11 0x2ca +#define BTN_TRIGGER_HAPPY12 0x2cb +#define BTN_TRIGGER_HAPPY13 0x2cc +#define BTN_TRIGGER_HAPPY14 0x2cd +#define BTN_TRIGGER_HAPPY15 0x2ce +#define BTN_TRIGGER_HAPPY16 0x2cf +#define BTN_TRIGGER_HAPPY17 0x2d0 +#define BTN_TRIGGER_HAPPY18 0x2d1 +#define BTN_TRIGGER_HAPPY19 0x2d2 +#define BTN_TRIGGER_HAPPY20 0x2d3 +#define BTN_TRIGGER_HAPPY21 0x2d4 +#define BTN_TRIGGER_HAPPY22 0x2d5 +#define BTN_TRIGGER_HAPPY23 0x2d6 +#define BTN_TRIGGER_HAPPY24 0x2d7 +#define BTN_TRIGGER_HAPPY25 0x2d8 +#define BTN_TRIGGER_HAPPY26 0x2d9 +#define BTN_TRIGGER_HAPPY27 0x2da +#define BTN_TRIGGER_HAPPY28 0x2db +#define BTN_TRIGGER_HAPPY29 0x2dc +#define BTN_TRIGGER_HAPPY30 0x2dd +#define BTN_TRIGGER_HAPPY31 0x2de +#define BTN_TRIGGER_HAPPY32 0x2df +#define BTN_TRIGGER_HAPPY33 0x2e0 +#define BTN_TRIGGER_HAPPY34 0x2e1 +#define BTN_TRIGGER_HAPPY35 0x2e2 +#define BTN_TRIGGER_HAPPY36 0x2e3 +#define BTN_TRIGGER_HAPPY37 0x2e4 +#define BTN_TRIGGER_HAPPY38 0x2e5 +#define BTN_TRIGGER_HAPPY39 0x2e6 +#define BTN_TRIGGER_HAPPY40 0x2e7 + /* We avoid low common keys in module aliases so they don't get huge. */ #define KEY_MIN_INTERESTING KEY_MUTE #define KEY_MAX 0x2ff @@ -658,6 +700,7 @@ #define ABS_MT_TOOL_TYPE 0x37 /* Type of touching device */ #define ABS_MT_BLOB_ID 0x38 /* Group a set of packets as a blob */ #define ABS_MT_TRACKING_ID 0x39 /* Unique ID of initiated contact */ +#define ABS_MT_PRESSURE 0x3a /* Pressure on contact area */ #define ABS_MAX 0x3f #define ABS_CNT (ABS_MAX+1) @@ -1118,6 +1161,7 @@ int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value); struct input_handle *grab; + struct input_handle *filter; spinlock_t event_lock; struct mutex mutex; @@ -1218,6 +1262,7 @@ void *private; void (*event)(struct input_handle *handle, unsigned int type, unsigned int code, int value); + bool (*filter)(struct input_handle *handle, unsigned int type, unsigned int code, int value); int (*connect)(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id); void (*disconnect)(struct input_handle *handle); void (*start)(struct input_handle *handle); @@ -1295,6 +1340,9 @@ int input_grab_device(struct input_handle *); void input_release_device(struct input_handle *); +int input_filter_device(struct input_handle *); +void input_unfilter_device(struct input_handle *); + int input_open_device(struct input_handle *); void input_close_device(struct input_handle *); --- linux-ec2-2.6.32.orig/include/linux/nfs_fs_sb.h +++ linux-ec2-2.6.32/include/linux/nfs_fs_sb.h @@ -176,6 +176,7 @@ #define NFS_CAP_ATIME (1U << 11) #define NFS_CAP_CTIME (1U << 12) #define NFS_CAP_MTIME (1U << 13) +#define NFS_CAP_POSIX_LOCK (1U << 14) /* maximum number of slots to use */ --- linux-ec2-2.6.32.orig/include/linux/pci.h +++ linux-ec2-2.6.32/include/linux/pci.h @@ -289,7 +289,7 @@ int rom_attr_enabled; /* has display of the rom attribute been enabled? */ struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */ -#ifdef CONFIG_PCI_MSI +#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) struct list_head msi_list; #endif struct pci_vpd *vpd; @@ -564,6 +564,9 @@ resource_size_t); void pcibios_update_irq(struct pci_dev *, int irq); +/* Weak but can be overriden by arch */ +void pci_fixup_cardbus(struct pci_bus *); + /* Generic PCI functions used internally */ extern struct pci_bus *pci_find_bus(int domain, int busnr); @@ -721,6 +724,9 @@ void pci_update_resource(struct pci_dev *dev, int resno); int __must_check pci_assign_resource(struct pci_dev *dev, int i); int pci_select_bars(struct pci_dev *dev, unsigned long flags); +#ifdef CONFIG_XEN +void pci_restore_bars(struct pci_dev *); +#endif /* ROM control related routines */ int pci_enable_rom(struct pci_dev *pdev); @@ -877,6 +883,11 @@ { return 0; } + +#ifdef CONFIG_XEN +#define register_msi_get_owner(func) 0 +#define unregister_msi_get_owner(func) 0 +#endif #else extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec); extern void pci_msi_shutdown(struct pci_dev *dev); @@ -889,6 +900,10 @@ extern void msi_remove_pci_irq_vectors(struct pci_dev *dev); extern void pci_restore_msi_state(struct pci_dev *dev); extern int pci_msi_enabled(void); +#ifdef CONFIG_XEN +extern int register_msi_get_owner(int (*func)(struct pci_dev *dev)); +extern int unregister_msi_get_owner(int (*func)(struct pci_dev *dev)); +#endif #endif #ifndef CONFIG_PCIEASPM @@ -942,6 +957,11 @@ } #endif /* CONFIG_PCI_DOMAINS */ +/* some architectures require additional setup to direct VGA traffic */ +typedef int (*arch_set_vga_state_t)(struct pci_dev *pdev, bool decode, + unsigned int command_bits, bool change_bridge); +extern void pci_register_set_vga_state(arch_set_vga_state_t func); + #else /* CONFIG_PCI is not enabled */ /* @@ -1290,5 +1310,11 @@ extern void pci_hp_remove_module_link(struct pci_slot *pci_slot); #endif +#ifdef CONFIG_PCI_GUESTDEV +int pci_is_guestdev(struct pci_dev *dev); +#else +#define pci_is_guestdev(dev) 0 +#endif + #endif /* __KERNEL__ */ #endif /* LINUX_PCI_H */ --- linux-ec2-2.6.32.orig/include/linux/tty_driver.h +++ linux-ec2-2.6.32/include/linux/tty_driver.h @@ -224,6 +224,12 @@ * unless the tty also has a valid tty->termiox pointer. * * Optional: Called under the termios lock + * + * int (*get_icount)(struct tty_struct *tty, struct serial_icounter *icount); + * + * Called when the device receives a TIOCGICOUNT ioctl. Passed a kernel + * structure to complete. This method is optional and will only be called + * if provided (otherwise EINVAL will be returned). */ #include @@ -232,6 +238,7 @@ struct tty_struct; struct tty_driver; +struct serial_icounter_struct; struct tty_operations { struct tty_struct * (*lookup)(struct tty_driver *driver, @@ -268,6 +275,8 @@ unsigned int set, unsigned int clear); int (*resize)(struct tty_struct *tty, struct winsize *ws); int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew); + int (*get_icount)(struct tty_struct *tty, + struct serial_icounter_struct *icount); #ifdef CONFIG_CONSOLE_POLL int (*poll_init)(struct tty_driver *driver, int line, char *options); int (*poll_get_char)(struct tty_driver *driver, int line); --- linux-ec2-2.6.32.orig/include/linux/quota.h +++ linux-ec2-2.6.32/include/linux/quota.h @@ -313,8 +313,9 @@ int (*claim_space) (struct inode *, qsize_t); /* release rsved quota for delayed alloc */ void (*release_rsv) (struct inode *, qsize_t); - /* get reserved quota for delayed alloc */ - qsize_t (*get_reserved_space) (struct inode *); + /* get reserved quota for delayed alloc, value returned is managed by + * quota code only */ + qsize_t *(*get_reserved_space) (struct inode *); }; /* Operations handling requests from userspace */ --- linux-ec2-2.6.32.orig/include/linux/audit.h +++ linux-ec2-2.6.32/include/linux/audit.h @@ -33,7 +33,7 @@ * 1200 - 1299 messages internal to the audit daemon * 1300 - 1399 audit event messages * 1400 - 1499 SE Linux use - * 1500 - 1599 kernel LSPP events + * 1500 - 1599 AppArmor use * 1600 - 1699 kernel crypto events * 1700 - 1799 kernel anomaly records * 1800 - 1899 kernel integrity events @@ -122,6 +122,14 @@ #define AUDIT_MAC_UNLBL_STCADD 1416 /* NetLabel: add a static label */ #define AUDIT_MAC_UNLBL_STCDEL 1417 /* NetLabel: del a static label */ +#define AUDIT_APPARMOR_AUDIT 1501 /* AppArmor audited grants */ +#define AUDIT_APPARMOR_ALLOWED 1502 /* Allowed Access for learning */ +#define AUDIT_APPARMOR_DENIED 1503 +#define AUDIT_APPARMOR_HINT 1504 /* Process Tracking information */ +#define AUDIT_APPARMOR_STATUS 1505 /* Changes in config */ +#define AUDIT_APPARMOR_ERROR 1506 /* Internal AppArmor Errors */ +#define AUDIT_APPARMOR_KILL 1507 /* AppArmor killing processes */ + #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 #define AUDIT_ANOM_PROMISCUOUS 1700 /* Device changed promiscuous mode */ --- linux-ec2-2.6.32.orig/include/linux/usb_usual.h +++ linux-ec2-2.6.32/include/linux/usb_usual.h @@ -56,7 +56,9 @@ US_FLAG(SANE_SENSE, 0x00008000) \ /* Sane Sense (> 18 bytes) */ \ US_FLAG(CAPACITY_OK, 0x00010000) \ - /* READ CAPACITY response is correct */ + /* READ CAPACITY response is correct */ \ + US_FLAG(BAD_SENSE, 0x00020000) \ + /* Bad Sense (never more than 18 bytes) */ #define US_FLAG(name, value) US_FL_##name = value , enum { US_DO_ALL_FLAGS }; --- linux-ec2-2.6.32.orig/include/linux/hugetlb.h +++ linux-ec2-2.6.32/include/linux/hugetlb.h @@ -12,6 +12,15 @@ #include #include +struct hugepage_subpool { + spinlock_t lock; + long count; + long max_hpages, used_hpages; +}; + +struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); +void hugepage_put_subpool(struct hugepage_subpool *spool); + int PageHuge(struct page *page); static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) @@ -138,12 +147,11 @@ }; struct hugetlbfs_sb_info { - long max_blocks; /* blocks allowed */ - long free_blocks; /* blocks free */ long max_inodes; /* inodes allowed */ long free_inodes; /* inodes free */ spinlock_t stat_lock; struct hstate *hstate; + struct hugepage_subpool *spool; }; @@ -166,8 +174,6 @@ extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, int acct, struct user_struct **user, int creat_flags); -int hugetlb_get_quota(struct address_space *mapping, long delta); -void hugetlb_put_quota(struct address_space *mapping, long delta); static inline int is_file_hugepages(struct file *file) { --- linux-ec2-2.6.32.orig/include/linux/connector.h +++ linux-ec2-2.6.32/include/linux/connector.h @@ -24,9 +24,6 @@ #include -#define CN_IDX_CONNECTOR 0xffffffff -#define CN_VAL_CONNECTOR 0xffffffff - /* * Process Events connector unique ids -- used for message routing */ @@ -73,30 +70,6 @@ __u8 data[0]; }; -/* - * Notify structure - requests notification about - * registering/unregistering idx/val in range [first, first+range]. - */ -struct cn_notify_req { - __u32 first; - __u32 range; -}; - -/* - * Main notification control message - * *_notify_num - number of appropriate cn_notify_req structures after - * this struct. - * group - notification receiver's idx. - * len - total length of the attached data. - */ -struct cn_ctl_msg { - __u32 idx_notify_num; - __u32 val_notify_num; - __u32 group; - __u32 len; - __u8 data[0]; -}; - #ifdef __KERNEL__ #include @@ -149,11 +122,6 @@ u32 seq, group; }; -struct cn_ctl_entry { - struct list_head notify_entry; - struct cn_ctl_msg *msg; -}; - struct cn_dev { struct cb_id id; --- linux-ec2-2.6.32.orig/include/linux/libata.h +++ linux-ec2-2.6.32/include/linux/libata.h @@ -339,6 +339,7 @@ ATA_EHI_HOTPLUGGED = (1 << 0), /* could have been hotplugged */ ATA_EHI_NO_AUTOPSY = (1 << 2), /* no autopsy */ ATA_EHI_QUIET = (1 << 3), /* be quiet */ + ATA_EHI_NO_RECOVERY = (1 << 4), /* no recovery */ ATA_EHI_DID_SOFTRESET = (1 << 16), /* already soft-reset this port */ ATA_EHI_DID_HARDRESET = (1 << 17), /* already soft-reset this port */ @@ -354,6 +355,9 @@ /* max tries if error condition is still set after ->error_handler */ ATA_EH_MAX_TRIES = 5, + /* sometimes resuming a link requires several retries */ + ATA_LINK_RESUME_TRIES = 5, + /* how hard are we gonna try to probe/recover devices */ ATA_PROBE_MAX_TRIES = 3, ATA_EH_DEV_TRIES = 3, --- linux-ec2-2.6.32.orig/include/linux/if_vlan.h +++ linux-ec2-2.6.32/include/linux/if_vlan.h @@ -115,10 +115,12 @@ extern int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, u16 vlan_tci, int polling); extern int vlan_hwaccel_do_receive(struct sk_buff *skb); -extern int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, - unsigned int vlan_tci, struct sk_buff *skb); -extern int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, - unsigned int vlan_tci); +extern gro_result_t +vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, + unsigned int vlan_tci, struct sk_buff *skb); +extern gro_result_t +vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, + unsigned int vlan_tci); #else static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev) @@ -145,17 +147,18 @@ return 0; } -static inline int vlan_gro_receive(struct napi_struct *napi, - struct vlan_group *grp, - unsigned int vlan_tci, struct sk_buff *skb) +static inline gro_result_t +vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, + unsigned int vlan_tci, struct sk_buff *skb) { - return NET_RX_DROP; + return GRO_DROP; } -static inline int vlan_gro_frags(struct napi_struct *napi, - struct vlan_group *grp, unsigned int vlan_tci) +static inline gro_result_t +vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, + unsigned int vlan_tci) { - return NET_RX_DROP; + return GRO_DROP; } #endif --- linux-ec2-2.6.32.orig/include/linux/writeback.h +++ linux-ec2-2.6.32/include/linux/writeback.h @@ -27,13 +27,12 @@ * in a manner such that unspecified fields are set to zero. */ struct writeback_control { - struct backing_dev_info *bdi; /* If !NULL, only write back this - queue */ - struct super_block *sb; /* if !NULL, only write inodes from - this super_block */ enum writeback_sync_modes sync_mode; unsigned long *older_than_this; /* If !NULL, only write back inodes older than this */ + unsigned long wb_start; /* Time writeback_inodes_wb was + called. This is needed to avoid + extra jobs and livelock */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ @@ -69,8 +68,10 @@ struct bdi_writeback; int inode_wait(void *); void writeback_inodes_sb(struct super_block *); +int writeback_inodes_sb_if_idle(struct super_block *); void sync_inodes_sb(struct super_block *); -void writeback_inodes_wbc(struct writeback_control *wbc); +void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); void wakeup_flusher_threads(long nr_pages); --- linux-ec2-2.6.32.orig/include/linux/kmod.h +++ linux-ec2-2.6.32/include/linux/kmod.h @@ -23,6 +23,7 @@ #include #include #include +#include #define KMOD_PATH_LEN 256 @@ -44,7 +45,26 @@ struct key; struct file; -struct subprocess_info; + +#define UMH_NO_WAIT 0 /* don't wait at all */ +#define UMH_WAIT_EXEC 1 /* wait for the exec, but not the process */ +#define UMH_WAIT_PROC 2 /* wait for the process to complete */ +#define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */ + +struct subprocess_info { + struct work_struct work; + struct completion *complete; + struct cred *cred; + char *path; + char **argv; + char **envp; + int wait; + int retval; + struct file *stdin; + int (*init)(struct subprocess_info *info); + void (*cleanup)(struct subprocess_info *info); + void *data; +}; /* Allocate a subprocess_info structure */ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, @@ -55,37 +75,47 @@ struct key *session_keyring); int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, struct file **filp); -void call_usermodehelper_setcleanup(struct subprocess_info *info, - void (*cleanup)(char **argv, char **envp)); - -enum umh_wait { - UMH_NO_WAIT = -1, /* don't wait at all */ - UMH_WAIT_EXEC = 0, /* wait for the exec, but not the process */ - UMH_WAIT_PROC = 1, /* wait for the process to complete */ -}; +void call_usermodehelper_setfns(struct subprocess_info *info, + int (*init)(struct subprocess_info *info), + void (*cleanup)(struct subprocess_info *info), + void *data); /* Actually execute the sub-process */ -int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait); +int call_usermodehelper_exec(struct subprocess_info *info, int wait); /* Free the subprocess_info. This is only needed if you're not going to call call_usermodehelper_exec */ void call_usermodehelper_freeinfo(struct subprocess_info *info); static inline int -call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait) +call_usermodehelper_fns(char *path, char **argv, char **envp, + int wait, + int (*init)(struct subprocess_info *info), + void (*cleanup)(struct subprocess_info *), void *data) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; info = call_usermodehelper_setup(path, argv, envp, gfp_mask); + if (info == NULL) return -ENOMEM; + + call_usermodehelper_setfns(info, init, cleanup, data); + return call_usermodehelper_exec(info, wait); } static inline int +call_usermodehelper(char *path, char **argv, char **envp, int wait) +{ + return call_usermodehelper_fns(path, argv, envp, wait, + NULL, NULL, NULL); +} + +static inline int call_usermodehelper_keys(char *path, char **argv, char **envp, - struct key *session_keyring, enum umh_wait wait) + struct key *session_keyring, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; @@ -104,7 +134,16 @@ extern int call_usermodehelper_pipe(char *path, char *argv[], char *envp[], struct file **filp); +#ifdef CONFIG_PM_SLEEP extern int usermodehelper_disable(void); extern void usermodehelper_enable(void); +extern bool usermodehelper_is_disabled(void); +extern void read_lock_usermodehelper(void); +extern void read_unlock_usermodehelper(void); +#else +static inline bool usermodehelper_is_disabled(void) { return false; } +static inline void read_lock_usermodehelper(void) {} +static inline void read_unlock_usermodehelper(void) {} +#endif #endif /* __LINUX_KMOD_H__ */ --- linux-ec2-2.6.32.orig/include/linux/mmzone.h +++ linux-ec2-2.6.32/include/linux/mmzone.h @@ -290,6 +290,13 @@ unsigned long watermark[NR_WMARK]; /* + * When free pages are below this point, additional steps are taken + * when reading the number of free pages to avoid per-cpu counter + * drift allowing watermarks to be breached + */ + unsigned long percpu_drift_mark; + + /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several * GB of ram we must reserve some of the lower zone memory (otherwise we risk @@ -460,6 +467,12 @@ return test_bit(ZONE_OOM_LOCKED, &zone->flags); } +#ifdef CONFIG_SMP +unsigned long zone_nr_free_pages(struct zone *zone); +#else +#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) +#endif /* CONFIG_SMP */ + /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the --- linux-ec2-2.6.32.orig/include/linux/oprofile.h +++ linux-ec2-2.6.32/include/linux/oprofile.h @@ -16,6 +16,9 @@ #include #include #include +#ifdef CONFIG_XEN +#include +#endif /* Each escaped entry is prefixed by ESCAPE_CODE * then one of the following codes, then the @@ -28,14 +31,18 @@ #define CPU_SWITCH_CODE 2 #define COOKIE_SWITCH_CODE 3 #define KERNEL_ENTER_SWITCH_CODE 4 -#define KERNEL_EXIT_SWITCH_CODE 5 +#define USER_ENTER_SWITCH_CODE 5 #define MODULE_LOADED_CODE 6 #define CTX_TGID_CODE 7 #define TRACE_BEGIN_CODE 8 #define TRACE_END_CODE 9 #define XEN_ENTER_SWITCH_CODE 10 +#ifndef CONFIG_XEN #define SPU_PROFILING_CODE 11 #define SPU_CTX_SWITCH_CODE 12 +#else +#define DOMAIN_SWITCH_CODE 11 +#endif #define IBS_FETCH_CODE 13 #define IBS_OP_CODE 14 @@ -49,6 +56,12 @@ /* create any necessary configuration files in the oprofile fs. * Optional. */ int (*create_files)(struct super_block * sb, struct dentry * root); +#ifdef CONFIG_XEN + /* setup active domains with Xen */ + int (*set_active)(int *active_domains, unsigned int adomains); + /* setup passive domains with Xen */ + int (*set_passive)(int *passive_domains, unsigned int pdomains); +#endif /* Do any necessary interrupt setup. Optional. */ int (*setup)(void); /* Do any necessary interrupt shutdown. Optional. */ @@ -107,9 +120,14 @@ * backtrace. */ void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event); +void oprofile_add_mode(int cpu_mode); + /* add a backtrace entry, to be called from the ->backtrace callback */ void oprofile_add_trace(unsigned long eip); +/* add a domain switch entry */ +int oprofile_add_domain_switch(int32_t domain_id); + /** * Create a file of the given name as a child of the given root, with --- linux-ec2-2.6.32.orig/include/linux/quotaops.h +++ linux-ec2-2.6.32/include/linux/quotaops.h @@ -26,6 +26,10 @@ sb->s_qcop->quota_sync(sb, type); } +void inode_add_rsv_space(struct inode *inode, qsize_t number); +void inode_claim_rsv_space(struct inode *inode, qsize_t number); +void inode_sub_rsv_space(struct inode *inode, qsize_t number); + int dquot_initialize(struct inode *inode, int type); int dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, unsigned int id, int type); @@ -42,7 +46,6 @@ int dquot_reserve_space(struct inode *inode, qsize_t number, int prealloc); int dquot_claim_space(struct inode *inode, qsize_t number); void dquot_release_reserved_space(struct inode *inode, qsize_t number); -qsize_t dquot_get_reserved_space(struct inode *inode); int dquot_free_space(struct inode *inode, qsize_t number); int dquot_free_inode(const struct inode *inode, qsize_t number); @@ -199,6 +202,8 @@ if (inode->i_sb->dq_op->reserve_space(inode, nr, 0) == NO_QUOTA) return 1; } + else + inode_add_rsv_space(inode, nr); return 0; } @@ -221,7 +226,7 @@ if (inode->i_sb->dq_op->claim_space(inode, nr) == NO_QUOTA) return 1; } else - inode_add_bytes(inode, nr); + inode_claim_rsv_space(inode, nr); mark_inode_dirty(inode); return 0; @@ -235,6 +240,8 @@ { if (sb_any_quota_active(inode->i_sb)) inode->i_sb->dq_op->release_rsv(inode, nr); + else + inode_sub_rsv_space(inode, nr); } static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr) --- linux-ec2-2.6.32.orig/include/linux/highmem.h +++ linux-ec2-2.6.32/include/linux/highmem.h @@ -130,12 +130,14 @@ return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr); } +#ifndef __HAVE_ARCH_CLEAR_HIGHPAGE static inline void clear_highpage(struct page *page) { void *kaddr = kmap_atomic(page, KM_USER0); clear_page(kaddr); kunmap_atomic(kaddr, KM_USER0); } +#endif static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, @@ -189,6 +191,8 @@ #endif +#ifndef __HAVE_ARCH_COPY_HIGHPAGE + static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; @@ -200,4 +204,6 @@ kunmap_atomic(vto, KM_USER1); } +#endif + #endif /* _LINUX_HIGHMEM_H */ --- linux-ec2-2.6.32.orig/include/linux/log2.h +++ linux-ec2-2.6.32/include/linux/log2.h @@ -185,7 +185,6 @@ #define rounddown_pow_of_two(n) \ ( \ __builtin_constant_p(n) ? ( \ - (n == 1) ? 0 : \ (1UL << ilog2(n))) : \ __rounddown_pow_of_two(n) \ ) --- linux-ec2-2.6.32.orig/include/linux/page-flags.h +++ linux-ec2-2.6.32/include/linux/page-flags.h @@ -108,6 +108,11 @@ #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif +#ifdef CONFIG_XEN + PG_foreign, /* Page is owned by foreign allocator. */ + /* PG_netback, Page is owned by netback */ + PG_blkback, /* Page is owned by blkback */ +#endif __NR_PAGEFLAGS, /* Filesystems */ @@ -120,8 +125,15 @@ PG_fscache = PG_private_2, /* page backed by cache */ /* XEN */ +#if defined(CONFIG_XEN) + PG_pinned = PG_locked, /* Cannot alias with PG_owner_priv_1 since + * bad_page() checks should include this bit. + * Should not use PG_arch_1 as that may have + * a different purpose elsewhere. */ +#elif defined(CONFIG_PARAVIRT_XEN) PG_pinned = PG_owner_priv_1, PG_savepinned = PG_dirty, +#endif /* SLOB */ PG_slob_free = PG_private, @@ -207,8 +219,12 @@ TESTCLEARFLAG(Active, active) __PAGEFLAG(Slab, slab) PAGEFLAG(Checked, checked) /* Used by some filesystems */ +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN) PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ +#endif +#ifdef CONFIG_PARAVIRT_XEN PAGEFLAG(SavePinned, savepinned); /* Xen */ +#endif PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) @@ -329,6 +345,28 @@ CLEARPAGEFLAG(Uptodate, uptodate) +#ifdef CONFIG_XEN +TESTPAGEFLAG(Foreign, foreign) +static inline void SetPageForeign(struct page *page, + void (*dtor)(struct page *, unsigned int)) +{ + BUG_ON(!dtor); + set_bit(PG_foreign, &page->flags); + page->index = (long)dtor; +} +static inline void ClearPageForeign(struct page *page) +{ + clear_bit(PG_foreign, &page->flags); + page->index = 0; +} +static inline void PageForeignDestructor(struct page *page, unsigned int order) +{ + ((void (*)(struct page *, unsigned int))page->index)(page, order); +} +/*PAGEFLAG(Netback, netback)*/ +PAGEFLAG(Blkback, blkback) +#endif + extern void cancel_dirty_page(struct page *page, unsigned int account_size); int test_clear_page_writeback(struct page *page); @@ -362,7 +400,7 @@ * pages on the LRU and/or pagecache. */ TESTPAGEFLAG(Compound, compound) -__PAGEFLAG(Head, compound) +__SETPAGEFLAG(Head, compound) __CLEARPAGEFLAG(Head, compound) /* * PG_reclaim is used in combination with PG_compound to mark the @@ -374,8 +412,14 @@ * PG_compound & PG_reclaim => Tail page * PG_compound & ~PG_reclaim => Head page */ +#define PG_head_mask ((1L << PG_compound)) #define PG_head_tail_mask ((1L << PG_compound) | (1L << PG_reclaim)) +static inline int PageHead(struct page *page) +{ + return ((page->flags & PG_head_tail_mask) == PG_head_mask); +} + static inline int PageTail(struct page *page) { return ((page->flags & PG_head_tail_mask) == PG_head_tail_mask); @@ -399,6 +443,12 @@ #define __PG_MLOCKED 0 #endif +#ifndef CONFIG_XEN +# define __PG_XEN 0 +#else +# define __PG_XEN (1 << PG_foreign) +#endif + /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. @@ -408,7 +458,7 @@ 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON) + 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | __PG_XEN) /* * Flags checked when a page is prepped for return by the page allocator. --- linux-ec2-2.6.32.orig/include/linux/precache.h +++ linux-ec2-2.6.32/include/linux/precache.h @@ -0,0 +1,55 @@ +#ifndef _LINUX_PRECACHE_H + +#include +#include + +#ifdef CONFIG_PRECACHE +extern void precache_init(struct super_block *sb); +extern void shared_precache_init(struct super_block *sb, char *uuid); +extern int precache_get(struct address_space *mapping, unsigned long index, + struct page *empty_page); +extern int precache_put(struct address_space *mapping, unsigned long index, + struct page *page); +extern int precache_flush(struct address_space *mapping, unsigned long index); +extern int precache_flush_inode(struct address_space *mapping); +extern int precache_flush_filesystem(struct super_block *s); +#else +static inline void precache_init(struct super_block *sb) +{ +} + +static inline void shared_precache_init(struct super_block *sb, char *uuid) +{ +} + +static inline int precache_get(struct address_space *mapping, + unsigned long index, struct page *empty_page) +{ + return 0; +} + +static inline int precache_put(struct address_space *mapping, + unsigned long index, struct page *page) +{ + return 0; +} + +static inline int precache_flush(struct address_space *mapping, + unsigned long index) +{ + return 0; +} + +static inline int precache_flush_inode(struct address_space *mapping) +{ + return 0; +} + +static inline int precache_flush_filesystem(struct super_block *s) +{ + return 0; +} +#endif + +#define _LINUX_PRECACHE_H +#endif /* _LINUX_PRECACHE_H */ --- linux-ec2-2.6.32.orig/include/linux/pid.h +++ linux-ec2-2.6.32/include/linux/pid.h @@ -117,7 +117,7 @@ */ extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); -int next_pidmap(struct pid_namespace *pid_ns, int last); +int next_pidmap(struct pid_namespace *pid_ns, unsigned int last); extern struct pid *alloc_pid(struct pid_namespace *ns); extern void free_pid(struct pid *pid); --- linux-ec2-2.6.32.orig/include/linux/if_packet.h +++ linux-ec2-2.6.32/include/linux/if_packet.h @@ -63,6 +63,7 @@ __u16 tp_mac; __u16 tp_net; __u16 tp_vlan_tci; + __u16 tp_padding; }; /* Rx ring - header status */ @@ -103,6 +104,7 @@ __u32 tp_sec; __u32 tp_nsec; __u16 tp_vlan_tci; + __u16 tp_padding; }; #define TPACKET2_HDRLEN (TPACKET_ALIGN(sizeof(struct tpacket2_hdr)) + sizeof(struct sockaddr_ll)) --- linux-ec2-2.6.32.orig/include/linux/firmware.h +++ linux-ec2-2.6.32/include/linux/firmware.h @@ -11,6 +11,7 @@ struct firmware { size_t size; const u8 *data; + struct page **pages; }; struct device; --- linux-ec2-2.6.32.orig/include/linux/security.h +++ linux-ec2-2.6.32/include/linux/security.h @@ -76,7 +76,7 @@ extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); extern int cap_task_setioprio(struct task_struct *p, int ioprio); extern int cap_task_setnice(struct task_struct *p, int nice); -extern int cap_syslog(int type); +extern int cap_syslog(int type, bool from_file); extern int cap_vm_enough_memory(struct mm_struct *mm, long pages); struct msghdr; @@ -95,8 +95,13 @@ extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb, int cap); +#ifdef CONFIG_MMU extern unsigned long mmap_min_addr; extern unsigned long dac_mmap_min_addr; +#else +#define dac_mmap_min_addr 0UL +#endif + /* * Values used in the task_security_ops calls */ @@ -121,6 +126,7 @@ #define LSM_UNSAFE_PTRACE 2 #define LSM_UNSAFE_PTRACE_CAP 4 +#ifdef CONFIG_MMU /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr @@ -135,6 +141,7 @@ } extern int mmap_min_addr_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif #ifdef CONFIG_SECURITY @@ -447,6 +454,22 @@ * @new_dir contains the path structure for parent of the new link. * @new_dentry contains the dentry structure of the new link. * Return 0 if permission is granted. + * @path_chmod: + * Check for permission to change DAC's permission of a file or directory. + * @dentry contains the dentry structure. + * @mnt contains the vfsmnt structure. + * @mode contains DAC's mode. + * Return 0 if permission is granted. + * @path_chown: + * Check for permission to change owner/group of a file or directory. + * @path contains the path structure. + * @uid contains new owner's ID. + * @gid contains new group's ID. + * Return 0 if permission is granted. + * @path_chroot: + * Check for permission to change root directory. + * @path contains the path structure. + * Return 0 if permission is granted. * @inode_readlink: * Check the permission to read the symbolic link. * @dentry contains the dentry structure for the file link. @@ -1324,6 +1347,7 @@ * logging to the console. * See the syslog(2) manual page for an explanation of the @type values. * @type contains the type of action. + * @from_file indicates the context of action (if it came from /proc). * Return 0 if permission is granted. * @settime: * Check permission to change the system time. @@ -1438,7 +1462,7 @@ int (*sysctl) (struct ctl_table *table, int op); int (*quotactl) (int cmds, int type, int id, struct super_block *sb); int (*quota_on) (struct dentry *dentry); - int (*syslog) (int type); + int (*syslog) (int type, bool from_file); int (*settime) (struct timespec *ts, struct timezone *tz); int (*vm_enough_memory) (struct mm_struct *mm, long pages); @@ -1488,6 +1512,10 @@ struct dentry *new_dentry); int (*path_rename) (struct path *old_dir, struct dentry *old_dentry, struct path *new_dir, struct dentry *new_dentry); + int (*path_chmod) (struct dentry *dentry, struct vfsmount *mnt, + mode_t mode); + int (*path_chown) (struct path *path, uid_t uid, gid_t gid); + int (*path_chroot) (struct path *path); #endif int (*inode_alloc_security) (struct inode *inode); @@ -1733,7 +1761,7 @@ int security_sysctl(struct ctl_table *table, int op); int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); -int security_syslog(int type); +int security_syslog(int type, bool from_file); int security_settime(struct timespec *ts, struct timezone *tz); int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); @@ -1979,9 +2007,9 @@ return 0; } -static inline int security_syslog(int type) +static inline int security_syslog(int type, bool from_file) { - return cap_syslog(type); + return cap_syslog(type, from_file); } static inline int security_settime(struct timespec *ts, struct timezone *tz) @@ -2952,6 +2980,10 @@ struct dentry *new_dentry); int security_path_rename(struct path *old_dir, struct dentry *old_dentry, struct path *new_dir, struct dentry *new_dentry); +int security_path_chmod(struct dentry *dentry, struct vfsmount *mnt, + mode_t mode); +int security_path_chown(struct path *path, uid_t uid, gid_t gid); +int security_path_chroot(struct path *path); #else /* CONFIG_SECURITY_PATH */ static inline int security_path_unlink(struct path *dir, struct dentry *dentry) { @@ -3001,6 +3033,23 @@ { return 0; } + +static inline int security_path_chmod(struct dentry *dentry, + struct vfsmount *mnt, + mode_t mode) +{ + return 0; +} + +static inline int security_path_chown(struct path *path, uid_t uid, gid_t gid) +{ + return 0; +} + +static inline int security_path_chroot(struct path *path) +{ + return 0; +} #endif /* CONFIG_SECURITY_PATH */ #ifdef CONFIG_KEYS --- linux-ec2-2.6.32.orig/include/linux/lzo.h +++ linux-ec2-2.6.32/include/linux/lzo.h @@ -4,28 +4,28 @@ * LZO Public Kernel Interface * A mini subset of the LZO real-time data compression library * - * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer + * Copyright (C) 1996-2012 Markus F.X.J. Oberhumer * * The full LZO package can be found at: * http://www.oberhumer.com/opensource/lzo/ * - * Changed for kernel use by: + * Changed for Linux kernel use by: * Nitin Gupta * Richard Purdie */ -#define LZO1X_MEM_COMPRESS (16384 * sizeof(unsigned char *)) -#define LZO1X_1_MEM_COMPRESS LZO1X_MEM_COMPRESS +#define LZO1X_1_MEM_COMPRESS (8192 * sizeof(unsigned short)) +#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS #define lzo1x_worst_compress(x) ((x) + ((x) / 16) + 64 + 3) -/* This requires 'workmem' of size LZO1X_1_MEM_COMPRESS */ +/* This requires 'wrkmem' of size LZO1X_1_MEM_COMPRESS */ int lzo1x_1_compress(const unsigned char *src, size_t src_len, - unsigned char *dst, size_t *dst_len, void *wrkmem); + unsigned char *dst, size_t *dst_len, void *wrkmem); /* safe decompression with overrun testing */ int lzo1x_decompress_safe(const unsigned char *src, size_t src_len, - unsigned char *dst, size_t *dst_len); + unsigned char *dst, size_t *dst_len); /* * Return values (< 0 = Error) @@ -40,5 +40,6 @@ #define LZO_E_EOF_NOT_FOUND (-7) #define LZO_E_INPUT_NOT_CONSUMED (-8) #define LZO_E_NOT_YET_IMPLEMENTED (-9) +#define LZO_E_INVALID_ARGUMENT (-10) #endif --- linux-ec2-2.6.32.orig/include/linux/resource.h +++ linux-ec2-2.6.32/include/linux/resource.h @@ -3,8 +3,6 @@ #include -struct task_struct; - /* * Resource control/accounting header file for linux */ @@ -55,8 +53,11 @@ /* * Limit the stack by to some sane default: root can always * increase this limit if needed.. 8MB seems reasonable. + * + * (2MB more to cover randomization effects.) */ -#define _STK_LIM (8*1024*1024) +#define _STK_LIM (10*1024*1024) +#define EXEC_STACK_BIAS (2*1024*1024) /* * GPG2 wants 64kB of mlocked memory, to make sure pass phrases @@ -70,6 +71,12 @@ */ #include +#ifdef __KERNEL__ + +struct task_struct; + int getrusage(struct task_struct *p, int who, struct rusage __user *ru); +#endif /* __KERNEL__ */ + #endif --- linux-ec2-2.6.32.orig/include/linux/pci_regs.h +++ linux-ec2-2.6.32/include/linux/pci_regs.h @@ -377,7 +377,7 @@ #define PCI_EXP_TYPE_DOWNSTREAM 0x6 /* Downstream Port */ #define PCI_EXP_TYPE_PCI_BRIDGE 0x7 /* PCI/PCI-X Bridge */ #define PCI_EXP_TYPE_RC_END 0x9 /* Root Complex Integrated Endpoint */ -#define PCI_EXP_TYPE_RC_EC 0x10 /* Root Complex Event Collector */ +#define PCI_EXP_TYPE_RC_EC 0xa /* Root Complex Event Collector */ #define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */ #define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */ #define PCI_EXP_DEVCAP 4 /* Device capabilities */ --- linux-ec2-2.6.32.orig/include/linux/vermagic.h +++ linux-ec2-2.6.32/include/linux/vermagic.h @@ -22,6 +22,11 @@ #else #define MODULE_VERMAGIC_MODVERSIONS "" #endif +#ifdef CONFIG_XEN +#define MODULE_VERMAGIC_XEN "Xen " +#else +#define MODULE_VERMAGIC_XEN +#endif #ifndef MODULE_ARCH_VERMAGIC #define MODULE_ARCH_VERMAGIC "" #endif @@ -30,5 +35,5 @@ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ - MODULE_ARCH_VERMAGIC + MODULE_VERMAGIC_XEN MODULE_ARCH_VERMAGIC --- linux-ec2-2.6.32.orig/include/linux/mm_types.h +++ linux-ec2-2.6.32/include/linux/mm_types.h @@ -138,7 +138,7 @@ within vm_mm. */ /* linked list of VM areas per task, sorted by address */ - struct vm_area_struct *vm_next; + struct vm_area_struct *vm_next, *vm_prev; pgprot_t vm_page_prot; /* Access permissions of this VMA. */ unsigned long vm_flags; /* Flags, see mm.h. */ @@ -206,6 +206,9 @@ unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); + unsigned long (*get_unmapped_exec_area) (struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); void (*unmap_area) (struct mm_struct *mm, unsigned long addr); unsigned long mmap_base; /* base of mmap area */ unsigned long task_size; /* size of task vm space */ --- linux-ec2-2.6.32.orig/include/linux/seqlock.h +++ linux-ec2-2.6.32/include/linux/seqlock.h @@ -88,12 +88,12 @@ unsigned ret; repeat: - ret = sl->sequence; - smp_rmb(); + ret = ACCESS_ONCE(sl->sequence); if (unlikely(ret & 1)) { cpu_relax(); goto repeat; } + smp_rmb(); return ret; } --- linux-ec2-2.6.32.orig/include/linux/time.h +++ linux-ec2-2.6.32/include/linux/time.h @@ -91,11 +91,36 @@ return ts_delta; } +#define KTIME_MAX ((s64)~((u64)1 << 63)) +#if (BITS_PER_LONG == 64) +# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +#else +# define KTIME_SEC_MAX LONG_MAX +#endif + /* * Returns true if the timespec is norm, false if denorm: */ -#define timespec_valid(ts) \ - (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC)) +static inline bool timespec_valid(const struct timespec *ts) +{ + /* Dates before 1970 are bogus */ + if (ts->tv_sec < 0) + return false; + /* Can't have more nanoseconds then a second */ + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return false; + return true; +} + +static inline bool timespec_valid_strict(const struct timespec *ts) +{ + if (!timespec_valid(ts)) + return false; + /* Disallow values that could overflow ktime_t */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + return false; + return true; +} extern struct timespec xtime; extern struct timespec wall_to_monotonic; @@ -148,6 +173,7 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_valid_for_hres(void); +extern u64 timekeeping_max_deferment(void); extern void update_wall_time(void); extern void update_xtime_cache(u64 nsec); extern void timekeeping_leap_insert(int leapsecond); --- linux-ec2-2.6.32.orig/include/linux/list_sort.h +++ linux-ec2-2.6.32/include/linux/list_sort.h @@ -0,0 +1,11 @@ +#ifndef _LINUX_LIST_SORT_H +#define _LINUX_LIST_SORT_H + +#include + +struct list_head; + +void list_sort(void *priv, struct list_head *head, + int (*cmp)(void *priv, struct list_head *a, + struct list_head *b)); +#endif --- linux-ec2-2.6.32.orig/include/linux/sysctl.h +++ linux-ec2-2.6.32/include/linux/sysctl.h @@ -68,6 +68,7 @@ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_XEN=123, /* Xen info and control */ CTL_ARLAN=254, /* arlan wireless driver */ CTL_S390DBF=5677, /* s390 debug */ CTL_SUNRPC=7249, /* sunrpc debug */ @@ -490,6 +491,7 @@ NET_IPV4_CONF_PROMOTE_SECONDARIES=20, NET_IPV4_CONF_ARP_ACCEPT=21, NET_IPV4_CONF_ARP_NOTIFY=22, + NET_IPV4_CONF_SRC_VMARK=24, __NET_IPV4_CONF_MAX }; --- linux-ec2-2.6.32.orig/include/linux/interrupt.h +++ linux-ec2-2.6.32/include/linux/interrupt.h @@ -52,16 +52,25 @@ * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. + * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend + * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set + * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device + * resume time. */ #define IRQF_DISABLED 0x00000020 #define IRQF_SAMPLE_RANDOM 0x00000040 #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 -#define IRQF_TIMER 0x00000200 +#define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 #define IRQF_ONESHOT 0x00002000 +#define IRQF_NO_SUSPEND 0x00004000 +#define IRQF_FORCE_RESUME 0x00008000 +#define IRQF_EARLY_RESUME 0x00020000 + +#define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND) /* * Bits used by threaded handlers: @@ -192,13 +201,16 @@ extern void resume_device_irqs(void); #ifdef CONFIG_PM_SLEEP extern int check_wakeup_irqs(void); +extern void irq_pm_syscore_resume(void); #else static inline int check_wakeup_irqs(void) { return 0; } +static inline void irq_pm_syscore_resume(void) { }; #endif #else static inline void suspend_device_irqs(void) { }; static inline void resume_device_irqs(void) { }; static inline int check_wakeup_irqs(void) { return 0; } +static inline void irq_pm_syscore_resume(void) { }; #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) @@ -317,6 +329,12 @@ } #endif /* CONFIG_GENERIC_HARDIRQS */ +#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED +int irq_ignore_unhandled(unsigned int irq); +#else +#define irq_ignore_unhandled(irq) 0 +#endif + #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) #define or_softirq_pending(x) (local_softirq_pending() |= (x)) --- linux-ec2-2.6.32.orig/include/linux/klist.h +++ linux-ec2-2.6.32/include/linux/klist.h @@ -22,7 +22,7 @@ struct list_head k_list; void (*get)(struct klist_node *); void (*put)(struct klist_node *); -} __attribute__ ((aligned (4))); +} __attribute__ ((aligned (sizeof(void *)))); #define KLIST_INIT(_name, _get, _put) \ { .k_lock = __SPIN_LOCK_UNLOCKED(_name.k_lock), \ --- linux-ec2-2.6.32.orig/include/linux/tick.h +++ linux-ec2-2.6.32/include/linux/tick.h @@ -98,6 +98,9 @@ extern struct tick_sched *tick_get_tick_sched(int cpu); extern void tick_check_idle(int cpu); extern int tick_oneshot_mode_active(void); +# ifndef arch_needs_cpu +# define arch_needs_cpu(cpu) (0) +# endif # else static inline void tick_clock_notify(void) { } static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } --- linux-ec2-2.6.32.orig/include/linux/namei.h +++ linux-ec2-2.6.32/include/linux/namei.h @@ -75,6 +75,9 @@ extern struct file *nameidata_to_filp(struct nameidata *nd, int flags); extern void release_open_intent(struct nameidata *); +extern struct dentry *lookup_hash(struct nameidata *nd); +extern int __lookup_one_len(const char *name, struct qstr *this, + struct dentry *base, int len); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_noperm(const char *, struct dentry *); --- linux-ec2-2.6.32.orig/include/linux/blkdev.h +++ linux-ec2-2.6.32/include/linux/blkdev.h @@ -318,7 +318,7 @@ unsigned short max_phys_segments; unsigned char misaligned; - unsigned char no_cluster; + unsigned char cluster; }; struct request_queue @@ -440,7 +440,6 @@ #endif }; -#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ #define QUEUE_FLAG_STOPPED 2 /* queue is stopped */ #define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ @@ -457,11 +456,9 @@ #define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ -#define QUEUE_FLAG_CQ 16 /* hardware does queuing */ -#define QUEUE_FLAG_DISCARD 17 /* supports DISCARD */ +#define QUEUE_FLAG_DISCARD 16 /* supports DISCARD */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_CLUSTER) | \ (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP)) @@ -582,7 +579,6 @@ #define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) -#define blk_queue_queuing(q) test_bit(QUEUE_FLAG_CQ, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) @@ -627,6 +623,11 @@ #define rq_data_dir(rq) ((rq)->cmd_flags & 1) +static inline unsigned int blk_queue_cluster(struct request_queue *q) +{ + return q->limits.cluster; +} + /* * We regard a request as sync, if either a read or a sync write */ @@ -774,6 +775,9 @@ extern void blk_plug_device_unlocked(struct request_queue *); extern int blk_remove_plug(struct request_queue *); extern void blk_recount_segments(struct request_queue *, struct bio *); +extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); +extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t, + unsigned int, void __user *); extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, unsigned int, void __user *); extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, @@ -932,7 +936,7 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); -extern void blk_queue_physical_block_size(struct request_queue *, unsigned short); +extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); @@ -942,6 +946,8 @@ extern void blk_set_default_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); +extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, + sector_t offset); extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); @@ -1081,7 +1087,7 @@ return q->limits.physical_block_size; } -static inline int bdev_physical_block_size(struct block_device *bdev) +static inline unsigned int bdev_physical_block_size(struct block_device *bdev) { return queue_physical_block_size(bdev_get_queue(bdev)); } @@ -1114,11 +1120,18 @@ return q->limits.alignment_offset; } +static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t offset) +{ + unsigned int granularity = max(lim->physical_block_size, lim->io_min); + + offset &= granularity - 1; + return (granularity + lim->alignment_offset - offset) & (granularity - 1); +} + static inline int queue_sector_alignment_offset(struct request_queue *q, sector_t sector) { - return ((sector << 9) - q->limits.alignment_offset) - & (q->limits.io_min - 1); + return queue_limit_alignment_offset(&q->limits, sector << 9); } static inline int bdev_alignment_offset(struct block_device *bdev) --- linux-ec2-2.6.32.orig/include/linux/reiserfs_xattr.h +++ linux-ec2-2.6.32/include/linux/reiserfs_xattr.h @@ -70,6 +70,11 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec); #endif +static inline int reiserfs_xattrs_initialized(struct super_block *sb) +{ + return REISERFS_SB(sb)->priv_root != NULL; +} + #define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header)) static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size) { --- linux-ec2-2.6.32.orig/include/linux/sched.h +++ linux-ec2-2.6.32/include/linux/sched.h @@ -145,7 +145,6 @@ extern void calc_global_load(void); -extern u64 cpu_nr_migrations(int cpu); extern unsigned long get_parent_ip(unsigned long addr); @@ -379,6 +378,9 @@ arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long +arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); +extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); @@ -443,6 +445,10 @@ extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); +#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ +#define SUID_DUMP_USER 1 /* Dump as user of process */ +#define SUID_DUMP_ROOT 2 /* Dump as root */ + /* mm flags */ /* dumpable bits */ #define MMF_DUMPABLE 0 /* core dump is permitted */ @@ -628,6 +634,9 @@ cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + cputime_t prev_utime, prev_stime; +#endif unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; @@ -726,14 +735,6 @@ uid_t uid; struct user_namespace *user_ns; -#ifdef CONFIG_USER_SCHED - struct task_group *tg; -#ifdef CONFIG_SYSFS - struct kobject kobj; - struct delayed_work work; -#endif -#endif - #ifdef CONFIG_PERF_EVENTS atomic_long_t locked_vm; #endif @@ -864,7 +865,10 @@ if (sched_smt_power_savings) return SD_POWERSAVINGS_BALANCE; - return SD_PREFER_SIBLING; + if (!sched_mc_power_savings) + return SD_PREFER_SIBLING; + + return 0; } static inline int sd_balance_for_package_power(void) @@ -897,6 +901,7 @@ * single CPU. */ unsigned int cpu_power; + unsigned int group_weight; /* * The CPUs this group covers. @@ -995,6 +1000,7 @@ char *name; #endif + unsigned int span_weight; /* * Span of all CPUs in this domain. * @@ -1066,7 +1072,8 @@ struct sched_class { const struct sched_class *next; - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup, + bool head); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); void (*yield_task) (struct rq *rq); @@ -1076,7 +1083,8 @@ void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + int (*select_task_rq)(struct rq *rq, struct task_struct *p, + int sd_flag, int flags); unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -1088,7 +1096,8 @@ enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); - void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); + void (*task_waking) (struct rq *this_rq, struct task_struct *task); + void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, const struct cpumask *newmask); @@ -1099,7 +1108,7 @@ void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); - void (*task_new) (struct rq *rq, struct task_struct *p); + void (*task_fork) (struct task_struct *p); void (*switched_from) (struct rq *this_rq, struct task_struct *task, int running); @@ -1108,10 +1117,11 @@ void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio, int running); - unsigned int (*get_rr_interval) (struct task_struct *task); + unsigned int (*get_rr_interval) (struct rq *rq, + struct task_struct *task); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*moved_group) (struct task_struct *p); + void (*task_move_group) (struct task_struct *p, int on_rq); #endif }; @@ -1172,7 +1182,6 @@ u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; - u64 nr_forced2_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; @@ -1354,7 +1363,7 @@ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock it with task_lock()) - - initialized normally by flush_old_exec */ + - initialized normally by setup_new_exec */ /* file system info */ int link_count, total_link_count; #ifdef CONFIG_SYSVIPC @@ -1538,7 +1547,6 @@ /* bitmask of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ - unsigned long stack_start; }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ @@ -1723,12 +1731,12 @@ extern cputime_t task_utime(struct task_struct *p); extern cputime_t task_stime(struct task_struct *p); extern cputime_t task_gtime(struct task_struct *p); +extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); /* * Per process flags */ -#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ - /* Not implemented yet, only for 486*/ +#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */ #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ @@ -1865,6 +1873,19 @@ */ extern unsigned long long cpu_clock(int cpu); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * An i/f to runtime opt-in for irq time accounting based off of sched_clock. + * The reason for this explicit opt-in is not to have perf penalty with + * slow sched_clocks. + */ +extern void enable_sched_clock_irqtime(void); +extern void disable_sched_clock_irqtime(void); +#else +static inline void enable_sched_clock_irqtime(void) {} +static inline void disable_sched_clock_irqtime(void) {} +#endif + extern unsigned long long task_sched_runtime(struct task_struct *task); extern unsigned long long thread_group_sched_runtime(struct task_struct *task); @@ -1880,6 +1901,7 @@ extern void sched_clock_idle_wakeup_event(u64 delta_ns); #ifdef CONFIG_HOTPLUG_CPU +extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p); extern void idle_task_exit(void); #else static inline void idle_task_exit(void) {} @@ -2086,11 +2108,18 @@ return info <= SEND_SIG_FORCED; } -/* True if we are on the alternate signal stack. */ - +/* + * True if we are on the alternate signal stack. + */ static inline int on_sig_stack(unsigned long sp) { - return (sp - current->sas_ss_sp < current->sas_ss_size); +#ifdef CONFIG_STACK_GROWSUP + return sp >= current->sas_ss_sp && + sp - current->sas_ss_sp < current->sas_ss_size; +#else + return sp > current->sas_ss_sp && + sp - current->sas_ss_sp <= current->sas_ss_size; +#endif } static inline int sas_ss_flags(unsigned long sp) @@ -2392,9 +2421,9 @@ extern int __cond_resched_softirq(void); -#define cond_resched_softirq() ({ \ - __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ - __cond_resched_softirq(); \ +#define cond_resched_softirq() ({ \ + __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ + __cond_resched_softirq(); \ }) /* @@ -2437,7 +2466,16 @@ extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); -extern void signal_wake_up(struct task_struct *t, int resume_stopped); +extern void signal_wake_up_state(struct task_struct *t, unsigned int state); + +static inline void signal_wake_up(struct task_struct *t, bool resume) +{ + signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); +} +static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) +{ + signal_wake_up_state(t, resume ? __TASK_TRACED : 0); +} /* * Wrappers for p->thread_info->cpu access. No-op on UP. @@ -2483,13 +2521,9 @@ extern void normalize_rt_tasks(void); -#ifdef CONFIG_GROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED extern struct task_group init_task_group; -#ifdef CONFIG_USER_SCHED -extern struct task_group root_task_group; -extern void set_tg_uid(struct user_struct *user); -#endif extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); @@ -2576,6 +2610,28 @@ #define TASK_STATE_TO_CHAR_STR "RSDTtZX" +static inline unsigned long task_rlimit(const struct task_struct *tsk, + unsigned int limit) +{ + return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); +} + +static inline unsigned long task_rlimit_max(const struct task_struct *tsk, + unsigned int limit) +{ + return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); +} + +static inline unsigned long rlimit(unsigned int limit) +{ + return task_rlimit(current, limit); +} + +static inline unsigned long rlimit_max(unsigned int limit) +{ + return task_rlimit_max(current, limit); +} + #endif /* __KERNEL__ */ #endif --- linux-ec2-2.6.32.orig/include/linux/ethtool.h +++ linux-ec2-2.6.32/include/linux/ethtool.h @@ -357,6 +357,8 @@ __u32 flow_type; /* The rx flow hash value or the rule DB size */ __u64 data; + /* The following fields are not valid and must not be used for + * the ETHTOOL_{G,X}RXFH commands. */ struct ethtool_rx_flow_spec fs; __u32 rule_cnt; __u32 rule_locs[0]; @@ -502,6 +504,7 @@ int (*get_rxnfc)(struct net_device *, struct ethtool_rxnfc *, void *); int (*set_rxnfc)(struct net_device *, struct ethtool_rxnfc *); int (*flash_device)(struct net_device *, struct ethtool_flash *); + int (*reset)(struct net_device *, u32 *); }; #endif /* __KERNEL__ */ @@ -515,7 +518,9 @@ #define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ #define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */ #define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation. */ -#define ETHTOOL_GLINK 0x0000000a /* Get link status (ethtool_value) */ +/* Get link status for host, i.e. whether the interface *and* the + * physical port (if there is one) are up (ethtool_value). */ +#define ETHTOOL_GLINK 0x0000000a #define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ #define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */ #define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */ @@ -559,6 +564,7 @@ #define ETHTOOL_SRXCLSRLDEL 0x00000031 /* Delete RX classification rule */ #define ETHTOOL_SRXCLSRLINS 0x00000032 /* Insert RX classification rule */ #define ETHTOOL_FLASHDEV 0x00000033 /* Flash firmware to device */ +#define ETHTOOL_RESET 0x00000034 /* Reset hardware */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET @@ -689,4 +695,34 @@ #define RX_CLS_FLOW_DISC 0xffffffffffffffffULL +/* Reset flags */ +/* The reset() operation must clear the flags for the components which + * were actually reset. On successful return, the flags indicate the + * components which were not reset, either because they do not exist + * in the hardware or because they cannot be reset independently. The + * driver must never reset any components that were not requested. + */ +enum ethtool_reset_flags { + /* These flags represent components dedicated to the interface + * the command is addressed to. Shift any flag left by + * ETH_RESET_SHARED_SHIFT to reset a shared component of the + * same type. + */ + ETH_RESET_MGMT = 1 << 0, /* Management processor */ + ETH_RESET_IRQ = 1 << 1, /* Interrupt requester */ + ETH_RESET_DMA = 1 << 2, /* DMA engine */ + ETH_RESET_FILTER = 1 << 3, /* Filtering/flow direction */ + ETH_RESET_OFFLOAD = 1 << 4, /* Protocol offload */ + ETH_RESET_MAC = 1 << 5, /* Media access controller */ + ETH_RESET_PHY = 1 << 6, /* Transceiver/PHY */ + ETH_RESET_RAM = 1 << 7, /* RAM shared between + * multiple components */ + + ETH_RESET_DEDICATED = 0x0000ffff, /* All components dedicated to + * this interface */ + ETH_RESET_ALL = 0xffffffff, /* All components used by this + * interface, even if shared */ +}; +#define ETH_RESET_SHARED_SHIFT 16 + #endif /* _LINUX_ETHTOOL_H */ --- linux-ec2-2.6.32.orig/include/linux/tboot.h +++ linux-ec2-2.6.32/include/linux/tboot.h @@ -150,6 +150,7 @@ #else +#define tboot_enabled() 0 #define tboot_probe() do { } while (0) #define tboot_shutdown(shutdown_type) do { } while (0) #define tboot_sleep(sleep_state, pm1a_control, pm1b_control) \ --- linux-ec2-2.6.32.orig/include/linux/cred.h +++ linux-ec2-2.6.32/include/linux/cred.h @@ -153,6 +153,7 @@ extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); +extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); @@ -283,26 +284,6 @@ ((const struct cred *)(rcu_dereference((task)->real_cred))) /** - * get_task_cred - Get another task's objective credentials - * @task: The task to query - * - * Get the objective credentials of a task, pinning them so that they can't go - * away. Accessing a task's credentials directly is not permitted. - * - * The caller must make sure task doesn't go away, either by holding a ref on - * task or by holding tasklist_lock to prevent it from being unlinked. - */ -#define get_task_cred(task) \ -({ \ - struct cred *__cred; \ - rcu_read_lock(); \ - __cred = (struct cred *) __task_cred((task)); \ - get_cred(__cred); \ - rcu_read_unlock(); \ - __cred; \ -}) - -/** * get_current_cred - Get the current task's subjective credentials * * Get the subjective credentials of the current task, pinning them so that --- linux-ec2-2.6.32.orig/include/linux/kernel.h +++ linux-ec2-2.6.32/include/linux/kernel.h @@ -55,6 +55,19 @@ } \ ) +/* + * Multiplies an integer by a fraction, while avoiding unnecessary + * overflow or loss of precision. + */ +#define mult_frac(x, numer, denom)( \ +{ \ + typeof(x) quot = (x) / (denom); \ + typeof(x) rem = (x) % (denom); \ + (quot * (numer)) + ((rem * (numer)) / (denom)); \ +} \ +) + + #define _RET_IP_ (unsigned long)__builtin_return_address(0) #define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) @@ -407,6 +420,49 @@ #endif /* + * ratelimited messages with local ratelimit_state, + * no local ratelimit_state used in the !PRINTK case + */ +#ifdef CONFIG_PRINTK +#define printk_ratelimited(fmt, ...) ({ \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + \ + if (__ratelimit(&_rs)) \ + printk(fmt, ##__VA_ARGS__); \ +}) +#else +/* No effect, but we still get type checking even in the !PRINTK case: */ +#define printk_ratelimited printk +#endif + +#define pr_emerg_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warning_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +#define pr_notice_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) +/* no pr_cont_ratelimited, don't do that... */ +/* If you are writing a driver, please use dev_dbg instead */ +#if defined(DEBUG) +#define pr_debug_ratelimited(fmt, ...) \ + printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) +#else +#define pr_debug_ratelimited(fmt, ...) \ + ({ if (0) printk_ratelimited(KERN_DEBUG pr_fmt(fmt), \ + ##__VA_ARGS__); 0; }) +#endif + +/* * General tracing related utility functions - trace_printk(), * tracing_on/tracing_off and tracing_start()/tracing_stop * --- linux-ec2-2.6.32.orig/include/linux/if_tunnel.h +++ linux-ec2-2.6.32/include/linux/if_tunnel.h @@ -2,6 +2,7 @@ #define _IF_TUNNEL_H_ #include +#include #ifdef __KERNEL__ #include --- linux-ec2-2.6.32.orig/include/linux/perf_event.h +++ linux-ec2-2.6.32/include/linux/perf_event.h @@ -219,7 +219,7 @@ #define PERF_EVENT_IOC_DISABLE _IO ('$', 1) #define PERF_EVENT_IOC_REFRESH _IO ('$', 2) #define PERF_EVENT_IOC_RESET _IO ('$', 3) -#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, u64) +#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) enum perf_event_ioc_flags { --- linux-ec2-2.6.32.orig/include/linux/eventpoll.h +++ linux-ec2-2.6.32/include/linux/eventpoll.h @@ -61,6 +61,7 @@ static inline void eventpoll_init_file(struct file *file) { INIT_LIST_HEAD(&file->f_ep_links); + INIT_LIST_HEAD(&file->f_tfile_llink); } --- linux-ec2-2.6.32.orig/include/linux/ipv6.h +++ linux-ec2-2.6.32/include/linux/ipv6.h @@ -248,6 +248,7 @@ #define IP6SKB_XFRM_TRANSFORMED 1 #define IP6SKB_FORWARDED 2 +#define IP6SKB_FRAGMENTED 16 }; #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) --- linux-ec2-2.6.32.orig/include/linux/ptrace.h +++ linux-ec2-2.6.32/include/linux/ptrace.h @@ -292,6 +292,9 @@ * calling arch_ptrace_stop() when it would be superfluous. For example, * if the thread has not been back to user mode since the last stop, the * thread state might indicate that nothing needs to be done. + * + * This is guaranteed to be invoked once before a task stops for ptrace and + * may include arch-specific operations necessary prior to a ptrace stop. */ #define arch_ptrace_stop_needed(code, info) (0) #endif --- linux-ec2-2.6.32.orig/include/linux/mm.h +++ linux-ec2-2.6.32/include/linux/mm.h @@ -77,7 +77,11 @@ #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) #define VM_GROWSUP 0x00000200 +#else +#define VM_GROWSUP 0x00000000 +#endif #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ @@ -102,7 +106,12 @@ #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ +#ifndef CONFIG_XEN #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ +#else +#define VM_SAO 0 +#define VM_FOREIGN 0x20000000 /* Has pages belonging to another VM */ +#endif #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ @@ -127,6 +136,12 @@ */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) +#ifdef CONFIG_XEN +struct vm_foreign_map { + struct page **map; +}; +#endif + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. @@ -195,6 +210,17 @@ */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); + +#ifdef CONFIG_XEN + /* Area-specific function for clearing the PTE at @ptep. Returns the + * original value of @ptep. */ + pte_t (*zap_pte)(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, int is_fullmm); + + /* called before close() to indicate no more pages should be mapped */ + void (*unmap)(struct vm_area_struct *area); +#endif + #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy @@ -837,6 +863,12 @@ int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); +/* Is the vma a continuation of the stack vma above it? */ +static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr) +{ + return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); +} + extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len); @@ -1129,7 +1161,15 @@ unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long, int); + +static inline unsigned long get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0); +} extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -1195,8 +1235,10 @@ /* Do stack extension */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); -#ifdef CONFIG_IA64 +#if VM_GROWSUP extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); +#else + #define expand_upwards(vma, address) do { } while (0) #endif extern int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address); @@ -1231,6 +1273,8 @@ unsigned long pfn); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); + struct page *follow_page(struct vm_area_struct *, unsigned long address, unsigned int foll_flags); --- linux-ec2-2.6.32.orig/include/linux/irq.h +++ linux-ec2-2.6.32/include/linux/irq.h @@ -174,7 +174,6 @@ */ struct irq_desc { unsigned int irq; - struct timer_rand_state *timer_rand_state; unsigned int *kstat_irqs; #ifdef CONFIG_INTR_REMAP struct irq_2_iommu *irq_2_iommu; @@ -400,7 +399,9 @@ /* Dynamic irq helper functions */ extern void dynamic_irq_init(unsigned int irq); +void dynamic_irq_init_keep_chip_data(unsigned int irq); extern void dynamic_irq_cleanup(unsigned int irq); +void dynamic_irq_cleanup_keep_chip_data(unsigned int irq); /* Set/get chip/data for an IRQ: */ extern int set_irq_chip(unsigned int irq, struct irq_chip *chip); --- linux-ec2-2.6.32.orig/include/linux/binfmts.h +++ linux-ec2-2.6.32/include/linux/binfmts.h @@ -29,6 +29,7 @@ char buf[BINPRM_BUF_SIZE]; #ifdef CONFIG_MMU struct vm_area_struct *vma; + unsigned long vma_pages; #else # define MAX_ARG_PAGES 32 struct page *page[MAX_ARG_PAGES]; @@ -59,6 +60,10 @@ unsigned long loader, exec; }; +extern void acct_arg_size(struct linux_binprm *bprm, unsigned long pages); +extern struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, + int write); + #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT) @@ -66,8 +71,6 @@ #define BINPRM_FLAGS_EXECFD_BIT 1 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) -#define BINPRM_MAX_RECURSION 4 - /* * This structure defines the functions that are used to load the binary formats that * linux accepts. @@ -101,11 +104,9 @@ extern int __must_check remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *,struct pt_regs *); extern int flush_old_exec(struct linux_binprm * bprm); +extern void setup_new_exec(struct linux_binprm * bprm); extern int suid_dumpable; -#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ -#define SUID_DUMP_USER 1 /* Dump as user of process */ -#define SUID_DUMP_ROOT 2 /* Dump as root */ /* Stack area protections */ #define EXSTACK_DEFAULT 0 /* Whatever the arch defaults to */ @@ -116,6 +117,7 @@ unsigned long stack_top, int executable_stack); extern int bprm_mm_init(struct linux_binprm *bprm); +extern int bprm_change_interp(char *interp, struct linux_binprm *bprm); extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); extern int prepare_bprm_creds(struct linux_binprm *bprm); extern void install_exec_creds(struct linux_binprm *bprm); --- linux-ec2-2.6.32.orig/include/linux/if_pppox.h +++ linux-ec2-2.6.32/include/linux/if_pppox.h @@ -108,11 +108,11 @@ struct pppoe_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 ver : 4; __u8 type : 4; + __u8 ver : 4; #elif defined(__BIG_ENDIAN_BITFIELD) - __u8 type : 4; __u8 ver : 4; + __u8 type : 4; #else #error "Please fix " #endif --- linux-ec2-2.6.32.orig/include/linux/cpufreq.h +++ linux-ec2-2.6.32/include/linux/cpufreq.h @@ -302,7 +302,7 @@ #endif /* query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it */ -#ifdef CONFIG_CPU_FREQ +#if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL) unsigned int cpufreq_quick_get(unsigned int cpu); #else static inline unsigned int cpufreq_quick_get(unsigned int cpu) --- linux-ec2-2.6.32.orig/include/linux/kexec.h +++ linux-ec2-2.6.32/include/linux/kexec.h @@ -46,6 +46,13 @@ KEXEC_CORE_NOTE_NAME_BYTES + \ KEXEC_CORE_NOTE_DESC_BYTES ) +#ifndef KEXEC_ARCH_HAS_PAGE_MACROS +#define kexec_page_to_pfn(page) page_to_pfn(page) +#define kexec_pfn_to_page(pfn) pfn_to_page(pfn) +#define kexec_virt_to_phys(addr) virt_to_phys(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(addr) +#endif + /* * This structure is used to hold the arguments that are used when loading * kernel binaries. @@ -112,6 +119,12 @@ extern void machine_kexec(struct kimage *image); extern int machine_kexec_prepare(struct kimage *image); extern void machine_kexec_cleanup(struct kimage *image); +#ifdef CONFIG_XEN +extern int xen_machine_kexec_load(struct kimage *image); +extern void xen_machine_kexec_unload(struct kimage *image); +extern void xen_machine_kexec_setup_resources(void); +extern void xen_machine_kexec_register_resources(struct resource *res); +#endif extern asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, @@ -192,8 +205,15 @@ #define VMCOREINFO_BYTES (4096) #define VMCOREINFO_NOTE_NAME "VMCOREINFO" #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) +#if !defined(CONFIG_XEN) || !defined(CONFIG_X86) #define VMCOREINFO_NOTE_SIZE (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES \ + VMCOREINFO_NOTE_NAME_BYTES) +#else +#define VMCOREINFO_NOTE_SIZE ALIGN(KEXEC_NOTE_HEAD_BYTES*2 \ + + VMCOREINFO_BYTES \ + + VMCOREINFO_NOTE_NAME_BYTES, \ + PAGE_SIZE) +#endif /* Location of a reserved region to hold the crash kernel. */ --- linux-ec2-2.6.32.orig/include/linux/aio.h +++ linux-ec2-2.6.32/include/linux/aio.h @@ -201,6 +201,12 @@ struct delayed_work wq; +#ifdef CONFIG_EPOLL + /* poll integration */ + wait_queue_head_t poll_wait; + struct file *file; +#endif + struct rcu_head rcu_head; }; --- linux-ec2-2.6.32.orig/include/linux/regset.h +++ linux-ec2-2.6.32/include/linux/regset.h @@ -335,8 +335,11 @@ { const struct user_regset *regset = &view->regsets[setno]; + if (!regset->get) + return -EOPNOTSUPP; + if (!access_ok(VERIFY_WRITE, data, size)) - return -EIO; + return -EFAULT; return regset->get(target, regset, offset, size, NULL, data); } @@ -358,8 +361,11 @@ { const struct user_regset *regset = &view->regsets[setno]; + if (!regset->set) + return -EOPNOTSUPP; + if (!access_ok(VERIFY_READ, data, size)) - return -EIO; + return -EFAULT; return regset->set(target, regset, offset, size, NULL, data); } --- linux-ec2-2.6.32.orig/include/linux/freezer.h +++ linux-ec2-2.6.32/include/linux/freezer.h @@ -64,9 +64,12 @@ extern void cancel_freezing(struct task_struct *p); #ifdef CONFIG_CGROUP_FREEZER -extern int cgroup_frozen(struct task_struct *task); +extern int cgroup_freezing_or_frozen(struct task_struct *task); #else /* !CONFIG_CGROUP_FREEZER */ -static inline int cgroup_frozen(struct task_struct *task) { return 0; } +static inline int cgroup_freezing_or_frozen(struct task_struct *task) +{ + return 0; +} #endif /* !CONFIG_CGROUP_FREEZER */ /* --- linux-ec2-2.6.32.orig/include/linux/fb.h +++ linux-ec2-2.6.32/include/linux/fb.h @@ -784,8 +784,6 @@ #define FBINFO_MISC_USEREVENT 0x10000 /* event request from userspace */ #define FBINFO_MISC_TILEBLITTING 0x20000 /* use tile blitting */ -#define FBINFO_MISC_FIRMWARE 0x40000 /* a replaceable firmware - inited framebuffer */ /* A driver may set this flag to indicate that it does want a set_par to be * called every time when fbcon_switch is executed. The advantage is that with @@ -799,6 +797,8 @@ */ #define FBINFO_MISC_ALWAYS_SETPAR 0x40000 +/* where the fb is a firmware driver, and can be replaced with a proper one */ +#define FBINFO_MISC_FIRMWARE 0x80000 /* * Host and GPU endianness differ. */ --- linux-ec2-2.6.32.orig/include/linux/init.h +++ linux-ec2-2.6.32/include/linux/init.h @@ -186,6 +186,7 @@ #define core_initcall(fn) __define_initcall("1",fn,1) #define core_initcall_sync(fn) __define_initcall("1s",fn,1s) +#define earlyrootfs_initcall(fn) __define_initcall("earlyrootfs",fn,rootfs) #define postcore_initcall(fn) __define_initcall("2",fn,2) #define postcore_initcall_sync(fn) __define_initcall("2s",fn,2s) #define arch_initcall(fn) __define_initcall("3",fn,3) @@ -213,6 +214,8 @@ static initcall_t __initcall_##fn \ __used __section(.security_initcall.init) = fn +extern struct list_head populate_rootfs_domain; + struct obs_kernel_param { const char *str; int (*setup_func)(char *); --- linux-ec2-2.6.32.orig/include/linux/inetdevice.h +++ linux-ec2-2.6.32/include/linux/inetdevice.h @@ -83,6 +83,7 @@ #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) #define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING) #define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER) +#define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK) #define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \ ACCEPT_SOURCE_ROUTE) #define IN_DEV_BOOTP_RELAY(in_dev) IN_DEV_ANDCONF((in_dev), BOOTP_RELAY) --- linux-ec2-2.6.32.orig/include/linux/cpuset.h +++ linux-ec2-2.6.32/include/linux/cpuset.h @@ -21,8 +21,7 @@ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); -extern void cpuset_cpus_allowed_locked(struct task_struct *p, - struct cpumask *mask); +extern int cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -69,9 +68,6 @@ extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); -extern void cpuset_lock(void); -extern void cpuset_unlock(void); - extern int cpuset_mem_spread_node(void); static inline int cpuset_do_page_mem_spread(void) @@ -105,10 +101,11 @@ { cpumask_copy(mask, cpu_possible_mask); } -static inline void cpuset_cpus_allowed_locked(struct task_struct *p, - struct cpumask *mask) + +static inline int cpuset_cpus_allowed_fallback(struct task_struct *p) { - cpumask_copy(mask, cpu_possible_mask); + cpumask_copy(&p->cpus_allowed, cpu_possible_mask); + return cpumask_any(cpu_active_mask); } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) @@ -157,9 +154,6 @@ { } -static inline void cpuset_lock(void) {} -static inline void cpuset_unlock(void) {} - static inline int cpuset_mem_spread_node(void) { return 0; --- linux-ec2-2.6.32.orig/include/linux/msi.h +++ linux-ec2-2.6.32/include/linux/msi.h @@ -14,8 +14,10 @@ extern void mask_msi_irq(unsigned int irq); extern void unmask_msi_irq(unsigned int irq); extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); +extern void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); extern void read_msi_msg(unsigned int irq, struct msi_msg *msg); +extern void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); extern void write_msi_msg(unsigned int irq, struct msi_msg *msg); struct msi_desc { --- linux-ec2-2.6.32.orig/include/linux/cpumask.h +++ linux-ec2-2.6.32/include/linux/cpumask.h @@ -84,6 +84,7 @@ #define num_online_cpus() cpumask_weight(cpu_online_mask) #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) +#define num_active_cpus() cpumask_weight(cpu_active_mask) #define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask) #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) @@ -92,6 +93,7 @@ #define num_online_cpus() 1 #define num_possible_cpus() 1 #define num_present_cpus() 1 +#define num_active_cpus() 1 #define cpu_online(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) --- linux-ec2-2.6.32.orig/include/linux/vmstat.h +++ linux-ec2-2.6.32/include/linux/vmstat.h @@ -166,6 +166,28 @@ return x; } +/* + * More accurate version that also considers the currently pending + * deltas. For that we need to loop over all cpus to find the current + * deltas. There is no synchronization so the result cannot be + * exactly accurate either. + */ +static inline unsigned long zone_page_state_snapshot(struct zone *zone, + enum zone_stat_item item) +{ + long x = atomic_long_read(&zone->vm_stat[item]); + +#ifdef CONFIG_SMP + int cpu; + for_each_online_cpu(cpu) + x += zone_pcp(zone, cpu)->vm_stat_diff[item]; + + if (x < 0) + x = 0; +#endif + return x; +} + extern unsigned long global_reclaimable_pages(void); extern unsigned long zone_reclaimable_pages(struct zone *zone); --- linux-ec2-2.6.32.orig/include/linux/hardirq.h +++ linux-ec2-2.6.32/include/linux/hardirq.h @@ -64,6 +64,8 @@ #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) +#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) + #ifndef PREEMPT_ACTIVE #define PREEMPT_ACTIVE_BITS 1 #define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) @@ -82,10 +84,13 @@ /* * Are we doing bottom half or hardware interrupt processing? * Are we in a softirq context? Interrupt context? + * in_softirq - Are we currently processing softirq or have bh disabled? + * in_serving_softirq - Are we currently processing softirq? */ #define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) +#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) /* * Are we in NMI context? @@ -132,10 +137,12 @@ struct task_struct; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) static inline void account_system_vtime(struct task_struct *tsk) { } +#else +extern void account_system_vtime(struct task_struct *tsk); #endif #if defined(CONFIG_NO_HZ) --- linux-ec2-2.6.32.orig/include/linux/cryptohash.h +++ linux-ec2-2.6.32/include/linux/cryptohash.h @@ -7,6 +7,11 @@ void sha_init(__u32 *buf); void sha_transform(__u32 *digest, const char *data, __u32 *W); +#define MD5_DIGEST_WORDS 4 +#define MD5_MESSAGE_BYTES 64 + +void md5_transform(__u32 *hash, __u32 const *in); + __u32 half_md4_transform(__u32 buf[4], __u32 const in[8]); #endif --- linux-ec2-2.6.32.orig/include/linux/tty.h +++ linux-ec2-2.6.32/include/linux/tty.h @@ -68,6 +68,17 @@ unsigned long data[0]; }; +/* + * We default to dicing tty buffer allocations to this many characters + * in order to avoid multiple page allocations. We know the size of + * tty_buffer itself but it must also be taken into account that the + * the buffer is 256 byte aligned. See tty_buffer_find for the allocation + * logic this must match + */ + +#define TTY_BUFFER_PAGE (((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~0xFF) + + struct tty_bufhead { struct delayed_work work; spinlock_t lock; --- linux-ec2-2.6.32.orig/include/linux/jiffies.h +++ linux-ec2-2.6.32/include/linux/jiffies.h @@ -303,7 +303,7 @@ extern unsigned long timeval_to_jiffies(const struct timeval *value); extern void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value); -extern clock_t jiffies_to_clock_t(long x); +extern clock_t jiffies_to_clock_t(unsigned long x); extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); --- linux-ec2-2.6.32.orig/include/linux/fs.h +++ linux-ec2-2.6.32/include/linux/fs.h @@ -21,7 +21,8 @@ /* Fixed constants first: */ #undef NR_OPEN -#define INR_OPEN 1024 /* Initial setting for nfile rlimits */ +#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */ +#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1< +#else +static inline int arch_get_random_long(unsigned long *v) +{ + return 0; +} +static inline int arch_get_random_int(unsigned int *v) +{ + return 0; +} +#endif + #endif /* __KERNEL___ */ #endif /* _LINUX_RANDOM_H */ --- linux-ec2-2.6.32.orig/include/linux/skbuff.h +++ linux-ec2-2.6.32/include/linux/skbuff.h @@ -278,6 +278,8 @@ * @local_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) * @nohdr: Payload reference only, must not modify header + * @proto_data_valid: Protocol data validated since arriving at localhost + * @proto_csum_blank: Protocol csum must be added before leaving localhost * @pkt_type: Packet class * @fclone: skbuff clone status * @ip_summed: Driver fed us an IP checksum @@ -379,9 +381,13 @@ #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif +#ifdef CONFIG_XEN + __u8 proto_data_valid:1, + proto_csum_blank:1; +#endif kmemcheck_bitfield_end(flags2); - /* 0/14 bit hole */ + /* 0/12/14 bit hole */ #ifdef CONFIG_NET_DMA dma_cookie_t dma_cookie; @@ -641,6 +647,16 @@ (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1; } +static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) +{ + might_sleep_if(pri & __GFP_WAIT); + + if (skb_cloned(skb)) + return pskb_expand_head(skb, 0, 0, pri); + + return 0; +} + /** * skb_header_cloned - is the header a clone * @skb: buffer to check @@ -1312,6 +1328,16 @@ } #endif /* NET_SKBUFF_DATA_USES_OFFSET */ +static inline void skb_mac_header_rebuild(struct sk_buff *skb) +{ + if (skb_mac_header_was_set(skb)) { + const unsigned char *old_mac = skb_mac_header(skb); + + skb_set_mac_header(skb, -skb->mac_len); + memmove(skb_mac_header(skb), old_mac, skb->mac_len); + } +} + static inline int skb_transport_offset(const struct sk_buff *skb) { return skb_transport_header(skb) - skb->data; @@ -2085,5 +2111,12 @@ } bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); + +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN) +int skb_checksum_setup(struct sk_buff *skb); +#else +static inline int skb_checksum_setup(struct sk_buff *skb) { return 0; } +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ --- linux-ec2-2.6.32.orig/include/linux/poison.h +++ linux-ec2-2.6.32/include/linux/poison.h @@ -2,13 +2,25 @@ #define _LINUX_POISON_H /********** include/linux/list.h **********/ + +/* + * Architectures might want to move the poison pointer offset + * into some well-recognized area such as 0xdead000000000000, + * that is also not mappable by user-space exploits: + */ +#ifdef CONFIG_ILLEGAL_POINTER_VALUE +# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL) +#else +# define POISON_POINTER_DELTA 0 +#endif + /* * These are non-NULL pointers that will result in page faults * under normal circumstances, used to verify that nobody uses * non-initialized list entries. */ -#define LIST_POISON1 ((void *) 0x00100100) -#define LIST_POISON2 ((void *) 0x00200200) +#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) /********** include/linux/timer.h **********/ /* @@ -36,6 +48,15 @@ #define POISON_FREE 0x6b /* for use-after-free poisoning */ #define POISON_END 0xa5 /* end-byte of poisoning */ +/********** mm/hugetlb.c **********/ +/* + * Private mappings of hugetlb pages use this poisoned value for + * page->mapping. The core VM should not be doing anything with this mapping + * but futex requires the existence of some page->mapping value even though it + * is unused if PAGE_MAPPING_ANON is set. + */ +#define HUGETLB_POISON ((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON)) + /********** arch/$ARCH/mm/init.c **********/ #define POISON_FREE_INITMEM 0xcc --- linux-ec2-2.6.32.orig/include/linux/compat.h +++ linux-ec2-2.6.32/include/linux/compat.h @@ -309,5 +309,11 @@ asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode); +extern void __user *compat_alloc_user_space(unsigned long len); + +#else + +#define is_compat_task() (0) + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ --- linux-ec2-2.6.32.orig/include/linux/pci_ids.h +++ linux-ec2-2.6.32/include/linux/pci_ids.h @@ -393,6 +393,9 @@ #define PCI_DEVICE_ID_VLSI_82C147 0x0105 #define PCI_DEVICE_ID_VLSI_VAS96011 0x0702 +/* AMD RD890 Chipset */ +#define PCI_DEVICE_ID_RD890_IOMMU 0x5a23 + #define PCI_VENDOR_ID_ADL 0x1005 #define PCI_DEVICE_ID_ADL_2301 0x2301 @@ -602,6 +605,8 @@ #define PCI_DEVICE_ID_MATROX_G550 0x2527 #define PCI_DEVICE_ID_MATROX_VIA 0x4536 +#define PCI_VENDOR_ID_MOBILITY_ELECTRONICS 0x14f2 + #define PCI_VENDOR_ID_CT 0x102c #define PCI_DEVICE_ID_CT_69000 0x00c0 #define PCI_DEVICE_ID_CT_65545 0x00d8 @@ -1262,6 +1267,7 @@ #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE 0x0759 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_SMBUS 0x07D8 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP79_SMBUS 0x0AA2 +#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP89_SATA 0x0D85 #define PCI_VENDOR_ID_IMS 0x10e0 #define PCI_DEVICE_ID_IMS_TT128 0x9128 @@ -2030,6 +2036,7 @@ #define PCI_DEVICE_ID_AFAVLAB_P030 0x2182 #define PCI_SUBDEVICE_ID_AFAVLAB_P061 0x2150 +#define PCI_VENDOR_ID_BCM_GVC 0x14a4 #define PCI_VENDOR_ID_BROADCOM 0x14e4 #define PCI_DEVICE_ID_TIGON3_5752 0x1600 #define PCI_DEVICE_ID_TIGON3_5752M 0x1601 @@ -2290,6 +2297,20 @@ #define PCI_DEVICE_ID_MPC8536 0x0051 #define PCI_DEVICE_ID_P2020E 0x0070 #define PCI_DEVICE_ID_P2020 0x0071 +#define PCI_DEVICE_ID_P2010E 0x0078 +#define PCI_DEVICE_ID_P2010 0x0079 +#define PCI_DEVICE_ID_P1020E 0x0100 +#define PCI_DEVICE_ID_P1020 0x0101 +#define PCI_DEVICE_ID_P1011E 0x0108 +#define PCI_DEVICE_ID_P1011 0x0109 +#define PCI_DEVICE_ID_P1022E 0x0110 +#define PCI_DEVICE_ID_P1022 0x0111 +#define PCI_DEVICE_ID_P1013E 0x0118 +#define PCI_DEVICE_ID_P1013 0x0119 +#define PCI_DEVICE_ID_P4080E 0x0400 +#define PCI_DEVICE_ID_P4080 0x0401 +#define PCI_DEVICE_ID_P4040E 0x0408 +#define PCI_DEVICE_ID_P4040 0x0409 #define PCI_DEVICE_ID_MPC8641 0x7010 #define PCI_DEVICE_ID_MPC8641D 0x7011 #define PCI_DEVICE_ID_MPC8610 0x7018 @@ -2303,6 +2324,7 @@ #define PCI_VENDOR_ID_JMICRON 0x197B #define PCI_DEVICE_ID_JMICRON_JMB360 0x2360 #define PCI_DEVICE_ID_JMICRON_JMB361 0x2361 +#define PCI_DEVICE_ID_JMICRON_JMB362 0x2362 #define PCI_DEVICE_ID_JMICRON_JMB363 0x2363 #define PCI_DEVICE_ID_JMICRON_JMB365 0x2365 #define PCI_DEVICE_ID_JMICRON_JMB366 0x2366 @@ -2398,6 +2420,9 @@ #define PCI_DEVICE_ID_INTEL_82840_HB 0x1a21 #define PCI_DEVICE_ID_INTEL_82845_HB 0x1a30 #define PCI_DEVICE_ID_INTEL_IOAT 0x1a38 +#define PCI_DEVICE_ID_INTEL_CPT_SMBUS 0x1c22 +#define PCI_DEVICE_ID_INTEL_CPT_LPC1 0x1c42 +#define PCI_DEVICE_ID_INTEL_CPT_LPC2 0x1c43 #define PCI_DEVICE_ID_INTEL_82801AA_0 0x2410 #define PCI_DEVICE_ID_INTEL_82801AA_1 0x2411 #define PCI_DEVICE_ID_INTEL_82801AA_3 0x2413 --- linux-ec2-2.6.32.orig/include/linux/syscalls.h +++ linux-ec2-2.6.32/include/linux/syscalls.h @@ -153,7 +153,8 @@ #define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__) #define SYSCALL_TRACE_ENTER_EVENT(sname) \ - static struct ftrace_event_call event_enter_##sname; \ + static struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_enter_##sname; \ struct trace_event enter_syscall_print_##sname = { \ .trace = print_syscall_enter, \ }; \ @@ -189,7 +190,8 @@ } #define SYSCALL_TRACE_EXIT_EVENT(sname) \ - static struct ftrace_event_call event_exit_##sname; \ + static struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_exit_##sname; \ struct trace_event exit_syscall_print_##sname = { \ .trace = print_syscall_exit, \ }; \ @@ -879,4 +881,8 @@ asmlinkage long sys_perf_event_open( struct perf_event_attr __user *attr_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); + +asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff); #endif --- linux-ec2-2.6.32.orig/include/linux/elfnote.h +++ linux-ec2-2.6.32/include/linux/elfnote.h @@ -52,7 +52,7 @@ 4484:.balign 4 ; \ .popsection ; -#define ELFNOTE(name, type, desc) \ +#define ELFNOTE(name, type, desc...) \ ELFNOTE_START(name, type, "") \ desc ; \ ELFNOTE_END --- linux-ec2-2.6.32.orig/include/linux/ftrace.h +++ linux-ec2-2.6.32/include/linux/ftrace.h @@ -412,6 +412,7 @@ extern void ftrace_graph_init_task(struct task_struct *t); extern void ftrace_graph_exit_task(struct task_struct *t); +extern void ftrace_graph_init_idle_task(struct task_struct *t, int cpu); static inline int task_curr_ret_stack(struct task_struct *t) { @@ -435,6 +436,7 @@ static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } +static inline void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { } static inline int task_curr_ret_stack(struct task_struct *tsk) { --- linux-ec2-2.6.32.orig/include/linux/msdos_fs.h +++ linux-ec2-2.6.32/include/linux/msdos_fs.h @@ -47,6 +47,7 @@ #define DELETED_FLAG 0xe5 /* marks file as deleted when in name[0] */ #define IS_FREE(n) (!*(n) || *(n) == DELETED_FLAG) +#define FAT_LFN_LEN 255 /* maximum long name length */ #define MSDOS_NAME 11 /* maximum name length */ #define MSDOS_LONGNAME 256 /* maximum name length */ #define MSDOS_SLOTS 21 /* max # of slots for short and long names */ --- linux-ec2-2.6.32.orig/include/linux/hid.h +++ linux-ec2-2.6.32/include/linux/hid.h @@ -312,6 +312,7 @@ #define HID_QUIRK_MULTI_INPUT 0x00000040 #define HID_QUIRK_SKIP_OUTPUT_REPORTS 0x00010000 #define HID_QUIRK_FULLSPEED_INTERVAL 0x10000000 +#define HID_QUIRK_NO_INIT_REPORTS 0x20000000 /* * This is the global environment of the parser. This information is @@ -362,6 +363,7 @@ struct hid_usage { unsigned hid; /* hid usage code */ unsigned collection_index; /* index into collection array */ + unsigned usage_index; /* index into usage array */ /* hidinput data */ __u16 code; /* input driver code */ __u8 type; /* input driver type */ @@ -409,10 +411,12 @@ struct hid_device *device; /* associated device */ }; +#define HID_MAX_IDS 256 + struct hid_report_enum { unsigned numbered; struct list_head report_list; - struct hid_report *report_id_hash[256]; + struct hid_report *report_id_hash[HID_MAX_IDS]; }; #define HID_REPORT_TYPES 3 @@ -690,6 +694,10 @@ void hid_output_report(struct hid_report *report, __u8 *data); struct hid_device *hid_allocate_device(void); int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size); +struct hid_report *hid_validate_values(struct hid_device *hid, + unsigned int type, unsigned int id, + unsigned int field_index, + unsigned int report_counts); int hid_check_keys_pressed(struct hid_device *hid); int hid_connect(struct hid_device *hid, unsigned int connect_mask); void hid_disconnect(struct hid_device *hid); --- linux-ec2-2.6.32.orig/include/linux/console.h +++ linux-ec2-2.6.32/include/linux/console.h @@ -63,6 +63,7 @@ extern const struct consw vga_con; /* VGA text console */ extern const struct consw newport_con; /* SGI Newport console */ extern const struct consw prom_con; /* SPARC PROM console */ +extern int console_use_vt; int con_is_bound(const struct consw *csw); int register_con_driver(const struct consw *csw, int first, int last); --- linux-ec2-2.6.32.orig/include/linux/ext3_fs.h +++ linux-ec2-2.6.32/include/linux/ext3_fs.h @@ -180,8 +180,8 @@ /* Flags that should be inherited by new inodes from their parent. */ #define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ - EXT3_SYNC_FL | EXT3_IMMUTABLE_FL | EXT3_APPEND_FL |\ - EXT3_NODUMP_FL | EXT3_NOATIME_FL | EXT3_COMPRBLK_FL|\ + EXT3_SYNC_FL | EXT3_NODUMP_FL |\ + EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\ EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) --- linux-ec2-2.6.32.orig/include/linux/mempolicy.h +++ linux-ec2-2.6.32/include/linux/mempolicy.h @@ -180,7 +180,7 @@ struct shared_policy { struct rb_root root; - spinlock_t lock; + struct mutex mutex; }; void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); --- linux-ec2-2.6.32.orig/include/linux/vmalloc.h +++ linux-ec2-2.6.32/include/linux/vmalloc.h @@ -13,6 +13,7 @@ #define VM_MAP 0x00000004 /* vmap()ed pages */ #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ #define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ +#define VM_UNLIST 0x00000020 /* vm_struct is not listed in vmlist */ /* bits [20..32] reserved for arch specific ioremap internals */ /* @@ -115,9 +116,11 @@ extern struct vm_struct *vmlist; extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align, gfp_t gfp_mask); +#endif void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); --- linux-ec2-2.6.32.orig/include/linux/splice.h +++ linux-ec2-2.6.32/include/linux/splice.h @@ -82,4 +82,10 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, splice_direct_actor *); +extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags); +extern long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); + #endif --- linux-ec2-2.6.32.orig/include/linux/icmpv6.h +++ linux-ec2-2.6.32/include/linux/icmpv6.h @@ -123,6 +123,8 @@ #define ICMPV6_NOT_NEIGHBOUR 2 #define ICMPV6_ADDR_UNREACH 3 #define ICMPV6_PORT_UNREACH 4 +#define ICMPV6_POLICY_FAIL 5 +#define ICMPV6_REJECT_ROUTE 6 /* * Codes for Time Exceeded --- linux-ec2-2.6.32.orig/include/linux/jbd2.h +++ linux-ec2-2.6.32/include/linux/jbd2.h @@ -653,6 +653,7 @@ * waiting for it to finish. */ unsigned int t_synchronous_commit:1; + unsigned int t_flushed_data_blocks:1; /* * For use by the filesystem to store fs-specific data --- linux-ec2-2.6.32.orig/include/linux/ktime.h +++ linux-ec2-2.6.32/include/linux/ktime.h @@ -58,13 +58,6 @@ typedef union ktime ktime_t; /* Kill this */ -#define KTIME_MAX ((s64)~((u64)1 << 63)) -#if (BITS_PER_LONG == 64) -# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) -#else -# define KTIME_SEC_MAX LONG_MAX -#endif - /* * ktime_t definitions when using the 64-bit scalar representation: */ --- linux-ec2-2.6.32.orig/include/linux/proportions.h +++ linux-ec2-2.6.32/include/linux/proportions.h @@ -81,7 +81,11 @@ * Limit the time part in order to ensure there are some bits left for the * cycle counter and fraction multiply. */ +#if BITS_PER_LONG == 32 #define PROP_MAX_SHIFT (3*BITS_PER_LONG/4) +#else +#define PROP_MAX_SHIFT (BITS_PER_LONG/2) +#endif #define PROP_FRAC_SHIFT (BITS_PER_LONG - PROP_MAX_SHIFT - 1) #define PROP_FRAC_BASE (1UL << PROP_FRAC_SHIFT) --- linux-ec2-2.6.32.orig/include/linux/socket.h +++ linux-ec2-2.6.32/include/linux/socket.h @@ -246,7 +246,7 @@ #define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */ #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_MORE 0x8000 /* Sender will send more */ - +#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_EOF MSG_FIN #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exit for file --- linux-ec2-2.6.32.orig/include/linux/module.h +++ linux-ec2-2.6.32/include/linux/module.h @@ -724,4 +724,13 @@ static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ +/* + * Establish a symbolic link between 2 modules so that depmod + * and modprobe do the heavy lifting of loading the modules in the + * correct dependency order. + */ +#define MODULE_EXPORT(mod_name) int sym_link_##mod_name; EXPORT_SYMBOL(sym_link_##mod_name); +#define MODULE_IMPORT(mod_name) extern int sym_link_##mod_name; int func_sym_link_##mod_name(void) {sym_link_##mod_name=1;}; EXPORT_SYMBOL(func_sym_link_##mod_name); + + #endif /* _LINUX_MODULE_H */ --- linux-ec2-2.6.32.orig/include/linux/hrtimer.h +++ linux-ec2-2.6.32/include/linux/hrtimer.h @@ -159,21 +159,28 @@ * and timers * @clock_base: array of clock bases for this cpu * @curr_timer: the timer which is executing a callback right now + * @clock_was_set: Indicates that clock was set from irq context. * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @hres_active: State of high resolution mode - * @check_clocks: Indictator, when set evaluate time source and clock - * event devices whether high resolution mode can be - * activated. - * @nr_events: Total number of timer interrupt events + * @hang_detected: The last hrtimer interrupt detected a hang + * @nr_events: Total number of hrtimer interrupt events + * @nr_retries: Total number of hrtimer interrupt retries + * @nr_hangs: Total number of hrtimer interrupt hangs + * @max_hang_time: Maximum time spent in hrtimer_interrupt */ struct hrtimer_cpu_base { spinlock_t lock; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; + unsigned int clock_was_set; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; + int hang_detected; unsigned long nr_events; + unsigned long nr_retries; + unsigned long nr_hangs; + ktime_t max_hang_time; #endif }; @@ -275,6 +282,8 @@ # define MONOTONIC_RES_NSEC HIGH_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_HIGH_RES +extern void clock_was_set_delayed(void); + #else # define MONOTONIC_RES_NSEC LOW_RES_NSEC @@ -303,11 +312,14 @@ { return 0; } + +static inline void clock_was_set_delayed(void) { } + #endif extern ktime_t ktime_get(void); extern ktime_t ktime_get_real(void); - +extern ktime_t ktime_get_update_offsets(ktime_t *offs_real); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); @@ -446,7 +458,7 @@ static inline void timer_stats_account_hrtimer(struct hrtimer *timer) { - if (likely(!timer->start_site)) + if (likely(!timer_stats_active)) return; timer_stats_update_stats(timer, timer->start_pid, timer->start_site, timer->function, timer->start_comm, 0); @@ -457,8 +469,6 @@ static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) { - if (likely(!timer_stats_active)) - return; __timer_stats_hrtimer_set_start_info(timer, __builtin_return_address(0)); } --- linux-ec2-2.6.32.orig/include/linux/videodev2.h +++ linux-ec2-2.6.32/include/linux/videodev2.h @@ -858,6 +858,7 @@ #define V4L2_CTRL_FLAG_NEXT_CTRL 0x80000000 /* User-class control IDs defined by V4L2 */ +#define V4L2_CID_MAX_CTRLS 1024 #define V4L2_CID_BASE (V4L2_CTRL_CLASS_USER | 0x900) #define V4L2_CID_USER_BASE V4L2_CID_BASE /* IDs reserved for driver specific controls */ --- linux-ec2-2.6.32.orig/include/linux/net.h +++ linux-ec2-2.6.32/include/linux/net.h @@ -187,6 +187,14 @@ int optname, char __user *optval, int __user *optlen); int (*sendmsg) (struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len); + /* Notes for implementing recvmsg: + * =============================== + * msg->msg_namelen should get updated by the recvmsg handlers + * iff msg_name != NULL. It is by default 0 to prevent + * returning uninitialized memory to user space. The recvfrom + * handlers can assume that msg.msg_name is either NULL or has + * a minimum size of sizeof(struct sockaddr_storage). + */ int (*recvmsg) (struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len, int flags); --- linux-ec2-2.6.32.orig/include/linux/ext2_fs.h +++ linux-ec2-2.6.32/include/linux/ext2_fs.h @@ -196,8 +196,8 @@ /* Flags that should be inherited by new inodes from their parent. */ #define EXT2_FL_INHERITED (EXT2_SECRM_FL | EXT2_UNRM_FL | EXT2_COMPR_FL |\ - EXT2_SYNC_FL | EXT2_IMMUTABLE_FL | EXT2_APPEND_FL |\ - EXT2_NODUMP_FL | EXT2_NOATIME_FL | EXT2_COMPRBLK_FL|\ + EXT2_SYNC_FL | EXT2_NODUMP_FL |\ + EXT2_NOATIME_FL | EXT2_COMPRBLK_FL |\ EXT2_NOCOMP_FL | EXT2_JOURNAL_DATA_FL |\ EXT2_NOTAIL_FL | EXT2_DIRSYNC_FL) --- linux-ec2-2.6.32.orig/include/linux/pagemap.h +++ linux-ec2-2.6.32/include/linux/pagemap.h @@ -253,6 +253,8 @@ extern struct page * read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, void *data); +extern struct page * read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp_mask); extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); --- linux-ec2-2.6.32.orig/include/linux/nls.h +++ linux-ec2-2.6.32/include/linux/nls.h @@ -43,7 +43,7 @@ UTF16_BIG_ENDIAN }; -/* nls.c */ +/* nls_base.c */ extern int register_nls(struct nls_table *); extern int unregister_nls(struct nls_table *); extern struct nls_table *load_nls(char *); @@ -52,7 +52,8 @@ extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu); extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen); -extern int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs); +extern int utf8s_to_utf16s(const u8 *s, int len, + enum utf16_endian endian, wchar_t *pwcs, int maxlen); extern int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, u8 *s, int maxlen); --- linux-ec2-2.6.32.orig/include/linux/enclosure.h +++ linux-ec2-2.6.32/include/linux/enclosure.h @@ -42,6 +42,8 @@ ENCLOSURE_STATUS_NOT_INSTALLED, ENCLOSURE_STATUS_UNKNOWN, ENCLOSURE_STATUS_UNAVAILABLE, + /* last element for counting purposes */ + ENCLOSURE_STATUS_MAX }; /* SFF-8485 activity light settings */ --- linux-ec2-2.6.32.orig/include/linux/fsnotify.h +++ linux-ec2-2.6.32/include/linux/fsnotify.h @@ -183,52 +183,61 @@ /* * fsnotify_access - file was read */ -static inline void fsnotify_access(struct dentry *dentry) +static inline void fsnotify_access(struct file *file) { + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; __u32 mask = FS_ACCESS; if (S_ISDIR(inode->i_mode)) mask |= FS_IN_ISDIR; - inotify_inode_queue_event(inode, mask, 0, NULL, NULL); + if (!(file->f_mode & FMODE_NONOTIFY)) { + inotify_inode_queue_event(inode, mask, 0, NULL, NULL); - fsnotify_parent(dentry, mask); - fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_parent(dentry, mask); + fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0); + } } /* * fsnotify_modify - file was modified */ -static inline void fsnotify_modify(struct dentry *dentry) +static inline void fsnotify_modify(struct file *file) { + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; __u32 mask = FS_MODIFY; if (S_ISDIR(inode->i_mode)) mask |= FS_IN_ISDIR; - inotify_inode_queue_event(inode, mask, 0, NULL, NULL); + if (!(file->f_mode & FMODE_NONOTIFY)) { + inotify_inode_queue_event(inode, mask, 0, NULL, NULL); - fsnotify_parent(dentry, mask); - fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_parent(dentry, mask); + fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0); + } } /* * fsnotify_open - file was opened */ -static inline void fsnotify_open(struct dentry *dentry) +static inline void fsnotify_open(struct file *file) { + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; __u32 mask = FS_OPEN; if (S_ISDIR(inode->i_mode)) mask |= FS_IN_ISDIR; - inotify_inode_queue_event(inode, mask, 0, NULL, NULL); + if (!(file->f_mode & FMODE_NONOTIFY)) { + inotify_inode_queue_event(inode, mask, 0, NULL, NULL); - fsnotify_parent(dentry, mask); - fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_parent(dentry, mask); + fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0); + } } /* @@ -244,10 +253,12 @@ if (S_ISDIR(inode->i_mode)) mask |= FS_IN_ISDIR; - inotify_inode_queue_event(inode, mask, 0, NULL, NULL); + if (!(file->f_mode & FMODE_NONOTIFY)) { + inotify_inode_queue_event(inode, mask, 0, NULL, NULL); - fsnotify_parent(dentry, mask); - fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0); + fsnotify_parent(dentry, mask); + fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0); + } } /* --- linux-ec2-2.6.32.orig/include/linux/syslog.h +++ linux-ec2-2.6.32/include/linux/syslog.h @@ -0,0 +1,29 @@ +/* Syslog internals + * + * Copyright 2010 Canonical, Ltd. + * Author: Kees Cook + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _LINUX_SYSLOG_H +#define _LINUX_SYSLOG_H + +#define SYSLOG_FROM_CALL 0 +#define SYSLOG_FROM_FILE 1 + +int do_syslog(int type, char __user *buf, int count, bool from_file); + +#endif /* _LINUX_SYSLOG_H */ --- linux-ec2-2.6.32.orig/include/linux/ieee80211.h +++ linux-ec2-2.6.32/include/linux/ieee80211.h @@ -872,7 +872,7 @@ /* block-ack parameters */ #define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002 #define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C -#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFA0 +#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFC0 #define IEEE80211_DELBA_PARAM_TID_MASK 0xF000 #define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800 @@ -1098,6 +1098,8 @@ WLAN_CATEGORY_SA_QUERY = 8, WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION = 9, WLAN_CATEGORY_WMM = 17, + WLAN_CATEGORY_MESH_PLINK = 30, /* Pending ANA approval */ + WLAN_CATEGORY_MESH_PATH_SEL = 32, /* Pending ANA approval */ WLAN_CATEGORY_VENDOR_SPECIFIC_PROTECTED = 126, WLAN_CATEGORY_VENDOR_SPECIFIC = 127, }; --- linux-ec2-2.6.32.orig/include/linux/swap.h +++ linux-ec2-2.6.32/include/linux/swap.h @@ -177,8 +177,60 @@ unsigned int max; unsigned int inuse_pages; unsigned int old_block_size; +#ifdef CONFIG_PRESWAP + unsigned long *preswap_map; + unsigned int preswap_pages; +#endif + void (*notify_swap_entry_free_fn) (unsigned long); }; +#ifdef CONFIG_PRESWAP + +#include +extern int preswap_sysctl_handler(struct ctl_table *, int, void __user *, + size_t *, loff_t *); +extern const unsigned long preswap_zero, preswap_infinity; + +extern void preswap_shrink(unsigned long); +extern int preswap_test(struct swap_info_struct *, unsigned long); +extern void preswap_init(unsigned); +extern int preswap_put(struct page *); +extern int preswap_get(struct page *); +extern void preswap_flush(unsigned, unsigned long); +extern void preswap_flush_area(unsigned); +#else +static inline void preswap_shrink(unsigned long target_pages) +{ +} + +static inline int preswap_test(struct swap_info_struct *sis, unsigned long offset) +{ + return 0; +} + +static inline void preswap_init(unsigned type) +{ +} + +static inline int preswap_put(struct page *page) +{ + return 0; +} + +static inline int preswap_get(struct page *get) +{ + return 0; +} + +static inline void preswap_flush(unsigned type, unsigned long offset) +{ +} + +static inline void preswap_flush_area(unsigned type) +{ +} +#endif /* CONFIG_PRESWAP */ + struct swap_list_t { int head; /* head of priority-ordered swapfile list */ int next; /* swapfile to be used next */ @@ -218,21 +270,11 @@ __lru_cache_add(page, LRU_INACTIVE_ANON); } -static inline void lru_cache_add_active_anon(struct page *page) -{ - __lru_cache_add(page, LRU_ACTIVE_ANON); -} - static inline void lru_cache_add_file(struct page *page) { __lru_cache_add(page, LRU_INACTIVE_FILE); } -static inline void lru_cache_add_active_file(struct page *page) -{ - __lru_cache_add(page, LRU_ACTIVE_FILE); -} - /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); @@ -323,6 +365,7 @@ extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; +extern void set_notify_swap_entry_free(unsigned, void (*) (unsigned long)); /* linux/mm/thrash.c */ extern struct mm_struct *swap_token_mm; --- linux-ec2-2.6.32.orig/include/linux/topology.h +++ linux-ec2-2.6.32/include/linux/topology.h @@ -99,7 +99,7 @@ | 1*SD_WAKE_AFFINE \ | 1*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ - | 0*SD_SHARE_PKG_RESOURCES \ + | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ | 0*SD_PREFER_SIBLING \ , \ --- linux-ec2-2.6.32.orig/include/linux/lcm.h +++ linux-ec2-2.6.32/include/linux/lcm.h @@ -0,0 +1,8 @@ +#ifndef _LCM_H +#define _LCM_H + +#include + +unsigned long lcm(unsigned long a, unsigned long b) __attribute_const__; + +#endif /* _LCM_H */ --- linux-ec2-2.6.32.orig/include/linux/ssb/ssb_regs.h +++ linux-ec2-2.6.32/include/linux/ssb/ssb_regs.h @@ -170,7 +170,8 @@ #define SSB_SPROMSIZE_WORDS_R4 220 #define SSB_SPROMSIZE_BYTES_R123 (SSB_SPROMSIZE_WORDS_R123 * sizeof(u16)) #define SSB_SPROMSIZE_BYTES_R4 (SSB_SPROMSIZE_WORDS_R4 * sizeof(u16)) -#define SSB_SPROM_BASE 0x1000 +#define SSB_SPROM_BASE1 0x1000 +#define SSB_SPROM_BASE31 0x0800 #define SSB_SPROM_REVISION 0x107E #define SSB_SPROM_REVISION_REV 0x00FF /* SPROM Revision number */ #define SSB_SPROM_REVISION_CRC 0xFF00 /* SPROM CRC8 value */ --- linux-ec2-2.6.32.orig/include/linux/ssb/ssb.h +++ linux-ec2-2.6.32/include/linux/ssb/ssb.h @@ -301,6 +301,7 @@ /* ID information about the Chip. */ u16 chip_id; u16 chip_rev; + u16 sprom_offset; u16 sprom_size; /* number of words in sprom */ u8 chip_package; @@ -390,6 +391,9 @@ extern void ssb_bus_unregister(struct ssb_bus *bus); +/* Does the device have an SPROM? */ +extern bool ssb_is_sprom_available(struct ssb_bus *bus); + /* Set a fallback SPROM. * See kdoc at the function definition for complete documentation. */ extern int ssb_arch_set_fallback_sprom(const struct ssb_sprom *sprom); --- linux-ec2-2.6.32.orig/include/linux/ssb/ssb_driver_chipcommon.h +++ linux-ec2-2.6.32/include/linux/ssb/ssb_driver_chipcommon.h @@ -53,6 +53,7 @@ #define SSB_CHIPCO_CAP_64BIT 0x08000000 /* 64-bit Backplane */ #define SSB_CHIPCO_CAP_PMU 0x10000000 /* PMU available (rev >= 20) */ #define SSB_CHIPCO_CAP_ECI 0x20000000 /* ECI available (rev >= 20) */ +#define SSB_CHIPCO_CAP_SPROM 0x40000000 /* SPROM present */ #define SSB_CHIPCO_CORECTL 0x0008 #define SSB_CHIPCO_CORECTL_UARTCLK0 0x00000001 /* Drive UART with internal clock */ #define SSB_CHIPCO_CORECTL_SE 0x00000002 /* sync clk out enable (corerev >= 3) */ @@ -385,6 +386,7 @@ /** Chip specific Chip-Status register contents. */ +#define SSB_CHIPCO_CHST_4322_SPROM_EXISTS 0x00000040 /* SPROM present */ #define SSB_CHIPCO_CHST_4325_SPROM_OTP_SEL 0x00000003 #define SSB_CHIPCO_CHST_4325_DEFCIS_SEL 0 /* OTP is powered up, use def. CIS, no SPROM */ #define SSB_CHIPCO_CHST_4325_SPROM_SEL 1 /* OTP is powered up, SPROM is present */ @@ -398,6 +400,18 @@ #define SSB_CHIPCO_CHST_4325_RCAL_VALUE_SHIFT 4 #define SSB_CHIPCO_CHST_4325_PMUTOP_2B 0x00000200 /* 1 for 2b, 0 for to 2a */ +/** Macros to determine SPROM presence based on Chip-Status register. */ +#define SSB_CHIPCO_CHST_4312_SPROM_PRESENT(status) \ + ((status & SSB_CHIPCO_CHST_4325_SPROM_OTP_SEL) != \ + SSB_CHIPCO_CHST_4325_OTP_SEL) +#define SSB_CHIPCO_CHST_4322_SPROM_PRESENT(status) \ + (status & SSB_CHIPCO_CHST_4322_SPROM_EXISTS) +#define SSB_CHIPCO_CHST_4325_SPROM_PRESENT(status) \ + (((status & SSB_CHIPCO_CHST_4325_SPROM_OTP_SEL) != \ + SSB_CHIPCO_CHST_4325_DEFCIS_SEL) && \ + ((status & SSB_CHIPCO_CHST_4325_SPROM_OTP_SEL) != \ + SSB_CHIPCO_CHST_4325_OTP_SEL)) + /** Clockcontrol masks and values **/ @@ -564,6 +578,7 @@ struct ssb_chipcommon { struct ssb_device *dev; u32 capabilities; + u32 status; /* Fast Powerup Delay constant */ u16 fast_pwrup_delay; struct ssb_chipcommon_pmu pmu; --- linux-ec2-2.6.32.orig/include/linux/decompress/mm.h +++ linux-ec2-2.6.32/include/linux/decompress/mm.h @@ -14,11 +14,21 @@ /* Code active when included from pre-boot environment: */ +/* + * Some architectures want to ensure there is no local data in their + * pre-boot environment, so that data can arbitarily relocated (via + * GOT references). This is achieved by defining STATIC_RW_DATA to + * be null. + */ +#ifndef STATIC_RW_DATA +#define STATIC_RW_DATA static +#endif + /* A trivial malloc implementation, adapted from * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 */ -static unsigned long malloc_ptr; -static int malloc_count; +STATIC_RW_DATA unsigned long malloc_ptr; +STATIC_RW_DATA int malloc_count; static void *malloc(int size) { --- linux-ec2-2.6.32.orig/include/linux/netfilter/Kbuild +++ linux-ec2-2.6.32/include/linux/netfilter/Kbuild @@ -38,6 +38,8 @@ header-y += xt_pkttype.h header-y += xt_quota.h header-y += xt_rateest.h +header-y += xt_policy.h +header-y += xt_quota.h header-y += xt_realm.h header-y += xt_recent.h header-y += xt_sctp.h --- linux-ec2-2.6.32.orig/include/linux/netfilter/xt_recent.h +++ linux-ec2-2.6.32/include/linux/netfilter/xt_recent.h @@ -9,6 +9,7 @@ XT_RECENT_UPDATE = 1 << 2, XT_RECENT_REMOVE = 1 << 3, XT_RECENT_TTL = 1 << 4, + XT_RECENT_REAP = 1 << 5, XT_RECENT_SOURCE = 0, XT_RECENT_DEST = 1, @@ -16,6 +17,9 @@ XT_RECENT_NAME_LEN = 200, }; +/* Only allowed with --rcheck and --update */ +#define XT_RECENT_MODIFIERS (XT_RECENT_TTL|XT_RECENT_REAP) + struct xt_recent_mtinfo { __u32 seconds; __u32 hit_count; --- linux-ec2-2.6.32.orig/include/linux/nfsd/nfsd.h +++ linux-ec2-2.6.32/include/linux/nfsd/nfsd.h @@ -38,6 +38,7 @@ #define NFSD_MAY_OWNER_OVERRIDE 64 #define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 +#define NFSD_MAY_READ_IF_EXEC 2048 #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) --- linux-ec2-2.6.32.orig/include/linux/nfsd/xdr4.h +++ linux-ec2-2.6.32/include/linux/nfsd/xdr4.h @@ -480,18 +480,17 @@ static inline void set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { - BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); - cinfo->atomic = 1; + BUG_ON(!fhp->fh_pre_saved); + cinfo->atomic = fhp->fh_post_saved; cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); - if (cinfo->change_supported) { - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; - } else { - cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; - cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; - cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; - cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; - } + + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + } int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); --- linux-ec2-2.6.32.orig/include/linux/mfd/wm8350/pmic.h +++ linux-ec2-2.6.32/include/linux/mfd/wm8350/pmic.h @@ -666,20 +666,20 @@ #define WM8350_ISINK_FLASH_DUR_64MS (1 << 8) #define WM8350_ISINK_FLASH_DUR_96MS (2 << 8) #define WM8350_ISINK_FLASH_DUR_1024MS (3 << 8) -#define WM8350_ISINK_FLASH_ON_INSTANT (0 << 4) -#define WM8350_ISINK_FLASH_ON_0_25S (1 << 4) -#define WM8350_ISINK_FLASH_ON_0_50S (2 << 4) -#define WM8350_ISINK_FLASH_ON_1_00S (3 << 4) -#define WM8350_ISINK_FLASH_ON_1_95S (1 << 4) -#define WM8350_ISINK_FLASH_ON_3_91S (2 << 4) -#define WM8350_ISINK_FLASH_ON_7_80S (3 << 4) -#define WM8350_ISINK_FLASH_OFF_INSTANT (0 << 0) -#define WM8350_ISINK_FLASH_OFF_0_25S (1 << 0) -#define WM8350_ISINK_FLASH_OFF_0_50S (2 << 0) -#define WM8350_ISINK_FLASH_OFF_1_00S (3 << 0) -#define WM8350_ISINK_FLASH_OFF_1_95S (1 << 0) -#define WM8350_ISINK_FLASH_OFF_3_91S (2 << 0) -#define WM8350_ISINK_FLASH_OFF_7_80S (3 << 0) +#define WM8350_ISINK_FLASH_ON_INSTANT (0 << 0) +#define WM8350_ISINK_FLASH_ON_0_25S (1 << 0) +#define WM8350_ISINK_FLASH_ON_0_50S (2 << 0) +#define WM8350_ISINK_FLASH_ON_1_00S (3 << 0) +#define WM8350_ISINK_FLASH_ON_1_95S (1 << 0) +#define WM8350_ISINK_FLASH_ON_3_91S (2 << 0) +#define WM8350_ISINK_FLASH_ON_7_80S (3 << 0) +#define WM8350_ISINK_FLASH_OFF_INSTANT (0 << 4) +#define WM8350_ISINK_FLASH_OFF_0_25S (1 << 4) +#define WM8350_ISINK_FLASH_OFF_0_50S (2 << 4) +#define WM8350_ISINK_FLASH_OFF_1_00S (3 << 4) +#define WM8350_ISINK_FLASH_OFF_1_95S (1 << 4) +#define WM8350_ISINK_FLASH_OFF_3_91S (2 << 4) +#define WM8350_ISINK_FLASH_OFF_7_80S (3 << 4) /* * Regulator Interrupts. --- linux-ec2-2.6.32.orig/include/linux/mmc/host.h +++ linux-ec2-2.6.32/include/linux/mmc/host.h @@ -120,6 +120,7 @@ unsigned int f_min; unsigned int f_max; u32 ocr_avail; + struct notifier_block pm_notify; #define MMC_VDD_165_195 0x00000080 /* VDD voltage 1.65 - 1.95 */ #define MMC_VDD_20_21 0x00000100 /* VDD voltage 2.0 ~ 2.1 */ @@ -177,6 +178,7 @@ /* Only used with MMC_CAP_DISABLE */ int enabled; /* host is enabled */ + int rescan_disable; /* disable card detection */ int nesting_cnt; /* "enable" nesting count */ int en_dis_recurs; /* detect recursion */ unsigned int disable_delay; /* disable delay in msecs */ @@ -249,6 +251,7 @@ int mmc_host_enable(struct mmc_host *host); int mmc_host_disable(struct mmc_host *host); int mmc_host_lazy_disable(struct mmc_host *host); +int mmc_pm_notify(struct notifier_block *notify_block, unsigned long, void *); static inline void mmc_set_disable_delay(struct mmc_host *host, unsigned int disable_delay) --- linux-ec2-2.6.32.orig/include/linux/mmc/sdio.h +++ linux-ec2-2.6.32/include/linux/mmc/sdio.h @@ -94,6 +94,8 @@ #define SDIO_BUS_WIDTH_1BIT 0x00 #define SDIO_BUS_WIDTH_4BIT 0x02 +#define SDIO_BUS_ECSI 0x20 /* Enable continuous SPI interrupt */ +#define SDIO_BUS_SCSI 0x40 /* Support continuous SPI interrupt */ #define SDIO_BUS_CD_DISABLE 0x80 /* disable pull-up on DAT3 (pin 1) */ --- linux-ec2-2.6.32.orig/include/linux/sunrpc/svcsock.h +++ linux-ec2-2.6.32/include/linux/sunrpc/svcsock.h @@ -34,7 +34,7 @@ /* * Function prototypes. */ -void svc_close_all(struct list_head *); +void svc_close_all(struct svc_serv *); int svc_recv(struct svc_rqst *, long); int svc_send(struct svc_rqst *); void svc_drop(struct svc_rqst *); --- linux-ec2-2.6.32.orig/include/linux/sunrpc/sched.h +++ linux-ec2-2.6.32/include/linux/sunrpc/sched.h @@ -84,7 +84,8 @@ long tk_rtt; /* round-trip time (jiffies) */ pid_t tk_owner; /* Process id for batching tasks */ - unsigned char tk_priority : 2;/* Task priority */ + unsigned char tk_priority : 2,/* Task priority */ + tk_rebind_retry : 2; #ifdef RPC_DEBUG unsigned short tk_pid; /* debugging aid */ --- linux-ec2-2.6.32.orig/include/linux/usb/serial.h +++ linux-ec2-2.6.32/include/linux/usb/serial.h @@ -259,6 +259,8 @@ int (*tiocmget)(struct tty_struct *tty, struct file *file); int (*tiocmset)(struct tty_struct *tty, struct file *file, unsigned int set, unsigned int clear); + int (*get_icount)(struct tty_struct *tty, + struct serial_icounter_struct *icount); /* Called by the tty layer for port level work. There may or may not be an attached tty at this point */ void (*dtr_rts)(struct usb_serial_port *port, int on); @@ -326,6 +328,9 @@ struct usb_serial_port *port, unsigned int ch); extern int usb_serial_handle_break(struct usb_serial_port *port); +extern void usb_serial_handle_dcd_change(struct usb_serial_port *usb_port, + struct tty_struct *tty, + unsigned int status); extern int usb_serial_bus_register(struct usb_serial_driver *device); --- linux-ec2-2.6.32.orig/include/linux/usb/quirks.h +++ linux-ec2-2.6.32/include/linux/usb/quirks.h @@ -19,4 +19,8 @@ /* device can't handle its Configuration or Interface strings */ #define USB_QUIRK_CONFIG_INTF_STRINGS 0x00000008 +/* device needs a pause during initialization, after we read the device + descriptor */ +#define USB_QUIRK_DELAY_INIT 0x00000040 + #endif /* __LINUX_USB_QUIRKS_H */ --- linux-ec2-2.6.32.orig/include/trace/ftrace.h +++ linux-ec2-2.6.32/include/trace/ftrace.h @@ -43,7 +43,8 @@ tstruct \ char __data[0]; \ }; \ - static struct ftrace_event_call event_##name + static struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_##name #undef __cpparg #define __cpparg(arg...) arg @@ -159,7 +160,7 @@ #undef __get_str #undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) +#define TP_printk(fmt, args...) "\"%s\", %s\n", fmt, __stringify(args) #undef TP_fast_assign #define TP_fast_assign(args...) args --- linux-ec2-2.6.32.orig/include/trace/events/kmem.h +++ linux-ec2-2.6.32/include/trace/events/kmem.h @@ -293,7 +293,7 @@ TP_printk("page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s", __entry->page, - page_to_pfn(__entry->page), + __entry->page ? page_to_pfn(__entry->page) : 0, __entry->order, __entry->migratetype, show_gfp_flags(__entry->gfp_flags)) @@ -319,7 +319,7 @@ TP_printk("page=%p pfn=%lu order=%u migratetype=%d percpu_refill=%d", __entry->page, - page_to_pfn(__entry->page), + __entry->page ? page_to_pfn(__entry->page) : 0, __entry->order, __entry->migratetype, __entry->order == 0) --- linux-ec2-2.6.32.orig/include/trace/events/fs.h +++ linux-ec2-2.6.32/include/trace/events/fs.h @@ -0,0 +1,53 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs + +#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FS_H + +#include +#include + +TRACE_EVENT(do_sys_open, + + TP_PROTO(char *filename, int flags, int mode), + + TP_ARGS(filename, flags, mode), + + TP_STRUCT__entry( + __string( filename, filename ) + __field( int, flags ) + __field( int, mode ) + ), + + TP_fast_assign( + __assign_str(filename, filename); + __entry->flags = flags; + __entry->mode = mode; + ), + + TP_printk("\"%s\" %x %o", + __get_str(filename), __entry->flags, __entry->mode) +); + +TRACE_EVENT(open_exec, + + TP_PROTO(char *filename), + + TP_ARGS(filename), + + TP_STRUCT__entry( + __string( filename, filename ) + ), + + TP_fast_assign( + __assign_str(filename, filename); + ), + + TP_printk("\"%s\"", + __get_str(filename)) +); + +#endif /* _TRACE_FS_H */ + +/* This part must be outside protection */ +#include --- linux-ec2-2.6.32.orig/include/trace/events/vfs.h +++ linux-ec2-2.6.32/include/trace/events/vfs.h @@ -0,0 +1,53 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vfs + +#if !defined(_TRACE_VFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VFS_H + +/* + * Tracepoint for dirtying an inode: + */ +TRACE_EVENT(dirty_inode, + + TP_PROTO(struct inode *inode, struct task_struct *task), + + TP_ARGS(inode, task), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __array( char, dev, 16 ) + __array( char, file, 32 ) + ), + + TP_fast_assign( + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->pid = task->pid; + strlcpy(__entry->file, name, 32); + strlcpy(__entry->dev, inode->i_sb->s_id, 16); + + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } + ), + + TP_printk("task=%i (%s) file=%s dev=%s", + __entry->pid, __entry->comm, __entry->file, __entry->dev) +); + +#endif /* _TRACE_VFS_H */ + +/* This part must be outside protection */ +#include --- linux-ec2-2.6.32.orig/include/asm-generic/signal.h +++ linux-ec2-2.6.32/include/asm-generic/signal.h @@ -99,6 +99,10 @@ #include +#ifdef SA_RESTORER +#define __ARCH_HAS_SA_RESTORER +#endif + struct sigaction { __sighandler_t sa_handler; unsigned long sa_flags; --- linux-ec2-2.6.32.orig/include/asm-generic/resource.h +++ linux-ec2-2.6.32/include/asm-generic/resource.h @@ -78,7 +78,7 @@ [RLIMIT_CORE] = { 0, RLIM_INFINITY }, \ [RLIMIT_RSS] = { RLIM_INFINITY, RLIM_INFINITY }, \ [RLIMIT_NPROC] = { 0, 0 }, \ - [RLIMIT_NOFILE] = { INR_OPEN, INR_OPEN }, \ + [RLIMIT_NOFILE] = { INR_OPEN_CUR, INR_OPEN_MAX }, \ [RLIMIT_MEMLOCK] = { MLOCK_LIMIT, MLOCK_LIMIT }, \ [RLIMIT_AS] = { RLIM_INFINITY, RLIM_INFINITY }, \ [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ --- linux-ec2-2.6.32.orig/include/asm-generic/poll.h +++ linux-ec2-2.6.32/include/asm-generic/poll.h @@ -28,6 +28,8 @@ #define POLLRDHUP 0x2000 #endif +#define POLLFREE 0x4000 /* currently only for epoll */ + struct pollfd { int fd; short events; --- linux-ec2-2.6.32.orig/include/asm-generic/vmlinux.lds.h +++ linux-ec2-2.6.32/include/asm-generic/vmlinux.lds.h @@ -585,6 +585,7 @@ *(.initcall0s.init) \ *(.initcall1.init) \ *(.initcall1s.init) \ + *(.initcallearlyrootfs.init) \ *(.initcall2.init) \ *(.initcall2s.init) \ *(.initcall3.init) \ --- linux-ec2-2.6.32.orig/include/asm-generic/dma-mapping-common.h +++ linux-ec2-2.6.32/include/asm-generic/dma-mapping-common.h @@ -131,7 +131,7 @@ debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); } else - dma_sync_single_for_cpu(dev, addr, size, dir); + dma_sync_single_for_cpu(dev, addr + offset, size, dir); } static inline void dma_sync_single_range_for_device(struct device *dev, @@ -148,7 +148,7 @@ debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); } else - dma_sync_single_for_device(dev, addr, size, dir); + dma_sync_single_for_device(dev, addr + offset, size, dir); } static inline void --- linux-ec2-2.6.32.orig/include/asm-generic/bug.h +++ linux-ec2-2.6.32/include/asm-generic/bug.h @@ -50,6 +50,22 @@ #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while(0) #endif +#define __WARN_RATELIMIT(condition, state, format...) \ +({ \ + int rtn = 0; \ + if (unlikely(__ratelimit(state))) \ + rtn = WARN(condition, format); \ + rtn; \ +}) + +#define WARN_RATELIMIT(condition, format...) \ +({ \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + __WARN_RATELIMIT(condition, &_rs, format); \ +}) + /* * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report * significant issues that need prompt attention if they should ever --- linux-ec2-2.6.32.orig/include/asm-generic/fcntl.h +++ linux-ec2-2.6.32/include/asm-generic/fcntl.h @@ -3,8 +3,17 @@ #include +/* + * FMODE_EXEC is 0x20 + * FMODE_NONOTIFY is 0x1000000 + * These cannot be used by userspace O_* until internal and external open + * flags are split. + * -Eric Paris + */ + /* open/fcntl - O_SYNC is only implemented on blocks devices and on files located on an ext2 file system */ + #define O_ACCMODE 00000003 #define O_RDONLY 00000000 #define O_WRONLY 00000001 --- linux-ec2-2.6.32.orig/include/scsi/scsi.h +++ linux-ec2-2.6.32/include/scsi/scsi.h @@ -145,10 +145,10 @@ /* defined in T10 SCSI Primary Commands-2 (SPC2) */ struct scsi_varlen_cdb_hdr { - u8 opcode; /* opcode always == VARIABLE_LENGTH_CMD */ - u8 control; - u8 misc[5]; - u8 additional_cdb_length; /* total cdb length - 8 */ + __u8 opcode; /* opcode always == VARIABLE_LENGTH_CMD */ + __u8 control; + __u8 misc[5]; + __u8 additional_cdb_length; /* total cdb length - 8 */ __be16 service_action; /* service specific data follows */ }; --- linux-ec2-2.6.32.orig/include/scsi/scsi_bsg_fc.h +++ linux-ec2-2.6.32/include/scsi/scsi_bsg_fc.h @@ -292,7 +292,7 @@ struct fc_bsg_rport_els r_els; struct fc_bsg_rport_ct r_ct; } rqst_data; -}; +} __attribute__((packed)); /* response (request sense data) structure of the sg_io_v4 */ --- linux-ec2-2.6.32.orig/include/scsi/scsi_host.h +++ linux-ec2-2.6.32/include/scsi/scsi_host.h @@ -677,6 +677,12 @@ void *shost_data; /* + * Points to the physical bus device we'd use to do DMA + * Needed just in case we have virtual hosts. + */ + struct device *dma_dev; + + /* * We should ensure that this is aligned, both for better performance * and also because some compilers (m68k) don't automatically force * alignment to a long boundary. @@ -720,7 +726,9 @@ extern void scsi_flush_work(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int); -extern int __must_check scsi_add_host(struct Scsi_Host *, struct device *); +extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *, + struct device *, + struct device *); extern void scsi_scan_host(struct Scsi_Host *); extern void scsi_rescan_device(struct device *); extern void scsi_remove_host(struct Scsi_Host *); @@ -731,6 +739,12 @@ extern u64 scsi_calculate_bounce_limit(struct Scsi_Host *); +static inline int __must_check scsi_add_host(struct Scsi_Host *host, + struct device *dev) +{ + return scsi_add_host_with_dma(host, dev, dev); +} + static inline struct device *scsi_get_device(struct Scsi_Host *shost) { return shost->shost_gendev.parent; --- linux-ec2-2.6.32.orig/include/scsi/osd_protocol.h +++ linux-ec2-2.6.32/include/scsi/osd_protocol.h @@ -17,6 +17,7 @@ #define __OSD_PROTOCOL_H__ #include +#include #include #include --- linux-ec2-2.6.32.orig/include/scsi/libfc.h +++ linux-ec2-2.6.32/include/scsi/libfc.h @@ -145,6 +145,7 @@ RPORT_ST_LOGO, /* port logout sent */ RPORT_ST_ADISC, /* Discover Address sent */ RPORT_ST_DELETE, /* port being deleted */ + RPORT_ST_RESTART, /* remote port being deleted and will restart */ }; /** --- linux-ec2-2.6.32.orig/include/scsi/fc_frame.h +++ linux-ec2-2.6.32/include/scsi/fc_frame.h @@ -37,6 +37,9 @@ #define FC_FRAME_HEADROOM 32 /* headroom for VLAN + FCoE headers */ #define FC_FRAME_TAILROOM 8 /* trailer space for FCoE */ +/* Max number of skb frags allowed, reserving one for fcoe_crc_eof page */ +#define FC_FRAME_SG_LEN (MAX_SKB_FRAGS - 1) + #define fp_skb(fp) (&((fp)->skb)) #define fr_hdr(fp) ((fp)->skb.data) #define fr_len(fp) ((fp)->skb.len) --- linux-ec2-2.6.32.orig/include/scsi/scsi_device.h +++ linux-ec2-2.6.32/include/scsi/scsi_device.h @@ -446,7 +446,7 @@ } static inline int scsi_device_enclosure(struct scsi_device *sdev) { - return sdev->inquiry[6] & (1<<6); + return sdev->inquiry ? (sdev->inquiry[6] & (1<<6)) : 1; } static inline int scsi_device_protection(struct scsi_device *sdev) --- linux-ec2-2.6.32.orig/include/scsi/scsi_netlink.h +++ linux-ec2-2.6.32/include/scsi/scsi_netlink.h @@ -23,7 +23,7 @@ #define SCSI_NETLINK_H #include - +#include /* * This file intended to be included by both kernel and user space @@ -105,8 +105,8 @@ * PCI : ID data is the 16 bit PCI Registered Vendor ID */ #define SCSI_NL_VID_TYPE_SHIFT 56 -#define SCSI_NL_VID_TYPE_MASK ((u64)0xFF << SCSI_NL_VID_TYPE_SHIFT) -#define SCSI_NL_VID_TYPE_PCI ((u64)0x01 << SCSI_NL_VID_TYPE_SHIFT) +#define SCSI_NL_VID_TYPE_MASK ((__u64)0xFF << SCSI_NL_VID_TYPE_SHIFT) +#define SCSI_NL_VID_TYPE_PCI ((__u64)0x01 << SCSI_NL_VID_TYPE_SHIFT) #define SCSI_NL_VID_ID_MASK (~ SCSI_NL_VID_TYPE_MASK) --- linux-ec2-2.6.32.orig/include/acpi/processor.h +++ linux-ec2-2.6.32/include/acpi/processor.h @@ -17,6 +17,12 @@ #define ACPI_PROCESSOR_MAX_THROTTLE 250 /* 25% */ #define ACPI_PROCESSOR_MAX_DUTY_WIDTH 4 +#ifdef CONFIG_XEN +#define NR_ACPI_CPUS (NR_CPUS < 256 ? 256 : NR_CPUS) +#else +#define NR_ACPI_CPUS NR_CPUS +#endif /* CONFIG_XEN */ + #define ACPI_PDC_REVISION_ID 0x1 #define ACPI_PSD_REV0_REVISION 0 /* Support for _PSD as in ACPI 3.0 */ @@ -42,13 +48,24 @@ struct acpi_processor_cx; +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +struct acpi_csd_package { + acpi_integer num_entries; + acpi_integer revision; + acpi_integer domain; + acpi_integer coord_type; + acpi_integer num_processors; + acpi_integer index; +} __attribute__ ((packed)); +#endif + struct acpi_power_register { u8 descriptor; u16 length; u8 space_id; u8 bit_width; u8 bit_offset; - u8 reserved; + u8 access_size; u64 address; } __attribute__ ((packed)); @@ -74,6 +91,13 @@ u32 power; u32 usage; u64 time; + u8 bm_sts_skip; +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL + /* Require raw information for external control logic */ + struct acpi_power_register reg; + u32 csd_count; + struct acpi_csd_package *domain_info; +#endif struct acpi_processor_cx_policy promotion; struct acpi_processor_cx_policy demotion; char desc[ACPI_CX_DESC_LEN]; @@ -304,6 +328,9 @@ { return; } +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +int acpi_processor_ppc_has_changed(struct acpi_processor *pr); +#else static inline int acpi_processor_ppc_has_changed(struct acpi_processor *pr) { static unsigned int printout = 1; @@ -316,6 +343,7 @@ } return 0; } +#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */ #endif /* CONFIG_CPU_FREQ */ /* in processor_throttling.c */ @@ -353,4 +381,127 @@ } #endif +/* + * Following are interfaces geared to external processor PM control + * logic like a VMM + */ +/* Events notified to external control logic */ +#define PROCESSOR_PM_INIT 1 +#define PROCESSOR_PM_CHANGE 2 +#define PROCESSOR_HOTPLUG 3 + +/* Objects for the PM events */ +#define PM_TYPE_IDLE 0 +#define PM_TYPE_PERF 1 +#define PM_TYPE_THR 2 +#define PM_TYPE_MAX 3 + +/* Processor hotplug events */ +#define HOTPLUG_TYPE_ADD 0 +#define HOTPLUG_TYPE_REMOVE 1 + +#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL +struct processor_extcntl_ops { + /* Transfer processor PM events to external control logic */ + int (*pm_ops[PM_TYPE_MAX])(struct acpi_processor *pr, int event); + /* Notify physical processor status to external control logic */ + int (*hotplug)(struct acpi_processor *pr, int type); +}; +extern const struct processor_extcntl_ops *processor_extcntl_ops; + +static inline int processor_cntl_external(void) +{ + return (processor_extcntl_ops != NULL); +} + +static inline int processor_pm_external(void) +{ + return processor_cntl_external() && + (processor_extcntl_ops->pm_ops[PM_TYPE_IDLE] != NULL); +} + +static inline int processor_pmperf_external(void) +{ + return processor_cntl_external() && + (processor_extcntl_ops->pm_ops[PM_TYPE_PERF] != NULL); +} + +static inline int processor_pmthr_external(void) +{ + return processor_cntl_external() && + (processor_extcntl_ops->pm_ops[PM_TYPE_THR] != NULL); +} + +extern int processor_notify_external(struct acpi_processor *pr, + int event, int type); +extern void processor_extcntl_init(void); +extern int processor_extcntl_prepare(struct acpi_processor *pr); +extern int acpi_processor_get_performance_info(struct acpi_processor *pr); +extern int acpi_processor_get_psd(struct acpi_processor *pr); +void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **); + +/* + * Declarations for objects and functions removed in native 2.6.29, and + * thus moved to drivers/acpi/processor_extcntl.c. + */ +extern struct notifier_block acpi_processor_latency_notifier; +int acpi_processor_set_power_policy(struct acpi_processor *); +#else +static inline int processor_cntl_external(void) {return 0;} +static inline int processor_pm_external(void) {return 0;} +static inline int processor_pmperf_external(void) {return 0;} +static inline int processor_pmthr_external(void) {return 0;} +static inline int processor_notify_external(struct acpi_processor *pr, + int event, int type) +{ + return 0; +} +static inline void processor_extcntl_init(void) {} +static inline int processor_extcntl_prepare(struct acpi_processor *pr) +{ + return 0; +} +#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */ + +#ifdef CONFIG_XEN +static inline void xen_convert_pct_reg(struct xen_pct_register *xpct, + struct acpi_pct_register *apct) +{ + xpct->descriptor = apct->descriptor; + xpct->length = apct->length; + xpct->space_id = apct->space_id; + xpct->bit_width = apct->bit_width; + xpct->bit_offset = apct->bit_offset; + xpct->reserved = apct->reserved; + xpct->address = apct->address; +} + +static inline void xen_convert_pss_states(struct xen_processor_px *xpss, + struct acpi_processor_px *apss, int state_count) +{ + int i; + for(i=0; icore_frequency = apss->core_frequency; + xpss->power = apss->power; + xpss->transition_latency = apss->transition_latency; + xpss->bus_master_latency = apss->bus_master_latency; + xpss->control = apss->control; + xpss->status = apss->status; + xpss++; + apss++; + } +} + +static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd, + struct acpi_psd_package *apsd) +{ + xpsd->num_entries = apsd->num_entries; + xpsd->revision = apsd->revision; + xpsd->domain = apsd->domain; + xpsd->coord_type = apsd->coord_type; + xpsd->num_processors = apsd->num_processors; +} + +#endif /* CONFIG_XEN */ + #endif --- linux-ec2-2.6.32.orig/include/acpi/platform/aclinux.h +++ linux-ec2-2.6.32/include/acpi/platform/aclinux.h @@ -152,7 +152,7 @@ #include #define ACPI_PREEMPTION_POINT() \ do { \ - if (!in_atomic_preempt_off()) \ + if (!in_atomic_preempt_off() && !irqs_disabled()) \ cond_resched(); \ } while (0) --- linux-ec2-2.6.32.orig/include/sound/emu10k1.h +++ linux-ec2-2.6.32/include/sound/emu10k1.h @@ -1707,6 +1707,7 @@ unsigned int card_type; /* EMU10K1_CARD_* */ unsigned int ecard_ctrl; /* ecard control bits */ unsigned long dma_mask; /* PCI DMA mask */ + unsigned int delay_pcm_irq; /* in samples */ int max_cache_pages; /* max memory size / PAGE_SIZE */ struct snd_dma_buffer silent_page; /* silent page */ struct snd_dma_buffer ptb_pages; /* page table pages */ --- linux-ec2-2.6.32.orig/include/sound/core.h +++ linux-ec2-2.6.32/include/sound/core.h @@ -120,6 +120,8 @@ int user_ctl_count; /* count of all user controls */ struct list_head controls; /* all controls for this card */ struct list_head ctl_files; /* active control files */ + struct mutex user_ctl_lock; /* protects user controls against + concurrent access */ struct snd_info_entry *proc_root; /* root for soundcard specific files */ struct snd_info_entry *proc_id; /* the card id */ --- linux-ec2-2.6.32.orig/include/sound/soc-dapm.h +++ linux-ec2-2.6.32/include/sound/soc-dapm.h @@ -46,25 +46,25 @@ /* platform domain */ #define SND_SOC_DAPM_INPUT(wname) \ { .id = snd_soc_dapm_input, .name = wname, .kcontrols = NULL, \ - .num_kcontrols = 0} + .num_kcontrols = 0, .reg = SND_SOC_NOPM } #define SND_SOC_DAPM_OUTPUT(wname) \ { .id = snd_soc_dapm_output, .name = wname, .kcontrols = NULL, \ - .num_kcontrols = 0} + .num_kcontrols = 0, .reg = SND_SOC_NOPM } #define SND_SOC_DAPM_MIC(wname, wevent) \ { .id = snd_soc_dapm_mic, .name = wname, .kcontrols = NULL, \ - .num_kcontrols = 0, .event = wevent, \ + .num_kcontrols = 0, .reg = SND_SOC_NOPM, .event = wevent, \ .event_flags = SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD} #define SND_SOC_DAPM_HP(wname, wevent) \ { .id = snd_soc_dapm_hp, .name = wname, .kcontrols = NULL, \ - .num_kcontrols = 0, .event = wevent, \ + .num_kcontrols = 0, .reg = SND_SOC_NOPM, .event = wevent, \ .event_flags = SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD} #define SND_SOC_DAPM_SPK(wname, wevent) \ { .id = snd_soc_dapm_spk, .name = wname, .kcontrols = NULL, \ - .num_kcontrols = 0, .event = wevent, \ + .num_kcontrols = 0, .reg = SND_SOC_NOPM, .event = wevent, \ .event_flags = SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD} #define SND_SOC_DAPM_LINE(wname, wevent) \ { .id = snd_soc_dapm_line, .name = wname, .kcontrols = NULL, \ - .num_kcontrols = 0, .event = wevent, \ + .num_kcontrols = 0, .reg = SND_SOC_NOPM, .event = wevent, \ .event_flags = SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD} /* path domain */ @@ -129,11 +129,11 @@ /* events that are pre and post DAPM */ #define SND_SOC_DAPM_PRE(wname, wevent) \ { .id = snd_soc_dapm_pre, .name = wname, .kcontrols = NULL, \ - .num_kcontrols = 0, .event = wevent, \ + .num_kcontrols = 0, .reg = SND_SOC_NOPM, .event = wevent, \ .event_flags = SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_PRE_PMD} #define SND_SOC_DAPM_POST(wname, wevent) \ { .id = snd_soc_dapm_post, .name = wname, .kcontrols = NULL, \ - .num_kcontrols = 0, .event = wevent, \ + .num_kcontrols = 0, .reg = SND_SOC_NOPM, .event = wevent, \ .event_flags = SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD} /* stream domain */ --- linux-ec2-2.6.32.orig/ipc/msg.c +++ linux-ec2-2.6.32/ipc/msg.c @@ -125,6 +125,7 @@ void msg_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &msg_ids(ns), freeque); + idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); } #endif @@ -296,7 +297,9 @@ } atomic_sub(msq->q_cbytes, &ns->msg_bytes); security_msg_queue_free(msq); + ipc_lock_by_ptr(&msq->q_perm); ipc_rcu_putref(msq); + ipc_unlock(&msq->q_perm); } /* --- linux-ec2-2.6.32.orig/ipc/sem.c +++ linux-ec2-2.6.32/ipc/sem.c @@ -129,6 +129,7 @@ void sem_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &sem_ids(ns), freeary); + idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr); } #endif @@ -559,6 +560,8 @@ { struct semid_ds out; + memset(&out, 0, sizeof(out)); + ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm); out.sem_otime = in->sem_otime; --- linux-ec2-2.6.32.orig/ipc/compat.c +++ linux-ec2-2.6.32/ipc/compat.c @@ -242,6 +242,8 @@ struct semid64_ds __user *up64; int version = compat_ipc_parse_version(&third); + memset(&s64, 0, sizeof(s64)); + if (!uptr) return -EINVAL; if (get_user(pad, (u32 __user *) uptr)) @@ -422,6 +424,8 @@ int version = compat_ipc_parse_version(&second); void __user *p; + memset(&m64, 0, sizeof(m64)); + switch (second & (~IPC_64)) { case IPC_INFO: case IPC_RMID: @@ -595,6 +599,8 @@ int err, err2; int version = compat_ipc_parse_version(&second); + memset(&s64, 0, sizeof(s64)); + switch (second & (~IPC_64)) { case IPC_RMID: case SHM_LOCK: --- linux-ec2-2.6.32.orig/ipc/shm.c +++ linux-ec2-2.6.32/ipc/shm.c @@ -101,6 +101,7 @@ void shm_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &shm_ids(ns), do_shm_rmid); + idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr); } #endif @@ -290,28 +291,28 @@ unsigned long flags) { struct shm_file_data *sfd = shm_file_data(file); - return get_unmapped_area(sfd->file, addr, len, pgoff, flags); -} - -int is_file_shm_hugepages(struct file *file) -{ - int ret = 0; - - if (file->f_op == &shm_file_operations) { - struct shm_file_data *sfd; - sfd = shm_file_data(file); - ret = is_file_hugepages(sfd->file); - } - return ret; + return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, + pgoff, flags); } static const struct file_operations shm_file_operations = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, +}; + +static const struct file_operations shm_file_operations_huge = { + .mmap = shm_mmap, + .fsync = shm_fsync, + .release = shm_release, .get_unmapped_area = shm_get_unmapped_area, }; +int is_file_shm_hugepages(struct file *file) +{ + return file->f_op == &shm_file_operations_huge; +} + static const struct vm_operations_struct shm_vm_ops = { .open = shm_open, /* callback for a new vm-area open */ .close = shm_close, /* callback for when the vm-area is released */ @@ -473,6 +474,7 @@ { struct shmid_ds out; + memset(&out, 0, sizeof(out)); ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm); out.shm_segsz = in->shm_segsz; out.shm_atime = in->shm_atime; @@ -889,7 +891,10 @@ if (!sfd) goto out_put_dentry; - file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations); + file = alloc_file(path.mnt, path.dentry, f_mode, + is_file_hugepages(shp->shm_file) ? + &shm_file_operations_huge : + &shm_file_operations); if (!file) goto out_free; ima_counts_get(file); --- linux-ec2-2.6.32.orig/ipc/mqueue.c +++ linux-ec2-2.6.32/ipc/mqueue.c @@ -706,7 +706,7 @@ dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name)); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); - goto out_err; + goto out_putfd; } mntget(ipc_ns->mq_mnt); @@ -744,7 +744,6 @@ mntput(ipc_ns->mq_mnt); out_putfd: put_unused_fd(fd); -out_err: fd = error; out_upsem: mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex); --- linux-ec2-2.6.32.orig/ipc/compat_mq.c +++ linux-ec2-2.6.32/ipc/compat_mq.c @@ -53,6 +53,9 @@ void __user *p = NULL; if (u_attr && oflag & O_CREAT) { struct mq_attr attr; + + memset(&attr, 0, sizeof(attr)); + p = compat_alloc_user_space(sizeof(attr)); if (get_compat_mq_attr(&attr, u_attr) || copy_to_user(p, &attr, sizeof(attr))) @@ -127,6 +130,8 @@ struct mq_attr __user *p = compat_alloc_user_space(2 * sizeof(*p)); long ret; + memset(&mqstat, 0, sizeof(mqstat)); + if (u_mqstat) { if (get_compat_mq_attr(&mqstat, u_mqstat) || copy_to_user(p, &mqstat, sizeof(mqstat))) --- linux-ec2-2.6.32.orig/firmware/Makefile +++ linux-ec2-2.6.32/firmware/Makefile @@ -140,7 +140,7 @@ fw-shipped-all := $(fw-shipped-y) $(fw-shipped-m) $(fw-shipped-) # Directories which we _might_ need to create, so we have a rule for them. -firmware-dirs := $(sort $(patsubst %,$(objtree)/$(obj)/%/,$(dir $(fw-external-y) $(fw-shipped-all)))) +firmware-dirs := $(sort $(addprefix $(objtree)/$(obj)/,$(dir $(fw-external-y) $(fw-shipped-all)))) quiet_cmd_mkdir = MKDIR $(patsubst $(objtree)/%,%,$@) cmd_mkdir = mkdir -p $@ --- linux-ec2-2.6.32.orig/usr/gen_init_cpio.c +++ linux-ec2-2.6.32/usr/gen_init_cpio.c @@ -299,7 +299,7 @@ int retval; int rc = -1; int namesize; - int i; + unsigned int i; mode |= S_IFREG; @@ -372,25 +372,28 @@ static char *cpio_replace_env(char *new_location) { - char expanded[PATH_MAX + 1]; - char env_var[PATH_MAX + 1]; - char *start; - char *end; - - for (start = NULL; (start = strstr(new_location, "${")); ) { - end = strchr(start, '}'); - if (start < end) { - *env_var = *expanded = '\0'; - strncat(env_var, start + 2, end - start - 2); - strncat(expanded, new_location, start - new_location); - strncat(expanded, getenv(env_var), PATH_MAX); - strncat(expanded, end + 1, PATH_MAX); - strncpy(new_location, expanded, PATH_MAX); - } else - break; - } + char expanded[PATH_MAX + 1]; + char env_var[PATH_MAX + 1]; + char *start; + char *end; + + for (start = NULL; (start = strstr(new_location, "${")); ) { + end = strchr(start, '}'); + if (start < end) { + *env_var = *expanded = '\0'; + strncat(env_var, start + 2, end - start - 2); + strncat(expanded, new_location, start - new_location); + strncat(expanded, getenv(env_var), + PATH_MAX - strlen(expanded)); + strncat(expanded, end + 1, + PATH_MAX - strlen(expanded)); + strncpy(new_location, expanded, PATH_MAX); + new_location[PATH_MAX] = 0; + } else + break; + } - return new_location; + return new_location; } --- linux-ec2-2.6.32.orig/arch/Kconfig +++ linux-ec2-2.6.32/arch/Kconfig @@ -6,8 +6,6 @@ tristate "OProfile system profiling (EXPERIMENTAL)" depends on PROFILING depends on HAVE_OPROFILE - depends on TRACING_SUPPORT - select TRACING select RING_BUFFER select RING_BUFFER_ALLOW_SWAP help @@ -20,7 +18,7 @@ config OPROFILE_IBS bool "OProfile AMD IBS support (EXPERIMENTAL)" default n - depends on OPROFILE && SMP && X86 + depends on OPROFILE && SMP && X86 && !XEN help Instruction-Based Sampling (IBS) is a new profiling technique that provides rich, precise program performance --- linux-ec2-2.6.32.orig/arch/um/sys-x86_64/Makefile +++ linux-ec2-2.6.32/arch/um/sys-x86_64/Makefile @@ -8,7 +8,8 @@ setjmp.o signal.o stub.o stub_segv.o syscalls.o syscall_table.o \ sysrq.o ksyms.o tls.o -subarch-obj-y = lib/csum-partial_64.o lib/memcpy_64.o lib/thunk_64.o +subarch-obj-y = lib/csum-partial_64.o lib/memcpy_64.o lib/thunk_64.o \ + lib/rwsem_64.o subarch-obj-$(CONFIG_MODULES) += kernel/module.o ldt-y = ../sys-i386/ldt.o --- linux-ec2-2.6.32.orig/arch/um/drivers/line.c +++ linux-ec2-2.6.32/arch/um/drivers/line.c @@ -727,6 +727,9 @@ static void free_winch(struct winch *winch, int free_irq_ok) { + if (free_irq_ok) + free_irq(WINCH_IRQ, winch); + list_del(&winch->list); if (winch->pid != -1) @@ -735,8 +738,6 @@ os_close_file(winch->fd); if (winch->stack != 0) free_stack(winch->stack, 0); - if (free_irq_ok) - free_irq(WINCH_IRQ, winch); kfree(winch); } --- linux-ec2-2.6.32.orig/arch/um/drivers/ubd_kern.c +++ linux-ec2-2.6.32/arch/um/drivers/ubd_kern.c @@ -160,6 +160,7 @@ struct scatterlist sg[MAX_SG]; struct request *request; int start_sg, end_sg; + sector_t rq_pos; }; #define DEFAULT_COW { \ @@ -184,6 +185,7 @@ .request = NULL, \ .start_sg = 0, \ .end_sg = 0, \ + .rq_pos = 0, \ } /* Protected by ubd_lock */ @@ -508,8 +510,37 @@ static inline int ubd_file_size(struct ubd *ubd_dev, __u64 *size_out) { char *file; + int fd; + int err; - file = ubd_dev->cow.file ? ubd_dev->cow.file : ubd_dev->file; + __u32 version; + __u32 align; + char *backing_file; + time_t mtime; + unsigned long long size; + int sector_size; + int bitmap_offset; + + if (ubd_dev->file && ubd_dev->cow.file) { + file = ubd_dev->cow.file; + + goto out; + } + + fd = os_open_file(ubd_dev->file, global_openflags, 0); + if (fd < 0) + return fd; + + err = read_cow_header(file_reader, &fd, &version, &backing_file, \ + &mtime, &size, §or_size, &align, &bitmap_offset); + os_close_file(fd); + + if(err == -EINVAL) + file = ubd_dev->file; + else + file = backing_file; + +out: return os_file_size(file, size_out); } @@ -1222,7 +1253,6 @@ { struct io_thread_req *io_req; struct request *req; - sector_t sector; int n; while(1){ @@ -1233,12 +1263,12 @@ return; dev->request = req; + dev->rq_pos = blk_rq_pos(req); dev->start_sg = 0; dev->end_sg = blk_rq_map_sg(q, req, dev->sg); } req = dev->request; - sector = blk_rq_pos(req); while(dev->start_sg < dev->end_sg){ struct scatterlist *sg = &dev->sg[dev->start_sg]; @@ -1250,10 +1280,9 @@ return; } prepare_request(req, io_req, - (unsigned long long)sector << 9, + (unsigned long long)dev->rq_pos << 9, sg->offset, sg->length, sg_page(sg)); - sector += sg->length >> 9; n = os_write_file(thread_fd, &io_req, sizeof(struct io_thread_req *)); if(n != sizeof(struct io_thread_req *)){ @@ -1266,6 +1295,7 @@ return; } + dev->rq_pos += sg->length >> 9; dev->start_sg++; } dev->end_sg = 0; --- linux-ec2-2.6.32.orig/arch/um/sys-i386/shared/sysdep/syscalls.h +++ linux-ec2-2.6.32/arch/um/sys-i386/shared/sysdep/syscalls.h @@ -20,7 +20,3 @@ #define EXECUTE_SYSCALL(syscall, regs) \ ((long (*)(struct syscall_args)) \ (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) - -extern long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); --- linux-ec2-2.6.32.orig/arch/um/kernel/dyn.lds.S +++ linux-ec2-2.6.32/arch/um/kernel/dyn.lds.S @@ -50,8 +50,18 @@ .rela.got : { *(.rela.got) } .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } - .rel.plt : { *(.rel.plt) } - .rela.plt : { *(.rela.plt) } + .rel.plt : { + *(.rel.plt) + PROVIDE_HIDDEN(__rel_iplt_start = .); + *(.rel.iplt) + PROVIDE_HIDDEN(__rel_iplt_end = .); + } + .rela.plt : { + *(.rela.plt) + PROVIDE_HIDDEN(__rela_iplt_start = .); + *(.rela.iplt) + PROVIDE_HIDDEN(__rela_iplt_end = .); + } .init : { KEEP (*(.init)) } =0x90909090 --- linux-ec2-2.6.32.orig/arch/um/kernel/syscall.c +++ linux-ec2-2.6.32/arch/um/kernel/syscall.c @@ -8,6 +8,7 @@ #include "linux/mm.h" #include "linux/sched.h" #include "linux/utsname.h" +#include "linux/syscalls.h" #include "asm/current.h" #include "asm/mman.h" #include "asm/uaccess.h" @@ -37,31 +38,6 @@ return ret; } -/* common code for old and new mmaps */ -long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - long error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - out: - return error; -} - long old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) @@ -70,7 +46,7 @@ if (offset & ~PAGE_MASK) goto out; - err = sys_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + err = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); out: return err; } --- linux-ec2-2.6.32.orig/arch/um/kernel/uml.lds.S +++ linux-ec2-2.6.32/arch/um/kernel/uml.lds.S @@ -22,7 +22,7 @@ _text = .; _stext = .; __init_begin = .; - INIT_TEXT_SECTION(PAGE_SIZE) + INIT_TEXT_SECTION(0) . = ALIGN(PAGE_SIZE); .text : @@ -43,6 +43,23 @@ __syscall_stub_end = .; } + /* + * These are needed even in a static link, even if they wind up being empty. + * Newer glibc needs these __rel{,a}_iplt_{start,end} symbols. + */ + .rel.plt : { + *(.rel.plt) + PROVIDE_HIDDEN(__rel_iplt_start = .); + *(.rel.iplt) + PROVIDE_HIDDEN(__rel_iplt_end = .); + } + .rela.plt : { + *(.rela.plt) + PROVIDE_HIDDEN(__rela_iplt_start = .); + *(.rela.iplt) + PROVIDE_HIDDEN(__rela_iplt_end = .); + } + #include "asm/common.lds.S" init.data : { INIT_DATA } --- linux-ec2-2.6.32.orig/arch/um/kernel/exitcode.c +++ linux-ec2-2.6.32/arch/um/kernel/exitcode.c @@ -42,9 +42,11 @@ unsigned long count, void *data) { char *end, buf[sizeof("nnnnn\0")]; + size_t size; int tmp; - if (copy_from_user(buf, buffer, count)) + size = min(count, sizeof(buf)); + if (copy_from_user(buf, buffer, size)) return -EFAULT; tmp = simple_strtol(buf, &end, 0); --- linux-ec2-2.6.32.orig/arch/um/os-Linux/time.c +++ linux-ec2-2.6.32/arch/um/os-Linux/time.c @@ -60,7 +60,7 @@ long long disable_timer(void) { struct itimerval time = ((struct itimerval) { { 0, 0 }, { 0, 0 } }); - int remain, max = UM_NSEC_PER_SEC / UM_HZ; + long long remain, max = UM_NSEC_PER_SEC / UM_HZ; if (setitimer(ITIMER_VIRTUAL, &time, &time) < 0) printk(UM_KERN_ERR "disable_timer - setitimer failed, " --- linux-ec2-2.6.32.orig/arch/um/os-Linux/mem.c +++ linux-ec2-2.6.32/arch/um/os-Linux/mem.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "init.h" --- linux-ec2-2.6.32.orig/arch/frv/include/asm/page.h +++ linux-ec2-2.6.32/arch/frv/include/asm/page.h @@ -63,12 +63,10 @@ #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#ifdef CONFIG_MMU #define VM_DATA_DEFAULT_FLAGS \ (VM_READ | VM_WRITE | \ ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#endif #endif /* __ASSEMBLY__ */ --- linux-ec2-2.6.32.orig/arch/frv/include/asm/cache.h +++ linux-ec2-2.6.32/arch/frv/include/asm/cache.h @@ -17,6 +17,8 @@ #define L1_CACHE_SHIFT (CONFIG_FRV_L1_CACHE_SHIFT) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) +#define ARCH_KMALLOC_MINALIGN L1_CACHE_BYTES + #define __cacheline_aligned __attribute__((aligned(L1_CACHE_BYTES))) #define ____cacheline_aligned __attribute__((aligned(L1_CACHE_BYTES))) --- linux-ec2-2.6.32.orig/arch/frv/kernel/sys_frv.c +++ linux-ec2-2.6.32/arch/frv/kernel/sys_frv.c @@ -31,9 +31,6 @@ unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - int error = -EBADF; - struct file * file = NULL; - /* As with sparc32, make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have.... */ @@ -41,69 +38,10 @@ trying to map something we can't */ if (pgoff & ((1 << (PAGE_SHIFT - 12)) - 1)) return -EINVAL; - pgoff >>= PAGE_SHIFT - 12; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -#if 0 /* DAVIDM - do we want this */ -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return error; + return sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } -#endif /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. --- linux-ec2-2.6.32.orig/arch/avr32/include/asm/signal.h +++ linux-ec2-2.6.32/arch/avr32/include/asm/signal.h @@ -128,6 +128,7 @@ __sigrestore_t sa_restorer; sigset_t sa_mask; /* mask last for extensibility */ }; +#define __ARCH_HAS_SA_RESTORER struct k_sigaction { struct sigaction sa; --- linux-ec2-2.6.32.orig/arch/avr32/include/asm/syscalls.h +++ linux-ec2-2.6.32/arch/avr32/include/asm/syscalls.h @@ -29,10 +29,6 @@ struct pt_regs *); asmlinkage int sys_rt_sigreturn(struct pt_regs *); -/* kernel/sys_avr32.c */ -asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, off_t); - /* mm/cache.c */ asmlinkage int sys_cacheflush(int, void __user *, size_t); --- linux-ec2-2.6.32.orig/arch/avr32/kernel/sys_avr32.c +++ linux-ec2-2.6.32/arch/avr32/kernel/sys_avr32.c @@ -5,39 +5,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include -#include -#include -#include #include -#include -#include -#include - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, off_t offset) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return error; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, offset); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - return error; -} - int kernel_execve(const char *file, char **argv, char **envp) { register long scno asm("r8") = __NR_execve; --- linux-ec2-2.6.32.orig/arch/avr32/kernel/syscall-stubs.S +++ linux-ec2-2.6.32/arch/avr32/kernel/syscall-stubs.S @@ -61,7 +61,7 @@ __sys_mmap2: pushm lr st.w --sp, ARG6 - call sys_mmap2 + call sys_mmap_pgoff sub sp, -4 popm pc --- linux-ec2-2.6.32.orig/arch/ia64/Makefile +++ linux-ec2-2.6.32/arch/ia64/Makefile @@ -56,7 +56,7 @@ core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/ core-$(CONFIG_IA64_SGI_UV) += arch/ia64/uv/ core-$(CONFIG_KVM) += arch/ia64/kvm/ -core-$(CONFIG_XEN) += arch/ia64/xen/ +core-$(CONFIG_PARAVIRT_XEN) += arch/ia64/xen/ drivers-$(CONFIG_PCI) += arch/ia64/pci/ drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/ --- linux-ec2-2.6.32.orig/arch/ia64/Kconfig +++ linux-ec2-2.6.32/arch/ia64/Kconfig @@ -231,7 +231,7 @@ config IA64_XEN_GUEST bool "Xen guest" select SWIOTLB - depends on XEN + depends on PARAVIRT_XEN help Build a kernel that runs on Xen guest domain. At this moment only 16KB page size in supported. @@ -502,23 +502,6 @@ def_bool y depends on PROC_KCORE -config IA32_SUPPORT - bool "Support for Linux/x86 binaries" - help - IA-64 processors can execute IA-32 (X86) instructions. By - saying Y here, the kernel will include IA-32 system call - emulation support which makes it possible to transparently - run IA-32 Linux binaries on an IA-64 Linux system. - If in doubt, say Y. - -config COMPAT - bool - depends on IA32_SUPPORT - default y - -config COMPAT_FOR_U64_ALIGNMENT - def_bool COMPAT - config IA64_MCA_RECOVERY tristate "MCA recovery from errors other than TLB." --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/system.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/system.h @@ -281,10 +281,6 @@ void default_idle(void); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void account_system_vtime(struct task_struct *); -#endif - #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/acpi.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/acpi.h @@ -94,6 +94,7 @@ #define acpi_noirq 0 /* ACPI always enabled on IA64 */ #define acpi_pci_disabled 0 /* ACPI PCI always enabled on IA64 */ #define acpi_strict 1 /* no ACPI spec workarounds on IA64 */ +#define acpi_ht 0 /* no HT-only mode on IA64 */ #endif #define acpi_processor_cstate_check(x) (x) /* no idle limits on IA64 :) */ static inline void disable_acpi(void) { } --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/unistd.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/unistd.h @@ -311,11 +311,12 @@ #define __NR_preadv 1319 #define __NR_pwritev 1320 #define __NR_rt_tgsigqueueinfo 1321 +#define __NR_accept4 1334 #ifdef __KERNEL__ -#define NR_syscalls 298 /* length of syscall table */ +#define NR_syscalls 311 /* length of syscall table */ /* * The following defines stop scripts/checksyscalls.sh from complaining about --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/compat.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/compat.h @@ -198,7 +198,7 @@ } static __inline__ void __user * -compat_alloc_user_space (long len) +arch_compat_alloc_user_space (long len) { struct pt_regs *regs = task_pt_regs(current); return (void __user *) (((regs->r12 & 0xffffffff) & -16) - len); --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/processor.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/processor.h @@ -361,7 +361,7 @@ regs->loadrs = 0; \ regs->r8 = get_dumpable(current->mm); /* set "don't zap registers" flag */ \ regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ - if (unlikely(!get_dumpable(current->mm))) { \ + if (unlikely(get_dumpable(current->mm) != SUID_DUMP_USER)) { \ /* \ * Zap scratch regs to avoid leaking bits between processes with different \ * uid/privileges. \ --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/io.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/io.h @@ -424,6 +424,8 @@ extern void __iomem * ioremap(unsigned long offset, unsigned long size); extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); extern void iounmap (volatile void __iomem *addr); +extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size); +extern void early_iounmap (volatile void __iomem *addr, unsigned long size); /* * String version of IO memory access ops: --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/xen/hypervisor.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/xen/hypervisor.h @@ -34,10 +34,12 @@ #define _ASM_IA64_XEN_HYPERVISOR_H #include +#ifdef CONFIG_PARAVIRT_XEN #include #include /* to compile feature.c */ #include /* to comiple xen-netfront.c */ #include +#endif /* xen_domain_type is set before executing any C code by early_xen_setup */ enum xen_domain_type { @@ -46,7 +48,7 @@ XEN_HVM_DOMAIN, /* running in a Xen hvm domain*/ }; -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN extern enum xen_domain_type xen_domain_type; #else #define xen_domain_type XEN_NATIVE @@ -66,7 +68,7 @@ #endif -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; --- linux-ec2-2.6.32.orig/arch/ia64/include/asm/xen/interface.h +++ linux-ec2-2.6.32/arch/ia64/include/asm/xen/interface.h @@ -56,29 +56,21 @@ #ifndef _ASM_IA64_XEN_INTERFACE_H #define _ASM_IA64_XEN_INTERFACE_H -#define __DEFINE_GUEST_HANDLE(name, type) \ +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name #define DEFINE_GUEST_HANDLE_STRUCT(name) \ - __DEFINE_GUEST_HANDLE(name, struct name) -#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) -#define GUEST_HANDLE(name) __guest_handle_ ## name -#define GUEST_HANDLE_64(name) GUEST_HANDLE(name) + __DEFINE_XEN_GUEST_HANDLE(name, struct name) +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) +#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name +#define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name) #define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0) #ifndef __ASSEMBLY__ -/* Guest handles for primitive C types. */ -__DEFINE_GUEST_HANDLE(uchar, unsigned char); -__DEFINE_GUEST_HANDLE(uint, unsigned int); -__DEFINE_GUEST_HANDLE(ulong, unsigned long); -__DEFINE_GUEST_HANDLE(u64, unsigned long); -DEFINE_GUEST_HANDLE(char); -DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(long); -DEFINE_GUEST_HANDLE(void); +__DEFINE_XEN_GUEST_HANDLE(u64, unsigned long); +typedef unsigned long xen_ulong_t; typedef unsigned long xen_pfn_t; -DEFINE_GUEST_HANDLE(xen_pfn_t); #define PRI_xen_pfn "lx" #endif --- linux-ec2-2.6.32.orig/arch/ia64/kvm/kvm-ia64.c +++ linux-ec2-2.6.32/arch/ia64/kvm/kvm-ia64.c @@ -1185,6 +1185,11 @@ #define PALE_RESET_ENTRY 0x80000000ffffffb0UL +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu *v; @@ -1797,7 +1802,8 @@ { struct kvm_memory_slot *memslot; int r, i; - long n, base; + long base; + unsigned long n; unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base + offsetof(struct kvm_vm_data, kvm_mem_dirty_log)); @@ -1810,7 +1816,7 @@ if (!memslot->dirty_bitmap) goto out; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + n = kvm_dirty_bitmap_bytes(memslot); base = memslot->base_gfn / BITS_PER_LONG; for (i = 0; i < n/sizeof(long); ++i) { @@ -1826,7 +1832,7 @@ struct kvm_dirty_log *log) { int r; - int n; + unsigned long n; struct kvm_memory_slot *memslot; int is_dirty = 0; @@ -1844,7 +1850,7 @@ if (is_dirty) { kvm_flush_remote_tlbs(kvm); memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + n = kvm_dirty_bitmap_bytes(memslot); memset(memslot->dirty_bitmap, 0, n); } r = 0; --- linux-ec2-2.6.32.orig/arch/ia64/sn/kernel/msi_sn.c +++ linux-ec2-2.6.32/arch/ia64/sn/kernel/msi_sn.c @@ -174,7 +174,7 @@ * Release XIO resources for the old MSI PCI address */ - read_msi_msg(irq, &msg); + get_cached_msi_msg(irq, &msg); sn_pdev = (struct pcidev_info *)sn_irq_info->irq_pciioinfo; pdev = sn_pdev->pdi_linux_pcidev; provider = SN_PCIDEV_BUSPROVIDER(pdev); --- linux-ec2-2.6.32.orig/arch/ia64/sn/pci/tioca_provider.c +++ linux-ec2-2.6.32/arch/ia64/sn/pci/tioca_provider.c @@ -517,7 +517,7 @@ * use the GART mapped mode. */ static u64 -tioca_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count, int dma_flags) +tioca_dma_map(struct pci_dev *pdev, unsigned long paddr, size_t byte_count, int dma_flags) { u64 mapaddr; --- linux-ec2-2.6.32.orig/arch/ia64/xen/xcom_hcall.c +++ linux-ec2-2.6.32/arch/ia64/xen/xcom_hcall.c @@ -343,7 +343,7 @@ int xencomm_hypercall_memory_op(unsigned int cmd, void *arg) { - GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} }; + XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} }; struct xen_memory_reservation *xmr = NULL; int rc; struct xencomm_handle *desc; --- linux-ec2-2.6.32.orig/arch/ia64/xen/Kconfig +++ linux-ec2-2.6.32/arch/ia64/xen/Kconfig @@ -2,7 +2,7 @@ # This Kconfig describes xen/ia64 options # -config XEN +config PARAVIRT_XEN bool "Xen hypervisor support" default y depends on PARAVIRT && MCKINLEY && IA64_PAGE_SIZE_16KB && EXPERIMENTAL @@ -17,9 +17,9 @@ both as a guest OS on Xen and natively on hardware. config XEN_XENCOMM - depends on XEN + depends on PARAVIRT_XEN bool config NO_IDLE_HZ - depends on XEN + depends on PARAVIRT_XEN bool --- linux-ec2-2.6.32.orig/arch/ia64/mm/tlb.c +++ linux-ec2-2.6.32/arch/ia64/mm/tlb.c @@ -120,7 +120,7 @@ ia64_invala(); for (;;) { - asm volatile ("ld4.c.nc %0=[%1]" : "=r"(serve) : "r"(&ss->serve) : "memory"); + asm volatile ("ld8.c.nc %0=[%1]" : "=r"(serve) : "r"(&ss->serve) : "memory"); if (time_before(t, serve)) return; cpu_relax(); --- linux-ec2-2.6.32.orig/arch/ia64/mm/ioremap.c +++ linux-ec2-2.6.32/arch/ia64/mm/ioremap.c @@ -22,6 +22,12 @@ } void __iomem * +early_ioremap (unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr); +} + +void __iomem * ioremap (unsigned long phys_addr, unsigned long size) { void __iomem *addr; @@ -102,6 +108,11 @@ EXPORT_SYMBOL(ioremap_nocache); void +early_iounmap (volatile void __iomem *addr, unsigned long size) +{ +} + +void iounmap (volatile void __iomem *addr) { if (REGION_NUMBER(addr) == RGN_GATE) --- linux-ec2-2.6.32.orig/arch/ia64/kernel/irq_ia64.c +++ linux-ec2-2.6.32/arch/ia64/kernel/irq_ia64.c @@ -24,7 +24,6 @@ #include #include #include -#include /* for rand_initialize_irq() */ #include #include #include --- linux-ec2-2.6.32.orig/arch/ia64/kernel/entry.S +++ linux-ec2-2.6.32/arch/ia64/kernel/entry.S @@ -1806,6 +1806,19 @@ data8 sys_preadv data8 sys_pwritev // 1320 data8 sys_rt_tgsigqueueinfo + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_ni_syscall // 1325 + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_ni_syscall // 1330 + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_accept4 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ --- linux-ec2-2.6.32.orig/arch/ia64/kernel/mca.c +++ linux-ec2-2.6.32/arch/ia64/kernel/mca.c @@ -1850,7 +1850,8 @@ data = mca_bootmem(); first_time = 0; } else - data = __get_free_pages(GFP_KERNEL, get_order(sz)); + data = (void *)__get_free_pages(GFP_KERNEL, + get_order(sz)); if (!data) panic("Could not allocate MCA memory for cpu %d\n", cpu); --- linux-ec2-2.6.32.orig/arch/ia64/kernel/sys_ia64.c +++ linux-ec2-2.6.32/arch/ia64/kernel/sys_ia64.c @@ -100,51 +100,7 @@ asmlinkage unsigned long ia64_brk (unsigned long brk) { - unsigned long rlim, retval, newbrk, oldbrk; - struct mm_struct *mm = current->mm; - - /* - * Most of this replicates the code in sys_brk() except for an additional safety - * check and the clearing of r8. However, we can't call sys_brk() because we need - * to acquire the mmap_sem before we can do the test... - */ - down_write(&mm->mmap_sem); - - if (brk < mm->end_code) - goto out; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; - - /* Always allow shrinking brk. */ - if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) - goto set_brk; - goto out; - } - - /* Check against unimplemented/unmapped addresses: */ - if ((newbrk - oldbrk) > RGN_MAP_LIMIT || REGION_OFFSET(newbrk) > RGN_MAP_LIMIT) - goto out; - - /* Check against rlimit.. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; - if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) - goto out; - - /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) - goto out; - - /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) - goto out; -set_brk: - mm->brk = brk; -out: - retval = mm->brk; - up_write(&mm->mmap_sem); + unsigned long retval = sys_brk(brk); force_successful_syscall_return(); return retval; } @@ -185,39 +141,6 @@ return 0; } -static inline unsigned long -do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff) -{ - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - - if (!file->f_op || !file->f_op->mmap) { - addr = -ENODEV; - goto out; - } - } - - /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) { - addr = -EINVAL; - goto out; - } - - down_write(¤t->mm->mmap_sem); - addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - -out: if (file) - fput(file); - return addr; -} - /* * mmap2() is like mmap() except that the offset is expressed in units * of PAGE_SIZE (instead of bytes). This allows to mmap2() (pieces @@ -226,7 +149,7 @@ asmlinkage unsigned long sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff) { - addr = do_mmap2(addr, len, prot, flags, fd, pgoff); + addr = sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; @@ -238,7 +161,7 @@ if (offset_in_page(off) != 0) return -EINVAL; - addr = do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + addr = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; --- linux-ec2-2.6.32.orig/arch/ia64/kernel/time.c +++ linux-ec2-2.6.32/arch/ia64/kernel/time.c @@ -473,7 +473,7 @@ { } -void update_vsyscall(struct timespec *wall, struct clocksource *c) +void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult) { unsigned long flags; @@ -481,7 +481,7 @@ /* copy fsyscall clock data */ fsyscall_gtod_data.clk_mask = c->mask; - fsyscall_gtod_data.clk_mult = c->mult; + fsyscall_gtod_data.clk_mult = mult; fsyscall_gtod_data.clk_shift = c->shift; fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio; fsyscall_gtod_data.clk_cycle_last = c->cycle_last; --- linux-ec2-2.6.32.orig/arch/ia64/kernel/msi_ia64.c +++ linux-ec2-2.6.32/arch/ia64/kernel/msi_ia64.c @@ -25,7 +25,7 @@ if (irq_prepare_move(irq, cpu)) return -1; - read_msi_msg(irq, &msg); + get_cached_msi_msg(irq, &msg); addr = msg.address_lo; addr &= MSI_ADDR_DEST_ID_MASK; --- linux-ec2-2.6.32.orig/arch/ia64/kernel/asm-offsets.c +++ linux-ec2-2.6.32/arch/ia64/kernel/asm-offsets.c @@ -290,7 +290,7 @@ DEFINE(IA64_ITC_LASTCYCLE_OFFSET, offsetof (struct itc_jitter_data_t, itc_lastcycle)); -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN BLANK(); DEFINE(XEN_NATIVE_ASM, XEN_NATIVE); --- linux-ec2-2.6.32.orig/arch/ia64/kernel/vmlinux.lds.S +++ linux-ec2-2.6.32/arch/ia64/kernel/vmlinux.lds.S @@ -176,7 +176,7 @@ __start_gate_section = .; *(.data.gate) __stop_gate_section = .; -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN . = ALIGN(PAGE_SIZE); __xen_start_gate_section = .; *(.data.gate.xen) --- linux-ec2-2.6.32.orig/arch/ia64/kernel/fsys.S +++ linux-ec2-2.6.32/arch/ia64/kernel/fsys.S @@ -420,22 +420,31 @@ ;; RSM_PSR_I(p0, r18, r19) // mask interrupt delivery - mov ar.ccv=0 andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP + mov r8=EINVAL // default to EINVAL #ifdef CONFIG_SMP - mov r17=1 - ;; - cmpxchg4.acq r18=[r31],r17,ar.ccv // try to acquire the lock - mov r8=EINVAL // default to EINVAL + // __ticket_spin_trylock(r31) + ld4 r17=[r31] ;; + mov.m ar.ccv=r17 + extr.u r9=r17,17,15 + adds r19=1,r17 + extr.u r18=r17,0,15 + ;; + cmp.eq p6,p7=r9,r18 + ;; +(p6) cmpxchg4.acq r9=[r31],r19,ar.ccv +(p6) dep.z r20=r19,1,15 // next serving ticket for unlock +(p7) br.cond.spnt.many .lock_contention + ;; + cmp4.eq p0,p7=r9,r17 + adds r31=2,r31 +(p7) br.cond.spnt.many .lock_contention ld8 r3=[r2] // re-read current->blocked now that we hold the lock - cmp4.ne p6,p0=r18,r0 -(p6) br.cond.spnt.many .lock_contention ;; #else ld8 r3=[r2] // re-read current->blocked now that we hold the lock - mov r8=EINVAL // default to EINVAL #endif add r18=IA64_TASK_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r16 add r19=IA64_TASK_SIGNAL_OFFSET,r16 @@ -490,7 +499,9 @@ (p6) br.cond.spnt.few 1b // yes -> retry #ifdef CONFIG_SMP - st4.rel [r31]=r0 // release the lock + // __ticket_spin_unlock(r31) + st2.rel [r31]=r20 + mov r20=0 // i must not leak kernel bits... #endif SSM_PSR_I(p0, p9, r31) ;; @@ -512,7 +523,8 @@ .sig_pending: #ifdef CONFIG_SMP - st4.rel [r31]=r0 // release the lock + // __ticket_spin_unlock(r31) + st2.rel [r31]=r20 // release the lock #endif SSM_PSR_I(p0, p9, r17) ;; --- linux-ec2-2.6.32.orig/arch/ia64/hp/common/sba_iommu.c +++ linux-ec2-2.6.32/arch/ia64/hp/common/sba_iommu.c @@ -677,12 +677,19 @@ spin_unlock_irqrestore(&ioc->saved_lock, flags); pide = sba_search_bitmap(ioc, dev, pages_needed, 0); - if (unlikely(pide >= (ioc->res_size << 3))) - panic(__FILE__ ": I/O MMU @ %p is out of mapping resources\n", - ioc->ioc_hpa); + if (unlikely(pide >= (ioc->res_size << 3))) { + printk(KERN_WARNING "%s: I/O MMU @ %p is" + "out of mapping resources, %u %u %lx\n", + __func__, ioc->ioc_hpa, ioc->res_size, + pages_needed, dma_get_seg_boundary(dev)); + return -1; + } #else - panic(__FILE__ ": I/O MMU @ %p is out of mapping resources\n", - ioc->ioc_hpa); + printk(KERN_WARNING "%s: I/O MMU @ %p is" + "out of mapping resources, %u %u %lx\n", + __func__, ioc->ioc_hpa, ioc->res_size, + pages_needed, dma_get_seg_boundary(dev)); + return -1; #endif } } @@ -965,6 +972,8 @@ #endif pide = sba_alloc_range(ioc, dev, size); + if (pide < 0) + return 0; iovp = (dma_addr_t) pide << iovp_shift; @@ -1320,6 +1329,7 @@ unsigned long dma_offset, dma_len; /* start/len of DMA stream */ int n_mappings = 0; unsigned int max_seg_size = dma_get_max_seg_size(dev); + int idx; while (nents > 0) { unsigned long vaddr = (unsigned long) sba_sg_address(startsg); @@ -1418,16 +1428,22 @@ vcontig_sg->dma_length = vcontig_len; dma_len = (dma_len + dma_offset + ~iovp_mask) & iovp_mask; ASSERT(dma_len <= DMA_CHUNK_SIZE); - dma_sg->dma_address = (dma_addr_t) (PIDE_FLAG - | (sba_alloc_range(ioc, dev, dma_len) << iovp_shift) - | dma_offset); + idx = sba_alloc_range(ioc, dev, dma_len); + if (idx < 0) { + dma_sg->dma_length = 0; + return -1; + } + dma_sg->dma_address = (dma_addr_t)(PIDE_FLAG | (idx << iovp_shift) + | dma_offset); n_mappings++; } return n_mappings; } - +static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs); /** * sba_map_sg - map Scatter/Gather list * @dev: instance of PCI owned by the driver that's asking. @@ -1493,6 +1509,10 @@ ** Access to the virtual address is what forces a two pass algorithm. */ coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents); + if (coalesced < 0) { + sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs); + return 0; + } /* ** Program the I/O Pdir --- linux-ec2-2.6.32.orig/arch/ia64/hp/sim/simserial.c +++ linux-ec2-2.6.32/arch/ia64/hp/sim/simserial.c @@ -395,7 +395,7 @@ { if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) && (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) && - (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) { + (cmd != TIOCMIWAIT)) { if (tty->flags & (1 << TTY_IO_ERROR)) return -EIO; } @@ -433,16 +433,6 @@ case TIOCMIWAIT: printk(KERN_INFO "rs_ioctl: TIOCMIWAIT: called\n"); return 0; - /* - * Get counter of input serial line interrupts (DCD,RI,DSR,CTS) - * Return: write counters to the user passed counter struct - * NB: both 1->0 and 0->1 transitions are counted except for - * RI where only 0->1 is counted. - */ - case TIOCGICOUNT: - printk(KERN_INFO "rs_ioctl: TIOCGICOUNT called\n"); - return 0; - case TIOCSERGWILD: case TIOCSERSWILD: /* "setserial -W" is called in Debian boot */ --- linux-ec2-2.6.32.orig/arch/ia64/ia32/sys_ia32.c +++ linux-ec2-2.6.32/arch/ia64/ia32/sys_ia32.c @@ -858,6 +858,9 @@ prot = get_prot32(prot); + if (flags & MAP_HUGETLB) + return -ENOMEM; + #if PAGE_SHIFT > IA32_PAGE_SHIFT mutex_lock(&ia32_mmap_mutex); { --- linux-ec2-2.6.32.orig/arch/s390/Kconfig +++ linux-ec2-2.6.32/arch/s390/Kconfig @@ -188,6 +188,9 @@ depends on COMPAT && SYSVIPC default y +config KEYS_COMPAT + def_bool y if COMPAT && KEYS + config AUDIT_ARCH bool default y --- linux-ec2-2.6.32.orig/arch/s390/include/asm/system.h +++ linux-ec2-2.6.32/arch/s390/include/asm/system.h @@ -97,7 +97,6 @@ extern void account_vtime(struct task_struct *, struct task_struct *); extern void account_tick_vtime(struct task_struct *); -extern void account_system_vtime(struct task_struct *); #ifdef CONFIG_PFAULT extern void pfault_irq_init(void); --- linux-ec2-2.6.32.orig/arch/s390/include/asm/kvm.h +++ linux-ec2-2.6.32/arch/s390/include/asm/kvm.h @@ -1,6 +1,5 @@ #ifndef __LINUX_KVM_S390_H #define __LINUX_KVM_S390_H - /* * asm-s390/kvm.h - KVM s390 specific structures and definitions * @@ -15,6 +14,8 @@ */ #include +#define __KVM_S390 + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ --- linux-ec2-2.6.32.orig/arch/s390/include/asm/signal.h +++ linux-ec2-2.6.32/arch/s390/include/asm/signal.h @@ -131,6 +131,7 @@ void (*sa_restorer)(void); sigset_t sa_mask; /* mask last for extensibility */ }; +#define __ARCH_HAS_SA_RESTORER struct k_sigaction { struct sigaction sa; --- linux-ec2-2.6.32.orig/arch/s390/include/asm/vdso.h +++ linux-ec2-2.6.32/arch/s390/include/asm/vdso.h @@ -7,7 +7,7 @@ #define VDSO32_LBASE 0 #define VDSO64_LBASE 0 -#define VDSO_VERSION_STRING LINUX_2.6.26 +#define VDSO_VERSION_STRING LINUX_2.6.29 #ifndef __ASSEMBLY__ --- linux-ec2-2.6.32.orig/arch/s390/include/asm/cputime.h +++ linux-ec2-2.6.32/arch/s390/include/asm/cputime.h @@ -183,6 +183,7 @@ unsigned long long idle_count; unsigned long long idle_enter; unsigned long long idle_time; + int nohz_delay; }; DECLARE_PER_CPU(struct s390_idle_data, s390_idle); @@ -198,4 +199,11 @@ vtime_start_cpu(); } +static inline int s390_nohz_delay(int cpu) +{ + return per_cpu(s390_idle, cpu).nohz_delay != 0; +} + +#define arch_needs_cpu(cpu) s390_nohz_delay(cpu) + #endif /* _S390_CPUTIME_H */ --- linux-ec2-2.6.32.orig/arch/s390/include/asm/compat.h +++ linux-ec2-2.6.32/arch/s390/include/asm/compat.h @@ -171,16 +171,9 @@ return test_thread_flag(TIF_31BIT); } -#else - -static inline int is_compat_task(void) -{ - return 0; -} - #endif -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { unsigned long stack; --- linux-ec2-2.6.32.orig/arch/s390/include/asm/processor.h +++ linux-ec2-2.6.32/arch/s390/include/asm/processor.h @@ -150,11 +150,6 @@ */ extern unsigned long thread_saved_pc(struct task_struct *t); -/* - * Print register of task into buffer. Used in fs/proc/array.c. - */ -extern void task_show_regs(struct seq_file *m, struct task_struct *task); - extern void show_code(struct pt_regs *regs); unsigned long get_wchan(struct task_struct *p); --- linux-ec2-2.6.32.orig/arch/s390/kvm/sigp.c +++ linux-ec2-2.6.32/arch/s390/kvm/sigp.c @@ -188,9 +188,9 @@ /* make sure that the new value is valid memory */ address = address & 0x7fffe000u; - if ((copy_from_guest(vcpu, &tmp, - (u64) (address + vcpu->arch.sie_block->gmsor) , 1)) || - (copy_from_guest(vcpu, &tmp, (u64) (address + + if ((copy_from_user(&tmp, (void __user *) + (address + vcpu->arch.sie_block->gmsor) , 1)) || + (copy_from_user(&tmp, (void __user *)(address + vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) { *reg |= SIGP_STAT_INVALID_PARAMETER; return 1; /* invalid parameter */ --- linux-ec2-2.6.32.orig/arch/s390/kvm/intercept.c +++ linux-ec2-2.6.32/arch/s390/kvm/intercept.c @@ -213,7 +213,7 @@ return rc2; } -static const intercept_handler_t intercept_funcs[0x48 >> 2] = { +static const intercept_handler_t intercept_funcs[] = { [0x00 >> 2] = handle_noop, [0x04 >> 2] = handle_instruction, [0x08 >> 2] = handle_prog, @@ -230,7 +230,7 @@ intercept_handler_t func; u8 code = vcpu->arch.sie_block->icptcode; - if (code & 3 || code > 0x48) + if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs)) return -ENOTSUPP; func = intercept_funcs[code >> 2]; if (func) --- linux-ec2-2.6.32.orig/arch/s390/kvm/kvm-s390.c +++ linux-ec2-2.6.32/arch/s390/kvm/kvm-s390.c @@ -116,10 +116,16 @@ int kvm_dev_ioctl_check_extension(long ext) { + int r; + switch (ext) { + case KVM_CAP_S390_PSW: + r = 1; + break; default: - return 0; + r = 0; } + return r; } /* Section: vm related */ @@ -302,11 +308,17 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { - struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); - int rc = -ENOMEM; + struct kvm_vcpu *vcpu; + int rc = -EINVAL; + + if (id >= KVM_MAX_VCPUS) + goto out; + + rc = -ENOMEM; + vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); if (!vcpu) - goto out_nomem; + goto out; vcpu->arch.sie_block = (struct kvm_s390_sie_block *) get_zeroed_page(GFP_KERNEL); @@ -332,14 +344,16 @@ rc = kvm_vcpu_init(vcpu, kvm, id); if (rc) - goto out_free_cpu; + goto out_free_sie_block; VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu, vcpu->arch.sie_block); return vcpu; +out_free_sie_block: + free_page((unsigned long)(vcpu->arch.sie_block)); out_free_cpu: kfree(vcpu); -out_nomem: +out: return ERR_PTR(rc); } @@ -419,8 +433,10 @@ vcpu_load(vcpu); if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING) rc = -EBUSY; - else - vcpu->arch.sie_block->gpsw = psw; + else { + vcpu->run->psw_mask = psw.mask; + vcpu->run->psw_addr = psw.addr; + } vcpu_put(vcpu); return rc; } @@ -508,9 +524,6 @@ switch (kvm_run->exit_reason) { case KVM_EXIT_S390_SIEIC: - vcpu->arch.sie_block->gpsw.mask = kvm_run->s390_sieic.mask; - vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr; - break; case KVM_EXIT_UNKNOWN: case KVM_EXIT_INTR: case KVM_EXIT_S390_RESET: @@ -519,6 +532,9 @@ BUG(); } + vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; + vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; + might_fault(); do { @@ -538,8 +554,6 @@ /* intercept cannot be handled in-kernel, prepare kvm-run */ kvm_run->exit_reason = KVM_EXIT_S390_SIEIC; kvm_run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode; - kvm_run->s390_sieic.mask = vcpu->arch.sie_block->gpsw.mask; - kvm_run->s390_sieic.addr = vcpu->arch.sie_block->gpsw.addr; kvm_run->s390_sieic.ipa = vcpu->arch.sie_block->ipa; kvm_run->s390_sieic.ipb = vcpu->arch.sie_block->ipb; rc = 0; @@ -551,6 +565,9 @@ rc = 0; } + kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; + kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; + if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); --- linux-ec2-2.6.32.orig/arch/s390/mm/mmap.c +++ linux-ec2-2.6.32/arch/s390/mm/mmap.c @@ -27,8 +27,8 @@ #include #include #include +#include #include -#include /* * Top of mmap area (just below the process stack). --- linux-ec2-2.6.32.orig/arch/s390/kernel/entry.S +++ linux-ec2-2.6.32/arch/s390/kernel/entry.S @@ -571,6 +571,7 @@ mvc __THREAD_per+__PER_access_id(1,%r8),__LC_PER_ACCESS_ID oi __TI_flags+3(%r9),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP TRACE_IRQS_ON + lm %r2,%r6,SP_R2(%r15) # load svc arguments stosm __SF_EMPTY(%r15),0x03 # reenable interrupts b BASED(sysc_do_svc) --- linux-ec2-2.6.32.orig/arch/s390/kernel/time.c +++ linux-ec2-2.6.32/arch/s390/kernel/time.c @@ -214,7 +214,8 @@ return &clocksource_tod; } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, + u32 mult) { if (clock != &clocksource_tod) return; --- linux-ec2-2.6.32.orig/arch/s390/kernel/setup.c +++ linux-ec2-2.6.32/arch/s390/kernel/setup.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include --- linux-ec2-2.6.32.orig/arch/s390/kernel/traps.c +++ linux-ec2-2.6.32/arch/s390/kernel/traps.c @@ -243,43 +243,6 @@ show_last_breaking_event(regs); } -/* This is called from fs/proc/array.c */ -void task_show_regs(struct seq_file *m, struct task_struct *task) -{ - struct pt_regs *regs; - - regs = task_pt_regs(task); - seq_printf(m, "task: %p, ksp: %p\n", - task, (void *)task->thread.ksp); - seq_printf(m, "User PSW : %p %p\n", - (void *) regs->psw.mask, (void *)regs->psw.addr); - - seq_printf(m, "User GPRS: " FOURLONG, - regs->gprs[0], regs->gprs[1], - regs->gprs[2], regs->gprs[3]); - seq_printf(m, " " FOURLONG, - regs->gprs[4], regs->gprs[5], - regs->gprs[6], regs->gprs[7]); - seq_printf(m, " " FOURLONG, - regs->gprs[8], regs->gprs[9], - regs->gprs[10], regs->gprs[11]); - seq_printf(m, " " FOURLONG, - regs->gprs[12], regs->gprs[13], - regs->gprs[14], regs->gprs[15]); - seq_printf(m, "User ACRS: %08x %08x %08x %08x\n", - task->thread.acrs[0], task->thread.acrs[1], - task->thread.acrs[2], task->thread.acrs[3]); - seq_printf(m, " %08x %08x %08x %08x\n", - task->thread.acrs[4], task->thread.acrs[5], - task->thread.acrs[6], task->thread.acrs[7]); - seq_printf(m, " %08x %08x %08x %08x\n", - task->thread.acrs[8], task->thread.acrs[9], - task->thread.acrs[10], task->thread.acrs[11]); - seq_printf(m, " %08x %08x %08x %08x\n", - task->thread.acrs[12], task->thread.acrs[13], - task->thread.acrs[14], task->thread.acrs[15]); -} - static DEFINE_SPINLOCK(die_lock); void die(const char * str, struct pt_regs * regs, long err) --- linux-ec2-2.6.32.orig/arch/s390/kernel/nmi.c +++ linux-ec2-2.6.32/arch/s390/kernel/nmi.c @@ -95,7 +95,6 @@ static int notrace s390_revalidate_registers(struct mci *mci) { int kill_task; - u64 tmpclock; u64 zero; void *fpt_save_area, *fpt_creg_save_area; @@ -214,11 +213,10 @@ : "0", "cc"); #endif /* Revalidate clock comparator register */ - asm volatile( - " stck 0(%1)\n" - " sckc 0(%1)" - : "=m" (tmpclock) : "a" (&(tmpclock)) : "cc", "memory"); - + if (S390_lowcore.clock_comparator == -1) + set_clock_comparator(get_clock()); + else + set_clock_comparator(S390_lowcore.clock_comparator); /* Check if old PSW is valid */ if (!mci->wp) /* --- linux-ec2-2.6.32.orig/arch/s390/kernel/process.c +++ linux-ec2-2.6.32/arch/s390/kernel/process.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include --- linux-ec2-2.6.32.orig/arch/s390/kernel/head64.S +++ linux-ec2-2.6.32/arch/s390/kernel/head64.S @@ -83,6 +83,8 @@ slr %r0,%r0 # set cpuid to zero sigp %r1,%r0,0x12 # switch to esame mode sam64 # switch to 64 bit mode + llgfr %r13,%r13 # clear high-order half of base reg + lmh %r0,%r15,.Lzero64-.LPG1(%r13) # clear high-order half lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area # move IPL device to lowcore @@ -122,11 +124,12 @@ .quad 0 # cr12: tracing off .quad 0 # cr13: home space segment table .quad 0xc0000000 # cr14: machine check handling off - .quad 0 # cr15: linkage stack operations + .quad .Llinkage_stack # cr15: linkage stack operations .Lpcmsk:.quad 0x0000000180000000 .L4malign:.quad 0xffffffffffc00000 .Lscan2g:.quad 0x80000000 + 0x20000 - 8 # 2GB + 128K - 8 .Lnop: .long 0x07000700 +.Lzero64:.fill 16,4,0x0 #ifdef CONFIG_ZFCPDUMP .Lcurrent_cpu: .long 0x0 @@ -136,12 +139,15 @@ .Lparmaddr: .quad PARMAREA .align 64 -.Lduct: .long 0,0,0,0,.Lduald,0,0,0 +.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0 .long 0,0,0,0,0,0,0,0 +.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0 .align 128 .Lduald:.rept 8 .long 0x80000000,0,0,0 # invalid access-list entries .endr +.Llinkage_stack: + .long 0,0,0x89000000,0,0,0,0x8a000000,0 .org 0x12000 .globl _ehead --- linux-ec2-2.6.32.orig/arch/s390/kernel/compat_linux.c +++ linux-ec2-2.6.32/arch/s390/kernel/compat_linux.c @@ -683,38 +683,6 @@ u32 offset; }; -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file * file = NULL; - unsigned long error = -EBADF; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - if (!IS_ERR((void *) error) && error + len >= 0x80000000ULL) { - /* Result is out of bounds. */ - do_munmap(current->mm, addr, len); - error = -ENOMEM; - } - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - - asmlinkage unsigned long old32_mmap(struct mmap_arg_struct_emu31 __user *arg) { @@ -728,7 +696,8 @@ if (a.offset & ~PAGE_MASK) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } @@ -741,7 +710,7 @@ if (copy_from_user(&a, arg, sizeof(a))) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); out: return error; } --- linux-ec2-2.6.32.orig/arch/s390/kernel/ptrace.c +++ linux-ec2-2.6.32/arch/s390/kernel/ptrace.c @@ -36,8 +36,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -632,7 +632,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) { - long ret; + long ret = 0; /* Do the secure computing check first. */ secure_computing(regs->gprs[2]); @@ -641,7 +641,6 @@ * The sysc_tracesys code in entry.S stored the system * call number to gprs[2]. */ - ret = regs->gprs[2]; if (test_thread_flag(TIF_SYSCALL_TRACE) && (tracehook_report_syscall_entry(regs) || regs->gprs[2] >= NR_syscalls)) { @@ -663,7 +662,7 @@ regs->gprs[2], regs->orig_gpr2, regs->gprs[3], regs->gprs[4], regs->gprs[5]); - return ret; + return ret ?: regs->gprs[2]; } asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) --- linux-ec2-2.6.32.orig/arch/s390/kernel/entry64.S +++ linux-ec2-2.6.32/arch/s390/kernel/entry64.S @@ -549,6 +549,7 @@ mvc __THREAD_per+__PER_access_id(1,%r8),__LC_PER_ACCESS_ID oi __TI_flags+7(%r9),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP TRACE_IRQS_ON + lmg %r2,%r6,SP_R2(%r15) # load svc arguments stosm __SF_EMPTY(%r15),0x03 # reenable interrupts j sysc_do_svc --- linux-ec2-2.6.32.orig/arch/s390/kernel/vtime.c +++ linux-ec2-2.6.32/arch/s390/kernel/vtime.c @@ -167,6 +167,8 @@ /* Wait for external, I/O or machine check interrupt. */ psw.mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT; + idle->nohz_delay = 0; + /* Check if the CPU timer needs to be reprogrammed. */ if (vq->do_spt) { __u64 vmax = VTIMER_MAX_SLICE; --- linux-ec2-2.6.32.orig/arch/s390/kernel/sys_s390.c +++ linux-ec2-2.6.32/arch/s390/kernel/sys_s390.c @@ -32,32 +32,6 @@ #include #include "entry.h" -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - long error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux for S/390 isn't able to handle more than 5 @@ -81,7 +55,7 @@ if (copy_from_user(&a, arg, sizeof(a))) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); out: return error; } @@ -98,7 +72,7 @@ if (a.offset & ~PAGE_MASK) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return error; } --- linux-ec2-2.6.32.orig/arch/s390/kernel/s390_ext.c +++ linux-ec2-2.6.32/arch/s390/kernel/s390_ext.c @@ -126,6 +126,8 @@ /* Serve timer interrupts first. */ clock_comparator_work(); kstat_cpu(smp_processor_id()).irqs[EXTERNAL_INTERRUPT]++; + if (code != 0x1004) + __get_cpu_var(s390_idle).nohz_delay = 1; index = ext_hash(code); for (p = ext_int_hash[index]; p; p = p->next) { if (likely(p->code == code)) --- linux-ec2-2.6.32.orig/arch/s390/lib/delay.c +++ linux-ec2-2.6.32/arch/s390/lib/delay.c @@ -29,17 +29,21 @@ { unsigned long mask, cr0, cr0_saved; u64 clock_saved; + u64 end; + mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_EXT; + end = get_clock() + (usecs << 12); clock_saved = local_tick_disable(); - set_clock_comparator(get_clock() + (usecs << 12)); __ctl_store(cr0_saved, 0, 0); cr0 = (cr0_saved & 0xffff00e0) | 0x00000800; __ctl_load(cr0 , 0, 0); - mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_EXT; lockdep_off(); - trace_hardirqs_on(); - __load_psw_mask(mask); - local_irq_disable(); + do { + set_clock_comparator(end); + trace_hardirqs_on(); + __load_psw_mask(mask); + local_irq_disable(); + } while (get_clock() < end); lockdep_on(); __ctl_load(cr0_saved, 0, 0); local_tick_enable(clock_saved); --- linux-ec2-2.6.32.orig/arch/m68k/include/asm/signal.h +++ linux-ec2-2.6.32/arch/m68k/include/asm/signal.h @@ -119,6 +119,7 @@ __sigrestore_t sa_restorer; sigset_t sa_mask; /* mask last for extensibility */ }; +#define __ARCH_HAS_SA_RESTORER struct k_sigaction { struct sigaction sa; --- linux-ec2-2.6.32.orig/arch/m68k/include/asm/cache.h +++ linux-ec2-2.6.32/arch/m68k/include/asm/cache.h @@ -8,4 +8,6 @@ #define L1_CACHE_SHIFT 4 #define L1_CACHE_BYTES (1<< L1_CACHE_SHIFT) +#define ARCH_KMALLOC_MINALIGN L1_CACHE_BYTES + #endif --- linux-ec2-2.6.32.orig/arch/m68k/mm/motorola.c +++ linux-ec2-2.6.32/arch/m68k/mm/motorola.c @@ -299,6 +299,8 @@ zones_size[ZONE_DMA] = m68k_memory[i].size >> PAGE_SHIFT; free_area_init_node(i, zones_size, m68k_memory[i].addr >> PAGE_SHIFT, NULL); + if (node_present_pages(i)) + node_set_state(i, N_NORMAL_MEMORY); } } --- linux-ec2-2.6.32.orig/arch/m68k/kernel/sys_m68k.c +++ linux-ec2-2.6.32/arch/m68k/kernel/sys_m68k.c @@ -29,37 +29,16 @@ #include #include -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - return do_mmap2(addr, len, prot, flags, fd, pgoff); + /* + * This is wrong for sun3 - there PAGE_SIZE is 8Kb, + * so we need to shift the argument down by 1; m68k mmap64(3) + * (in libc) expects the last argument of mmap2 in 4Kb units. + */ + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* @@ -90,57 +69,11 @@ if (a.offset & ~PAGE_MASK) goto out; - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); -out: - return error; -} - -#if 0 -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } -#endif struct sel_arg_struct { unsigned long n; --- linux-ec2-2.6.32.orig/arch/alpha/kernel/osf_sys.c +++ linux-ec2-2.6.32/arch/alpha/kernel/osf_sys.c @@ -178,25 +178,18 @@ unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - struct file *file = NULL; - unsigned long ret = -EBADF; + unsigned long ret = -EINVAL; #if 0 if (flags & (_MAP_HASSEMAPHORE | _MAP_INHERIT | _MAP_UNALIGNED)) printk("%s: unimplemented OSF mmap flags %04lx\n", current->comm, flags); #endif - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - ret = do_mmap(file, addr, len, prot, flags, off); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + if ((off + PAGE_ALIGN(len)) < off) + goto out; + if (off & ~PAGE_MASK) + goto out; + ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return ret; } @@ -438,7 +431,7 @@ return -EFAULT; len = namelen; - if (namelen > 32) + if (len > 32) len = 32; down_read(&uts_sem); @@ -625,7 +618,7 @@ down_read(&uts_sem); res = sysinfo_table[offset]; len = strlen(res)+1; - if (len > count) + if ((unsigned long)len > (unsigned long)count) len = count; if (copy_to_user(buf, res, len)) err = -EFAULT; @@ -680,7 +673,7 @@ return 1; case GSI_GET_HWRPB: - if (nbytes < sizeof(*hwrpb)) + if (nbytes > sizeof(*hwrpb)) return -EINVAL; if (copy_to_user(buffer, hwrpb, nbytes) != 0) return -EFAULT; @@ -1042,6 +1035,7 @@ { struct rusage r; long ret, err; + unsigned int status = 0; mm_segment_t old_fs; if (!ur) @@ -1050,13 +1044,15 @@ old_fs = get_fs(); set_fs (KERNEL_DS); - ret = sys_wait4(pid, ustatus, options, (struct rusage __user *) &r); + ret = sys_wait4(pid, (unsigned int __user *) &status, options, + (struct rusage __user *) &r); set_fs (old_fs); if (!access_ok(VERIFY_WRITE, ur, sizeof(*ur))) return -EFAULT; err = 0; + err |= put_user(status, ustatus); err |= __put_user(r.ru_utime.tv_sec, &ur->ru_utime.tv_sec); err |= __put_user(r.ru_utime.tv_usec, &ur->ru_utime.tv_usec); err |= __put_user(r.ru_stime.tv_sec, &ur->ru_stime.tv_sec); --- linux-ec2-2.6.32.orig/arch/alpha/kernel/sys_nautilus.c +++ linux-ec2-2.6.32/arch/alpha/kernel/sys_nautilus.c @@ -189,6 +189,10 @@ extern void free_reserved_mem(void *, void *); extern void pcibios_claim_one_bus(struct pci_bus *); +static struct resource irongate_io = { + .name = "Irongate PCI IO", + .flags = IORESOURCE_IO, +}; static struct resource irongate_mem = { .name = "Irongate PCI MEM", .flags = IORESOURCE_MEM, @@ -210,6 +214,7 @@ irongate = pci_get_bus_and_slot(0, 0); bus->self = irongate; + bus->resource[0] = &irongate_io; bus->resource[1] = &irongate_mem; pci_bus_size_bridges(bus); --- linux-ec2-2.6.32.orig/arch/alpha/kernel/err_marvel.c +++ linux-ec2-2.6.32/arch/alpha/kernel/err_marvel.c @@ -109,7 +109,7 @@ #define IO7__ERR_CYC__CYCLE__M (0x7) printk("%s Packet In Error: %s\n" - "%s Error in %s, cycle %ld%s%s\n", + "%s Error in %s, cycle %lld%s%s\n", err_print_prefix, packet_desc[EXTRACT(err_cyc, IO7__ERR_CYC__PACKET)], err_print_prefix, @@ -313,7 +313,7 @@ } printk("%s Up Hose Garbage Symptom:\n" - "%s Source Port: %ld - Dest PID: %ld - OpCode: %s\n", + "%s Source Port: %lld - Dest PID: %lld - OpCode: %s\n", err_print_prefix, err_print_prefix, EXTRACT(ugbge_sym, IO7__PO7_UGBGE_SYM__UPH_SRC_PORT), @@ -552,7 +552,7 @@ #define IO7__POX_SPLCMPLT__REM_BYTE_COUNT__M (0xfff) printk("%s Split Completion Error:\n" - "%s Source (Bus:Dev:Func): %ld:%ld:%ld\n", + "%s Source (Bus:Dev:Func): %lld:%lld:%lld\n", err_print_prefix, err_print_prefix, EXTRACT(spl_cmplt, IO7__POX_SPLCMPLT__SOURCE_BUS), --- linux-ec2-2.6.32.orig/arch/microblaze/Makefile +++ linux-ec2-2.6.32/arch/microblaze/Makefile @@ -69,12 +69,16 @@ all: linux.bin -BOOT_TARGETS = linux.bin linux.bin.gz simpleImage.% +# With make 3.82 we cannot mix normal and wildcard targets +BOOT_TARGETS1 = linux.bin linux.bin.gz +BOOT_TARGETS2 = simpleImage.% archclean: $(Q)$(MAKE) $(clean)=$(boot) -$(BOOT_TARGETS): vmlinux +$(BOOT_TARGETS1): vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ +$(BOOT_TARGETS2): vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ define archhelp --- linux-ec2-2.6.32.orig/arch/microblaze/kernel/sys_microblaze.c +++ linux-ec2-2.6.32/arch/microblaze/kernel/sys_microblaze.c @@ -62,46 +62,14 @@ return error; } -asmlinkage long -sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file *file = NULL; - int ret = -EBADF; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) { - printk(KERN_INFO "no fd in mmap\r\n"); - goto out; - } - } - - down_write(¤t->mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return ret; -} - asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, off_t pgoff) { - int err = -EINVAL; - - if (pgoff & ~PAGE_MASK) { - printk(KERN_INFO "no pagemask in mmap\r\n"); - goto out; - } + if (pgoff & ~PAGE_MASK) + return -EINVAL; - err = sys_mmap2(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); -out: - return err; + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); } /* --- linux-ec2-2.6.32.orig/arch/microblaze/kernel/syscall_table.S +++ linux-ec2-2.6.32/arch/microblaze/kernel/syscall_table.S @@ -196,7 +196,7 @@ .long sys_ni_syscall /* reserved for streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 /* mmap2 */ + .long sys_mmap_pgoff /* mmap2 */ .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ --- linux-ec2-2.6.32.orig/arch/x86/Kconfig.cpu +++ linux-ec2-2.6.32/arch/x86/Kconfig.cpu @@ -323,7 +323,7 @@ config X86_XADD def_bool y - depends on X86_32 && !M386 + depends on X86_64 || !M386 config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" @@ -340,7 +340,7 @@ config X86_F00F_BUG def_bool y - depends on M586MMX || M586TSC || M586 || M486 || M386 + depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT config X86_WP_WORKS_OK def_bool y @@ -397,10 +397,11 @@ config X86_TSC def_bool y depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64 + depends on !XEN config X86_CMPXCHG64 def_bool y - depends on !M386 && !M486 + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM # this should be set for all -march=.. options where the compiler # generates cmov. @@ -496,7 +497,7 @@ config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT - depends on !64BIT + depends on !64BIT && !XEN ---help--- This enables detection, tunings and quirks for UMC processors @@ -509,13 +510,13 @@ config X86_DS def_bool X86_PTRACE_BTS - depends on X86_DEBUGCTLMSR + depends on X86_DEBUGCTLMSR && !XEN select HAVE_HW_BRANCH_TRACER config X86_PTRACE_BTS bool "Branch Trace Store" default y - depends on X86_DEBUGCTLMSR + depends on X86_DEBUGCTLMSR && !XEN depends on BROKEN ---help--- This adds a ptrace interface to the hardware's branch trace store. --- linux-ec2-2.6.32.orig/arch/x86/Kbuild +++ linux-ec2-2.6.32/arch/x86/Kbuild @@ -2,7 +2,7 @@ obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support -obj-$(CONFIG_XEN) += xen/ +obj-$(CONFIG_PARAVIRT_XEN) += xen/ # lguest paravirtualization support obj-$(CONFIG_LGUEST_GUEST) += lguest/ --- linux-ec2-2.6.32.orig/arch/x86/Makefile +++ linux-ec2-2.6.32/arch/x86/Makefile @@ -146,8 +146,27 @@ BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage -PHONY += bzImage $(BOOT_TARGETS) +PHONY += bzImage vmlinuz $(BOOT_TARGETS) +ifdef CONFIG_XEN +LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \ + -I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE) + +ifdef CONFIG_X86_64 +LDFLAGS_vmlinux := -e startup_64 +endif + +# Default kernel to build +all: vmlinuz + +# KBUILD_IMAGE specifies the target image being built +KBUILD_IMAGE := $(boot)/vmlinuz + +vmlinuz: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) + $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot + $(Q)ln -fsn ../../x86/boot/$@ $(objtree)/arch/$(UTS_MACHINE)/boot/$@ +else # Default kernel to build all: bzImage @@ -158,6 +177,7 @@ $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ +endif $(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) $@ --- linux-ec2-2.6.32.orig/arch/x86/Kconfig.debug +++ linux-ec2-2.6.32/arch/x86/Kconfig.debug @@ -25,6 +25,7 @@ config X86_VERBOSE_BOOTUP bool "Enable verbose x86 bootup info messages" default y + depends on !XEN ---help--- Enables the informational output from the decompression stage (e.g. bzImage) of the boot. If you disable this you will still @@ -136,7 +137,7 @@ config DOUBLEFAULT default y bool "Enable doublefault exception handler" if EMBEDDED - depends on X86_32 + depends on X86_32 && !X86_NO_TSS ---help--- This option allows trapping of rare doublefault exceptions that would otherwise cause a system to silently reboot. Disabling this @@ -185,6 +186,7 @@ config HAVE_MMIOTRACE_SUPPORT def_bool y + depends on !XEN # # IO delay types: @@ -264,6 +266,7 @@ bool "Debug boot parameters" depends on DEBUG_KERNEL depends on DEBUG_FS + depends on !XEN ---help--- This option will cause struct boot_params to be exported via debugfs. --- linux-ec2-2.6.32.orig/arch/x86/Kconfig +++ linux-ec2-2.6.32/arch/x86/Kconfig @@ -24,7 +24,7 @@ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PERF_EVENTS if (!M386 && !M486) + select HAVE_PERF_EVENTS if (!M386 && !M486 && !XEN) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -39,16 +39,16 @@ select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_SYSCALL_TRACEPOINTS - select HAVE_KVM - select HAVE_ARCH_KGDB + select HAVE_KVM if !XEN + select HAVE_ARCH_KGDB if !XEN select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select USER_STACKTRACE_SUPPORT select HAVE_DMA_API_DEBUG select HAVE_KERNEL_GZIP - select HAVE_KERNEL_BZIP2 - select HAVE_KERNEL_LZMA + select HAVE_KERNEL_BZIP2 if !XEN + select HAVE_KERNEL_LZMA if !XEN select HAVE_ARCH_KMEMCHECK config OUTPUT_FORMAT @@ -69,13 +69,16 @@ config CLOCKSOURCE_WATCHDOG def_bool y + depends on !XEN config GENERIC_CLOCKEVENTS def_bool y + depends on !XEN config GENERIC_CLOCKEVENTS_BROADCAST def_bool y depends on X86_64 || (X86_32 && X86_LOCAL_APIC) + depends on !XEN config LOCKDEP_SUPPORT def_bool y @@ -157,6 +160,7 @@ config ARCH_HIBERNATION_POSSIBLE def_bool y + depends on !XEN config ARCH_SUSPEND_POSSIBLE def_bool y @@ -213,20 +217,34 @@ config X86_HT bool - depends on SMP + depends on SMP && !XEN default y config X86_TRAMPOLINE bool depends on SMP || (64BIT && ACPI_SLEEP) + depends on !XEN default y +config X86_NO_TSS + def_bool y + depends on XEN + +config X86_NO_IDT + def_bool y + depends on XEN + config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR config KTIME_SCALAR def_bool X86_32 + +config ARCH_CPU_PROBE_RELEASE + def_bool y + depends on HOTPLUG_CPU && !XEN + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -298,13 +316,25 @@ For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it +config X86_XEN + bool "Xen-compatible" + depends on X86_32 + select XEN + select X86_PAE + select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST + select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST + select SWIOTLB + help + Choose this option if you plan to run this kernel on top of the + Xen Hypervisor. + config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" - depends on X86_32 && SMP + depends on X86_32 && SMP && !XEN ---help--- This option is needed for the systems that have more than 8 CPUs -if X86_32 +if X86_32 && !XEN config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -327,7 +357,15 @@ generic distribution kernel, say Y here - otherwise say N. endif -if X86_64 +config X86_64_XEN + bool "Enable Xen compatible kernel" + depends on X86_64 + select XEN + select SWIOTLB + help + This option will compile a kernel compatible with Xen hypervisor + +if X86_64 && !XEN config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -370,6 +408,12 @@ # Following is an alphabetically sorted list of 32 bit extended platforms # Please maintain the alphabetic order if and when there are additions +config X86_LPIA + bool "LPIA-compatible" + depends on X86_32 && X86_PC + help + Choose this option if your computer is an LPIA platform. + config X86_ELAN bool "AMD Elan" depends on X86_32 @@ -480,6 +524,7 @@ menuconfig PARAVIRT_GUEST bool "Paravirtualized guest support" + depends on !XEN ---help--- Say Y here to get to see options related to running Linux under various hypervisors. This option alone does not add any kernel code. @@ -565,6 +610,7 @@ config MEMTEST bool "Memtest" + depends on !XEN ---help--- This option adds a kernel parameter 'memtest', which allows memtest to be set. @@ -587,6 +633,7 @@ config HPET_TIMER def_bool X86_64 prompt "HPET Timer Support" if X86_32 + depends on !XEN ---help--- Use the IA-PC HPET (High Precision Event Timer) to manage time in preference to the PIT and RTC, if a HPET is @@ -612,6 +659,7 @@ config DMI default y bool "Enable DMI scanning" if EMBEDDED + depends on !XEN_UNPRIVILEGED_GUEST ---help--- Enabled scanning of DMI to identify machine quirks. Say Y here unless you have verified that your setup is not @@ -622,7 +670,7 @@ bool "GART IOMMU support" if EMBEDDED default y select SWIOTLB - depends on X86_64 && PCI + depends on X86_64 && PCI && K8_NB && !X86_64_XEN ---help--- Support for full DMA access of devices with 32bit memory access only on systems with more than 3GB. This is usually needed for USB, @@ -637,7 +685,7 @@ config CALGARY_IOMMU bool "IBM Calgary IOMMU support" select SWIOTLB - depends on X86_64 && PCI && EXPERIMENTAL + depends on X86_64 && PCI && !X86_64_XEN && EXPERIMENTAL ---help--- Support for hardware IOMMUs in IBM's xSeries x366 and x460 systems. Needed to run systems with more than 3GB of memory @@ -667,7 +715,7 @@ bool "AMD IOMMU support" select SWIOTLB select PCI_MSI - depends on X86_64 && PCI && ACPI + depends on X86_64 && PCI && ACPI && !XEN ---help--- With this option you can enable support for AMD IOMMU hardware in your system. An IOMMU is a hardware component which provides @@ -716,11 +764,12 @@ config NR_CPUS int "Maximum number of CPUs" if SMP && !MAXSMP - range 2 8 if SMP && X86_32 && !X86_BIGSMP + range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN range 2 512 if SMP && !MAXSMP default "1" if !SMP default "4096" if MAXSMP default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) + default "16" if X86_64_XEN default "8" if SMP ---help--- This allows you to specify the maximum number of CPUs which this @@ -748,11 +797,22 @@ making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config IRQ_TIME_ACCOUNTING + bool "Fine granularity task level IRQ time accounting" + default n + ---help--- + Select this option to enable fine granularity task irq time + accounting. This is done by reading a timestamp on each + transitions between softirq and hardirq state, so there can be a + small performance impact. + + If in doubt, say N here. + source "kernel/Kconfig.preempt" config X86_UP_APIC bool "Local APIC support on uniprocessors" - depends on X86_32 && !SMP && !X86_32_NON_STANDARD + depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !XEN_UNPRIVILEGED_GUEST ---help--- A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -778,10 +838,12 @@ config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + depends on !XEN_UNPRIVILEGED_GUEST config X86_IO_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + depends on !XEN_UNPRIVILEGED_GUEST config X86_VISWS_APIC def_bool y @@ -790,7 +852,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" default n - depends on X86_IO_APIC + depends on X86_IO_APIC && !XEN ---help--- This option enables a workaround that fixes a source of spurious interrupts. This is recommended when threaded @@ -813,6 +875,7 @@ config X86_MCE bool "Machine Check / overheating reporting" + depends on !XEN_UNPRIVILEGED_GUEST ---help--- Machine Check support allows the processor to notify the kernel if it detects a problem (e.g. overheating, data corruption). @@ -822,22 +885,30 @@ config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && !XEN ---help--- Additional support for intel specific MCE features such as the thermal monitor. +config X86_MCE_XEON75XX + tristate "Intel Xeon 7500 series corrected memory error driver" + depends on X86_MCE_INTEL && PCI + ---help--- + Add support for a Intel Xeon 7500 series specific memory error driver. + This allows to report the DIMM and physical address on a corrected + memory error machine check event. + config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && !XEN ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. config X86_ANCIENT_MCE def_bool n - depends on X86_32 && X86_MCE + depends on X86_32 && X86_MCE && !XEN prompt "Support for old Pentium 5 / WinChip machine checks" ---help--- Include support for machine check handling on old Pentium 5 or WinChip @@ -857,6 +928,10 @@ If you don't know what a machine check is and you don't do kernel QA it is safe to say n. +config X86_XEN_MCE + def_bool y + depends on XEN && X86_MCE + config X86_THERMAL_VECTOR def_bool y depends on X86_MCE_INTEL @@ -866,10 +941,29 @@ default y depends on X86_32 ---help--- - This option is required by programs like DOSEMU to run 16-bit legacy - code on X86 processors. It also may be needed by software like - XFree86 to initialize some video cards via BIOS. Disabling this - option saves about 6k. + This option is required by programs like DOSEMU to run + 16-bit real mode legacy code on x86 processors. It also may + be needed by software like XFree86 to initialize some video + cards via BIOS. Disabling this option saves about 6K. + +config X86_16BIT + bool "Enable support for 16-bit segments" + default y + ---help--- + This option is required by programs like Wine to run 16-bit + protected mode legacy code on x86 processors. Disabling + this option saves about 300 bytes on i386, or around 6K text + plus 16K runtime memory on x86-64, + +config X86_ESPFIX32 + def_bool y + depends on X86_16BIT && X86_32 + depends on !XEN + +config X86_ESPFIX64 + def_bool y + depends on X86_16BIT && X86_64 + depends on !XEN config TOSHIBA tristate "Toshiba Laptop support" @@ -909,7 +1003,7 @@ config X86_REBOOTFIXUPS bool "Enable X86 board specific fixups for reboot" - depends on X86_32 + depends on X86_32 && !XEN ---help--- This enables chipset and/or board specific fixups to be done in order to get reboot to work correctly. This is only needed on @@ -926,6 +1020,7 @@ config MICROCODE tristate "/dev/cpu/microcode - microcode support" + depends on !XEN_UNPRIVILEGED_GUEST select FW_LOADER ---help--- If you say Y here, you will be able to update the microcode on @@ -944,7 +1039,7 @@ config MICROCODE_INTEL bool "Intel microcode patch loading support" - depends on MICROCODE + depends on MICROCODE && !XEN default MICROCODE select FW_LOADER ---help--- @@ -957,7 +1052,7 @@ config MICROCODE_AMD bool "AMD microcode patch loading support" - depends on MICROCODE + depends on MICROCODE && !XEN select FW_LOADER ---help--- If you select this option, microcode patch loading support for AMD @@ -984,12 +1079,6 @@ with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. -config X86_CPU_DEBUG - tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support" - ---help--- - If you select this option, this will provide various x86 CPUs - information through debugfs. - choice prompt "High Memory Support" default HIGHMEM4G if !X86_NUMAQ @@ -1113,7 +1202,7 @@ config DIRECT_GBPAGES bool "Enable 1GB pages for kernel pagetables" if EMBEDDED default y - depends on X86_64 + depends on X86_64 && !XEN ---help--- Allow the kernel linear mapping to use 1GB pages on CPUs that support it. This can improve the kernel's performance a tiny bit by @@ -1122,7 +1211,7 @@ # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" - depends on SMP + depends on SMP && !XEN depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) ---help--- @@ -1226,11 +1315,12 @@ config ARCH_SPARSEMEM_DEFAULT def_bool y - depends on X86_64 + depends on X86_64 && !X86_64_XEN config ARCH_SPARSEMEM_ENABLE def_bool y depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD + depends on !XEN select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 @@ -1242,6 +1332,11 @@ def_bool X86_64 depends on MEMORY_HOTPLUG +config ILLEGAL_POINTER_VALUE + hex + default 0 if X86_32 + default 0xdead000000000000 if X86_64 + source "mm/Kconfig" config HIGHPTE @@ -1255,6 +1350,7 @@ config X86_CHECK_BIOS_CORRUPTION bool "Check for low memory corruption" + depends on !XEN ---help--- Periodically check for memory corruption in low memory, which is suspected to be caused by BIOS. Even when enabled in the @@ -1285,6 +1381,7 @@ config X86_RESERVE_LOW_64K bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" + depends on !XEN default y ---help--- Reserve the first 64K of physical RAM on BIOSes that are known @@ -1306,6 +1403,7 @@ config MATH_EMULATION bool prompt "Math emulation" if X86_32 + depends on !XEN ---help--- Linux can emulate a math coprocessor (used for floating point operations) if you don't have one. 486DX and Pentium processors have @@ -1331,6 +1429,7 @@ config MTRR bool "MTRR (Memory Type Range Register) support" + depends on !XEN_UNPRIVILEGED_GUEST ---help--- On Intel P6 family processors (Pentium Pro, Pentium II and later) the Memory Type Range Registers (MTRRs) may be used to control @@ -1366,7 +1465,7 @@ config MTRR_SANITIZER def_bool y prompt "MTRR cleanup support" - depends on MTRR + depends on MTRR && !XEN ---help--- Convert MTRR layout from continuous to discrete, so X drivers can add writeback entries. @@ -1413,9 +1512,18 @@ def_bool y depends on X86_PAT +config ARCH_RANDOM + def_bool y + prompt "x86 architectural random number generator" if EMBEDDED + ---help--- + Enable the x86 architectural RDRAND instruction + (Intel Bull Mountain technology) to generate random numbers. + If supported, this is a high bandwidth, cryptographically + secure hardware random number generator. + config EFI bool "EFI runtime service support" - depends on ACPI + depends on ACPI && !XEN ---help--- This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). @@ -1463,6 +1571,7 @@ config KEXEC bool "kexec system call" + depends on !XEN_UNPRIVILEGED_GUEST ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1480,6 +1589,7 @@ config CRASH_DUMP bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) + depends on !XEN ---help--- Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels @@ -1500,7 +1610,8 @@ code in physical address mode via KEXEC config PHYSICAL_START - hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) + hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP || XEN) + default 0x100000 if XEN default "0x1000000" ---help--- This gives the physical address where the kernel is loaded. @@ -1542,6 +1653,7 @@ config RELOCATABLE bool "Build a relocatable kernel" + depends on !XEN default y ---help--- This builds a kernel image that retains relocation information @@ -1564,7 +1676,8 @@ config PHYSICAL_ALIGN hex - prompt "Alignment value to which kernel should be aligned" if X86_32 + prompt "Alignment value to which kernel should be aligned" if X86_32 && !XEN + default 0x2000 if XEN default "0x1000000" range 0x2000 0x1000000 ---help--- @@ -1659,6 +1772,7 @@ config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) + depends on !XEN config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y @@ -1669,6 +1783,7 @@ depends on NUMA menu "Power management and ACPI options" + depends on !XEN_UNPRIVILEGED_GUEST config ARCH_HIBERNATION_HEADER def_bool y @@ -1687,7 +1802,7 @@ menuconfig APM tristate "APM (Advanced Power Management) BIOS support" - depends on X86_32 && PM_SLEEP + depends on X86_32 && PM_SLEEP && !XEN ---help--- APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with @@ -1821,6 +1936,7 @@ bool "PCI support" default y select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) + select ARCH_SUPPORTS_MSI if (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND) ---help--- Find out whether you have a PCI motherboard. PCI is the name of a bus system, i.e. the way the CPU talks to the other stuff inside @@ -1848,6 +1964,7 @@ config PCI_GOBIOS bool "BIOS" + depends on !XEN config PCI_GOMMCONFIG bool "MMConfig" @@ -1859,6 +1976,13 @@ bool "OLPC" depends on OLPC +config PCI_GOXEN_FE + bool "Xen PCI Frontend" + depends on X86_XEN + help + The PCI device frontend driver allows the kernel to import arbitrary + PCI devices from a PCI backend to support PCI driver domains. + config PCI_GOANY bool "Any" @@ -1866,7 +1990,7 @@ config PCI_BIOS def_bool y - depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY) + depends on X86_32 && PCI && !XEN && (PCI_GOBIOS || PCI_GOANY) # x86-64 doesn't support PCI BIOS access from long mode so always go direct. config PCI_DIRECT @@ -1889,9 +2013,24 @@ bool "Support mmconfig PCI config space access" depends on X86_64 && PCI && ACPI +config XEN_PCIDEV_FRONTEND + def_bool y + prompt "Xen PCI Frontend" if X86_64 + depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64) + select HOTPLUG + help + The PCI device frontend driver allows the kernel to import arbitrary + PCI devices from a PCI backend to support PCI driver domains. + +config XEN_PCIDEV_FE_DEBUG + bool "Xen PCI Frontend Debugging" + depends on XEN_PCIDEV_FRONTEND + help + Enables some debug statements within the PCI Frontend. + config DMAR bool "Support for DMA Remapping Devices (EXPERIMENTAL)" - depends on PCI_MSI && ACPI && EXPERIMENTAL + depends on PCI_MSI && ACPI && !XEN && EXPERIMENTAL help DMA remapping (DMAR) devices support enables independent address translations for Direct Memory Access (DMA) from devices. @@ -1933,7 +2072,7 @@ config INTR_REMAP bool "Support for Interrupt Remapping (EXPERIMENTAL)" - depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL + depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && !XEN && EXPERIMENTAL ---help--- Supports Interrupt remapping for IO-APIC and MSI devices. To use x2apic mode in the CPU's which support x2APIC enhancements or @@ -1951,6 +2090,7 @@ config ISA bool "ISA support" + depends on !XEN ---help--- Find out whether you have ISA slots on your motherboard. ISA is the name of a bus system, i.e. the way the CPU talks to the other stuff @@ -1978,6 +2118,7 @@ config MCA bool "MCA support" + depends on !XEN ---help--- MicroChannel Architecture is found in some IBM PS/2 machines and laptops. It is a bus system similar to PCI or ISA. See @@ -2028,7 +2169,7 @@ config K8_NB def_bool y - depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA))) + depends on CPU_SUP_AMD && PCI source "drivers/pcmcia/Kconfig" @@ -2079,7 +2220,11 @@ source "drivers/Kconfig" +source "ubuntu/Kconfig" + +if !XEN_UNPRIVILEGED_GUEST source "drivers/firmware/Kconfig" +endif source "fs/Kconfig" --- linux-ec2-2.6.32.orig/arch/x86/Makefile_32.cpu +++ linux-ec2-2.6.32/arch/x86/Makefile_32.cpu @@ -46,6 +46,13 @@ # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) +# Work around the pentium-mmx code generator madness of gcc4.4.x which +# does stack alignment by generating horrible code _before_ the mcount +# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph +# tracer assumptions. For i686, generic, core2 this is set by the +# compiler anyway +cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args) + # Bug fix for binutils: this option is required in order to keep # binutils from generating NOPL instructions against our will. ifneq ($(CONFIG_X86_P6_NOP),y) --- linux-ec2-2.6.32.orig/arch/x86/boot/Makefile +++ linux-ec2-2.6.32/arch/x86/boot/Makefile @@ -23,6 +23,7 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf bzImage +targets += vmlinuz vmlinux-stripped targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed @@ -195,6 +196,20 @@ cp System.map $(INSTALL_PATH)/ if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi +$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE + $(call if_changed,gzip) + @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' + +$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded +$(obj)/vmlinux-stripped: vmlinux FORCE + $(call if_changed,objcopy) + +ifndef CONFIG_XEN +bzImage := bzImage +else +bzImage := vmlinuz +endif + install: - sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ + sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/$(bzImage) \ System.map "$(INSTALL_PATH)" --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/system.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/system.h @@ -0,0 +1,440 @@ +#ifndef _ASM_X86_SYSTEM_H +#define _ASM_X86_SYSTEM_H + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* entries in ARCH_DLINFO: */ +#ifdef CONFIG_IA32_EMULATION +# define AT_VECTOR_SIZE_ARCH 2 +#else +# define AT_VECTOR_SIZE_ARCH 1 +#endif + +struct task_struct; /* one of the stranger aspects of C forward declarations */ +struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); + +#ifdef CONFIG_X86_32 + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movl %P[task_canary](%[next]), %%ebx\n\t" \ + "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" +#define __switch_canary_oparam \ + , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. + */ +#define switch_to(prev, next, last) \ +do { \ + /* \ + * Context-switching clobbers all registers, so we clobber \ + * them explicitly, via unused output variables. \ + * (EAX and EBP is not listed because EBP is saved/restored \ + * explicitly for wchan access and EAX is the return value of \ + * __switch_to()) \ + */ \ + unsigned long ebx, ecx, edx, esi, edi; \ + \ + asm volatile("pushfl\n\t" /* save flags */ \ + "pushl %%ebp\n\t" /* save EBP */ \ + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ + "pushl %[next_ip]\n\t" /* restore EIP */ \ + __switch_canary \ + "jmp __switch_to\n" /* regparm call */ \ + "1:\t" \ + "popl %%ebp\n\t" /* restore EBP */ \ + "popfl\n" /* restore flags */ \ + \ + /* output parameters */ \ + : [prev_sp] "=m" (prev->thread.sp), \ + [prev_ip] "=m" (prev->thread.ip), \ + "=a" (last), \ + \ + /* clobbered output registers: */ \ + "=b" (ebx), "=c" (ecx), "=d" (edx), \ + "=S" (esi), "=D" (edi) \ + \ + __switch_canary_oparam \ + \ + /* input parameters: */ \ + : [next_sp] "m" (next->thread.sp), \ + [next_ip] "m" (next->thread.ip), \ + \ + /* regparm parameters for __switch_to(): */ \ + [prev] "a" (prev), \ + [next] "d" (next) \ + \ + __switch_canary_iparam \ + \ + : /* reloaded segment registers */ \ + "memory"); \ +} while (0) + +/* + * disable hlt during certain critical i/o operations + */ +#define HAVE_DISABLE_HLT +#else +#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" +#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" + +/* frame pointer must be last for get_wchan */ +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" + +#define __EXTRA_CLOBBER \ + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ + "r12", "r13", "r14", "r15" + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movq %P[task_canary](%%rsi),%%r8\n\t" \ + "movq %%r8,"__percpu_arg([gs_canary])"\n\t" +#define __switch_canary_oparam \ + , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* Save restore flags to clear handle leaking NT */ +#define switch_to(prev, next, last) \ + asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + ".globl thread_return\n" \ + "thread_return:\n\t" \ + "movq "__percpu_arg([current_task])",%%rsi\n\t" \ + __switch_canary \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ + "movq %%rax,%%rdi\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "jnz ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ + __switch_canary_oparam \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ + [_tif_fork] "i" (_TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ + [current_task] "m" (per_cpu_var(current_task)) \ + __switch_canary_iparam \ + : "memory", "cc" __EXTRA_CLOBBER) +#endif + +#ifdef __KERNEL__ + +extern void xen_load_gs_index(unsigned); + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg, value) \ + asm volatile("\n" \ + "1:\t" \ + "movl %k0,%%" #seg "\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3:\t" \ + "movl %k1, %%" #seg "\n\t" \ + "jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b,3b) \ + : :"r" (value), "r" (0) : "memory") + + +/* + * Save a segment register away + */ +#define savesegment(seg, value) \ + asm("mov %%" #seg ",%0":"=r" (value) : : "memory") + +/* + * x86_32 user gs accessors. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_32_LAZY_GS +#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) +#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +#define task_user_gs(tsk) ((tsk)->thread.gs) +#define lazy_save_gs(v) savesegment(gs, (v)) +#define lazy_load_gs(v) loadsegment(gs, (v)) +#else /* X86_32_LAZY_GS */ +#define get_user_gs(regs) (u16)((regs)->gs) +#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +#define lazy_save_gs(v) do { } while (0) +#define lazy_load_gs(v) do { } while (0) +#endif /* X86_32_LAZY_GS */ +#endif /* X86_32 */ + +static inline unsigned long get_limit(unsigned long segment) +{ + unsigned long __limit; + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); + return __limit + 1; +} + +static inline void xen_clts(void) +{ + HYPERVISOR_fpu_taskswitch(0); +} + +static inline void xen_stts(void) +{ + HYPERVISOR_fpu_taskswitch(1); +} + +/* + * Volatile isn't enough to prevent the compiler from reordering the + * read/write functions for the control registers and messing everything up. + * A memory clobber would solve the problem, but would prevent reordering of + * all loads stores around it, which can hurt performance. Solution is to + * use a variable and mimic reads and writes to it to enforce serialization + */ +static unsigned long __force_order; + +static inline unsigned long xen_read_cr0(void) +{ + unsigned long val; + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void xen_write_cr0(unsigned long val) +{ + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); +} + +#define xen_read_cr2() vcpu_info_read(arch.cr2) +#define xen_write_cr2(val) vcpu_info_write(arch.cr2, val) + +static inline unsigned long xen_read_cr3(void) +{ + unsigned long val; + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); +#ifdef CONFIG_X86_32 + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT; +#else + return machine_to_phys(val); +#endif +} + +static inline void xen_write_cr3(unsigned long val) +{ +#ifdef CONFIG_X86_32 + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT)); +#else + val = phys_to_machine(val); +#endif + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long xen_read_cr4(void) +{ + unsigned long val; + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +#define xen_read_cr4_safe() xen_read_cr4() + +static inline void xen_write_cr4(unsigned long val) +{ + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long xen_read_cr8(void) +{ + return 0; +} + +static inline void xen_write_cr8(unsigned long val) +{ + BUG_ON(val); +} +#endif + +static inline void xen_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +#define read_cr0() (xen_read_cr0()) +#define write_cr0(x) (xen_write_cr0(x)) +#define read_cr2() (xen_read_cr2()) +#define write_cr2(x) (xen_write_cr2(x)) +#define read_cr3() (xen_read_cr3()) +#define write_cr3(x) (xen_write_cr3(x)) +#define read_cr4() (xen_read_cr4()) +#define read_cr4_safe() (xen_read_cr4_safe()) +#define write_cr4(x) (xen_write_cr4(x)) +#define wbinvd() (xen_wbinvd()) +#ifdef CONFIG_X86_64 +#define read_cr8() (xen_read_cr8()) +#define write_cr8(x) (xen_write_cr8(x)) +#define load_gs_index xen_load_gs_index +#endif + +/* Clear the 'TS' bit */ +#define clts() (xen_clts()) +#define stts() (xen_stts()) + +#endif /* __KERNEL__ */ + +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); +} + +#define nop() asm volatile ("nop") + +void disable_hlt(void); +void enable_hlt(void); + +void cpu_idle_wait(void); + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +void xen_idle(void); + +void stop_this_cpu(void *dummy); + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ +#ifdef CONFIG_X86_32 +/* + * Some non-Intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static __always_inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +#endif /* _ASM_X86_SYSTEM_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/agp.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/agp.h @@ -0,0 +1,41 @@ +#ifndef _ASM_X86_AGP_H +#define _ASM_X86_AGP_H + +#include +#include +#include + +/* + * Functions to keep the agpgart mappings coherent with the MMU. The + * GART gives the CPU a physical alias of pages in memory. The alias + * region is mapped uncacheable. Make sure there are no conflicting + * mappings with different cachability attributes for the same + * page. This avoids data corruption on some CPUs. + */ + +#define map_page_into_agp(page) ( \ + xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \ + ?: set_pages_uc(page, 1)) +#define unmap_page_from_agp(page) ( \ + xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \ + /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ + set_pages_wb(page, 1)) + +/* + * Could use CLFLUSH here if the cpu supports it. But then it would + * need to be called for each cacheline of the whole page so it may + * not be worth it. Would need a page for it. + */ +#define flush_agp_cache() wbinvd() + +#define virt_to_gart virt_to_machine + +/* GATT allocation. Returns/accepts GATT kernel virtual address. */ +#define alloc_gatt_pages(order) ({ \ + char *_t; dma_addr_t _d; \ + _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \ + _t; }) +#define free_gatt_pages(table, order) \ + dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table)) + +#endif /* _ASM_X86_AGP_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pci.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pci.h @@ -0,0 +1,162 @@ +#ifndef _ASM_X86_PCI_H +#define _ASM_X86_PCI_H + +#include /* for struct page */ +#include +#include +#include +#include +#include + +#ifdef __KERNEL__ + +struct pci_sysdata { + int domain; /* PCI domain */ + int node; /* NUMA node */ +#ifdef CONFIG_X86_64 + void *iommu; /* IOMMU private data */ +#endif +#ifdef CONFIG_XEN_PCIDEV_FRONTEND + struct pcifront_device *pdev; +#endif +}; + +extern int pci_routeirq; +extern int noioapicquirk; +extern int noioapicreroute; + +/* scan a bus after allocating a pci_sysdata for it */ +extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, + int node); +extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); + +static inline int pci_domain_nr(struct pci_bus *bus) +{ + struct pci_sysdata *sd = bus->sysdata; + return sd->domain; +} + +static inline int pci_proc_domain(struct pci_bus *bus) +{ + return pci_domain_nr(bus); +} + + +/* Can be used to override the logic in pci_scan_bus for skipping + already-configured bus numbers - to be used for buggy BIOSes + or architectures with incomplete PCI setup by the loader */ + +#ifdef CONFIG_PCI +extern unsigned int pcibios_assign_all_busses(void); +#else +#define pcibios_assign_all_busses() 0 +#endif + +#include +#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain()) + +extern unsigned long pci_mem_start; +#define PCIBIOS_MIN_IO 0x1000 +#define PCIBIOS_MIN_MEM (pci_mem_start) + +#define PCIBIOS_MIN_CARDBUS_IO 0x4000 + +void pcibios_config_init(void); +struct pci_bus *pcibios_scan_root(int bus); + +void pcibios_set_master(struct pci_dev *dev); +void pcibios_penalize_isa_irq(int irq, int active); +struct irq_routing_table *pcibios_get_irq_routing_table(void); +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); + + +#define HAVE_PCI_MMAP +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, + int write_combine); + + +#ifdef CONFIG_PCI +extern void early_quirks(void); +static inline void pci_dma_burst_advice(struct pci_dev *pdev, + enum pci_dma_burst_strategy *strat, + unsigned long *strategy_parameter) +{ + *strat = PCI_DMA_BURST_INFINITY; + *strategy_parameter = ~0UL; +} +#else +static inline void early_quirks(void) { } +#endif + +extern void pci_iommu_alloc(void); + +/* MSI arch hooks */ +#define arch_setup_msi_irqs arch_setup_msi_irqs +#define arch_teardown_msi_irqs arch_teardown_msi_irqs + +#define PCI_DMA_BUS_IS_PHYS 0 + +#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) \ + || defined(CONFIG_SWIOTLB) + +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ + dma_addr_t ADDR_NAME; +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ + __u32 LEN_NAME; +#define pci_unmap_addr(PTR, ADDR_NAME) \ + ((PTR)->ADDR_NAME) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + (((PTR)->ADDR_NAME) = (VAL)) +#define pci_unmap_len(PTR, LEN_NAME) \ + ((PTR)->LEN_NAME) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ + (((PTR)->LEN_NAME) = (VAL)) + +#else + +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0]; +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0]; +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME)) +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ + do { break; } while (pci_unmap_len(PTR, LEN_NAME)) + +#endif + +#endif /* __KERNEL__ */ + +#ifdef CONFIG_X86_64 +#include "../../asm/pci_64.h" +#endif + +/* implement the pci_ DMA API in terms of the generic device dma_ one */ +#include + +/* generic pci stuff */ +#include +#define PCIBIOS_MAX_MEM_32 0xffffffff + +#ifdef CONFIG_NUMA +/* Returns the node based on pci bus */ +static inline int __pcibus_to_node(const struct pci_bus *bus) +{ + const struct pci_sysdata *sd = bus->sysdata; + + return sd->node; +} + +static inline const struct cpumask * +cpumask_of_pcibus(const struct pci_bus *bus) +{ + int node; + + node = __pcibus_to_node(bus); + return (node == -1) ? cpu_online_mask : + cpumask_of_node(node); +} +#endif + +#endif /* _ASM_X86_PCI_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable_types.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable_types.h @@ -0,0 +1,385 @@ +#ifndef _ASM_X86_PGTABLE_DEFS_H +#define _ASM_X86_PGTABLE_DEFS_H + +#include +#include + +#define FIRST_USER_ADDRESS 0 + +#define _PAGE_BIT_PRESENT 0 /* is present */ +#define _PAGE_BIT_RW 1 /* writeable */ +#define _PAGE_BIT_USER 2 /* userspace addressable */ +#define _PAGE_BIT_PWT 3 /* page write through */ +#define _PAGE_BIT_PCD 4 /* page cache disabled */ +#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ +#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ +#define _PAGE_BIT_PAT 7 /* on 4KB pages */ +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ +#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ +#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ + +/* If _PAGE_BIT_PRESENT is clear, we use these: */ +/* - if the user mapped it with PROT_NONE; pte_present gives true */ +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL +/* - set: nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY + +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) +#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) +#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define __HAVE_ARCH_PTE_SPECIAL + +#ifdef CONFIG_KMEMCHECK +#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#else +#define _PAGE_HIDDEN (_AT(pteval_t, 0)) +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) +#else +#define _PAGE_NX (_AT(pteval_t, 0)) +#endif + +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) + +#ifndef __ASSEMBLY__ +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002 +extern unsigned int __kernel_page_user; +#else +#define __kernel_page_user 0 +#endif +#endif + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | __kernel_page_user) + +/* Set of bits not changed in pte_modify */ +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \ + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) + +/* + * PAT settings are part of the hypervisor interface, which sets the + * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]). + */ +#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT) +#define _PAGE_CACHE_WB (0) +#define _PAGE_CACHE_WT (_PAGE_PWT) +#define _PAGE_CACHE_WC (_PAGE_PAT) +#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT) +#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) +#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) + +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) + +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) +#define PAGE_COPY PAGE_COPY_NOEXEC +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) + +#define __PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) +#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) + +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) +#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) +#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) + +#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) +#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) + +/* xwr */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_EXEC +#define __P101 PAGE_READONLY_EXEC +#define __P110 PAGE_COPY_EXEC +#define __P111 PAGE_COPY_EXEC + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_EXEC +#define __S101 PAGE_READONLY_EXEC +#define __S110 PAGE_SHARED_EXEC +#define __S111 PAGE_SHARED_EXEC + +/* + * early identity mapping pte attrib macros. + */ +#ifdef CONFIG_X86_64 +#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +#else +/* + * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection + * bits are combined, this will alow user to access the high address mapped + * VDSO in the presence of CONFIG_COMPAT_VDSO + */ +#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ +#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ +#endif + +#ifdef CONFIG_X86_32 +# include +#else +# include "pgtable_64_types.h" +#endif + +#ifndef __ASSEMBLY__ + +#include + +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) + +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +#define PTE_FLAGS_MASK (~PTE_PFN_MASK) + +typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; + +#include + +typedef struct { pgdval_t pgd; } pgd_t; + +#define __pgd_ma(x) ((pgd_t) { (x) } ) +static inline pgd_t xen_make_pgd(pgdval_t val) +{ + if (likely(val & _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pgd_t) { val }; +} + +#define __pgd_val(x) ((x).pgd) +static inline pgdval_t xen_pgd_val(pgd_t pgd) +{ + pgdval_t ret = __pgd_val(pgd); +#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002 + if (likely(ret)) + ret = machine_to_phys(ret) | _PAGE_PRESENT; +#else + if (likely(ret & _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); +#endif + return ret; +} + +static inline pgdval_t pgd_flags(pgd_t pgd) +{ + return __pgd_val(pgd) & PTE_FLAGS_MASK; +} + +#if PAGETABLE_LEVELS > 3 +typedef struct { pudval_t pud; } pud_t; + +#define __pud_ma(x) ((pud_t) { (x) } ) +static inline pud_t xen_make_pud(pudval_t val) +{ + if (likely(val & _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pud_t) { val }; +} + +#define __pud_val(x) ((x).pud) +static inline pudval_t xen_pud_val(pud_t pud) +{ + pudval_t ret = __pud_val(pud); + if (likely(ret & _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); + return ret; +} +#else +#include + +#define __pud_val(x) __pgd_val((x).pgd) +static inline pudval_t xen_pud_val(pud_t pud) +{ + return xen_pgd_val(pud.pgd); +} +#endif + +#if PAGETABLE_LEVELS > 2 +typedef struct { pmdval_t pmd; } pmd_t; + +#define __pmd_ma(x) ((pmd_t) { (x) } ) +static inline pmd_t xen_make_pmd(pmdval_t val) +{ + if (likely(val & _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pmd_t) { val }; +} + +#define __pmd_val(x) ((x).pmd) +static inline pmdval_t xen_pmd_val(pmd_t pmd) +{ + pmdval_t ret = __pmd_val(pmd); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (likely(ret)) + ret = pte_machine_to_phys(ret) | _PAGE_PRESENT; +#else + if (likely(ret & _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); +#endif + return ret; +} +#else +#include + +#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } ) +#define __pmd_val(x) __pgd_val((x).pud.pgd) +static inline pmdval_t xen_pmd_val(pmd_t pmd) +{ + return xen_pgd_val(pmd.pud.pgd); +} +#endif + +static inline pudval_t pud_flags(pud_t pud) +{ + return __pud_val(pud) & PTE_FLAGS_MASK; +} + +static inline pmdval_t pmd_flags(pmd_t pmd) +{ + return __pmd_val(pmd) & PTE_FLAGS_MASK; +} + +#define __pte_ma(x) ((pte_t) { .pte = (x) } ) +static inline pte_t xen_make_pte(pteval_t val) +{ + if (likely((val & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pte_t) { .pte = val }; +} + +#define __pte_val(x) ((x).pte) +static inline pteval_t xen_pte_val(pte_t pte) +{ + pteval_t ret = __pte_val(pte); + if (likely((pte.pte_low & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); + return ret; +} + +static inline pteval_t pte_flags(pte_t pte) +{ + return __pte_val(pte) & PTE_FLAGS_MASK; +} + +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) } ) + + +typedef struct page *pgtable_t; + +extern pteval_t __supported_pte_mask; +extern void set_nx(void); +extern int nx_enabled; + +#define pgprot_writecombine pgprot_writecombine +extern pgprot_t pgprot_writecombine(pgprot_t prot); + +#ifndef CONFIG_XEN +/* Indicate that x86 has its own track and untrack pfn vma functions */ +#define __HAVE_PFNMAP_TRACKING +#endif + +#define __HAVE_PHYS_MEM_ACCESS_PROT +struct file; +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot); +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot); + +/* Install a pte for a particular vaddr in kernel space. */ +void set_pte_vaddr(unsigned long vaddr, pte_t pte); + +struct seq_file; +extern void arch_report_meminfo(struct seq_file *m); + +enum { + PG_LEVEL_NONE, + PG_LEVEL_4K, + PG_LEVEL_2M, + PG_LEVEL_1G, + PG_LEVEL_NUM +}; + +#ifdef CONFIG_PROC_FS +extern void update_page_count(int level, unsigned long pages); +#else +static inline void update_page_count(int level, unsigned long pages) { } +#endif + +/* + * Helper function that returns the kernel pagetable entry controlling + * the virtual address 'address'. NULL means no pagetable entry present. + * NOTE: the return type is pte_t but if the pmd is PSE then we return it + * as a pte too. + */ +extern pte_t *lookup_address(unsigned long address, unsigned int *level); + +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_DEFS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable_32.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable_32.h @@ -0,0 +1,99 @@ +#ifndef _ASM_X86_PGTABLE_32_H +#define _ASM_X86_PGTABLE_32_H + +#include + +/* + * The Linux memory management assumes a three-level page table setup. On + * the i386, we use that, but "fold" the mid level into the top-level page + * table, so that we physically have the same two-level page table as the + * i386 mmu expects. + * + * This file contains the functions and defines necessary to modify and use + * the i386 page table tree. + */ +#ifndef __ASSEMBLY__ +#include +#include +#include + +#include +#include +#include +#include +#include + +struct vm_area_struct; + +extern pgd_t *swapper_pg_dir; + +static inline void pgtable_cache_init(void) { } +static inline void check_pgt_cache(void) { } +void paging_init(void); + +extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); + + +/* + * Define this if things work differently on an i386 and an i486: + * it will (on an i486) warn about kernel memory accesses that are + * done without a 'access_ok(VERIFY_WRITE,..)' + */ +#undef TEST_ACCESS_OK + +#ifdef CONFIG_X86_PAE +# include +#else +# include +#endif + +#if defined(CONFIG_HIGHPTE) +#define __KM_PTE \ + (in_nmi() ? KM_NMI_PTE : \ + in_irq() ? KM_IRQ_PTE : \ + KM_PTE0) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ + pte_index((address))) +#define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ + pte_index((address))) +#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) +#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) +#else +#define pte_offset_map(dir, address) \ + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) +#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) +#endif + +/* Clear a kernel PTE and flush it from the TLB */ +#define kpte_clear_flush(ptep, vaddr) \ +do { \ + if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \ + BUG(); \ +} while (0) + +/* + * The i386 doesn't have any external MMU info: the kernel page + * tables contain all the necessary information. + */ +#define update_mmu_cache(vma, address, pte) do { } while (0) + +void make_lowmem_page_readonly(void *va, unsigned int feature); +void make_lowmem_page_writable(void *va, unsigned int feature); + +#endif /* !__ASSEMBLY__ */ + +/* + * kern_addr_valid() is (1) for FLATMEM and (0) for + * SPARSEMEM and DISCONTIGMEM + */ +#ifdef CONFIG_FLATMEM +#define kern_addr_valid(addr) (1) +#else +#define kern_addr_valid(kaddr) (0) +#endif + +#endif /* _ASM_X86_PGTABLE_32_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/hypercall.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/hypercall.h @@ -0,0 +1,430 @@ +/****************************************************************************** + * hypercall.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * 64-bit updates: + * Benjamin Liu + * Jun Nakajima + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERCALL_H__ +#define __HYPERCALL_H__ + +#ifndef __HYPERVISOR_H__ +# error "please don't include this file directly" +#endif + +#if CONFIG_XEN_COMPAT <= 0x030002 +# include /* memcpy() */ +#endif + +#ifdef CONFIG_XEN +#define HYPERCALL_ASM_OPERAND "%c" +#define HYPERCALL_LOCATION(op) (hypercall_page + (op) * 32) +#define HYPERCALL_C_OPERAND(name) "i" (HYPERCALL_LOCATION(__HYPERVISOR_##name)) +#else +#define HYPERCALL_ASM_OPERAND "*%" +#define HYPERCALL_LOCATION(op) (hypercall_stubs + (op) * 32) +#define HYPERCALL_C_OPERAND(name) "g" (HYPERCALL_LOCATION(__HYPERVISOR_##name)) +#endif + +#define HYPERCALL_ARG(arg, n) \ + register typeof((arg)+0) __arg##n asm(HYPERCALL_arg##n) = (arg) + +#define _hypercall0(type, name) \ +({ \ + type __res; \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "1" \ + : "=a" (__res) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall1(type, name, arg) \ +({ \ + type __res; \ + HYPERCALL_ARG(arg, 1); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "2" \ + : "=a" (__res), "+r" (__arg1) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "3" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "4" \ + : "=a" (__res), "+r" (__arg1), \ + "+r" (__arg2), "+r" (__arg3) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + HYPERCALL_ARG(a4, 4); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "5" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + HYPERCALL_ARG(a4, 4); \ + HYPERCALL_ARG(a5, 5); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "6" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4), "+r" (__arg5) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall(type, op, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + HYPERCALL_ARG(a4, 4); \ + HYPERCALL_ARG(a5, 5); \ + asm volatile ( \ + "call *%6" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4), "+r" (__arg5) \ + : "g" (HYPERCALL_LOCATION(op)) \ + : "memory" ); \ + __res; \ +}) + +#ifdef CONFIG_X86_32 +# include "hypercall_32.h" +#else +# include "hypercall_64.h" +#endif + +static inline int __must_check +HYPERVISOR_set_trap_table( + const trap_info_t *table) +{ + return _hypercall1(int, set_trap_table, table); +} + +static inline int __must_check +HYPERVISOR_mmu_update( + mmu_update_t *req, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + if (arch_use_lazy_mmu_mode()) + return xen_multi_mmu_update(req, count, success_count, domid); + return _hypercall4(int, mmu_update, req, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_mmuext_op( + struct mmuext_op *op, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + if (arch_use_lazy_mmu_mode()) + return xen_multi_mmuext_op(op, count, success_count, domid); + return _hypercall4(int, mmuext_op, op, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_set_gdt( + unsigned long *frame_list, unsigned int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); +} + +static inline int __must_check +HYPERVISOR_stack_switch( + unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); +} + +static inline int +HYPERVISOR_fpu_taskswitch( + int set) +{ + return _hypercall1(int, fpu_taskswitch, set); +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int __must_check +HYPERVISOR_sched_op_compat( + int cmd, unsigned long arg) +{ + return _hypercall2(int, sched_op_compat, cmd, arg); +} +#endif + +static inline int __must_check +HYPERVISOR_sched_op( + int cmd, void *arg) +{ + return _hypercall2(int, sched_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_platform_op( + struct xen_platform_op *platform_op) +{ + platform_op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, platform_op); +} + +struct xen_mc; +static inline int __must_check +HYPERVISOR_mca( + struct xen_mc *mc_op) +{ + mc_op->interface_version = XEN_MCA_INTERFACE_VERSION; + return _hypercall1(int, mca, mc_op); +} + +static inline int __must_check +HYPERVISOR_set_debugreg( + unsigned int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static inline unsigned long __must_check +HYPERVISOR_get_debugreg( + unsigned int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int __must_check +HYPERVISOR_memory_op( + unsigned int cmd, void *arg) +{ + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(false); + return _hypercall2(int, memory_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_multicall( + multicall_entry_t *call_list, unsigned int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); +} + +static inline int __must_check +HYPERVISOR_event_channel_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, event_channel_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + struct evtchn_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_xen_version( + int cmd, void *arg) +{ + return _hypercall2(int, xen_version, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_console_io( + int cmd, unsigned int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); +} + +static inline int __must_check +HYPERVISOR_physdev_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, physdev_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + struct physdev_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_grant_table_op( + unsigned int cmd, void *uop, unsigned int count) +{ + bool fixup = false; + int rc; + + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(false); +#ifdef GNTTABOP_map_grant_ref + if (cmd == GNTTABOP_map_grant_ref) +#endif + fixup = gnttab_pre_map_adjust(cmd, uop, count); + rc = _hypercall3(int, grant_table_op, cmd, uop, count); + if (rc == 0 && fixup) + rc = gnttab_post_map_adjust(uop, count); + return rc; +} + +static inline int __must_check +HYPERVISOR_vm_assist( + unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); +} + +static inline int __must_check +HYPERVISOR_vcpu_op( + int cmd, unsigned int vcpuid, void *extra_args) +{ + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); +} + +static inline int __must_check +HYPERVISOR_suspend( + unsigned long srec) +{ + struct sched_shutdown sched_shutdown = { + .reason = SHUTDOWN_suspend + }; + + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, + &sched_shutdown, srec); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, + SHUTDOWN_suspend, srec); +#endif + + return rc; +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int +HYPERVISOR_nmi_op( + unsigned long op, void *arg) +{ + return _hypercall2(int, nmi_op, op, arg); +} +#endif + +#ifndef CONFIG_XEN +static inline unsigned long __must_check +HYPERVISOR_hvm_op( + int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} +#endif + +static inline int __must_check +HYPERVISOR_callback_op( + int cmd, const void *arg) +{ + return _hypercall2(int, callback_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_xenoprof_op( + int op, void *arg) +{ + return _hypercall2(int, xenoprof_op, op, arg); +} + +static inline int __must_check +HYPERVISOR_kexec_op( + unsigned long op, void *args) +{ + return _hypercall2(int, kexec_op, op, args); +} + +static inline int __must_check +HYPERVISOR_tmem_op( + struct tmem_op *op) +{ + return _hypercall1(int, tmem_op, op); +} + +#endif /* __HYPERCALL_H__ */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/smp-processor-id.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/smp-processor-id.h @@ -0,0 +1,36 @@ +#ifndef _ASM_X86_SMP_PROCESSOR_ID_H +#define _ASM_X86_SMP_PROCESSOR_ID_H + +#if defined(CONFIG_SMP) && !defined(__ASSEMBLY__) + +#include + +DECLARE_PER_CPU(int, cpu_number); + +/* + * This function is needed by all SMP systems. It must _always_ be valid + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +#define raw_smp_processor_id() percpu_read(cpu_number) +#define safe_smp_processor_id() smp_processor_id() + +#ifdef CONFIG_X86_64_SMP +#define stack_smp_processor_id() \ +({ \ + struct thread_info *ti; \ + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ + ti->cpu; \ +}) +#endif + +#ifdef CONFIG_DEBUG_PREEMPT +extern unsigned int debug_smp_processor_id(void); +# define smp_processor_id() debug_smp_processor_id() +#else +# define smp_processor_id() raw_smp_processor_id() +#endif + +#endif /* SMP && !__ASSEMBLY__ */ + +#endif /* _ASM_X86_SMP_PROCESSOR_ID_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/xor_64.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/xor_64.h @@ -0,0 +1,337 @@ +#ifndef _ASM_X86_XOR_64_H +#define _ASM_X86_XOR_64_H + +/* + * x86-64 changes / gcc fixes from Andi Kleen. + * Copyright 2002 Andi Kleen, SuSE Labs. + * + * This hasn't been optimized for the hammer yet, but there are likely + * no advantages to be gotten from x86-64 here anyways. + */ + +typedef struct { + unsigned long a, b; +} __attribute__((aligned(16))) xmm_store_t; + +/* Doesn't use gcc to save the XMM registers, because there is no easy way to + tell it to do a clts before the register saving. */ +#define XMMS_SAVE \ +do { \ + preempt_disable(); \ + if (!(current_thread_info()->status & TS_USEDFPU)) \ + clts(); \ + asm volatile( \ + "movups %%xmm0,(%1) ;\n\t" \ + "movups %%xmm1,0x10(%1) ;\n\t" \ + "movups %%xmm2,0x20(%1) ;\n\t" \ + "movups %%xmm3,0x30(%1) ;\n\t" \ + : "=&r" (cr0) \ + : "r" (xmm_save) \ + : "memory"); \ +} while (0) + +#define XMMS_RESTORE \ +do { \ + asm volatile( \ + "sfence ;\n\t" \ + "movups (%1),%%xmm0 ;\n\t" \ + "movups 0x10(%1),%%xmm1 ;\n\t" \ + "movups 0x20(%1),%%xmm2 ;\n\t" \ + "movups 0x30(%1),%%xmm3 ;\n\t" \ + : \ + : "r" (cr0), "r" (xmm_save) \ + : "memory"); \ + if (!(current_thread_info()->status & TS_USEDFPU)) \ + stts(); \ + preempt_enable(); \ +} while (0) + +#define OFFS(x) "16*("#x")" +#define PF_OFFS(x) "256+16*("#x")" +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" +#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" +#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" +#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" +#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" +#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" +#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" +#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" +#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" + + +static void +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{ + unsigned int lines = bytes >> 8; + unsigned long cr0; + xmm_store_t xmm_save[4]; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + LD(i, 0) \ + LD(i + 1, 1) \ + PF1(i) \ + PF1(i + 2) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " decl %[cnt] ; jnz 1b" + : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) + : [inc] "r" (256UL) + : "memory"); + + XMMS_RESTORE; +} + +static void +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+r" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) + : [inc] "r" (256UL) + : "memory"); + XMMS_RESTORE; +} + +static void +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+c" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) + : [inc] "r" (256UL) + : "memory" ); + + XMMS_RESTORE; +} + +static void +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4, unsigned long *p5) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + PF4(i) \ + PF4(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + XO4(i, 0) \ + XO4(i + 1, 1) \ + XO4(i + 2, 2) \ + XO4(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " addq %[inc], %[p5] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+c" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), + [p5] "+r" (p5) + : [inc] "r" (256UL) + : "memory"); + + XMMS_RESTORE; +} + +static struct xor_block_template xor_block_sse = { + .name = "generic_sse", + .do_2 = xor_sse_2, + .do_3 = xor_sse_3, + .do_4 = xor_sse_4, + .do_5 = xor_sse_5, +}; + +#undef XOR_TRY_TEMPLATES +#define XOR_TRY_TEMPLATES \ +do { \ + xor_speed(&xor_block_sse); \ +} while (0) + +/* We force the use of the SSE xor block because it can write around L2. + We may also be able to load into the L1 only depending on how the cpu + deals with a load to a line that is being prefetched. */ +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) + +#endif /* _ASM_X86_XOR_64_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/maddr_64.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/maddr_64.h @@ -0,0 +1,163 @@ +#ifndef _X86_64_MADDR_H +#define _X86_64_MADDR_H + +#include +#include +#include + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL<<63) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +/* Definitions for machine and pseudophysical addresses. */ +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; + +#ifdef CONFIG_XEN + +extern unsigned long *phys_to_machine_mapping; +extern unsigned long max_mapnr; + +#undef machine_to_phys_mapping +extern unsigned long *machine_to_phys_mapping; +extern unsigned int machine_to_phys_order; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return pfn; + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return 1; + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return mfn; + + if (unlikely((mfn >> machine_to_phys_order) != 0)) + return max_mapnr; + + /* The array access can fail (e.g., device space beyond end of RAM). */ + asm ( + "1: movq %1,%0\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movq %2,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 1b,3b\n" + ".previous" + : "=r" (pfn) + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) ); + + return pfn; +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))==MFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservation, + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +{ + unsigned long pfn = mfn_to_pfn(mfn); + if (likely(pfn < max_mapnr) + && likely(!xen_feature(XENFEAT_auto_translated_physmap)) + && unlikely(phys_to_machine_mapping[pfn] != mfn)) + return max_mapnr; /* force !pfn_valid() */ + return pfn; +} + +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + phys_to_machine_mapping[pfn] = mfn; +} + +static inline maddr_t phys_to_machine(paddr_t phys) +{ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); + return machine; +} + +static inline paddr_t machine_to_phys(maddr_t machine) +{ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); + return phys; +} + +static inline paddr_t pte_phys_to_machine(paddr_t phys) +{ + maddr_t machine; + machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK); + return machine; +} + +static inline paddr_t pte_machine_to_phys(maddr_t machine) +{ + paddr_t phys; + phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); + return phys; +} + +#else /* !CONFIG_XEN */ + +#define pfn_to_mfn(pfn) (pfn) +#define mfn_to_pfn(mfn) (mfn) +#define mfn_to_local_pfn(mfn) (mfn) +#define set_phys_to_machine(pfn, mfn) ((void)0) +#define phys_to_machine_mapping_valid(pfn) (1) +#define phys_to_machine(phys) ((maddr_t)(phys)) +#define machine_to_phys(mach) ((paddr_t)(mach)) +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot) +#define __pte_ma(x) __pte(x) + +#endif /* !CONFIG_XEN */ + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) (phys_to_machine(__pa(v))) +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) + +#endif /* _X86_64_MADDR_H */ + --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/swiotlb.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/swiotlb.h @@ -0,0 +1,4 @@ +#include_next + +dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size, + int dir); --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/highmem.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/highmem.h @@ -0,0 +1,98 @@ +/* + * highmem.h: virtual kernel memory mappings for high memory + * + * Used in CONFIG_HIGHMEM systems for memory pages which + * are not addressable by direct kernel virtual addresses. + * + * Copyright (C) 1999 Gerhard Wichert, Siemens AG + * Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * up to 16 Terabyte physical memory. With current x86 CPUs + * we now support up to 64 Gigabytes physical RAM. + * + * Copyright (C) 1999 Ingo Molnar + */ + +#ifndef _ASM_X86_HIGHMEM_H +#define _ASM_X86_HIGHMEM_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +/* declarations for highmem.c */ +extern unsigned long highstart_pfn, highend_pfn; + +/* + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical + * chunk of RAM. + */ +/* + * Ordering is: + * + * FIXADDR_TOP + * fixed_addresses + * FIXADDR_START + * temp fixed addresses + * FIXADDR_BOOT_START + * Persistent kmap area + * PKMAP_BASE + * VMALLOC_END + * Vmalloc area + * VMALLOC_START + * high_memory + */ +#define LAST_PKMAP_MASK (LAST_PKMAP-1) +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) + +extern void *kmap_high(struct page *page); +extern void kunmap_high(struct page *page); + +void *kmap(struct page *page); +void kunmap(struct page *page); +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); +void *kmap_atomic(struct page *page, enum km_type type); +void *kmap_atomic_pte(struct page *page, enum km_type type); +void kunmap_atomic(void *kvaddr, enum km_type type); +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); +void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); +struct page *kmap_atomic_to_page(void *ptr); + +#define kmap_atomic_pte(page, type) \ + kmap_atomic_prot(page, type, \ + PagePinned(page) ? PAGE_KERNEL_RO : kmap_prot) + +#define flush_cache_kmaps() do { } while (0) + +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn); + +void clear_highpage(struct page *); +static inline void clear_user_highpage(struct page *page, unsigned long vaddr) +{ + clear_highpage(page); +} +#define __HAVE_ARCH_CLEAR_HIGHPAGE +#define clear_user_highpage clear_user_highpage +#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE + +void copy_highpage(struct page *to, struct page *from); +static inline void copy_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + copy_highpage(to, from); +} +#define __HAVE_ARCH_COPY_HIGHPAGE +#define __HAVE_ARCH_COPY_USER_HIGHPAGE + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_HIGHMEM_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/hypercall_32.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/hypercall_32.h @@ -0,0 +1,62 @@ +#define HYPERCALL_arg1 "ebx" +#define HYPERCALL_arg2 "ecx" +#define HYPERCALL_arg3 "edx" +#define HYPERCALL_arg4 "esi" +#define HYPERCALL_arg5 "edi" + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int __must_check +HYPERVISOR_set_callbacks( + unsigned long event_selector, unsigned long event_address, + unsigned long failsafe_selector, unsigned long failsafe_address) +{ + return _hypercall4(int, set_callbacks, + event_selector, event_address, + failsafe_selector, failsafe_address); +} +#endif + +static inline long __must_check +HYPERVISOR_set_timer_op( + u64 timeout) +{ + return _hypercall2(long, set_timer_op, + (unsigned long)timeout, + (unsigned long)(timeout>>32)); +} + +static inline int __must_check +HYPERVISOR_update_descriptor( + u64 ma, u64 desc) +{ + return _hypercall4(int, update_descriptor, + (unsigned long)ma, (unsigned long)(ma>>32), + (unsigned long)desc, (unsigned long)(desc>>32)); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping( + unsigned long va, pte_t new_val, unsigned long flags) +{ + unsigned long pte_hi = 0; + + if (arch_use_lazy_mmu_mode()) + return xen_multi_update_va_mapping(va, new_val, flags); +#ifdef CONFIG_X86_PAE + pte_hi = new_val.pte_high; +#endif + return _hypercall4(int, update_va_mapping, va, + new_val.pte_low, pte_hi, flags); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + unsigned long pte_hi = 0; +#ifdef CONFIG_X86_PAE + pte_hi = new_val.pte_high; +#endif + return _hypercall5(int, update_va_mapping_otherdomain, va, + new_val.pte_low, pte_hi, flags, domid); +} --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/xenoprof.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/xenoprof.h @@ -0,0 +1,48 @@ +/****************************************************************************** + * asm-i386/mach-xen/asm/xenoprof.h + * + * Copyright (c) 2006 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef __ASM_XENOPROF_H__ +#define __ASM_XENOPROF_H__ +#ifdef CONFIG_XEN + +struct super_block; +struct dentry; +int xenoprof_create_files(struct super_block * sb, struct dentry * root); +#define HAVE_XENOPROF_CREATE_FILES + +struct xenoprof_init; +void xenoprof_arch_init_counter(struct xenoprof_init *init); +void xenoprof_arch_counter(void); +void xenoprof_arch_start(void); +void xenoprof_arch_stop(void); + +struct xenoprof_arch_shared_buffer { + /* nothing */ +}; +struct xenoprof_shared_buffer; +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf); +struct xenoprof_get_buffer; +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf); +struct xenoprof_passive; +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf); + +#endif /* CONFIG_XEN */ +#endif /* __ASM_XENOPROF_H__ */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/mach_traps.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/mach_traps.h @@ -0,0 +1,33 @@ +/* + * include/asm-xen/asm-i386/mach-xen/mach_traps.h + * + * Machine specific NMI handling for Xen + */ +#ifndef _MACH_TRAPS_H +#define _MACH_TRAPS_H + +#include +#include + +static inline void clear_mem_error(unsigned char reason) {} +static inline void clear_io_check_error(unsigned char reason) {} + +static inline unsigned char get_nmi_reason(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + unsigned char reason = 0; + + /* construct a value which looks like it came from + * port 0x61. + */ + if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason)) + reason |= 0x40; + if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason)) + reason |= 0x80; + + return reason; +} + +static inline void reassert_nmi(void) {} + +#endif /* !_MACH_TRAPS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/smp.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/smp.h @@ -0,0 +1,221 @@ +#ifndef _ASM_X86_SMP_H +#define _ASM_X86_SMP_H +#ifndef __ASSEMBLY__ +#include +#include +#include + +/* + * We need the APIC definitions automatically as part of 'smp.h' + */ +#ifdef CONFIG_X86_LOCAL_APIC +# include +# include +# ifdef CONFIG_X86_IO_APIC +# include +# endif +#endif +#include +#include + +extern int smp_num_siblings; +extern unsigned int num_processors; + +DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); +DECLARE_PER_CPU(u16, cpu_llc_id); +DECLARE_PER_CPU(int, cpu_number); + +static inline struct cpumask *cpu_sibling_mask(int cpu) +{ + return per_cpu(cpu_sibling_map, cpu); +} + +static inline struct cpumask *cpu_core_mask(int cpu) +{ + return per_cpu(cpu_core_map, cpu); +} + +DECLARE_PER_CPU(u16, x86_cpu_to_apicid); +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); + +#ifdef CONFIG_SMP + +#ifndef CONFIG_XEN + +/* Static state in head.S used to set up a CPU */ +extern struct { + void *sp; + unsigned short ss; +} stack_start; + +struct smp_ops { + void (*smp_prepare_boot_cpu)(void); + void (*smp_prepare_cpus)(unsigned max_cpus); + void (*smp_cpus_done)(unsigned max_cpus); + + void (*stop_other_cpus)(int wait); + void (*smp_send_reschedule)(int cpu); + + int (*cpu_up)(unsigned cpu); + int (*cpu_disable)(void); + void (*cpu_die)(unsigned int cpu); + void (*play_dead)(void); + + void (*send_call_func_ipi)(const struct cpumask *mask); + void (*send_call_func_single_ipi)(int cpu); +}; + +/* Globals due to paravirt */ +extern void set_cpu_sibling_map(int cpu); + +#ifndef CONFIG_PARAVIRT +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0) +#endif +extern struct smp_ops smp_ops; + +static inline void smp_send_stop(void) +{ + smp_ops.stop_other_cpus(0); +} + +static inline void stop_other_cpus(void) +{ + smp_ops.stop_other_cpus(1); +} + +static inline void smp_prepare_boot_cpu(void) +{ + smp_ops.smp_prepare_boot_cpu(); +} + +static inline void smp_prepare_cpus(unsigned int max_cpus) +{ + smp_ops.smp_prepare_cpus(max_cpus); +} + +static inline void smp_cpus_done(unsigned int max_cpus) +{ + smp_ops.smp_cpus_done(max_cpus); +} + +static inline int __cpu_up(unsigned int cpu) +{ + return smp_ops.cpu_up(cpu); +} + +static inline int __cpu_disable(void) +{ + return smp_ops.cpu_disable(); +} + +static inline void __cpu_die(unsigned int cpu) +{ + smp_ops.cpu_die(cpu); +} + +static inline void play_dead(void) +{ + smp_ops.play_dead(); +} + +static inline void smp_send_reschedule(int cpu) +{ + smp_ops.smp_send_reschedule(cpu); +} + +static inline void arch_send_call_function_single_ipi(int cpu) +{ + smp_ops.send_call_func_single_ipi(cpu); +} + +static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) +{ + smp_ops.send_call_func_ipi(mask); +} + +void cpu_disable_common(void); +void native_smp_prepare_boot_cpu(void); +void native_smp_prepare_cpus(unsigned int max_cpus); +void native_smp_cpus_done(unsigned int max_cpus); +int native_cpu_up(unsigned int cpunum); +int native_cpu_disable(void); +void native_cpu_die(unsigned int cpu); +void native_play_dead(void); +void play_dead_common(void); +void wbinvd_on_cpu(int cpu); +int wbinvd_on_all_cpus(void); + +#else /* CONFIG_XEN */ + +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); +void xen_stop_other_cpus(int wait); +void xen_smp_send_reschedule(int cpu); +void xen_send_call_func_ipi(const struct cpumask *mask); +void xen_send_call_func_single_ipi(int cpu); + +static inline void smp_send_stop(void) +{ + xen_stop_other_cpus(0); +} + +#define smp_send_reschedule xen_smp_send_reschedule +#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi +#define arch_send_call_function_ipi_mask xen_send_call_func_ipi + +void play_dead(void); + +#endif /* CONFIG_XEN */ + +void wbinvd_on_cpu(int cpu); +int wbinvd_on_all_cpus(void); + +void smp_store_cpu_info(int id); +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) + +/* We don't mark CPUs online until __cpu_up(), so we need another measure */ +static inline int num_booting_cpus(void) +{ + return cpumask_weight(cpu_callout_mask); +} +#elif /* !CONFIG_SMP */ !defined(CONFIG_XEN) +#define wbinvd_on_cpu(cpu) wbinvd() +static inline int wbinvd_on_all_cpus(void) +{ + wbinvd(); + return 0; +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_XEN +int wbinvd_on_all_cpus(void); +#endif + +extern unsigned disabled_cpus __cpuinitdata; + +#include + +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) + +#ifndef CONFIG_X86_64 +static inline int logical_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); +} + +#endif + +extern int hard_smp_processor_id(void); + +#else /* CONFIG_X86_LOCAL_APIC */ + +# ifndef CONFIG_SMP +# define hard_smp_processor_id() 0 +# endif + +#endif /* CONFIG_X86_LOCAL_APIC */ + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_SMP_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/mmu_context.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/mmu_context.h @@ -0,0 +1,164 @@ +#ifndef _ASM_X86_MMU_CONTEXT_H +#define _ASM_X86_MMU_CONTEXT_H + +#include +#include +#include +#include + +void arch_exit_mmap(struct mm_struct *mm); +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); + +void mm_pin(struct mm_struct *mm); +void mm_unpin(struct mm_struct *mm); +void mm_pin_all(void); + +static inline void xen_activate_mm(struct mm_struct *prev, + struct mm_struct *next) +{ + if (!PagePinned(virt_to_page(next->pgd))) + mm_pin(next); +} + +/* + * Used for LDT copy/destruction. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context(struct mm_struct *mm); + + +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); +#endif +} + +#define prepare_arch_switch(next) __prepare_arch_switch() + +static inline void __prepare_arch_switch(void) +{ +#ifdef CONFIG_X86_32 + /* + * Save away %gs. No need to save %fs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. + */ + lazy_save_gs(current->thread.gs); + lazy_load_gs(__KERNEL_STACK_CANARY); +#else + /* + * Save away %es, %ds, %fs and %gs. Must happen before reload + * of cr3/ldt (i.e., not in __switch_to). + */ + __asm__ __volatile__ ( + "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3" + : "=m" (current->thread.es), + "=m" (current->thread.ds), + "=m" (current->thread.fsindex), + "=m" (current->thread.gsindex) ); + + if (current->thread.ds) + __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) ); + + if (current->thread.es) + __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) ); + + if (current->thread.fsindex) { + __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) ); + current->thread.fs = 0; + } + + if (current->thread.gsindex) { + load_gs_index(0); + current->thread.gs = 0; + } +#endif +} + +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op; +#ifdef CONFIG_X86_64 + pgd_t *upgd; +#endif + + if (likely(prev != next)) { + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && + !PagePinned(virt_to_page(next->pgd))); + +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + percpu_write(cpu_tlbstate.active_mm, next); +#endif + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* Re-load page tables: load_cr3(next->pgd) */ + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = virt_to_mfn(next->pgd); + op++; + + /* xen_new_user_pt(next->pgd) */ +#ifdef CONFIG_X86_64 + op->cmd = MMUEXT_NEW_USER_BASEPTR; + upgd = __user_pgd(next->pgd); + op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0; + op++; +#endif + /* stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + + /* + * load the LDT, if the LDT is different: + */ + if (unlikely(prev->context.ldt != next->context.ldt)) { + /* load_LDT_nolock(&next->context) */ + op->cmd = MMUEXT_SET_LDT; + op->arg1.linear_addr = (unsigned long)next->context.ldt; + op->arg2.nr_ents = next->context.size; + op++; + } + + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); + } +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + else { + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); + + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { + /* We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + */ + load_cr3(next->pgd); + xen_new_user_pt(next->pgd); + load_LDT_nolock(&next->context); + } + } +#endif +} + +#define activate_mm(prev, next) \ +do { \ + xen_activate_mm(prev, next); \ + switch_mm((prev), (next), NULL); \ +} while (0); + +#ifdef CONFIG_X86_32 +#define deactivate_mm(tsk, mm) \ +do { \ + lazy_load_gs(0); \ +} while (0) +#else +#define deactivate_mm(tsk, mm) \ +do { \ + load_gs_index(0); \ + loadsegment(fs, 0); \ +} while (0) +#endif + +#endif /* _ASM_X86_MMU_CONTEXT_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/xor.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/xor.h @@ -0,0 +1,8 @@ +#ifdef CONFIG_KMEMCHECK +/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ +# include +#elif defined(CONFIG_X86_32) +# include "../../asm/xor_32.h" +#else +# include "xor_64.h" +#endif --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/irqflags.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/irqflags.h @@ -0,0 +1,223 @@ +#ifndef _X86_IRQFLAGS_H_ +#define _X86_IRQFLAGS_H_ + +#include + +#ifndef __ASSEMBLY__ +/* + * The use of 'barrier' in the following reflects their use as local-lock + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following + * critical operations are executed. All critical operations must complete + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also + * includes these barriers, for example. + */ + +#define xen_save_fl(void) vcpu_info_read(evtchn_upcall_mask) + +#define xen_restore_fl(f) \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \ + barrier(); /* unmask then check (avoid races) */\ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ + } \ +} while (0) + +#define xen_irq_disable() \ +do { \ + vcpu_info_write(evtchn_upcall_mask, 1); \ + barrier(); \ +} while (0) + +#define xen_irq_enable() \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + _vcpu->evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ +} while (0) + +void xen_safe_halt(void); + +void xen_halt(void); + +#define __raw_local_save_flags() xen_save_fl() + +#define raw_local_irq_restore(flags) xen_restore_fl(flags) + +#define raw_local_irq_disable() xen_irq_disable() + +#define raw_local_irq_enable() xen_irq_enable() + +/* + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +static inline void raw_safe_halt(void) +{ + xen_safe_halt(); +} + +/* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +static inline void halt(void) +{ + xen_halt(); +} + +/* + * For spinlocks, etc: + */ +#define __raw_local_irq_save() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_local_irq_disable(); \ + \ + flags; \ +}) +#else + +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#ifdef CONFIG_X86_64 +# define __REG_si %rsi +# define __CPU_num PER_CPU_VAR(cpu_number) +#else +# define __REG_si %esi +# define __CPU_num TI_cpu(%ebp) +#endif + +#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT + +#define GET_VCPU_INFO PER_CPU(vcpu_info, __REG_si) +#define __DISABLE_INTERRUPTS movb $1,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask) +#define __ENABLE_INTERRUPTS movb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask) +#define __TEST_PENDING cmpb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_pending+0) +#define DISABLE_INTERRUPTS(clb) __DISABLE_INTERRUPTS +#define ENABLE_INTERRUPTS(clb) __ENABLE_INTERRUPTS + +#define __SIZEOF_DISABLE_INTERRUPTS 8 +#define __SIZEOF_TEST_PENDING 8 + +#else /* CONFIG_XEN_VCPU_INFO_PLACEMENT */ + +#define sizeof_vcpu_shift 6 + +#ifdef CONFIG_SMP +#define GET_VCPU_INFO movl __CPU_num,%esi ; \ + shl $sizeof_vcpu_shift,%esi ; \ + add HYPERVISOR_shared_info,__REG_si +#else +#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si +#endif + +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si) +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si) +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si) +#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ + __DISABLE_INTERRUPTS +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ + __ENABLE_INTERRUPTS + +#define __SIZEOF_DISABLE_INTERRUPTS 4 +#define __SIZEOF_TEST_PENDING 3 + +#endif /* CONFIG_XEN_VCPU_INFO_PLACEMENT */ + +#ifndef CONFIG_X86_64 +#define INTERRUPT_RETURN iret +#define ENABLE_INTERRUPTS_SYSEXIT \ + movb $0,evtchn_upcall_mask(%esi) /* __ENABLE_INTERRUPTS */ ; \ +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \ + cmpb $0,evtchn_upcall_pending(%esi) /* __TEST_PENDING */ ; \ + jnz 14f /* process more events if necessary... */ ; \ + movl PT_ESI(%esp), %esi ; \ + sysexit ; \ +14: movb $1,evtchn_upcall_mask(%esi) /* __DISABLE_INTERRUPTS */ ; \ + TRACE_IRQS_OFF ; \ +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \ + mov $__KERNEL_PERCPU, %ecx ; \ + push %esp ; \ + mov %ecx, %fs ; \ + SET_KERNEL_GS %ecx ; \ + call evtchn_do_upcall ; \ + add $4,%esp ; \ + jmp ret_from_intr +#endif + + +#endif /* __ASSEMBLY__ */ + +#ifndef __ASSEMBLY__ +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) + +#define raw_local_irq_save(flags) \ + do { (flags) = __raw_local_irq_save(); } while (0) + +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return (flags != 0); +} + +#define raw_irqs_disabled() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_irqs_disabled_flags(flags); \ +}) + +#else + +#ifdef CONFIG_X86_64 +#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ + TRACE_IRQS_ON; \ + ENABLE_INTERRUPTS(CLBR_NONE); \ + SAVE_REST; \ + LOCKDEP_SYS_EXIT; \ + RESTORE_REST; \ + __DISABLE_INTERRUPTS; \ + TRACE_IRQS_OFF; + +#else +#define ARCH_LOCKDEP_SYS_EXIT \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call lockdep_sys_exit; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_LOCKDEP_SYS_EXIT_IRQ +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; +#else +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ +# else +# define LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ +# endif + +#endif /* __ASSEMBLY__ */ +#endif --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/synch_bitops.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/synch_bitops.h @@ -0,0 +1,126 @@ +#ifndef __XEN_SYNCH_BITOPS_H__ +#define __XEN_SYNCH_BITOPS_H__ + +/* + * Copyright 1992, Linus Torvalds. + * Heavily modified to provide guaranteed strong synchronisation + * when communicating with Xen or other guest OSes running on other CPUs. + */ + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include +#endif + +#define ADDR (*(volatile long *) addr) + +static __inline__ void synch_set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btsl %1,%0" + : "+m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btrl %1,%0" + : "+m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_change_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btcl %1,%0" + : "+m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btsl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btrl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__ ( + "lock btcl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +struct __synch_xchg_dummy { unsigned long a[100]; }; +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x)) + +#define synch_cmpxchg(ptr, old, new) \ +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\ + (unsigned long)(old), \ + (unsigned long)(new), \ + sizeof(*(ptr)))) + +static inline unsigned long __synch_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#ifdef CONFIG_X86_64 + case 4: + __asm__ __volatile__("lock; cmpxchgl %k1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 8: + __asm__ __volatile__("lock; cmpxchgq %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#else + case 4: + __asm__ __volatile__("lock; cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#endif + } + return old; +} + +#define synch_test_bit test_bit + +#define synch_cmpxchg_subword synch_cmpxchg + +#endif /* __XEN_SYNCH_BITOPS_H__ */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable_64.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable_64.h @@ -0,0 +1,197 @@ +#ifndef _ASM_X86_PGTABLE_64_H +#define _ASM_X86_PGTABLE_64_H + +#include +#include + +#ifndef __ASSEMBLY__ + +/* + * This file contains the functions and defines necessary to modify and use + * the x86-64 page table tree. + */ +#include +#include +#include +#include + +#ifdef CONFIG_XEN +extern pud_t level3_user_pgt[512]; + +extern void xen_init_pt(void); +extern void xen_switch_pt(void); +#endif + +extern pud_t level3_kernel_pgt[512]; +extern pud_t level3_ident_pgt[512]; +extern pmd_t level2_kernel_pgt[512]; +extern pmd_t level2_fixmap_pgt[512]; +extern pmd_t level2_ident_pgt[512]; +extern pgd_t init_level4_pgt[]; + +#define swapper_pg_dir init_level4_pgt + +extern void paging_init(void); + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \ + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \ + __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pud %p(%016lx pfn %010Lx).\n", \ + __FILE__, __LINE__, &(e), __pud_val(e), \ + (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016lx pfn %010Lx).\n", \ + __FILE__, __LINE__, &(e), __pgd_val(e), \ + (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) + +struct mm_struct; + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); + + +#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0)) + +static inline void xen_set_pte(pte_t *ptep, pte_t pte) +{ + *ptep = pte; +} + +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + xen_set_pte(ptep, pte); +} + +#ifdef CONFIG_SMP +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret) +{ + return __pte_ma(xchg(&xp->pte, 0)); +} +#else +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) +#endif + +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + xen_l2_entry_update(pmdp, pmd); +} + +#define xen_pmd_clear(pmd) \ +({ \ + pmd_t *__pmdp = (pmd); \ + PagePinned(virt_to_page(__pmdp)) \ + ? set_pmd(__pmdp, xen_make_pmd(0)) \ + : (void)(*__pmdp = xen_make_pmd(0)); \ +}) + +static inline void xen_set_pud(pud_t *pudp, pud_t pud) +{ + xen_l3_entry_update(pudp, pud); +} + +#define xen_pud_clear(pud) \ +({ \ + pud_t *__pudp = (pud); \ + PagePinned(virt_to_page(__pudp)) \ + ? set_pud(__pudp, xen_make_pud(0)) \ + : (void)(*__pudp = xen_make_pud(0)); \ +}) + +static inline pgd_t *__user_pgd(pgd_t *pgd) +{ + if (unlikely(((unsigned long)pgd & PAGE_MASK) + == (unsigned long)init_level4_pgt)) + return NULL; + return (pgd_t *)(virt_to_page(pgd)->index + + ((unsigned long)pgd & ~PAGE_MASK)); +} + +static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd) +{ + xen_l4_entry_update(pgdp, pgd); +} + +#define xen_pgd_clear(pgd) \ +({ \ + pgd_t *__pgdp = (pgd); \ + PagePinned(virt_to_page(__pgdp)) \ + ? xen_l4_entry_update(__pgdp, xen_make_pgd(0)) \ + : (void)(*__user_pgd(__pgdp) = *__pgdp = xen_make_pgd(0)); \ +}) + +#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT) + +extern unsigned long early_arbitrary_virt_to_mfn(void *va); + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ + +/* + * Level 4 access. + */ +static inline int pgd_large(pgd_t pgd) { return 0; } +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) + +/* PUD - Level3 access */ + +/* PMD - Level 2 access */ +#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ + _PAGE_FILE }) +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT + +/* PTE - Level 1 access. */ + +/* x86-64 always has all page tables mapped. */ +#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) +#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) +#define pte_unmap(pte) /* NOP */ +#define pte_unmap_nested(pte) /* NOP */ + +#define update_mmu_cache(vma, address, pte) do { } while (0) + +/* Encode and de-code a swap entry */ +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) +#define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) }) +#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) + +extern int kern_addr_valid(unsigned long addr); +extern void cleanup_highmap(void); + +#define HAVE_ARCH_UNMAPPED_AREA +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN + +#define pgtable_cache_init() do { } while (0) +#define check_pgt_cache() do { } while (0) + +#define PAGE_AGP PAGE_KERNEL_NOCACHE +#define HAVE_PAGE_AGP 1 + +/* fs/proc/kcore.c */ +#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) +#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) + +#define __HAVE_ARCH_PTE_SAME +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_64_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable_64_types.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable_64_types.h @@ -0,0 +1,66 @@ +#ifndef _ASM_X86_PGTABLE_64_DEFS_H +#define _ASM_X86_PGTABLE_64_DEFS_H + +#ifndef __ASSEMBLY__ +#include + +/* + * These are used to make use of C type-checking.. + */ +typedef unsigned long pteval_t; +typedef unsigned long pmdval_t; +typedef unsigned long pudval_t; +typedef unsigned long pgdval_t; +typedef unsigned long pgprotval_t; + +typedef union { pteval_t pte; unsigned int pte_low; } pte_t; + +#endif /* !__ASSEMBLY__ */ + +#define SHARED_KERNEL_PMD 0 +#define PAGETABLE_LEVELS 4 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 39 +#define PTRS_PER_PGD 512 + +/* + * 3rd level page + */ +#define PUD_SHIFT 30 +#define PTRS_PER_PUD 512 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + +#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE - 1)) +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) +#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE - 1)) + +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +#define MAX_PHYSMEM_BITS 43 +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define VMALLOC_START _AC(0xffffc90000000000, UL) +#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffea0000000000, UL) +#define MODULES_VADDR _AC(0xffffffffa0000000, UL) +#define MODULES_END _AC(0xffffffffff000000, UL) +#define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define ESPFIX_PGD_ENTRY _AC(-2, UL) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) + +#endif /* _ASM_X86_PGTABLE_64_DEFS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/dma-mapping.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/dma-mapping.h @@ -0,0 +1,26 @@ +#ifndef _ASM_X86_DMA_MAPPING_H_ +#define _ASM_X86_DMA_MAPPING_H_ + +#define phys_to_dma _phys_to_dma_ +#define dma_to_phys _dma_to_phys_ + +#include_next + +#undef phys_to_dma +#undef dma_to_phys + +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return phys_to_machine(paddr); +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return machine_to_phys(daddr); +} + +void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t); + +extern int range_straddles_page_boundary(paddr_t p, size_t size); + +#endif /* _ASM_X86_DMA_MAPPING_H_ */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable-3level_types.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable-3level_types.h @@ -0,0 +1,44 @@ +#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H +#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H + +#ifndef __ASSEMBLY__ +#include + +typedef u64 pteval_t; +typedef u64 pmdval_t; +typedef u64 pudval_t; +typedef u64 pgdval_t; +typedef u64 pgprotval_t; + +typedef union { + struct { + unsigned long pte_low, pte_high; + }; + pteval_t pte; +} pte_t; +#endif /* !__ASSEMBLY__ */ + +#define SHARED_KERNEL_PMD 0 + +#define PAGETABLE_LEVELS 3 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 30 +#define PTRS_PER_PGD 4 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + + +#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/vga.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/vga.h @@ -0,0 +1,20 @@ +/* + * Access to VGA videoram + * + * (c) 1998 Martin Mares + */ + +#ifndef _ASM_X86_VGA_H +#define _ASM_X86_VGA_H + +/* + * On the PC, we can just recalculate addresses and then + * access the videoram directly without any black magic. + */ + +#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x) + +#define vga_readb(x) (*(x)) +#define vga_writeb(x, y) (*(y) = (x)) + +#endif /* _ASM_X86_VGA_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/irq_vectors.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/irq_vectors.h @@ -0,0 +1,93 @@ +#ifndef _ASM_X86_IRQ_VECTORS_H +#define _ASM_X86_IRQ_VECTORS_H + +#define MCE_VECTOR 0x12 + +#ifdef CONFIG_X86_32 +# define SYSCALL_VECTOR 0x80 +# define IA32_SYSCALL_VECTOR 0x80 +#else +# define IA32_SYSCALL_VECTOR 0x80 +#endif + +#define RESCHEDULE_VECTOR 0 +#define CALL_FUNCTION_VECTOR 1 +#define CALL_FUNC_SINGLE_VECTOR 2 +#define REBOOT_VECTOR 3 +#define NR_IPIS 4 + +/* + * The maximum number of vectors supported by i386 processors + * is limited to 256. For processors other than i386, NR_VECTORS + * should be changed accordingly. + */ +#define NR_VECTORS 256 + +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 + +#ifndef __ASSEMBLY__ +static inline int invalid_vm86_irq(int irq) +{ + return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; +} +#endif + +/* + * Size the maximum number of interrupts. + * + * If the irq_desc[] array has a sparse layout, we can size things + * generously - it scales up linearly with the maximum number of CPUs, + * and the maximum number of IO-APICs, whichever is higher. + * + * In other cases we size more conservatively, to not create too large + * static arrays. + */ + +#define NR_IRQS_LEGACY 16 + +/* + * The flat IRQ space is divided into two regions: + * 1. A one-to-one mapping of real physical IRQs. This space is only used + * if we have physical device-access privilege. This region is at the + * start of the IRQ space so that existing device drivers do not need + * to be modified to translate physical IRQ numbers into our IRQ space. + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These + * are bound using the provided bind/unbind functions. + */ +#define PIRQ_BASE 0 + +#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS ) +#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) + +#ifdef CONFIG_X86_IO_APIC +# if !defined(NR_CPUS) || !defined(MAX_IO_APICS) +/* nothing */ +# elif defined(CONFIG_SPARSE_IRQ) +# define NR_PIRQS \ + (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ + (NR_VECTORS + CPU_VECTOR_LIMIT) : \ + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) +# elif NR_CPUS < MAX_IO_APICS +# define NR_PIRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) +# else +# define NR_PIRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) +# endif +#else /* !CONFIG_X86_IO_APIC: */ +# define NR_PIRQS NR_IRQS_LEGACY +#endif + +#ifndef __ASSEMBLY__ +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SPARSE_IRQ) +extern int nr_pirqs; +#else +# define nr_pirqs NR_PIRQS +#endif +#endif + +#define DYNIRQ_BASE (PIRQ_BASE + nr_pirqs) +#define NR_DYNIRQS (64 + CONFIG_XEN_NR_GUEST_DEVICES) + +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) + +#endif /* _ASM_X86_IRQ_VECTORS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/maddr.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/maddr.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "maddr_32.h" +#else +# include "maddr_64.h" +#endif --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/gnttab_dma.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/gnttab_dma.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2007 Herbert Xu + * Copyright (c) 2007 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_I386_GNTTAB_DMA_H +#define _ASM_I386_GNTTAB_DMA_H + +static inline int gnttab_dma_local_pfn(struct page *page) +{ + /* Has it become a local MFN? */ + return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page)))); +} + +static inline maddr_t gnttab_dma_map_page(struct page *page) +{ + __gnttab_dma_map_page(page); + return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT); +} + +static inline void gnttab_dma_unmap_page(maddr_t maddr) +{ + __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr))); +} + +#endif /* _ASM_I386_GNTTAB_DMA_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/spinlock.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/spinlock.h @@ -0,0 +1,458 @@ +#ifndef _ASM_X86_SPINLOCK_H +#define _ASM_X86_SPINLOCK_H + +#include +#include +#include +#include +#include + +/* + * Your basic SMP spinlocks, allowing only a single CPU anywhere + * + * Simple spin lock operations. There are two variants, one clears IRQ's + * on the local processor, one does not. + * + * These are fair FIFO ticket locks, which are currently limited to 256 + * CPUs. + * + * (the type definitions are in asm/spinlock_types.h) + */ + +#ifdef CONFIG_X86_32 +# define LOCK_PTR_REG "a" +# define REG_PTR_MODE "k" +#else +# define LOCK_PTR_REG "D" +# define REG_PTR_MODE "q" +#endif + +#if defined(CONFIG_X86_32) && \ + (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)) +/* + * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock + * (PPro errata 66, 92) + */ +# define UNLOCK_LOCK_PREFIX LOCK_PREFIX +#else +# define UNLOCK_LOCK_PREFIX +#endif + +#ifdef TICKET_SHIFT + +#include +#include +#include + +DECLARE_PER_CPU(struct vcpu_runstate_info, runstate); + +int xen_spinlock_init(unsigned int cpu); +void xen_spinlock_cleanup(unsigned int cpu); +bool xen_spin_wait(raw_spinlock_t *, unsigned int *token, + unsigned int flags); +unsigned int xen_spin_adjust(const raw_spinlock_t *, unsigned int token); +void xen_spin_kick(raw_spinlock_t *, unsigned int token); + +/* + * Ticket locks are conceptually two parts, one indicating the current head of + * the queue, and the other indicating the current tail. The lock is acquired + * by atomically noting the tail and incrementing it by one (thus adding + * ourself to the queue and noting our position), then waiting until the head + * becomes equal to the the initial value of the tail. + * + * We use an xadd covering *both* parts of the lock, to increment the tail and + * also load the position of the head, which takes care of memory ordering + * issues and should be optimal for the uncontended case. Note the tail must be + * in the high part, because a wide xadd increment of the low part would carry + * up and contaminate the high part. + * + * With fewer than 2^8 possible CPUs, we can use x86's partial registers to + * save some instructions and make the code more elegant. There really isn't + * much between them in performance though, especially as locks are out of line. + */ +#if TICKET_SHIFT == 8 +#define __ticket_spin_lock_preamble \ + asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \ + "cmpb %h0, %b0\n\t" \ + "sete %1" \ + : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \ + : "0" (0x0100) \ + : "memory", "cc") +#define __ticket_spin_lock_body \ + asm("1:\t" \ + "cmpb %h0, %b0\n\t" \ + "je 2f\n\t" \ + "decl %1\n\t" \ + "jz 2f\n\t" \ + "rep ; nop\n\t" \ + "movb %2, %b0\n\t" \ + /* don't need lfence here, because loads are in-order */ \ + "jmp 1b\n" \ + "2:" \ + : "+Q" (token), "+g" (count) \ + : "m" (lock->slock) \ + : "memory", "cc") +#define __ticket_spin_unlock_body \ + asm(UNLOCK_LOCK_PREFIX "incb %2\n\t" \ + "movzwl %2, %0\n\t" \ + "cmpb %h0, %b0\n\t" \ + "setne %1" \ + : "=&Q" (token), "=qm" (kick), "+m" (lock->slock) \ + : \ + : "memory", "cc") + +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +{ + int tmp, new; + + asm("movzwl %2, %0\n\t" + "cmpb %h0, %b0\n\t" + "leal 0x100(%" REG_PTR_MODE "0), %1\n\t" + "jne 1f\n\t" + LOCK_PREFIX "cmpxchgw %w1, %2\n\t" + "1:\t" + "sete %b1\n\t" + "movzbl %b1, %0\n\t" + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) + : + : "memory", "cc"); + + if (tmp) + lock->owner = raw_smp_processor_id(); + + return tmp; +} +#elif TICKET_SHIFT == 16 +#define __ticket_spin_lock_preamble \ + do { \ + unsigned int tmp; \ + asm(LOCK_PREFIX "xaddl %0, %2\n\t" \ + "shldl $16, %0, %3\n\t" \ + "cmpw %w3, %w0\n\t" \ + "sete %1" \ + : "=&r" (token), "=qm" (free), "+m" (lock->slock), \ + "=&g" (tmp) \ + : "0" (0x00010000) \ + : "memory", "cc"); \ + } while (0) +#define __ticket_spin_lock_body \ + do { \ + unsigned int tmp; \ + asm("shldl $16, %0, %2\n" \ + "1:\t" \ + "cmpw %w2, %w0\n\t" \ + "je 2f\n\t" \ + "decl %1\n\t" \ + "jz 2f\n\t" \ + "rep ; nop\n\t" \ + "movw %3, %w0\n\t" \ + /* don't need lfence here, because loads are in-order */ \ + "jmp 1b\n" \ + "2:" \ + : "+r" (token), "+g" (count), "=&g" (tmp) \ + : "m" (lock->slock) \ + : "memory", "cc"); \ + } while (0) +#define __ticket_spin_unlock_body \ + do { \ + unsigned int tmp; \ + asm(UNLOCK_LOCK_PREFIX "incw %2\n\t" \ + "movl %2, %0\n\t" \ + "shldl $16, %0, %3\n\t" \ + "cmpw %w3, %w0\n\t" \ + "setne %1" \ + : "=&r" (token), "=qm" (kick), "+m" (lock->slock), \ + "=&r" (tmp) \ + : \ + : "memory", "cc"); \ + } while (0) + +static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) +{ + int tmp; + int new; + + asm("movl %2, %0\n\t" + "movl %0, %1\n\t" + "roll $16, %0\n\t" + "cmpl %0, %1\n\t" + "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t" + "jne 1f\n\t" + LOCK_PREFIX "cmpxchgl %1, %2\n" + "1:\t" + "sete %b1\n\t" + "movzbl %b1, %0\n\t" + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) + : + : "memory", "cc"); + + if (tmp) + lock->owner = raw_smp_processor_id(); + + return tmp; +} +#endif + +#define __ticket_spin_count(lock) \ + (per_cpu(runstate.state, (lock)->owner) == RUNSTATE_running \ + ? 1 << 10 : 2) + +static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) +{ + int tmp = ACCESS_ONCE(lock->slock); + + return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); +} + +static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) +{ + int tmp = ACCESS_ONCE(lock->slock); + + return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; +} + +static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) +{ + unsigned int token, count; + unsigned int flags = __raw_local_irq_save(); + bool free; + + __ticket_spin_lock_preamble; + if (likely(free)) + raw_local_irq_restore(flags); + else { + token = xen_spin_adjust(lock, token); + raw_local_irq_restore(flags); + do { + count = __ticket_spin_count(lock); + __ticket_spin_lock_body; + } while (unlikely(!count) + && !xen_spin_wait(lock, &token, flags)); + } + lock->owner = raw_smp_processor_id(); +} + +static __always_inline void __ticket_spin_lock_flags(raw_spinlock_t *lock, + unsigned long flags) +{ + unsigned int token, count; + bool free; + + __ticket_spin_lock_preamble; + if (unlikely(!free)) { + token = xen_spin_adjust(lock, token); + do { + count = __ticket_spin_count(lock); + __ticket_spin_lock_body; + } while (unlikely(!count) + && !xen_spin_wait(lock, &token, flags)); + } + lock->owner = raw_smp_processor_id(); +} + +static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) +{ + unsigned int token; + bool kick; + + __ticket_spin_unlock_body; + if (kick) + xen_spin_kick(lock, token); +} + +#ifndef XEN_SPINLOCK_SOURCE +#undef __ticket_spin_lock_preamble +#undef __ticket_spin_lock_body +#undef __ticket_spin_unlock_body +#undef __ticket_spin_count +#endif + +#define __raw_spin(n) __ticket_spin_##n + +#else /* TICKET_SHIFT */ + +static inline int xen_spinlock_init(unsigned int cpu) { return 0; } +static inline void xen_spinlock_cleanup(unsigned int cpu) {} + +static inline int __byte_spin_is_locked(raw_spinlock_t *lock) +{ + return lock->lock != 0; +} + +static inline int __byte_spin_is_contended(raw_spinlock_t *lock) +{ + return lock->spinners != 0; +} + +static inline void __byte_spin_lock(raw_spinlock_t *lock) +{ + s8 val = 1; + + asm("1: xchgb %1, %0\n" + " test %1,%1\n" + " jz 3f\n" + " " LOCK_PREFIX "incb %2\n" + "2: rep;nop\n" + " cmpb $1, %0\n" + " je 2b\n" + " " LOCK_PREFIX "decb %2\n" + " jmp 1b\n" + "3:" + : "+m" (lock->lock), "+q" (val), "+m" (lock->spinners): : "memory"); +} + +#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock) + +static inline int __byte_spin_trylock(raw_spinlock_t *lock) +{ + u8 old = 1; + + asm("xchgb %1,%0" + : "+m" (lock->lock), "+q" (old) : : "memory"); + + return old == 0; +} + +static inline void __byte_spin_unlock(raw_spinlock_t *lock) +{ + smp_wmb(); + lock->lock = 0; +} + +#define __raw_spin(n) __byte_spin_##n + +#endif /* TICKET_SHIFT */ + +static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +{ + return __raw_spin(is_locked)(lock); +} + +static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +{ + return __raw_spin(is_contended)(lock); +} +#define __raw_spin_is_contended __raw_spin_is_contended + +static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) +{ + __raw_spin(lock)(lock); +} + +static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) +{ + return __raw_spin(trylock)(lock); +} + +static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) +{ + __raw_spin(unlock)(lock); +} + +static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, + unsigned long flags) +{ + __raw_spin(lock_flags)(lock, flags); +} + +#undef __raw_spin + +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +{ + while (__raw_spin_is_locked(lock)) + cpu_relax(); +} + +/* + * Read-write spinlocks, allowing multiple readers + * but only one writer. + * + * NOTE! it is quite common to have readers in interrupts + * but no interrupt writers. For those circumstances we + * can "mix" irq-safe locks - any writer needs to get a + * irq-safe write-lock, but readers can get non-irqsafe + * read-locks. + * + * On x86, we implement read-write locks as a 32-bit counter + * with the high bit (sign) being the "contended" bit. + */ + +/** + * read_can_lock - would read_trylock() succeed? + * @lock: the rwlock in question. + */ +static inline int __raw_read_can_lock(raw_rwlock_t *lock) +{ + return (int)(lock)->lock > 0; +} + +/** + * write_can_lock - would write_trylock() succeed? + * @lock: the rwlock in question. + */ +static inline int __raw_write_can_lock(raw_rwlock_t *lock) +{ + return (lock)->lock == RW_LOCK_BIAS; +} + +static inline void __raw_read_lock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" + "jns 1f\n" + "call __read_lock_failed\n\t" + "1:\n" + ::LOCK_PTR_REG (rw) : "memory"); +} + +static inline void __raw_write_lock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" + "jz 1f\n" + "call __write_lock_failed\n\t" + "1:\n" + ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); +} + +static inline int __raw_read_trylock(raw_rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + + if (atomic_dec_return(count) >= 0) + return 1; + atomic_inc(count); + return 0; +} + +static inline int __raw_write_trylock(raw_rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + + if (atomic_sub_and_test(RW_LOCK_BIAS, count)) + return 1; + atomic_add(RW_LOCK_BIAS, count); + return 0; +} + +static inline void __raw_read_unlock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); +} + +static inline void __raw_write_unlock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX "addl %1, %0" + : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); +} + +#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) +#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) + +#define _raw_spin_relax(lock) cpu_relax() +#define _raw_read_relax(lock) cpu_relax() +#define _raw_write_relax(lock) cpu_relax() + +/* The {read|write|spin}_lock() on x86 are full memory barriers. */ +static inline void smp_mb__after_lock(void) { } +#define ARCH_HAS_SMP_MB_AFTER_LOCK + +#endif /* _ASM_X86_SPINLOCK_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/desc.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/desc.h @@ -0,0 +1,421 @@ +#ifndef _ASM_X86_DESC_H +#define _ASM_X86_DESC_H + +#include +#include +#include +#include + +static inline void fill_ldt(struct desc_struct *desc, + const struct user_desc *info) +{ + desc->limit0 = info->limit & 0x0ffff; + desc->base0 = info->base_addr & 0x0000ffff; + + desc->base1 = (info->base_addr & 0x00ff0000) >> 16; + desc->type = (info->read_exec_only ^ 1) << 1; + desc->type |= info->contents << 2; + desc->s = 1; + desc->dpl = 0x3; + desc->p = info->seg_not_present ^ 1; + desc->limit = (info->limit & 0xf0000) >> 16; + desc->avl = info->useable; + desc->d = info->seg_32bit; + desc->g = info->limit_in_pages; + desc->base2 = (info->base_addr & 0xff000000) >> 24; + /* + * Don't allow setting of the lm bit. It is useless anyway + * because 64bit system calls require __USER_CS: + */ + desc->l = 0; +} + +#ifndef CONFIG_X86_NO_IDT +extern struct desc_ptr idt_descr; +extern gate_desc idt_table[]; +#endif + +struct gdt_page { + struct desc_struct gdt[GDT_ENTRIES]; +} __attribute__((aligned(PAGE_SIZE))); +DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); + +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) +{ + return per_cpu(gdt_page, cpu).gdt; +} + +#ifdef CONFIG_X86_64 + +static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, + unsigned dpl, unsigned ist, unsigned seg) +{ + gate->offset_low = PTR_LOW(func); + gate->segment = __KERNEL_CS; + gate->ist = ist; + gate->p = 1; + gate->dpl = dpl; + gate->zero0 = 0; + gate->zero1 = 0; + gate->type = type; + gate->offset_middle = PTR_MIDDLE(func); + gate->offset_high = PTR_HIGH(func); +} + +#else +static inline void pack_gate(gate_desc *gate, unsigned char type, + unsigned long base, unsigned dpl, unsigned flags, + unsigned short seg) +{ + gate->a = (seg << 16) | (base & 0xffff); + gate->b = (base & 0xffff0000) | + (((0x80 | type | (dpl << 5)) & 0xff) << 8); +} + +#endif + +static inline int desc_empty(const void *ptr) +{ + const u32 *desc = ptr; + return !(desc[0] | desc[1]); +} + +#ifndef CONFIG_XEN +#define load_TR_desc() native_load_tr_desc() +#define load_gdt(dtr) native_load_gdt(dtr) +#define load_idt(dtr) native_load_idt(dtr) +#define load_tr(tr) asm volatile("ltr %0"::"m" (tr)) +#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) + +#define store_gdt(dtr) native_store_gdt(dtr) +#define store_idt(dtr) native_store_idt(dtr) +#define store_tr(tr) (tr = native_store_tr()) + +#define load_TLS(t, cpu) native_load_tls(t, cpu) +#define set_ldt native_set_ldt + +#define write_ldt_entry(dt, entry, desc) \ + native_write_ldt_entry(dt, entry, desc) +#define write_gdt_entry(dt, entry, desc, type) \ + native_write_gdt_entry(dt, entry, desc, type) +#define write_idt_entry(dt, entry, g) \ + native_write_idt_entry(dt, entry, g) + +static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ +} + +static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) +{ +} + +#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt)) + +static inline void native_write_idt_entry(gate_desc *idt, int entry, + const gate_desc *gate) +{ + memcpy(&idt[entry], gate, sizeof(*gate)); +} + +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, + const void *desc) +{ + memcpy(&ldt[entry], desc, 8); +} + +static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, + const void *desc, int type) +{ + unsigned int size; + switch (type) { + case DESC_TSS: + size = sizeof(tss_desc); + break; + case DESC_LDT: + size = sizeof(ldt_desc); + break; + default: + size = sizeof(struct desc_struct); + break; + } + memcpy(&gdt[entry], desc, size); +} +#endif + +static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, + unsigned long limit, unsigned char type, + unsigned char flags) +{ + desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); + desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | + (limit & 0x000f0000) | ((type & 0xff) << 8) | + ((flags & 0xf) << 20); + desc->p = 1; +} + + +#ifndef CONFIG_XEN +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) +{ +#ifdef CONFIG_X86_64 + struct ldttss_desc64 *desc = d; + memset(desc, 0, sizeof(*desc)); + desc->limit0 = size & 0xFFFF; + desc->base0 = PTR_LOW(addr); + desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->type = type; + desc->p = 1; + desc->limit1 = (size >> 16) & 0xF; + desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; + desc->base3 = PTR_HIGH(addr); +#else + pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); +#endif +} + +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) +{ + struct desc_struct *d = get_cpu_gdt_table(cpu); + tss_desc tss; + + /* + * sizeof(unsigned long) coming from an extra "long" at the end + * of the iobitmap. See tss_struct definition in processor.h + * + * -1? seg base+limit should be pointing to the address of the + * last valid byte + */ + set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + + sizeof(unsigned long) - 1); + write_gdt_entry(d, entry, &tss, DESC_TSS); +} + +#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) + +static inline void native_set_ldt(const void *addr, unsigned int entries) +{ + if (likely(entries == 0)) + asm volatile("lldt %w0"::"q" (0)); + else { + unsigned cpu = smp_processor_id(); + ldt_desc ldt; + + set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, + entries * LDT_ENTRY_SIZE - 1); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, + &ldt, DESC_LDT); + asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); + } +} + +static inline void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static inline void native_load_gdt(const struct desc_ptr *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static inline void native_load_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static inline void native_store_gdt(struct desc_ptr *dtr) +{ + asm volatile("sgdt %0":"=m" (*dtr)); +} + +static inline void native_store_idt(struct desc_ptr *dtr) +{ + asm volatile("sidt %0":"=m" (*dtr)); +} + +static inline unsigned long native_store_tr(void) +{ + unsigned long tr; + asm volatile("str %0":"=r" (tr)); + return tr; +} + +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; +} +#else +#include + +#define load_TLS(t, cpu) xen_load_tls(t, cpu) +#define set_ldt xen_set_ldt + +extern int write_ldt_entry(struct desc_struct *ldt, int entry, + const void *desc); +extern int write_gdt_entry(struct desc_struct *gdt, int entry, + const void *desc, int type); + +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN; + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + if (HYPERVISOR_update_descriptor( + arbitrary_virt_to_machine(&gdt[i]), + *(u64 *)&t->tls_array[i])) + BUG(); +} +#endif + +#define _LDT_empty(info) \ + ((info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0) + +#ifdef CONFIG_X86_64 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) +#else +#define LDT_empty(info) (_LDT_empty(info)) +#endif + +static inline void clear_LDT(void) +{ + set_ldt(NULL, 0); +} + +/* + * load one particular LDT into the current CPU + */ +static inline void load_LDT_nolock(mm_context_t *pc) +{ + set_ldt(pc->ldt, pc->size); +} + +static inline void load_LDT(mm_context_t *pc) +{ + preempt_disable(); + load_LDT_nolock(pc); + preempt_enable(); +} + +static inline unsigned long get_desc_base(const struct desc_struct *desc) +{ + return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); +} + +static inline void set_desc_base(struct desc_struct *desc, unsigned long base) +{ + desc->base0 = base & 0xffff; + desc->base1 = (base >> 16) & 0xff; + desc->base2 = (base >> 24) & 0xff; +} + +static inline unsigned long get_desc_limit(const struct desc_struct *desc) +{ + return desc->limit0 | (desc->limit << 16); +} + +static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) +{ + desc->limit0 = limit & 0xffff; + desc->limit = (limit >> 16) & 0xf; +} + +#ifndef CONFIG_X86_NO_IDT +static inline void _set_gate(int gate, unsigned type, void *addr, + unsigned dpl, unsigned ist, unsigned seg) +{ + gate_desc s; + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); + /* + * does not need to be atomic because it is only done once at + * setup time + */ + write_idt_entry(idt_table, gate, &s); +} + +/* + * This needs to use 'idt_table' rather than 'idt', and + * thus use the _nonmapped_ version of the IDT, as the + * Pentium F0 0F bugfix can have resulted in the mapped + * IDT being write-protected. + */ +static inline void set_intr_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); +} + +extern int first_system_vector; +/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ +extern unsigned long used_vectors[]; + +static inline void alloc_system_vector(int vector) +{ + if (!test_bit(vector, used_vectors)) { + set_bit(vector, used_vectors); + if (first_system_vector > vector) + first_system_vector = vector; + } else + BUG(); +} + +static inline void alloc_intr_gate(unsigned int n, void *addr) +{ + alloc_system_vector(n); + set_intr_gate(n, addr); +} + +/* + * This routine sets up an interrupt gate at directory privilege level 3. + */ +static inline void set_system_intr_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); +} + +static inline void set_system_trap_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); +} + +static inline void set_trap_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); +} + +static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); +} + +static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); +} + +static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); +} +#endif + +#endif /* _ASM_X86_DESC_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/system_64.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/system_64.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_SYSTEM_64_H +#define _ASM_X86_SYSTEM_64_H + +#include +#include + + +static inline unsigned long read_cr8(void) +{ + return 0; +} + +static inline void write_cr8(unsigned long val) +{ + BUG_ON(val); +} + +#include + +#endif /* _ASM_X86_SYSTEM_64_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/spinlock_types.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/spinlock_types.h @@ -0,0 +1,60 @@ +#ifndef _ASM_X86_SPINLOCK_TYPES_H +#define _ASM_X86_SPINLOCK_TYPES_H + +#ifndef __LINUX_SPINLOCK_TYPES_H +# error "please don't include this file directly" +#endif + +#include + +typedef union { + unsigned int slock; + struct { +/* + * Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll(). + */ +#if CONFIG_XEN_COMPAT >= 0x030002 +/* + * On Xen we support a single level of interrupt re-enabling per lock. Hence + * we can have twice as many outstanding tickets. Thus the cut-off for using + * byte register pairs must be at half the number of CPUs. + */ +#if 2 * CONFIG_NR_CPUS < 256 +# define TICKET_SHIFT 8 + u8 cur, seq; +#else +# define TICKET_SHIFT 16 + u16 cur, seq; +#endif +#if CONFIG_NR_CPUS <= 256 + u8 owner; +#else + u16 owner; +#endif +#else +/* + * This differs from the pre-2.6.24 spinlock by always using xchgb + * rather than decb to take the lock; this allows it to use a + * zero-initialized lock structure. It also maintains a 1-byte + * contention counter, so that we can implement + * __byte_spin_is_contended. + */ + u8 lock; +#if CONFIG_NR_CPUS < 256 + u8 spinners; +#else +# error NR_CPUS >= 256 not implemented +#endif +#endif + }; +} raw_spinlock_t; + +#define __RAW_SPIN_LOCK_UNLOCKED { 0 } + +typedef struct { + unsigned int lock; +} raw_rwlock_t; + +#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } + +#endif /* _ASM_X86_SPINLOCK_TYPES_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/processor.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/processor.h @@ -0,0 +1,1021 @@ +#ifndef _ASM_X86_PROCESSOR_H +#define _ASM_X86_PROCESSOR_H + +#include + +/* Forward declaration, a strange C thing */ +struct task_struct; +struct mm_struct; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* + * Default implementation of macro that returns current + * instruction pointer ("program counter"). + */ +static inline void *current_text_addr(void) +{ + void *pc; + + asm volatile("mov $1f, %0; 1:":"=r" (pc)); + + return pc; +} + +#ifdef CONFIG_X86_VSMP +# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) +# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) +#else +# define ARCH_MIN_TASKALIGN 16 +# define ARCH_MIN_MMSTRUCT_ALIGN 0 +#endif + +/* + * CPU type and hardware bug flags. Kept separately for each CPU. + * Members of this structure are referenced in head.S, so think twice + * before touching them. [mj] + */ + +struct cpuinfo_x86 { + __u8 x86; /* CPU family */ + __u8 x86_vendor; /* CPU vendor */ + __u8 x86_model; + __u8 x86_mask; +#ifdef CONFIG_X86_32 + char wp_works_ok; /* It doesn't on 386's */ + + /* Problems on some 486Dx4's and old 386's: */ + char hlt_works_ok; + char hard_math; + char rfu; + char fdiv_bug; + char f00f_bug; + char coma_bug; + char pad0; +#else + /* Number of 4K pages in DTLB/ITLB combined(in pages): */ + int x86_tlbsize; +#endif + __u8 x86_virt_bits; + __u8 x86_phys_bits; + /* CPUID returned core id bits: */ + __u8 x86_coreid_bits; + /* Max extended CPUID function supported: */ + __u32 extended_cpuid_level; + /* Maximum supported CPUID level, -1=no CPUID: */ + int cpuid_level; + __u32 x86_capability[NCAPINTS]; + char x86_vendor_id[16]; + char x86_model_id[64]; + /* in KB - valid for CPUS which support this call: */ + int x86_cache_size; + int x86_cache_alignment; /* In bytes */ + int x86_power; + unsigned long loops_per_jiffy; +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) + /* cpus sharing the last level cache: */ + cpumask_var_t llc_shared_map; +#endif + /* cpuid returned max cores value: */ + u16 x86_max_cores; + u16 apicid; + u16 initial_apicid; + u16 x86_clflush_size; +#ifdef CONFIG_SMP + /* number of cores as seen by the OS: */ + u16 booted_cores; + /* Physical processor id: */ + u16 phys_proc_id; + /* Core id: */ + u16 cpu_core_id; + /* Index into per_cpu list: */ + u16 cpu_index; +#endif + unsigned int x86_hyper_vendor; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +#define X86_VENDOR_INTEL 0 +#define X86_VENDOR_CYRIX 1 +#define X86_VENDOR_AMD 2 +#define X86_VENDOR_UMC 3 +#define X86_VENDOR_CENTAUR 5 +#define X86_VENDOR_TRANSMETA 7 +#define X86_VENDOR_NSC 8 +#define X86_VENDOR_NUM 9 + +#define X86_VENDOR_UNKNOWN 0xff + +#define X86_HYPER_VENDOR_NONE 0 +#define X86_HYPER_VENDOR_VMWARE 1 +#define X86_HYPER_VENDOR_XEN 'X' + +/* + * capabilities of CPUs + */ +extern struct cpuinfo_x86 boot_cpu_data; +extern struct cpuinfo_x86 new_cpu_data; + +extern __u32 cpu_caps_cleared[NCAPINTS]; +extern __u32 cpu_caps_set[NCAPINTS]; + +#ifdef CONFIG_SMP +DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +#define cpu_data(cpu) per_cpu(cpu_info, cpu) +#define current_cpu_data __get_cpu_var(cpu_info) +#else +#define cpu_data(cpu) boot_cpu_data +#define current_cpu_data boot_cpu_data +#endif + +extern const struct seq_operations cpuinfo_op; + +static inline int hlt_works(int cpu) +{ +#ifdef CONFIG_X86_32 + return cpu_data(cpu).hlt_works_ok; +#else + return 1; +#endif +} + +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) + +extern void cpu_detect(struct cpuinfo_x86 *c); + +extern struct pt_regs *idle_regs(struct pt_regs *); + +extern void early_cpu_init(void); +extern void identify_boot_cpu(void); +extern void identify_secondary_cpu(struct cpuinfo_x86 *); +extern void print_cpu_info(struct cpuinfo_x86 *); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern unsigned short num_cache_leaves; + +extern void detect_extended_topology(struct cpuinfo_x86 *c); +extern void detect_ht(struct cpuinfo_x86 *c); + +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm(XEN_CPUID + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static inline void load_cr3(pgd_t *pgdir) +{ + write_cr3(__pa(pgdir)); +} + +#ifndef CONFIG_X86_NO_TSS +#ifdef CONFIG_X86_32 +/* This is the TSS defined by the hardware. */ +struct x86_hw_tss { + unsigned short back_link, __blh; + unsigned long sp0; + unsigned short ss0, __ss0h; + unsigned long sp1; + /* ss1 caches MSR_IA32_SYSENTER_CS: */ + unsigned short ss1, __ss1h; + unsigned long sp2; + unsigned short ss2, __ss2h; + unsigned long __cr3; + unsigned long ip; + unsigned long flags; + unsigned long ax; + unsigned long cx; + unsigned long dx; + unsigned long bx; + unsigned long sp; + unsigned long bp; + unsigned long si; + unsigned long di; + unsigned short es, __esh; + unsigned short cs, __csh; + unsigned short ss, __ssh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; + unsigned short ldt, __ldth; + unsigned short trace; + unsigned short io_bitmap_base; + +} __attribute__((packed)); +extern struct tss_struct doublefault_tss; +#else +struct x86_hw_tss { + u32 reserved1; + u64 sp0; + u64 sp1; + u64 sp2; + u64 reserved2; + u64 ist[7]; + u32 reserved3; + u32 reserved4; + u16 reserved5; + u16 io_bitmap_base; + +} __attribute__((packed)) ____cacheline_aligned; +#endif +#endif /* CONFIG_X86_NO_TSS */ + +/* + * IO-bitmap sizes: + */ +#define IO_BITMAP_BITS 65536 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) +#define INVALID_IO_BITMAP_OFFSET 0x8000 + +#ifndef CONFIG_X86_NO_TSS +struct tss_struct { + /* + * The hardware state: + */ + struct x86_hw_tss x86_tss; + + /* + * The extra 1 is there because the CPU will access an + * additional byte beyond the end of the IO permission + * bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + + /* + * .. and then another 0x100 bytes for the emergency kernel stack: + */ + unsigned long stack[64]; + +} ____cacheline_aligned; + +DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); + +/* + * Save the original ist values for checking stack pointers during debugging + */ +struct orig_ist { + unsigned long ist[7]; +}; +#endif /* CONFIG_X86_NO_TSS */ + +#define MXCSR_DEFAULT 0x1f80 + +struct i387_fsave_struct { + u32 cwd; /* FPU Control Word */ + u32 swd; /* FPU Status Word */ + u32 twd; /* FPU Tag Word */ + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Pointer Offset */ + u32 fos; /* FPU Operand Pointer Selector */ + + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + + /* Software status information [not touched by FSAVE ]: */ + u32 status; +}; + +struct i387_fxsave_struct { + u16 cwd; /* Control Word */ + u16 swd; /* Status Word */ + u16 twd; /* Tag Word */ + u16 fop; /* Last Instruction Opcode */ + union { + struct { + u64 rip; /* Instruction Pointer */ + u64 rdp; /* Data Pointer */ + }; + struct { + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Offset */ + u32 fos; /* FPU Operand Selector */ + }; + }; + u32 mxcsr; /* MXCSR Register State */ + u32 mxcsr_mask; /* MXCSR Mask */ + + /* 8*16 bytes for each FP-reg = 128 bytes: */ + u32 st_space[32]; + + /* 16*16 bytes for each XMM-reg = 256 bytes: */ + u32 xmm_space[64]; + + u32 padding[12]; + + union { + u32 padding1[12]; + u32 sw_reserved[12]; + }; + +} __attribute__((aligned(16))); + +struct i387_soft_struct { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + u8 ftop; + u8 changed; + u8 lookahead; + u8 no_update; + u8 rm; + u8 alimit; + struct math_emu_info *info; + u32 entry_eip; +}; + +struct ymmh_struct { + /* 16 * 16 bytes for each YMMH-reg = 256 bytes */ + u32 ymmh_space[64]; +}; + +struct xsave_hdr_struct { + u64 xstate_bv; + u64 reserved1[2]; + u64 reserved2[5]; +} __attribute__((packed)); + +struct xsave_struct { + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; + struct ymmh_struct ymmh; + /* new processor state extensions will go here */ +} __attribute__ ((packed, aligned (64))); + +union thread_xstate { + struct i387_fsave_struct fsave; + struct i387_fxsave_struct fxsave; + struct i387_soft_struct soft; + struct xsave_struct xsave; +}; + +#ifdef CONFIG_X86_64 +#ifndef CONFIG_X86_NO_TSS +DECLARE_PER_CPU(struct orig_ist, orig_ist); +#endif + +union irq_stack_union { + char irq_stack[IRQ_STACK_SIZE]; + /* + * GCC hardcodes the stack canary as %gs:40. Since the + * irq_stack is the object at %gs:0, we reserve the bottom + * 48 bytes of the irq stack for the canary. + */ + struct { + char gs_base[40]; + unsigned long stack_canary; + }; +}; + +DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union); +DECLARE_INIT_PER_CPU(irq_stack_union); + +DECLARE_PER_CPU(char *, irq_stack_ptr); +DECLARE_PER_CPU(unsigned int, irq_count); +extern unsigned long kernel_eflags; +extern asmlinkage void ignore_sysret(void); +#else /* X86_64 */ +#ifdef CONFIG_CC_STACKPROTECTOR +/* + * Make sure stack canary segment base is cached-aligned: + * "For Intel Atom processors, avoid non zero segment base address + * that is not aligned to cache line boundary at all cost." + * (Optim Ref Manual Assembly/Compiler Coding Rule 15.) + */ +struct stack_canary { + char __pad[20]; /* canary at %gs:20 */ + unsigned long canary; +}; +DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); +#endif +#endif /* X86_64 */ + +extern unsigned int xstate_size; +extern void free_thread_xstate(struct task_struct *); +extern struct kmem_cache *task_xstate_cachep; + +struct thread_struct { + /* Cached TLS descriptors: */ + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + unsigned long sp0; + unsigned long sp; +#ifdef CONFIG_X86_32 + unsigned long sysenter_cs; +#else + unsigned long usersp; /* Copy from PDA */ + unsigned short es; + unsigned short ds; + unsigned short fsindex; + unsigned short gsindex; +#endif +#ifdef CONFIG_X86_32 + unsigned long ip; +#endif +#ifdef CONFIG_X86_64 + unsigned long fs; +#endif + unsigned long gs; + /* Hardware debugging registers: */ + unsigned long debugreg0; + unsigned long debugreg1; + unsigned long debugreg2; + unsigned long debugreg3; + unsigned long debugreg6; + unsigned long debugreg7; + /* Fault info: */ + unsigned long cr2; + unsigned long trap_no; + unsigned long error_code; + /* floating point and extended processor state */ + union thread_xstate *xstate; +#ifdef CONFIG_X86_32 + /* Virtual 86 mode info */ + struct vm86_struct __user *vm86_info; + unsigned long screen_bitmap; + unsigned long v86flags, v86mask, saved_sp0; + unsigned int saved_fs, saved_gs; +#endif + /* IO permissions: */ + unsigned long *io_bitmap_ptr; + unsigned long iopl; + /* Max allowed port in the bitmap, in bytes: */ + unsigned io_bitmap_max; +/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ + unsigned long debugctlmsr; + /* Debug Store context; see asm/ds.h */ + struct ds_context *ds_ctx; +}; + +static inline unsigned long xen_get_debugreg(int regno) +{ + return HYPERVISOR_get_debugreg(regno); +} + +static inline void xen_set_debugreg(int regno, unsigned long value) +{ + WARN_ON(HYPERVISOR_set_debugreg(regno, value)); +} + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static inline void xen_set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); +} + +#ifndef CONFIG_X86_NO_TSS +static inline void +native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) +{ + tss->x86_tss.sp0 = thread->sp0; +#ifdef CONFIG_X86_32 + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { + tss->x86_tss.ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +#endif +} +#else +#define xen_load_sp0(tss, thread) do { \ + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \ + BUG(); \ +} while (0) +#endif + +#define __cpuid xen_cpuid +#define paravirt_enabled() 0 + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = xen_get_debugreg(register) +#define set_debugreg(value, register) \ + xen_set_debugreg(register, value) + +#define load_sp0 xen_load_sp0 + +#define set_iopl_mask xen_set_iopl_mask + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ +extern unsigned long mmu_cr4_features; + +static inline void set_in_cr4(unsigned long mask) +{ + unsigned cr4; + + mmu_cr4_features |= mask; + cr4 = read_cr4(); + cr4 |= mask; + write_cr4(cr4); +} + +static inline void clear_in_cr4(unsigned long mask) +{ + unsigned cr4; + + mmu_cr4_features &= ~mask; + cr4 = read_cr4(); + cr4 &= ~mask; + write_cr4(cr4); +} + +typedef struct { + unsigned long seg; +} mm_segment_t; + + +/* + * create a kernel thread without removing it from tasklists + */ +extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); + +/* Free all resources held by a thread. */ +extern void release_thread(struct task_struct *); + +/* Prepare to copy thread state - unlazy all lazy state */ +extern void prepare_to_copy(struct task_struct *tsk); + +unsigned long get_wchan(struct task_struct *p); + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return eax; +} + +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ebx; +} + +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ecx; +} + +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return edx; +} + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + asm volatile("rep; nop" ::: "memory"); +} + +static inline void cpu_relax(void) +{ + rep_nop(); +} + +/* Stop speculative execution and prefetching of modified code. */ +static inline void sync_core(void) +{ + int tmp; + +#if defined(CONFIG_M386) || defined(CONFIG_M486) + if (boot_cpu_data.x86 < 5) + /* There is no speculative execution. + * jmp is a barrier to prefetching. */ + asm volatile("jmp 1f\n1:\n" ::: "memory"); + else +#endif + /* cpuid is a barrier to speculative execution. + * Prefetched instructions are automatically + * invalidated when modified. */ + asm volatile("cpuid" : "=a" (tmp) : "0" (1) + : "ebx", "ecx", "edx", "memory"); +} + +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); + +extern void select_idle_routine(const struct cpuinfo_x86 *c); +extern void init_c1e_mask(void); + +extern unsigned long boot_option_idle_override; +extern unsigned long idle_halt; +extern unsigned long idle_nomwait; + +#ifndef CONFIG_XEN +/* + * on systems with caches, caches must be flashed as the absolute + * last instruction before going into a suspended halt. Otherwise, + * dirty data can linger in the cache and become stale on resume, + * leading to strange errors. + * + * perform a variety of operations to guarantee that the compiler + * will not reorder instructions. wbinvd itself is serializing + * so the processor will not reorder. + * + * Systems without cache can just go into halt. + */ +static inline void wbinvd_halt(void) +{ + mb(); + /* check for clflush to determine if wbinvd is legal */ + if (cpu_has_clflush) + asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory"); + else + while (1) + halt(); +} +#endif + +extern void enable_sep_cpu(void); +extern int sysenter_setup(void); + +/* Defined in head.S */ +extern struct desc_ptr early_gdt_descr; + +extern void cpu_set_gdt(int); +extern void switch_to_new_gdt(int); +extern void load_percpu_segment(int); +extern void cpu_init(void); + +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + +static inline unsigned long get_debugctlmsr_on_cpu(int cpu) +{ + u64 debugctlmsr = 0; + u32 val1, val2; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2); + debugctlmsr = val1 | ((u64)val2 << 32); + + return debugctlmsr; +} + +static inline void update_debugctlmsr(unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); +} + +static inline void update_debugctlmsr_on_cpu(int cpu, + unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, + (u32)((u64)debugctlmsr), + (u32)((u64)debugctlmsr >> 32)); +} + +/* + * from system description table in BIOS. Mostly for MCA use, but + * others may find it useful: + */ +extern unsigned int machine_id; +extern unsigned int machine_submodel_id; +extern unsigned int BIOS_revision; + +/* Boot loader type from the setup header: */ +extern int bootloader_type; +extern int bootloader_version; + +extern char ignore_fpu_irq; + +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 +#define ARCH_HAS_PREFETCHW +#define ARCH_HAS_SPINLOCK_PREFETCH + +#ifdef CONFIG_X86_32 +# define BASE_PREFETCH ASM_NOP4 +# define ARCH_HAS_PREFETCH +#else +# define BASE_PREFETCH "prefetcht0 (%1)" +#endif + +/* + * Prefetch instructions for Pentium III (+) and AMD Athlon (+) + * + * It's not worth to care about 3dnow prefetches for the K6 + * because they are microcoded there and very slow. + */ +static inline void prefetch(const void *x) +{ + alternative_input(BASE_PREFETCH, + "prefetchnta (%1)", + X86_FEATURE_XMM, + "r" (x)); +} + +/* + * 3dnow prefetch to get an exclusive cache line. + * Useful for spinlocks to avoid one state transition in the + * cache coherency protocol: + */ +static inline void prefetchw(const void *x) +{ + alternative_input(BASE_PREFETCH, + "prefetchw (%1)", + X86_FEATURE_3DNOW, + "r" (x)); +} + +static inline void spin_lock_prefetch(const void *x) +{ + prefetchw(x); +} + +#ifdef CONFIG_X86_32 +/* + * User space process size: 3GB (default). + */ +#define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_MAX TASK_SIZE +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +#define INIT_THREAD { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .vm86_info = NULL, \ + .sysenter_cs = __KERNEL_CS, \ + .io_bitmap_ptr = NULL, \ +} + +/* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ +#define INIT_TSS { \ + .x86_tss = { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .ss0 = __KERNEL_DS, \ + .ss1 = __KERNEL_CS, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + }, \ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ +} + +extern unsigned long thread_saved_pc(struct task_struct *tsk); + +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) +#define KSTK_TOP(info) \ +({ \ + unsigned long *__ptr = (unsigned long *)(info); \ + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ +}) + +/* + * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * This is necessary to guarantee that the entire "struct pt_regs" + * is accessable even if the CPU haven't stored the SS/ESP registers + * on the stack (interrupt gate does not save these registers + * when switching to the same priv ring). + * Therefore beware: accessing the ss/esp fields of the + * "struct pt_regs" is possible, but they may contain the + * completely wrong values. + */ +#define task_pt_regs(task) \ +({ \ + struct pt_regs *__regs__; \ + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ + __regs__ - 1; \ +}) + +#define KSTK_ESP(task) (task_pt_regs(task)->sp) + +#else +/* + * User space process size. 47bits minus one guard page. + */ +#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ + 0xc0000000 : 0xFFFFe000) + +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) + +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX TASK_SIZE_MAX + +#define INIT_THREAD { \ + .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +#define INIT_TSS { \ + .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +/* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) + +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ +#endif /* CONFIG_X86_64 */ + +extern void start_thread(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp); + +/* + * This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) + +#define KSTK_EIP(task) (task_pt_regs(task)->ip) + +/* Get/set a process' ability to use the timestamp counter instruction */ +#define GET_TSC_CTL(adr) get_tsc_mode((adr)) +#define SET_TSC_CTL(val) set_tsc_mode((val)) + +extern int get_tsc_mode(unsigned long adr); +extern int set_tsc_mode(unsigned int val); + +extern int amd_get_nb_id(int cpu); + +struct aperfmperf { + u64 aperf, mperf; +}; + +static inline void get_aperfmperf(struct aperfmperf *am) +{ + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); + + rdmsrl(MSR_IA32_APERF, am->aperf); + rdmsrl(MSR_IA32_MPERF, am->mperf); +} + +#define APERFMPERF_SHIFT 10 + +static inline +unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, + struct aperfmperf *new) +{ + u64 aperf = new->aperf - old->aperf; + u64 mperf = new->mperf - old->mperf; + unsigned long ratio = aperf; + + mperf >>= APERFMPERF_SHIFT; + if (mperf) + ratio = div64_u64(aperf, mperf); + + return ratio; +} + +/* + * AMD errata checking + */ +#ifdef CONFIG_CPU_SUP_AMD +extern const int amd_erratum_400[]; +extern bool cpu_has_amd_erratum(const int *); + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +#else +#define cpu_has_amd_erratum(x) (false) +#endif /* CONFIG_CPU_SUP_AMD */ + +#endif /* _ASM_X86_PROCESSOR_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/ipi.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/ipi.h @@ -0,0 +1,13 @@ +#ifndef _ASM_X86_IPI_H +#define _ASM_X86_IPI_H + +#include +#include + +void xen_send_IPI_mask(const struct cpumask *, int vector); +void xen_send_IPI_mask_allbutself(const struct cpumask *, int vector); +void xen_send_IPI_allbutself(int vector); +void xen_send_IPI_all(int vector); +void xen_send_IPI_self(int vector); + +#endif /* _ASM_X86_IPI_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable.h @@ -0,0 +1,752 @@ +#ifndef _ASM_X86_PGTABLE_H +#define _ASM_X86_PGTABLE_H + +#include +#include + +#include + +/* + * Macro to mark a page protection value as UC- + */ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ + : (prot)) + +#ifndef __ASSEMBLY__ + +/* + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. + */ +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + +extern spinlock_t pgd_lock; +extern struct list_head pgd_list; + +#ifdef CONFIG_PARAVIRT +#include +#else /* !CONFIG_PARAVIRT */ +#define set_pte(ptep, pte) xen_set_pte(ptep, pte) +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) + +#define set_pte_atomic(ptep, pte) \ + xen_set_pte_atomic(ptep, pte) + +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) + +#ifndef __PAGETABLE_PUD_FOLDED +#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd) +#define pgd_clear(pgd) xen_pgd_clear(pgd) +#endif + +#ifndef set_pud +# define set_pud(pudp, pud) xen_set_pud(pudp, pud) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pud_clear(pud) xen_pud_clear(pud) +#endif + +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) +#define pmd_clear(pmd) xen_pmd_clear(pmd) + +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) + +#define pgd_val(x) xen_pgd_val(x) +#define __pgd(x) xen_make_pgd(x) + +#ifndef __PAGETABLE_PUD_FOLDED +#define pud_val(x) xen_pud_val(x) +#define __pud(x) xen_make_pud(x) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pmd_val(x) xen_pmd_val(x) +#define __pmd(x) xen_make_pmd(x) +#endif + +#define pte_val(x) xen_pte_val(x) +#define __pte(x) xen_make_pte(x) + +#define arch_end_context_switch(prev) do {} while(0) + +#endif /* CONFIG_PARAVIRT */ + +/* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ +static inline int pte_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_DIRTY; +} + +static inline int pte_young(pte_t pte) +{ + return pte_flags(pte) & _PAGE_ACCESSED; +} + +static inline int pte_write(pte_t pte) +{ + return pte_flags(pte) & _PAGE_RW; +} + +static inline int pte_file(pte_t pte) +{ + return pte_flags(pte) & _PAGE_FILE; +} + +static inline int pte_huge(pte_t pte) +{ + return pte_flags(pte) & _PAGE_PSE; +} + +static inline int pte_global(pte_t pte) +{ + return 0; +} + +static inline int pte_exec(pte_t pte) +{ + return !(pte_flags(pte) & _PAGE_NX); +} + +static inline int pte_special(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SPECIAL; +} + +#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \ + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) +#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IOMAP ? max_mapnr : \ + (_pte).pte_low & _PAGE_PRESENT ? \ + mfn_to_local_pfn(__pte_mfn(_pte)) : \ + __pte_mfn(_pte)) + +#define pte_page(pte) pfn_to_page(pte_pfn(pte)) + +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + +static inline unsigned long pud_pfn(pud_t pud) +{ + return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + +static inline int pmd_large(pmd_t pte) +{ + return pmd_flags(pte) & _PAGE_PSE; +} + +static inline pte_t pte_set_flags(pte_t pte, pteval_t set) +{ + pteval_t v = __pte_val(pte); + + return __pte_ma(v | set); +} + +static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) +{ + pteval_t v = __pte_val(pte); + + return __pte_ma(v & ~clear); +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_DIRTY); +} + +static inline pte_t pte_mkold(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_ACCESSED); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_RW); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_NX); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_ACCESSED); +} + +static inline pte_t pte_mkwrite(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_RW); +} + +static inline pte_t pte_mkhuge(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_PSE); +} + +static inline pte_t pte_clrhuge(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_PSE); +} + +static inline pte_t pte_mkglobal(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_clrglobal(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SPECIAL); +} + +/* + * Mask out unsupported bits in a present pgprot. Non-present pgprots + * can use those bits for other purposes, so leave them be. + */ +static inline pgprotval_t massage_pgprot(pgprot_t pgprot) +{ + pgprotval_t protval = pgprot_val(pgprot); + + if (protval & _PAGE_PRESENT) + protval &= __supported_pte_mask; + + return protval; +} + +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} + +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) +{ + return __pte_ma(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} + +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + pteval_t val = pte_val(pte) & _PAGE_CHG_MASK; + + val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK; + + return __pte(val); +} + +/* mprotect needs to preserve PAT bits when updating vm_page_prot */ +#define pgprot_modify pgprot_modify +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK; + pgprotval_t addbits = pgprot_val(newprot); + return __pgprot(preservebits | addbits); +} + +#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) + +#define canon_pgprot(p) __pgprot(massage_pgprot(p)) + +static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, + unsigned long flags, + unsigned long new_flags) +{ + /* + * PAT type is always WB for ISA. So no need to check. + */ + if (is_ISA_range(paddr, paddr + size - 1)) + return 1; + + /* + * Certain new memtypes are not allowed with certain + * requested memtype: + * - request is uncached, return cannot be write-back + * - request is write-combine, return cannot be write-back + */ + if ((flags == _PAGE_CACHE_UC_MINUS && + new_flags == _PAGE_CACHE_WB) || + (flags == _PAGE_CACHE_WC && + new_flags == _PAGE_CACHE_WB)) { + return 0; + } + + return 1; +} + +pmd_t *populate_extra_pmd(unsigned long vaddr); +pte_t *populate_extra_pte(unsigned long vaddr); +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_X86_32 +# include "pgtable_32.h" +#else +# include "pgtable_64.h" +#endif + +#ifndef __ASSEMBLY__ +#include + +static inline int pte_none(pte_t pte) +{ + return !pte.pte; +} + +#define __HAVE_ARCH_PTE_SAME +static inline int pte_same(pte_t a, pte_t b) +{ + return a.pte == b.pte; +} + +static inline int pte_present(pte_t a) +{ + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); +} + +static inline int pte_hidden(pte_t pte) +{ + return pte_flags(pte) & _PAGE_HIDDEN; +} + +static inline int pmd_present(pmd_t pmd) +{ +#if CONFIG_XEN_COMPAT <= 0x030002 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. + can temporarily clear it. */ + return __pmd_val(pmd) != 0; +#else + /* + * Checking for _PAGE_PSE is needed too because + * split_huge_page will temporarily clear the present bit (but + * the _PAGE_PSE flag will remain set at all times while the + * _PAGE_PRESENT bit is clear). + */ + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); +#endif +} + +static inline int pmd_none(pmd_t pmd) +{ + /* Only check low word on 32-bit platforms, since it might be + out of sync with upper half. */ + return (unsigned long)__pmd_val(pmd) == 0; +} + +static inline unsigned long pmd_page_vaddr(pmd_t pmd) +{ + return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) + +/* + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] + * + * this macro returns the index of the entry in the pmd page which would + * control the given virtual address + */ +static inline unsigned long pmd_index(unsigned long address) +{ + return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); +} + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + * + * (Currently stuck as a macro because of indirect forward reference + * to linux/mm.h:page_to_nid()) + */ +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) + +/* + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] + * + * this function returns the index of the entry in the pte page which would + * control the given virtual address + */ +static inline unsigned long pte_index(unsigned long address) +{ + return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); +} + +static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) +{ + return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); +} + +static inline int pmd_bad(pmd_t pmd) +{ +#if CONFIG_XEN_COMPAT <= 0x030002 + return (pmd_flags(pmd) & ~_PAGE_USER & ~_PAGE_PRESENT) + != (_KERNPG_TABLE & ~_PAGE_PRESENT); +#else + return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; +#endif +} + +static inline unsigned long pages_to_mb(unsigned long npg) +{ + return npg >> (20 - PAGE_SHIFT); +} + +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO) + +#if PAGETABLE_LEVELS > 2 +static inline int pud_none(pud_t pud) +{ + return __pud_val(pud) == 0; +} + +static inline int pud_present(pud_t pud) +{ + return pud_flags(pud) & _PAGE_PRESENT; +} + +static inline unsigned long pud_page_vaddr(pud_t pud) +{ + return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) + +/* Find an entry in the second-level page table.. */ +static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) +{ + return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); +} + +static inline int pud_large(pud_t pud) +{ + return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); +} + +static inline int pud_bad(pud_t pud) +{ + return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; +} +#else +static inline int pud_large(pud_t pud) +{ + return 0; +} +#endif /* PAGETABLE_LEVELS > 2 */ + +#if PAGETABLE_LEVELS > 3 +static inline int pgd_present(pgd_t pgd) +{ + return pgd_flags(pgd) & _PAGE_PRESENT; +} + +static inline unsigned long pgd_page_vaddr(pgd_t pgd) +{ + return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +/* to find an entry in a page-table-directory. */ +static inline unsigned long pud_index(unsigned long address) +{ + return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + +static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) +{ + return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); +} + +static inline int pgd_bad(pgd_t pgd) +{ + return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; +} + +static inline int pgd_none(pgd_t pgd) +{ + return !__pgd_val(pgd); +} +#endif /* PAGETABLE_LEVELS > 3 */ + +#endif /* __ASSEMBLY__ */ + +/* + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] + * + * this macro returns the index of the entry in the pgd page which would + * control the given virtual address + */ +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) + +/* + * pgd_offset() returns a (pgd_t *) + * pgd_index() is used get the offset into the pgd page's array of pgd_t's; + */ +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +/* + * a shortcut which implies the use of the kernel's pgd, instead + * of a process's + */ +#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) + + +#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) +#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) + +#ifndef __ASSEMBLY__ + +#define direct_gbpages 0 + +/* local pte updates need not use xchg for locking */ +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) +{ + xen_set_pte(ptep, __pte(0)); + return res; +} + +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , pte_t pte) +{ + if ((mm != current->mm && mm != &init_mm) || + HYPERVISOR_update_va_mapping(addr, pte, 0)) + xen_set_pte(ptep, pte); +} + +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + if ((mm != current->mm && mm != &init_mm) + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) + __xen_pte_clear(ptep); +} + +#ifndef CONFIG_PARAVIRT +/* + * Rules for using pte_update - it must be called after any PTE update which + * has not been done using the set_pte / clear_pte interfaces. It is used by + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE + * updates should either be sets, clears, or set_pte_atomic for P->P + * transitions, which means this hook should only be called for user PTEs. + * This hook implies a P->P protection or access change has taken place, which + * requires a subsequent TLB flush. The notification can optionally be delayed + * until the TLB flush event by using the pte_update_defer form of the + * interface, but care must be taken to assure that the flush happens while + * still holding the same page table lock so that the shadow and primary pages + * do not become out of sync on SMP. + */ +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) +#endif + +/* + * We only update the dirty/accessed state if we set + * the dirty bit by hand in the kernel, since the hardware + * will do the accessed bit for us, and we don't want to + * race with other CPU's that might be updating the dirty + * bit at the same time. + */ +struct vm_area_struct; + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +extern int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +extern int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); + +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH +#define ptep_clear_flush(vma, addr, ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + pte_t __res = *__ptep; \ + if (!pte_none(__res) && \ + ((vma)->vm_mm != current->mm || \ + HYPERVISOR_update_va_mapping(addr, __pte(0), \ + uvm_multi(mm_cpumask((vma)->vm_mm)) | \ + UVMF_INVLPG))) { \ + __xen_pte_clear(__ptep); \ + flush_tlb_page(vma, addr); \ + } \ + __res; \ +}) + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_t pte = *ptep; + if (!pte_none(pte) + && (mm != &init_mm + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) { + pte = xen_ptep_get_and_clear(ptep, pte); + pte_update(mm, addr, ptep); + } + return pte; +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL +#define ptep_get_and_clear_full(mm, addr, ptep, full) \ + ((full) ? ({ \ + pte_t *__ptep = (ptep); \ + pte_t __res = *__ptep; \ + if (!PagePinned(virt_to_page((mm)->pgd))) \ + __xen_pte_clear(__ptep); \ + else if (!pte_none(__res)) \ + xen_l1_entry_update(__ptep, __pte(0)); \ + __res; \ + }) : \ + ptep_get_and_clear(mm, addr, ptep)) + +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int); + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + if (pte_write(pte)) + set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); +} + +/* + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); + * + * dst - pointer to pgd range anwhere on a pgd page + * src - "" + * count - the number of pgds to copy. + * + * dst and src can be on the same page, but the range must not overlap, + * and must not cross a page boundary. + */ +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) +{ + memcpy(dst, src, count * sizeof(pgd_t)); +} + +#define arbitrary_virt_to_mfn(va) \ +({ \ + unsigned int __lvl; \ + pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \ + BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\ + pte_mfn(*__ptep); \ +}) + +#define arbitrary_virt_to_machine(va) \ + (((maddr_t)arbitrary_virt_to_mfn(va) << PAGE_SHIFT) \ + | ((unsigned long)(va) & (PAGE_SIZE - 1))) + +#ifdef CONFIG_HIGHPTE +#include +struct page *kmap_atomic_to_page(void *); +#define ptep_to_machine(ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + page_to_phys(kmap_atomic_to_page(__ptep)) \ + | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \ +}) +#else +#define ptep_to_machine(ptep) virt_to_machine(ptep) +#endif + +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ +#if CONFIG_XEN_COMPAT < 0x030300 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) + return ptep_get_and_clear(mm, addr, ptep); +#endif + return *ptep; +} + +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + mmu_update_t u; + +#if CONFIG_XEN_COMPAT < 0x030300 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) { + set_pte_at(mm, addr, ptep, pte); + return; + } +#endif + u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD; + u.val = __pte_val(pte); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF)) + BUG(); +} + +#include + +#include +void make_page_readonly(void *va, unsigned int feature); +void make_page_writable(void *va, unsigned int feature); +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); +void make_pages_writable(void *va, unsigned int nr, unsigned int feature); + +struct vm_area_struct; + +int direct_remap_pfn_range(struct vm_area_struct *vma, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); +int direct_kernel_remap_pfn_range(unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + uint64_t *ptep); +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size); + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/setup.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/setup.h @@ -0,0 +1,8 @@ +#ifndef __ASSEMBLY__ + +void xen_start_kernel(void); +void xen_arch_setup(void); + +#endif + +#include_next --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/hypervisor.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/hypervisor.h @@ -0,0 +1,390 @@ +/****************************************************************************** + * hypervisor.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERVISOR_H__ +#define __HYPERVISOR_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern shared_info_t *HYPERVISOR_shared_info; + +#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT +DECLARE_PER_CPU(struct vcpu_info, vcpu_info); +#define vcpu_info(cpu) (&per_cpu(vcpu_info, cpu)) +#define current_vcpu_info() (&__get_cpu_var(vcpu_info)) +#define vcpu_info_read(fld) percpu_read(vcpu_info.fld) +#define vcpu_info_write(fld, val) percpu_write(vcpu_info.fld, val) +#define vcpu_info_xchg(fld, val) percpu_xchg(vcpu_info.fld, val) +void setup_vcpu_info(unsigned int cpu); +void adjust_boot_vcpu_info(void); +#else +#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu)) +#ifdef CONFIG_SMP +#define current_vcpu_info() vcpu_info(smp_processor_id()) +#else +#define current_vcpu_info() vcpu_info(0) +#endif +#define vcpu_info_read(fld) (current_vcpu_info()->fld) +#define vcpu_info_write(fld, val) (current_vcpu_info()->fld = (val)) +static inline void setup_vcpu_info(unsigned int cpu) {} +#endif + +#ifdef CONFIG_X86_32 +extern unsigned long hypervisor_virt_start; +#endif + +/* arch/xen/i386/kernel/setup.c */ +extern start_info_t *xen_start_info; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN) +#else +#define is_initial_xendomain() 0 +#endif + +#define init_hypervisor(c) ((void)((c)->x86_hyper_vendor = X86_HYPER_VENDOR_XEN)) +#define init_hypervisor_platform() init_hypervisor(&boot_cpu_data) + +struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu); + +/* arch/xen/kernel/evtchn.c */ +/* Force a proper event-channel callback from Xen. */ +void force_evtchn_callback(void); + +/* arch/xen/kernel/process.c */ +void xen_cpu_idle (void); + +/* arch/xen/i386/kernel/hypervisor.c */ +void do_hypervisor_callback(struct pt_regs *regs); + +/* arch/xen/i386/mm/hypervisor.c */ +/* + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already + * be MACHINE addresses. + */ + +void xen_pt_switch(pgd_t *); +void xen_new_user_pt(pgd_t *); /* x86_64 only */ +void xen_load_gs(unsigned int selector); /* x86_64 only */ +void xen_tlb_flush(void); +void xen_invlpg(unsigned long ptr); + +void xen_l1_entry_update(pte_t *ptr, pte_t val); +void xen_l2_entry_update(pmd_t *ptr, pmd_t val); +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */ +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */ +void xen_pgd_pin(pgd_t *); +void xen_pgd_unpin(pgd_t *); + +void xen_init_pgd_pin(void); + +void xen_set_ldt(const void *ptr, unsigned int ents); + +#ifdef CONFIG_SMP +#include +void xen_tlb_flush_all(void); +void xen_invlpg_all(unsigned long ptr); +void xen_tlb_flush_mask(const cpumask_t *mask); +void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr); +#else +#define xen_tlb_flush_all xen_tlb_flush +#define xen_invlpg_all xen_invlpg +#endif + +/* Returns zero on success else negative errno. */ +int xen_create_contiguous_region( + unsigned long vstart, unsigned int order, unsigned int address_bits); +void xen_destroy_contiguous_region( + unsigned long vstart, unsigned int order); +int early_create_contiguous_region(unsigned long pfn, unsigned int order, + unsigned int address_bits); + +struct page; + +int xen_limit_pages_to_max_mfn( + struct page *pages, unsigned int order, unsigned int address_bits); + +/* Turn jiffies into Xen system time. */ +u64 jiffies_to_st(unsigned long jiffies); + +#ifdef CONFIG_XEN_SCRUB_PAGES +void scrub_pages(void *, unsigned int); +#else +#define scrub_pages(_p,_n) ((void)0) +#endif + +#if defined(CONFIG_XEN) && !defined(MODULE) + +DECLARE_PER_CPU(bool, xen_lazy_mmu); + +void xen_multicall_flush(bool); + +int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t, + unsigned long flags); +int __must_check xen_multi_mmu_update(mmu_update_t *, unsigned int count, + unsigned int *success_count, domid_t); +int __must_check xen_multi_mmuext_op(struct mmuext_op *, unsigned int count, + unsigned int *success_count, domid_t); + +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE +static inline void arch_enter_lazy_mmu_mode(void) +{ + percpu_write(xen_lazy_mmu, true); +} + +static inline void arch_leave_lazy_mmu_mode(void) +{ + percpu_write(xen_lazy_mmu, false); + xen_multicall_flush(false); +} + +#define arch_use_lazy_mmu_mode() unlikely(percpu_read(xen_lazy_mmu)) + +#if 0 /* All uses are in places potentially called asynchronously, but + * asynchronous code should rather not make use of lazy mode at all. + * Therefore, all uses of this function get commented out, proper + * detection of asynchronous invocations is added whereever needed, + * and this function is disabled to catch any new (improper) uses. + */ +static inline void arch_flush_lazy_mmu_mode(void) +{ + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(false); +} +#endif + +#else /* !CONFIG_XEN || MODULE */ + +static inline void xen_multicall_flush(bool ignore) {} +#define arch_use_lazy_mmu_mode() false +#define xen_multi_update_va_mapping(...) ({ BUG(); -ENOSYS; }) +#define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; }) +#define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; }) + +#endif /* CONFIG_XEN && !MODULE */ + +#ifdef CONFIG_XEN + +struct gnttab_map_grant_ref; +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *, + unsigned int count); +#if CONFIG_XEN_COMPAT < 0x030400 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int); +#else +static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m, + unsigned int count) +{ + BUG(); + return -ENOSYS; +} +#endif + +#else /* !CONFIG_XEN */ + +#define gnttab_pre_map_adjust(...) false +#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; }) + +#endif /* CONFIG_XEN */ + +#if defined(CONFIG_X86_64) +#define MULTI_UVMFLAGS_INDEX 2 +#define MULTI_UVMDOMID_INDEX 3 +#else +#define MULTI_UVMFLAGS_INDEX 3 +#define MULTI_UVMDOMID_INDEX 4 +#endif + +#ifdef CONFIG_XEN +#define is_running_on_xen() 1 +extern char hypercall_page[PAGE_SIZE]; +#else +extern char *hypercall_stubs; +#define is_running_on_xen() (!!hypercall_stubs) +#endif + +#include + +static inline int +HYPERVISOR_yield( + void) +{ + int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +static inline int +HYPERVISOR_block( + void) +{ + int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0); +#endif + + return rc; +} + +static inline void __noreturn +HYPERVISOR_shutdown( + unsigned int reason) +{ + struct sched_shutdown sched_shutdown = { + .reason = reason + }; + + VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown)); +#if CONFIG_XEN_COMPAT <= 0x030002 + VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason)); +#endif + /* Don't recurse needlessly. */ + BUG_ON(reason != SHUTDOWN_crash); + for(;;); +} + +static inline int __must_check +HYPERVISOR_poll( + evtchn_port_t *ports, unsigned int nr_ports, u64 timeout) +{ + int rc; + struct sched_poll sched_poll = { + .nr_ports = nr_ports, + .timeout = jiffies_to_st(timeout) + }; + set_xen_guest_handle(sched_poll.ports, ports); + + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_poll_no_timeout( + evtchn_port_t *ports, unsigned int nr_ports) +{ + int rc; + struct sched_poll sched_poll = { + .nr_ports = nr_ports + }; + set_xen_guest_handle(sched_poll.ports, ports); + + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +#ifdef CONFIG_XEN + +static inline void +MULTI_update_va_mapping( + multicall_entry_t *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +{ + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; +#if defined(CONFIG_X86_64) + mcl->args[1] = new_val.pte; +#elif defined(CONFIG_X86_PAE) + mcl->args[1] = new_val.pte_low; + mcl->args[2] = new_val.pte_high; +#else + mcl->args[1] = new_val.pte_low; + mcl->args[2] = 0; +#endif + mcl->args[MULTI_UVMFLAGS_INDEX] = flags; +} + +static inline void +MULTI_mmu_update(multicall_entry_t *mcl, mmu_update_t *req, + unsigned int count, unsigned int *success_count, + domid_t domid) +{ + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)req; + mcl->args[1] = count; + mcl->args[2] = (unsigned long)success_count; + mcl->args[3] = domid; +} + +static inline void +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd, + void *uop, unsigned int count) +{ + mcl->op = __HYPERVISOR_grant_table_op; + mcl->args[0] = cmd; + mcl->args[1] = (unsigned long)uop; + mcl->args[2] = count; +} + +#else /* !defined(CONFIG_XEN) */ + +/* Multicalls not supported for HVM guests. */ +#define MULTI_update_va_mapping(a,b,c,d) ((void)0) +#define MULTI_grant_table_op(a,b,c,d) ((void)0) + +#endif + +#define uvm_multi(cpumask) ((unsigned long)cpumask_bits(cpumask) | UVMF_MULTI) + +#ifdef LINUX +/* drivers/staging/ use Windows-style types, including VOID */ +#undef VOID +#endif + +#endif /* __HYPERVISOR_H__ */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/maddr_32.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/maddr_32.h @@ -0,0 +1,182 @@ +#ifndef _I386_MADDR_H +#define _I386_MADDR_H + +#include +#include +#include + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +/* Definitions for machine and pseudophysical addresses. */ +#ifdef CONFIG_X86_PAE +typedef unsigned long long paddr_t; +typedef unsigned long long maddr_t; +#else +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; +#endif + +#ifdef CONFIG_XEN + +extern unsigned long *phys_to_machine_mapping; +extern unsigned long max_mapnr; + +#undef machine_to_phys_mapping +extern unsigned long *machine_to_phys_mapping; +extern unsigned int machine_to_phys_order; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return pfn; + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return 1; + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return mfn; + + if (unlikely((mfn >> machine_to_phys_order) != 0)) + return max_mapnr; + + /* The array access can fail (e.g., device space beyond end of RAM). */ + asm ( + "1: movl %1,%0\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl %2,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b,3b\n" + ".previous" + : "=r" (pfn) + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) ); + + return pfn; +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))==MFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservation, + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +{ + unsigned long pfn = mfn_to_pfn(mfn); + if (likely(pfn < max_mapnr) + && likely(!xen_feature(XENFEAT_auto_translated_physmap)) + && unlikely(phys_to_machine_mapping[pfn] != mfn)) + return max_mapnr; /* force !pfn_valid() */ + return pfn; +} + +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + phys_to_machine_mapping[pfn] = mfn; +} + +static inline maddr_t phys_to_machine(paddr_t phys) +{ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); + return machine; +} + +static inline paddr_t machine_to_phys(maddr_t machine) +{ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); + return phys; +} + +#ifdef CONFIG_X86_PAE +static inline paddr_t pte_phys_to_machine(paddr_t phys) +{ + /* + * In PAE mode, the NX bit needs to be dealt with in the value + * passed to pfn_to_mfn(). On x86_64, we need to mask it off, + * but for i386 the conversion to ulong for the argument will + * clip it off. + */ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK); + return machine; +} + +static inline paddr_t pte_machine_to_phys(maddr_t machine) +{ + /* + * In PAE mode, the NX bit needs to be dealt with in the value + * passed to mfn_to_pfn(). On x86_64, we need to mask it off, + * but for i386 the conversion to ulong for the argument will + * clip it off. + */ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); + return phys; +} +#else +#define pte_phys_to_machine phys_to_machine +#define pte_machine_to_phys machine_to_phys +#endif + +#else /* !CONFIG_XEN */ + +#define pfn_to_mfn(pfn) (pfn) +#define mfn_to_pfn(mfn) (mfn) +#define mfn_to_local_pfn(mfn) (mfn) +#define set_phys_to_machine(pfn, mfn) ((void)0) +#define phys_to_machine_mapping_valid(pfn) (1) +#define phys_to_machine(phys) ((maddr_t)(phys)) +#define machine_to_phys(mach) ((paddr_t)(mach)) +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot) +#define __pte_ma(x) __pte(x) + +#endif /* !CONFIG_XEN */ + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) (phys_to_machine(__pa(v))) +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) + +#endif /* _I386_MADDR_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgtable-3level.h @@ -0,0 +1,137 @@ +#ifndef _ASM_X86_PGTABLE_3LEVEL_H +#define _ASM_X86_PGTABLE_3LEVEL_H + +/* + * Intel Physical Address Extension (PAE) Mode - three-level page + * tables on PPro+ CPUs. + * + * Copyright (C) 1999 Ingo Molnar + */ + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \ + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \ + __FILE__, __LINE__, &(e), __pmd_val(e), \ + (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \ + __FILE__, __LINE__, &(e), __pgd_val(e), \ + (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT) + +/* Rules for using set_pte: the pte being assigned *must* be + * either not present or in a state where the hardware will + * not attempt to update the pte. In places where this is + * not possible, use pte_get_and_clear to obtain the old pte + * value and then use set_pte to update it. -ben + */ + +static inline void xen_set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((unsigned long long *)(ptep), __pte_val(pte)); +} + +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + xen_l2_entry_update(pmdp, pmd); +} + +static inline void xen_set_pud(pud_t *pudp, pud_t pud) +{ + xen_l3_entry_update(pudp, pud); +} + +/* + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table + * entry, so clear the bottom half first and enforce ordering with a compiler + * barrier. + */ +static inline void __xen_pte_clear(pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; +} + +#define xen_pmd_clear(pmd) \ +({ \ + pmd_t *__pmdp = (pmd); \ + PagePinned(virt_to_page(__pmdp)) \ + ? set_pmd(__pmdp, __pmd(0)) \ + : (void)(*__pmdp = __pmd(0)); \ +}) + +static inline void __xen_pud_clear(pud_t *pudp) +{ + pgdval_t pgd; + + set_pud(pudp, __pud(0)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + * + * Make sure the pud entry we're updating is within the + * current pgd to avoid unnecessary TLB flushes. + */ + pgd = read_cr3(); + if (__pa(pudp) >= pgd && __pa(pudp) < + (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) + xen_tlb_flush(); +} + +#define xen_pud_clear(pudp) \ +({ \ + pud_t *__pudp = (pudp); \ + PagePinned(virt_to_page(__pudp)) \ + ? __xen_pud_clear(__pudp) \ + : (void)(*__pudp = __pud(0)); \ +}) + +#ifdef CONFIG_SMP +static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res) +{ + uint64_t val = __pte_val(res); + if (__cmpxchg64(ptep, val, 0) != val) { + /* xchg acts as a barrier before the setting of the high bits */ + res.pte_low = xchg(&ptep->pte_low, 0); + res.pte_high = ptep->pte_high; + ptep->pte_high = 0; + } + return res; +} +#else +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) +#endif + +#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \ + ((_pte).pte_high << (32-PAGE_SHIFT))) + +/* + * Bits 0, 6 and 7 are taken in the low part of the pte, + * put the 32 bits of offset into the high part. + */ +#define pte_to_pgoff(pte) ((pte).pte_high) +#define pgoff_to_pte(off) \ + ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) +#define PTE_FILE_MAX_BITS 32 + +/* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) +#define __swp_type(x) (((x).val) & 0x1f) +#define __swp_offset(x) ((x).val >> 5) +#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) +#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) +#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) + +#endif /* _ASM_X86_PGTABLE_3LEVEL_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/pgalloc.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/pgalloc.h @@ -0,0 +1,159 @@ +#ifndef _ASM_X86_PGALLOC_H +#define _ASM_X86_PGALLOC_H + +#include +#include /* for struct page */ +#include + +#include /* for phys_to_virt and page_to_pseudophys */ + +static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {} + +static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, + unsigned long start, unsigned long count) {} +static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_release_pte(unsigned long pfn) {} +static inline void paravirt_release_pmd(unsigned long pfn) {} +static inline void paravirt_release_pud(unsigned long pfn) {} + +#ifdef CONFIG_X86_64 +void early_make_page_readonly(void *va, unsigned int feature); +pmd_t *early_get_pmd(unsigned long va); +#define make_lowmem_page_readonly make_page_readonly +#define make_lowmem_page_writable make_page_writable +#endif + +/* + * Flags to use when allocating a user page table page. + */ +extern gfp_t __userpte_alloc_gfp; + +/* + * Allocate and free page tables. + */ +extern pgd_t *pgd_alloc(struct mm_struct *); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); + +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); + +/* Should really implement gc for free page table pages. This could be + done with a reference count in struct page. */ + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); + make_lowmem_page_writable(pte, XENFEAT_writable_page_tables); + free_page((unsigned long)pte); +} + +extern void __pte_free(pgtable_t); +static inline void pte_free(struct mm_struct *mm, struct page *pte) +{ + __pte_free(pte); +} + +extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); + +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, + unsigned long address) +{ + ___pte_free_tlb(tlb, pte); +} + +static inline void pmd_populate_kernel(struct mm_struct *mm, + pmd_t *pmd, pte_t *pte) +{ + paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + struct page *pte) +{ + unsigned long pfn = page_to_pfn(pte); + pmd_t ent = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE); + + paravirt_alloc_pte(mm, pfn); + if (PagePinned(virt_to_page(pmd))) { +#ifndef CONFIG_HIGHPTE + BUG_ON(PageHighMem(pte)); +#endif + set_pmd(pmd, ent); + } else + *pmd = ent; +} + +#define pmd_pgtable(pmd) pmd_page(pmd) + +#if PAGETABLE_LEVELS > 2 +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr); +extern void __pmd_free(pgtable_t); + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); + __pmd_free(virt_to_page(pmd)); +} + +extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long adddress) +{ + ___pmd_free_tlb(tlb, pmd); +} + +#ifdef CONFIG_X86_PAE +extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); +#else /* !CONFIG_X86_PAE */ +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + pud_t ent = __pud(_PAGE_TABLE | __pa(pmd)); + + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + if (PagePinned(virt_to_page(pud))) + set_pud(pud, ent); + else + *pud = ent; +} +#endif /* CONFIG_X86_PAE */ + +#if PAGETABLE_LEVELS > 3 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + pgd_t ent = __pgd(_PAGE_TABLE | __pa(pud)); + + paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); + if (unlikely(PagePinned(virt_to_page(pgd)))) + xen_l4_entry_update(pgd, ent); + else + *__user_pgd(pgd) = *pgd = ent; +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return (pud_t *)pmd_alloc_one(mm, addr); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); + __pmd_free(virt_to_page(pud)); +} + +extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); + +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + ___pud_free_tlb(tlb, pud); +} + +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +#endif /* _ASM_X86_PGALLOC_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/tlbflush.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/tlbflush.h @@ -0,0 +1,116 @@ +#ifndef _ASM_X86_TLBFLUSH_H +#define _ASM_X86_TLBFLUSH_H + +#include +#include + +#include +#include + +#define __flush_tlb() xen_tlb_flush() +#define __flush_tlb_global() xen_tlb_flush() +#define __flush_tlb_single(addr) xen_invlpg(addr) +#define __flush_tlb_all() xen_tlb_flush() +#define __flush_tlb_one(addr) xen_invlpg(addr) + +#ifdef CONFIG_X86_32 +# define TLB_FLUSH_ALL 0xffffffff +#else +# define TLB_FLUSH_ALL -1ULL +#endif + +/* + * TLB flushing: + * + * - flush_tlb() flushes the current mm struct TLBs + * - flush_tlb_all() flushes all processes TLBs + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages + * + * ..but the i386 has somewhat limited tlb flushing capabilities, + * and page-granular flushes are available only on i486 and up. + * + * x86-64 can only flush individual pages or full VMs. For a range flush + * we always do the full VM. Might be worth trying if for a small + * range a few INVLPGs in a row are a win. + */ + +#ifndef CONFIG_SMP + +#define flush_tlb() __flush_tlb() +#define flush_tlb_all() __flush_tlb_all() +#define local_flush_tlb() __flush_tlb() + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + if (mm == current->active_mm) + __flush_tlb(); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long addr) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb_one(addr); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb(); +} + +static inline void reset_lazy_tlbstate(void) +{ +} + +#else /* SMP */ + +#include + +#define local_flush_tlb() __flush_tlb() + +#define flush_tlb_all xen_tlb_flush_all +#define flush_tlb_current_task() xen_tlb_flush_mask(mm_cpumask(current->mm)) +#define flush_tlb_mm(mm) xen_tlb_flush_mask(mm_cpumask(mm)) +#define flush_tlb_page(vma, va) xen_invlpg_mask(mm_cpumask((vma)->vm_mm), va) + +#define flush_tlb() flush_tlb_current_task() + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + flush_tlb_mm(vma->vm_mm); +} + +#ifndef CONFIG_XEN +#define TLBSTATE_OK 1 +#define TLBSTATE_LAZY 2 + +struct tlb_state { + struct mm_struct *active_mm; + int state; +}; +DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); + +static inline void reset_lazy_tlbstate(void) +{ + percpu_write(cpu_tlbstate.state, 0); + percpu_write(cpu_tlbstate.active_mm, &init_mm); +} +#endif + +#endif /* SMP */ + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + flush_tlb_all(); +} + +extern void zap_low_mappings(bool early); + +#endif /* _ASM_X86_TLBFLUSH_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/hypercall_64.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/hypercall_64.h @@ -0,0 +1,54 @@ +#define HYPERCALL_arg1 "rdi" +#define HYPERCALL_arg2 "rsi" +#define HYPERCALL_arg3 "rdx" +#define HYPERCALL_arg4 "r10" +#define HYPERCALL_arg5 "r8" + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int __must_check +HYPERVISOR_set_callbacks( + unsigned long event_address, unsigned long failsafe_address, + unsigned long syscall_address) +{ + return _hypercall3(int, set_callbacks, + event_address, failsafe_address, syscall_address); +} +#endif + +static inline long __must_check +HYPERVISOR_set_timer_op( + u64 timeout) +{ + return _hypercall1(long, set_timer_op, timeout); +} + +static inline int __must_check +HYPERVISOR_update_descriptor( + unsigned long ma, unsigned long word) +{ + return _hypercall2(int, update_descriptor, ma, word); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping( + unsigned long va, pte_t new_val, unsigned long flags) +{ + if (arch_use_lazy_mmu_mode()) + return xen_multi_update_va_mapping(va, new_val, flags); + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + return _hypercall4(int, update_va_mapping_otherdomain, va, + new_val.pte, flags, domid); +} + +static inline int __must_check +HYPERVISOR_set_segment_base( + int reg, unsigned long value) +{ + return _hypercall2(int, set_segment_base, reg, value); +} --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/io.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/io.h @@ -0,0 +1,230 @@ +#ifndef _ASM_X86_IO_H +#define _ASM_X86_IO_H + +#define ARCH_HAS_IOREMAP_WC + +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +#define build_mmio_read(name, size, type, reg, barrier) \ +static inline type name(const volatile void __iomem *addr) \ +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \ +:"m" (*(volatile type __force *)addr) barrier); return ret; } + +#define build_mmio_write(name, size, type, reg, barrier) \ +static inline void name(type val, volatile void __iomem *addr) \ +{ asm volatile("mov" size " %0,%1": :reg (val), \ +"m" (*(volatile type __force *)addr) barrier); } + +build_mmio_read(readb, "b", unsigned char, "=q", :"memory") +build_mmio_read(readw, "w", unsigned short, "=r", :"memory") +build_mmio_read(readl, "l", unsigned int, "=r", :"memory") + +build_mmio_read(__readb, "b", unsigned char, "=q", ) +build_mmio_read(__readw, "w", unsigned short, "=r", ) +build_mmio_read(__readl, "l", unsigned int, "=r", ) + +build_mmio_write(writeb, "b", unsigned char, "q", :"memory") +build_mmio_write(writew, "w", unsigned short, "r", :"memory") +build_mmio_write(writel, "l", unsigned int, "r", :"memory") + +build_mmio_write(__writeb, "b", unsigned char, "q", ) +build_mmio_write(__writew, "w", unsigned short, "r", ) +build_mmio_write(__writel, "l", unsigned int, "r", ) + +#define readb_relaxed(a) __readb(a) +#define readw_relaxed(a) __readw(a) +#define readl_relaxed(a) __readl(a) +#define __raw_readb __readb +#define __raw_readw __readw +#define __raw_readl __readl + +#define __raw_writeb __writeb +#define __raw_writew __writew +#define __raw_writel __writel + +#define mmiowb() barrier() + +#ifdef CONFIG_X86_64 + +build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_write(writeq, "q", unsigned long, "r", :"memory") + +#else + +static inline __u64 readq(const volatile void __iomem *addr) +{ + const volatile u32 __iomem *p = addr; + u32 low, high; + + low = readl(p); + high = readl(p + 1); + + return low + ((u64)high << 32); +} + +static inline void writeq(__u64 val, volatile void __iomem *addr) +{ + writel(val, addr); + writel(val >> 32, addr+4); +} + +#endif + +#define readq_relaxed(a) readq(a) + +#define __raw_readq(a) readq(a) +#define __raw_writeq(val, addr) writeq(val, addr) + +/* Let people know that we have them */ +#define readq readq +#define writeq writeq + +#define native_io_delay xen_io_delay + +/** + * virt_to_phys - map virtual addresses to physical + * @address: address to remap + * + * The returned physical address is the physical (CPU) mapping for + * the memory address given. It is only valid to use this function on + * addresses directly mapped or allocated via kmalloc. + * + * This function does not give bus mappings for DMA transfers. In + * almost all conceivable cases a device driver should not be using + * this function + */ + +static inline phys_addr_t virt_to_phys(volatile void *address) +{ + return __pa(address); +} + +/** + * phys_to_virt - map physical address to virtual + * @address: address to remap + * + * The returned virtual address is a current CPU mapping for + * the memory address given. It is only valid to use this function on + * addresses that have a kernel mapping + * + * This function does not handle bus mappings for DMA transfers. In + * almost all conceivable cases a device driver should not be using + * this function + */ + +static inline void *phys_to_virt(phys_addr_t address) +{ + return __va(address); +} + +/* + * Change "struct page" to physical address. + */ +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) +#undef page_to_phys +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page))) +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page))) + +/* + * ISA I/O bus memory addresses are 1:1 with the physical address. + * However, we truncate the address to unsigned int to avoid undesirable + * promitions in legacy drivers. + */ +#define isa_virt_to_bus(_x) ({ \ + unsigned long _va_ = (unsigned long)(_x); \ + _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \ + ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \ + : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); }) +#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x)) + +/* + * However PCI ones are not necessarily 1:1 and therefore these interfaces + * are forbidden in portable PCI drivers. + * + * Allow them on x86 for legacy drivers, though. + */ +#define virt_to_bus(_x) phys_to_machine(__pa(_x)) +#define bus_to_virt(_x) __va(machine_to_phys(_x)) + +/** + * ioremap - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * If the area you are trying to map is a PCI BAR you should have a + * look at pci_iomap(). + */ +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, + unsigned long prot_val); + +/* + * The default ioremap() behavior is non-cached: + */ +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) +{ + return ioremap_nocache(offset, size); +} + +extern void iounmap(volatile void __iomem *addr); + +extern void set_iounmap_nonlazy(void); + +#ifdef CONFIG_X86_32 +# include "../../asm/io_32.h" +#else +# include "../../asm/io_64.h" +#endif + +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) + +/* We will be supplying our own /dev/mem implementation */ +#define ARCH_HAS_DEV_MEM + +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \ + (unsigned long) (bv)->bv_offset) + +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ + (bvec_to_phys(vec1) + (vec1)->bv_len == bvec_to_phys(vec2) \ + && bvec_to_pseudophys(vec1) + (vec1)->bv_len \ + == bvec_to_pseudophys(vec2)) + +#undef __ISA_IO_base + +#endif + +extern void *xlate_dev_mem_ptr(unsigned long phys); +extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); + +extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size, + unsigned long prot_val); +extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); + +/* + * early_ioremap() and early_iounmap() are for temporary early boot-time + * mappings, before the real ioremap() is functional. + * A boot-time mapping is currently limited to at most 16 pages. + */ +extern void early_ioremap_init(void); +extern void early_ioremap_reset(void); +extern void __iomem *early_ioremap(resource_size_t phys_addr, + unsigned long size); +extern void __iomem *early_memremap(resource_size_t phys_addr, + unsigned long size); +extern void early_iounmap(void __iomem *addr, unsigned long size); + +#define IO_SPACE_LIMIT 0xffff + +#endif /* _ASM_X86_IO_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/mach-xen/asm/fixmap.h +++ linux-ec2-2.6.32/arch/x86/include/mach-xen/asm/fixmap.h @@ -0,0 +1,214 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009 + */ + +#ifndef _ASM_X86_FIXMAP_H +#define _ASM_X86_FIXMAP_H + +#ifndef __ASSEMBLY__ +#include +#include +#include +#ifdef CONFIG_X86_32 +#include +#include +#else +#include +#endif + +/* + * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall + * uses fixmaps that relies on FIXADDR_TOP for proper address calculation. + * Because of this, FIXADDR_TOP x86 integration was left as later work. + */ +#ifdef CONFIG_X86_32 +/* used by vmalloc.c, vsyscall.lds.S. + * + * Leave one empty page between vmalloc'ed areas and + * the start of the fixmap. + */ +extern unsigned long __FIXADDR_TOP; +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) + +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) +#else +#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) + +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ +#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) +#endif + + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. + * for x86_32: We allocate these special addresses + * from the end of virtual memory (0xfffff000) backwards. + * Also this lets us do fail-safe vmalloc(), we + * can guarantee that these special addresses and + * vmalloc()-ed addresses never overlap. + * + * These 'compile-time allocated' memory buffers are + * fixed-size 4k pages (or larger if used with an increment + * higher than 1). Use set_fixmap(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ +enum fixed_addresses { +#ifdef CONFIG_X86_32 + FIX_HOLE, + FIX_VDSO, +#else + VSYSCALL_LAST_PAGE, + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, + VSYSCALL_HPET, +#endif + FIX_DBGP_BASE, + FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_LOCAL_APIC + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +#endif +#ifdef CONFIG_X86_IO_APIC + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, +#endif +#else + FIX_SHARED_INFO, +#define NR_FIX_ISAMAPS 256 + FIX_ISAMAP_END, + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, +#endif +#ifdef CONFIG_X86_VISWS_APIC + FIX_CO_CPU, /* Cobalt timer */ + FIX_CO_APIC, /* Cobalt APIC Redirection Table */ + FIX_LI_PCIA, /* Lithium PCI Bridge A */ + FIX_LI_PCIB, /* Lithium PCI Bridge B */ +#endif +#ifdef CONFIG_X86_F00F_BUG + FIX_F00F_IDT, /* Virtual mapping for IDT */ +#endif +#ifdef CONFIG_X86_CYCLONE_TIMER + FIX_CYCLONE_TIMER, /*cyclone timer register*/ +#endif +#ifdef CONFIG_X86_32 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +#ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, +#endif +#endif +#ifdef CONFIG_PARAVIRT + FIX_PARAVIRT_BOOTMAP, +#endif + FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ + __end_of_permanent_fixed_addresses, + /* + * 256 temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + * + * We round it up to the next 256 pages boundary so that we + * can have a single pgd entry and a single pte table: + */ +#define NR_FIX_BTMAPS 64 +#define FIX_BTMAPS_SLOTS 4 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - + (__end_of_permanent_fixed_addresses & 255), + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +#ifdef CONFIG_X86_32 + FIX_WP_TEST, +#endif +#ifdef CONFIG_INTEL_TXT + FIX_TBOOT_BASE, +#endif + __end_of_fixed_addresses +}; + + +extern void reserve_top_address(unsigned long reserve); + +#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) +#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE) + +extern int fixmaps_set; + +extern pte_t *kmap_pte; +extern pgprot_t kmap_prot; +extern pte_t *pkmap_page_table; + +void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t); + +static inline void __set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + xen_set_fixmap(idx, phys, flags); +} + +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) + +/* + * Some hardware wants to get fixmapped without caching. + */ +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) + +#define clear_fixmap(idx) \ + __set_fixmap(idx, 0, __pgprot(0)) + +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + +extern void __this_fixmap_does_not_exist(void); + +/* + * 'index to address' translation. If anyone tries to use the idx + * directly without translation, we catch the bug with a NULL-deference + * kernel oops. Illegal ranges of incoming indices are caught too. + */ +static __always_inline unsigned long fix_to_virt(const unsigned int idx) +{ + /* + * this branch gets completely eliminated after inlining, + * except when someone tries to use fixaddr indices in an + * illegal way. (such as mixing up address types or using + * out-of-range indices). + * + * If it doesn't get removed, the linker will complain + * loudly with a reasonably clear error message.. + */ + if (idx >= __end_of_fixed_addresses) + __this_fixmap_does_not_exist(); + + return __fix_to_virt(idx); +} + +static inline unsigned long virt_to_fix(const unsigned long vaddr) +{ + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); +} +#endif /* !__ASSEMBLY__ */ +#endif /* _ASM_X86_FIXMAP_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/system.h +++ linux-ec2-2.6.32/arch/x86/include/asm/system.h @@ -449,7 +449,7 @@ * * (Could use an alternative three way for this if there was one.) */ -static inline void rdtsc_barrier(void) +static __always_inline void rdtsc_barrier(void) { alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/acpi.h +++ linux-ec2-2.6.32/arch/x86/include/asm/acpi.h @@ -30,6 +30,10 @@ #include #include +#ifdef CONFIG_XEN +#include +#endif + #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long @@ -89,6 +93,7 @@ extern int acpi_pci_disabled; extern int acpi_skip_timer_override; extern int acpi_use_timer_override; +extern int acpi_fix_pin2_polarity; extern u8 acpi_sci_flags; extern int acpi_sci_override_gsi; @@ -120,6 +125,27 @@ /* early initialization routine */ extern void acpi_reserve_bootmem(void); +#ifdef CONFIG_XEN +static inline int acpi_notify_hypervisor_state(u8 sleep_state, + u32 pm1a_cnt_val, + u32 pm1b_cnt_val) +{ + struct xen_platform_op op = { + .cmd = XENPF_enter_acpi_sleep, + .interface_version = XENPF_INTERFACE_VERSION, + .u = { + .enter_acpi_sleep = { + .pm1a_cnt_val = pm1a_cnt_val, + .pm1b_cnt_val = pm1b_cnt_val, + .sleep_state = sleep_state, + }, + }, + }; + + return HYPERVISOR_platform_op(&op); +} +#endif /* CONFIG_XEN */ + /* * Check if the CPU can handle C2 and deeper */ @@ -152,7 +178,9 @@ #endif /* !CONFIG_ACPI */ +#ifndef CONFIG_XEN #define ARCH_HAS_POWER_INIT 1 +#endif struct bootnode; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/kvm_host.h +++ linux-ec2-2.6.32/arch/x86/include/asm/kvm_host.h @@ -193,6 +193,7 @@ unsigned invalid:1; unsigned cr4_pge:1; unsigned nxe:1; + unsigned cr0_wp:1; }; }; @@ -256,7 +257,8 @@ void (*new_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, + u32 *error); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -355,6 +357,9 @@ struct page *time_page; bool singlestep; /* guest is single stepped by KVM */ + u64 last_guest_tsc; + u64 last_kernel_ns; + bool nmi_pending; bool nmi_injected; @@ -412,6 +417,7 @@ unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; u64 vm_init_tsc; + s64 kvmclock_offset; }; struct kvm_vm_stat { @@ -600,8 +606,7 @@ unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); @@ -644,6 +649,10 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -657,6 +666,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +bool kvm_check_iopl(struct kvm_vcpu *vcpu); struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); @@ -667,20 +677,6 @@ return (struct kvm_mmu_page *)page_private(page); } -static inline u16 kvm_read_fs(void) -{ - u16 seg; - asm("mov %%fs, %0" : "=g"(seg)); - return seg; -} - -static inline u16 kvm_read_gs(void) -{ - u16 seg; - asm("mov %%gs, %0" : "=g"(seg)); - return seg; -} - static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -688,16 +684,6 @@ return ldt; } -static inline void kvm_load_fs(u16 sel) -{ - asm("mov %0, %%fs" : : "rm"(sel)); -} - -static inline void kvm_load_gs(u16 sel) -{ - asm("mov %0, %%gs" : : "rm"(sel)); -} - static inline void kvm_load_ldt(u16 sel) { asm("lldt %0" : : "rm"(sel)); @@ -755,6 +741,20 @@ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } +static inline u64 get_canonical(u64 la) +{ + return ((int64_t)la << 16) >> 16; +} + +static inline bool is_noncanonical_address(u64 la) +{ +#ifdef CONFIG_X86_64 + return get_canonical(la) != la; +#else + return false; +#endif +} + #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/hw_irq.h +++ linux-ec2-2.6.32/arch/x86/include/asm/hw_irq.h @@ -107,6 +107,7 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void); #endif #ifdef CONFIG_SMP +#ifndef CONFIG_XEN extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); extern void smp_call_function_single_interrupt(struct pt_regs *); @@ -115,9 +116,18 @@ #else extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif +#else +#include +extern irqreturn_t smp_reschedule_interrupt(int, void *); +extern irqreturn_t smp_call_function_interrupt(int, void *); +extern irqreturn_t smp_call_function_single_interrupt(int, void *); +extern irqreturn_t smp_reboot_interrupt(int, void *); +#endif #endif +#ifndef CONFIG_XEN extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); +#endif typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/percpu.h +++ linux-ec2-2.6.32/arch/x86/include/asm/percpu.h @@ -133,6 +133,38 @@ ret__; \ }) +#define percpu_xchg_op(op, var, val) \ +({ \ + typedef typeof(var) T__; \ + T__ ret__; \ + if (0) \ + ret__ = (val); \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b %0,"__percpu_arg(1) \ + : "=q" (ret__), "+m" (var) \ + : "0" ((T__)(val))); \ + break; \ + case 2: \ + asm(op "w %0,"__percpu_arg(1) \ + : "=r" (ret__), "+m" (var) \ + : "0" ((T__)(val))); \ + break; \ + case 4: \ + asm(op "l %0,"__percpu_arg(1) \ + : "=r" (ret__), "+m" (var) \ + : "0" ((T__)(val))); \ + break; \ + case 8: \ + asm(op "q %0,"__percpu_arg(1) \ + : "=r" (ret__), "+m" (var) \ + : "0" ((T__)(val))); \ + break; \ + default: __bad_percpu_size(); \ + } \ + ret__; \ +}) + /* * percpu_read() makes gcc load the percpu variable every time it is * accessed while percpu_read_stable() allows the value to be cached. @@ -152,6 +184,10 @@ #define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) #define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) #define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) +#define percpu_xchg(var, val) percpu_xchg_op("xchg", per_cpu__##var, val) +#if defined(CONFIG_X86_XADD) || defined(CONFIG_X86_64) +#define percpu_xadd(var, val) percpu_xchg_op("xadd", per_cpu__##var, val) +#endif /* This is not atomic against other CPUs -- CPU preemption needs to be off */ #define x86_test_and_clear_bit_percpu(bit, var) \ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/tsc.h +++ linux-ec2-2.6.32/arch/x86/include/asm/tsc.h @@ -59,5 +59,7 @@ extern void check_tsc_sync_target(void); extern int notsc_setup(char *); +extern void save_sched_clock_state(void); +extern void restore_sched_clock_state(void); #endif /* _ASM_X86_TSC_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/elf.h +++ linux-ec2-2.6.32/arch/x86/include/asm/elf.h @@ -197,14 +197,8 @@ set_fs(USER_DS); \ } while (0) -#define COMPAT_SET_PERSONALITY(ex) \ -do { \ - if (test_thread_flag(TIF_IA32)) \ - clear_thread_flag(TIF_ABI_PENDING); \ - else \ - set_thread_flag(TIF_ABI_PENDING); \ - current->personality |= force_personality32; \ -} while (0) +void set_personality_ia32(void); +#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32() #define COMPAT_ELF_PLATFORM ("i686") --- linux-ec2-2.6.32.orig/arch/x86/include/asm/pgtable_32.h +++ linux-ec2-2.6.32/arch/x86/include/asm/pgtable_32.h @@ -27,6 +27,7 @@ struct vm_area_struct; extern pgd_t swapper_pg_dir[1024]; +extern pgd_t trampoline_pg_dir[1024]; static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } --- linux-ec2-2.6.32.orig/arch/x86/include/asm/paravirt.h +++ linux-ec2-2.6.32/arch/x86/include/asm/paravirt.h @@ -289,6 +289,12 @@ { PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); } +#ifdef CONFIG_X86_32 +static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm) +{ + PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm); +} +#endif /*CONFIG_X86_32*/ static inline void store_gdt(struct desc_ptr *dtr) { PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/required-features.h +++ linux-ec2-2.6.32/arch/x86/include/asm/required-features.h @@ -48,7 +48,7 @@ #endif #ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT +#if defined(CONFIG_PARAVIRT) || defined(CONFIG_XEN) /* Paravirtualized systems may not have PSE or PGE available */ #define NEED_PSE 0 #define NEED_PGE 0 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/kvm_emulate.h +++ linux-ec2-2.6.32/arch/x86/include/asm/kvm_emulate.h @@ -54,13 +54,23 @@ struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. + * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); + + /* + * fetch: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*fetch)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* * read_emulated: Read bytes from emulated/special memory area. @@ -99,6 +109,8 @@ unsigned int bytes, struct kvm_vcpu *vcpu); + bool (*get_cpuid)(struct kvm_vcpu *vcpu, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); }; /* Type, address-of, and value of an instruction's operand. */ @@ -129,7 +141,7 @@ u8 seg_override; unsigned int d; unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; + unsigned long eip, eip_orig; /* modrm */ u8 modrm; u8 modrm_mod; @@ -168,10 +180,24 @@ /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ +/* CPUID vendors */ +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65 + +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41 +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574 +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273 + +#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547 +#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e +#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69 + /* Host execution mode. */ #if defined(CONFIG_X86_32) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/signal.h +++ linux-ec2-2.6.32/arch/x86/include/asm/signal.h @@ -125,6 +125,8 @@ extern void do_notify_resume(struct pt_regs *, void *, __u32); # endif /* __KERNEL__ */ +#define __ARCH_HAS_SA_RESTORER + #ifdef __i386__ # ifdef __KERNEL__ struct old_sigaction { --- linux-ec2-2.6.32.orig/arch/x86/include/asm/page_64_types.h +++ linux-ec2-2.6.32/arch/x86/include/asm/page_64_types.h @@ -14,12 +14,11 @@ #define IRQ_STACK_ORDER 2 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#define DOUBLEFAULT_STACK 1 +#define NMI_STACK 2 +#define DEBUG_STACK 3 +#define MCE_STACK 4 +#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) @@ -69,7 +68,15 @@ #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_FLATMEM +/* + * While max_pfn is not exported, max_mapnr never gets initialized for non-Xen + * other than for hotplugged memory. + */ +#ifndef CONFIG_XEN #define pfn_valid(pfn) ((pfn) < max_pfn) +#else +#define pfn_valid(pfn) ((pfn) < max_mapnr) +#endif #endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/mmu.h +++ linux-ec2-2.6.32/arch/x86/include/asm/mmu.h @@ -7,15 +7,25 @@ /* * The x86 doesn't have a mmu context, but * we put the segment information here. + * + * exec_limit is used to track the range PROT_EXEC + * mappings span. */ typedef struct { void *ldt; int size; +#ifdef CONFIG_XEN + unsigned has_foreign_mappings:1; +#endif struct mutex lock; void *vdso; +#ifdef CONFIG_X86_32 + struct desc_struct user_cs; + unsigned long exec_limit; +#endif } mm_context_t; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) void leave_mm(int cpu); #else static inline void leave_mm(int cpu) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/vmx.h +++ linux-ec2-2.6.32/arch/x86/include/asm/vmx.h @@ -56,6 +56,7 @@ #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 +#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -144,6 +145,8 @@ VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, TPR_THRESHOLD = 0x0000401c, SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + PLE_GAP = 0x00004020, + PLE_WINDOW = 0x00004022, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, @@ -248,6 +251,7 @@ #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/suspend_32.h +++ linux-ec2-2.6.32/arch/x86/include/asm/suspend_32.h @@ -15,6 +15,8 @@ struct saved_context { u16 es, fs, gs, ss; unsigned long cr0, cr2, cr3, cr4; + u64 misc_enable; + bool misc_enable_saved; struct desc_ptr gdt; struct desc_ptr idt; u16 ldt; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/hpet.h +++ linux-ec2-2.6.32/arch/x86/include/asm/hpet.h @@ -66,6 +66,7 @@ extern unsigned long hpet_address; extern unsigned long force_hpet_address; extern int hpet_force_user; +extern u8 hpet_msi_disable; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/smp.h +++ linux-ec2-2.6.32/arch/x86/include/asm/smp.h @@ -50,7 +50,7 @@ void (*smp_prepare_cpus)(unsigned max_cpus); void (*smp_cpus_done)(unsigned max_cpus); - void (*smp_send_stop)(void); + void (*stop_other_cpus)(int wait); void (*smp_send_reschedule)(int cpu); int (*cpu_up)(unsigned cpu); @@ -73,7 +73,12 @@ static inline void smp_send_stop(void) { - smp_ops.smp_send_stop(); + smp_ops.stop_other_cpus(0); +} + +static inline void stop_other_cpus(void) +{ + smp_ops.stop_other_cpus(1); } static inline void smp_prepare_boot_cpu(void) @@ -135,6 +140,8 @@ void native_cpu_die(unsigned int cpu); void native_play_dead(void); void play_dead_common(void); +void wbinvd_on_cpu(int cpu); +int wbinvd_on_all_cpus(void); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); @@ -147,6 +154,13 @@ { return cpumask_weight(cpu_callout_mask); } +#else /* !CONFIG_SMP */ +#define wbinvd_on_cpu(cpu) wbinvd() +static inline int wbinvd_on_all_cpus(void) +{ + wbinvd(); + return 0; +} #endif /* CONFIG_SMP */ extern unsigned disabled_cpus __cpuinitdata; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/apicdef.h +++ linux-ec2-2.6.32/arch/x86/include/asm/apicdef.h @@ -11,6 +11,8 @@ #define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 #define APIC_DEFAULT_PHYS_BASE 0xfee00000 +#ifndef CONFIG_XEN + #define APIC_ID 0x20 #define APIC_LVR 0x30 @@ -72,6 +74,7 @@ #define APIC_DEST_LOGICAL 0x00800 #define APIC_DEST_PHYSICAL 0x00000 #define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 #define APIC_DM_LOWEST 0x00100 #define APIC_DM_SMI 0x00200 #define APIC_DM_REMRD 0x00300 @@ -136,6 +139,16 @@ #define APIC_BASE_MSR 0x800 #define X2APIC_ENABLE (1UL << 10) +#else /* CONFIG_XEN */ + +enum { + APIC_DEST_ALLBUT = 0x1, + APIC_DEST_SELF, + APIC_DEST_ALLINC +}; + +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_32 # define MAX_IO_APICS 64 #else @@ -143,6 +156,8 @@ # define MAX_LOCAL_APIC 32768 #endif +#ifndef CONFIG_XEN + /* * All x86-64 systems are xAPIC compatible. * In the following, "apicid" is a physical APIC ID. @@ -413,6 +428,8 @@ #undef u32 +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_32 #define BAD_APICID 0xFFu #else --- linux-ec2-2.6.32.orig/arch/x86/include/asm/mmu_context.h +++ linux-ec2-2.6.32/arch/x86/include/asm/mmu_context.h @@ -36,8 +36,6 @@ unsigned cpu = smp_processor_id(); if (likely(prev != next)) { - /* stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); #ifdef CONFIG_SMP percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); @@ -47,6 +45,9 @@ /* Re-load page tables */ load_cr3(next->pgd); + /* stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + /* * load the LDT, if the LDT is different: */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/time.h +++ linux-ec2-2.6.32/arch/x86/include/asm/time.h @@ -7,4 +7,11 @@ extern void time_init(void); +#ifdef CONFIG_XEN +struct timespec; +extern int xen_independent_wallclock(void); +extern void xen_read_persistent_clock(struct timespec *); +extern int xen_update_persistent_clock(void); +#endif + #endif /* _ASM_X86_TIME_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/k8.h +++ linux-ec2-2.6.32/arch/x86/include/asm/k8.h @@ -1,11 +1,13 @@ #ifndef _ASM_X86_K8_H #define _ASM_X86_K8_H +#include #include extern struct pci_device_id k8_nb_ids[]; extern int early_is_k8_nb(u32 value); +extern struct resource *amd_get_mmconfig_range(struct resource *res); extern struct pci_dev **k8_northbridges; extern int num_k8_northbridges; extern int cache_k8_northbridges(void); @@ -13,11 +15,16 @@ extern int k8_scan_nodes(unsigned long start, unsigned long end); #ifdef CONFIG_K8_NB +extern int num_k8_northbridges; + static inline struct pci_dev *node_to_k8_nb_misc(int node) { return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; } + #else +#define num_k8_northbridges 0 + static inline struct pci_dev *node_to_k8_nb_misc(int node) { return NULL; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/irqflags.h +++ linux-ec2-2.6.32/arch/x86/include/asm/irqflags.h @@ -130,7 +130,7 @@ #define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ -#define INTERRUPT_RETURN iretq +#define INTERRUPT_RETURN jmp native_iret #define USERGS_SYSRET64 \ swapgs; \ sysretq; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/apic.h +++ linux-ec2-2.6.32/arch/x86/include/asm/apic.h @@ -10,12 +10,16 @@ #include #include #include +#ifndef CONFIG_XEN #include +#endif #include #include #include +#ifndef CONFIG_XEN #define ARCH_APICTIMER_STOPS_ON_C3 1 +#endif /* * Debugging macros @@ -47,6 +51,7 @@ #ifdef CONFIG_X86_LOCAL_APIC extern unsigned int apic_verbosity; +#ifndef CONFIG_XEN extern int local_apic_timer_c2_ok; extern int disable_apic; @@ -119,6 +124,8 @@ extern int x2apic_mode; +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_X2APIC /* * Make previous memory operations globally visible before @@ -365,6 +372,8 @@ */ extern struct apic *apic; +#ifndef CONFIG_XEN + /* * APIC functionality to boot other CPUs - only used on SMP: */ @@ -458,6 +467,8 @@ extern void generic_bigsmp_probe(void); +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_LOCAL_APIC @@ -477,6 +488,8 @@ DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); +#ifndef CONFIG_XEN + static inline unsigned int read_apic_id(void) { unsigned int reg; @@ -588,6 +601,8 @@ return physid_mask_of_physid(phys_apicid); } +#endif /* CONFIG_XEN */ + #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_32 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/pgtable_64_types.h +++ linux-ec2-2.6.32/arch/x86/include/asm/pgtable_64_types.h @@ -59,5 +59,7 @@ #define MODULES_VADDR _AC(0xffffffffa0000000, UL) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define ESPFIX_PGD_ENTRY _AC(-2, UL) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/ptrace.h +++ linux-ec2-2.6.32/arch/x86/include/asm/ptrace.h @@ -142,9 +142,6 @@ int error_code, int si_code); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); -extern long syscall_trace_enter(struct pt_regs *); -extern void syscall_trace_leave(struct pt_regs *); - static inline unsigned long regs_return_value(struct pt_regs *regs) { return regs->ax; @@ -224,12 +221,32 @@ extern void user_disable_single_step(struct task_struct *); extern void user_enable_block_step(struct task_struct *); -#ifdef CONFIG_X86_DEBUGCTLMSR +#if defined(CONFIG_XEN) +#define arch_has_block_step() (0) +#elif defined(CONFIG_X86_DEBUGCTLMSR) #define arch_has_block_step() (1) #else #define arch_has_block_step() (boot_cpu_data.x86 >= 6) #endif +#define ARCH_HAS_USER_SINGLE_STEP_INFO + +/* + * When hitting ptrace_stop(), we cannot return using SYSRET because + * that does not restore the full CPU state, only a minimal set. The + * ptracer can change arbitrary register values, which is usually okay + * because the usual ptrace stops run off the signal delivery path which + * forces IRET; however, ptrace_event() stops happen in arbitrary places + * in the kernel and don't force IRET path. + * + * So force IRET path after a ptrace stop. + */ +#define arch_ptrace_stop_needed(code, info) \ +({ \ + set_thread_flag(TIF_NOTIFY_RESUME); \ + false; \ +}) + struct user_desc; extern int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *info); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/dma-mapping.h +++ linux-ec2-2.6.32/arch/x86/include/asm/dma-mapping.h @@ -151,8 +151,6 @@ { struct dma_map_ops *ops = get_dma_ops(dev); - WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_from_coherent(dev, get_order(size), vaddr)) return; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/irq.h +++ linux-ec2-2.6.32/arch/x86/include/asm/irq.h @@ -15,7 +15,7 @@ return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) # define ARCH_HAS_NMI_WATCHDOG #endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/pvclock.h +++ linux-ec2-2.6.32/arch/x86/include/asm/pvclock.h @@ -10,5 +10,44 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec *ts); +void pvclock_resume(void); + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif defined(__x86_64__) + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} #endif /* _ASM_X86_PVCLOCK_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/suspend_64.h +++ linux-ec2-2.6.32/arch/x86/include/asm/suspend_64.h @@ -27,6 +27,8 @@ u16 ds, es, fs, gs, ss; unsigned long gs_base, gs_kernel_base, fs_base; unsigned long cr0, cr2, cr3, cr4, cr8; + u64 misc_enable; + bool misc_enable_saved; unsigned long efer; u16 gdt_pad; u16 gdt_limit; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/kexec.h +++ linux-ec2-2.6.32/arch/x86/include/asm/kexec.h @@ -5,14 +5,30 @@ # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_PGD 2 +# ifndef CONFIG_XEN # define PA_SWAP_PAGE 3 # define PAGES_NR 4 +# else /* CONFIG_XEN */ +/* + * The hypervisor interface implicitly requires that all entries (except + * for possibly the final one) are arranged in matching PA_/VA_ pairs. +# define VA_PGD 3 + */ +# define PA_SWAP_PAGE 4 +# define PAGES_NR 5 +# endif /* CONFIG_XEN */ #else # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_TABLE_PAGE 2 +# ifndef CONFIG_XEN # define PA_SWAP_PAGE 3 # define PAGES_NR 4 +# else /* CONFIG_XEN, see comment above +# define VA_TABLE_PAGE 3 */ +# define PA_SWAP_PAGE 4 +# define PAGES_NR 5 +# endif /* CONFIG_XEN */ #endif # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 @@ -163,6 +179,19 @@ }; #endif +/* Under Xen we need to work with machine addresses. These macros give the + * machine address of a certain page to the generic kexec code instead of + * the pseudo physical address which would be given by the default macros. + */ + +#ifdef CONFIG_XEN +#define KEXEC_ARCH_HAS_PAGE_MACROS +#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page)) +#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn)) +#define kexec_virt_to_phys(addr) virt_to_machine(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr)) +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/page_32_types.h +++ linux-ec2-2.6.32/arch/x86/include/asm/page_32_types.h @@ -22,7 +22,6 @@ #endif #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) -#define STACKFAULT_STACK 0 #define DOUBLEFAULT_STACK 1 #define NMI_STACK 0 #define DEBUG_STACK 0 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/scatterlist.h +++ linux-ec2-2.6.32/arch/x86/include/asm/scatterlist.h @@ -3,6 +3,10 @@ #define ISA_DMA_THRESHOLD (0x00ffffff) +#ifdef CONFIG_X86_XEN +# define sg_dma_len(sg) ((sg)->dma_length) +#endif + #include #endif /* _ASM_X86_SCATTERLIST_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/irq_vectors.h +++ linux-ec2-2.6.32/arch/x86/include/asm/irq_vectors.h @@ -113,7 +113,7 @@ */ #define LOCAL_PENDING_VECTOR 0xec -#define UV_BAU_MESSAGE 0xec +#define UV_BAU_MESSAGE 0xea /* * Self IPI vector for machine checks --- linux-ec2-2.6.32.orig/arch/x86/include/asm/trampoline.h +++ linux-ec2-2.6.32/arch/x86/include/asm/trampoline.h @@ -13,15 +13,18 @@ extern unsigned long init_rsp; extern unsigned long initial_code; +extern unsigned long initial_page_table; extern unsigned long initial_gs; #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) #define TRAMPOLINE_BASE 0x6000 extern unsigned long setup_trampoline(void); +extern void __init setup_trampoline_page_table(void); extern void __init reserve_trampoline_memory(void); #else -static inline void reserve_trampoline_memory(void) {}; +static inline void setup_trampoline_page_table(void) {} +static inline void reserve_trampoline_memory(void) {} #endif /* CONFIG_X86_TRAMPOLINE */ #endif /* __ASSEMBLY__ */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/cmpxchg_32.h +++ linux-ec2-2.6.32/arch/x86/include/asm/cmpxchg_32.h @@ -17,60 +17,33 @@ #define __xg(x) ((struct __xchg_dummy *)(x)) /* - * The semantics of XCHGCMP8B are a bit strange, this is why - * there is a loop and the loading of %%eax and %%edx has to - * be inside. This inlines well in most cases, the cached - * cost is around ~38 cycles. (in the future we might want - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that - * might have an implicit FPU-save as a cost, so it's not - * clear which path to go.) + * CMPXCHG8B only writes to the target if we had the previous + * value in registers, otherwise it acts as a read and gives us the + * "new previous" value. That is why there is a loop. Preloading + * EDX:EAX is a performance optimization: in the common case it means + * we need only one locked operation. * - * cmpxchg8b must be used with the lock prefix here to allow - * the instruction to be executed atomically, see page 3-102 - * of the instruction set reference 24319102.pdf. We need - * the reader side to see the coherent 64bit value. + * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very + * least an FPU save and/or %cr0.ts manipulation. + * + * cmpxchg8b must be used with the lock prefix here to allow the + * instruction to be executed atomically. We need to have the reader + * side to see the coherent 64bit value. */ -static inline void __set_64bit(unsigned long long *ptr, - unsigned int low, unsigned int high) +static inline void set_64bit(volatile u64 *ptr, u64 value) { + u32 low = value; + u32 high = value >> 32; + u64 prev = *ptr; + asm volatile("\n1:\t" - "movl (%0), %%eax\n\t" - "movl 4(%0), %%edx\n\t" - LOCK_PREFIX "cmpxchg8b (%0)\n\t" + LOCK_PREFIX "cmpxchg8b %0\n\t" "jnz 1b" - : /* no outputs */ - : "D"(ptr), - "b"(low), - "c"(high) - : "ax", "dx", "memory"); -} - -static inline void __set_64bit_constant(unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32)); -} - -#define ll_low(x) *(((unsigned int *)&(x)) + 0) -#define ll_high(x) *(((unsigned int *)&(x)) + 1) - -static inline void __set_64bit_var(unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr, ll_low(value), ll_high(value)); + : "=m" (*ptr), "+A" (prev) + : "b" (low), "c" (high) + : "memory"); } -#define set_64bit(ptr, value) \ - (__builtin_constant_p((value)) \ - ? __set_64bit_constant((ptr), (value)) \ - : __set_64bit_var((ptr), (value))) - -#define _set_64bit(ptr, value) \ - (__builtin_constant_p(value) \ - ? __set_64bit(ptr, (unsigned int)(value), \ - (unsigned int)((value) >> 32)) \ - : __set_64bit(ptr, ll_low((value)), ll_high((value)))) - /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway * Note 2: xchg has side effect, so that attribute volatile is necessary, @@ -82,20 +55,20 @@ switch (size) { case 1: asm volatile("xchgb %b0,%1" - : "=q" (x) - : "m" (*__xg(ptr)), "0" (x) + : "=q" (x), "+m" (*__xg(ptr)) + : "0" (x) : "memory"); break; case 2: asm volatile("xchgw %w0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) + : "=r" (x), "+m" (*__xg(ptr)) + : "0" (x) : "memory"); break; case 4: asm volatile("xchgl %0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) + : "=r" (x), "+m" (*__xg(ptr)) + : "0" (x) : "memory"); break; } @@ -139,21 +112,21 @@ unsigned long prev; switch (size) { case 1: - asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile(LOCK_PREFIX "cmpxchgb %b2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "q"(new), "0"(old) : "memory"); return prev; case 2: - asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile(LOCK_PREFIX "cmpxchgw %w2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; case 4: - asm volatile(LOCK_PREFIX "cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile(LOCK_PREFIX "cmpxchgl %2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; } @@ -172,21 +145,21 @@ unsigned long prev; switch (size) { case 1: - asm volatile("lock; cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("lock; cmpxchgb %b2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "q"(new), "0"(old) : "memory"); return prev; case 2: - asm volatile("lock; cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("lock; cmpxchgw %w2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; case 4: - asm volatile("lock; cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("lock; cmpxchgl %2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; } @@ -200,21 +173,21 @@ unsigned long prev; switch (size) { case 1: - asm volatile("cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("cmpxchgb %b2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "q"(new), "0"(old) : "memory"); return prev; case 2: - asm volatile("cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("cmpxchgw %w2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; case 4: - asm volatile("cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("cmpxchgl %2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; } @@ -226,11 +199,10 @@ unsigned long long new) { unsigned long long prev; - asm volatile(LOCK_PREFIX "cmpxchg8b %3" - : "=A"(prev) + asm volatile(LOCK_PREFIX "cmpxchg8b %1" + : "=A"(prev), "+m" (*__xg(ptr)) : "b"((unsigned long)new), "c"((unsigned long)(new >> 32)), - "m"(*__xg(ptr)), "0"(old) : "memory"); return prev; @@ -241,11 +213,10 @@ unsigned long long new) { unsigned long long prev; - asm volatile("cmpxchg8b %3" - : "=A"(prev) + asm volatile("cmpxchg8b %1" + : "=A"(prev), "+m"(*__xg(ptr)) : "b"((unsigned long)new), "c"((unsigned long)(new >> 32)), - "m"(*__xg(ptr)), "0"(old) : "memory"); return prev; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/segment.h +++ linux-ec2-2.6.32/arch/x86/include/asm/segment.h @@ -186,7 +186,9 @@ #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3) -#ifndef CONFIG_PARAVIRT +#if defined(CONFIG_X86_XEN) +#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) +#elif !defined(CONFIG_PARAVIRT) #define get_kernel_rpl() 0 #endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/nmi.h +++ linux-ec2-2.6.32/arch/x86/include/asm/nmi.h @@ -5,8 +5,6 @@ #include #include -#ifdef ARCH_HAS_NMI_WATCHDOG - /** * do_nmi_callback * @@ -16,6 +14,11 @@ int do_nmi_callback(struct pt_regs *regs, int cpu); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); + +extern int unknown_nmi_panic; + +#ifdef ARCH_HAS_NMI_WATCHDOG + extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); @@ -42,7 +45,6 @@ struct ctl_table; extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); -extern int unknown_nmi_panic; void arch_trigger_all_cpu_backtrace(void); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace @@ -65,7 +67,6 @@ */ return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC); } -#endif void lapic_watchdog_stop(void); int lapic_watchdog_init(unsigned nmi_hz); @@ -73,6 +74,9 @@ unsigned lapic_adjust_nmi_hz(unsigned hz); void disable_lapic_nmi_watchdog(void); void enable_lapic_nmi_watchdog(void); + +#endif + void stop_nmi(void); void restart_nmi(void); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/hardirq.h +++ linux-ec2-2.6.32/arch/x86/include/asm/hardirq.h @@ -18,7 +18,11 @@ #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; +#ifndef CONFIG_XEN unsigned int irq_tlb_count; +#else + unsigned int irq_lock_count; +#endif #endif #ifdef CONFIG_X86_MCE unsigned int irq_thermal_count; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/paravirt_types.h +++ linux-ec2-2.6.32/arch/x86/include/asm/paravirt_types.h @@ -118,6 +118,9 @@ void (*store_gdt)(struct desc_ptr *); void (*store_idt)(struct desc_ptr *); void (*set_ldt)(const void *desc, unsigned entries); +#ifdef CONFIG_X86_32 + void (*load_user_cs_desc)(int cpu, struct mm_struct *mm); +#endif /*CONFIG_X86_32*/ unsigned long (*store_tr)(void); void (*load_tls)(struct thread_struct *t, unsigned int cpu); #ifdef CONFIG_X86_64 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/thread_info.h +++ linux-ec2-2.6.32/arch/x86/include/asm/thread_info.h @@ -86,7 +86,6 @@ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ -#define TIF_ABI_PENDING 19 #define TIF_MEMDIE 20 #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -96,6 +95,9 @@ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ +#ifdef CONFIG_X86_XEN +#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */ +#endif #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -110,7 +112,6 @@ #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) -#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FREEZE (1 << TIF_FREEZE) @@ -119,6 +120,7 @@ #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_CSTAR (1 << TIF_CSTAR) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -145,9 +147,14 @@ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) /* flags to check in __switch_to() */ +#ifndef CONFIG_XEN #define _TIF_WORK_CTXSW \ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) +#else +#define _TIF_WORK_CTXSW (_TIF_NOTSC \ + /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/) +#endif #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/rwsem.h +++ linux-ec2-2.6.32/arch/x86/include/asm/rwsem.h @@ -41,6 +41,7 @@ #include #include #include +#include struct rwsem_waiter; @@ -55,17 +56,28 @@ /* * the semaphore definition + * + * The bias values and the counter type limits the number of + * potential readers/writers to 32767 for 32 bits and 2147483647 + * for 64 bits. */ -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) +#ifdef CONFIG_X86_64 +# define RWSEM_ACTIVE_MASK 0xffffffffL +#else +# define RWSEM_ACTIVE_MASK 0x0000ffffL +#endif + +#define RWSEM_UNLOCKED_VALUE 0x00000000L +#define RWSEM_ACTIVE_BIAS 0x00000001L +#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) +typedef signed long rwsem_count_t; + struct rw_semaphore { - signed long count; + rwsem_count_t count; spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -105,7 +117,7 @@ static inline void __down_read(struct rw_semaphore *sem) { asm volatile("# beginning down_read\n\t" - LOCK_PREFIX " incl (%%eax)\n\t" + LOCK_PREFIX _ASM_INC "(%1)\n\t" /* adds 0x00000001, returns the old value */ " jns 1f\n" " call call_rwsem_down_read_failed\n" @@ -121,14 +133,14 @@ */ static inline int __down_read_trylock(struct rw_semaphore *sem) { - __s32 result, tmp; + rwsem_count_t result, tmp; asm volatile("# beginning __down_read_trylock\n\t" - " movl %0,%1\n\t" + " mov %0,%1\n\t" "1:\n\t" - " movl %1,%2\n\t" - " addl %3,%2\n\t" + " mov %1,%2\n\t" + " add %3,%2\n\t" " jle 2f\n\t" - LOCK_PREFIX " cmpxchgl %2,%0\n\t" + LOCK_PREFIX " cmpxchg %2,%0\n\t" " jnz 1b\n\t" "2:\n\t" "# ending __down_read_trylock\n\t" @@ -143,13 +155,13 @@ */ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { - int tmp; + rwsem_count_t tmp; tmp = RWSEM_ACTIVE_WRITE_BIAS; asm volatile("# beginning down_write\n\t" - LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtract 0x0000ffff, returns the old value */ - " testl %%edx,%%edx\n\t" + " test %1,%1\n\t" /* was the count 0 before? */ " jz 1f\n" " call call_rwsem_down_write_failed\n" @@ -170,9 +182,9 @@ */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - signed long ret = cmpxchg(&sem->count, - RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + rwsem_count_t ret = cmpxchg(&sem->count, + RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; return 0; @@ -183,9 +195,9 @@ */ static inline void __up_read(struct rw_semaphore *sem) { - __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; + rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; asm volatile("# beginning __up_read\n\t" - LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ " jns 1f\n\t" " call call_rwsem_wake\n" @@ -201,18 +213,18 @@ */ static inline void __up_write(struct rw_semaphore *sem) { + rwsem_count_t tmp; asm volatile("# beginning __up_write\n\t" - " movl %2,%%edx\n\t" - LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */ " jz 1f\n" " call call_rwsem_wake\n" "1:\n\t" "# ending __up_write\n" - : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc", "edx"); + : "+m" (sem->count), "=d" (tmp) + : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS) + : "memory", "cc"); } /* @@ -221,33 +233,38 @@ static inline void __downgrade_write(struct rw_semaphore *sem) { asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX " addl %2,(%%eax)\n\t" - /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ + LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t" + /* + * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) + * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) + */ " jns 1f\n\t" " call call_rwsem_downgrade_wake\n" "1:\n\t" "# ending __downgrade_write\n" : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_WAITING_BIAS) + : "a" (sem), "er" (-RWSEM_WAITING_BIAS) : "memory", "cc"); } /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(rwsem_count_t delta, + struct rw_semaphore *sem) { - asm volatile(LOCK_PREFIX "addl %1,%0" + asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" : "+m" (sem->count) - : "ir" (delta)); + : "er" (delta)); } /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, + struct rw_semaphore *sem) { - int tmp = delta; + rwsem_count_t tmp = delta; asm volatile(LOCK_PREFIX "xadd %0,%1" : "+r" (tmp), "+m" (sem->count) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/msr.h +++ linux-ec2-2.6.32/arch/x86/include/asm/msr.h @@ -27,6 +27,18 @@ }; }; +struct msr_info { + u32 msr_no; + struct msr reg; + struct msr *msrs; + int err; +}; + +struct msr_regs_info { + u32 *regs; + int err; +}; + static inline unsigned long long native_read_tscp(unsigned int *aux) { unsigned long low, high; @@ -244,11 +256,14 @@ #define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) +struct msr *msrs_alloc(void); +void msrs_free(struct msr *msrs); + #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); -void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); -void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/sys_ia32.h +++ linux-ec2-2.6.32/arch/x86/include/asm/sys_ia32.h @@ -62,9 +62,6 @@ asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); - struct oldold_utsname; struct old_utsname; asmlinkage long sys32_olduname(struct oldold_utsname __user *); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/boot.h +++ linux-ec2-2.6.32/arch/x86/include/asm/boot.h @@ -16,7 +16,7 @@ & ~(CONFIG_PHYSICAL_ALIGN - 1)) /* Minimum kernel alignment, as a power of two */ -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else #define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/espfix.h +++ linux-ec2-2.6.32/arch/x86/include/asm/espfix.h @@ -0,0 +1,16 @@ +#ifndef _ASM_X86_ESPFIX_H +#define _ASM_X86_ESPFIX_H + +#ifdef CONFIG_X86_64 + +#include + +DECLARE_PER_CPU(unsigned long, espfix_stack); +DECLARE_PER_CPU(unsigned long, espfix_waddr); + +extern void init_espfix_bsp(void); +extern void init_espfix_ap(void); + +#endif /* CONFIG_X86_64 */ + +#endif /* _ASM_X86_ESPFIX_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/uaccess.h +++ linux-ec2-2.6.32/arch/x86/include/asm/uaccess.h @@ -42,7 +42,7 @@ * Returns 0 if the range is valid, nonzero otherwise. * * This is equivalent to the following test: - * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) + * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64) * * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ @@ -570,7 +570,6 @@ #ifdef CONFIG_X86_32 # include "uaccess_32.h" #else -# define ARCH_HAS_SEARCH_EXTABLE # include "uaccess_64.h" #endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/archrandom.h +++ linux-ec2-2.6.32/arch/x86/include/asm/archrandom.h @@ -0,0 +1,75 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu , + * H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#ifndef ASM_X86_ARCHRANDOM_H +#define ASM_X86_ARCHRANDOM_H + +#include +#include +#include +#include + +#define RDRAND_RETRY_LOOPS 10 + +#define RDRAND_INT ".byte 0x0f,0xc7,0xf0" +#ifdef CONFIG_X86_64 +# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" +#else +# define RDRAND_LONG RDRAND_INT +#endif + +#ifdef CONFIG_ARCH_RANDOM + +#define GET_RANDOM(name, type, rdrand, nop) \ +static inline int name(type *v) \ +{ \ + int ok; \ + alternative_io("movl $0, %0\n\t" \ + nop, \ + "\n1: " rdrand "\n\t" \ + "jc 2f\n\t" \ + "decl %0\n\t" \ + "jnz 1b\n\t" \ + "2:", \ + X86_FEATURE_RDRAND, \ + ASM_OUTPUT2("=r" (ok), "=a" (*v)), \ + "0" (RDRAND_RETRY_LOOPS)); \ + return ok; \ +} + +#ifdef CONFIG_X86_64 + +GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); +GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); + +#else + +GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); +GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); + +#endif /* CONFIG_X86_64 */ + +#endif /* CONFIG_ARCH_RANDOM */ + +extern void x86_init_rdrand(struct cpuinfo_x86 *c); + +#endif /* ASM_X86_ARCHRANDOM_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/io_apic.h +++ linux-ec2-2.6.32/arch/x86/include/asm/io_apic.h @@ -160,6 +160,7 @@ struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); +void setup_IO_APIC_irq_extra(u32 gsi); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); extern void ioapic_insert_resources(void); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/compat.h +++ linux-ec2-2.6.32/arch/x86/include/asm/compat.h @@ -204,7 +204,7 @@ return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = task_pt_regs(current); return (void __user *)regs->sp - len; --- linux-ec2-2.6.32.orig/arch/x86/include/asm/syscalls.h +++ linux-ec2-2.6.32/arch/x86/include/asm/syscalls.h @@ -55,8 +55,6 @@ struct oldold_utsname; struct old_utsname; -asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); asmlinkage int old_mmap(struct mmap_arg_struct __user *); asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/smpboot_hooks.h +++ linux-ec2-2.6.32/arch/x86/include/asm/smpboot_hooks.h @@ -34,7 +34,7 @@ */ CMOS_WRITE(0, 0xf); - *((volatile long *)phys_to_virt(apic->trampoline_phys_low)) = 0; + *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0; } static inline void __init smpboot_setup_io_apic(void) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/desc.h +++ linux-ec2-2.6.32/arch/x86/include/asm/desc.h @@ -5,6 +5,7 @@ #include #include #include +#include static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info) @@ -93,6 +94,9 @@ #define load_TLS(t, cpu) native_load_tls(t, cpu) #define set_ldt native_set_ldt +#ifdef CONFIG_X86_32 +#define load_user_cs_desc native_load_user_cs_desc +#endif /*CONFIG_X86_32*/ #define write_ldt_entry(dt, entry, desc) \ native_write_ldt_entry(dt, entry, desc) @@ -392,4 +396,25 @@ _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); } +#ifdef CONFIG_X86_32 +static inline void set_user_cs(struct desc_struct *desc, unsigned long limit) +{ + limit = (limit - 1) / PAGE_SIZE; + desc->a = limit & 0xffff; + desc->b = (limit & 0xf0000) | 0x00c0fb00; +} + +static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm) +{ + get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs; +} + +#define arch_add_exec_range arch_add_exec_range +#define arch_remove_exec_range arch_remove_exec_range +#define arch_flush_exec_range arch_flush_exec_range +extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_flush_exec_range(struct mm_struct *mm); +#endif /* CONFIG_X86_32 */ + #endif /* _ASM_X86_DESC_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/i8253.h +++ linux-ec2-2.6.32/arch/x86/include/asm/i8253.h @@ -8,10 +8,14 @@ extern spinlock_t i8253_lock; +#ifdef CONFIG_GENERIC_CLOCKEVENTS + extern struct clock_event_device *global_clock_event; extern void setup_pit_timer(void); +#endif + #define inb_pit inb_p #define outb_pit outb_p --- linux-ec2-2.6.32.orig/arch/x86/include/asm/i387.h +++ linux-ec2-2.6.32/arch/x86/include/asm/i387.h @@ -242,12 +242,13 @@ /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. safe_address is a random variable that should be in L1 */ - alternative_input( - GENERIC_NOP8 GENERIC_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (safe_address)); + if (unlikely(boot_cpu_has(X86_FEATURE_FXSAVE_LEAK))) { + asm volatile( + "fnclex\n\t" + "emms\n\t" + "fildl %[addr]" /* set F?P to defined value */ + : : [addr] "m" (safe_address)); + } end: task_thread_info(tsk)->status &= ~TS_USEDFPU; } --- linux-ec2-2.6.32.orig/arch/x86/include/asm/e820.h +++ linux-ec2-2.6.32/arch/x86/include/asm/e820.h @@ -129,7 +129,11 @@ #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ +#ifndef CONFIG_XEN #define ISA_START_ADDRESS 0xa0000 +#else +#define ISA_START_ADDRESS 0 +#endif #define ISA_END_ADDRESS 0x100000 #define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/processor.h +++ linux-ec2-2.6.32/arch/x86/include/asm/processor.h @@ -180,7 +180,7 @@ unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ - asm("cpuid" + asm volatile("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), @@ -1052,4 +1052,23 @@ return ratio; } +/* + * AMD errata checking + */ +#ifdef CONFIG_CPU_SUP_AMD +extern const int amd_erratum_400[]; +extern bool cpu_has_amd_erratum(const int *); + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +#else +#define cpu_has_amd_erratum(x) (false) +#endif /* CONFIG_CPU_SUP_AMD */ + #endif /* _ASM_X86_PROCESSOR_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/pgtable.h +++ linux-ec2-2.6.32/arch/x86/include/asm/pgtable.h @@ -130,12 +130,16 @@ return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; } +static inline unsigned long pud_pfn(pud_t pud) +{ + return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) { - return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); + return pmd_flags(pte) & _PAGE_PSE; } static inline pte_t pte_set_flags(pte_t pte, pteval_t set) @@ -327,7 +331,13 @@ static inline int pmd_present(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_PRESENT; + /* + * Checking for _PAGE_PSE is needed too because + * split_huge_page will temporarily clear the present bit (but + * the _PAGE_PSE flag will remain set at all times while the + * _PAGE_PRESENT bit is clear). + */ + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } static inline int pmd_none(pmd_t pmd) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/setup.h +++ linux-ec2-2.6.32/arch/x86/include/asm/setup.h @@ -57,6 +57,8 @@ #ifndef _SETUP +#include + /* * This is set up by the setup-routine at boot-time */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/hypervisor.h +++ linux-ec2-2.6.32/arch/x86/include/asm/hypervisor.h @@ -24,3 +24,7 @@ extern void init_hypervisor_platform(void); #endif + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include_next +#endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/amd_iommu_types.h +++ linux-ec2-2.6.32/arch/x86/include/asm/amd_iommu_types.h @@ -305,6 +305,9 @@ /* capabilities of that IOMMU read from ACPI */ u32 cap; + /* flags read from acpi table */ + u8 acpi_flags; + /* * Capability pointer. There could be more than one IOMMU per PCI * device function if there are more than one AMD IOMMU capability @@ -348,6 +351,15 @@ /* default dma_ops domain for that IOMMU */ struct dma_ops_domain *default_dom; + + /* + * This array is required to work around a potential BIOS bug. + * The BIOS may miss to restore parts of the PCI configuration + * space when the system resumes from S3. The result is that the + * IOMMU does not execute commands anymore which leads to system + * failure. + */ + u32 cache_cfg[4]; }; /* @@ -469,4 +481,10 @@ /* some function prototypes */ extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); +static inline bool is_rd890_iommu(struct pci_dev *pdev) +{ + return (pdev->vendor == PCI_VENDOR_ID_ATI) && + (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); +} + #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/msr-index.h +++ linux-ec2-2.6.32/arch/x86/include/asm/msr-index.h @@ -81,11 +81,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_AMD64_MC0_MASK 0xc0010044 + #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) #define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) #define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) #define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) +#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) @@ -104,6 +108,9 @@ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 +#define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 @@ -123,6 +130,7 @@ #define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff #define FAM10H_MMIO_CONF_BASE_SHIFT 20 +#define MSR_FAM10H_NODE_ID 0xc001100c /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a @@ -195,8 +203,9 @@ #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_IA32_FEATURE_CONTROL 0x0000003a -#define FEATURE_CONTROL_LOCKED (1<<0) -#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) +#define FEATURE_CONTROL_LOCKED (1<<0) +#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) +#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) #define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE_BSP (1<<8) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/traps.h +++ linux-ec2-2.6.32/arch/x86/include/asm/traps.h @@ -37,6 +37,9 @@ asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); +#ifdef CONFIG_X86_XEN +asmlinkage void fixup_4gb_segment(void); +#endif dotraplinkage void do_divide_error(struct pt_regs *, long); dotraplinkage void do_debug(struct pt_regs *, long); @@ -65,6 +68,9 @@ dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); +#ifdef CONFIG_XEN +void do_fixup_4gb_segment(struct pt_regs *, long); +#endif #endif static inline int get_si_code(unsigned long condition) --- linux-ec2-2.6.32.orig/arch/x86/include/asm/amd_iommu.h +++ linux-ec2-2.6.32/arch/x86/include/asm/amd_iommu.h @@ -32,6 +32,7 @@ extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_shutdown(void); extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_init_api(void); #else static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } --- linux-ec2-2.6.32.orig/arch/x86/include/asm/pgalloc.h +++ linux-ec2-2.6.32/arch/x86/include/asm/pgalloc.h @@ -23,6 +23,11 @@ #endif /* + * Flags to use when allocating a user page table page. + */ +extern gfp_t __userpte_alloc_gfp; + +/* * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); --- linux-ec2-2.6.32.orig/arch/x86/include/asm/cmpxchg_64.h +++ linux-ec2-2.6.32/arch/x86/include/asm/cmpxchg_64.h @@ -8,13 +8,11 @@ #define __xg(x) ((volatile long *)(x)) -static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) +static inline void set_64bit(volatile u64 *ptr, u64 val) { *ptr = val; } -#define _set_64bit set_64bit - /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway * Note 2: xchg has side effect, so that attribute volatile is necessary, @@ -26,26 +24,26 @@ switch (size) { case 1: asm volatile("xchgb %b0,%1" - : "=q" (x) - : "m" (*__xg(ptr)), "0" (x) + : "=q" (x), "+m" (*__xg(ptr)) + : "0" (x) : "memory"); break; case 2: asm volatile("xchgw %w0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) + : "=r" (x), "+m" (*__xg(ptr)) + : "0" (x) : "memory"); break; case 4: asm volatile("xchgl %k0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) + : "=r" (x), "+m" (*__xg(ptr)) + : "0" (x) : "memory"); break; case 8: asm volatile("xchgq %0,%1" - : "=r" (x) - : "m" (*__xg(ptr)), "0" (x) + : "=r" (x), "+m" (*__xg(ptr)) + : "0" (x) : "memory"); break; } @@ -66,27 +64,27 @@ unsigned long prev; switch (size) { case 1: - asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile(LOCK_PREFIX "cmpxchgb %b2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "q"(new), "0"(old) : "memory"); return prev; case 2: - asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile(LOCK_PREFIX "cmpxchgw %w2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; case 4: - asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile(LOCK_PREFIX "cmpxchgl %k2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; case 8: - asm volatile(LOCK_PREFIX "cmpxchgq %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile(LOCK_PREFIX "cmpxchgq %2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; } @@ -105,21 +103,27 @@ unsigned long prev; switch (size) { case 1: - asm volatile("lock; cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("lock; cmpxchgb %b2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "q"(new), "0"(old) : "memory"); return prev; case 2: - asm volatile("lock; cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("lock; cmpxchgw %w2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; case 4: - asm volatile("lock; cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("lock; cmpxchgl %k2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) + : "memory"); + return prev; + case 8: + asm volatile("lock; cmpxchgq %2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; } @@ -133,27 +137,27 @@ unsigned long prev; switch (size) { case 1: - asm volatile("cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("cmpxchgb %b2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "q"(new), "0"(old) : "memory"); return prev; case 2: - asm volatile("cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("cmpxchgw %w2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; case 4: - asm volatile("cmpxchgl %k1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("cmpxchgl %k2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; case 8: - asm volatile("cmpxchgq %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) + asm volatile("cmpxchgq %2,%1" + : "=a"(prev), "+m"(*__xg(ptr)) + : "r"(new), "0"(old) : "memory"); return prev; } --- linux-ec2-2.6.32.orig/arch/x86/include/asm/mce.h +++ linux-ec2-2.6.32/arch/x86/include/asm/mce.h @@ -67,6 +67,8 @@ __u32 socketid; /* CPU socket ID */ __u32 apicid; /* CPU initial apic ID */ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ + __u64 aux0; /* model specific */ + __u64 aux1; /* model specific */ }; /* @@ -214,5 +216,11 @@ void mce_log_therm_throt_event(__u64 status); +#ifdef CONFIG_X86_THERMAL_VECTOR +extern void mcheck_intel_therm_init(void); +#else +static inline void mcheck_intel_therm_init(void) { } +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ --- linux-ec2-2.6.32.orig/arch/x86/include/asm/cpufeature.h +++ linux-ec2-2.6.32/arch/x86/include/asm/cpufeature.h @@ -124,6 +124,8 @@ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */ #define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ @@ -150,9 +152,10 @@ #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ #define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ #define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ +#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ +#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -246,8 +249,13 @@ #define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1) #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) +#ifndef CONFIG_XEN #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#else +#define cpu_has_xsave boot_cpu_has(X86_FEATURE_OSXSAVE) +#endif #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) +#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 --- linux-ec2-2.6.32.orig/arch/x86/include/asm/timer.h +++ linux-ec2-2.6.32/arch/x86/include/asm/timer.h @@ -38,6 +38,22 @@ * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" + * + * In: + * + * ns = cycles * cyc2ns_scale / SC + * + * Although we may still have enough bits to store the value of ns, + * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, + * leading to an incorrect result. + * + * To avoid this, we can decompose 'cycles' into quotient and remainder + * of division by SC. Then, + * + * ns = (quot * SC + rem) * cyc2ns_scale / SC + * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC + * + * - sqazi@google.com */ DECLARE_PER_CPU(unsigned long, cyc2ns); @@ -49,7 +65,8 @@ { int cpu = smp_processor_id(); unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR; + ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), + (1UL << CYC2NS_SCALE_FACTOR)); return ns; } --- linux-ec2-2.6.32.orig/arch/x86/include/asm/io.h +++ linux-ec2-2.6.32/arch/x86/include/asm/io.h @@ -172,6 +172,7 @@ extern void iounmap(volatile void __iomem *addr); +extern void set_iounmap_nonlazy(void); #ifdef CONFIG_X86_32 # include "io_32.h" --- linux-ec2-2.6.32.orig/arch/x86/include/asm/topology.h +++ linux-ec2-2.6.32/arch/x86/include/asm/topology.h @@ -30,7 +30,7 @@ # define ENABLE_TOPO_DEFINES # endif #else -# ifdef CONFIG_SMP +# if defined(CONFIG_SMP) && !defined(CONFIG_XEN) # define ENABLE_TOPO_DEFINES # endif #endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/fixmap.h +++ linux-ec2-2.6.32/arch/x86/include/asm/fixmap.h @@ -82,6 +82,9 @@ #endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif @@ -126,9 +129,6 @@ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif --- linux-ec2-2.6.32.orig/arch/x86/include/asm/uv/uv_hub.h +++ linux-ec2-2.6.32/arch/x86/include/asm/uv/uv_hub.h @@ -11,7 +11,7 @@ #ifndef _ASM_X86_UV_UV_HUB_H #define _ASM_X86_UV_UV_HUB_H -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_UV #include #include #include @@ -31,20 +31,20 @@ * contiguous (although various IO spaces may punch holes in * it).. * - * N - Number of bits in the node portion of a socket physical - * address. + * N - Number of bits in the node portion of a socket physical + * address. * - * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of - * routers always have low bit of 1, C/MBricks have low bit - * equal to 0. Most addressing macros that target UV hub chips - * right shift the NASID by 1 to exclude the always-zero bit. - * NASIDs contain up to 15 bits. + * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of + * routers always have low bit of 1, C/MBricks have low bit + * equal to 0. Most addressing macros that target UV hub chips + * right shift the NASID by 1 to exclude the always-zero bit. + * NASIDs contain up to 15 bits. * * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead * of nasids. * - * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant - * of the nasid for socket usage. + * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant + * of the nasid for socket usage. * * * NumaLink Global Physical Address Format: @@ -71,12 +71,12 @@ * * * APICID format - * NOTE!!!!!! This is the current format of the APICID. However, code - * should assume that this will change in the future. Use functions - * in this file for all APICID bit manipulations and conversion. + * NOTE!!!!!! This is the current format of the APICID. However, code + * should assume that this will change in the future. Use functions + * in this file for all APICID bit manipulations and conversion. * - * 1111110000000000 - * 5432109876543210 + * 1111110000000000 + * 5432109876543210 * pppppppppplc0cch * sssssssssss * @@ -89,9 +89,9 @@ * Note: Processor only supports 12 bits in the APICID register. The ACPI * tables hold all 16 bits. Software needs to be aware of this. * - * Unless otherwise specified, all references to APICID refer to - * the FULL value contained in ACPI tables, not the subset in the - * processor APICID register. + * Unless otherwise specified, all references to APICID refer to + * the FULL value contained in ACPI tables, not the subset in the + * processor APICID register. */ @@ -151,16 +151,16 @@ }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) +#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) /* * Local & Global MMR space macros. - * Note: macros are intended to be used ONLY by inline functions - * in this file - not by other kernel code. - * n - NASID (full 15-bit global nasid) - * g - GNODE (full 15-bit global nasid, right shifted 1) - * p - PNODE (local part of nsids, right shifted 1) + * Note: macros are intended to be used ONLY by inline functions + * in this file - not by other kernel code. + * n - NASID (full 15-bit global nasid) + * g - GNODE (full 15-bit global nasid, right shifted 1) + * p - PNODE (local part of nsids, right shifted 1) */ #define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) #define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) @@ -213,8 +213,8 @@ /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. - * Note: use the standard __pa() & __va() macros for converting - * between socket virtual and socket physical addresses. + * Note: use the standard __pa() & __va() macros for converting + * between socket virtual and socket physical addresses. */ /* socket phys RAM --> UV global physical address */ @@ -265,21 +265,18 @@ * Access global MMRs using the low memory MMR32 space. This region supports * faster MMR access but not all MMRs are accessible in this space. */ -static inline unsigned long *uv_global_mmr32_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR32_BASE | UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr32(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr32_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr32(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset) { return readq(uv_global_mmr32_address(pnode, offset)); } @@ -288,25 +285,32 @@ * Access Global MMR space using the MMR space located at the top of physical * memory. */ -static inline unsigned long *uv_global_mmr64_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR64_BASE | UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr64(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr64_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr64(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset) { return readq(uv_global_mmr64_address(pnode, offset)); } +static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val) +{ + writeb(val, uv_global_mmr64_address(pnode, offset)); +} + +static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset) +{ + return readb(uv_global_mmr64_address(pnode, offset)); +} + /* * Access hub local MMRs. Faster than using global space but only local MMRs * are accessible. @@ -426,11 +430,17 @@ } } +static inline unsigned long uv_scir_offset(int apicid) +{ + return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f); +} + static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) { if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_write_global_mmr8(uv_cpu_to_pnode(cpu), + uv_cpu_hub_info(cpu)->scir.offset, value); uv_cpu_hub_info(cpu)->scir.state = value; - uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); } } --- linux-ec2-2.6.32.orig/arch/x86/include/asm/xen/hypervisor.h +++ linux-ec2-2.6.32/arch/x86/include/asm/xen/hypervisor.h @@ -43,7 +43,7 @@ XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ }; -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN extern enum xen_domain_type xen_domain_type; #else #define xen_domain_type XEN_NATIVE --- linux-ec2-2.6.32.orig/arch/x86/include/asm/xen/interface.h +++ linux-ec2-2.6.32/arch/x86/include/asm/xen/interface.h @@ -10,17 +10,20 @@ #define _ASM_X86_XEN_INTERFACE_H #ifdef __XEN__ -#define __DEFINE_GUEST_HANDLE(name, type) \ +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name #else -#define __DEFINE_GUEST_HANDLE(name, type) \ +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef type * __guest_handle_ ## name #endif +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \ + ___DEFINE_XEN_GUEST_HANDLE(name, type); \ + ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type) #define DEFINE_GUEST_HANDLE_STRUCT(name) \ - __DEFINE_GUEST_HANDLE(name, struct name) -#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) -#define GUEST_HANDLE(name) __guest_handle_ ## name + __DEFINE_XEN_GUEST_HANDLE(name, struct name) +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) +#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name #ifdef __XEN__ #if defined(__i386__) @@ -47,14 +50,8 @@ #endif #ifndef __ASSEMBLY__ -/* Guest handles for primitive C types. */ -__DEFINE_GUEST_HANDLE(uchar, unsigned char); -__DEFINE_GUEST_HANDLE(uint, unsigned int); -__DEFINE_GUEST_HANDLE(ulong, unsigned long); -DEFINE_GUEST_HANDLE(char); -DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(long); -DEFINE_GUEST_HANDLE(void); +typedef unsigned long xen_pfn_t; +typedef unsigned long xen_ulong_t; #endif #ifndef HYPERVISOR_VIRT_START --- linux-ec2-2.6.32.orig/arch/x86/kvm/lapic.c +++ linux-ec2-2.6.32/arch/x86/kvm/lapic.c @@ -374,6 +374,12 @@ if (unlikely(!apic_enabled(apic))) break; + if (trig_mode) { + apic_debug("level trig mode for vector %d", vector); + apic_set_vector(vector, apic->regs + APIC_TMR); + } else + apic_clear_vector(vector, apic->regs + APIC_TMR); + result = !apic_test_and_set_irr(vector, apic); trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector, !result); @@ -384,11 +390,6 @@ break; } - if (trig_mode) { - apic_debug("level trig mode for vector %d", vector); - apic_set_vector(vector, apic->regs + APIC_TMR); - } else - apic_clear_vector(vector, apic->regs + APIC_TMR); kvm_vcpu_kick(vcpu); break; @@ -518,7 +519,8 @@ ASSERT(apic != NULL); /* if initial count is 0, current count should also be 0 */ - if (apic_get_reg(apic, APIC_TMICT) == 0) + if (apic_get_reg(apic, APIC_TMICT) == 0 || + apic->lapic_timer.period == 0) return 0; remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); @@ -1156,6 +1158,7 @@ hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); start_apic_timer(apic); + apic->irr_pending = true; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) --- linux-ec2-2.6.32.orig/arch/x86/kvm/mmu.h +++ linux-ec2-2.6.32/arch/x86/kvm/mmu.h @@ -37,6 +37,12 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) --- linux-ec2-2.6.32.orig/arch/x86/kvm/vmx.c +++ linux-ec2-2.6.32/arch/x86/kvm/vmx.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "x86.h" @@ -61,6 +62,27 @@ static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +/* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * ple_gap: upper bound on the amount of time between two successive + * executions of PAUSE in a loop. Also indicate if ple enabled. + * According to test, this time is usually small than 41 cycles. + * ple_window: upper bound on the amount of time a guest is allowed to execute + * in a PAUSE loop. Tests indicate that most spinlocks are held for + * less than 2^12 cycles + * Time is measured based on a counter that runs at the same rate as the TSC, + * refer SDM volume 3b section 21.6.13 & 22.1.3. + */ +#define KVM_VMX_DEFAULT_PLE_GAP 41 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; +module_param(ple_gap, int, S_IRUGO); + +static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, int, S_IRUGO); + +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + struct vmcs { u32 revision_id; u32 abort; @@ -92,7 +114,7 @@ } host_state; struct { int vm86_active; - u8 save_iopl; + ulong save_rflags; struct kvm_save_segment { u16 selector; unsigned long base; @@ -127,6 +149,7 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); +static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; @@ -320,6 +343,12 @@ SECONDARY_EXEC_UNRESTRICTED_GUEST; } +static inline int cpu_has_vmx_ple(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PAUSE_LOOP_EXITING; +} + static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && @@ -625,7 +654,7 @@ */ vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = kvm_read_fs(); + savesegment(fs, vmx->host_state.fs_sel); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -633,7 +662,7 @@ vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - vmx->host_state.gs_sel = kvm_read_gs(); + savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -650,10 +679,7 @@ #endif #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - save_msrs(vmx->host_msrs + - vmx->msr_offset_kernel_gs_base, 1); - + save_msrs(vmx->host_msrs + vmx->msr_offset_kernel_gs_base, 1); #endif load_msrs(vmx->guest_msrs, vmx->save_nmsrs); load_transition_efer(vmx); @@ -661,32 +687,36 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - unsigned long flags; - if (!vmx->host_state.loaded) return; ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; - if (vmx->host_state.fs_reload_needed) - kvm_load_fs(vmx->host_state.fs_sel); +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) + save_msrs(vmx->guest_msrs + vmx->msr_offset_kernel_gs_base, 1); +#endif if (vmx->host_state.gs_ldt_reload_needed) { kvm_load_ldt(vmx->host_state.ldt_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_save(flags); - kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); + load_gs_index(vmx->host_state.gs_sel); +#else + loadsegment(gs, vmx->host_state.gs_sel); #endif - local_irq_restore(flags); } + if (vmx->host_state.fs_reload_needed) + loadsegment(fs, vmx->host_state.fs_sel); reload_tss(); +#ifdef CONFIG_X86_64 + save_msrs(vmx->guest_msrs, vmx->msr_offset_kernel_gs_base); + save_msrs(vmx->guest_msrs + vmx->msr_offset_kernel_gs_base + 1, + vmx->save_nmsrs - vmx->msr_offset_kernel_gs_base - 1); +#else save_msrs(vmx->guest_msrs, vmx->save_nmsrs); +#endif load_msrs(vmx->host_msrs, vmx->save_nmsrs); reload_host_efer(vmx); + load_gdt(&__get_cpu_var(host_gdt)); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -783,18 +813,23 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - unsigned long rflags; + unsigned long rflags, save_rflags; rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) - rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (to_vmx(vcpu)->rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) { + to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } vmcs_writel(GUEST_RFLAGS, rflags); } @@ -1057,6 +1092,7 @@ case MSR_IA32_TSC: rdtscll(host_tsc); guest_write_tsc(data, host_tsc); + vcpu->arch.hv_clock.tsc_timestamp = 0; break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -1133,9 +1169,16 @@ u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) - == FEATURE_CONTROL_LOCKED; + if (msr & FEATURE_CONTROL_LOCKED) { + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) + && tboot_enabled()) + return 1; + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && !tboot_enabled()) + return 1; + } + + return 0; /* locked but not enabled */ } @@ -1143,22 +1186,26 @@ { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - u64 old; + u64 old, test_bits; INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) - != (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) + + test_bits = FEATURE_CONTROL_LOCKED; + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + if (tboot_enabled()) + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; + + if ((old & test_bits) != test_bits) { /* enable and lock */ - wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED); + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); + } write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) : "memory", "cc"); + + store_gdt(&__get_cpu_var(host_gdt)); } static void vmclear_local_vcpus(void) @@ -1250,7 +1297,8 @@ SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST; + SECONDARY_EXEC_UNRESTRICTED_GUEST | + SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1394,6 +1442,9 @@ if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); + if (!cpu_has_vmx_ple()) + ple_gap = 0; + return alloc_kvm_area(); } @@ -1431,8 +1482,8 @@ vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); - flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1501,8 +1552,7 @@ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vmx->rmode.save_iopl - = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -2302,13 +2352,22 @@ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!enable_ept) + if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + enable_unrestricted_guest = 0; + } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (!ple_gap) + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } + if (ple_gap) { + vmcs_write32(PLE_GAP, ple_gap); + vmcs_write32(PLE_WINDOW, ple_window); + } + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -2320,8 +2379,8 @@ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); @@ -2510,7 +2569,7 @@ if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx->vcpu.arch.cr0 = 0x60000010; + vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); @@ -2674,6 +2733,12 @@ kvm_queue_exception(vcpu, vec); return 1; case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return 0; /* fall through */ @@ -2790,6 +2855,13 @@ kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; @@ -3049,7 +3121,7 @@ trace_kvm_msr_write(ecx, data); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + if (kvm_set_msr(vcpu, ecx, data) != 0) { kvm_inject_gp(vcpu, 0); return 1; } @@ -3358,6 +3430,18 @@ } /* + * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE + * exiting, so only get here on cpu with PAUSE-Loop-Exiting. + */ +static int handle_pause(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + skip_emulated_instruction(vcpu); + kvm_vcpu_on_spin(vcpu); + + return 1; +} + +/* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. @@ -3394,6 +3478,7 @@ [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, }; static const int kvm_vmx_max_exit_handlers = --- linux-ec2-2.6.32.orig/arch/x86/kvm/irq.h +++ linux-ec2-2.6.32/arch/x86/kvm/irq.h @@ -85,7 +85,11 @@ static inline int irqchip_in_kernel(struct kvm *kvm) { - return pic_irqchip(kvm) != NULL; + int ret; + + ret = (pic_irqchip(kvm) != NULL); + smp_rmb(); + return ret; } void kvm_pic_reset(struct kvm_kpic_state *s); --- linux-ec2-2.6.32.orig/arch/x86/kvm/svm.c +++ linux-ec2-2.6.32/arch/x86/kvm/svm.c @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -62,6 +63,8 @@ #define nsvm_printk(fmt, args...) do {} while(0) #endif +static bool erratum_383_found __read_mostly; + static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, @@ -299,6 +302,31 @@ svm_set_interrupt_shadow(vcpu, 0); } +static void svm_init_erratum_383(void) +{ + u32 low, high; + int err; + u64 val; + + /* Only Fam10h is affected */ + if (boot_cpu_data.x86 != 0x10) + return; + + /* Use _safe variants to not break nested virtualization */ + val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); + if (err) + return; + + val |= (1ULL << 47); + + low = lower_32_bits(val); + high = upper_32_bits(val); + + native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + + erratum_383_found = true; +} + static int has_svm(void) { const char *msg; @@ -318,7 +346,6 @@ static void svm_hardware_enable(void *garbage) { - struct svm_cpu_data *svm_data; uint64_t efer; struct descriptor_table gdt_descr; @@ -350,6 +377,10 @@ wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(svm_data->save_area) << PAGE_SHIFT); + + svm_init_erratum_383(); + + return; } static void svm_cpu_uninit(int cpu) @@ -590,7 +621,6 @@ control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); - control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -625,11 +655,12 @@ save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; - /* - * cr0 val on cpu init should be 0x60000010, we enable cpu - * cache by default. the orderly way is to enable cache in bios. + /* This is the guest-visible cr0 value. + * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ - save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; + svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); + save->cr4 = X86_CR4_PAE; /* rdx = ?? */ @@ -693,29 +724,28 @@ if (err) goto free_svm; + err = -ENOMEM; page = alloc_page(GFP_KERNEL); - if (!page) { - err = -ENOMEM; + if (!page) goto uninit; - } - err = -ENOMEM; msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!msrpm_pages) - goto uninit; + goto free_page1; nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) - goto uninit; - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + goto free_page2; hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) - goto uninit; + goto free_page3; + svm->nested.hsave = page_address(hsave_page); + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->nested.msrpm = page_address(nested_msrpm_pages); svm->vmcb = page_address(page); @@ -723,6 +753,7 @@ svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); + svm->vmcb->control.tsc_offset = 0-native_read_tsc(); fx_init(&svm->vcpu); svm->vcpu.fpu_active = 1; @@ -732,6 +763,12 @@ return &svm->vcpu; +free_page3: + __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); +free_page2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); +free_page1: + __free_page(page); uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: @@ -758,17 +795,18 @@ int i; if (unlikely(cpu != vcpu->cpu)) { - u64 tsc_this, delta; + u64 delta; - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - rdtscll(tsc_this); - delta = vcpu->arch.host_tsc - tsc_this; - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; + if (check_tsc_unstable()) { + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + delta = vcpu->arch.host_tsc - native_read_tsc(); + svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; + } vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; @@ -1251,8 +1289,59 @@ return 1; } -static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static bool is_erratum_383(void) { + int err, i; + u64 value; + + if (!erratum_383_found) + return false; + + value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); + if (err) + return false; + + /* Bit 62 may or may not be set for this mce */ + value &= ~(1ULL << 62); + + if (value != 0xb600000000010015ULL) + return false; + + /* Clear MCi_STATUS registers */ + for (i = 0; i < 6; ++i) + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); + + value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); + if (!err) { + u32 low, high; + + value &= ~(1ULL << 2); + low = lower_32_bits(value); + high = upper_32_bits(value); + + native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + } + + /* Flush tlb to evict multi-match entries */ + __flush_tlb_all(); + + return true; +} + +static void svm_handle_mce(struct vcpu_svm *svm) +{ + if (is_erratum_383()) { + /* + * Erratum 383 triggered. Guest state is corrupt so kill the + * guest. + */ + pr_err("KVM: Guest triggered AMD Erratum 383\n"); + + set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); + + return; + } + /* * On an #MC intercept the MCE handler is not called automatically in * the host. So do it by hand here. @@ -1261,6 +1350,11 @@ "int $0x12\n"); /* not sure if we ever come back to this point */ + return; +} + +static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ return 1; } @@ -2018,7 +2112,7 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { ++svm->vcpu.stat.nmi_window_exits; - svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; return 1; } @@ -2162,6 +2256,7 @@ } svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; + vcpu->arch.hv_clock.tsc_timestamp = 0; break; } @@ -2230,7 +2325,7 @@ trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) + if (kvm_set_msr(&svm->vcpu, ecx, data)) kvm_inject_gp(&svm->vcpu, 0); else skip_emulated_instruction(&svm->vcpu); @@ -2413,7 +2508,7 @@ svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -2604,8 +2699,8 @@ sync_lapic_to_cr8(vcpu); save_host_msrs(vcpu); - fs_selector = kvm_read_fs(); - gs_selector = kvm_read_gs(); + savesegment(fs, fs_selector); + savesegment(gs, gs_selector); ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ @@ -2692,10 +2787,15 @@ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - kvm_load_fs(fs_selector); - kvm_load_gs(gs_selector); - kvm_load_ldt(ldt_selector); load_host_msrs(vcpu); + kvm_load_ldt(ldt_selector); + loadsegment(fs, fs_selector); +#ifdef CONFIG_X86_64 + load_gs_index(gs_selector); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, gs_selector); +#endif reload_tss(vcpu); @@ -2711,6 +2811,14 @@ vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); } + + /* + * We need to handle MC intercepts here before the vcpu has a chance to + * change the physical cpu + */ + if (unlikely(svm->vmcb->control.exit_code == + SVM_EXIT_EXCP_BASE + MC_VECTOR)) + svm_handle_mce(svm); } #undef R --- linux-ec2-2.6.32.orig/arch/x86/kvm/emulate.c +++ linux-ec2-2.6.32/arch/x86/kvm/emulate.c @@ -75,6 +75,7 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ +#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ /* Source 2 operand type */ #define Src2None (0<<29) #define Src2CL (1<<29) @@ -86,6 +87,7 @@ enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group8, Group9, }; static u32 opcode_table[256] = { @@ -203,7 +205,7 @@ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -211,16 +213,20 @@ static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, Group | GroupDual | Group7, 0, 0, + 0, ImplicitOps, ImplicitOps | Priv, 0, + ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, + 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + ModRM | ImplicitOps | Priv, ModRM | Priv, + ModRM | ImplicitOps | Priv, ModRM | Priv, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, + ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, + ImplicitOps, ImplicitOps | Priv, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -258,11 +264,12 @@ 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, Group | Group8, DstMem | SrcReg | ModRM | BitOp, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, + 0, 0, 0, Group | GroupDual | Group9, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -311,24 +318,39 @@ SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, + 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, + [Group8*8] = + 0, 0, 0, 0, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + [Group9*8] = + 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, + [Group9*8] = + 0, 0, 0, 0, 0, 0, 0, 0, }; /* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18) #define EFLG_VM (1<<17) #define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) #define EFLG_IF (1<<9) +#define EFLG_TF (1<<8) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -597,7 +619,7 @@ if (linear < fc->start || linear >= fc->end) { size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); if (rc) return rc; fc->start = linear; @@ -613,6 +635,9 @@ { int rc = 0; + /* x86 instructions are limited to 15 bytes. */ + if (eip + size - ctxt->decode.eip_orig > 15) + return X86EMUL_UNHANDLEABLE; eip += ctxt->cs_base; while (size--) { rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); @@ -649,11 +674,11 @@ op_bytes = 3; *address = 0; rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); + ctxt->vcpu, NULL); if (rc) return rc; rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); + ctxt->vcpu, NULL); return rc; } @@ -871,12 +896,13 @@ /* Shadow copy of register state. Committed on successful emulation. */ memset(c, 0, sizeof(struct decode_cache)); - c->eip = kvm_rip_read(ctxt->vcpu); + c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); switch (mode) { case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; @@ -1186,6 +1212,49 @@ return rc; } +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + int rc; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); + + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; + } + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; +} + static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1327,7 +1396,7 @@ rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); if (rc) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); return rc; } @@ -1426,20 +1495,73 @@ ss->present = 1; } +static bool syscall_is_enabled(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + u32 eax, ebx, ecx, edx; + + /* + * syscall should always be enabled in longmode - so only become + * vendor specific (cpuid) if other modes are active... + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + return true; + + eax = 0x00000000; + ecx = 0x00000000; + if (ops->get_cpuid(ctxt->vcpu, &eax, &ebx, &ecx, &edx)) { + /* + * Intel ("GenuineIntel") + * remark: Intel CPUs only support "syscall" in 64bit + * longmode. Also an 64bit guest with a + * 32bit compat-app running will #UD !! While this + * behaviour can be fixed (by emulating) into AMD + * response - CPUs of AMD can't behave like Intel. + */ + if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) + return false; + + /* AMD ("AuthenticAMD") */ + if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) + return true; + + /* AMD ("AMDisbetter!") */ + if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) + return true; + } + + /* default: (not Intel, not AMD), apply Intel's stricter rules... */ + return false; +} + static int -emulate_syscall(struct x86_emulate_ctxt *ctxt) +emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; struct kvm_segment cs, ss; u64 msr_data; + u64 efer = 0; /* syscall is not available in real mode */ if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) + || ctxt->mode == X86EMUL_MODE_VM86) return -1; + if (!(syscall_is_enabled(ctxt, ops))) + return -1; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_EFER, &efer); setup_syscalls_segments(ctxt, &cs, &ss); + if (!(efer & EFER_SCE)) + return -1; + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); msr_data >>= 32; cs.selector = (u16)(msr_data & 0xfffc); @@ -1487,9 +1609,8 @@ if (c->lock_prefix) return -1; - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + /* inject #GP if in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } @@ -1553,15 +1674,9 @@ if (c->lock_prefix) return -1; - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + /* inject #GP if in real mode or Virtual 8086 mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); return -1; } @@ -1608,6 +1723,57 @@ return 0; } +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) +{ + int iopl; + if (ctxt->mode == X86EMUL_MODE_REAL) + return false; + if (ctxt->mode == X86EMUL_MODE_VM86) + return true; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + struct kvm_segment tr_seg; + int r; + u16 io_bitmap_ptr; + u8 perm, bit_idx = port & 0x7; + unsigned mask = (1 << len) - 1; + + kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); + if (tr_seg.unusable) + return false; + if (tr_seg.limit < 103) + return false; + r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, + NULL); + if (r != X86EMUL_CONTINUE) + return false; + if (io_bitmap_ptr + port/8 > tr_seg.limit) + return false; + r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, + ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if ((perm >> bit_idx) & mask) + return false; + return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + if (emulator_bad_iopl(ctxt)) + if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + return false; + return true; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1629,6 +1795,12 @@ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); saved_eip = c->eip; + /* Privileged instruction can be executed only in CPL=0 */ + if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) memop = c->modrm_ea; @@ -1761,7 +1933,12 @@ break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -1777,6 +1954,11 @@ return 0; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 0, (c->d & ByteOp) ? 1 : c->op_bytes, @@ -1863,25 +2045,19 @@ break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; - int type_bits; - int err; sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_CS || + c->modrm_reg > VCPU_SREG_GS) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; } - if (err < 0) - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + + rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -1910,7 +2086,10 @@ c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; c->dst.val = c->src.val; @@ -2048,11 +2227,9 @@ case 0xe9: /* jmp rel */ goto jmp; case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, + VCPU_SREG_CS)) + goto done; c->eip = c->src.val; break; @@ -2070,7 +2247,13 @@ case 0xef: /* out (e/r)ax,dx */ port = c->regs[VCPU_REGS_RDX]; io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, + do_io: + if (!emulator_io_permited(ctxt, ops, port, + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, (c->d & ByteOp) ? 1 : c->op_bytes, port) != 0) { c->eip = saved_eip; @@ -2095,13 +2278,21 @@ c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; @@ -2204,7 +2395,7 @@ } break; case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) + if (emulate_syscall(ctxt, ops) == -1) goto cannot_emulate; else goto writeback; --- linux-ec2-2.6.32.orig/arch/x86/kvm/Kconfig +++ linux-ec2-2.6.32/arch/x86/kvm/Kconfig @@ -7,6 +7,7 @@ menuconfig VIRTUALIZATION bool "Virtualization" depends on HAVE_KVM || X86 + depends on !XEN default y ---help--- Say Y here to get to see options for using your Linux host to run other --- linux-ec2-2.6.32.orig/arch/x86/kvm/mmu.c +++ linux-ec2-2.6.32/arch/x86/kvm/mmu.c @@ -136,12 +136,6 @@ #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - #define PT_PDPE_LEVEL 3 #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 @@ -227,7 +221,7 @@ } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static int is_write_protection(struct kvm_vcpu *vcpu) +static bool is_write_protection(struct kvm_vcpu *vcpu) { return vcpu->arch.cr0 & X86_CR0_WP; } @@ -477,7 +471,7 @@ addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return page_size; + return PT_PAGE_TABLE_LEVEL; down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); @@ -515,11 +509,9 @@ if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { - + for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; - } return level - 1; } @@ -1504,8 +1496,8 @@ for_each_sp(pages, sp, parents, i) { kvm_mmu_zap_page(kvm, sp); mmu_pages_clear_parents(&parents); + zapped++; } - zapped += pages.nr; kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1556,14 +1548,16 @@ */ if (used_pages > kvm_nr_mmu_pages) { - while (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages && + !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(kvm, page); + used_pages -= kvm_mmu_zap_page(kvm, page); used_pages--; } + kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } else @@ -1610,7 +1604,8 @@ && !sp->role.invalid) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + nn = bucket->first; } } } @@ -1641,7 +1636,7 @@ { struct page *page; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); if (gpa == UNMAPPED_GVA) return NULL; @@ -1848,6 +1843,9 @@ spte |= PT_WRITABLE_MASK; + if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) + spte &= ~PT_USER_MASK; + /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection @@ -1903,6 +1901,8 @@ child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*sptep), pfn); @@ -2096,11 +2096,13 @@ direct = 1; if (mmu_check_root(vcpu, root_gfn)) return 1; + spin_lock(&vcpu->kvm->mmu_lock); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = root; return 0; } @@ -2122,11 +2124,14 @@ root_gfn = 0; if (mmu_check_root(vcpu, root_gfn)) return 1; + spin_lock(&vcpu->kvm->mmu_lock); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); @@ -2164,8 +2169,11 @@ spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) { + if (error) + *error = 0; return vaddr; } @@ -2447,6 +2455,7 @@ r = paging32_init_context(vcpu); vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); return r; } @@ -2486,7 +2495,9 @@ goto out; spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); r = mmu_alloc_roots(vcpu); + spin_lock(&vcpu->kvm->mmu_lock); mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); if (r) @@ -2749,7 +2760,7 @@ if (tdp_enabled) return 0; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -3247,7 +3258,7 @@ if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) audit_mappings_page(vcpu, ent, va, level - 1); else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; --- linux-ec2-2.6.32.orig/arch/x86/kvm/i8254.c +++ linux-ec2-2.6.32/arch/x86/kvm/i8254.c @@ -256,8 +256,10 @@ return; timer = &pit->pit_state.pit_timer.timer; + mutex_lock(&pit->pit_state.lock); if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + mutex_unlock(&pit->pit_state.lock); } static void destroy_pit_timer(struct kvm_timer *pt) @@ -277,11 +279,15 @@ .is_periodic = kpit_is_periodic, }; -static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) +static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) { + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; struct kvm_timer *pt = &ps->pit_timer; s64 interval; + if (!irqchip_in_kernel(kvm)) + return; + interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); @@ -333,13 +339,13 @@ /* FIXME: enhance mode 4 precision */ case 4: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { - create_pit_timer(ps, val, 0); + create_pit_timer(kvm, val, 0); } break; case 2: case 3: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ - create_pit_timer(ps, val, 1); + create_pit_timer(kvm, val, 1); } break; default: @@ -465,6 +471,9 @@ return -EOPNOTSUPP; addr &= KVM_PIT_CHANNEL_MASK; + if (addr == 3) + return 0; + s = &pit_state->channels[addr]; mutex_lock(&pit_state->lock); --- linux-ec2-2.6.32.orig/arch/x86/kvm/paging_tmpl.h +++ linux-ec2-2.6.32/arch/x86/kvm/paging_tmpl.h @@ -150,7 +150,9 @@ walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) + goto not_present; + trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) @@ -316,8 +318,32 @@ break; } - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) - continue; + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + unsigned direct_access; + + if (level != gw->level) + continue; + + /* + * For the direct sp, if the guest pte's dirty bit + * changed form clean to dirty, it will corrupt the + * sp's access: allow writable in the read-only sp, + * so we should update the spte at this point to get + * a new sp with the correct access. + */ + direct_access = gw->pt_access & gw->pte_access; + if (!is_dirty_gpte(gw->ptes[gw->level - 1])) + direct_access &= ~ACC_WRITE_MASK; + + child = page_header(*sptep & PT64_BASE_ADDR_MASK); + if (child->role.access == direct_access) + continue; + + mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } if (is_large_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); @@ -334,6 +360,7 @@ /* advance table_gfn when emulating 1gb pages with 4k */ if (delta == 0) table_gfn += PT_INDEX(addr, level); + access &= gw->pte_access; } else { direct = 0; table_gfn = gw->table_gfn[level - 2]; @@ -455,8 +482,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; - pt_element_t gpte; - gpa_t pte_gpa = -1; int level; u64 *sptep; int need_flush = 0; @@ -471,10 +496,6 @@ if (level == PT_PAGE_TABLE_LEVEL || ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); - - pte_gpa = (sp->gfn << PAGE_SHIFT); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); @@ -493,32 +514,25 @@ if (need_flush) kvm_flush_remote_tlbs(vcpu->kvm); spin_unlock(&vcpu->kvm->mmu_lock); - - if (pte_gpa == -1) - return; - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) - return; - if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) { - if (mmu_topup_memory_caches(vcpu)) - return; - kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, - sizeof(pt_element_t), 0); - } } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, + u32 *error) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, + !!(access & PFERR_WRITE_MASK), + !!(access & PFERR_USER_MASK), + !!(access & PFERR_FETCH_MASK)); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; - } + } else if (error) + *error = walker.error_code; return gpa; } --- linux-ec2-2.6.32.orig/arch/x86/kvm/x86.c +++ linux-ec2-2.6.32/arch/x86/kvm/x86.c @@ -47,6 +47,7 @@ #include #include #include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -297,21 +298,16 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (cr0 & CR0_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); kvm_inject_gp(vcpu, 0); return; } if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); kvm_inject_gp(vcpu, 0); return; } if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - printk(KERN_DEBUG "set_cr0: #GP, set PG flag " - "and a clear PE flag\n"); kvm_inject_gp(vcpu, 0); return; } @@ -322,15 +318,11 @@ int cs_db, cs_l; if (!is_pae(vcpu)) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while PAE is disabled\n"); kvm_inject_gp(vcpu, 0); return; } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while CS.L == 1\n"); kvm_inject_gp(vcpu, 0); return; @@ -338,8 +330,6 @@ } else #endif if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr0: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -356,7 +346,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0eul) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); @@ -366,28 +356,23 @@ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) { - printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " - "in long mode\n"); kvm_inject_gp(vcpu, 0); return; } } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (cr4 & X86_CR4_VMXE) { - printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); kvm_inject_gp(vcpu, 0); return; } @@ -408,21 +393,16 @@ if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) { - printk(KERN_DEBUG - "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - printk(KERN_DEBUG "set_cr3: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -454,7 +434,6 @@ void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); kvm_inject_gp(vcpu, 0); return; } @@ -484,16 +463,19 @@ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * * This list is modified at module load time to reflect the - * capabilities of the host cpu. + * capabilities of the host cpu. This capabilities test skips MSRs that are + * kvm-specific. Those are put in the beginning of the list. */ + +#define KVM_SAVE_MSRS_BEGIN 2 static u32 msrs_to_save[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; @@ -502,53 +484,42 @@ MSR_IA32_MISC_ENABLE, }; -static void set_efer(struct kvm_vcpu *vcpu, u64 efer) +static int set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & efer_reserved_bits) { - printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", - efer); - kvm_inject_gp(vcpu, 0); - return; - } + if (efer & efer_reserved_bits) + return 1; if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { - printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); - kvm_inject_gp(vcpu, 0); - return; - } + && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) + return 1; if (efer & EFER_FFXSR) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { - printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) + return 1; } if (efer & EFER_SVME) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { - printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) + return 1; } - kvm_x86_ops->set_efer(vcpu, efer); - efer &= ~EFER_LMA; efer |= vcpu->arch.shadow_efer & EFER_LMA; + kvm_x86_ops->set_efer(vcpu, efer); + vcpu->arch.shadow_efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); + + return 0; } void kvm_enable_efer_bits(u64 mask) @@ -557,7 +528,6 @@ } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); - /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -565,8 +535,34 @@ */ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { + switch (msr_index) { + case MSR_FS_BASE: + case MSR_GS_BASE: + case MSR_KERNEL_GS_BASE: + case MSR_CSTAR: + case MSR_LSTAR: + if (is_noncanonical_address(data)) + return 1; + break; + case MSR_IA32_SYSENTER_EIP: + case MSR_IA32_SYSENTER_ESP: + /* + * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if + * non-canonical address is written on Intel but not on + * AMD (which ignores the top 32-bits, because it does + * not implement 64-bit SYSENTER). + * + * 64-bit code should hence be able to write a non-canonical + * value on AMD. Making the address canonical ensures that + * vmentry does not fail on Intel after writing a non-canonical + * value, and that something deterministic happens if the guest + * invokes 64-bit SYSENTER. + */ + data = get_canonical(data); + } return kvm_x86_ops->set_msr(vcpu, msr_index, data); } +EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention @@ -578,14 +574,22 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { - static int version; + int version; + int r; struct pvclock_wall_clock wc; - struct timespec now, sys, boot; + struct timespec boot; if (!wall_clock) return; - version++; + r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); + if (r) + return; + + if (version & 1) + ++version; /* first time write, random junk */ + + ++version; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); @@ -595,9 +599,7 @@ * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - now = current_kernel_time(); - ktime_get_ts(&sys); - boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); + getboottime(&boot); wc.sec = boot.tv_sec; wc.nsec = boot.tv_nsec; @@ -657,6 +659,8 @@ struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; unsigned long this_tsc_khz; + s64 kernel_ns, max_kernel_ns; + u64 tsc_timestamp; if ((!vcpu->time_page)) return; @@ -670,14 +674,53 @@ /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); + kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); + kernel_ns = timespec_to_ns(&ts); local_irq_restore(flags); + /* + * Time as measured by the TSC may go backwards when resetting the base + * tsc_timestamp. The reason for this is that the TSC resolution is + * higher than the resolution of the other clock scales. Thus, many + * possible measurments of the TSC correspond to one measurement of any + * other clock, and so a spread of values is possible. This is not a + * problem for the computation of the nanosecond clock; with TSC rates + * around 1GHZ, there can only be a few cycles which correspond to one + * nanosecond value, and any path through this code will inevitably + * take longer than that. However, with the kernel_ns value itself, + * the precision may be much lower, down to HZ granularity. If the + * first sampling of TSC against kernel_ns ends in the low part of the + * range, and the second in the high end of the range, we can get: + * + * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new + * + * As the sampling errors potentially range in the thousands of cycles, + * it is possible such a time value has already been observed by the + * guest. To protect against this, we must compute the system time as + * observed by the guest and ensure the new system time is greater. + */ + max_kernel_ns = 0; + if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + max_kernel_ns = vcpu->last_guest_tsc - + vcpu->hv_clock.tsc_timestamp; + max_kernel_ns = pvclock_scale_delta(max_kernel_ns, + vcpu->hv_clock.tsc_to_system_mul, + vcpu->hv_clock.tsc_shift); + max_kernel_ns += vcpu->last_kernel_ns; + } + + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; + /* With all the info we got, fill in the values */ - vcpu->hv_clock.system_time = ts.tv_nsec + - (NSEC_PER_SEC * (u64)ts.tv_sec); + vcpu->hv_clock.tsc_timestamp = tsc_timestamp; + vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + vcpu->last_kernel_ns = kernel_ns; + vcpu->last_guest_tsc = tsc_timestamp; + /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate @@ -823,9 +866,13 @@ if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MC0_CTL + 4 * bank_num) { u32 offset = msr - MSR_IA32_MC0_CTL; - /* only 0 or all 1s can be written to IA32_MCi_CTL */ + /* only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore + * this to avoid an uncatched #GP in the guest + */ if ((offset & 0x3) == 0 && - data != 0 && data != ~(u64)0) + data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; vcpu->arch.mce_banks[offset] = data; break; @@ -839,8 +886,7 @@ { switch (msr) { case MSR_EFER: - set_efer(vcpu, data); - break; + return set_efer(vcpu, data); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ if (data != 0) { @@ -904,6 +950,12 @@ /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + /* Check that address+len does not cross page boundary */ + if ((vcpu->arch.time_offset + + sizeof(struct pvclock_vcpu_time_info) - 1) + & PAGE_MASK) + break; + vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); @@ -1224,6 +1276,7 @@ case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: + case KVM_CAP_ADJUST_CLOCK: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1238,8 +1291,8 @@ case KVM_CAP_NR_MEMSLOTS: r = KVM_MEMORY_SLOTS; break; - case KVM_CAP_PV_MMU: - r = !tdp_enabled; + case KVM_CAP_PV_MMU: /* obsolete */ + r = 0; break; case KVM_CAP_IOMMU: r = iommu_found(); @@ -1431,6 +1484,7 @@ { int r; + vcpu_load(vcpu); r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; @@ -1442,6 +1496,7 @@ out: cpuid->nent = vcpu->arch.cpuid_nent; + vcpu_put(vcpu); return r; } @@ -1501,7 +1556,7 @@ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 0 /* SKINIT */ | 0 /* WDT */; /* all calls to cpuid_count() should be made on the same cpu */ @@ -1691,6 +1746,7 @@ int r; unsigned bank_num = mcg_cap & 0xff, bank; + vcpu_load(vcpu); r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; @@ -1705,6 +1761,7 @@ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; out: + vcpu_put(vcpu); return r; } @@ -1907,7 +1964,9 @@ r = -EFAULT; if (copy_from_user(&mce, argp, sizeof mce)) goto out; + vcpu_load(vcpu); r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); + vcpu_put(vcpu); break; } default: @@ -2114,6 +2173,7 @@ sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); + memset(&ps->reserved, 0, sizeof(ps->reserved)); return r; } @@ -2152,7 +2212,7 @@ struct kvm_dirty_log *log) { int r; - int n; + unsigned long n; struct kvm_memory_slot *memslot; int is_dirty = 0; @@ -2168,7 +2228,7 @@ kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + n = kvm_dirty_bitmap_bytes(memslot); memset(memslot->dirty_bitmap, 0, n); } r = 0; @@ -2244,25 +2304,42 @@ if (r) goto out; break; - case KVM_CREATE_IRQCHIP: + case KVM_CREATE_IRQCHIP: { + struct kvm_pic *vpic; + + mutex_lock(&kvm->lock); + r = -EEXIST; + if (kvm->arch.vpic) + goto create_irqchip_unlock; + r = -EINVAL; + if (atomic_read(&kvm->online_vcpus)) + goto create_irqchip_unlock; r = -ENOMEM; - kvm->arch.vpic = kvm_create_pic(kvm); - if (kvm->arch.vpic) { + vpic = kvm_create_pic(kvm); + if (vpic) { r = kvm_ioapic_init(kvm); if (r) { - kfree(kvm->arch.vpic); - kvm->arch.vpic = NULL; - goto out; + kfree(vpic); + goto create_irqchip_unlock; } } else - goto out; + goto create_irqchip_unlock; + smp_wmb(); + kvm->arch.vpic = vpic; + smp_wmb(); r = kvm_setup_default_irq_routing(kvm); if (r) { + mutex_lock(&kvm->irq_lock); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); - goto out; + kvm->arch.vpic = NULL; + kvm->arch.vioapic = NULL; + mutex_unlock(&kvm->irq_lock); } + create_irqchip_unlock: + mutex_unlock(&kvm->lock); break; + } case KVM_CREATE_PIT: u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; goto create_pit; @@ -2421,6 +2498,45 @@ r = 0; break; } + case KVM_SET_CLOCK: { + struct timespec now; + struct kvm_clock_data user_ns; + u64 now_ns; + s64 delta; + + r = -EFAULT; + if (copy_from_user(&user_ns, argp, sizeof(user_ns))) + goto out; + + r = -EINVAL; + if (user_ns.flags) + goto out; + + r = 0; + ktime_get_ts(&now); + now_ns = timespec_to_ns(&now); + delta = user_ns.clock - now_ns; + kvm->arch.kvmclock_offset = delta; + break; + } + case KVM_GET_CLOCK: { + struct timespec now; + struct kvm_clock_data user_ns; + u64 now_ns; + + ktime_get_ts(&now); + now_ns = timespec_to_ns(&now); + user_ns.clock = kvm->arch.kvmclock_offset + now_ns; + user_ns.flags = 0; + memset(&user_ns.pad, 0, sizeof(user_ns.pad)); + + r = -EFAULT; + if (copy_to_user(argp, &user_ns, sizeof(user_ns))) + goto out; + r = 0; + break; + } + default: ; } @@ -2433,7 +2549,8 @@ u32 dummy[2]; unsigned i, j; - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { + /* skip the first msrs in the list. KVM-specific */ + for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; if (j < i) @@ -2462,14 +2579,41 @@ return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_FETCH_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2492,14 +2636,37 @@ return r; } +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + error); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} + static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2529,6 +2696,7 @@ struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -2538,17 +2706,20 @@ return X86EMUL_CONTINUE; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; mmio: /* @@ -2587,11 +2758,12 @@ struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; } @@ -2655,7 +2827,7 @@ char *kaddr; u64 val; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -2700,6 +2872,9 @@ { struct kvm_vcpu *vcpu = ctxt->vcpu; + if (!kvm_x86_ops->get_dr) + return X86EMUL_UNHANDLEABLE; + switch (dr) { case 0 ... 3: *dest = kvm_x86_ops->get_dr(vcpu, dr); @@ -2715,6 +2890,9 @@ unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; int exception; + if (!kvm_x86_ops->set_dr) + return X86EMUL_UNHANDLEABLE; + kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); if (exception) { /* FIXME: better handling */ @@ -2734,18 +2912,41 @@ rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); } EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); +static bool emulator_get_cpuid(struct kvm_vcpu *vcpu, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + struct kvm_cpuid_entry2 *cpuid = NULL; + + if (eax && ecx) + cpuid = kvm_find_cpuid_entry(vcpu, *eax, *ecx); + + if (cpuid) { + *eax = cpuid->eax; + *ecx = cpuid->ecx; + if (ebx) + *ebx = cpuid->ebx; + if (edx) + *edx = cpuid->edx; + return true; + } + + return false; +} + static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt, + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, + .get_cpuid = emulator_get_cpuid, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -2785,8 +2986,9 @@ vcpu->arch.emulate_ctxt.vcpu = vcpu; vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); vcpu->arch.emulate_ctxt.mode = + (!(vcpu->arch.cr0 & X86_CR0_PE)) ? X86EMUL_MODE_REAL : (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; @@ -2878,12 +3080,17 @@ gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; int ret; + u32 error_code; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu); + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); else - ret = kvm_read_guest_virt(q, p, bytes, vcpu); + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + return ret; } @@ -2904,7 +3111,7 @@ if (io->in) { r = pio_copy_data(vcpu); if (r) - return r; + goto out; } delta = 1; @@ -2931,7 +3138,7 @@ kvm_register_write(vcpu, VCPU_REGS_RSI, val); } } - +out: io->count -= io->cur_count; io->cur_count = 0; @@ -2974,6 +3181,8 @@ { unsigned long val; + trace_kvm_pio(!in, port, size, 1); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -2985,9 +3194,6 @@ vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, 1); - val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); @@ -3006,6 +3212,8 @@ unsigned now, in_page; int ret = 0; + trace_kvm_pio(!in, port, size, count); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3017,9 +3225,6 @@ vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, count); - if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); return 1; @@ -3051,10 +3256,8 @@ if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_gp(vcpu, 0); + if (ret == X86EMUL_PROPAGATE_FAULT) return 1; - } if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) @@ -3602,6 +3805,8 @@ kvm_x86_ops->prepare_guest_switch(vcpu); kvm_load_guest_fpu(vcpu); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + local_irq_disable(); clear_bit(KVM_REQ_KICK, &vcpu->requests); @@ -4034,7 +4239,9 @@ kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return 1; } - return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + return kvm_read_guest_virt_system(dtable.base + index*8, + seg_desc, sizeof(*seg_desc), + vcpu, NULL); } /* allowed just for 8 bytes segments */ @@ -4048,15 +4255,23 @@ if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); } -static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, +static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc) +{ + u32 base_addr = get_desc_base(seg_desc); + + return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL); +} + +static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { u32 base_addr = get_desc_base(seg_desc); - return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); + return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL); } static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) @@ -4067,18 +4282,6 @@ return kvm_seg.selector; } -static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, - u16 selector, - struct kvm_segment *kvm_seg) -{ - struct desc_struct seg_desc; - - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; - seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); - return 0; -} - static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { @@ -4096,7 +4299,7 @@ .unusable = 0, }; kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return 0; + return X86EMUL_CONTINUE; } static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) @@ -4106,24 +4309,113 @@ (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); } -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment kvm_seg; + struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) - return 1; - kvm_seg.type |= type_bits; - if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && - seg != VCPU_SREG_LDTR) - if (!kvm_seg.s) - kvm_seg.unusable = 1; + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + if (ret) + return ret; + + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.type |= 1; + save_guest_segment_descriptor(vcpu, selector, &seg_desc); + } +load: kvm_set_segment(vcpu, &kvm_seg, seg); - return 0; + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; } static void save_state_to_tss32(struct kvm_vcpu *vcpu, @@ -4149,6 +4441,14 @@ tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); } +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + static int load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { @@ -4166,25 +4466,41 @@ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) + if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) + if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) return 1; return 0; } @@ -4225,19 +4541,33 @@ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) return 1; - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) return 1; return 0; } @@ -4259,7 +4589,7 @@ sizeof tss_segment_16)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_16, sizeof tss_segment_16)) goto out; @@ -4267,7 +4597,7 @@ tss_segment_16.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_16.prev_task_link, sizeof tss_segment_16.prev_task_link)) goto out; @@ -4298,7 +4628,7 @@ sizeof tss_segment_32)) goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), &tss_segment_32, sizeof tss_segment_32)) goto out; @@ -4306,7 +4636,7 @@ tss_segment_32.prev_task_link = old_tss_sel; if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), + get_tss_base_addr_write(vcpu, nseg_desc), &tss_segment_32.prev_task_link, sizeof tss_segment_32.prev_task_link)) goto out; @@ -4328,8 +4658,9 @@ int ret = 0; u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); + u32 desc_limit; - old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); + old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); /* FIXME: Handle errors. Failure to read either TSS or their * descriptors should generate a pagefault. @@ -4350,7 +4681,10 @@ } } - if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { + desc_limit = get_desc_limit(&nseg_desc); + if (!nseg_desc.p || + ((desc_limit < 0x67 && (nseg_desc.type & 8)) || + desc_limit < 0x2b)) { kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); return 1; } @@ -4409,6 +4743,9 @@ int pending_vec, max_bits; struct descriptor_table dt; + if (sregs->cr4 & X86_CR4_OSXSAVE) + return -EINVAL; + vcpu_load(vcpu); dt.limit = sregs->idt.limit; @@ -4538,7 +4875,7 @@ vcpu_load(vcpu); down_read(&vcpu->kvm->slots_lock); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); up_read(&vcpu->kvm->slots_lock); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; @@ -4670,12 +5007,7 @@ if (r == 0) r = kvm_mmu_setup(vcpu); vcpu_put(vcpu); - if (r < 0) - goto free_vcpu; - return 0; -free_vcpu: - kvm_x86_ops->vcpu_free(vcpu); return r; } @@ -4726,6 +5058,11 @@ kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; @@ -4762,12 +5099,13 @@ GFP_KERNEL); if (!vcpu->arch.mce_banks) { r = -ENOMEM; - goto fail_mmu_destroy; + goto fail_free_lapic; } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; return 0; - +fail_free_lapic: + kvm_free_lapic(vcpu); fail_mmu_destroy: kvm_mmu_destroy(vcpu); fail_free_pio_data: @@ -4778,6 +5116,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); down_read(&vcpu->kvm->slots_lock); kvm_mmu_destroy(vcpu); --- linux-ec2-2.6.32.orig/arch/x86/vdso/vdso32-setup.c +++ linux-ec2-2.6.32/arch/x86/vdso/vdso32-setup.c @@ -331,7 +331,7 @@ if (compat) addr = VDSO_HIGH_BASE; else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; --- linux-ec2-2.6.32.orig/arch/x86/vdso/Makefile +++ linux-ec2-2.6.32/arch/x86/vdso/Makefile @@ -25,7 +25,7 @@ export CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \ +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so @@ -65,11 +65,12 @@ vdso32.so-$(VDSO32-y) += int80 vdso32.so-$(CONFIG_COMPAT) += syscall vdso32.so-$(VDSO32-y) += sysenter +vdso32.so-$(CONFIG_X86_XEN) += syscall vdso32-images = $(vdso32.so-y:%=vdso32-%.so) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 +VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1 # This makes sure the $(obj) subdirectory exists even though vdso32/ # is not a kbuild sub-make subdirectory. --- linux-ec2-2.6.32.orig/arch/x86/vdso/vdso32-setup-xen.c +++ linux-ec2-2.6.32/arch/x86/vdso/vdso32-setup-xen.c @@ -0,0 +1,486 @@ +/* + * (C) Copyright 2002 Linus Torvalds + * Portions based on the vdso-randomization code from exec-shield: + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar + * + * This file contains the needed initializations to support sysenter. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +enum { + VDSO_DISABLED = 0, + VDSO_ENABLED = 1, + VDSO_COMPAT = 2, +}; + +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_DEFAULT VDSO_COMPAT +#else +#define VDSO_DEFAULT VDSO_ENABLED +#endif + +#ifdef CONFIG_X86_64 +#define vdso_enabled sysctl_vsyscall32 +#define arch_setup_additional_pages syscall32_setup_pages +#endif + +/* + * This is the difference between the prelinked addresses in the vDSO images + * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO + * in the user address space. + */ +#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK) + +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; + +static int __init vdso_setup(char *s) +{ + vdso_enabled = simple_strtoul(s, NULL, 0); + + return 1; +} + +/* + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO + * behavior on both 64-bit and 32-bit kernels. + * On 32-bit kernels, vdso=[012] means the same thing. + */ +__setup("vdso32=", vdso_setup); + +#ifdef CONFIG_X86_32 +__setup_param("vdso=", vdso32_setup, vdso_setup, 0); + +EXPORT_SYMBOL_GPL(vdso_enabled); +#endif + +static __init void reloc_symtab(Elf32_Ehdr *ehdr, + unsigned offset, unsigned size) +{ + Elf32_Sym *sym = (void *)ehdr + offset; + unsigned nsym = size / sizeof(*sym); + unsigned i; + + for(i = 0; i < nsym; i++, sym++) { + if (sym->st_shndx == SHN_UNDEF || + sym->st_shndx == SHN_ABS) + continue; /* skip */ + + if (sym->st_shndx > SHN_LORESERVE) { + printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", + sym->st_shndx); + continue; + } + + switch(ELF_ST_TYPE(sym->st_info)) { + case STT_OBJECT: + case STT_FUNC: + case STT_SECTION: + case STT_FILE: + sym->st_value += VDSO_ADDR_ADJUST; + } + } +} + +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) +{ + Elf32_Dyn *dyn = (void *)ehdr + offset; + + for(; dyn->d_tag != DT_NULL; dyn++) + switch(dyn->d_tag) { + case DT_PLTGOT: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_INIT: + case DT_FINI: + case DT_REL: + case DT_DEBUG: + case DT_JMPREL: + case DT_VERSYM: + case DT_VERDEF: + case DT_VERNEED: + case DT_ADDRRNGLO ... DT_ADDRRNGHI: + /* definitely pointers needing relocation */ + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; + break; + + case DT_ENCODING ... OLD_DT_LOOS-1: + case DT_LOOS ... DT_HIOS-1: + /* Tags above DT_ENCODING are pointers if + they're even */ + if (dyn->d_tag >= DT_ENCODING && + (dyn->d_tag & 1) == 0) + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; + break; + + case DT_VERDEFNUM: + case DT_VERNEEDNUM: + case DT_FLAGS_1: + case DT_RELACOUNT: + case DT_RELCOUNT: + case DT_VALRNGLO ... DT_VALRNGHI: + /* definitely not pointers */ + break; + + case OLD_DT_LOOS ... DT_LOOS-1: + case DT_HIOS ... DT_VALRNGLO-1: + default: + if (dyn->d_tag > DT_ENCODING) + printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", + dyn->d_tag); + break; + } +} + +static __init void relocate_vdso(Elf32_Ehdr *ehdr) +{ + Elf32_Phdr *phdr; + Elf32_Shdr *shdr; + int i; + + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 || + !elf_check_arch_ia32(ehdr) || + ehdr->e_type != ET_DYN); + + ehdr->e_entry += VDSO_ADDR_ADJUST; + + /* rebase phdrs */ + phdr = (void *)ehdr + ehdr->e_phoff; + for (i = 0; i < ehdr->e_phnum; i++) { + phdr[i].p_vaddr += VDSO_ADDR_ADJUST; + + /* relocate dynamic stuff */ + if (phdr[i].p_type == PT_DYNAMIC) + reloc_dyn(ehdr, phdr[i].p_offset); + } + + /* rebase sections */ + shdr = (void *)ehdr + ehdr->e_shoff; + for(i = 0; i < ehdr->e_shnum; i++) { + if (!(shdr[i].sh_flags & SHF_ALLOC)) + continue; + + shdr[i].sh_addr += VDSO_ADDR_ADJUST; + + if (shdr[i].sh_type == SHT_SYMTAB || + shdr[i].sh_type == SHT_DYNSYM) + reloc_symtab(ehdr, shdr[i].sh_offset, + shdr[i].sh_size); + } +} + +static struct page *vdso32_pages[1]; + +#ifdef CONFIG_X86_64 + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) + +void __cpuinit syscall32_cpu_init(void) +{ + static const struct callback_register __cpuinitconst cstar = { + .type = CALLBACKTYPE_syscall32, + .address = (unsigned long)ia32_cstar_target + }; + static const struct callback_register __cpuinitconst sysenter = { + .type = CALLBACKTYPE_sysenter, + .address = (unsigned long)ia32_sysenter_target + }; + + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0) + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); +} + +#define compat_uses_vma 1 + +static inline void map_compat_vdso(int map) +{ +} + +#else /* CONFIG_X86_32 */ + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) + +extern asmlinkage void ia32pv_cstar_target(void); +static const struct callback_register __cpuinitconst cstar = { + .type = CALLBACKTYPE_syscall32, + .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target }, +}; + +void __cpuinit enable_sep_cpu(void) +{ + extern asmlinkage void ia32pv_sysenter_target(void); + static struct callback_register __cpuinitdata sysenter = { + .type = CALLBACKTYPE_sysenter, + .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target }, + }; + + if (vdso32_syscall()) { + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0) + BUG(); + return; + } + + if (!vdso32_sysenter()) + return; + + if (xen_feature(XENFEAT_supervisor_mode_kernel)) + sysenter.address.eip = (unsigned long)ia32_sysenter_target; + + switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) { + case 0: + break; +#if CONFIG_XEN_COMPAT < 0x030200 + case -ENOSYS: + sysenter.type = CALLBACKTYPE_sysenter_deprecated; + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0) + break; +#endif + default: + setup_clear_cpu_cap(X86_FEATURE_SEP); + break; + } +} + +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; +} + +#define compat_uses_vma 0 + +static void map_compat_vdso(int map) +{ + static int vdso_mapped; + + if (map == vdso_mapped) + return; + + vdso_mapped = map; + + __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT, + map ? PAGE_READONLY_EXEC : PAGE_NONE); + + /* flush stray tlbs */ + flush_tlb_all(); +} + +#endif /* CONFIG_X86_64 */ + +int __init sysenter_setup(void) +{ + void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); + const void *vsyscall; + size_t vsyscall_len; + + vdso32_pages[0] = virt_to_page(syscall_page); + +#ifdef CONFIG_X86_32 + gate_vma_init(); + + if (boot_cpu_has(X86_FEATURE_SYSCALL)) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD + && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0) + setup_force_cpu_cap(X86_FEATURE_SYSCALL32); + else { + setup_clear_cpu_cap(X86_FEATURE_SYSCALL); + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); + } + } +#endif + if (vdso32_syscall()) { + vsyscall = &vdso32_syscall_start; + vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start; + } else if (vdso32_sysenter()){ + vsyscall = &vdso32_sysenter_start; + vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; + } else { + vsyscall = &vdso32_int80_start; + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start; + } + + memcpy(syscall_page, vsyscall, vsyscall_len); + relocate_vdso(syscall_page); + + return 0; +} + +/* Setup a VMA at program startup for the vsyscall page */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret = 0; + bool compat; + + if (vdso_enabled == VDSO_DISABLED) + return 0; + + down_write(&mm->mmap_sem); + + /* Test compat mode once here, in case someone + changes it via sysctl */ + compat = (vdso_enabled == VDSO_COMPAT); + + map_compat_vdso(compat); + + if (compat) + addr = VDSO_HIGH_BASE; + else { + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + } + + current->mm->context.vdso = (void *)addr; + + if (compat_uses_vma || !compat) { + /* + * MAYWRITE to allow gdb to COW and set breakpoints + * + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully + * interpretable later without matching up the same + * kernel and hardware config to see what PC values + * meant. + */ + ret = install_special_mapping(mm, addr, PAGE_SIZE, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + vdso32_pages); + + if (ret) + goto up_fail; + } + + current_thread_info()->sysenter_return = + VDSO32_SYMBOL(addr, SYSENTER_RETURN); + + up_fail: + if (ret) + current->mm->context.vdso = NULL; + + up_write(&mm->mmap_sem); + + return ret; +} + +#ifdef CONFIG_X86_64 + +/* + * This must be done early in case we have an initrd containing 32-bit + * binaries (e.g., hotplug). This could be pushed upstream. + */ +core_initcall(sysenter_setup); + +#ifdef CONFIG_SYSCTL +/* Register vsyscall32 into the ABI table */ +#include + +static ctl_table abi_table2[] = { + { + .procname = "vsyscall32", + .data = &sysctl_vsyscall32, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + {} +}; + +static ctl_table abi_root_table2[] = { + { + .ctl_name = CTL_ABI, + .procname = "abi", + .mode = 0555, + .child = abi_table2 + }, + {} +}; + +static __init int ia32_binfmt_init(void) +{ + register_sysctl_table(abi_root_table2); + return 0; +} +__initcall(ia32_binfmt_init); +#endif + +#else /* CONFIG_X86_32 */ + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + return NULL; +} + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ + struct mm_struct *mm = tsk->mm; + + /* Check to see if this task was created in compat vdso mode */ + if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) + return &gate_vma; + return NULL; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + const struct vm_area_struct *vma = get_gate_vma(task); + + return vma && addr >= vma->vm_start && addr < vma->vm_end; +} + +int in_gate_area_no_task(unsigned long addr) +{ + return 0; +} + +#endif /* CONFIG_X86_64 */ --- linux-ec2-2.6.32.orig/arch/x86/vdso/vdso32.S +++ linux-ec2-2.6.32/arch/x86/vdso/vdso32.S @@ -9,7 +9,7 @@ .globl vdso32_syscall_start, vdso32_syscall_end vdso32_syscall_start: -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN) .incbin "arch/x86/vdso/vdso32-syscall.so" #endif vdso32_syscall_end: --- linux-ec2-2.6.32.orig/arch/x86/vdso/vdso32/syscall.S +++ linux-ec2-2.6.32/arch/x86/vdso/vdso32/syscall.S @@ -19,8 +19,10 @@ .Lpush_ebp: movl %ecx, %ebp syscall +#ifndef CONFIG_XEN movl $__USER32_DS, %ecx movl %ecx, %ss +#endif movl %ebp, %ecx popl %ebp .Lpop_ebp: --- linux-ec2-2.6.32.orig/arch/x86/vdso/vdso32/sysenter.S +++ linux-ec2-2.6.32/arch/x86/vdso/vdso32/sysenter.S @@ -43,7 +43,7 @@ .space 7,0x90 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - jmp .Lenter_kernel + int $0x80 /* 16: System call normal return point is here! */ VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ pop %ebp --- linux-ec2-2.6.32.orig/arch/x86/vdso/vdso32/note.S +++ linux-ec2-2.6.32/arch/x86/vdso/vdso32/note.S @@ -13,7 +13,7 @@ .long LINUX_VERSION_CODE ELFNOTE_END -#ifdef CONFIG_XEN +#if defined(CONFIG_X86_XEN) || defined(CONFIG_PARAVIRT_XEN) /* * Add a special note telling glibc's dynamic linker a fake hardware * flavor that it will use to choose the search path for libraries in the @@ -37,8 +37,12 @@ ELFNOTE_START(GNU, 2, "a") .long 1 /* ncaps */ +#ifdef CONFIG_PARAVIRT_XEN VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */ .long 0 /* mask */ +#else + .long 1 << VDSO_NOTE_NONEGSEG_BIT /* mask */ +#endif .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ ELFNOTE_END #endif --- linux-ec2-2.6.32.orig/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ linux-ec2-2.6.32/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -0,0 +1,157 @@ +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains accelerated part of ghash + * implementation. More information about PCLMULQDQ can be found at: + * + * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * Vinodh Gopal + * Erdinc Ozturk + * Deniz Karakoyunlu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include + +.align 16 +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f +.Lpoly: + .octa 0xc2000000000000000000000000000001 +.Ltwo_one: + .octa 0x00000001000000000000000000000001 + +#define DATA %xmm0 +#define SHASH %xmm1 +#define T1 %xmm2 +#define T2 %xmm3 +#define T3 %xmm4 +#define BSWAP %xmm5 +#define IN1 %xmm6 + +.text + +/* + * __clmul_gf128mul_ble: internal ABI + * input: + * DATA: operand1 + * SHASH: operand2, hash_key << 1 mod poly + * output: + * DATA: operand1 * operand2 mod poly + * changed: + * T1 + * T2 + * T3 + */ +__clmul_gf128mul_ble: + movaps DATA, T1 + pshufd $0b01001110, DATA, T2 + pshufd $0b01001110, SHASH, T3 + pxor DATA, T2 + pxor SHASH, T3 + + # pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 + .byte 0x66, 0x0f, 0x3a, 0x44, 0xc1, 0x00 + # pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 + .byte 0x66, 0x0f, 0x3a, 0x44, 0xd1, 0x11 + # pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) + .byte 0x66, 0x0f, 0x3a, 0x44, 0xdc, 0x00 + pxor DATA, T2 + pxor T1, T2 # T2 = a0 * b1 + a1 * b0 + + movaps T2, T3 + pslldq $8, T3 + psrldq $8, T2 + pxor T3, DATA + pxor T2, T1 # is result of + # carry-less multiplication + + # first phase of the reduction + movaps DATA, T3 + psllq $1, T3 + pxor DATA, T3 + psllq $5, T3 + pxor DATA, T3 + psllq $57, T3 + movaps T3, T2 + pslldq $8, T2 + psrldq $8, T3 + pxor T2, DATA + pxor T3, T1 + + # second phase of the reduction + movaps DATA, T2 + psrlq $5, T2 + pxor DATA, T2 + psrlq $1, T2 + pxor DATA, T2 + psrlq $1, T2 + pxor T2, T1 + pxor T1, DATA + ret + +/* void clmul_ghash_mul(char *dst, const be128 *shash) */ +ENTRY(clmul_ghash_mul) + movups (%rdi), DATA + movups (%rsi), SHASH + movaps .Lbswap_mask, BSWAP + pshufb BSWAP, DATA + call __clmul_gf128mul_ble + pshufb BSWAP, DATA + movups DATA, (%rdi) + ret + +/* + * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + * const be128 *shash); + */ +ENTRY(clmul_ghash_update) + cmp $16, %rdx + jb .Lupdate_just_ret # check length + movaps .Lbswap_mask, BSWAP + movups (%rdi), DATA + movups (%rcx), SHASH + pshufb BSWAP, DATA +.align 4 +.Lupdate_loop: + movups (%rsi), IN1 + pshufb BSWAP, IN1 + pxor IN1, DATA + call __clmul_gf128mul_ble + sub $16, %rdx + add $16, %rsi + cmp $16, %rdx + jge .Lupdate_loop + pshufb BSWAP, DATA + movups DATA, (%rdi) +.Lupdate_just_ret: + ret + +/* + * void clmul_ghash_setkey(be128 *shash, const u8 *key); + * + * Calculate hash_key << 1 mod poly + */ +ENTRY(clmul_ghash_setkey) + movaps .Lbswap_mask, BSWAP + movups (%rsi), %xmm0 + pshufb BSWAP, %xmm0 + movaps %xmm0, %xmm1 + psllq $1, %xmm0 + psrlq $63, %xmm1 + movaps %xmm1, %xmm2 + pslldq $8, %xmm1 + psrldq $8, %xmm2 + por %xmm1, %xmm0 + # reduction + pshufd $0b00100100, %xmm2, %xmm1 + pcmpeqd .Ltwo_one, %xmm1 + pand .Lpoly, %xmm1 + pxor %xmm1, %xmm0 + movups %xmm0, (%rdi) + ret --- linux-ec2-2.6.32.orig/arch/x86/crypto/Makefile +++ linux-ec2-2.6.32/arch/x86/crypto/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o +obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o @@ -24,3 +25,5 @@ salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o + +ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o --- linux-ec2-2.6.32.orig/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ linux-ec2-2.6.32/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -0,0 +1,333 @@ +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains glue code. + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +void clmul_ghash_mul(char *dst, const be128 *shash); + +void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + const be128 *shash); + +void clmul_ghash_setkey(be128 *shash, const u8 *key); + +struct ghash_async_ctx { + struct cryptd_ahash *cryptd_tfm; +}; + +struct ghash_ctx { + be128 shash; +}; + +struct ghash_desc_ctx { + u8 buffer[GHASH_BLOCK_SIZE]; + u32 bytes; +}; + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + clmul_ghash_setkey(&ctx->shash, key); + + return 0; +} + +static int ghash_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *dst = dctx->buffer; + + kernel_fpu_begin(); + if (dctx->bytes) { + int n = min(srclen, dctx->bytes); + u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + clmul_ghash_mul(dst, &ctx->shash); + } + + clmul_ghash_update(dst, src, srclen, &ctx->shash); + kernel_fpu_end(); + + if (srclen & 0xf) { + src += srclen - (srclen & 0xf); + srclen &= 0xf; + dctx->bytes = GHASH_BLOCK_SIZE - srclen; + while (srclen--) + *dst++ ^= *src++; + } + + return 0; +} + +static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) +{ + u8 *dst = dctx->buffer; + + if (dctx->bytes) { + u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes); + + while (dctx->bytes--) + *tmp++ ^= 0; + + kernel_fpu_begin(); + clmul_ghash_mul(dst, &ctx->shash); + kernel_fpu_end(); + } + + dctx->bytes = 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *buf = dctx->buffer; + + ghash_flush(ctx, dctx); + memcpy(dst, buf, GHASH_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg ghash_alg = { + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), + .base = { + .cra_name = "__ghash", + .cra_driver_name = "__ghash-pclmulqdqni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list), + }, +}; + +static int ghash_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (irq_fpu_usable()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_init(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return crypto_shash_init(desc); + } +} + +static int ghash_async_update(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (irq_fpu_usable()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_update(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return shash_ahash_update(req, desc); + } +} + +static int ghash_async_final(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (irq_fpu_usable()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_final(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return crypto_shash_final(desc, req->result); + } +} + +static int ghash_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (irq_fpu_usable()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_digest(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return shash_ahash_digest(req, desc); + } +} + +static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, + unsigned int keylen) +{ + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct crypto_ahash *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ahash_setkey(child, key, keylen); + crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) + & CRYPTO_TFM_RES_MASK); + + return 0; +} + +static int ghash_async_init_tfm(struct crypto_tfm *tfm) +{ + struct cryptd_ahash *cryptd_tfm; + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ctx->cryptd_tfm = cryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&cryptd_tfm->base)); + + return 0; +} + +static void ghash_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ahash(ctx->cryptd_tfm); +} + +static struct ahash_alg ghash_async_alg = { + .init = ghash_async_init, + .update = ghash_async_update, + .final = ghash_async_final, + .setkey = ghash_async_setkey, + .digest = ghash_async_digest, + .halg = { + .digestsize = GHASH_DIGEST_SIZE, + .base = { + .cra_name = "ghash", + .cra_driver_name = "ghash-clmulni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list), + .cra_init = ghash_async_init_tfm, + .cra_exit = ghash_async_exit_tfm, + }, + }, +}; + +static int __init ghash_pclmulqdqni_mod_init(void) +{ + int err; + + if (!cpu_has_pclmulqdq) { + printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not" + " detected.\n"); + return -ENODEV; + } + + err = crypto_register_shash(&ghash_alg); + if (err) + goto err_out; + err = crypto_register_ahash(&ghash_async_alg); + if (err) + goto err_shash; + + return 0; + +err_shash: + crypto_unregister_shash(&ghash_alg); +err_out: + return err; +} + +static void __exit ghash_pclmulqdqni_mod_exit(void) +{ + crypto_unregister_ahash(&ghash_async_alg); + crypto_unregister_shash(&ghash_alg); +} + +module_init(ghash_pclmulqdqni_mod_init); +module_exit(ghash_pclmulqdqni_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " + "acclerated by PCLMULQDQ-NI"); +MODULE_ALIAS("ghash"); --- linux-ec2-2.6.32.orig/arch/x86/power/cpu.c +++ linux-ec2-2.6.32/arch/x86/power/cpu.c @@ -104,12 +104,15 @@ ctxt->cr4 = read_cr4(); ctxt->cr8 = read_cr8(); #endif + ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE, + &ctxt->misc_enable); } /* Needed by apm.c */ void save_processor_state(void) { __save_processor_state(&saved_context); + save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); @@ -176,6 +179,8 @@ */ static void __restore_processor_state(struct saved_context *ctxt) { + if (ctxt->misc_enable_saved) + wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable); /* * control registers */ @@ -249,6 +254,7 @@ void restore_processor_state(void) { __restore_processor_state(&saved_context); + restore_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); --- linux-ec2-2.6.32.orig/arch/x86/power/Makefile +++ linux-ec2-2.6.32/arch/x86/power/Makefile @@ -5,3 +5,5 @@ obj-$(CONFIG_PM_SLEEP) += cpu.o obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o + +disabled-obj-$(CONFIG_XEN) := cpu.o --- linux-ec2-2.6.32.orig/arch/x86/power/hibernate_asm_32.S +++ linux-ec2-2.6.32/arch/x86/power/hibernate_asm_32.S @@ -27,10 +27,17 @@ ret ENTRY(restore_image) + movl mmu_cr4_features, %ecx movl resume_pg_dir, %eax subl $__PAGE_OFFSET, %eax movl %eax, %cr3 + jecxz 1f # cr4 Pentium and higher, skip if zero + andl $~(X86_CR4_PGE), %ecx + movl %ecx, %cr4; # turn off PGE + movl %cr3, %eax; # flush TLB + movl %eax, %cr3 +1: movl restore_pblist, %edx .p2align 4,,7 @@ -54,16 +61,8 @@ movl $swapper_pg_dir, %eax subl $__PAGE_OFFSET, %eax movl %eax, %cr3 - /* Flush TLB, including "global" things (vmalloc) */ movl mmu_cr4_features, %ecx jecxz 1f # cr4 Pentium and higher, skip if zero - movl %ecx, %edx - andl $~(X86_CR4_PGE), %edx - movl %edx, %cr4; # turn off PGE -1: - movl %cr3, %eax; # flush TLB - movl %eax, %cr3 - jecxz 1f # cr4 Pentium and higher, skip if zero movl %ecx, %cr4; # turn PGE back on 1: --- linux-ec2-2.6.32.orig/arch/x86/xen/spinlock.c +++ linux-ec2-2.6.32/arch/x86/xen/spinlock.c @@ -312,7 +312,6 @@ if (per_cpu(lock_spinners, cpu) == xl) { ADD_STATS(released_slow_kicked, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); - break; } } } --- linux-ec2-2.6.32.orig/arch/x86/xen/xen-ops.h +++ linux-ec2-2.6.32/arch/x86/xen/xen-ops.h @@ -25,6 +25,7 @@ void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); +void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); @@ -41,6 +42,7 @@ void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); --- linux-ec2-2.6.32.orig/arch/x86/xen/smp.c +++ linux-ec2-2.6.32/arch/x86/xen/smp.c @@ -30,6 +30,7 @@ #include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -179,6 +180,15 @@ { unsigned cpu; + if (skip_ioapic_setup) { + char *m = (max_cpus == 0) ? + "The nosmp parameter is incompatible with Xen; " \ + "use Xen dom0_max_vcpus=1 parameter" : + "The noapic parameter is incompatible with Xen"; + + xen_raw_printk(m); + panic(m); + } xen_init_lock_cpu(0); smp_store_cpu_info(0); @@ -295,6 +305,7 @@ (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; #endif + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_init_lock_cpu(cpu); @@ -395,9 +406,9 @@ BUG(); } -static void xen_smp_send_stop(void) +static void xen_stop_other_cpus(int wait) { - smp_call_function(stop_self, NULL, 0); + smp_call_function(stop_self, NULL, wait); } static void xen_smp_send_reschedule(int cpu) @@ -465,7 +476,7 @@ .cpu_disable = xen_cpu_disable, .play_dead = xen_play_dead, - .smp_send_stop = xen_smp_send_stop, + .stop_other_cpus = xen_stop_other_cpus, .smp_send_reschedule = xen_smp_send_reschedule, .send_call_func_ipi = xen_smp_send_call_function_ipi, --- linux-ec2-2.6.32.orig/arch/x86/xen/time.c +++ linux-ec2-2.6.32/arch/x86/xen/time.c @@ -100,7 +100,7 @@ return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; } -static void setup_runstate_info(int cpu) +void xen_setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; @@ -154,45 +154,6 @@ account_idle_ticks(ticks); } -/* - * Xen sched_clock implementation. Returns the number of unstolen - * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED - * states. - */ -unsigned long long xen_sched_clock(void) -{ - struct vcpu_runstate_info state; - cycle_t now; - u64 ret; - s64 offset; - - /* - * Ideally sched_clock should be called on a per-cpu basis - * anyway, so preempt should already be disabled, but that's - * not current practice at the moment. - */ - preempt_disable(); - - now = xen_clocksource_read(); - - get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - offset = now - state.state_entry_time; - if (offset < 0) - offset = 0; - - ret = state.time[RUNSTATE_blocked] + - state.time[RUNSTATE_running] + - offset; - - preempt_enable(); - - return ret; -} - - /* Get the TSC speed from Xen */ unsigned long xen_tsc_khz(void) { @@ -434,7 +395,9 @@ name = ""; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_DISABLED|IRQF_PERCPU| + IRQF_NOBALANCING|IRQF_TIMER| + IRQF_FORCE_RESUME, name, NULL); evt = &per_cpu(xen_clock_events, cpu); @@ -442,8 +405,6 @@ evt->cpumask = cpumask_of(cpu); evt->irq = irq; - - setup_runstate_info(cpu); } void xen_teardown_timer(int cpu) @@ -465,6 +426,8 @@ { int cpu; + pvclock_resume(); + if (xen_clockevent != &xen_vcpuop_clockevent) return; @@ -494,6 +457,7 @@ setup_force_cpu_cap(X86_FEATURE_TSC); + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); } --- linux-ec2-2.6.32.orig/arch/x86/xen/enlighten.c +++ linux-ec2-2.6.32/arch/x86/xen/enlighten.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -138,24 +139,23 @@ */ void xen_vcpu_restore(void) { - if (have_vcpu_info_placement) { - int cpu; + int cpu; - for_each_online_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); + for_each_online_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); - if (other_cpu && - HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) - BUG(); + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) + BUG(); - xen_vcpu_setup(cpu); + xen_setup_runstate_info(cpu); - if (other_cpu && - HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) - BUG(); - } + if (have_vcpu_info_placement) + xen_vcpu_setup(cpu); - BUG_ON(!have_vcpu_info_placement); + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) + BUG(); } } @@ -213,6 +213,7 @@ static __init void xen_init_cpuid_mask(void) { unsigned int ax, bx, cx, dx; + unsigned int xsave_mask; cpuid_leaf1_edx_mask = ~((1 << X86_FEATURE_MCE) | /* disable MCE */ @@ -223,24 +224,16 @@ cpuid_leaf1_edx_mask &= ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ (1 << X86_FEATURE_ACPI)); /* disable ACPI */ - ax = 1; - cx = 0; xen_cpuid(&ax, &bx, &cx, &dx); - /* cpuid claims we support xsave; try enabling it to see what happens */ - if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { - unsigned long cr4; - - set_in_cr4(X86_CR4_OSXSAVE); - - cr4 = read_cr4(); - - if ((cr4 & X86_CR4_OSXSAVE) == 0) - cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); - - clear_in_cr4(X86_CR4_OSXSAVE); - } + xsave_mask = + (1 << (X86_FEATURE_XSAVE % 32)) | + (1 << (X86_FEATURE_OSXSAVE % 32)); + + /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ + if ((cx & xsave_mask) != xsave_mask) + cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ } static void xen_set_debugreg(int reg, unsigned long val) @@ -332,6 +325,24 @@ xen_mc_issue(PARAVIRT_LAZY_CPU); } +#ifdef CONFIG_X86_32 +static void xen_load_user_cs_desc(int cpu, struct mm_struct *mm) +{ + void *gdt; + xmaddr_t mgdt; + u64 descriptor; + struct desc_struct user_cs; + + gdt = &get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS]; + mgdt = virt_to_machine(gdt); + + user_cs = mm->context.user_cs; + descriptor = (u64) user_cs.a | ((u64) user_cs.b) << 32; + + HYPERVISOR_update_descriptor(mgdt.maddr, descriptor); +} +#endif /*CONFIG_X86_32*/ + static void xen_load_gdt(const struct desc_ptr *dtr) { unsigned long va = dtr->address; @@ -776,7 +787,16 @@ native_write_cr4(cr4); } - +#ifdef CONFIG_X86_64 +static inline unsigned long xen_read_cr8(void) +{ + return 0; +} +static inline void xen_write_cr8(unsigned long val) +{ + BUG_ON(val); +} +#endif static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int ret; @@ -924,7 +944,7 @@ }; static const struct pv_time_ops xen_time_ops __initdata = { - .sched_clock = xen_sched_clock, + .sched_clock = xen_clocksource_read, }; static const struct pv_cpu_ops xen_cpu_ops __initdata = { @@ -942,13 +962,23 @@ .read_cr4_safe = native_read_cr4_safe, .write_cr4 = xen_write_cr4, +#ifdef CONFIG_X86_64 + .read_cr8 = xen_read_cr8, + .write_cr8 = xen_write_cr8, +#endif + .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, + .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = xen_write_msr_safe, + .wrmsr_regs = native_wrmsr_safe_regs, + .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, + .read_tscp = native_read_tscp, + .iret = xen_iret, .irq_enable_sysexit = xen_sysexit, #ifdef CONFIG_X86_64 @@ -958,6 +988,9 @@ .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, +#ifdef CONFIG_X86_32 + .load_user_cs_desc = xen_load_user_cs_desc, +#endif /*CONFIG_X86_32*/ .load_gdt = xen_load_gdt, .load_idt = xen_load_idt, .load_tls = xen_load_tls, @@ -997,10 +1030,6 @@ { struct sched_shutdown r = { .reason = reason }; -#ifdef CONFIG_SMP - smp_send_stop(); -#endif - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); } @@ -1093,6 +1122,12 @@ __supported_pte_mask |= _PAGE_IOMAP; + /* + * Prevent page tables from being allocated in highmem, even + * if CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + #ifdef CONFIG_X86_64 /* Work out if we support NX */ check_efer(); @@ -1182,6 +1217,8 @@ xen_raw_console_write("about to get started...\n"); + xen_setup_runstate_info(0); + /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); --- linux-ec2-2.6.32.orig/arch/x86/xen/xen-head.S +++ linux-ec2-2.6.32/arch/x86/xen/xen-head.S @@ -1,7 +1,7 @@ /* Xen-specific pieces of head.S, intended to be included in the right place in head.S */ -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN #include #include @@ -52,4 +52,4 @@ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) -#endif /*CONFIG_XEN */ +#endif /* CONFIG_PARAVIRT_XEN */ --- linux-ec2-2.6.32.orig/arch/x86/xen/xen-asm.S +++ linux-ec2-2.6.32/arch/x86/xen/xen-asm.S @@ -96,7 +96,7 @@ /* check for unmasked and pending */ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending - jz 1f + jnz 1f 2: call check_events 1: ENDPATCH(xen_restore_fl_direct) --- linux-ec2-2.6.32.orig/arch/x86/xen/xen-asm_32.S +++ linux-ec2-2.6.32/arch/x86/xen/xen-asm_32.S @@ -88,11 +88,11 @@ */ #ifdef CONFIG_SMP GET_THREAD_INFO(%eax) - movl TI_cpu(%eax), %eax - movl __per_cpu_offset(,%eax,4), %eax - mov per_cpu__xen_vcpu(%eax), %eax + movl %ss:TI_cpu(%eax), %eax + movl %ss:__per_cpu_offset(,%eax,4), %eax + mov %ss:per_cpu__xen_vcpu(%eax), %eax #else - movl per_cpu__xen_vcpu, %eax + movl %ss:per_cpu__xen_vcpu, %eax #endif /* check IF state we're restoring */ @@ -105,19 +105,21 @@ * resuming the code, so we don't have to be worried about * being preempted to another CPU. */ - setz XEN_vcpu_info_mask(%eax) + setz %ss:XEN_vcpu_info_mask(%eax) xen_iret_start_crit: /* check for unmasked and pending */ - cmpw $0x0001, XEN_vcpu_info_pending(%eax) + cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax) /* * If there's something pending, mask events again so we can - * jump back into xen_hypervisor_callback + * jump back into xen_hypervisor_callback. Otherwise do not + * touch XEN_vcpu_info_mask. */ - sete XEN_vcpu_info_mask(%eax) + jne 1f + movb $1, %ss:XEN_vcpu_info_mask(%eax) - popl %eax +1: popl %eax /* * From this point on the registers are restored and the stack --- linux-ec2-2.6.32.orig/arch/x86/xen/xen-asm_64.S +++ linux-ec2-2.6.32/arch/x86/xen/xen-asm_64.S @@ -96,7 +96,7 @@ pushq $__USER32_CS pushq %rcx - pushq $VGCF_in_syscall + pushq $0 1: jmp hypercall_iret ENDPATCH(xen_sysret32) RELOC(xen_sysret32, 1b+1) @@ -151,7 +151,7 @@ ENTRY(xen_sysenter_target) lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax - pushq $VGCF_in_syscall + pushq $0 jmp hypercall_iret ENDPROC(xen_syscall32_target) ENDPROC(xen_sysenter_target) --- linux-ec2-2.6.32.orig/arch/x86/xen/Kconfig +++ linux-ec2-2.6.32/arch/x86/xen/Kconfig @@ -2,7 +2,7 @@ # This Kconfig describes xen options # -config XEN +config PARAVIRT_XEN bool "Xen guest support" select PARAVIRT select PARAVIRT_CLOCK @@ -17,7 +17,7 @@ int "Maximum allowed size of a domain in gigabytes" default 8 if X86_32 default 32 if X86_64 - depends on XEN + depends on PARAVIRT_XEN help The pseudo-physical to machine address array is sized according to the maximum possible memory size of a Xen @@ -26,12 +26,12 @@ config XEN_SAVE_RESTORE bool - depends on XEN && PM + depends on PARAVIRT_XEN && PM default y config XEN_DEBUG_FS bool "Enable Xen debug and tuning parameters in debugfs" - depends on XEN && DEBUG_FS + depends on PARAVIRT_XEN && DEBUG_FS default n help Enable statistics output and various tuning options in debugfs. --- linux-ec2-2.6.32.orig/arch/x86/xen/mmu.c +++ linux-ec2-2.6.32/arch/x86/xen/mmu.c @@ -185,7 +185,7 @@ } /* Build the parallel p2m_top_mfn structures */ -static void __init xen_build_mfn_list_list(void) +void xen_build_mfn_list_list(void) { unsigned pfn, idx; @@ -987,10 +987,9 @@ */ void xen_mm_pin_all(void) { - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { if (!PagePinned(page)) { @@ -999,7 +998,7 @@ } } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } /* @@ -1100,10 +1099,9 @@ */ void xen_mm_unpin_all(void) { - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { @@ -1113,7 +1111,7 @@ } } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -1141,7 +1139,7 @@ active_mm = percpu_read(cpu_tlbstate.active_mm); - if (active_mm == mm) + if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure @@ -1432,14 +1430,15 @@ { pgprot_t prot = PAGE_KERNEL; + /* + * We disable highmem allocations for page tables so we should never + * see any calls to kmap_atomic_pte on a highmem page. + */ + BUG_ON(PageHighMem(page)); + if (PagePinned(page)) prot = PAGE_KERNEL_RO; - if (0 && PageHighMem(page)) - printk("mapping highpte %lx type %d prot %s\n", - page_to_pfn(page), type, - (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); - return kmap_atomic_prot(page, type, prot); } #endif @@ -1657,8 +1656,10 @@ for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { pte_t pte; +#ifdef CONFIG_X86_32 if (pfn > max_pfn_mapped) max_pfn_mapped = pfn; +#endif if (!pte_none(pte_page[pteidx])) continue; @@ -1703,6 +1704,12 @@ pud_t *l3; pmd_t *l2; + /* max_pfn_mapped is the last pfn mapped in the initial memory + * mappings. Considering that on Xen after the kernel mappings we + * have the mappings of some pages that don't exist in pfn space, we + * set max_pfn_mapped to the last real pfn mapped. */ + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); --- linux-ec2-2.6.32.orig/arch/x86/xen/suspend.c +++ linux-ec2-2.6.32/arch/x86/xen/suspend.c @@ -1,4 +1,5 @@ #include +#include #include #include @@ -27,6 +28,8 @@ void xen_post_suspend(int suspend_cancelled) { + xen_build_mfn_list_list(); + xen_setup_shared_info(); if (suspend_cancelled) { @@ -44,7 +47,19 @@ } +static void xen_vcpu_notify_restore(void *data) +{ + unsigned long reason = (unsigned long)data; + + /* Boot processor notified via generic timekeeping_resume() */ + if ( smp_processor_id() == 0) + return; + + clockevents_notify(reason, NULL); +} + void xen_arch_resume(void) { - /* nothing */ + on_each_cpu(xen_vcpu_notify_restore, + (void *)CLOCK_EVT_NOTIFY_RESUME, 1); } --- linux-ec2-2.6.32.orig/arch/x86/xen/multicalls.c +++ linux-ec2-2.6.32/arch/x86/xen/multicalls.c @@ -189,10 +189,10 @@ unsigned argidx = roundup(b->argidx, sizeof(u64)); BUG_ON(preemptible()); - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); if (b->mcidx == MC_BATCH || - (argidx + args) > MC_ARGS) { + (argidx + args) >= MC_ARGS) { mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); xen_mc_flush(); argidx = roundup(b->argidx, sizeof(u64)); @@ -206,7 +206,7 @@ ret.args = &b->args[argidx]; b->argidx = argidx + args; - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); return ret; } @@ -216,7 +216,7 @@ struct multicall_space ret = { NULL, NULL }; BUG_ON(preemptible()); - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); if (b->mcidx == 0) return ret; @@ -224,14 +224,14 @@ if (b->entries[b->mcidx - 1].op != op) return ret; - if ((b->argidx + size) > MC_ARGS) + if ((b->argidx + size) >= MC_ARGS) return ret; ret.mc = &b->entries[b->mcidx - 1]; ret.args = &b->args[b->argidx]; b->argidx += size; - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); return ret; } --- linux-ec2-2.6.32.orig/arch/x86/mm/fault.c +++ linux-ec2-2.6.32/arch/x86/mm/fault.c @@ -223,15 +223,14 @@ address >= TASK_SIZE && address < FIXADDR_TOP; address += PMD_SIZE) { - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { if (!vmalloc_sync_one(page_address(page), address)) break; } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } } @@ -331,13 +330,12 @@ address += PGDIR_SIZE) { const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); @@ -346,7 +344,7 @@ else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } } @@ -378,10 +376,12 @@ if (pgd_none(*pgd_ref)) return -1; - if (pgd_none(*pgd)) + if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); - else + arch_flush_lazy_mmu_mode(); + } else { BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } /* * Below here mismatches are bugs because these lower tables @@ -801,8 +801,10 @@ up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) + if (!(error_code & PF_USER)) { no_context(regs, error_code, address); + return; + } /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) @@ -828,6 +830,13 @@ unsigned long address, unsigned int fault) { if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address); + return; + } + out_of_memory(regs, error_code, address); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) --- linux-ec2-2.6.32.orig/arch/x86/mm/pat-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/pat-xen.c @@ -0,0 +1,1028 @@ +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi + * Suresh B Siddha + * + * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_PAT +int __read_mostly pat_enabled = 1; + +static inline void pat_disable(const char *reason) +{ + pat_enabled = 0; + printk(KERN_INFO "%s\n", reason); +} + +static int __init nopat(char *str) +{ + pat_disable("PAT support disabled."); + return 0; +} +early_param("nopat", nopat); +#else +static inline void pat_disable(const char *reason) +{ + (void)reason; +} +#endif + + +static int debug_enable; + +static int __init pat_debug_setup(char *str) +{ + debug_enable = 1; + return 0; +} +__setup("debugpat", pat_debug_setup); + +#define dprintk(fmt, arg...) \ + do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) + + +static u64 __read_mostly boot_pat_state; + +enum { + PAT_UC = 0, /* uncached */ + PAT_WC = 1, /* Write combining */ + PAT_WT = 4, /* Write Through */ + PAT_WP = 5, /* Write Protected */ + PAT_WB = 6, /* Write Back (default) */ + PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ +}; + +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) + +void pat_init(void) +{ + u64 pat; + bool boot_cpu = !boot_pat_state; + + if (!pat_enabled) + return; + + if (!cpu_has_pat) { + if (!boot_pat_state) { + pat_disable("PAT not supported by CPU."); + return; + } else { + /* + * If this happens we are on a secondary CPU, but + * switched to PAT on the boot CPU. We have no way to + * undo PAT. + */ + printk(KERN_ERR "PAT enabled, " + "but not supported by secondary CPU\n"); + BUG(); + } + } + +#ifndef CONFIG_XEN + /* Set PWT to Write-Combining. All other bits stay the same */ + /* + * PTE encoding used in Linux: + * PAT + * |PCD + * ||PWT + * ||| + * 000 WB _PAGE_CACHE_WB + * 001 WC _PAGE_CACHE_WC + * 010 UC- _PAGE_CACHE_UC_MINUS + * 011 UC _PAGE_CACHE_UC + * PAT bit unused + */ + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); + + /* Boot CPU check */ + if (!boot_pat_state) + rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); + + wrmsrl(MSR_IA32_CR_PAT, pat); +#else + /* + * PAT settings are part of the hypervisor interface, and their + * assignment cannot be changed. + */ + rdmsrl(MSR_IA32_CR_PAT, pat); + if (!boot_pat_state) + boot_pat_state = pat; +#endif + + if (boot_cpu) + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", + smp_processor_id(), boot_pat_state, pat); +} + +#undef PAT + +static char *cattr_name(unsigned long flags) +{ + switch (flags & _PAGE_CACHE_MASK) { + case _PAGE_CACHE_UC: return "uncached"; + case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_WB: return "write-back"; + case _PAGE_CACHE_WC: return "write-combining"; + case _PAGE_CACHE_WP: return "write-protected"; + case _PAGE_CACHE_WT: return "write-through"; + default: return "broken"; + } +} + +/* + * The global memtype list keeps track of memory type for specific + * physical memory areas. Conflicting memory types in different + * mappings can cause CPU cache corruption. To avoid this we keep track. + * + * The list is sorted based on starting address and can contain multiple + * entries for each address (this allows reference counting for overlapping + * areas). All the aliases have the same cache attributes of course. + * Zero attributes are represented as holes. + * + * The data structure is a list that is also organized as an rbtree + * sorted on the start address of memtype range. + * + * memtype_lock protects both the linear list and rbtree. + */ + +struct memtype { + u64 start; + u64 end; + unsigned long type; + struct list_head nd; + struct rb_node rb; +}; + +static struct rb_root memtype_rbroot = RB_ROOT; +static LIST_HEAD(memtype_list); +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ + +static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) +{ + struct rb_node *node = root->rb_node; + struct memtype *last_lower = NULL; + + while (node) { + struct memtype *data = container_of(node, struct memtype, rb); + + if (data->start < start) { + last_lower = data; + node = node->rb_right; + } else if (data->start > start) { + node = node->rb_left; + } else + return data; + } + + /* Will return NULL if there is no entry with its start <= start */ + return last_lower; +} + +static void memtype_rb_insert(struct rb_root *root, struct memtype *data) +{ + struct rb_node **new = &(root->rb_node); + struct rb_node *parent = NULL; + + while (*new) { + struct memtype *this = container_of(*new, struct memtype, rb); + + parent = *new; + if (data->start <= this->start) + new = &((*new)->rb_left); + else if (data->start > this->start) + new = &((*new)->rb_right); + } + + rb_link_node(&data->rb, parent, new); + rb_insert_color(&data->rb, root); +} + +/* + * Does intersection of PAT memory type and MTRR memory type and returns + * the resulting memory type as PAT understands it. + * (Type in pat and mtrr will not have same value) + * The intersection is based on "Effective Memory Type" tables in IA-32 + * SDM vol 3a + */ +static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) +{ + /* + * Look for MTRR hint to get the effective type in case where PAT + * request is for WB. + */ + if (req_type == _PAGE_CACHE_WB) { + u8 mtrr_type; + + mtrr_type = mtrr_type_lookup(start, end); + if (mtrr_type != MTRR_TYPE_WRBACK) + return _PAGE_CACHE_UC_MINUS; + + return _PAGE_CACHE_WB; + } + + return req_type; +} + +static int +chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) +{ + if (new->type != entry->type) { + if (type) { + new->type = entry->type; + *type = entry->type; + } else + goto conflict; + } + + /* check overlaps with more than one entry in the list */ + list_for_each_entry_continue(entry, &memtype_list, nd) { + if (new->end <= entry->start) + break; + else if (new->type != entry->type) + goto conflict; + } + return 0; + + conflict: + printk(KERN_INFO "%s:%d conflicting memory types " + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, + new->end, cattr_name(new->type), cattr_name(entry->type)); + return -EBUSY; +} + +static int pat_pagerange_is_ram(unsigned long start, unsigned long end) +{ + int ram_page = 0, not_rampage = 0; + unsigned long page_nr; + + for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); + ++page_nr) { + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && + page_is_ram(mfn_to_local_pfn(page_nr))) + ram_page = 1; + else + not_rampage = 1; + + if (ram_page == not_rampage) + return -1; + } + + return ram_page; +} + +/* + * For RAM pages, we use page flags to mark the pages with appropriate type. + * Here we do two pass: + * - Find the memtype of all the pages in the range, look for any conflicts + * - In case of no conflicts, set the new memtype for pages in the range + * + * Caller must hold memtype_lock for atomicity. + */ +static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, + unsigned long *new_type) +{ + struct page *page; + u64 pfn; + + if (req_type == _PAGE_CACHE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_UC_MINUS; + } + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + unsigned long type; + + page = pfn_to_page(pfn); + type = get_page_memtype(page); + if (type != -1) { + printk(KERN_INFO "reserve_ram_pages_type failed " + "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", + start, end, type, req_type); + if (new_type) + *new_type = type; + + return -EBUSY; + } + } + + if (new_type) + *new_type = req_type; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + set_page_memtype(page, req_type); + } + return 0; +} + +static int free_ram_pages_type(u64 start, u64 end) +{ + struct page *page; + u64 pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + set_page_memtype(page, -1); + } + return 0; +} + +/* + * req_type typically has one of the: + * - _PAGE_CACHE_WB + * - _PAGE_CACHE_WC + * - _PAGE_CACHE_UC_MINUS + * - _PAGE_CACHE_UC + * + * req_type will have a special case value '-1', when requester want to inherit + * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. + * + * If new_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If new_type is non-NULL, function will return + * available type in new_type in case of no error. In case of any error + * it will return a negative return value. + */ +int reserve_memtype(u64 start, u64 end, unsigned long req_type, + unsigned long *new_type) +{ + struct memtype *new, *entry; + unsigned long actual_type; + struct list_head *where; + int is_range_ram; + int err = 0; + + BUG_ON(start >= end); /* end is exclusive */ + + if (!pat_enabled) { + /* This is identical to page table setting without PAT */ + if (new_type) { + if (req_type == -1) + *new_type = _PAGE_CACHE_WB; + else if (req_type == _PAGE_CACHE_WC) + *new_type = _PAGE_CACHE_UC_MINUS; + else + *new_type = req_type & _PAGE_CACHE_MASK; + } + return 0; + } + + /* Low ISA region is always mapped WB in page table. No need to track */ + if (is_ISA_range(start, end - 1)) { + if (new_type) + *new_type = _PAGE_CACHE_WB; + return 0; + } + + /* + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. + */ + actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + + if (new_type) + *new_type = actual_type; + + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = reserve_ram_pages_type(start, end, req_type, new_type); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { + return -EINVAL; + } + + new = kmalloc(sizeof(struct memtype), GFP_KERNEL); + if (!new) + return -ENOMEM; + + new->start = start; + new->end = end; + new->type = actual_type; + + spin_lock(&memtype_lock); + + /* Search for existing mapping that overlaps the current range */ + where = NULL; + list_for_each_entry(entry, &memtype_list, nd) { + if (end <= entry->start) { + where = entry->nd.prev; + break; + } else if (start <= entry->start) { /* end > entry->start */ + err = chk_conflict(new, entry, new_type); + if (!err) { + dprintk("Overlap at 0x%Lx-0x%Lx\n", + entry->start, entry->end); + where = entry->nd.prev; + } + break; + } else if (start < entry->end) { /* start > entry->start */ + err = chk_conflict(new, entry, new_type); + if (!err) { + dprintk("Overlap at 0x%Lx-0x%Lx\n", + entry->start, entry->end); + + /* + * Move to right position in the linked + * list to add this new entry + */ + list_for_each_entry_continue(entry, + &memtype_list, nd) { + if (start <= entry->start) { + where = entry->nd.prev; + break; + } + } + } + break; + } + } + + if (err) { + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " + "track %s, req %s\n", + start, end, cattr_name(new->type), cattr_name(req_type)); + kfree(new); + spin_unlock(&memtype_lock); + + return err; + } + + if (where) + list_add(&new->nd, where); + else + list_add_tail(&new->nd, &memtype_list); + + memtype_rb_insert(&memtype_rbroot, new); + + spin_unlock(&memtype_lock); + + dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", + start, end, cattr_name(new->type), cattr_name(req_type), + new_type ? cattr_name(*new_type) : "-"); + + return err; +} + +int free_memtype(u64 start, u64 end) +{ + struct memtype *entry, *saved_entry; + int err = -EINVAL; + int is_range_ram; + + if (!pat_enabled) + return 0; + + /* Low ISA region is always mapped WB. No need to track */ + if (is_ISA_range(start, end - 1)) + return 0; + + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = free_ram_pages_type(start, end); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { + return -EINVAL; + } + + spin_lock(&memtype_lock); + + entry = memtype_rb_search(&memtype_rbroot, start); + if (unlikely(entry == NULL)) + goto unlock_ret; + + /* + * Saved entry points to an entry with start same or less than what + * we searched for. Now go through the list in both directions to look + * for the entry that matches with both start and end, with list stored + * in sorted start address + */ + saved_entry = entry; + list_for_each_entry_from(entry, &memtype_list, nd) { + if (entry->start == start && entry->end == end) { + rb_erase(&entry->rb, &memtype_rbroot); + list_del(&entry->nd); + kfree(entry); + err = 0; + break; + } else if (entry->start > start) { + break; + } + } + + if (!err) + goto unlock_ret; + + entry = saved_entry; + list_for_each_entry_reverse(entry, &memtype_list, nd) { + if (entry->start == start && entry->end == end) { + rb_erase(&entry->rb, &memtype_rbroot); + list_del(&entry->nd); + kfree(entry); + err = 0; + break; + } else if (entry->start < start) { + break; + } + } +unlock_ret: + spin_unlock(&memtype_lock); + + if (err) { + printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", + current->comm, current->pid, start, end); + } + + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + + return err; +} + + +#ifndef CONFIG_XEN +/** + * lookup_memtype - Looksup the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or + * _PAGE_CACHE_UC + */ +static unsigned long lookup_memtype(u64 paddr) +{ + int rettype = _PAGE_CACHE_WB; + struct memtype *entry; + + if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + spin_lock(&memtype_lock); + page = pfn_to_page(paddr >> PAGE_SHIFT); + rettype = get_page_memtype(page); + spin_unlock(&memtype_lock); + /* + * -1 from get_page_memtype() implies RAM page is in its + * default state and not reserved, and hence of type WB + */ + if (rettype == -1) + rettype = _PAGE_CACHE_WB; + + return rettype; + } + + spin_lock(&memtype_lock); + + entry = memtype_rb_search(&memtype_rbroot, paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_UC_MINUS; + + spin_unlock(&memtype_lock); + return rettype; +} +#endif + +/** + * io_reserve_memtype - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type) +{ + resource_size_t size = end - start; + unsigned long req_type = *type; + unsigned long new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, size)); + + ret = reserve_memtype(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(start, size, req_type, new_type)) + goto out_free; + + if (kernel_map_sync_memtype(start, size, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + free_memtype(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * io_free_memtype - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void io_free_memtype(resource_size_t start, resource_size_t end) +{ + free_memtype(start, end); +} + +pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn, + unsigned long size, pgprot_t vma_prot) +{ + return vma_prot; +} + +#ifdef CONFIG_STRICT_DEVMEM +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ +static inline int range_is_allowed(unsigned long mfn, unsigned long size) +{ + return 1; +} +#else +/* This check is needed to avoid cache aliasing when PAT is enabled */ +static inline int range_is_allowed(unsigned long mfn, unsigned long size) +{ + u64 from = ((u64)mfn) << PAGE_SHIFT; + u64 to = from + size; + u64 cursor = from; + + if (!pat_enabled) + return 1; + + while (cursor < to) { + if (!devmem_is_allowed(mfn)) { + printk(KERN_INFO + "Program %s tried to access /dev/mem between %Lx->%Lx.\n", + current->comm, from, to); + return 0; + } + cursor += PAGE_SIZE; + mfn++; + } + return 1; +} +#endif /* CONFIG_STRICT_DEVMEM */ + +int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn, + unsigned long size, pgprot_t *vma_prot) +{ + unsigned long flags = _PAGE_CACHE_WB; + + if (!range_is_allowed(mfn, size)) + return 0; + + if (file->f_flags & O_SYNC) { + flags = _PAGE_CACHE_UC_MINUS; + } + +#ifndef CONFIG_X86_32 +#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */ + /* + * On the PPro and successors, the MTRRs are used to set + * memory types for physical addresses outside main memory, + * so blindly setting UC or PWT on those pages is wrong. + * For Pentiums and earlier, the surround logic should disable + * caching for the high addresses through the KEN pin, but + * we maintain the tradition of paranoia in this code. + */ + if (!pat_enabled && + !(boot_cpu_has(X86_FEATURE_MTRR) || + boot_cpu_has(X86_FEATURE_K6_MTRR) || + boot_cpu_has(X86_FEATURE_CYRIX_ARR) || + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && + (pfn << PAGE_SHIFT) >= __pa(high_memory)) { + flags = _PAGE_CACHE_UC; + } +#endif +#endif + + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | + flags); + return 1; +} + +/* + * Change the memory type for the physial address range in kernel identity + * mapping space if that range is a part of identity map. + */ +int kernel_map_sync_memtype(u64 ma, unsigned long size, unsigned long flags) +{ + return ioremap_check_change_attr(ma >> PAGE_SHIFT, size, flags); +} + +#ifndef CONFIG_XEN +/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful reserve_memtype, + * this func also keeps identity mapping (if any) in sync with this new prot. + */ +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, + int strict_prot) +{ + int is_ram = 0; + int ret; + unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); + unsigned long flags = want_flags; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + + /* + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. + */ + if (is_ram) { + if (!pat_enabled) + return 0; + + flags = lookup_memtype(paddr); + if (want_flags != flags) { + printk(KERN_WARNING + "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } + return 0; + } + + ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + if (ret) + return ret; + + if (flags != want_flags) { + if (strict_prot || + !is_new_memtype_allowed(paddr, size, want_flags, flags)) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR "%s:%d map pfn expected mapping type %s" + " for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + return -EINVAL; + } + /* + * We allow returning different type than the one requested in + * non strict case. + */ + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } + + if (kernel_map_sync_memtype(paddr, size, flags) < 0) { + free_memtype(paddr, paddr + size); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to free a range of physical memory. + * Frees non RAM regions only. + */ +static void free_pfn_range(u64 paddr, unsigned long size) +{ + int is_ram; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + if (is_ram == 0) + free_memtype(paddr, paddr + size); +} + +/* + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + * + * If the vma has a linear pfn mapping for the entire range, we get the prot + * from pte and reserve the entire vma range with single reserve_pfn_range call. + */ +int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + resource_size_t paddr; + unsigned long prot; + unsigned long vma_size = vma->vm_end - vma->vm_start; + pgprot_t pgprot; + + if (is_linear_pfn_mapping(vma)) { + /* + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. + */ + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + pgprot = __pgprot(prot); + return reserve_pfn_range(paddr, vma_size, &pgprot, 1); + } + + return 0; +} + +/* + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + * + * prot is passed in as a parameter for the new mapping. If the vma has a + * linear pfn mapping for the entire range reserve the entire vma range with + * single reserve_pfn_range call. + */ +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long size) +{ + unsigned long flags; + resource_size_t paddr; + unsigned long vma_size = vma->vm_end - vma->vm_start; + + if (is_linear_pfn_mapping(vma)) { + /* reserve the whole chunk starting from vm_pgoff */ + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + return reserve_pfn_range(paddr, vma_size, prot, 0); + } + + if (!pat_enabled) + return 0; + + /* for vm_insert_pfn and friends, we set prot based on lookup */ + flags = lookup_memtype(pfn << PAGE_SHIFT); + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + + return 0; +} + +/* + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) +{ + resource_size_t paddr; + unsigned long vma_size = vma->vm_end - vma->vm_start; + + if (is_linear_pfn_mapping(vma)) { + /* free the whole chunk starting from vm_pgoff */ + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + free_pfn_range(paddr, vma_size); + return; + } +} +#endif /* CONFIG_XEN */ + +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + if (pat_enabled) + return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + else + return pgprot_noncached(prot); +} +EXPORT_SYMBOL_GPL(pgprot_writecombine); + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) + +/* get Nth element of the linked list */ +static struct memtype *memtype_get_idx(loff_t pos) +{ + struct memtype *list_node, *print_entry; + int i = 1; + + print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); + if (!print_entry) + return NULL; + + spin_lock(&memtype_lock); + list_for_each_entry(list_node, &memtype_list, nd) { + if (pos == i) { + *print_entry = *list_node; + spin_unlock(&memtype_lock); + return print_entry; + } + ++i; + } + spin_unlock(&memtype_lock); + kfree(print_entry); + + return NULL; +} + +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos == 0) { + ++*pos; + seq_printf(seq, "PAT memtype list:\n"); + } + + return memtype_get_idx(*pos); +} + +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return memtype_get_idx(*pos); +} + +static void memtype_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int memtype_seq_show(struct seq_file *seq, void *v) +{ + struct memtype *print_entry = (struct memtype *)v; + + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), + print_entry->start, print_entry->end); + kfree(print_entry); + + return 0; +} + +static const struct seq_operations memtype_seq_ops = { + .start = memtype_seq_start, + .next = memtype_seq_next, + .stop = memtype_seq_stop, + .show = memtype_seq_show, +}; + +static int memtype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &memtype_seq_ops); +} + +static const struct file_operations memtype_fops = { + .open = memtype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init pat_memtype_list_init(void) +{ + debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, + NULL, &memtype_fops); + return 0; +} + +late_initcall(pat_memtype_list_init); + +#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ --- linux-ec2-2.6.32.orig/arch/x86/mm/init.c +++ linux-ec2-2.6.32/arch/x86/mm/init.c @@ -149,6 +149,14 @@ set_nx(); if (nx_enabled) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); + else +#ifdef CONFIG_X86_32 + printk(KERN_INFO "Using x86 segment limits to approximate " + "NX protection\n"); +#else + printk(KERN_WARNING "Warning: NX (Execute Disable) protection " + "missing in CPU or disabled in BIOS!\n"); +#endif /* Enable PSE if available */ if (cpu_has_pse) --- linux-ec2-2.6.32.orig/arch/x86/mm/hypervisor.c +++ linux-ec2-2.6.32/arch/x86/mm/hypervisor.c @@ -0,0 +1,1214 @@ +/****************************************************************************** + * mm/hypervisor.c + * + * Update page tables via the hypervisor. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +EXPORT_SYMBOL(hypercall_page); + +shared_info_t *__read_mostly HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; +#ifndef CONFIG_XEN_VCPU_INFO_PLACEMENT +EXPORT_SYMBOL(HYPERVISOR_shared_info); +#else +DEFINE_PER_CPU(struct vcpu_info, vcpu_info) __aligned(sizeof(struct vcpu_info)); +EXPORT_PER_CPU_SYMBOL(vcpu_info); + +void __ref setup_vcpu_info(unsigned int cpu) +{ + struct vcpu_info *v = &per_cpu(vcpu_info, cpu); + struct vcpu_register_vcpu_info info; +#ifdef CONFIG_X86_64 + static bool first = true; + + if (first) { + first = false; + info.mfn = early_arbitrary_virt_to_mfn(v); + } else +#endif + info.mfn = arbitrary_virt_to_mfn(v); + info.offset = offset_in_page(v); + + if (HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, + cpu, &info)) + BUG(); +} + +void __init adjust_boot_vcpu_info(void) +{ + unsigned long lpfn, rpfn, lmfn, rmfn; + pte_t *lpte, *rpte; + unsigned int level; + mmu_update_t mmu[2]; + + /* + * setup_vcpu_info() cannot be used more than once for a given (v)CPU, + * hence we must swap the underlying MFNs of the two pages holding old + * and new vcpu_info of the boot CPU. + * + * Do *not* use __get_cpu_var() or percpu_{write,...}() here, as the per- + * CPU segment didn't get reloaded yet. Using percpu_read(), as in + * arch_use_lazy_mmu_mode(), though undesirable, is safe except for the + * accesses to variables that were updated in setup_percpu_areas(). + */ + lpte = lookup_address((unsigned long)&per_cpu_var(vcpu_info) + + (__per_cpu_load - __per_cpu_start), + &level); + rpte = lookup_address((unsigned long)&per_cpu(vcpu_info, 0), &level); + BUG_ON(!lpte || !(pte_flags(*lpte) & _PAGE_PRESENT)); + BUG_ON(!rpte || !(pte_flags(*rpte) & _PAGE_PRESENT)); + lmfn = __pte_mfn(*lpte); + rmfn = __pte_mfn(*rpte); + + if (lmfn == rmfn) + return; + + lpfn = mfn_to_local_pfn(lmfn); + rpfn = mfn_to_local_pfn(rmfn); + + printk(KERN_INFO + "Swapping MFNs for PFN %lx and %lx (MFN %lx and %lx)\n", + lpfn, rpfn, lmfn, rmfn); + + xen_l1_entry_update(lpte, pfn_pte_ma(rmfn, pte_pgprot(*lpte))); + xen_l1_entry_update(rpte, pfn_pte_ma(lmfn, pte_pgprot(*rpte))); +#ifdef CONFIG_X86_64 + if (HYPERVISOR_update_va_mapping((unsigned long)__va(lpfn<op, mc->args[0], mc->args[1], mc->args[2], mc->args[3], + rc, mc->args[5]); + BUG(); +} + +static int _xen_multicall_flush(bool ret_last) { + struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu); + multicall_entry_t *mc = lazy->mc; + unsigned int count = lazy->nr_mc; + + if (!count) + return 0; + + lazy->nr_mc = 0; + lazy->nr_mmu = 0; + lazy->nr_mmuext = 0; + + if (count == 1) { + int rc = _hypercall(int, mc->op, mc->args[0], mc->args[1], + mc->args[2], mc->args[3], mc->args[4]); + + if (unlikely(rc)) { + if (ret_last) + return rc; + multicall_failed(mc, rc); + } + } else { + if (HYPERVISOR_multicall(mc, count)) + BUG(); + while (count-- > ret_last) + if (unlikely(mc++->result)) + multicall_failed(mc - 1, mc[-1].result); + if (ret_last) + return mc->result; + } + + return 0; +} + +void xen_multicall_flush(bool force) { + if (force || use_lazy_mmu_mode()) + _xen_multicall_flush(false); +} +EXPORT_SYMBOL(xen_multicall_flush); + +int xen_multi_update_va_mapping(unsigned long va, pte_t pte, + unsigned long uvmf) +{ + struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu); + multicall_entry_t *mc; + + if (unlikely(!use_lazy_mmu_mode())) +#ifdef CONFIG_X86_PAE + return _hypercall4(int, update_va_mapping, va, + pte.pte_low, pte.pte_high, uvmf); +#else + return _hypercall3(int, update_va_mapping, va, + pte.pte, uvmf); +#endif + + if (unlikely(lazy->nr_mc == NR_MC)) + _xen_multicall_flush(false); + + mc = lazy->mc + lazy->nr_mc++; + mc->op = __HYPERVISOR_update_va_mapping; + mc->args[0] = va; +#ifndef CONFIG_X86_PAE + mc->args[1] = pte.pte; +#else + mc->args[1] = pte.pte_low; + mc->args[2] = pte.pte_high; +#endif + mc->args[MULTI_UVMFLAGS_INDEX] = uvmf; + mc->args[5] = (long)__builtin_return_address(0); + + return 0; +} + +static inline bool mmu_may_merge(const multicall_entry_t *mc, + unsigned int op, domid_t domid) +{ + return mc->op == op && !mc->args[2] && mc->args[3] == domid; +} + +int xen_multi_mmu_update(mmu_update_t *src, unsigned int count, + unsigned int *success_count, domid_t domid) +{ + struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu); + multicall_entry_t *mc = lazy->mc + lazy->nr_mc; + mmu_update_t *dst; + bool commit, merge; + + if (unlikely(!use_lazy_mmu_mode())) + return _hypercall4(int, mmu_update, src, count, + success_count, domid); + + commit = (lazy->nr_mmu + count) > NR_MMU || success_count; + merge = lazy->nr_mc && !commit + && mmu_may_merge(mc - 1, __HYPERVISOR_mmu_update, domid); + if (unlikely(lazy->nr_mc == NR_MC) && !merge) { + _xen_multicall_flush(false); + mc = lazy->mc; + commit = count > NR_MMU || success_count; + } + + if (!lazy->nr_mc && unlikely(commit)) + return _hypercall4(int, mmu_update, src, count, + success_count, domid); + + dst = lazy->mmu + lazy->nr_mmu; + lazy->nr_mmu += count; + if (merge) { + mc[-1].args[1] += count; + memcpy(dst, src, count * sizeof(*src)); + } else { + ++lazy->nr_mc; + mc->op = __HYPERVISOR_mmu_update; + if (!commit) { + mc->args[0] = (unsigned long)dst; + memcpy(dst, src, count * sizeof(*src)); + } else + mc->args[0] = (unsigned long)src; + mc->args[1] = count; + mc->args[2] = (unsigned long)success_count; + mc->args[3] = domid; + mc->args[5] = (long)__builtin_return_address(0); + } + + while (!commit && count--) + switch (src++->ptr & (sizeof(pteval_t) - 1)) { + case MMU_NORMAL_PT_UPDATE: + case MMU_PT_UPDATE_PRESERVE_AD: + break; + default: + commit = true; + break; + } + + return commit ? _xen_multicall_flush(true) : 0; +} + +int xen_multi_mmuext_op(struct mmuext_op *src, unsigned int count, + unsigned int *success_count, domid_t domid) +{ + struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu); + multicall_entry_t *mc; + struct mmuext_op *dst; + bool commit, merge; + + if (unlikely(!use_lazy_mmu_mode())) + return _hypercall4(int, mmuext_op, src, count, + success_count, domid); + + /* + * While it could be useful in theory, I've never seen the body of + * this conditional to be reached, hence it seems more reasonable + * to disable it for the time being. + */ + if (0 && likely(count) + && likely(!success_count) + && likely(domid == DOMID_SELF) + && likely(lazy->nr_mc) + && lazy->mc[lazy->nr_mc - 1].op == __HYPERVISOR_update_va_mapping) { + unsigned long oldf, newf = UVMF_NONE; + + switch (src->cmd) { + case MMUEXT_TLB_FLUSH_ALL: + newf = UVMF_TLB_FLUSH | UVMF_ALL; + break; + case MMUEXT_INVLPG_ALL: + newf = UVMF_INVLPG | UVMF_ALL; + break; + case MMUEXT_TLB_FLUSH_MULTI: + newf = UVMF_TLB_FLUSH | UVMF_MULTI + | (unsigned long)src->arg2.vcpumask.p; + break; + case MMUEXT_INVLPG_MULTI: + newf = UVMF_INVLPG | UVMF_MULTI + | (unsigned long)src->arg2.vcpumask.p; + break; + case MMUEXT_TLB_FLUSH_LOCAL: + newf = UVMF_TLB_FLUSH | UVMF_LOCAL; + break; + case MMUEXT_INVLPG_LOCAL: + newf = UVMF_INVLPG | UVMF_LOCAL; + break; + } + mc = lazy->mc + lazy->nr_mc - 1; + oldf = mc->args[MULTI_UVMFLAGS_INDEX]; + if (newf == UVMF_NONE || oldf == UVMF_NONE + || newf == (UVMF_TLB_FLUSH | UVMF_ALL)) + ; + else if (oldf == (UVMF_TLB_FLUSH | UVMF_ALL)) + newf = UVMF_TLB_FLUSH | UVMF_ALL; + else if ((newf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG + && (oldf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG + && ((src->arg1.linear_addr ^ mc->args[0]) + >> PAGE_SHIFT)) + newf = UVMF_NONE; + else if (((oldf | newf) & UVMF_ALL) + && !((oldf ^ newf) & UVMF_FLUSHTYPE_MASK)) + newf |= UVMF_ALL; + else if ((oldf ^ newf) & ~UVMF_FLUSHTYPE_MASK) + newf = UVMF_NONE; + else if ((oldf & UVMF_FLUSHTYPE_MASK) == UVMF_TLB_FLUSH) + newf = (newf & ~UVMF_FLUSHTYPE_MASK) | UVMF_TLB_FLUSH; + else if ((newf & UVMF_FLUSHTYPE_MASK) != UVMF_TLB_FLUSH + && ((newf ^ oldf) & UVMF_FLUSHTYPE_MASK)) + newf = UVMF_NONE; + if (newf != UVMF_NONE) { + mc->args[MULTI_UVMFLAGS_INDEX] = newf; + ++src; + if (!--count) + return 0; + } + } + + mc = lazy->mc + lazy->nr_mc; + commit = (lazy->nr_mmuext + count) > NR_MMUEXT || success_count; + merge = lazy->nr_mc && !commit + && mmu_may_merge(mc - 1, __HYPERVISOR_mmuext_op, domid); + if (unlikely(lazy->nr_mc == NR_MC) && !merge) { + _xen_multicall_flush(false); + mc = lazy->mc; + commit = count > NR_MMUEXT || success_count; + } + + if (!lazy->nr_mc && unlikely(commit)) + return _hypercall4(int, mmuext_op, src, count, + success_count, domid); + + dst = lazy->mmuext + lazy->nr_mmuext; + lazy->nr_mmuext += count; + if (merge) { + mc[-1].args[1] += count; + memcpy(dst, src, count * sizeof(*src)); + } else { + ++lazy->nr_mc; + mc->op = __HYPERVISOR_mmuext_op; + if (!commit) { + mc->args[0] = (unsigned long)dst; + memcpy(dst, src, count * sizeof(*src)); + } else + mc->args[0] = (unsigned long)src; + mc->args[1] = count; + mc->args[2] = (unsigned long)success_count; + mc->args[3] = domid; + mc->args[5] = (long)__builtin_return_address(0); + } + + while (!commit && count--) + switch (src++->cmd) { + case MMUEXT_PIN_L1_TABLE: + case MMUEXT_PIN_L2_TABLE: + case MMUEXT_PIN_L3_TABLE: + case MMUEXT_PIN_L4_TABLE: + case MMUEXT_UNPIN_TABLE: + case MMUEXT_TLB_FLUSH_LOCAL: + case MMUEXT_INVLPG_LOCAL: + case MMUEXT_TLB_FLUSH_MULTI: + case MMUEXT_INVLPG_MULTI: + case MMUEXT_TLB_FLUSH_ALL: + case MMUEXT_INVLPG_ALL: + break; + default: + commit = true; + break; + } + + return commit ? _xen_multicall_flush(true) : 0; +} + +void xen_l1_entry_update(pte_t *ptr, pte_t val) +{ + mmu_update_t u; + u.ptr = ptep_to_machine(ptr); + u.val = __pte_val(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL_GPL(xen_l1_entry_update); + +static void do_lN_entry_update(mmu_update_t *mmu, unsigned int mmu_count, + struct page *page) +{ + if (likely(page)) { + multicall_entry_t mcl[2]; + unsigned long pfn = page_to_pfn(page); + + MULTI_update_va_mapping(mcl, + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, PAGE_KERNEL_RO), 0); + SetPagePinned(page); + MULTI_mmu_update(mcl + 1, mmu, mmu_count, NULL, DOMID_SELF); + if (unlikely(HYPERVISOR_multicall_check(mcl, 2, NULL))) + BUG(); + } else if (unlikely(HYPERVISOR_mmu_update(mmu, mmu_count, + NULL, DOMID_SELF) < 0)) + BUG(); +} + +void xen_l2_entry_update(pmd_t *ptr, pmd_t val) +{ + mmu_update_t u; + struct page *page = NULL; + + if (likely(pmd_present(val)) && likely(!pmd_large(val)) + && likely(mem_map) + && likely(PagePinned(virt_to_page(ptr)))) { + page = pmd_page(val); + if (unlikely(PagePinned(page))) + page = NULL; + else if (PageHighMem(page)) { +#ifdef CONFIG_HIGHPTE + BUG(); +#endif + kmap_flush_unused(); + page = NULL; + } + } + u.ptr = virt_to_machine(ptr); + u.val = __pmd_val(val); + do_lN_entry_update(&u, 1, page); +} + +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) +void xen_l3_entry_update(pud_t *ptr, pud_t val) +{ + mmu_update_t u; + struct page *page = NULL; + + if (likely(pud_present(val)) +#ifdef CONFIG_X86_64 + && likely(!pud_large(val)) +#endif + && likely(mem_map) + && likely(PagePinned(virt_to_page(ptr)))) { + page = pud_page(val); + if (unlikely(PagePinned(page))) + page = NULL; + } + u.ptr = virt_to_machine(ptr); + u.val = __pud_val(val); + do_lN_entry_update(&u, 1, page); +} +#endif + +#ifdef CONFIG_X86_64 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val) +{ + mmu_update_t u[2]; + struct page *page = NULL; + + if (likely(pgd_present(val)) && likely(mem_map) + && likely(PagePinned(virt_to_page(ptr)))) { + page = pgd_page(val); + if (unlikely(PagePinned(page))) + page = NULL; + } + u[0].ptr = virt_to_machine(ptr); + u[0].val = __pgd_val(val); + if (((unsigned long)ptr & ~PAGE_MASK) + <= pgd_index(TASK_SIZE_MAX) * sizeof(*ptr)) { + ptr = __user_pgd(ptr); + BUG_ON(!ptr); + u[1].ptr = virt_to_machine(ptr); + u[1].val = __pgd_val(val); + do_lN_entry_update(u, 2, page); + } else + do_lN_entry_update(u, 1, page); +} +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_X86_64 +void xen_pt_switch(pgd_t *pgd) +{ + struct mmuext_op op; + op.cmd = MMUEXT_NEW_BASEPTR; + op.arg1.mfn = virt_to_mfn(pgd); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_new_user_pt(pgd_t *pgd) +{ + struct mmuext_op op; + + pgd = __user_pgd(pgd); + op.cmd = MMUEXT_NEW_USER_BASEPTR; + op.arg1.mfn = pgd ? virt_to_mfn(pgd) : 0; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +#endif + +void xen_tlb_flush(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL(xen_tlb_flush); + +void xen_invlpg(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_LOCAL; + op.arg1.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL(xen_invlpg); + +#ifdef CONFIG_SMP + +void xen_tlb_flush_all(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_ALL; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL_GPL(xen_tlb_flush_all); + +void xen_tlb_flush_mask(const cpumask_t *mask) +{ + struct mmuext_op op; + if ( cpus_empty(*mask) ) + return; + op.cmd = MMUEXT_TLB_FLUSH_MULTI; + set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask)); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL_GPL(xen_tlb_flush_mask); + +void xen_invlpg_all(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_ALL; + op.arg1.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL_GPL(xen_invlpg_all); + +void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr) +{ + struct mmuext_op op; + if ( cpus_empty(*mask) ) + return; + op.cmd = MMUEXT_INVLPG_MULTI; + op.arg1.linear_addr = ptr & PAGE_MASK; + set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask)); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL_GPL(xen_invlpg_mask); + +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_X86_64 +#define NR_PGD_PIN_OPS 2 +#else +#define NR_PGD_PIN_OPS 1 +#endif + +void xen_pgd_pin(pgd_t *pgd) +{ + struct mmuext_op op[NR_PGD_PIN_OPS]; + unsigned int nr = NR_PGD_PIN_OPS; + + op[0].cmd = MMUEXT_PIN_L3_TABLE; + op[0].arg1.mfn = virt_to_mfn(pgd); +#ifdef CONFIG_X86_64 + op[1].cmd = op[0].cmd = MMUEXT_PIN_L4_TABLE; + pgd = __user_pgd(pgd); + if (pgd) + op[1].arg1.mfn = virt_to_mfn(pgd); + else + nr = 1; +#endif + if (HYPERVISOR_mmuext_op(op, nr, NULL, DOMID_SELF) < 0) + BUG(); +} + +void xen_pgd_unpin(pgd_t *pgd) +{ + struct mmuext_op op[NR_PGD_PIN_OPS]; + unsigned int nr = NR_PGD_PIN_OPS; + + op[0].cmd = MMUEXT_UNPIN_TABLE; + op[0].arg1.mfn = virt_to_mfn(pgd); +#ifdef CONFIG_X86_64 + pgd = __user_pgd(pgd); + if (pgd) { + op[1].cmd = MMUEXT_UNPIN_TABLE; + op[1].arg1.mfn = virt_to_mfn(pgd); + } else + nr = 1; +#endif + if (HYPERVISOR_mmuext_op(op, nr, NULL, DOMID_SELF) < 0) + BUG(); +} + +void xen_set_ldt(const void *ptr, unsigned int ents) +{ + struct mmuext_op op; + op.cmd = MMUEXT_SET_LDT; + op.arg1.linear_addr = (unsigned long)ptr; + op.arg2.nr_ents = ents; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +/* Protected by balloon_lock. */ +#define MAX_CONTIG_ORDER 9 /* 2MB */ +static unsigned long discontig_frames[1< MAX_CONTIG_ORDER)) + return -ENOMEM; + +#ifdef CONFIG_64BIT + if (unlikely(vstart > PAGE_OFFSET + MAXMEM)) { + unsigned int level; + + if (vstart < __START_KERNEL_map + || vstart + (PAGE_SIZE << order) > _brk_end) + return -EINVAL; + ptep = lookup_address((unsigned long)__va(__pa(vstart)), + &level); + if (ptep && pte_none(*ptep)) + ptep = NULL; + if (vstart < __START_KERNEL && ptep) + return -EINVAL; + if (order > MAX_CONTIG_ORDER - 1) + return -ENOMEM; + } +#else + if (unlikely(vstart + (PAGE_SIZE << order) > (unsigned long)high_memory)) + return -EINVAL; +#endif + + set_xen_guest_handle(exchange.in.extent_start, in_frames); + set_xen_guest_handle(exchange.out.extent_start, &out_frame); + + scrub_pages((void *)vstart, 1 << order); + + balloon_lock(flags); + + /* 1. Zap current PTEs, remembering MFNs. */ + for (i = 0; i < (1U<> PAGE_SHIFT) + i); + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), + __pte_ma(0), 0); +#ifdef CONFIG_64BIT + if (ptep) + MULTI_update_va_mapping(cr_mcl + i + (1U << order), + (unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE), + __pte_ma(0), 0); +#endif + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, + INVALID_P2M_ENTRY); + } +#ifdef CONFIG_64BIT + if (ptep) + i += i; +#endif + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) + BUG(); + + /* 2. Get a new contiguous memory extent. */ + out_frame = __pa(vstart) >> PAGE_SHIFT; + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); + success = (exchange.nr_exchanged == (1UL << order)); + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); + BUG_ON(success && (rc != 0)); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + /* Compatibility when XENMEM_exchange is unsupported. */ + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &exchange.in) != (1UL << order)) + BUG(); + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap, + &exchange.out) == 1); + if (!success) { + /* Couldn't get special memory: fall back to normal. */ + for (i = 0; i < (1U<>PAGE_SHIFT) + i; + if (HYPERVISOR_memory_op(XENMEM_populate_physmap, + &exchange.in) != (1UL<>PAGE_SHIFT)+i, frame); + } +#ifdef CONFIG_64BIT + if (ptep) + i += i; +#endif + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order + ? UVMF_TLB_FLUSH|UVMF_ALL + : UVMF_INVLPG|UVMF_ALL; + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) + BUG(); + + balloon_unlock(flags); + + return success ? 0 : -ENOMEM; +} +EXPORT_SYMBOL_GPL(xen_create_contiguous_region); + +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) +{ + unsigned long *out_frames = discontig_frames, in_frame; + unsigned long frame, flags; + unsigned int i; + int rc, success; + struct xen_memory_exchange exchange = { + .in = { + .nr_extents = 1, + .extent_order = order, + .domid = DOMID_SELF + }, + .out = { + .nr_extents = 1UL << order, + .extent_order = 0, + .domid = DOMID_SELF + } + }; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return; + + set_xen_guest_handle(exchange.in.extent_start, &in_frame); + set_xen_guest_handle(exchange.out.extent_start, out_frames); + + scrub_pages((void *)vstart, 1 << order); + + balloon_lock(flags); + + /* 1. Find start MFN of contiguous extent. */ + in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT); + + /* 2. Zap current PTEs. */ + for (i = 0; i < (1U<>PAGE_SHIFT)+i, + INVALID_P2M_ENTRY); + out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i; + } + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) + BUG(); + + /* 3. Do the exchange for non-contiguous MFNs. */ + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); + success = (exchange.nr_exchanged == 1); + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); + BUG_ON(success && (rc != 0)); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + /* Compatibility when XENMEM_exchange is unsupported. */ + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &exchange.in) != 1) + BUG(); + if (HYPERVISOR_memory_op(XENMEM_populate_physmap, + &exchange.out) != (1UL << order)) + BUG(); + success = 1; + } +#endif + + /* 4. Map new pages in place of old pages. */ + for (i = 0; i < (1U<>PAGE_SHIFT)+i, frame); + } + + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order + ? UVMF_TLB_FLUSH|UVMF_ALL + : UVMF_INVLPG|UVMF_ALL; + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) + BUG(); + + balloon_unlock(flags); + + if (unlikely(!success)) { + /* Try hard to get the special memory back to Xen. */ + exchange.in.extent_order = 0; + set_xen_guest_handle(exchange.in.extent_start, &in_frame); + + for (i = 0; i < (1U<> PAGE_SHIFT; + set_phys_to_machine(pfn, frame); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + mmu.ptr = ((uint64_t)frame << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + mmu.val = pfn; + cr_mcl[j].op = __HYPERVISOR_mmu_update; + cr_mcl[j].args[0] = (unsigned long)&mmu; + cr_mcl[j].args[1] = 1; + cr_mcl[j].args[2] = 0; + cr_mcl[j].args[3] = DOMID_SELF; + ++j; + } + + cr_mcl[j].op = __HYPERVISOR_memory_op; + cr_mcl[j].args[0] = XENMEM_decrease_reservation; + cr_mcl[j].args[1] = (unsigned long)&exchange.in; + + if (HYPERVISOR_multicall(cr_mcl, j + 1)) + BUG(); + BUG_ON(cr_mcl[j].result != 1); + while (j--) + BUG_ON(cr_mcl[j].result != 0); + + balloon_unlock(flags); + + free_empty_pages(&page, 1); + + in_frame++; + vstart += PAGE_SIZE; + } + } +} +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); + +int __init early_create_contiguous_region(unsigned long pfn, + unsigned int order, + unsigned int address_bits) +{ + unsigned long *in_frames = discontig_frames, out_frame = pfn; + unsigned int i; + int rc, success; + struct xen_memory_exchange exchange = { + .in = { + .nr_extents = 1UL << order, + .extent_order = 0, + .domid = DOMID_SELF + }, + .out = { + .nr_extents = 1, + .extent_order = order, + .address_bits = address_bits, + .domid = DOMID_SELF + } + }; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return -ENOMEM; + + for (i = 0; i < (1U << order); ++i) { + in_frames[i] = pfn_to_mfn(pfn + i); + set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY); + } + + set_xen_guest_handle(exchange.in.extent_start, in_frames); + set_xen_guest_handle(exchange.out.extent_start, &out_frame); + + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); + success = (exchange.nr_exchanged == (1UL << order)); + BUG_ON(!success && (exchange.nr_exchanged || !rc)); + BUG_ON(success && rc); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + /* Compatibility when XENMEM_exchange is unavailable. */ + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &exchange.in) != (1UL << order)) + BUG(); + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap, + &exchange.out) == 1); + if (!success) { + for (i = 0; i < (1U << order); ++i) + in_frames[i] = pfn + i; + if (HYPERVISOR_memory_op(XENMEM_populate_physmap, + &exchange.in) != (1UL << order)) + BUG(); + } + } +#endif + + for (i = 0; i < (1U << order); ++i, ++out_frame) { + if (!success) + out_frame = in_frames[i]; + set_phys_to_machine(pfn + i, out_frame); + } + + return success ? 0 : -ENOMEM; +} + +static void undo_limit_pages(struct page *pages, unsigned int order) +{ + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + BUG_ON(order > MAX_CONTIG_ORDER); + xen_limit_pages_to_max_mfn(pages, order, 0); + ClearPageForeign(pages); + __free_pages(pages, order); +} + +int xen_limit_pages_to_max_mfn( + struct page *pages, unsigned int order, unsigned int address_bits) +{ + unsigned long flags, frame; + unsigned long *in_frames = discontig_frames, *out_frames = limited_frames; + struct page *page; + unsigned int i, n, nr_mcl; + int rc, success; + DECLARE_BITMAP(limit_map, 1 << MAX_CONTIG_ORDER); + + struct xen_memory_exchange exchange = { + .in = { + .extent_order = 0, + .domid = DOMID_SELF + }, + .out = { + .extent_order = 0, + .address_bits = address_bits, + .domid = DOMID_SELF + } + }; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return -ENOMEM; + + if (address_bits) { + if (address_bits < PAGE_SHIFT) + return -EINVAL; + bitmap_zero(limit_map, 1U << order); + } else if (order) { + BUILD_BUG_ON(sizeof(pages->index) != sizeof(*limit_map)); + for (i = 0; i < BITS_TO_LONGS(1U << order); ++i) + limit_map[i] = pages[i + 1].index; + } else + __set_bit(0, limit_map); + + set_xen_guest_handle(exchange.in.extent_start, in_frames); + set_xen_guest_handle(exchange.out.extent_start, out_frames); + + /* 0. Scrub the pages. */ + for (i = 0, n = 0; i < 1U<> (address_bits - PAGE_SHIFT))) + continue; + __set_bit(i, limit_map); + } + + if (!PageHighMem(page)) + scrub_pages(page_address(page), 1); +#ifdef CONFIG_XEN_SCRUB_PAGES + else { + scrub_pages(kmap(page), 1); + kunmap(page); + ++n; + } +#endif + } + if (bitmap_empty(limit_map, 1U << order)) + return 0; + + if (n) + kmap_flush_unused(); + + balloon_lock(flags); + + /* 1. Zap current PTEs (if any), remembering MFNs. */ + for (i = 0, n = 0, nr_mcl = 0; i < (1U<index)); + for (i = 0; i < BITS_TO_LONGS(1U << order); ++i) + pages[i + 1].index = limit_map[i]; + } + SetPageForeign(pages, undo_limit_pages); + } + + return 0; +} +EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn); + +int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc) +{ + maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry); + return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc); +} + +int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, + int type) +{ + maddr_t mach_gp = arbitrary_virt_to_machine(gdt + entry); + return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc); +} --- linux-ec2-2.6.32.orig/arch/x86/mm/setup_nx.c +++ linux-ec2-2.6.32/arch/x86/mm/setup_nx.c @@ -53,6 +53,7 @@ #else void set_nx(void) { + nx_enabled = _PAGE_NX && ((__supported_pte_mask & _PAGE_NX) == _PAGE_NX); } #endif --- linux-ec2-2.6.32.orig/arch/x86/mm/pageattr.c +++ linux-ec2-2.6.32/arch/x86/mm/pageattr.c @@ -56,12 +56,10 @@ void update_page_count(int level, unsigned long pages) { - unsigned long flags; - /* Protect against CPA */ - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); direct_pages_count[level] += pages; - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } static void split_page_count(int level) @@ -354,7 +352,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot; int i, do_split = 1; @@ -363,7 +361,7 @@ if (cpa->force_split) return 1; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up already: @@ -458,14 +456,14 @@ } out_unlock: - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return do_split; } static int split_large_page(pte_t *kpte, unsigned long address) { - unsigned long flags, pfn, pfninc = 1; + unsigned long pfn, pfninc = 1; unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; @@ -479,7 +477,7 @@ if (!base) return -ENOMEM; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up for us already: @@ -551,7 +549,7 @@ */ if (base) __free_page(base); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return 0; } --- linux-ec2-2.6.32.orig/arch/x86/mm/mmap.c +++ linux-ec2-2.6.32/arch/x86/mm/mmap.c @@ -87,9 +87,9 @@ */ if (current->flags & PF_RANDOMIZE) { if (mmap_is_ia32()) - rnd = (long)get_random_int() % (1<<8); + rnd = get_random_int() % (1<<8); else - rnd = (long)(get_random_int() % (1<<28)); + rnd = get_random_int() % (1<<28); } return rnd << PAGE_SHIFT; } @@ -131,6 +131,11 @@ } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; +#ifdef CONFIG_X86_32 + if (!nx_enabled && !(current->personality & READ_IMPLIES_EXEC) + && mmap_is_ia32()) + mm->get_unmapped_exec_area = arch_get_unmapped_exec_area; +#endif mm->unmap_area = arch_unmap_area_topdown; } } --- linux-ec2-2.6.32.orig/arch/x86/mm/fault-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/fault-xen.c @@ -0,0 +1,1175 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar + */ +#include /* STACK_END_MAGIC */ +#include /* test_thread_flag(), ... */ +#include /* oops_begin/end, ... */ +#include /* search_exception_table */ +#include /* max_low_pfn */ +#include /* __kprobes, ... */ +#include /* kmmio_handler, ... */ +#include /* perf_sw_event */ + +#include /* dotraplinkage, ... */ +#include /* pgd_*(), ... */ +#include /* kmemcheck_*(), ... */ + +/* + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch + */ +enum x86_pf_error_code { + + PF_PROT = 1 << 0, + PF_WRITE = 1 << 1, + PF_USER = 1 << 2, + PF_RSVD = 1 << 3, + PF_INSTR = 1 << 4, +}; + +/* + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: + */ +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) +{ + if (unlikely(is_kmmio_active())) + if (kmmio_handler(regs, addr) == 1) + return -1; + return 0; +} + +static inline int notify_page_fault(struct pt_regs *regs) +{ + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ + if (kprobes_built_in() && !user_mode_vm(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } + + return ret; +} + +/* + * Prefetch quirks: + * + * 32-bit mode: + * + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + * + * 64-bit mode: + * + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. + */ +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) +{ + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + return (!user_mode(regs)) || (regs->cs == __USER_CS); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (probe_kernel_address(instr, opcode)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) +{ + unsigned char *max_instr; + unsigned char *instr; + int prefetch = 0; + + /* + * If it was a exec (instruction fetch) fault on NX page, then + * do not ignore the fault: + */ + if (error_code & PF_INSTR) + return 0; + + instr = (void *)convert_ip_to_linear(current, regs); + max_instr = instr + 15; + + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + return 0; + + while (instr < max_instr) { + unsigned char opcode; + + if (probe_kernel_address(instr, opcode)) + break; + + instr++; + + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) + break; + } + return prefetch; +} + +static void +force_sig_info_fault(int si_signo, int si_code, unsigned long address, + struct task_struct *tsk) +{ + siginfo_t info; + + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; + + force_sig_info(si_signo, &info, tsk); +} + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + + if (!pmd_present(*pmd)) +#if CONFIG_XEN_COMPAT > 0x030002 + set_pmd(pmd, *pmd_k); +#else + /* + * When running on older Xen we must launder *pmd_k through + * pmd_val() to ensure that _PAGE_PRESENT is correctly set. + */ + set_pmd(pmd, __pmd(pmd_val(*pmd_k))); +#endif + else + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + + return pmd_k; +} + +void vmalloc_sync_all(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + + struct page *page; + + spin_lock_irq(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), address)) + break; + } + spin_unlock(&pgd_lock); + } +} + +/* + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} + +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; +} + +static bool low_pfn(unsigned long pfn) +{ + return pfn < max_low_pfn; +} + +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(address)]; + pmd_t *pmd; + pte_t *pte; + +#ifdef CONFIG_X86_PAE + printk("*pdpt = %016Lx ", __pgd_val(*pgd)); + if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) + goto out; +#endif + pmd = pmd_offset(pud_offset(pgd, address), address); + printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)__pmd_val(*pmd)); + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already: + */ + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + + pte = pte_offset_kernel(pmd, address); + printk(KERN_CONT "*pte = %0*Lx ", sizeof(*pte) * 2, (u64)__pte_val(*pte)); +out: + printk(KERN_CONT "\n"); +} +#define dump_pagetable(addr, krnl) dump_pagetable(addr) + +#else /* CONFIG_X86_64: */ + +void vmalloc_sync_all(void) +{ + unsigned long address; + + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { + + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock(&pgd_lock); + } +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ + pgd = pgd_offset(current->active_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + + pte = pte_offset_kernel(pmd, address); + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + + return 0; +} + +static const char errata93_warning[] = +KERN_ERR +"******* Your BIOS seems to not contain a fix for K8 errata #93\n" +"******* Working around it, but it may cause SEGVs or burn power.\n" +"******* Please consider a BIOS update.\n" +"******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long address, bool kernel) +{ + pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd_t *pgd = base + pgd_index(address); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (!kernel) + pgd = __user_pgd(base) + pgd_index(address); + + if (bad_address(pgd)) + goto bad; + + printk("PGD %lx ", pgd_val(*pgd)); + + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (bad_address(pud)) + goto bad; + + printk(KERN_CONT "PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud) || pud_large(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) + goto bad; + + printk(KERN_CONT "PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) + goto bad; + + printk(KERN_CONT "PTE %lx", pte_val(*pte)); +out: + printk(KERN_CONT "\n"); + return; +bad: + printk("BAD\n"); +} + +#endif /* CONFIG_X86_64 */ + +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. + */ +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + if (address != regs->ip) + return 0; + + if ((address >> 32) != 0) + return 0; + + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + printk_once(errata93_warning); + regs->ip = address; + return 1; + } +#endif + return 0; +} + +/* + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code + * segment in LDT is compatibility mode. + */ +static int is_errata100(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) + return 1; +#endif + return 0; +} + +static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_F00F_BUG + unsigned long nr; + + /* + * Pentium F0 0F C7 C8 bug workaround: + */ + if (boot_cpu_data.f00f_bug) { + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { + do_invalid_op(regs, 0); + return 1; + } + } +#endif + return 0; +} + +static const char nx_warning[] = KERN_CRIT +"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; + +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + if (!oops_may_print()) + return; + + if (error_code & PF_INSTR) { + unsigned int level; + + pte_t *pte = lookup_address(address, &level); + + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(nx_warning, current_uid()); + } + + printk(KERN_ALERT "BUG: unable to handle kernel "); + if (address < PAGE_SIZE) + printk(KERN_CONT "NULL pointer dereference"); + else + printk(KERN_CONT "paging request"); + + printk(KERN_CONT " at %p\n", (void *) address); + printk(KERN_ALERT "IP:"); + printk_address(regs->ip, 1); + + dump_pagetable(address, !(error_code & PF_USER)); +} + +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + tsk->comm, address); + dump_pagetable(address, !(error_code & PF_USER)); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + + if (__die("Bad pagetable", regs, error_code)) + sig = 0; + + oops_end(flags, regs, sig); +} + +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + struct task_struct *tsk = current; + unsigned long *stackend; + unsigned long flags; + int sig; + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * 32-bit: + * + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * 64-bit: + * + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata93(regs, address)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice: + */ + flags = oops_begin(); + + show_fault_oops(regs, error_code, address); + + stackend = end_of_stack(tsk); + if (*stackend != STACK_END_MAGIC) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + + oops_end(flags, regs, sig); +} + +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + printk(KERN_CONT "\n"); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) +{ + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here: + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space: + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + /* Kernel addresses are always protection faults: */ + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + + return; + } + + if (is_f00f_bug(regs, address)) + return; + + no_context(regs, error_code, address); +} + +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); +} + +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) +{ + struct mm_struct *mm = current->mm; + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + __bad_area_nosemaphore(regs, error_code, address, si_code); +} + +static noinline void +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_MAPERR); +} + +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_ACCERR); +} + +/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ +static void +out_of_memory(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed): + */ + up_read(¤t->mm->mmap_sem); + + pagefault_out_of_memory(); +} + +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, + unsigned int fault) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + int code = BUS_ADRERR; + + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & PF_USER)) { + no_context(regs, error_code, address); + return; + } + + /* User-space => ok to do another page fault: */ + if (is_prefetch(regs, error_code, address)) + return; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + +#ifdef CONFIG_MEMORY_FAILURE + if (fault & VM_FAULT_HWPOISON) { + printk(KERN_ERR + "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + tsk->comm, tsk->pid, address); + code = BUS_MCEERR_AR; + } +#endif + force_sig_info_fault(SIGBUS, code, address, tsk); +} + +static noinline void +mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault) +{ + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address); + return; + } + + out_of_memory(regs, error_code, address); + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) + do_sigbus(regs, error_code, address, fault); + else + BUG(); + } +} + +static int spurious_fault_check(unsigned long error_code, pte_t *pte) +{ + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + +/* + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + */ +static noinline int +spurious_fault(unsigned long error_code, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret; + + /* Reserved-bit violation or user access to kernel space? */ + if (error_code & (PF_USER | PF_RSVD)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + if (pud_large(*pud)) + return spurious_fault_check(error_code, (pte_t *) pud); + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + if (pmd_large(*pmd)) + return spurious_fault_check(error_code, (pte_t *) pmd); + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return 0; + + ret = spurious_fault_check(error_code, pte); + if (!ret) + return 0; + + /* + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: + */ + ret = spurious_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + + return ret; +} + +int show_unhandled_signals = 1; + +static inline int +access_error(unsigned long error_code, int write, struct vm_area_struct *vma) +{ + if (write) { + /* write, present and write, not present: */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + return 0; + } + + /* read, present: */ + if (unlikely(error_code & PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + + return 0; +} + +static int fault_in_kernel_space(unsigned long address) +{ + return address >= TASK_SIZE_MAX; +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + unsigned long address; + struct mm_struct *mm; + int write; + int fault; + + /* Set the "privileged fault" bit to something sane. */ + if (user_mode_vm(regs)) + error_code |= PF_USER; + else + error_code &= ~PF_USER; + + tsk = current; + mm = tsk->mm; + + /* Get the faulting address: */ + address = read_cr2(); + + /* + * Detect and handle instructions that would cause a page fault for + * both a tracked kernel page and a userspace page. + */ + if (kmemcheck_active(regs)) + kmemcheck_hide(regs); + prefetchw(&mm->mmap_sem); + + if (unlikely(kmmio_fault(regs, address))) + return; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ + if (unlikely(fault_in_kernel_space(address))) { + /* Faults in hypervisor area can never be patched up. */ +#if defined(CONFIG_X86_XEN) + if (address >= hypervisor_virt_start) { +#elif defined(CONFIG_X86_64_XEN) + if (address >= HYPERVISOR_VIRT_START + && address < HYPERVISOR_VIRT_END) { +#endif + bad_area_nosemaphore(regs, error_code, address); + return; + } + + if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + + if (kmemcheck_fault(regs, address, error_code)) + return; + } + + /* Can handle a stale RO->RW TLB: */ + if (spurious_fault(error_code, address)) + return; + + /* kprobes don't want to hook the spurious faults: */ + if (notify_page_fault(regs)) + return; + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock: + */ + bad_area_nosemaphore(regs, error_code, address); + + return; + } + + /* kprobes don't want to hook the spurious faults: */ + if (unlikely(notify_page_fault(regs))) + return; + /* + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet: + */ + if (user_mode_vm(regs)) { + local_irq_enable(); + error_code |= PF_USER; + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } + + if (unlikely(error_code & PF_RSVD)) + pgtable_bad(regs, error_code, address); + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + + /* + * If we're in an interrupt, have no user context or are running + * in an atomic region then we must not take the fault: + */ + if (unlikely(in_atomic() || !mm)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } + + /* + * When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in + * the kernel and should generate an OOPS. Unfortunately, in the + * case of an erroneous fault occurring in a code path which already + * holds mmap_sem we will deadlock attempting to validate the fault + * against the address space. Luckily the kernel only validly + * references user space from well defined areas of code, which are + * listed in the exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibility of a + * deadlock. Attempt to lock the address space, if we cannot we then + * validate the source. If this is invalid we can skip the address + * space check, thus avoiding the deadlock: + */ + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if ((error_code & PF_USER) == 0 && + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } + down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): + */ + might_sleep(); + } + + vma = find_vma(mm, address); + if (unlikely(!vma)) { + bad_area(regs, error_code, address); + return; + } + if (likely(vma->vm_start <= address)) + goto good_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, error_code, address); + return; + } + if (error_code & PF_USER) { + /* + * Accessing the stack below %sp is always a bug. + * The large cushion allows instructions like enter + * and pusha to work. ("enter $65535, $31" pushes + * 32 pointers and then decrements %sp by 65535.) + */ + if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { + bad_area(regs, error_code, address); + return; + } + } + if (unlikely(expand_stack(vma, address))) { + bad_area(regs, error_code, address); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + write = error_code & PF_WRITE; + + if (unlikely(access_error(error_code, write, vma))) { + bad_area_access_error(regs, error_code, address); + return; + } + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault: + */ + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); + + if (unlikely(fault & VM_FAULT_ERROR)) { + mm_fault_error(regs, error_code, address, fault); + return; + } + + if (fault & VM_FAULT_MAJOR) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + regs, address); + } + + check_v8086_mode(regs, address, tsk); + + up_read(&mm->mmap_sem); +} --- linux-ec2-2.6.32.orig/arch/x86/mm/pageattr-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/pageattr-xen.c @@ -0,0 +1,1449 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The current flushing context - we pass it instead of 5 arguments: + */ +struct cpa_data { + unsigned long *vaddr; + pgprot_t mask_set; + pgprot_t mask_clr; + int numpages; + int flags; + unsigned long pfn; + unsigned force_split : 1; + int curpage; + struct page **pages; +}; + +/* + * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) + * using cpa_lock. So that we don't allow any other cpu, with stale large tlb + * entries change the page attribute in parallel to some other cpu + * splitting a large page entry along with changing the attribute. + */ +static DEFINE_SPINLOCK(cpa_lock); + +#define CPA_FLUSHTLB 1 +#define CPA_ARRAY 2 +#define CPA_PAGES_ARRAY 4 + +#ifdef CONFIG_PROC_FS +static unsigned long direct_pages_count[PG_LEVEL_NUM]; + +void update_page_count(int level, unsigned long pages) +{ + /* Protect against CPA */ + spin_lock(&pgd_lock); + direct_pages_count[level] += pages; + spin_unlock(&pgd_lock); +} + +static void split_page_count(int level) +{ + direct_pages_count[level]--; + direct_pages_count[level - 1] += PTRS_PER_PTE; +} + +void arch_report_meminfo(struct seq_file *m) +{ + seq_printf(m, "DirectMap4k: %8lu kB\n", + direct_pages_count[PG_LEVEL_4K] << 2); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + seq_printf(m, "DirectMap2M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 11); +#else + seq_printf(m, "DirectMap4M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 12); +#endif +#ifdef CONFIG_X86_64 + if (direct_gbpages) + seq_printf(m, "DirectMap1G: %8lu kB\n", + direct_pages_count[PG_LEVEL_1G] << 20); +#endif +} +#else +static inline void split_page_count(int level) { } +#endif + +#ifdef CONFIG_X86_64 + +static inline unsigned long highmap_start_pfn(void) +{ + return __pa(_text) >> PAGE_SHIFT; +} + +static inline unsigned long highmap_end_pfn(void) +{ + return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; +} + +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC +# define debug_pagealloc 1 +#else +# define debug_pagealloc 0 +#endif + +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +/* + * Flushing functions + */ + +/** + * clflush_cache_range - flush a cache range with clflush + * @addr: virtual start address + * @size: number of bytes to flush + * + * clflush is an unordered instruction which needs fencing with mfence + * to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + void *vend = vaddr + size - 1; + + mb(); + + for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) + clflush(vaddr); + /* + * Flush any possible final partial cacheline: + */ + clflush(vend); + + mb(); +} +EXPORT_SYMBOL_GPL(clflush_cache_range); + +static void __cpa_flush_all(void *arg) +{ + unsigned long cache = (unsigned long)arg; + + /* + * Flush all to work around Errata in early athlons regarding + * large page flushing. + */ + __flush_tlb_all(); + + if (cache && boot_cpu_data.x86 >= 4) + wbinvd(); +} + +static void cpa_flush_all(unsigned long cache) +{ + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_all, (void *) cache, 1); +} + +static void __cpa_flush_range(void *arg) +{ + /* + * We could optimize that further and do individual per page + * tlb invalidates for a low number of pages. Caveat: we must + * flush the high aliases on 64bit as well. + */ + __flush_tlb_all(); +} + +static void cpa_flush_range(unsigned long start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long addr; + + BUG_ON(irqs_disabled()); + WARN_ON(PAGE_ALIGN(start) != start); + + on_each_cpu(__cpa_flush_range, NULL, 1); + + if (!cache) + return; + + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { + pte_t *pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (__pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *) addr, PAGE_SIZE); + } +} + +static void cpa_flush_array(unsigned long *start, int numpages, int cache, + int in_flags, struct page **pages) +{ + unsigned int i, level; + unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ + + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); + + if (!cache || do_wbinvd) + return; + + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0; i < numpages; i++) { + unsigned long addr; + pte_t *pte; + + if (in_flags & CPA_PAGES_ARRAY) + addr = (unsigned long)page_address(pages[i]); + else + addr = start[i]; + + pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (__pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *)addr, PAGE_SIZE); + } +} + +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, + unsigned long pfn) +{ + pgprot_t forbidden = __pgprot(0); + +#ifndef CONFIG_XEN + /* + * The BIOS area between 640k and 1Mb needs to be executable for + * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. + */ + if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) + pgprot_val(forbidden) |= _PAGE_NX; +#endif + + /* + * The kernel text needs to be executable for obvious reasons + * Does not cover __inittext since that is gone later on. On + * 64bit we do not enforce !NX on the low mapping + */ + if (within(address, (unsigned long)_text, (unsigned long)_etext)) + pgprot_val(forbidden) |= _PAGE_NX; + + /* + * The .rodata section needs to be read-only. Using the pfn + * catches all aliases. + */ + if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, + __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) + pgprot_val(forbidden) |= _PAGE_RW; + + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); + + return prot; +} + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + + *level = PG_LEVEL_NONE; + + if (pgd_none(*pgd)) + return NULL; + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return NULL; + + *level = PG_LEVEL_1G; + if (pud_large(*pud) || !pud_present(*pud)) + return (pte_t *)pud; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return NULL; + + *level = PG_LEVEL_2M; + if (pmd_large(*pmd) || !pmd_present(*pmd)) + return (pte_t *)pmd; + + *level = PG_LEVEL_4K; + + return pte_offset_kernel(pmd, address); +} +EXPORT_SYMBOL_GPL(lookup_address); + +/* + * Set the new pmd in all the pgds we know about: + */ +static void __set_pmd_pte(pte_t *kpte, unsigned long address, + unsigned int level, pte_t pte) +{ + /* change init_mm */ + switch(level) { + case PG_LEVEL_2M: + xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte))); + break; +#ifdef CONFIG_X86_64 + case PG_LEVEL_1G: + xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte))); + break; +#endif + default: + BUG(); + } +#ifdef CONFIG_X86_32 + if (!SHARED_KERNEL_PMD) { + struct page *page; + + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte))); + } + } +#endif +} + +static int +try_preserve_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; + pte_t new_pte, old_pte, *tmp; + pgprot_t old_prot, new_prot; + int i, do_split = 1; + unsigned int level; + + if (cpa->force_split) + return 1; + + spin_lock(&pgd_lock); + /* + * Check for races, another CPU might have split this page + * up already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) + goto out_unlock; + + switch (level) { + case PG_LEVEL_2M: + psize = PMD_PAGE_SIZE; + pmask = PMD_PAGE_MASK; + break; +#ifdef CONFIG_X86_64 + case PG_LEVEL_1G: + psize = PUD_PAGE_SIZE; + pmask = PUD_PAGE_MASK; + break; +#endif + default: + do_split = -EINVAL; + goto out_unlock; + } + + /* + * Calculate the number of pages, which fit into this large + * page starting at address: + */ + nextpage_addr = (address + psize) & pmask; + numpages = (nextpage_addr - address) >> PAGE_SHIFT; + if (numpages < cpa->numpages) + cpa->numpages = numpages; + + /* + * We are safe now. Check whether the new pgprot is the same: + */ + old_pte = *kpte; + old_prot = new_prot = pte_pgprot(old_pte); + + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + + /* + * old_pte points to the large page base address. So we need + * to add the offset of the virtual address: + */ + pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); + cpa->pfn = pfn; + + new_prot = static_protections(new_prot, address, pfn); + + /* + * We need to check the full range, whether + * static_protection() requires a different pgprot for one of + * the pages in the range we try to preserve: + */ + if (pfn < max_mapnr) { + addr = address + PAGE_SIZE; + for (i = 1; i < cpa->numpages && ++pfn < max_mapnr; + i++, addr += PAGE_SIZE) { + pgprot_t chk_prot = static_protections(new_prot, addr, pfn); + + if (pgprot_val(chk_prot) != pgprot_val(new_prot)) + goto out_unlock; + } + } + + /* + * If there are no changes, return. maxpages has been updated + * above: + */ + if (pgprot_val(new_prot) == pgprot_val(old_prot)) { + do_split = 0; + goto out_unlock; + } + + /* + * We need to change the attributes. Check, whether we can + * change the large page in one go. We request a split, when + * the address is not aligned and the number of pages is + * smaller than the number of pages in the large page. Note + * that we limited the number of possible pages already to + * the number of pages in the large page. + */ + if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { + /* + * The address is aligned and the number of pages + * covers the full page. + */ + new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot)); + __set_pmd_pte(kpte, address, level, new_pte); + cpa->flags |= CPA_FLUSHTLB; + do_split = 0; + } + +out_unlock: + spin_unlock(&pgd_lock); + + return do_split; +} + +static int split_large_page(pte_t *kpte, unsigned long address) +{ + unsigned long mfn, mfninc = 1; + unsigned int i, level; + pte_t *pbase, *tmp; + pgprot_t ref_prot; + struct page *base; + + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); + if (!base) + return -ENOMEM; + + spin_lock(&pgd_lock); + /* + * Check for races, another CPU might have split this page + * up for us already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) + goto out_unlock; + + pbase = (pte_t *)page_address(base); + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); + ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + /* + * If we ever want to utilize the PAT bit, we need to + * update this function to make sure it's converted from + * bit 12 to bit 7 when we cross from the 2MB level to + * the 4K level: + */ + WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); + +#ifdef CONFIG_X86_64 + if (level == PG_LEVEL_1G) { + mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + pgprot_val(ref_prot) |= _PAGE_PSE; + } +#endif + + if (address >= (unsigned long)__va(0) && + address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) + split_page_count(level); + +#ifdef CONFIG_X86_64 + if (address >= (unsigned long)__va(1UL<<32) && + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) + split_page_count(level); +#endif + + /* + * Get the target mfn from the original entry: + */ + mfn = __pte_mfn(*kpte); + for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc) + set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot)); + + /* + * Install the new, split up pagetable. + * + * We use the standard kernel pagetable protections for the new + * pagetable protections, the actual ptes set above control the + * primary protection behavior: + */ + if (!xen_feature(XENFEAT_writable_page_tables) && + HYPERVISOR_update_va_mapping((unsigned long)pbase, + mk_pte(base, PAGE_KERNEL_RO), 0)) + BUG(); + __set_pmd_pte(kpte, address, level, mk_pte(base, __pgprot(_KERNPG_TABLE))); + + /* + * Intel Atom errata AAH41 workaround. + * + * The real fix should be in hw or in a microcode update, but + * we also probabilistically try to reduce the window of having + * a large TLB mixed with 4K TLBs while instruction fetches are + * going on. + */ + __flush_tlb_all(); + + base = NULL; + +out_unlock: + /* + * If we dropped out via the lookup_address check under + * pgd_lock then stick the page back into the pool: + */ + if (base) + __free_page(base); + spin_unlock(&pgd_lock); + + return 0; +} + +static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, + int primary) +{ + /* + * Ignore all non primary paths. + */ + if (!primary) + return 0; + + /* + * Ignore the NULL PTE for kernel identity mapping, as it is expected + * to have holes. + * Also set numpages to '1' indicating that we processed cpa req for + * one virtual address page and its pfn. TBD: numpages can be set based + * on the initial value and the level returned by lookup_address(). + */ + if (within(vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { + cpa->numpages = 1; + cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; + return 0; + } else { + WARN(1, KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", vaddr, + *cpa->vaddr); + + return -EFAULT; + } +} + +static int __change_page_attr(struct cpa_data *cpa, int primary) +{ + unsigned long address; + int do_split, err; + unsigned int level; + pte_t *kpte, old_pte; + + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + address = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) + address = cpa->vaddr[cpa->curpage]; + else + address = *cpa->vaddr; +repeat: + kpte = lookup_address(address, &level); + if (!kpte) + return __cpa_process_fault(cpa, address, primary); + + old_pte = *kpte; + if (!__pte_val(old_pte)) + return __cpa_process_fault(cpa, address, primary); + + if (level == PG_LEVEL_4K) { + pte_t new_pte; + pgprot_t new_prot = pte_pgprot(old_pte); + unsigned long mfn = __pte_mfn(old_pte); + + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + + new_prot = static_protections(new_prot, address, + mfn_to_local_pfn(mfn)); + + /* + * We need to keep the mfn from the existing PTE, + * after all we're only going to change it's attributes + * not the memory it points to + */ + new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot)); + cpa->pfn = mfn_to_local_pfn(mfn); + /* + * Do we really change anything ? + */ + if (__pte_val(old_pte) != __pte_val(new_pte)) { + set_pte_atomic(kpte, new_pte); + cpa->flags |= CPA_FLUSHTLB; + } + cpa->numpages = 1; + return 0; + } + + /* + * Check, whether we can keep the large page intact + * and just change the pte: + */ + do_split = try_preserve_large_page(kpte, address, cpa); + /* + * When the range fits into the existing large page, + * return. cp->numpages and cpa->tlbflush have been updated in + * try_large_page: + */ + if (do_split <= 0) + return do_split; + + /* + * We have to split the large page: + */ + err = split_large_page(kpte, address); + if (!err) { + /* + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * With out this, we violate the TLB application note, that says + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." + * + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. + */ + flush_tlb_all(); + goto repeat; + } + + return err; +} + +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); + +static int cpa_process_alias(struct cpa_data *cpa) +{ + struct cpa_data alias_cpa; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr; + int ret; + + if (cpa->pfn >= max_pfn_mapped) + return 0; + +#ifdef CONFIG_X86_64 + if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) + return 0; +#endif + /* + * No need to redo, when the primary call touched the direct + * mapping already: + */ + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + vaddr = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) + vaddr = cpa->vaddr[cpa->curpage]; + else + vaddr = *cpa->vaddr; + + if (!(within(vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { + + alias_cpa = *cpa; + alias_cpa.vaddr = &laddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + +#ifdef CONFIG_X86_64 + /* + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need + * to touch the high mapped kernel as well: + */ + if (!within(vaddr, (unsigned long)_text, _brk_end) && + within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } +#endif + + return 0; +} + +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) +{ + int ret, numpages = cpa->numpages; + + while (numpages) { + /* + * Store the remaining nr of pages for the large page + * preservation check. + */ + cpa->numpages = numpages; + /* for array changes, we can't use large page */ + if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) + cpa->numpages = 1; + + if (!debug_pagealloc) + spin_lock(&cpa_lock); + ret = __change_page_attr(cpa, checkalias); + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + if (ret) + return ret; + + if (checkalias) { + ret = cpa_process_alias(cpa); + if (ret) + return ret; + } + + /* + * Adjust the number of pages with the result of the + * CPA operation. Either a large page has been + * preserved or a single page update happened. + */ + BUG_ON(cpa->numpages > numpages); + numpages -= cpa->numpages; + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) + cpa->curpage++; + else + *cpa->vaddr += cpa->numpages * PAGE_SIZE; + + } + return 0; +} + +static inline int cache_attr(pgprot_t attr) +{ + return pgprot_val(attr) & + (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); +} + +static int change_page_attr_set_clr(unsigned long *addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr, + int force_split, int in_flag, + struct page **pages) +{ + struct cpa_data cpa; + int ret, cache, checkalias; + unsigned long baddr = 0; + + /* + * Check, if we are requested to change a not supported + * feature: + */ + mask_set = canon_pgprot(mask_set); + mask_clr = canon_pgprot(mask_clr); + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) + return 0; + + /* Ensure we are PAGE_SIZE aligned */ + if (in_flag & CPA_ARRAY) { + int i; + for (i = 0; i < numpages; i++) { + if (addr[i] & ~PAGE_MASK) { + addr[i] &= PAGE_MASK; + WARN_ON_ONCE(1); + } + } + } else if (!(in_flag & CPA_PAGES_ARRAY)) { + /* + * in_flag of CPA_PAGES_ARRAY implies it is aligned. + * No need to cehck in that case + */ + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + /* + * Save address for cache flush. *addr is modified in the call + * to __change_page_attr_set_clr() below. + */ + baddr = *addr; + } + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + + vm_unmap_aliases(); + + cpa.vaddr = addr; + cpa.pages = pages; + cpa.numpages = numpages; + cpa.mask_set = mask_set; + cpa.mask_clr = mask_clr; + cpa.flags = 0; + cpa.curpage = 0; + cpa.force_split = force_split; + + if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) + cpa.flags |= in_flag; + + /* No alias checking for _NX bit modifications */ + checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; + + ret = __change_page_attr_set_clr(&cpa, checkalias); + + /* + * Check whether we really changed something: + */ + if (!(cpa.flags & CPA_FLUSHTLB)) + goto out; + + /* + * No need to flush, when we did not set any of the caching + * attributes: + */ + cache = cache_attr(mask_set); + + /* + * On success we use clflush, when the CPU supports it to + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): + */ + if (!ret && cpu_has_clflush) { + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + cpa_flush_array(addr, numpages, cache, + cpa.flags, pages); + } else + cpa_flush_range(baddr, numpages, cache); + } else + cpa_flush_all(cache); + +out: + return ret; +} + +static inline int change_page_attr_set(unsigned long *addr, int numpages, + pgprot_t mask, int array) +{ + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, + (array ? CPA_ARRAY : 0), NULL); +} + +static inline int change_page_attr_clear(unsigned long *addr, int numpages, + pgprot_t mask, int array) +{ + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, + (array ? CPA_ARRAY : 0), NULL); +} + +static inline int cpa_set_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, + CPA_PAGES_ARRAY, pages); +} + +static inline int cpa_clear_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, + CPA_PAGES_ARRAY, pages); +} + +#ifdef CONFIG_XEN +static void _free_memtype(u64 pstart, u64 pend) +{ + u64 pa = pstart &= __PHYSICAL_MASK; + u64 ma = phys_to_machine(pa); + + while ((pa += PAGE_SIZE) < pend) { + if (phys_to_machine(pa) != ma + (pa - pstart)) { + free_memtype(ma, ma + (pa - pstart)); + pstart = pa; + ma = phys_to_machine(pa); + } + } + free_memtype(ma, ma + (pend - pstart)); +} +#define free_memtype _free_memtype + +static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type) +{ + u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur; + u64 ma = phys_to_machine(pa); + int rc = 0; + + while ((pa += PAGE_SIZE) < pend) { + if (phys_to_machine(pa) != ma + (pa - pcur)) { + rc = reserve_memtype(ma, ma + (pa - pcur), + req_type, NULL); + if (rc) + break; + pcur = pa; + ma = phys_to_machine(pa); + } + } + if (likely(!rc)) + rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL); + + if (unlikely(!rc) && pstart < pcur) + _free_memtype(pstart, pcur); + + return rc; +} +#define reserve_memtype(s, e, r, n) \ + _reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r)) +#endif + +int _set_memory_uc(unsigned long addr, int numpages) +{ + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + int ret; + + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL); + if (ret) + goto out_err; + + ret = _set_memory_uc(addr, numpages); + if (ret) + goto out_free; + + return 0; + +out_free: + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); +out_err: + return ret; +} +EXPORT_SYMBOL(set_memory_uc); + +int set_memory_array_uc(unsigned long *addr, int addrinarray) +{ + int i, j; + int ret; + + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + for (i = 0; i < addrinarray; i++) { + ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL); + if (ret) + goto out_free; + } + + ret = change_page_attr_set(addr, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS), 1); + if (ret) + goto out_free; + + return 0; + +out_free: + for (j = 0; j < i; j++) + free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); + + return ret; +} +EXPORT_SYMBOL(set_memory_array_uc); + +int _set_memory_wc(unsigned long addr, int numpages) +{ + int ret; + unsigned long addr_copy = addr; + + ret = change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); + if (!ret) { + ret = change_page_attr_set_clr(&addr_copy, numpages, + __pgprot(_PAGE_CACHE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, 0, NULL); + } + return ret; +} + +int set_memory_wc(unsigned long addr, int numpages) +{ + int ret; + + if (!pat_enabled) + return set_memory_uc(addr, numpages); + + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_WC, NULL); + if (ret) + goto out_err; + + ret = _set_memory_wc(addr, numpages); + if (ret) + goto out_free; + + return 0; + +out_free: + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); +out_err: + return ret; +} +EXPORT_SYMBOL(set_memory_wc); + +int _set_memory_wb(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_CACHE_MASK), 0); +} + +int set_memory_wb(unsigned long addr, int numpages) +{ + int ret; + + ret = _set_memory_wb(addr, numpages); + if (ret) + return ret; + + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + return 0; +} +EXPORT_SYMBOL(set_memory_wb); + +int set_memory_array_wb(unsigned long *addr, int addrinarray) +{ + int i; + int ret; + + ret = change_page_attr_clear(addr, addrinarray, + __pgprot(_PAGE_CACHE_MASK), 1); + if (ret) + return ret; + + for (i = 0; i < addrinarray; i++) + free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); + + return 0; +} +EXPORT_SYMBOL(set_memory_array_wb); + +int set_memory_x(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); +} +EXPORT_SYMBOL(set_memory_x); + +int set_memory_nx(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); +} +EXPORT_SYMBOL(set_memory_nx); + +int set_memory_ro(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); +} +EXPORT_SYMBOL_GPL(set_memory_ro); + +int set_memory_rw(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); +} +EXPORT_SYMBOL_GPL(set_memory_rw); + +int set_memory_np(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + +int set_memory_4k(unsigned long addr, int numpages) +{ + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(0), 1, 0, NULL); +} + +int set_pages_uc(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_uc(addr, numpages); +} +EXPORT_SYMBOL(set_pages_uc); + +int set_pages_array_uc(struct page **pages, int addrinarray) +{ + unsigned long start; + unsigned long end; + int i; + int free_idx; + + for (i = 0; i < addrinarray; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) + goto err_out; + } + + if (cpa_set_pages_array(pages, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) { + return 0; /* Success */ + } +err_out: + free_idx = i; + for (i = 0; i < free_idx; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + free_memtype(start, end); + } + return -EINVAL; +} +EXPORT_SYMBOL(set_pages_array_uc); + +int set_pages_wb(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_wb(addr, numpages); +} +EXPORT_SYMBOL(set_pages_wb); + +int set_pages_array_wb(struct page **pages, int addrinarray) +{ + int retval; + unsigned long start; + unsigned long end; + int i; + + retval = cpa_clear_pages_array(pages, addrinarray, + __pgprot(_PAGE_CACHE_MASK)); + if (retval) + return retval; + + for (i = 0; i < addrinarray; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + free_memtype(start, end); + } + + return 0; +} +EXPORT_SYMBOL(set_pages_array_wb); + +int set_pages_x(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_x(addr, numpages); +} +EXPORT_SYMBOL(set_pages_x); + +int set_pages_nx(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_nx(addr, numpages); +} +EXPORT_SYMBOL(set_pages_nx); + +int set_pages_ro(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_ro(addr, numpages); +} + +int set_pages_rw(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_rw(addr, numpages); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC + +static int __set_pages_p(struct page *page, int numpages) +{ + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, + .numpages = numpages, + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(0), + .flags = 0}; + + /* + * No alias checking needed for setting present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); +} + +static int __set_pages_np(struct page *page, int numpages) +{ + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = 0}; + + /* + * No alias checking needed for setting not present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + if (!enable) { + debug_check_no_locks_freed(page_address(page), + numpages * PAGE_SIZE); + } + + /* + * If page allocator is not up yet then do not call c_p_a(): + */ + if (!debug_pagealloc_enabled) + return; + + /* + * The return value is ignored as the calls cannot fail. + * Large pages for identity mappings are not used at boot time + * and hence no memory allocations during large page split. + */ + if (enable) + __set_pages_p(page, numpages); + else + __set_pages_np(page, numpages); + + /* + * We should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu: + */ + __flush_tlb_all(); +} + +#ifdef CONFIG_HIBERNATION + +bool kernel_page_present(struct page *page) +{ + unsigned int level; + pte_t *pte; + + if (PageHighMem(page)) + return false; + + pte = lookup_address((unsigned long)page_address(page), &level); + return (__pte_val(*pte) & _PAGE_PRESENT); +} + +#endif /* CONFIG_HIBERNATION */ + +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +static inline int in_secondary_range(unsigned long va) +{ +#ifdef CONFIG_X86_64 + return va >= VMALLOC_START && va < VMALLOC_END; +#else + return va >= (unsigned long)high_memory; +#endif +} + +static void __make_page_readonly(unsigned long va) +{ + pte_t *pte; + unsigned int level; + + pte = lookup_address(va, &level); + BUG_ON(!pte || level != PG_LEVEL_4K); + if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0)) + BUG(); + if (in_secondary_range(va)) { + unsigned long pfn = pte_pfn(*pte); + +#ifdef CONFIG_HIGHMEM + if (pfn >= highstart_pfn) + kmap_flush_unused(); /* flush stale writable kmaps */ + else +#endif + __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT)); + } +} + +static void __make_page_writable(unsigned long va) +{ + pte_t *pte; + unsigned int level; + + pte = lookup_address(va, &level); + BUG_ON(!pte || level != PG_LEVEL_4K); + if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG)) + BUG(); + if (in_secondary_range(va)) { + unsigned long pfn = pte_pfn(*pte); + +#ifdef CONFIG_HIGHMEM + if (pfn < highstart_pfn) +#endif + __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT)); + } +} + +void make_page_readonly(void *va, unsigned int feature) +{ + if (!xen_feature(feature)) + __make_page_readonly((unsigned long)va); +} + +void make_page_writable(void *va, unsigned int feature) +{ + if (!xen_feature(feature)) + __make_page_writable((unsigned long)va); +} + +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature) +{ + unsigned long addr; + + if (xen_feature(feature)) + return; + + for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE) + __make_page_readonly(addr); +} + +void make_pages_writable(void *va, unsigned int nr, unsigned int feature) +{ + unsigned long addr; + + if (xen_feature(feature)) + return; + + for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE) + __make_page_writable(addr); +} + +/* + * The testcases use internal knowledge of the implementation that shouldn't + * be exposed to the rest of the kernel. Include these directly here. + */ +#ifdef CONFIG_CPA_DEBUG +#include "pageattr-test.c" +#endif --- linux-ec2-2.6.32.orig/arch/x86/mm/srat_64.c +++ linux-ec2-2.6.32/arch/x86/mm/srat_64.c @@ -229,9 +229,11 @@ printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); } - if (changed) + if (changed) { + node_set(node, cpu_nodes_parsed); printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + } } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ --- linux-ec2-2.6.32.orig/arch/x86/mm/pgtable.c +++ linux-ec2-2.6.32/arch/x86/mm/pgtable.c @@ -6,6 +6,14 @@ #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#ifdef CONFIG_HIGHPTE +#define PGALLOC_USER_GFP __GFP_HIGHMEM +#else +#define PGALLOC_USER_GFP 0 +#endif + +gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { return (pte_t *)__get_free_page(PGALLOC_GFP); @@ -15,16 +23,29 @@ { struct page *pte; -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); -#else - pte = alloc_pages(PGALLOC_GFP, 0); -#endif + pte = alloc_pages(__userpte_alloc_gfp, 0); if (pte) pgtable_page_ctor(pte); return pte; } +static int __init setup_userpte(char *arg) +{ + if (!arg) + return -EINVAL; + + /* + * "userpte=nohigh" disables allocation of user pagetables in + * high memory. + */ + if (strcmp(arg, "nohigh") == 0) + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + else + return -EINVAL; + return 0; +} +early_param("userpte", setup_userpte); + void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); @@ -89,14 +110,12 @@ static void pgd_dtor(pgd_t *pgd) { - unsigned long flags; /* can be called from interrupt context */ - if (SHARED_KERNEL_PMD) return; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } /* @@ -228,7 +247,6 @@ { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - unsigned long flags; pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); @@ -248,12 +266,12 @@ * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); pgd_ctor(pgd); pgd_prepopulate_pmd(mm, pgd, pmds); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return pgd; --- linux-ec2-2.6.32.orig/arch/x86/mm/highmem_32-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/highmem_32-xen.c @@ -0,0 +1,186 @@ +#include +#include +#include /* for totalram_pages */ + +void *kmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_high(page); +} + +void kunmap(struct page *page) +{ + if (in_interrupt()) + BUG(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap it is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + pagefault_disable(); + + if (!PageHighMem(page)) + return page_address(page); + + debug_kmap_atomic(type); + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + BUG_ON(!pte_none(*(kmap_pte-idx))); + set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); + + return (void *)vaddr; +} + +void *kmap_atomic(struct page *page, enum km_type type) +{ + return kmap_atomic_prot(page, type, kmap_prot); +} + +void kunmap_atomic(void *kvaddr, enum km_type type) +{ + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + + /* + * Force other mappings to Oops if they'll try to access this pte + * without first remap it. Keeping stale mappings around is a bad idea + * also, in case the page changes cacheability attributes or becomes + * a protected page in a hypervisor. + */ + if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + kpte_clear_flush(kmap_pte-idx, vaddr); + else { +#ifdef CONFIG_DEBUG_HIGHMEM + BUG_ON(vaddr < PAGE_OFFSET); + BUG_ON(vaddr >= (unsigned long)high_memory); +#endif + } + + pagefault_enable(); +} + +/* + * This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +{ + return kmap_atomic_prot_pfn(pfn, type, kmap_prot); +} +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ + +struct page *kmap_atomic_to_page(void *ptr) +{ + unsigned long idx, vaddr = (unsigned long)ptr; + pte_t *pte; + + if (vaddr < FIXADDR_START) + return virt_to_page(ptr); + + idx = virt_to_fix(vaddr); + pte = kmap_pte - (idx - FIX_KMAP_BEGIN); + return pte_page(*pte); +} + +void clear_highpage(struct page *page) +{ + void *kaddr; + + if (likely(xen_feature(XENFEAT_highmem_assist)) + && PageHighMem(page)) { + struct mmuext_op meo; + + meo.cmd = MMUEXT_CLEAR_PAGE; + meo.arg1.mfn = pfn_to_mfn(page_to_pfn(page)); + if (HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0) + return; + } + + kaddr = kmap_atomic(page, KM_USER0); + clear_page(kaddr); + kunmap_atomic(kaddr, KM_USER0); +} + +void copy_highpage(struct page *to, struct page *from) +{ + void *vfrom, *vto; + + if (likely(xen_feature(XENFEAT_highmem_assist)) + && (PageHighMem(from) || PageHighMem(to))) { + unsigned long from_pfn = page_to_pfn(from); + unsigned long to_pfn = page_to_pfn(to); + struct mmuext_op meo; + + meo.cmd = MMUEXT_COPY_PAGE; + meo.arg1.mfn = pfn_to_mfn(to_pfn); + meo.arg2.src_mfn = pfn_to_mfn(from_pfn); + if (mfn_to_pfn(meo.arg2.src_mfn) == from_pfn + && mfn_to_pfn(meo.arg1.mfn) == to_pfn + && HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0) + return; + } + + vfrom = kmap_atomic(from, KM_USER0); + vto = kmap_atomic(to, KM_USER1); + copy_page(vto, vfrom); + kunmap_atomic(vfrom, KM_USER0); + kunmap_atomic(vto, KM_USER1); +} + +EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kunmap); +EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(clear_highpage); +EXPORT_SYMBOL(copy_highpage); + +void __init set_highmem_pages_init(void) +{ + struct zone *zone; + int nid; + + for_each_zone(zone) { + unsigned long zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + nid = zone_to_nid(zone); + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn); + + /* XEN: init high-mem pages outside initial allocation. */ + if (zone_start_pfn < xen_start_info->nr_pages) + zone_start_pfn = xen_start_info->nr_pages; + for (; zone_start_pfn < zone_end_pfn; zone_start_pfn++) { + ClearPageReserved(pfn_to_page(zone_start_pfn)); + init_page_count(pfn_to_page(zone_start_pfn)); + } + } + totalram_pages += totalhigh_pages; +} --- linux-ec2-2.6.32.orig/arch/x86/mm/init-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/init-xen.c @@ -0,0 +1,456 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + +unsigned long __meminitdata e820_table_start; +unsigned long __meminitdata e820_table_end; +unsigned long __meminitdata e820_table_top; + +int after_bootmem; + +#if !defined(CONFIG_XEN) +int direct_gbpages +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; +#elif defined(CONFIG_X86_32) +#define direct_gbpages 0 +extern unsigned long extend_init_mapping(unsigned long tables_space); +#else +extern void xen_finish_init_mapping(void); +#endif + +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) +{ + unsigned long puds, pmds, ptes, tables; + + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + + if (use_gbpages) { + unsigned long extra; + + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + + if (use_pse) { + unsigned long extra; + + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + + /* + * RED-PEN putting page tables only on node 0 could + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ +#ifdef CONFIG_X86_32 + e820_table_start = extend_init_mapping(tables); + e820_table_end = e820_table_start; +#else /* CONFIG_X86_64 */ + if (!e820_table_top) { + e820_table_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + + xen_start_info->nr_pt_frames; + e820_table_end = e820_table_start; + } else { + /* + * [table_start, table_top) gets passed to reserve_early(), + * so we must not use table_end here, despite continuing + * to allocate from there. table_end possibly being below + * table_start is otoh not a problem. + */ + e820_table_start = e820_table_top; + } +#endif + if (e820_table_start == -1UL) + panic("Cannot find space for the kernel page tables"); + + e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); +} + +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +#ifdef CONFIG_X86_32 +#define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ +#define NR_RANGE_MR 5 +#endif + +static int __meminit save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<> PAGE_SHIFT; + pos = start_pfn << PAGE_SHIFT; +#ifdef CONFIG_X86_32 + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + if (pos == 0) + end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + else + end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#endif + if (end_pfn > (end >> PAGE_SHIFT)) + end_pfn = end >> PAGE_SHIFT; + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pos = end_pfn << PAGE_SHIFT; + } + + /* big page (2M) range */ + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#ifdef CONFIG_X86_32 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +#endif + + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PAGE_SHIFT; + end_pfn = end>>PAGE_SHIFT; + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof(struct map_range)); + mr[i--].start = old_start; + nr_range--; + } + + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " %010lx - %010lx page %s\n", + mr[i].start, mr[i].end, + (mr[i].page_size_mask & (1<> PAGE_SHIFT) \ + << PAGE_SHIFT) + __START_KERNEL_map)) + + if (!start) { + unsigned long addr, va = __START_KERNEL_map; + unsigned long *page = (unsigned long *)init_level4_pgt; + + /* Kill mapping of memory below _text. */ + while (va < (unsigned long)&_text) { + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) + BUG(); + va += PAGE_SIZE; + } + + /* Blow away any spurious initial mappings. */ + va = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT); + + addr = page[pgd_index(va)]; + page = addr_to_page(addr); + addr = page[pud_index(va)]; + page = addr_to_page(addr); + while (pmd_index(va) | pte_index(va)) { + if (pmd_none(*(pmd_t *)&page[pmd_index(va)])) + break; + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) + BUG(); + va += PAGE_SIZE; + } + } + + for (i = 0; i < nr_range; i++) + ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, + mr[i].page_size_mask); +#undef addr_to_page +#endif + +#ifdef CONFIG_X86_32 + early_ioremap_page_table_range_init(); +#endif + +#ifdef CONFIG_X86_64 + BUG_ON(e820_table_end > e820_table_top); + if (!start) + xen_finish_init_mapping(); + else +#endif + if (e820_table_end < e820_table_top) + /* Disable the 'table_end' allocator. */ + e820_table_top = e820_table_end; + + __flush_tlb_all(); + + if (!after_bootmem && e820_table_top > e820_table_start) { +#ifdef CONFIG_X86_64 + if (xen_start_info->mfn_list < __START_KERNEL_map + && e820_table_start <= xen_start_info->first_p2m_pfn + && e820_table_top > xen_start_info->first_p2m_pfn) { + reserve_early(e820_table_start << PAGE_SHIFT, + xen_start_info->first_p2m_pfn + << PAGE_SHIFT, + "PGTABLE"); + e820_table_start = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + } +#endif + reserve_early(e820_table_start << PAGE_SHIFT, + e820_table_top << PAGE_SHIFT, "PGTABLE"); + } + + if (!after_bootmem) + early_memtest(start, end); + + return ret >> PAGE_SHIFT; +} + + +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; + if (mfn_to_local_pfn(pagenr) >= max_pfn) + return 1; + return 0; +} + +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long addr = begin; + + if (addr >= end) + return; + + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + + for (; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); +#ifdef CONFIG_X86_64 + if (addr >= __START_KERNEL_map) { + /* make_readonly() reports all kernel addresses. */ + if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)), + pfn_pte(__pa(addr) >> PAGE_SHIFT, + PAGE_KERNEL), + 0)) + BUG(); + if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) + BUG(); + } +#endif + free_page(addr); + totalram_pages++; + } +#endif +} + +void free_initmem(void) +{ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + free_init_pages("initrd memory", start, end); +} +#endif --- linux-ec2-2.6.32.orig/arch/x86/mm/physaddr.c +++ linux-ec2-2.6.32/arch/x86/mm/physaddr.c @@ -8,6 +8,10 @@ #ifdef CONFIG_X86_64 +#ifdef CONFIG_XEN +#define phys_base 0 +#endif + unsigned long __phys_addr(unsigned long x) { if (x >= __START_KERNEL_map) { --- linux-ec2-2.6.32.orig/arch/x86/mm/iomap_32-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/iomap_32-xen.c @@ -0,0 +1,112 @@ +/* + * Copyright © 2008 Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + */ + +#include +#include +#include +#include +#include + +static int is_io_mapping_possible(resource_size_t base, unsigned long size) +{ +#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) + /* There is no way to map greater than 1 << 32 address without PAE */ + if (base + size > 0x100000000ULL) + return 0; +#endif + return 1; +} + +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) +{ + unsigned long flag = _PAGE_CACHE_WC; + int ret; + + if (!is_io_mapping_possible(base, size)) + return -EINVAL; + + ret = io_reserve_memtype(base, base + size, &flag); + if (ret) + return ret; + + *prot = __pgprot(__PAGE_KERNEL | flag); + return 0; +} +EXPORT_SYMBOL_GPL(iomap_create_wc); + +void +iomap_free(resource_size_t base, unsigned long size) +{ + io_free_memtype(base, base + size); +} +EXPORT_SYMBOL_GPL(iomap_free); + +void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + pagefault_disable(); + + debug_kmap_atomic(type); + idx = type + KM_TYPE_NR * smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte_at(&init_mm, vaddr, kmap_pte - idx, pfn_pte(pfn, prot)); + /*arch_flush_lazy_mmu_mode();*/ + + return (void *)vaddr; +} + +/* + * Map 'mfn' using fixed map 'type' and protections 'prot' + */ +void * +iomap_atomic_prot_pfn(unsigned long mfn, enum km_type type, pgprot_t prot) +{ + /* + * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. + * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the + * MTRR is UC or WC. UC_MINUS gets the real intention, of the + * user, which is "WC if the MTRR is WC, UC if you can't do that." + */ + if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) + prot = PAGE_KERNEL_UC_MINUS; + + pgprot_val(prot) |= _PAGE_IOMAP; + return kmap_atomic_prot_pfn(mfn, type, prot); +} +EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); + +void +iounmap_atomic(void *kvaddr, enum km_type type) +{ + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + + /* + * Force other mappings to Oops if they'll try to access this pte + * without first remap it. Keeping stale mappings around is a bad idea + * also, in case the page changes cacheability attributes or becomes + * a protected page in a hypervisor. + */ + if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + kpte_clear_flush(kmap_pte-idx, vaddr); + + pagefault_enable(); +} +EXPORT_SYMBOL_GPL(iounmap_atomic); --- linux-ec2-2.6.32.orig/arch/x86/mm/init_64-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/init_64-xen.c @@ -0,0 +1,1303 @@ +/* + * linux/arch/x86_64/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2002,2003 Andi Kleen + * + * Jun Nakajima + * Modified for Xen. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if CONFIG_XEN_COMPAT <= 0x030002 +unsigned int __kernel_page_user; +EXPORT_SYMBOL(__kernel_page_user); +#endif + +extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD]; +extern pte_t level1_fixmap_pgt[PTRS_PER_PTE]; + +/* + * Use this until direct mapping is established, i.e. before __va() is + * available in init_memory_mapping(). + */ + +#define addr_to_page(addr, page) \ + (addr) &= PHYSICAL_PAGE_MASK; \ + (page) = ((unsigned long *) ((unsigned long) \ + (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \ + __START_KERNEL_map))) + +pmd_t *__init early_get_pmd(unsigned long va) +{ + unsigned long addr; + unsigned long *page = (unsigned long *)init_level4_pgt; + + addr = page[pgd_index(va)]; + addr_to_page(addr, page); + + addr = page[pud_index(va)]; + addr_to_page(addr, page); + + return (pmd_t *)&page[pmd_index(va)]; +} + +void __meminit early_make_page_readonly(void *va, unsigned int feature) +{ + unsigned long addr, _va = (unsigned long)va; + pte_t pte, *ptep; + unsigned long *page = (unsigned long *) init_level4_pgt; + + BUG_ON(after_bootmem); + + if (xen_feature(feature)) + return; + + addr = (unsigned long) page[pgd_index(_va)]; + addr_to_page(addr, page); + + addr = page[pud_index(_va)]; + addr_to_page(addr, page); + + addr = page[pmd_index(_va)]; + addr_to_page(addr, page); + + ptep = (pte_t *) &page[pte_index(_va)]; + + pte.pte = ptep->pte & ~_PAGE_RW; + if (HYPERVISOR_update_va_mapping(_va, pte, 0)) + BUG(); +} + +unsigned long __init early_arbitrary_virt_to_mfn(void *v) +{ + unsigned long va = (unsigned long)v, addr, *page; + + BUG_ON(va < __START_KERNEL_map); + + page = (void *)(xen_read_cr3() + __START_KERNEL_map); + + addr = page[pgd_index(va)]; + addr_to_page(addr, page); + + addr = page[pud_index(va)]; + addr_to_page(addr, page); + + addr = page[pmd_index(va)]; + addr_to_page(addr, page); + + return (page[pte_index(va)] & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT; +} + +#ifndef CONFIG_XEN +static int __init parse_direct_gbpages_off(char *arg) +{ + direct_gbpages = 0; + return 0; +} +early_param("nogbpages", parse_direct_gbpages_off); + +static int __init parse_direct_gbpages_on(char *arg) +{ + direct_gbpages = 1; + return 0; +} +early_param("gbpages", parse_direct_gbpages_on); +#endif + +/* + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the + * physical space so we can cache the place of the first one and move + * around without checking the pgd every time. + */ + +pteval_t __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +int force_personality32; + +/* + * noexec32=on|off + * Control non executable heap for 32bit processes. + * To control the stack too use noexec=off + * + * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) + * off PROT_READ implies PROT_EXEC + */ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + +static __init unsigned long get_table_end(void) +{ + BUG_ON(!e820_table_end); + if (xen_start_info->mfn_list < __START_KERNEL_map + && e820_table_end == xen_start_info->first_p2m_pfn) { + e820_table_end += xen_start_info->nr_p2m_frames; + e820_table_top += xen_start_info->nr_p2m_frames; + } + return e820_table_end++; +} + +/* + * NOTE: This function is marked __ref because it calls __init function + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. + */ +static __ref void *spp_getpage(void) +{ + void *ptr; + + if (after_bootmem) + ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + else if (e820_table_end < e820_table_top) { + ptr = __va(get_table_end() << PAGE_SHIFT); + memset(ptr, 0, PAGE_SIZE); + } else + ptr = alloc_bootmem_pages(PAGE_SIZE); + + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { + panic("set_pte_phys: cannot allocate page data %s\n", + after_bootmem ? "after bootmem" : ""); + } + + pr_debug("spp_getpage %p\n", ptr); + + return ptr; +} + +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) +{ + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + make_page_readonly(pud, XENFEAT_writable_page_tables); + if (!after_bootmem) + xen_l4_entry_update(pgd, __pgd(__pa(pud) | _PAGE_TABLE)); + else + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} + +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ + if (pud_none(*pud)) { + pmd_t *pmd = (pmd_t *) spp_getpage(); + make_page_readonly(pmd, XENFEAT_writable_page_tables); + if (!after_bootmem) + xen_l3_entry_update(pud, __pud(__pa(pmd) | _PAGE_TABLE)); + else + pud_populate(&init_mm, pud, pmd); + if (pmd != pmd_offset(pud, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); + } + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ + if (pmd_none(*pmd)) { + pte_t *pte = (pte_t *) spp_getpage(); + make_page_readonly(pte, XENFEAT_writable_page_tables); + pmd_populate_kernel(&init_mm, pmd, pte); + if (pte != pte_offset_kernel(pmd, 0)) + printk(KERN_ERR "PAGETABLE BUG #02!\n"); + } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); + + set_pte(pte, new_pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) +{ + pgd_t *pgd; + pud_t *pud_page; + + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval)); + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); + return; + } + pud_page = (pud_t*)pgd_page_vaddr(*pgd); + set_pte_vaddr_pud(pud_page, vaddr, pteval); +} + +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); +} + +#ifndef CONFIG_XEN +/* + * Create large page table mappings for a range of physical addresses. + */ +static void __init __init_extra_mapping(unsigned long phys, unsigned long size, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { + pgd = pgd_offset_k((unsigned long)__va(phys)); + if (pgd_none(*pgd)) { + pud = (pud_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + _PAGE_USER)); + } + pud = pud_offset(pgd, (unsigned long)__va(phys)); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | + _PAGE_USER)); + } + pmd = pmd_offset(pud, phys); + BUG_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd(phys | pgprot_val(prot))); + } +} + +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); +} + +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); +} + +/* + * The head.S code sets up the kernel high mapping: + * + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) + * + * phys_addr holds the negative offset to the kernel, which is added + * to the compile time generated pmds. This results in invalid pmds up + * to the point where we hit the physaddr 0 mapping. + * + * We limit the mappings to the region from _text to _end. _end is + * rounded up to the 2MB boundary. This catches the invalid pmds as + * well, as they are located before _text: + */ +void __init cleanup_highmap(void) +{ + unsigned long vaddr = __START_KERNEL_map; + unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt; + pmd_t *last_pmd = pmd + PTRS_PER_PMD; + + for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > end) + set_pmd(pmd, __pmd(0)); + } +} +#endif + +static __ref void *alloc_low_page(unsigned long *phys) +{ + unsigned long pfn; + void *adr; + + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + *phys = __pa(adr); + + return adr; + } + + pfn = get_table_end(); + if (pfn >= e820_table_top) + panic("alloc_low_page: ran out of memory"); + + adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); + memset(adr, 0, PAGE_SIZE); + *phys = pfn * PAGE_SIZE; + return adr; +} + +static __ref void unmap_low_page(void *adr) +{ + if (after_bootmem) + return; + + early_iounmap(adr, PAGE_SIZE); +} + +static inline int __meminit make_readonly(unsigned long paddr) +{ + extern char __vsyscall_0; + int readonly = 0; + + /* Make new page tables read-only on the first pass. */ + if (!xen_feature(XENFEAT_writable_page_tables) + && !max_pfn_mapped + && (paddr >= (e820_table_start << PAGE_SHIFT))) { + unsigned long top = e820_table_top; + + /* Account for the range get_table_end() skips. */ + if (xen_start_info->mfn_list < __START_KERNEL_map + && e820_table_end <= xen_start_info->first_p2m_pfn + && top > xen_start_info->first_p2m_pfn) + top += xen_start_info->nr_p2m_frames; + if (paddr < (top << PAGE_SHIFT)) + readonly = 1; + } + /* Make old page tables read-only. */ + if (!xen_feature(XENFEAT_writable_page_tables) + && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map)) + && (paddr < (e820_table_end << PAGE_SHIFT))) + readonly = 1; + /* Make P->M table (and its page tables) read-only. */ + if (!xen_feature(XENFEAT_writable_page_tables) + && xen_start_info->mfn_list < __START_KERNEL_map + && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT) + && paddr < (xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT) + readonly = 1; + + /* + * No need for writable mapping of kernel image. This also ensures that + * page and descriptor tables embedded inside don't have writable + * mappings. Exclude the vsyscall area here, allowing alternative + * instruction patching to work. + */ + if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa(_brk_end)) + && !(paddr >= __pa_symbol(&__vsyscall_0) + && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE)) + readonly = 1; + + return readonly; +} + +static unsigned long __meminit +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, + pgprot_t prot) +{ + unsigned pages = 0; + unsigned long last_map_addr = end; + int i; + + pte_t *pte = pte_page + pte_index(addr); + + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { + unsigned long pteval = addr | pgprot_val(prot); + + if (addr >= end || + (!after_bootmem && + (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages)) + break; + + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ + if (__pte_val(*pte)) { + pages++; + continue; + } + + if (make_readonly(addr)) + pteval &= ~_PAGE_RW; + if (0) + printk(" pte=%p addr=%lx pte=%016lx\n", + pte, addr, pteval); + pages++; + if (!after_bootmem) + *pte = __pte(pteval & __supported_pte_mask); + else + set_pte(pte, __pte(pteval & __supported_pte_mask)); + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; + } + + update_page_count(PG_LEVEL_4K, pages); + + return last_map_addr; +} + +static unsigned long __meminit +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, + pgprot_t prot) +{ + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); + + BUG_ON(!max_pfn_mapped); + return phys_pte_init(pte, address, end, prot); +} + +static unsigned long __meminit +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, + unsigned long page_size_mask, pgprot_t prot) +{ + unsigned long pages = 0; + unsigned long last_map_addr = end; + + int i = pmd_index(address); + + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { + unsigned long pte_phys; + pmd_t *pmd = pmd_page + pmd_index(address); + pte_t *pte; + pgprot_t new_prot = prot; + + if (address >= end) + break; + + if (__pmd_val(*pmd)) { + if (!pmd_large(*pmd)) { + spin_lock(&init_mm.page_table_lock); + last_map_addr = phys_pte_update(pmd, address, + end, prot); + spin_unlock(&init_mm.page_table_lock); + continue; + } + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) { + pages++; + continue; + } + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); + } + + if (page_size_mask & (1<> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + spin_unlock(&init_mm.page_table_lock); + last_map_addr = (address & PMD_MASK) + PMD_SIZE; + continue; + } + + pte = alloc_low_page(&pte_phys); + last_map_addr = phys_pte_init(pte, address, end, new_prot); + unmap_low_page(pte); + + if (!after_bootmem) { + if (max_pfn_mapped) + make_page_readonly(__va(pte_phys), + XENFEAT_writable_page_tables); + *pmd = __pmd(pte_phys | _PAGE_TABLE); + } else { + spin_lock(&init_mm.page_table_lock); + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + spin_unlock(&init_mm.page_table_lock); + } + } + update_page_count(PG_LEVEL_2M, pages); + return last_map_addr; +} + +static unsigned long __meminit +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, + unsigned long page_size_mask, pgprot_t prot) +{ + pmd_t *pmd = pmd_offset(pud, 0); + unsigned long last_map_addr; + + BUG_ON(!max_pfn_mapped); + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); + __flush_tlb_all(); + return last_map_addr; +} + +static unsigned long __meminit +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, + unsigned long page_size_mask) +{ + unsigned long pages = 0; + unsigned long last_map_addr = end; + int i = pud_index(addr); + + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { + unsigned long pmd_phys; + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + pgprot_t prot = PAGE_KERNEL; + + if (addr >= end) + break; + + if (__pud_val(*pud)) { + if (!pud_large(*pud)) { + last_map_addr = phys_pmd_update(pud, addr, end, + page_size_mask, prot); + continue; + } + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) { + pages++; + continue; + } + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); + } + + if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + spin_unlock(&init_mm.page_table_lock); + last_map_addr = (addr & PUD_MASK) + PUD_SIZE; + continue; + } + + pmd = alloc_low_page(&pmd_phys); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, + prot); + unmap_low_page(pmd); + + if (!after_bootmem) { + if (max_pfn_mapped) + make_page_readonly(__va(pmd_phys), + XENFEAT_writable_page_tables); + if (page_size_mask & (1 << PG_LEVEL_NUM)) + xen_l3_entry_update(pud, __pud(pmd_phys | _PAGE_TABLE)); + else + *pud = __pud(pmd_phys | _PAGE_TABLE); + } else { + spin_lock(&init_mm.page_table_lock); + pud_populate(&init_mm, pud, __va(pmd_phys)); + spin_unlock(&init_mm.page_table_lock); + } + } + __flush_tlb_all(); + + update_page_count(PG_LEVEL_1G, pages); + + return last_map_addr; +} + +static unsigned long __meminit +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long page_size_mask) +{ + pud_t *pud; + + pud = (pud_t *)pgd_page_vaddr(*pgd); + + return phys_pud_init(pud, addr, end, page_size_mask | (1 << PG_LEVEL_NUM)); +} + +void __init xen_init_pt(void) +{ + unsigned long addr, *page; + + /* Find the initial pte page that was built for us. */ + page = (unsigned long *)xen_start_info->pt_base; + addr = page[pgd_index(__START_KERNEL_map)]; + addr_to_page(addr, page); + +#if CONFIG_XEN_COMPAT <= 0x030002 + /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER + in kernel PTEs. We check that here. */ + if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) { + unsigned long *pg; + pte_t pte; + + /* Mess with the initial mapping of page 0. It's not needed. */ + BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map); + addr = page[pud_index(__START_KERNEL_map)]; + addr_to_page(addr, pg); + addr = pg[pmd_index(__START_KERNEL_map)]; + addr_to_page(addr, pg); + pte.pte = pg[pte_index(__START_KERNEL_map)]; + BUG_ON(!(pte.pte & _PAGE_PRESENT)); + + /* If _PAGE_USER isn't set, we obviously do not need it. */ + if (pte.pte & _PAGE_USER) { + /* _PAGE_USER is needed, but is it set implicitly? */ + pte.pte &= ~_PAGE_USER; + if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map, + pte, 0) != 0) || + !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER)) + /* We need to explicitly specify _PAGE_USER. */ + __kernel_page_user = _PAGE_USER; + } + } +#endif + + /* Construct mapping of initial pte page in our own directories. */ + init_level4_pgt[pgd_index(__START_KERNEL_map)] = + __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE); + memcpy(level3_kernel_pgt + pud_index(__START_KERNEL_map), + page + pud_index(__START_KERNEL_map), + (PTRS_PER_PUD - pud_index(__START_KERNEL_map)) + * sizeof(*level3_kernel_pgt)); + + /* Copy the initial P->M table mappings if necessary. */ + addr = pgd_index(xen_start_info->mfn_list); + if (addr < pgd_index(__START_KERNEL_map)) + init_level4_pgt[addr] = + ((pgd_t *)xen_start_info->pt_base)[addr]; + + /* Do an early initialization of the fixmap area. */ + addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); + if (pud_present(level3_kernel_pgt[pud_index(addr)])) { + unsigned long adr = page[pud_index(addr)]; + + addr_to_page(adr, page); + memcpy(level2_fixmap_pgt, page, PAGE_SIZE); + } + level3_kernel_pgt[pud_index(addr)] = + __pud(__pa_symbol(level2_fixmap_pgt) | _PAGE_TABLE); + level2_fixmap_pgt[pmd_index(addr)] = + __pmd(__pa_symbol(level1_fixmap_pgt) | _PAGE_TABLE); + + early_make_page_readonly(init_level4_pgt, + XENFEAT_writable_page_tables); + early_make_page_readonly(level3_kernel_pgt, + XENFEAT_writable_page_tables); + early_make_page_readonly(level3_user_pgt, + XENFEAT_writable_page_tables); + early_make_page_readonly(level2_fixmap_pgt, + XENFEAT_writable_page_tables); + early_make_page_readonly(level1_fixmap_pgt, + XENFEAT_writable_page_tables); + + if (!xen_feature(XENFEAT_writable_page_tables)) + xen_pgd_pin(init_level4_pgt); +} + +void __init xen_finish_init_mapping(void) +{ + unsigned long start, end; + struct mmuext_op mmuext; + + /* Re-vector virtual addresses pointing into the initial + mapping to the just-established permanent ones. */ + xen_start_info = __va(__pa(xen_start_info)); + xen_start_info->pt_base = (unsigned long) + __va(__pa(xen_start_info->pt_base)); + if (!xen_feature(XENFEAT_auto_translated_physmap) + && xen_start_info->mfn_list >= __START_KERNEL_map) + phys_to_machine_mapping = + __va(__pa(xen_start_info->mfn_list)); + if (xen_start_info->mod_start) + xen_start_info->mod_start = (unsigned long) + __va(__pa(xen_start_info->mod_start)); + + /* Unpin the no longer used Xen provided page tables. */ + mmuext.cmd = MMUEXT_UNPIN_TABLE; + mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base); + if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF)) + BUG(); + + /* Destroy the Xen-created mappings beyond the kernel image. */ + start = PAGE_ALIGN(_brk_end); + end = __START_KERNEL_map + (e820_table_start << PAGE_SHIFT); + for (; start < end; start += PAGE_SIZE) + if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0)) + BUG(); + + WARN(e820_table_end != e820_table_top, "start=%lx cur=%lx top=%lx\n", + e820_table_start, e820_table_end, e820_table_top); + if (e820_table_end > e820_table_top) + e820_table_top = e820_table_end; +} + +unsigned long __meminit +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) +{ + + unsigned long next, last_map_addr = end; + + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + for (; start < end; start = next) { + pgd_t *pgd = pgd_offset_k(start); + unsigned long pud_phys; + pud_t *pud; + + next = (start + PGDIR_SIZE) & PGDIR_MASK; + if (next > end) + next = end; + + if (__pgd_val(*pgd)) { + last_map_addr = phys_pud_update(pgd, __pa(start), + __pa(end), page_size_mask); + continue; + } + + pud = alloc_low_page(&pud_phys); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), + page_size_mask); + unmap_low_page(pud); + + if(!after_bootmem) { + if (max_pfn_mapped) + make_page_readonly(__va(pud_phys), + XENFEAT_writable_page_tables); + xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE)); + } else { + spin_lock(&init_mm.page_table_lock); + pgd_populate(&init_mm, pgd, __va(pud_phys)); + spin_unlock(&init_mm.page_table_lock); + } + } + + return last_map_addr; +} + +#ifndef CONFIG_NUMA +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long bootmap_size, bootmap; + + e820_register_active_regions(0, start_pfn, end_pfn); +#ifdef CONFIG_XEN + if (end_pfn > xen_start_info->nr_pages) + end_pfn = xen_start_info->nr_pages; +#endif + bootmap_size = bootmem_bootmap_pages(end_pfn)<> PAGE_SHIFT, + 0, end_pfn); + free_bootmem_with_active_regions(0, end_pfn); + early_res_to_bootmem(0, end_pfn< max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +/* + * Memory is added always to NORMAL zone. This means you will never get + * additional DMA/DMA32 memory. + */ +int arch_add_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = pgdat->node_zones + ZONE_NORMAL; + unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + last_mapped_pfn = init_memory_mapping(start, start + size); + if (last_mapped_pfn > max_pfn_mapped) + max_pfn_mapped = last_mapped_pfn; + + ret = __add_pages(nid, zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); + + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start, size); + + return ret; +} +EXPORT_SYMBOL_GPL(arch_add_memory); + +#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) +int memory_add_physaddr_to_nid(u64 start) +{ + return 0; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static struct kcore_list kcore_vsyscall; + +void __init mem_init(void) +{ + long codesize, reservedpages, datasize, initsize; + unsigned long absent_pages; + unsigned long pfn; + + pci_iommu_alloc(); + + /* clear_bss() already clear the empty_zero_page */ + + reservedpages = 0; + + /* this will put all low memory onto the freelists */ +#ifdef CONFIG_NUMA + totalram_pages = numa_free_all_bootmem(); +#else + totalram_pages = free_all_bootmem(); +#endif + + /* XEN: init pages outside initial allocation. */ + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { + ClearPageReserved(pfn_to_page(pfn)); + init_page_count(pfn_to_page(pfn)); + } + + absent_pages = absent_pages_in_range(0, max_pfn); + reservedpages = max_pfn - totalram_pages - absent_pages; + after_bootmem = 1; + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + /* Register memory areas for /proc/kcore */ + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, + VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); + + printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " + "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", + nr_free_pages() << (PAGE_SHIFT-10), + max_pfn << (PAGE_SHIFT-10), + codesize >> 10, + absent_pages << (PAGE_SHIFT-10), + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10); +} + +#ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); + +static int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_stext); + unsigned long end = PFN_ALIGN(__start_rodata); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, end); + + set_memory_rw(start, (end - start) >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_stext); + unsigned long end = PFN_ALIGN(__start_rodata); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, end); + + set_memory_ro(start, (end - start) >> PAGE_SHIFT); +} + +void mark_rodata_ro(void) +{ + unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); + unsigned long rodata_start = + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + kernel_set_to_readonly = 1; + + /* + * The rodata section (but not the kernel text!) should also be + * not-executable. + */ + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); + + rodata_test(); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); + set_memory_rw(start, (end-start) >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: again\n"); + set_memory_ro(start, (end-start) >> PAGE_SHIFT); +#endif +} + +#endif + +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) +{ +#ifdef CONFIG_NUMA + int nid, next_nid; + int ret; +#endif + unsigned long pfn = phys >> PAGE_SHIFT; + + if (pfn >= max_pfn) { + /* + * This can happen with kdump kernels when accessing + * firmware tables: + */ + if (pfn < max_pfn_mapped) + return -EFAULT; + + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n", + phys, len); + return -EFAULT; + } + + /* Should check here against the e820 map to avoid double free */ +#ifdef CONFIG_NUMA + nid = phys_to_nid(phys); + next_nid = phys_to_nid(phys + len - 1); + if (nid == next_nid) + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); + else + ret = reserve_bootmem(phys, len, flags); + + if (ret != 0) + return ret; + +#else + reserve_bootmem(phys, len, flags); +#endif + +#ifndef CONFIG_XEN + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { + dma_reserve += len / PAGE_SIZE; + set_dma_reserve(dma_reserve); + } +#endif + + return 0; +} + +int kern_addr_valid(unsigned long addr) +{ + unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (above != 0 && above != -1UL) + return 0; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + return 0; + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) + return 0; + + if (pud_large(*pud)) + return pfn_valid(pud_pfn(*pud)); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return 0; + + if (pmd_large(*pmd)) + return pfn_valid(pmd_pfn(*pmd)); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + return 0; + + return pfn_valid(pte_pfn(*pte)); +} + +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_START, + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC +}; + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(tsk, TIF_IA32)) + return NULL; +#endif + return &gate_vma; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(task); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* + * Use this when you have no reliable task/vma, typically from interrupt + * context. It is less reliable than using the task's vma and may give + * false positives: + */ +int in_gate_area_no_task(unsigned long addr) +{ + return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); +} + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + if (vma == &gate_vma) + return "[vsyscall]"; + return NULL; +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * Initialise the sparsemem vmemmap using huge-pages at the PMD level. + */ +static long __meminitdata addr_start, addr_end; +static void __meminitdata *p_start, *p_end; +static int __meminitdata node_start; + +int __meminit +vmemmap_populate(struct page *start_page, unsigned long size, int node) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + for (; addr < end; addr = next) { + void *p = NULL; + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + + pud = vmemmap_pud_populate(pgd, addr, node); + if (!pud) + return -ENOMEM; + + if (!cpu_has_pse) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = vmemmap_pmd_populate(pud, addr, node); + + if (!pmd) + return -ENOMEM; + + p = vmemmap_pte_populate(pmd, addr, node); + + if (!p) + return -ENOMEM; + + addr_end = addr + PAGE_SIZE; + p_end = p + PAGE_SIZE; + } else { + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte_t entry; + + p = vmemmap_alloc_block(PMD_SIZE, node); + if (!p) + return -ENOMEM; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd_ma(__pte_val(entry))); + + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; + } else + vmemmap_verify((pte_t *)pmd, node, addr, next); + } + + } + return 0; +} + +void __meminit vmemmap_populate_print_last(void) +{ + if (p_start) { + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + p_start = NULL; + p_end = NULL; + node_start = 0; + } +} +#endif --- linux-ec2-2.6.32.orig/arch/x86/mm/Makefile +++ linux-ec2-2.6.32/arch/x86/mm/Makefile @@ -25,4 +25,7 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o +obj-$(CONFIG_XEN) += hypervisor.o +disabled-obj-$(CONFIG_XEN) := tlb.o + obj-$(CONFIG_MEMTEST) += memtest.o --- linux-ec2-2.6.32.orig/arch/x86/mm/tlb.c +++ linux-ec2-2.6.32/arch/x86/mm/tlb.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -130,6 +131,10 @@ union smp_flush_state *f; cpu = smp_processor_id(); +#ifdef CONFIG_X86_32 + if (current->active_mm) + load_user_cs_desc(cpu, current->active_mm); +#endif /* * orig_rax contains the negated interrupt vector. * Use that to determine where the sender put the data. --- linux-ec2-2.6.32.orig/arch/x86/mm/pgtable_32-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/pgtable_32-xen.c @@ -0,0 +1,180 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +unsigned int __VMALLOC_RESERVE = 128 << 20; + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) +{ +#ifndef CONFIG_XEN + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + if (pte_val(pteval)) + set_pte_at(&init_mm, vaddr, pte, pteval); + else + pte_clear(&init_mm, vaddr, pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +#else + if (HYPERVISOR_update_va_mapping(vaddr, pteval, + UVMF_INVLPG|UVMF_ALL)) + BUG(); +#endif +} + +/* + * Associate a large virtual page frame with a given physical page frame + * and protection flags for that frame. pfn is for the base of the page, + * vaddr is what the page gets mapped to - both must be properly aligned. + * The pmd must already be instantiated. Assumes PAE mode. + */ +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ + printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); + return; /* BUG(); */ + } + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ + printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); + return; /* BUG(); */ + } + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); + return; /* BUG(); */ + } + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + set_pmd(pmd, pfn_pmd(pfn, flags)); + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START; +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE); +EXPORT_SYMBOL(__FIXADDR_TOP); + +/* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/ + __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET; + return 0; +} +early_param("vmalloc", parse_vmalloc); + +#ifndef CONFIG_XEN +/* + * reservetop=size reserves a hole at the top of the kernel address space which + * a hypervisor can load into later. Needed for dynamically loaded hypervisors, + * so relocating the fixmap can be done before paging initialization. + */ +static int __init parse_reservetop(char *arg) +{ + unsigned long address; + + if (!arg) + return -EINVAL; + + address = memparse(arg, &arg); + reserve_top_address(address); + return 0; +} +early_param("reservetop", parse_reservetop); +#endif + +void make_lowmem_page_readonly(void *va, unsigned int feature) +{ + pte_t *pte; + unsigned int level; + int rc; + + if (xen_feature(feature)) + return; + + pte = lookup_address((unsigned long)va, &level); + BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte)); + rc = HYPERVISOR_update_va_mapping( + (unsigned long)va, pte_wrprotect(*pte), 0); + BUG_ON(rc); +} + +void make_lowmem_page_writable(void *va, unsigned int feature) +{ + pte_t *pte; + unsigned int level; + int rc; + + if (xen_feature(feature)) + return; + + pte = lookup_address((unsigned long)va, &level); + BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte)); + rc = HYPERVISOR_update_va_mapping( + (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG); + BUG_ON(rc); +} --- linux-ec2-2.6.32.orig/arch/x86/mm/init_32-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/init_32-xen.c @@ -0,0 +1,1136 @@ +/* + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long highstart_pfn, highend_pfn; + +static noinline int do_test_wp_bit(void); + +bool __read_mostly __vmalloc_start_set = false; + +static __init void *alloc_low_page(void) +{ + unsigned long pfn = e820_table_end++; + void *adr; + + if (pfn >= e820_table_top) + panic("alloc_low_page: ran out of memory"); + + adr = __va(pfn * PAGE_SIZE); + memset(adr, 0, PAGE_SIZE); + return adr; +} + +/* + * Creates a middle page table and puts a pointer to it in the + * given global directory entry. This only returns the gd entry + * in non-PAE compilation mode, since the middle layer is folded. + */ +static pmd_t * __init one_md_table_init(pgd_t *pgd) +{ + pud_t *pud; + pmd_t *pmd_table; + +#ifdef CONFIG_X86_PAE + if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) { + if (after_bootmem) + pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); + else + pmd_table = (pmd_t *)alloc_low_page(); + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + BUG_ON(pmd_table != pmd_offset(pud, 0)); + + return pmd_table; + } +#endif + pud = pud_offset(pgd, 0); + pmd_table = pmd_offset(pud, 0); + + return pmd_table; +} + +/* + * Create a page table and place a pointer to it in a middle page + * directory entry: + */ +static pte_t * __init one_page_table_init(pmd_t *pmd) +{ +#if CONFIG_XEN_COMPAT <= 0x030002 + if (pmd_none(*pmd)) { +#else + if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) { +#endif + pte_t *page_table = NULL; + + if (after_bootmem) { +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); +#endif + if (!page_table) + page_table = + (pte_t *)alloc_bootmem_pages(PAGE_SIZE); + } else + page_table = (pte_t *)alloc_low_page(); + + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); + make_lowmem_page_readonly(page_table, + XENFEAT_writable_page_tables); + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); + } + + return pte_offset_kernel(pmd, 0); +} + +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; +} + +static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, + unsigned long vaddr, pte_t *lastpte) +{ +#ifdef CONFIG_HIGHMEM + /* + * Something (early fixmap) may already have put a pte + * page here, which causes the page table allocation + * to become nonlinear. Attempt to fix it, and if it + * is still nonlinear then we have to bug. + */ + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + + if (pmd_idx_kmap_begin != pmd_idx_kmap_end + && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end + && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start + || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { + pte_t *newpte; + int i; + + BUG_ON(after_bootmem); + newpte = alloc_low_page(); + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(newpte + i, pte[i]); + + paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); + make_lowmem_page_readonly(newpte, + XENFEAT_writable_page_tables); + set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); + BUG_ON(newpte != pte_offset_kernel(pmd, 0)); + __flush_tlb_all(); + + paravirt_release_pte(__pa(pte) >> PAGE_SHIFT); + make_lowmem_page_writable(pte, + XENFEAT_writable_page_tables); + pte = newpte; + } + BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1) + && vaddr > fix_to_virt(FIX_KMAP_END) + && lastpte && lastpte + PTRS_PER_PTE != pte); +#endif + return pte; +} + +/* + * This function initializes a certain range of kernel virtual memory + * with new bootmem page tables, everywhere page tables are missing in + * the given range. + * + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without + * checking the pgd every time. + */ +static void __init +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) +{ + int pgd_idx, pmd_idx; + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte = NULL; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { + pmd = one_md_table_init(pgd); + pmd = pmd + pmd_index(vaddr); + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd++, pmd_idx++) { + if (vaddr >= hypervisor_virt_start) + break; + pte = page_table_kmap_check(one_page_table_init(pmd), + pmd, vaddr, pte); + + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +} + +static inline int is_kernel_text(unsigned long addr) +{ + if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) + return 1; + return 0; +} + +/* + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET: + */ +unsigned long __init +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) +{ + int use_pse = page_size_mask == (1<> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + + /* + * First iteration will setup identity mapping using large/small pages + * based on use_pse, with other attributes same as set by + * the early code in head_32.S + * + * Second iteration will setup the appropriate attributes (NX, GLOBAL..) + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + */ + mapping_iter = 1; + + if (!cpu_has_pse) { + use_pse = 0; + mapping_iter = 0; + } + +repeat: + pages_2m = pages_4k = 0; + pfn = start_pfn; + pgd_idx = pgd_index((pfn<= end_pfn) + continue; +#ifdef CONFIG_X86_PAE + pmd_idx = pmd_index((pfn<= hypervisor_virt_start) + continue; + + /* + * Map with big pages if possible, otherwise + * create normal page tables: + */ + if (use_pse) { + unsigned int addr2; + pgprot_t prot = PAGE_KERNEL_LARGE; + /* + * first pass will use the same initial + * identity mapping attribute + _PAGE_PSE. + */ + pgprot_t init_prot = + __pgprot(PTE_IDENT_ATTR | + _PAGE_PSE); + + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(addr) || + is_kernel_text(addr2)) + prot = PAGE_KERNEL_LARGE_EXEC; + + pages_2m++; + if (mapping_iter == 1) + set_pmd(pmd, pfn_pmd(pfn, init_prot)); + else + set_pmd(pmd, pfn_pmd(pfn, prot)); + + pfn += PTRS_PER_PTE; + continue; + } + pte = one_page_table_init(pmd); + + pte_ofs = pte_index((pfn<= xen_start_info->nr_pages || pte_present(*pte)) + continue; + if (is_kernel_text(addr)) + prot = PAGE_KERNEL_EXEC; + + pages_4k++; + if (mapping_iter == 1) + set_pte(pte, pfn_pte(pfn, init_prot)); + else + set_pte(pte, pfn_pte(pfn, prot)); + } + } + } + if (mapping_iter <= 1) { + /* + * update direct mapping page count only in the first + * iteration. + */ + update_page_count(PG_LEVEL_2M, pages_2m); + update_page_count(PG_LEVEL_4K, pages_4k); + } + if (mapping_iter == 1) { + /* + * local global flush tlb, which will flush the previous + * mappings present in both small and large page TLB's. + */ + __flush_tlb_all(); + + /* + * Second iteration will set the actual desired PTE attributes. + */ + mapping_iter = 2; + goto repeat; + } + return 0; +} + +pte_t *kmap_pte; +pgprot_t kmap_prot; + +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) +{ + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), + vaddr), vaddr), vaddr); +} + +static void __init kmap_init(void) +{ + unsigned long kmap_vstart; + + /* + * Cache the first kmap pte: + */ + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); + kmap_pte = kmap_get_fixmap_pte(kmap_vstart); + + kmap_prot = PAGE_KERNEL; +} + +#ifdef CONFIG_HIGHMEM +static void __init permanent_kmaps_init(pgd_t *pgd_base) +{ + unsigned long vaddr; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + vaddr = PKMAP_BASE; + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + pgd_index(vaddr); + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; +} + +static void __init add_one_highpage_init(struct page *page, int pfn) +{ + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + totalhigh_pages++; +} + +struct add_highpages_data { + unsigned long start_pfn; + unsigned long end_pfn; +}; + +static int __init add_highpages_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + int node_pfn; + struct page *page; + unsigned long final_start_pfn, final_end_pfn; + struct add_highpages_data *data; + + data = (struct add_highpages_data *)datax; + + final_start_pfn = max(start_pfn, data->start_pfn); + final_end_pfn = min(end_pfn, data->end_pfn); + if (final_start_pfn >= final_end_pfn) + return 0; + + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; + node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page, node_pfn); + } + + return 0; + +} + +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + struct add_highpages_data data; + + data.start_pfn = start_pfn; + data.end_pfn = end_pfn; + + work_with_active_regions(nid, add_highpages_work_fn, &data); +} + +#else +static inline void permanent_kmaps_init(pgd_t *pgd_base) +{ +} +#endif /* CONFIG_HIGHMEM */ + +pgd_t *swapper_pg_dir; + +/* + * Build a proper pagetable for the kernel mappings. Up until this + * point, we've been running on some set of pagetables constructed by + * the boot process. + * + * If we're booting on native hardware, this will be a pagetable + * constructed in arch/x86/kernel/head_32.S. The root of the + * pagetable will be swapper_pg_dir. + * + * If we're booting paravirtualized under a hypervisor, then there are + * more options: we may already be running PAE, and the pagetable may + * or may not be based in swapper_pg_dir. In any case, + * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * appropriately for the rest of the initialization to work. + * + * In general, pagetable_init() assumes that the pagetable may already + * be partially populated, and so it avoids stomping on any existing + * mappings. + */ +void __init early_ioremap_page_table_range_init(void) +{ + pgd_t *pgd_base = swapper_pg_dir; + unsigned long vaddr, end; + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; + page_table_range_init(vaddr, end, pgd_base); + early_ioremap_reset(); +} + +static void __init pagetable_init(void) +{ + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base; + + permanent_kmaps_init(pgd_base); +} + +#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN) +/* + * ACPI suspend needs this for resume, because things like the intel-agp + * driver might have split up a kernel 4MB mapping. + */ +char swsusp_pg_dir[PAGE_SIZE] + __attribute__ ((aligned(PAGE_SIZE))); + +static inline void save_pg_dir(void) +{ + memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); +} +#else /* !CONFIG_ACPI_SLEEP */ +static inline void save_pg_dir(void) +{ +} +#endif /* !CONFIG_ACPI_SLEEP */ + +void zap_low_mappings(bool early) +{ + int i; + + /* + * Zap initial low-memory mappings. + * + * Note that "pgd_clear()" doesn't do it for + * us, because pgd_clear() is a no-op on i386. + */ + for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) + set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); +#else + set_pgd(swapper_pg_dir+i, __pgd(0)); +#endif + } + + if (early) + __flush_tlb(); + else + flush_tlb_all(); +} + +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +/* user-defined highmem size */ +static unsigned int highmem_pages = -1; + +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init parse_highmem(char *arg) +{ + if (!arg) + return -EINVAL; + + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; + return 0; +} +early_param("highmem", parse_highmem); + +#define MSG_HIGHMEM_TOO_BIG \ + "highmem size (%luMB) is bigger than pages available (%luMB)!\n" + +#define MSG_LOWMEM_TOO_SMALL \ + "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n" +/* + * All of RAM fits into lowmem - but if user wants highmem + * artificially via the highmem=x boot parameter then create + * it: + */ +void __init lowmem_pfn_init(void) +{ + /* max_low_pfn is 0, we already have early_res support */ + max_low_pfn = max_pfn; + + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR MSG_HIGHMEM_TOO_BIG, + pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) { + printk(KERN_ERR MSG_LOWMEM_TOO_SMALL, + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif +} + +#define MSG_HIGHMEM_TOO_SMALL \ + "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" + +#define MSG_HIGHMEM_TRIMMED \ + "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" +/* + * We have more RAM than fits into lowmem - we try to put it into + * highmem, also taking the highmem=x boot parameter into account: + */ +void __init highmem_pfn_init(void) +{ + max_low_pfn = MAXMEM_PFN; + + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, + pages_to_mb(max_pfn - MAXMEM_PFN), + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } +#ifndef CONFIG_HIGHMEM + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; +#else /* !CONFIG_HIGHMEM */ +#ifndef CONFIG_HIGHMEM64G + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); + } +#endif /* !CONFIG_HIGHMEM64G */ +#endif /* !CONFIG_HIGHMEM */ +} + +/* + * Determine low and high memory ranges: + */ +void __init find_low_pfn_range(void) +{ + /* it could update max_pfn */ + + if (max_pfn <= MAXMEM_PFN) + lowmem_pfn_init(); + else + highmem_pfn_init(); +} + +#ifndef CONFIG_NEED_MULTIPLE_NODES +void __init initmem_init(unsigned long start_pfn, + unsigned long end_pfn) +{ +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; + e820_register_active_regions(0, 0, highend_pfn); + sparse_memory_present_with_active_regions(0); + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + e820_register_active_regions(0, 0, max_low_pfn); + sparse_memory_present_with_active_regions(0); + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif +#ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; +#endif + __vmalloc_start_set = true; + + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); +} +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ + +static void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; +#endif + + free_area_init_nodes(max_zone_pfns); + + xen_init_pgd_pin(); +} + +static unsigned long __init setup_node_bootmem(int nodeid, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long bootmap) +{ + unsigned long bootmap_size; + + /* don't touch min_low_pfn */ + bootmap_size = init_bootmem_node(NODE_DATA(nodeid), + bootmap >> PAGE_SHIFT, + start_pfn, end_pfn); + printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", + nodeid, start_pfn<nr_pages); + + /* + * Initialize the boot-time allocator (with low memory only): + */ + bootmap_size = bootmem_bootmap_pages(end_xen_pfn)<nr_pages)< end_xen_pfn) + continue; + if (end_pfn > end_xen_pfn) + end_pfn = end_xen_pfn; +#else + start_pfn = 0; + end_pfn = end_xen_pfn; +#endif + bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, + bootmap); + } + + after_bootmem = 1; +} + +unsigned long __init extend_init_mapping(unsigned long tables_space) +{ + unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + + xen_start_info->nr_pt_frames; + unsigned long start = start_pfn, va = (unsigned long)&_text; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* Ensure init mappings cover kernel text/data and initial tables. */ + while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) { + pgd = pgd_offset_k(va); + pud = pud_offset(pgd, va); + pmd = pmd_offset(pud, va); + if (pmd_none(*pmd)) { + unsigned long pa = start_pfn++ << PAGE_SHIFT; + + memset(__va(pa), 0, PAGE_SIZE); + make_lowmem_page_readonly(__va(pa), + XENFEAT_writable_page_tables); + xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE)); + } + pte = pte_offset_kernel(pmd, va); + if (pte_none(*pte)) { + pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE); + + if (HYPERVISOR_update_va_mapping(va, new_pte, 0)) + BUG(); + } + va += PAGE_SIZE; + } + + /* Finally, blow away any spurious initial mappings. */ + while (1) { + pgd = pgd_offset_k(va); + pud = pud_offset(pgd, va); + pmd = pmd_offset(pud, va); + if (pmd_none(*pmd)) + break; + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) + BUG(); + va += PAGE_SIZE; + } + + if (start_pfn > start) + reserve_early(start << PAGE_SHIFT, + start_pfn << PAGE_SHIFT, "INITMAP"); + + return start_pfn; +} + +/* + * paging_init() sets up the page tables - note that the first 8MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +void __init paging_init(void) +{ + pagetable_init(); + + __flush_tlb_all(); + + kmap_init(); + + /* + * NOTE: at this point the bootmem allocator is fully available. + */ + sparse_init(); + zone_sizes_init(); +} + +/* + * Test if the WP bit works in supervisor mode. It isn't supported on 386's + * and also on some strange 486's. All 586+'s are OK. This used to involve + * black magic jumps to work around some nasty CPU bugs, but fortunately the + * switch to using exceptions got rid of all that. + */ +static void __init test_wp_bit(void) +{ + printk(KERN_INFO + "Checking if this processor honours the WP bit even in supervisor mode..."); + + /* Any page-aligned address will do, the test is non-destructive */ + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); + boot_cpu_data.wp_works_ok = do_test_wp_bit(); + clear_fixmap(FIX_WP_TEST); + + if (!boot_cpu_data.wp_works_ok) { + printk(KERN_CONT "No.\n"); +#ifdef CONFIG_X86_WP_WORKS_OK + panic( + "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); +#endif + } else { + printk(KERN_CONT "Ok.\n"); + } +} + +void __init mem_init(void) +{ + int codesize, reservedpages, datasize, initsize; + int tmp; + unsigned long pfn; + + pci_iommu_alloc(); + +#ifdef CONFIG_FLATMEM + BUG_ON(!mem_map); +#endif + /* this will put all low memory onto the freelists */ + totalram_pages += free_all_bootmem(); + /* XEN: init low-mem pages outside initial allocation. */ + for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) { + ClearPageReserved(pfn_to_page(pfn)); + init_page_count(pfn_to_page(pfn)); + } + + reservedpages = 0; + for (tmp = 0; tmp < max_low_pfn; tmp++) + /* + * Only count reserved RAM pages: + */ + if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) + reservedpages++; + + set_highmem_pages_init(); + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " + "%dk reserved, %dk data, %dk init, %ldk highmem)\n", + nr_free_pages() << (PAGE_SHIFT-10), + num_physpages << (PAGE_SHIFT-10), + codesize >> 10, + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10, + (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) + ); + + printk(KERN_INFO "virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#endif + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + +#ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, +#endif + + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, + + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - + (unsigned long)&__init_begin) >> 10, + + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + + /* + * Check boundaries twice: Some fundamental inconsistencies can + * be detected at build time already. + */ +#define __FIXADDR_TOP (-PAGE_SIZE) +#ifdef CONFIG_HIGHMEM + BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif +#define high_memory (-128UL << 20) + BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); +#undef high_memory +#undef __FIXADDR_TOP + +#ifdef CONFIG_HIGHMEM + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif + BUG_ON(VMALLOC_START >= VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); + + if (boot_cpu_data.wp_works_ok < 0) + test_wp_bit(); + + save_pg_dir(); + zap_low_mappings(true); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int arch_add_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdata = NODE_DATA(nid); + struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(nid, zone, start_pfn, nr_pages); +} +#endif + +/* + * This function cannot be __init, since exceptions don't work in that + * section. Put this after the callers, so that it cannot be inlined. + */ +static noinline int do_test_wp_bit(void) +{ + char tmp_reg; + int flag; + + __asm__ __volatile__( + " movb %0, %1 \n" + "1: movb %1, %0 \n" + " xorl %2, %2 \n" + "2: \n" + _ASM_EXTABLE(1b,2b) + :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), + "=q" (tmp_reg), + "=r" (flag) + :"2" (1) + :"memory"); + + return flag; +} + +#ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); + +static int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, start+size); + + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, start+size); + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +} + +void mark_rodata_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + + kernel_set_to_readonly = 1; + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); +#endif + + start += size; + size = (unsigned long)__end_rodata - start; + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); + rodata_test(); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +#endif +} +#endif + +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) +{ + return reserve_bootmem(phys, len, flags); +} --- linux-ec2-2.6.32.orig/arch/x86/mm/init_64.c +++ linux-ec2-2.6.32/arch/x86/mm/init_64.c @@ -49,6 +49,7 @@ #include #include #include +#include static unsigned long dma_reserve __initdata; @@ -615,6 +616,21 @@ */ #ifdef CONFIG_MEMORY_HOTPLUG /* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +/* * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ @@ -633,6 +649,9 @@ ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start, size); + return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); @@ -820,6 +839,9 @@ if (pud_none(*pud)) return 0; + if (pud_large(*pud)) + return pfn_valid(pud_pfn(*pud)); + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return 0; --- linux-ec2-2.6.32.orig/arch/x86/mm/dump_pagetables.c +++ linux-ec2-2.6.32/arch/x86/mm/dump_pagetables.c @@ -30,11 +30,13 @@ unsigned long start_address; unsigned long current_address; const struct addr_marker *marker; + unsigned long lines; }; struct addr_marker { unsigned long start_address; const char *name; + unsigned long max_lines; }; /* Address space markers hints */ @@ -45,6 +47,7 @@ { PAGE_OFFSET, "Low Kernel Mapping" }, { VMALLOC_START, "vmalloc() Area" }, { VMEMMAP_START, "Vmemmap" }, + { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, { __START_KERNEL_map, "High Kernel Mapping" }, { MODULES_VADDR, "Modules" }, { MODULES_END, "End Modules" }, @@ -141,7 +144,7 @@ pgprot_t new_prot, int level) { pgprotval_t prot, cur; - static const char units[] = "KMGTPE"; + static const char units[] = "BKMGTPE"; /* * If we have a "break" in the series, we need to flush the state that @@ -156,6 +159,7 @@ st->current_prot = new_prot; st->level = level; st->marker = address_markers; + st->lines = 0; seq_printf(m, "---[ %s ]---\n", st->marker->name); } else if (prot != cur || level != st->level || st->current_address >= st->marker[1].start_address) { @@ -166,17 +170,21 @@ /* * Now print the actual finished series */ - seq_printf(m, "0x%0*lx-0x%0*lx ", - width, st->start_address, - width, st->current_address); - - delta = (st->current_address - st->start_address) >> 10; - while (!(delta & 1023) && unit[1]) { - delta >>= 10; - unit++; + if (!st->marker->max_lines || + st->lines < st->marker->max_lines) { + seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); + + delta = (st->current_address - st->start_address); + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + seq_printf(m, "%9lu%c ", delta, *unit); + printk_prot(m, st->current_prot, st->level); } - seq_printf(m, "%9lu%c ", delta, *unit); - printk_prot(m, st->current_prot, st->level); + st->lines++; /* * We print markers for special areas of address space, @@ -184,7 +192,15 @@ * This helps in the interpretation. */ if (st->current_address >= st->marker[1].start_address) { + if (st->marker->max_lines && + st->lines > st->marker->max_lines) { + unsigned long nskip = + st->lines - st->marker->max_lines; + seq_printf(m, "... %lu entr%s skipped ... \n", + nskip, nskip == 1 ? "y" : "ies"); + } st->marker++; + st->lines = 0; seq_printf(m, "---[ %s ]---\n", st->marker->name); } --- linux-ec2-2.6.32.orig/arch/x86/mm/pgtable-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/pgtable-xen.c @@ -0,0 +1,892 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO + +#ifdef CONFIG_HIGHPTE +#define PGALLOC_USER_GFP __GFP_HIGHMEM +#else +#define PGALLOC_USER_GFP 0 +#endif + +gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + pte_t *pte = (pte_t *)__get_free_page(PGALLOC_GFP); + if (pte) + make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables); + return pte; +} + +static void _pte_free(struct page *page, unsigned int order) +{ + BUG_ON(order); + __pte_free(page); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + + pte = alloc_pages(__userpte_alloc_gfp, 0); + if (pte) { + pgtable_page_ctor(pte); + SetPageForeign(pte, _pte_free); + init_page_count(pte); + } + return pte; +} + +static int __init setup_userpte(char *arg) +{ + if (!arg) + return -EINVAL; + + /* + * "userpte=nohigh" disables allocation of user pagetables in + * high memory. + */ + if (strcmp(arg, "nohigh") == 0) + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + else + return -EINVAL; + return 0; +} +early_param("userpte", setup_userpte); + +void __pte_free(pgtable_t pte) +{ + if (!PageHighMem(pte)) { + if (PagePinned(pte)) { + unsigned long pfn = page_to_pfn(pte); + + if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, + PAGE_KERNEL), + 0)) + BUG(); + ClearPagePinned(pte); + } + } else +#ifdef CONFIG_HIGHPTE + ClearPagePinned(pte); +#else + BUG(); +#endif + + ClearPageForeign(pte); + init_page_count(pte); + pgtable_page_dtor(pte); + __free_page(pte); +} + +void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pgtable_page_dtor(pte); + paravirt_release_pte(page_to_pfn(pte)); + tlb_remove_page(tlb, pte); +} + +#if PAGETABLE_LEVELS > 2 +static void _pmd_free(struct page *page, unsigned int order) +{ + BUG_ON(order); + __pmd_free(page); +} + +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pmd; + + pmd = alloc_pages(PGALLOC_GFP, 0); + if (!pmd) + return NULL; + SetPageForeign(pmd, _pmd_free); + init_page_count(pmd); + return page_address(pmd); +} + +void __pmd_free(pgtable_t pmd) +{ + if (PagePinned(pmd)) { + unsigned long pfn = page_to_pfn(pmd); + + if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, PAGE_KERNEL), + 0)) + BUG(); + ClearPagePinned(pmd); + } + + ClearPageForeign(pmd); + init_page_count(pmd); + __free_page(pmd); +} + +void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pmd)); +} + +#if PAGETABLE_LEVELS > 3 +void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +{ + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pud)); +} +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +static void _pin_lock(struct mm_struct *mm, int lock) { + if (lock) + spin_lock(&mm->page_table_lock); +#if USE_SPLIT_PTLOCKS + /* While mm->page_table_lock protects us against insertions and + * removals of higher level page table pages, it doesn't protect + * against updates of pte-s. Such updates, however, require the + * pte pages to be in consistent state (unpinned+writable or + * pinned+readonly). The pinning and attribute changes, however + * cannot be done atomically, which is why such updates must be + * prevented from happening concurrently. + * Note that no pte lock can ever elsewhere be acquired nesting + * with an already acquired one in the same mm, or with the mm's + * page_table_lock already acquired, as that would break in the + * non-split case (where all these are actually resolving to the + * one page_table_lock). Thus acquiring all of them here is not + * going to result in dead locks, and the order of acquires + * doesn't matter. + */ + { + pgd_t *pgd = mm->pgd; + unsigned g; + + for (g = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) { + pud_t *pud; + unsigned u; + + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + pmd_t *pmd; + unsigned m; + + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + spinlock_t *ptl; + + if (pmd_none(*pmd)) + continue; + ptl = pte_lockptr(0, pmd); + if (lock) + spin_lock(ptl); + else + spin_unlock(ptl); + } + } + } + } +#endif + if (!lock) + spin_unlock(&mm->page_table_lock); +} +#define pin_lock(mm) _pin_lock(mm, 1) +#define pin_unlock(mm) _pin_lock(mm, 0) + +#define PIN_BATCH sizeof(void *) +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); + +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags, + unsigned int cpu, unsigned int seq) +{ + unsigned long pfn = page_to_pfn(page); + + if (pgprot_val(flags) & _PAGE_RW) + ClearPagePinned(page); + else + SetPagePinned(page); + if (PageHighMem(page)) + return seq; + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, flags), 0); + if (unlikely(++seq == PIN_BATCH)) { + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), + PIN_BATCH, NULL))) + BUG(); + seq = 0; + } + + return seq; +} + +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) +{ + pgd_t *pgd = pgd_base; + pud_t *pud; + pmd_t *pmd; + int g,u,m; + unsigned int cpu, seq; + multicall_entry_t *mcl; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + cpu = get_cpu(); + + /* + * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables + * may not be the 'current' task's pagetables (e.g., current may be + * 32-bit, but the pagetables may be for a 64-bit task). + * Subtracting 1 from TASK_SIZE_MAX means the loop limit is correct + * regardless of whether TASK_SIZE_MAX is a multiple of PGDIR_SIZE. + */ + for (g = 0, seq = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq); + } + } + } + +#ifdef CONFIG_X86_PAE + for (; g < PTRS_PER_PGD; g++, pgd++) { + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, 0); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, 0); + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq); + } +#endif + + mcl = per_cpu(pb_mcl, cpu); +#ifdef CONFIG_X86_64 + if (unlikely(seq > PIN_BATCH - 2)) { + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL))) + BUG(); + seq = 0; + } + pgd = __user_pgd(pgd_base); + BUG_ON(!pgd); + MULTI_update_va_mapping(mcl + seq, + (unsigned long)pgd, + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, flags), + 0); + MULTI_update_va_mapping(mcl + seq + 1, + (unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH); + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL))) + BUG(); +#else + if (likely(seq != 0)) { + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, + (unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH); + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), + seq + 1, NULL))) + BUG(); + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH)) + BUG(); +#endif + + put_cpu(); +} + +void __init xen_init_pgd_pin(void) +{ + pgd_t *pgd = init_mm.pgd; + pud_t *pud; + pmd_t *pmd; + unsigned int g, u, m; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + SetPagePinned(virt_to_page(pgd)); + for (g = 0; g < PTRS_PER_PGD; g++, pgd++) { +#ifndef CONFIG_X86_PAE + if (g >= pgd_index(HYPERVISOR_VIRT_START) + && g <= pgd_index(HYPERVISOR_VIRT_END - 1)) + continue; +#endif + if (!pgd_present(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + SetPagePinned(virt_to_page(pud)); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (!pud_present(*pud) || pud_large(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + SetPagePinned(virt_to_page(pmd)); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { +#ifdef CONFIG_X86_PAE + if (g == pgd_index(HYPERVISOR_VIRT_START) + && m >= pmd_index(HYPERVISOR_VIRT_START)) + continue; +#endif + if (!pmd_present(*pmd) || pmd_large(*pmd)) + continue; + SetPagePinned(pmd_page(*pmd)); + } + } + } +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_pgt)); +#endif +} + +static void __pgd_pin(pgd_t *pgd) +{ + pgd_walk(pgd, PAGE_KERNEL_RO); + kmap_flush_unused(); + xen_pgd_pin(pgd); + SetPagePinned(virt_to_page(pgd)); +} + +static void __pgd_unpin(pgd_t *pgd) +{ + xen_pgd_unpin(pgd); + pgd_walk(pgd, PAGE_KERNEL); + ClearPagePinned(virt_to_page(pgd)); +} + +static void pgd_test_and_unpin(pgd_t *pgd) +{ + if (PagePinned(virt_to_page(pgd))) + __pgd_unpin(pgd); +} + +void mm_pin(struct mm_struct *mm) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + pin_lock(mm); + __pgd_pin(mm->pgd); + pin_unlock(mm); +} + +void mm_unpin(struct mm_struct *mm) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + pin_lock(mm); + __pgd_unpin(mm->pgd); + pin_unlock(mm); +} + +void mm_pin_all(void) +{ + struct page *page; + unsigned long flags; + + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + /* + * Allow uninterrupted access to the pgd_list. Also protects + * __pgd_pin() by disabling preemption. + * All other CPUs must be at a safe point (e.g., in stop_machine + * or offlined entirely). + */ + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!PagePinned(page)) + __pgd_pin((pgd_t *)page_address(page)); + } + spin_unlock_irqrestore(&pgd_lock, flags); +} + +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + if (!PagePinned(virt_to_page(mm->pgd))) + mm_pin(mm); +} + +/* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() *much* + * faster this way, as no hypercalls are needed for the page table updates. + */ +static void leave_active_mm(struct task_struct *tsk, struct mm_struct *mm) + __releases(tsk->alloc_lock) +{ + if (tsk->active_mm == mm) { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + if (atomic_dec_and_test(&mm->mm_count)) + BUG(); + } + + task_unlock(tsk); +} + +static void _leave_active_mm(void *mm) +{ + struct task_struct *tsk = current; + + if (spin_trylock(&tsk->alloc_lock)) + leave_active_mm(tsk, mm); +} + +void arch_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + leave_active_mm(tsk, mm); + + preempt_disable(); + smp_call_function_many(mm_cpumask(mm), _leave_active_mm, mm, 1); + preempt_enable(); + + if (PagePinned(virt_to_page(mm->pgd)) + && atomic_read(&mm->mm_count) == 1 + && !mm->context.has_foreign_mappings) + mm_unpin(mm); +} + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_add(&page->lru, &pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); +} + +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) + +static void pgd_ctor(pgd_t *pgd) +{ + pgd_test_and_unpin(pgd); + + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + PAGETABLE_LEVELS == 4) { + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + } + +#ifdef CONFIG_X86_64 + /* set level3_user_pgt for vsyscall area */ + __user_pgd(pgd)[pgd_index(VSYSCALL_START)] = + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE); +#endif + + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) + pgd_list_add(pgd); +} + +static void pgd_dtor(pgd_t *pgd) +{ + if (!SHARED_KERNEL_PMD) { + spin_lock(&pgd_lock); + pgd_list_del(pgd); + spin_unlock(&pgd_lock); + } + + pgd_test_and_unpin(pgd); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * -- wli + */ + +#ifdef CONFIG_X86_PAE +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + pud_t pud = __pud(__pa(pmd) | _PAGE_PRESENT); + + paravirt_alloc_pmd(mm, page_to_pfn(virt_to_page(pmd))); + + if (likely(!PagePinned(virt_to_page(pudp)))) { + *pudp = pud; + return; + } + + set_pud(pudp, pud); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + if (mm == current->active_mm) + xen_tlb_flush(); +} +#else /* !CONFIG_X86_PAE */ + +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +#define PREALLOCATED_PMDS 0 + +#endif /* CONFIG_X86_PAE */ + +static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig) +{ + int i; + +#ifdef CONFIG_X86_PAE + if (contig) + xen_destroy_contiguous_region((unsigned long)mm->pgd, 0); +#endif + + for(i = 0; i < PREALLOCATED_PMDS; i++) + if (pmds[i]) + pmd_free(mm, pmds[i]); +} + +static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm) +{ + int i; + bool failed = false; + + for(i = 0; i < PREALLOCATED_PMDS; i++) { + pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT); + if (pmd == NULL) + failed = true; + pmds[i] = pmd; + } + + if (failed) { + free_pmds(pmds, mm, false); + return -ENOMEM; + } + + return 0; +} + +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) +{ + int i; + + for(i = 0; i < PREALLOCATED_PMDS; i++) { + pgd_t pgd = pgdp[i]; + + if (__pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgdp[i] = xen_make_pgd(0); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + } + } + +#ifdef CONFIG_X86_PAE + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) + xen_destroy_contiguous_region((unsigned long)pgdp, 0); +#endif +} + +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) +{ + pud_t *pud; + unsigned long addr; + int i; + + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ + return; + + pud = pud_offset(pgd, 0); + for (addr = i = 0; i < PREALLOCATED_PMDS; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmds[i]; + + if (i >= KERNEL_PGD_BOUNDARY) + memcpy(pmd, + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); + + /* It is safe to poke machine addresses of pmds under the pgd_lock. */ + pud_populate(mm, pud, pmd); + } +} + +static inline pgd_t *user_pgd_alloc(pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + if (pgd) { + pgd_t *upgd = (void *)__get_free_page(PGALLOC_GFP); + + if (upgd) + virt_to_page(pgd)->index = (long)upgd; + else { + free_page((unsigned long)pgd); + pgd = NULL; + } + } +#endif + return pgd; +} + +static inline void user_pgd_free(pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + free_page(virt_to_page(pgd)->index); +#endif +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd; + pmd_t *pmds[PREALLOCATED_PMDS]; + + pgd = user_pgd_alloc((void *)__get_free_page(PGALLOC_GFP)); + + if (pgd == NULL) + goto out; + + mm->pgd = pgd; + + if (preallocate_pmds(pmds, mm) != 0) + goto out_free_pgd; + + if (paravirt_pgd_alloc(mm) != 0) + goto out_free_pmds; + + /* + * Make sure that pre-populating the pmds is atomic with + * respect to anything walking the pgd_list, so that they + * never see a partially populated pgd. + */ + spin_lock(&pgd_lock); + +#ifdef CONFIG_X86_PAE + /* Protect against save/restore: move below 4GB under pgd_lock. */ + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb) + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) { + spin_unlock(&pgd_lock); + goto out_free_pmds; + } +#endif + + pgd_ctor(pgd); + pgd_prepopulate_pmd(mm, pgd, pmds); + + spin_unlock(&pgd_lock); + + return pgd; + +out_free_pmds: + free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb)); +out_free_pgd: + user_pgd_free(pgd); + free_page((unsigned long)pgd); +out: + return NULL; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + /* + * After this the pgd should not be pinned for the duration of this + * function's execution. We should never sleep and thus never race: + * 1. User pmds will not become write-protected under our feet due + * to a concurrent mm_pin_all(). + * 2. The machine addresses in PGD entries will not become invalid + * due to a concurrent save/restore. + */ + pgd_dtor(pgd); + + pgd_mop_up_pmds(mm, pgd); + paravirt_pgd_free(mm, pgd); + user_pgd_free(pgd); + free_page((unsigned long)pgd); +} + +/* blktap and gntdev need this, as otherwise they would implicitly (and + * needlessly, as they never use it) reference init_mm. */ +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, int full) +{ + return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm, + addr, ptep, full); +} +EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full); + +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + + if (changed && dirty) { + if (likely(vma->vm_mm == current->mm)) { + if (HYPERVISOR_update_va_mapping(address, + entry, + uvm_multi(mm_cpumask(vma->vm_mm))|UVMF_INVLPG)) + BUG(); + } else { + xen_l1_entry_update(ptep, entry); + flush_tlb_page(vma, address); + } + } + + return changed; +} + +int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int ret = 0; + + if (pte_young(*ptep)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *) &ptep->pte); + + if (ret) + pte_update(vma->vm_mm, addr, ptep); + + return ret; +} + +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t pte = *ptep; + int young = pte_young(pte); + + pte = pte_mkold(pte); + if (PagePinned(virt_to_page(vma->vm_mm->pgd))) + ptep_set_access_flags(vma, address, ptep, pte, young); + else if (young) + ptep->pte_low = pte.pte_low; + + return young; +} + +/** + * reserve_top_address - reserves a hole in the top of kernel address space + * @reserve - size of hole to reserve + * + * Can be used to relocate the fixmap area and poke a hole in the top + * of kernel address space to make room for a hypervisor. + */ +void __init reserve_top_address(unsigned long reserve) +{ +#ifdef CONFIG_X86_32 + BUG_ON(fixmaps_set > 0); + printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", + (int)-reserve); + __FIXADDR_TOP = -reserve - PAGE_SIZE; +#endif +} + +int fixmaps_set; + +void xen_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + pte_t pte; + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + + switch (idx) { +#ifdef CONFIG_X86_64 + extern pte_t level1_fixmap_pgt[PTRS_PER_PTE]; + + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: + pte = pfn_pte(phys >> PAGE_SHIFT, flags); + set_pte_vaddr_pud(level3_user_pgt, address, pte); + break; + case FIX_EARLYCON_MEM_BASE: + case FIX_SHARED_INFO: + case FIX_ISAMAP_END ... FIX_ISAMAP_BEGIN: + xen_l1_entry_update(level1_fixmap_pgt + pte_index(address), + pfn_pte_ma(phys >> PAGE_SHIFT, flags)); + fixmaps_set++; + return; +#else + case FIX_WP_TEST: + case FIX_VDSO: + pte = pfn_pte(phys >> PAGE_SHIFT, flags); + break; +#endif + default: + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags); + break; + } + set_pte_vaddr(address, pte); + fixmaps_set++; +} --- linux-ec2-2.6.32.orig/arch/x86/mm/ioremap-xen.c +++ linux-ec2-2.6.32/arch/x86/mm/ioremap-xen.c @@ -0,0 +1,871 @@ +/* + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "physaddr.h" + +static int direct_remap_area_pte_fn(pte_t *pte, + struct page *pmd_page, + unsigned long address, + void *data) +{ + mmu_update_t **v = (mmu_update_t **)data; + + BUG_ON(!pte_none(*pte)); + + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) << + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); + (*v)++; + + return 0; +} + +static int __direct_remap_pfn_range(struct mm_struct *mm, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + int rc; + unsigned long i, start_address; + mmu_update_t *u, *v, *w; + + u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + if (u == NULL) + return -ENOMEM; + + start_address = address; + + flush_cache_all(); + + for (i = 0; i < size; i += PAGE_SIZE) { + if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) { + /* Flush a full batch after filling in the PTE ptrs. */ + rc = apply_to_page_range(mm, start_address, + address - start_address, + direct_remap_area_pte_fn, &w); + if (rc) + goto out; + rc = -EFAULT; + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) + goto out; + v = w = u; + start_address = address; + } + + /* + * Fill in the machine address: PTE ptr is done later by + * apply_to_page_range(). + */ + pgprot_val(prot) |= _PAGE_IOMAP; + v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot))); + + mfn++; + address += PAGE_SIZE; + v++; + } + + if (v != u) { + /* Final batch. */ + rc = apply_to_page_range(mm, start_address, + address - start_address, + direct_remap_area_pte_fn, &w); + if (rc) + goto out; + rc = -EFAULT; + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) + goto out; + } + + rc = 0; + + out: + flush_tlb_all(); + + free_page((unsigned long)u); + + return rc; +} + +int direct_remap_pfn_range(struct vm_area_struct *vma, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return remap_pfn_range(vma, address, mfn, size, prot); + + if (domid == DOMID_SELF) + return -EINVAL; + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + + vma->vm_mm->context.has_foreign_mappings = 1; + + return __direct_remap_pfn_range( + vma->vm_mm, address, mfn, size, prot, domid); +} +EXPORT_SYMBOL(direct_remap_pfn_range); + +int direct_kernel_remap_pfn_range(unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + return __direct_remap_pfn_range( + &init_mm, address, mfn, size, prot, domid); +} +EXPORT_SYMBOL(direct_kernel_remap_pfn_range); + +static int lookup_pte_fn( + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + uint64_t *ptep = (uint64_t *)data; + if (ptep) + *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) << + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); + return 0; +} + +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + uint64_t *ptep) +{ + return apply_to_page_range(mm, address, PAGE_SIZE, + lookup_pte_fn, ptep); +} + +EXPORT_SYMBOL(create_lookup_pte_addr); + +static int noop_fn( + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + return 0; +} + +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size) +{ + return apply_to_page_range(mm, address, size, noop_fn, NULL); +} + +EXPORT_SYMBOL(touch_pte_range); + +int page_is_ram(unsigned long pagenr) +{ + resource_size_t addr, end; + int i; + +#ifndef CONFIG_XEN + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + */ + if (pagenr == 0) + return 0; + + /* + * Second special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + */ + if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && + pagenr < (BIOS_END >> PAGE_SHIFT)) + return 0; +#endif + + for (i = 0; i < e820.nr_map; i++) { + /* + * Not usable memory: + */ + if (e820.map[i].type != E820_RAM) + continue; + addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; + end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; + + + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; +} + +/* + * Fix up the linear direct mapping of the kernel to avoid cache attribute + * conflicts. + */ +static int ioremap_change_attr(unsigned long vaddr, unsigned long size, + unsigned long prot_val) +{ + unsigned long nrpages = size >> PAGE_SHIFT; + int err; + + switch (prot_val) { + case _PAGE_CACHE_UC: + default: + err = _set_memory_uc(vaddr, nrpages); + break; + case _PAGE_CACHE_WC: + err = _set_memory_wc(vaddr, nrpages); + break; + case _PAGE_CACHE_WB: + err = _set_memory_wb(vaddr, nrpages); + break; + } + + return err; +} + +int ioremap_check_change_attr(unsigned long mfn, unsigned long size, + unsigned long prot_val) +{ + unsigned long sz; + int rc; + + for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) { + unsigned long pfn = mfn_to_local_pfn(mfn); + + if (pfn >= max_low_pfn_mapped && + (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped)) + continue; + rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT), + PAGE_SIZE, prot_val); + } + + return rc; +} + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +static void __iomem *__ioremap_caller(resource_size_t phys_addr, + unsigned long size, unsigned long prot_val, void *caller) +{ + unsigned long mfn, offset, vaddr; + resource_size_t last_addr; + const resource_size_t unaligned_phys_addr = phys_addr; + const unsigned long unaligned_size = size; + struct vm_struct *area; + unsigned long new_prot_val; + pgprot_t prot; + int retval; + domid_t domid = DOMID_IO; + void __iomem *ret_addr; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + if (!phys_addr_valid(phys_addr)) { + printk(KERN_WARNING "ioremap: invalid physical address %llx\n", + (unsigned long long)phys_addr); + WARN_ON_ONCE(1); + return NULL; + } + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr)) + return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr); + + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ONCE(iomem_map_sanity_check(phys_addr, size), + KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) { + unsigned long pfn = mfn_to_local_pfn(mfn); + + if (pfn_valid(pfn)) { + if (!PageReserved(pfn_to_page(pfn))) + return NULL; + domid = DOMID_SELF; + } + } + WARN_ON_ONCE(domid == DOMID_SELF); + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + retval = reserve_memtype(phys_addr, (u64)phys_addr + size, + prot_val, &new_prot_val); + if (retval) { + printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); + return NULL; + } + + if (prot_val != new_prot_val) { + if (!is_new_memtype_allowed(phys_addr, size, + prot_val, new_prot_val)) { + printk(KERN_ERR + "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", + (unsigned long long)phys_addr, + (unsigned long long)(phys_addr + size), + prot_val, new_prot_val); + free_memtype(phys_addr, phys_addr + size); + return NULL; + } + prot_val = new_prot_val; + } + + switch (prot_val) { + case _PAGE_CACHE_UC: + default: + prot = PAGE_KERNEL_IO_NOCACHE; + break; + case _PAGE_CACHE_UC_MINUS: + prot = PAGE_KERNEL_IO_UC_MINUS; + break; + case _PAGE_CACHE_WC: + prot = PAGE_KERNEL_IO_WC; + break; + case _PAGE_CACHE_WB: + prot = PAGE_KERNEL_IO; + break; + } + + /* + * Ok, go for it.. + */ + area = get_vm_area_caller(size, VM_IOREMAP, caller); + if (!area) + return NULL; + area->phys_addr = phys_addr; + vaddr = (unsigned long) area->addr; + + if (kernel_map_sync_memtype(phys_addr, size, prot_val)) { + free_memtype(phys_addr, phys_addr + size); + free_vm_area(area); + return NULL; + } + + if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr), + size, prot, domid)) { + free_memtype(phys_addr, phys_addr + size); + free_vm_area(area); + return NULL; + } + + ret_addr = (void __iomem *) (vaddr + offset); + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); + + return ret_addr; +} + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) +{ + /* + * Ideally, this should be: + * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; + * + * Till we fix all X drivers to use ioremap_wc(), we will use + * UC MINUS. + */ + unsigned long val = _PAGE_CACHE_UC_MINUS; + + return __ioremap_caller(phys_addr, size, val, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_nocache); + +/** + * ioremap_wc - map memory into CPU space write combined + * @offset: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write combining. + * Write combining allows faster writes to some hardware devices. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) +{ + if (pat_enabled) + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, + __builtin_return_address(0)); + else + return ioremap_nocache(phys_addr, size); +} +EXPORT_SYMBOL(ioremap_wc); + +void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_cache); + +#ifndef CONFIG_XEN +static void __iomem *ioremap_default(resource_size_t phys_addr, + unsigned long size) +{ + unsigned long flags; + void __iomem *ret; + int err; + + /* + * - WB for WB-able memory and no other conflicting mappings + * - UC_MINUS for non-WB-able memory with no other conflicting mappings + * - Inherit from confliting mappings otherwise + */ + err = reserve_memtype(phys_addr, phys_addr + size, + _PAGE_CACHE_WB, &flags); + if (err < 0) + return NULL; + + ret = __ioremap_caller(phys_addr, size, flags, + __builtin_return_address(0)); + + free_memtype(phys_addr, phys_addr + size); + return ret; +} +#endif + +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK), + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_prot); + +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, *o; + + if ((void __force *)addr <= high_memory) + return; + + /* + * __ioremap special-cases the PCI/ISA range by not instantiating a + * vm_area and by simply returning an address into the kernel mapping + * of ISA space. So handle that here. + */ + if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) + return; + + addr = (volatile void __iomem *) + (PAGE_MASK & (unsigned long __force)addr); + + mmiotrace_iounmap(addr); + + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == (void __force *)addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { + printk(KERN_ERR "iounmap: bad address %p\n", addr); + dump_stack(); + return; + } + + free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); + + /* Finally remove it */ + o = remove_vm_area((void __force *)addr); + BUG_ON(p != o || o == NULL); + kfree(p); +} +EXPORT_SYMBOL(iounmap); + +#ifndef CONFIG_XEN +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +void *xlate_dev_mem_ptr(unsigned long phys) +{ + void *addr; + unsigned long start = phys & PAGE_MASK; + + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ + if (page_is_ram(start >> PAGE_SHIFT)) + return __va(phys); + + addr = (void __force *)ioremap_default(start, PAGE_SIZE); + if (addr) + addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); + + return addr; +} + +void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +{ + if (page_is_ram(phys >> PAGE_SHIFT)) + return; + + iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return; +} +#endif + +static int __initdata early_ioremap_debug; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static __initdata int after_paging_init; +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; + +#ifdef CONFIG_X86_32 +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) +{ + /* Don't assume we're using swapper_pg_dir at this point */ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(addr)]; + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + + return pmd; +} +#else +#define early_ioremap_pmd early_get_pmd +#undef make_lowmem_page_readonly +#define make_lowmem_page_readonly early_make_page_readonly +#endif + +static inline pte_t * __init early_ioremap_pte(unsigned long addr) +{ + return &bm_pte[pte_index(addr)]; +} + +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + +void __init early_ioremap_init(void) +{ + pmd_t *pmd; + int i; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_init()\n"); + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); + memset(bm_pte, 0, sizeof(bm_pte)); + make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables); + pmd_populate_kernel(&init_mm, pmd, bm_pte); + + /* + * The boot-ioremap range spans multiple pmds, for which + * we are not prepared: + */ + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { + WARN_ON(1); + printk(KERN_WARNING "pmd %p != %p\n", + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", + FIX_BTMAP_BEGIN); + } +} + +void __init early_ioremap_reset(void) +{ + after_paging_init = 1; +} + +static void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + pte = early_ioremap_pte(addr); + + if (pgprot_val(flags)) + set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags)); + else + pte_clear(&init_mm, addr, pte); + __flush_tlb_one(addr); +} + +static inline void __init early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t prot) +{ + if (after_paging_init) + __set_fixmap(idx, phys, prot); + else + __early_set_fixmap(idx, phys, prot); +} + +static inline void __init early_clear_fixmap(enum fixed_addresses idx) +{ + if (after_paging_init) + clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, __pgprot(0)); +} + +static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; + +static int __init check_early_ioremap_leak(void) +{ + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (!count) + return 0; + WARN(1, KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n", + count); + printk(KERN_WARNING + "please boot with early_ioremap_debug and report the dmesg.\n"); + + return 1; +} +late_initcall(check_early_ioremap_leak); + +static void __init __iomem * +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) +{ + unsigned long offset; + resource_size_t last_addr; + unsigned int nrpages; + enum fixed_addresses idx0, idx; + int i, slot; + + WARN_ON(system_state != SYSTEM_BOOTING); + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (slot < 0) { + printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n", + (u64)phys_addr, size); + WARN_ON(1); + return NULL; + } + + if (early_ioremap_debug) { + printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ", + (u64)phys_addr, size, slot); + dump_stack(); + } + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) { + WARN_ON(1); + return NULL; + } + + prev_size[slot] = size; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (nrpages > NR_FIX_BTMAPS) { + WARN_ON(1); + return NULL; + } + + /* + * Ok, go for it.. + */ + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + idx = idx0; + while (nrpages > 0) { + early_set_fixmap(idx, phys_addr, prot); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + if (early_ioremap_debug) + printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); + + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); + return prev_map[slot]; +} + +/* Remap an IO device */ +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + /* + * Don't remap the low PCI/ISA area, it's always mapped. + */ + if (is_initial_xendomain() && is_ISA_range(phys_addr, phys_addr + size - 1)) + return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr); + + return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); +} + +/* Remap memory */ +void __init __iomem * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL); +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + /* + * early_ioremap special-cases the PCI/ISA range by not instantiating a + * vm_area and by simply returning an address into the kernel mapping + * of ISA space. So handle that here. + */ + if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN) + && (unsigned long)addr < fix_to_virt(FIX_ISAMAP_END - 1)) + return; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } + + if (slot < 0) { + printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", + addr, size); + WARN_ON(1); + return; + } + + if (prev_size[slot] != size) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot]); + WARN_ON(1); + return; + } + + if (early_ioremap_debug) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, + size, slot); + dump_stack(); + } + + virt_addr = (unsigned long)addr; + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { + WARN_ON(1); + return; + } + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + early_clear_fixmap(idx); + --idx; + --nrpages; + } + prev_map[slot] = NULL; +} --- linux-ec2-2.6.32.orig/arch/x86/mm/extable.c +++ linux-ec2-2.6.32/arch/x86/mm/extable.c @@ -35,34 +35,3 @@ return 0; } - -#ifdef CONFIG_X86_64 -/* - * Need to defined our own search_extable on X86_64 to work around - * a B stepping K8 bug. - */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - /* B stepping K8 bug */ - if ((value >> 32) == 0) - value |= 0xffffffffUL << 32; - - while (first <= last) { - const struct exception_table_entry *mid; - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - return NULL; -} -#endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/vsyscall_64-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/vsyscall_64-xen.c @@ -0,0 +1,315 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located + * at virtual address -10Mbyte+1024bytes etc... There are at max 4 + * vsyscalls. One vsyscall can reserve more than 1 slot to avoid + * jumping out of line if necessary. We cannot add more with this + * mechanism because older kernels won't return -ENOSYS. + * If we want more than four we need a vDSO. + * + * Note: the concept clashes with user mode linux. If you use UML and + * want per guest time just set the kernel.vsyscall64 sysctl to 0. + */ + +/* Disable profiling for userspace code: */ +#define DISABLE_BRANCH_PROFILING + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __vsyscall(nr) \ + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace +#define __syscall_clobber "r11","cx","memory" + +/* + * vsyscall_gtod_data contains data that is : + * - readonly from vsyscalls + * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) + * Try to keep this structure as small as possible to avoid cache line ping pongs + */ +int __vgetcpu_mode __section_vgetcpu_mode; + +struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = +{ + .lock = SEQLOCK_UNLOCKED, + .sysctl_enabled = 1, +}; + +void update_vsyscall_tz(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* sys_tz has changed */ + vsyscall_gtod_data.sys_tz = sys_tz; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); +} + +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, + u32 mult) +{ + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ + vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; + vsyscall_gtod_data.clock.mult = mult; + vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); +} + +/* RED-PEN may want to readd seq locking, but then the variable should be + * write-once. + */ +static __always_inline void do_get_tz(struct timezone * tz) +{ + *tz = __vsyscall_gtod_data.sys_tz; +} + +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + int ret; + asm volatile("syscall" + : "=a" (ret) + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) + : __syscall_clobber ); + return ret; +} + +static __always_inline long time_syscall(long *t) +{ + long secs; + asm volatile("syscall" + : "=a" (secs) + : "0" (__NR_time),"D" (t) : __syscall_clobber); + return secs; +} + +static __always_inline void do_vgettimeofday(struct timeval * tv) +{ + cycle_t now, base, mask, cycle_delta; + unsigned seq; + unsigned long mult, shift, nsec; + cycle_t (*vread)(void); + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + vread = __vsyscall_gtod_data.clock.vread; + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { + gettimeofday(tv,NULL); + return; + } + + now = vread(); + base = __vsyscall_gtod_data.clock.cycle_last; + mask = __vsyscall_gtod_data.clock.mask; + mult = __vsyscall_gtod_data.clock.mult; + shift = __vsyscall_gtod_data.clock.shift; + + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + nsec = __vsyscall_gtod_data.wall_time_nsec; + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + + /* calculate interval: */ + cycle_delta = (now - base) & mask; + /* convert to nsecs: */ + nsec += (cycle_delta * mult) >> shift; + + while (nsec >= NSEC_PER_SEC) { + tv->tv_sec += 1; + nsec -= NSEC_PER_SEC; + } + tv->tv_usec = nsec / NSEC_PER_USEC; +} + +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) +{ + if (tv) + do_vgettimeofday(tv); + if (tz) + do_get_tz(tz); + return 0; +} + +/* This will break when the xtime seconds get inaccurate, but that is + * unlikely */ +time_t __vsyscall(1) vtime(time_t *t) +{ + struct timeval tv; + time_t result; + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) + return time_syscall(t); + + vgettimeofday(&tv, NULL); + result = tv.tv_sec; + if (t) + *t = result; + return result; +} + +/* Fast way to get current CPU and node. + This helps to do per node and per CPU caches in user space. + The result is not guaranteed without CPU affinity, but usually + works out because the scheduler tries to keep a thread on the same + CPU. + + tcache must point to a two element sized long array. + All arguments can be NULL. */ +long __vsyscall(2) +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) +{ + unsigned int p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly rdtscp/cpuid otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->blob[0] == (j = __jiffies)) { + p = tcache->blob[1]; + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + native_read_tscp(&p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + if (tcache) { + tcache->blob[0] = j; + tcache->blob[1] = p; + } + if (cpu) + *cpu = p & 0xfff; + if (node) + *node = p >> 12; + return 0; +} + +static long __vsyscall(3) venosys_1(void) +{ + return -ENOSYS; +} + +#ifdef CONFIG_SYSCTL +static ctl_table kernel_table2[] = { + { .procname = "vsyscall64", + .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec }, + {} +}; + +static ctl_table kernel_root_table2[] = { + { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, + .child = kernel_table2 }, + {} +}; +#endif + +/* Assume __initcall executes before all user space. Hopefully kmod + doesn't violate that. We'll find out if it does. */ +static void __cpuinit vsyscall_set_cpu(int cpu) +{ + unsigned long d; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node(cpu); +#endif + if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) + write_rdtscp_aux((node << 12) | cpu); + + /* Store cpu number in limit so that it can be loaded quickly + in user space in vgetcpu. + 12 bits for the CPU and 8 bits for the node. */ + d = 0x0f40000000000ULL; + d |= cpu; + d |= (node & 0xf) << 12; + d |= (node >> 4) << 48; + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); +} + +static void __cpuinit cpu_vsyscall_init(void *arg) +{ + /* preemption should be already off */ + vsyscall_set_cpu(raw_smp_processor_id()); +} + +static int __cpuinit +cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) +{ + long cpu = (long)arg; + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); + return NOTIFY_DONE; +} + +void __init map_vsyscall(void) +{ + extern char __vsyscall_0; + unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + + /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); +} + +static int __init vsyscall_init(void) +{ + BUG_ON(((unsigned long) &vgettimeofday != + VSYSCALL_ADDR(__NR_vgettimeofday))); + BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); + BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); + BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); +#ifdef CONFIG_XEN + vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */ + if (boot_cpu_has(X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; +#endif +#ifdef CONFIG_SYSCTL + register_sysctl_table(kernel_root_table2); +#endif + on_each_cpu(cpu_vsyscall_init, NULL, 1); + hotcpu_notifier(cpu_vsyscall_notifier, 0); + return 0; +} + +__initcall(vsyscall_init); --- linux-ec2-2.6.32.orig/arch/x86/kernel/tboot.c +++ linux-ec2-2.6.32/arch/x86/kernel/tboot.c @@ -46,6 +46,7 @@ /* Global pointer to shared data; NULL means no measured launch. */ struct tboot *tboot __read_mostly; +EXPORT_SYMBOL(tboot); /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ #define AP_WAIT_TIMEOUT 1 --- linux-ec2-2.6.32.orig/arch/x86/kernel/init_task.c +++ linux-ec2-2.6.32/arch/x86/kernel/init_task.c @@ -31,6 +31,7 @@ struct task_struct init_task = INIT_TASK(init_task); EXPORT_SYMBOL(init_task); +#ifndef CONFIG_X86_NO_TSS /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. The TSS size is kept cacheline-aligned @@ -39,4 +40,4 @@ * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; - +#endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/pci-dma.c +++ linux-ec2-2.6.32/arch/x86/kernel/pci-dma.c @@ -214,7 +214,7 @@ if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; + forbid_dac = 1; if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; --- linux-ec2-2.6.32.orig/arch/x86/kernel/tlb_uv.c +++ linux-ec2-2.6.32/arch/x86/kernel/tlb_uv.c @@ -817,10 +817,8 @@ */ apicid = blade_to_first_apicid(blade); pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); - if ((pa & 0xff) != UV_BAU_MESSAGE) { - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, + uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, ((apicid << 32) | UV_BAU_MESSAGE)); - } return 0; } --- linux-ec2-2.6.32.orig/arch/x86/kernel/e820.c +++ linux-ec2-2.6.32/arch/x86/kernel/e820.c @@ -79,7 +79,7 @@ * Note: this function only works correct if the e820 table is sorted and * not-overlapping, which is the case */ -int __init e820_all_mapped(u64 start, u64 end, unsigned type) +int e820_all_mapped(u64 start, u64 end, unsigned type) { int i; @@ -106,6 +106,7 @@ } return 0; } +EXPORT_SYMBOL_GPL(e820_all_mapped); /* * Add a memory region to the kernel e820 map. @@ -1236,15 +1237,21 @@ if (!p) return -EINVAL; -#ifdef CONFIG_X86_32 if (!strcmp(p, "nopentium")) { +#ifdef CONFIG_X86_32 setup_clear_cpu_cap(X86_FEATURE_PSE); return 0; - } +#else + printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + return -EINVAL; #endif + } userdef = 1; mem_size = memparse(p, &p); + /* don't remove all of memory when handling "mem={invalid}" param */ + if (mem_size == 0) + return -EINVAL; e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; --- linux-ec2-2.6.32.orig/arch/x86/kernel/ldt-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/ldt-xen.c @@ -0,0 +1,277 @@ +/* + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP +static void flush_ldt(void *current_mm) +{ + if (current->active_mm == current_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt, *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount * LDT_ENTRY_SIZE); + else + newldt = (void *)__get_free_page(GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, + (mincount - oldsize) * LDT_ENTRY_SIZE); + +#ifdef CONFIG_X86_64 + /* CHECKME: Do we really need this ? */ + wmb(); +#endif + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + preempt_disable(); +#endif + make_pages_readonly(newldt, + (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + load_LDT(pc); +#ifdef CONFIG_SMP + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) + smp_call_function(flush_ldt, current->mm, 1); + preempt_enable(); +#endif + } + if (oldsize) { + make_pages_writable(oldldt, + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + put_page(virt_to_page(oldldt)); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); + make_pages_readonly(new->ldt, + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct *old_mm; + int retval = 0; + + memset(&mm->context, 0, sizeof(mm->context)); + mutex_init(&mm->context.lock); + old_mm = current->mm; + if (old_mm) + mm->context.vdso = old_mm->context.vdso; + if (old_mm && old_mm->context.size > 0) { + mutex_lock(&old_mm->context.lock); + retval = copy_ldt(&mm->context, &old_mm->context); + mutex_unlock(&old_mm->context.lock); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + /* CHECKME: Can this ever happen ? */ + if (mm == current->active_mm) + clear_LDT(); + make_pages_writable(mm->context.ldt, + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + put_page(virt_to_page(mm->context.ldt)); + mm->context.size = 0; + } +} + +static int read_ldt(void __user *ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct *mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; + + mutex_lock(&mm->context.lock); + size = mm->context.size * LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + mutex_unlock(&mm->context.lock); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr + size, bytecount - size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user *ptr, unsigned long bytecount) +{ + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct *mm = current->mm; + struct desc_struct ldt; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + mutex_lock(&mm->context.lock); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, + ldt_info.entry_number + 1, 1); + if (error < 0) + goto out_unlock; + } + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + memset(&ldt, 0, sizeof(ldt)); + goto install; + } + } + +#ifndef CONFIG_X86_16BIT + if (!ldt_info.seg_32bit) { + error = -EINVAL; + goto out_unlock; + } +#endif + fill_ldt(&ldt, &ldt_info); + if (oldmode) + ldt.avl = 0; + + /* Install the new entry ... */ +install: + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); + +out_unlock: + mutex_unlock(&mm->context.lock); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, + unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/sys_i386_32.c +++ linux-ec2-2.6.32/arch/x86/kernel/sys_i386_32.c @@ -24,31 +24,6 @@ #include -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - struct mm_struct *mm = current->mm; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/i386 didn't use to be able to handle more than @@ -77,7 +52,7 @@ if (a.offset & ~PAGE_MASK) goto out; - err = sys_mmap2(a.addr, a.len, a.prot, a.flags, + err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return err; --- linux-ec2-2.6.32.orig/arch/x86/kernel/k8.c +++ linux-ec2-2.6.32/arch/x86/kernel/k8.c @@ -87,6 +87,37 @@ return 0; } +struct resource *amd_get_mmconfig_range(struct resource *res) +{ + u32 address; + u64 base, msr; + unsigned segn_busn_bits; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return NULL; + + /* assume all cpus from fam10h have mmconfig */ + if (boot_cpu_data.x86 < 0x10) + return NULL; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, msr); + + /* mmconfig is not enabled */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return NULL; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + res->flags = IORESOURCE_MEM; + res->start = base; + res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1; + return res; +} + void k8_flush_garts(void) { int flushed, i; @@ -121,3 +152,17 @@ } EXPORT_SYMBOL_GPL(k8_flush_garts); +static __init int init_k8_nbs(void) +{ + int err = 0; + + err = cache_k8_northbridges(); + + if (err < 0) + printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + + return err; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_k8_nbs); --- linux-ec2-2.6.32.orig/arch/x86/kernel/relocate_kernel_32.S +++ linux-ec2-2.6.32/arch/x86/kernel/relocate_kernel_32.S @@ -87,16 +87,36 @@ movl PTR(PA_PGD)(%ebp), %eax movl %eax, %cr3 + /* setup idt */ + lidtl idt_48 - relocate_kernel(%edi) + + /* setup gdt */ + leal gdt - relocate_kernel(%edi), %eax + movl %eax, (gdt_48 - relocate_kernel) + 2(%edi) + lgdtl gdt_48 - relocate_kernel(%edi) + + /* setup data segment registers */ + mov $(gdt_ds - gdt), %eax + mov %eax, %ds + mov %eax, %es + mov %eax, %fs + mov %eax, %gs + mov %eax, %ss + /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%edi), %esp - /* jump to identity mapped page */ + /* load new code segment and jump to identity mapped page */ + pushl $0 + pushl $(gdt_cs - gdt) movl %edi, %eax addl $(identity_mapped - relocate_kernel), %eax pushl %eax - ret + iretl identity_mapped: + /* set return address to 0 if not preserving context */ + pushl $0 /* store the start address on the stack */ pushl %edx @@ -271,5 +291,22 @@ popl %ebp ret + .align 16 +gdt: + .quad 0x0000000000000000 /* NULL descriptor */ +gdt_cs: + .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ +gdt_ds: + .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ +gdt_end: + +gdt_48: + .word gdt_end - gdt - 1 /* limit */ + .long 0 /* base - filled in by code above */ + +idt_48: + .word 0 /* limit */ + .long 0 /* base */ + .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel --- linux-ec2-2.6.32.orig/arch/x86/kernel/smp.c +++ linux-ec2-2.6.32/arch/x86/kernel/smp.c @@ -158,10 +158,10 @@ irq_exit(); } -static void native_smp_send_stop(void) +static void native_stop_other_cpus(int wait) { unsigned long flags; - unsigned long wait; + unsigned long timeout; if (reboot_force) return; @@ -178,9 +178,12 @@ if (num_online_cpus() > 1) { apic->send_IPI_allbutself(REBOOT_VECTOR); - /* Don't wait longer than a second */ - wait = USEC_PER_SEC; - while (num_online_cpus() > 1 && wait--) + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } @@ -226,7 +229,7 @@ .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, + .stop_other_cpus = native_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, --- linux-ec2-2.6.32.orig/arch/x86/kernel/msr.c +++ linux-ec2-2.6.32/arch/x86/kernel/msr.c @@ -176,6 +176,9 @@ struct cpuinfo_x86 *c = &cpu_data(cpu); int ret = 0; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + lock_kernel(); cpu = iminor(file->f_path.dentry->d_inode); @@ -251,7 +254,7 @@ int i, err = 0; i = 0; - if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { + if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { printk(KERN_ERR "msr: unable to get major %d for msr\n", MSR_MAJOR); err = -EBUSY; @@ -279,7 +282,7 @@ msr_device_destroy(i); class_destroy(msr_class); out_chrdev: - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); out: return err; } @@ -290,7 +293,7 @@ for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); } --- linux-ec2-2.6.32.orig/arch/x86/kernel/entry_32.S +++ linux-ec2-2.6.32/arch/x86/kernel/entry_32.S @@ -393,7 +393,7 @@ CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp - movl TSS_sysenter_sp0(%esp),%esp + movl SYSENTER_stack_sp0(%esp),%esp sysenter_past_esp: /* * Interrupts are disabled here, but we can't trace it until @@ -445,8 +445,9 @@ jnz sysenter_audit sysenter_do_call: cmpl $(nr_syscalls), %eax - jae syscall_badsys + jae sysenter_badsys call *sys_call_table(,%eax,4) +sysenter_after_call: movl %eax,PT_EAX(%esp) LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) @@ -527,6 +528,7 @@ jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) +syscall_after_call: movl %eax,PT_EAX(%esp) # store the return value syscall_exit: LOCKDEP_SYS_EXIT @@ -541,6 +543,7 @@ restore_all: TRACE_IRQS_IRET restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. @@ -551,6 +554,7 @@ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS +#endif restore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 @@ -567,13 +571,9 @@ .long irq_return,iret_exc .previous +#ifdef CONFIG_X86_ESPFIX32 CFI_RESTORE_STATE ldt_ss: - larl PT_OLDSS(%esp), %eax - jnz restore_nocheck - testl $0x00400000, %eax # returning to 32bit stack? - jnz restore_nocheck # allright, normal return - #ifdef CONFIG_PARAVIRT /* * The kernel can't run on a non-flat stack if paravirt mode @@ -617,6 +617,7 @@ lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck +#endif CFI_ENDPROC ENDPROC(system_call) @@ -701,8 +702,13 @@ END(syscall_fault) syscall_badsys: - movl $-ENOSYS,PT_EAX(%esp) - jmp resume_userspace + movl $-ENOSYS,%eax + jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS,%eax + jmp sysenter_after_call END(syscall_badsys) CFI_ENDPROC @@ -734,6 +740,7 @@ * the high word of the segment base from the GDT and swiches to the * normal stack and adjusts ESP with the matching offset. */ +#ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ PER_CPU(gdt_page, %ebx) mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ @@ -746,8 +753,10 @@ CFI_ADJUST_CFA_OFFSET 4 lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 +#endif .endm .macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 movl %ss, %eax /* see if on espfix stack */ cmpw $__ESPFIX_SS, %ax @@ -758,6 +767,7 @@ /* switch to normal stack */ FIXUP_ESPFIX_STACK 27: +#endif .endm /* @@ -995,7 +1005,7 @@ CFI_ENDPROC ENDPROC(kernel_thread_helper) -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN /* Xen doesn't set %esp to be precisely what the normal sysenter entrypoint expects, so fix it up before using the normal path. */ ENTRY(xen_sysenter_target) @@ -1057,7 +1067,6 @@ lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f - addl $16,%esp jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) 5: pushl $0 # EAX == 0 => Category 1 (Bad segment) CFI_ADJUST_CFA_OFFSET 4 @@ -1088,7 +1097,7 @@ .previous ENDPROC(xen_failsafe_callback) -#endif /* CONFIG_XEN */ +#endif /* CONFIG_PARAVIRT_XEN */ #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -1270,7 +1279,7 @@ * that sets up the real kernel stack. Check here, since we can't * allow the wrong stack to be used. * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have * already pushed 3 words if it hits on the sysenter instruction: * eflags, cs and eip. * @@ -1282,7 +1291,7 @@ cmpw $__KERNEL_CS, 4(%esp) jne \ok \label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp + movl SYSENTER_stack_sp0 + \offset(%esp), %esp CFI_DEF_CFA esp, 0 CFI_UNDEFINED eip pushfl @@ -1321,6 +1330,7 @@ */ ENTRY(nmi) RING0_INT_FRAME +#ifdef CONFIG_X86_ESPFIX32 pushl %eax CFI_ADJUST_CFA_OFFSET 4 movl %ss, %eax @@ -1328,6 +1338,7 @@ popl %eax CFI_ADJUST_CFA_OFFSET -4 je nmi_espfix_stack +#endif cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup pushl %eax @@ -1370,6 +1381,7 @@ FIX_STACK 24, nmi_stack_correct, 1 jmp nmi_stack_correct +#ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: /* We have a RING0_INT_FRAME here. * @@ -1395,6 +1407,7 @@ lss 12+4(%esp), %esp # back to espfix stack CFI_ADJUST_CFA_OFFSET -24 jmp irq_return +#endif CFI_ENDPROC END(nmi) --- linux-ec2-2.6.32.orig/arch/x86/kernel/sfi.c +++ linux-ec2-2.6.32/arch/x86/kernel/sfi.c @@ -31,7 +31,7 @@ #include #include -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; void __init mp_sfi_register_lapic_address(unsigned long address) @@ -99,9 +99,12 @@ pentry++; } +#ifndef CONFIG_XEN WARN(pic_mode, KERN_WARNING "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); pic_mode = 0; +#endif + return 0; } #endif /* CONFIG_X86_IO_APIC */ @@ -111,7 +114,7 @@ */ int __init sfi_platform_init(void) { -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) mp_sfi_register_lapic_address(sfi_lapic_addr); sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); #endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/crash_dump_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/crash_dump_64.c @@ -34,7 +34,7 @@ if (!csize) return 0; - vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); + vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); if (!vaddr) return -ENOMEM; @@ -46,6 +46,7 @@ } else memcpy(buf, vaddr + offset, csize); + set_iounmap_nonlazy(); iounmap(vaddr); return csize; } --- linux-ec2-2.6.32.orig/arch/x86/kernel/trampoline.c +++ linux-ec2-2.6.32/arch/x86/kernel/trampoline.c @@ -1,6 +1,7 @@ #include #include +#include #include #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) @@ -39,3 +40,19 @@ memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } + +void __init setup_trampoline_page_table(void) +{ +#ifdef CONFIG_X86_32 + /* Copy kernel address range */ + clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* Initialize low mappings */ + clone_pgd_range(trampoline_pg_dir, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, + KERNEL_PGD_BOUNDARY)); +#endif +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/tls.c +++ linux-ec2-2.6.32/arch/x86/kernel/tls.c @@ -28,6 +28,21 @@ return -ESRCH; } +static bool tls_desc_okay(const struct user_desc *info) +{ + if (LDT_empty(info)) + return true; + + /* + * espfix is required for 16-bit data segments, but espfix + * only works for LDT segments. + */ + if (!info->seg_32bit) + return false; + + return true; +} + static void set_tls_desc(struct task_struct *p, int idx, const struct user_desc *info, int n) { @@ -67,6 +82,9 @@ if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; + if (!tls_desc_okay(&info)) + return -EINVAL; + if (idx == -1) idx = info.entry_number; @@ -163,7 +181,7 @@ { const struct desc_struct *tls; - if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || (count % sizeof(struct user_desc)) != 0) return -EINVAL; @@ -197,8 +215,9 @@ { struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; + int i; - if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || (count % sizeof(struct user_desc)) != 0) return -EINVAL; @@ -210,6 +229,10 @@ else info = infobuf; + for (i = 0; i < count / sizeof(struct user_desc); i++) + if (!tls_desc_okay(info + i)) + return -EINVAL; + set_tls_desc(target, GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), info, count / sizeof(struct user_desc)); --- linux-ec2-2.6.32.orig/arch/x86/kernel/paravirt.c +++ linux-ec2-2.6.32/arch/x86/kernel/paravirt.c @@ -345,6 +345,9 @@ .read_tscp = native_read_tscp, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, +#ifdef CONFIG_X86_32 + .load_user_cs_desc = native_load_user_cs_desc, +#endif /*CONFIG_X86_32*/ .load_gdt = native_load_gdt, .load_idt = native_load_idt, .store_gdt = native_store_gdt, --- linux-ec2-2.6.32.orig/arch/x86/kernel/sys_x86_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/sys_x86_64.c @@ -23,26 +23,11 @@ unsigned long, fd, unsigned long, off) { long error; - struct file *file; - error = -EINVAL; if (off & ~PAGE_MASK) goto out; - error = -EBADF; - file = NULL; - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); + error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return error; } --- linux-ec2-2.6.32.orig/arch/x86/kernel/pci-dma-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/pci-dma-xen.c @@ -0,0 +1,408 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static int forbid_dac __read_mostly; + +struct dma_map_ops *dma_ops; +EXPORT_SYMBOL(dma_ops); + +static int iommu_sac_force __read_mostly; + +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow __read_mostly = 1; +int force_iommu __read_mostly = 1; +#else +int panic_on_overflow __read_mostly = 0; +int force_iommu __read_mostly = 0; +#endif + +int iommu_merge __read_mostly = 0; + +int no_iommu __read_mostly; +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; + +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA translation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user want to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; + +dma_addr_t bad_dma_address __read_mostly = 0; +EXPORT_SYMBOL(bad_dma_address); + +/* Dummy device used for NULL arguments (normally ISA). Better would + be probably a smaller DMA mask, but this is bug-to-bug compatible + to older i386. */ +struct device x86_dma_fallback_dev = { + .init_name = "fallback device", + .coherent_dma_mask = DMA_BIT_MASK(32), + .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, +}; +EXPORT_SYMBOL(x86_dma_fallback_dev); + +/* Number of entries preallocated for DMA-API debugging */ +#define PREALLOC_DMA_DEBUG_ENTRIES 32768 + +int dma_set_mask(struct device *dev, u64 mask) +{ + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + + *dev->dma_mask = mask; + + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) +static __initdata void *dma32_bootmem_ptr; +static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); + +static int __init parse_dma32_size_opt(char *p) +{ + if (!p) + return -EINVAL; + dma32_bootmem_size = memparse(p, &p); + return 0; +} +early_param("dma32_size", parse_dma32_size_opt); + +void __init dma32_reserve_bootmem(void) +{ + unsigned long size, align; + if (max_pfn <= MAX_DMA32_PFN) + return; + + /* + * check aperture_64.c allocate_aperture() for reason about + * using 512M as goal + */ + align = 64ULL<<20; + size = roundup(dma32_bootmem_size, align); + dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, + 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(dma32_bootmem_ptr); + if (dma32_bootmem_ptr) + dma32_bootmem_size = size; + else + dma32_bootmem_size = 0; +} +static void __init dma32_free_bootmem(void) +{ + + if (max_pfn <= MAX_DMA32_PFN) + return; + + if (!dma32_bootmem_ptr) + return; + + free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size); + + dma32_bootmem_ptr = NULL; + dma32_bootmem_size = 0; +} +#endif + +static struct dma_map_ops swiotlb_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, + .mapping_error = swiotlb_dma_mapping_error, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, + .sync_single_range_for_device = swiotlb_sync_single_range_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .dma_supported = swiotlb_dma_supported +}; + +void __init pci_iommu_alloc(void) +{ +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + /* free the range so iommu could get some range less than 4G */ + dma32_free_bootmem(); +#endif + + /* + * The order of these functions is important for + * fall-back/fail-over reasons + */ + gart_iommu_hole_init(); + + detect_calgary(); + + detect_intel_iommu(); + + amd_iommu_detect(); + + swiotlb_init(); + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + dma_ops = &swiotlb_dma_ops; + } +} + +void *dma_generic_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag) +{ + unsigned long dma_mask; + struct page *page; +#ifndef CONFIG_XEN + dma_addr_t addr; +#else + void *memory; +#endif + unsigned int order = get_order(size); + + dma_mask = dma_alloc_coherent_mask(dev, flag); + +#ifndef CONFIG_XEN + flag |= __GFP_ZERO; +again: +#else + flag &= ~(__GFP_DMA | __GFP_DMA32); +#endif + page = alloc_pages_node(dev_to_node(dev), flag, order); + if (!page) + return NULL; + +#ifndef CONFIG_XEN + addr = page_to_phys(page); + if (addr + size > dma_mask) { + __free_pages(page, order); + + if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { + flag = (flag & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + return NULL; + } + + *dma_addr = addr; + return page_address(page); +#else + memory = page_address(page); + if (xen_create_contiguous_region((unsigned long)memory, order, + fls64(dma_mask))) { + __free_pages(page, order); + return NULL; + } + + *dma_addr = virt_to_bus(memory); + return memset(memory, 0, size); +#endif +} + +#ifdef CONFIG_XEN +void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + unsigned int order = get_order(size); + unsigned long va = (unsigned long)vaddr; + + xen_destroy_contiguous_region(va, order); + free_pages(va, order); +} +#endif + +/* + * See for the iommu kernel parameter + * documentation. + */ +static __init int iommu_setup(char *p) +{ + iommu_merge = 1; + + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p, "off", 3)) + no_iommu = 1; + /* gart_parse_options has more force support */ + if (!strncmp(p, "force", 5)) + force_iommu = 1; + if (!strncmp(p, "noforce", 7)) { + iommu_merge = 0; + force_iommu = 0; + } + + if (!strncmp(p, "biomerge", 8)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic", 5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic", 7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge", 5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge", 7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac", 8)) + iommu_sac_force = 1; + if (!strncmp(p, "allowdac", 8)) + forbid_dac = 0; + if (!strncmp(p, "nodac", 5)) + forbid_dac = -1; + if (!strncmp(p, "usedac", 6)) { + forbid_dac = -1; + return 1; + } +#ifdef CONFIG_SWIOTLB + if (!strncmp(p, "soft", 4)) + swiotlb = 1; +#endif + if (!strncmp(p, "pt", 2)) + iommu_pass_through = 1; + + gart_parse_options(p); + +#ifdef CONFIG_CALGARY_IOMMU + if (!strncmp(p, "calgary", 7)) + use_calgary = 1; +#endif /* CONFIG_CALGARY_IOMMU */ + + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 0; +} +early_param("iommu", iommu_setup); + +static int check_pages_physically_contiguous(unsigned long pfn, + unsigned int offset, + size_t length) +{ + unsigned long next_mfn; + int i; + int nr_pages; + + next_mfn = pfn_to_mfn(pfn); + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; + + for (i = 1; i < nr_pages; i++) { + if (pfn_to_mfn(++pfn) != ++next_mfn) + return 0; + } + return 1; +} + +int range_straddles_page_boundary(paddr_t p, size_t size) +{ + unsigned long pfn = p >> PAGE_SHIFT; + unsigned int offset = p & ~PAGE_MASK; + + return ((offset + size > PAGE_SIZE) && + !check_pages_physically_contiguous(pfn, offset, size)); +} + +int dma_supported(struct device *dev, u64 mask) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + +#ifdef CONFIG_PCI + if (mask > 0xffffffff && forbid_dac > 0) { + dev_info(dev, "PCI: Disallowing DAC for device\n"); + return 0; + } +#endif + + if (ops->dma_supported) + return ops->dma_supported(dev, mask); + + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < DMA_BIT_MASK(24)) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. This + allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area and + return DAC as fallback address the device may not handle it + correctly. + + As a special case some controllers have a 39bit address + mode that is as efficient as 32bit (aic79xx). Don't force + SAC for these. Assume all masks <= 40 bits are of this + type. Normally this doesn't make any difference, but gives + more gentle handling of IOMMU overflow. */ + if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) { + dev_info(dev, "Force SAC with mask %Lx\n", mask); + return 0; + } + + return 1; +} +EXPORT_SYMBOL(dma_supported); + +static int __init pci_iommu_init(void) +{ + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); + +#ifdef CONFIG_PCI + dma_debug_add_bus(&pci_bus_type); +#endif + + calgary_iommu_init(); + + intel_iommu_init(); + + amd_iommu_init(); + + gart_iommu_init(); + + no_iommu_init(); + return 0; +} + +void pci_iommu_shutdown(void) +{ + gart_iommu_shutdown(); + + amd_iommu_shutdown(); +} +/* Must execute after PCI subsystem */ +rootfs_initcall(pci_iommu_init); + +#ifdef CONFIG_PCI +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ + +static __devinit void via_no_dac(struct pci_dev *dev) +{ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); + forbid_dac = 1; + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +#endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/x86_init-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/x86_init-xen.c @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2009 Thomas Gleixner + * + * For licencing details see kernel-base/COPYING + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +void __cpuinit x86_init_noop(void) { } +void __init x86_init_uint_noop(unsigned int unused) { } +void __init x86_init_pgd_noop(pgd_t *unused) { } + +/* + * The platform setup functions are preset with the default functions + * for standard PC hardware. + */ +struct x86_init_ops x86_init __initdata = { + + .resources = { + .probe_roms = x86_init_noop, + .reserve_resources = reserve_standard_io_resources, + .memory_setup = default_machine_specific_memory_setup, + }, + + .mpparse = { + .mpc_record = x86_init_uint_noop, + .setup_ioapic_ids = x86_init_noop, + .mpc_apic_id = NULL, + .smp_read_mpc_oem = default_smp_read_mpc_oem, + .mpc_oem_bus_info = default_mpc_oem_bus_info, + .find_smp_config = default_find_smp_config, + .get_smp_config = default_get_smp_config, + }, + + .irqs = { + .pre_vector_init = NULL, + .intr_init = NULL, + .trap_init = x86_init_noop, + }, + + .oem = { + .arch_setup = xen_arch_setup, + .banner = x86_init_noop, + }, + + .paging = { + .pagetable_setup_start = x86_init_pgd_noop, + .pagetable_setup_done = x86_init_pgd_noop, + }, + + .timers = { + .setup_percpu_clockev = NULL, + .tsc_pre_init = x86_init_noop, + .timer_init = x86_init_noop, + }, +}; + +struct x86_platform_ops x86_platform = { + .calibrate_tsc = NULL, + .get_wallclock = mach_get_cmos_time, + .set_wallclock = mach_set_rtc_mmss, +}; --- linux-ec2-2.6.32.orig/arch/x86/kernel/relocate_kernel_64.S +++ linux-ec2-2.6.32/arch/x86/kernel/relocate_kernel_64.S @@ -91,15 +91,34 @@ /* Switch to the identity mapped page tables */ movq %r9, %cr3 + /* setup idt */ + lidtq idt_80 - relocate_kernel(%r8) + + /* setup gdt */ + leaq gdt - relocate_kernel(%r8), %rax + movq %rax, (gdt_80 - relocate_kernel) + 2(%r8) + lgdtq gdt_80 - relocate_kernel(%r8) + + /* setup data segment registers */ + xorl %eax, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss + /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%r8), %rsp - /* jump to identity mapped page */ + /* load new code segment and jump to identity mapped page */ addq $(identity_mapped - relocate_kernel), %r8 + pushq $(gdt_cs - gdt) pushq %r8 - ret + lretq identity_mapped: + /* set return address to 0 if not preserving context */ + pushq $0 /* store the start address on the stack */ pushq %rdx @@ -262,5 +281,20 @@ 3: ret + .align 16 +gdt: + .quad 0x0000000000000000 /* NULL descriptor */ +gdt_cs: + .quad 0x00af9a000000ffff +gdt_end: + +gdt_80: + .word gdt_end - gdt - 1 /* limit */ + .quad 0 /* base - filled in by code above */ + +idt_80: + .word 0 /* limit */ + .quad 0 /* base */ + .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel --- linux-ec2-2.6.32.orig/arch/x86/kernel/asm-offsets_32.c +++ linux-ec2-2.6.32/arch/x86/kernel/asm-offsets_32.c @@ -20,10 +20,14 @@ #include #include +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN) #include +#endif +#ifdef CONFIG_LGUEST_GUEST #include #include "../../../drivers/lguest/lg.h" +#endif /* workaround for a warning with -Wmissing-prototypes */ void foo(void); @@ -55,6 +59,7 @@ OFFSET(TI_exec_domain, thread_info, exec_domain); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); + OFFSET(TI_cpu, thread_info, cpu); OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); @@ -93,9 +98,14 @@ OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); +#ifndef CONFIG_X86_NO_TSS /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - + DEFINE(SYSENTER_stack_sp0, offsetof(struct tss_struct, x86_tss.sp0) - sizeof(struct tss_struct)); +#else + /* sysenter stack points directly to sp0 */ + DEFINE(SYSENTER_stack_sp0, 0); +#endif DEFINE(PAGE_SIZE_asm, PAGE_SIZE); DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); @@ -105,6 +115,11 @@ OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_START_mfn_list, start_info, mfn_list); +#endif + #ifdef CONFIG_PARAVIRT BLANK(); OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); @@ -117,7 +132,7 @@ OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); --- linux-ec2-2.6.32.orig/arch/x86/kernel/process_64-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/process_64-xen.c @@ -0,0 +1,753 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + * + * X86-64 port + * Andi Kleen. + * + * CPU hotplug support - ashok.raj@intel.com + * + * Jun Nakajima + * Modified for Xen + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +asmlinkage extern void ret_from_fork(void); + +DEFINE_PER_CPU(unsigned long, old_rsp); +static DEFINE_PER_CPU(unsigned char, is_idle); + +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); + +void enter_idle(void) +{ + percpu_write(is_idle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); +} + +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + current_thread_info()->status |= TS_POLLING; + + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + + /* endless idle loop with no priority at all */ + while (1) { + tick_nohz_stop_sched_tick(1); + while (!need_resched()) { + + rmb(); + + if (cpu_is_offline(smp_processor_id())) + play_dead(); + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_irq_disable(); + enter_idle(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); + xen_idle(); + start_critical_timings(); + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ + __exit_idle(); + } + + tick_nohz_restart_sched_tick(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + +/* Prints also some state that isn't saved in the pt_regs */ +void __show_regs(struct pt_regs *regs, int all) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; + const char *board; + + printk("\n"); + print_modules(); + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, board); + printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk_address(regs->ip, 1); + printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + regs->sp, regs->flags); + printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + regs->ax, regs->bx, regs->cx); + printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + regs->dx, regs->si, regs->di); + printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + regs->bp, regs->r8, regs->r9); + printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); + asm("mov %%fs,%0" : "=r" (fsindex)); + asm("mov %%gs,%0" : "=r" (gsindex)); + + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + if (!all) + return; + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4(); + + printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs, fsindex, gs, gsindex, shadowgs); + printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + es, cr0); + printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); +} + +void show_regs(struct pt_regs *regs) +{ + printk(KERN_INFO "CPU %d:", smp_processor_id()); + __show_regs(regs, 1); + show_trace(NULL, regs, (void *)(regs + 1), regs->bp); +} + +void xen_load_gs_index(unsigned gs) +{ + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs)); +} + +void release_thread(struct task_struct *dead_task) +{ + if (dead_task->mm) { + if (dead_task->mm->context.size) { + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); + BUG(); + } + } +} + +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) +{ + struct user_desc ud = { + .base_addr = addr, + .limit = 0xfffff, + .seg_32bit = 1, + .limit_in_pages = 1, + .useable = 1, + }; + struct desc_struct *desc = t->thread.tls_array; + desc += tls; + fill_ldt(desc, &ud); +} + +static inline u32 read_32bit_tls(struct task_struct *t, int tls) +{ + return get_desc_base(&t->thread.tls_array[tls]); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(unsigned long clone_flags, unsigned long sp, + unsigned long unused, + struct task_struct *p, struct pt_regs *regs) +{ + int err; + struct pt_regs *childregs; + struct task_struct *me = current; + + childregs = ((struct pt_regs *) + (THREAD_SIZE + task_stack_page(p))) - 1; + *childregs = *regs; + + childregs->ax = 0; + childregs->sp = sp; + if (sp == ~0UL) + childregs->sp = (unsigned long)childregs; + + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + p->thread.usersp = me->thread.usersp; + + set_tsk_thread_flag(p, TIF_FORK); + + savesegment(gs, p->thread.gsindex); + p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; + savesegment(fs, p->thread.fsindex); + p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); + + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); + set_tsk_thread_flag(p, TIF_IO_BITMAP); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) + goto out; + } + p->thread.iopl = current->thread.iopl; + + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + + err = 0; +out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + loadsegment(fs, 0); + loadsegment(es, 0); + loadsegment(ds, 0); + load_gs_index(0); + regs->ip = new_ip; + regs->sp = new_sp; + percpu_write(old_rsp, new_sp); + regs->cs = __USER_CS; + regs->ss = __USER_DS; + regs->flags = 0x200; + /* + * Free the old FP and other extended state + */ + free_thread_xstate(current); +} +EXPORT_SYMBOL_GPL(start_thread); + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * This could still be optimized: + * - fold all the options into a flag word and test it with a single test. + * - could test fs/gs bitsliced + * + * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. + */ +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; + int cpu = smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *tss = &per_cpu(init_tss, cpu); +#endif + bool preload_fpu; +#if CONFIG_XEN_COMPAT > 0x030002 + struct physdev_set_iopl iopl_op; + struct physdev_set_iobitmap iobmp_op; +#else + struct physdev_op _pdo[2], *pdo = _pdo; +#define iopl_op pdo->u.set_iopl +#define iobmp_op pdo->u.set_iobitmap +#endif + multicall_entry_t _mcl[8], *mcl = _mcl; + + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + + /* we're going to use this soon, after a few expensive things */ + if (preload_fpu) + prefetch(next->xstate); + + /* + * This is basically '__unlazy_fpu', except that we queue a + * multicall to indicate FPU task switch, rather than + * synchronously trapping to Xen. + * The AMD workaround requires it to be after DS reload, or + * after DS has been cleared, which we do in __prepare_arch_switch. + */ + if (task_thread_info(prev_p)->status & TS_USEDFPU) { + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ + if (!preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } + } else + prev_p->fpu_counter = 0; + + /* Make sure cpu is ready for new context */ + if (preload_fpu) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 0; + mcl++; + } + + /* + * Reload sp0. + * This is load_sp0(tss, next) with a multicall. + */ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = __KERNEL_DS; + mcl->args[1] = next->sp0; + mcl++; + + /* + * Load the per-thread Thread-Local Storage descriptor. + * This is load_TLS(next, cpu) with multicalls. + */ +#define C(i) do { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ + mcl->op = __HYPERVISOR_update_descriptor; \ + mcl->args[0] = arbitrary_virt_to_machine( \ + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ + mcl->args[1] = *(u64 *)&next->tls_array[i]; \ + mcl++; \ + } \ +} while (0) + C(0); C(1); C(2); +#undef C + + if (unlikely(prev->iopl != next->iopl)) { + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iopl; + mcl->args[1] = (unsigned long)&iopl_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iopl; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + set_xen_guest_handle(iobmp_op.bitmap, + (char *)next->io_bitmap_ptr); + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iobitmap; + mcl->args[1] = (unsigned long)&iobmp_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iobitmap; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + +#if CONFIG_XEN_COMPAT <= 0x030002 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); +#endif + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) + BUG(); + + /* + * Switch DS and ES. + * This won't pick up thread selector changes, but I guess that is ok. + */ + if (unlikely(next->es)) + loadsegment(es, next->es); + + if (unlikely(next->ds)) + loadsegment(ds, next->ds); + + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_end_context_switch(next_p); + + /* + * Switch FS and GS. + * + * Segment register != 0 always requires a reload. Also + * reload when it has changed. When prev process used 64bit + * base always reload to avoid an information leak. + */ + if (unlikely(next->fsindex)) + loadsegment(fs, next->fsindex); + + if (next->fs) + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs)); + + if (unlikely(next->gsindex)) + load_gs_index(next->gsindex); + + if (next->gs) + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs)); + + /* + * Switch the PDA context. + */ + prev->usersp = percpu_read(old_rsp); + percpu_write(old_rsp, next->usersp); + percpu_write(current_task, next_p); + + percpu_write(kernel_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); + + /* + * Now maybe reload the debug registers + */ + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) + __switch_to_xtra(prev_p, next_p); + + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. + */ + if (preload_fpu) + __math_state_restore(); + return prev_p; +} + +/* + * sys_execve() executes a new program. + */ +asmlinkage +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs *regs) +{ + long error; + char *filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, regs); + putname(filename); + return error; +} + +void set_personality_64bit(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 64bit mode */ + clear_thread_flag(TIF_IA32); + + /* TBD: overwrites user setup. Should have two bits. + But 64bit processes have always behaved this way, + so it's not too bad. The main problem is just that + 32bit childs are affected again. */ + current->personality &= ~READ_IMPLIES_EXEC; +} + +asmlinkage long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + +void set_personality_ia32(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + current->personality |= force_personality32; + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; +} + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long stack; + u64 fp, ip; + int count = 0; + + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack = (unsigned long)task_stack_page(p); + if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) + return 0; + fp = *(u64 *)(p->thread.sp); + do { + if (fp < (unsigned long)stack || + fp >= (unsigned long)stack+THREAD_SIZE) + return 0; + ip = *(u64 *)(fp+8); + if (!in_sched_functions(ip)) + return ip; + fp = *(u64 *)fp; + } while (count++ < 16); + return 0; +} + +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +{ + int ret = 0; + int doit = task == current; + int cpu; + + switch (code) { + case ARCH_SET_GS: + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + load_gs_index(GS_TLS_SEL); + } + task->thread.gsindex = GS_TLS_SEL; + task->thread.gs = 0; + } else { + task->thread.gsindex = 0; + task->thread.gs = addr; + if (doit) { + load_gs_index(0); + ret = HYPERVISOR_set_segment_base( + SEGBASE_GS_USER, addr); + } + } + put_cpu(); + break; + case ARCH_SET_FS: + /* Not strictly needed for fs, but do it for symmetry + with gs */ + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, FS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + loadsegment(fs, FS_TLS_SEL); + } + task->thread.fsindex = FS_TLS_SEL; + task->thread.fs = 0; + } else { + task->thread.fsindex = 0; + task->thread.fs = addr; + if (doit) { + /* set the selector to 0 to not confuse + __switch_to */ + loadsegment(fs, 0); + ret = HYPERVISOR_set_segment_base(SEGBASE_FS, + addr); + } + } + put_cpu(); + break; + case ARCH_GET_FS: { + unsigned long base; + if (task->thread.fsindex == FS_TLS_SEL) + base = read_32bit_tls(task, FS_TLS); + else if (doit) + rdmsrl(MSR_FS_BASE, base); + else + base = task->thread.fs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + case ARCH_GET_GS: { + unsigned long base; + unsigned gsindex; + if (task->thread.gsindex == GS_TLS_SEL) + base = read_32bit_tls(task, GS_TLS); + else if (doit) { + savesegment(gs, gsindex); + if (gsindex) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = task->thread.gs; + } else + base = task->thread.gs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +long sys_arch_prctl(int code, unsigned long addr) +{ + return do_arch_prctl(current, code, addr); +} + --- linux-ec2-2.6.32.orig/arch/x86/kernel/head_64-xen.S +++ linux-ec2-2.6.32/arch/x86/kernel/head_64-xen.S @@ -0,0 +1,151 @@ +/* + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Karsten Keil + * Copyright (C) 2001,2002 Andi Kleen + * Copyright (C) 2005 Eric Biederman + * Jun Nakajima + * Modified for Xen + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + __HEAD + .code64 + .globl startup_64 +startup_64: + movq $(init_thread_union+THREAD_SIZE-8),%rsp + + /* rsi is pointer to startup info structure. + pass it to C */ + movq %rsi,%rdi + + /* Set up %gs. + * + * The base of %gs always points to the bottom of the irqstack + * union. If the stack protector canary is enabled, it is + * located at %gs:40. Note that, on SMP, the boot cpu uses + * init data section till per cpu areas are set up. + */ + movl $MSR_GS_BASE,%ecx + movq $INIT_PER_CPU_VAR(irq_stack_union),%rax + movq %rax,%rdx + shrq $32,%rdx + wrmsr + + pushq $0 # fake return address + jmp x86_64_start_kernel + +#define NEXT_PAGE(name) \ + .balign PAGE_SIZE; \ + phys_##name = . - .head.text; \ +ENTRY(name) + +NEXT_PAGE(init_level4_pgt) + .fill 512,8,0 + +NEXT_PAGE(level3_kernel_pgt) + .fill 512,8,0 + + /* + * This is used for vsyscall area mapping as we have a different + * level4 page table for user. + */ +NEXT_PAGE(level3_user_pgt) + .fill 512,8,0 + +NEXT_PAGE(level2_fixmap_pgt) + .fill 512,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 + +NEXT_PAGE(hypercall_page) + CFI_STARTPROC + .rept 0x1000 / 0x20 + .skip 1 /* push %rcx */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx,0 + .skip 2 /* push %r11 */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx,0 + .skip 5 /* mov $#,%eax */ + .skip 2 /* syscall */ + .skip 2 /* pop %r11 */ + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE r11 + .skip 1 /* pop %rcx */ + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + .align 0x20,0 /* ret */ + .endr + CFI_ENDPROC + +#undef NEXT_PAGE + + __PAGE_ALIGNED_BSS + .align PAGE_SIZE +ENTRY(empty_zero_page) + .skip PAGE_SIZE + +#if CONFIG_XEN_COMPAT <= 0x030002 +/* + * __xen_guest information + */ +.macro utoh value + i = 64 + .rept 16 + i = i - 4 + .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf) + .endr +.endm + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=xen-3.0" + .ascii ",VIRT_BASE=0x" + utoh __START_KERNEL_map + .ascii ",ELF_PADDR_OFFSET=0x" + utoh __START_KERNEL_map + .ascii ",VIRT_ENTRY=0x" + utoh (__START_KERNEL_map + __PHYSICAL_START) + .ascii ",HYPERCALL_PAGE=0x" + utoh (phys_hypercall_page >> PAGE_SHIFT) + .ascii ",FEATURES=writable_page_tables" + .ascii "|writable_descriptor_tables" + .ascii "|auto_translated_physmap" + .ascii "|supervisor_mode_kernel" + .ascii ",LOADER=generic" + .byte 0 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad __START_KERNEL_map) +#if CONFIG_XEN_COMPAT <= 0x030002 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad __START_KERNEL_map) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad 0) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) --- linux-ec2-2.6.32.orig/arch/x86/kernel/setup_percpu.c +++ linux-ec2-2.6.32/arch/x86/kernel/setup_percpu.c @@ -224,7 +224,7 @@ * are zeroed indicating that the static arrays are * gone. */ -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) per_cpu(x86_cpu_to_apicid, cpu) = early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = @@ -248,7 +248,7 @@ } /* indicate the early static arrays will soon be gone */ -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; #endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/head_32-xen.S +++ linux-ec2-2.6.32/arch/x86/kernel/head_32-xen.S @@ -0,0 +1,198 @@ + + +.text +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + +__HEAD +#define VIRT_ENTRY_OFFSET 0x0 +.org VIRT_ENTRY_OFFSET +ENTRY(startup_32) + movl %esi,xen_start_info + cld + + /* Set up the stack pointer */ + movl $(init_thread_union+THREAD_SIZE),%esp + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + XEN_CPUID + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + movl $1,%eax # Use the CPUID instruction to get CPU type + XEN_CPUID + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_MASK + movl %edx,X86_CAPABILITY + + movb $1,X86_HARD_MATH + +#ifdef CONFIG_CC_STACKPROTECTOR + /* + * The linker can't handle this by relocation. Manually set + * base address in stack canary segment descriptor. + */ + movl $per_cpu__gdt_page,%eax + movl $per_cpu__stack_canary,%ecx + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) + movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +#endif + + # %esi still points to start_info, and no registers + # need to be preserved. + + movl XEN_START_mfn_list(%esi), %ebx + movl $(per_cpu__gdt_page - __PAGE_OFFSET), %eax + shrl $PAGE_SHIFT, %eax + movl (%ebx,%eax,4), %ecx + pushl %ecx # frame number for set_gdt below + + xorl %esi, %esi + xorl %edx, %edx + shldl $PAGE_SHIFT, %ecx, %edx + shll $PAGE_SHIFT, %ecx + orl $_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY, %ecx + movl $per_cpu__gdt_page, %ebx + movl $__HYPERVISOR_update_va_mapping, %eax + int $0x82 + + movl $(PAGE_SIZE_asm / 8), %ecx + movl %esp, %ebx + movl $__HYPERVISOR_set_gdt, %eax + int $0x82 + + popl %ecx + + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + + movl $(__KERNEL_STACK_CANARY),%eax + movl %eax,%gs + + cld # gcc2 wants the direction flag cleared at all times + + pushl $0 # fake return address for unwinder + jmp i386_start_kernel + +#define HYPERCALL_PAGE_OFFSET 0x1000 +.org HYPERCALL_PAGE_OFFSET +ENTRY(hypercall_page) + CFI_STARTPROC +.skip 0x1000 + CFI_ENDPROC + +/* + * BSS section + */ +__PAGE_ALIGNED_BSS + .align PAGE_SIZE_asm +ENTRY(swapper_pg_fixmap) + .fill 1024,4,0 +ENTRY(empty_zero_page) + .fill 4096,1,0 + +/* + * This starts the data section. + */ +.data + +#if CONFIG_XEN_COMPAT <= 0x030002 +/* + * __xen_guest information + */ +.macro utoa value + .if (\value) < 0 || (\value) >= 0x10 + utoa (((\value)>>4)&0x0fffffff) + .endif + .if ((\value) & 0xf) < 10 + .byte '0' + ((\value) & 0xf) + .else + .byte 'A' + ((\value) & 0xf) - 10 + .endif +.endm + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=xen-3.0" + .ascii ",VIRT_BASE=0x" + utoa __PAGE_OFFSET + .ascii ",ELF_PADDR_OFFSET=0x" + utoa __PAGE_OFFSET + .ascii ",VIRT_ENTRY=0x" + utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET) + .ascii ",HYPERCALL_PAGE=0x" + utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) + .ascii ",FEATURES=writable_page_tables" + .ascii "|writable_descriptor_tables" + .ascii "|auto_translated_physmap" + .ascii "|pae_pgdir_above_4gb" + .ascii "|supervisor_mode_kernel" +#ifdef CONFIG_X86_PAE + .ascii ",PAE=yes[extended-cr3]" +#else + .ascii ",PAE=no" +#endif + .ascii ",LOADER=generic" + .byte 0 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ + + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) +#if CONFIG_XEN_COMPAT <= 0x030002 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long __PAGE_OFFSET) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long 0) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_32) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long _PAGE_PRESENT, _PAGE_PRESENT) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) --- linux-ec2-2.6.32.orig/arch/x86/kernel/process_32.c +++ linux-ec2-2.6.32/arch/x86/kernel/process_32.c @@ -296,15 +296,21 @@ void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { + int cpu; + set_user_gs(regs, 0); regs->fs = 0; - set_fs(USER_DS); regs->ds = __USER_DS; regs->es = __USER_DS; regs->ss = __USER_DS; regs->cs = __USER_CS; regs->ip = new_ip; regs->sp = new_sp; + + cpu = get_cpu(); + load_user_cs_desc(cpu, current->mm); + put_cpu(); + /* * Free the old FP and other extended state */ @@ -359,6 +365,8 @@ preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; __unlazy_fpu(prev_p); + if (next_p->mm) + load_user_cs_desc(cpu, next_p->mm); /* we're going to use this soon, after a few expensive things */ if (preload_fpu) @@ -497,3 +505,40 @@ return 0; } +static void modify_cs(struct mm_struct *mm, unsigned long limit) +{ + mm->context.exec_limit = limit; + set_user_cs(&mm->context.user_cs, limit); + if (mm == current->mm) { + int cpu; + + cpu = get_cpu(); + load_user_cs_desc(cpu, mm); + put_cpu(); + } +} + +void arch_add_exec_range(struct mm_struct *mm, unsigned long limit) +{ + if (limit > mm->context.exec_limit) + modify_cs(mm, limit); +} + +void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end) +{ + struct vm_area_struct *vma; + unsigned long limit = PAGE_SIZE; + + if (old_end == mm->context.exec_limit) { + for (vma = mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + modify_cs(mm, limit); + } +} + +void arch_flush_exec_range(struct mm_struct *mm) +{ + mm->context.exec_limit = 0; + set_user_cs(&mm->context.user_cs, 0); +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/setup.c +++ linux-ec2-2.6.32/arch/x86/kernel/setup.c @@ -109,6 +109,8 @@ #ifdef CONFIG_X86_64 #include #endif +#include +#include /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -666,19 +668,38 @@ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), }, }, - { /* - * AMI BIOS with low memory corruption was found on Intel DG45ID board. - * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * AMI BIOS with low memory corruption was found on Intel DG45ID and + * DG45FC boards. + * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will * match only DMI_BOARD_NAME and see if there is more bad products * with this vendor. */ + { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", .matches = { DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), + }, + }, + /* + * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so + * match on the product name. + */ + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix BIOS", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"), + }, + }, #endif {} }; @@ -978,6 +999,8 @@ paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); + setup_trampoline_page_table(); + tboot_probe(); #ifdef CONFIG_X86_64 @@ -1031,6 +1054,8 @@ #endif #endif x86_init.oem.banner(); + + mcheck_intel_therm_init(); } #ifdef CONFIG_X86_32 --- linux-ec2-2.6.32.orig/arch/x86/kernel/rtc.c +++ linux-ec2-2.6.32/arch/x86/kernel/rtc.c @@ -171,6 +171,11 @@ unsigned long flags; int retval; +#ifdef CONFIG_XEN + if (xen_update_persistent_clock() < 0 || xen_independent_wallclock()) + return 0; +#endif + spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.set_wallclock(now.tv_sec); spin_unlock_irqrestore(&rtc_lock, flags); @@ -183,6 +188,12 @@ { unsigned long retval, flags; +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) { + xen_read_persistent_clock(ts); + return; + } +#endif spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); --- linux-ec2-2.6.32.orig/arch/x86/kernel/mpparse-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/mpparse-xen.c @@ -0,0 +1,1081 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998, 1999, 2000, 2009 Ingo Molnar + * (c) 2008 Alexey Starikovskiy + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void *_bus_to_virt(unsigned long ma) +{ + return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma); +} + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +#ifndef CONFIG_XEN +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} +#endif + +static void __init MP_processor_info(struct mpc_cpu *m) +{ +#ifndef CONFIG_XEN + int apicid; + char *bootup_cpu = ""; + + if (!(m->cpuflag & CPU_ENABLED)) { + disabled_cpus++; + return; + } + + apicid = x86_init.mpparse.mpc_apic_id(m); + + if (m->cpuflag & CPU_BOOTPROCESSOR) { + bootup_cpu = " (Bootup-CPU)"; + boot_cpu_physical_apicid = m->apicid; + } + + printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu); + generic_processor_info(apicid, m->apicver); +#else /* CONFIG_XEN */ + num_processors++; +#endif +} + +#ifdef CONFIG_X86_IO_APIC +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) +{ + memcpy(str, m->bustype, 6); + str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} + +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + x86_init.mpparse.mpc_oem_bus_info(m, str); + +#if MAX_MP_BUSSES < 256 + if (m->busid >= MAX_MP_BUSSES) { + printk(KERN_WARNING "MP table busid value (%d) for bustype %s " + " is too large, max. supported is %d\n", + m->busid, str, MAX_MP_BUSSES - 1); + return; + } +#endif + + set_bit(m->busid, mp_bus_not_pci); + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + mp_bus_id_to_type[m->busid] = MP_BUS_ISA; +#endif + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); + + clear_bit(m->busid, mp_bus_not_pci); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + mp_bus_id_to_type[m->busid] = MP_BUS_PCI; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { + mp_bus_id_to_type[m->busid] = MP_BUS_EISA; + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { + mp_bus_id_to_type[m->busid] = MP_BUS_MCA; +#endif + } else + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); +} + +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +static void __init MP_ioapic_info(struct mpc_ioapic *m) +{ + if (!(m->flags & MPC_APIC_USABLE)) + return; + + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", + m->apicid, m->apicver, m->apicaddr); + + if (bad_ioapic(m->apicaddr)) + return; + + mp_ioapics[nr_ioapics].apicaddr = m->apicaddr; + mp_ioapics[nr_ioapics].apicid = m->apicid; + mp_ioapics[nr_ioapics].type = m->type; + mp_ioapics[nr_ioapics].apicver = m->apicver; + mp_ioapics[nr_ioapics].flags = m->flags; + nr_ioapics++; +} + +static void print_MP_intsrc_info(struct mpc_intsrc *m) +{ + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); +} + +static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) +{ + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + mp_irq->irqtype, mp_irq->irqflag & 3, + (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, + mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); +} + +static void __init assign_to_mp_irq(struct mpc_intsrc *m, + struct mpc_intsrc *mp_irq) +{ + mp_irq->dstapic = m->dstapic; + mp_irq->type = m->type; + mp_irq->irqtype = m->irqtype; + mp_irq->irqflag = m->irqflag; + mp_irq->srcbus = m->srcbus; + mp_irq->srcbusirq = m->srcbusirq; + mp_irq->dstirq = m->dstirq; +} + +static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) +{ + m->dstapic = mp_irq->dstapic; + m->type = mp_irq->type; + m->irqtype = mp_irq->irqtype; + m->irqflag = mp_irq->irqflag; + m->srcbus = mp_irq->srcbus; + m->srcbusirq = mp_irq->srcbusirq; + m->dstirq = mp_irq->dstirq; +} + +static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) +{ + if (mp_irq->dstapic != m->dstapic) + return 1; + if (mp_irq->type != m->type) + return 2; + if (mp_irq->irqtype != m->irqtype) + return 3; + if (mp_irq->irqflag != m->irqflag) + return 4; + if (mp_irq->srcbus != m->srcbus) + return 5; + if (mp_irq->srcbusirq != m->srcbusirq) + return 6; + if (mp_irq->dstirq != m->dstirq) + return 7; + + return 0; +} + +static void __init MP_intsrc_info(struct mpc_intsrc *m) +{ + int i; + + print_MP_intsrc_info(m); + + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) + return; + } + + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} +#else /* CONFIG_X86_IO_APIC */ +static inline void __init MP_bus_info(struct mpc_bus *m) {} +static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} +static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {} +#endif /* CONFIG_X86_IO_APIC */ + + +static void __init MP_lintsrc_info(struct mpc_lintsrc *m) +{ + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid, + m->srcbusirq, m->destapic, m->destapiclint); +} + +/* + * Read/parse the MPC + */ + +static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) +{ + + if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) { + printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", + mpc->signature[0], mpc->signature[1], + mpc->signature[2], mpc->signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc, mpc->length)) { + printk(KERN_ERR "MPTABLE: checksum error!\n"); + return 0; + } + if (mpc->spec != 0x01 && mpc->spec != 0x04) { + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", + mpc->spec); + return 0; + } + if (!mpc->lapic) { + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); + return 0; + } + memcpy(oem, mpc->oem, 8); + oem[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); + + memcpy(str, mpc->productid, 12); + str[12] = 0; + + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); + +#ifndef CONFIG_XEN + printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic); +#endif + + return 1; +} + +static void skip_entry(unsigned char **ptr, int *count, int size) +{ + *ptr += size; + *count += size; +} + +static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) +{ + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n" + "type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->length, 1); +} + +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + +static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) +{ + char str[16]; + char oem[10]; + + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_32 + generic_mps_oem_check(mpc, oem, str); +#endif + /* save the local APIC address, it might be non-default */ + if (!acpi_lapic) + mp_lapic_addr = mpc->lapic; +#endif + + if (early) + return 1; + + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); + + /* + * Now process the configuration blocks. + */ + x86_init.mpparse.mpc_record(0); + + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info((struct mpc_cpu *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + MP_bus_info((struct mpc_bus *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + MP_ioapic_info((struct mpc_ioapic *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + MP_intsrc_info((struct mpc_intsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + MP_lintsrc_info((struct mpc_lintsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + count = mpc->length; + break; + } + x86_init.mpparse.mpc_record(1); + } + + if (!num_processors) + printk(KERN_ERR "MPTABLE: no processors registered!\n"); + return num_processors; +} + +#ifdef CONFIG_X86_IO_APIC + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.type = MP_INTSRC; + intsrc.irqflag = 0; /* conforming */ + intsrc.srcbus = 0; + intsrc.dstapic = mp_ioapics[0].apicid; + + intsrc.irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... " + "falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || + ELCR_trigger(13)) + printk(KERN_ERR "ELCR contains invalid data... " + "not using ELCR\n"); + else { + printk(KERN_INFO + "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.irqflag = 13; + else + intsrc.irqflag = 0; + } + + intsrc.srcbusirq = i; + intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.irqtype = mp_ExtINT; + intsrc.srcbusirq = 0; + intsrc.dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + + +static void __init construct_ioapic_table(int mpc_default_type) +{ + struct mpc_ioapic ioapic; + struct mpc_bus bus; + + bus.type = MP_BUS; + bus.busid = 0; + switch (mpc_default_type) { + default: + printk(KERN_ERR "???\nUnknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.busid = 1; + memcpy(bus.bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); +} +#else +static inline void __init construct_ioapic_table(int mpc_default_type) { } +#endif + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_cpu processor; + struct mpc_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + +#ifndef CONFIG_XEN + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; +#endif + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.cpuflag = CPU_ENABLED; + processor.cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.featureflag = boot_cpu_data.x86_capability[0]; + processor.reserved[0] = 0; + processor.reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.apicid = i; + MP_processor_info(&processor); + } + + construct_ioapic_table(mpc_default_type); + + lintsrc.type = MP_LINTSRC; + lintsrc.irqflag = 0; /* conforming */ + lintsrc.srcbusid = 0; + lintsrc.srcbusirq = 0; + lintsrc.destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.irqtype = linttypes[i]; + lintsrc.destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct mpf_intel *mpf_found; + +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_ioremap(physptr, PAGE_SIZE); + size = mpc->length; + early_iounmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + +static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) +{ + struct mpc_table *mpc; + unsigned long size; + + size = get_mpc_size(mpf->physptr); + mpc = early_ioremap(mpf->physptr, size); + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(mpc, early)) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 0; +#endif + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n" + "... disabling SMP support. (tell your hw vendor)\n"); + early_iounmap(mpc, size); + return -1; + } + early_iounmap(mpc, size); + + if (early) + return -1; + +#ifdef CONFIG_X86_IO_APIC + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " + "using default mptable. (tell your hw vendor)\n"); + + bus.type = MP_BUS; + bus.busid = 0; + memcpy(bus.bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } +#endif + + return 0; +} + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init default_get_smp_config(unsigned int early) +{ + struct mpf_intel *mpf = mpf_found; + + if (!mpf) + return; + + if (acpi_lapic && early) + return; + + /* + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table + */ + if (acpi_lapic && acpi_ioapic) + return; + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", + mpf->specification); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN) + if (mpf->feature2 & (1 << 7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } +#endif + /* + * Now see if we need to read further. + */ + if (mpf->feature1 != 0) { + if (early) { +#ifndef CONFIG_XEN + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; +#endif + return; + } + + printk(KERN_INFO "Default MP configuration #%d\n", + mpf->feature1); + construct_default_ISA_mptable(mpf->feature1); + + } else if (mpf->physptr) { + if (check_physptr(mpf, early)) + return; + } else + BUG(); + + if (!early) + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +#ifndef CONFIG_XEN +static void __init smp_reserve_bootmem(struct mpf_intel *mpf) +{ + unsigned long size = get_mpc_size(mpf->physptr); +#ifdef CONFIG_X86_32 + /* + * We cannot access to MPC table to compute table size yet, + * as only few megabytes from the bottom is mapped now. + * PC-9800's MPC table places on the very last of physical + * memory; so that simply reserving PAGE_SIZE from mpf->physptr + * yields BUG() in reserve_bootmem. + * also need to make sure physptr is below than max_low_pfn + * we don't need reserve the area above max_low_pfn + */ + unsigned long end = max_low_pfn * PAGE_SIZE; + + if (mpf->physptr < end) { + if (mpf->physptr + size > end) + size = end - mpf->physptr; + reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); + } +#else + reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); +#endif +} +#endif + +static int __init smp_scan_config(unsigned long base, unsigned long length, + unsigned reserve) +{ + unsigned int *bp = _bus_to_virt(base); + struct mpf_intel *mpf; + + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", + bp, length); + BUILD_BUG_ON(sizeof(*mpf) != 16); + + while (length > 0) { + mpf = (struct mpf_intel *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->specification == 1) + || (mpf->specification == 4))) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 1; +#endif + mpf_found = mpf; + +#ifndef CONFIG_XEN + printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", + mpf, (u64)virt_to_phys(mpf)); + + if (!reserve) + return 1; + reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), + BOOTMEM_DEFAULT); + if (mpf->physptr) + smp_reserve_bootmem(mpf); +#else + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, ((void *)bp - _bus_to_virt(base)) + base); +#endif + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init default_find_smp_config(unsigned int reserve) +{ +#ifndef CONFIG_XEN + unsigned int address; +#endif + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0, 0x400, reserve) || + smp_scan_config(639 * 0x400, 0x400, reserve) || + smp_scan_config(0xF0000, 0x10000, reserve)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + +#ifndef CONFIG_XEN + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400, reserve); +#endif +} + +#ifdef CONFIG_X86_IO_APIC +static u8 __initdata irq_used[MAX_IRQ_SOURCES]; + +static int __init get_MP_intsrc_index(struct mpc_intsrc *m) +{ + int i; + + if (m->irqtype != mp_INT) + return 0; + + if (m->irqflag != 0x0f) + return 0; + + /* not legacy */ + + for (i = 0; i < mp_irq_entries; i++) { + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != 0x0f) + continue; + + if (mp_irqs[i].srcbus != m->srcbus) + continue; + if (mp_irqs[i].srcbusirq != m->srcbusirq) + continue; + if (irq_used[i]) { + /* already claimed */ + return -2; + } + irq_used[i] = 1; + return i; + } + + /* not found */ + return -1; +} + +#define SPARE_SLOT_NUM 20 + +static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; + +static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) +{ + int i; + + apic_printk(APIC_VERBOSE, "OLD "); + print_MP_intsrc_info(m); + + i = get_MP_intsrc_index(m); + if (i > 0) { + assign_to_mpc_intsrc(&mp_irqs[i], m); + apic_printk(APIC_VERBOSE, "NEW "); + print_mp_irq_info(&mp_irqs[i]); + return; + } + if (!i) { + /* legacy, do nothing */ + return; + } + if (*nr_m_spare < SPARE_SLOT_NUM) { + /* + * not found (-1), or duplicated (-2) are invalid entries, + * we need to use the slot later + */ + m_spare[*nr_m_spare] = m; + *nr_m_spare += 1; + } +} +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ + +static int +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) +{ + int ret = 0; + + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; + } + + return ret; +} + +static int __init replace_intsrc_all(struct mpc_table *mpc, + unsigned long mpc_new_phys, + unsigned long mpc_new_length) +{ +#ifdef CONFIG_X86_IO_APIC + int i; +#endif + int count = sizeof(*mpc); + int nr_m_spare = 0; + unsigned char *mpt = ((unsigned char *)mpc) + count; + + printk(KERN_INFO "mpc_length %x\n", mpc->length); + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + goto out; + } + } + +#ifdef CONFIG_X86_IO_APIC + for (i = 0; i < mp_irq_entries; i++) { + if (irq_used[i]) + continue; + + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != 0x0f) + continue; + + if (nr_m_spare > 0) { + apic_printk(APIC_VERBOSE, "*NEW* found\n"); + nr_m_spare--; + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); + m_spare[nr_m_spare] = NULL; + } else { + struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; + count += sizeof(struct mpc_intsrc); + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) + goto out; + assign_to_mpc_intsrc(&mp_irqs[i], m); + mpc->length = count; + mpt += sizeof(struct mpc_intsrc); + } + print_mp_irq_info(&mp_irqs[i]); + } +#endif +out: + /* update checksum */ + mpc->checksum = 0; + mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length); + + return 0; +} + +int enable_update_mptable; + +static int __init update_mptable_setup(char *str) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + return 0; +} +early_param("update_mptable", update_mptable_setup); + +static unsigned long __initdata mpc_new_phys; +static unsigned long mpc_new_length __initdata = 4096; + +/* alloc_mptable or alloc_mptable=4k */ +static int __initdata alloc_mptable; +static int __init parse_alloc_mptable_opt(char *p) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + alloc_mptable = 1; + if (!p) + return 0; + mpc_new_length = memparse(p, &p); + return 0; +} +early_param("alloc_mptable", parse_alloc_mptable_opt); + +void __init early_reserve_e820_mpc_new(void) +{ + if (enable_update_mptable && alloc_mptable) { + u64 startt = 0; +#ifdef CONFIG_X86_TRAMPOLINE + startt = TRAMPOLINE_BASE; +#endif + mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); + } +} + +static int __init update_mp_table(void) +{ + char str[16]; + char oem[10]; + struct mpf_intel *mpf; + struct mpc_table *mpc, *mpc_new; + + if (!enable_update_mptable) + return 0; + + mpf = mpf_found; + if (!mpf) + return 0; + + /* + * Now see if we need to go further. + */ + if (mpf->feature1 != 0) + return 0; + + if (!mpf->physptr) + return 0; + + mpc = _bus_to_virt(mpf->physptr); + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + + printk(KERN_INFO "mpf: %llx\n", (u64)arbitrary_virt_to_machine(mpf)); + printk(KERN_INFO "physptr: %x\n", mpf->physptr); + + if (mpc_new_phys && mpc->length > mpc_new_length) { + mpc_new_phys = 0; + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", + mpc_new_length); + } + + if (!mpc_new_phys) { + unsigned char old, new; + /* check if we can change the postion */ + mpc->checksum = 0; + old = mpf_checksum((unsigned char *)mpc, mpc->length); + mpc->checksum = 0xff; + new = mpf_checksum((unsigned char *)mpc, mpc->length); + if (old == new) { + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); + return 0; + } + printk(KERN_INFO "use in-positon replacing\n"); + } else { + maddr_t mpc_new_bus; + + mpc_new_bus = phys_to_machine(mpc_new_phys); + mpf->physptr = mpc_new_bus; + mpc_new = phys_to_virt(mpc_new_phys); + memcpy(mpc_new, mpc, mpc->length); + mpc = mpc_new; + /* check if we can modify that */ + if (mpc_new_bus - mpf->physptr) { + struct mpf_intel *mpf_new; + /* steal 16 bytes from [0, 1k) */ + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); + mpf_new = isa_bus_to_virt(0x400 - 16); + memcpy(mpf_new, mpf, 16); + mpf = mpf_new; + mpf->physptr = mpc_new_bus; + } + mpf->checksum = 0; + mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); + printk(KERN_INFO "physptr new: %x\n", mpf->physptr); + } + + /* + * only replace the one with mp_INT and + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW, + * already in mp_irqs , stored by ... and mp_config_acpi_gsi, + * may need pci=routeirq for all coverage + */ + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); + + return 0; +} + +late_initcall(update_mp_table); --- linux-ec2-2.6.32.orig/arch/x86/kernel/pcspeaker.c +++ linux-ec2-2.6.32/arch/x86/kernel/pcspeaker.c @@ -6,6 +6,11 @@ { struct platform_device *pd; +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return 0; +#endif + pd = platform_device_register_simple("pcspkr", -1, NULL, 0); return IS_ERR(pd) ? PTR_ERR(pd) : 0; --- linux-ec2-2.6.32.orig/arch/x86/kernel/fixup.c +++ linux-ec2-2.6.32/arch/x86/kernel/fixup.c @@ -0,0 +1,89 @@ +/****************************************************************************** + * fixup.c + * + * Binary-rewriting of certain IA32 instructions, on notification by Xen. + * Used to avoid repeated slow emulation of common instructions used by the + * user-space TLS (Thread-Local Storage) libraries. + * + * **** NOTE **** + * Issues with the binary rewriting have caused it to be removed. Instead + * we rely on Xen's emulator to boot the kernel, and then print a banner + * message recommending that the user disables /lib/tls. + * + * Copyright (c) 2004, K A Fraser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) + +dotraplinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) +{ + static unsigned long printed = 0; + char info[100]; + int i; + + /* Ignore statically-linked init. */ + if (current->tgid == 1) + return; + + VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable, + VMASST_TYPE_4gb_segments_notify)); + + if (test_and_set_bit(0, &printed)) + return; + + sprintf(info, "%s (pid=%d)", current->comm, current->tgid); + + DP(""); + DP("***************************************************************"); + DP("***************************************************************"); + DP("** WARNING: Currently emulating unsupported memory accesses **"); + DP("** in /lib/tls glibc libraries. The emulation is **"); + DP("** slow. To ensure full performance you should **"); + DP("** install a 'xen-friendly' (nosegneg) version of **"); + DP("** the library, or disable tls support by executing **"); + DP("** the following as root: **"); + DP("** mv /lib/tls /lib/tls.disabled **"); + DP("** Offending process: %-38.38s **", info); + DP("***************************************************************"); + DP("***************************************************************"); + DP(""); + + for (i = 5; i > 0; i--) { + touch_softlockup_watchdog(); + printk("Pausing... %d", i); + mdelay(1000); + printk("\b\b\b\b\b\b\b\b\b\b\b\b"); + } + + printk("Continuing...\n\n"); +} + +static int __init fixup_init(void) +{ + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_4gb_segments_notify)); + return 0; +} +__initcall(fixup_init); --- linux-ec2-2.6.32.orig/arch/x86/kernel/machine_kexec_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/machine_kexec_64.c @@ -19,6 +19,112 @@ #include #include +#ifdef CONFIG_XEN + +/* In the case of Xen, override hypervisor functions to be able to create + * a regular identity mapping page table... + */ + +#include +#include + +#define x__pmd(x) ((pmd_t) { (x) } ) +#define x__pud(x) ((pud_t) { (x) } ) +#define x__pgd(x) ((pgd_t) { (x) } ) + +#define x_pmd_val(x) ((x).pmd) +#define x_pud_val(x) ((x).pud) +#define x_pgd_val(x) ((x).pgd) + +static inline void x_set_pmd(pmd_t *dst, pmd_t val) +{ + x_pmd_val(*dst) = x_pmd_val(val); +} + +static inline void x_set_pud(pud_t *dst, pud_t val) +{ + x_pud_val(*dst) = phys_to_machine(x_pud_val(val)); +} + +static inline void x_pud_clear (pud_t *pud) +{ + x_pud_val(*pud) = 0; +} + +static inline void x_set_pgd(pgd_t *dst, pgd_t val) +{ + x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); +} + +static inline void x_pgd_clear (pgd_t * pgd) +{ + x_pgd_val(*pgd) = 0; +} + +#define X__PAGE_KERNEL_LARGE_EXEC \ + _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE +#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY + +#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) + +#if PAGES_NR > KEXEC_XEN_NO_PAGES +#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break +#endif + +#if PA_CONTROL_PAGE != 0 +#error PA_CONTROL_PAGE is non zero - Xen support will break +#endif + +void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) +{ + void *control_page; + void *table_page; + + memset(xki->page_list, 0, sizeof(xki->page_list)); + + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + table_page = page_address(image->control_code_page); + + xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); + xki->page_list[PA_TABLE_PAGE] = __ma(table_page); + + if (image->type == KEXEC_TYPE_DEFAULT) + xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page); +} + +int __init machine_kexec_setup_resources(struct resource *hypervisor, + struct resource *phys_cpus, + int nr_phys_cpus) +{ + int k; + + /* The per-cpu crash note resources belong to the hypervisor resource */ + for (k = 0; k < nr_phys_cpus; k++) + request_resource(hypervisor, phys_cpus + k); + + return 0; +} + +#else /* CONFIG_XEN */ + +#define x__pmd(x) __pmd(x) +#define x__pud(x) __pud(x) +#define x__pgd(x) __pgd(x) + +#define x_set_pmd(x, y) set_pmd(x, y) +#define x_set_pud(x, y) set_pud(x, y) +#define x_set_pgd(x, y) set_pgd(x, y) + +#define x_pud_clear(x) pud_clear(x) +#define x_pgd_clear(x) pgd_clear(x) + +#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +#define X_KERNPG_TABLE _KERNPG_TABLE + +#endif /* CONFIG_XEN */ + static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) { @@ -48,7 +154,7 @@ } pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); result = 0; out: return result; @@ -61,7 +167,7 @@ addr &= PAGE_MASK; end_addr = addr + PUD_SIZE; while (addr < end_addr) { - set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); addr += PMD_SIZE; } } @@ -86,12 +192,12 @@ } level2p = (pmd_t *)page_address(page); init_level2_page(level2p, addr); - set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); + x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE)); addr += PUD_SIZE; } /* clear the unused entries */ while (addr < end_addr) { - pud_clear(level3p++); + x_pud_clear(level3p++); addr += PUD_SIZE; } out: @@ -121,12 +227,12 @@ result = init_level3_page(image, level3p, addr, last_addr); if (result) goto out; - set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); + x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE)); addr += PGDIR_SIZE; } /* clear the unused entries */ while (addr < end_addr) { - pgd_clear(level4p++); + x_pgd_clear(level4p++); addr += PGDIR_SIZE; } out: @@ -187,8 +293,14 @@ { pgd_t *level4p; int result; + unsigned long x_max_pfn = max_pfn; + +#ifdef CONFIG_XEN + x_max_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); +#endif + level4p = (pgd_t *)__va(start_pgtable); - result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); + result = init_level4_page(image, level4p, 0, x_max_pfn << PAGE_SHIFT); if (result) return result; /* @@ -201,47 +313,6 @@ return init_transition_pgtable(image, level4p); } -static void set_idt(void *newidt, u16 limit) -{ - struct desc_ptr curidt; - - /* x86-64 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - __asm__ __volatile__ ( - "lidtq %0\n" - : : "m" (curidt) - ); -}; - - -static void set_gdt(void *newgdt, u16 limit) -{ - struct desc_ptr curgdt; - - /* x86-64 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - __asm__ __volatile__ ( - "lgdtq %0\n" - : : "m" (curgdt) - ); -}; - -static void load_segments(void) -{ - __asm__ __volatile__ ( - "\tmovl %0,%%ds\n" - "\tmovl %0,%%es\n" - "\tmovl %0,%%ss\n" - "\tmovl %0,%%fs\n" - "\tmovl %0,%%gs\n" - : : "a" (__KERNEL_DS) : "memory" - ); -} - int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -263,6 +334,7 @@ free_transition_pgtable(image); } +#ifndef CONFIG_XEN /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. @@ -308,24 +380,6 @@ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); - /* - * The segment registers are funny things, they have both a - * visible and an invisible part. Whenever the visible part is - * set to a specific selector, the invisible part is loaded - * with from a table in memory. At no other time is the - * descriptor table in memory accessed. - * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. - */ - load_segments(); - /* - * The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); - /* now call it */ image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, @@ -339,10 +393,13 @@ __ftrace_enabled_restore(save_ftrace_enabled); } +#endif void arch_crash_save_vmcoreinfo(void) { +#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */ VMCOREINFO_SYMBOL(phys_base); +#endif VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA --- linux-ec2-2.6.32.orig/arch/x86/kernel/entry_32-xen.S +++ linux-ec2-2.6.32/arch/x86/kernel/entry_32-xen.S @@ -0,0 +1,1756 @@ +/* + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'syscall_exit': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +#define nr_syscalls ((syscall_table_size)/4) + +/* Pseudo-eflags. */ +NMI_MASK = 0x80000000 + +#ifdef CONFIG_PREEMPT +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +#define preempt_stop(clobbers) +#define resume_kernel restore_all +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +#ifdef CONFIG_VM86 +#define resume_userspace_sig check_userspace +#else +#define resume_userspace_sig resume_userspace +#endif + +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp + CFI_ADJUST_CFA_OFFSET -(4 + \pop) +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl %gs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET gs, 0*/ +.endm + +.macro POP_GS pop=0 +98: popl %gs + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE gs*/ + .if \pop <> 0 + add $\pop, %esp + CFI_ADJUST_CFA_OFFSET -\pop + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.section __ex_table, "a" + .align 4 + .long 98b, 99b +.popsection +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.section __ex_table, "a" + .align 4 + .long 98b, 99b +.popsection +.endm + +.macro GS_TO_REG reg + movl %gs, \reg + /*CFI_REGISTER gs, \reg*/ +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) + /*CFI_REL_OFFSET gs, PT_GS*/ +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL + cld + PUSH_GS + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0;*/ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0;*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0;*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs + SET_KERNEL_GS %edx +.endm + +.macro RESTORE_INT_REGS + popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ecx + popl %edx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edx + popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi + popl %edi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edi + popl %ebp + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebp + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE eax +.endm + +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl %ds + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE ds;*/ +2: popl %es + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE es;*/ +3: popl %fs + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE fs;*/ + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.section __ex_table, "a" + .align 4 + .long 1b, 4b + .long 2b, 5b + .long 3b, 6b +.popsection + POP_GS_EX +.endm + +.macro RING0_INT_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 3*4 + /*CFI_OFFSET cs, -2*4;*/ + CFI_OFFSET eip, -3*4 +.endm + +.macro RING0_EC_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 4*4 + /*CFI_OFFSET cs, -2*4;*/ + CFI_OFFSET eip, -3*4 +.endm + +.macro RING0_PTREGS_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ + CFI_OFFSET eip, PT_EIP-PT_OLDESP + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ + CFI_OFFSET eax, PT_EAX-PT_OLDESP + CFI_OFFSET ebp, PT_EBP-PT_OLDESP + CFI_OFFSET edi, PT_EDI-PT_OLDESP + CFI_OFFSET esi, PT_ESI-PT_OLDESP + CFI_OFFSET edx, PT_EDX-PT_OLDESP + CFI_OFFSET ecx, PT_ECX-PT_OLDESP + CFI_OFFSET ebx, PT_EBX-PT_OLDESP +.endm + +ENTRY(ret_from_fork) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + pushl $0x0202 # Reset kernel eflags + CFI_ADJUST_CFA_OFFSET 4 + popfl + CFI_ADJUST_CFA_OFFSET -4 + jmp syscall_exit + CFI_ENDPROC +END(ret_from_fork) + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN + RING0_PTREGS_FRAME +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: + GET_THREAD_INFO(%ebp) +check_userspace: + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jnz restore_all +need_resched: + movl TI_flags(%ebp), %ecx # need_resched set ? + testb $_TIF_NEED_RESCHED, %cl + jz restore_all + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +END(resume_kernel) +#endif + CFI_ENDPROC + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(ia32_sysenter_target) + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp + movl SYSENTER_stack_sp0(%esp),%esp +sysenter_past_esp: + /* + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. + */ + pushl $(__USER_DS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ss, 0*/ + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esp, 0 + pushfl + orl $X86_EFLAGS_IF, (%esp) + CFI_ADJUST_CFA_OFFSET 4 + pushl $(__USER_CS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET cs, 0*/ + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 + + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp + movl %ebp,PT_EBP(%esp) +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + + GET_THREAD_INFO(%ebp) + + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: + cmpl $(nr_syscalls), %eax + jae sysenter_badsys + call *sys_call_table(,%eax,4) +sysenter_after_call: + movl %eax,PT_EAX(%esp) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jne sysexit_audit +sysenter_exit: +/* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp,%ebp +#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT + GET_VCPU_INFO +#endif + TRACE_IRQS_ON +1: mov PT_FS(%esp), %fs + PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ + movl %eax,%edx /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + call audit_syscall_entry + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jne syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jne syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + + CFI_ENDPROC +.pushsection .fixup,"ax" +2: movl $0,PT_FS(%esp) + jmp 1b +.section __ex_table,"a" + .align 4 + .long 1b,2b +.popsection + PTGS_TO_GS_EX +ENDPROC(ia32_sysenter_target) + + # pv sysenter call handler stub +ENTRY(ia32pv_sysenter_target) + RING0_INT_FRAME + movl $__USER_DS,16(%esp) + movl %ebp,12(%esp) + movl $__USER_CS,4(%esp) + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + CFI_ADJUST_CFA_OFFSET 4 +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + /* fall through */ + CFI_ENDPROC +ENDPROC(ia32pv_sysenter_target) + + # system call handler stub +ENTRY(system_call) + RING0_INT_FRAME # can't unwind into user space anyway + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) +syscall_after_call: + movl %eax,PT_EAX(%esp) # store the return value +syscall_exit: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jne syscall_exit_work + +restore_all: + TRACE_IRQS_IRET +restore_all_notrace: +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_ESPFIX32 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. + # See comments in process.c:copy_thread() for details. + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + CFI_REMEMBER_STATE + je ldt_ss # returning to user-space with LDT SS +#endif +restore_nocheck: +#else +restore_nocheck: + movl PT_EFLAGS(%esp), %eax + testl $(X86_EFLAGS_VM|NMI_MASK), %eax + CFI_REMEMBER_STATE + jnz hypervisor_iret + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF + GET_VCPU_INFO + andb evtchn_upcall_mask(%esi),%al + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask + CFI_REMEMBER_STATE + jnz restore_all_enable_events # != 0 => enable event delivery +#endif + RESTORE_REGS 4 # skip orig_eax/error_code + CFI_ADJUST_CFA_OFFSET -4 +irq_return: + INTERRUPT_RETURN +.section .fixup,"ax" +ENTRY(iret_exc) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous +.section __ex_table,"a" + .align 4 + .long irq_return,iret_exc +.previous + +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_ESPFIX32 + CFI_RESTORE_STATE +ldt_ss: +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck +#endif + +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + PER_CPU(gdt_page, %ebx) + shr $16, %edx + mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ + mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ + pushl $__ESPFIX_SS + CFI_ADJUST_CFA_OFFSET 4 + push %eax /* new kernel esp */ + CFI_ADJUST_CFA_OFFSET 4 + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ + DISABLE_INTERRUPTS(CLBR_EAX) + lss (%esp), %esp /* switch to espfix segment */ + CFI_ADJUST_CFA_OFFSET -8 + jmp restore_nocheck +#endif +#else + CFI_RESTORE_STATE + ALIGN +restore_all_enable_events: + TRACE_IRQS_ON + __ENABLE_INTERRUPTS +scrit: /**** START OF CRITICAL REGION ****/ + __TEST_PENDING + jnz 14f # process more events if necessary... + RESTORE_REGS 4 +1: INTERRUPT_RETURN +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +14: __DISABLE_INTERRUPTS + TRACE_IRQS_OFF +ecrit: /**** END OF CRITICAL REGION ****/ + jmp .Ldo_upcall + + CFI_RESTORE_STATE +hypervisor_iret: + andl $~NMI_MASK, PT_EFLAGS(%esp) + RESTORE_REGS 4 + jmp hypercall_page + (__HYPERVISOR_iret * 32) +#endif + CFI_ENDPROC +ENDPROC(system_call) + + # perform work that needs to be done immediately before resumption + ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jne work_notifysig_v86 # returning to kernel-space or + # vm86-space + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig + + ALIGN +work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + CFI_ADJUST_CFA_OFFSET 4 + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + movl %eax, %esp +#else + movl %esp, %eax +#endif + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig +END(work_pending) + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $(nr_syscalls), %eax + jnae syscall_call + jmp syscall_exit +END(syscall_trace_entry) + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace +END(syscall_exit_work) + CFI_ENDPROC + + RING0_INT_FRAME # can't unwind into user space anyway +syscall_fault: + GET_THREAD_INFO(%ebp) + movl $-EFAULT,PT_EAX(%esp) + jmp resume_userspace +END(syscall_fault) + +syscall_badsys: + movl $-ENOSYS,%eax + jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS,%eax + jmp sysenter_after_call +END(sysenter_badsys) + CFI_ENDPROC + +/* + * System calls that need a pt_regs pointer. + */ +#define PTREGSCALL(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%eax; \ + jmp sys_##name; + +PTREGSCALL(iopl) +PTREGSCALL(fork) +PTREGSCALL(clone) +PTREGSCALL(vfork) +PTREGSCALL(execve) +PTREGSCALL(sigaltstack) +PTREGSCALL(sigreturn) +PTREGSCALL(rt_sigreturn) +PTREGSCALL(vm86) +PTREGSCALL(vm86old) + +#ifndef CONFIG_XEN +.macro FIXUP_ESPFIX_STACK +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ +#ifdef CONFIG_X86_ESPFIX32 + /* fixup the stack */ + PER_CPU(gdt_page, %ebx) + mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ + mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ + pushl $__KERNEL_DS + CFI_ADJUST_CFA_OFFSET 4 + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + lss (%esp), %esp /* switch to the normal stack segment */ + CFI_ADJUST_CFA_OFFSET -8 +#endif +.endm +.macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +#endif +.endm + +/* + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. + */ +.section .init.rodata,"a" +ENTRY(interrupt) +.text + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT +ENTRY(irq_entries_start) + RING0_INT_FRAME +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector <> FIRST_EXTERNAL_VECTOR + CFI_ADJUST_CFA_OFFSET -4 + .endif +1: pushl $(~vector+0x80) /* Note: always in signed byte range */ + CFI_ADJUST_CFA_OFFSET 4 + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 + jmp 2f + .endif + .previous + .long 1b + .text +vector=vector+1 + .endif + .endr +2: jmp common_interrupt +.endr +END(irq_entries_start) + +.previous +END(interrupt) +.previous + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + SAVE_ALL + TRACE_IRQS_OFF + movl %esp,%eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + CFI_ENDPROC + +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + RING0_INT_FRAME; \ + pushl $~(nr); \ + CFI_ADJUST_CFA_OFFSET 4; \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ + call fn; \ + jmp ret_from_intr; \ + CFI_ENDPROC; \ +ENDPROC(name) + +#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) + +/* The include is where all of the SMP etc. interrupts come from */ +#include + +#else +#define UNWIND_ESPFIX_STACK + +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +# +# The sysexit critical region is slightly different. sysexit +# atomically removes the entire stack frame. If we interrupt in the +# critical region we know that the entire frame is present and correct +# so we can simply throw away the new one. +ENTRY(hypervisor_callback) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + movl PT_CS(%esp),%ecx + movl PT_EIP(%esp),%eax + andl $SEGMENT_RPL_MASK,%ecx + cmpl $USER_RPL,%ecx + jae .Ldo_upcall + cmpl $scrit,%eax + jb 0f + cmpl $ecrit,%eax + jb critical_region_fixup +0: +#ifdef CONFIG_XEN_SUPERVISOR_MODE_KERNEL + cmpl $sysexit_scrit,%eax + jb .Ldo_upcall + cmpl $sysexit_ecrit,%eax + ja .Ldo_upcall + addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame. +#endif +.Ldo_upcall: + push %esp + CFI_ADJUST_CFA_OFFSET 4 + call evtchn_do_upcall + add $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + jmp ret_from_intr + CFI_ENDPROC + +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +critical_region_fixup: + movsbl critical_fixup_table-scrit(%eax),%ecx # %ecx contains num slots popped + testl %ecx,%ecx + leal (%esp,%ecx,4),%esi # %esi points at end of src region + leal PT_OLDESP(%esp),%edi # %edi points at end of dst region + jle 17f # skip loop if nothing to copy +16: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 16b +17: movl %edi,%esp # final %edi is top of merged stack + jmp .Ldo_upcall + +.section .rodata,"a" +critical_fixup_table: + .rept __SIZEOF_TEST_PENDING + .byte -1 + .endr + .byte -1,-1 # jnz 14f + .byte 0 # pop %ebx + .byte 1 # pop %ecx + .byte 2 # pop %edx + .byte 3 # pop %esi + .byte 4 # pop %edi + .byte 5 # pop %ebp + .byte 6 # pop %eax + .byte 7 # pop %ds + .byte 8 # pop %es + .byte 9,9 # pop %fs +#ifndef CONFIG_X86_32_LAZY_GS + .byte 10,10 # pop %gs + .byte 11,11,11 # add $4,%esp +#else + .byte 10,10,10 # add $8,%esp +#endif + .byte 12 # iret + .rept __SIZEOF_DISABLE_INTERRUPTS + .byte -1 + .endr +.previous + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(failsafe_callback) + pushl %eax + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + jz 5f + addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) + jmp iret_exc +5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) + RING0_INT_FRAME + pushl $0 + SAVE_ALL + jmp ret_from_exception +.section .fixup,"ax"; \ +6: xorl %eax,%eax; \ + movl %eax,4(%esp); \ + jmp 1b; \ +7: xorl %eax,%eax; \ + movl %eax,8(%esp); \ + jmp 2b; \ +8: xorl %eax,%eax; \ + movl %eax,12(%esp); \ + jmp 3b; \ +9: xorl %eax,%eax; \ + movl %eax,16(%esp); \ + jmp 4b; \ +.previous; \ +.section __ex_table,"a"; \ + .align 4; \ + .long 1b,6b; \ + .long 2b,7b; \ + .long 3b,8b; \ + .long 4b,9b; \ +.previous +#endif + CFI_ENDPROC + +ENTRY(coprocessor_error) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_simd_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(simd_coprocessor_error) + +ENTRY(device_not_available) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_device_not_available + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(device_not_available) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) + iret +.section __ex_table,"a" + .align 4 + .long native_iret, iret_exc +.previous +END(native_iret) + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +END(native_irq_enable_sysexit) +#endif + +ENTRY(overflow) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_overflow + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(overflow) + +ENTRY(bounds) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_bounds + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(bounds) + +ENTRY(invalid_op) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_invalid_op + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_coprocessor_segment_overrun + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + RING0_EC_FRAME + pushl $do_invalid_TSS + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(invalid_TSS) + +ENTRY(segment_not_present) + RING0_EC_FRAME + pushl $do_segment_not_present + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(segment_not_present) + +ENTRY(stack_segment) + RING0_EC_FRAME + pushl $do_stack_segment + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(stack_segment) + +ENTRY(alignment_check) + RING0_EC_FRAME + pushl $do_alignment_check + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(alignment_check) + +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl machine_check_vector + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(machine_check) +#endif + +#ifndef CONFIG_XEN +ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_spurious_interrupt_bug + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +#endif /* !CONFIG_XEN */ + +ENTRY(fixup_4gb_segment) + RING0_EC_FRAME + pushl $do_fixup_4gb_segment + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(spurious_interrupt_bug) + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movl 4(%esp), %edx + movl (%esp), %ecx + leal 4(%esp), %eax + movl %ebx, PT_EBX(%edx) + xorl %ebx, %ebx + movl %ebx, PT_ECX(%edx) + movl %ebx, PT_EDX(%edx) + movl %esi, PT_ESI(%edx) + movl %edi, PT_EDI(%edx) + movl %ebp, PT_EBP(%edx) + movl %ebx, PT_EAX(%edx) + movl $__USER_DS, PT_DS(%edx) + movl $__USER_DS, PT_ES(%edx) + movl $__KERNEL_PERCPU, PT_FS(%edx) + movl $__KERNEL_STACK_CANARY, PT_GS(%edx) + movl %eax, PT_OLDESP(%edx) + movl 16(%esp), %eax + movl %ebx, PT_ORIG_EAX(%edx) + movl %ecx, PT_EIP(%edx) + movl 12(%esp), %ecx + movl $__KERNEL_CS, PT_CS(%edx) + movl %eax, 12(%esp) + movl 8(%esp), %eax + movl %ecx, 8(%esp) + movl %ebx, PT_EFLAGS(%edx) + movl PT_EBX(%edx), %ebx + movl $__KERNEL_DS, PT_OLDSS(%edx) + jmpl *%eax + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif + +ENTRY(kernel_thread_helper) + pushl $0 # fake return address for unwinder + CFI_STARTPROC + movl %edx,%eax + push %edx + CFI_ADJUST_CFA_OFFSET 4 + call *%ebx + push %eax + CFI_ADJUST_CFA_OFFSET 4 + call do_exit + ud2 # padding for call trace + CFI_ENDPROC +ENDPROC(kernel_thread_helper) + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %edx + lea 0x4(%ebp), %eax + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %edx + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %eax + pushl %ecx + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, 0xc(%esp) + popl %edx + popl %ecx + popl %eax + ret +#endif + +#include + + # pv syscall call handler stub +ENTRY(ia32pv_cstar_target) + RING0_INT_FRAME + movl $__USER_DS,16(%esp) + movl %ebp,%ecx + movl $__USER_CS,4(%esp) + movl 12(%esp),%ebp + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-4,%ebp + CFI_REMEMBER_STATE + ja cstar_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,cstar_fault +.previous + SAVE_ALL + GET_THREAD_INFO(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz cstar_trace_entry + cmpl $nr_syscalls,%eax + jae cstar_badsys +.Lcstar_call: + btl %eax,cstar_special + jc .Lcstar_special + call *cstar_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) # store the return value +.Lcstar_exit: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp syscall_exit +.Lcstar_special: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp syscall_call +cstar_set_tif: + movl $cstar_clear_tif,(%esp) # replace return address + LOCK_PREFIX + orl $_TIF_CSTAR,TI_flags(%ebp) + jmp *sys_call_table(,%eax,4) +cstar_clear_tif: + movl %eax,PT_EAX(%esp) # store the return value + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + jmp .Lcstar_exit +cstar_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + cmpl $nr_syscalls,%eax + jae 1f + btl %eax,cstar_special + jc .Lcstar_trace_special +1: movl %esp,%eax + LOCK_PREFIX + orl $_TIF_CSTAR,TI_flags(%ebp) + call syscall_trace_enter + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + /* What it returned is what we'll actually use. */ + cmpl $nr_syscalls,%eax + jb .Lcstar_call + jmp .Lcstar_exit +.Lcstar_trace_special: + movl PT_ECX(%esp),%ecx + movl %esp,%eax + movl %ecx,PT_EBP(%esp) # put user EBP back in place + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $nr_syscalls,%eax + jb syscall_call + jmp syscall_exit +cstar_badsys: + movl $-ENOSYS,PT_EAX(%esp) +.Lcstar_resume: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp resume_userspace + CFI_RESTORE_STATE +cstar_fault: + movl $-EFAULT,%eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + jmp .Lcstar_resume + CFI_ENDPROC +ENDPROC(ia32pv_cstar_target) + +ENTRY(cstar_ret_from_fork) + CFI_STARTPROC + movl PT_ECX(%esp),%ecx + GET_THREAD_INFO(%ebp) + movl %ecx,PT_EBP(%esp) # put user EBP back in place + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + jmp ret_from_fork + CFI_ENDPROC +END(ret_from_fork) + +.section .rodata,"a" +#include "syscall_table_32.S" + +syscall_table_size=(.-sys_call_table) + +#include +cstar_special: +nr=0 +mask=0 +.rept nr_syscalls+31 + .irp n, __NR_sigreturn, __NR_rt_sigreturn + .if nr == \n + mask = mask | (1 << (\n & 31)) + .endif + .endr + nr = nr + 1 + .if (nr & 31) == 0 + .long mask + mask = 0 + .endif +.endr +#define sys_call_table cstar_call_table +#define sys_fork cstar_set_tif +#define sys_clone cstar_set_tif +#define sys_vfork cstar_set_tif +#include "syscall_table_32.S" +#undef sys_call_table +#undef sys_fork +#undef sys_clone +#undef sys_vfork + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 + ALIGN +error_code: + /* the function address is in %gs's slot on the stack */ + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC +END(page_fault) + +#ifndef CONFIG_XEN +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +.macro FIX_STACK offset ok label + cmpw $__KERNEL_CS, 4(%esp) + jne \ok +\label: + movl TSS_sysenter_sp0 + \offset(%esp), %esp + CFI_DEF_CFA esp, 0 + CFI_UNDEFINED eip + pushfl + CFI_ADJUST_CFA_OFFSET 4 + pushl $__KERNEL_CS + CFI_ADJUST_CFA_OFFSET 4 + pushl $sysenter_past_esp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 +.endm +#endif /* CONFIG_XEN */ + +ENTRY(debug) + RING0_INT_FRAME +#ifndef CONFIG_XEN + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn +debug_stack_correct: +#endif /* !CONFIG_XEN */ + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_ESPFIX32 + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + je nmi_espfix_stack +#endif + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace + CFI_ENDPROC + +nmi_stack_fixup: + RING0_INT_FRAME + FIX_STACK 12, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + +#ifdef CONFIG_X86_ESPFIX32 +nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ + pushl %ss + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 + addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 + .endr + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 + jmp irq_return +#endif +#else + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + orl $NMI_MASK, PT_EFLAGS(%esp) + jmp restore_all +#endif + CFI_ENDPROC +END(nmi) + +ENTRY(int3) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC +END(int3) + +ENTRY(general_protection) + RING0_EC_FRAME + pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(general_protection) + +/* + * End of kprobes section + */ + .popsection --- linux-ec2-2.6.32.orig/arch/x86/kernel/vsyscall_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/vsyscall_64.c @@ -73,7 +73,8 @@ write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, + u32 mult) { unsigned long flags; @@ -82,7 +83,7 @@ vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; --- linux-ec2-2.6.32.orig/arch/x86/kernel/traps.c +++ linux-ec2-2.6.32/arch/x86/kernel/traps.c @@ -115,6 +115,67 @@ if (!user_mode_vm(regs)) die(str, regs, err); } + +static inline int +__compare_user_cs_desc(const struct desc_struct *desc1, + const struct desc_struct *desc2) +{ + return ((desc1->limit0 != desc2->limit0) || + (desc1->limit != desc2->limit) || + (desc1->base0 != desc2->base0) || + (desc1->base1 != desc2->base1) || + (desc1->base2 != desc2->base2)); +} + +/* + * lazy-check for CS validity on exec-shield binaries: + * + * the original non-exec stack patch was written by + * Solar Designer . Thanks! + */ +static int +check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code) +{ + struct desc_struct *desc1, *desc2; + struct vm_area_struct *vma; + unsigned long limit; + + if (current->mm == NULL) + return 0; + + limit = -1UL; + if (current->mm->context.exec_limit != -1UL) { + limit = PAGE_SIZE; + spin_lock(¤t->mm->page_table_lock); + for (vma = current->mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + vma = get_gate_vma(current); + if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + spin_unlock(¤t->mm->page_table_lock); + if (limit >= TASK_SIZE) + limit = -1UL; + current->mm->context.exec_limit = limit; + } + set_user_cs(¤t->mm->context.user_cs, limit); + + desc1 = ¤t->mm->context.user_cs; + desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS; + + if (__compare_user_cs_desc(desc1, desc2)) { + /* + * The CS was not in sync - reload it and retry the + * instruction. If the instruction still faults then + * we won't hit this branch next time around. + */ + load_user_cs_desc(cpu, current->mm); + + return 1; + } + + return 0; +} #endif static void __kprobes @@ -220,28 +281,41 @@ DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -#endif DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) #ifdef CONFIG_X86_64 /* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); -} dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_X86_ESPFIX64 + extern unsigned char native_irq_return_iret[]; + + /* + * If IRET takes a non-IST fault on the espfix64 stack, then we + * end up promoting it to a doublefault. In that case, modify + * the stack to make it look like we just entered the #GP + * handler from user space, similar to bad_iret. + */ + if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { + struct pt_regs *normal_regs = task_pt_regs(current); + + /* Fake a #GP(0) from userspace. */ + memmove(&normal_regs->ip, (void *)regs->sp, 5*8); + normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + regs->ip = (unsigned long)general_protection; + regs->sp = (unsigned long)&normal_regs->orig_ax; + return; + } +#endif + /* Return not checked because double check cannot be ignored */ notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); @@ -273,6 +347,20 @@ if (!user_mode(regs)) goto gp_in_kernel; +#ifdef CONFIG_X86_32 +{ + int cpu; + int ok; + + cpu = get_cpu(); + ok = check_lazy_exec_limit(cpu, regs, error_code); + put_cpu(); + + if (ok) + return; +} +#endif + tsk->thread.error_code = error_code; tsk->thread.trap_no = 13; @@ -500,6 +588,35 @@ *regs = *eregs; return regs; } + +struct bad_iret_stack { + void *error_entry_ret; + struct pt_regs regs; +}; + +asmlinkage +struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +{ + /* + * This is called from entry_64.S early in handling a fault + * caused by a bad iret to user mode. To handle the fault + * correctly, we want move our stack frame to task_pt_regs + * and we want to pretend that the exception came from the + * iret target. + */ + struct bad_iret_stack *new_stack = + container_of(task_pt_regs(current), + struct bad_iret_stack, regs); + + /* Copy the IRET target to the new stack. */ + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); + + /* Copy the remainder of the stack from the current stack. */ + memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); + + BUG_ON(!user_mode_vm(&new_stack->regs)); + return new_stack; +} #endif /* @@ -881,11 +998,29 @@ } #ifdef CONFIG_X86_32 +/* + * The fixup code for errors in iret jumps to here (iret_exc). It loses + * the original trap number and erorr code. The bogus trap 32 and error + * code 0 are what the vanilla kernel delivers via: + * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) + * + * NOTE: Because of the final "1" in the macro we need to enable interrupts. + * + * In case of a general protection fault in the iret instruction, we + * need to check for a lazy CS update for exec-shield. + */ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; + int ok; + int cpu; local_irq_enable(); + cpu = get_cpu(); + ok = check_lazy_exec_limit(cpu, regs, error_code); + put_cpu(); + if (ok) return; + info.si_signo = SIGILL; info.si_errno = 0; info.si_code = ILL_BADSTK; @@ -927,7 +1062,7 @@ set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); + set_intr_gate(12, &stack_segment); set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); set_intr_gate(15, &spurious_interrupt_bug); --- linux-ec2-2.6.32.orig/arch/x86/kernel/mmconf-fam10h_64.c +++ linux-ec2-2.6.32/arch/x86/kernel/mmconf-fam10h_64.c @@ -218,6 +218,16 @@ val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | FAM10H_MMIO_CONF_ENABLE; wrmsrl(address, val); + +#ifdef CONFIG_XEN + { + u64 val2; + + rdmsrl(address, val2); + if (val2 != val) + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; + } +#endif } static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) --- linux-ec2-2.6.32.orig/arch/x86/kernel/quirks-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/quirks-xen.c @@ -0,0 +1,553 @@ +/* + * This file contains work-arounds for x86 and x86_64 platform bugs. + */ +#include +#include + +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) + +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +{ + u8 config, rev; + u16 word; + + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. + * Disable SW irqbalance/affinity on those platforms. + */ + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + if (rev > 0x9) + return; + + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + pci_write_config_byte(dev, 0xf4, config|0x2); + + /* + * read xTPR register. We may not have a pci_dev for device 8 + * because it might be hidden until the above write. + */ + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); + + if (!(word & (1 << 13))) { + struct xen_platform_op op; + + dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " + "disabling irq balancing and affinity\n"); + op.cmd = XENPF_platform_quirk; + op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; + WARN_ON(HYPERVISOR_platform_op(&op)); + } + + /* put back the original value for config space*/ + if (!(config & 0x2)) + pci_write_config_byte(dev, 0xf4, config); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, + quirk_intel_irqbalance); +#endif + +#if defined(CONFIG_HPET_TIMER) +#include + +unsigned long force_hpet_address; + +static enum { + NONE_FORCE_HPET_RESUME, + OLD_ICH_FORCE_HPET_RESUME, + ICH_FORCE_HPET_RESUME, + VT8237_FORCE_HPET_RESUME, + NVIDIA_FORCE_HPET_RESUME, + ATI_FORCE_HPET_RESUME, +} force_hpet_resume_type; + +static void __iomem *rcba_base; + +static void ich_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address) + return; + + BUG_ON(rcba_base == NULL); + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + } + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) + BUG(); + else + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + + return; +} + +static void ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 uninitialized_var(rcba); + int err = 0; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xF0, &rcba); + rcba &= 0xFFFFC000; + if (rcba == 0) { + dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; " + "cannot force enable HPET\n"); + return; + } + + /* use bits 31:14, 16 kB aligned */ + rcba_base = ioremap_nocache(rcba, 0x4000); + if (rcba_base == NULL) { + dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; " + "cannot force enable HPET\n"); + return; + } + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + + if (val & 0x80) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + iounmap(rcba_base); + return; + } + + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + err = 1; + } else { + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + } + + if (err) { + force_hpet_address = 0; + iounmap(rcba_base); + dev_printk(KERN_DEBUG, &dev->dev, + "Failed to force enable HPET\n"); + } else { + force_hpet_resume_type = ICH_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + } +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */ + ich_force_enable_hpet); + +static struct pci_dev *cached_dev; + +static void hpet_print_force_info(void) +{ + printk(KERN_INFO "HPET not enabled in BIOS. " + "You might try hpet=force boot option\n"); +} + +static void old_ich_force_hpet_resume(void) +{ + u32 val; + u32 uninitialized_var(gen_cntl); + + if (!force_hpet_address || !cached_dev) + return; + + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + + pci_write_config_dword(cached_dev, 0xD0, gen_cntl); + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + val = gen_cntl >> 15; + val &= 0x7; + if (val == 0x4) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void old_ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 uninitialized_var(gen_cntl); + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + /* + * Bit 17 is HPET enable bit. + * Bit 16:15 control the HPET base address. + */ + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + pci_write_config_dword(dev, 0xD0, gen_cntl); + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; + return; + } + + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); +} + +/* + * Undocumented chipset features. Make sure that the user enforced + * this. + */ +static void old_ich_force_enable_hpet_user(struct pci_dev *dev) +{ + if (hpet_force_user) + old_ich_force_enable_hpet(dev); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, + old_ich_force_enable_hpet); + + +static void vt8237_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address || !cached_dev) + return; + + val = 0xfed00000 | 0x80; + pci_write_config_dword(cached_dev, 0x68, val); + + pci_read_config_dword(cached_dev, 0x68, &val); + if (val & 0x80) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void vt8237_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + pci_read_config_dword(dev, 0x68, &val); + /* + * Bit 7 is HPET enable bit. + * Bit 31:10 is HPET base address (contrary to what datasheet claims) + */ + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + val = 0xfed00000 | 0x80; + pci_write_config_dword(dev, 0x68, val); + + pci_read_config_dword(dev, 0x68, &val); + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; + return; + } + + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, + vt8237_force_enable_hpet); + +static void ati_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x14, 0xfed00000); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static u32 ati_ixp4x0_rev(struct pci_dev *dev) +{ + u32 d; + u8 b; + + pci_read_config_byte(dev, 0xac, &b); + b &= ~(1<<5); + pci_write_config_byte(dev, 0xac, b); + pci_read_config_dword(dev, 0x70, &d); + d |= 1<<8; + pci_write_config_dword(dev, 0x70, d); + pci_read_config_dword(dev, 0x8, &d); + d &= 0xff; + dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); + return d; +} + +static void ati_force_enable_hpet(struct pci_dev *dev) +{ + u32 d, val; + u8 b; + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + d = ati_ixp4x0_rev(dev); + if (d < 0x82) + return; + + /* base address */ + pci_write_config_dword(dev, 0x14, 0xfed00000); + pci_read_config_dword(dev, 0x14, &val); + + /* enable interrupt */ + outb(0x72, 0xcd6); b = inb(0xcd7); + b |= 0x1; + outb(0x72, 0xcd6); outb(b, 0xcd7); + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x1)) + return; + pci_read_config_dword(dev, 0x64, &d); + d |= (1<<10); + pci_write_config_dword(dev, 0x64, d); + pci_read_config_dword(dev, 0x64, &d); + if (!(d & (1<<10))) + return; + + force_hpet_address = val; + force_hpet_resume_type = ATI_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + ati_force_enable_hpet); + +/* + * Undocumented chipset feature taken from LinuxBIOS. + */ +static void nvidia_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x44, 0xfed00001); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static void nvidia_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + pci_write_config_dword(dev, 0x44, 0xfed00001); + pci_read_config_dword(dev, 0x44, &val); + force_hpet_address = val & 0xfffffffe; + force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; + return; +} + +/* ISA Bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0050, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051, + nvidia_force_enable_hpet); + +/* LPC bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0362, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0363, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0364, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0365, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0366, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, + nvidia_force_enable_hpet); + +void force_hpet_resume(void) +{ + switch (force_hpet_resume_type) { + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + case ATI_FORCE_HPET_RESUME: + ati_force_hpet_resume(); + return; + default: + break; + } +} + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void __init quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 node; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + node = val & 7; + /* + * Some hardware may return an invalid node ID, + * so check it first: + */ + if (node_online(node)) + set_dev_node(&dev->dev, node); + pci_dev_put(nb_ht); +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); +#endif --- linux-ec2-2.6.32.orig/arch/x86/kernel/smpboot.c +++ linux-ec2-2.6.32/arch/x86/kernel/smpboot.c @@ -70,7 +70,6 @@ #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; -static int low_mappings; #endif /* State of each CPU */ @@ -88,6 +87,25 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) + +/* + * We need this for trampoline_base protection from concurrent accesses when + * off- and onlining cores wildly. + */ +static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); + +void cpu_hotplug_driver_lock() +{ + mutex_lock(&x86_cpu_hotplug_driver_mutex); +} + +void cpu_hotplug_driver_unlock() +{ + mutex_unlock(&x86_cpu_hotplug_driver_mutex); +} + +ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } +ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } #else static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define get_idle_for_cpu(x) (idle_thread_array[(x)]) @@ -273,6 +291,18 @@ * fragile that we want to limit the things done here to the * most necessary things. */ + +#ifdef CONFIG_X86_32 + /* + * Switch away from the trampoline page-table + * + * Do this before cpu_init() because it needs to access per-cpu + * data which may not be mapped in the trampoline page-table. + */ + load_cr3(swapper_pg_dir); + __flush_tlb_all(); +#endif + vmi_bringup(); cpu_init(); preempt_disable(); @@ -291,17 +321,18 @@ enable_8259A_irq(0); } -#ifdef CONFIG_X86_32 - while (low_mappings) - cpu_relax(); - __flush_tlb_all(); -#endif - /* This must be done before setting cpu_online_mask */ set_cpu_sibling_map(raw_smp_processor_id()); wmb(); /* + * Enable the espfix hack for this CPU + */ +#ifdef CONFIG_X86_ESPFIX64 + init_espfix_ap(); +#endif + + /* * We need to hold call_lock, so there is no inconsistency * between the time smp_call_function() determines number of * IPI recipients, and the time when the determination is made @@ -722,6 +753,7 @@ #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); + initial_page_table = __pa(&trampoline_pg_dir); #else clear_tsk_thread_flag(c_idle.idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); @@ -866,20 +898,8 @@ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; -#ifdef CONFIG_X86_32 - /* init low mem mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); - flush_tlb_all(); - low_mappings = 1; - err = do_boot_cpu(apicid, cpu); - zap_low_mappings(false); - low_mappings = 0; -#else - err = do_boot_cpu(apicid, cpu); -#endif if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; @@ -1066,9 +1086,7 @@ set_cpu_sibling_map(0); enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); --- linux-ec2-2.6.32.orig/arch/x86/kernel/smp-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/smp-xen.c @@ -0,0 +1,207 @@ +/* + * Intel SMP support routines. + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998-99, 2000, 2009 Ingo Molnar + * (c) 2002,2003 Andi Kleen, SuSE Labs. + * + * i386 and x86_64 integration by Glauber Costa + * + * This code is released under the GNU General Public License version 2 or + * later. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +/* + * Some notes on x86 processor bugs affecting SMP operation: + * + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. + * The Linux implications for SMP are handled as follows: + * + * Pentium III / [Xeon] + * None of the E1AP-E3AP errata are visible to the user. + * + * E1AP. see PII A1AP + * E2AP. see PII A2AP + * E3AP. see PII A3AP + * + * Pentium II / [Xeon] + * None of the A1AP-A3AP errata are visible to the user. + * + * A1AP. see PPro 1AP + * A2AP. see PPro 2AP + * A3AP. see PPro 7AP + * + * Pentium Pro + * None of 1AP-9AP errata are visible to the normal user, + * except occasional delivery of 'spurious interrupt' as trap #15. + * This is very rare and a non-problem. + * + * 1AP. Linux maps APIC as non-cacheable + * 2AP. worked around in hardware + * 3AP. fixed in C0 and above steppings microcode update. + * Linux does not use excessive STARTUP_IPIs. + * 4AP. worked around in hardware + * 5AP. symmetric IO mode (normal Linux operation) not affected. + * 'noapic' mode has vector 0xf filled out properly. + * 6AP. 'noapic' mode might be affected - fixed in later steppings + * 7AP. We do not assume writes to the LVT deassering IRQs + * 8AP. We do not enable low power mode (deep sleep) during MP bootup + * 9AP. We do not use mixed mode + * + * Pentium + * There is a marginal case where REP MOVS on 100MHz SMP + * machines with B stepping processors can fail. XXX should provide + * an L1cache=Writethrough or L1cache=off option. + * + * B stepping CPUs may hang. There are hardware work arounds + * for this. We warn about it in case your board doesn't have the work + * arounds. Basically that's so I can tell anyone with a B stepping + * CPU and SMP problems "tough". + * + * Specific items [From Pentium Processor Specification Update] + * + * 1AP. Linux doesn't use remote read + * 2AP. Linux doesn't trust APIC errors + * 3AP. We work around this + * 4AP. Linux never generated 3 interrupts of the same priority + * to cause a lost local interrupt. + * 5AP. Remote read is never used + * 6AP. not affected - worked around in hardware + * 7AP. not affected - worked around in hardware + * 8AP. worked around in hardware - we get explicit CS errors if not + * 9AP. only 'noapic' mode affected. Might generate spurious + * interrupts, we log only the first one and count the + * rest silently. + * 10AP. not affected - worked around in hardware + * 11AP. Linux reads the APIC between writes to avoid this, as per + * the documentation. Make sure you preserve this as it affects + * the C stepping chips too. + * 12AP. not affected - worked around in hardware + * 13AP. not affected - worked around in hardware + * 14AP. we always deassert INIT during bootup + * 15AP. not affected - worked around in hardware + * 16AP. not affected - worked around in hardware + * 17AP. not affected - worked around in hardware + * 18AP. not affected - worked around in hardware + * 19AP. not affected - worked around in BIOS + * + * If this sounds worrying believe me these bugs are either ___RARE___, + * or are signal timing bugs worked around in hardware and there's + * about nothing of note with C stepping upwards. + */ + +/* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing + * anything. Worst case is that we lose a reschedule ... + */ +void xen_smp_send_reschedule(int cpu) +{ + if (unlikely(cpu_is_offline(cpu))) { + WARN_ON(1); + return; + } + xen_send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); +} + +void xen_send_call_func_single_ipi(int cpu) +{ + xen_send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR); +} + +void xen_send_call_func_ipi(const struct cpumask *mask) +{ + xen_send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR); +} + +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ + +irqreturn_t smp_reboot_interrupt(int irq, void *dev_id) +{ + irq_enter(); + stop_this_cpu(NULL); + irq_exit(); + + return IRQ_HANDLED; +} + +void xen_stop_other_cpus(int wait) +{ + unsigned long flags; + unsigned long timeout; + + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + * On most systems we could also use an NMI here, + * but there are a few systems around where NMI + * is problematic so stay with an non NMI for now + * (this implies we cannot stop CPUs spinning with irq off + * currently) + */ + if (num_online_cpus() > 1) { + xen_send_IPI_allbutself(REBOOT_VECTOR); + + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) + udelay(1); + } + + local_irq_save(flags); + disable_all_local_evtchn(); + local_irq_restore(flags); +} + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) +{ + inc_irq_stat(irq_resched_count); + return IRQ_HANDLED; +} + +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id) +{ + irq_enter(); + generic_smp_call_function_interrupt(); + inc_irq_stat(irq_call_count); + irq_exit(); + + return IRQ_HANDLED; +} + +irqreturn_t smp_call_function_single_interrupt(int irq, void *dev_id) +{ + irq_enter(); + generic_smp_call_function_single_interrupt(); + inc_irq_stat(irq_call_count); + irq_exit(); + + return IRQ_HANDLED; +} --- linux-ec2-2.6.32.orig/arch/x86/kernel/kvm.c +++ linux-ec2-2.6.32/arch/x86/kernel/kvm.c @@ -198,7 +198,14 @@ static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - pv_info.paravirt_enabled = 1; + + /* + * KVM isn't paravirt in the sense of paravirt_enabled. A KVM + * guest kernel works like a bare metal kernel with additional + * features, and paravirt_enabled is about features that are + * missing. + */ + pv_info.paravirt_enabled = 0; if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; --- linux-ec2-2.6.32.orig/arch/x86/kernel/setup-xen.c +++ linux-ec2-2.6.32/arch/x86/kernel/setup-xen.c @@ -0,0 +1,1384 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons , July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle , February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel , March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include