direct-io.hg

changeset 11350:684fdcfb251a

merge with xen-unstable.hg
author awilliam@xenbuild.aw
date Mon Aug 28 16:26:37 2006 -0600 (2006-08-28)
parents 896fcdd49c7f f790546ecfda
children e317ad162eba
files xen/arch/x86/shadow2-common.c xen/arch/x86/shadow2.c xen/include/asm-x86/page-guest32.h xen/include/asm-x86/shadow2-multi.h xen/include/asm-x86/shadow2-private.h xen/include/asm-x86/shadow2-types.h xen/include/asm-x86/shadow2.h
line diff
     1.1 --- a/docs/man/xend-config.sxp.pod.5	Mon Aug 28 16:16:07 2006 -0600
     1.2 +++ b/docs/man/xend-config.sxp.pod.5	Mon Aug 28 16:26:37 2006 -0600
     1.3 @@ -23,7 +23,7 @@ The following lists the daemon configura
     1.4  =item I<logfile>
     1.5  
     1.6  The location of the file to record runtime log messages.  Defaults to
     1.7 -I</var/log/xend.log>.
     1.8 +I</var/log/xen/xend.log>.
     1.9  
    1.10  =item I<loglevel>
    1.11  
     2.1 --- a/docs/misc/xend.tex	Mon Aug 28 16:16:07 2006 -0600
     2.2 +++ b/docs/misc/xend.tex	Mon Aug 28 16:26:37 2006 -0600
     2.3 @@ -214,7 +214,7 @@ a sequence of s-expressions. The configu
     2.4  Configuration scripts ({\it e.g.} for network-script) are looked for in {\tt /etc/xen}
     2.5  unless their name begins with '/'.
     2.6  
     2.7 -Xend sends its log output to {\tt /var/log/xend.log}. This is a rotating logfile,
     2.8 +Xend sends its log output to {\tt /var/log/xen/xend.log}. This is a rotating logfile,
     2.9  and logs are moved onto {\tt xend.log.1} {\it etc.} as they get large. Old logs may
    2.10  be deleted.
    2.11  
    2.12 @@ -411,7 +411,7 @@ allows access to some debugging function
    2.13  \end{itemize}
    2.14  
    2.15  When tracing is on xend logs all functions calls and exceptions to
    2.16 -{\tt /var/log/xend.trace}.
    2.17 +{\tt /var/log/xen/xend.trace}.
    2.18  
    2.19  \begin{thebibliography}{99}
    2.20  
     3.1 --- a/docs/src/user.tex	Mon Aug 28 16:16:07 2006 -0600
     3.2 +++ b/docs/src/user.tex	Mon Aug 28 16:26:37 2006 -0600
     3.3 @@ -973,8 +973,8 @@ using the \texttt{xm} tool.
     3.4  
     3.5  \subsection{Logging}
     3.6  
     3.7 -As \xend\ runs, events will be logged to \path{/var/log/xend.log} and
     3.8 -(less frequently) to \path{/var/log/xend-debug.log}. These, along with
     3.9 +As \xend\ runs, events will be logged to \path{/var/log/xen/xend.log} and
    3.10 +(less frequently) to \path{/var/log/xen/xend-debug.log}. These, along with
    3.11  the standard syslog files, are useful when troubleshooting problems.
    3.12  
    3.13  \subsection{Configuring \Xend\ }
     4.1 --- a/tools/Makefile	Mon Aug 28 16:16:07 2006 -0600
     4.2 +++ b/tools/Makefile	Mon Aug 28 16:26:37 2006 -0600
     4.3 @@ -39,6 +39,7 @@ install: check
     4.4  	done
     4.5  	$(MAKE) ioemuinstall
     4.6  	$(INSTALL_DIR) -p $(DESTDIR)/var/xen/dump
     4.7 +	$(INSTALL_DIR) -p $(DESTDIR)/var/log/xen
     4.8  
     4.9  .PHONY: clean
    4.10  clean: check_clean
     5.1 --- a/tools/console/daemon/io.c	Mon Aug 28 16:16:07 2006 -0600
     5.2 +++ b/tools/console/daemon/io.c	Mon Aug 28 16:26:37 2006 -0600
     5.3 @@ -584,16 +584,14 @@ void handle_io(void)
     5.4  			    FD_ISSET(xc_evtchn_fd(d->xce_handle), &readfds))
     5.5  				handle_ring_read(d);
     5.6  
     5.7 -			if (d->tty_fd != -1) {
     5.8 -				if (FD_ISSET(d->tty_fd, &readfds))
     5.9 -					handle_tty_read(d);
    5.10 +			if (d->tty_fd != -1 && FD_ISSET(d->tty_fd, &readfds))
    5.11 +				handle_tty_read(d);
    5.12  
    5.13 -				if (FD_ISSET(d->tty_fd, &writefds))
    5.14 -					handle_tty_write(d);
    5.15 +			if (d->tty_fd != -1 && FD_ISSET(d->tty_fd, &writefds))
    5.16 +				handle_tty_write(d);
    5.17  
    5.18 -				if (d->is_dead)
    5.19 -					cleanup_domain(d);
    5.20 -			}
    5.21 +			if (d->is_dead)
    5.22 +				cleanup_domain(d);
    5.23  		}
    5.24  	} while (ret > -1);
    5.25  }
     6.1 --- a/tools/examples/vif-route	Mon Aug 28 16:16:07 2006 -0600
     6.2 +++ b/tools/examples/vif-route	Mon Aug 28 16:26:37 2006 -0600
     6.3 @@ -30,10 +30,12 @@ case "$command" in
     6.4          ifconfig ${vif} ${main_ip} netmask 255.255.255.255 up
     6.5          echo 1 >/proc/sys/net/ipv4/conf/${vif}/proxy_arp
     6.6          ipcmd='add'
     6.7 +        cmdprefix=''
     6.8          ;;
     6.9      offline)
    6.10 -        ifdown ${vif}
    6.11 +        do_without_error ifdown ${vif}
    6.12          ipcmd='del'
    6.13 +        cmdprefix='do_without_error'
    6.14          ;;
    6.15  esac
    6.16  
    6.17 @@ -41,7 +43,7 @@ if [ "${ip}" ] ; then
    6.18      # If we've been given a list of IP addresses, then add routes from dom0 to
    6.19      # the guest using those addresses.
    6.20      for addr in ${ip} ; do
    6.21 -      ip route ${ipcmd} ${addr} dev ${vif} src ${main_ip}
    6.22 +      ${cmdprefix} ip route ${ipcmd} ${addr} dev ${vif} src ${main_ip}
    6.23      done 
    6.24  fi
    6.25  
     7.1 --- a/tools/examples/xen-hotplug-common.sh	Mon Aug 28 16:16:07 2006 -0600
     7.2 +++ b/tools/examples/xen-hotplug-common.sh	Mon Aug 28 16:26:37 2006 -0600
     7.3 @@ -21,7 +21,7 @@ dir=$(dirname "$0")
     7.4  . "$dir/xen-script-common.sh"
     7.5  . "$dir/locking.sh"
     7.6  
     7.7 -exec 2>>/var/log/xen-hotplug.log
     7.8 +exec 2>>/var/log/xen/xen-hotplug.log
     7.9  
    7.10  export PATH="/sbin:/bin:/usr/bin:/usr/sbin:$PATH"
    7.11  export LANG="POSIX"
     8.1 --- a/tools/examples/xen-network-common.sh	Mon Aug 28 16:16:07 2006 -0600
     8.2 +++ b/tools/examples/xen-network-common.sh	Mon Aug 28 16:26:37 2006 -0600
     8.3 @@ -44,34 +44,18 @@ then
     8.4    }
     8.5  elif ! which ifup >/dev/null 2>/dev/null
     8.6  then
     8.7 -  if [ -e /etc/conf.d/net ]
     8.8 -  then
     8.9 -    preiftransfer()
    8.10 -    {
    8.11 -      true
    8.12 -    }
    8.13 -    ifup()
    8.14 -    {
    8.15 -      /etc/init.d/net.$1 start
    8.16 -    }
    8.17 -    ifdown()
    8.18 -    {
    8.19 -      /etc/init.d/net.$1 stop
    8.20 -    }
    8.21 -  else
    8.22 -    preiftransfer()
    8.23 -    {
    8.24 -      true
    8.25 -    }
    8.26 -    ifup()
    8.27 -    {
    8.28 -      false
    8.29 -    }
    8.30 -    ifdown()
    8.31 -    {
    8.32 -      false
    8.33 -    }
    8.34 -  fi
    8.35 +  preiftransfer()
    8.36 +  {
    8.37 +    true
    8.38 +  }
    8.39 +  ifup()
    8.40 +  {
    8.41 +    false
    8.42 +  }
    8.43 +  ifdown()
    8.44 +  {
    8.45 +    false
    8.46 +  }
    8.47  else
    8.48    preiftransfer()
    8.49    {
     9.1 --- a/tools/examples/xend-config.sxp	Mon Aug 28 16:16:07 2006 -0600
     9.2 +++ b/tools/examples/xend-config.sxp	Mon Aug 28 16:26:37 2006 -0600
     9.3 @@ -11,7 +11,7 @@
     9.4  # Commented out entries show the default for that entry, unless otherwise
     9.5  # specified.
     9.6  
     9.7 -#(logfile /var/log/xend.log)
     9.8 +#(logfile /var/log/xen/xend.log)
     9.9  #(loglevel DEBUG)
    9.10  
    9.11  #(xend-http-server no)
    10.1 --- a/tools/firmware/hvmloader/smbios.c	Mon Aug 28 16:16:07 2006 -0600
    10.2 +++ b/tools/firmware/hvmloader/smbios.c	Mon Aug 28 16:26:37 2006 -0600
    10.3 @@ -116,8 +116,10 @@ smbios_table_size(uint32_t vcpus, const 
    10.4  
    10.5  	/* type 0: "Xen", xen_version, and release_date */
    10.6  	size += strlen("Xen") + strlen(xen_version) + 2;
    10.7 -	/* type 1: "Xen", xen_version, "HVM domU" */
    10.8 -	size += strlen("Xen") + strlen("HVM domU") + strlen(xen_version) + 3;
    10.9 +	/* type 1: "Xen", xen_version, "HVM domU", UUID as string for 
   10.10 +                   serial number */
   10.11 +	size += strlen("Xen") + strlen("HVM domU") + strlen(xen_version) +
   10.12 +			36 + 4;
   10.13  	/* type 3: "Xen" */
   10.14  	size += strlen("Xen") + 1;
   10.15  	/* type 4: socket designation ("CPU n"), processor_manufacturer */
   10.16 @@ -371,6 +373,7 @@ static void *
   10.17  smbios_type_1_init(void *start, const char *xen_version, 
   10.18  		   uint8_t uuid[16])
   10.19  {
   10.20 +	char uuid_str[37];
   10.21  	struct smbios_type_1 *p = (struct smbios_type_1 *)start;
   10.22  	p->header.type = 1;
   10.23  	p->header.length = sizeof(struct smbios_type_1);
   10.24 @@ -379,7 +382,7 @@ smbios_type_1_init(void *start, const ch
   10.25  	p->manufacturer_str = 1;
   10.26  	p->product_name_str = 2;
   10.27  	p->version_str = 3;
   10.28 -	p->serial_number_str = 0;
   10.29 +	p->serial_number_str = 4;
   10.30      
   10.31  	memcpy(p->uuid, uuid, 16);
   10.32  
   10.33 @@ -395,6 +398,9 @@ smbios_type_1_init(void *start, const ch
   10.34  	start += strlen("HVM domU") + 1;
   10.35  	strcpy((char *)start, xen_version);
   10.36  	start += strlen(xen_version) + 1;
   10.37 +	uuid_to_string(uuid_str, uuid);	
   10.38 +	strcpy((char *)start, uuid_str);
   10.39 +	start += strlen(uuid_str) + 1;
   10.40  	*((uint8_t *)start) = 0;
   10.41      
   10.42  	return start+1; 
    11.1 --- a/tools/firmware/hvmloader/util.c	Mon Aug 28 16:16:07 2006 -0600
    11.2 +++ b/tools/firmware/hvmloader/util.c	Mon Aug 28 16:26:37 2006 -0600
    11.3 @@ -174,3 +174,57 @@ cpuid(uint32_t idx, uint32_t *eax, uint3
    11.4  		: "0" (idx) );
    11.5  }
    11.6  
    11.7 +/* Write a two-character hex representation of 'byte' to digits[].
    11.8 +   Pre-condition: sizeof(digits) >= 2 */
    11.9 +void
   11.10 +byte_to_hex(char *digits, uint8_t byte)
   11.11 +{
   11.12 +	uint8_t nybbel = byte >> 4;
   11.13 +
   11.14 +	if (nybbel > 9)
   11.15 +		digits[0] = 'a' + nybbel-10;
   11.16 +	else
   11.17 +		digits[0] = '0' + nybbel;
   11.18 +
   11.19 +	nybbel = byte & 0x0f;
   11.20 +	if (nybbel > 9)
   11.21 +		digits[1] = 'a' + nybbel-10;
   11.22 +	else
   11.23 +		digits[1] = '0' + nybbel;
   11.24 +}
   11.25 +
   11.26 +/* Convert an array of 16 unsigned bytes to a DCE/OSF formatted UUID
   11.27 +   string.
   11.28 +
   11.29 +   Pre-condition: sizeof(dest) >= 37 */
   11.30 +void
   11.31 +uuid_to_string(char *dest, uint8_t *uuid)
   11.32 +{
   11.33 +	int i = 0;
   11.34 +	char *p = dest;
   11.35 +
   11.36 +	for (i = 0; i < 4; ++i) {
   11.37 +		byte_to_hex(p, uuid[i]);
   11.38 +		p += 2;
   11.39 +	}
   11.40 +	*p++ = '-';
   11.41 +	for (i = 4; i < 6; ++i) {
   11.42 +		byte_to_hex(p, uuid[i]);
   11.43 +		p += 2;
   11.44 +	}
   11.45 +	*p++ = '-';
   11.46 +	for (i = 6; i < 8; ++i) {
   11.47 +		byte_to_hex(p, uuid[i]);
   11.48 +		p += 2;
   11.49 +	}
   11.50 +	*p++ = '-';
   11.51 +	for (i = 8; i < 10; ++i) {
   11.52 +		byte_to_hex(p, uuid[i]);
   11.53 +		p += 2;
   11.54 +	}
   11.55 +	*p++ = '-';
   11.56 +	for (i = 10; i < 16; ++i) {
   11.57 +		byte_to_hex(p, uuid[i]);
   11.58 +		p += 2;
   11.59 +	}
   11.60 +}
    12.1 --- a/tools/firmware/hvmloader/util.h	Mon Aug 28 16:16:07 2006 -0600
    12.2 +++ b/tools/firmware/hvmloader/util.h	Mon Aug 28 16:26:37 2006 -0600
    12.3 @@ -25,6 +25,16 @@ void *memcpy(void *dest, const void *src
    12.4  void *memset(void *s, int c, unsigned n);
    12.5  char *itoa(char *a, unsigned int i);
    12.6  
    12.7 +/* convert a byte to two lowercase hex digits, with no terminating NUL 
    12.8 +   character.  digits[] must have at least two elements. */
    12.9 +void byte_to_hex(char *digits, uint8_t byte);
   12.10 +
   12.11 +/* Convert an array of 16 unsigned bytes to a DCE/OSF formatted UUID
   12.12 +   string.
   12.13 +
   12.14 +   Pre-condition: sizeof(dest) >= 37 */
   12.15 +void uuid_to_string(char *dest, uint8_t *uuid);
   12.16 +
   12.17  /* Debug output */
   12.18  void puts(const char *s);
   12.19  
    13.1 --- a/tools/ioemu/Makefile	Mon Aug 28 16:16:07 2006 -0600
    13.2 +++ b/tools/ioemu/Makefile	Mon Aug 28 16:26:37 2006 -0600
    13.3 @@ -94,7 +94,7 @@ test speed test2: all
    13.4  	$(MAKE) -C tests $@
    13.5  
    13.6  TAGS: 
    13.7 -	etags *.[ch] tests/*.[ch]
    13.8 +	etags *.[ch] target-i386-dm/*.[ch] hw/*.[ch]
    13.9  
   13.10  cscope:
   13.11  	rm -f ./cscope.*
    14.1 --- a/tools/ioemu/patches/qemu-logging	Mon Aug 28 16:16:07 2006 -0600
    14.2 +++ b/tools/ioemu/patches/qemu-logging	Mon Aug 28 16:26:37 2006 -0600
    14.3 @@ -43,7 +43,7 @@ Index: ioemu/vl.c
    14.4       /* default mac address of the first network interface */
    14.5       
    14.6  +    /* init debug */
    14.7 -+    sprintf(qemu_dm_logfilename, "/var/log/qemu-dm.%d.log", getpid());
    14.8 ++    sprintf(qemu_dm_logfilename, "/var/log/xen/qemu-dm.%d.log", getpid());
    14.9  +    cpu_set_log_filename(qemu_dm_logfilename);
   14.10  +    cpu_set_log(0);
   14.11  +    
    15.1 --- a/tools/ioemu/patches/xen-build	Mon Aug 28 16:16:07 2006 -0600
    15.2 +++ b/tools/ioemu/patches/xen-build	Mon Aug 28 16:26:37 2006 -0600
    15.3 @@ -1,7 +1,7 @@
    15.4  Index: ioemu/Makefile
    15.5  ===================================================================
    15.6 ---- ioemu.orig/Makefile	2006-08-06 02:03:44.915543858 +0100
    15.7 -+++ ioemu/Makefile	2006-08-06 02:11:33.461331417 +0100
    15.8 +--- ioemu.orig/Makefile	2006-08-28 20:19:23.000000000 +0100
    15.9 ++++ ioemu/Makefile	2006-08-28 20:20:08.000000000 +0100
   15.10  @@ -1,11 +1,14 @@
   15.11   # Makefile for QEMU.
   15.12   
   15.13 @@ -60,6 +60,15 @@ Index: ioemu/Makefile
   15.14   ifndef CONFIG_WIN32
   15.15   	mkdir -p "$(DESTDIR)$(datadir)/keymaps"
   15.16   	for x in $(KEYMAPS); do \
   15.17 +@@ -89,7 +94,7 @@
   15.18 + 	$(MAKE) -C tests $@
   15.19 + 
   15.20 + TAGS: 
   15.21 +-	etags *.[ch] tests/*.[ch]
   15.22 ++	etags *.[ch] target-i386-dm/*.[ch] hw/*.[ch]
   15.23 + 
   15.24 + cscope:
   15.25 + 	rm -f ./cscope.*
   15.26  @@ -107,11 +112,11 @@
   15.27   	texi2dvi $<
   15.28   
   15.29 @@ -76,8 +85,8 @@ Index: ioemu/Makefile
   15.30   info: qemu-doc.info qemu-tech.info
   15.31  Index: ioemu/Makefile.target
   15.32  ===================================================================
   15.33 ---- ioemu.orig/Makefile.target	2006-08-06 02:03:44.922543079 +0100
   15.34 -+++ ioemu/Makefile.target	2006-08-06 02:09:22.320951557 +0100
   15.35 +--- ioemu.orig/Makefile.target	2006-08-28 20:19:23.000000000 +0100
   15.36 ++++ ioemu/Makefile.target	2006-08-28 20:19:47.000000000 +0100
   15.37  @@ -1,5 +1,8 @@
   15.38   include config.mak
   15.39   
   15.40 @@ -149,8 +158,8 @@ Index: ioemu/Makefile.target
   15.41   include .depend
   15.42  Index: ioemu/configure
   15.43  ===================================================================
   15.44 ---- ioemu.orig/configure	2006-08-06 02:03:45.783447220 +0100
   15.45 -+++ ioemu/configure	2006-08-06 02:09:41.076860544 +0100
   15.46 +--- ioemu.orig/configure	2006-08-28 20:19:23.000000000 +0100
   15.47 ++++ ioemu/configure	2006-08-28 20:19:47.000000000 +0100
   15.48  @@ -18,8 +18,8 @@
   15.49   
   15.50   # default parameters
    16.1 --- a/tools/ioemu/vl.c	Mon Aug 28 16:16:07 2006 -0600
    16.2 +++ b/tools/ioemu/vl.c	Mon Aug 28 16:26:37 2006 -0600
    16.3 @@ -5924,7 +5924,7 @@ int main(int argc, char **argv)
    16.4      /* default mac address of the first network interface */
    16.5      
    16.6      /* init debug */
    16.7 -    sprintf(qemu_dm_logfilename, "/var/log/qemu-dm.%d.log", getpid());
    16.8 +    sprintf(qemu_dm_logfilename, "/var/log/xen/qemu-dm.%d.log", getpid());
    16.9      cpu_set_log_filename(qemu_dm_logfilename);
   16.10      cpu_set_log(0);
   16.11      
    17.1 --- a/tools/libxc/xc_hvm_build.c	Mon Aug 28 16:16:07 2006 -0600
    17.2 +++ b/tools/libxc/xc_hvm_build.c	Mon Aug 28 16:26:37 2006 -0600
    17.3 @@ -441,7 +441,7 @@ static int xc_hvm_build_internal(int xc_
    17.4          goto error_out;
    17.5      }
    17.6  
    17.7 -    /* HVM domains must be put into shadow2 mode at the start of day */
    17.8 +    /* HVM domains must be put into shadow mode at the start of day */
    17.9      if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_ENABLE,
   17.10                             NULL, 0, NULL, 
   17.11                             XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT  |
    18.1 --- a/tools/misc/xend	Mon Aug 28 16:16:07 2006 -0600
    18.2 +++ b/tools/misc/xend	Mon Aug 28 16:26:37 2006 -0600
    18.3 @@ -86,7 +86,7 @@ def start_xenstored():
    18.4      XENSTORED_TRACE = os.getenv("XENSTORED_TRACE")
    18.5      cmd = "xenstored --pid-file /var/run/xenstore.pid"
    18.6      if XENSTORED_TRACE:
    18.7 -        cmd += " -T /var/log/xenstored-trace.log"
    18.8 +        cmd += " -T /var/log/xen/xenstored-trace.log"
    18.9      s,o = commands.getstatusoutput(cmd)
   18.10  
   18.11  def start_consoled():
    19.1 --- a/tools/python/xen/util/bugtool.py	Mon Aug 28 16:16:07 2006 -0600
    19.2 +++ b/tools/python/xen/util/bugtool.py	Mon Aug 28 16:26:37 2006 -0600
    19.3 @@ -43,8 +43,8 @@ TITLE_RE = re.compile(r'<title>(.*)</tit
    19.4  
    19.5  FILES_TO_SEND = [ '/var/log/' + x for x in 
    19.6                    [ 'syslog', 'messages', 'debug',
    19.7 -                    'xend.log', 'xend-debug.log', 'xenstored-trace.log',
    19.8 -                    'xen-hotplug.log' ] ]
    19.9 +                    'xen/xend.log', 'xen/xend-debug.log', 'xen/xenstored-trace.log',
   19.10 +                    'xen/xen-hotplug.log' ] ]
   19.11  #FILES_TO_SEND = [  ]
   19.12  
   19.13  
    20.1 --- a/tools/python/xen/xend/XendRoot.py	Mon Aug 28 16:16:07 2006 -0600
    20.2 +++ b/tools/python/xen/xend/XendRoot.py	Mon Aug 28 16:26:37 2006 -0600
    20.3 @@ -52,7 +52,7 @@ class XendRoot:
    20.4      block_script_dir = "/etc/xen/scripts"
    20.5  
    20.6      """Default path to the log file. """
    20.7 -    logfile_default = "/var/log/xend.log"
    20.8 +    logfile_default = "/var/log/xen/xend.log"
    20.9  
   20.10      """Default level of information to be logged."""
   20.11      loglevel_default = 'DEBUG'
    21.1 --- a/tools/python/xen/xend/server/params.py	Mon Aug 28 16:16:07 2006 -0600
    21.2 +++ b/tools/python/xen/xend/server/params.py	Mon Aug 28 16:26:37 2006 -0600
    21.3 @@ -39,8 +39,8 @@ def getenv(var, val, conv=None):
    21.4  
    21.5  # The following parameters could be placed in a configuration file.
    21.6  XEND_PID_FILE      = '/var/run/xend.pid'
    21.7 -XEND_TRACE_FILE    = '/var/log/xend.trace'
    21.8 -XEND_DEBUG_LOG     = '/var/log/xend-debug.log'
    21.9 +XEND_TRACE_FILE    = '/var/log/xen/xend.trace'
   21.10 +XEND_DEBUG_LOG     = '/var/log/xen/xend-debug.log'
   21.11  XEND_USER          = 'root'
   21.12  XEND_DEBUG         = getenv("XEND_DEBUG",     0, conv=int)
   21.13  XEND_DAEMONIZE     = getenv("XEND_DAEMONIZE", not XEND_DEBUG, conv=int)
    22.1 --- a/tools/security/python/xensec_gen/main.py	Mon Aug 28 16:16:07 2006 -0600
    22.2 +++ b/tools/security/python/xensec_gen/main.py	Mon Aug 28 16:26:37 2006 -0600
    22.3 @@ -34,7 +34,7 @@ import CGIHTTPServer
    22.4  
    22.5  gHttpPort = 7777
    22.6  gHttpDir  = '/var/lib/xensec_gen'
    22.7 -gLogFile  = '/var/log/xensec_gen.log'
    22.8 +gLogFile  = '/var/log/xen/xensec_gen.log'
    22.9  gUser     = 'nobody'
   22.10  gGroup    = 'nobody'
   22.11  
    23.1 --- a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c	Mon Aug 28 16:16:07 2006 -0600
    23.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c	Mon Aug 28 16:26:37 2006 -0600
    23.3 @@ -4,7 +4,7 @@
    23.4   * A simplified event channel for para-drivers in unmodified linux
    23.5   *
    23.6   * Copyright (c) 2002-2005, K A Fraser
    23.7 - * Copyright (c) 2005, <xiaofeng.ling@intel.com>
    23.8 + * Copyright (c) 2005, Intel Corporation <xiaofeng.ling@intel.com>
    23.9   *
   23.10   * This file may be distributed separately from the Linux kernel, or
   23.11   * incorporated into other software packages, subject to the following license:
    24.1 --- a/xen/arch/x86/Makefile	Mon Aug 28 16:16:07 2006 -0600
    24.2 +++ b/xen/arch/x86/Makefile	Mon Aug 28 16:26:37 2006 -0600
    24.3 @@ -2,6 +2,7 @@ subdir-y += acpi
    24.4  subdir-y += cpu
    24.5  subdir-y += genapic
    24.6  subdir-y += hvm
    24.7 +subdir-y += mm
    24.8  subdir-y += oprofile
    24.9  
   24.10  subdir-$(x86_32) += x86_32
   24.11 @@ -41,23 +42,6 @@ obj-y += traps.o
   24.12  obj-y += usercopy.o
   24.13  obj-y += x86_emulate.o
   24.14  
   24.15 -ifneq ($(pae),n)
   24.16 -obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o
   24.17 -else
   24.18 -obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o
   24.19 -endif
   24.20 -
   24.21 -obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \
   24.22 -                 shadow2_g2_on_s3.o
   24.23 -
   24.24 -guest_levels  = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
   24.25 -shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
   24.26 -shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
   24.27 -                -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
   24.28 -
   24.29 -shadow2_%.o: shadow2.c $(HDRS) Makefile
   24.30 -	$(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@
   24.31 -
   24.32  obj-$(crash_debug) += gdbstub.o
   24.33  
   24.34  $(TARGET): $(TARGET)-syms boot/mkelf32
   24.35 @@ -86,9 +70,6 @@ xen.lds: $(TARGET_SUBARCH)/xen.lds.S $(H
   24.36  boot/mkelf32: boot/mkelf32.c
   24.37  	$(HOSTCC) $(HOSTCFLAGS) -o $@ $<
   24.38  
   24.39 -shadow_guest32.o: shadow.c
   24.40 -shadow_guest32pae.o: shadow.c
   24.41 -
   24.42  .PHONY: clean
   24.43  clean::
   24.44  	rm -f asm-offsets.s xen.lds boot/*.o boot/*~ boot/core boot/mkelf32
    25.1 --- a/xen/arch/x86/domain.c	Mon Aug 28 16:16:07 2006 -0600
    25.2 +++ b/xen/arch/x86/domain.c	Mon Aug 28 16:26:37 2006 -0600
    25.3 @@ -200,12 +200,12 @@ int arch_domain_create(struct domain *d)
    25.4  
    25.5  #endif /* __x86_64__ */
    25.6  
    25.7 -    shadow2_lock_init(d);
    25.8 -    for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ )
    25.9 -        INIT_LIST_HEAD(&d->arch.shadow2.freelists[i]);
   25.10 -    INIT_LIST_HEAD(&d->arch.shadow2.p2m_freelist);
   25.11 -    INIT_LIST_HEAD(&d->arch.shadow2.p2m_inuse);
   25.12 -    INIT_LIST_HEAD(&d->arch.shadow2.toplevel_shadows);
   25.13 +    shadow_lock_init(d);
   25.14 +    for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
   25.15 +        INIT_LIST_HEAD(&d->arch.shadow.freelists[i]);
   25.16 +    INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist);
   25.17 +    INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse);
   25.18 +    INIT_LIST_HEAD(&d->arch.shadow.toplevel_shadows);
   25.19  
   25.20      if ( !is_idle_domain(d) )
   25.21      {
   25.22 @@ -236,7 +236,7 @@ int arch_domain_create(struct domain *d)
   25.23  
   25.24  void arch_domain_destroy(struct domain *d)
   25.25  {
   25.26 -    shadow2_final_teardown(d);
   25.27 +    shadow_final_teardown(d);
   25.28  
   25.29      free_xenheap_pages(
   25.30          d->arch.mm_perdomain_pt,
   25.31 @@ -342,10 +342,10 @@ int arch_set_info_guest(
   25.32          }
   25.33      }    
   25.34  
   25.35 -    /* Shadow2: make sure the domain has enough shadow memory to
   25.36 +    /* Shadow: make sure the domain has enough shadow memory to
   25.37       * boot another vcpu */
   25.38 -    if ( shadow2_mode_enabled(d) 
   25.39 -         && d->arch.shadow2.total_pages < shadow2_min_acceptable_pages(d) )
   25.40 +    if ( shadow_mode_enabled(d) 
   25.41 +         && d->arch.shadow.total_pages < shadow_min_acceptable_pages(d) )
   25.42      {
   25.43          destroy_gdt(v);
   25.44          return -ENOMEM;
   25.45 @@ -357,8 +357,8 @@ int arch_set_info_guest(
   25.46      /* Don't redo final setup */
   25.47      set_bit(_VCPUF_initialised, &v->vcpu_flags);
   25.48  
   25.49 -    if ( shadow2_mode_enabled(d) )
   25.50 -        shadow2_update_paging_modes(v);
   25.51 +    if ( shadow_mode_enabled(d) )
   25.52 +        shadow_update_paging_modes(v);
   25.53  
   25.54      update_cr3(v);
   25.55  
   25.56 @@ -936,11 +936,11 @@ void domain_relinquish_resources(struct 
   25.57      for_each_vcpu ( d, v )
   25.58      {
   25.59          /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
   25.60 -         * or sh2_update_paging_modes()) */
   25.61 +         * or sh_update_paging_modes()) */
   25.62          pfn = pagetable_get_pfn(v->arch.guest_table);
   25.63          if ( pfn != 0 )
   25.64          {
   25.65 -            if ( shadow2_mode_refcounts(d) )
   25.66 +            if ( shadow_mode_refcounts(d) )
   25.67                  put_page(mfn_to_page(pfn));
   25.68              else
   25.69                  put_page_and_type(mfn_to_page(pfn));
   25.70 @@ -962,7 +962,7 @@ void domain_relinquish_resources(struct 
   25.71          hvm_relinquish_guest_resources(d);
   25.72  
   25.73      /* Tear down shadow mode stuff. */
   25.74 -    shadow2_teardown(d);
   25.75 +    shadow_teardown(d);
   25.76  
   25.77      /*
   25.78       * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
   25.79 @@ -981,18 +981,18 @@ void domain_relinquish_resources(struct 
   25.80  
   25.81  void arch_dump_domain_info(struct domain *d)
   25.82  {
   25.83 -    if ( shadow2_mode_enabled(d) )
   25.84 +    if ( shadow_mode_enabled(d) )
   25.85      {
   25.86 -        printk("    shadow2 mode: ");
   25.87 -        if ( d->arch.shadow2.mode & SHM2_enable )
   25.88 +        printk("    shadow mode: ");
   25.89 +        if ( d->arch.shadow.mode & SHM2_enable )
   25.90              printk("enabled ");
   25.91 -        if ( shadow2_mode_refcounts(d) )
   25.92 +        if ( shadow_mode_refcounts(d) )
   25.93              printk("refcounts ");
   25.94 -        if ( shadow2_mode_log_dirty(d) )
   25.95 +        if ( shadow_mode_log_dirty(d) )
   25.96              printk("log_dirty ");
   25.97 -        if ( shadow2_mode_translate(d) )
   25.98 +        if ( shadow_mode_translate(d) )
   25.99              printk("translate ");
  25.100 -        if ( shadow2_mode_external(d) )
  25.101 +        if ( shadow_mode_external(d) )
  25.102              printk("external ");
  25.103          printk("\n");
  25.104      }
    26.1 --- a/xen/arch/x86/domain_build.c	Mon Aug 28 16:16:07 2006 -0600
    26.2 +++ b/xen/arch/x86/domain_build.c	Mon Aug 28 16:26:37 2006 -0600
    26.3 @@ -679,8 +679,8 @@ int construct_dom0(struct domain *d,
    26.4          (void)alloc_vcpu(d, i, i);
    26.5  
    26.6      /* Set up CR3 value for write_ptbase */
    26.7 -    if ( shadow2_mode_enabled(v->domain) )
    26.8 -        shadow2_update_paging_modes(v);
    26.9 +    if ( shadow_mode_enabled(v->domain) )
   26.10 +        shadow_update_paging_modes(v);
   26.11      else
   26.12          update_cr3(v);
   26.13  
   26.14 @@ -791,8 +791,8 @@ int construct_dom0(struct domain *d,
   26.15      new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
   26.16  
   26.17      if ( opt_dom0_shadow )
   26.18 -        if ( shadow2_test_enable(d) == 0 ) 
   26.19 -            shadow2_update_paging_modes(v);
   26.20 +        if ( shadow_test_enable(d) == 0 ) 
   26.21 +            shadow_update_paging_modes(v);
   26.22  
   26.23      if ( supervisor_mode_kernel )
   26.24      {
    27.1 --- a/xen/arch/x86/domctl.c	Mon Aug 28 16:16:07 2006 -0600
    27.2 +++ b/xen/arch/x86/domctl.c	Mon Aug 28 16:26:37 2006 -0600
    27.3 @@ -39,7 +39,7 @@ long arch_do_domctl(
    27.4          d = find_domain_by_id(domctl->domain);
    27.5          if ( d != NULL )
    27.6          {
    27.7 -            ret = shadow2_domctl(d, &domctl->u.shadow_op, u_domctl);
    27.8 +            ret = shadow_domctl(d, &domctl->u.shadow_op, u_domctl);
    27.9              put_domain(d);
   27.10              copy_to_guest(u_domctl, domctl, 1);
   27.11          } 
    28.1 --- a/xen/arch/x86/hvm/hvm.c	Mon Aug 28 16:16:07 2006 -0600
    28.2 +++ b/xen/arch/x86/hvm/hvm.c	Mon Aug 28 16:26:37 2006 -0600
    28.3 @@ -384,8 +384,8 @@ int hvm_copy(void *buf, unsigned long va
    28.4          if (count > size)
    28.5              count = size;
    28.6  
    28.7 -        gfn = shadow2_gva_to_gfn(v, vaddr);
    28.8 -        mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn));
    28.9 +        gfn = shadow_gva_to_gfn(v, vaddr);
   28.10 +        mfn = mfn_x(sh_vcpu_gfn_to_mfn(v, gfn));
   28.11  
   28.12          if (mfn == INVALID_MFN)
   28.13              return 0;
   28.14 @@ -539,7 +539,7 @@ void hvm_do_hypercall(struct cpu_user_re
   28.15          return;
   28.16      }
   28.17  
   28.18 -    if ( current->arch.shadow2.mode->guest_levels == 4 )
   28.19 +    if ( current->arch.shadow.mode->guest_levels == 4 )
   28.20      {
   28.21          pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi,
   28.22                                                         pregs->rsi,
    29.1 --- a/xen/arch/x86/hvm/platform.c	Mon Aug 28 16:16:07 2006 -0600
    29.2 +++ b/xen/arch/x86/hvm/platform.c	Mon Aug 28 16:26:37 2006 -0600
    29.3 @@ -721,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *
    29.4  
    29.5      if (pvalid) {
    29.6          if (hvm_paging_enabled(current))
    29.7 -            p->u.data = shadow2_gva_to_gpa(current, value);
    29.8 +            p->u.data = shadow_gva_to_gpa(current, value);
    29.9          else
   29.10              p->u.pdata = (void *) value; /* guest VA == guest PA */
   29.11      } else
   29.12 @@ -771,7 +771,7 @@ void send_mmio_req(
   29.13  
   29.14      if (pvalid) {
   29.15          if (hvm_paging_enabled(v))
   29.16 -            p->u.data = shadow2_gva_to_gpa(v, value);
   29.17 +            p->u.data = shadow_gva_to_gpa(v, value);
   29.18          else
   29.19              p->u.pdata = (void *) value; /* guest VA == guest PA */
   29.20      } else
    30.1 --- a/xen/arch/x86/hvm/svm/svm.c	Mon Aug 28 16:16:07 2006 -0600
    30.2 +++ b/xen/arch/x86/hvm/svm/svm.c	Mon Aug 28 16:26:37 2006 -0600
    30.3 @@ -29,7 +29,7 @@
    30.4  #include <xen/domain_page.h>
    30.5  #include <asm/current.h>
    30.6  #include <asm/io.h>
    30.7 -#include <asm/shadow2.h>
    30.8 +#include <asm/shadow.h>
    30.9  #include <asm/regs.h>
   30.10  #include <asm/cpufeature.h>
   30.11  #include <asm/processor.h>
   30.12 @@ -403,6 +403,50 @@ static inline int long_mode_do_msr_write
   30.13      return 1;
   30.14  }
   30.15  
   30.16 +
   30.17 +#define loaddebug(_v,_reg) \
   30.18 +    __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg]))
   30.19 +#define savedebug(_v,_reg) \
   30.20 +    __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg]))
   30.21 +
   30.22 +
   30.23 +static inline void svm_save_dr(struct vcpu *v)
   30.24 +{
   30.25 +    if (v->arch.hvm_vcpu.flag_dr_dirty)
   30.26 +    {
   30.27 +        /* clear the DR dirty flag and re-enable intercepts for DR accesses */ 
   30.28 +        v->arch.hvm_vcpu.flag_dr_dirty = 0;
   30.29 +        v->arch.hvm_svm.vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES;
   30.30 +
   30.31 +        savedebug(&v->arch.guest_context, 0);    
   30.32 +        savedebug(&v->arch.guest_context, 1);    
   30.33 +        savedebug(&v->arch.guest_context, 2);    
   30.34 +        savedebug(&v->arch.guest_context, 3);    
   30.35 +    }
   30.36 +}
   30.37 +
   30.38 +
   30.39 +static inline void __restore_debug_registers(struct vcpu *v)
   30.40 +{
   30.41 +    loaddebug(&v->arch.guest_context, 0);
   30.42 +    loaddebug(&v->arch.guest_context, 1);
   30.43 +    loaddebug(&v->arch.guest_context, 2);
   30.44 +    loaddebug(&v->arch.guest_context, 3);
   30.45 +}
   30.46 +
   30.47 +
   30.48 +static inline void svm_restore_dr(struct vcpu *v)
   30.49 +{
   30.50 +    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
   30.51 +
   30.52 +    if (!vmcb)
   30.53 +        return;
   30.54 +
   30.55 +    if (unlikely(vmcb->dr7 & 0xFF))
   30.56 +        __restore_debug_registers(v);
   30.57 +}
   30.58 +
   30.59 +
   30.60  static int svm_realmode(struct vcpu *v)
   30.61  {
   30.62      unsigned long cr0 = v->arch.hvm_svm.cpu_shadow_cr0;
   30.63 @@ -717,6 +761,7 @@ static void svm_freeze_time(struct vcpu 
   30.64  static void svm_ctxt_switch_from(struct vcpu *v)
   30.65  {
   30.66      svm_freeze_time(v);
   30.67 +    svm_save_dr(v);
   30.68  }
   30.69  
   30.70  static void svm_ctxt_switch_to(struct vcpu *v)
   30.71 @@ -732,6 +777,7 @@ static void svm_ctxt_switch_to(struct vc
   30.72      set_segment_register(es, 0);
   30.73      set_segment_register(ss, 0);
   30.74  #endif
   30.75 +    svm_restore_dr(v);
   30.76  }
   30.77  
   30.78  
   30.79 @@ -746,10 +792,10 @@ static void svm_final_setup_guest(struct
   30.80      if ( v != d->vcpu[0] )
   30.81          return;
   30.82  
   30.83 -    if ( !shadow2_mode_external(d) )
   30.84 +    if ( !shadow_mode_external(d) )
   30.85      {
   30.86          DPRINTK("Can't init HVM for dom %u vcpu %u: "
   30.87 -                "not in shadow2 external mode\n", d->domain_id, v->vcpu_id);
   30.88 +                "not in shadow external mode\n", d->domain_id, v->vcpu_id);
   30.89          domain_crash(d);
   30.90      }
   30.91  
   30.92 @@ -914,7 +960,7 @@ static int svm_do_page_fault(unsigned lo
   30.93                  va, eip, (unsigned long)regs->error_code);
   30.94  //#endif
   30.95  
   30.96 -    result = shadow2_fault(va, regs); 
   30.97 +    result = shadow_fault(va, regs); 
   30.98  
   30.99      if( result ) {
  30.100          /* Let's make sure that the Guest TLB is flushed */
  30.101 @@ -1183,55 +1229,16 @@ static inline void set_reg(unsigned int 
  30.102  }
  30.103                             
  30.104  
  30.105 -static void svm_dr_access (struct vcpu *v, unsigned int reg, unsigned int type,
  30.106 -                           struct cpu_user_regs *regs)
  30.107 +static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
  30.108  {
  30.109 -    unsigned long *reg_p = 0;
  30.110 -    unsigned int gpreg = 0;
  30.111 -    unsigned long eip;
  30.112 -    int inst_len;
  30.113 -    int index;
  30.114 -    struct vmcb_struct *vmcb;
  30.115 -    u8 buffer[MAX_INST_LEN];
  30.116 -    u8 prefix = 0;
  30.117 -
  30.118 -    vmcb = v->arch.hvm_svm.vmcb;
  30.119 -    
  30.120 -    ASSERT(vmcb);
  30.121 -
  30.122 -    eip = vmcb->rip;
  30.123 -    inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer));
  30.124 -    index = skip_prefix_bytes(buffer, sizeof(buffer));
  30.125 -    
  30.126 -    ASSERT(buffer[index+0] == 0x0f && (buffer[index+1] & 0xFD) == 0x21);
  30.127 -
  30.128 -    if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
  30.129 -        prefix = buffer[index-1];
  30.130 -
  30.131 -    gpreg = decode_src_reg(prefix, buffer[index + 2]);
  30.132 -    ASSERT(reg == decode_dest_reg(prefix, buffer[index + 2]));
  30.133 -
  30.134 -    HVM_DBG_LOG(DBG_LEVEL_1, "svm_dr_access : eip=%lx, reg=%d, gpreg = %x",
  30.135 -                eip, reg, gpreg);
  30.136 -
  30.137 -    reg_p = get_reg_p(gpreg, regs, vmcb);
  30.138 -        
  30.139 -    switch (type) 
  30.140 -    {
  30.141 -    case TYPE_MOV_TO_DR: 
  30.142 -        inst_len = __get_instruction_length(vmcb, INSTR_MOV2DR, buffer);
  30.143 -        v->arch.guest_context.debugreg[reg] = *reg_p;
  30.144 -        break;
  30.145 -    case TYPE_MOV_FROM_DR:
  30.146 -        inst_len = __get_instruction_length(vmcb, INSTR_MOVDR2, buffer);
  30.147 -        *reg_p = v->arch.guest_context.debugreg[reg];
  30.148 -        break;
  30.149 -    default:
  30.150 -        __hvm_bug(regs);
  30.151 -        break;
  30.152 -    }
  30.153 -    ASSERT(inst_len > 0);
  30.154 -    __update_guest_eip(vmcb, inst_len);
  30.155 +    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
  30.156 +
  30.157 +    v->arch.hvm_vcpu.flag_dr_dirty = 1;
  30.158 +
  30.159 +    __restore_debug_registers(v);
  30.160 +
  30.161 +    /* allow the guest full access to the debug registers */
  30.162 +    vmcb->dr_intercepts = 0;
  30.163  }
  30.164  
  30.165  
  30.166 @@ -1562,7 +1569,7 @@ static int svm_set_cr0(unsigned long val
  30.167          v->arch.guest_table = pagetable_from_pfn(mfn);
  30.168          if ( old_base_mfn )
  30.169              put_page(mfn_to_page(old_base_mfn));
  30.170 -        shadow2_update_paging_modes(v);
  30.171 +        shadow_update_paging_modes(v);
  30.172  
  30.173          HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", 
  30.174                      (unsigned long) (mfn << PAGE_SHIFT));
  30.175 @@ -1588,14 +1595,14 @@ static int svm_set_cr0(unsigned long val
  30.176              svm_inject_exception(v, TRAP_gp_fault, 1, 0);
  30.177              return 0;
  30.178          }
  30.179 -        shadow2_update_paging_modes(v);
  30.180 +        shadow_update_paging_modes(v);
  30.181          vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
  30.182          set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
  30.183      }
  30.184      else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
  30.185      {
  30.186          /* we should take care of this kind of situation */
  30.187 -        shadow2_update_paging_modes(v);
  30.188 +        shadow_update_paging_modes(v);
  30.189          vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
  30.190          set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
  30.191      }
  30.192 @@ -1706,7 +1713,7 @@ static int mov_to_cr(int gpreg, int cr, 
  30.193              mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
  30.194              if (mfn != pagetable_get_pfn(v->arch.guest_table))
  30.195                  __hvm_bug(regs);
  30.196 -            shadow2_update_cr3(v);
  30.197 +            shadow_update_cr3(v);
  30.198          }
  30.199          else 
  30.200          {
  30.201 @@ -1771,7 +1778,7 @@ static int mov_to_cr(int gpreg, int cr, 
  30.202                  v->arch.guest_table = pagetable_from_pfn(mfn);
  30.203                  if ( old_base_mfn )
  30.204                      put_page(mfn_to_page(old_base_mfn));
  30.205 -                shadow2_update_paging_modes(v);
  30.206 +                shadow_update_paging_modes(v);
  30.207  
  30.208                  HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
  30.209                              (unsigned long) (mfn << PAGE_SHIFT));
  30.210 @@ -1808,7 +1815,7 @@ static int mov_to_cr(int gpreg, int cr, 
  30.211          if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
  30.212          {
  30.213              set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
  30.214 -            shadow2_update_paging_modes(v);
  30.215 +            shadow_update_paging_modes(v);
  30.216          }
  30.217          break;
  30.218      }
  30.219 @@ -2149,7 +2156,7 @@ void svm_handle_invlpg(const short invlp
  30.220  
  30.221      /* Overkill, we may not this */
  30.222      set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
  30.223 -    shadow2_invlpg(v, g_vaddr);
  30.224 +    shadow_invlpg(v, g_vaddr);
  30.225  }
  30.226  
  30.227  
  30.228 @@ -2520,7 +2527,7 @@ void walk_shadow_and_guest_pt(unsigned l
  30.229      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
  30.230      unsigned long gpa;
  30.231  
  30.232 -    gpa = shadow2_gva_to_gpa(current, gva);
  30.233 +    gpa = shadow_gva_to_gpa(current, gva);
  30.234      printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 );
  30.235      if( !svm_paging_enabled(v) || mmio_space(gpa) )
  30.236          return;
  30.237 @@ -2591,7 +2598,7 @@ asmlinkage void svm_vmexit_handler(struc
  30.238          if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF) 
  30.239          {
  30.240              if (svm_paging_enabled(v) && 
  30.241 -                !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2)))
  30.242 +                !mmio_space(shadow_gva_to_gpa(current, vmcb->exitinfo2)))
  30.243              {
  30.244                  printk("I%08ld,ExC=%s(%d),IP=%x:%llx,"
  30.245                         "I1=%llx,I2=%llx,INT=%llx, "
  30.246 @@ -2601,7 +2608,7 @@ asmlinkage void svm_vmexit_handler(struc
  30.247                         (unsigned long long) vmcb->exitinfo1,
  30.248                         (unsigned long long) vmcb->exitinfo2,
  30.249                         (unsigned long long) vmcb->exitintinfo.bytes,
  30.250 -                       (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2));
  30.251 +                       (unsigned long long) shadow_gva_to_gpa(current, vmcb->exitinfo2));
  30.252              }
  30.253              else 
  30.254              {
  30.255 @@ -2862,53 +2869,9 @@ asmlinkage void svm_vmexit_handler(struc
  30.256      case VMEXIT_CR8_WRITE:
  30.257          svm_cr_access(v, 8, TYPE_MOV_TO_CR, &regs);
  30.258          break;
  30.259 -
  30.260 -    case VMEXIT_DR0_READ:
  30.261 -        svm_dr_access(v, 0, TYPE_MOV_FROM_DR, &regs);
  30.262 -        break;
  30.263 -
  30.264 -    case VMEXIT_DR1_READ:
  30.265 -        svm_dr_access(v, 1, TYPE_MOV_FROM_DR, &regs);
  30.266 -        break;
  30.267 -
  30.268 -    case VMEXIT_DR2_READ:
  30.269 -        svm_dr_access(v, 2, TYPE_MOV_FROM_DR, &regs);
  30.270 -        break;
  30.271 -
  30.272 -    case VMEXIT_DR3_READ:
  30.273 -        svm_dr_access(v, 3, TYPE_MOV_FROM_DR, &regs);
  30.274 -        break;
  30.275 -
  30.276 -    case VMEXIT_DR6_READ:
  30.277 -        svm_dr_access(v, 6, TYPE_MOV_FROM_DR, &regs);
  30.278 -        break;
  30.279 -
  30.280 -    case VMEXIT_DR7_READ:
  30.281 -        svm_dr_access(v, 7, TYPE_MOV_FROM_DR, &regs);
  30.282 -        break;
  30.283 -
  30.284 -    case VMEXIT_DR0_WRITE:
  30.285 -        svm_dr_access(v, 0, TYPE_MOV_TO_DR, &regs);
  30.286 -        break;
  30.287 -
  30.288 -    case VMEXIT_DR1_WRITE:
  30.289 -        svm_dr_access(v, 1, TYPE_MOV_TO_DR, &regs);
  30.290 -        break;
  30.291 -
  30.292 -    case VMEXIT_DR2_WRITE:
  30.293 -        svm_dr_access(v, 2, TYPE_MOV_TO_DR, &regs);
  30.294 -        break;
  30.295 -
  30.296 -    case VMEXIT_DR3_WRITE:
  30.297 -        svm_dr_access(v, 3, TYPE_MOV_TO_DR, &regs);
  30.298 -        break;
  30.299 -
  30.300 -    case VMEXIT_DR6_WRITE:
  30.301 -        svm_dr_access(v, 6, TYPE_MOV_TO_DR, &regs);
  30.302 -        break;
  30.303 -
  30.304 -    case VMEXIT_DR7_WRITE:
  30.305 -        svm_dr_access(v, 7, TYPE_MOV_TO_DR, &regs);
  30.306 +	
  30.307 +    case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
  30.308 +        svm_dr_access(v, &regs);
  30.309          break;
  30.310  
  30.311      case VMEXIT_IOIO:
    31.1 --- a/xen/arch/x86/hvm/svm/vmcb.c	Mon Aug 28 16:16:07 2006 -0600
    31.2 +++ b/xen/arch/x86/hvm/svm/vmcb.c	Mon Aug 28 16:26:37 2006 -0600
    31.3 @@ -121,7 +121,7 @@ static int construct_vmcb_controls(struc
    31.4          GENERAL2_INTERCEPT_SKINIT | GENERAL2_INTERCEPT_RDTSCP;
    31.5  
    31.6      /* read or write all debug registers 0 - 15 */
    31.7 -    vmcb->dr_intercepts = 0;
    31.8 +    vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES;
    31.9  
   31.10      /* RD/WR all control registers 0 - 15, but not read CR2 */
   31.11      vmcb->cr_intercepts = ~(CR_INTERCEPT_CR2_READ | CR_INTERCEPT_CR2_WRITE);
    32.1 --- a/xen/arch/x86/hvm/vmx/vmcs.c	Mon Aug 28 16:16:07 2006 -0600
    32.2 +++ b/xen/arch/x86/hvm/vmx/vmcs.c	Mon Aug 28 16:26:37 2006 -0600
    32.3 @@ -35,7 +35,7 @@
    32.4  #include <xen/event.h>
    32.5  #include <xen/kernel.h>
    32.6  #include <xen/keyhandler.h>
    32.7 -#include <asm/shadow2.h>
    32.8 +#include <asm/shadow.h>
    32.9  
   32.10  static int vmcs_size;
   32.11  static int vmcs_order;
   32.12 @@ -272,7 +272,7 @@ static void vmx_do_launch(struct vcpu *v
   32.13      error |= __vmwrite(GUEST_TR_BASE, 0);
   32.14      error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
   32.15  
   32.16 -    shadow2_update_paging_modes(v);
   32.17 +    shadow_update_paging_modes(v);
   32.18      printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n",
   32.19             __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3);
   32.20      __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
    33.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Mon Aug 28 16:16:07 2006 -0600
    33.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Mon Aug 28 16:26:37 2006 -0600
    33.3 @@ -40,7 +40,7 @@
    33.4  #include <asm/hvm/vmx/vmx.h>
    33.5  #include <asm/hvm/vmx/vmcs.h>
    33.6  #include <asm/hvm/vmx/cpu.h>
    33.7 -#include <asm/shadow2.h>
    33.8 +#include <asm/shadow.h>
    33.9  #include <public/sched.h>
   33.10  #include <public/hvm/ioreq.h>
   33.11  #include <asm/hvm/vpic.h>
   33.12 @@ -66,10 +66,10 @@ static int vmx_initialize_guest_resource
   33.13      if ( v->vcpu_id != 0 )
   33.14          return 1;
   33.15  
   33.16 -    if ( !shadow2_mode_external(d) )
   33.17 +    if ( !shadow_mode_external(d) )
   33.18      {
   33.19          DPRINTK("Can't init HVM for dom %u vcpu %u: "
   33.20 -                "not in shadow2 external mode\n", 
   33.21 +                "not in shadow external mode\n", 
   33.22                  d->domain_id, v->vcpu_id);
   33.23          domain_crash(d);
   33.24      }
   33.25 @@ -865,7 +865,7 @@ static int vmx_do_page_fault(unsigned lo
   33.26      }
   33.27  #endif
   33.28  
   33.29 -    result = shadow2_fault(va, regs);
   33.30 +    result = shadow_fault(va, regs);
   33.31  
   33.32      TRACE_VMEXIT (2,result);
   33.33  #if 0
   33.34 @@ -1039,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigne
   33.35       * We do the safest things first, then try to update the shadow
   33.36       * copying from guest
   33.37       */
   33.38 -    shadow2_invlpg(v, va);
   33.39 +    shadow_invlpg(v, va);
   33.40  }
   33.41  
   33.42  
   33.43 @@ -1301,7 +1301,7 @@ vmx_world_restore(struct vcpu *v, struct
   33.44  
   33.45   skip_cr3:
   33.46  
   33.47 -    shadow2_update_paging_modes(v);
   33.48 +    shadow_update_paging_modes(v);
   33.49      if (!vmx_paging_enabled(v))
   33.50          HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
   33.51      else
   33.52 @@ -1504,7 +1504,7 @@ static int vmx_set_cr0(unsigned long val
   33.53          v->arch.guest_table = pagetable_from_pfn(mfn);
   33.54          if (old_base_mfn)
   33.55              put_page(mfn_to_page(old_base_mfn));
   33.56 -        shadow2_update_paging_modes(v);
   33.57 +        shadow_update_paging_modes(v);
   33.58  
   33.59          HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
   33.60                      (unsigned long) (mfn << PAGE_SHIFT));
   33.61 @@ -1577,7 +1577,7 @@ static int vmx_set_cr0(unsigned long val
   33.62      else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
   33.63      {
   33.64          __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
   33.65 -        shadow2_update_paging_modes(v);
   33.66 +        shadow_update_paging_modes(v);
   33.67      }
   33.68  
   33.69      return 1;
   33.70 @@ -1662,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, str
   33.71              mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
   33.72              if (mfn != pagetable_get_pfn(v->arch.guest_table))
   33.73                  __hvm_bug(regs);
   33.74 -            shadow2_update_cr3(v);
   33.75 +            shadow_update_cr3(v);
   33.76          } else {
   33.77              /*
   33.78               * If different, make a shadow. Check if the PDBR is valid
   33.79 @@ -1755,7 +1755,7 @@ static int mov_to_cr(int gp, int cr, str
   33.80           * all TLB entries except global entries.
   33.81           */
   33.82          if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
   33.83 -            shadow2_update_paging_modes(v);
   33.84 +            shadow_update_paging_modes(v);
   33.85          break;
   33.86      }
   33.87      default:
    34.1 --- a/xen/arch/x86/mm.c	Mon Aug 28 16:16:07 2006 -0600
    34.2 +++ b/xen/arch/x86/mm.c	Mon Aug 28 16:26:37 2006 -0600
    34.3 @@ -454,12 +454,12 @@ int map_ldt_shadow_page(unsigned int off
    34.4  
    34.5      res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
    34.6  
    34.7 -    if ( !res && unlikely(shadow2_mode_refcounts(d)) )
    34.8 +    if ( !res && unlikely(shadow_mode_refcounts(d)) )
    34.9      {
   34.10 -        shadow2_lock(d);
   34.11 -        shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
   34.12 +        shadow_lock(d);
   34.13 +        shadow_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
   34.14          res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
   34.15 -        shadow2_unlock(d);
   34.16 +        shadow_unlock(d);
   34.17      }
   34.18  
   34.19      if ( unlikely(!res) )
   34.20 @@ -527,7 +527,7 @@ get_linear_pagetable(
   34.21      struct page_info *page;
   34.22      unsigned long pfn;
   34.23  
   34.24 -    ASSERT( !shadow2_mode_refcounts(d) );
   34.25 +    ASSERT( !shadow_mode_refcounts(d) );
   34.26  
   34.27      if ( (root_get_flags(re) & _PAGE_RW) )
   34.28      {
   34.29 @@ -602,12 +602,12 @@ get_page_from_l1e(
   34.30          d = dom_io;
   34.31      }
   34.32  
   34.33 -    /* Foreign mappings into guests in shadow2 external mode don't
   34.34 +    /* Foreign mappings into guests in shadow external mode don't
   34.35       * contribute to writeable mapping refcounts.  (This allows the
   34.36       * qemu-dm helper process in dom0 to map the domain's memory without
   34.37       * messing up the count of "real" writable mappings.) */
   34.38      okay = (((l1e_get_flags(l1e) & _PAGE_RW) && 
   34.39 -             !(unlikely(shadow2_mode_external(d) && (d != current->domain))))
   34.40 +             !(unlikely(shadow_mode_external(d) && (d != current->domain))))
   34.41              ? get_page_and_type(page, d, PGT_writable_page)
   34.42              : get_page(page, d));
   34.43      if ( !okay )
   34.44 @@ -771,9 +771,9 @@ void put_page_from_l1e(l1_pgentry_t l1e,
   34.45      }
   34.46  
   34.47      /* Remember we didn't take a type-count of foreign writable mappings
   34.48 -     * to shadow2 external domains */
   34.49 +     * to shadow external domains */
   34.50      if ( (l1e_get_flags(l1e) & _PAGE_RW) && 
   34.51 -         !(unlikely((e != d) && shadow2_mode_external(e))) )
   34.52 +         !(unlikely((e != d) && shadow_mode_external(e))) )
   34.53      {
   34.54          put_page_and_type(page);
   34.55      }
   34.56 @@ -830,7 +830,7 @@ static int alloc_l1_table(struct page_in
   34.57      l1_pgentry_t  *pl1e;
   34.58      int            i;
   34.59  
   34.60 -    ASSERT(!shadow2_mode_refcounts(d));
   34.61 +    ASSERT(!shadow_mode_refcounts(d));
   34.62  
   34.63      pl1e = map_domain_page(pfn);
   34.64  
   34.65 @@ -883,7 +883,7 @@ static int create_pae_xen_mappings(l3_pg
   34.66       *     a. alloc_l3_table() calls this function and this check will fail
   34.67       *     b. mod_l3_entry() disallows updates to slot 3 in an existing table
   34.68       *
   34.69 -     * XXX -- this needs revisiting for shadow2_mode_refcount()==true...
   34.70 +     * XXX -- this needs revisiting for shadow_mode_refcount()==true...
   34.71       */
   34.72      page = l3e_get_page(l3e3);
   34.73      BUG_ON(page->u.inuse.type_info & PGT_pinned);
   34.74 @@ -1007,7 +1007,7 @@ static int alloc_l2_table(struct page_in
   34.75      l2_pgentry_t  *pl2e;
   34.76      int            i;
   34.77  
   34.78 -    ASSERT(!shadow2_mode_refcounts(d));
   34.79 +    ASSERT(!shadow_mode_refcounts(d));
   34.80      
   34.81      pl2e = map_domain_page(pfn);
   34.82  
   34.83 @@ -1059,7 +1059,7 @@ static int alloc_l3_table(struct page_in
   34.84      l3_pgentry_t  *pl3e;
   34.85      int            i;
   34.86  
   34.87 -    ASSERT(!shadow2_mode_refcounts(d));
   34.88 +    ASSERT(!shadow_mode_refcounts(d));
   34.89  
   34.90  #ifdef CONFIG_X86_PAE
   34.91      /*
   34.92 @@ -1120,7 +1120,7 @@ static int alloc_l4_table(struct page_in
   34.93      unsigned long vaddr;
   34.94      int            i;
   34.95  
   34.96 -    ASSERT(!shadow2_mode_refcounts(d));
   34.97 +    ASSERT(!shadow_mode_refcounts(d));
   34.98  
   34.99      for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
  34.100      {
  34.101 @@ -1234,8 +1234,8 @@ static inline int update_l1e(l1_pgentry_
  34.102                               struct vcpu *v)
  34.103  {
  34.104      int rv = 1;
  34.105 -    if ( unlikely(shadow2_mode_enabled(v->domain)) )
  34.106 -        shadow2_lock(v->domain);
  34.107 +    if ( unlikely(shadow_mode_enabled(v->domain)) )
  34.108 +        shadow_lock(v->domain);
  34.109  #ifndef PTE_UPDATE_WITH_CMPXCHG
  34.110      rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
  34.111  #else
  34.112 @@ -1266,10 +1266,10 @@ static inline int update_l1e(l1_pgentry_
  34.113          }
  34.114      }
  34.115  #endif
  34.116 -    if ( unlikely(shadow2_mode_enabled(v->domain)) )
  34.117 +    if ( unlikely(shadow_mode_enabled(v->domain)) )
  34.118      {
  34.119 -        shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
  34.120 -        shadow2_unlock(v->domain);    
  34.121 +        shadow_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
  34.122 +        shadow_unlock(v->domain);    
  34.123      }
  34.124      return rv;
  34.125  }
  34.126 @@ -1339,13 +1339,13 @@ static int mod_l1_entry(l1_pgentry_t *pl
  34.127  #endif
  34.128  #define UPDATE_ENTRY(_t,_p,_o,_n,_m)  ({                            \
  34.129      int rv;                                                         \
  34.130 -    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
  34.131 -        shadow2_lock(current->domain);                              \
  34.132 +    if ( unlikely(shadow_mode_enabled(current->domain)) )          \
  34.133 +        shadow_lock(current->domain);                              \
  34.134      rv = _UPDATE_ENTRY(_t, _p, _o, _n);                             \
  34.135 -    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
  34.136 +    if ( unlikely(shadow_mode_enabled(current->domain)) )          \
  34.137      {                                                               \
  34.138 -        shadow2_validate_guest_entry(current, _mfn(_m), (_p));      \
  34.139 -        shadow2_unlock(current->domain);                            \
  34.140 +        shadow_validate_guest_entry(current, _mfn(_m), (_p));      \
  34.141 +        shadow_unlock(current->domain);                            \
  34.142      }                                                               \
  34.143      rv;                                                             \
  34.144  })
  34.145 @@ -1581,21 +1581,21 @@ void free_page_type(struct page_info *pa
  34.146           */
  34.147          this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
  34.148  
  34.149 -        if ( unlikely(shadow2_mode_enabled(owner)
  34.150 -                 && !shadow2_lock_is_acquired(owner)) )
  34.151 +        if ( unlikely(shadow_mode_enabled(owner)
  34.152 +                 && !shadow_lock_is_acquired(owner)) )
  34.153          {
  34.154              /* Raw page tables are rewritten during save/restore. */
  34.155 -            if ( !shadow2_mode_translate(owner) )
  34.156 +            if ( !shadow_mode_translate(owner) )
  34.157                  mark_dirty(owner, page_to_mfn(page));
  34.158  
  34.159 -            if ( shadow2_mode_refcounts(owner) )
  34.160 +            if ( shadow_mode_refcounts(owner) )
  34.161                  return;
  34.162  
  34.163              gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
  34.164              ASSERT(VALID_M2P(gmfn));
  34.165 -            shadow2_lock(owner);
  34.166 -            shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
  34.167 -            shadow2_unlock(owner);
  34.168 +            shadow_lock(owner);
  34.169 +            shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
  34.170 +            shadow_unlock(owner);
  34.171          }
  34.172      }
  34.173  
  34.174 @@ -1760,7 +1760,7 @@ int get_page_type(struct page_info *page
  34.175  #endif
  34.176                      /* Fixme: add code to propagate va_unknown to subtables. */
  34.177                      if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
  34.178 -                         !shadow2_mode_refcounts(page_get_owner(page)) )
  34.179 +                         !shadow_mode_refcounts(page_get_owner(page)) )
  34.180                          return 0;
  34.181                      /* This table is possibly mapped at multiple locations. */
  34.182                      nx &= ~PGT_va_mask;
  34.183 @@ -1810,7 +1810,7 @@ int new_guest_cr3(unsigned long mfn)
  34.184      if ( hvm_guest(v) && !hvm_paging_enabled(v) )
  34.185          domain_crash_synchronous();
  34.186  
  34.187 -    if ( shadow2_mode_refcounts(d) )
  34.188 +    if ( shadow_mode_refcounts(d) )
  34.189      {
  34.190          okay = get_page_from_pagenr(mfn, d);
  34.191          if ( unlikely(!okay) )
  34.192 @@ -1858,7 +1858,7 @@ int new_guest_cr3(unsigned long mfn)
  34.193  
  34.194      if ( likely(old_base_mfn != 0) )
  34.195      {
  34.196 -        if ( shadow2_mode_refcounts(d) )
  34.197 +        if ( shadow_mode_refcounts(d) )
  34.198              put_page(mfn_to_page(old_base_mfn));
  34.199          else
  34.200              put_page_and_type(mfn_to_page(old_base_mfn));
  34.201 @@ -2043,7 +2043,7 @@ int do_mmuext_op(
  34.202              type = PGT_root_page_table;
  34.203  
  34.204          pin_page:
  34.205 -            if ( shadow2_mode_refcounts(FOREIGNDOM) )
  34.206 +            if ( shadow_mode_refcounts(FOREIGNDOM) )
  34.207                  break;
  34.208  
  34.209              okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
  34.210 @@ -2065,7 +2065,7 @@ int do_mmuext_op(
  34.211              break;
  34.212  
  34.213          case MMUEXT_UNPIN_TABLE:
  34.214 -            if ( shadow2_mode_refcounts(d) )
  34.215 +            if ( shadow_mode_refcounts(d) )
  34.216                  break;
  34.217  
  34.218              if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
  34.219 @@ -2078,11 +2078,11 @@ int do_mmuext_op(
  34.220              {
  34.221                  put_page_and_type(page);
  34.222                  put_page(page);
  34.223 -                if ( shadow2_mode_enabled(d) )
  34.224 +                if ( shadow_mode_enabled(d) )
  34.225                  {
  34.226 -                    shadow2_lock(d);
  34.227 -                    shadow2_remove_all_shadows(v, _mfn(mfn));
  34.228 -                    shadow2_unlock(d);
  34.229 +                    shadow_lock(d);
  34.230 +                    shadow_remove_all_shadows(v, _mfn(mfn));
  34.231 +                    shadow_unlock(d);
  34.232                  }
  34.233              }
  34.234              else
  34.235 @@ -2125,8 +2125,8 @@ int do_mmuext_op(
  34.236              break;
  34.237      
  34.238          case MMUEXT_INVLPG_LOCAL:
  34.239 -            if ( !shadow2_mode_enabled(d) 
  34.240 -                 || shadow2_invlpg(v, op.arg1.linear_addr) != 0 )
  34.241 +            if ( !shadow_mode_enabled(d) 
  34.242 +                 || shadow_invlpg(v, op.arg1.linear_addr) != 0 )
  34.243                  local_flush_tlb_one(op.arg1.linear_addr);
  34.244              break;
  34.245  
  34.246 @@ -2173,7 +2173,7 @@ int do_mmuext_op(
  34.247              unsigned long ptr  = op.arg1.linear_addr;
  34.248              unsigned long ents = op.arg2.nr_ents;
  34.249  
  34.250 -            if ( shadow2_mode_external(d) )
  34.251 +            if ( shadow_mode_external(d) )
  34.252              {
  34.253                  MEM_LOG("ignoring SET_LDT hypercall from external "
  34.254                          "domain %u", d->domain_id);
  34.255 @@ -2319,7 +2319,7 @@ int do_mmu_update(
  34.256              case PGT_l3_page_table:
  34.257              case PGT_l4_page_table:
  34.258              {
  34.259 -                if ( shadow2_mode_refcounts(d) )
  34.260 +                if ( shadow_mode_refcounts(d) )
  34.261                  {
  34.262                      DPRINTK("mmu update on shadow-refcounted domain!");
  34.263                      break;
  34.264 @@ -2372,16 +2372,16 @@ int do_mmu_update(
  34.265                  if ( unlikely(!get_page_type(page, PGT_writable_page)) )
  34.266                      break;
  34.267  
  34.268 -                if ( unlikely(shadow2_mode_enabled(d)) )
  34.269 -                    shadow2_lock(d);
  34.270 +                if ( unlikely(shadow_mode_enabled(d)) )
  34.271 +                    shadow_lock(d);
  34.272  
  34.273                  *(intpte_t *)va = req.val;
  34.274                  okay = 1;
  34.275  
  34.276 -                if ( unlikely(shadow2_mode_enabled(d)) )
  34.277 +                if ( unlikely(shadow_mode_enabled(d)) )
  34.278                  {
  34.279 -                    shadow2_validate_guest_entry(v, _mfn(mfn), va);
  34.280 -                    shadow2_unlock(d);
  34.281 +                    shadow_validate_guest_entry(v, _mfn(mfn), va);
  34.282 +                    shadow_unlock(d);
  34.283                  }
  34.284  
  34.285                  put_page_type(page);
  34.286 @@ -2405,8 +2405,8 @@ int do_mmu_update(
  34.287                  break;
  34.288              }
  34.289  
  34.290 -            if ( shadow2_mode_translate(FOREIGNDOM) )
  34.291 -                shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
  34.292 +            if ( shadow_mode_translate(FOREIGNDOM) )
  34.293 +                shadow_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
  34.294              else 
  34.295                  set_gpfn_from_mfn(mfn, gpfn);
  34.296              okay = 1;
  34.297 @@ -2492,7 +2492,7 @@ static int create_grant_pte_mapping(
  34.298          goto failed;
  34.299      } 
  34.300  
  34.301 -    if ( !shadow2_mode_refcounts(d) )
  34.302 +    if ( !shadow_mode_refcounts(d) )
  34.303          put_page_from_l1e(ol1e, d);
  34.304  
  34.305      put_page_type(page);
  34.306 @@ -2590,7 +2590,7 @@ static int create_grant_va_mapping(
  34.307                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
  34.308          return GNTST_general_error;
  34.309  
  34.310 -    if ( !shadow2_mode_refcounts(d) )
  34.311 +    if ( !shadow_mode_refcounts(d) )
  34.312          put_page_from_l1e(ol1e, d);
  34.313  
  34.314      return GNTST_okay;
  34.315 @@ -2714,10 +2714,10 @@ int do_update_va_mapping(unsigned long v
  34.316  
  34.317      perfc_incrc(calls_to_update_va);
  34.318  
  34.319 -    if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) )
  34.320 +    if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
  34.321          return -EINVAL;
  34.322  
  34.323 -    if ( unlikely(shadow2_mode_refcounts(d)) )
  34.324 +    if ( unlikely(shadow_mode_refcounts(d)) )
  34.325      {
  34.326          DPRINTK("Grant op on a shadow-refcounted domain\n");
  34.327          return -EINVAL; 
  34.328 @@ -2725,11 +2725,11 @@ int do_update_va_mapping(unsigned long v
  34.329  
  34.330      LOCK_BIGLOCK(d);
  34.331  
  34.332 -    if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) )
  34.333 +    if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
  34.334      {
  34.335          if ( unlikely(this_cpu(percpu_mm_info).foreign &&
  34.336 -                      (shadow2_mode_translate(d) ||
  34.337 -                       shadow2_mode_translate(
  34.338 +                      (shadow_mode_translate(d) ||
  34.339 +                       shadow_mode_translate(
  34.340                             this_cpu(percpu_mm_info).foreign))) )
  34.341          {
  34.342              /*
  34.343 @@ -2770,8 +2770,8 @@ int do_update_va_mapping(unsigned long v
  34.344          switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
  34.345          {
  34.346          case UVMF_LOCAL:
  34.347 -            if ( !shadow2_mode_enabled(d) 
  34.348 -                 || (shadow2_invlpg(current, va) != 0) ) 
  34.349 +            if ( !shadow_mode_enabled(d) 
  34.350 +                 || (shadow_invlpg(current, va) != 0) ) 
  34.351                  local_flush_tlb_one(va);
  34.352              break;
  34.353          case UVMF_ALL:
  34.354 @@ -3006,7 +3006,7 @@ long arch_memory_op(int op, XEN_GUEST_HA
  34.355              break;
  34.356          }
  34.357  
  34.358 -        if ( !shadow2_mode_translate(d) || (mfn == 0) )
  34.359 +        if ( !shadow_mode_translate(d) || (mfn == 0) )
  34.360          {
  34.361              put_domain(d);
  34.362              return -EINVAL;
  34.363 @@ -3196,21 +3196,21 @@ static int ptwr_emulated_update(
  34.364      pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
  34.365      if ( do_cmpxchg )
  34.366      {
  34.367 -        if ( shadow2_mode_enabled(d) )
  34.368 -            shadow2_lock(d);
  34.369 +        if ( shadow_mode_enabled(d) )
  34.370 +            shadow_lock(d);
  34.371          ol1e = l1e_from_intpte(old);
  34.372          if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
  34.373          {
  34.374 -            if ( shadow2_mode_enabled(d) )
  34.375 -                shadow2_unlock(d);
  34.376 +            if ( shadow_mode_enabled(d) )
  34.377 +                shadow_unlock(d);
  34.378              unmap_domain_page(pl1e);
  34.379              put_page_from_l1e(nl1e, d);
  34.380              return X86EMUL_CMPXCHG_FAILED;
  34.381          }
  34.382 -        if ( unlikely(shadow2_mode_enabled(v->domain)) )
  34.383 +        if ( unlikely(shadow_mode_enabled(v->domain)) )
  34.384          {
  34.385 -            shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
  34.386 -            shadow2_unlock(v->domain);    
  34.387 +            shadow_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
  34.388 +            shadow_unlock(v->domain);    
  34.389          }
  34.390      }
  34.391      else
    35.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    35.2 +++ b/xen/arch/x86/mm/Makefile	Mon Aug 28 16:26:37 2006 -0600
    35.3 @@ -0,0 +1,1 @@
    35.4 +subdir-y += shadow
    36.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    36.2 +++ b/xen/arch/x86/mm/shadow/Makefile	Mon Aug 28 16:26:37 2006 -0600
    36.3 @@ -0,0 +1,15 @@
    36.4 +ifneq ($(pae),n)
    36.5 +obj-$(x86_32) += common.o g2_on_s3.o g3_on_s3.o
    36.6 +else
    36.7 +obj-$(x86_32) += common.o g2_on_s2.o
    36.8 +endif
    36.9 +
   36.10 +obj-$(x86_64) += common.o g4_on_s4.o g3_on_s3.o g2_on_s3.o
   36.11 +
   36.12 +guest_levels  = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(1)))))
   36.13 +shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(1)))))
   36.14 +shadow_defns  = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
   36.15 +                -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
   36.16 +
   36.17 +g%.o: multi.c $(HDRS) Makefile
   36.18 +	$(CC) $(CFLAGS) $(call shadow_defns,$(@F)) -c $< -o $@
    37.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    37.2 +++ b/xen/arch/x86/mm/shadow/common.c	Mon Aug 28 16:26:37 2006 -0600
    37.3 @@ -0,0 +1,3407 @@
    37.4 +/******************************************************************************
    37.5 + * arch/x86/mm/shadow/common.c
    37.6 + *
    37.7 + * Shadow code that does not need to be multiply compiled.
    37.8 + * Parts of this code are Copyright (c) 2006 by XenSource Inc.
    37.9 + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
   37.10 + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
   37.11 + * 
   37.12 + * This program is free software; you can redistribute it and/or modify
   37.13 + * it under the terms of the GNU General Public License as published by
   37.14 + * the Free Software Foundation; either version 2 of the License, or
   37.15 + * (at your option) any later version.
   37.16 + *
   37.17 + * This program is distributed in the hope that it will be useful,
   37.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   37.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   37.20 + * GNU General Public License for more details.
   37.21 + *
   37.22 + * You should have received a copy of the GNU General Public License
   37.23 + * along with this program; if not, write to the Free Software
   37.24 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   37.25 + */
   37.26 +
   37.27 +#define SHADOW 1
   37.28 +
   37.29 +#include <xen/config.h>
   37.30 +#include <xen/types.h>
   37.31 +#include <xen/mm.h>
   37.32 +#include <xen/trace.h>
   37.33 +#include <xen/sched.h>
   37.34 +#include <xen/perfc.h>
   37.35 +#include <xen/irq.h>
   37.36 +#include <xen/domain_page.h>
   37.37 +#include <xen/guest_access.h>
   37.38 +#include <xen/keyhandler.h>
   37.39 +#include <asm/event.h>
   37.40 +#include <asm/page.h>
   37.41 +#include <asm/current.h>
   37.42 +#include <asm/flushtlb.h>
   37.43 +#include <asm/shadow.h>
   37.44 +#include "private.h"
   37.45 +
   37.46 +#if SHADOW_AUDIT
   37.47 +int shadow_audit_enable = 0;
   37.48 +
   37.49 +static void shadow_audit_key(unsigned char key)
   37.50 +{
   37.51 +    shadow_audit_enable = !shadow_audit_enable;
   37.52 +    printk("%s shadow_audit_enable=%d\n",
   37.53 +           __func__, shadow_audit_enable);
   37.54 +}
   37.55 +
   37.56 +static int __init shadow_audit_key_init(void)
   37.57 +{
   37.58 +    register_keyhandler(
   37.59 +        'O', shadow_audit_key,  "toggle shadow audits");
   37.60 +    return 0;
   37.61 +}
   37.62 +__initcall(shadow_audit_key_init);
   37.63 +#endif /* SHADOW_AUDIT */
   37.64 +
   37.65 +static void sh_free_log_dirty_bitmap(struct domain *d);
   37.66 +
   37.67 +int _shadow_mode_refcounts(struct domain *d)
   37.68 +{
   37.69 +    return shadow_mode_refcounts(d);
   37.70 +}
   37.71 +
   37.72 +
   37.73 +/**************************************************************************/
   37.74 +/* x86 emulator support for the shadow code
   37.75 + */
   37.76 +
   37.77 +static int
   37.78 +sh_x86_emulate_read_std(unsigned long addr,
   37.79 +                         unsigned long *val,
   37.80 +                         unsigned int bytes,
   37.81 +                         struct x86_emulate_ctxt *ctxt)
   37.82 +{
   37.83 +    struct vcpu *v = current;
   37.84 +    if ( hvm_guest(v) )
   37.85 +    {
   37.86 +        *val = 0;
   37.87 +        // XXX -- this is WRONG.
   37.88 +        //        It entirely ignores the permissions in the page tables.
   37.89 +        //        In this case, that is only a user vs supervisor access check.
   37.90 +        //
   37.91 +        if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
   37.92 +        {
   37.93 +#if 0
   37.94 +            SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
   37.95 +                           v->domain->domain_id, v->vcpu_id, 
   37.96 +                           addr, *val, bytes);
   37.97 +#endif
   37.98 +            return X86EMUL_CONTINUE;
   37.99 +        }
  37.100 +
  37.101 +        /* If we got here, there was nothing mapped here, or a bad GFN 
  37.102 +         * was mapped here.  This should never happen: we're here because
  37.103 +         * of a write fault at the end of the instruction we're emulating. */ 
  37.104 +        SHADOW_PRINTK("read failed to va %#lx\n", addr);
  37.105 +        return X86EMUL_PROPAGATE_FAULT;
  37.106 +    }
  37.107 +    else 
  37.108 +    {
  37.109 +        SHADOW_PRINTK("this operation is not emulated yet\n");
  37.110 +        return X86EMUL_UNHANDLEABLE;
  37.111 +    }
  37.112 +}
  37.113 +
  37.114 +static int
  37.115 +sh_x86_emulate_write_std(unsigned long addr,
  37.116 +                          unsigned long val,
  37.117 +                          unsigned int bytes,
  37.118 +                          struct x86_emulate_ctxt *ctxt)
  37.119 +{
  37.120 +    struct vcpu *v = current;
  37.121 +#if 0
  37.122 +    SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
  37.123 +                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
  37.124 +#endif
  37.125 +    if ( hvm_guest(v) )
  37.126 +    {
  37.127 +        // XXX -- this is WRONG.
  37.128 +        //        It entirely ignores the permissions in the page tables.
  37.129 +        //        In this case, that includes user vs supervisor, and
  37.130 +        //        write access.
  37.131 +        //
  37.132 +        if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
  37.133 +            return X86EMUL_CONTINUE;
  37.134 +
  37.135 +        /* If we got here, there was nothing mapped here, or a bad GFN 
  37.136 +         * was mapped here.  This should never happen: we're here because
  37.137 +         * of a write fault at the end of the instruction we're emulating,
  37.138 +         * which should be handled by sh_x86_emulate_write_emulated. */ 
  37.139 +        SHADOW_PRINTK("write failed to va %#lx\n", addr);
  37.140 +        return X86EMUL_PROPAGATE_FAULT;
  37.141 +    }
  37.142 +    else 
  37.143 +    {
  37.144 +        SHADOW_PRINTK("this operation is not emulated yet\n");
  37.145 +        return X86EMUL_UNHANDLEABLE;
  37.146 +    }
  37.147 +}
  37.148 +
  37.149 +static int
  37.150 +sh_x86_emulate_write_emulated(unsigned long addr,
  37.151 +                               unsigned long val,
  37.152 +                               unsigned int bytes,
  37.153 +                               struct x86_emulate_ctxt *ctxt)
  37.154 +{
  37.155 +    struct vcpu *v = current;
  37.156 +#if 0
  37.157 +    SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
  37.158 +                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
  37.159 +#endif
  37.160 +    if ( hvm_guest(v) )
  37.161 +    {
  37.162 +        return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt);
  37.163 +    }
  37.164 +    else 
  37.165 +    {
  37.166 +        SHADOW_PRINTK("this operation is not emulated yet\n");
  37.167 +        return X86EMUL_UNHANDLEABLE;
  37.168 +    }
  37.169 +}
  37.170 +
  37.171 +static int 
  37.172 +sh_x86_emulate_cmpxchg_emulated(unsigned long addr,
  37.173 +                                 unsigned long old,
  37.174 +                                 unsigned long new,
  37.175 +                                 unsigned int bytes,
  37.176 +                                 struct x86_emulate_ctxt *ctxt)
  37.177 +{
  37.178 +    struct vcpu *v = current;
  37.179 +#if 0
  37.180 +    SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
  37.181 +                   v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
  37.182 +#endif
  37.183 +    if ( hvm_guest(v) )
  37.184 +    {
  37.185 +        return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new, 
  37.186 +                                                    bytes, ctxt);
  37.187 +    }
  37.188 +    else 
  37.189 +    {
  37.190 +        SHADOW_PRINTK("this operation is not emulated yet\n");
  37.191 +        return X86EMUL_UNHANDLEABLE;
  37.192 +    }
  37.193 +}
  37.194 +
  37.195 +static int 
  37.196 +sh_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
  37.197 +                                   unsigned long old_lo,
  37.198 +                                   unsigned long old_hi,
  37.199 +                                   unsigned long new_lo,
  37.200 +                                   unsigned long new_hi,
  37.201 +                                   struct x86_emulate_ctxt *ctxt)
  37.202 +{
  37.203 +    struct vcpu *v = current;
  37.204 +#if 0
  37.205 +    SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
  37.206 +                   v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
  37.207 +                   new_hi, new_lo, ctxt);
  37.208 +#endif
  37.209 +    if ( hvm_guest(v) )
  37.210 +    {
  37.211 +        return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
  37.212 +                                                      new_lo, new_hi, ctxt);
  37.213 +    }
  37.214 +    else 
  37.215 +    {
  37.216 +        SHADOW_PRINTK("this operation is not emulated yet\n");
  37.217 +        return X86EMUL_UNHANDLEABLE;
  37.218 +    }
  37.219 +}
  37.220 +
  37.221 +
  37.222 +struct x86_emulate_ops shadow_emulator_ops = {
  37.223 +    .read_std           = sh_x86_emulate_read_std,
  37.224 +    .write_std          = sh_x86_emulate_write_std,
  37.225 +    .read_emulated      = sh_x86_emulate_read_std,
  37.226 +    .write_emulated     = sh_x86_emulate_write_emulated,
  37.227 +    .cmpxchg_emulated   = sh_x86_emulate_cmpxchg_emulated,
  37.228 +    .cmpxchg8b_emulated = sh_x86_emulate_cmpxchg8b_emulated,
  37.229 +};
  37.230 +
  37.231 +
  37.232 +/**************************************************************************/
  37.233 +/* Code for "promoting" a guest page to the point where the shadow code is
  37.234 + * willing to let it be treated as a guest page table.  This generally
  37.235 + * involves making sure there are no writable mappings available to the guest
  37.236 + * for this page.
  37.237 + */
  37.238 +void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type)
  37.239 +{
  37.240 +    struct page_info *page = mfn_to_page(gmfn);
  37.241 +    unsigned long type_info;
  37.242 +
  37.243 +    ASSERT(valid_mfn(gmfn));
  37.244 +
  37.245 +    /* We should never try to promote a gmfn that has writeable mappings */
  37.246 +    ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0);
  37.247 +
  37.248 +    // Is the page already shadowed?
  37.249 +    if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
  37.250 +    {
  37.251 +        // No prior shadow exists...
  37.252 +
  37.253 +        // Grab a type-ref.  We don't really care if we are racing with another
  37.254 +        // vcpu or not, or even what kind of type we get; we just want the type
  37.255 +        // count to be > 0.
  37.256 +        //
  37.257 +        do {
  37.258 +            type_info =
  37.259 +                page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
  37.260 +        } while ( !get_page_type(page, type_info) );
  37.261 +
  37.262 +        // Now that the type ref is non-zero, we can safely use the
  37.263 +        // shadow_flags.
  37.264 +        //
  37.265 +        page->shadow_flags = 0;
  37.266 +    }
  37.267 +
  37.268 +    ASSERT(!test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
  37.269 +    set_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
  37.270 +}
  37.271 +
  37.272 +void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
  37.273 +{
  37.274 +    struct page_info *page = mfn_to_page(gmfn);
  37.275 +
  37.276 +    ASSERT(test_bit(_PGC_page_table, &page->count_info));
  37.277 +    ASSERT(test_bit(type >> PGC_SH_type_shift, &page->shadow_flags));
  37.278 +
  37.279 +    clear_bit(type >> PGC_SH_type_shift, &page->shadow_flags);
  37.280 +
  37.281 +    if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
  37.282 +    {
  37.283 +        // release the extra type ref
  37.284 +        put_page_type(page);
  37.285 +
  37.286 +        // clear the is-a-page-table bit.
  37.287 +        clear_bit(_PGC_page_table, &page->count_info);
  37.288 +    }
  37.289 +}
  37.290 +
  37.291 +/**************************************************************************/
  37.292 +/* Validate a pagetable change from the guest and update the shadows.
  37.293 + * Returns a bitmask of SHADOW_SET_* flags. */
  37.294 +
  37.295 +static int
  37.296 +__shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, 
  37.297 +                               void *entry, u32 size)
  37.298 +{
  37.299 +    int result = 0;
  37.300 +    struct page_info *page = mfn_to_page(gmfn);
  37.301 +
  37.302 +    sh_mark_dirty(v->domain, gmfn);
  37.303 +    
  37.304 +    // Determine which types of shadows are affected, and update each.
  37.305 +    //
  37.306 +    // Always validate L1s before L2s to prevent another cpu with a linear
  37.307 +    // mapping of this gmfn from seeing a walk that results from 
  37.308 +    // using the new L2 value and the old L1 value.  (It is OK for such a
  37.309 +    // guest to see a walk that uses the old L2 value with the new L1 value,
  37.310 +    // as hardware could behave this way if one level of the pagewalk occurs
  37.311 +    // before the store, and the next level of the pagewalk occurs after the
  37.312 +    // store.
  37.313 +    //
  37.314 +    // Ditto for L2s before L3s, etc.
  37.315 +    //
  37.316 +
  37.317 +    if ( !(page->count_info & PGC_page_table) )
  37.318 +        return 0;  /* Not shadowed at all */
  37.319 +
  37.320 +#if CONFIG_PAGING_LEVELS == 2
  37.321 +    if ( page->shadow_flags & SHF_L1_32 ) 
  37.322 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2)
  37.323 +            (v, gmfn, entry, size);
  37.324 +#else 
  37.325 +    if ( page->shadow_flags & SHF_L1_32 ) 
  37.326 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2)
  37.327 +            (v, gmfn, entry, size);
  37.328 +#endif
  37.329 +
  37.330 +#if CONFIG_PAGING_LEVELS == 2
  37.331 +    if ( page->shadow_flags & SHF_L2_32 ) 
  37.332 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2)
  37.333 +            (v, gmfn, entry, size);
  37.334 +#else 
  37.335 +    if ( page->shadow_flags & SHF_L2_32 ) 
  37.336 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2)
  37.337 +            (v, gmfn, entry, size);
  37.338 +#endif
  37.339 +
  37.340 +#if CONFIG_PAGING_LEVELS >= 3 
  37.341 +    if ( page->shadow_flags & SHF_L1_PAE ) 
  37.342 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3)
  37.343 +            (v, gmfn, entry, size);
  37.344 +    if ( page->shadow_flags & SHF_L2_PAE ) 
  37.345 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3)
  37.346 +            (v, gmfn, entry, size);
  37.347 +    if ( page->shadow_flags & SHF_L2H_PAE ) 
  37.348 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3)
  37.349 +            (v, gmfn, entry, size);
  37.350 +    if ( page->shadow_flags & SHF_L3_PAE ) 
  37.351 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 3, 3)
  37.352 +            (v, gmfn, entry, size);
  37.353 +#else /* 32-bit non-PAE hypervisor does not support PAE guests */
  37.354 +    ASSERT((page->shadow_flags & (SHF_L3_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0);
  37.355 +#endif
  37.356 +
  37.357 +#if CONFIG_PAGING_LEVELS >= 4 
  37.358 +    if ( page->shadow_flags & SHF_L1_64 ) 
  37.359 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4)
  37.360 +            (v, gmfn, entry, size);
  37.361 +    if ( page->shadow_flags & SHF_L2_64 ) 
  37.362 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4)
  37.363 +            (v, gmfn, entry, size);
  37.364 +    if ( page->shadow_flags & SHF_L3_64 ) 
  37.365 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4)
  37.366 +            (v, gmfn, entry, size);
  37.367 +    if ( page->shadow_flags & SHF_L4_64 ) 
  37.368 +        result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4)
  37.369 +            (v, gmfn, entry, size);
  37.370 +#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
  37.371 +    ASSERT((page->shadow_flags 
  37.372 +            & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0);
  37.373 +#endif
  37.374 +
  37.375 +    return result;
  37.376 +}
  37.377 +
  37.378 +
  37.379 +int
  37.380 +shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
  37.381 +/* This is the entry point from hypercalls. It returns a bitmask of all the 
  37.382 + * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
  37.383 +{
  37.384 +    int rc;
  37.385 +
  37.386 +    ASSERT(shadow_lock_is_acquired(v->domain));
  37.387 +    rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
  37.388 +    shadow_audit_tables(v);
  37.389 +    return rc;
  37.390 +}
  37.391 +
  37.392 +void
  37.393 +shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
  37.394 +                                void *entry, u32 size)
  37.395 +/* This is the entry point for emulated writes to pagetables in HVM guests */
  37.396 +{
  37.397 +    struct domain *d = v->domain;
  37.398 +    int rc;
  37.399 +
  37.400 +    ASSERT(shadow_lock_is_acquired(v->domain));
  37.401 +    rc = __shadow_validate_guest_entry(v, gmfn, entry, size);
  37.402 +    if ( rc & SHADOW_SET_FLUSH )
  37.403 +    {
  37.404 +        // Flush everyone except the local processor, which will flush when it
  37.405 +        // re-enters the HVM guest.
  37.406 +        //
  37.407 +        cpumask_t mask = d->domain_dirty_cpumask;
  37.408 +        cpu_clear(v->processor, mask);
  37.409 +        flush_tlb_mask(mask);
  37.410 +    }
  37.411 +    if ( rc & SHADOW_SET_ERROR ) 
  37.412 +    {
  37.413 +        /* This page is probably not a pagetable any more: tear it out of the 
  37.414 +         * shadows, along with any tables that reference it */
  37.415 +        shadow_remove_all_shadows_and_parents(v, gmfn);
  37.416 +    }
  37.417 +    /* We ignore the other bits: since we are about to change CR3 on
  37.418 +     * VMENTER we don't need to do any extra TLB flushes. */ 
  37.419 +}
  37.420 +
  37.421 +
  37.422 +/**************************************************************************/
  37.423 +/* Memory management for shadow pages. */ 
  37.424 +
  37.425 +/* Meaning of the count_info field in shadow pages
  37.426 + * ----------------------------------------------
  37.427 + * 
  37.428 + * A count of all references to this page from other shadow pages and
  37.429 + * guest CR3s (a.k.a. v->arch.shadow.table).  
  37.430 + *
  37.431 + * The top bits hold the shadow type and the pinned bit.  Top-level
  37.432 + * shadows are pinned so that they don't disappear when not in a CR3
  37.433 + * somewhere.
  37.434 + *
  37.435 + * We don't need to use get|put_page for this as the updates are all
  37.436 + * protected by the shadow lock.  We can't use get|put_page for this
  37.437 + * as the size of the count on shadow pages is different from that on
  37.438 + * normal guest pages.
  37.439 + */
  37.440 +
  37.441 +/* Meaning of the type_info field in shadow pages
  37.442 + * ----------------------------------------------
  37.443 + * 
  37.444 + * type_info use depends on the shadow type (from count_info)
  37.445 + * 
  37.446 + * PGC_SH_none : This page is in the shadow free pool.  type_info holds
  37.447 + *                the chunk order for our freelist allocator.
  37.448 + *
  37.449 + * PGC_SH_l*_shadow : This page is in use as a shadow. type_info 
  37.450 + *                     holds the mfn of the guest page being shadowed,
  37.451 + *
  37.452 + * PGC_SH_fl1_*_shadow : This page is being used to shatter a superpage.
  37.453 + *                        type_info holds the gfn being shattered.
  37.454 + *
  37.455 + * PGC_SH_monitor_table : This page is part of a monitor table.
  37.456 + *                         type_info is not used.
  37.457 + */
  37.458 +
  37.459 +/* Meaning of the _domain field in shadow pages
  37.460 + * --------------------------------------------
  37.461 + *
  37.462 + * In shadow pages, this field will always have its least significant bit
  37.463 + * set.  This ensures that all attempts to get_page() will fail (as all
  37.464 + * valid pickled domain pointers have a zero for their least significant bit).
  37.465 + * Instead, the remaining upper bits are used to record the shadow generation
  37.466 + * counter when the shadow was created.
  37.467 + */
  37.468 +
  37.469 +/* Meaning of the shadow_flags field
  37.470 + * ----------------------------------
  37.471 + * 
  37.472 + * In guest pages that are shadowed, one bit for each kind of shadow they have.
  37.473 + * 
  37.474 + * In shadow pages, will be used for holding a representation of the populated
  37.475 + * entries in this shadow (either a min/max, or a bitmap, or ...)
  37.476 + *
  37.477 + * In monitor-table pages, holds the level of the particular page (to save
  37.478 + * spilling the shadow types into an extra bit by having three types of monitor
  37.479 + * page).
  37.480 + */
  37.481 +
  37.482 +/* Meaning of the list_head struct in shadow pages
  37.483 + * -----------------------------------------------
  37.484 + *
  37.485 + * In free shadow pages, this is used to hold the free-lists of chunks.
  37.486 + *
  37.487 + * In top-level shadow tables, this holds a linked-list of all top-level
  37.488 + * shadows (used for recovering memory and destroying shadows). 
  37.489 + *
  37.490 + * In lower-level shadows, this holds the physical address of a higher-level
  37.491 + * shadow entry that holds a reference to this shadow (or zero).
  37.492 + */
  37.493 +
  37.494 +/* Allocating shadow pages
  37.495 + * -----------------------
  37.496 + *
  37.497 + * Most shadow pages are allocated singly, but there are two cases where we 
  37.498 + * need to allocate multiple pages together.
  37.499 + * 
  37.500 + * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
  37.501 + *    A 32-bit guest l1 table covers 4MB of virtuial address space,
  37.502 + *    and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
  37.503 + *    of virtual address space each).  Similarly, a 32-bit guest l2 table 
  37.504 + *    (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va 
  37.505 + *    each).  These multi-page shadows are contiguous and aligned; 
  37.506 + *    functions for handling offsets into them are defined in shadow.c 
  37.507 + *    (shadow_l1_index() etc.)
  37.508 + *    
  37.509 + * 2: Shadowing PAE top-level pages.  Each guest page that contains
  37.510 + *    any PAE top-level pages requires two shadow pages to shadow it.
  37.511 + *    They contain alternating l3 tables and pae_l3_bookkeeping structs.
  37.512 + *
  37.513 + * This table shows the allocation behaviour of the different modes:
  37.514 + *
  37.515 + * Xen paging      32b  pae  pae  64b  64b  64b
  37.516 + * Guest paging    32b  32b  pae  32b  pae  64b
  37.517 + * PV or HVM        *   HVM   *   HVM  HVM   * 
  37.518 + * Shadow paging   32b  pae  pae  pae  pae  64b
  37.519 + *
  37.520 + * sl1 size         4k   8k   4k   8k   4k   4k
  37.521 + * sl2 size         4k  16k   4k  16k   4k   4k
  37.522 + * sl3 size         -    -    8k   -    8k   4k
  37.523 + * sl4 size         -    -    -    -    -    4k
  37.524 + *
  37.525 + * We allocate memory from xen in four-page units and break them down
  37.526 + * with a simple buddy allocator.  Can't use the xen allocator to handle
  37.527 + * this as it only works for contiguous zones, and a domain's shadow
  37.528 + * pool is made of fragments.
  37.529 + *
  37.530 + * In HVM guests, the p2m table is built out of shadow pages, and we provide 
  37.531 + * a function for the p2m management to steal pages, in max-order chunks, from 
  37.532 + * the free pool.  We don't provide for giving them back, yet.
  37.533 + */
  37.534 +
  37.535 +/* Figure out the least acceptable quantity of shadow memory.
  37.536 + * The minimum memory requirement for always being able to free up a
  37.537 + * chunk of memory is very small -- only three max-order chunks per
  37.538 + * vcpu to hold the top level shadows and pages with Xen mappings in them.  
  37.539 + *
  37.540 + * But for a guest to be guaranteed to successfully execute a single
  37.541 + * instruction, we must be able to map a large number (about thirty) VAs
  37.542 + * at the same time, which means that to guarantee progress, we must
  37.543 + * allow for more than ninety allocated pages per vcpu.  We round that
  37.544 + * up to 128 pages, or half a megabyte per vcpu. */
  37.545 +unsigned int shadow_min_acceptable_pages(struct domain *d) 
  37.546 +{
  37.547 +    u32 vcpu_count = 0;
  37.548 +    struct vcpu *v;
  37.549 +
  37.550 +    for_each_vcpu(d, v)
  37.551 +        vcpu_count++;
  37.552 +
  37.553 +    return (vcpu_count * 128);
  37.554 +}
  37.555 +
  37.556 +/* Using the type_info field to store freelist order */
  37.557 +#define SH_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
  37.558 +#define SH_SET_PFN_ORDER(_p, _o)                       \
  37.559 + do { (_p)->u.inuse.type_info = (_o); } while (0)
  37.560 + 
  37.561 +
  37.562 +/* Figure out the order of allocation needed for a given shadow type */
  37.563 +static inline u32
  37.564 +shadow_order(u32 shadow_type) 
  37.565 +{
  37.566 +#if CONFIG_PAGING_LEVELS > 2
  37.567 +    static const u32 type_to_order[16] = {
  37.568 +        0, /* PGC_SH_none           */
  37.569 +        1, /* PGC_SH_l1_32_shadow   */
  37.570 +        1, /* PGC_SH_fl1_32_shadow  */
  37.571 +        2, /* PGC_SH_l2_32_shadow   */
  37.572 +        0, /* PGC_SH_l1_pae_shadow  */
  37.573 +        0, /* PGC_SH_fl1_pae_shadow */
  37.574 +        0, /* PGC_SH_l2_pae_shadow  */
  37.575 +        0, /* PGC_SH_l2h_pae_shadow */
  37.576 +        1, /* PGC_SH_l3_pae_shadow  */
  37.577 +        0, /* PGC_SH_l1_64_shadow   */
  37.578 +        0, /* PGC_SH_fl1_64_shadow  */
  37.579 +        0, /* PGC_SH_l2_64_shadow   */
  37.580 +        0, /* PGC_SH_l3_64_shadow   */
  37.581 +        0, /* PGC_SH_l4_64_shadow   */
  37.582 +        2, /* PGC_SH_p2m_table      */
  37.583 +        0  /* PGC_SH_monitor_table  */
  37.584 +        };
  37.585 +    u32 type = (shadow_type & PGC_SH_type_mask) >> PGC_SH_type_shift;
  37.586 +    return type_to_order[type];
  37.587 +#else  /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
  37.588 +    return 0;
  37.589 +#endif
  37.590 +}
  37.591 +
  37.592 +
  37.593 +/* Do we have a free chunk of at least this order? */
  37.594 +static inline int chunk_is_available(struct domain *d, int order)
  37.595 +{
  37.596 +    int i;
  37.597 +    
  37.598 +    for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
  37.599 +        if ( !list_empty(&d->arch.shadow.freelists[i]) )
  37.600 +            return 1;
  37.601 +    return 0;
  37.602 +}
  37.603 +
  37.604 +/* Dispatcher function: call the per-mode function that will unhook the
  37.605 + * non-Xen mappings in this top-level shadow mfn */
  37.606 +void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
  37.607 +{
  37.608 +    struct page_info *pg = mfn_to_page(smfn);
  37.609 +    switch ( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift )
  37.610 +    {
  37.611 +    case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
  37.612 +#if CONFIG_PAGING_LEVELS == 2
  37.613 +        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn);
  37.614 +#else
  37.615 +        SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn);
  37.616 +#endif
  37.617 +        break;
  37.618 +#if CONFIG_PAGING_LEVELS >= 3
  37.619 +    case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
  37.620 +        SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn);
  37.621 +        break;
  37.622 +#endif
  37.623 +#if CONFIG_PAGING_LEVELS >= 4
  37.624 +    case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
  37.625 +        SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn);
  37.626 +        break;
  37.627 +#endif
  37.628 +    default:
  37.629 +        SHADOW_PRINTK("top-level shadow has bad type %08lx\n", 
  37.630 +                       (unsigned long)((pg->count_info & PGC_SH_type_mask)
  37.631 +                                       >> PGC_SH_type_shift));
  37.632 +        BUG();
  37.633 +    }
  37.634 +}
  37.635 +
  37.636 +
  37.637 +/* Make sure there is at least one chunk of the required order available
  37.638 + * in the shadow page pool. This must be called before any calls to
  37.639 + * shadow_alloc().  Since this will free existing shadows to make room,
  37.640 + * it must be called early enough to avoid freeing shadows that the
  37.641 + * caller is currently working on. */
  37.642 +void shadow_prealloc(struct domain *d, unsigned int order)
  37.643 +{
  37.644 +    /* Need a vpcu for calling unpins; for now, since we don't have
  37.645 +     * per-vcpu shadows, any will do */
  37.646 +    struct vcpu *v = d->vcpu[0];
  37.647 +    struct list_head *l, *t;
  37.648 +    struct page_info *pg;
  37.649 +    mfn_t smfn;
  37.650 +
  37.651 +    if ( chunk_is_available(d, order) ) return; 
  37.652 +    
  37.653 +    /* Stage one: walk the list of top-level pages, unpinning them */
  37.654 +    perfc_incrc(shadow_prealloc_1);
  37.655 +    list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
  37.656 +    {
  37.657 +        pg = list_entry(l, struct page_info, list);
  37.658 +        smfn = page_to_mfn(pg);
  37.659 +
  37.660 +#if CONFIG_PAGING_LEVELS >= 3
  37.661 +        if ( (pg->count_info & PGC_SH_type_mask) == PGC_SH_l3_pae_shadow )
  37.662 +        {
  37.663 +            /* For PAE, we need to unpin each subshadow on this shadow */
  37.664 +            SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn);
  37.665 +        } 
  37.666 +        else 
  37.667 +#endif /* 32-bit code always takes this branch */
  37.668 +        {
  37.669 +            /* Unpin this top-level shadow */
  37.670 +            sh_unpin(v, smfn);
  37.671 +        }
  37.672 +
  37.673 +        /* See if that freed up a chunk of appropriate size */
  37.674 +        if ( chunk_is_available(d, order) ) return;
  37.675 +    }
  37.676 +
  37.677 +    /* Stage two: all shadow pages are in use in hierarchies that are
  37.678 +     * loaded in cr3 on some vcpu.  Walk them, unhooking the non-Xen
  37.679 +     * mappings. */
  37.680 +    perfc_incrc(shadow_prealloc_2);
  37.681 +    v = current;
  37.682 +    if ( v->domain != d )
  37.683 +        v = d->vcpu[0];
  37.684 +    /* Walk the list from the tail: recently used toplevels have been pulled
  37.685 +     * to the head */
  37.686 +    list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows)
  37.687 +    {
  37.688 +        pg = list_entry(l, struct page_info, list);
  37.689 +        smfn = page_to_mfn(pg);
  37.690 +        shadow_unhook_mappings(v, smfn);
  37.691 +
  37.692 +        /* Need to flush TLB if we've altered our own tables */
  37.693 +        if ( !shadow_mode_external(d) 
  37.694 +             && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
  37.695 +            local_flush_tlb();
  37.696 +        
  37.697 +        /* See if that freed up a chunk of appropriate size */
  37.698 +        if ( chunk_is_available(d, order) ) return;
  37.699 +    }
  37.700 +    
  37.701 +    /* Nothing more we can do: all remaining shadows are of pages that
  37.702 +     * hold Xen mappings for some vcpu.  This can never happen. */
  37.703 +    SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n"
  37.704 +                   "  shadow pages total = %u, free = %u, p2m=%u\n",
  37.705 +                   1 << order, 
  37.706 +                   d->arch.shadow.total_pages, 
  37.707 +                   d->arch.shadow.free_pages, 
  37.708 +                   d->arch.shadow.p2m_pages);
  37.709 +    BUG();
  37.710 +}
  37.711 +
  37.712 +
  37.713 +/* Allocate another shadow's worth of (contiguous, aligned) pages,
  37.714 + * and fill in the type and backpointer fields of their page_infos. 
  37.715 + * Never fails to allocate. */
  37.716 +mfn_t shadow_alloc(struct domain *d,  
  37.717 +                    u32 shadow_type,
  37.718 +                    unsigned long backpointer)
  37.719 +{
  37.720 +    struct page_info *pg = NULL;
  37.721 +    unsigned int order = shadow_order(shadow_type);
  37.722 +    cpumask_t mask;
  37.723 +    void *p;
  37.724 +    int i;
  37.725 +
  37.726 +    ASSERT(shadow_lock_is_acquired(d));
  37.727 +    ASSERT(order <= SHADOW_MAX_ORDER);
  37.728 +    ASSERT(shadow_type != PGC_SH_none);
  37.729 +    perfc_incrc(shadow_alloc);
  37.730 +
  37.731 +    /* Find smallest order which can satisfy the request. */
  37.732 +    for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
  37.733 +        if ( !list_empty(&d->arch.shadow.freelists[i]) )
  37.734 +        {
  37.735 +            pg = list_entry(d->arch.shadow.freelists[i].next, 
  37.736 +                            struct page_info, list);
  37.737 +            list_del(&pg->list);
  37.738 +            
  37.739 +            /* We may have to halve the chunk a number of times. */
  37.740 +            while ( i != order )
  37.741 +            {
  37.742 +                i--;
  37.743 +                SH_SET_PFN_ORDER(pg, i);
  37.744 +                list_add_tail(&pg->list, &d->arch.shadow.freelists[i]);
  37.745 +                pg += 1 << i;
  37.746 +            }
  37.747 +            d->arch.shadow.free_pages -= 1 << order;
  37.748 +
  37.749 +            /* Init page info fields and clear the pages */
  37.750 +            for ( i = 0; i < 1<<order ; i++ ) 
  37.751 +            {
  37.752 +                pg[i].u.inuse.type_info = backpointer;
  37.753 +                pg[i].count_info = shadow_type;
  37.754 +                pg[i].shadow_flags = 0;
  37.755 +                INIT_LIST_HEAD(&pg[i].list);
  37.756 +                /* Before we overwrite the old contents of this page, 
  37.757 +                 * we need to be sure that no TLB holds a pointer to it. */
  37.758 +                mask = d->domain_dirty_cpumask;
  37.759 +                tlbflush_filter(mask, pg[i].tlbflush_timestamp);
  37.760 +                if ( unlikely(!cpus_empty(mask)) )
  37.761 +                {
  37.762 +                    perfc_incrc(shadow_alloc_tlbflush);
  37.763 +                    flush_tlb_mask(mask);
  37.764 +                }
  37.765 +                /* Now safe to clear the page for reuse */
  37.766 +                p = sh_map_domain_page(page_to_mfn(pg+i));
  37.767 +                ASSERT(p != NULL);
  37.768 +                clear_page(p);
  37.769 +                sh_unmap_domain_page(p);
  37.770 +                perfc_incr(shadow_alloc_count);
  37.771 +            }
  37.772 +            return page_to_mfn(pg);
  37.773 +        }
  37.774 +    
  37.775 +    /* If we get here, we failed to allocate. This should never happen.
  37.776 +     * It means that we didn't call shadow_prealloc() correctly before
  37.777 +     * we allocated.  We can't recover by calling prealloc here, because
  37.778 +     * we might free up higher-level pages that the caller is working on. */
  37.779 +    SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
  37.780 +    BUG();
  37.781 +}
  37.782 +
  37.783 +
  37.784 +/* Return some shadow pages to the pool. */
  37.785 +void shadow_free(struct domain *d, mfn_t smfn)
  37.786 +{
  37.787 +    struct page_info *pg = mfn_to_page(smfn); 
  37.788 +    u32 shadow_type;
  37.789 +    unsigned long order;
  37.790 +    unsigned long mask;
  37.791 +    int i;
  37.792 +
  37.793 +    ASSERT(shadow_lock_is_acquired(d));
  37.794 +    perfc_incrc(shadow_free);
  37.795 +
  37.796 +    shadow_type = pg->count_info & PGC_SH_type_mask;
  37.797 +    ASSERT(shadow_type != PGC_SH_none);
  37.798 +    ASSERT(shadow_type != PGC_SH_p2m_table);
  37.799 +    order = shadow_order(shadow_type);
  37.800 +
  37.801 +    d->arch.shadow.free_pages += 1 << order;
  37.802 +
  37.803 +    for ( i = 0; i < 1<<order; i++ ) 
  37.804 +    {
  37.805 +        /* Strip out the type: this is now a free shadow page */
  37.806 +        pg[i].count_info = 0;
  37.807 +        /* Remember the TLB timestamp so we will know whether to flush 
  37.808 +         * TLBs when we reuse the page.  Because the destructors leave the
  37.809 +         * contents of the pages in place, we can delay TLB flushes until
  37.810 +         * just before the allocator hands the page out again. */
  37.811 +        pg[i].tlbflush_timestamp = tlbflush_current_time();
  37.812 +        perfc_decr(shadow_alloc_count);
  37.813 +    }
  37.814 +
  37.815 +    /* Merge chunks as far as possible. */
  37.816 +    while ( order < SHADOW_MAX_ORDER )
  37.817 +    {
  37.818 +        mask = 1 << order;
  37.819 +        if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
  37.820 +            /* Merge with predecessor block? */
  37.821 +            if ( (((pg-mask)->count_info & PGC_SH_type_mask) != PGT_none) 
  37.822 +                 || (SH_PFN_ORDER(pg-mask) != order) )
  37.823 +                break;
  37.824 +            list_del(&(pg-mask)->list);
  37.825 +            pg -= mask;
  37.826 +        } else {
  37.827 +            /* Merge with successor block? */
  37.828 +            if ( (((pg+mask)->count_info & PGC_SH_type_mask) != PGT_none)
  37.829 +                 || (SH_PFN_ORDER(pg+mask) != order) )
  37.830 +                break;
  37.831 +            list_del(&(pg+mask)->list);
  37.832 +        }
  37.833 +        order++;
  37.834 +    }
  37.835 +
  37.836 +    SH_SET_PFN_ORDER(pg, order);
  37.837 +    list_add_tail(&pg->list, &d->arch.shadow.freelists[order]);
  37.838 +}
  37.839 +
  37.840 +/* Divert some memory from the pool to be used by the p2m mapping.
  37.841 + * This action is irreversible: the p2m mapping only ever grows.
  37.842 + * That's OK because the p2m table only exists for external domains,
  37.843 + * and those domains can't ever turn off shadow mode.
  37.844 + * Also, we only ever allocate a max-order chunk, so as to preserve
  37.845 + * the invariant that shadow_prealloc() always works.
  37.846 + * Returns 0 iff it can't get a chunk (the caller should then
  37.847 + * free up some pages in domheap and call set_sh_allocation);
  37.848 + * returns non-zero on success.
  37.849 + */
  37.850 +static int
  37.851 +shadow_alloc_p2m_pages(struct domain *d)
  37.852 +{
  37.853 +    struct page_info *pg;
  37.854 +    u32 i;
  37.855 +    ASSERT(shadow_lock_is_acquired(d));
  37.856 +    
  37.857 +    if ( d->arch.shadow.total_pages 
  37.858 +         < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) )
  37.859 +        return 0; /* Not enough shadow memory: need to increase it first */
  37.860 +    
  37.861 +    pg = mfn_to_page(shadow_alloc(d, PGC_SH_p2m_table, 0));
  37.862 +    d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER);
  37.863 +    d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER);
  37.864 +    for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++)
  37.865 +    {
  37.866 +        /* Unlike shadow pages, mark p2m pages as owned by the domain */
  37.867 +        page_set_owner(&pg[i], d);
  37.868 +        list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist);
  37.869 +    }
  37.870 +    return 1;
  37.871 +}
  37.872 +
  37.873 +// Returns 0 if no memory is available...
  37.874 +mfn_t
  37.875 +shadow_alloc_p2m_page(struct domain *d)
  37.876 +{
  37.877 +    struct list_head *entry;
  37.878 +    mfn_t mfn;
  37.879 +    void *p;
  37.880 +
  37.881 +    if ( list_empty(&d->arch.shadow.p2m_freelist) &&
  37.882 +         !shadow_alloc_p2m_pages(d) )
  37.883 +        return _mfn(0);
  37.884 +    entry = d->arch.shadow.p2m_freelist.next;
  37.885 +    list_del(entry);
  37.886 +    list_add_tail(entry, &d->arch.shadow.p2m_inuse);
  37.887 +    mfn = page_to_mfn(list_entry(entry, struct page_info, list));
  37.888 +    sh_get_ref(mfn, 0);
  37.889 +    p = sh_map_domain_page(mfn);
  37.890 +    clear_page(p);
  37.891 +    sh_unmap_domain_page(p);
  37.892 +
  37.893 +    return mfn;
  37.894 +}
  37.895 +
  37.896 +#if CONFIG_PAGING_LEVELS == 3
  37.897 +static void p2m_install_entry_in_monitors(struct domain *d, 
  37.898 +                                          l3_pgentry_t *l3e) 
  37.899 +/* Special case, only used for external-mode domains on PAE hosts:
  37.900 + * update the mapping of the p2m table.  Once again, this is trivial in
  37.901 + * other paging modes (one top-level entry points to the top-level p2m,
  37.902 + * no maintenance needed), but PAE makes life difficult by needing a
  37.903 + * copy the eight l3es of the p2m table in eight l2h slots in the
  37.904 + * monitor table.  This function makes fresh copies when a p2m l3e
  37.905 + * changes. */
  37.906 +{
  37.907 +    l2_pgentry_t *ml2e;
  37.908 +    struct vcpu *v;
  37.909 +    unsigned int index;
  37.910 +
  37.911 +    index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
  37.912 +    ASSERT(index < MACHPHYS_MBYTES>>1);
  37.913 +
  37.914 +    for_each_vcpu(d, v) 
  37.915 +    {
  37.916 +        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
  37.917 +            continue;
  37.918 +        ASSERT(shadow_mode_external(v->domain));
  37.919 +
  37.920 +        SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
  37.921 +                      d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
  37.922 +
  37.923 +        if ( v == current ) /* OK to use linear map of monitor_table */
  37.924 +            ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
  37.925 +        else 
  37.926 +        {
  37.927 +            l3_pgentry_t *ml3e;
  37.928 +            ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
  37.929 +            ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
  37.930 +            ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
  37.931 +            ml2e += l2_table_offset(RO_MPT_VIRT_START);
  37.932 +            sh_unmap_domain_page(ml3e);
  37.933 +        }
  37.934 +        ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
  37.935 +        if ( v != current )
  37.936 +            sh_unmap_domain_page(ml2e);
  37.937 +    }
  37.938 +}
  37.939 +#endif
  37.940 +
  37.941 +// Find the next level's P2M entry, checking for out-of-range gfn's...
  37.942 +// Returns NULL on error.
  37.943 +//
  37.944 +static l1_pgentry_t *
  37.945 +p2m_find_entry(void *table, unsigned long *gfn_remainder,
  37.946 +                   unsigned long gfn, u32 shift, u32 max)
  37.947 +{
  37.948 +    u32 index;
  37.949 +
  37.950 +    index = *gfn_remainder >> shift;
  37.951 +    if ( index >= max )
  37.952 +    {
  37.953 +        SHADOW_DEBUG(P2M, "gfn=0x%lx out of range "
  37.954 +                      "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
  37.955 +                       gfn, *gfn_remainder, shift, index, max);
  37.956 +        return NULL;
  37.957 +    }
  37.958 +    *gfn_remainder &= (1 << shift) - 1;
  37.959 +    return (l1_pgentry_t *)table + index;
  37.960 +}
  37.961 +
  37.962 +// Walk one level of the P2M table, allocating a new table if required.
  37.963 +// Returns 0 on error.
  37.964 +//
  37.965 +static int
  37.966 +p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, 
  37.967 +               unsigned long *gfn_remainder, unsigned long gfn, u32 shift, 
  37.968 +               u32 max, unsigned long type)
  37.969 +{
  37.970 +    l1_pgentry_t *p2m_entry;
  37.971 +    void *next;
  37.972 +
  37.973 +    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
  37.974 +                                      shift, max)) )
  37.975 +        return 0;
  37.976 +
  37.977 +    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
  37.978 +    {
  37.979 +        mfn_t mfn = shadow_alloc_p2m_page(d);
  37.980 +        if ( mfn_x(mfn) == 0 )
  37.981 +            return 0;
  37.982 +        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
  37.983 +        mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
  37.984 +        mfn_to_page(mfn)->count_info = 1;
  37.985 +#if CONFIG_PAGING_LEVELS == 3
  37.986 +        if (type == PGT_l2_page_table)
  37.987 +        {
  37.988 +            /* We have written to the p2m l3: need to sync the per-vcpu
  37.989 +             * copies of it in the monitor tables */
  37.990 +            p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
  37.991 +        }
  37.992 +#endif
  37.993 +        /* The P2M can be shadowed: keep the shadows synced */
  37.994 +        if ( d->vcpu[0] )
  37.995 +            (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn,
  37.996 +                                                 p2m_entry, sizeof *p2m_entry);
  37.997 +    }
  37.998 +    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
  37.999 +    next = sh_map_domain_page(*table_mfn);
 37.1000 +    sh_unmap_domain_page(*table);
 37.1001 +    *table = next;
 37.1002 +
 37.1003 +    return 1;
 37.1004 +}
 37.1005 +
 37.1006 +// Returns 0 on error (out of memory)
 37.1007 +int
 37.1008 +shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
 37.1009 +{
 37.1010 +    // XXX -- this might be able to be faster iff current->domain == d
 37.1011 +    mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
 37.1012 +    void *table = sh_map_domain_page(table_mfn);
 37.1013 +    unsigned long gfn_remainder = gfn;
 37.1014 +    l1_pgentry_t *p2m_entry;
 37.1015 +
 37.1016 +#if CONFIG_PAGING_LEVELS >= 4
 37.1017 +    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
 37.1018 +                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
 37.1019 +                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
 37.1020 +        return 0;
 37.1021 +#endif
 37.1022 +#if CONFIG_PAGING_LEVELS >= 3
 37.1023 +    // When using PAE Xen, we only allow 33 bits of pseudo-physical
 37.1024 +    // address in translated guests (i.e. 8 GBytes).  This restriction
 37.1025 +    // comes from wanting to map the P2M table into the 16MB RO_MPT hole
 37.1026 +    // in Xen's address space for translated PV guests.
 37.1027 +    //
 37.1028 +    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
 37.1029 +                         L3_PAGETABLE_SHIFT - PAGE_SHIFT,
 37.1030 +                         (CONFIG_PAGING_LEVELS == 3
 37.1031 +                          ? 8
 37.1032 +                          : L3_PAGETABLE_ENTRIES),
 37.1033 +                         PGT_l2_page_table) )
 37.1034 +        return 0;
 37.1035 +#endif
 37.1036 +    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
 37.1037 +                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
 37.1038 +                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
 37.1039 +        return 0;
 37.1040 +
 37.1041 +    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
 37.1042 +                               0, L1_PAGETABLE_ENTRIES);
 37.1043 +    ASSERT(p2m_entry);
 37.1044 +    if ( valid_mfn(mfn) )
 37.1045 +        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
 37.1046 +    else
 37.1047 +        *p2m_entry = l1e_empty();
 37.1048 +
 37.1049 +    /* The P2M can be shadowed: keep the shadows synced */
 37.1050 +    (void) __shadow_validate_guest_entry(d->vcpu[0], table_mfn, 
 37.1051 +                                          p2m_entry, sizeof *p2m_entry);
 37.1052 +
 37.1053 +    sh_unmap_domain_page(table);
 37.1054 +
 37.1055 +    return 1;
 37.1056 +}
 37.1057 +
 37.1058 +// Allocate a new p2m table for a domain.
 37.1059 +//
 37.1060 +// The structure of the p2m table is that of a pagetable for xen (i.e. it is
 37.1061 +// controlled by CONFIG_PAGING_LEVELS).
 37.1062 +//
 37.1063 +// Returns 0 if p2m table could not be initialized
 37.1064 +//
 37.1065 +static int
 37.1066 +shadow_alloc_p2m_table(struct domain *d)
 37.1067 +{
 37.1068 +    mfn_t p2m_top;
 37.1069 +    struct list_head *entry;
 37.1070 +    unsigned int page_count = 0;
 37.1071 +    
 37.1072 +    SHADOW_PRINTK("allocating p2m table\n");
 37.1073 +    ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
 37.1074 +
 37.1075 +    p2m_top = shadow_alloc_p2m_page(d);
 37.1076 +    mfn_to_page(p2m_top)->count_info = 1;
 37.1077 +    mfn_to_page(p2m_top)->u.inuse.type_info = 
 37.1078 +#if CONFIG_PAGING_LEVELS == 4
 37.1079 +        PGT_l4_page_table
 37.1080 +#elif CONFIG_PAGING_LEVELS == 3
 37.1081 +        PGT_l3_page_table
 37.1082 +#elif CONFIG_PAGING_LEVELS == 2
 37.1083 +        PGT_l2_page_table
 37.1084 +#endif
 37.1085 +        | 1 | PGT_validated;
 37.1086 +   
 37.1087 +    if ( mfn_x(p2m_top) == 0 )
 37.1088 +        return 0;
 37.1089 +
 37.1090 +    d->arch.phys_table = pagetable_from_mfn(p2m_top);
 37.1091 +
 37.1092 +    SHADOW_PRINTK("populating p2m table\n");
 37.1093 + 
 37.1094 +    for ( entry = d->page_list.next;
 37.1095 +          entry != &d->page_list;
 37.1096 +          entry = entry->next )
 37.1097 +    {
 37.1098 +        struct page_info *page = list_entry(entry, struct page_info, list);
 37.1099 +        mfn_t mfn = page_to_mfn(page);
 37.1100 +        unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
 37.1101 +        page_count++;
 37.1102 +        if (
 37.1103 +#ifdef __x86_64__
 37.1104 +            (gfn != 0x5555555555555555L)
 37.1105 +#else
 37.1106 +            (gfn != 0x55555555L)
 37.1107 +#endif
 37.1108 +             && gfn != INVALID_M2P_ENTRY
 37.1109 +             && !shadow_set_p2m_entry(d, gfn, mfn) )
 37.1110 +        {
 37.1111 +            SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH_PRI_mfn "\n",
 37.1112 +                           gfn, mfn_x(mfn));
 37.1113 +            return 0;
 37.1114 +        }
 37.1115 +    }
 37.1116 +
 37.1117 +    SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count);
 37.1118 +    return 1;
 37.1119 +}
 37.1120 +
 37.1121 +mfn_t
 37.1122 +sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
 37.1123 +/* Read another domain's p2m entries */
 37.1124 +{
 37.1125 +    mfn_t mfn;
 37.1126 +    unsigned long addr = gpfn << PAGE_SHIFT;
 37.1127 +    l2_pgentry_t *l2e;
 37.1128 +    l1_pgentry_t *l1e;
 37.1129 +    
 37.1130 +    ASSERT(shadow_mode_translate(d));
 37.1131 +    mfn = pagetable_get_mfn(d->arch.phys_table);
 37.1132 +
 37.1133 +
 37.1134 +#if CONFIG_PAGING_LEVELS > 2
 37.1135 +    if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) 
 37.1136 +        /* This pfn is higher than the p2m map can hold */
 37.1137 +        return _mfn(INVALID_MFN);
 37.1138 +#endif
 37.1139 +
 37.1140 +
 37.1141 +#if CONFIG_PAGING_LEVELS >= 4
 37.1142 +    { 
 37.1143 +        l4_pgentry_t *l4e = sh_map_domain_page(mfn);
 37.1144 +        l4e += l4_table_offset(addr);
 37.1145 +        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
 37.1146 +        {
 37.1147 +            sh_unmap_domain_page(l4e);
 37.1148 +            return _mfn(INVALID_MFN);
 37.1149 +        }
 37.1150 +        mfn = _mfn(l4e_get_pfn(*l4e));
 37.1151 +        sh_unmap_domain_page(l4e);
 37.1152 +    }
 37.1153 +#endif
 37.1154 +#if CONFIG_PAGING_LEVELS >= 3
 37.1155 +    {
 37.1156 +        l3_pgentry_t *l3e = sh_map_domain_page(mfn);
 37.1157 +        l3e += l3_table_offset(addr);
 37.1158 +        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
 37.1159 +        {
 37.1160 +            sh_unmap_domain_page(l3e);
 37.1161 +            return _mfn(INVALID_MFN);
 37.1162 +        }
 37.1163 +        mfn = _mfn(l3e_get_pfn(*l3e));
 37.1164 +        sh_unmap_domain_page(l3e);
 37.1165 +    }
 37.1166 +#endif
 37.1167 +
 37.1168 +    l2e = sh_map_domain_page(mfn);
 37.1169 +    l2e += l2_table_offset(addr);
 37.1170 +    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
 37.1171 +    {
 37.1172 +        sh_unmap_domain_page(l2e);
 37.1173 +        return _mfn(INVALID_MFN);
 37.1174 +    }
 37.1175 +    mfn = _mfn(l2e_get_pfn(*l2e));
 37.1176 +    sh_unmap_domain_page(l2e);
 37.1177 +
 37.1178 +    l1e = sh_map_domain_page(mfn);
 37.1179 +    l1e += l1_table_offset(addr);
 37.1180 +    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
 37.1181 +    {
 37.1182 +        sh_unmap_domain_page(l1e);
 37.1183 +        return _mfn(INVALID_MFN);
 37.1184 +    }
 37.1185 +    mfn = _mfn(l1e_get_pfn(*l1e));
 37.1186 +    sh_unmap_domain_page(l1e);
 37.1187 +
 37.1188 +    return mfn;
 37.1189 +}
 37.1190 +
 37.1191 +unsigned long
 37.1192 +shadow_gfn_to_mfn_foreign(unsigned long gpfn)
 37.1193 +{
 37.1194 +    return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn));
 37.1195 +}
 37.1196 +
 37.1197 +
 37.1198 +static void shadow_p2m_teardown(struct domain *d)
 37.1199 +/* Return all the p2m pages to Xen.
 37.1200 + * We know we don't have any extra mappings to these pages */
 37.1201 +{
 37.1202 +    struct list_head *entry, *n;
 37.1203 +    struct page_info *pg;
 37.1204 +
 37.1205 +    d->arch.phys_table = pagetable_null();
 37.1206 +
 37.1207 +    list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse)
 37.1208 +    {
 37.1209 +        pg = list_entry(entry, struct page_info, list);
 37.1210 +        list_del(entry);
 37.1211 +        /* Should have just the one ref we gave it in alloc_p2m_page() */
 37.1212 +        if ( (pg->count_info & PGC_SH_count_mask) != 1 )
 37.1213 +        {
 37.1214 +            SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
 37.1215 +                           pg->count_info, pg->u.inuse.type_info);
 37.1216 +        }
 37.1217 +        ASSERT(page_get_owner(pg) == d);
 37.1218 +        /* Free should not decrement domain's total allocation, since 
 37.1219 +         * these pages were allocated without an owner. */
 37.1220 +        page_set_owner(pg, NULL); 
 37.1221 +        free_domheap_pages(pg, 0);
 37.1222 +        d->arch.shadow.p2m_pages--;
 37.1223 +        perfc_decr(shadow_alloc_count);
 37.1224 +    }
 37.1225 +    list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist)
 37.1226 +    {
 37.1227 +        list_del(entry);
 37.1228 +        pg = list_entry(entry, struct page_info, list);
 37.1229 +        ASSERT(page_get_owner(pg) == d);
 37.1230 +        /* Free should not decrement domain's total allocation. */
 37.1231 +        page_set_owner(pg, NULL); 
 37.1232 +        free_domheap_pages(pg, 0);
 37.1233 +        d->arch.shadow.p2m_pages--;
 37.1234 +        perfc_decr(shadow_alloc_count);
 37.1235 +    }
 37.1236 +    ASSERT(d->arch.shadow.p2m_pages == 0);
 37.1237 +}
 37.1238 +
 37.1239 +/* Set the pool of shadow pages to the required number of pages.
 37.1240 + * Input will be rounded up to at least shadow_min_acceptable_pages(),
 37.1241 + * plus space for the p2m table.
 37.1242 + * Returns 0 for success, non-zero for failure. */
 37.1243 +static unsigned int set_sh_allocation(struct domain *d, 
 37.1244 +                                       unsigned int pages,
 37.1245 +                                       int *preempted)
 37.1246 +{
 37.1247 +    struct page_info *pg;
 37.1248 +    unsigned int lower_bound;
 37.1249 +    int j;
 37.1250 +
 37.1251 +    ASSERT(shadow_lock_is_acquired(d));
 37.1252 +    
 37.1253 +    /* Don't allocate less than the minimum acceptable, plus one page per
 37.1254 +     * megabyte of RAM (for the p2m table) */
 37.1255 +    lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
 37.1256 +    if ( pages > 0 && pages < lower_bound )
 37.1257 +        pages = lower_bound;
 37.1258 +    /* Round up to largest block size */
 37.1259 +    pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
 37.1260 +
 37.1261 +    SHADOW_PRINTK("current %i target %i\n", 
 37.1262 +                   d->arch.shadow.total_pages, pages);
 37.1263 +
 37.1264 +    while ( d->arch.shadow.total_pages != pages ) 
 37.1265 +    {
 37.1266 +        if ( d->arch.shadow.total_pages < pages ) 
 37.1267 +        {
 37.1268 +            /* Need to allocate more memory from domheap */
 37.1269 +            pg = alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0); 
 37.1270 +            if ( pg == NULL ) 
 37.1271 +            { 
 37.1272 +                SHADOW_PRINTK("failed to allocate shadow pages.\n");
 37.1273 +                return -ENOMEM;
 37.1274 +            }
 37.1275 +            d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER;
 37.1276 +            d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER;
 37.1277 +            for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ ) 
 37.1278 +            {
 37.1279 +                pg[j].u.inuse.type_info = 0;  /* Free page */
 37.1280 +                pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
 37.1281 +            }
 37.1282 +            SH_SET_PFN_ORDER(pg, SHADOW_MAX_ORDER);
 37.1283 +            list_add_tail(&pg->list, 
 37.1284 +                          &d->arch.shadow.freelists[SHADOW_MAX_ORDER]);
 37.1285 +        } 
 37.1286 +        else if ( d->arch.shadow.total_pages > pages ) 
 37.1287 +        {
 37.1288 +            /* Need to return memory to domheap */
 37.1289 +            shadow_prealloc(d, SHADOW_MAX_ORDER);
 37.1290 +            ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER]));
 37.1291 +            pg = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next, 
 37.1292 +                            struct page_info, list);
 37.1293 +            list_del(&pg->list);
 37.1294 +            d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER;
 37.1295 +            d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER;
 37.1296 +            free_domheap_pages(pg, SHADOW_MAX_ORDER);
 37.1297 +        }
 37.1298 +
 37.1299 +        /* Check to see if we need to yield and try again */
 37.1300 +        if ( preempted && hypercall_preempt_check() )
 37.1301 +        {
 37.1302 +            *preempted = 1;
 37.1303 +            return 0;
 37.1304 +        }
 37.1305 +    }
 37.1306 +
 37.1307 +    return 0;
 37.1308 +}
 37.1309 +
 37.1310 +unsigned int shadow_set_allocation(struct domain *d, 
 37.1311 +                                    unsigned int megabytes,
 37.1312 +                                    int *preempted)
 37.1313 +/* Hypercall interface to set the shadow memory allocation */
 37.1314 +{
 37.1315 +    unsigned int rv;
 37.1316 +    shadow_lock(d);
 37.1317 +    rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted); 
 37.1318 +    SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n",
 37.1319 +                   d->domain_id,
 37.1320 +                   d->arch.shadow.total_pages,
 37.1321 +                   shadow_get_allocation(d));
 37.1322 +    shadow_unlock(d);
 37.1323 +    return rv;
 37.1324 +}
 37.1325 +
 37.1326 +/**************************************************************************/
 37.1327 +/* Hash table for storing the guest->shadow mappings */
 37.1328 +
 37.1329 +/* Hash function that takes a gfn or mfn, plus another byte of type info */
 37.1330 +typedef u32 key_t;
 37.1331 +static inline key_t sh_hash(unsigned long n, u8 t) 
 37.1332 +{
 37.1333 +    unsigned char *p = (unsigned char *)&n;
 37.1334 +    key_t k = t;
 37.1335 +    int i;
 37.1336 +    for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
 37.1337 +    return k;
 37.1338 +}
 37.1339 +
 37.1340 +#if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
 37.1341 +
 37.1342 +/* Before we get to the mechanism, define a pair of audit functions
 37.1343 + * that sanity-check the contents of the hash table. */
 37.1344 +static void sh_hash_audit_bucket(struct domain *d, int bucket)
 37.1345 +/* Audit one bucket of the hash table */
 37.1346 +{
 37.1347 +    struct shadow_hash_entry *e, *x;
 37.1348 +    struct page_info *pg;
 37.1349 +
 37.1350 +    if ( !(SHADOW_AUDIT_ENABLE) )
 37.1351 +        return;
 37.1352 +
 37.1353 +    e = &d->arch.shadow.hash_table[bucket];
 37.1354 +    if ( e->t == 0 ) return; /* Bucket is empty */ 
 37.1355 +    while ( e )
 37.1356 +    {
 37.1357 +        /* Empty link? */
 37.1358 +        BUG_ON( e->t == 0 ); 
 37.1359 +        /* Bogus type? */
 37.1360 +        BUG_ON( e->t > (PGC_SH_max_shadow >> PGC_SH_type_shift) );
 37.1361 +        /* Wrong bucket? */
 37.1362 +        BUG_ON( sh_hash(e->n, e->t) % SHADOW_HASH_BUCKETS != bucket ); 
 37.1363 +        /* Duplicate entry? */
 37.1364 +        for ( x = e->next; x; x = x->next )
 37.1365 +            BUG_ON( x->n == e->n && x->t == e->t );
 37.1366 +        /* Bogus MFN? */
 37.1367 +        BUG_ON( !valid_mfn(e->smfn) );
 37.1368 +        pg = mfn_to_page(e->smfn);
 37.1369 +        /* Not a shadow? */
 37.1370 +        BUG_ON( page_get_owner(pg) != 0 );
 37.1371 +        /* Wrong kind of shadow? */
 37.1372 +        BUG_ON( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift 
 37.1373 +                != e->t ); 
 37.1374 +        /* Bad backlink? */
 37.1375 +        BUG_ON( pg->u.inuse.type_info != e->n );
 37.1376 +        if ( e->t != (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
 37.1377 +             && e->t != (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
 37.1378 +             && e->t != (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) )
 37.1379 +        {
 37.1380 +            /* Bad shadow flags on guest page? */
 37.1381 +            BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow_flags & (1<<e->t)) );
 37.1382 +        }
 37.1383 +        /* That entry was OK; on we go */
 37.1384 +        e = e->next;
 37.1385 +    }
 37.1386 +}
 37.1387 +
 37.1388 +#else
 37.1389 +#define sh_hash_audit_bucket(_d, _b)
 37.1390 +#endif /* Hashtable bucket audit */
 37.1391 +
 37.1392 +
 37.1393 +#if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
 37.1394 +
 37.1395 +static void sh_hash_audit(struct domain *d)
 37.1396 +/* Full audit: audit every bucket in the table */
 37.1397 +{
 37.1398 +    int i;
 37.1399 +
 37.1400 +    if ( !(SHADOW_AUDIT_ENABLE) )
 37.1401 +        return;
 37.1402 +
 37.1403 +    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) 
 37.1404 +    {
 37.1405 +        sh_hash_audit_bucket(d, i);
 37.1406 +    }
 37.1407 +}
 37.1408 +
 37.1409 +#else
 37.1410 +#define sh_hash_audit(_d)
 37.1411 +#endif /* Hashtable bucket audit */
 37.1412 +
 37.1413 +/* Memory management interface for bucket allocation.
 37.1414 + * These ought to come out of shadow memory, but at least on 32-bit
 37.1415 + * machines we are forced to allocate them from xenheap so that we can
 37.1416 + * address them. */
 37.1417 +static struct shadow_hash_entry *sh_alloc_hash_entry(struct domain *d)
 37.1418 +{
 37.1419 +    struct shadow_hash_entry *extra, *x;
 37.1420 +    int i;
 37.1421 +
 37.1422 +    /* We need to allocate a new node. Ensure the free list is not empty. 
 37.1423 +     * Allocate new entries in units the same size as the original table. */
 37.1424 +    if ( unlikely(d->arch.shadow.hash_freelist == NULL) )
 37.1425 +    {
 37.1426 +        size_t sz = sizeof(void *) + (SHADOW_HASH_BUCKETS * sizeof(*x));
 37.1427 +        extra = xmalloc_bytes(sz);
 37.1428 +
 37.1429 +        if ( extra == NULL )
 37.1430 +        {
 37.1431 +            /* No memory left! */
 37.1432 +            SHADOW_ERROR("xmalloc() failed when allocating hash buckets.\n");
 37.1433 +            domain_crash_synchronous();
 37.1434 +        }
 37.1435 +        memset(extra, 0, sz);
 37.1436 +
 37.1437 +        /* Record the allocation block so it can be correctly freed later. */
 37.1438 +        *((struct shadow_hash_entry **)&extra[SHADOW_HASH_BUCKETS]) = 
 37.1439 +            d->arch.shadow.hash_allocations;
 37.1440 +        d->arch.shadow.hash_allocations = &extra[0];
 37.1441 +
 37.1442 +        /* Thread a free chain through the newly-allocated nodes. */
 37.1443 +        for ( i = 0; i < (SHADOW_HASH_BUCKETS - 1); i++ )
 37.1444 +            extra[i].next = &extra[i+1];
 37.1445 +        extra[i].next = NULL;
 37.1446 +
 37.1447 +        /* Add the new nodes to the free list. */
 37.1448 +        d->arch.shadow.hash_freelist = &extra[0];
 37.1449 +    }
 37.1450 +
 37.1451 +    /* Allocate a new node from the free list. */
 37.1452 +    x = d->arch.shadow.hash_freelist;
 37.1453 +    d->arch.shadow.hash_freelist = x->next;
 37.1454 +    return x;
 37.1455 +}
 37.1456 +
 37.1457 +static void sh_free_hash_entry(struct domain *d, struct shadow_hash_entry *e)
 37.1458 +{
 37.1459 +    /* Mark the bucket as empty and return it to the free list */
 37.1460 +    e->t = 0; 
 37.1461 +    e->next = d->arch.shadow.hash_freelist;
 37.1462 +    d->arch.shadow.hash_freelist = e;
 37.1463 +}
 37.1464 +
 37.1465 +
 37.1466 +/* Allocate and initialise the table itself.  
 37.1467 + * Returns 0 for success, 1 for error. */
 37.1468 +static int shadow_hash_alloc(struct domain *d)
 37.1469 +{
 37.1470 +    struct shadow_hash_entry *table;
 37.1471 +
 37.1472 +    ASSERT(shadow_lock_is_acquired(d));
 37.1473 +    ASSERT(!d->arch.shadow.hash_table);
 37.1474 +
 37.1475 +    table = xmalloc_array(struct shadow_hash_entry, SHADOW_HASH_BUCKETS);
 37.1476 +    if ( !table ) return 1;
 37.1477 +    memset(table, 0, 
 37.1478 +           SHADOW_HASH_BUCKETS * sizeof (struct shadow_hash_entry));
 37.1479 +    d->arch.shadow.hash_table = table;
 37.1480 +    return 0;
 37.1481 +}
 37.1482 +
 37.1483 +/* Tear down the hash table and return all memory to Xen.
 37.1484 + * This function does not care whether the table is populated. */
 37.1485 +static void shadow_hash_teardown(struct domain *d)
 37.1486 +{
 37.1487 +    struct shadow_hash_entry *a, *n;
 37.1488 +
 37.1489 +    ASSERT(shadow_lock_is_acquired(d));
 37.1490 +    ASSERT(d->arch.shadow.hash_table);
 37.1491 +
 37.1492 +    /* Return the table itself */
 37.1493 +    xfree(d->arch.shadow.hash_table);
 37.1494 +    d->arch.shadow.hash_table = NULL;
 37.1495 +
 37.1496 +    /* Return any extra allocations */
 37.1497 +    a = d->arch.shadow.hash_allocations;
 37.1498 +    while ( a ) 
 37.1499 +    {
 37.1500 +        /* We stored a linked-list pointer at the end of each allocation */
 37.1501 +        n = *((struct shadow_hash_entry **)(&a[SHADOW_HASH_BUCKETS]));
 37.1502 +        xfree(a);
 37.1503 +        a = n;
 37.1504 +    }
 37.1505 +    d->arch.shadow.hash_allocations = NULL;
 37.1506 +    d->arch.shadow.hash_freelist = NULL;
 37.1507 +}
 37.1508 +
 37.1509 +
 37.1510 +mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
 37.1511 +/* Find an entry in the hash table.  Returns the MFN of the shadow,
 37.1512 + * or INVALID_MFN if it doesn't exist */
 37.1513 +{
 37.1514 +    struct domain *d = v->domain;
 37.1515 +    struct shadow_hash_entry *p, *x, *head;
 37.1516 +    key_t key;
 37.1517 +
 37.1518 +    ASSERT(shadow_lock_is_acquired(d));
 37.1519 +    ASSERT(d->arch.shadow.hash_table);
 37.1520 +    ASSERT(t);
 37.1521 +
 37.1522 +    sh_hash_audit(d);
 37.1523 +
 37.1524 +    perfc_incrc(shadow_hash_lookups);
 37.1525 +    key = sh_hash(n, t);
 37.1526 +
 37.1527 +    x = head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
 37.1528 +    p = NULL;
 37.1529 +
 37.1530 +    sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
 37.1531 +
 37.1532 +    do
 37.1533 +    {
 37.1534 +        ASSERT(x->t || ((x == head) && (x->next == NULL)));
 37.1535 +
 37.1536 +        if ( x->n == n && x->t == t )
 37.1537 +        {
 37.1538 +            /* Pull-to-front if 'x' isn't already the head item */
 37.1539 +            if ( unlikely(x != head) )
 37.1540 +            {
 37.1541 +                if ( unlikely(d->arch.shadow.hash_walking != 0) )
 37.1542 +                    /* Can't reorder: someone is walking the hash chains */
 37.1543 +                    return x->smfn;
 37.1544 +                else 
 37.1545 +                {
 37.1546 +                    /* Delete 'x' from list and reinsert after head. */
 37.1547 +                    p->next = x->next;
 37.1548 +                    x->next = head->next;
 37.1549 +                    head->next = x;
 37.1550 +                    
 37.1551 +                    /* Swap 'x' contents with head contents. */
 37.1552 +                    SWAP(head->n, x->n);
 37.1553 +                    SWAP(head->t, x->t);
 37.1554 +                    SWAP(head->smfn, x->smfn);
 37.1555 +                }
 37.1556 +            }
 37.1557 +            else
 37.1558 +            {
 37.1559 +                perfc_incrc(shadow_hash_lookup_head);
 37.1560 +            }
 37.1561 +            return head->smfn;
 37.1562 +        }
 37.1563 +
 37.1564 +        p = x;
 37.1565 +        x = x->next;
 37.1566 +    }
 37.1567 +    while ( x != NULL );
 37.1568 +
 37.1569 +    perfc_incrc(shadow_hash_lookup_miss);
 37.1570 +    return _mfn(INVALID_MFN);
 37.1571 +}
 37.1572 +
 37.1573 +void shadow_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
 37.1574 +/* Put a mapping (n,t)->smfn into the hash table */
 37.1575 +{
 37.1576 +    struct domain *d = v->domain;
 37.1577 +    struct shadow_hash_entry *x, *head;
 37.1578 +    key_t key;
 37.1579 +    
 37.1580 +    ASSERT(shadow_lock_is_acquired(d));
 37.1581 +    ASSERT(d->arch.shadow.hash_table);
 37.1582 +    ASSERT(t);
 37.1583 +
 37.1584 +    sh_hash_audit(d);
 37.1585 +
 37.1586 +    perfc_incrc(shadow_hash_inserts);
 37.1587 +    key = sh_hash(n, t);
 37.1588 +
 37.1589 +    head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
 37.1590 +
 37.1591 +    sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
 37.1592 +
 37.1593 +    /* If the bucket is empty then insert the new page as the head item. */
 37.1594 +    if ( head->t == 0 )
 37.1595 +    {
 37.1596 +        head->n = n;
 37.1597 +        head->t = t;
 37.1598 +        head->smfn = smfn;
 37.1599 +        ASSERT(head->next == NULL);
 37.1600 +    }
 37.1601 +    else 
 37.1602 +    {
 37.1603 +        /* Insert a new entry directly after the head item. */
 37.1604 +        x = sh_alloc_hash_entry(d);
 37.1605 +        x->n = n; 
 37.1606 +        x->t = t;
 37.1607 +        x->smfn = smfn;
 37.1608 +        x->next = head->next;
 37.1609 +        head->next = x;
 37.1610 +    }
 37.1611 +    
 37.1612 +    sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
 37.1613 +}
 37.1614 +
 37.1615 +void shadow_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
 37.1616 +/* Excise the mapping (n,t)->smfn from the hash table */
 37.1617 +{
 37.1618 +    struct domain *d = v->domain;
 37.1619 +    struct shadow_hash_entry *p, *x, *head;
 37.1620 +    key_t key;
 37.1621 +
 37.1622 +    ASSERT(shadow_lock_is_acquired(d));
 37.1623 +    ASSERT(d->arch.shadow.hash_table);
 37.1624 +    ASSERT(t);
 37.1625 +
 37.1626 +    sh_hash_audit(d);
 37.1627 +
 37.1628 +    perfc_incrc(shadow_hash_deletes);
 37.1629 +    key = sh_hash(n, t);
 37.1630 +
 37.1631 +    head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS];
 37.1632 +
 37.1633 +    sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
 37.1634 +
 37.1635 +    /* Match on head item? */
 37.1636 +    if ( head->n == n && head->t == t )
 37.1637 +    {
 37.1638 +        if ( (x = head->next) != NULL )
 37.1639 +        {
 37.1640 +            /* Overwrite head with contents of following node. */
 37.1641 +            head->n = x->n;
 37.1642 +            head->t = x->t;
 37.1643 +            head->smfn = x->smfn;
 37.1644 +
 37.1645 +            /* Delete following node. */
 37.1646 +            head->next = x->next;
 37.1647 +            sh_free_hash_entry(d, x);
 37.1648 +        }
 37.1649 +        else
 37.1650 +        {
 37.1651 +            /* This bucket is now empty. Initialise the head node. */
 37.1652 +            head->t = 0;
 37.1653 +        }
 37.1654 +    }
 37.1655 +    else 
 37.1656 +    {
 37.1657 +        /* Not at the head; need to walk the chain */
 37.1658 +        p = head;
 37.1659 +        x = head->next; 
 37.1660 +        
 37.1661 +        while(1)
 37.1662 +        {
 37.1663 +            ASSERT(x); /* We can't have hit the end, since our target is
 37.1664 +                        * still in the chain somehwere... */
 37.1665 +            if ( x->n == n && x->t == t )
 37.1666 +            {
 37.1667 +                /* Delete matching node. */
 37.1668 +                p->next = x->next;
 37.1669 +                sh_free_hash_entry(d, x);
 37.1670 +                break;
 37.1671 +            }
 37.1672 +            p = x;
 37.1673 +            x = x->next;
 37.1674 +        }
 37.1675 +    }
 37.1676 +
 37.1677 +    sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS);
 37.1678 +}
 37.1679 +
 37.1680 +typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
 37.1681 +
 37.1682 +static void hash_foreach(struct vcpu *v, 
 37.1683 +                         unsigned int callback_mask, 
 37.1684 +                         hash_callback_t callbacks[], 
 37.1685 +                         mfn_t callback_mfn)
 37.1686 +/* Walk the hash table looking at the types of the entries and 
 37.1687 + * calling the appropriate callback function for each entry. 
 37.1688 + * The mask determines which shadow types we call back for, and the array
 37.1689 + * of callbacks tells us which function to call.
 37.1690 + * Any callback may return non-zero to let us skip the rest of the scan. 
 37.1691 + *
 37.1692 + * WARNING: Callbacks MUST NOT add or remove hash entries unless they 
 37.1693 + * then return non-zero to terminate the scan. */
 37.1694 +{
 37.1695 +    int i, done = 0;
 37.1696 +    struct domain *d = v->domain;
 37.1697 +    struct shadow_hash_entry *x;
 37.1698 +
 37.1699 +    /* Say we're here, to stop hash-lookups reordering the chains */
 37.1700 +    ASSERT(shadow_lock_is_acquired(d));
 37.1701 +    ASSERT(d->arch.shadow.hash_walking == 0);
 37.1702 +    d->arch.shadow.hash_walking = 1;
 37.1703 +
 37.1704 +    callback_mask &= ~1; /* Never attempt to call back on empty buckets */
 37.1705 +    for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) 
 37.1706 +    {
 37.1707 +        /* WARNING: This is not safe against changes to the hash table.
 37.1708 +         * The callback *must* return non-zero if it has inserted or
 37.1709 +         * deleted anything from the hash (lookups are OK, though). */
 37.1710 +        for ( x = &d->arch.shadow.hash_table[i]; x; x = x->next )
 37.1711 +        {
 37.1712 +            if ( callback_mask & (1 << x->t) ) 
 37.1713 +            {
 37.1714 +                ASSERT(x->t <= 15);
 37.1715 +                ASSERT(callbacks[x->t] != NULL);
 37.1716 +                if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
 37.1717 +                    break;
 37.1718 +            }
 37.1719 +        }
 37.1720 +        if ( done ) break; 
 37.1721 +    }
 37.1722 +    d->arch.shadow.hash_walking = 0; 
 37.1723 +}
 37.1724 +
 37.1725 +
 37.1726 +/**************************************************************************/
 37.1727 +/* Destroy a shadow page: simple dispatcher to call the per-type destructor
 37.1728 + * which will decrement refcounts appropriately and return memory to the 
 37.1729 + * free pool. */
 37.1730 +
 37.1731 +void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
 37.1732 +{
 37.1733 +    struct page_info *pg = mfn_to_page(smfn);
 37.1734 +    u32 t = pg->count_info & PGC_SH_type_mask;
 37.1735 +
 37.1736 +
 37.1737 +    SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
 37.1738 +
 37.1739 +    /* Double-check, if we can, that the shadowed page belongs to this
 37.1740 +     * domain, (by following the back-pointer). */
 37.1741 +    ASSERT(t == PGC_SH_fl1_32_shadow  ||  
 37.1742 +           t == PGC_SH_fl1_pae_shadow ||  
 37.1743 +           t == PGC_SH_fl1_64_shadow  || 
 37.1744 +           t == PGC_SH_monitor_table  || 
 37.1745 +           (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info))) 
 37.1746 +            == v->domain)); 
 37.1747 +
 37.1748 +    /* The down-shifts here are so that the switch statement is on nice
 37.1749 +     * small numbers that the compiler will enjoy */
 37.1750 +    switch ( t >> PGC_SH_type_shift )
 37.1751 +    {
 37.1752 +#if CONFIG_PAGING_LEVELS == 2
 37.1753 +    case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
 37.1754 +    case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
 37.1755 +        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn); 
 37.1756 +        break;
 37.1757 +    case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
 37.1758 +        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn);
 37.1759 +        break;
 37.1760 +#else /* PAE or 64bit */
 37.1761 +    case PGC_SH_l1_32_shadow >> PGC_SH_type_shift:
 37.1762 +    case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift:
 37.1763 +        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn);
 37.1764 +        break;
 37.1765 +    case PGC_SH_l2_32_shadow >> PGC_SH_type_shift:
 37.1766 +        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn);
 37.1767 +        break;
 37.1768 +#endif
 37.1769 +
 37.1770 +#if CONFIG_PAGING_LEVELS >= 3
 37.1771 +    case PGC_SH_l1_pae_shadow >> PGC_SH_type_shift:
 37.1772 +    case PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift:
 37.1773 +        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn);
 37.1774 +        break;
 37.1775 +    case PGC_SH_l2_pae_shadow >> PGC_SH_type_shift:
 37.1776 +    case PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift:
 37.1777 +        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn);
 37.1778 +        break;
 37.1779 +    case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift:
 37.1780 +        SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 3, 3)(v, smfn);
 37.1781 +        break;
 37.1782 +#endif
 37.1783 +
 37.1784 +#if CONFIG_PAGING_LEVELS >= 4
 37.1785 +    case PGC_SH_l1_64_shadow >> PGC_SH_type_shift:
 37.1786 +    case PGC_SH_fl1_64_shadow >> PGC_SH_type_shift:
 37.1787 +        SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn);
 37.1788 +        break;
 37.1789 +    case PGC_SH_l2_64_shadow >> PGC_SH_type_shift:
 37.1790 +        SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn);
 37.1791 +        break;
 37.1792 +    case PGC_SH_l3_64_shadow >> PGC_SH_type_shift:
 37.1793 +        SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn);
 37.1794 +        break;
 37.1795 +    case PGC_SH_l4_64_shadow >> PGC_SH_type_shift:
 37.1796 +        SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn);
 37.1797 +        break;
 37.1798 +#endif
 37.1799 +    default:
 37.1800 +        SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n", 
 37.1801 +                       (unsigned long)t);
 37.1802 +        BUG();
 37.1803 +    }    
 37.1804 +}
 37.1805 +
 37.1806 +/**************************************************************************/
 37.1807 +/* Remove all writeable mappings of a guest frame from the shadow tables 
 37.1808 + * Returns non-zero if we need to flush TLBs. 
 37.1809 + * level and fault_addr desribe how we found this to be a pagetable;
 37.1810 + * level==0 means we have some other reason for revoking write access.*/
 37.1811 +
 37.1812 +int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn, 
 37.1813 +                                unsigned int level,
 37.1814 +                                unsigned long fault_addr)
 37.1815 +{
 37.1816 +    /* Dispatch table for getting per-type functions */
 37.1817 +    static hash_callback_t callbacks[16] = {
 37.1818 +        NULL, /* none    */
 37.1819 +#if CONFIG_PAGING_LEVELS == 2
 37.1820 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32   */
 37.1821 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32  */
 37.1822 +#else 
 37.1823 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32   */
 37.1824 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32  */
 37.1825 +#endif
 37.1826 +        NULL, /* l2_32   */
 37.1827 +#if CONFIG_PAGING_LEVELS >= 3
 37.1828 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae  */
 37.1829 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */
 37.1830 +#else 
 37.1831 +        NULL, /* l1_pae  */
 37.1832 +        NULL, /* fl1_pae */
 37.1833 +#endif
 37.1834 +        NULL, /* l2_pae  */
 37.1835 +        NULL, /* l2h_pae */
 37.1836 +        NULL, /* l3_pae  */
 37.1837 +#if CONFIG_PAGING_LEVELS >= 4
 37.1838 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64   */
 37.1839 +        SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64  */
 37.1840 +#else
 37.1841 +        NULL, /* l1_64   */
 37.1842 +        NULL, /* fl1_64  */
 37.1843 +#endif
 37.1844 +        NULL, /* l2_64   */
 37.1845 +        NULL, /* l3_64   */
 37.1846 +        NULL, /* l4_64   */
 37.1847 +        NULL, /* p2m     */
 37.1848 +        NULL  /* unused  */
 37.1849 +    };
 37.1850 +
 37.1851 +    static unsigned int callback_mask = 
 37.1852 +          1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
 37.1853 +        | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
 37.1854 +        | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
 37.1855 +        | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
 37.1856 +        | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
 37.1857 +        | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
 37.1858 +        ;
 37.1859 +    struct page_info *pg = mfn_to_page(gmfn);
 37.1860 +
 37.1861 +    ASSERT(shadow_lock_is_acquired(v->domain));
 37.1862 +
 37.1863 +    /* Only remove writable mappings if we are doing shadow refcounts.
 37.1864 +     * In guest refcounting, we trust Xen to already be restricting
 37.1865 +     * all the writes to the guest page tables, so we do not need to
 37.1866 +     * do more. */
 37.1867 +    if ( !shadow_mode_refcounts(v->domain) )
 37.1868 +        return 0;
 37.1869 +
 37.1870 +    /* Early exit if it's already a pagetable, or otherwise not writeable */
 37.1871 +    if ( sh_mfn_is_a_page_table(gmfn) 
 37.1872 +         || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
 37.1873 +        return 0;
 37.1874 +
 37.1875 +    perfc_incrc(shadow_writeable);
 37.1876 +
 37.1877 +    /* If this isn't a "normal" writeable page, the domain is trying to 
 37.1878 +     * put pagetables in special memory of some kind.  We can't allow that. */
 37.1879 +    if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
 37.1880 +    {
 37.1881 +        SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %" 
 37.1882 +                      PRtype_info "\n",
 37.1883 +                      mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
 37.1884 +        domain_crash(v->domain);
 37.1885 +    }
 37.1886 +
 37.1887 +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
 37.1888 +    if ( v == current && level != 0 )
 37.1889 +    {
 37.1890 +        unsigned long gfn;
 37.1891 +        /* Heuristic: there is likely to be only one writeable mapping,
 37.1892 +         * and that mapping is likely to be in the current pagetable,
 37.1893 +         * either in the guest's linear map (linux, windows) or in a
 37.1894 +         * magic slot used to map high memory regions (linux HIGHTPTE) */
 37.1895 +
 37.1896 +#define GUESS(_a, _h) do {                                              \
 37.1897 +            if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) )          \
 37.1898 +                perfc_incrc(shadow_writeable_h_ ## _h);                \
 37.1899 +            if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )        \
 37.1900 +                return 1;                                               \
 37.1901 +        } while (0)
 37.1902 +
 37.1903 +        
 37.1904 +        /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
 37.1905 +        if ( v == current 
 37.1906 +             && (gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
 37.1907 +            GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
 37.1908 +
 37.1909 +        if ( v->arch.shadow.mode->guest_levels == 2 )
 37.1910 +        {
 37.1911 +            if ( level == 1 )
 37.1912 +                /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
 37.1913 +                GUESS(0xC0000000UL + (fault_addr >> 10), 1);
 37.1914 +        }
 37.1915 +#if CONFIG_PAGING_LEVELS >= 3
 37.1916 +        else if ( v->arch.shadow.mode->guest_levels == 3 )
 37.1917 +        {
 37.1918 +            /* 32bit PAE w2k3: linear map at 0xC0000000 */
 37.1919 +            switch ( level ) 
 37.1920 +            {
 37.1921 +            case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
 37.1922 +            case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
 37.1923 +            }
 37.1924 +        }
 37.1925 +#if CONFIG_PAGING_LEVELS >= 4
 37.1926 +        else if ( v->arch.shadow.mode->guest_levels == 4 )
 37.1927 +        {
 37.1928 +            /* 64bit w2k3: linear map at 0x0000070000000000 */
 37.1929 +            switch ( level ) 
 37.1930 +            {
 37.1931 +            case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
 37.1932 +            case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
 37.1933 +            case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
 37.1934 +            }
 37.1935 +        }
 37.1936 +#endif /* CONFIG_PAGING_LEVELS >= 4 */
 37.1937 +#endif /* CONFIG_PAGING_LEVELS >= 3 */
 37.1938 +
 37.1939 +#undef GUESS
 37.1940 +
 37.1941 +    }
 37.1942 +#endif
 37.1943 +    
 37.1944 +    /* Brute-force search of all the shadows, by walking the hash */
 37.1945 +    perfc_incrc(shadow_writeable_bf);
 37.1946 +    hash_foreach(v, callback_mask, callbacks, gmfn);
 37.1947 +
 37.1948 +    /* If that didn't catch the mapping, something is very wrong */
 37.1949 +    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
 37.1950 +    {
 37.1951 +        SHADOW_ERROR("can't find all writeable mappings of mfn %lx: "
 37.1952 +                      "%lu left\n", mfn_x(gmfn),
 37.1953 +                      (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
 37.1954 +        domain_crash(v->domain);
 37.1955 +    }
 37.1956 +    
 37.1957 +    /* We killed at least one writeable mapping, so must flush TLBs. */
 37.1958 +    return 1;
 37.1959 +}
 37.1960 +
 37.1961 +
 37.1962 +
 37.1963 +/**************************************************************************/
 37.1964 +/* Remove all mappings of a guest frame from the shadow tables.
 37.1965 + * Returns non-zero if we need to flush TLBs. */
 37.1966 +
 37.1967 +int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
 37.1968 +{
 37.1969 +    struct page_info *page = mfn_to_page(gmfn);
 37.1970 +    int expected_count;
 37.1971 +
 37.1972 +    /* Dispatch table for getting per-type functions */
 37.1973 +    static hash_callback_t callbacks[16] = {
 37.1974 +        NULL, /* none    */
 37.1975 +#if CONFIG_PAGING_LEVELS == 2
 37.1976 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32   */
 37.1977 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32  */
 37.1978 +#else 
 37.1979 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32   */
 37.1980 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32  */
 37.1981 +#endif
 37.1982 +        NULL, /* l2_32   */
 37.1983 +#if CONFIG_PAGING_LEVELS >= 3
 37.1984 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae  */
 37.1985 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */
 37.1986 +#else 
 37.1987 +        NULL, /* l1_pae  */
 37.1988 +        NULL, /* fl1_pae */
 37.1989 +#endif
 37.1990 +        NULL, /* l2_pae  */
 37.1991 +        NULL, /* l2h_pae */
 37.1992 +        NULL, /* l3_pae  */
 37.1993 +#if CONFIG_PAGING_LEVELS >= 4
 37.1994 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64   */
 37.1995 +        SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64  */
 37.1996 +#else
 37.1997 +        NULL, /* l1_64   */
 37.1998 +        NULL, /* fl1_64  */
 37.1999 +#endif
 37.2000 +        NULL, /* l2_64   */
 37.2001 +        NULL, /* l3_64   */
 37.2002 +        NULL, /* l4_64   */
 37.2003 +        NULL, /* p2m     */
 37.2004 +        NULL  /* unused  */
 37.2005 +    };
 37.2006 +
 37.2007 +    static unsigned int callback_mask = 
 37.2008 +          1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift)
 37.2009 +        | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift)
 37.2010 +        | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift)
 37.2011 +        | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift)
 37.2012 +        | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift)
 37.2013 +        | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift)
 37.2014 +        ;
 37.2015 +
 37.2016 +    perfc_incrc(shadow_mappings);
 37.2017 +    if ( (page->count_info & PGC_count_mask) == 0 )
 37.2018 +        return 0;
 37.2019 +
 37.2020 +    ASSERT(shadow_lock_is_acquired(v->domain));
 37.2021 +
 37.2022 +    /* XXX TODO: 
 37.2023 +     * Heuristics for finding the (probably) single mapping of this gmfn */
 37.2024 +    
 37.2025 +    /* Brute-force search of all the shadows, by walking the hash */
 37.2026 +    perfc_incrc(shadow_mappings_bf);
 37.2027 +    hash_foreach(v, callback_mask, callbacks, gmfn);
 37.2028 +
 37.2029 +    /* If that didn't catch the mapping, something is very wrong */
 37.2030 +    expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
 37.2031 +    if ( (page->count_info & PGC_count_mask) != expected_count )
 37.2032 +    {
 37.2033 +        /* Don't complain if we're in HVM and there's one extra mapping: 
 37.2034 +         * The qemu helper process has an untyped mapping of this dom's RAM */
 37.2035 +        if ( !(shadow_mode_external(v->domain)
 37.2036 +               && (page->count_info & PGC_count_mask) <= 2
 37.2037 +               && (page->u.inuse.type_info & PGT_count_mask) == 0) )
 37.2038 +        {
 37.2039 +            SHADOW_ERROR("can't find all mappings of mfn %lx: "
 37.2040 +                          "c=%08x t=%08lx\n", mfn_x(gmfn), 
 37.2041 +                          page->count_info, page->u.inuse.type_info);
 37.2042 +        }
 37.2043 +    }
 37.2044 +
 37.2045 +    /* We killed at least one mapping, so must flush TLBs. */
 37.2046 +    return 1;
 37.2047 +}
 37.2048 +
 37.2049 +
 37.2050 +/**************************************************************************/
 37.2051 +/* Remove all shadows of a guest frame from the shadow tables */
 37.2052 +
 37.2053 +static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
 37.2054 +/* Follow this shadow's up-pointer, if it has one, and remove the reference
 37.2055 + * found there.  Returns 1 if that was the only reference to this shadow */
 37.2056 +{
 37.2057 +    struct page_info *pg = mfn_to_page(smfn);
 37.2058 +    mfn_t pmfn;
 37.2059 +    void *vaddr;
 37.2060 +    int rc;
 37.2061 +
 37.2062 +    ASSERT((pg->count_info & PGC_SH_type_mask) > 0);
 37.2063 +    ASSERT((pg->count_info & PGC_SH_type_mask) < PGC_SH_max_shadow);
 37.2064 +    ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l2_32_shadow);
 37.2065 +    ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l3_pae_shadow);
 37.2066 +    ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l4_64_shadow);
 37.2067 +    
 37.2068 +    if (pg->up == 0) return 0;
 37.2069 +    pmfn = _mfn(pg->up >> PAGE_SHIFT);
 37.2070 +    ASSERT(valid_mfn(pmfn));
 37.2071 +    vaddr = sh_map_domain_page(pmfn);
 37.2072 +    ASSERT(vaddr);
 37.2073 +    vaddr += pg->up & (PAGE_SIZE-1);
 37.2074 +    ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
 37.2075 +    
 37.2076 +    /* Is this the only reference to this shadow? */
 37.2077 +    rc = ((pg->count_info & PGC_SH_count_mask) == 1) ? 1 : 0;
 37.2078 +
 37.2079 +    /* Blank the offending entry */
 37.2080 +    switch ((pg->count_info & PGC_SH_type_mask)) 
 37.2081 +    {
 37.2082 +    case PGC_SH_l1_32_shadow:
 37.2083 +    case PGC_SH_l2_32_shadow:
 37.2084 +#if CONFIG_PAGING_LEVELS == 2
 37.2085 +        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn);
 37.2086 +#else
 37.2087 +        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn);
 37.2088 +#endif
 37.2089 +        break;
 37.2090 +#if CONFIG_PAGING_LEVELS >=3
 37.2091 +    case PGC_SH_l1_pae_shadow:
 37.2092 +    case PGC_SH_l2_pae_shadow:
 37.2093 +    case PGC_SH_l2h_pae_shadow:
 37.2094 +    case PGC_SH_l3_pae_shadow:
 37.2095 +        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn);
 37.2096 +        break;
 37.2097 +#if CONFIG_PAGING_LEVELS >= 4
 37.2098 +    case PGC_SH_l1_64_shadow:
 37.2099 +    case PGC_SH_l2_64_shadow:
 37.2100 +    case PGC_SH_l3_64_shadow:
 37.2101 +    case PGC_SH_l4_64_shadow:
 37.2102 +        SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn);
 37.2103 +        break;
 37.2104 +#endif
 37.2105 +#endif
 37.2106 +    default: BUG(); /* Some wierd unknown shadow type */
 37.2107 +    }
 37.2108 +    
 37.2109 +    sh_unmap_domain_page(vaddr);
 37.2110 +    if ( rc )
 37.2111 +        perfc_incrc(shadow_up_pointer);
 37.2112 +    else
 37.2113 +        perfc_incrc(shadow_unshadow_bf);
 37.2114 +
 37.2115 +    return rc;
 37.2116 +}
 37.2117 +
 37.2118 +void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
 37.2119 +/* Remove the shadows of this guest page.  
 37.2120 + * If all != 0, find all shadows, if necessary by walking the tables.
 37.2121 + * Otherwise, just try the (much faster) heuristics, which will remove 
 37.2122 + * at most one reference to each shadow of the page. */
 37.2123 +{
 37.2124 +    struct page_info *pg;
 37.2125 +    mfn_t smfn;
 37.2126 +    u32 sh_flags;
 37.2127 +    unsigned char t;
 37.2128 +
 37.2129 +    /* Dispatch table for getting per-type functions: each level must
 37.2130 +     * be called with the function to remove a lower-level shadow. */
 37.2131 +    static hash_callback_t callbacks[16] = {
 37.2132 +        NULL, /* none    */
 37.2133 +        NULL, /* l1_32   */
 37.2134 +        NULL, /* fl1_32  */
 37.2135 +#if CONFIG_PAGING_LEVELS == 2
 37.2136 +        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32   */
 37.2137 +#else 
 37.2138 +        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32   */
 37.2139 +#endif
 37.2140 +        NULL, /* l1_pae  */
 37.2141 +        NULL, /* fl1_pae */
 37.2142 +#if CONFIG_PAGING_LEVELS >= 3
 37.2143 +        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae  */
 37.2144 +        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */
 37.2145 +        SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,3,3), /* l3_pae  */
 37.2146 +#else 
 37.2147 +        NULL, /* l2_pae  */
 37.2148 +        NULL, /* l2h_pae */
 37.2149 +        NULL, /* l3_pae  */
 37.2150 +#endif
 37.2151 +        NULL, /* l1_64   */
 37.2152 +        NULL, /* fl1_64  */
 37.2153 +#if CONFIG_PAGING_LEVELS >= 4
 37.2154 +        SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64   */
 37.2155 +        SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64   */
 37.2156 +        SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64   */
 37.2157 +#else
 37.2158 +        NULL, /* l2_64   */
 37.2159 +        NULL, /* l3_64   */
 37.2160 +        NULL, /* l4_64   */
 37.2161 +#endif
 37.2162 +        NULL, /* p2m     */
 37.2163 +        NULL  /* unused  */
 37.2164 +    };
 37.2165 +
 37.2166 +    /* Another lookup table, for choosing which mask to use */
 37.2167 +    static unsigned int masks[16] = {
 37.2168 +        0, /* none    */
 37.2169 +        1 << (PGC_SH_l2_32_shadow >> PGC_SH_type_shift), /* l1_32   */
 37.2170 +        0, /* fl1_32  */
 37.2171 +        0, /* l2_32   */
 37.2172 +        ((1 << (PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift))
 37.2173 +         | (1 << (PGC_SH_l2_pae_shadow >> PGC_SH_type_shift))), /* l1_pae  */
 37.2174 +        0, /* fl1_pae */
 37.2175 +        1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2_pae  */
 37.2176 +        1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2h_pae  */
 37.2177 +        0, /* l3_pae  */
 37.2178 +        1 << (PGC_SH_l2_64_shadow >> PGC_SH_type_shift), /* l1_64   */
 37.2179 +        0, /* fl1_64  */
 37.2180 +        1 << (PGC_SH_l3_64_shadow >> PGC_SH_type_shift), /* l2_64   */
 37.2181 +        1 << (PGC_SH_l4_64_shadow >> PGC_SH_type_shift), /* l3_64   */
 37.2182 +        0, /* l4_64   */
 37.2183 +        0, /* p2m     */
 37.2184 +        0  /* unused  */
 37.2185 +    };
 37.2186 +
 37.2187 +    ASSERT(shadow_lock_is_acquired(v->domain));
 37.2188 +
 37.2189 +    pg = mfn_to_page(gmfn);
 37.2190 +
 37.2191 +    /* Bale out now if the page is not shadowed */
 37.2192 +    if ( (pg->count_info & PGC_page_table) == 0 )
 37.2193 +        return;
 37.2194 +
 37.2195 +    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
 37.2196 +                   v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
 37.2197 +
 37.2198 +    /* Search for this shadow in all appropriate shadows */
 37.2199 +    perfc_incrc(shadow_unshadow);
 37.2200 +    sh_flags = pg->shadow_flags;
 37.2201 +
 37.2202 +    /* Lower-level shadows need to be excised from upper-level shadows.
 37.2203 +     * This call to hash_foreach() looks dangerous but is in fact OK: each
 37.2204 +     * call will remove at most one shadow, and terminate immediately when
 37.2205 +     * it does remove it, so we never walk the hash after doing a deletion.  */
 37.2206 +#define DO_UNSHADOW(_type) do {                                 \
 37.2207 +    t = (_type) >> PGC_SH_type_shift;                          \
 37.2208 +    smfn = shadow_hash_lookup(v, mfn_x(gmfn), t);              \
 37.2209 +    if ( !sh_remove_shadow_via_pointer(v, smfn) && all )       \
 37.2210 +        hash_foreach(v, masks[t], callbacks, smfn);             \
 37.2211 +} while (0)
 37.2212 +
 37.2213 +    /* Top-level shadows need to be unpinned */
 37.2214 +#define DO_UNPIN(_type) do {                                             \
 37.2215 +    t = (_type) >> PGC_SH_type_shift;                                   \
 37.2216 +    smfn = shadow_hash_lookup(v, mfn_x(gmfn), t);                       \
 37.2217 +    if ( mfn_to_page(smfn)->count_info & PGC_SH_pinned )                \
 37.2218 +        sh_unpin(v, smfn);                                              \
 37.2219 +    if ( (_type) == PGC_SH_l3_pae_shadow )                              \
 37.2220 +        SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn); \
 37.2221 +} while (0)
 37.2222 +
 37.2223 +    if ( sh_flags & SHF_L1_32 )   DO_UNSHADOW(PGC_SH_l1_32_shadow);
 37.2224 +    if ( sh_flags & SHF_L2_32 )   DO_UNPIN(PGC_SH_l2_32_shadow);
 37.2225 +#if CONFIG_PAGING_LEVELS >= 3
 37.2226 +    if ( sh_flags & SHF_L1_PAE )  DO_UNSHADOW(PGC_SH_l1_pae_shadow);
 37.2227 +    if ( sh_flags & SHF_L2_PAE )  DO_UNSHADOW(PGC_SH_l2_pae_shadow);
 37.2228 +    if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(PGC_SH_l2h_pae_shadow);
 37.2229 +    if ( sh_flags & SHF_L3_PAE )  DO_UNPIN(PGC_SH_l3_pae_shadow);
 37.2230 +#if CONFIG_PAGING_LEVELS >= 4
 37.2231 +    if ( sh_flags & SHF_L1_64 )   DO_UNSHADOW(PGC_SH_l1_64_shadow);
 37.2232 +    if ( sh_flags & SHF_L2_64 )   DO_UNSHADOW(PGC_SH_l2_64_shadow);
 37.2233 +    if ( sh_flags & SHF_L3_64 )   DO_UNSHADOW(PGC_SH_l3_64_shadow);
 37.2234 +    if ( sh_flags & SHF_L4_64 )   DO_UNPIN(PGC_SH_l4_64_shadow);
 37.2235 +#endif
 37.2236 +#endif
 37.2237 +
 37.2238 +#undef DO_UNSHADOW
 37.2239 +#undef DO_UNPIN
 37.2240 +
 37.2241 +
 37.2242 +#if CONFIG_PAGING_LEVELS > 2
 37.2243 +    /* We may have caused some PAE l3 entries to change: need to 
 37.2244 +     * fix up the copies of them in various places */
 37.2245 +    if ( sh_flags & (SHF_L2_PAE|SHF_L2H_PAE) )
 37.2246 +        sh_pae_recopy(v->domain);
 37.2247 +#endif
 37.2248 +
 37.2249 +    /* If that didn't catch the shadows, something is wrong */
 37.2250 +    if ( all && (pg->count_info & PGC_page_table) )
 37.2251 +    {
 37.2252 +        SHADOW_ERROR("can't find all shadows of mfn %05lx (shadow_flags=%08x)\n",
 37.2253 +                      mfn_x(gmfn), pg->shadow_flags);
 37.2254 +        domain_crash(v->domain);
 37.2255 +    }
 37.2256 +}
 37.2257 +
 37.2258 +void
 37.2259 +shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
 37.2260 +/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
 37.2261 + * Unshadow it, and recursively unshadow pages that reference it. */
 37.2262 +{
 37.2263 +    shadow_remove_all_shadows(v, gmfn);
 37.2264 +    /* XXX TODO:
 37.2265 +     * Rework this hashtable walker to return a linked-list of all 
 37.2266 +     * the shadows it modified, then do breadth-first recursion 
 37.2267 +     * to find the way up to higher-level tables and unshadow them too. 
 37.2268 +     *
 37.2269 +     * The current code (just tearing down each page's shadows as we
 37.2270 +     * detect that it is not a pagetable) is correct, but very slow. 
 37.2271 +     * It means extra emulated writes and slows down removal of mappings. */
 37.2272 +}
 37.2273 +
 37.2274 +/**************************************************************************/
 37.2275 +
 37.2276 +void sh_update_paging_modes(struct vcpu *v)
 37.2277 +{
 37.2278 +    struct domain *d = v->domain;
 37.2279 +    struct shadow_paging_mode *old_mode = v->arch.shadow.mode;
 37.2280 +    mfn_t old_guest_table;
 37.2281 +
 37.2282 +    ASSERT(shadow_lock_is_acquired(d));
 37.2283 +
 37.2284 +    // Valid transitions handled by this function:
 37.2285 +    // - For PV guests:
 37.2286 +    //     - after a shadow mode has been changed
 37.2287 +    // - For HVM guests:
 37.2288 +    //     - after a shadow mode has been changed
 37.2289 +    //     - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
 37.2290 +    //
 37.2291 +
 37.2292 +    // Avoid determining the current shadow mode for uninitialized CPUs, as
 37.2293 +    // we can not yet determine whether it is an HVM or PV domain.
 37.2294 +    //
 37.2295 +    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
 37.2296 +    {
 37.2297 +        printk("%s: postponing determination of shadow mode\n", __func__);
 37.2298 +        return;
 37.2299 +    }
 37.2300 +
 37.2301 +    // First, tear down any old shadow tables held by this vcpu.
 37.2302 +    //
 37.2303 +    shadow_detach_old_tables(v);
 37.2304 +
 37.2305 +    if ( !hvm_guest(v) )
 37.2306 +    {
 37.2307 +        ///
 37.2308 +        /// PV guest
 37.2309 +        ///
 37.2310 +#if CONFIG_PAGING_LEVELS == 4
 37.2311 +        if ( pv_32bit_guest(v) )
 37.2312 +            v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3);
 37.2313 +        else
 37.2314 +            v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4);
 37.2315 +#elif CONFIG_PAGING_LEVELS == 3
 37.2316 +        v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
 37.2317 +#elif CONFIG_PAGING_LEVELS == 2
 37.2318 +        v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
 37.2319 +#else
 37.2320 +#error unexpected paging mode
 37.2321 +#endif
 37.2322 +    }
 37.2323 +    else
 37.2324 +    {
 37.2325 +        ///
 37.2326 +        /// HVM guest
 37.2327 +        ///
 37.2328 +        ASSERT(shadow_mode_translate(d));
 37.2329 +        ASSERT(shadow_mode_external(d));
 37.2330 +
 37.2331 +        v->arch.shadow.hvm_paging_enabled = !!hvm_paging_enabled(v);
 37.2332 +        if ( !v->arch.shadow.hvm_paging_enabled )
 37.2333 +        {
 37.2334 +            
 37.2335 +            /* Set v->arch.guest_table to use the p2m map, and choose
 37.2336 +             * the appropriate shadow mode */
 37.2337 +            old_guest_table = pagetable_get_mfn(v->arch.guest_table);
 37.2338 +#if CONFIG_PAGING_LEVELS == 2
 37.2339 +            v->arch.guest_table =
 37.2340 +                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
 37.2341 +            v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2);
 37.2342 +#elif CONFIG_PAGING_LEVELS == 3 
 37.2343 +            v->arch.guest_table =
 37.2344 +                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
 37.2345 +            v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
 37.2346 +#else /* CONFIG_PAGING_LEVELS == 4 */
 37.2347 +            { 
 37.2348 +                l4_pgentry_t *l4e; 
 37.2349 +                /* Use the start of the first l3 table as a PAE l3 */
 37.2350 +                ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
 37.2351 +                l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
 37.2352 +                ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
 37.2353 +                v->arch.guest_table =
 37.2354 +                    pagetable_from_pfn(l4e_get_pfn(l4e[0]));
 37.2355 +                sh_unmap_domain_page(l4e);
 37.2356 +            }
 37.2357 +            v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3);
 37.2358 +#endif
 37.2359 +            /* Fix up refcounts on guest_table */
 37.2360 +            get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
 37.2361 +            if ( mfn_x(old_guest_table) != 0 )
 37.2362 +                put_page(mfn_to_page(old_guest_table));
 37.2363 +        }
 37.2364 +        else
 37.2365 +        {
 37.2366 +#ifdef __x86_64__
 37.2367 +            if ( hvm_long_mode_enabled(v) )
 37.2368 +            {
 37.2369 +                // long mode guest...
 37.2370 +                v->arch.shadow.mode =
 37.2371 +                    &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4);
 37.2372 +            }
 37.2373 +            else
 37.2374 +#endif
 37.2375 +                if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
 37.2376 +                {
 37.2377 +#if CONFIG_PAGING_LEVELS >= 3
 37.2378 +                    // 32-bit PAE mode guest...
 37.2379 +                    v->arch.shadow.mode =
 37.2380 +                        &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3);
 37.2381 +#else
 37.2382 +                    SHADOW_ERROR("PAE not supported in 32-bit Xen\n");
 37.2383 +                    domain_crash(d);
 37.2384 +                    return;
 37.2385 +#endif
 37.2386 +                }
 37.2387 +                else
 37.2388 +                {
 37.2389 +                    // 32-bit 2 level guest...
 37.2390 +#if CONFIG_PAGING_LEVELS >= 3
 37.2391 +                    v->arch.shadow.mode =
 37.2392 +                        &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2);
 37.2393 +#else
 37.2394 +                    v->arch.shadow.mode =
 37.2395 +                        &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2);
 37.2396 +#endif
 37.2397 +                }
 37.2398 +        }
 37.2399 +
 37.2400 +        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
 37.2401 +        {
 37.2402 +            mfn_t mmfn = shadow_make_monitor_table(v);
 37.2403 +            v->arch.monitor_table = pagetable_from_mfn(mmfn);
 37.2404 +            v->arch.monitor_vtable = sh_map_domain_page(mmfn);
 37.2405 +        } 
 37.2406 +
 37.2407 +        if ( v->arch.shadow.mode != old_mode )
 37.2408 +        {
 37.2409 +            SHADOW_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
 37.2410 +                           "(was g=%u s=%u)\n",
 37.2411 +                           d->domain_id, v->vcpu_id, 
 37.2412 +                           v->arch.shadow.mode->guest_levels,
 37.2413 +                           v->arch.shadow.mode->shadow_levels,
 37.2414 +                           old_mode ? old_mode->guest_levels : 0,
 37.2415 +                           old_mode ? old_mode->shadow_levels : 0);
 37.2416 +            if ( old_mode &&
 37.2417 +                 (v->arch.shadow.mode->shadow_levels !=
 37.2418 +                  old_mode->shadow_levels) )
 37.2419 +            {
 37.2420 +                /* Need to make a new monitor table for the new mode */
 37.2421 +                mfn_t new_mfn, old_mfn;
 37.2422 +
 37.2423 +                if ( v != current ) 
 37.2424 +                {
 37.2425 +                    SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
 37.2426 +                                  "this HVM vcpu's (d=%u v=%u) paging mode!\n",
 37.2427 +                                  current->domain->domain_id, current->vcpu_id,
 37.2428 +                                  v->domain->domain_id, v->vcpu_id);
 37.2429 +                    domain_crash(v->domain);
 37.2430 +                    return;
 37.2431 +                }
 37.2432 +
 37.2433 +                sh_unmap_domain_page(v->arch.monitor_vtable);
 37.2434 +                old_mfn = pagetable_get_mfn(v->arch.monitor_table);
 37.2435 +                v->arch.monitor_table = pagetable_null();
 37.2436 +                new_mfn = v->arch.shadow.mode->make_monitor_table(v);            
 37.2437 +                v->arch.monitor_table = pagetable_from_mfn(new_mfn);
 37.2438 +                v->arch.monitor_vtable = sh_map_domain_page(new_mfn);
 37.2439 +                SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n",
 37.2440 +                               mfn_x(new_mfn));
 37.2441 +
 37.2442 +                /* Don't be running on the old monitor table when we 
 37.2443 +                 * pull it down!  Switch CR3, and warn the HVM code that
 37.2444 +                 * its host cr3 has changed. */
 37.2445 +                make_cr3(v, mfn_x(new_mfn));
 37.2446 +                write_ptbase(v);
 37.2447 +                hvm_update_host_cr3(v);
 37.2448 +                old_mode->destroy_monitor_table(v, old_mfn);
 37.2449 +            }
 37.2450 +        }
 37.2451 +
 37.2452 +        // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
 37.2453 +        //        These are HARD: think about the case where two CPU's have
 37.2454 +        //        different values for CR4.PSE and CR4.PGE at the same time.
 37.2455 +        //        This *does* happen, at least for CR4.PGE...
 37.2456 +    }
 37.2457 +
 37.2458 +    v->arch.shadow.mode->update_cr3(v);
 37.2459 +}
 37.2460 +
 37.2461 +/**************************************************************************/
 37.2462 +/* Turning on and off shadow features */
 37.2463 +
 37.2464 +static void sh_new_mode(struct domain *d, u32 new_mode)
 37.2465 +/* Inform all the vcpus that the shadow mode has been changed */
 37.2466 +{
 37.2467 +    struct vcpu *v;
 37.2468 +
 37.2469 +    ASSERT(shadow_lock_is_acquired(d));
 37.2470 +    ASSERT(d != current->domain);
 37.2471 +    d->arch.shadow.mode = new_mode;
 37.2472 +    if ( new_mode & SHM2_translate ) 
 37.2473 +        shadow_audit_p2m(d);
 37.2474 +    for_each_vcpu(d, v)
 37.2475 +        sh_update_paging_modes(v);
 37.2476 +}
 37.2477 +
 37.2478 +static int shadow_enable(struct domain *d, u32 mode)
 37.2479 +/* Turn on "permanent" shadow features: external, translate, refcount.
 37.2480 + * Can only be called once on a domain, and these features cannot be
 37.2481 + * disabled. 
 37.2482 + * Returns 0 for success, -errno for failure. */
 37.2483 +{    
 37.2484 +    unsigned int old_pages;
 37.2485 +    int rv = 0;
 37.2486 +
 37.2487 +    mode |= SHM2_enable;
 37.2488 +
 37.2489 +    domain_pause(d);
 37.2490 +    shadow_lock(d);
 37.2491 +
 37.2492 +    /* Sanity check the arguments */
 37.2493 +    if ( (d == current->domain) ||
 37.2494 +         shadow_mode_enabled(d) ||
 37.2495 +         ((mode & SHM2_external) && !(mode & SHM2_translate)) )
 37.2496 +    {
 37.2497 +        rv = -EINVAL;
 37.2498 +        goto out;
 37.2499 +    }
 37.2500 +
 37.2501 +    // XXX -- eventually would like to require that all memory be allocated
 37.2502 +    // *after* shadow_enabled() is called...  So here, we would test to make
 37.2503 +    // sure that d->page_list is empty.
 37.2504 +#if 0
 37.2505 +    spin_lock(&d->page_alloc_lock);
 37.2506 +    if ( !list_empty(&d->page_list) )
 37.2507 +    {
 37.2508 +        spin_unlock(&d->page_alloc_lock);
 37.2509 +        rv = -EINVAL;
 37.2510 +        goto out;
 37.2511 +    }
 37.2512 +    spin_unlock(&d->page_alloc_lock);
 37.2513 +#endif
 37.2514 +
 37.2515 +    /* Init the shadow memory allocation if the user hasn't done so */
 37.2516 +    old_pages = d->arch.shadow.total_pages;
 37.2517 +    if ( old_pages == 0 )
 37.2518 +        if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
 37.2519 +        {
 37.2520 +            set_sh_allocation(d, 0, NULL);
 37.2521 +            rv = -ENOMEM;
 37.2522 +            goto out;
 37.2523 +        }
 37.2524 +
 37.2525 +    /* Init the hash table */
 37.2526 +    if ( shadow_hash_alloc(d) != 0 )
 37.2527 +    {
 37.2528 +        set_sh_allocation(d, old_pages, NULL);            
 37.2529 +        rv = -ENOMEM;
 37.2530 +        goto out;
 37.2531 +    }
 37.2532 +
 37.2533 +    /* Init the P2M table */
 37.2534 +    if ( mode & SHM2_translate )
 37.2535 +        if ( !shadow_alloc_p2m_table(d) )
 37.2536 +        {
 37.2537 +            shadow_hash_teardown(d);
 37.2538 +            set_sh_allocation(d, old_pages, NULL);
 37.2539 +            shadow_p2m_teardown(d);
 37.2540 +            rv = -ENOMEM;
 37.2541 +            goto out;
 37.2542 +        }
 37.2543 +
 37.2544 +    /* Update the bits */
 37.2545 +    sh_new_mode(d, mode);
 37.2546 +    shadow_audit_p2m(d);
 37.2547 + out:
 37.2548 +    shadow_unlock(d);
 37.2549 +    domain_unpause(d);
 37.2550 +    return 0;
 37.2551 +}
 37.2552 +
 37.2553 +void shadow_teardown(struct domain *d)
 37.2554 +/* Destroy the shadow pagetables of this domain and free its shadow memory.
 37.2555 + * Should only be called for dying domains. */
 37.2556 +{
 37.2557 +    struct vcpu *v;
 37.2558 +    mfn_t mfn;
 37.2559 +
 37.2560 +    ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
 37.2561 +    ASSERT(d != current->domain);
 37.2562 +
 37.2563 +    if ( !shadow_lock_is_acquired(d) )
 37.2564 +        shadow_lock(d); /* Keep various asserts happy */
 37.2565 +
 37.2566 +    if ( shadow_mode_enabled(d) )
 37.2567 +    {
 37.2568 +        /* Release the shadow and monitor tables held by each vcpu */
 37.2569 +        for_each_vcpu(d, v)
 37.2570 +        {
 37.2571 +            shadow_detach_old_tables(v);
 37.2572 +            if ( shadow_mode_external(d) )
 37.2573 +            {
 37.2574 +                mfn = pagetable_get_mfn(v->arch.monitor_table);
 37.2575 +                if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
 37.2576 +                    shadow_destroy_monitor_table(v, mfn);
 37.2577 +                v->arch.monitor_table = pagetable_null();
 37.2578 +            }
 37.2579 +        }
 37.2580 +    }
 37.2581 +
 37.2582 +    if ( d->arch.shadow.total_pages != 0 )
 37.2583 +    {
 37.2584 +        SHADOW_PRINTK("teardown of domain %u starts."
 37.2585 +                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
 37.2586 +                       d->domain_id,
 37.2587 +                       d->arch.shadow.total_pages, 
 37.2588 +                       d->arch.shadow.free_pages, 
 37.2589 +                       d->arch.shadow.p2m_pages);
 37.2590 +        /* Destroy all the shadows and release memory to domheap */
 37.2591 +        set_sh_allocation(d, 0, NULL);
 37.2592 +        /* Release the hash table back to xenheap */
 37.2593 +        if (d->arch.shadow.hash_table) 
 37.2594 +            shadow_hash_teardown(d);
 37.2595 +        /* Release the log-dirty bitmap of dirtied pages */
 37.2596 +        sh_free_log_dirty_bitmap(d);
 37.2597 +        /* Should not have any more memory held */
 37.2598 +        SHADOW_PRINTK("teardown done."
 37.2599 +                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
 37.2600 +                       d->arch.shadow.total_pages, 
 37.2601 +                       d->arch.shadow.free_pages, 
 37.2602 +                       d->arch.shadow.p2m_pages);
 37.2603 +        ASSERT(d->arch.shadow.total_pages == 0);
 37.2604 +    }
 37.2605 +
 37.2606 +    /* We leave the "permanent" shadow modes enabled, but clear the
 37.2607 +     * log-dirty mode bit.  We don't want any more mark_dirty()
 37.2608 +     * calls now that we've torn down the bitmap */
 37.2609 +    d->arch.shadow.mode &= ~SHM2_log_dirty;
 37.2610 +
 37.2611 +    shadow_unlock(d);
 37.2612 +}
 37.2613 +
 37.2614 +void shadow_final_teardown(struct domain *d)
 37.2615 +/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
 37.2616 +{
 37.2617 +
 37.2618 +    SHADOW_PRINTK("dom %u final teardown starts."
 37.2619 +                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
 37.2620 +                   d->domain_id,
 37.2621 +                   d->arch.shadow.total_pages, 
 37.2622 +                   d->arch.shadow.free_pages, 
 37.2623 +                   d->arch.shadow.p2m_pages);
 37.2624 +
 37.2625 +    /* Double-check that the domain didn't have any shadow memory.  
 37.2626 +     * It is possible for a domain that never got domain_kill()ed
 37.2627 +     * to get here with its shadow allocation intact. */
 37.2628 +    if ( d->arch.shadow.total_pages != 0 )
 37.2629 +        shadow_teardown(d);
 37.2630 +
 37.2631 +    /* It is now safe to pull down the p2m map. */
 37.2632 +    if ( d->arch.shadow.p2m_pages != 0 )
 37.2633 +        shadow_p2m_teardown(d);
 37.2634 +
 37.2635 +    SHADOW_PRINTK("dom %u final teardown done."
 37.2636 +                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
 37.2637 +                   d->domain_id,
 37.2638 +                   d->arch.shadow.total_pages, 
 37.2639 +                   d->arch.shadow.free_pages, 
 37.2640 +                   d->arch.shadow.p2m_pages);
 37.2641 +}
 37.2642 +
 37.2643 +static int shadow_one_bit_enable(struct domain *d, u32 mode)
 37.2644 +/* Turn on a single shadow mode feature */
 37.2645 +{
 37.2646 +    ASSERT(shadow_lock_is_acquired(d));
 37.2647 +
 37.2648 +    /* Sanity check the call */
 37.2649 +    if ( d == current->domain || (d->arch.shadow.mode & mode) )
 37.2650 +    {
 37.2651 +        return -EINVAL;
 37.2652 +    }
 37.2653 +
 37.2654 +    if ( d->arch.shadow.mode == 0 )
 37.2655 +    {
 37.2656 +        /* Init the shadow memory allocation and the hash table */
 37.2657 +        if ( set_sh_allocation(d, 1, NULL) != 0 
 37.2658 +             || shadow_hash_alloc(d) != 0 )
 37.2659 +        {
 37.2660 +            set_sh_allocation(d, 0, NULL);
 37.2661 +            return -ENOMEM;
 37.2662 +        }
 37.2663 +    }
 37.2664 +
 37.2665 +    /* Update the bits */
 37.2666 +    sh_new_mode(d, d->arch.shadow.mode | mode);
 37.2667 +
 37.2668 +    return 0;
 37.2669 +}
 37.2670 +
 37.2671 +static int shadow_one_bit_disable(struct domain *d, u32 mode) 
 37.2672 +/* Turn off a single shadow mode feature */
 37.2673 +{
 37.2674 +    struct vcpu *v;
 37.2675 +    ASSERT(shadow_lock_is_acquired(d));
 37.2676 +
 37.2677 +    /* Sanity check the call */
 37.2678 +    if ( d == current->domain || !(d->arch.shadow.mode & mode) )
 37.2679 +    {
 37.2680 +        return -EINVAL;
 37.2681 +    }
 37.2682 +
 37.2683 +    /* Update the bits */
 37.2684 +    sh_new_mode(d, d->arch.shadow.mode & ~mode);
 37.2685 +    if ( d->arch.shadow.mode == 0 )
 37.2686 +    {
 37.2687 +        /* Get this domain off shadows */
 37.2688 +        SHADOW_PRINTK("un-shadowing of domain %u starts."
 37.2689 +                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
 37.2690 +                       d->domain_id,
 37.2691 +                       d->arch.shadow.total_pages, 
 37.2692 +                       d->arch.shadow.free_pages, 
 37.2693 +                       d->arch.shadow.p2m_pages);
 37.2694 +        for_each_vcpu(d, v)
 37.2695 +        {
 37.2696 +            shadow_detach_old_tables(v);
 37.2697 +#if CONFIG_PAGING_LEVELS == 4
 37.2698 +            if ( !(v->arch.flags & TF_kernel_mode) )
 37.2699 +                make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
 37.2700 +            else
 37.2701 +#endif
 37.2702 +                make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
 37.2703 +
 37.2704 +        }
 37.2705 +
 37.2706 +        /* Pull down the memory allocation */
 37.2707 +        if ( set_sh_allocation(d, 0, NULL) != 0 )
 37.2708 +        {
 37.2709 +            // XXX - How can this occur?
 37.2710 +            //       Seems like a bug to return an error now that we've
 37.2711 +            //       disabled the relevant shadow mode.
 37.2712 +            //
 37.2713 +            return -ENOMEM;
 37.2714 +        }
 37.2715 +        shadow_hash_teardown(d);
 37.2716 +        SHADOW_PRINTK("un-shadowing of domain %u done."
 37.2717 +                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
 37.2718 +                       d->domain_id,
 37.2719 +                       d->arch.shadow.total_pages, 
 37.2720 +                       d->arch.shadow.free_pages, 
 37.2721 +                       d->arch.shadow.p2m_pages);
 37.2722 +    }
 37.2723 +
 37.2724 +    return 0;
 37.2725 +}
 37.2726 +
 37.2727 +/* Enable/disable ops for the "test" and "log-dirty" modes */
 37.2728 +int shadow_test_enable(struct domain *d)
 37.2729 +{
 37.2730 +    int ret;
 37.2731 +
 37.2732 +    domain_pause(d);
 37.2733 +    shadow_lock(d);
 37.2734 +
 37.2735 +    if ( shadow_mode_enabled(d) )
 37.2736 +    {
 37.2737 +        SHADOW_ERROR("Don't support enabling test mode"
 37.2738 +                      "on already shadowed doms\n");
 37.2739 +        ret = -EINVAL;
 37.2740 +        goto out;
 37.2741 +    }
 37.2742 +
 37.2743 +    ret = shadow_one_bit_enable(d, SHM2_enable);
 37.2744 + out:
 37.2745 +    shadow_unlock(d);
 37.2746 +    domain_unpause(d);
 37.2747 +
 37.2748 +    return ret;
 37.2749 +}
 37.2750 +
 37.2751 +int shadow_test_disable(struct domain *d)
 37.2752 +{
 37.2753 +    int ret;
 37.2754 +
 37.2755 +    domain_pause(d);
 37.2756 +    shadow_lock(d);
 37.2757 +    ret = shadow_one_bit_disable(d, SHM2_enable);
 37.2758 +    shadow_unlock(d);
 37.2759 +    domain_unpause(d);
 37.2760 +
 37.2761 +    return ret;
 37.2762 +}
 37.2763 +
 37.2764 +static int
 37.2765 +sh_alloc_log_dirty_bitmap(struct domain *d)
 37.2766 +{
 37.2767 +    ASSERT(d->arch.shadow.dirty_bitmap == NULL);
 37.2768 +    d->arch.shadow.dirty_bitmap_size =
 37.2769 +        (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
 37.2770 +        ~(BITS_PER_LONG - 1);
 37.2771 +    d->arch.shadow.dirty_bitmap =
 37.2772 +        xmalloc_array(unsigned long,
 37.2773 +                      d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG);
 37.2774 +    if ( d->arch.shadow.dirty_bitmap == NULL )
 37.2775 +    {
 37.2776 +        d->arch.shadow.dirty_bitmap_size = 0;
 37.2777 +        return -ENOMEM;
 37.2778 +    }
 37.2779 +    memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8);
 37.2780 +
 37.2781 +    return 0;
 37.2782 +}
 37.2783 +
 37.2784 +static void
 37.2785 +sh_free_log_dirty_bitmap(struct domain *d)
 37.2786 +{
 37.2787 +    d->arch.shadow.dirty_bitmap_size = 0;
 37.2788 +    if ( d->arch.shadow.dirty_bitmap )
 37.2789 +    {
 37.2790 +        xfree(d->arch.shadow.dirty_bitmap);
 37.2791 +        d->arch.shadow.dirty_bitmap = NULL;
 37.2792 +    }
 37.2793 +}
 37.2794 +
 37.2795 +static int shadow_log_dirty_enable(struct domain *d)
 37.2796 +{
 37.2797 +    int ret;
 37.2798 +
 37.2799 +    domain_pause(d);
 37.2800 +    shadow_lock(d);
 37.2801 +
 37.2802 +    if ( shadow_mode_log_dirty(d) )
 37.2803 +    {
 37.2804 +        ret = -EINVAL;
 37.2805 +        goto out;
 37.2806 +    }
 37.2807 +
 37.2808 +    if ( shadow_mode_enabled(d) )
 37.2809 +    {
 37.2810 +        SHADOW_ERROR("Don't (yet) support enabling log-dirty"
 37.2811 +                      "on already shadowed doms\n");
 37.2812 +        ret = -EINVAL;
 37.2813 +        goto out;
 37.2814 +    }
 37.2815 +
 37.2816 +    ret = sh_alloc_log_dirty_bitmap(d);
 37.2817 +    if ( ret != 0 )
 37.2818 +    {
 37.2819 +        sh_free_log_dirty_bitmap(d);
 37.2820 +        goto out;
 37.2821 +    }
 37.2822 +
 37.2823 +    ret = shadow_one_bit_enable(d, SHM2_log_dirty);
 37.2824 +    if ( ret != 0 )
 37.2825 +        sh_free_log_dirty_bitmap(d);
 37.2826 +
 37.2827 + out:
 37.2828 +    shadow_unlock(d);
 37.2829 +    domain_unpause(d);
 37.2830 +    return ret;
 37.2831 +}
 37.2832 +
 37.2833 +static int shadow_log_dirty_disable(struct domain *d)
 37.2834 +{
 37.2835 +    int ret;
 37.2836 +
 37.2837 +    domain_pause(d);
 37.2838 +    shadow_lock(d);
 37.2839 +    ret = shadow_one_bit_disable(d, SHM2_log_dirty);
 37.2840 +    if ( !shadow_mode_log_dirty(d) )
 37.2841 +        sh_free_log_dirty_bitmap(d);
 37.2842 +    shadow_unlock(d);
 37.2843 +    domain_unpause(d);
 37.2844 +
 37.2845 +    return ret;
 37.2846 +}
 37.2847 +
 37.2848 +/**************************************************************************/
 37.2849 +/* P2M map manipulations */
 37.2850 +
 37.2851 +static void
 37.2852 +sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
 37.2853 +{
 37.2854 +    struct vcpu *v;
 37.2855 +
 37.2856 +    if ( !shadow_mode_translate(d) )
 37.2857 +        return;
 37.2858 +
 37.2859 +    v = current;
 37.2860 +    if ( v->domain != d )
 37.2861 +        v = d->vcpu[0];
 37.2862 +
 37.2863 +
 37.2864 +    SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
 37.2865 +
 37.2866 +    ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn);
 37.2867 +    //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn);
 37.2868 +
 37.2869 +    shadow_remove_all_shadows_and_parents(v, _mfn(mfn));
 37.2870 +    if ( shadow_remove_all_mappings(v, _mfn(mfn)) )
 37.2871 +        flush_tlb_mask(d->domain_dirty_cpumask);
 37.2872 +    shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
 37.2873 +    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
 37.2874 +}
 37.2875 +
 37.2876 +void
 37.2877 +shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
 37.2878 +                                  unsigned long mfn)
 37.2879 +{
 37.2880 +    shadow_lock(d);
 37.2881 +    shadow_audit_p2m(d);
 37.2882 +    sh_p2m_remove_page(d, gfn, mfn);
 37.2883 +    shadow_audit_p2m(d);
 37.2884 +    shadow_unlock(d);    
 37.2885 +}
 37.2886 +
 37.2887 +void
 37.2888 +shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn,
 37.2889 +                               unsigned long mfn)
 37.2890 +{
 37.2891 +    struct vcpu *v;
 37.2892 +    unsigned long ogfn;
 37.2893 +    mfn_t omfn;
 37.2894 +
 37.2895 +    if ( !shadow_mode_translate(d) )
 37.2896 +        return;
 37.2897 +
 37.2898 +    v = current;
 37.2899 +    if ( v->domain != d )
 37.2900 +        v = d->vcpu[0];
 37.2901 +
 37.2902 +    shadow_lock(d);
 37.2903 +    shadow_audit_p2m(d);
 37.2904 +
 37.2905 +    SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
 37.2906 +
 37.2907 +    omfn = sh_gfn_to_mfn(d, gfn);
 37.2908 +    if ( valid_mfn(omfn) )
 37.2909 +    {
 37.2910 +        /* Get rid of the old mapping, especially any shadows */
 37.2911 +        shadow_remove_all_shadows_and_parents(v, omfn);
 37.2912 +        if ( shadow_remove_all_mappings(v, omfn) )
 37.2913 +            flush_tlb_mask(d->domain_dirty_cpumask);
 37.2914 +        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
 37.2915 +    }        
 37.2916 +
 37.2917 +    ogfn = sh_mfn_to_gfn(d, _mfn(mfn));
 37.2918 +    if (
 37.2919 +#ifdef __x86_64__
 37.2920 +        (ogfn != 0x5555555555555555L)
 37.2921 +#else
 37.2922 +        (ogfn != 0x55555555L)
 37.2923 +#endif
 37.2924 +        && (ogfn != INVALID_M2P_ENTRY)
 37.2925 +        && (ogfn != gfn) )
 37.2926 +    {
 37.2927 +        /* This machine frame is already mapped at another physical address */
 37.2928 +        SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
 37.2929 +                       mfn, ogfn, gfn);
 37.2930 +        if ( valid_mfn(omfn = sh_gfn_to_mfn(d, ogfn)) ) 
 37.2931 +        {
 37.2932 +            SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n", 
 37.2933 +                           ogfn , mfn_x(omfn));
 37.2934 +            if ( mfn_x(omfn) == mfn ) 
 37.2935 +                sh_p2m_remove_page(d, ogfn, mfn);
 37.2936 +        }
 37.2937 +    }
 37.2938 +
 37.2939 +    shadow_set_p2m_entry(d, gfn, _mfn(mfn));
 37.2940 +    set_gpfn_from_mfn(mfn, gfn);
 37.2941 +    shadow_audit_p2m(d);
 37.2942 +    shadow_unlock(d);
 37.2943 +}
 37.2944 +
 37.2945 +/**************************************************************************/
 37.2946 +/* Log-dirty mode support */
 37.2947 +
 37.2948 +/* Convert a shadow to log-dirty mode. */
 37.2949 +void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
 37.2950 +{
 37.2951 +    BUG();
 37.2952 +}
 37.2953 +
 37.2954 +
 37.2955 +/* Read a domain's log-dirty bitmap and stats.  
 37.2956 + * If the operation is a CLEAN, clear the bitmap and stats as well. */
 37.2957 +static int shadow_log_dirty_op(
 37.2958 +    struct domain *d, struct xen_domctl_shadow_op *sc)
 37.2959 +{
 37.2960 +    int i, rv = 0, clean = 0;
 37.2961 +
 37.2962 +    domain_pause(d);
 37.2963 +    shadow_lock(d);
 37.2964 +
 37.2965 +    clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
 37.2966 +
 37.2967 +    SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", 
 37.2968 +                  (clean) ? "clean" : "peek",
 37.2969 +                  d->domain_id,
 37.2970 +                  d->arch.shadow.fault_count, 
 37.2971 +                  d->arch.shadow.dirty_count);
 37.2972 +
 37.2973 +    sc->stats.fault_count = d->arch.shadow.fault_count;
 37.2974 +    sc->stats.dirty_count = d->arch.shadow.dirty_count;    
 37.2975 +        
 37.2976 +    if ( clean ) 
 37.2977 +    {
 37.2978 +        struct list_head *l, *t;
 37.2979 +        struct page_info *pg;
 37.2980 +
 37.2981 +        /* Need to revoke write access to the domain's pages again. 
 37.2982 +         * In future, we'll have a less heavy-handed approach to this, 
 37.2983 +         * but for now, we just unshadow everything except Xen. */
 37.2984 +        list_for_each_safe(l, t, &d->arch.shadow.toplevel_shadows)
 37.2985 +        {
 37.2986 +            pg = list_entry(l, struct page_info, list);
 37.2987 +            shadow_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
 37.2988 +        }
 37.2989 +
 37.2990 +        d->arch.shadow.fault_count = 0;
 37.2991 +        d->arch.shadow.dirty_count = 0;
 37.2992 +    }
 37.2993 +
 37.2994 +    if ( guest_handle_is_null(sc->dirty_bitmap) ||
 37.2995 +         (d->arch.shadow.dirty_bitmap == NULL) )
 37.2996 +    {
 37.2997 +        rv = -EINVAL;
 37.2998 +        goto out;
 37.2999 +    }
 37.3000 + 
 37.3001 +    if ( sc->pages > d->arch.shadow.dirty_bitmap_size )
 37.3002 +        sc->pages = d->arch.shadow.dirty_bitmap_size; 
 37.3003 +
 37.3004 +#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
 37.3005 +    for ( i = 0; i < sc->pages; i += CHUNK )
 37.3006 +    {
 37.3007 +        int bytes = ((((sc->pages - i) > CHUNK) 
 37.3008 +                      ? CHUNK 
 37.3009 +                      : (sc->pages - i)) + 7) / 8;
 37.3010 +     
 37.3011 +        if ( copy_to_guest_offset(
 37.3012 +                 sc->dirty_bitmap, 
 37.3013 +                 i/(8*sizeof(unsigned long)),
 37.3014 +                 d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
 37.3015 +                 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
 37.3016 +        {
 37.3017 +            rv = -EINVAL;
 37.3018 +            goto out;
 37.3019 +        }
 37.3020 +
 37.3021 +        if ( clean )
 37.3022 +            memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))),
 37.3023 +                   0, bytes);
 37.3024 +    }
 37.3025 +#undef CHUNK
 37.3026 +
 37.3027 + out:
 37.3028 +    shadow_unlock(d);
 37.3029 +    domain_unpause(d);
 37.3030 +    return 0;
 37.3031 +}
 37.3032 +
 37.3033 +
 37.3034 +/* Mark a page as dirty */
 37.3035 +void sh_do_mark_dirty(struct domain *d, mfn_t gmfn)
 37.3036 +{
 37.3037 +    unsigned long pfn;
 37.3038 +
 37.3039 +    ASSERT(shadow_lock_is_acquired(d));
 37.3040 +    ASSERT(shadow_mode_log_dirty(d));
 37.3041 +
 37.3042 +    if ( !valid_mfn(gmfn) )
 37.3043 +        return;
 37.3044 +
 37.3045 +    ASSERT(d->arch.shadow.dirty_bitmap != NULL);
 37.3046 +
 37.3047 +    /* We /really/ mean PFN here, even for non-translated guests. */
 37.3048 +    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
 37.3049 +
 37.3050 +    /*
 37.3051 +     * Values with the MSB set denote MFNs that aren't really part of the 
 37.3052 +     * domain's pseudo-physical memory map (e.g., the shared info frame).
 37.3053 +     * Nothing to do here...
 37.3054 +     */
 37.3055 +    if ( unlikely(!VALID_M2P(pfn)) )
 37.3056 +        return;
 37.3057 +
 37.3058 +    /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
 37.3059 +    if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) ) 
 37.3060 +    { 
 37.3061 +        if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) )
 37.3062 +        {
 37.3063 +            SHADOW_DEBUG(LOGDIRTY, 
 37.3064 +                          "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n",
 37.3065 +                          mfn_x(gmfn), pfn, d->domain_id);
 37.3066 +            d->arch.shadow.dirty_count++;
 37.3067 +        }
 37.3068 +    }
 37.3069 +    else
 37.3070 +    {
 37.3071 +        SHADOW_PRINTK("mark_dirty OOR! "
 37.3072 +                       "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
 37.3073 +                       "owner=%d c=%08x t=%" PRtype_info "\n",
 37.3074 +                       mfn_x(gmfn), 
 37.3075 +                       pfn, 
 37.3076 +                       d->arch.shadow.dirty_bitmap_size,
 37.3077 +                       d->domain_id,
 37.3078 +                       (page_get_owner(mfn_to_page(gmfn))
 37.3079 +                        ? page_get_owner(mfn_to_page(gmfn))->domain_id
 37.3080 +                        : -1),
 37.3081 +                       mfn_to_page(gmfn)->count_info, 
 37.3082 +                       mfn_to_page(gmfn)->u.inuse.type_info);
 37.3083 +    }
 37.3084 +}
 37.3085 +
 37.3086 +
 37.3087 +/**************************************************************************/
 37.3088 +/* Shadow-control XEN_DOMCTL dispatcher */
 37.3089 +
 37.3090 +int shadow_domctl(struct domain *d, 
 37.3091 +                   xen_domctl_shadow_op_t *sc,
 37.3092 +                   XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
 37.3093 +{
 37.3094 +    int rc, preempted = 0;
 37.3095 +
 37.3096 +    if ( unlikely(d == current->domain) )
 37.3097 +    {
 37.3098 +        DPRINTK("Don't try to do a shadow op on yourself!\n");
 37.3099 +        return -EINVAL;
 37.3100 +    }
 37.3101 +
 37.3102 +    switch ( sc->op )
 37.3103 +    {
 37.3104 +    case XEN_DOMCTL_SHADOW_OP_OFF:
 37.3105 +        if ( shadow_mode_log_dirty(d) )
 37.3106 +            if ( (rc = shadow_log_dirty_disable(d)) != 0 ) 
 37.3107 +                return rc;
 37.3108 +        if ( d->arch.shadow.mode & SHM2_enable )
 37.3109 +            if ( (rc = shadow_test_disable(d)) != 0 ) 
 37.3110 +                return rc;
 37.3111 +        return 0;
 37.3112 +
 37.3113 +    case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
 37.3114 +        return shadow_test_enable(d);
 37.3115 +
 37.3116 +    case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
 37.3117 +        return shadow_log_dirty_enable(d);
 37.3118 +
 37.3119 +    case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
 37.3120 +        return shadow_enable(d, SHM2_refcounts|SHM2_translate);
 37.3121 +
 37.3122 +    case XEN_DOMCTL_SHADOW_OP_CLEAN:
 37.3123 +    case XEN_DOMCTL_SHADOW_OP_PEEK:
 37.3124 +        return shadow_log_dirty_op(d, sc);
 37.3125 +
 37.3126 +    case XEN_DOMCTL_SHADOW_OP_ENABLE:
 37.3127 +        if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY )
 37.3128 +            return shadow_log_dirty_enable(d);
 37.3129 +        return shadow_enable(d, sc->mode << SHM2_shift);
 37.3130 +
 37.3131 +    case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
 37.3132 +        sc->mb = shadow_get_allocation(d);
 37.3133 +        return 0;
 37.3134 +
 37.3135 +    case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
 37.3136 +        rc = shadow_set_allocation(d, sc->mb, &preempted);
 37.3137 +        if ( preempted )
 37.3138 +            /* Not finished.  Set up to re-run the call. */
 37.3139 +            rc = hypercall_create_continuation(
 37.3140 +                __HYPERVISOR_domctl, "h", u_domctl);
 37.3141 +        else 
 37.3142 +            /* Finished.  Return the new allocation */
 37.3143 +            sc->mb = shadow_get_allocation(d);
 37.3144 +        return rc;
 37.3145 +
 37.3146 +    default:
 37.3147 +        SHADOW_ERROR("Bad shadow op %u\n", sc->op);
 37.3148 +        return -EINVAL;
 37.3149 +    }
 37.3150 +}
 37.3151 +
 37.3152 +
 37.3153 +/**************************************************************************/
 37.3154 +/* Auditing shadow tables */
 37.3155 +
 37.3156 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
 37.3157 +
 37.3158 +void shadow_audit_tables(struct vcpu *v) 
 37.3159 +{
 37.3160 +    /* Dispatch table for getting per-type functions */
 37.3161 +    static hash_callback_t callbacks[16] = {
 37.3162 +        NULL, /* none    */
 37.3163 +#if CONFIG_PAGING_LEVELS == 2
 37.3164 +        SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2),  /* l1_32   */
 37.3165 +        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32  */
 37.3166 +        SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2),  /* l2_32   */
 37.3167 +#else 
 37.3168 +        SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2),  /* l1_32   */
 37.3169 +        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32  */
 37.3170 +        SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2),  /* l2_32   */
 37.3171 +        SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3),  /* l1_pae  */
 37.3172 +        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */
 37.3173 +        SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3),  /* l2_pae  */
 37.3174 +        SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3),  /* l2h_pae */
 37.3175 +        SHADOW_INTERNAL_NAME(sh_audit_l3_table,3,3),  /* l3_pae  */
 37.3176 +#if CONFIG_PAGING_LEVELS >= 4
 37.3177 +        SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4),  /* l1_64   */
 37.3178 +        SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64  */
 37.3179 +        SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4),  /* l2_64   */
 37.3180 +        SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4),  /* l3_64   */
 37.3181 +        SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4),  /* l4_64   */
 37.3182 +#endif /* CONFIG_PAGING_LEVELS >= 4 */
 37.3183 +#endif /* CONFIG_PAGING_LEVELS > 2 */
 37.3184 +        NULL  /* All the rest */
 37.3185 +    };
 37.3186 +    unsigned int mask; 
 37.3187 +
 37.3188 +    if ( !(SHADOW_AUDIT_ENABLE) )
 37.3189 +        return;
 37.3190 +    
 37.3191 +    if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
 37.3192 +        mask = ~1; /* Audit every table in the system */
 37.3193 +    else 
 37.3194 +    {
 37.3195 +        /* Audit only the current mode's tables */
 37.3196 +        switch ( v->arch.shadow.mode->guest_levels )
 37.3197 +        {
 37.3198 +        case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break;
 37.3199 +        case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE
 37.3200 +                        |SHF_L2H_PAE|SHF_L3_PAE); break;
 37.3201 +        case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64  
 37.3202 +                        |SHF_L3_64|SHF_L4_64); break;
 37.3203 +        default: BUG();
 37.3204 +        }
 37.3205 +    }
 37.3206 +
 37.3207 +    hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
 37.3208 +}
 37.3209 +
 37.3210 +#endif /* Shadow audit */
 37.3211 +
 37.3212 +
 37.3213 +/**************************************************************************/
 37.3214 +/* Auditing p2m tables */
 37.3215 +
 37.3216 +#if SHADOW_AUDIT & SHADOW_AUDIT_P2M
 37.3217 +
 37.3218 +void shadow_audit_p2m(struct domain *d)
 37.3219 +{
 37.3220 +    struct list_head *entry;
 37.3221 +    struct page_info *page;
 37.3222 +    struct domain *od;
 37.3223 +    unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
 37.3224 +    mfn_t p2mfn;
 37.3225 +    unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
 37.3226 +    int test_linear;
 37.3227 +    
 37.3228 +    if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) )
 37.3229 +        return;
 37.3230 +
 37.3231 +    //SHADOW_PRINTK("p2m audit starts\n");
 37.3232 +
 37.3233 +    test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
 37.3234 +    if ( test_linear )
 37.3235 +        local_flush_tlb(); 
 37.3236 +
 37.3237 +    /* Audit part one: walk the domain's page allocation list, checking 
 37.3238 +     * the m2p entries. */
 37.3239 +    for ( entry = d->page_list.next;
 37.3240 +          entry != &d->page_list;
 37.3241 +          entry = entry->next )
 37.3242 +    {
 37.3243 +        page = list_entry(entry, struct page_info, list);
 37.3244 +        mfn = mfn_x(page_to_mfn(page));
 37.3245 +
 37.3246 +        // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn); 
 37.3247 +
 37.3248 +        od = page_get_owner(page);
 37.3249 +
 37.3250 +        if ( od != d ) 
 37.3251 +        {
 37.3252 +            SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
 37.3253 +                           mfn, od, (od?od->domain_id:-1), d, d->domain_id);
 37.3254 +            continue;
 37.3255 +        }
 37.3256 +
 37.3257 +        gfn = get_gpfn_from_mfn(mfn);
 37.3258 +        if ( gfn == INVALID_M2P_ENTRY ) 
 37.3259 +        {
 37.3260 +            orphans_i++;
 37.3261 +            //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
 37.3262 +            //               mfn); 
 37.3263 +            continue;
 37.3264 +        }
 37.3265 +
 37.3266 +        if ( gfn == 0x55555555 ) 
 37.3267 +        {
 37.3268 +            orphans_d++;
 37.3269 +            //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", 
 37.3270 +            //               mfn); 
 37.3271 +            continue;
 37.3272 +        }
 37.3273 +
 37.3274 +        p2mfn = sh_gfn_to_mfn_foreign(d, gfn);
 37.3275 +        if ( mfn_x(p2mfn) != mfn )
 37.3276 +        {
 37.3277 +            mpbad++;
 37.3278 +            SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
 37.3279 +                           " (-> gfn %#lx)\n",
 37.3280 +                           mfn, gfn, mfn_x(p2mfn),
 37.3281 +                           (mfn_valid(p2mfn)
 37.3282 +                            ? get_gpfn_from_mfn(mfn_x(p2mfn))
 37.3283 +                            : -1u));
 37.3284 +            /* This m2p entry is stale: the domain has another frame in
 37.3285 +             * this physical slot.  No great disaster, but for neatness,
 37.3286 +             * blow away the m2p entry. */ 
 37.3287 +            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
 37.3288 +        }
 37.3289 +
 37.3290 +        if ( test_linear )
 37.3291 +        {
 37.3292 +            lp2mfn = get_mfn_from_gpfn(gfn);
 37.3293 +            if ( lp2mfn != mfn_x(p2mfn) )
 37.3294 +            {
 37.3295 +                SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
 37.3296 +                               "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
 37.3297 +            }
 37.3298 +        }
 37.3299 +
 37.3300 +        // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", 
 37.3301 +        //                mfn, gfn, p2mfn, lp2mfn); 
 37.3302 +    }   
 37.3303 +
 37.3304 +    /* Audit part two: walk the domain's p2m table, checking the entries. */
 37.3305 +    if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
 37.3306 +    {
 37.3307 +        l2_pgentry_t *l2e;
 37.3308 +        l1_pgentry_t *l1e;
 37.3309 +        int i1, i2;
 37.3310 +        
 37.3311 +#if CONFIG_PAGING_LEVELS == 4
 37.3312 +        l4_pgentry_t *l4e;
 37.3313 +        l3_pgentry_t *l3e;
 37.3314 +        int i3, i4;
 37.3315 +        l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
 37.3316 +#elif CONFIG_PAGING_LEVELS == 3
 37.3317 +        l3_pgentry_t *l3e;
 37.3318 +        int i3;
 37.3319 +        l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
 37.3320 +#else /* CONFIG_PAGING_LEVELS == 2 */
 37.3321 +        l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
 37.3322 +#endif
 37.3323 +
 37.3324 +        gfn = 0;
 37.3325 +#if CONFIG_PAGING_LEVELS >= 3
 37.3326 +#if CONFIG_PAGING_LEVELS >= 4
 37.3327 +        for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
 37.3328 +        {
 37.3329 +            if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
 37.3330 +            {
 37.3331 +                gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
 37.3332 +                continue;
 37.3333 +            }
 37.3334 +            l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
 37.3335 +#endif /* now at levels 3 or 4... */
 37.3336 +            for ( i3 = 0; 
 37.3337 +                  i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); 
 37.3338 +                  i3++ )
 37.3339 +            {
 37.3340 +                if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
 37.3341 +                {
 37.3342 +                    gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
 37.3343 +                    continue;
 37.3344 +                }
 37.3345 +                l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
 37.3346 +#endif /* all levels... */
 37.3347 +                for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
 37.3348 +                {
 37.3349 +                    if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
 37.3350 +                    {
 37.3351 +                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
 37.3352 +                        continue;
 37.3353 +                    }
 37.3354 +                    l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
 37.3355 +                    
 37.3356 +                    for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
 37.3357 +                    {
 37.3358 +                        if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
 37.3359 +                            continue;
 37.3360 +                        mfn = l1e_get_pfn(l1e[i1]);
 37.3361 +                        ASSERT(valid_mfn(_mfn(mfn)));
 37.3362 +                        m2pfn = get_gpfn_from_mfn(mfn);
 37.3363 +                        if ( m2pfn != gfn )
 37.3364 +                        {
 37.3365 +                            pmbad++;
 37.3366 +                            SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
 37.3367 +                                           " -> gfn %#lx\n", gfn, mfn, m2pfn);
 37.3368 +                            BUG();
 37.3369 +                        }
 37.3370 +                    }
 37.3371 +                    sh_unmap_domain_page(l1e);
 37.3372 +                }
 37.3373 +#if CONFIG_PAGING_LEVELS >= 3
 37.3374 +                sh_unmap_domain_page(l2e);
 37.3375 +            }
 37.3376 +#if CONFIG_PAGING_LEVELS >= 4
 37.3377 +            sh_unmap_domain_page(l3e);
 37.3378 +        }
 37.3379 +#endif
 37.3380 +#endif
 37.3381 +
 37.3382 +#if CONFIG_PAGING_LEVELS == 4
 37.3383 +        sh_unmap_domain_page(l4e);
 37.3384 +#elif CONFIG_PAGING_LEVELS == 3
 37.3385 +        sh_unmap_domain_page(l3e);
 37.3386 +#else /* CONFIG_PAGING_LEVELS == 2 */
 37.3387 +        sh_unmap_domain_page(l2e);
 37.3388 +#endif
 37.3389 +
 37.3390 +    }
 37.3391 +
 37.3392 +    //SHADOW_PRINTK("p2m audit complete\n");
 37.3393 +    //if ( orphans_i | orphans_d | mpbad | pmbad ) 
 37.3394 +    //    SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
 37.3395 +    //                   orphans_i + orphans_d, orphans_i, orphans_d,
 37.3396 +    if ( mpbad | pmbad ) 
 37.3397 +        SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
 37.3398 +                       pmbad, mpbad);
 37.3399 +}
 37.3400 +
 37.3401 +#endif /* p2m audit */
 37.3402 +
 37.3403 +/*
 37.3404 + * Local variables:
 37.3405 + * mode: C
 37.3406 + * c-set-style: "BSD"
 37.3407 + * c-basic-offset: 4
 37.3408 + * indent-tabs-mode: nil
 37.3409 + * End: 
 37.3410 + */
    38.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    38.2 +++ b/xen/arch/x86/mm/shadow/multi.c	Mon Aug 28 16:26:37 2006 -0600
    38.3 @@ -0,0 +1,4492 @@
    38.4 +/******************************************************************************
    38.5 + * arch/x86/mm/shadow/multi.c
    38.6 + *
    38.7 + * Simple, mostly-synchronous shadow page tables. 
    38.8 + * Parts of this code are Copyright (c) 2006 by XenSource Inc.
    38.9 + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
   38.10 + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
   38.11 + *
   38.12 + * This program is free software; you can redistribute it and/or modify
   38.13 + * it under the terms of the GNU General Public License as published by
   38.14 + * the Free Software Foundation; either version 2 of the License, or
   38.15 + * (at your option) any later version.
   38.16 + *
   38.17 + * This program is distributed in the hope that it will be useful,
   38.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   38.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   38.20 + * GNU General Public License for more details.
   38.21 + *
   38.22 + * You should have received a copy of the GNU General Public License
   38.23 + * along with this program; if not, write to the Free Software
   38.24 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   38.25 + */
   38.26 +
   38.27 +// DESIGN QUESTIONS:
   38.28 +// Why use subshadows for PAE guests?
   38.29 +// - reduces pressure in the hash table
   38.30 +// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
   38.31 +// - would need to find space in the page_info to store 7 more bits of
   38.32 +//   backpointer
   38.33 +// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
   38.34 +//   figure out when to demote the guest page from l3 status
   38.35 +//
   38.36 +// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
   38.37 +// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
   38.38 +//   space for both PV and HVM guests.
   38.39 +//
   38.40 +
   38.41 +#define SHADOW 1
   38.42 +
   38.43 +#include <xen/config.h>
   38.44 +#include <xen/types.h>
   38.45 +#include <xen/mm.h>
   38.46 +#include <xen/trace.h>
   38.47 +#include <xen/sched.h>
   38.48 +#include <xen/perfc.h>
   38.49 +#include <xen/domain_page.h>
   38.50 +#include <asm/page.h>
   38.51 +#include <asm/current.h>
   38.52 +#include <asm/shadow.h>
   38.53 +#include <asm/flushtlb.h>
   38.54 +#include <asm/hvm/hvm.h>
   38.55 +#include "private.h"
   38.56 +#include "types.h"
   38.57 +
   38.58 +/* The first cut: an absolutely synchronous, trap-and-emulate version,
   38.59 + * supporting only HVM guests (and so only "external" shadow mode). 
   38.60 + *
   38.61 + * THINGS TO DO LATER:
   38.62 + * 
   38.63 + * FIX GVA_TO_GPA
   38.64 + * The current interface returns an unsigned long, which is not big enough
   38.65 + * to hold a physical address in PAE.  Should return a gfn instead.
   38.66 + * 
   38.67 + * TEARDOWN HEURISTICS
   38.68 + * Also: have a heuristic for when to destroy a previous paging-mode's 
   38.69 + * shadows.  When a guest is done with its start-of-day 32-bit tables
   38.70 + * and reuses the memory we want to drop those shadows.  Start with 
   38.71 + * shadows in a page in two modes as a hint, but beware of clever tricks 
   38.72 + * like reusing a pagetable for both PAE and 64-bit during boot...
   38.73 + *
   38.74 + * PAE LINEAR MAPS
   38.75 + * Rework shadow_get_l*e() to have the option of using map_domain_page()
   38.76 + * instead of linear maps.  Add appropriate unmap_l*e calls in the users. 
   38.77 + * Then we can test the speed difference made by linear maps.  If the 
   38.78 + * map_domain_page() version is OK on PAE, we could maybe allow a lightweight 
   38.79 + * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
   38.80 + * to share l2h pages again. 
   38.81 + *
   38.82 + * PAE L3 COPYING
   38.83 + * In this code, we copy all 32 bytes of a PAE L3 every time we change an 
   38.84 + * entry in it, and every time we change CR3.  We copy it for the linear 
   38.85 + * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
   38.86 + * buffer so it fits in CR3.  Maybe we can avoid some of this recopying 
   38.87 + * by using the shadow directly in some places. 
   38.88 + * Also, for SMP, need to actually respond to seeing shadow.pae_flip_pending.
   38.89 + *
   38.90 + * GUEST_WALK_TABLES TLB FLUSH COALESCE
   38.91 + * guest_walk_tables can do up to three remote TLB flushes as it walks to
   38.92 + * the first l1 of a new pagetable.  Should coalesce the flushes to the end, 
   38.93 + * and if we do flush, re-do the walk.  If anything has changed, then 
   38.94 + * pause all the other vcpus and do the walk *again*.
   38.95 + *
   38.96 + * WP DISABLED
   38.97 + * Consider how to implement having the WP bit of CR0 set to 0.  
   38.98 + * Since we need to be able to cause write faults to pagetables, this might
   38.99 + * end up looking like not having the (guest) pagetables present at all in 
  38.100 + * HVM guests...
  38.101 + *
  38.102 + * PSE disabled / PSE36
  38.103 + * We don't support any modes other than PSE enabled, PSE36 disabled.
  38.104 + * Neither of those would be hard to change, but we'd need to be able to 
  38.105 + * deal with shadows made in one mode and used in another.
  38.106 + */
  38.107 +
  38.108 +#define FETCH_TYPE_PREFETCH 1
  38.109 +#define FETCH_TYPE_DEMAND   2
  38.110 +#define FETCH_TYPE_WRITE    4
  38.111 +typedef enum {
  38.112 +    ft_prefetch     = FETCH_TYPE_PREFETCH,
  38.113 +    ft_demand_read  = FETCH_TYPE_DEMAND,
  38.114 +    ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
  38.115 +} fetch_type_t;
  38.116 +
  38.117 +#ifdef DEBUG_TRACE_DUMP
  38.118 +static char *fetch_type_names[] = {
  38.119 +    [ft_prefetch]     "prefetch",
  38.120 +    [ft_demand_read]  "demand read",
  38.121 +    [ft_demand_write] "demand write",
  38.122 +};
  38.123 +#endif
  38.124 +
  38.125 +/* XXX forward declarations */
  38.126 +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
  38.127 +static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res);
  38.128 +#endif
  38.129 +static inline void sh_update_linear_entries(struct vcpu *v);
  38.130 +
  38.131 +/**************************************************************************/
  38.132 +/* Hash table mapping from guest pagetables to shadows
  38.133 + *
  38.134 + * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
  38.135 + * FL1's:       maps the *gfn* of the start of a superpage to the mfn of a
  38.136 + *              shadow L1 which maps its "splinters".
  38.137 + * PAE CR3s:    maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
  38.138 + *              PAE L3 info page for that CR3 value.
  38.139 + */
  38.140 +
  38.141 +static inline mfn_t 
  38.142 +get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
  38.143 +/* Look for FL1 shadows in the hash table */
  38.144 +{
  38.145 +    mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn),
  38.146 +                                     PGC_SH_fl1_shadow >> PGC_SH_type_shift);
  38.147 +
  38.148 +    if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
  38.149 +    {
  38.150 +        struct page_info *page = mfn_to_page(smfn);
  38.151 +        if ( !(page->count_info & PGC_SH_log_dirty) )
  38.152 +            shadow_convert_to_log_dirty(v, smfn);
  38.153 +    }
  38.154 +
  38.155 +    return smfn;
  38.156 +}
  38.157 +
  38.158 +static inline mfn_t 
  38.159 +get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
  38.160 +/* Look for shadows in the hash table */
  38.161 +{
  38.162 +    mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn),
  38.163 +                                     shadow_type >> PGC_SH_type_shift);
  38.164 +    perfc_incrc(shadow_get_shadow_status);
  38.165 +
  38.166 +    if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
  38.167 +    {
  38.168 +        struct page_info *page = mfn_to_page(smfn);
  38.169 +        if ( !(page->count_info & PGC_SH_log_dirty) )
  38.170 +            shadow_convert_to_log_dirty(v, smfn);
  38.171 +    }
  38.172 +
  38.173 +    return smfn;
  38.174 +}
  38.175 +
  38.176 +static inline void 
  38.177 +set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
  38.178 +/* Put an FL1 shadow into the hash table */
  38.179 +{
  38.180 +    SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
  38.181 +                   gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn));
  38.182 +
  38.183 +    if ( unlikely(shadow_mode_log_dirty(v->domain)) )
  38.184 +        // mark this shadow as a log dirty shadow...
  38.185 +        set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
  38.186 +    else
  38.187 +        clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
  38.188 +
  38.189 +    shadow_hash_insert(v, gfn_x(gfn),
  38.190 +                        PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn);
  38.191 +}
  38.192 +
  38.193 +static inline void 
  38.194 +set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
  38.195 +/* Put a shadow into the hash table */
  38.196 +{
  38.197 +    struct domain *d = v->domain;
  38.198 +    int res;
  38.199 +
  38.200 +    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
  38.201 +                   d->domain_id, v->vcpu_id, mfn_x(gmfn),
  38.202 +                   shadow_type, mfn_x(smfn));
  38.203 +
  38.204 +    if ( unlikely(shadow_mode_log_dirty(d)) )
  38.205 +        // mark this shadow as a log dirty shadow...
  38.206 +        set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
  38.207 +    else
  38.208 +        clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info);
  38.209 +
  38.210 +    res = get_page(mfn_to_page(gmfn), d);
  38.211 +    ASSERT(res == 1);
  38.212 +
  38.213 +    shadow_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH_type_shift,
  38.214 +                        smfn);
  38.215 +}
  38.216 +
  38.217 +static inline void 
  38.218 +delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
  38.219 +/* Remove a shadow from the hash table */
  38.220 +{
  38.221 +    SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n",
  38.222 +                   gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn));
  38.223 +
  38.224 +    shadow_hash_delete(v, gfn_x(gfn),
  38.225 +                        PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn);
  38.226 +}
  38.227 +
  38.228 +static inline void 
  38.229 +delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
  38.230 +/* Remove a shadow from the hash table */
  38.231 +{
  38.232 +    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
  38.233 +                   v->domain->domain_id, v->vcpu_id,
  38.234 +                   mfn_x(gmfn), shadow_type, mfn_x(smfn));
  38.235 +    shadow_hash_delete(v, mfn_x(gmfn),
  38.236 +                        shadow_type >> PGC_SH_type_shift, smfn);
  38.237 +    put_page(mfn_to_page(gmfn));
  38.238 +}
  38.239 +
  38.240 +/**************************************************************************/
  38.241 +/* CPU feature support querying */
  38.242 +
  38.243 +static inline int
  38.244 +guest_supports_superpages(struct vcpu *v)
  38.245 +{
  38.246 +    /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
  38.247 +     * CR4.PSE is set or the guest is in PAE or long mode */
  38.248 +    return (hvm_guest(v) && (GUEST_PAGING_LEVELS != 2 
  38.249 +                             || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE)));
  38.250 +}
  38.251 +
  38.252 +static inline int
  38.253 +guest_supports_nx(struct vcpu *v)
  38.254 +{
  38.255 +    if ( !hvm_guest(v) )
  38.256 +        return cpu_has_nx;
  38.257 +
  38.258 +    // XXX - fix this!
  38.259 +    return 1;
  38.260 +}
  38.261 +
  38.262 +
  38.263 +/**************************************************************************/
  38.264 +/* Functions for walking the guest page tables */
  38.265 +
  38.266 +
  38.267 +/* Walk the guest pagetables, filling the walk_t with what we see. 
  38.268 + * Takes an uninitialised walk_t.  The caller must call unmap_walk() 
  38.269 + * on the walk_t before discarding it or calling guest_walk_tables again. 
  38.270 + * If "guest_op" is non-zero, we are serving a genuine guest memory access, 
  38.271 + * and must (a) be under the shadow lock, and (b) remove write access
  38.272 + * from any gueat PT pages we see, as we will be using their contents to 
  38.273 + * perform shadow updates.
  38.274 + * Returns 0 for success or non-zero if the guest pagetables are malformed.
  38.275 + * N.B. Finding a not-present entry does not cause a non-zero return code. */
  38.276 +static inline int 
  38.277 +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
  38.278 +{
  38.279 +    ASSERT(!guest_op || shadow_lock_is_acquired(v->domain));
  38.280 +
  38.281 +    perfc_incrc(shadow_guest_walk);
  38.282 +    memset(gw, 0, sizeof(*gw));
  38.283 +    gw->va = va;
  38.284 +
  38.285 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
  38.286 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
  38.287 +    /* Get l4e from the top level table */
  38.288 +    gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
  38.289 +    gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
  38.290 +    /* Walk down to the l3e */
  38.291 +    if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
  38.292 +    gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
  38.293 +    if ( !valid_mfn(gw->l3mfn) ) return 1;
  38.294 +    /* This mfn is a pagetable: make sure the guest can't write to it. */
  38.295 +    if ( guest_op && shadow_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
  38.296 +        flush_tlb_mask(v->domain->domain_dirty_cpumask); 
  38.297 +    gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn))
  38.298 +        + guest_l3_table_offset(va);
  38.299 +#else /* PAE only... */
  38.300 +    /* Get l3e from the top level table */
  38.301 +    gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
  38.302 +    gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
  38.303 +#endif /* PAE or 64... */
  38.304 +    /* Walk down to the l2e */
  38.305 +    if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
  38.306 +    gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
  38.307 +    if ( !valid_mfn(gw->l2mfn) ) return 1;
  38.308 +    /* This mfn is a pagetable: make sure the guest can't write to it. */
  38.309 +    if ( guest_op && shadow_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
  38.310 +        flush_tlb_mask(v->domain->domain_dirty_cpumask); 
  38.311 +    gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn))
  38.312 +        + guest_l2_table_offset(va);
  38.313 +#else /* 32-bit only... */
  38.314 +    /* Get l2e from the top level table */
  38.315 +    gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
  38.316 +    gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
  38.317 +#endif /* All levels... */
  38.318 +    
  38.319 +    if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
  38.320 +    if ( guest_supports_superpages(v) &&
  38.321 +         (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) ) 
  38.322 +    {
  38.323 +        /* Special case: this guest VA is in a PSE superpage, so there's
  38.324 +         * no guest l1e.  We make one up so that the propagation code
  38.325 +         * can generate a shadow l1 table.  Start with the gfn of the 
  38.326 +         * first 4k-page of the superpage. */
  38.327 +        gfn_t start = guest_l2e_get_gfn(*gw->l2e);
  38.328 +        /* Grant full access in the l1e, since all the guest entry's 
  38.329 +         * access controls are enforced in the shadow l2e.  This lets 
  38.330 +         * us reflect l2 changes later without touching the l1s. */
  38.331 +        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
  38.332 +                     _PAGE_ACCESSED|_PAGE_DIRTY);
  38.333 +        /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
  38.334 +         * of the level 1 */
  38.335 +        if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) ) 
  38.336 +            flags |= _PAGE_PAT; 
  38.337 +        /* Increment the pfn by the right number of 4k pages.  
  38.338 +         * The ~0x1 is to mask out the PAT bit mentioned above. */
  38.339 +        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
  38.340 +        gw->eff_l1e = guest_l1e_from_gfn(start, flags);
  38.341 +        gw->l1e = NULL;
  38.342 +        gw->l1mfn = _mfn(INVALID_MFN);
  38.343 +    } 
  38.344 +    else 
  38.345 +    {
  38.346 +        /* Not a superpage: carry on and find the l1e. */
  38.347 +        gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
  38.348 +        if ( !valid_mfn(gw->l1mfn) ) return 1;
  38.349 +        /* This mfn is a pagetable: make sure the guest can't write to it. */
  38.350 +        if ( guest_op 
  38.351 +             && shadow_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
  38.352 +            flush_tlb_mask(v->domain->domain_dirty_cpumask); 
  38.353 +        gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn))
  38.354 +            + guest_l1_table_offset(va);
  38.355 +        gw->eff_l1e = *gw->l1e;
  38.356 +    }
  38.357 +
  38.358 +    return 0;
  38.359 +}
  38.360 +
  38.361 +/* Given a walk_t, translate the gw->va into the guest's notion of the
  38.362 + * corresponding frame number. */
  38.363 +static inline gfn_t
  38.364 +guest_walk_to_gfn(walk_t *gw)
  38.365 +{
  38.366 +    if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
  38.367 +        return _gfn(INVALID_GFN);
  38.368 +    return guest_l1e_get_gfn(gw->eff_l1e);
  38.369 +}
  38.370 +
  38.371 +/* Given a walk_t, translate the gw->va into the guest's notion of the
  38.372 + * corresponding physical address. */
  38.373 +static inline paddr_t
  38.374 +guest_walk_to_gpa(walk_t *gw)
  38.375 +{
  38.376 +    if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
  38.377 +        return 0;
  38.378 +    return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
  38.379 +}
  38.380 +
  38.381 +
  38.382 +/* Unmap (and reinitialise) a guest walk.  
  38.383 + * Call this to dispose of any walk filled in by guest_walk_tables() */
  38.384 +static void unmap_walk(struct vcpu *v, walk_t *gw)
  38.385 +{
  38.386 +#if GUEST_PAGING_LEVELS >= 3
  38.387 +#if GUEST_PAGING_LEVELS >= 4
  38.388 +    if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e);
  38.389 +#endif
  38.390 +    if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e);
  38.391 +#endif
  38.392 +    if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e);
  38.393 +#ifdef DEBUG
  38.394 +    memset(gw, 0, sizeof(*gw));
  38.395 +#endif
  38.396 +}
  38.397 +
  38.398 +
  38.399 +/* Pretty-print the contents of a guest-walk */
  38.400 +static inline void print_gw(walk_t *gw)
  38.401 +{
  38.402 +    SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
  38.403 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
  38.404 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
  38.405 +    SHADOW_PRINTK("   l4mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l4mfn));
  38.406 +    SHADOW_PRINTK("   l4e=%p\n", gw->l4e);
  38.407 +    if ( gw->l4e )
  38.408 +        SHADOW_PRINTK("   *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4);
  38.409 +#endif /* PAE or 64... */
  38.410 +    SHADOW_PRINTK("   l3mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l3mfn));
  38.411 +    SHADOW_PRINTK("   l3e=%p\n", gw->l3e);
  38.412 +    if ( gw->l3e )
  38.413 +        SHADOW_PRINTK("   *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3);
  38.414 +#endif /* All levels... */
  38.415 +    SHADOW_PRINTK("   l2mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l2mfn));
  38.416 +    SHADOW_PRINTK("   l2e=%p\n", gw->l2e);
  38.417 +    if ( gw->l2e )
  38.418 +        SHADOW_PRINTK("   *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2);
  38.419 +    SHADOW_PRINTK("   l1mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l1mfn));
  38.420 +    SHADOW_PRINTK("   l1e=%p\n", gw->l1e);
  38.421 +    if ( gw->l1e )
  38.422 +        SHADOW_PRINTK("   *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1);
  38.423 +    SHADOW_PRINTK("   eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1);
  38.424 +}
  38.425 +
  38.426 +
  38.427 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
  38.428 +/* Lightweight audit: pass all the shadows associated with this guest walk
  38.429 + * through the audit mechanisms */
  38.430 +static void sh_audit_gw(struct vcpu *v, walk_t *gw) 
  38.431 +{
  38.432 +    mfn_t smfn;
  38.433 +
  38.434 +    if ( !(SHADOW_AUDIT_ENABLE) )
  38.435 +        return;
  38.436 +
  38.437 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
  38.438 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
  38.439 +    if ( valid_mfn(gw->l4mfn)
  38.440 +         && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn, 
  38.441 +                                                PGC_SH_l4_shadow))) )
  38.442 +        (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
  38.443 +#endif /* PAE or 64... */
  38.444 +    if ( valid_mfn(gw->l3mfn)
  38.445 +         && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn, 
  38.446 +                                                PGC_SH_l3_shadow))) )
  38.447 +        (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
  38.448 +#endif /* All levels... */
  38.449 +    if ( valid_mfn(gw->l2mfn) )
  38.450 +    {
  38.451 +        if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, 
  38.452 +                                                 PGC_SH_l2_shadow))) )
  38.453 +            (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
  38.454 +#if GUEST_PAGING_LEVELS == 3
  38.455 +        if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, 
  38.456 +                                                 PGC_SH_l2h_shadow))) )
  38.457 +            (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
  38.458 +#endif
  38.459 +    }
  38.460 +    if ( valid_mfn(gw->l1mfn)
  38.461 +         && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn, 
  38.462 +                                                PGC_SH_l1_shadow))) )
  38.463 +        (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
  38.464 +    else if ( gw->l2e
  38.465 +              && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
  38.466 +              && valid_mfn( 
  38.467 +              (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
  38.468 +        (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
  38.469 +}
  38.470 +
  38.471 +#else
  38.472 +#define sh_audit_gw(_v, _gw) do {} while(0)
  38.473 +#endif /* audit code */
  38.474 +
  38.475 +
  38.476 +
  38.477 +/**************************************************************************/
  38.478 +/* Function to write to the guest tables, for propagating accessed and 
  38.479 + * dirty bits from the shadow to the guest.
  38.480 + * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
  38.481 + * and an operation type.  The guest entry is always passed as an l1e: 
  38.482 + * since we only ever write flags, that's OK.
  38.483 + * Returns the new flag bits of the guest entry. */
  38.484 +
  38.485 +static u32 guest_set_ad_bits(struct vcpu *v,
  38.486 +                             mfn_t gmfn, 
  38.487 +                             guest_l1e_t *ep,
  38.488 +                             unsigned int level, 
  38.489 +                             fetch_type_t ft)
  38.490 +{
  38.491 +    u32 flags, shflags, bit;
  38.492 +    struct page_info *pg;
  38.493 +    int res = 0;
  38.494 +
  38.495 +    ASSERT(valid_mfn(gmfn)
  38.496 +           && (sh_mfn_is_a_page_table(gmfn)
  38.497 +               || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) 
  38.498 +                   == 0)));
  38.499 +    ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
  38.500 +    ASSERT(level <= GUEST_PAGING_LEVELS);
  38.501 +    ASSERT(ft == ft_demand_read || ft == ft_demand_write);
  38.502 +    ASSERT(shadow_lock_is_acquired(v->domain));
  38.503 +
  38.504 +    flags = guest_l1e_get_flags(*ep);
  38.505 +
  38.506 +    /* PAE l3s do not have A and D bits */
  38.507 +    if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) )
  38.508 +        return flags;
  38.509 +
  38.510 +    /* Need the D bit as well for writes, in l1es and 32bit/PAE PSE l2es. */
  38.511 +    if ( ft == ft_demand_write  
  38.512 +         && (level == 1 || 
  38.513 +             (level == 2 && GUEST_PAGING_LEVELS < 4 
  38.514 +              && (flags & _PAGE_PSE) && guest_supports_superpages(v))) )
  38.515 +    {
  38.516 +        if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) 
  38.517 +             == (_PAGE_DIRTY | _PAGE_ACCESSED) )
  38.518 +            return flags;  /* Guest already has A and D bits set */
  38.519 +        flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
  38.520 +        perfc_incrc(shadow_ad_update);
  38.521 +    }
  38.522 +    else 
  38.523 +    {
  38.524 +        if ( flags & _PAGE_ACCESSED )
  38.525 +            return flags;  /* Guest already has A bit set */
  38.526 +        flags |= _PAGE_ACCESSED;
  38.527 +        perfc_incrc(shadow_a_update);
  38.528 +    }
  38.529 +
  38.530 +    /* Set the bit(s) */
  38.531 +    sh_mark_dirty(v->domain, gmfn);
  38.532 +    SHADOW_DEBUG(A_AND_D, "gfn = %"SH_PRI_gfn", "
  38.533 +                  "old flags = %#x, new flags = %#x\n", 
  38.534 +                  guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags);
  38.535 +    *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
  38.536 +    
  38.537 +    /* May need to propagate this change forward to other kinds of shadow */
  38.538 +    pg = mfn_to_page(gmfn);
  38.539 +    if ( !sh_mfn_is_a_page_table(gmfn) ) 
  38.540 +    {
  38.541 +        /* This guest pagetable is not yet shadowed at all. */
  38.542 +        // MAF: I think this assert is busted...  If this gmfn has not yet
  38.543 +        // been promoted, then it seems perfectly reasonable for there to be
  38.544 +        // outstanding type refs to it...
  38.545 +        /* TJD: No. If the gmfn has not been promoted, we must at least 
  38.546 +         * have recognised that it is a pagetable, and pulled write access.
  38.547 +         * The type count should only be non-zero if it is actually a page 
  38.548 +         * table.  The test above was incorrect, though, so I've fixed it. */
  38.549 +        ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0);
  38.550 +        return flags;  
  38.551 +    }
  38.552 +
  38.553 +    shflags = pg->shadow_flags & SHF_page_type_mask;
  38.554 +    while ( shflags )
  38.555 +    {
  38.556 +        bit = find_first_set_bit(shflags);
  38.557 +        ASSERT(shflags & (1u << bit));
  38.558 +        shflags &= ~(1u << bit);
  38.559 +        if ( !(pg->shadow_flags & (1u << bit)) )
  38.560 +            continue;
  38.561 +        switch ( bit )
  38.562 +        {
  38.563 +        case PGC_SH_type_to_index(PGC_SH_l1_shadow):
  38.564 +            if (level != 1) 
  38.565 +                res |= sh_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep));
  38.566 +            break;
  38.567 +        case PGC_SH_type_to_index(PGC_SH_l2_shadow):
  38.568 +            if (level != 2) 
  38.569 +                res |= sh_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep));
  38.570 +            break;
  38.571 +#if GUEST_PAGING_LEVELS == 3 /* PAE only */
  38.572 +        case PGC_SH_type_to_index(PGC_SH_l2h_shadow):
  38.573 +            if (level != 2) 
  38.574 +                res |= sh_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep));
  38.575 +            break;
  38.576 +#endif
  38.577 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
  38.578 +        case PGC_SH_type_to_index(PGC_SH_l3_shadow):
  38.579 +            if (level != 3) 
  38.580 +                res |= sh_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep));
  38.581 +            break;
  38.582 +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
  38.583 +        case PGC_SH_type_to_index(PGC_SH_l4_shadow):
  38.584 +            if (level != 4) 
  38.585 +                res |= sh_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep));
  38.586 +            break;
  38.587 +#endif 
  38.588 +#endif
  38.589 +        default:
  38.590 +            SHADOW_ERROR("mfn %"SH_PRI_mfn" is shadowed in multiple "
  38.591 +                          "modes: A&D bits may be out of sync (flags=%#x).\n", 
  38.592 +                          mfn_x(gmfn), pg->shadow_flags); 
  38.593 +            /* XXX Shadows in other modes will not be updated, so will
  38.594 +             * have their A and D bits out of sync. */
  38.595 +        }
  38.596 +    }
  38.597 +    
  38.598 +    /* We should never need to flush the TLB or recopy PAE entries */
  38.599 +    ASSERT( res == 0 || res == SHADOW_SET_CHANGED );
  38.600 +    return flags;
  38.601 +}
  38.602 +
  38.603 +/**************************************************************************/
  38.604 +/* Functions to compute the correct index into a shadow page, given an
  38.605 + * index into the guest page (as returned by guest_get_index()).
  38.606 + * This is trivial when the shadow and guest use the same sized PTEs, but
  38.607 + * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
  38.608 + * PAE- or 64-bit shadows).
  38.609 + *
  38.610 + * These functions also increment the shadow mfn, when necessary.  When PTE
  38.611 + * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
  38.612 + * page.  In this case, we allocate 2 contiguous pages for the shadow L1, and
  38.613 + * use simple pointer arithmetic on a pointer to the guest L1e to figure out
  38.614 + * which shadow page we really want.  Similarly, when PTE sizes are
  38.615 + * mismatched, we shadow a guest L2 page with 4 shadow L2 pages.  (The easiest
  38.616 + * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
  38.617 + * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
  38.618 + * space.)
  38.619 + *
  38.620 + * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
  38.621 + * of shadow (to store both the shadow, and the info that would normally be
  38.622 + * stored in page_info fields).  This arrangement allows the shadow and the
  38.623 + * "page_info" fields to always be stored in the same page (in fact, in
  38.624 + * the same cache line), avoiding an extra call to map_domain_page().
  38.625 + */
  38.626 +
  38.627 +static inline u32
  38.628 +guest_index(void *ptr)
  38.629 +{
  38.630 +    return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
  38.631 +}
  38.632 +
  38.633 +static inline u32
  38.634 +shadow_l1_index(mfn_t *smfn, u32 guest_index)
  38.635 +{
  38.636 +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
  38.637 +    *smfn = _mfn(mfn_x(*smfn) +
  38.638 +                 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
  38.639 +    return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
  38.640 +#else
  38.641 +    return guest_index;
  38.642 +#endif
  38.643 +}
  38.644 +
  38.645 +static inline u32
  38.646 +shadow_l2_index(mfn_t *smfn, u32 guest_index)
  38.647 +{
  38.648 +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
  38.649 +    // Because we use 2 shadow l2 entries for each guest entry, the number of
  38.650 +    // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
  38.651 +    //
  38.652 +    *smfn = _mfn(mfn_x(*smfn) +
  38.653 +                 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
  38.654 +
  38.655 +    // We multiple by two to get the index of the first of the two entries
  38.656 +    // used to shadow the specified guest entry.
  38.657 +    return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
  38.658 +#else
  38.659 +    return guest_index;
  38.660 +#endif
  38.661 +}
  38.662 +
  38.663 +#if GUEST_PAGING_LEVELS >= 3
  38.664 +
  38.665 +static inline u32
  38.666 +shadow_l3_index(mfn_t *smfn, u32 guest_index)
  38.667 +{
  38.668 +#if GUEST_PAGING_LEVELS == 3
  38.669 +    u32 group_id;
  38.670 +
  38.671 +    // Because we use twice the space in L3 shadows as was consumed in guest
  38.672 +    // L3s, the number of guest entries per shadow page is
  38.673 +    // SHADOW_L2_PAGETABLE_ENTRIES/2.  (Note this is *not*
  38.674 +    // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...)
  38.675 +    //
  38.676 +    *smfn = _mfn(mfn_x(*smfn) +
  38.677 +                 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
  38.678 +
  38.679 +    // We store PAE L3 shadows in groups of 4, alternating shadows and
  38.680 +    // pae_l3_bookkeeping structs.  So the effective shadow index is
  38.681 +    // the the group_id * 8 + the offset within the group.
  38.682 +    //
  38.683 +    guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2);
  38.684 +    group_id = guest_index / 4;
  38.685 +    return (group_id * 8) + (guest_index % 4);
  38.686 +#else
  38.687 +    return guest_index;
  38.688 +#endif
  38.689 +}
  38.690 +
  38.691 +#endif // GUEST_PAGING_LEVELS >= 3
  38.692 +
  38.693 +#if GUEST_PAGING_LEVELS >= 4
  38.694 +
  38.695 +static inline u32
  38.696 +shadow_l4_index(mfn_t *smfn, u32 guest_index)
  38.697 +{
  38.698 +    return guest_index;
  38.699 +}
  38.700 +
  38.701 +#endif // GUEST_PAGING_LEVELS >= 4
  38.702 +
  38.703 +
  38.704 +/**************************************************************************/
  38.705 +/* Functions which compute shadow entries from their corresponding guest
  38.706 + * entries.
  38.707 + *
  38.708 + * These are the "heart" of the shadow code.
  38.709 + *
  38.710 + * There are two sets of these: those that are called on demand faults (read
  38.711 + * faults and write faults), and those that are essentially called to
  38.712 + * "prefetch" (or propagate) entries from the guest into the shadow.  The read
  38.713 + * fault and write fault are handled as two separate cases for L1 entries (due
  38.714 + * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
  38.715 + * into the respective demand_fault functions.
  38.716 + */
  38.717 +
  38.718 +#define CHECK(_cond)                                    \
  38.719 +do {                                                    \
  38.720 +    if (unlikely(!(_cond)))                             \
  38.721 +    {                                                   \
  38.722 +        printk("%s %s %d ASSERTION (%s) FAILED\n",      \
  38.723 +               __func__, __FILE__, __LINE__, #_cond);   \
  38.724 +        return -1;                                      \
  38.725 +    }                                                   \
  38.726 +} while (0);
  38.727 +
  38.728 +// The function below tries to capture all of the flag manipulation for the
  38.729 +// demand and propagate functions into one place.
  38.730 +//
  38.731 +static always_inline u32
  38.732 +sh_propagate_flags(struct vcpu *v, mfn_t target_mfn, 
  38.733 +                    u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, 
  38.734 +                    int mmio, int level, fetch_type_t ft)
  38.735 +{
  38.736 +    struct domain *d = v->domain;
  38.737 +    u32 pass_thru_flags;
  38.738 +    u32 sflags;
  38.739 +
  38.740 +    // XXX -- might want to think about PAT support for HVM guests...
  38.741 +
  38.742 +#ifndef NDEBUG
  38.743 +    // MMIO can only occur from L1e's
  38.744 +    //
  38.745 +    if ( mmio )
  38.746 +        CHECK(level == 1);
  38.747 +
  38.748 +    // We should always have a pointer to the guest entry if it's a non-PSE
  38.749 +    // non-MMIO demand access.
  38.750 +    if ( ft & FETCH_TYPE_DEMAND )
  38.751 +        CHECK(guest_entry_ptr || level == 1);
  38.752 +#endif
  38.753 +
  38.754 +    // A not-present guest entry has a special signature in the shadow table,
  38.755 +    // so that we do not have to consult the guest tables multiple times...
  38.756 +    //
  38.757 +    if ( unlikely(!(gflags & _PAGE_PRESENT)) )
  38.758 +        return _PAGE_SHADOW_GUEST_NOT_PRESENT;
  38.759 +
  38.760 +    // Must have a valid target_mfn, unless this is mmio, or unless this is a
  38.761 +    // prefetch.  In the case of a prefetch, an invalid mfn means that we can
  38.762 +    // not usefully shadow anything, and so we return early.
  38.763 +    //
  38.764 +    if ( !valid_mfn(target_mfn) )
  38.765 +    {
  38.766 +        CHECK((ft == ft_prefetch) || mmio);
  38.767 +        if ( !mmio )
  38.768 +            return 0;
  38.769 +    }
  38.770 +
  38.771 +    // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's...
  38.772 +    //
  38.773 +    if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) )
  38.774 +        pass_thru_flags = _PAGE_PRESENT;
  38.775 +    else
  38.776 +    {
  38.777 +        pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
  38.778 +                           _PAGE_RW | _PAGE_PRESENT);
  38.779 +        if ( guest_supports_nx(v) )
  38.780 +            pass_thru_flags |= _PAGE_NX_BIT;
  38.781 +    }
  38.782 +
  38.783 +    // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their
  38.784 +    // L3e's; they are all implied.  So we emulate them here.
  38.785 +    //
  38.786 +    if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) )
  38.787 +        gflags = pass_thru_flags;
  38.788 +
  38.789 +    // Propagate bits from the guest to the shadow.
  38.790 +    // Some of these may be overwritten, below.
  38.791 +    // Since we know the guest's PRESENT bit is set, we also set the shadow's
  38.792 +    // SHADOW_PRESENT bit.
  38.793 +    //
  38.794 +    sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT;
  38.795 +
  38.796 +    // Copy the guest's RW bit into the SHADOW_RW bit.
  38.797 +    //
  38.798 +    if ( gflags & _PAGE_RW )
  38.799 +        sflags |= _PAGE_SHADOW_RW;
  38.800 +
  38.801 +    // Set the A&D bits for higher level shadows.
  38.802 +    // Higher level entries do not, strictly speaking, have dirty bits, but
  38.803 +    // since we use shadow linear tables, each of these entries may, at some
  38.804 +    // point in time, also serve as a shadow L1 entry.
  38.805 +    // By setting both the  A&D bits in each of these, we eliminate the burden
  38.806 +    // on the hardware to update these bits on initial accesses.
  38.807 +    //
  38.808 +    if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
  38.809 +        sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
  38.810 +
  38.811 +
  38.812 +    // Set the A and D bits in the guest entry, if we need to.
  38.813 +    if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
  38.814 +        gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
  38.815 +    
  38.816 +    // If the A or D bit has not yet been set in the guest, then we must
  38.817 +    // prevent the corresponding kind of access.
  38.818 +    //
  38.819 +    if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) &&
  38.820 +                  !(gflags & _PAGE_ACCESSED)) )
  38.821 +        sflags &= ~_PAGE_PRESENT;
  38.822 +
  38.823 +    /* D bits exist in l1es, and 32bit/PAE PSE l2es, but not 64bit PSE l2es */
  38.824 +    if ( unlikely( ((level == 1) 
  38.825 +                    || ((level == 2) && (GUEST_PAGING_LEVELS < 4) 
  38.826 +                        && guest_supports_superpages(v) &&
  38.827 +                        (gflags & _PAGE_PSE)))
  38.828 +                   && !(gflags & _PAGE_DIRTY)) )
  38.829 +        sflags &= ~_PAGE_RW;
  38.830 +
  38.831 +    // MMIO caching
  38.832 +    //
  38.833 +    // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit
  38.834 +    // to cache the fact that this entry  is in MMIO space.
  38.835 +    //
  38.836 +    if ( (level == 1) && mmio )
  38.837 +    {
  38.838 +        sflags &= ~(_PAGE_PRESENT);
  38.839 +        sflags |= _PAGE_SHADOW_MMIO;
  38.840 +    }
  38.841 +    else 
  38.842 +    {
  38.843 +        // shadow_mode_log_dirty support
  38.844 +        //
  38.845 +        // Only allow the guest write access to a page a) on a demand fault,
  38.846 +        // or b) if the page is already marked as dirty.
  38.847 +        //
  38.848 +        if ( unlikely((level == 1) &&
  38.849 +                      !(ft & FETCH_TYPE_WRITE) &&
  38.850 +                      shadow_mode_log_dirty(d) &&
  38.851 +                      !sh_mfn_is_dirty(d, target_mfn)) )
  38.852 +        {
  38.853 +            sflags &= ~_PAGE_RW;
  38.854 +        }
  38.855 +        
  38.856 +        // protect guest page tables
  38.857 +        //
  38.858 +        if ( unlikely((level == 1) &&
  38.859 +                      sh_mfn_is_a_page_table(target_mfn)) )
  38.860 +        {
  38.861 +            if ( shadow_mode_trap_reads(d) )
  38.862 +            {
  38.863 +                // if we are trapping both reads & writes, then mark this page
  38.864 +                // as not present...
  38.865 +                //
  38.866 +                sflags &= ~_PAGE_PRESENT;
  38.867 +            }
  38.868 +            else
  38.869 +            {
  38.870 +                // otherwise, just prevent any writes...
  38.871 +                //
  38.872 +                sflags &= ~_PAGE_RW;
  38.873 +            }
  38.874 +        }
  38.875 +    }
  38.876 +
  38.877 +    return sflags;
  38.878 +}
  38.879 +
  38.880 +#undef CHECK
  38.881 +
  38.882 +#if GUEST_PAGING_LEVELS >= 4
  38.883 +static void
  38.884 +l4e_propagate_from_guest(struct vcpu *v, 
  38.885 +                         guest_l4e_t *gl4e,
  38.886 +                         mfn_t gl4mfn,
  38.887 +                         mfn_t sl3mfn,
  38.888 +                         shadow_l4e_t *sl4p,
  38.889 +                         fetch_type_t ft)
  38.890 +{
  38.891 +    u32 gflags = guest_l4e_get_flags(*gl4e);
  38.892 +    u32 sflags = sh_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e,
  38.893 +                                     gl4mfn, 0, 4, ft);
  38.894 +
  38.895 +    *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags);
  38.896 +
  38.897 +    SHADOW_DEBUG(PROPAGATE,
  38.898 +                  "%s gl4e=%" SH_PRI_gpte " sl4e=%" SH_PRI_pte "\n",
  38.899 +                  fetch_type_names[ft], gl4e->l4, sl4p->l4);
  38.900 +    ASSERT(sflags != -1);
  38.901 +}
  38.902 +#endif // GUEST_PAGING_LEVELS >= 4
  38.903 +
  38.904 +#if GUEST_PAGING_LEVELS >= 3
  38.905 +static void
  38.906 +l3e_propagate_from_guest(struct vcpu *v,
  38.907 +                         guest_l3e_t *gl3e,
  38.908 +                         mfn_t gl3mfn, 
  38.909 +                         mfn_t sl2mfn, 
  38.910 +                         shadow_l3e_t *sl3p,
  38.911 +                         fetch_type_t ft)
  38.912 +{
  38.913 +    u32 gflags = guest_l3e_get_flags(*gl3e);
  38.914 +    u32 sflags = sh_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e,
  38.915 +                                     gl3mfn, 0, 3, ft);
  38.916 +
  38.917 +    *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags);
  38.918 +
  38.919 +    SHADOW_DEBUG(PROPAGATE,
  38.920 +                  "%s gl3e=%" SH_PRI_gpte " sl3e=%" SH_PRI_pte "\n",
  38.921 +                  fetch_type_names[ft], gl3e->l3, sl3p->l3);
  38.922 +    ASSERT(sflags != -1);
  38.923 +}
  38.924 +#endif // GUEST_PAGING_LEVELS >= 3
  38.925 +
  38.926 +static void
  38.927 +l2e_propagate_from_guest(struct vcpu *v, 
  38.928 +                         guest_l2e_t *gl2e,
  38.929 +                         mfn_t gl2mfn,
  38.930 +                         mfn_t sl1mfn, 
  38.931 +                         shadow_l2e_t *sl2p,
  38.932 +                         fetch_type_t ft)
  38.933 +{
  38.934 +    u32 gflags = guest_l2e_get_flags(*gl2e);
  38.935 +    u32 sflags = sh_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e, 
  38.936 +                                     gl2mfn, 0, 2, ft);
  38.937 +
  38.938 +    *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags);
  38.939 +
  38.940 +    SHADOW_DEBUG(PROPAGATE,
  38.941 +                  "%s gl2e=%" SH_PRI_gpte " sl2e=%" SH_PRI_pte "\n",
  38.942 +                  fetch_type_names[ft], gl2e->l2, sl2p->l2);
  38.943 +    ASSERT(sflags != -1);
  38.944 +}
  38.945 +
  38.946 +static inline int
  38.947 +l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
  38.948 +               int mmio)
  38.949 +/* returns 1 if emulation is required, and 0 otherwise */
  38.950 +{
  38.951 +    struct domain *d = v->domain;
  38.952 +    u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
  38.953 +    u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
  38.954 +                                     mmio, 1, ft_demand_read);
  38.955 +
  38.956 +    if ( shadow_mode_trap_reads(d) && !mmio && sh_mfn_is_a_page_table(gmfn) )
  38.957 +    {
  38.958 +        // emulation required!
  38.959 +        *sl1p = shadow_l1e_empty();
  38.960 +        return 1;
  38.961 +    }
  38.962 +
  38.963 +    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
  38.964 +
  38.965 +    SHADOW_DEBUG(PROPAGATE,
  38.966 +                  "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
  38.967 +                  (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
  38.968 +
  38.969 +    ASSERT(sflags != -1);
  38.970 +    return 0;
  38.971 +}
  38.972 +
  38.973 +static inline int
  38.974 +l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
  38.975 +                int mmio)
  38.976 +/* returns 1 if emulation is required, and 0 otherwise */
  38.977 +{
  38.978 +    struct domain *d = v->domain;
  38.979 +    u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
  38.980 +    u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
  38.981 +                                     mmio, 1, ft_demand_write);
  38.982 +
  38.983 +    sh_mark_dirty(d, gmfn);
  38.984 +
  38.985 +    if ( !mmio && sh_mfn_is_a_page_table(gmfn) )
  38.986 +    {
  38.987 +        // emulation required!
  38.988 +        *sl1p = shadow_l1e_empty();
  38.989 +        return 1;
  38.990 +    }
  38.991 +
  38.992 +    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
  38.993 +
  38.994 +    SHADOW_DEBUG(PROPAGATE,
  38.995 +                  "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
  38.996 +                  (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
  38.997 +
  38.998 +    ASSERT(sflags != -1);
  38.999 +    return 0;
 38.1000 +}
 38.1001 +
 38.1002 +static inline void
 38.1003 +l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p,
 38.1004 +                         int mmio)
 38.1005 +{
 38.1006 +    gfn_t gfn = guest_l1e_get_gfn(gl1e);
 38.1007 +    mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn);
 38.1008 +    u32 gflags = guest_l1e_get_flags(gl1e);
 38.1009 +    u32 sflags = sh_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN), 
 38.1010 +                                     mmio, 1, ft_prefetch);
 38.1011 +
 38.1012 +    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
 38.1013 +
 38.1014 +    SHADOW_DEBUG(PROPAGATE,
 38.1015 +                  "gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n",
 38.1016 +                  gl1e.l1, sl1p->l1);
 38.1017 +
 38.1018 +    ASSERT(sflags != -1);
 38.1019 +}
 38.1020 +
 38.1021 +
 38.1022 +/**************************************************************************/
 38.1023 +/* These functions update shadow entries (and do bookkeeping on the shadow
 38.1024 + * tables they are in).  It is intended that they are the only
 38.1025 + * functions which ever write (non-zero) data onto a shadow page.
 38.1026 + *
 38.1027 + * They return a set of flags: 
 38.1028 + * SHADOW_SET_CHANGED -- we actually wrote a new value to the shadow.
 38.1029 + * SHADOW_SET_FLUSH   -- the caller must cause a TLB flush.
 38.1030 + * SHADOW_SET_ERROR   -- the input is not a valid entry (for example, if
 38.1031 + *                        shadow_get_page_from_l1e() fails).
 38.1032 + * SHADOW_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local
 38.1033 + *                             copies of their PAE L3 entries re-copied.
 38.1034 + */
 38.1035 +
 38.1036 +static inline void safe_write_entry(void *dst, void *src) 
 38.1037 +/* Copy one PTE safely when processors might be running on the
 38.1038 + * destination pagetable.   This does *not* give safety against
 38.1039 + * concurrent writes (that's what the shadow lock is for), just 
 38.1040 + * stops the hardware picking up partially written entries. */
 38.1041 +{
 38.1042 +    volatile unsigned long *d = dst;
 38.1043 +    unsigned long *s = src;
 38.1044 +    ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
 38.1045 +#if CONFIG_PAGING_LEVELS == 3
 38.1046 +    /* In PAE mode, pagetable entries are larger
 38.1047 +     * than machine words, so won't get written atomically.  We need to make
 38.1048 +     * sure any other cpu running on these shadows doesn't see a
 38.1049 +     * half-written entry.  Do this by marking the entry not-present first,
 38.1050 +     * then writing the high word before the low word. */
 38.1051 +    BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
 38.1052 +    d[0] = 0;
 38.1053 +    d[1] = s[1];
 38.1054 +    d[0] = s[0];
 38.1055 +#else
 38.1056 +    /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
 38.1057 +     * which will be an atomic write, since the entry is aligned. */
 38.1058 +    BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
 38.1059 +    *d = *s;
 38.1060 +#endif
 38.1061 +}
 38.1062 +
 38.1063 +
 38.1064 +static inline void 
 38.1065 +shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
 38.1066 +/* This function does the actual writes to shadow pages.
 38.1067 + * It must not be called directly, since it doesn't do the bookkeeping
 38.1068 + * that shadow_set_l*e() functions do. */
 38.1069 +{
 38.1070 +    shadow_l1e_t *dst = d;
 38.1071 +    shadow_l1e_t *src = s;
 38.1072 +    void *map = NULL;
 38.1073 +    int i;
 38.1074 +
 38.1075 +    /* Because we mirror access rights at all levels in the shadow, an
 38.1076 +     * l2 (or higher) entry with the RW bit cleared will leave us with
 38.1077 +     * no write access through the linear map.  
 38.1078 +     * We detect that by writing to the shadow with copy_to_user() and 
 38.1079 +     * using map_domain_page() to get a writeable mapping if we need to. */
 38.1080 +    if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 ) 
 38.1081 +    {
 38.1082 +        perfc_incrc(shadow_linear_map_failed);
 38.1083 +        map = sh_map_domain_page(mfn);
 38.1084 +        ASSERT(map != NULL);
 38.1085 +        dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
 38.1086 +    }
 38.1087 +
 38.1088 +
 38.1089 +    for ( i = 0; i < entries; i++ )
 38.1090 +        safe_write_entry(dst++, src++);
 38.1091 +
 38.1092 +    if ( map != NULL ) sh_unmap_domain_page(map);
 38.1093 +
 38.1094 +    /* XXX TODO:
 38.1095 +     * Update min/max field in page_info struct of this mfn */
 38.1096 +}
 38.1097 +
 38.1098 +static inline int
 38.1099 +perms_strictly_increased(u32 old_flags, u32 new_flags) 
 38.1100 +/* Given the flags of two entries, are the new flags a strict
 38.1101 + * increase in rights over the old ones? */
 38.1102 +{
 38.1103 +    u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
 38.1104 +    u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
 38.1105 +    /* Flip the NX bit, since it's the only one that decreases rights;
 38.1106 +     * we calculate as if it were an "X" bit. */
 38.1107 +    of ^= _PAGE_NX_BIT;
 38.1108 +    nf ^= _PAGE_NX_BIT;
 38.1109 +    /* If the changed bits are all set in the new flags, then rights strictly 
 38.1110 +     * increased between old and new. */
 38.1111 +    return ((of | (of ^ nf)) == nf);
 38.1112 +}
 38.1113 +
 38.1114 +static int inline
 38.1115 +shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
 38.1116 +{
 38.1117 +    int res;
 38.1118 +    mfn_t mfn;
 38.1119 +    struct domain *owner;
 38.1120 +    shadow_l1e_t sanitized_sl1e =
 38.1121 +        shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT);
 38.1122 +
 38.1123 +    //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT);
 38.1124 +    //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0);
 38.1125 +
 38.1126 +    if ( !shadow_mode_refcounts(d) )
 38.1127 +        return 1;
 38.1128 +
 38.1129 +    res = get_page_from_l1e(sanitized_sl1e, d);
 38.1130 +
 38.1131 +    // If a privileged domain is attempting to install a map of a page it does
 38.1132 +    // not own, we let it succeed anyway.
 38.1133 +    //
 38.1134 +    if ( unlikely(!res) &&
 38.1135 +         IS_PRIV(d) &&
 38.1136 +         !shadow_mode_translate(d) &&
 38.1137 +         valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) &&
 38.1138 +         (owner = page_get_owner(mfn_to_page(mfn))) &&
 38.1139 +         (d != owner) )
 38.1140 +    {
 38.1141 +        res = get_page_from_l1e(sanitized_sl1e, owner);
 38.1142 +        SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx "
 38.1143 +                       "which is owned by domain %d: %s\n",
 38.1144 +                       d->domain_id, mfn_x(mfn), owner->domain_id,
 38.1145 +                       res ? "success" : "failed");
 38.1146 +    }
 38.1147 +
 38.1148 +    if ( unlikely(!res) )
 38.1149 +    {
 38.1150 +        perfc_incrc(shadow_get_page_fail);
 38.1151 +        SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
 38.1152 +    }
 38.1153 +
 38.1154 +    return res;
 38.1155 +}
 38.1156 +
 38.1157 +static void inline
 38.1158 +shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
 38.1159 +{ 
 38.1160 +    if ( !shadow_mode_refcounts(d) )
 38.1161 +        return;
 38.1162 +
 38.1163 +    put_page_from_l1e(sl1e, d);
 38.1164 +}
 38.1165 +
 38.1166 +#if GUEST_PAGING_LEVELS >= 4
 38.1167 +static int shadow_set_l4e(struct vcpu *v, 
 38.1168 +                          shadow_l4e_t *sl4e, 
 38.1169 +                          shadow_l4e_t new_sl4e, 
 38.1170 +                          mfn_t sl4mfn)
 38.1171 +{
 38.1172 +    int flags = 0;
 38.1173 +    shadow_l4e_t old_sl4e;
 38.1174 +    paddr_t paddr;
 38.1175 +    ASSERT(sl4e != NULL);
 38.1176 +    old_sl4e = *sl4e;
 38.1177 +
 38.1178 +    if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
 38.1179 +    
 38.1180 +    paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) 
 38.1181 +             | (((unsigned long)sl4e) & ~PAGE_MASK));
 38.1182 +
 38.1183 +    if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) 
 38.1184 +    {
 38.1185 +        /* About to install a new reference */        
 38.1186 +        sh_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr);
 38.1187 +    } 
 38.1188 +
 38.1189 +    /* Write the new entry */
 38.1190 +    shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
 38.1191 +    flags |= SHADOW_SET_CHANGED;
 38.1192 +
 38.1193 +    if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) 
 38.1194 +    {
 38.1195 +        /* We lost a reference to an old mfn. */
 38.1196 +        mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
 38.1197 +        if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
 38.1198 +             || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e), 
 38.1199 +                                          shadow_l4e_get_flags(new_sl4e)) )
 38.1200 +        {
 38.1201 +            flags |= SHADOW_SET_FLUSH;
 38.1202 +        }
 38.1203 +        sh_put_ref(v, osl3mfn, paddr);
 38.1204 +    }
 38.1205 +    return flags;
 38.1206 +}
 38.1207 +#endif /* GUEST_PAGING_LEVELS >= 4 */
 38.1208 +
 38.1209 +#if GUEST_PAGING_LEVELS >= 3
 38.1210 +static int shadow_set_l3e(struct vcpu *v, 
 38.1211 +                          shadow_l3e_t *sl3e, 
 38.1212 +                          shadow_l3e_t new_sl3e, 
 38.1213 +                          mfn_t sl3mfn)
 38.1214 +{
 38.1215 +    int flags = 0;
 38.1216 +    shadow_l3e_t old_sl3e;
 38.1217 +    paddr_t paddr;
 38.1218 +    ASSERT(sl3e != NULL);
 38.1219 +    old_sl3e = *sl3e;
 38.1220 +
 38.1221 +    if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
 38.1222 +
 38.1223 +    paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) 
 38.1224 +             | (((unsigned long)sl3e) & ~PAGE_MASK));
 38.1225 +    
 38.1226 +    if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) 
 38.1227 +    {
 38.1228 +        /* About to install a new reference */        
 38.1229 +        sh_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr);
 38.1230 +    } 
 38.1231 +
 38.1232 +    /* Write the new entry */
 38.1233 +    shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
 38.1234 +    flags |= SHADOW_SET_CHANGED;
 38.1235 +
 38.1236 +#if GUEST_PAGING_LEVELS == 3 
 38.1237 +    /* We wrote a guest l3e in a PAE pagetable.  This table is copied in
 38.1238 +     * the linear pagetable entries of its l2s, and may also be copied
 38.1239 +     * to a low memory location to make it fit in CR3.  Report that we
 38.1240 +     * need to resync those copies (we can't wait for the guest to flush
 38.1241 +     * the TLB because it might be an increase in rights). */
 38.1242 +    {
 38.1243 +        struct vcpu *vcpu;
 38.1244 +
 38.1245 +        struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e);
 38.1246 +        for_each_vcpu(v->domain, vcpu)
 38.1247 +        {
 38.1248 +            if (info->vcpus & (1 << vcpu->vcpu_id))
 38.1249 +            {
 38.1250 +                // Remember that this flip/update needs to occur.
 38.1251 +                vcpu->arch.shadow.pae_flip_pending = 1;
 38.1252 +                flags |= SHADOW_SET_L3PAE_RECOPY;
 38.1253 +            }
 38.1254 +        }
 38.1255 +    }
 38.1256 +#endif
 38.1257 +
 38.1258 +    if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) 
 38.1259 +    {
 38.1260 +        /* We lost a reference to an old mfn. */
 38.1261 +        mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
 38.1262 +        if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
 38.1263 +             !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e), 
 38.1264 +                                       shadow_l3e_get_flags(new_sl3e)) ) 
 38.1265 +        {
 38.1266 +            flags |= SHADOW_SET_FLUSH;
 38.1267 +        }
 38.1268 +        sh_put_ref(v, osl2mfn, paddr);
 38.1269 +    }
 38.1270 +    return flags;
 38.1271 +}
 38.1272 +#endif /* GUEST_PAGING_LEVELS >= 3 */ 
 38.1273 +
 38.1274 +static int shadow_set_l2e(struct vcpu *v, 
 38.1275 +                          shadow_l2e_t *sl2e, 
 38.1276 +                          shadow_l2e_t new_sl2e, 
 38.1277 +                          mfn_t sl2mfn)
 38.1278 +{
 38.1279 +    int flags = 0;
 38.1280 +    shadow_l2e_t old_sl2e;
 38.1281 +    paddr_t paddr;
 38.1282 +
 38.1283 +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
 38.1284 +    /* In 2-on-3 we work with pairs of l2es pointing at two-page
 38.1285 +     * shadows.  Reference counting and up-pointers track from the first
 38.1286 +     * page of the shadow to the first l2e, so make sure that we're 
 38.1287 +     * working with those:     
 38.1288 +     * Align the pointer down so it's pointing at the first of the pair */
 38.1289 +    sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
 38.1290 +    /* Align the mfn of the shadow entry too */
 38.1291 +    new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
 38.1292 +#endif
 38.1293 +
 38.1294 +    ASSERT(sl2e != NULL);
 38.1295 +    old_sl2e = *sl2e;
 38.1296 +    
 38.1297 +    if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
 38.1298 +    
 38.1299 +    paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
 38.1300 +             | (((unsigned long)sl2e) & ~PAGE_MASK));
 38.1301 +
 38.1302 +    if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 
 38.1303 +    {
 38.1304 +        /* About to install a new reference */
 38.1305 +        sh_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr);
 38.1306 +    } 
 38.1307 +
 38.1308 +    /* Write the new entry */
 38.1309 +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
 38.1310 +    {
 38.1311 +        shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
 38.1312 +        /* The l1 shadow is two pages long and need to be pointed to by
 38.1313 +         * two adjacent l1es.  The pair have the same flags, but point
 38.1314 +         * at odd and even MFNs */
 38.1315 +        ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
 38.1316 +        pair[1].l2 |= (1<<PAGE_SHIFT);
 38.1317 +        shadow_write_entries(sl2e, &pair, 2, sl2mfn);
 38.1318 +    }
 38.1319 +#else /* normal case */
 38.1320 +    shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
 38.1321 +#endif
 38.1322 +    flags |= SHADOW_SET_CHANGED;
 38.1323 +
 38.1324 +    if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT ) 
 38.1325 +    {
 38.1326 +        /* We lost a reference to an old mfn. */
 38.1327 +        mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
 38.1328 +        if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
 38.1329 +             !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e), 
 38.1330 +                                       shadow_l2e_get_flags(new_sl2e)) ) 
 38.1331 +        {
 38.1332 +            flags |= SHADOW_SET_FLUSH;
 38.1333 +        }
 38.1334 +        sh_put_ref(v, osl1mfn, paddr);
 38.1335 +    }
 38.1336 +    return flags;
 38.1337 +}
 38.1338 +
 38.1339 +static int shadow_set_l1e(struct vcpu *v, 
 38.1340 +                          shadow_l1e_t *sl1e, 
 38.1341 +                          shadow_l1e_t new_sl1e,
 38.1342 +                          mfn_t sl1mfn)
 38.1343 +{
 38.1344 +    int flags = 0;
 38.1345 +    struct domain *d = v->domain;
 38.1346 +    shadow_l1e_t old_sl1e;
 38.1347 +    ASSERT(sl1e != NULL);
 38.1348 +    
 38.1349 +    old_sl1e = *sl1e;
 38.1350 +
 38.1351 +    if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
 38.1352 +    
 38.1353 +    if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT ) 
 38.1354 +    {
 38.1355 +        /* About to install a new reference */        
 38.1356 +        if ( shadow_mode_refcounts(d) ) {
 38.1357 +            if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 ) 
 38.1358 +            {
 38.1359 +                /* Doesn't look like a pagetable. */
 38.1360 +                flags |= SHADOW_SET_ERROR;
 38.1361 +                new_sl1e = shadow_l1e_empty();
 38.1362 +            }
 38.1363 +        }
 38.1364 +    } 
 38.1365 +
 38.1366 +    /* Write the new entry */
 38.1367 +    shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
 38.1368 +    flags |= SHADOW_SET_CHANGED;
 38.1369 +
 38.1370 +    if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) 
 38.1371 +    {
 38.1372 +        /* We lost a reference to an old mfn. */
 38.1373 +        /* N.B. Unlike higher-level sets, never need an extra flush 
 38.1374 +         * when writing an l1e.  Because it points to the same guest frame 
 38.1375 +         * as the guest l1e did, it's the guest's responsibility to
 38.1376 +         * trigger a flush later. */
 38.1377 +        if ( shadow_mode_refcounts(d) ) 
 38.1378 +        {
 38.1379 +            shadow_put_page_from_l1e(old_sl1e, d);
 38.1380 +        } 
 38.1381 +    }
 38.1382 +    return flags;
 38.1383 +}
 38.1384 +
 38.1385 +
 38.1386 +/**************************************************************************/
 38.1387 +/* These functions take a vcpu and a virtual address, and return a pointer
 38.1388 + * to the appropriate level N entry from the shadow tables.  
 38.1389 + * If the necessary tables are not present in the shadow, they return NULL. */
 38.1390 +
 38.1391 +/* N.B. The use of GUEST_PAGING_LEVELS here is correct.  If the shadow has
 38.1392 + * more levels than the guest, the upper levels are always fixed and do not 
 38.1393 + * reflect any information from the guest, so we do not use these functions 
 38.1394 + * to access them. */
 38.1395 +
 38.1396 +#if GUEST_PAGING_LEVELS >= 4
 38.1397 +static shadow_l4e_t *
 38.1398 +shadow_get_l4e(struct vcpu *v, unsigned long va)
 38.1399 +{
 38.1400 +    /* Reading the top level table is always valid. */
 38.1401 +    return sh_linear_l4_table(v) + shadow_l4_linear_offset(va);
 38.1402 +}
 38.1403 +#endif /* GUEST_PAGING_LEVELS >= 4 */
 38.1404 +
 38.1405 +
 38.1406 +#if GUEST_PAGING_LEVELS >= 3
 38.1407 +static shadow_l3e_t *
 38.1408 +shadow_get_l3e(struct vcpu *v, unsigned long va)
 38.1409 +{
 38.1410 +#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
 38.1411 +    /* Get the l4 */
 38.1412 +    shadow_l4e_t *sl4e = shadow_get_l4e(v, va);
 38.1413 +    ASSERT(sl4e != NULL);
 38.1414 +    if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) )
 38.1415 +        return NULL;
 38.1416 +    ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e)));
 38.1417 +    /* l4 was present; OK to get the l3 */
 38.1418 +    return sh_linear_l3_table(v) + shadow_l3_linear_offset(va);
 38.1419 +#else /* PAE... */
 38.1420 +    /* Top level is always mapped */
 38.1421 +    ASSERT(v->arch.shadow_vtable);
 38.1422 +    return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va);
 38.1423 +#endif 
 38.1424 +}
 38.1425 +#endif /* GUEST_PAGING_LEVELS >= 3 */
 38.1426 +
 38.1427 +
 38.1428 +static shadow_l2e_t *
 38.1429 +shadow_get_l2e(struct vcpu *v, unsigned long va)
 38.1430 +{
 38.1431 +#if GUEST_PAGING_LEVELS >= 3  /* 64bit/PAE... */
 38.1432 +    /* Get the l3 */
 38.1433 +    shadow_l3e_t *sl3e = shadow_get_l3e(v, va);
 38.1434 +    if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
 38.1435 +        return NULL;
 38.1436 +    ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e)));
 38.1437 +    /* l3 was present; OK to get the l2 */
 38.1438 +#endif
 38.1439 +    return sh_linear_l2_table(v) + shadow_l2_linear_offset(va);
 38.1440 +}
 38.1441 +
 38.1442 +
 38.1443 +#if 0 // avoid the compiler warning for now...
 38.1444 +
 38.1445 +static shadow_l1e_t *
 38.1446 +shadow_get_l1e(struct vcpu *v, unsigned long va)
 38.1447 +{
 38.1448 +    /* Get the l2 */
 38.1449 +    shadow_l2e_t *sl2e = shadow_get_l2e(v, va);
 38.1450 +    if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) )
 38.1451 +        return NULL;
 38.1452 +    ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e)));
 38.1453 +    /* l2 was present; OK to get the l1 */
 38.1454 +    return sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
 38.1455 +}
 38.1456 +
 38.1457 +#endif
 38.1458 +
 38.1459 +
 38.1460 +/**************************************************************************/
 38.1461 +/* Macros to walk pagetables.  These take the shadow of a pagetable and 
 38.1462 + * walk every "interesting" entry.  That is, they don't touch Xen mappings, 
 38.1463 + * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every 
 38.1464 + * second entry (since pairs of entries are managed together). For multi-page
 38.1465 + * shadows they walk all pages.
 38.1466 + * 
 38.1467 + * Arguments are an MFN, the variable to point to each entry, a variable 
 38.1468 + * to indicate that we are done (we will shortcut to the end of the scan 
 38.1469 + * when _done != 0), a variable to indicate that we should avoid Xen mappings,
 38.1470 + * and the code. 
 38.1471 + *
 38.1472 + * WARNING: These macros have side-effects.  They change the values of both 
 38.1473 + * the pointer and the MFN. */ 
 38.1474 +
 38.1475 +static inline void increment_ptr_to_guest_entry(void *ptr)
 38.1476 +{
 38.1477 +    if ( ptr )
 38.1478 +    {
 38.1479 +        guest_l1e_t **entry = ptr;
 38.1480 +        (*entry)++;
 38.1481 +    }
 38.1482 +}
 38.1483 +
 38.1484 +/* All kinds of l1: touch all entries */
 38.1485 +#define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)       \
 38.1486 +do {                                                                    \
 38.1487 +    int _i;                                                             \
 38.1488 +    shadow_l1e_t *_sp = map_shadow_page((_sl1mfn));                     \
 38.1489 +    ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask)       \
 38.1490 +           == PGC_SH_l1_shadow                                         \
 38.1491 +           || (mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask)    \
 38.1492 +           == PGC_SH_fl1_shadow);                                      \
 38.1493 +    for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ )              \
 38.1494 +    {                                                                   \
 38.1495 +        (_sl1e) = _sp + _i;                                             \
 38.1496 +        if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT )           \
 38.1497 +            {_code}                                                     \
 38.1498 +        if ( _done ) break;                                             \
 38.1499 +        increment_ptr_to_guest_entry(_gl1p);                            \
 38.1500 +    }                                                                   \
 38.1501 +    unmap_shadow_page(_sp);                                             \
 38.1502 +} while (0)
 38.1503 +
 38.1504 +/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
 38.1505 +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
 38.1506 +#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done,  _code)       \
 38.1507 +do {                                                                    \
 38.1508 +    int __done = 0;                                                     \
 38.1509 +    _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p,                         \
 38.1510 +                         ({ (__done = _done); }), _code);               \
 38.1511 +    _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1);                                 \
 38.1512 +    if ( !__done )                                                      \
 38.1513 +        _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p,                     \
 38.1514 +                             ({ (__done = _done); }), _code);           \
 38.1515 +} while (0)
 38.1516 +#else /* Everything else; l1 shadows are only one page */
 38.1517 +#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)        \
 38.1518 +       _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
 38.1519 +#endif
 38.1520 +    
 38.1521 +
 38.1522 +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
 38.1523 +
 38.1524 +/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
 38.1525 +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)    \
 38.1526 +do {                                                                      \
 38.1527 +    int _i, _j, __done = 0;                                               \
 38.1528 +    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)         \
 38.1529 +           == PGC_SH_l2_32_shadow);                                      \
 38.1530 +    for ( _j = 0; _j < 4 && !__done; _j++ )                               \
 38.1531 +    {                                                                     \
 38.1532 +        shadow_l2e_t *_sp = map_shadow_page(_sl2mfn);                     \
 38.1533 +        for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 )         \
 38.1534 +            if ( (!(_xen))                                                \
 38.1535 +                 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i)             \
 38.1536 +                 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
 38.1537 +            {                                                             \
 38.1538 +                (_sl2e) = _sp + _i;                                       \
 38.1539 +                if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )     \
 38.1540 +                    {_code}                                               \
 38.1541 +                if ( (__done = (_done)) ) break;                          \
 38.1542 +                increment_ptr_to_guest_entry(_gl2p);                      \
 38.1543 +            }                                                             \
 38.1544 +        unmap_shadow_page(_sp);                                           \
 38.1545 +        _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1);                               \
 38.1546 +    }                                                                     \
 38.1547 +} while (0)
 38.1548 +
 38.1549 +#elif GUEST_PAGING_LEVELS == 2
 38.1550 +
 38.1551 +/* 32-bit on 32-bit: avoid Xen entries */
 38.1552 +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)     \
 38.1553 +do {                                                                       \
 38.1554 +    int _i;                                                                \
 38.1555 +    shadow_l2e_t *_sp = map_shadow_page((_sl2mfn));                        \
 38.1556 +    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)          \
 38.1557 +           == PGC_SH_l2_32_shadow);                                       \
 38.1558 +    for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )                 \
 38.1559 +        if ( (!(_xen))                                                     \
 38.1560 +             ||                                                            \
 38.1561 +             (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
 38.1562 +        {                                                                  \
 38.1563 +            (_sl2e) = _sp + _i;                                            \
 38.1564 +            if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )          \
 38.1565 +                {_code}                                                    \
 38.1566 +            if ( _done ) break;                                            \
 38.1567 +            increment_ptr_to_guest_entry(_gl2p);                           \
 38.1568 +        }                                                                  \
 38.1569 +    unmap_shadow_page(_sp);                                                \
 38.1570 +} while (0)
 38.1571 +
 38.1572 +#elif GUEST_PAGING_LEVELS == 3
 38.1573 +
 38.1574 +/* PAE: if it's an l2h, don't touch Xen mappings */
 38.1575 +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)     \
 38.1576 +do {                                                                       \
 38.1577 +    int _i;                                                                \
 38.1578 +    shadow_l2e_t *_sp = map_shadow_page((_sl2mfn));                        \
 38.1579 +    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)          \
 38.1580 +           == PGC_SH_l2_pae_shadow                                        \
 38.1581 +           || (mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)       \
 38.1582 +           == PGC_SH_l2h_pae_shadow);                                     \
 38.1583 +    for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )                 \
 38.1584 +        if ( (!(_xen))                                                     \
 38.1585 +             || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)    \
 38.1586 +                 != PGC_SH_l2h_pae_shadow)                                \
 38.1587 +             || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES))                  \
 38.1588 +                 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
 38.1589 +        {                                                                  \
 38.1590 +            (_sl2e) = _sp + _i;                                            \
 38.1591 +            if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )          \
 38.1592 +                {_code}                                                    \
 38.1593 +            if ( _done ) break;                                            \
 38.1594 +            increment_ptr_to_guest_entry(_gl2p);                           \
 38.1595 +        }                                                                  \
 38.1596 +    unmap_shadow_page(_sp);                                                \
 38.1597 +} while (0)
 38.1598 +
 38.1599 +#else 
 38.1600 +
 38.1601 +/* 64-bit l2: touch all entries */
 38.1602 +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)  \
 38.1603 +do {                                                                    \
 38.1604 +    int _i;                                                             \
 38.1605 +    shadow_l2e_t *_sp = map_shadow_page((_sl2mfn));                     \
 38.1606 +    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask)       \
 38.1607 +           == PGC_SH_l2_64_shadow);                                    \
 38.1608 +    for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )              \
 38.1609 +    {                                                                   \
 38.1610 +        (_sl2e) = _sp + _i;                                             \
 38.1611 +        if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )           \
 38.1612 +            {_code}                                                     \
 38.1613 +        if ( _done ) break;                                             \
 38.1614 +        increment_ptr_to_guest_entry(_gl2p);                            \
 38.1615 +    }                                                                   \
 38.1616 +    unmap_shadow_page(_sp);                                             \
 38.1617 +} while (0)
 38.1618 +
 38.1619 +#endif /* different kinds of l2 */
 38.1620 +
 38.1621 +#if GUEST_PAGING_LEVELS == 3
 38.1622 +
 38.1623 +/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */
 38.1624 +#define SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code)             \
 38.1625 +do {                                                                    \
 38.1626 +    int _i;                                                             \
 38.1627 +    for ( _i = 0; _i < 4; _i++ )                                        \
 38.1628 +    {                                                                   \
 38.1629 +        if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT )           \
 38.1630 +            {_code}                                                     \
 38.1631 +        if ( _done ) break;                                             \
 38.1632 +        _sl3e++;                                                        \
 38.1633 +        increment_ptr_to_guest_entry(_gl3p);                            \
 38.1634 +    }                                                                   \
 38.1635 +} while (0)
 38.1636 +
 38.1637 +/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */
 38.1638 +#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code)        \
 38.1639 +do {                                                                    \
 38.1640 +    int _i, _j, _k, __done = 0;                                         \
 38.1641 +    ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask)       \
 38.1642 +           == PGC_SH_l3_pae_shadow);                                   \
 38.1643 +    /* The subshadows are split, 64 on each page of the shadow */       \
 38.1644 +    for ( _j = 0; _j < 2 && !__done; _j++ )                             \
 38.1645 +    {                                                                   \
 38.1646 +        void *_sp = sh_map_domain_page(_sl3mfn);                       \
 38.1647 +        for ( _i = 0; _i < 64; _i++ )                                   \
 38.1648 +        {                                                               \
 38.1649 +            /* Every second 32-byte region is a bookkeeping entry */    \
 38.1650 +            _sl3e = (shadow_l3e_t *)(_sp + (64 * _i));                  \
 38.1651 +            if ( (sl3p_to_info(_sl3e))->refcount > 0 )                  \
 38.1652 +                SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p,                   \
 38.1653 +                                        ({ __done = (_done); __done; }), \
 38.1654 +                                        _code);                         \
 38.1655 +            else                                                        \
 38.1656 +                for ( _k = 0 ; _k < 4 ; _k++ )                          \
 38.1657 +                    increment_ptr_to_guest_entry(_gl3p);                \
 38.1658 +            if ( __done ) break;                                        \
 38.1659 +        }                                                               \
 38.1660 +        sh_unmap_domain_page(_sp);                                     \
 38.1661 +        _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1);                             \
 38.1662 +    }                                                                   \
 38.1663 +} while (0)
 38.1664 +
 38.1665 +#elif GUEST_PAGING_LEVELS == 4
 38.1666 +
 38.1667 +/* 64-bit l3: touch all entries */
 38.1668 +#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code)        \
 38.1669 +do {                                                                    \
 38.1670 +    int _i;                                                             \
 38.1671 +    shadow_l3e_t *_sp = map_shadow_page((_sl3mfn));                     \
 38.1672 +    ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask)       \
 38.1673 +           == PGC_SH_l3_64_shadow);                                    \
 38.1674 +    for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ )              \
 38.1675 +    {                                                                   \
 38.1676 +        (_sl3e) = _sp + _i;                                             \
 38.1677 +        if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT )           \
 38.1678 +            {_code}                                                     \
 38.1679 +        if ( _done ) break;                                             \
 38.1680 +        increment_ptr_to_guest_entry(_gl3p);                            \
 38.1681 +    }                                                                   \
 38.1682 +    unmap_shadow_page(_sp);                                             \
 38.1683 +} while (0)
 38.1684 +
 38.1685 +/* 64-bit l4: avoid Xen mappings */
 38.1686 +#define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code)  \
 38.1687 +do {                                                                    \
 38.1688 +    int _i;                                                             \
 38.1689 +    shadow_l4e_t *_sp = map_shadow_page((_sl4mfn));                     \
 38.1690 +    ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH_type_mask)       \
 38.1691 +           == PGC_SH_l4_64_shadow);                                    \
 38.1692 +    for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ )              \
 38.1693 +    {                                                                   \
 38.1694 +        if ( (!(_xen)) || is_guest_l4_slot(_i) )                        \
 38.1695 +        {                                                               \
 38.1696 +            (_sl4e) = _sp + _i;                                         \
 38.1697 +            if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT )       \
 38.1698 +                {_code}                                                 \
 38.1699 +            if ( _done ) break;                                         \
 38.1700 +        }                                                               \
 38.1701 +        increment_ptr_to_guest_entry(_gl4p);                            \
 38.1702 +    }                                                                   \
 38.1703 +    unmap_shadow_page(_sp);                                             \
 38.1704 +} while (0)
 38.1705 +
 38.1706 +#endif
 38.1707 +
 38.1708 +
 38.1709 +
 38.1710 +/**************************************************************************/
 38.1711 +/* Functions to install Xen mappings and linear mappings in shadow pages */
 38.1712 +
 38.1713 +static mfn_t sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type);
 38.1714 +
 38.1715 +// XXX -- this function should probably be moved to shadow-common.c, but that
 38.1716 +//        probably wants to wait until the shadow types have been moved from
 38.1717 +//        shadow-types.h to shadow-private.h
 38.1718 +//
 38.1719 +#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
 38.1720 +void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
 38.1721 +{
 38.1722 +    struct domain *d = v->domain;
 38.1723 +    shadow_l4e_t *sl4e;
 38.1724 +
 38.1725 +    sl4e = sh_map_domain_page(sl4mfn);
 38.1726 +    ASSERT(sl4e != NULL);
 38.1727 +    ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
 38.1728 +    
 38.1729 +    /* Copy the common Xen mappings from the idle domain */
 38.1730 +    memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
 38.1731 +           &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
 38.1732 +           ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
 38.1733 +
 38.1734 +    /* Install the per-domain mappings for this domain */
 38.1735 +    sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
 38.1736 +        shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
 38.1737 +                            __PAGE_HYPERVISOR);
 38.1738 +
 38.1739 +    /* Linear mapping */
 38.1740 +    sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
 38.1741 +        shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
 38.1742 +    sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
 38.1743 +        shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
 38.1744 +
 38.1745 +    if ( shadow_mode_translate(v->domain) )
 38.1746 +    {
 38.1747 +        /* install domain-specific P2M table */
 38.1748 +        sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
 38.1749 +            shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
 38.1750 +                                __PAGE_HYPERVISOR);
 38.1751 +    }
 38.1752 +
 38.1753 +    sh_unmap_domain_page(sl4e);    
 38.1754 +}
 38.1755 +#endif
 38.1756 +
 38.1757 +#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
 38.1758 +// For 3-on-3 PV guests, we need to make sure the xen mappings are in
 38.1759 +// place, which means that we need to populate the l2h entry in the l3
 38.1760 +// table.
 38.1761 +
 38.1762 +void sh_install_xen_entries_in_l2h(struct vcpu *v, 
 38.1763 +                                    mfn_t sl2hmfn)
 38.1764 +{
 38.1765 +    struct domain *d = v->domain;
 38.1766 +    shadow_l2e_t *sl2e;
 38.1767 +    int i;
 38.1768 +
 38.1769 +    sl2e = sh_map_domain_page(sl2hmfn);
 38.1770 +    ASSERT(sl2e != NULL);
 38.1771 +    ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
 38.1772 +    
 38.1773 +    /* Copy the common Xen mappings from the idle domain */
 38.1774 +    memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
 38.1775 +           &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
 38.1776 +           L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
 38.1777 +
 38.1778 +    /* Install the per-domain mappings for this domain */
 38.1779 +    for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
 38.1780 +        sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
 38.1781 +            shadow_l2e_from_mfn(
 38.1782 +                page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
 38.1783 +                __PAGE_HYPERVISOR);
 38.1784 +    
 38.1785 +    /* We don't set up a linear mapping here because we can't until this
 38.1786 +     * l2h is installed in an l3e.  sh_update_linear_entries() handles
 38.1787 +     * the linear mappings when the l3 is loaded. */
 38.1788 +
 38.1789 +    if ( shadow_mode_translate(d) )
 38.1790 +    {
 38.1791 +        /* Install the domain-specific p2m table */
 38.1792 +        l3_pgentry_t *p2m;
 38.1793 +        ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
 38.1794 +        p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
 38.1795 +        for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
 38.1796 +        {
 38.1797 +            sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
 38.1798 +                shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
 38.1799 +                                    __PAGE_HYPERVISOR);
 38.1800 +        }
 38.1801 +        sh_unmap_domain_page(p2m);
 38.1802 +    }
 38.1803 +    
 38.1804 +    sh_unmap_domain_page(sl2e);
 38.1805 +}
 38.1806 +
 38.1807 +void sh_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn)
 38.1808 +{
 38.1809 +    shadow_l3e_t *sl3e;
 38.1810 +    guest_l3e_t *gl3e = v->arch.guest_vtable;
 38.1811 +    shadow_l3e_t new_sl3e;
 38.1812 +    gfn_t l2gfn;
 38.1813 +    mfn_t l2gmfn, l2smfn;
 38.1814 +    int r;
 38.1815 +
 38.1816 +    ASSERT(!shadow_mode_external(v->domain));
 38.1817 +    ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT);
 38.1818 +    l2gfn = guest_l3e_get_gfn(gl3e[3]);
 38.1819 +    l2gmfn = sh_gfn_to_mfn(v->domain, gfn_x(l2gfn));
 38.1820 +    l2smfn = get_shadow_status(v, l2gmfn, PGC_SH_l2h_shadow);
 38.1821 +    if ( !valid_mfn(l2smfn) )
 38.1822 +    {
 38.1823 +        l2smfn = sh_make_shadow(v, l2gmfn, PGC_SH_l2h_shadow);
 38.1824 +    }
 38.1825 +    l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e,
 38.1826 +                             ft_prefetch);
 38.1827 +    sl3e = sh_map_domain_page(sl3mfn);
 38.1828 +    r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn);
 38.1829 +    sh_unmap_domain_page(sl3e);
 38.1830 +}
 38.1831 +#endif
 38.1832 +
 38.1833 +
 38.1834 +#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
 38.1835 +void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
 38.1836 +{
 38.1837 +    struct domain *d = v->domain;
 38.1838 +    shadow_l2e_t *sl2e;
 38.1839 +    int i;
 38.1840 +
 38.1841 +    sl2e = sh_map_domain_page(sl2mfn);
 38.1842 +    ASSERT(sl2e != NULL);
 38.1843 +    ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
 38.1844 +    
 38.1845 +    /* Copy the common Xen mappings from the idle domain */
 38.1846 +    memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
 38.1847 +           &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
 38.1848 +           L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
 38.1849 +
 38.1850 +    /* Install the per-domain mappings for this domain */
 38.1851 +    for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
 38.1852 +        sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
 38.1853 +            shadow_l2e_from_mfn(
 38.1854 +                page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
 38.1855 +                __PAGE_HYPERVISOR);
 38.1856 +
 38.1857 +    /* Linear mapping */
 38.1858 +    sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
 38.1859 +        shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
 38.1860 +    sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
 38.1861 +        shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
 38.1862 +
 38.1863 +    if ( shadow_mode_translate(d) )
 38.1864 +    {
 38.1865 +        /* install domain-specific P2M table */
 38.1866 +        sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
 38.1867 +            shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
 38.1868 +                                __PAGE_HYPERVISOR);
 38.1869 +    }
 38.1870 +
 38.1871 +    sh_unmap_domain_page(sl2e);
 38.1872 +}
 38.1873 +#endif
 38.1874 +
 38.1875 +
 38.1876 +
 38.1877 +
 38.1878 +
 38.1879 +/**************************************************************************/
 38.1880 +/* Create a shadow of a given guest page.
 38.1881 + */
 38.1882 +static mfn_t
 38.1883 +sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
 38.1884 +{
 38.1885 +    mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn));
 38.1886 +    SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
 38.1887 +                  mfn_x(gmfn), shadow_type, mfn_x(smfn));
 38.1888 +
 38.1889 +    if ( shadow_type != PGC_SH_guest_root_type )
 38.1890 +        /* Lower-level shadow, not yet linked form a higher level */
 38.1891 +        mfn_to_page(smfn)->up = 0;
 38.1892 +
 38.1893 +    // Create the Xen mappings...
 38.1894 +    if ( !shadow_mode_external(v->domain) )
 38.1895 +    {
 38.1896 +        switch (shadow_type) 
 38.1897 +        {
 38.1898 +#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
 38.1899 +        case PGC_SH_l4_shadow:
 38.1900 +            sh_install_xen_entries_in_l4(v, gmfn, smfn); break;
 38.1901 +#endif
 38.1902 +#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
 38.1903 +        case PGC_SH_l3_shadow:
 38.1904 +            sh_install_xen_entries_in_l3(v, gmfn, smfn); break;
 38.1905 +        case PGC_SH_l2h_shadow:
 38.1906 +            sh_install_xen_entries_in_l2h(v, smfn); break;
 38.1907 +#endif
 38.1908 +#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
 38.1909 +        case PGC_SH_l2_shadow:
 38.1910 +            sh_install_xen_entries_in_l2(v, gmfn, smfn); break;
 38.1911 +#endif
 38.1912 +        default: /* Do nothing */ break;
 38.1913 +        }
 38.1914 +    }
 38.1915 +    
 38.1916 +    shadow_promote(v, gmfn, shadow_type);
 38.1917 +    set_shadow_status(v, gmfn, shadow_type, smfn);
 38.1918 +
 38.1919 +    return smfn;
 38.1920 +}
 38.1921 +
 38.1922 +/* Make a splintered superpage shadow */
 38.1923 +static mfn_t
 38.1924 +make_fl1_shadow(struct vcpu *v, gfn_t gfn)
 38.1925 +{
 38.1926 +    mfn_t smfn = shadow_alloc(v->domain, PGC_SH_fl1_shadow,
 38.1927 +                               (unsigned long) gfn_x(gfn));
 38.1928 +
 38.1929 +    SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" SH_PRI_mfn "\n",
 38.1930 +                  gfn_x(gfn), mfn_x(smfn));
 38.1931 +
 38.1932 +    set_fl1_shadow_status(v, gfn, smfn);
 38.1933 +    return smfn;
 38.1934 +}
 38.1935 +
 38.1936 +
 38.1937 +#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
 38.1938 +mfn_t
 38.1939 +sh_make_monitor_table(struct vcpu *v)
 38.1940 +{
 38.1941 +
 38.1942 +    ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
 38.1943 +    
 38.1944 +#if CONFIG_PAGING_LEVELS == 4    
 38.1945 +    {
 38.1946 +        struct domain *d = v->domain;
 38.1947 +        mfn_t m4mfn;
 38.1948 +        m4mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
 38.1949 +        sh_install_xen_entries_in_l4(v, m4mfn, m4mfn);
 38.1950 +        /* Remember the level of this table */
 38.1951 +        mfn_to_page(m4mfn)->shadow_flags = 4;
 38.1952 +#if SHADOW_PAGING_LEVELS < 4
 38.1953 +        // Install a monitor l3 table in slot 0 of the l4 table.
 38.1954 +        // This is used for shadow linear maps.
 38.1955 +        {
 38.1956 +            mfn_t m3mfn; 
 38.1957 +            l4_pgentry_t *l4e;
 38.1958 +            m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
 38.1959 +            mfn_to_page(m3mfn)->shadow_flags = 3;
 38.1960 +            l4e = sh_map_domain_page(m4mfn);
 38.1961 +            l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
 38.1962 +            sh_unmap_domain_page(l4e);
 38.1963 +        }
 38.1964 +#endif /* SHADOW_PAGING_LEVELS < 4 */
 38.1965 +        return m4mfn;
 38.1966 +    }
 38.1967 +
 38.1968 +#elif CONFIG_PAGING_LEVELS == 3
 38.1969 +
 38.1970 +    {
 38.1971 +        struct domain *d = v->domain;
 38.1972 +        mfn_t m3mfn, m2mfn; 
 38.1973 +        l3_pgentry_t *l3e;
 38.1974 +        l2_pgentry_t *l2e;
 38.1975 +        int i;
 38.1976 +
 38.1977 +        m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
 38.1978 +        /* Remember the level of this table */
 38.1979 +        mfn_to_page(m3mfn)->shadow_flags = 3;
 38.1980 +
 38.1981 +        // Install a monitor l2 table in slot 3 of the l3 table.
 38.1982 +        // This is used for all Xen entries, including linear maps
 38.1983 +        m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
 38.1984 +        mfn_to_page(m2mfn)->shadow_flags = 2;
 38.1985 +        l3e = sh_map_domain_page(m3mfn);
 38.1986 +        l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
 38.1987 +        sh_install_xen_entries_in_l2h(v, m2mfn);
 38.1988 +        /* Install the monitor's own linear map */
 38.1989 +        l2e = sh_map_domain_page(m2mfn);
 38.1990 +        for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
 38.1991 +            l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
 38.1992 +                (l3e_get_flags(l3e[i]) & _PAGE_PRESENT) 
 38.1993 +                ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR) 
 38.1994 +                : l2e_empty();
 38.1995 +        sh_unmap_domain_page(l2e);
 38.1996 +        sh_unmap_domain_page(l3e);
 38.1997 +
 38.1998 +        SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
 38.1999 +        return m3mfn;
 38.2000 +    }
 38.2001 +
 38.2002 +#elif CONFIG_PAGING_LEVELS == 2
 38.2003 +
 38.2004 +    {
 38.2005 +        struct domain *d = v->domain;
 38.2006 +        mfn_t m2mfn;
 38.2007 +        m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0);
 38.2008 +        sh_install_xen_entries_in_l2(v, m2mfn, m2mfn);
 38.2009 +        /* Remember the level of this table */
 38.2010 +        mfn_to_page(m2mfn)->shadow_flags = 2;
 38.2011 +        return m2mfn;
 38.2012 +    }
 38.2013 +
 38.2014 +#else
 38.2015 +#error this should not happen
 38.2016 +#endif /* CONFIG_PAGING_LEVELS */
 38.2017 +}
 38.2018 +#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
 38.2019 +
 38.2020 +/**************************************************************************/
 38.2021 +/* These functions also take a virtual address and return the level-N
 38.2022 + * shadow table mfn and entry, but they create the shadow pagetables if
 38.2023 + * they are needed.  The "demand" argument is non-zero when handling
 38.2024 + * a demand fault (so we know what to do about accessed bits &c).
 38.2025 + * If the necessary tables are not present in the guest, they return NULL. */
 38.2026 +#if GUEST_PAGING_LEVELS >= 4
 38.2027 +static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, 
 38.2028 +                                                walk_t *gw, 
 38.2029 +                                                mfn_t *sl4mfn)
 38.2030 +{
 38.2031 +    /* There is always a shadow of the top level table.  Get it. */
 38.2032 +    *sl4mfn = pagetable_get_mfn(v->arch.shadow_table);
 38.2033 +    /* Reading the top level table is always valid. */
 38.2034 +    return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
 38.2035 +}
 38.2036 +#endif /* GUEST_PAGING_LEVELS >= 4 */
 38.2037 +
 38.2038 +
 38.2039 +#if GUEST_PAGING_LEVELS >= 3
 38.2040 +static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, 
 38.2041 +                                                walk_t *gw, 
 38.2042 +                                                mfn_t *sl3mfn,
 38.2043 +                                                fetch_type_t ft)
 38.2044 +{
 38.2045 +#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
 38.2046 +    mfn_t sl4mfn;
 38.2047 +    shadow_l4e_t *sl4e;
 38.2048 +    if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */
 38.2049 +    /* Get the l4e */
 38.2050 +    sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
 38.2051 +    ASSERT(sl4e != NULL);
 38.2052 +    if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) 
 38.2053 +    {
 38.2054 +        *sl3mfn = shadow_l4e_get_mfn(*sl4e);
 38.2055 +        ASSERT(valid_mfn(*sl3mfn));
 38.2056 +    } 
 38.2057 +    else 
 38.2058 +    {
 38.2059 +        int r;
 38.2060 +        shadow_l4e_t new_sl4e;
 38.2061 +        /* No l3 shadow installed: find and install it. */
 38.2062 +        *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH_l3_shadow);
 38.2063 +        if ( !valid_mfn(*sl3mfn) ) 
 38.2064 +        {
 38.2065 +            /* No l3 shadow of this page exists at all: make one. */
 38.2066 +            *sl3mfn = sh_make_shadow(v, gw->l3mfn, PGC_SH_l3_shadow);
 38.2067 +        }
 38.2068 +        /* Install the new sl3 table in the sl4e */
 38.2069 +        l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn, 
 38.2070 +                                 *sl3mfn, &new_sl4e, ft);
 38.2071 +        r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
 38.2072 +        ASSERT((r & SHADOW_SET_FLUSH) == 0);
 38.2073 +    }
 38.2074 +    /* Now follow it down a level.  Guaranteed to succeed. */
 38.2075 +    return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
 38.2076 +#else /* PAE... */
 38.2077 +    /* There is always a shadow of the top level table.  Get it. */
 38.2078 +    *sl3mfn = pagetable_get_mfn(v->arch.shadow_table);
 38.2079 +    /* This next line is important: the shadow l3 table is in an 8k
 38.2080 +     * shadow and we need to return the right mfn of the pair. This call
 38.2081 +     * will set it for us as a side-effect. */
 38.2082 +    (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e));
 38.2083 +    ASSERT(v->arch.shadow_vtable);
 38.2084 +    return ((shadow_l3e_t *)v->arch.shadow_vtable) 
 38.2085 +        + shadow_l3_table_offset(gw->va);
 38.2086 +#endif /* GUEST_PAGING_LEVELS >= 4 */
 38.2087 +}
 38.2088 +#endif /* GUEST_PAGING_LEVELS >= 3 */
 38.2089 +
 38.2090 +
 38.2091 +static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, 
 38.2092 +                                                walk_t *gw, 
 38.2093 +                                                mfn_t *sl2mfn,
 38.2094 +                                                fetch_type_t ft)
 38.2095 +{
 38.2096 +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */
 38.2097 +    mfn_t sl3mfn = _mfn(INVALID_MFN);
 38.2098 +    shadow_l3e_t *sl3e;
 38.2099 +    if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
 38.2100 +    /* Get the l3e */
 38.2101 +    sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
 38.2102 +    ASSERT(sl3e != NULL);  /* Since we know guest PT is valid this far */
 38.2103 +    if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) 
 38.2104 +    {
 38.2105 +        *sl2mfn = shadow_l3e_get_mfn(*sl3e);
 38.2106 +        ASSERT(valid_mfn(*sl2mfn));
 38.2107 +    } 
 38.2108 +    else 
 38.2109 +    {
 38.2110 +        int r;
 38.2111 +        shadow_l3e_t new_sl3e;
 38.2112 +        /* No l2 shadow installed: find and install it. */
 38.2113 +        *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH_l2_shadow);
 38.2114 +        if ( !valid_mfn(*sl2mfn) ) 
 38.2115 +        {
 38.2116 +            /* No l2 shadow of this page exists at all: make one. */
 38.2117 +            *sl2mfn = sh_make_shadow(v, gw->l2mfn, PGC_SH_l2_shadow);
 38.2118 +        }
 38.2119 +        /* Install the new sl2 table in the sl3e */
 38.2120 +        l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn, 
 38.2121 +                                 *sl2mfn, &new_sl3e, ft);
 38.2122 +        r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
 38.2123 +        ASSERT((r & SHADOW_SET_FLUSH) == 0);
 38.2124 +#if GUEST_PAGING_LEVELS == 3 
 38.2125 +        /* Need to sync up the linear maps, as we are about to use them */
 38.2126 +        ASSERT( r & SHADOW_SET_L3PAE_RECOPY );
 38.2127 +        sh_pae_recopy(v->domain);
 38.2128 +#endif
 38.2129 +    }
 38.2130 +    /* Now follow it down a level.  Guaranteed to succeed. */
 38.2131 +    return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
 38.2132 +#else /* 32bit... */
 38.2133 +    /* There is always a shadow of the top level table.  Get it. */
 38.2134 +    *sl2mfn = pagetable_get_mfn(v->arch.shadow_table);
 38.2135 +    /* This next line is important: the guest l2 has a 16k
 38.2136 +     * shadow, we need to return the right mfn of the four. This
 38.2137 +     * call will set it for us as a side-effect. */
 38.2138 +    (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
 38.2139 +    /* Reading the top level table is always valid. */
 38.2140 +    return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
 38.2141 +#endif 
 38.2142 +}
 38.2143 +
 38.2144 +
 38.2145 +static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, 
 38.2146 +                                                walk_t *gw, 
 38.2147 +                                                mfn_t *sl1mfn,
 38.2148 +                                                fetch_type_t ft)
 38.2149 +{
 38.2150 +    mfn_t sl2mfn;
 38.2151 +    shadow_l2e_t *sl2e;
 38.2152 +
 38.2153 +    /* Get the l2e */
 38.2154 +    sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
 38.2155 +    if ( sl2e == NULL ) return NULL;
 38.2156 +    if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) 
 38.2157 +    {
 38.2158 +        *sl1mfn = shadow_l2e_get_mfn(*sl2e);
 38.2159 +        ASSERT(valid_mfn(*sl1mfn));
 38.2160 +    } 
 38.2161 +    else 
 38.2162 +    {
 38.2163 +        shadow_l2e_t new_sl2e;
 38.2164 +        int r, flags = guest_l2e_get_flags(*gw->l2e);
 38.2165 +        /* No l1 shadow installed: find and install it. */
 38.2166 +        if ( !(flags & _PAGE_PRESENT) )
 38.2167 +            return NULL; /* No guest page. */
 38.2168 +        if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) ) 
 38.2169 +        {
 38.2170 +            /* Splintering a superpage */
 38.2171 +            gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
 38.2172 +            *sl1mfn = get_fl1_shadow_status(v, l2gfn);
 38.2173 +            if ( !valid_mfn(*sl1mfn) ) 
 38.2174 +            {
 38.2175 +                /* No fl1 shadow of this superpage exists at all: make one. */
 38.2176 +                *sl1mfn = make_fl1_shadow(v, l2gfn);
 38.2177 +            }
 38.2178 +        } 
 38.2179 +        else 
 38.2180 +        {
 38.2181 +            /* Shadowing an actual guest l1 table */
 38.2182 +            if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
 38.2183 +            *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH_l1_shadow);
 38.2184 +            if ( !valid_mfn(*sl1mfn) ) 
 38.2185 +            {
 38.2186 +                /* No l1 shadow of this page exists at all: make one. */
 38.2187 +                *sl1mfn = sh_make_shadow(v, gw->l1mfn, PGC_SH_l1_shadow);
 38.2188 +            }
 38.2189 +        }
 38.2190 +        /* Install the new sl1 table in the sl2e */
 38.2191 +        l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn, 
 38.2192 +                                 *sl1mfn, &new_sl2e, ft);
 38.2193 +        r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
 38.2194 +        ASSERT((r & SHADOW_SET_FLUSH) == 0);        
 38.2195 +        /* This next line is important: in 32-on-PAE and 32-on-64 modes,
 38.2196 +         * the guest l1 table has an 8k shadow, and we need to return
 38.2197 +         * the right mfn of the pair. This call will set it for us as a
 38.2198 +         * side-effect.  (In all other cases, it's a no-op and will be
 38.2199 +         * compiled out.) */
 38.2200 +        (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
 38.2201 +    }
 38.2202 +    /* Now follow it down a level.  Guaranteed to succeed. */
 38.2203 +    return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
 38.2204 +}
 38.2205 +
 38.2206 +
 38.2207 +
 38.2208 +/**************************************************************************/
 38.2209 +/* Destructors for shadow tables: 
 38.2210 + * Unregister the shadow, decrement refcounts of any entries present in it,
 38.2211 + * and release the memory.
 38.2212 + *
 38.2213 + * N.B. These destructors do not clear the contents of the shadows.
 38.2214 + *      This allows us to delay TLB shootdowns until the page is being reused.
 38.2215 + *      See shadow_alloc() and shadow_free() for how this is handled.
 38.2216 + */
 38.2217 +
 38.2218 +#if GUEST_PAGING_LEVELS >= 4
 38.2219 +void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
 38.2220 +{
 38.2221 +    shadow_l4e_t *sl4e;
 38.2222 +    u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
 38.2223 +    mfn_t gmfn, sl4mfn;
 38.2224 +    int xen_mappings;
 38.2225 +
 38.2226 +    SHADOW_DEBUG(DESTROY_SHADOW,
 38.2227 +                  "%s(%05lx)\n", __func__, mfn_x(smfn));
 38.2228 +    ASSERT(t == PGC_SH_l4_shadow);
 38.2229 +
 38.2230 +    /* Record that the guest page isn't shadowed any more (in this type) */
 38.2231 +    gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
 38.2232 +    delete_shadow_status(v, gmfn, t, smfn);
 38.2233 +    shadow_demote(v, gmfn, t);
 38.2234 +    /* Take this shadow off the list of root shadows */
 38.2235 +    list_del_init(&mfn_to_page(smfn)->list);
 38.2236 +
 38.2237 +    /* Decrement refcounts of all the old entries */
 38.2238 +    xen_mappings = (!shadow_mode_external(v->domain));
 38.2239 +    sl4mfn = smfn; 
 38.2240 +    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
 38.2241 +        if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) 
 38.2242 +        {
 38.2243 +            sh_put_ref(v, shadow_l4e_get_mfn(*sl4e),
 38.2244 +                        (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) 
 38.2245 +                        | ((unsigned long)sl4e & ~PAGE_MASK));
 38.2246 +        }
 38.2247 +    });
 38.2248 +    
 38.2249 +    /* Put the memory back in the pool */
 38.2250 +    shadow_free(v->domain, smfn);
 38.2251 +}
 38.2252 +#endif    
 38.2253 +
 38.2254 +#if GUEST_PAGING_LEVELS >= 3
 38.2255 +void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
 38.2256 +{
 38.2257 +    shadow_l3e_t *sl3e;
 38.2258 +    u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
 38.2259 +    mfn_t gmfn, sl3mfn;
 38.2260 +
 38.2261 +    SHADOW_DEBUG(DESTROY_SHADOW,
 38.2262 +                  "%s(%05lx)\n", __func__, mfn_x(smfn));
 38.2263 +    ASSERT(t == PGC_SH_l3_shadow);
 38.2264 +
 38.2265 +    /* Record that the guest page isn't shadowed any more (in this type) */
 38.2266 +    gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
 38.2267 +    delete_shadow_status(v, gmfn, t, smfn);
 38.2268 +    shadow_demote(v, gmfn, t);
 38.2269 +#if GUEST_PAGING_LEVELS == 3
 38.2270 +    /* Take this shadow off the list of root shadows */
 38.2271 +    list_del_init(&mfn_to_page(smfn)->list);
 38.2272 +#endif
 38.2273 +
 38.2274 +    /* Decrement refcounts of all the old entries */
 38.2275 +    sl3mfn = smfn; 
 38.2276 +    SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
 38.2277 +        if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) 
 38.2278 +            sh_put_ref(v, shadow_l3e_get_mfn(*sl3e),
 38.2279 +                        (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) 
 38.2280 +                        | ((unsigned long)sl3e & ~PAGE_MASK));
 38.2281 +    });
 38.2282 +
 38.2283 +    /* Put the memory back in the pool */
 38.2284 +    shadow_free(v->domain, smfn);
 38.2285 +}
 38.2286 +#endif    
 38.2287 +
 38.2288 +
 38.2289 +#if GUEST_PAGING_LEVELS == 3
 38.2290 +static void sh_destroy_l3_subshadow(struct vcpu *v, 
 38.2291 +                                     shadow_l3e_t *sl3e)
 38.2292 +/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */
 38.2293 +{
 38.2294 +    int i;
 38.2295 +    ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0); 
 38.2296 +    for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ ) 
 38.2297 +        if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT ) 
 38.2298 +            sh_put_ref(v, shadow_l3e_get_mfn(sl3e[i]),
 38.2299 +                        maddr_from_mapped_domain_page(sl3e));
 38.2300 +}
 38.2301 +#endif
 38.2302 +
 38.2303 +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
 38.2304 +void sh_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn)
 38.2305 +/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */
 38.2306 +{
 38.2307 +    int i, j;
 38.2308 +    struct pae_l3_bookkeeping *bk;
 38.2309 +    
 38.2310 +    ASSERT((mfn_to_page(smfn)->count_info & PGC_SH_type_mask) 
 38.2311 +           == PGC_SH_l3_pae_shadow);
 38.2312 +    /* The subshadows are split, 64 on each page of the shadow */
 38.2313 +    for ( i = 0; i < 2; i++ ) 
 38.2314 +    {
 38.2315 +        void *p = sh_map_domain_page(_mfn(mfn_x(smfn) + i));
 38.2316 +        for ( j = 0; j < 64; j++ )
 38.2317 +        {
 38.2318 +            /* Every second 32-byte region is a bookkeeping entry */
 38.2319 +            bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32);
 38.2320 +            if ( bk->pinned )
 38.2321 +                sh_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn);
 38.2322 +            /* Check whether we've just freed the whole shadow */
 38.2323 +            if ( (mfn_to_page(smfn)->count_info & PGC_SH_count_mask) == 0 ) 
 38.2324 +            {
 38.2325 +                sh_unmap_domain_page(p);
 38.2326 +                return;
 38.2327 +            }
 38.2328 +        }
 38.2329 +        sh_unmap_domain_page(p);
 38.2330 +    }
 38.2331 +}
 38.2332 +#endif
 38.2333 +
 38.2334 +void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
 38.2335 +{
 38.2336 +    shadow_l2e_t *sl2e;
 38.2337 +    u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
 38.2338 +    mfn_t gmfn, sl2mfn;
 38.2339 +    int xen_mappings;
 38.2340 +
 38.2341 +    SHADOW_DEBUG(DESTROY_SHADOW,
 38.2342 +                  "%s(%05lx)\n", __func__, mfn_x(smfn));
 38.2343 +    ASSERT(t == PGC_SH_l2_shadow 
 38.2344 +           || t == PGC_SH_l2h_pae_shadow);
 38.2345 +
 38.2346 +    /* Record that the guest page isn't shadowed any more (in this type) */
 38.2347 +    gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
 38.2348 +    delete_shadow_status(v, gmfn, t, smfn);
 38.2349 +    shadow_demote(v, gmfn, t);
 38.2350 +#if GUEST_PAGING_LEVELS == 2
 38.2351 +    /* Take this shadow off the list of root shadows */
 38.2352 +    list_del_init(&mfn_to_page(smfn)->list);
 38.2353 +#endif
 38.2354 +
 38.2355 +    /* Decrement refcounts of all the old entries */
 38.2356 +    sl2mfn = smfn;
 38.2357 +    xen_mappings = (!shadow_mode_external(v->domain) &&
 38.2358 +                    ((GUEST_PAGING_LEVELS == 2) ||
 38.2359 +                     ((GUEST_PAGING_LEVELS == 3) &&
 38.2360 +                      (t == PGC_SH_l2h_pae_shadow))));
 38.2361 +    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
 38.2362 +        if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) 
 38.2363 +            sh_put_ref(v, shadow_l2e_get_mfn(*sl2e),
 38.2364 +                        (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) 
 38.2365 +                        | ((unsigned long)sl2e & ~PAGE_MASK));
 38.2366 +    });
 38.2367 +
 38.2368 +    /* Put the memory back in the pool */
 38.2369 +    shadow_free(v->domain, smfn);
 38.2370 +}
 38.2371 +
 38.2372 +void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
 38.2373 +{
 38.2374 +    struct domain *d = v->domain;
 38.2375 +    shadow_l1e_t *sl1e;
 38.2376 +    u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask;
 38.2377 +
 38.2378 +    SHADOW_DEBUG(DESTROY_SHADOW,
 38.2379 +                  "%s(%05lx)\n", __func__, mfn_x(smfn));
 38.2380 +    ASSERT(t == PGC_SH_l1_shadow || t == PGC_SH_fl1_shadow);
 38.2381 +
 38.2382 +    /* Record that the guest page isn't shadowed any more (in this type) */
 38.2383 +    if ( t == PGC_SH_fl1_shadow )
 38.2384 +    {
 38.2385 +        gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info);
 38.2386 +        delete_fl1_shadow_status(v, gfn, smfn);
 38.2387 +    }
 38.2388 +    else 
 38.2389 +    {
 38.2390 +        mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
 38.2391 +        delete_shadow_status(v, gmfn, t, smfn);
 38.2392 +        shadow_demote(v, gmfn, t);
 38.2393 +    }
 38.2394 +    
 38.2395 +    if ( shadow_mode_refcounts(d) )
 38.2396 +    {
 38.2397 +        /* Decrement refcounts of all the old entries */
 38.2398 +        mfn_t sl1mfn = smfn; 
 38.2399 +        SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
 38.2400 +            if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT ) 
 38.2401 +                shadow_put_page_from_l1e(*sl1e, d);
 38.2402 +        });
 38.2403 +    }
 38.2404 +    
 38.2405 +    /* Put the memory back in the pool */
 38.2406 +    shadow_free(v->domain, smfn);
 38.2407 +}
 38.2408 +
 38.2409 +#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
 38.2410 +void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
 38.2411 +{
 38.2412 +    struct domain *d = v->domain;
 38.2413 +    ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH_type_mask)
 38.2414 +           == PGC_SH_monitor_table);
 38.2415 +
 38.2416 +#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
 38.2417 +    /* Need to destroy the l3 monitor page in slot 0 too */
 38.2418 +    {
 38.2419 +        l4_pgentry_t *l4e = sh_map_domain_page(mmfn);
 38.2420 +        ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
 38.2421 +        shadow_free(d, _mfn(l4e_get_pfn(l4e[0])));
 38.2422 +        sh_unmap_domain_page(l4e);
 38.2423 +    }
 38.2424 +#elif CONFIG_PAGING_LEVELS == 3
 38.2425 +    /* Need to destroy the l2 monitor page in slot 4 too */
 38.2426 +    {
 38.2427 +        l3_pgentry_t *l3e = sh_map_domain_page(mmfn);
 38.2428 +        ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
 38.2429 +        shadow_free(d, _mfn(l3e_get_pfn(l3e[3])));
 38.2430 +        sh_unmap_domain_page(l3e);
 38.2431 +    }
 38.2432 +#endif
 38.2433 +
 38.2434 +    /* Put the memory back in the pool */
 38.2435 +    shadow_free(d, mmfn);
 38.2436 +}
 38.2437 +#endif
 38.2438 +
 38.2439 +/**************************************************************************/
 38.2440 +/* Functions to destroy non-Xen mappings in a pagetable hierarchy.
 38.2441 + * These are called from common code when we are running out of shadow
 38.2442 + * memory, and unpinning all the top-level shadows hasn't worked. 
 38.2443 + *
 38.2444 + * This implementation is pretty crude and slow, but we hope that it won't 
 38.2445 + * be called very often. */
 38.2446 +
 38.2447 +#if GUEST_PAGING_LEVELS == 2
 38.2448 +
 38.2449 +void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
 38.2450 +{    
 38.2451 +    shadow_l2e_t *sl2e;
 38.2452 +    int xen_mappings = !shadow_mode_external(v->domain);
 38.2453 +    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
 38.2454 +        (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
 38.2455 +    });
 38.2456 +}
 38.2457 +
 38.2458 +#elif GUEST_PAGING_LEVELS == 3
 38.2459 +
 38.2460 +void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn)
 38.2461 +/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */
 38.2462 +{
 38.2463 +    shadow_l3e_t *sl3e;
 38.2464 +    SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
 38.2465 +        if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) {
 38.2466 +            mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e);
 38.2467 +            if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask) 
 38.2468 +                 == PGC_SH_l2h_pae_shadow ) 
 38.2469 +            {
 38.2470 +                /* High l2: need to pick particular l2es to unhook */
 38.2471 +                shadow_l2e_t *sl2e;
 38.2472 +                SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, {
 38.2473 +                    (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
 38.2474 +                });
 38.2475 +            }
 38.2476 +            else
 38.2477 +            {
 38.2478 +                /* Normal l2: can safely unhook the whole l3e */
 38.2479 +                (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
 38.2480 +            }
 38.2481 +        }
 38.2482 +    });
 38.2483 +    /* We've changed PAE L3 entries: must sync up various copies of them */
 38.2484 +    sh_pae_recopy(v->domain);
 38.2485 +}
 38.2486 +
 38.2487 +#elif GUEST_PAGING_LEVELS == 4
 38.2488 +
 38.2489 +void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
 38.2490 +{
 38.2491 +    shadow_l4e_t *sl4e;
 38.2492 +    int xen_mappings = !shadow_mode_external(v->domain);
 38.2493 +    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
 38.2494 +        (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
 38.2495 +    });
 38.2496 +}
 38.2497 +
 38.2498 +#endif
 38.2499 +
 38.2500 +/**************************************************************************/
 38.2501 +/* Internal translation functions.
 38.2502 + * These functions require a pointer to the shadow entry that will be updated.
 38.2503 + */
 38.2504 +
 38.2505 +/* These functions take a new guest entry, translate it to shadow and write 
 38.2506 + * the shadow entry.
 38.2507 + *
 38.2508 + * They return the same bitmaps as the shadow_set_lXe() functions.
 38.2509 + */
 38.2510 +
 38.2511 +#if GUEST_PAGING_LEVELS >= 4
 38.2512 +static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
 38.2513 +{
 38.2514 +    shadow_l4e_t new_sl4e;
 38.2515 +    guest_l4e_t *new_gl4e = new_ge;
 38.2516 +    shadow_l4e_t *sl4p = se;
 38.2517 +    mfn_t sl3mfn = _mfn(INVALID_MFN);
 38.2518 +    int result = 0;
 38.2519 +
 38.2520 +    perfc_incrc(shadow_validate_gl4e_calls);
 38.2521 +
 38.2522 +    if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
 38.2523 +    {
 38.2524 +        gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
 38.2525 +        mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
 38.2526 +        if ( valid_mfn(gl3mfn) )
 38.2527 +            sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH_l3_shadow);
 38.2528 +        else
 38.2529 +            result |= SHADOW_SET_ERROR;
 38.2530 +    }
 38.2531 +    l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
 38.2532 +                             sl3mfn, &new_sl4e, ft_prefetch);
 38.2533 +    result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
 38.2534 +    return result;
 38.2535 +}
 38.2536 +#endif // GUEST_PAGING_LEVELS >= 4
 38.2537 +
 38.2538 +#if GUEST_PAGING_LEVELS >= 3
 38.2539 +static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
 38.2540 +{
 38.2541 +    shadow_l3e_t new_sl3e;
 38.2542 +    guest_l3e_t *new_gl3e = new_ge;
 38.2543 +    shadow_l3e_t *sl3p = se;
 38.2544 +    mfn_t sl2mfn = _mfn(INVALID_MFN);
 38.2545 +    int result = 0;
 38.2546 +
 38.2547 +    perfc_incrc(shadow_validate_gl3e_calls);
 38.2548 +
 38.2549 +    if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
 38.2550 +    {
 38.2551 +        gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
 38.2552 +        mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
 38.2553 +        if ( valid_mfn(gl2mfn) )
 38.2554 +            sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH_l2_shadow);
 38.2555 +        else
 38.2556 +            result |= SHADOW_SET_ERROR;
 38.2557 +    }
 38.2558 +    l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN), 
 38.2559 +                             sl2mfn, &new_sl3e, ft_prefetch);
 38.2560 +    result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
 38.2561 +
 38.2562 +#if GUEST_PAGING_LEVELS == 3
 38.2563 +    /* We have changed a PAE l3 entry: need to sync up the possible copies 
 38.2564 +     * of it */
 38.2565 +    if ( result & SHADOW_SET_L3PAE_RECOPY )
 38.2566 +        sh_pae_recopy(v->domain);
 38.2567 +#endif
 38.2568 +
 38.2569 +    return result;
 38.2570 +}
 38.2571 +#endif // GUEST_PAGING_LEVELS >= 3
 38.2572 +
 38.2573 +static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
 38.2574 +{
 38.2575 +    shadow_l2e_t new_sl2e;
 38.2576 +    guest_l2e_t *new_gl2e = new_ge;
 38.2577 +    shadow_l2e_t *sl2p = se;
 38.2578 +    mfn_t sl1mfn = _mfn(INVALID_MFN);
 38.2579 +    int result = 0;
 38.2580 +
 38.2581 +    perfc_incrc(shadow_validate_gl2e_calls);
 38.2582 +
 38.2583 +    if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
 38.2584 +    {
 38.2585 +        gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
 38.2586 +        if ( guest_supports_superpages(v) &&
 38.2587 +             (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
 38.2588 +        {
 38.2589 +            // superpage -- need to look up the shadow L1 which holds the
 38.2590 +            // splitters...
 38.2591 +            sl1mfn = get_fl1_shadow_status(v, gl1gfn);
 38.2592 +#if 0
 38.2593 +            // XXX - it's possible that we want to do some kind of prefetch
 38.2594 +            // for superpage fl1's here, but this is *not* on the demand path,
 38.2595 +            // so we'll hold off trying that for now...
 38.2596 +            //
 38.2597 +            if ( !valid_mfn(sl1mfn) )
 38.2598 +                sl1mfn = make_fl1_shadow(v, gl1gfn);
 38.2599 +#endif
 38.2600 +        }
 38.2601 +        else
 38.2602 +        {
 38.2603 +            mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
 38.2604 +            if ( valid_mfn(gl1mfn) )
 38.2605 +                sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH_l1_shadow);
 38.2606 +            else
 38.2607 +                result |= SHADOW_SET_ERROR;
 38.2608 +        }
 38.2609 +    }
 38.2610 +    l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
 38.2611 +                             sl1mfn, &new_sl2e, ft_prefetch);
 38.2612 +    result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
 38.2613 +
 38.2614 +    return result;
 38.2615 +}
 38.2616 +
 38.2617 +static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
 38.2618 +{
 38.2619 +    shadow_l1e_t new_sl1e;
 38.2620 +    guest_l1e_t *new_gl1e = new_ge;
 38.2621 +    shadow_l1e_t *sl1p = se;
 38.2622 +    gfn_t gfn;
 38.2623 +    mfn_t mfn;
 38.2624 +    int result = 0;
 38.2625 +
 38.2626 +    perfc_incrc(shadow_validate_gl1e_calls);
 38.2627 +
 38.2628 +    gfn = guest_l1e_get_gfn(*new_gl1e);
 38.2629 +    mfn = vcpu_gfn_to_mfn(v, gfn);
 38.2630 +
 38.2631 +    l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e, 
 38.2632 +                             /* mmio? */ !valid_mfn(mfn));
 38.2633 +    
 38.2634 +    result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
 38.2635 +    return result;
 38.2636 +}
 38.2637 +
 38.2638 +
 38.2639 +/**************************************************************************/
 38.2640 +/* Functions which translate and install a the shadows of arbitrary guest 
 38.2641 + * entries that we have just seen the guest write. */
 38.2642 +
 38.2643 +
 38.2644 +static inline int 
 38.2645 +sh_map_and_validate(struct vcpu *v, mfn_t gmfn,
 38.2646 +                     void *new_gp, u32 size, u32 sh_type, 
 38.2647 +                     u32 (*shadow_index)(mfn_t *smfn, u32 idx),
 38.2648 +                     int (*validate_ge)(struct vcpu *v, void *ge, 
 38.2649 +                                        mfn_t smfn, void *se))
 38.2650 +/* Generic function for mapping and validating. */
 38.2651 +{
 38.2652 +    mfn_t smfn, smfn2, map_mfn;
 38.2653 +    shadow_l1e_t *sl1p;
 38.2654 +    u32 shadow_idx, guest_idx;
 38.2655 +    int result = 0;
 38.2656 +
 38.2657 +    /* Align address and size to guest entry boundaries */
 38.2658 +    size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
 38.2659 +    new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
 38.2660 +    size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
 38.2661 +    ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
 38.2662 +
 38.2663 +    /* Map the shadow page */
 38.2664 +    smfn = get_shadow_status(v, gmfn, sh_type);
 38.2665 +    ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */
 38.2666 +    guest_idx = guest_index(new_gp);
 38.2667 +    map_mfn = smfn;
 38.2668 +    shadow_idx = shadow_index(&map_mfn, guest_idx);
 38.2669 +    sl1p = map_shadow_page(map_mfn);
 38.2670 +
 38.2671 +    /* Validate one entry at a time */
 38.2672 +    while ( size )
 38.2673 +    {
 38.2674 +        smfn2 = smfn;
 38.2675 +        guest_idx = guest_index(new_gp);
 38.2676 +        shadow_idx = shadow_index(&smfn2, guest_idx);
 38.2677 +        if ( mfn_x(smfn2) != mfn_x(map_mfn) )
 38.2678 +        {
 38.2679 +            /* We have moved to another page of the shadow */
 38.2680 +            map_mfn = smfn2;
 38.2681 +            unmap_shadow_page(sl1p);
 38.2682 +            sl1p = map_shadow_page(map_mfn);
 38.2683 +        }
 38.2684 +        result |= validate_ge(v,
 38.2685 +                              new_gp,
 38.2686 +                              map_mfn,
 38.2687 +                              &sl1p[shadow_idx]);
 38.2688 +        size -= sizeof(guest_l1e_t);
 38.2689 +        new_gp += sizeof(guest_l1e_t);
 38.2690 +    }
 38.2691 +    unmap_shadow_page(sl1p);
 38.2692 +    return result;
 38.2693 +}
 38.2694 +
 38.2695 +
 38.2696 +int
 38.2697 +sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
 38.2698 +                          void *new_gl4p, u32 size)
 38.2699 +{
 38.2700 +#if GUEST_PAGING_LEVELS >= 4
 38.2701 +    return sh_map_and_validate(v, gl4mfn, new_gl4p, size, 
 38.2702 +                                PGC_SH_l4_shadow, 
 38.2703 +                                shadow_l4_index, 
 38.2704 +                                validate_gl4e);
 38.2705 +#else // ! GUEST_PAGING_LEVELS >= 4
 38.2706 +    SHADOW_PRINTK("called in wrong paging mode!\n");
 38.2707 +    BUG();
 38.2708 +    return 0;
 38.2709 +#endif 
 38.2710 +}
 38.2711 +    
 38.2712 +int
 38.2713 +sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
 38.2714 +                          void *new_gl3p, u32 size)
 38.2715 +{
 38.2716 +#if GUEST_PAGING_LEVELS >= 3
 38.2717 +    return sh_map_and_validate(v, gl3mfn, new_gl3p, size, 
 38.2718 +                                PGC_SH_l3_shadow, 
 38.2719 +                                shadow_l3_index, 
 38.2720 +                                validate_gl3e);
 38.2721 +#else // ! GUEST_PAGING_LEVELS >= 3
 38.2722 +    SHADOW_PRINTK("called in wrong paging mode!\n");
 38.2723 +    BUG();
 38.2724 +    return 0;
 38.2725 +#endif
 38.2726 +}
 38.2727 +
 38.2728 +int
 38.2729 +sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
 38.2730 +                          void *new_gl2p, u32 size)
 38.2731 +{
 38.2732 +    return sh_map_and_validate(v, gl2mfn, new_gl2p, size, 
 38.2733 +                                PGC_SH_l2_shadow, 
 38.2734 +                                shadow_l2_index, 
 38.2735 +                                validate_gl2e);
 38.2736 +}
 38.2737 +
 38.2738 +int
 38.2739 +sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
 38.2740 +                           void *new_gl2p, u32 size)
 38.2741 +{
 38.2742 +#if GUEST_PAGING_LEVELS == 3
 38.2743 +    return sh_map_and_validate(v, gl2mfn, new_gl2p, size, 
 38.2744 +                                PGC_SH_l2h_shadow, 
 38.2745 +                                shadow_l2_index, 
 38.2746 +                                validate_gl2e);
 38.2747 +#else /* Non-PAE guests don't have different kinds of l2 table */
 38.2748 +    SHADOW_PRINTK("called in wrong paging mode!\n");
 38.2749 +    BUG();
 38.2750 +    return 0;
 38.2751 +#endif
 38.2752 +}
 38.2753 +
 38.2754 +int
 38.2755 +sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
 38.2756 +                          void *new_gl1p, u32 size)
 38.2757 +{
 38.2758 +    return sh_map_and_validate(v, gl1mfn, new_gl1p, size, 
 38.2759 +                                PGC_SH_l1_shadow, 
 38.2760 +                                shadow_l1_index, 
 38.2761 +                                validate_gl1e);
 38.2762 +}
 38.2763 +
 38.2764 +
 38.2765 +/**************************************************************************/
 38.2766 +/* Optimization: If we see two emulated writes of zeros to the same
 38.2767 + * page-table without another kind of page fault in between, we guess
 38.2768 + * that this is a batch of changes (for process destruction) and
 38.2769 + * unshadow the page so we don't take a pagefault on every entry.  This
 38.2770 + * should also make finding writeable mappings of pagetables much
 38.2771 + * easier. */
 38.2772 +
 38.2773 +/* Look to see if this is the second emulated write in a row to this
 38.2774 + * page, and unshadow/unhook if it is */
 38.2775 +static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
 38.2776 +{
 38.2777 +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
 38.2778 +    if ( v->arch.shadow.last_emulated_mfn == mfn_x(gmfn) &&
 38.2779 +         sh_mfn_is_a_page_table(gmfn) )
 38.2780 +    {
 38.2781 +        u32 flags = mfn_to_page(gmfn)->shadow_flags;
 38.2782 +        mfn_t smfn;
 38.2783 +        if ( !(flags & (SHF_L2_32|SHF_L3_PAE|SHF_L4_64)) )
 38.2784 +        {
 38.2785 +            perfc_incrc(shadow_early_unshadow);
 38.2786 +            sh_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ );
 38.2787 +            return;
 38.2788 +        }
 38.2789 +        /* SHF_unhooked_mappings is set to make sure we only unhook
 38.2790 +         * once in a single batch of updates. It is reset when this
 38.2791 +         * top-level page is loaded into CR3 again */
 38.2792 +        if ( !(flags & SHF_unhooked_mappings) ) 
 38.2793 +        {
 38.2794 +            perfc_incrc(shadow_early_unshadow_top);
 38.2795 +            mfn_to_page(gmfn)->shadow_flags |= SHF_unhooked_mappings;
 38.2796 +            if ( flags & SHF_L2_32 )
 38.2797 +            {
 38.2798 +                smfn = get_shadow_status(v, gmfn, PGC_SH_l2_32_shadow);
 38.2799 +                shadow_unhook_mappings(v, smfn);
 38.2800 +            }
 38.2801 +            if ( flags & SHF_L3_PAE ) 
 38.2802 +            {
 38.2803 +                smfn = get_shadow_status(v, gmfn, PGC_SH_l3_pae_shadow);
 38.2804 +                shadow_unhook_mappings(v, smfn);
 38.2805 +            }
 38.2806 +            if ( flags & SHF_L4_64 ) 
 38.2807 +            {
 38.2808 +                smfn = get_shadow_status(v, gmfn, PGC_SH_l4_64_shadow);
 38.2809 +                shadow_unhook_mappings(v, smfn);
 38.2810 +            }
 38.2811 +        }
 38.2812 +    }
 38.2813 +    v->arch.shadow.last_emulated_mfn = mfn_x(gmfn);
 38.2814 +#endif
 38.2815 +}
 38.2816 +
 38.2817 +/* Stop counting towards early unshadows, as we've seen a real page fault */
 38.2818 +static inline void reset_early_unshadow(struct vcpu *v)
 38.2819 +{
 38.2820 +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
 38.2821 +    v->arch.shadow.last_emulated_mfn = INVALID_MFN;
 38.2822 +#endif
 38.2823 +}
 38.2824 +
 38.2825 +
 38.2826 +
 38.2827 +/**************************************************************************/
 38.2828 +/* Entry points into the shadow code */
 38.2829 +
 38.2830 +/* Called from pagefault handler in Xen, and from the HVM trap handlers
 38.2831 + * for pagefaults.  Returns 1 if this fault was an artefact of the
 38.2832 + * shadow code (and the guest should retry) or 0 if it is not (and the
 38.2833 + * fault should be handled elsewhere or passed to the guest). */
 38.2834 +
 38.2835 +static int sh_page_fault(struct vcpu *v, 
 38.2836 +                          unsigned long va, 
 38.2837 +                          struct cpu_user_regs *regs)
 38.2838 +{
 38.2839 +    struct domain *d = v->domain;
 38.2840 +    walk_t gw;
 38.2841 +    u32 accumulated_gflags;
 38.2842 +    gfn_t gfn;
 38.2843 +    mfn_t gmfn, sl1mfn=_mfn(0);
 38.2844 +    shadow_l1e_t sl1e, *ptr_sl1e;
 38.2845 +    paddr_t gpa;
 38.2846 +    struct cpu_user_regs emul_regs;
 38.2847 +    struct x86_emulate_ctxt emul_ctxt;
 38.2848 +    int r, mmio;
 38.2849 +    fetch_type_t ft = 0;
 38.2850 +
 38.2851 +    //
 38.2852 +    // XXX: Need to think about eventually mapping superpages directly in the
 38.2853 +    //      shadow (when possible), as opposed to splintering them into a
 38.2854 +    //      bunch of 4K maps.
 38.2855 +    //
 38.2856 +
 38.2857 +    SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
 38.2858 +                   v->domain->domain_id, v->vcpu_id, va, regs->error_code);
 38.2859 +    
 38.2860 +    shadow_lock(d);
 38.2861 +
 38.2862 +    shadow_audit_tables(v);
 38.2863 +                   
 38.2864 +    if ( guest_walk_tables(v, va, &gw, 1) != 0 )
 38.2865 +    {
 38.2866 +        SHADOW_PRINTK("malformed guest pagetable!");
 38.2867 +        print_gw(&gw);
 38.2868 +    }
 38.2869 +
 38.2870 +    sh_audit_gw(v, &gw);
 38.2871 +
 38.2872 +    // We do not look at the gw->l1e, as that will not exist for superpages.
 38.2873 +    // Instead, we use the gw->eff_l1e...
 38.2874 +    //
 38.2875 +    // We need not check all the levels of the guest page table entries for
 38.2876 +    // present vs not-present, as the eff_l1e will always be not present if
 38.2877 +    // one of the higher level entries is not present.
 38.2878 +    //
 38.2879 +    if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
 38.2880 +    {
 38.2881 +        if ( hvm_guest(v) && !shadow_vcpu_mode_translate(v) )
 38.2882 +        {
 38.2883 +            /* Not present in p2m map, means this is mmio */
 38.2884 +            gpa = va;
 38.2885 +            goto mmio;
 38.2886 +        }
 38.2887 +
 38.2888 +        perfc_incrc(shadow_fault_bail_not_present);
 38.2889 +        goto not_a_shadow_fault;
 38.2890 +    }
 38.2891 +
 38.2892 +    // All levels of the guest page table are now known to be present.
 38.2893 +    accumulated_gflags = accumulate_guest_flags(&gw);
 38.2894 +
 38.2895 +    // Check for attempts to access supervisor-only pages from user mode,
 38.2896 +    // i.e. ring 3.  Such errors are not caused or dealt with by the shadow
 38.2897 +    // code.
 38.2898 +    //
 38.2899 +    if ( (regs->error_code & PFEC_user_mode) &&
 38.2900 +         !(accumulated_gflags & _PAGE_USER) )
 38.2901 +    {
 38.2902 +        /* illegal user-mode access to supervisor-only page */
 38.2903 +        perfc_incrc(shadow_fault_bail_user_supervisor);
 38.2904 +        goto not_a_shadow_fault;
 38.2905 +    }
 38.2906 +
 38.2907 +    // Was it a write fault?
 38.2908 +    //
 38.2909 +    if ( regs->error_code & PFEC_write_access )
 38.2910 +    {
 38.2911 +        if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
 38.2912 +        {
 38.2913 +            perfc_incrc(shadow_fault_bail_ro_mapping);
 38.2914 +            goto not_a_shadow_fault;
 38.2915 +        }
 38.2916 +    }
 38.2917 +    else // must have been either an insn fetch or read fault
 38.2918 +    {
 38.2919 +        // Check for NX bit violations: attempts to execute code that is
 38.2920 +        // marked "do not execute".  Such errors are not caused or dealt with
 38.2921 +        // by the shadow code.
 38.2922 +        //
 38.2923 +        if ( regs->error_code & PFEC_insn_fetch )
 38.2924 +        {
 38.2925 +            if ( accumulated_gflags & _PAGE_NX_BIT )
 38.2926 +            {
 38.2927 +                /* NX prevented this code fetch */
 38.2928 +                perfc_incrc(shadow_fault_bail_nx);
 38.2929 +                goto not_a_shadow_fault;
 38.2930 +            }
 38.2931 +        }
 38.2932 +    }
 38.2933 +
 38.2934 +    /* Is this an MMIO access? */
 38.2935 +    gfn = guest_l1e_get_gfn(gw.eff_l1e);
 38.2936 +    mmio = ( hvm_guest(v) 
 38.2937 +             && shadow_vcpu_mode_translate(v) 
 38.2938 +             && mmio_space(gfn_to_paddr(gfn)) );
 38.2939 +
 38.2940 +    /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds 
 38.2941 +     * the equivalent mfn. */
 38.2942 +    if ( mmio ) 
 38.2943 +        gmfn = _mfn(gfn_x(gfn));
 38.2944 +    else
 38.2945 +    {
 38.2946 +        gmfn = vcpu_gfn_to_mfn(v, gfn);
 38.2947 +        if ( !valid_mfn(gmfn) )
 38.2948 +        {
 38.2949 +            perfc_incrc(shadow_fault_bail_bad_gfn);
 38.2950 +            SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n", 
 38.2951 +                           gfn_x(gfn), mfn_x(gmfn));
 38.2952 +            goto not_a_shadow_fault;
 38.2953 +        }
 38.2954 +    }
 38.2955 +
 38.2956 +    /* Make sure there is enough free shadow memory to build a chain of
 38.2957 +     * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
 38.2958 +     * to allocate all we need.  (We never allocate a top-level shadow
 38.2959 +     * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
 38.2960 +    shadow_prealloc(d, SHADOW_MAX_ORDER);
 38.2961 +
 38.2962 +    /* Acquire the shadow.  This must happen before we figure out the rights 
 38.2963 +     * for the shadow entry, since we might promote a page here. */
 38.2964 +    // XXX -- this code will need to change somewhat if/when the shadow code
 38.2965 +    // can directly map superpages...
 38.2966 +    ft = ((regs->error_code & PFEC_write_access) ?
 38.2967 +          ft_demand_write : ft_demand_read);
 38.2968 +    ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
 38.2969 +    ASSERT(ptr_sl1e);
 38.2970 +
 38.2971 +    /* Calculate the shadow entry */
 38.2972 +    if ( ft == ft_demand_write )
 38.2973 +    {
 38.2974 +        if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) )
 38.2975 +        {
 38.2976 +            perfc_incrc(shadow_fault_emulate_write);
 38.2977 +            goto emulate;
 38.2978 +        }
 38.2979 +    }
 38.2980 +    else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) )
 38.2981 +    {
 38.2982 +        perfc_incrc(shadow_fault_emulate_read);
 38.2983 +        goto emulate;
 38.2984 +    }
 38.2985 +
 38.2986 +    /* Quick sanity check: we never make an MMIO entry that's got the 
 38.2987 +     * _PAGE_PRESENT flag set in it. */
 38.2988 +    ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT));
 38.2989 +
 38.2990 +    r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
 38.2991 +
 38.2992 +    if ( mmio ) 
 38.2993 +    {
 38.2994 +        gpa = guest_walk_to_gpa(&gw);
 38.2995 +        goto mmio;
 38.2996 +    }
 38.2997 +
 38.2998 +#if 0
 38.2999 +    if ( !(r & SHADOW_SET_CHANGED) )
 38.3000 +        debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH_PRI_pte
 38.3001 +                          ") did not change anything\n",
 38.3002 +                          __func__, gw.va, l1e_get_intpte(sl1e));
 38.3003 +#endif
 38.3004 +
 38.3005 +    perfc_incrc(shadow_fault_fixed);
 38.3006 +    d->arch.shadow.fault_count++;
 38.3007 +    reset_early_unshadow(v);
 38.3008 +
 38.3009 + done:
 38.3010 +    sh_audit_gw(v, &gw);
 38.3011 +    unmap_walk(v, &gw);
 38.3012 +    SHADOW_PRINTK("fixed\n");
 38.3013 +    shadow_audit_tables(v);
 38.3014 +    shadow_unlock(d);
 38.3015 +    return EXCRET_fault_fixed;
 38.3016 +
 38.3017 + emulate:
 38.3018 +
 38.3019 +    /* Take the register set we were called with */
 38.3020 +    emul_regs = *regs;
 38.3021 +    if ( hvm_guest(v) )
 38.3022 +    {
 38.3023 +        /* Add the guest's segment selectors, rip, rsp. rflags */ 
 38.3024 +        hvm_store_cpu_guest_regs(v, &emul_regs, NULL);
 38.3025 +    }
 38.3026 +    emul_ctxt.regs = &emul_regs;
 38.3027 +    emul_ctxt.cr2 = va;
 38.3028 +    emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST;
 38.3029 +
 38.3030 +    SHADOW_PRINTK("emulate: eip=%#lx\n", emul_regs.eip);
 38.3031 +
 38.3032 +    v->arch.shadow.propagate_fault = 0;
 38.3033 +    if ( x86_emulate_memop(&emul_ctxt, &shadow_emulator_ops) )
 38.3034 +    {
 38.3035 +        SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n", 
 38.3036 +                       mfn_x(gmfn));
 38.3037 +        perfc_incrc(shadow_fault_emulate_failed);
 38.3038 +        /* If this is actually a page table, then we have a bug, and need 
 38.3039 +         * to support more operations in the emulator.  More likely, 
 38.3040 +         * though, this is a hint that this page should not be shadowed. */
 38.3041 +        shadow_remove_all_shadows(v, gmfn);
 38.3042 +        /* This means that actual missing operations will cause the 
 38.3043 +         * guest to loop on the same page fault. */
 38.3044 +        goto done;
 38.3045 +    }
 38.3046 +    if ( v->arch.shadow.propagate_fault )
 38.3047 +    {
 38.3048 +        /* Emulation triggered another page fault */
 38.3049 +        goto not_a_shadow_fault;
 38.3050 +    }
 38.3051 +
 38.3052 +    /* Emulator has changed the user registers: write back */
 38.3053 +    if ( hvm_guest(v) )
 38.3054 +    {
 38.3055 +        /* Write back the guest's segment selectors, rip, rsp. rflags */ 
 38.3056 +        hvm_load_cpu_guest_regs(v, &emul_regs);
 38.3057 +        /* And don't overwrite those in the caller's regs. */
 38.3058 +        emul_regs.eip = regs->eip;
 38.3059 +        emul_regs.cs = regs->cs;
 38.3060 +        emul_regs.eflags = regs->eflags;
 38.3061 +        emul_regs.esp = regs->esp;
 38.3062 +        emul_regs.ss = regs->ss;
 38.3063 +        emul_regs.es = regs->es;
 38.3064 +        emul_regs.ds = regs->ds;
 38.3065 +        emul_regs.fs = regs->fs;
 38.3066 +        emul_regs.gs = regs->gs;
 38.3067 +    }
 38.3068 +    *regs = emul_regs;
 38.3069 +
 38.3070 +    goto done;
 38.3071 +
 38.3072 + mmio:
 38.3073 +    perfc_incrc(shadow_fault_mmio);
 38.3074 +    if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) )
 38.3075 +    {
 38.3076 +        /* Need to deal with these disabled-APIC accesses, as
 38.3077 +         * handle_mmio() apparently does not currently do that. */
 38.3078 +        /* TJD: What about it, then?   For now, I'm turning this BUG() 
 38.3079 +         * into a domain_crash() since we don't want to kill Xen. */
 38.3080 +        SHADOW_ERROR("disabled-APIC access: not supported\n.");
 38.3081 +        domain_crash(d); 
 38.3082 +    }
 38.3083 +    sh_audit_gw(v, &gw);
 38.3084 +    unmap_walk(v, &gw);
 38.3085 +    SHADOW_PRINTK("mmio\n");
 38.3086 +    shadow_audit_tables(v);
 38.3087 +    reset_early_unshadow(v);
 38.3088 +    shadow_unlock(d);
 38.3089 +    sh_log_mmio(v, gpa);
 38.3090 +    handle_mmio(va, gpa);
 38.3091 +    return EXCRET_fault_fixed;
 38.3092 +
 38.3093 + not_a_shadow_fault:
 38.3094 +    sh_audit_gw(v, &gw);
 38.3095 +    unmap_walk(v, &gw);
 38.3096 +    SHADOW_PRINTK("not a shadow fault\n");
 38.3097 +    shadow_audit_tables(v);
 38.3098 +    reset_early_unshadow(v);
 38.3099 +    shadow_unlock(d);
 38.3100 +    return 0;
 38.3101 +}
 38.3102 +
 38.3103 +
 38.3104 +static int
 38.3105 +sh_invlpg(struct vcpu *v, unsigned long va)
 38.3106 +/* Called when the guest requests an invlpg.  Returns 1 if the invlpg
 38.3107 + * instruction should be issued on the hardware, or 0 if it's safe not
 38.3108 + * to do so. */
 38.3109 +{
 38.3110 +    shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va);
 38.3111 +
 38.3112 +    // XXX -- might be a good thing to prefetch the va into the shadow
 38.3113 +
 38.3114 +    // no need to flush anything if there's no SL2...
 38.3115 +    //
 38.3116 +    if ( !ptr_sl2e )
 38.3117 +        return 0;
 38.3118 +
 38.3119 +    // If there's nothing shadowed for this particular sl2e, then
 38.3120 +    // there is no need to do an invlpg, either...
 38.3121 +    //
 38.3122 +    if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) )
 38.3123 +        return 0;
 38.3124 +
 38.3125 +    // Check to see if the SL2 is a splintered superpage...
 38.3126 +    // If so, then we'll need to flush the entire TLB (because that's
 38.3127 +    // easier than invalidating all of the individual 4K pages).
 38.3128 +    //
 38.3129 +    if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info &
 38.3130 +          PGC_SH_type_mask) == PGC_SH_fl1_shadow )
 38.3131 +    {
 38.3132 +        local_flush_tlb();
 38.3133 +        return 0;
 38.3134 +    }
 38.3135 +
 38.3136 +    return 1;
 38.3137 +}
 38.3138 +
 38.3139 +static unsigned long
 38.3140 +sh_gva_to_gfn(struct vcpu *v, unsigned long va)
 38.3141 +/* Called to translate a guest virtual address to what the *guest*
 38.3142 + * pagetables would map it to. */
 38.3143 +{
 38.3144 +    walk_t gw;
 38.3145 +    gfn_t gfn;
 38.3146 +
 38.3147 +    guest_walk_tables(v, va, &gw, 0);
 38.3148 +    gfn = guest_walk_to_gfn(&gw);
 38.3149 +    unmap_walk(v, &gw);
 38.3150 +
 38.3151 +    return gfn_x(gfn);
 38.3152 +}
 38.3153 +
 38.3154 +
 38.3155 +static unsigned long
 38.3156 +sh_gva_to_gpa(struct vcpu *v, unsigned long va)
 38.3157 +/* Called to translate a guest virtual address to what the *guest*
 38.3158 + * pagetables would map it to. */
 38.3159 +{
 38.3160 +    unsigned long gfn = sh_gva_to_gfn(v, va);
 38.3161 +    if ( gfn == INVALID_GFN )
 38.3162 +        return 0;
 38.3163 +    else
 38.3164 +        return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK);
 38.3165 +}
 38.3166 +
 38.3167 +
 38.3168 +// XXX -- should this be in this file?
 38.3169 +//        Or should it be moved to shadow-common.c?
 38.3170 +//
 38.3171 +/* returns a lowmem machine address of the copied HVM L3 root table
 38.3172 + * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy,
 38.3173 + * otherwise blank out any entries with reserved bits in them.  */
 38.3174 +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
 38.3175 +static unsigned long
 38.3176 +hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res)
 38.3177 +{
 38.3178 +    int i, f;
 38.3179 +    int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY);
 38.3180 +    l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
 38.3181 +    memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t));
 38.3182 +    for ( i = 0; i < 4; i++ )
 38.3183 +    {
 38.3184 +        f = l3e_get_flags(l3tab[i]);
 38.3185 +        if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) )
 38.3186 +            new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res);
 38.3187 +        else
 38.3188 +            new_l3e = l3e_empty();
 38.3189 +        safe_write_entry(&copy[i], &new_l3e);
 38.3190 +    }
 38.3191 +    return __pa(copy);
 38.3192 +}
 38.3193 +#endif
 38.3194 +
 38.3195 +
 38.3196 +static inline void
 38.3197 +sh_update_linear_entries(struct vcpu *v)
 38.3198 +/* Sync up all the linear mappings for this vcpu's pagetables */
 38.3199 +{
 38.3200 +    struct domain *d = v->domain;
 38.3201 +
 38.3202 +    /* Linear pagetables in PV guests
 38.3203 +     * ------------------------------
 38.3204 +     *
 38.3205 +     * Guest linear pagetables, which map the guest pages, are at
 38.3206 +     * LINEAR_PT_VIRT_START.  Shadow linear pagetables, which map the
 38.3207 +     * shadows, are at SH_LINEAR_PT_VIRT_START.  Most of the time these
 38.3208 +     * are set up at shadow creation time, but (of course!) the PAE case
 38.3209 +     * is subtler.  Normal linear mappings are made by having an entry
 38.3210 +     * in the top-level table that points to itself (shadow linear) or
 38.3211 +     * to the guest top-level table (guest linear).  For PAE, to set up
 38.3212 +     * a linear map requires us to copy the four top-level entries into 
 38.3213 +     * level-2 entries.  That means that every time we change a PAE l3e,
 38.3214 +     * we need to reflect the change into the copy.
 38.3215 +     *
 38.3216 +     * Linear pagetables in HVM guests
 38.3217 +     * -------------------------------
 38.3218 +     *
 38.3219 +     * For HVM guests, the linear pagetables are installed in the monitor
 38.3220 +     * tables (since we can't put them in the shadow).  Shadow linear
 38.3221 +     * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
 38.3222 +     * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for 
 38.3223 +     * a linear pagetable of the monitor tables themselves.  We have 
 38.3224 +     * the same issue of having to re-copy PAE l3 entries whevever we use
 38.3225 +     * PAE shadows. 
 38.3226 +     *
 38.3227 +     * Because HVM guests run on the same monitor tables regardless of the 
 38.3228 +     * shadow tables in use, the linear mapping of the shadow tables has to 
 38.3229 +     * be updated every time v->arch.shadow_table changes. 
 38.3230 +     */
 38.3231 +
 38.3232 +    /* Don't try to update the monitor table if it doesn't exist */
 38.3233 +    if ( shadow_mode_external(d) 
 38.3234 +         && pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
 38.3235 +        return;
 38.3236 +
 38.3237 +#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
 38.3238 +    
 38.3239 +    /* For PV, one l4e points at the guest l4, one points at the shadow
 38.3240 +     * l4.  No maintenance required. 
 38.3241 +     * For HVM, just need to update the l4e that points to the shadow l4. */
 38.3242 +
 38.3243 +    if ( shadow_mode_external(d) )
 38.3244 +    {
 38.3245 +        /* Use the linear map if we can; otherwise make a new mapping */
 38.3246 +        if ( v == current ) 
 38.3247 +        {
 38.3248 +            __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] = 
 38.3249 +                l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
 38.3250 +                             __PAGE_HYPERVISOR);
 38.3251 +        } 
 38.3252 +        else
 38.3253 +        { 
 38.3254 +            l4_pgentry_t *ml4e;
 38.3255 +            ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
 38.3256 +            ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = 
 38.3257 +                l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
 38.3258 +                             __PAGE_HYPERVISOR);
 38.3259 +            sh_unmap_domain_page(ml4e);
 38.3260 +        }
 38.3261 +    }
 38.3262 +
 38.3263 +#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
 38.3264 +
 38.3265 +    /* This case only exists in HVM.  To give ourselves a linear map of the 
 38.3266 +     * shadows, we need to extend a PAE shadow to 4 levels.  We do this by 
 38.3267 +     * having a monitor l3 in slot 0 of the monitor l4 table, and 
 38.3268 +     * copying the PAE l3 entries into it.  Then, by having the monitor l4e
 38.3269 +     * for shadow pagetables also point to the monitor l4, we can use it
 38.3270 +     * to access the shadows. */
 38.3271 +
 38.3272 +    if ( shadow_mode_external(d) )
 38.3273 +    {
 38.3274 +        /* Install copies of the shadow l3es into the monitor l3 table.
 38.3275 +         * The monitor l3 table is hooked into slot 0 of the monitor
 38.3276 +         * l4 table, so we use l3 linear indices 0 to 3 */
 38.3277 +        shadow_l3e_t *sl3e;
 38.3278 +        l3_pgentry_t *ml3e;
 38.3279 +        mfn_t l3mfn;
 38.3280 +        int i;
 38.3281 +
 38.3282 +        /* Use linear mappings if we can; otherwise make new mappings */
 38.3283 +        if ( v == current ) 
 38.3284 +        {
 38.3285 +            ml3e = __linear_l3_table;
 38.3286 +            l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
 38.3287 +#if GUEST_PAGING_LEVELS == 2
 38.3288 +            /* Shadow l3 tables are made up by update_cr3 */
 38.3289 +            sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
 38.3290 +#else
 38.3291 +            sl3e = v->arch.shadow_vtable;
 38.3292 +#endif
 38.3293 +        }
 38.3294 +        else 
 38.3295 +        {   
 38.3296 +            l4_pgentry_t *ml4e;
 38.3297 +            ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
 38.3298 +            ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
 38.3299 +            l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
 38.3300 +            ml3e = sh_map_domain_page(l3mfn);
 38.3301 +            sh_unmap_domain_page(ml4e);
 38.3302 +#if GUEST_PAGING_LEVELS == 2
 38.3303 +            /* Shadow l3 tables are made up by update_cr3 */
 38.3304 +            sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
 38.3305 +#else
 38.3306 +            sl3e = sh_map_domain_page(pagetable_get_mfn(v->arch.shadow_table));
 38.3307 +#endif
 38.3308 +        }
 38.3309 +
 38.3310 +        for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
 38.3311 +        {
 38.3312 +            ml3e[i] = 
 38.3313 +                (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT) 
 38.3314 +                ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])), 
 38.3315 +                               __PAGE_HYPERVISOR) 
 38.3316 +                : l3e_empty();
 38.3317 +        }
 38.3318 +
 38.3319 +        if ( v != current ) 
 38.3320 +        {
 38.3321 +            sh_unmap_domain_page(ml3e);
 38.3322 +#if GUEST_PAGING_LEVELS != 2
 38.3323 +            sh_unmap_domain_page(sl3e);
 38.3324 +#endif
 38.3325 +        }
 38.3326 +    }
 38.3327 +
 38.3328 +#elif CONFIG_PAGING_LEVELS == 3
 38.3329 +
 38.3330 +    /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
 38.3331 +     * entries in the shadow, and the shadow's l3 entries into the 
 38.3332 +     * shadow-linear-map l2 entries in the shadow.  This is safe to do 
 38.3333 +     * because Xen does not let guests share high-slot l2 tables between l3s,
 38.3334 +     * so we know we're not treading on anyone's toes. 
 38.3335 +     *
 38.3336 +     * HVM: need to copy the shadow's l3 entries into the
 38.3337 +     * shadow-linear-map l2 entries in the monitor table.  This is safe
 38.3338 +     * because we have one monitor table for each vcpu.  The monitor's
 38.3339 +     * own l3es don't need to be copied because they never change.  
 38.3340 +     * XXX That might change if we start stuffing things into the rest
 38.3341 +     * of the monitor's virtual address space. 
 38.3342 +     */ 
 38.3343 +    {
 38.3344 +        l2_pgentry_t *l2e, new_l2e;
 38.3345 +        shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
 38.3346 +        int i;
 38.3347 +
 38.3348 +#if GUEST_PAGING_LEVELS == 2
 38.3349 +        /* Shadow l3 tables were built by update_cr3 */
 38.3350 +        if ( shadow_mode_external(d) )
 38.3351 +            shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
 38.3352 +        else
 38.3353 +            BUG(); /* PV 2-on-3 is not supported yet */
 38.3354 +        
 38.3355 +#else /* GUEST_PAGING_LEVELS == 3 */
 38.3356 +        
 38.3357 +        /* Use local vcpu's mappings if we can; otherwise make new mappings */
 38.3358 +        if ( v == current ) 
 38.3359 +        {
 38.3360 +            shadow_l3e = v->arch.shadow_vtable;
 38.3361 +            if ( !shadow_mode_external(d) )
 38.3362 +                guest_l3e = v->arch.guest_vtable;
 38.3363 +        }
 38.3364 +        else 
 38.3365 +        {
 38.3366 +            mfn_t smfn;
 38.3367 +            int idx;
 38.3368 +            
 38.3369 +            /* Map the shadow l3 */
 38.3370 +            smfn = pagetable_get_mfn(v->arch.shadow_table);
 38.3371 +            idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable));
 38.3372 +            shadow_l3e = sh_map_domain_page(smfn);
 38.3373 +            shadow_l3e += idx;
 38.3374 +            if ( !shadow_mode_external(d) )
 38.3375 +            {
 38.3376 +                /* Also the guest l3 */
 38.3377 +                mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table); 
 38.3378 +                guest_l3e = sh_map_domain_page(gmfn);
 38.3379 +                guest_l3e += guest_index(v->arch.guest_vtable);
 38.3380 +            }
 38.3381 +        }
 38.3382 +#endif /* GUEST_PAGING_LEVELS */
 38.3383 +        
 38.3384 +        /* Choose where to write the entries, using linear maps if possible */
 38.3385 +        if ( v == current && shadow_mode_external(d) ) 
 38.3386 +        {
 38.3387 +            /* From the monitor tables, it's safe to use linear maps to update
 38.3388 +             * monitor l2s */
 38.3389 +            l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
 38.3390 +        }
 38.3391 +        else if ( shadow_mode_external(d) ) 
 38.3392 +        {
 38.3393 +            /* Map the monitor table's high l2 */
 38.3394 +            l3_pgentry_t *l3e;
 38.3395 +            l3e = sh_map_domain_page(
 38.3396 +                pagetable_get_mfn(v->arch.monitor_table));
 38.3397 +            ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
 38.3398 +            l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
 38.3399 +            sh_unmap_domain_page(l3e);
 38.3400 +        } 
 38.3401 +        else 
 38.3402 +        {
 38.3403 +            /* Map the shadow table's high l2 */
 38.3404 +            ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
 38.3405 +            l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
 38.3406 +        }
 38.3407 +        
 38.3408 +        
 38.3409 +        if ( !shadow_mode_external(d) )
 38.3410 +        {
 38.3411 +            /* Write linear mapping of guest. */
 38.3412 +            for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
 38.3413 +            { 
 38.3414 +                new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT) 
 38.3415 +                    ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
 38.3416 +                                   __PAGE_HYPERVISOR) 
 38.3417 +                    : l2e_empty();
 38.3418 +                safe_write_entry(
 38.3419 +                    &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
 38.3420 +                    &new_l2e);
 38.3421 +            }
 38.3422 +        }
 38.3423 +        
 38.3424 +        /* Write linear mapping of shadow. */
 38.3425 +        for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
 38.3426 +        {
 38.3427 +            new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT) 
 38.3428 +                ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
 38.3429 +                               __PAGE_HYPERVISOR) 
 38.3430 +                : l2e_empty();
 38.3431 +            safe_write_entry(
 38.3432 +                &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
 38.3433 +                &new_l2e);
 38.3434 +        }
 38.3435 +        
 38.3436 +        if ( v != current || !shadow_mode_external(d) )
 38.3437 +            sh_unmap_domain_page(l2e);
 38.3438 +        
 38.3439 +#if GUEST_PAGING_LEVELS == 3
 38.3440 +        if ( v != current) 
 38.3441 +        {
 38.3442 +            sh_unmap_domain_page(shadow_l3e);
 38.3443 +            if ( !shadow_mode_external(d) )
 38.3444 +                sh_unmap_domain_page(guest_l3e);
 38.3445 +        }
 38.3446 +#endif
 38.3447 +    }
 38.3448 +
 38.3449 +#elif CONFIG_PAGING_LEVELS == 2
 38.3450 +
 38.3451 +    /* For PV, one l2e points at the guest l2, one points at the shadow
 38.3452 +     * l2. No maintenance required. 
 38.3453 +     * For HVM, just need to update the l2e that points to the shadow l2. */
 38.3454 +
 38.3455 +    if ( shadow_mode_external(d) )
 38.3456 +    {
 38.3457 +        /* Use the linear map if we can; otherwise make a new mapping */
 38.3458 +        if ( v == current ) 
 38.3459 +        {
 38.3460 +            __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] = 
 38.3461 +                l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
 38.3462 +                             __PAGE_HYPERVISOR);
 38.3463 +        } 
 38.3464 +        else
 38.3465 +        { 
 38.3466 +            l2_pgentry_t *ml2e;
 38.3467 +            ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
 38.3468 +            ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = 
 38.3469 +                l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
 38.3470 +                             __PAGE_HYPERVISOR);
 38.3471 +            sh_unmap_domain_page(ml2e);
 38.3472 +        }
 38.3473 +    }
 38.3474 +
 38.3475 +#else
 38.3476 +#error this should not happen
 38.3477 +#endif
 38.3478 +}
 38.3479 +
 38.3480 +
 38.3481 +// XXX -- should this be in this file?
 38.3482 +//        Or should it be moved to shadow-common.c?
 38.3483 +//
 38.3484 +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
 38.3485 +void sh_pae_recopy(struct domain *d)
 38.3486 +/* Called whenever we write to the l3 entries of a PAE pagetable which 
 38.3487 + * is currently in use.  Each vcpu that is using the table needs to 
 38.3488 + * resync its copies of the l3s in linear maps and any low-memory
 38.3489 + * copies it might have made for fitting into 32bit CR3.
 38.3490 + * Since linear maps are also resynced when we change CR3, we don't
 38.3491 + * need to worry about changes to PAE l3es that are not currently in use.*/
 38.3492 +{
 38.3493 +    struct vcpu *v;
 38.3494 +    cpumask_t flush_mask = CPU_MASK_NONE;
 38.3495 +    ASSERT(shadow_lock_is_acquired(d));
 38.3496 +    
 38.3497 +    for_each_vcpu(d, v)
 38.3498 +    {
 38.3499 +        if ( !v->arch.shadow.pae_flip_pending ) 
 38.3500 +            continue;
 38.3501 +
 38.3502 +        cpu_set(v->processor, flush_mask);
 38.3503 +        
 38.3504 +        SHADOW_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id);
 38.3505 +
 38.3506 +        /* This vcpu has a copy in its linear maps */
 38.3507 +        sh_update_linear_entries(v);
 38.3508 +        if ( hvm_guest(v) )
 38.3509 +        {
 38.3510 +            /* This vcpu has a copy in its HVM PAE l3 */
 38.3511 +            v->arch.hvm_vcpu.hw_cr3 = 
 38.3512 +                hvm_pae_copy_root(v, v->arch.shadow_vtable,
 38.3513 +                                  !shadow_vcpu_mode_translate(v));
 38.3514 +        }
 38.3515 +#if CONFIG_PAGING_LEVELS == 3
 38.3516 +        else 
 38.3517 +        {
 38.3518 +            /* This vcpu might have copied the l3 to below 4GB */
 38.3519 +            if ( v->arch.cr3 >> PAGE_SHIFT 
 38.3520 +                 != pagetable_get_pfn(v->arch.shadow_table) )
 38.3521 +            {
 38.3522 +                /* Recopy to where that copy is. */
 38.3523 +                int i;
 38.3524 +                l3_pgentry_t *dst, *src;
 38.3525 +                dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */
 38.3526 +                src = v->arch.shadow_vtable;
 38.3527 +                for ( i = 0 ; i < 4 ; i++ ) 
 38.3528 +                    safe_write_entry(dst + i, src + i);
 38.3529 +            }
 38.3530 +        }
 38.3531 +#endif
 38.3532 +        v->arch.shadow.pae_flip_pending = 0;        
 38.3533 +    }
 38.3534 +
 38.3535 +    flush_tlb_mask(flush_mask);
 38.3536 +}
 38.3537 +#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */
 38.3538 +
 38.3539 +
 38.3540 +/* removes:
 38.3541 + *     vcpu->arch.guest_vtable
 38.3542 + *     vcpu->arch.shadow_table
 38.3543 + *     vcpu->arch.shadow_vtable
 38.3544 + * Does all appropriate management/bookkeeping/refcounting/etc...
 38.3545 + */
 38.3546 +static void
 38.3547 +sh_detach_old_tables(struct vcpu *v)
 38.3548 +{
 38.3549 +    mfn_t smfn;
 38.3550 +
 38.3551 +    ////
 38.3552 +    //// vcpu->arch.guest_vtable
 38.3553 +    ////
 38.3554 +    if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
 38.3555 +         v->arch.guest_vtable )
 38.3556 +    {
 38.3557 +        // Q: why does this need to use (un)map_domain_page_*global* ?
 38.3558 +        sh_unmap_domain_page_global(v->arch.guest_vtable);
 38.3559 +        v->arch.guest_vtable = NULL;
 38.3560 +    }
 38.3561 +
 38.3562 +    ////
 38.3563 +    //// vcpu->arch.shadow_table
 38.3564 +    ////
 38.3565 +    smfn = pagetable_get_mfn(v->arch.shadow_table);
 38.3566 +    if ( mfn_x(smfn) )
 38.3567 +    {
 38.3568 +        ASSERT(v->arch.shadow_vtable);
 38.3569 +
 38.3570 +#if GUEST_PAGING_LEVELS == 3
 38.3571 +        // PAE guests do not (necessarily) use an entire page for their
 38.3572 +        // 4-entry L3s, so we have to deal with them specially.
 38.3573 +        //
 38.3574 +        sh_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn);
 38.3575 +#else
 38.3576 +        sh_put_ref(v, smfn, 0);
 38.3577 +#endif
 38.3578 +
 38.3579 +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
 38.3580 +        {
 38.3581 +            struct pae_l3_bookkeeping *info =
 38.3582 +                sl3p_to_info(v->arch.shadow_vtable);
 38.3583 +            ASSERT(test_bit(v->vcpu_id, &info->vcpus));
 38.3584 +            clear_bit(v->vcpu_id, &info->vcpus);
 38.3585 +        }
 38.3586 +#endif
 38.3587 +        v->arch.shadow_table = pagetable_null();
 38.3588 +    }
 38.3589 +
 38.3590 +    ////
 38.3591 +    //// vcpu->arch.shadow_vtable
 38.3592 +    ////
 38.3593 +    if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
 38.3594 +         v->arch.shadow_vtable )
 38.3595 +    {
 38.3596 +        // Q: why does this need to use (un)map_domain_page_*global* ?
 38.3597 +        //
 38.3598 +        sh_unmap_domain_page_global(v->arch.shadow_vtable);
 38.3599 +        v->arch.shadow_vtable = NULL;
 38.3600 +    }
 38.3601 +}
 38.3602 +
 38.3603 +static void
 38.3604 +sh_update_cr3(struct vcpu *v)
 38.3605 +/* Updates vcpu->arch.shadow_table after the guest has changed CR3.
 38.3606 + * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
 38.3607 + * if appropriate).
 38.3608 + * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)...
 38.3609 + */
 38.3610 +{
 38.3611 +    struct domain *d = v->domain;
 38.3612 +    mfn_t gmfn, smfn;
 38.3613 +#if GUEST_PAGING_LEVELS == 3
 38.3614 +    u32 guest_idx=0;
 38.3615 +#endif
 38.3616 +
 38.3617 +    ASSERT(shadow_lock_is_acquired(v->domain));
 38.3618 +    ASSERT(v->arch.shadow.mode);
 38.3619 +
 38.3620 +    ////
 38.3621 +    //// vcpu->arch.guest_table is already set
 38.3622 +    ////
 38.3623 +    
 38.3624 +#ifndef NDEBUG 
 38.3625 +    /* Double-check that the HVM code has sent us a sane guest_table */
 38.3626 +    if ( hvm_guest(v) )
 38.3627 +    {
 38.3628 +        gfn_t gfn;
 38.3629 +
 38.3630 +        ASSERT(shadow_mode_external(d));
 38.3631 +
 38.3632 +        // Is paging enabled on this vcpu?
 38.3633 +        if ( shadow_vcpu_mode_translate(v) )
 38.3634 +        {
 38.3635 +            gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
 38.3636 +            gmfn = vcpu_gfn_to_mfn(v, gfn);
 38.3637 +            ASSERT(valid_mfn(gmfn));
 38.3638 +            ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
 38.3639 +        } 
 38.3640 +        else 
 38.3641 +        {
 38.3642 +            /* Paging disabled: guest_table points at (part of) p2m */
 38.3643 +#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
 38.3644 +            /* For everything else, they sould be the same */
 38.3645 +            ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
 38.3646 +#endif
 38.3647 +        }
 38.3648 +    }
 38.3649 +#endif
 38.3650 +
 38.3651 +    SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n",
 38.3652 +                   d->domain_id, v->vcpu_id, 
 38.3653 +                   (unsigned long)pagetable_get_pfn(v->arch.guest_table));
 38.3654 +
 38.3655 +#if GUEST_PAGING_LEVELS == 4
 38.3656 +    if ( !(v->arch.flags & TF_kernel_mode) )
 38.3657 +        gmfn = pagetable_get_mfn(v->arch.guest_table_user);
 38.3658 +    else
 38.3659 +#endif
 38.3660 +        gmfn = pagetable_get_mfn(v->arch.guest_table);
 38.3661 +
 38.3662 +    sh_detach_old_tables(v);
 38.3663 +
 38.3664 +    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
 38.3665 +    {
 38.3666 +        ASSERT(v->arch.cr3 == 0);
 38.3667 +        return;
 38.3668 +    }
 38.3669 +
 38.3670 +    ////
 38.3671 +    //// vcpu->arch.guest_vtable
 38.3672 +    ////
 38.3673 +    if ( shadow_mode_external(d) )
 38.3674 +    {
 38.3675 +#if GUEST_PAGING_LEVELS == 3
 38.3676 +        if ( shadow_vcpu_mode_translate(v) ) 
 38.3677 +            /* Paging enabled: find where in the page the l3 table is */
 38.3678 +            guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
 38.3679 +        else
 38.3680 +            /* Paging disabled: l3 is at the start of a page (in the p2m) */ 
 38.3681 +            guest_idx = 0; 
 38.3682 +
 38.3683 +        // Ignore the low 2 bits of guest_idx -- they are really just
 38.3684 +        // cache control.
 38.3685 +        guest_idx &= ~3;
 38.3686 +        // XXX - why does this need a global map?
 38.3687 +        v->arch.guest_vtable =
 38.3688 +            (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx;
 38.3689 +#else
 38.3690 +        // XXX - why does this need a global map?
 38.3691 +        v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
 38.3692 +#endif
 38.3693 +    }
 38.3694 +    else
 38.3695 +    {
 38.3696 +#ifdef __x86_64__
 38.3697 +        v->arch.guest_vtable = __linear_l4_table;
 38.3698 +#elif GUEST_PAGING_LEVELS == 3
 38.3699 +        // XXX - why does this need a global map?
 38.3700 +        v->arch.guest_vtable = sh_map_domain_page_global(gmfn);
 38.3701 +#else
 38.3702 +        v->arch.guest_vtable = __linear_l2_table;
 38.3703 +#endif
 38.3704 +    }
 38.3705 +
 38.3706 +#if 0
 38.3707 +    printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
 38.3708 +           __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
 38.3709 +#endif
 38.3710 +
 38.3711 +    ////
 38.3712 +    //// vcpu->arch.shadow_table
 38.3713 +    ////
 38.3714 +    smfn = get_shadow_status(v, gmfn, PGC_SH_guest_root_type);
 38.3715 +    if ( valid_mfn(smfn) )
 38.3716 +    {
 38.3717 +        /* Pull this root shadow to the front of the list of roots. */
 38.3718 +        list_del(&mfn_to_page(smfn)->list);
 38.3719 +        list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows);
 38.3720 +    }
 38.3721 +    else
 38.3722 +    {
 38.3723 +        /* This guest MFN is a pagetable.  Must revoke write access. */
 38.3724 +        if ( shadow_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0) 
 38.3725 +             != 0 )
 38.3726 +            flush_tlb_mask(d->domain_dirty_cpumask); 
 38.3727 +        /* Make sure there's enough free shadow memory. */
 38.3728 +        shadow_prealloc(d, SHADOW_MAX_ORDER); 
 38.3729 +        /* Shadow the page. */
 38.3730 +        smfn = sh_make_shadow(v, gmfn, PGC_SH_guest_root_type);
 38.3731 +        list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows);
 38.3732 +    }
 38.3733 +    ASSERT(valid_mfn(smfn));
 38.3734 +    v->arch.shadow_table = pagetable_from_mfn(smfn);
 38.3735 +
 38.3736 +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW
 38.3737 +    /* Once again OK to unhook entries from this table if we see fork/exit */
 38.3738 +    ASSERT(sh_mfn_is_a_page_table(gmfn));
 38.3739 +    mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings;
 38.3740 +#endif
 38.3741 +
 38.3742 +
 38.3743 +    ////
 38.3744 +    //// vcpu->arch.shadow_vtable
 38.3745 +    ////
 38.3746 +    if ( shadow_mode_external(d) )
 38.3747 +    {
 38.3748 +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
 38.3749 +        mfn_t adjusted_smfn = smfn;
 38.3750 +        u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx);
 38.3751 +        // Q: why does this need to use (un)map_domain_page_*global* ?
 38.3752 +        v->arch.shadow_vtable =
 38.3753 +            (shadow_l3e_t *)sh_map_domain_page_global(adjusted_smfn) +
 38.3754 +            shadow_idx;
 38.3755 +#else
 38.3756 +        // Q: why does this need to use (un)map_domain_page_*global* ?
 38.3757 +        v->arch.shadow_vtable = sh_map_domain_page_global(smfn);
 38.3758 +#endif
 38.3759 +    }
 38.3760 +    else
 38.3761 +    {
 38.3762 +#if SHADOW_PAGING_LEVELS == 4
 38.3763 +        v->arch.shadow_vtable = __sh_linear_l4_table;
 38.3764 +#elif GUEST_PAGING_LEVELS == 3
 38.3765 +        // XXX - why does this need a global map?
 38.3766 +        v->arch.shadow_vtable = sh_map_domain_page_global(smfn);
 38.3767 +#else
 38.3768 +        v->arch.shadow_vtable = __sh_linear_l2_table;
 38.3769 +#endif
 38.3770 +    }
 38.3771 +
 38.3772 +    ////
 38.3773 +    //// Take a ref to the new shadow table, and pin it.
 38.3774 +    ////
 38.3775 +    //
 38.3776 +    // This ref is logically "held" by v->arch.shadow_table entry itself.
 38.3777 +    // Release the old ref.
 38.3778 +    //
 38.3779 +#if GUEST_PAGING_LEVELS == 3
 38.3780 +    // PAE guests do not (necessarily) use an entire page for their
 38.3781 +    // 4-entry L3s, so we have to deal with them specially.
 38.3782 +    //
 38.3783 +    // XXX - might want to revisit this if/when we do multiple compilation for
 38.3784 +    //       HVM-vs-PV guests, as PAE PV guests could get away without doing
 38.3785 +    //       subshadows.
 38.3786 +    //
 38.3787 +    sh_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn);
 38.3788 +    sh_pin_l3_subshadow(v->arch.shadow_vtable, smfn);
 38.3789 +#else
 38.3790 +    sh_get_ref(smfn, 0);
 38.3791 +    sh_pin(smfn);
 38.3792 +#endif
 38.3793 +
 38.3794 +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
 38.3795 +    // PAE 3-on-3 shadows have to keep track of which vcpu's are using
 38.3796 +    // which l3 subshadow, in order handle the SHADOW_SET_L3PAE_RECOPY
 38.3797 +    // case from validate_gl3e().  Search for SHADOW_SET_L3PAE_RECOPY
 38.3798 +    // in the code for more info.
 38.3799 +    //
 38.3800 +    {
 38.3801 +        struct pae_l3_bookkeeping *info =
 38.3802 +            sl3p_to_info(v->arch.shadow_vtable);
 38.3803 +        ASSERT(!test_bit(v->vcpu_id, &info->vcpus));
 38.3804 +        set_bit(v->vcpu_id, &info->vcpus);
 38.3805 +    }
 38.3806 +#endif
 38.3807 +
 38.3808 +    debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n",
 38.3809 +                      __func__, gmfn, smfn);
 38.3810 +
 38.3811 +    ///
 38.3812 +    /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3
 38.3813 +    ///
 38.3814 +    if ( shadow_mode_external(d) )
 38.3815 +    {
 38.3816 +        ASSERT(hvm_guest(v));
 38.3817 +        make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
 38.3818 +
 38.3819 +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
 38.3820 +#if SHADOW_PAGING_LEVELS != 3
 38.3821 +#error unexpected combination of GUEST and SHADOW paging levels
 38.3822 +#endif
 38.3823 +        /* 2-on-3: make a PAE l3 table that points at the four-page l2 */
 38.3824 +        {
 38.3825 +            mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table);
 38.3826 +            int i;
 38.3827 +
 38.3828 +            ASSERT(v->arch.hvm_vcpu.hw_cr3 ==
 38.3829 +                   virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab));
 38.3830 +            for (i = 0; i < 4; i++)
 38.3831 +            {
 38.3832 +                v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] =
 38.3833 +                    shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT);
 38.3834 +            }
 38.3835 +        }
 38.3836 +#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
 38.3837 +        /* 3-on-3: copy the shadow l3 to slots that are below 4GB.
 38.3838 +         * If paging is disabled, clear l3e reserved bits; otherwise 
 38.3839 +         * remove entries that have reserved bits set. */
 38.3840 +        v->arch.hvm_vcpu.hw_cr3 =
 38.3841 +            hvm_pae_copy_root(v, v->arch.shadow_vtable, 
 38.3842 +                              !shadow_vcpu_mode_translate(v));
 38.3843 +#else
 38.3844 +        /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */
 38.3845 +        v->arch.hvm_vcpu.hw_cr3 =
 38.3846 +            pagetable_get_paddr(v->arch.shadow_table);
 38.3847 +#endif
 38.3848 +    }
 38.3849 +    else // not shadow_mode_external...
 38.3850 +    {
 38.3851 +        /* We don't support PV except guest == shadow == config levels */
 38.3852 +        BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
 38.3853 +        make_cr3(v, pagetable_get_pfn(v->arch.shadow_table));
 38.3854 +    }
 38.3855 +
 38.3856 +    /* Fix up the linear pagetable mappings */
 38.3857 +    sh_update_linear_entries(v);
 38.3858 +}
 38.3859 +
 38.3860 +
 38.3861 +/**************************************************************************/
 38.3862 +/* Functions to revoke guest rights */
 38.3863 +
 38.3864 +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
 38.3865 +static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
 38.3866 +/* Look up this vaddr in the current shadow and see if it's a writeable
 38.3867 + * mapping of this gmfn.  If so, remove it.  Returns 1 if it worked. */
 38.3868 +{
 38.3869 +    shadow_l1e_t sl1e, *sl1p;
 38.3870 +    shadow_l2e_t *sl2p;
 38.3871 +#if GUEST_PAGING_LEVELS >= 3
 38.3872 +    shadow_l3e_t *sl3p;
 38.3873 +#if GUEST_PAGING_LEVELS >= 4
 38.3874 +    shadow_l4e_t *sl4p;
 38.3875 +#endif
 38.3876 +#endif
 38.3877 +    mfn_t sl1mfn;
 38.3878 +
 38.3879 +
 38.3880 +    /* Carefully look in the shadow linear map for the l1e we expect */
 38.3881 +    if ( v->arch.shadow_vtable == NULL ) return 0;
 38.3882 +#if GUEST_PAGING_LEVELS >= 4
 38.3883 +    sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
 38.3884 +    if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
 38.3885 +        return 0;
 38.3886 +    sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
 38.3887 +    if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
 38.3888 +        return 0;
 38.3889 +#elif GUEST_PAGING_LEVELS == 3
 38.3890 +    sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable) 
 38.3891 +        + shadow_l3_linear_offset(vaddr);
 38.3892 +    if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
 38.3893 +        return 0;
 38.3894 +#endif
 38.3895 +    sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
 38.3896 +    if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
 38.3897 +        return 0;
 38.3898 +    sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
 38.3899 +    sl1e = *sl1p;
 38.3900 +    if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
 38.3901 +          != (_PAGE_PRESENT|_PAGE_RW))
 38.3902 +         || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
 38.3903 +        return 0;
 38.3904 +
 38.3905 +    /* Found it!  Need to remove its write permissions. */
 38.3906 +    sl1mfn = shadow_l2e_get_mfn(*sl2p);
 38.3907 +    sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
 38.3908 +    shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
 38.3909 +    return 1;
 38.3910 +}
 38.3911 +#endif
 38.3912 +
 38.3913 +int sh_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn)
 38.3914 +/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
 38.3915 +{
 38.3916 +    shadow_l1e_t *sl1e;
 38.3917 +    int done = 0;
 38.3918 +    int flags;
 38.3919 +    
 38.3920 +    SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, 
 38.3921 +    {
 38.3922 +        flags = shadow_l1e_get_flags(*sl1e);
 38.3923 +        if ( (flags & _PAGE_PRESENT) 
 38.3924 +             && (flags & _PAGE_RW) 
 38.3925 +             && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
 38.3926 +        {
 38.3927 +            shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
 38.3928 +            if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
 38.3929 +                  & PGT_count_mask) == 0 )
 38.3930 +                /* This breaks us cleanly out of the FOREACH macro */
 38.3931 +                done = 1;
 38.3932 +        }
 38.3933 +    });
 38.3934 +    return done;
 38.3935 +}
 38.3936 +
 38.3937 +
 38.3938 +int sh_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
 38.3939 +/* Excises all mappings to guest frame from this shadow l1 table */
 38.3940 +{
 38.3941 +    shadow_l1e_t *sl1e;
 38.3942 +    int done = 0;
 38.3943 +    int flags;
 38.3944 +    
 38.3945 +    SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, 
 38.3946 +    {
 38.3947 +        flags = shadow_l1e_get_flags(*sl1e);
 38.3948 +        if ( (flags & _PAGE_PRESENT) 
 38.3949 +             && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
 38.3950 +        {
 38.3951 +            shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
 38.3952 +            if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
 38.3953 +                /* This breaks us cleanly out of the FOREACH macro */
 38.3954 +                done = 1;
 38.3955 +        }
 38.3956 +    });
 38.3957 +    return done;
 38.3958 +}
 38.3959 +
 38.3960 +/**************************************************************************/
 38.3961 +/* Functions to excise all pointers to shadows from higher-level shadows. */
 38.3962 +
 38.3963 +void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
 38.3964 +/* Blank out a single shadow entry */
 38.3965 +{
 38.3966 +    switch (mfn_to_page(smfn)->count_info & PGC_SH_type_mask) 
 38.3967 +    {
 38.3968 +    case PGC_SH_l1_shadow:
 38.3969 +        shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
 38.3970 +    case PGC_SH_l2_shadow:
 38.3971 +#if GUEST_PAGING_LEVELS == 3
 38.3972 +    case PGC_SH_l2h_shadow:
 38.3973 +#endif
 38.3974 +        shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
 38.3975 +#if GUEST_PAGING_LEVELS >= 3
 38.3976 +    case PGC_SH_l3_shadow:
 38.3977 +        shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
 38.3978 +#if GUEST_PAGING_LEVELS >= 4
 38.3979 +    case PGC_SH_l4_shadow:
 38.3980 +        shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
 38.3981 +#endif
 38.3982 +#endif
 38.3983 +    default: BUG(); /* Called with the wrong kind of shadow. */
 38.3984 +    }
 38.3985 +}
 38.3986 +
 38.3987 +int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
 38.3988 +/* Remove all mappings of this l1 shadow from this l2 shadow */
 38.3989 +{
 38.3990 +    shadow_l2e_t *sl2e;
 38.3991 +    int done = 0;
 38.3992 +    int flags;
 38.3993 +#if GUEST_PAGING_LEVELS != 4
 38.3994 +    int xen_mappings = !shadow_mode_external(v->domain);
 38.3995 +#endif
 38.3996 +    
 38.3997 +    SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings, 
 38.3998 +    {
 38.3999 +        flags = shadow_l2e_get_flags(*sl2e);
 38.4000 +        if ( (flags & _PAGE_PRESENT) 
 38.4001 +             && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
 38.4002 +        {
 38.4003 +            shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
 38.4004 +            if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH_type_mask) == 0 )
 38.4005 +                /* This breaks us cleanly out of the FOREACH macro */
 38.4006 +                done = 1;
 38.4007 +        }
 38.4008 +    });
 38.4009 +    return done;
 38.4010 +}
 38.4011 +
 38.4012 +#if GUEST_PAGING_LEVELS >= 3
 38.4013 +int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
 38.4014 +/* Remove all mappings of this l2 shadow from this l3 shadow */
 38.4015 +{
 38.4016 +    shadow_l3e_t *sl3e;
 38.4017 +    int done = 0;
 38.4018 +    int flags;
 38.4019 +    
 38.4020 +    SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done, 
 38.4021 +    {
 38.4022 +        flags = shadow_l3e_get_flags(*sl3e);
 38.4023 +        if ( (flags & _PAGE_PRESENT) 
 38.4024 +             && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
 38.4025 +        {
 38.4026 +            shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
 38.4027 +            if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask) == 0 )
 38.4028 +                /* This breaks us cleanly out of the FOREACH macro */
 38.4029 +                done = 1;
 38.4030 +        }
 38.4031 +    });
 38.4032 +    return done;
 38.4033 +}
 38.4034 +
 38.4035 +#if GUEST_PAGING_LEVELS >= 4
 38.4036 +int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
 38.4037 +/* Remove all mappings of this l3 shadow from this l4 shadow */
 38.4038 +{
 38.4039 +    shadow_l4e_t *sl4e;
 38.4040 +    int done = 0;
 38.4041 +    int flags, xen_mappings = !shadow_mode_external(v->domain);
 38.4042 +    
 38.4043 +    SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
 38.4044 +    {
 38.4045 +        flags = shadow_l4e_get_flags(*sl4e);
 38.4046 +        if ( (flags & _PAGE_PRESENT) 
 38.4047 +             && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
 38.4048 +        {
 38.4049 +            shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
 38.4050 +            if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH_type_mask) == 0 )
 38.4051 +                /* This breaks us cleanly out of the FOREACH macro */
 38.4052 +                done = 1;
 38.4053 +        }
 38.4054 +    });
 38.4055 +    return done;
 38.4056 +}
 38.4057 +#endif /* 64bit guest */ 
 38.4058 +#endif /* PAE guest */
 38.4059 +
 38.4060 +/**************************************************************************/
 38.4061 +/* Handling HVM guest writes to pagetables  */
 38.4062 +
 38.4063 +/* Check that the user is allowed to perform this write. 
 38.4064 + * Returns a mapped pointer to write to, and the mfn it's on,
 38.4065 + * or NULL for error. */
 38.4066 +static inline void * emulate_map_dest(struct vcpu *v,
 38.4067 +                                      unsigned long vaddr,
 38.4068 +                                      struct x86_emulate_ctxt *ctxt,
 38.4069 +                                      mfn_t *mfnp)
 38.4070 +{
 38.4071 +    walk_t gw;
 38.4072 +    u32 flags;
 38.4073 +    gfn_t gfn;
 38.4074 +    mfn_t mfn;
 38.4075 +
 38.4076 +    guest_walk_tables(v, vaddr, &gw, 1);
 38.4077 +    flags = accumulate_guest_flags(&gw);
 38.4078 +    gfn = guest_l1e_get_gfn(gw.eff_l1e);
 38.4079 +    mfn = vcpu_gfn_to_mfn(v, gfn);
 38.4080 +    sh_audit_gw(v, &gw);
 38.4081 +    unmap_walk(v, &gw);
 38.4082 +
 38.4083 +    if ( !(flags & _PAGE_PRESENT) 
 38.4084 +         || !(flags & _PAGE_RW) 
 38.4085 +         || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) )
 38.4086 +    {
 38.4087 +        /* This write would have faulted even on bare metal */
 38.4088 +        v->arch.shadow.propagate_fault = 1;
 38.4089 +        return NULL;
 38.4090 +    }
 38.4091 +    
 38.4092 +    if ( !valid_mfn(mfn) )
 38.4093 +    {
 38.4094 +        /* Attempted a write to a bad gfn.  This should never happen:
 38.4095 +         * after all, we're here because this write is to a page table. */
 38.4096 +        BUG();
 38.4097 +    }
 38.4098 +
 38.4099 +    ASSERT(sh_mfn_is_a_page_table(mfn));
 38.4100 +    *mfnp = mfn;
 38.4101 +    return sh_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
 38.4102 +}
 38.4103 +
 38.4104 +int
 38.4105 +sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
 38.4106 +                      u32 bytes, struct x86_emulate_ctxt *ctxt)
 38.4107 +{
 38.4108 +    ASSERT(shadow_lock_is_acquired(v->domain));
 38.4109 +    while ( bytes > 0 )
 38.4110 +    {
 38.4111 +        mfn_t mfn;
 38.4112 +        int bytes_on_page;
 38.4113 +        void *addr;
 38.4114 +
 38.4115 +        bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK);
 38.4116 +        if ( bytes_on_page > bytes )
 38.4117 +            bytes_on_page = bytes;
 38.4118 +
 38.4119 +        if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
 38.4120 +            return X86EMUL_PROPAGATE_FAULT;
 38.4121 +        memcpy(addr, src, bytes_on_page);
 38.4122 +        shadow_validate_guest_pt_write(v, mfn, addr, bytes_on_page);
 38.4123 +        bytes -= bytes_on_page;
 38.4124 +        /* If we are writing zeros to this page, might want to unshadow */
 38.4125 +        if ( *(u8 *)addr == 0 )
 38.4126 +            check_for_early_unshadow(v, mfn);
 38.4127 +        sh_unmap_domain_page(addr);
 38.4128 +    }
 38.4129 +    shadow_audit_tables(v);
 38.4130 +    return X86EMUL_CONTINUE;
 38.4131 +}
 38.4132 +
 38.4133 +int
 38.4134 +sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr, 
 38.4135 +                        unsigned long old, unsigned long new,
 38.4136 +                        unsigned int bytes, struct x86_emulate_ctxt *ctxt)
 38.4137 +{
 38.4138 +    mfn_t mfn;
 38.4139 +    void *addr;
 38.4140 +    unsigned long prev;
 38.4141 +    int rv = X86EMUL_CONTINUE;
 38.4142 +
 38.4143 +    ASSERT(shadow_lock_is_acquired(v->domain));
 38.4144 +    ASSERT(bytes <= sizeof (unsigned long));
 38.4145 +
 38.4146 +    if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
 38.4147 +        return X86EMUL_PROPAGATE_FAULT;
 38.4148 +
 38.4149 +    switch (bytes) 
 38.4150 +    {
 38.4151 +    case 1: prev = cmpxchg(((u8 *)addr), old, new);  break;
 38.4152 +    case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
 38.4153 +    case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
 38.4154 +    case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
 38.4155 +    default:
 38.4156 +        SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes);
 38.4157 +        prev = ~old;
 38.4158 +    }
 38.4159 +
 38.4160 +    if ( (prev == old)  )
 38.4161 +        shadow_validate_guest_pt_write(v, mfn, addr, bytes);
 38.4162 +    else
 38.4163 +        rv = X86EMUL_CMPXCHG_FAILED;
 38.4164 +
 38.4165 +    SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
 38.4166 +                  " wanted %#lx now %#lx bytes %u\n",
 38.4167 +                  vaddr, prev, old, new, *(unsigned long *)addr, bytes);
 38.4168 +
 38.4169 +    /* If we are writing zeros to this page, might want to unshadow */
 38.4170 +    if ( *(u8 *)addr == 0 )
 38.4171 +        check_for_early_unshadow(v, mfn);
 38.4172 +
 38.4173 +    sh_unmap_domain_page(addr);
 38.4174 +    shadow_audit_tables(v);
 38.4175 +    check_for_early_unshadow(v, mfn);
 38.4176 +    return rv;
 38.4177 +}
 38.4178 +
 38.4179 +int
 38.4180 +sh_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr, 
 38.4181 +                          unsigned long old_lo, unsigned long old_hi,
 38.4182 +                          unsigned long new_lo, unsigned long new_hi,
 38.4183 +                          struct x86_emulate_ctxt *ctxt)
 38.4184 +{
 38.4185 +    mfn_t mfn;
 38.4186 +    void *addr;
 38.4187 +    u64 old, new, prev;
 38.4188 +    int rv = X86EMUL_CONTINUE;
 38.4189 +
 38.4190 +    ASSERT(shadow_lock_is_acquired(v->domain));
 38.4191 +
 38.4192 +    if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
 38.4193 +        return X86EMUL_PROPAGATE_FAULT;
 38.4194 +
 38.4195 +    old = (((u64) old_hi) << 32) | (u64) old_lo;
 38.4196 +    new = (((u64) new_hi) << 32) | (u64) new_lo;
 38.4197 +    prev = cmpxchg(((u64 *)addr), old, new);
 38.4198 +
 38.4199 +    if ( (prev == old)  )
 38.4200 +        shadow_validate_guest_pt_write(v, mfn, addr, 8);
 38.4201 +    else
 38.4202 +        rv = X86EMUL_CMPXCHG_FAILED;
 38.4203 +
 38.4204 +    /* If we are writing zeros to this page, might want to unshadow */
 38.4205 +    if ( *(u8 *)addr == 0 )
 38.4206 +        check_for_early_unshadow(v, mfn);
 38.4207 +
 38.4208 +    sh_unmap_domain_page(addr);
 38.4209 +    shadow_audit_tables(v);
 38.4210 +    check_for_early_unshadow(v, mfn);
 38.4211 +    return rv;
 38.4212 +}
 38.4213 +
 38.4214 +
 38.4215 +/**************************************************************************/
 38.4216 +/* Audit tools */
 38.4217 +
 38.4218 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
 38.4219 +
 38.4220 +#define AUDIT_FAIL(_level, _fmt, _a...) do {                               \
 38.4221 +    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"         \
 38.4222 +           "gl" #_level "mfn = %" SH_PRI_mfn                              \
 38.4223 +           " sl" #_level "mfn = %" SH_PRI_mfn                             \
 38.4224 +           " &gl" #_level "e = %p &sl" #_level "e = %p"                    \
 38.4225 +           " gl" #_level "e = %" SH_PRI_gpte                              \
 38.4226 +           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",        \
 38.4227 +           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                      \
 38.4228 +           _level, guest_index(gl ## _level ## e),                         \
 38.4229 +           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),         \
 38.4230 +           gl ## _level ## e, sl ## _level ## e,                           \
 38.4231 +           gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
 38.4232 +           ##_a);                                                          \
 38.4233 +    BUG();                                                                 \
 38.4234 +    done = 1;                                                              \
 38.4235 +} while (0)
 38.4236 +
 38.4237 +
 38.4238 +static char * sh_audit_flags(struct vcpu *v, int level,
 38.4239 +                              int gflags, int sflags) 
 38.4240 +/* Common code for auditing flag bits */
 38.4241 +{
 38.4242 +    if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
 38.4243 +        return "shadow is present but guest is not present";
 38.4244 +    if ( (sflags & _PAGE_GLOBAL) && !hvm_guest(v) ) 
 38.4245 +        return "global bit set in PV shadow";
 38.4246 +    if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
 38.4247 +         && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) ) 
 38.4248 +        return "dirty bit not propagated";
 38.4249 +    if ( level == 2 && (sflags & _PAGE_PSE) )
 38.4250 +        return "PS bit set in shadow";
 38.4251 +#if SHADOW_PAGING_LEVELS == 3
 38.4252 +    if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
 38.4253 +#endif
 38.4254 +    if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) ) 
 38.4255 +        return "user/supervisor bit does not match";
 38.4256 +    if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) ) 
 38.4257 +        return "NX bit does not match";
 38.4258 +    if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) ) 
 38.4259 +        return "shadow grants write access but guest does not";
 38.4260 +    if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) ) 
 38.4261 +        return "accessed bit not propagated";
 38.4262 +    return NULL;
 38.4263 +}
 38.4264 +
 38.4265 +static inline mfn_t
 38.4266 +audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
 38.4267 +/* Convert this gfn to an mfn in the manner appropriate for the
 38.4268 + * guest pagetable it's used in (gmfn) */ 
 38.4269 +{
 38.4270 +    if ( !shadow_mode_translate(v->domain) )
 38.4271 +        return _mfn(gfn_x(gfn));
 38.4272 +    
 38.4273 +    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
 38.4274 +         != PGT_writable_page ) 
 38.4275 +        return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
 38.4276 +    else 
 38.4277 +        return sh_gfn_to_mfn(v->domain, gfn_x(gfn));
 38.4278 +} 
 38.4279 +
 38.4280 +
 38.4281 +int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
 38.4282 +{
 38.4283 +    guest_l1e_t *gl1e, *gp;
 38.4284 +    shadow_l1e_t *sl1e;
 38.4285 +    mfn_t mfn, gmfn, gl1mfn;
 38.4286 +    gfn_t gfn;
 38.4287 +    char *s;
 38.4288 +    int done = 0;
 38.4289 +
 38.4290 +    /* Follow the backpointer */
 38.4291 +    gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info);
 38.4292 +    gl1e = gp = sh_map_domain_page(gl1mfn);
 38.4293 +    SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
 38.4294 +
 38.4295 +        s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
 38.4296 +                            shadow_l1e_get_flags(*sl1e));
 38.4297 +        if ( s ) AUDIT_FAIL(1, "%s", s);
 38.4298 +
 38.4299 +        if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
 38.4300 +        {
 38.4301 +            gfn = guest_l1e_get_gfn(*gl1e);
 38.4302 +            mfn = shadow_l1e_get_mfn(*sl1e);
 38.4303 +            gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
 38.4304 +            if ( mfn_x(gmfn) != mfn_x(mfn) )
 38.4305 +                AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
 38.4306 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
 38.4307 +                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
 38.4308 +        }
 38.4309 +    });
 38.4310 +    sh_unmap_domain_page(gp);
 38.4311 +    return done;
 38.4312 +}
 38.4313 +
 38.4314 +int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
 38.4315 +{
 38.4316 +    guest_l1e_t *gl1e, e;
 38.4317 +    shadow_l1e_t *sl1e;
 38.4318 +    mfn_t gl1mfn = _mfn(INVALID_MFN);
 38.4319 +    int f;
 38.4320 +    int done = 0;
 38.4321 +
 38.4322 +    /* fl1 has no useful backpointer: all we can check are flags */
 38.4323 +    e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
 38.4324 +    SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
 38.4325 +        f = shadow_l1e_get_flags(*sl1e);
 38.4326 +        f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
 38.4327 +        if ( !(f == 0 
 38.4328 +               || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
 38.4329 +                        _PAGE_ACCESSED|_PAGE_DIRTY) 
 38.4330 +               || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) )
 38.4331 +            AUDIT_FAIL(1, "fl1e has bad flags");
 38.4332 +    });
 38.4333 +    return 0;
 38.4334 +}
 38.4335 +
 38.4336 +int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
 38.4337 +{
 38.4338 +    guest_l2e_t *gl2e, *gp;
 38.4339 +    shadow_l2e_t *sl2e;
 38.4340 +    mfn_t mfn, gmfn, gl2mfn;
 38.4341 +    gfn_t gfn;
 38.4342 +    char *s;
 38.4343 +    int done = 0;
 38.4344 +#if GUEST_PAGING_LEVELS != 4
 38.4345 +    int xen_mappings = !shadow_mode_external(v->domain);
 38.4346 +#endif
 38.4347 +
 38.4348 +    /* Follow the backpointer */
 38.4349 +    gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info);
 38.4350 +    gl2e = gp = sh_map_domain_page(gl2mfn);
 38.4351 +    SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
 38.4352 +
 38.4353 +        s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
 38.4354 +                            shadow_l2e_get_flags(*sl2e));
 38.4355 +        if ( s ) AUDIT_FAIL(2, "%s", s);
 38.4356 +
 38.4357 +        if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
 38.4358 +        {
 38.4359 +            gfn = guest_l2e_get_gfn(*gl2e);
 38.4360 +            mfn = shadow_l2e_get_mfn(*sl2e);
 38.4361 +            gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)  
 38.4362 +                ? get_fl1_shadow_status(v, gfn)
 38.4363 +                : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn), 
 38.4364 +                                    PGC_SH_l1_shadow);
 38.4365 +            if ( mfn_x(gmfn) != mfn_x(mfn) )
 38.4366 +                AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
 38.4367 +                           " (--> %" SH_PRI_mfn ")"
 38.4368 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
 38.4369 +                           gfn_x(gfn), 
 38.4370 +                           (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
 38.4371 +                           : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
 38.4372 +                           mfn_x(gmfn), mfn_x(mfn));
 38.4373 +        }
 38.4374 +    });
 38.4375 +    sh_unmap_domain_page(gp);
 38.4376 +    return 0;
 38.4377 +}
 38.4378 +
 38.4379 +#if GUEST_PAGING_LEVELS >= 3
 38.4380 +int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
 38.4381 +{
 38.4382 +    guest_l3e_t *gl3e, *gp;
 38.4383 +    shadow_l3e_t *sl3e;
 38.4384 +    mfn_t mfn, gmfn, gl3mfn;
 38.4385 +    gfn_t gfn;
 38.4386 +    char *s;
 38.4387 +    int done = 0;
 38.4388 +
 38.4389 +    /* Follow the backpointer */
 38.4390 +    gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info);
 38.4391 +    gl3e = gp = sh_map_domain_page(gl3mfn);
 38.4392 +    SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
 38.4393 +
 38.4394 +        s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
 38.4395 +                            shadow_l3e_get_flags(*sl3e));
 38.4396 +        if ( s ) AUDIT_FAIL(3, "%s", s);
 38.4397 +
 38.4398 +        if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
 38.4399 +        {
 38.4400 +            gfn = guest_l3e_get_gfn(*gl3e);
 38.4401 +            mfn = shadow_l3e_get_mfn(*sl3e);
 38.4402 +            gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn), 
 38.4403 +                                     (GUEST_PAGING_LEVELS == 3 
 38.4404 +                                      && !shadow_mode_external(v->domain)
 38.4405 +                                      && (guest_index(gl3e) % 4) == 3)
 38.4406 +                                     ? PGC_SH_l2h_pae_shadow
 38.4407 +                                     : PGC_SH_l2_shadow);
 38.4408 +            if ( mfn_x(gmfn) != mfn_x(mfn) )
 38.4409 +                AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn
 38.4410 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
 38.4411 +                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
 38.4412 +        }
 38.4413 +    });
 38.4414 +    sh_unmap_domain_page(gp);
 38.4415 +    return 0;
 38.4416 +}
 38.4417 +#endif /* GUEST_PAGING_LEVELS >= 3 */
 38.4418 +
 38.4419 +#if GUEST_PAGING_LEVELS >= 4
 38.4420 +int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
 38.4421 +{
 38.4422 +    guest_l4e_t *gl4e, *gp;
 38.4423 +    shadow_l4e_t *sl4e;
 38.4424 +    mfn_t mfn, gmfn, gl4mfn;
 38.4425 +    gfn_t gfn;
 38.4426 +    char *s;
 38.4427 +    int done = 0;
 38.4428 +    int xen_mappings = !shadow_mode_external(v->domain);
 38.4429 +
 38.4430 +    /* Follow the backpointer */
 38.4431 +    gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info);
 38.4432 +    gl4e = gp = sh_map_domain_page(gl4mfn);
 38.4433 +    SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
 38.4434 +    {
 38.4435 +        s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
 38.4436 +                            shadow_l4e_get_flags(*sl4e));
 38.4437 +        if ( s ) AUDIT_FAIL(4, "%s", s);
 38.4438 +
 38.4439 +        if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS )
 38.4440 +        {
 38.4441 +            gfn = guest_l4e_get_gfn(*gl4e);
 38.4442 +            mfn = shadow_l4e_get_mfn(*sl4e);
 38.4443 +            gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn), 
 38.4444 +                                     PGC_SH_l3_shadow);
 38.4445 +            if ( mfn_x(gmfn) != mfn_x(mfn) )
 38.4446 +                AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
 38.4447 +                           " --> %" SH_PRI_mfn " != mfn %" SH_PRI_mfn "\n",
 38.4448 +                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
 38.4449 +        }
 38.4450 +    });
 38.4451 +    sh_unmap_domain_page(gp);
 38.4452 +    return 0;
 38.4453 +}
 38.4454 +#endif /* GUEST_PAGING_LEVELS >= 4 */
 38.4455 +
 38.4456 +
 38.4457 +#undef AUDIT_FAIL
 38.4458 +
 38.4459 +#endif /* Audit code */
 38.4460 +
 38.4461 +/**************************************************************************/
 38.4462 +/* Entry points into this mode of the shadow code.
 38.4463 + * This will all be mangled by the preprocessor to uniquify everything. */
 38.4464 +struct shadow_paging_mode sh_paging_mode = {
 38.4465 +    .page_fault             = sh_page_fault, 
 38.4466 +    .invlpg                 = sh_invlpg,
 38.4467 +    .gva_to_gpa             = sh_gva_to_gpa,
 38.4468 +    .gva_to_gfn             = sh_gva_to_gfn,
 38.4469 +    .update_cr3             = sh_update_cr3,
 38.4470 +    .map_and_validate_gl1e  = sh_map_and_validate_gl1e,
 38.4471 +    .map_and_validate_gl2e  = sh_map_and_validate_gl2e,
 38.4472 +    .map_and_validate_gl2he = sh_map_and_validate_gl2he,
 38.4473 +    .map_and_validate_gl3e  = sh_map_and_validate_gl3e,
 38.4474 +    .map_and_validate_gl4e  = sh_map_and_validate_gl4e,
 38.4475 +    .detach_old_tables      = sh_detach_old_tables,
 38.4476 +    .x86_emulate_write      = sh_x86_emulate_write,
 38.4477 +    .x86_emulate_cmpxchg    = sh_x86_emulate_cmpxchg,
 38.4478 +    .x86_emulate_cmpxchg8b  = sh_x86_emulate_cmpxchg8b,
 38.4479 +    .make_monitor_table     = sh_make_monitor_table,
 38.4480 +    .destroy_monitor_table  = sh_destroy_monitor_table,