ia64/xen-unstable

changeset 2446:309c1fb27f87

bitkeeper revision 1.1159.1.127 (413cb655B-pXolV76jppJJyH2EFWkw)

Merge labyrinth.cl.cam.ac.uk:/auto/groups/xeno/users/cl349/BK/xeno.bk-nbsd
into labyrinth.cl.cam.ac.uk:/local/scratch/cl349/xeno.bk-nbsd
author cl349@labyrinth.cl.cam.ac.uk
date Mon Sep 06 19:11:17 2004 +0000 (2004-09-06)
parents d9439135569e 60e68411ab8a
children 924a0fb0c200
files .rootkeys BitKeeper/etc/ignore BitKeeper/etc/logging_ok Makefile netbsd-2.0-xen-sparse/Makefile netbsd-2.0-xen-sparse/mkbuildtree netbsd-2.0-xen-sparse/nbconfig-xen netbsd-2.0-xen-sparse/nbmake-xen netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c netbsd-2.0-xen-sparse/sys/arch/xen/include/ctrl_if.h netbsd-2.0-xen-sparse/sys/arch/xen/include/evtchn.h netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/ctrl_if.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/evtchn.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c netbsd-2.0-xen-sparse/sys/nfs/files.nfs
line diff
     1.1 --- a/.rootkeys	Mon Sep 06 18:48:13 2004 +0000
     1.2 +++ b/.rootkeys	Mon Sep 06 19:11:17 2004 +0000
     1.3 @@ -259,6 +259,41 @@ 4124f66f4NaKNa0xPiGGykn9QaZk3w linux-2.6
     1.4  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.8.1-xen-sparse/mkbuildtree
     1.5  412f46c0LJuKAgSPGoC0Z1DEkLfuLA linux-2.6.8.1-xen-sparse/mm/memory.c
     1.6  410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.8.1-xen-sparse/mm/page_alloc.c
     1.7 +413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2.0-xen-sparse/Makefile
     1.8 +413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
     1.9 +413cb1e5kY_Zil7-b0kI6hvCIxBEYg netbsd-2.0-xen-sparse/nbconfig-xen
    1.10 +413cb1e5-58q5doPifcE1Q8ZAgm-JQ netbsd-2.0-xen-sparse/nbmake-xen
    1.11 +413cb3b3Cmp02Gj87f3wwu2W9y0gBg netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN
    1.12 +413cb3b3aUP9GmUWqHWQ2SRp1qXnqQ netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen
    1.13 +413cb3b3pZuLKElEpQwX1C-3hLW4qA netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c
    1.14 +413cb3b34ui1cCGaSqIeLiBgMp-PDw netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c
    1.15 +413cb3b3i11i2GVGn0YGlRbM3ifbPQ netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c
    1.16 +413cb3b3FgMboWw-Pm3XdbBFSlZl_g netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S
    1.17 +413cb3b4ABCSfkHRmbsWfnZNG28nBA netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c
    1.18 +413cb3b4bvVJ7UlliMSH60J4uIb9kA netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c
    1.19 +413cb3b4aKd9SUY-OzUiTF0Gb9ve9w netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c
    1.20 +413cb3b4jUtWl-sP493PvB27o-Iltw netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S
    1.21 +413cb3b4ElwwoJEmmzflV0HgK5Qxcg netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c
    1.22 +413cb564XpMxewOF9BCK37BNcDewHQ netbsd-2.0-xen-sparse/sys/arch/xen/include/ctrl_if.h
    1.23 +413cb564rB0n4HPqzYQxBvfR9r-KeQ netbsd-2.0-xen-sparse/sys/arch/xen/include/evtchn.h
    1.24 +413cb3b4k9OVRCxuSdhKt-2baTp_Yg netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h
    1.25 +413cb3b4bRsqiHQLTKEZk4-zOksf8A netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h
    1.26 +413cb3b4OqY83qI8GztIZGADpvrpSw netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h
    1.27 +413cb3b42GG0LffraTnpZKlSUq57wg netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h
    1.28 +413cb3b4F0ArkWVBRyspkw7ivfXihg netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h
    1.29 +413cb3b4ullQud70n4JClwoEEUBh8Q netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h
    1.30 +413cb3b4y1Ffq8BOhbdSpn-fGmKuEg netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h
    1.31 +413cb3b4uXOFcT56QuLt1fcDrB-4Zg netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c
    1.32 +413cb3b4hIffjrKn3zhVqJmH6ueB3Q netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c
    1.33 +413cb564SakPue2EEm4MTtRb4z5JVw netbsd-2.0-xen-sparse/sys/arch/xen/xen/ctrl_if.c
    1.34 +413cb564uNQuIozl7hperSVK9EeDCA netbsd-2.0-xen-sparse/sys/arch/xen/xen/evtchn.c
    1.35 +413cb3b4eNdRIasCoQIuX4Nu39Dlqw netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c
    1.36 +413cb3b40DLJLbX_ZUIULB0JFjBuaw netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c
    1.37 +413cb3b46JnvK1UurZAubeQoFg1W-w netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c
    1.38 +413cb3b5rIKB3TbyhK3pbNyVkYysqA netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c
    1.39 +413cb3b5eKxnzoodEqaWn2wrPnHWnA netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c
    1.40 +413cb3b5F56TvQWAmO5TsuzhtzLFPQ netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c
    1.41 +413cb3b53nyOv1OIeDSsCXhBFDXvJA netbsd-2.0-xen-sparse/sys/nfs/files.nfs
    1.42  40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
    1.43  3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
    1.44  4124b307nRyK3dhn1hAsvrY76NuV3g tools/check/Makefile
     2.1 --- a/BitKeeper/etc/ignore	Mon Sep 06 18:48:13 2004 +0000
     2.2 +++ b/BitKeeper/etc/ignore	Mon Sep 06 19:11:17 2004 +0000
     2.3 @@ -32,6 +32,9 @@ linux-2.4.26.tar.gz
     2.4  linux-2.6.7-xenU
     2.5  linux-2.6.7.tar.bz2
     2.6  linux-xen-sparse
     2.7 +netbsd-*-xen0
     2.8 +netbsd-*-xenU
     2.9 +netbsd-*-tools
    2.10  patches/*
    2.11  tools/*/build/lib*/*.py
    2.12  tools/balloon/balloon
     3.1 --- a/BitKeeper/etc/logging_ok	Mon Sep 06 18:48:13 2004 +0000
     3.2 +++ b/BitKeeper/etc/logging_ok	Mon Sep 06 19:11:17 2004 +0000
     3.3 @@ -10,6 +10,7 @@ br260@br260.wolfson.cam.ac.uk
     3.4  br260@labyrinth.cl.cam.ac.uk
     3.5  br260@laudney.cl.cam.ac.uk
     3.6  cl349@freefall.cl.cam.ac.uk
     3.7 +cl349@labyrinth.cl.cam.ac.uk
     3.8  djm@kirby.fc.hp.com
     3.9  gm281@boulderdash.cl.cam.ac.uk
    3.10  iap10@freefall.cl.cam.ac.uk
     4.1 --- a/Makefile	Mon Sep 06 18:48:13 2004 +0000
     4.2 +++ b/Makefile	Mon Sep 06 19:11:17 2004 +0000
     4.3 @@ -103,6 +103,54 @@ linux-xen%:
     4.4  	    modules_install
     4.5  	$(MAKE) -C $(BDIR) ARCH=xen INSTALL_PATH=$(INSTALL_DIR) install
     4.6  
     4.7 +
     4.8 +NETBSD_RELEASE   ?= 2.0
     4.9 +NETBSD_VER       ?= $(shell ( /bin/ls -ld netbsd-$(NETBSD_RELEASE)*-xen-sparse ) 2>/dev/null | \
    4.10 +		      sed -e 's!^.*netbsd-\(.\+\)-xen-sparse!\1!' )
    4.11 +NETBSD_CVSSNAP   ?= 20040906
    4.12 +NETBSD_SRC_PATH  ?= .:..
    4.13 +NETBSD_SRC       ?= $(firstword $(foreach dir,$(subst :, ,$(NETBSD_SRC_PATH)),\
    4.14 +                    $(wildcard $(dir)/netbsd-$(NETBSD_VER)-xen-kernel-$(NETBSD_CVSSNAP).tar.*z*)))
    4.15 +NETBSD_TOOLS_SRC ?= $(firstword $(foreach dir,$(subst :, ,$(NETBSD_SRC_PATH)),\
    4.16 +                    $(wildcard $(dir)/netbsd-$(NETBSD_VER)-tools.tar.*z*)))
    4.17 +
    4.18 +NETBSD_TREES := netbsd-$(NETBSD_VER)-xenU
    4.19 +
    4.20 +pristine-netbsd-src: 
    4.21 +ifeq ($(NETBSD_SRC),)
    4.22 +	@echo "Cannot find netbsd-$(NETBSD_VER)-xen-kernel-$(NETBSD_CVSSNAP).tar.gz in path $(NETBSD_SRC_PATH)"
    4.23 +	@wget http://www.cl.cam.ac.uk/Research/SRG/netos/xen/downloads/netbsd-$(NETBSD_VER)-xen-kernel-$(NETBSD_CVSSNAP).tar.bz2 -O./netbsd-$(NETBSD_VER)-xen-kernel-$(NETBSD_CVSSNAP).tar.bz2
    4.24 +NETBSD_SRC := ./netbsd-$(NETBSD_VER)-xen-kernel-$(NETBSD_CVSSNAP).tar.bz2 
    4.25 +endif
    4.26 +
    4.27 +pristine-netbsd-tools-src: 
    4.28 +ifeq ($(NETBSD_TOOLS_SRC),)
    4.29 +	@echo "Cannot find netbsd-$(NETBSD_VER)-tools.tar.gz in path $(NETBSD_SRC_PATH)"
    4.30 +	@wget http://www.cl.cam.ac.uk/Research/SRG/netos/xen/downloads/netbsd-$(NETBSD_VER)-tools.tar.bz2 -O./netbsd-$(NETBSD_VER)-tools.tar.bz2
    4.31 +NETBSD_TOOLS_SRC := ./netbsd-$(NETBSD_VER)-tools.tar.bz2 
    4.32 +endif
    4.33 +
    4.34 +netbsd-tools: pristine-netbsd-tools-src
    4.35 +	@[ -d netbsd-$(NETBSD_RELEASE)-tools ] || { \
    4.36 +		echo extract $(NETBSD_TOOLS_SRC); \
    4.37 +		tar -jxf $(NETBSD_TOOLS_SRC); }
    4.38 +
    4.39 +mk-netbsd-trees: netbsd-tools pristine-netbsd-src 
    4.40 +	$(RM) -rf $(NETBSD_TREES)
    4.41 +	echo $(NETBSD_SRC) | grep -q bz2 && \
    4.42 +	    tar -jxf $(NETBSD_SRC) || tar -zxf $(NETBSD_SRC)
    4.43 +	mv netbsd-$(NETBSD_VER)-xen-kernel-$(NETBSD_CVSSNAP) \
    4.44 +	    netbsd-$(NETBSD_VER)-xenU
    4.45 +	( cd netbsd-$(NETBSD_VER)-xen-sparse ; \
    4.46 +          ./mkbuildtree ../netbsd-$(NETBSD_VER)-xenU )
    4.47 +
    4.48 +# build the specified netbsd tree
    4.49 +BDIR = $(subst netbsd-,netbsd-$(NETBSD_VER)-,$@)
    4.50 +netbsd-xen%:	
    4.51 +	$(MAKE) -C $(BDIR) config
    4.52 +	$(MAKE) -C $(BDIR) netbsd
    4.53 +	$(MAKE) -C $(BDIR) INSTALL_PATH=$(INSTALL_DIR) INSTALL_NAME=boot/netbsd-$(NETBSD_VER)-$(subst netbsd-,,$@) install
    4.54 +
    4.55  # build xen, the tools, and a domain 0 plus unprivileged linux-xen images,
    4.56  # and place them in the install directory. 'make install' should then
    4.57  # copy them to the normal system directories
    4.58 @@ -130,6 +178,10 @@ linux24:
    4.59  	$(MAKE) LINUX_RELEASE=2.4 config-xen0
    4.60  	$(MAKE) LINUX_RELEASE=2.4 linux-xen0
    4.61  
    4.62 +netbsd:
    4.63 +	$(MAKE) mk-netbsd-trees
    4.64 +	$(MAKE) netbsd-xenU
    4.65 +
    4.66  clean: delete-symlinks
    4.67  	$(MAKE) -C xen clean
    4.68  	$(MAKE) -C tools clean
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/netbsd-2.0-xen-sparse/Makefile	Mon Sep 06 19:11:17 2004 +0000
     5.3 @@ -0,0 +1,20 @@
     5.4 +#
     5.5 +#
     5.6 +#
     5.7 +
     5.8 +.PHONY: config netbsd
     5.9 +
    5.10 +TOPDIR		?= $(shell pwd)
    5.11 +NETBSD_RELEASE	?= $(patsubst netbsd-%-xen%,%,$(notdir $(TOPDIR)))
    5.12 +NETBSD_VER	?= $(patsubst netbsd-%-xen%,%,$(notdir $(TOPDIR)))
    5.13 +
    5.14 +config:
    5.15 +	@mkdir -p compile/XEN
    5.16 +	cd compile/XEN && TOPDIR=$(TOPDIR) NETBSD_VER=$(NETBSD_VER) ../../nbconfig-xen XEN
    5.17 +
    5.18 +netbsd:
    5.19 +	cd compile/XEN && TOPDIR=$(TOPDIR) NETBSD_VER=$(NETBSD_VER) ../../nbmake-xen dependall
    5.20 +
    5.21 +install:
    5.22 +	@mkdir -p $(dir $(INSTALL_PATH)/$(INSTALL_NAME))
    5.23 +	install -c compile/XEN/netbsd $(INSTALL_PATH)/$(INSTALL_NAME)
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/netbsd-2.0-xen-sparse/mkbuildtree	Mon Sep 06 19:11:17 2004 +0000
     6.3 @@ -0,0 +1,114 @@
     6.4 +#!/bin/sh
     6.5 +
     6.6 +# mkbuildtree <build tree>
     6.7 +#
     6.8 +# Creates symbolic links in <build tree> for the sparse tree
     6.9 +# in the current directory.
    6.10 +
    6.11 +# Script to determine the relative path between two directories.
    6.12 +# Copyright (c) D. J. Hawkey Jr. 2002
    6.13 +# Fixed for Xen project by K. Fraser in 2003.  
    6.14 +abs_to_rel ()
    6.15 +{
    6.16 +	local CWD SRCPATH
    6.17 +                
    6.18 +	if [ "$1" != "/" -a "${1##*[^/]}" = "/" ]; then
    6.19 +		SRCPATH=${1%?}
    6.20 +	else
    6.21 +		SRCPATH=$1
    6.22 +	fi
    6.23 +	if [ "$2" != "/" -a "${2##*[^/]}" = "/" ]; then
    6.24 +		DESTPATH=${2%?}
    6.25 +	else
    6.26 +		DESTPATH=$2
    6.27 +	fi
    6.28 +
    6.29 +	CWD=$PWD
    6.30 +	[ "${1%%[^/]*}" != "/" ] && cd $1 && SRCPATH=$PWD
    6.31 +	[ "${2%%[^/]*}" != "/" ] && cd $2 && DESTPATH=$PWD
    6.32 +	[ "$CWD" != "$PWD" ] && cd $CWD
    6.33 +
    6.34 +	BASEPATH=$SRCPATH
    6.35 +
    6.36 +	[ "$SRCPATH" = "$DESTPATH" ] && DESTPATH="." && return
    6.37 +	[ "$SRCPATH" = "/" ] && DESTPATH=${DESTPATH#?} && return
    6.38 +
    6.39 +	while [ "$BASEPATH/" != "${DESTPATH%${DESTPATH#$BASEPATH/}}" ]; do
    6.40 +          BASEPATH=${BASEPATH%/*}
    6.41 +	done
    6.42 +
    6.43 +	SRCPATH=${SRCPATH#$BASEPATH}
    6.44 +        DESTPATH=${DESTPATH#$BASEPATH}
    6.45 +        DESTPATH=${DESTPATH#?}
    6.46 +	while [ -n "$SRCPATH" ]; do
    6.47 +		SRCPATH=${SRCPATH%/*}
    6.48 +		DESTPATH="../$DESTPATH"
    6.49 +	done
    6.50 +
    6.51 +	[ -z "$BASEPATH" ] && BASEPATH="/"
    6.52 +	[ "${DESTPATH##*[^/]}" = "/" ] && DESTPATH=${DESTPATH%?}
    6.53 +}
    6.54 +
    6.55 +# relative_lndir <target_dir>
    6.56 +# Creates a tree of symlinks in the current working directory that mirror
    6.57 +# real files in <target_dir>. <target_dir> should be relative to the current
    6.58 +# working directory. Symlinks in <target_dir> are ignored. Source-control files
    6.59 +# are ignored.
    6.60 +relative_lndir ()
    6.61 +{
    6.62 +  local SYMLINK_DIR REAL_DIR pref i j
    6.63 +  SYMLINK_DIR=$PWD
    6.64 +  REAL_DIR=$1
    6.65 +  (
    6.66 +  cd $REAL_DIR
    6.67 +  for i in `find . -type d | grep -v SCCS`; do
    6.68 +    [ -d $SYMLINK_DIR/$i ] || mkdir -p $SYMLINK_DIR/$i
    6.69 +    (
    6.70 +    cd $i
    6.71 +    pref=`echo $i | sed -e 's#/[^/]*#../#g' -e 's#^\.##'`
    6.72 +    for j in `find . -type f -o -type l -maxdepth 1`; do
    6.73 +      ln -sf ${pref}${REAL_DIR}/$i/$j ${SYMLINK_DIR}/$i/$j
    6.74 +    done
    6.75 +    )
    6.76 +  done
    6.77 +  )
    6.78 +}
    6.79 +
    6.80 +[ "$1" == "" ] && { echo "Syntax: $0 <linux tree to xenify>"; exit 1; }
    6.81 +
    6.82 +# Get absolute path to the destination directory
    6.83 +pushd . >/dev/null
    6.84 +cd ${1}
    6.85 +AD=$PWD
    6.86 +popd >/dev/null
    6.87 +  
    6.88 +# Get absolute path to the source directory
    6.89 +AS=`pwd`
    6.90 +
    6.91 +# Get path to source, relative to destination
    6.92 +abs_to_rel ${AD} ${AS}
    6.93 +RS=$DESTPATH
    6.94 +
    6.95 +# Remove old copies of files and directories at the destination
    6.96 +for i in `find . -type f -o -type l` ; do rm -f ${AD}/${i#./} ; done
    6.97 +
    6.98 +# We now work from the destination directory
    6.99 +cd ${AD}
   6.100 +
   6.101 +# Remove old symlinks
   6.102 +for i in `find . -type l`; do rm -f $i; done
   6.103 +
   6.104 +# Create symlinks of files and directories which exist in the sparse source
   6.105 +relative_lndir ${RS}
   6.106 +rm -f mkbuildtree
   6.107 +
   6.108 +
   6.109 +# Create links to the shared definitions of the hypervisor interface
   6.110 +rm -rf ${AD}/sys/arch/xen/include/hypervisor-ifs
   6.111 +mkdir  ${AD}/sys/arch/xen/include/hypervisor-ifs
   6.112 +cd     ${AD}/sys/arch/xen/include/hypervisor-ifs
   6.113 +relative_lndir ../../../../../${RS}/../xen/include/hypervisor-ifs
   6.114 +
   6.115 +# Remove files which don't exist anymore
   6.116 +rm -rf ${AD}/sys/arch/xen/xen/events.c
   6.117 +rm -rf ${AD}/sys/arch/xen/include/events.h
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/netbsd-2.0-xen-sparse/nbconfig-xen	Mon Sep 06 19:11:17 2004 +0000
     7.3 @@ -0,0 +1,20 @@
     7.4 +#! /bin/sh
     7.5 +#
     7.6 +
     7.7 +: ${HOS:=$(uname -s | tr /A-Z/ /a-z/)}
     7.8 +: ${HARCH:=$(uname -i)}
     7.9 +: ${NETBSD_RELEASE:=$(basename $(cd $(dirname $0) && pwd) | sed 's/netbsd-\([0-9]\+\.[0-9]\+\).*/\1/')}
    7.10 +: ${NETBSD_VERSION:=$(basename $(cd $(dirname $0) && pwd) | sed 's/netbsd-\([0-9]\+\.[0-9]\+.*\)-xen.*/\1/')}
    7.11 +
    7.12 +TOOLDIR="$TOPDIR/../netbsd-${NETBSD_RELEASE}-tools/$HOS-$HARCH"; export TOOLDIR
    7.13 +
    7.14 +CONF="$1"
    7.15 +case "$1" in
    7.16 +  /*)
    7.17 +    CONF="$1"
    7.18 +    ;;
    7.19 +  *)
    7.20 +    CONF="$TOPDIR"/sys/arch/xen/conf/"$1"
    7.21 +    ;;
    7.22 +esac
    7.23 +exec "${TOOLDIR}/bin/nbconfig" -b $(pwd) -s "$TOPDIR"/sys "$CONF"
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/netbsd-2.0-xen-sparse/nbmake-xen	Mon Sep 06 19:11:17 2004 +0000
     8.3 @@ -0,0 +1,26 @@
     8.4 +#! /bin/sh
     8.5 +# Set proper variables to allow easy "make" building of a NetBSD subtree.
     8.6 +# Generated from:  $NetBSD: build.sh,v 1.126 2004/02/04 11:23:40 lukem Exp $
     8.7 +#
     8.8 +
     8.9 +: ${HOS:=$(uname -s | tr /A-Z/ /a-z/)}
    8.10 +: ${HARCH:=$(uname -i)}
    8.11 +: ${NETBSD_RELEASE:=$(basename $(cd $(dirname $0) && pwd) | sed 's/netbsd-\([0-9]\+\.[0-9]\+\).*/\1/')}
    8.12 +: ${NETBSD_VERSION:=$(basename $(cd $(dirname $0) && pwd) | sed 's/netbsd-\([0-9]\+\.[0-9]\+.*\)-xen.*/\1/')}
    8.13 +
    8.14 +NETBSDSRCDIR="$TOPDIR"; export NETBSDSRCDIR
    8.15 +DESTDIR="$TOPDIR/root"; export DESTDIR
    8.16 +unset MAKEOBJDIRPREFIX
    8.17 +MAKEOBJDIR='${.CURDIR:C,^'"$TOPDIR,$TOPDIR/obj,}"; export MAKEOBJDIR
    8.18 +RELEASEDIR="$TOPDIR/release"; export RELEASEDIR
    8.19 +MKUNPRIVED='yes'; export MKUNPRIVED
    8.20 +MAKEVERBOSE='1'; export MAKEVERBOSE
    8.21 +LC_ALL='C'; export LC_ALL
    8.22 +TOOLDIR="$TOPDIR/../netbsd-${NETBSD_RELEASE}-tools/$HOS-$HARCH"; export TOOLDIR
    8.23 +MACHINE='i386'; export MACHINE
    8.24 +MACHINE_ARCH='i386'; export MACHINE_ARCH
    8.25 +MAKEFLAGS="-de -m $TOPDIR/share/mk MKOBJDIRS=yes"; export MAKEFLAGS
    8.26 +BUILDID="${NETBSD_RELEASE}"; export BUILDID
    8.27 +USETOOLS=yes; export USETOOLS
    8.28 +
    8.29 +exec "${TOOLDIR}/bin/nbmake" ${1+"$@"}
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN	Mon Sep 06 19:11:17 2004 +0000
     9.3 @@ -0,0 +1,176 @@
     9.4 +# $NetBSD: XEN,v 1.1.2.2 2004/07/15 20:19:34 he Exp $
     9.5 +
     9.6 +include 	"arch/xen/conf/std.xen"
     9.7 +
     9.8 +options 	INCLUDE_CONFIG_FILE	# embed config file in kernel binary
     9.9 +
    9.10 +#options		UVMHIST
    9.11 +#options		UVMHIST_PRINT
    9.12 +#options		SYSCALL_DEBUG
    9.13 +
    9.14 +maxusers	32		# estimated number of users
    9.15 +
    9.16 +#
    9.17 +options		XEN
    9.18 +#options		DOM0OPS
    9.19 +options		HZ=50
    9.20 +
    9.21 +#options 	I586_CPU
    9.22 +options 	I686_CPU
    9.23 +
    9.24 +#options 	VM86		# virtual 8086 emulation
    9.25 +#options 	USER_LDT	# user-settable LDT; used by WINE
    9.26 +
    9.27 +#options 	MTRR		# memory-type range register syscall support
    9.28 +
    9.29 +#options 	CONSDEVNAME="\"xencons\""
    9.30 +#options 	CONS_OVERRIDE
    9.31 +
    9.32 +options		INSECURE	# disable kernel security levels - X needs this
    9.33 +
    9.34 +options 	RTC_OFFSET=0	# hardware clock is this many mins. west of GMT
    9.35 +#options 	NTP		# NTP phase/frequency locked loop
    9.36 +
    9.37 +options 	KTRACE		# system call tracing via ktrace(1)
    9.38 +#options 	SYSTRACE	# system call vetting via systrace(1)
    9.39 +
    9.40 +options 	SYSVMSG		# System V-like message queues
    9.41 +options 	SYSVSEM		# System V-like semaphores
    9.42 +#options 	SEMMNI=10	# number of semaphore identifiers
    9.43 +#options 	SEMMNS=60	# number of semaphores in system
    9.44 +#options 	SEMUME=10	# max number of undo entries per process
    9.45 +#options 	SEMMNU=30	# number of undo structures in system
    9.46 +options 	SYSVSHM		# System V-like memory sharing
    9.47 +#options 	SHMMAXPGS=2048	# 2048 pages is the default
    9.48 +options 	P1003_1B_SEMAPHORE	# p1003.1b semaphore support
    9.49 +
    9.50 +options 	LKM		# loadable kernel modules
    9.51 +
    9.52 +options 	USERCONF	# userconf(4) support
    9.53 +options 	SYSCTL_INCLUDE_DESCR	# Include sysctl descriptions in kernel
    9.54 +
    9.55 +# Diagnostic/debugging support options
    9.56 +options 	DIAGNOSTIC	# expensive kernel consistency checks
    9.57 +options 	DEBUG		# expensive debugging checks/support 
    9.58 +options 	KMEMSTATS	# kernel memory statistics (vmstat -m)
    9.59 +options 	DDB		# in-kernel debugger
    9.60 +options		DDB_ONPANIC=1	# see also sysctl(8): `ddb.onpanic'
    9.61 +options 	DDB_HISTORY_SIZE=512	# enable history editing in DDB
    9.62 +#options 	KGDB		# remote debugger
    9.63 +#options 	KGDB_DEVNAME="\"com\"",KGDB_DEVADDR=0x2f8,KGDB_DEVRATE=57600
    9.64 +makeoptions	DEBUG="-g"	# compile full symbol table
    9.65 +
    9.66 +#options 	COMPAT_14	# NetBSD 1.4
    9.67 +#options 	COMPAT_15	# NetBSD 1.5
    9.68 +options 	COMPAT_16	# NetBSD 1.6
    9.69 +
    9.70 +##options 	COMPAT_LINUX	# binary compatibility with Linux
    9.71 +#options 	COMPAT_FREEBSD	# binary compatibility with FreeBSD
    9.72 +#options 	COMPAT_MACH	# binary compatibility with Mach binaries
    9.73 +#options	COMPAT_DARWIN	# binary compatibility with Darwin binaries
    9.74 +#options 	EXEC_MACHO	# exec MACH-O binaries
    9.75 +#options 	COMPAT_PECOFF	# kernel support to run Win32 apps
    9.76 +
    9.77 +file-system 	FFS		# UFS
    9.78 +file-system 	EXT2FS		# second extended file system (linux)
    9.79 +#file-system 	LFS		# log-structured file system
    9.80 +#file-system 	MFS		# memory file system
    9.81 +file-system 	NFS		# Network File System client
    9.82 +#file-system 	NTFS		# Windows/NT file system (experimental)
    9.83 +#file-system 	CD9660		# ISO 9660 + Rock Ridge file system
    9.84 +#file-system 	MSDOSFS		# MS-DOS file system
    9.85 +file-system 	FDESC		# /dev/fd
    9.86 +file-system 	KERNFS		# /kern
    9.87 +file-system 	NULLFS		# loopback file system
    9.88 +#file-system 	OVERLAY		# overlay file system
    9.89 +#file-system 	PORTAL		# portal filesystem (still experimental)
    9.90 +file-system 	PROCFS		# /proc
    9.91 +#file-system 	UMAPFS		# NULLFS + uid and gid remapping
    9.92 +#file-system 	UNION		# union file system
    9.93 +#file-system	SMBFS		# experimental - CIFS; also needs nsmb (below)
    9.94 +
    9.95 +#options 	QUOTA		# UFS quotas
    9.96 +#options 	SOFTDEP		# FFS soft updates support.
    9.97 +#options 	NFSSERVER	# Network File System server
    9.98 +
    9.99 +options 	GATEWAY		# packet forwarding
   9.100 +options 	INET		# IP + ICMP + TCP + UDP
   9.101 +options 	INET6		# IPV6
   9.102 +options 	IPSEC		# IP security
   9.103 +options 	IPSEC_ESP	# IP security (encryption part; define w/IPSEC)
   9.104 +options 	MROUTING	# IP multicast routing
   9.105 +options 	PFIL_HOOKS	# pfil(9) packet filter hooks
   9.106 +options 	IPFILTER_LOG	# ipmon(8) log support
   9.107 +
   9.108 +options 	NFS_BOOT_DHCP,NFS_BOOT_BOOTPARAM,NFS_BOOT_BOOTSTATIC
   9.109 +#options 	NFS_BOOTSTATIC_MYIP="\"169.254.1.2\""
   9.110 +#options 	NFS_BOOTSTATIC_GWIP="\"169.254.1.1\""
   9.111 +#options 	NFS_BOOTSTATIC_MASK="\"255.255.255.0\""
   9.112 +#options 	NFS_BOOTSTATIC_SERVADDR="\"169.254.1.1\""
   9.113 +#options 	NFS_BOOTSTATIC_SERVER="\"server:/path/to/root\""
   9.114 +
   9.115 +options 	WSEMUL_VT100		# VT100 / VT220 emulation
   9.116 +options 	WS_KERNEL_FG=WSCOL_GREEN
   9.117 +options 	WSDISPLAY_COMPAT_PCVT		# emulate some ioctls
   9.118 +options 	WSDISPLAY_COMPAT_SYSCONS	# emulate some ioctls
   9.119 +options 	WSDISPLAY_COMPAT_USL		# VT handling
   9.120 +options 	WSDISPLAY_COMPAT_RAWKBD		# can get raw scancodes
   9.121 +options 	WSDISPLAY_DEFAULTSCREENS=4
   9.122 +options 	PCDISPLAY_SOFTCURSOR
   9.123 +
   9.124 +config		netbsd	root on ? type ?
   9.125 +#config		netbsd	root on wd0a type ffs
   9.126 +#config		netbsd	root on xennet0 type nfs
   9.127 +
   9.128 +mainbus0 at root
   9.129 +
   9.130 +cpu* at mainbus?
   9.131 +
   9.132 +hypervisor*	at mainbus?		# Xen hypervisor
   9.133 +
   9.134 +npx0		at hypervisor?		# x86 math coprocessor
   9.135 +
   9.136 +xencons*	at hypervisor?		# Xen virtual console
   9.137 +xennet* 	at hypervisor?		# Xen virtual network interface
   9.138 +
   9.139 +#xbd*		at hypervisor?		# Xen virtual block device
   9.140 +#wd*		at hypervisor?		# Xen vbd (wd identity)
   9.141 +#sd*		at hypervisor?		# Xen vbd (sd identity)
   9.142 +#cd*		at hypervisor?		# Xen vbd (cd identity)
   9.143 +
   9.144 +#xenkbc* 	at hypervisor?		# Xen Keyboard/Mouse Interface
   9.145 +#pckbd*		at xenkbc?		# Keyboard
   9.146 +#vga*		at hypervisor?		# Xen VGA display
   9.147 +#pms*		at xenkbc?		# PS/2 Mouse for wsmouse
   9.148 +
   9.149 +#wskbd*		at pckbd? console ?
   9.150 +#wsdisplay*	at vga? console ?
   9.151 +#wsmouse*	at pms? mux 0
   9.152 +
   9.153 +
   9.154 +include	"arch/xen/conf/GENERIC.local"
   9.155 +
   9.156 +
   9.157 +pseudo-device	ccd		4	# concatenated/striped disk devices
   9.158 +#pseudo-device	cgd		4	# cryptographic disk devices
   9.159 +#pseudo-device	md		1	# memory disk device (ramdisk)
   9.160 +#pseudo-device	vnd		4	# disk-like interface to files
   9.161 +
   9.162 +pseudo-device	bpfilter	8	# Berkeley packet filter
   9.163 +pseudo-device	ipfilter		# IP filter (firewall) and NAT
   9.164 +pseudo-device	loop			# network loopback
   9.165 +#pseudo-device	tun		2	# network tunneling over tty
   9.166 +#pseudo-device	gre		2	# generic L3 over IP tunnel
   9.167 +#pseudo-device	gif		4	# IPv[46] over IPv[46] tunnel (RFC1933)
   9.168 +#pseudo-device	faith		1	# IPv[46] tcp relay translation i/f
   9.169 +#pseudo-device	stf		1	# 6to4 IPv6 over IPv4 encapsulation
   9.170 +#pseudo-device	vlan			# IEEE 802.1q encapsulation
   9.171 +#pseudo-device	bridge			# simple inter-network bridging
   9.172 +
   9.173 +pseudo-device	pty			# pseudo-terminals
   9.174 +pseudo-device	rnd			# /dev/random and in-kernel generator
   9.175 +pseudo-device	clockctl		# user control of clock subsystem
   9.176 +
   9.177 +pseudo-device	wsmux			# mouse & keyboard multiplexor
   9.178 +pseudo-device	wsfont
   9.179 +pseudo-device	ksyms			# /dev/ksyms
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen	Mon Sep 06 19:11:17 2004 +0000
    10.3 @@ -0,0 +1,232 @@
    10.4 +#	$NetBSD: files.xen,v 1.3.2.1 2004/05/22 15:59:02 he Exp $
    10.5 +#	NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp 
    10.6 +#	NetBSD: files.i386,v 1.254 2004/03/25 23:32:10 jmc Exp 
    10.7 +
    10.8 +maxpartitions 8
    10.9 +
   10.10 +maxusers 2 16 128
   10.11 +
   10.12 +# Processor type options.
   10.13 +defflag	opt_cputype.h	I686_CPU
   10.14 +
   10.15 +# delay before cpu_reset() for reboot.
   10.16 +defparam		CPURESET_DELAY
   10.17 +
   10.18 +# No unmapped page below kernel stack
   10.19 +defflag			NOREDZONE
   10.20 +
   10.21 +# Beep on halt
   10.22 +defflag opt_beep.h		BEEP_ONHALT
   10.23 +defparam opt_beep.h		BEEP_ONHALT_COUNT
   10.24 +defparam opt_beep.h		BEEP_ONHALT_PITCH BEEP_ONHALT_PERIOD
   10.25 +
   10.26 +file	arch/xen/i386/autoconf.c
   10.27 +file	arch/i386/i386/db_dbgreg.S	ddb | kstack_check_dr0
   10.28 +file	arch/i386/i386/db_disasm.c	ddb
   10.29 +file	arch/i386/i386/db_interface.c	ddb
   10.30 +file	arch/i386/i386/db_memrw.c	ddb | kgdb
   10.31 +file	arch/i386/i386/db_trace.c	ddb
   10.32 +file	kern/subr_disk_mbr.c		disk
   10.33 +file	arch/xen/i386/gdt.c
   10.34 +file	arch/xen/i386/hypervisor_machdep.c
   10.35 +file	arch/i386/i386/in_cksum.S	inet | inet6
   10.36 +file	arch/i386/i386/ipkdb_glue.c	ipkdb
   10.37 +file	arch/i386/i386/kgdb_machdep.c	kgdb
   10.38 +file	arch/xen/i386/machdep.c
   10.39 +file	arch/xen/i386/identcpu.c
   10.40 +file	arch/i386/i386/math_emulate.c	math_emulate
   10.41 +file	arch/i386/i386/mem.c
   10.42 +file	kern/kern_microtime.c		i586_cpu | i686_cpu
   10.43 +file	arch/i386/i386/mtrr_k6.c	mtrr
   10.44 +file	netns/ns_cksum.c		ns
   10.45 +file	arch/xen/i386/pmap.c
   10.46 +file	arch/i386/i386/process_machdep.c
   10.47 +file	arch/i386/i386/procfs_machdep.c	procfs
   10.48 +file	arch/xen/i386/sys_machdep.c
   10.49 +file	arch/i386/i386/syscall.c
   10.50 +file	arch/xen/i386/trap.c
   10.51 +file	arch/i386/i386/vm_machdep.c
   10.52 +file	arch/xen/i386/xen_machdep.c
   10.53 +
   10.54 +file	arch/xen/xen/xen_debug.c
   10.55 +
   10.56 +file	arch/xen/xen/clock.c
   10.57 +file	arch/xen/xen/evtchn.c
   10.58 +file	arch/xen/xen/ctrl_if.c
   10.59 +
   10.60 +file	dev/cons.c
   10.61 +
   10.62 +file	arch/i386/i386/mptramp.S		multiprocessor
   10.63 +file    arch/i386/i386/ipifuncs.c	multiprocessor
   10.64 +
   10.65 +file	arch/i386/i386/pmc.c		perfctrs
   10.66 +
   10.67 +file	crypto/des/arch/i386/des_enc.S		des
   10.68 +file	crypto/des/arch/i386/des_cbc.S		des
   10.69 +
   10.70 +file	crypto/blowfish/arch/i386/bf_enc.S	blowfish
   10.71 +file	crypto/blowfish/arch/i386/bf_cbc.S	blowfish & !i386_cpu
   10.72 +
   10.73 +#
   10.74 +# Machine-independent SCSI drivers
   10.75 +#
   10.76 +
   10.77 +#xxx include	"dev/scsipi/files.scsipi"
   10.78 +
   10.79 +#
   10.80 +# Machine-independent ATA drivers
   10.81 +#
   10.82 +
   10.83 +#xxx include	"dev/ata/files.ata"
   10.84 +
   10.85 +# Memory Disk for install floppy
   10.86 +file	dev/md_root.c			memory_disk_hooks
   10.87 +
   10.88 +#
   10.89 +define  mainbus { [apid = -1] }
   10.90 +
   10.91 +file	arch/x86/x86/bus_dma.c
   10.92 +file	arch/xen/x86/bus_space.c
   10.93 +file	arch/x86/x86/cacheinfo.c
   10.94 +file	arch/xen/x86/consinit.c
   10.95 +file	arch/xen/x86/intr.c
   10.96 +file	arch/x86/x86/ipi.c		multiprocessor
   10.97 +file	arch/x86/x86/lock_machdep.c	lockdebug
   10.98 +file	arch/x86/x86/softintr.c
   10.99 +
  10.100 +include	"arch/xen/conf/files.compat"
  10.101 +
  10.102 +#
  10.103 +# System bus types
  10.104 +#
  10.105 +
  10.106 +device	mainbus: mainbus
  10.107 +attach	mainbus at root
  10.108 +file	arch/xen/i386/mainbus.c		mainbus
  10.109 +
  10.110 +# Xen hypervisor
  10.111 +device	hypervisor { }
  10.112 +attach	hypervisor at mainbus
  10.113 +file	arch/xen/xen/hypervisor.c	hypervisor needs-flag
  10.114 +
  10.115 +# Numeric Processing Extension; Math Co-processor
  10.116 +device	npx
  10.117 +file	arch/xen/i386/npx.c		npx needs-flag
  10.118 +
  10.119 +attach	npx at hypervisor with npx_hv
  10.120 +file	arch/xen/i386/npx_hv.c		npx_hv
  10.121 +
  10.122 +# Xen console support
  10.123 +device	xencons: tty
  10.124 +attach	xencons at hypervisor
  10.125 +file	arch/xen/xen/xencons.c		xencons needs-flag
  10.126 +
  10.127 +include	"dev/wscons/files.wscons"
  10.128 +include	"dev/wsfont/files.wsfont"
  10.129 +
  10.130 +include	"dev/pckbport/files.pckbport"
  10.131 +
  10.132 +# CPUS
  10.133 +
  10.134 +define cpu { [apid = -1] }
  10.135 +device cpu
  10.136 +attach cpu at mainbus
  10.137 +file	arch/xen/i386/cpu.c		cpu
  10.138 +
  10.139 +#
  10.140 +# Compatibility modules
  10.141 +#
  10.142 +
  10.143 +# VM86 mode
  10.144 +file	arch/i386/i386/vm86.c			vm86
  10.145 +
  10.146 +# VM86 in kernel
  10.147 +file	arch/i386/i386/kvm86.c			kvm86
  10.148 +file	arch/i386/i386/kvm86call.S		kvm86
  10.149 +
  10.150 +# Binary compatibility with previous NetBSD releases (COMPAT_XX)
  10.151 +file	arch/i386/i386/compat_13_machdep.c	compat_13 | compat_aout
  10.152 +file	arch/i386/i386/compat_16_machdep.c	compat_16 | compat_ibcs2
  10.153 +
  10.154 +# SVR4 binary compatibility (COMPAT_SVR4)
  10.155 +include	"compat/svr4/files.svr4"
  10.156 +file	arch/i386/i386/svr4_machdep.c		compat_svr4
  10.157 +file	arch/i386/i386/svr4_sigcode.S		compat_svr4
  10.158 +file	arch/i386/i386/svr4_syscall.c		compat_svr4
  10.159 +
  10.160 +# MACH binary compatibility (COMPAT_MACH)
  10.161 +include	"compat/mach/files.mach"
  10.162 +file	arch/i386/i386/mach_machdep.c		compat_mach | compat_darwin
  10.163 +file	arch/i386/i386/mach_sigcode.S		compat_mach | compat_darwin
  10.164 +file	arch/i386/i386/mach_syscall.c		compat_mach | compat_darwin
  10.165 +file	arch/i386/i386/macho_machdep.c		exec_macho
  10.166 +
  10.167 +# DARWIN binary compatibility (COMPAT_DARWIN)
  10.168 +include	"compat/darwin/files.darwin"
  10.169 +file	arch/i386/i386/darwin_machdep.c		compat_darwin
  10.170 +
  10.171 +# iBCS-2 binary compatibility (COMPAT_IBCS2)
  10.172 +include	"compat/ibcs2/files.ibcs2"
  10.173 +file	arch/i386/i386/ibcs2_machdep.c		compat_ibcs2
  10.174 +file	arch/i386/i386/ibcs2_sigcode.S		compat_ibcs2
  10.175 +file	arch/i386/i386/ibcs2_syscall.c		compat_ibcs2
  10.176 +
  10.177 +# Linux binary compatibility (COMPAT_LINUX)
  10.178 +include	"compat/linux/files.linux"
  10.179 +include	"compat/linux/arch/i386/files.linux_i386"
  10.180 +file	arch/i386/i386/linux_sigcode.S		compat_linux
  10.181 +file	arch/i386/i386/linux_syscall.c		compat_linux
  10.182 +file	arch/i386/i386/linux_trap.c		compat_linux
  10.183 +
  10.184 +# FreeBSD binary compatibility (COMPAT_FREEBSD)
  10.185 +include	"compat/freebsd/files.freebsd"
  10.186 +file	arch/i386/i386/freebsd_machdep.c	compat_freebsd
  10.187 +file	arch/i386/i386/freebsd_sigcode.S	compat_freebsd
  10.188 +file	arch/i386/i386/freebsd_syscall.c	compat_freebsd
  10.189 +
  10.190 +# a.out binary compatibility (COMPAT_AOUT)
  10.191 +include	"compat/aout/files.aout"
  10.192 +
  10.193 +# Win32 binary compatibility (COMPAT_PECOFF)
  10.194 +include	"compat/pecoff/files.pecoff"
  10.195 +
  10.196 +# OSS audio driver compatibility
  10.197 +include	"compat/ossaudio/files.ossaudio"
  10.198 +
  10.199 +# Xen devices
  10.200 +
  10.201 +# Network driver
  10.202 +device	xennet: arp, ether, ifnet
  10.203 +attach	xennet at hypervisor
  10.204 +file	arch/xen/xen/if_xennet.c	xennet needs-flag
  10.205 +
  10.206 +# Block device driver and wd/sd/cd identities
  10.207 +device	xbd: disk
  10.208 +attach	xbd at hypervisor
  10.209 +file	arch/xen/xen/xbd.c		xbd | wd | sd | cd needs-flag
  10.210 +
  10.211 +device	wd: disk
  10.212 +attach	wd at hypervisor
  10.213 +
  10.214 +device	sd: disk
  10.215 +attach	sd at hypervisor
  10.216 +
  10.217 +device	cd: disk
  10.218 +attach	cd at hypervisor
  10.219 +
  10.220 +# Keyboard
  10.221 +device	xenkbc: pckbport
  10.222 +attach	xenkbc at hypervisor
  10.223 +file	arch/xen/xen/xenkbc.c		xenkbc		needs-flag
  10.224 +
  10.225 +# Generic VGA
  10.226 +attach	vga at hypervisor with vga_xen
  10.227 +file	arch/xen/xen/vga_xen.c		vga_xen		needs-flag
  10.228 +
  10.229 +# Domain-0 operations
  10.230 +defflag	opt_xen.h			DOM0OPS
  10.231 +file	arch/xen/xen/machmem.c		dom0ops
  10.232 +file	arch/xen/xen/privcmd.c		dom0ops
  10.233 +file	arch/xen/xen/vfr.c		dom0ops
  10.234 +
  10.235 +include "arch/xen/conf/majors.i386"
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c	Mon Sep 06 19:11:17 2004 +0000
    11.3 @@ -0,0 +1,630 @@
    11.4 +/*	$NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $	*/
    11.5 +/*	NetBSD: autoconf.c,v 1.75 2003/12/30 12:33:22 pk Exp 	*/
    11.6 +
    11.7 +/*-
    11.8 + * Copyright (c) 1990 The Regents of the University of California.
    11.9 + * All rights reserved.
   11.10 + *
   11.11 + * This code is derived from software contributed to Berkeley by
   11.12 + * William Jolitz.
   11.13 + *
   11.14 + * Redistribution and use in source and binary forms, with or without
   11.15 + * modification, are permitted provided that the following conditions
   11.16 + * are met:
   11.17 + * 1. Redistributions of source code must retain the above copyright
   11.18 + *    notice, this list of conditions and the following disclaimer.
   11.19 + * 2. Redistributions in binary form must reproduce the above copyright
   11.20 + *    notice, this list of conditions and the following disclaimer in the
   11.21 + *    documentation and/or other materials provided with the distribution.
   11.22 + * 3. Neither the name of the University nor the names of its contributors
   11.23 + *    may be used to endorse or promote products derived from this software
   11.24 + *    without specific prior written permission.
   11.25 + *
   11.26 + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   11.27 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   11.28 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   11.29 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   11.30 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   11.31 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   11.32 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   11.33 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   11.34 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   11.35 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   11.36 + * SUCH DAMAGE.
   11.37 + *
   11.38 + *	@(#)autoconf.c	7.1 (Berkeley) 5/9/91
   11.39 + */
   11.40 +
   11.41 +/*
   11.42 + * Setup the system to run on the current machine.
   11.43 + *
   11.44 + * Configure() is called at boot time and initializes the vba
   11.45 + * device tables and the memory controller monitoring.  Available
   11.46 + * devices are determined (from possibilities mentioned in ioconf.c),
   11.47 + * and the drivers are initialized.
   11.48 + */
   11.49 +
   11.50 +#include <sys/cdefs.h>
   11.51 +__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $");
   11.52 +
   11.53 +#include "opt_compat_oldboot.h"
   11.54 +#include "opt_multiprocessor.h"
   11.55 +#include "opt_nfs_boot.h"
   11.56 +#include "xennet.h"
   11.57 +
   11.58 +#include <sys/param.h>
   11.59 +#include <sys/systm.h>
   11.60 +#include <sys/buf.h>
   11.61 +#include <sys/disklabel.h>
   11.62 +#include <sys/conf.h>
   11.63 +#ifdef COMPAT_OLDBOOT
   11.64 +#include <sys/reboot.h>
   11.65 +#endif
   11.66 +#include <sys/device.h>
   11.67 +#include <sys/malloc.h>
   11.68 +#include <sys/vnode.h>
   11.69 +#include <sys/fcntl.h>
   11.70 +#include <sys/dkio.h>
   11.71 +#include <sys/proc.h>
   11.72 +#include <sys/user.h>
   11.73 +
   11.74 +#ifdef NFS_BOOT_BOOTSTATIC
   11.75 +#include <net/if.h>
   11.76 +#include <net/if_ether.h>
   11.77 +#include <netinet/in.h>
   11.78 +#include <nfs/rpcv2.h>
   11.79 +#include <nfs/nfsproto.h>
   11.80 +#include <nfs/nfs.h>
   11.81 +#include <nfs/nfsmount.h>
   11.82 +#include <nfs/nfsdiskless.h>
   11.83 +#include <machine/if_xennetvar.h>
   11.84 +#endif
   11.85 +
   11.86 +#include <machine/pte.h>
   11.87 +#include <machine/cpu.h>
   11.88 +#include <machine/gdt.h>
   11.89 +#include <machine/pcb.h>
   11.90 +#include <machine/bootinfo.h>
   11.91 +
   11.92 +#include "ioapic.h"
   11.93 +#include "lapic.h"
   11.94 +
   11.95 +#if NIOAPIC > 0
   11.96 +#include <machine/i82093var.h>
   11.97 +#endif
   11.98 +
   11.99 +#if NLAPIC > 0
  11.100 +#include <machine/i82489var.h>
  11.101 +#endif
  11.102 +
  11.103 +static int match_harddisk(struct device *, struct btinfo_bootdisk *);
  11.104 +static void matchbiosdisks(void);
  11.105 +static void findroot(void);
  11.106 +static int is_valid_disk(struct device *);
  11.107 +
  11.108 +extern struct disklist *i386_alldisks;
  11.109 +extern int i386_ndisks;
  11.110 +
  11.111 +#include "bios32.h"
  11.112 +#if NBIOS32 > 0
  11.113 +#include <machine/bios32.h>
  11.114 +#endif
  11.115 +
  11.116 +#include "opt_pcibios.h"
  11.117 +#ifdef PCIBIOS
  11.118 +#include <dev/pci/pcireg.h>
  11.119 +#include <dev/pci/pcivar.h>
  11.120 +#include <i386/pci/pcibios.h>
  11.121 +#endif
  11.122 +
  11.123 +#include "opt_kvm86.h"
  11.124 +#ifdef KVM86
  11.125 +#include <machine/kvm86.h>
  11.126 +#endif
  11.127 +
  11.128 +#include "opt_xen.h"
  11.129 +
  11.130 +struct device *booted_device;
  11.131 +int booted_partition;
  11.132 +
  11.133 +/*
  11.134 + * Determine i/o configuration for a machine.
  11.135 + */
  11.136 +void
  11.137 +cpu_configure(void)
  11.138 +{
  11.139 +
  11.140 +	startrtclock();
  11.141 +
  11.142 +#if NBIOS32 > 0
  11.143 +	bios32_init();
  11.144 +#endif
  11.145 +#ifdef PCIBIOS
  11.146 +	pcibios_init();
  11.147 +#endif
  11.148 +
  11.149 +	/* kvm86 needs a TSS */
  11.150 +	i386_proc0_tss_ldt_init();
  11.151 +#ifdef KVM86
  11.152 +	kvm86_init();
  11.153 +#endif
  11.154 +
  11.155 +	if (config_rootfound("mainbus", NULL) == NULL)
  11.156 +		panic("configure: mainbus not configured");
  11.157 +
  11.158 +#ifdef INTRDEBUG
  11.159 +	intr_printconfig();
  11.160 +#endif
  11.161 +
  11.162 +#if NIOAPIC > 0
  11.163 +	lapic_set_lvt();
  11.164 +	ioapic_enable();
  11.165 +#endif
  11.166 +	/* resync cr0 after FPU configuration */
  11.167 +	lwp0.l_addr->u_pcb.pcb_cr0 = rcr0();
  11.168 +#ifdef MULTIPROCESSOR
  11.169 +	/* propagate this to the idle pcb's. */
  11.170 +	cpu_init_idle_pcbs();
  11.171 +#endif
  11.172 +
  11.173 +	spl0();
  11.174 +#if NLAPIC > 0
  11.175 +	lapic_tpr = 0;
  11.176 +#endif
  11.177 +}
  11.178 +
  11.179 +void
  11.180 +cpu_rootconf(void)
  11.181 +{
  11.182 +	findroot();
  11.183 +	matchbiosdisks();
  11.184 +
  11.185 +	printf("boot device: %s\n",
  11.186 +	    booted_device ? booted_device->dv_xname : "<unknown>");
  11.187 +
  11.188 +	setroot(booted_device, booted_partition);
  11.189 +}
  11.190 +
  11.191 +/*
  11.192 + * XXX ugly bit of code. But, this is the only safe time that the
  11.193 + * match between BIOS disks and native disks can be done.
  11.194 + */
  11.195 +static void
  11.196 +matchbiosdisks(void)
  11.197 +{
  11.198 +	struct btinfo_biosgeom *big;
  11.199 +	struct bi_biosgeom_entry *be;
  11.200 +	struct device *dv;
  11.201 +	int i, ck, error, m, n;
  11.202 +	struct vnode *tv;
  11.203 +	char mbr[DEV_BSIZE];
  11.204 +	int  dklist_size;
  11.205 +	int bmajor;
  11.206 +
  11.207 +	big = lookup_bootinfo(BTINFO_BIOSGEOM);
  11.208 +
  11.209 +	if (big == NULL)
  11.210 +		return;
  11.211 +
  11.212 +	/*
  11.213 +	 * First, count all native disks
  11.214 +	 */
  11.215 +	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next)
  11.216 +		if (is_valid_disk(dv))
  11.217 +			i386_ndisks++;
  11.218 +
  11.219 +	if (i386_ndisks == 0)
  11.220 +		return;
  11.221 +
  11.222 +	dklist_size = sizeof (struct disklist) + (i386_ndisks - 1) *
  11.223 +	    sizeof (struct nativedisk_info);
  11.224 +
  11.225 +	/* XXX M_TEMP is wrong */
  11.226 +	i386_alldisks = malloc(dklist_size, M_TEMP, M_NOWAIT);
  11.227 +	if (i386_alldisks == NULL)
  11.228 +		return;
  11.229 +
  11.230 +	memset(i386_alldisks, 0, dklist_size);
  11.231 +
  11.232 +	i386_alldisks->dl_nnativedisks = i386_ndisks;
  11.233 +	i386_alldisks->dl_nbiosdisks = big->num;
  11.234 +	for (i = 0; i < big->num; i++) {
  11.235 +		i386_alldisks->dl_biosdisks[i].bi_dev = big->disk[i].dev;
  11.236 +		i386_alldisks->dl_biosdisks[i].bi_sec = big->disk[i].sec;
  11.237 +		i386_alldisks->dl_biosdisks[i].bi_head = big->disk[i].head;
  11.238 +		i386_alldisks->dl_biosdisks[i].bi_cyl = big->disk[i].cyl;
  11.239 +		i386_alldisks->dl_biosdisks[i].bi_lbasecs = big->disk[i].totsec;
  11.240 +		i386_alldisks->dl_biosdisks[i].bi_flags = big->disk[i].flags;
  11.241 +#ifdef GEOM_DEBUG
  11.242 +#ifdef NOTYET
  11.243 +		printf("disk %x: flags %x, interface %x, device %llx\n",
  11.244 +			big->disk[i].dev, big->disk[i].flags,
  11.245 +			big->disk[i].interface_path, big->disk[i].device_path);
  11.246 +#endif
  11.247 +#endif
  11.248 +	}
  11.249 +
  11.250 +	/*
  11.251 +	 * XXX code duplication from findroot()
  11.252 +	 */
  11.253 +	n = -1;
  11.254 +	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
  11.255 +		if (dv->dv_class != DV_DISK)
  11.256 +			continue;
  11.257 +#ifdef GEOM_DEBUG
  11.258 +		printf("matchbiosdisks: trying to match (%s) %s\n",
  11.259 +		    dv->dv_xname, dv->dv_cfdata->cf_name);
  11.260 +#endif
  11.261 +		if (is_valid_disk(dv)) {
  11.262 +			n++;
  11.263 +			sprintf(i386_alldisks->dl_nativedisks[n].ni_devname,
  11.264 +			    "%s%d", dv->dv_cfdata->cf_name,
  11.265 +			    dv->dv_unit);
  11.266 +
  11.267 +			bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
  11.268 +			if (bmajor == -1)
  11.269 +				return;
  11.270 +
  11.271 +			if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, RAW_PART),
  11.272 +			    &tv))
  11.273 +				panic("matchbiosdisks: can't alloc vnode");
  11.274 +
  11.275 +			error = VOP_OPEN(tv, FREAD, NOCRED, 0);
  11.276 +			if (error) {
  11.277 +				vput(tv);
  11.278 +				continue;
  11.279 +			}
  11.280 +			error = vn_rdwr(UIO_READ, tv, mbr, DEV_BSIZE, 0,
  11.281 +			    UIO_SYSSPACE, 0, NOCRED, NULL, 0);
  11.282 +			VOP_CLOSE(tv, FREAD, NOCRED, 0);
  11.283 +			if (error) {
  11.284 +#ifdef GEOM_DEBUG
  11.285 +				printf("matchbiosdisks: %s: MBR read failure\n",
  11.286 +				    dv->dv_xname);
  11.287 +#endif
  11.288 +				continue;
  11.289 +			}
  11.290 +
  11.291 +			for (ck = i = 0; i < DEV_BSIZE; i++)
  11.292 +				ck += mbr[i];
  11.293 +			for (m = i = 0; i < big->num; i++) {
  11.294 +				be = &big->disk[i];
  11.295 +#ifdef GEOM_DEBUG
  11.296 +				printf("match %s with %d ", dv->dv_xname, i);
  11.297 +				printf("dev ck %x bios ck %x\n", ck, be->cksum);
  11.298 +#endif
  11.299 +				if (be->flags & BI_GEOM_INVALID)
  11.300 +					continue;
  11.301 +				if (be->cksum == ck &&
  11.302 +				    !memcmp(&mbr[MBR_PART_OFFSET], be->dosparts,
  11.303 +					MBR_PART_COUNT *
  11.304 +					    sizeof (struct mbr_partition))) {
  11.305 +#ifdef GEOM_DEBUG
  11.306 +					printf("matched bios disk %x with %s\n",
  11.307 +					    be->dev, dv->dv_xname);
  11.308 +#endif
  11.309 +					i386_alldisks->dl_nativedisks[n].
  11.310 +					    ni_biosmatches[m++] = i;
  11.311 +				}
  11.312 +			}
  11.313 +			i386_alldisks->dl_nativedisks[n].ni_nmatches = m;
  11.314 +			vput(tv);
  11.315 +		}
  11.316 +	}
  11.317 +}
  11.318 +
  11.319 +#ifdef COMPAT_OLDBOOT
  11.320 +u_long	bootdev = 0;		/* should be dev_t, but not until 32 bits */
  11.321 +#endif
  11.322 +
  11.323 +/*
  11.324 + * helper function for "findroot()":
  11.325 + * return nonzero if disk device matches bootinfo
  11.326 + */
  11.327 +static int
  11.328 +match_harddisk(struct device *dv, struct btinfo_bootdisk *bid)
  11.329 +{
  11.330 +	struct vnode *tmpvn;
  11.331 +	int error;
  11.332 +	struct disklabel label;
  11.333 +	int found = 0;
  11.334 +	int bmajor;
  11.335 +
  11.336 +	/*
  11.337 +	 * A disklabel is required here.  The
  11.338 +	 * bootblocks don't refuse to boot from
  11.339 +	 * a disk without a label, but this is
  11.340 +	 * normally not wanted.
  11.341 +	 */
  11.342 +	if (bid->labelsector == -1)
  11.343 +		return(0);
  11.344 +
  11.345 +	/*
  11.346 +	 * lookup major number for disk block device
  11.347 +	 */
  11.348 +	bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
  11.349 +	if (bmajor == -1)
  11.350 +		return(0); /* XXX panic() ??? */
  11.351 +
  11.352 +	/*
  11.353 +	 * Fake a temporary vnode for the disk, open
  11.354 +	 * it, and read the disklabel for comparison.
  11.355 +	 */
  11.356 +	if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, bid->partition), &tmpvn))
  11.357 +		panic("findroot can't alloc vnode");
  11.358 +	error = VOP_OPEN(tmpvn, FREAD, NOCRED, 0);
  11.359 +	if (error) {
  11.360 +#ifndef DEBUG
  11.361 +		/*
  11.362 +		 * Ignore errors caused by missing
  11.363 +		 * device, partition or medium.
  11.364 +		 */
  11.365 +		if (error != ENXIO && error != ENODEV)
  11.366 +#endif
  11.367 +			printf("findroot: can't open dev %s%c (%d)\n",
  11.368 +			       dv->dv_xname, 'a' + bid->partition, error);
  11.369 +		vput(tmpvn);
  11.370 +		return(0);
  11.371 +	}
  11.372 +	error = VOP_IOCTL(tmpvn, DIOCGDINFO, &label, FREAD, NOCRED, 0);
  11.373 +	if (error) {
  11.374 +		/*
  11.375 +		 * XXX can't happen - open() would
  11.376 +		 * have errored out (or faked up one)
  11.377 +		 */
  11.378 +		printf("can't get label for dev %s%c (%d)\n",
  11.379 +		       dv->dv_xname, 'a' + bid->partition, error);
  11.380 +		goto closeout;
  11.381 +	}
  11.382 +
  11.383 +	/* compare with our data */
  11.384 +	if (label.d_type == bid->label.type &&
  11.385 +	    label.d_checksum == bid->label.checksum &&
  11.386 +	    !strncmp(label.d_packname, bid->label.packname, 16))
  11.387 +		found = 1;
  11.388 +
  11.389 +closeout:
  11.390 +	VOP_CLOSE(tmpvn, FREAD, NOCRED, 0);
  11.391 +	vput(tmpvn);
  11.392 +	return(found);
  11.393 +}
  11.394 +
  11.395 +/*
  11.396 + * Attempt to find the device from which we were booted.
  11.397 + * If we can do so, and not instructed not to do so,
  11.398 + * change rootdev to correspond to the load device.
  11.399 + */
  11.400 +void
  11.401 +findroot(void)
  11.402 +{
  11.403 +	struct btinfo_bootdisk *bid;
  11.404 +	struct device *dv;
  11.405 +	union xen_cmdline_parseinfo xcp;
  11.406 +#ifdef COMPAT_OLDBOOT
  11.407 +	int i, majdev, unit, part;
  11.408 +	char buf[32];
  11.409 +#endif
  11.410 +
  11.411 +	if (booted_device)
  11.412 +		return;
  11.413 +
  11.414 +	if (lookup_bootinfo(BTINFO_NETIF)) {
  11.415 +		/*
  11.416 +		 * We got netboot interface information, but
  11.417 +		 * "device_register()" couldn't match it to a configured
  11.418 +		 * device. Bootdisk information cannot be present at the
  11.419 +		 * same time, so give up.
  11.420 +		 */
  11.421 +		printf("findroot: netboot interface not found\n");
  11.422 +		return;
  11.423 +	}
  11.424 +
  11.425 +	bid = lookup_bootinfo(BTINFO_BOOTDISK);
  11.426 +	if (bid) {
  11.427 +		/*
  11.428 +		 * Scan all disk devices for ones that match the passed data.
  11.429 +		 * Don't break if one is found, to get possible multiple
  11.430 +		 * matches - for problem tracking. Use the first match anyway
  11.431 +		 * because lower device numbers are more likely to be the
  11.432 +		 * boot device.
  11.433 +		 */
  11.434 +		for (dv = alldevs.tqh_first; dv != NULL;
  11.435 +		    dv = dv->dv_list.tqe_next) {
  11.436 +			if (dv->dv_class != DV_DISK)
  11.437 +				continue;
  11.438 +
  11.439 +			if (!strcmp(dv->dv_cfdata->cf_name, "fd")) {
  11.440 +				/*
  11.441 +				 * Assume the configured unit number matches
  11.442 +				 * the BIOS device number.  (This is the old
  11.443 +				 * behaviour.)  Needs some ideas how to handle
  11.444 +				 * BIOS's "swap floppy drive" options.
  11.445 +				 */
  11.446 +				if ((bid->biosdev & 0x80) ||
  11.447 +				    dv->dv_unit != bid->biosdev)
  11.448 +					continue;
  11.449 +
  11.450 +				goto found;
  11.451 +			}
  11.452 +
  11.453 +			if (is_valid_disk(dv)) {
  11.454 +				/*
  11.455 +				 * Don't trust BIOS device numbers, try
  11.456 +				 * to match the information passed by the
  11.457 +				 * bootloader instead.
  11.458 +				 */
  11.459 +				if ((bid->biosdev & 0x80) == 0 ||
  11.460 +				    !match_harddisk(dv, bid))
  11.461 +					continue;
  11.462 +
  11.463 +				goto found;
  11.464 +			}
  11.465 +
  11.466 +			/* no "fd", "wd", "sd", "ld", "ed" */
  11.467 +			continue;
  11.468 +
  11.469 +found:
  11.470 +			if (booted_device) {
  11.471 +				printf("warning: double match for boot "
  11.472 +				    "device (%s, %s)\n",
  11.473 +				    booted_device->dv_xname, dv->dv_xname);
  11.474 +				continue;
  11.475 +			}
  11.476 +			booted_device = dv;
  11.477 +			booted_partition = bid->partition;
  11.478 +		}
  11.479 +
  11.480 +		if (booted_device)
  11.481 +			return;
  11.482 +	}
  11.483 +
  11.484 +	xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
  11.485 +
  11.486 +	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
  11.487 +		if (is_valid_disk(dv) == 0)
  11.488 +			continue;
  11.489 +
  11.490 +		if (xcp.xcp_bootdev[0] == 0) {
  11.491 +			booted_device = dv;
  11.492 +			break;
  11.493 +		}
  11.494 +
  11.495 +		if (strncmp(xcp.xcp_bootdev, dv->dv_xname,
  11.496 +		    strlen(dv->dv_xname)))
  11.497 +			continue;
  11.498 +
  11.499 +		if (strlen(xcp.xcp_bootdev) > strlen(dv->dv_xname)) {
  11.500 +			booted_partition = toupper(
  11.501 +				xcp.xcp_bootdev[strlen(dv->dv_xname)]) - 'A';
  11.502 +		}
  11.503 +
  11.504 +		booted_device = dv;
  11.505 +		break;
  11.506 +	}
  11.507 +
  11.508 +	if (booted_device)
  11.509 +		return;
  11.510 +
  11.511 +#ifdef COMPAT_OLDBOOT
  11.512 +#if 0
  11.513 +	printf("howto %x bootdev %x ", boothowto, bootdev);
  11.514 +#endif
  11.515 +
  11.516 +	if ((bootdev & B_MAGICMASK) != (u_long)B_DEVMAGIC)
  11.517 +		return;
  11.518 +
  11.519 +	majdev = (bootdev >> B_TYPESHIFT) & B_TYPEMASK;
  11.520 +	name = devsw_blk2name(majdev);
  11.521 +	if (name == NULL)
  11.522 +		return;
  11.523 +
  11.524 +	part = (bootdev >> B_PARTITIONSHIFT) & B_PARTITIONMASK;
  11.525 +	unit = (bootdev >> B_UNITSHIFT) & B_UNITMASK;
  11.526 +
  11.527 +	sprintf(buf, "%s%d", name, unit);
  11.528 +	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
  11.529 +		if (strcmp(buf, dv->dv_xname) == 0) {
  11.530 +			booted_device = dv;
  11.531 +			booted_partition = part;
  11.532 +			return;
  11.533 +		}
  11.534 +	}
  11.535 +#endif
  11.536 +}
  11.537 +
  11.538 +#include "pci.h"
  11.539 +
  11.540 +#include <dev/isa/isavar.h>
  11.541 +#if NPCI > 0
  11.542 +#include <dev/pci/pcivar.h>
  11.543 +#endif
  11.544 +
  11.545 +void
  11.546 +device_register(struct device *dev, void *aux)
  11.547 +{
  11.548 +	/*
  11.549 +	 * Handle network interfaces here, the attachment information is
  11.550 +	 * not available driver independantly later.
  11.551 +	 * For disks, there is nothing useful available at attach time.
  11.552 +	 */
  11.553 +#if NXENNET > 0
  11.554 +	if (dev->dv_class == DV_IFNET) {
  11.555 +		union xen_cmdline_parseinfo xcp;
  11.556 +
  11.557 +		xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
  11.558 +		if (strncmp(xcp.xcp_bootdev, dev->dv_xname, 16) == 0) {
  11.559 +#ifdef NFS_BOOT_BOOTSTATIC
  11.560 +			nfs_bootstatic_callback = xennet_bootstatic_callback;
  11.561 +#endif
  11.562 +			goto found;
  11.563 +		}
  11.564 +	}
  11.565 +#endif
  11.566 +	if (dev->dv_class == DV_IFNET) {
  11.567 +		struct btinfo_netif *bin = lookup_bootinfo(BTINFO_NETIF);
  11.568 +		if (bin == NULL)
  11.569 +			return;
  11.570 +
  11.571 +		/*
  11.572 +		 * We don't check the driver name against the device name
  11.573 +		 * passed by the boot ROM. The ROM should stay usable
  11.574 +		 * if the driver gets obsoleted.
  11.575 +		 * The physical attachment information (checked below)
  11.576 +		 * must be sufficient to identify the device.
  11.577 +		 */
  11.578 +
  11.579 +		if (bin->bus == BI_BUS_ISA &&
  11.580 +		    !strcmp(dev->dv_parent->dv_cfdata->cf_name, "isa")) {
  11.581 +			struct isa_attach_args *iaa = aux;
  11.582 +
  11.583 +			/* compare IO base address */
  11.584 +			/* XXXJRT what about multiple I/O addrs? */
  11.585 +			if (iaa->ia_nio > 0 &&
  11.586 +			    bin->addr.iobase == iaa->ia_io[0].ir_addr)
  11.587 +				goto found;
  11.588 +		}
  11.589 +#if NPCI > 0
  11.590 +		if (bin->bus == BI_BUS_PCI &&
  11.591 +		    !strcmp(dev->dv_parent->dv_cfdata->cf_name, "pci")) {
  11.592 +			struct pci_attach_args *paa = aux;
  11.593 +			int b, d, f;
  11.594 +
  11.595 +			/*
  11.596 +			 * Calculate BIOS representation of:
  11.597 +			 *
  11.598 +			 *	<bus,device,function>
  11.599 +			 *
  11.600 +			 * and compare.
  11.601 +			 */
  11.602 +			pci_decompose_tag(paa->pa_pc, paa->pa_tag, &b, &d, &f);
  11.603 +			if (bin->addr.tag == ((b << 8) | (d << 3) | f))
  11.604 +				goto found;
  11.605 +		}
  11.606 +#endif
  11.607 +	}
  11.608 +	return;
  11.609 +
  11.610 +found:
  11.611 +	if (booted_device) {
  11.612 +		/* XXX should be a "panic()" */
  11.613 +		printf("warning: double match for boot device (%s, %s)\n",
  11.614 +		    booted_device->dv_xname, dev->dv_xname);
  11.615 +		return;
  11.616 +	}
  11.617 +	booted_device = dev;
  11.618 +}
  11.619 +
  11.620 +static int
  11.621 +is_valid_disk(struct device *dv)
  11.622 +{
  11.623 +	const char *name;
  11.624 +
  11.625 +	if (dv->dv_class != DV_DISK)
  11.626 +		return (0);
  11.627 +
  11.628 +	name = dv->dv_cfdata->cf_name;
  11.629 +
  11.630 +	return (strcmp(name, "sd") == 0 || strcmp(name, "wd") == 0 ||
  11.631 +	    strcmp(name, "ld") == 0 || strcmp(name, "ed") == 0 ||
  11.632 +	    strcmp(name, "xbd") == 0);
  11.633 +}
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c	Mon Sep 06 19:11:17 2004 +0000
    12.3 @@ -0,0 +1,408 @@
    12.4 +/*	$NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $	*/
    12.5 +/*	NetBSD: gdt.c,v 1.32 2004/02/13 11:36:13 wiz Exp 	*/
    12.6 +
    12.7 +/*-
    12.8 + * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc.
    12.9 + * All rights reserved.
   12.10 + *
   12.11 + * This code is derived from software contributed to The NetBSD Foundation
   12.12 + * by John T. Kohl and Charles M. Hannum.
   12.13 + *
   12.14 + * Redistribution and use in source and binary forms, with or without
   12.15 + * modification, are permitted provided that the following conditions
   12.16 + * are met:
   12.17 + * 1. Redistributions of source code must retain the above copyright
   12.18 + *    notice, this list of conditions and the following disclaimer.
   12.19 + * 2. Redistributions in binary form must reproduce the above copyright
   12.20 + *    notice, this list of conditions and the following disclaimer in the
   12.21 + *    documentation and/or other materials provided with the distribution.
   12.22 + * 3. All advertising materials mentioning features or use of this software
   12.23 + *    must display the following acknowledgement:
   12.24 + *        This product includes software developed by the NetBSD
   12.25 + *        Foundation, Inc. and its contributors.
   12.26 + * 4. Neither the name of The NetBSD Foundation nor the names of its
   12.27 + *    contributors may be used to endorse or promote products derived
   12.28 + *    from this software without specific prior written permission.
   12.29 + *
   12.30 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   12.31 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   12.32 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   12.33 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   12.34 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   12.35 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   12.36 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   12.37 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   12.38 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   12.39 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   12.40 + * POSSIBILITY OF SUCH DAMAGE.
   12.41 + */
   12.42 +
   12.43 +#include <sys/cdefs.h>
   12.44 +__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $");
   12.45 +
   12.46 +#include "opt_multiprocessor.h"
   12.47 +#include "opt_xen.h"
   12.48 +
   12.49 +#include <sys/param.h>
   12.50 +#include <sys/systm.h>
   12.51 +#include <sys/proc.h>
   12.52 +#include <sys/lock.h>
   12.53 +#include <sys/user.h>
   12.54 +
   12.55 +#include <uvm/uvm.h>
   12.56 +
   12.57 +#include <machine/gdt.h>
   12.58 +
   12.59 +int gdt_size[2];	/* total number of GDT entries */
   12.60 +int gdt_count[2];	/* number of GDT entries in use */
   12.61 +int gdt_next[2];	/* next available slot for sweeping */
   12.62 +int gdt_free[2];	/* next free slot; terminated with GNULL_SEL */
   12.63 +
   12.64 +struct lock gdt_lock_store;
   12.65 +
   12.66 +static __inline void gdt_lock(void);
   12.67 +static __inline void gdt_unlock(void);
   12.68 +void gdt_init(void);
   12.69 +void gdt_grow(int);
   12.70 +int gdt_get_slot(void);
   12.71 +int gdt_get_slot1(int);
   12.72 +void gdt_put_slot(int);
   12.73 +void gdt_put_slot1(int, int);
   12.74 +
   12.75 +/*
   12.76 + * Lock and unlock the GDT, to avoid races in case gdt_{ge,pu}t_slot() sleep
   12.77 + * waiting for memory.
   12.78 + *
   12.79 + * Note that the locking done here is not sufficient for multiprocessor
   12.80 + * systems.  A freshly allocated slot will still be of type SDT_SYSNULL for
   12.81 + * some time after the GDT is unlocked, so gdt_compact() could attempt to
   12.82 + * reclaim it.
   12.83 + */
   12.84 +static __inline void
   12.85 +gdt_lock()
   12.86 +{
   12.87 +
   12.88 +	(void) lockmgr(&gdt_lock_store, LK_EXCLUSIVE, NULL);
   12.89 +}
   12.90 +
   12.91 +static __inline void
   12.92 +gdt_unlock()
   12.93 +{
   12.94 +
   12.95 +	(void) lockmgr(&gdt_lock_store, LK_RELEASE, NULL);
   12.96 +}
   12.97 +
   12.98 +void
   12.99 +setgdt(int sel, void *base, size_t limit,
  12.100 +    int type, int dpl, int def32, int gran)
  12.101 +{
  12.102 +	struct segment_descriptor sd;
  12.103 +	CPU_INFO_ITERATOR cii;
  12.104 +	struct cpu_info *ci;
  12.105 +
  12.106 +	if (type == SDT_SYS386TSS) {
  12.107 +		/* printk("XXX TSS descriptor not supported in GDT\n"); */
  12.108 +		return;
  12.109 +	}
  12.110 +
  12.111 +	setsegment(&sd, base, limit, type, dpl, def32, gran);
  12.112 +	for (CPU_INFO_FOREACH(cii, ci)) {
  12.113 +		if (ci->ci_gdt != NULL) {
  12.114 +#ifndef XEN
  12.115 +			ci->ci_gdt[sel].sd = sd;
  12.116 +#else
  12.117 +			xen_update_descriptor(&ci->ci_gdt[sel],
  12.118 +			    (union descriptor *)&sd);
  12.119 +#endif
  12.120 +		}
  12.121 +	}
  12.122 +}
  12.123 +
  12.124 +/*
  12.125 + * Initialize the GDT subsystem.  Called from autoconf().
  12.126 + */
  12.127 +void
  12.128 +gdt_init()
  12.129 +{
  12.130 +	size_t max_len, min_len;
  12.131 +	union descriptor *old_gdt;
  12.132 +	struct vm_page *pg;
  12.133 +	vaddr_t va;
  12.134 +	struct cpu_info *ci = &cpu_info_primary;
  12.135 +
  12.136 +	lockinit(&gdt_lock_store, PZERO, "gdtlck", 0, 0);
  12.137 +
  12.138 +	max_len = MAXGDTSIZ * sizeof(gdt[0]);
  12.139 +	min_len = MINGDTSIZ * sizeof(gdt[0]);
  12.140 +
  12.141 +	gdt_size[0] = MINGDTSIZ;
  12.142 +	gdt_count[0] = NGDT;
  12.143 +	gdt_next[0] = NGDT;
  12.144 +	gdt_free[0] = GNULL_SEL;
  12.145 +
  12.146 +	gdt_size[1] = 0;
  12.147 +	gdt_count[1] = MAXGDTSIZ;
  12.148 +	gdt_next[1] = MAXGDTSIZ;
  12.149 +	gdt_free[1] = GNULL_SEL;
  12.150 +
  12.151 +	old_gdt = gdt;
  12.152 +	gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len + max_len);
  12.153 +	for (va = (vaddr_t)gdt; va < (vaddr_t)gdt + min_len; va += PAGE_SIZE) {
  12.154 +		pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
  12.155 +		if (pg == NULL) {
  12.156 +			panic("gdt_init: no pages");
  12.157 +		}
  12.158 +		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
  12.159 +		    VM_PROT_READ | VM_PROT_WRITE);
  12.160 +	}
  12.161 +	memcpy(gdt, old_gdt, NGDT * sizeof(gdt[0]));
  12.162 +	ci->ci_gdt = gdt;
  12.163 +	setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1,
  12.164 +	    SDT_MEMRWA, SEL_KPL, 1, 1);
  12.165 +
  12.166 +	gdt_init_cpu(ci);
  12.167 +}
  12.168 +
  12.169 +/*
  12.170 + * Allocate shadow GDT for a slave CPU.
  12.171 + */
  12.172 +void
  12.173 +gdt_alloc_cpu(struct cpu_info *ci)
  12.174 +{
  12.175 +	int max_len = MAXGDTSIZ * sizeof(gdt[0]);
  12.176 +	int min_len = MINGDTSIZ * sizeof(gdt[0]);
  12.177 +	struct vm_page *pg;
  12.178 +	vaddr_t va;
  12.179 +
  12.180 +	ci->ci_gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len);
  12.181 +	for (va = (vaddr_t)ci->ci_gdt; va < (vaddr_t)ci->ci_gdt + min_len;
  12.182 +	    va += PAGE_SIZE) {
  12.183 +		while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO))
  12.184 +		    == NULL) {
  12.185 +			uvm_wait("gdt_alloc_cpu");
  12.186 +		}
  12.187 +		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
  12.188 +		    VM_PROT_READ | VM_PROT_WRITE);
  12.189 +	}
  12.190 +	memset(ci->ci_gdt, 0, min_len);
  12.191 +	memcpy(ci->ci_gdt, gdt, gdt_count[0] * sizeof(gdt[0]));
  12.192 +	setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1,
  12.193 +	    SDT_MEMRWA, SEL_KPL, 1, 1);
  12.194 +}
  12.195 +
  12.196 +
  12.197 +/*
  12.198 + * Load appropriate gdt descriptor; we better be running on *ci
  12.199 + * (for the most part, this is how a CPU knows who it is).
  12.200 + */
  12.201 +void
  12.202 +gdt_init_cpu(struct cpu_info *ci)
  12.203 +{
  12.204 +#ifndef XEN
  12.205 +	struct region_descriptor region;
  12.206 +	size_t max_len;
  12.207 +
  12.208 +	max_len = MAXGDTSIZ * sizeof(gdt[0]);
  12.209 +	setregion(&region, ci->ci_gdt, max_len - 1);
  12.210 +	lgdt(&region);
  12.211 +#else
  12.212 +	size_t len = gdt_size[0] * sizeof(gdt[0]);
  12.213 +	unsigned long frames[len >> PAGE_SHIFT];
  12.214 +	vaddr_t va;
  12.215 +	pt_entry_t *ptp;
  12.216 +	pt_entry_t *maptp;
  12.217 +	int f;
  12.218 +
  12.219 +	for (va = (vaddr_t)ci->ci_gdt, f = 0;
  12.220 +	     va < (vaddr_t)ci->ci_gdt + len;
  12.221 +	     va += PAGE_SIZE, f++) {
  12.222 +		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
  12.223 +		ptp = kvtopte(va);
  12.224 +		frames[f] = *ptp >> PAGE_SHIFT;
  12.225 +		maptp = (pt_entry_t *)vtomach((vaddr_t)ptp);
  12.226 +		PTE_CLEARBITS(ptp, maptp, PG_RW);
  12.227 +	}
  12.228 +	PTE_UPDATES_FLUSH();
  12.229 +	/* printk("loading gdt %x, %d entries, %d pages", */
  12.230 +	    /* frames[0] << PAGE_SHIFT, gdt_size[0], len >> PAGE_SHIFT); */
  12.231 +	if (HYPERVISOR_set_gdt(frames, gdt_size[0]))
  12.232 +		panic("HYPERVISOR_set_gdt failed!\n");
  12.233 +	lgdt_finish();
  12.234 +#endif
  12.235 +}
  12.236 +
  12.237 +#ifdef MULTIPROCESSOR
  12.238 +
  12.239 +void
  12.240 +gdt_reload_cpu(struct cpu_info *ci)
  12.241 +{
  12.242 +	struct region_descriptor region;
  12.243 +	size_t max_len;
  12.244 +
  12.245 +	max_len = MAXGDTSIZ * sizeof(gdt[0]);
  12.246 +	setregion(&region, ci->ci_gdt, max_len - 1);
  12.247 +	lgdt(&region);
  12.248 +}
  12.249 +#endif
  12.250 +
  12.251 +
  12.252 +/*
  12.253 + * Grow the GDT.
  12.254 + */
  12.255 +void
  12.256 +gdt_grow(int which)
  12.257 +{
  12.258 +	size_t old_len, new_len, max_len;
  12.259 +	CPU_INFO_ITERATOR cii;
  12.260 +	struct cpu_info *ci;
  12.261 +	struct vm_page *pg;
  12.262 +	vaddr_t va;
  12.263 +
  12.264 +	old_len = gdt_size[which] * sizeof(gdt[0]);
  12.265 +	gdt_size[which] <<= 1;
  12.266 +	new_len = old_len << 1;
  12.267 +
  12.268 +	if (which != 0) {
  12.269 +		max_len = MAXGDTSIZ * sizeof(gdt[0]);
  12.270 +		if (old_len == 0) {
  12.271 +			gdt_size[which] = MINGDTSIZ;
  12.272 +			new_len = gdt_size[which] * sizeof(gdt[0]);
  12.273 +		}
  12.274 +		for (va = (vaddr_t)(cpu_info_primary.ci_gdt) + old_len + max_len;
  12.275 +		     va < (vaddr_t)(cpu_info_primary.ci_gdt) + new_len + max_len;
  12.276 +		     va += PAGE_SIZE) {
  12.277 +			while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) ==
  12.278 +			    NULL) {
  12.279 +				uvm_wait("gdt_grow");
  12.280 +			}
  12.281 +			pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
  12.282 +			    VM_PROT_READ | VM_PROT_WRITE);
  12.283 +		}
  12.284 +		return;
  12.285 +	}
  12.286 +
  12.287 +	for (CPU_INFO_FOREACH(cii, ci)) {
  12.288 +		for (va = (vaddr_t)(ci->ci_gdt) + old_len;
  12.289 +		     va < (vaddr_t)(ci->ci_gdt) + new_len;
  12.290 +		     va += PAGE_SIZE) {
  12.291 +			while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) ==
  12.292 +			    NULL) {
  12.293 +				uvm_wait("gdt_grow");
  12.294 +			}
  12.295 +			pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
  12.296 +			    VM_PROT_READ | VM_PROT_WRITE);
  12.297 +		}
  12.298 +	}
  12.299 +}
  12.300 +
  12.301 +/*
  12.302 + * Allocate a GDT slot as follows:
  12.303 + * 1) If there are entries on the free list, use those.
  12.304 + * 2) If there are fewer than gdt_size entries in use, there are free slots
  12.305 + *    near the end that we can sweep through.
  12.306 + * 3) As a last resort, we increase the size of the GDT, and sweep through
  12.307 + *    the new slots.
  12.308 + */
  12.309 +int
  12.310 +gdt_get_slot()
  12.311 +{
  12.312 +	return gdt_get_slot1(0);
  12.313 +}
  12.314 +
  12.315 +int
  12.316 +gdt_get_slot1(int which)
  12.317 +{
  12.318 +	size_t offset;
  12.319 +	int slot;
  12.320 +
  12.321 +	gdt_lock();
  12.322 +
  12.323 +	if (gdt_free[which] != GNULL_SEL) {
  12.324 +		slot = gdt_free[which];
  12.325 +		gdt_free[which] = gdt[slot].gd.gd_selector;
  12.326 +	} else {
  12.327 +		offset = which * MAXGDTSIZ * sizeof(gdt[0]);
  12.328 +		if (gdt_next[which] != gdt_count[which] + offset)
  12.329 +			panic("gdt_get_slot botch 1");
  12.330 +		if (gdt_next[which] - offset >= gdt_size[which]) {
  12.331 +			if (gdt_size[which] >= MAXGDTSIZ)
  12.332 +				panic("gdt_get_slot botch 2");
  12.333 +			gdt_grow(which);
  12.334 +		}
  12.335 +		slot = gdt_next[which]++;
  12.336 +	}
  12.337 +
  12.338 +	gdt_count[which]++;
  12.339 +	gdt_unlock();
  12.340 +	return (slot);
  12.341 +}
  12.342 +
  12.343 +/*
  12.344 + * Deallocate a GDT slot, putting it on the free list.
  12.345 + */
  12.346 +void
  12.347 +gdt_put_slot(int slot)
  12.348 +{
  12.349 +	gdt_put_slot1(slot, 0);
  12.350 +}
  12.351 +
  12.352 +void
  12.353 +gdt_put_slot1(int slot, int which)
  12.354 +{
  12.355 +
  12.356 +	gdt_lock();
  12.357 +	gdt_count[which]--;
  12.358 +
  12.359 +	gdt[slot].gd.gd_type = SDT_SYSNULL;
  12.360 +	gdt[slot].gd.gd_selector = gdt_free[which];
  12.361 +	gdt_free[which] = slot;
  12.362 +
  12.363 +	gdt_unlock();
  12.364 +}
  12.365 +
  12.366 +int
  12.367 +tss_alloc(struct pcb *pcb)
  12.368 +{
  12.369 +	int slot;
  12.370 +
  12.371 +	slot = gdt_get_slot();
  12.372 +	setgdt(slot, &pcb->pcb_tss, sizeof(struct pcb) - 1,
  12.373 +	    SDT_SYS386TSS, SEL_KPL, 0, 0);
  12.374 +	return GSEL(slot, SEL_KPL);
  12.375 +}
  12.376 +
  12.377 +void
  12.378 +tss_free(int sel)
  12.379 +{
  12.380 +
  12.381 +	gdt_put_slot(IDXSEL(sel));
  12.382 +}
  12.383 +
  12.384 +/*
  12.385 + * Caller must have pmap locked for both of these functions.
  12.386 + */
  12.387 +void
  12.388 +ldt_alloc(struct pmap *pmap, union descriptor *ldt, size_t len)
  12.389 +{
  12.390 +	int slot;
  12.391 +
  12.392 +	slot = gdt_get_slot1(1);
  12.393 +#ifndef XEN
  12.394 +	setgdt(slot, ldt, len - 1, SDT_SYSLDT, SEL_KPL, 0, 0);
  12.395 +#else
  12.396 +	cpu_info_primary.ci_gdt[slot].ld.ld_base = (uint32_t)ldt;
  12.397 +	cpu_info_primary.ci_gdt[slot].ld.ld_entries =
  12.398 +		len / sizeof(union descriptor);
  12.399 +#endif
  12.400 +	pmap->pm_ldt_sel = GSEL(slot, SEL_KPL);
  12.401 +}
  12.402 +
  12.403 +void
  12.404 +ldt_free(struct pmap *pmap)
  12.405 +{
  12.406 +	int slot;
  12.407 +
  12.408 +	slot = IDXSEL(pmap->pm_ldt_sel);
  12.409 +
  12.410 +	gdt_put_slot1(slot, 1);
  12.411 +}
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c	Mon Sep 06 19:11:17 2004 +0000
    13.3 @@ -0,0 +1,230 @@
    13.4 +/*	$NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $	*/
    13.5 +
    13.6 +/*
    13.7 + *
    13.8 + * Copyright (c) 2004 Christian Limpach.
    13.9 + * All rights reserved.
   13.10 + *
   13.11 + * Redistribution and use in source and binary forms, with or without
   13.12 + * modification, are permitted provided that the following conditions
   13.13 + * are met:
   13.14 + * 1. Redistributions of source code must retain the above copyright
   13.15 + *    notice, this list of conditions and the following disclaimer.
   13.16 + * 2. Redistributions in binary form must reproduce the above copyright
   13.17 + *    notice, this list of conditions and the following disclaimer in the
   13.18 + *    documentation and/or other materials provided with the distribution.
   13.19 + * 3. All advertising materials mentioning features or use of this software
   13.20 + *    must display the following acknowledgement:
   13.21 + *      This product includes software developed by Christian Limpach.
   13.22 + * 4. The name of the author may not be used to endorse or promote products
   13.23 + *    derived from this software without specific prior written permission.
   13.24 + *
   13.25 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   13.26 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   13.27 + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   13.28 + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   13.29 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   13.30 + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   13.31 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   13.32 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   13.33 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   13.34 + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   13.35 + */
   13.36 +
   13.37 +/******************************************************************************
   13.38 + * hypervisor.c
   13.39 + * 
   13.40 + * Communication to/from hypervisor.
   13.41 + * 
   13.42 + * Copyright (c) 2002-2004, K A Fraser
   13.43 + * 
   13.44 + * Permission is hereby granted, free of charge, to any person obtaining a copy
   13.45 + * of this software and associated documentation files (the "Software"), to
   13.46 + * deal in the Software without restriction, including without limitation the
   13.47 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
   13.48 + * sell copies of the Software, and to permit persons to whom the Software is
   13.49 + * furnished to do so, subject to the following conditions:
   13.50 + * 
   13.51 + * The above copyright notice and this permission notice shall be included in
   13.52 + * all copies or substantial portions of the Software.
   13.53 + * 
   13.54 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
   13.55 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
   13.56 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
   13.57 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
   13.58 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
   13.59 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
   13.60 + * DEALINGS IN THE SOFTWARE.
   13.61 + */
   13.62 +
   13.63 +
   13.64 +#include <sys/cdefs.h>
   13.65 +__KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $");
   13.66 +
   13.67 +#include <sys/cdefs.h>
   13.68 +#include <sys/param.h>
   13.69 +#include <sys/systm.h>
   13.70 +
   13.71 +#include <machine/xen.h>
   13.72 +#include <machine/hypervisor.h>
   13.73 +#include <machine/evtchn.h>
   13.74 +
   13.75 +/*
   13.76 + * Force a proper event-channel callback from Xen after clearing the
   13.77 + * callback mask. We do this in a very simple manner, by making a call
   13.78 + * down into Xen. The pending flag will be checked by Xen on return.
   13.79 + */
   13.80 +void
   13.81 +hypervisor_force_callback(void)
   13.82 +{
   13.83 +
   13.84 +	(void)HYPERVISOR_xen_version(0);
   13.85 +}
   13.86 +
   13.87 +int stipending(void);
   13.88 +int
   13.89 +stipending()
   13.90 +{
   13.91 +	uint32_t l1;
   13.92 +	unsigned long l2;
   13.93 +	unsigned int l1i, l2i, port;
   13.94 +	int irq;
   13.95 +	shared_info_t *s = HYPERVISOR_shared_info;
   13.96 +	struct cpu_info *ci;
   13.97 +	int ret;
   13.98 +
   13.99 +	ret = 0;
  13.100 +	ci = curcpu();
  13.101 +
  13.102 +#if 0
  13.103 +	if (HYPERVISOR_shared_info->events)
  13.104 +		printf("stipending events %08lx mask %08lx ilevel %d\n",
  13.105 +		    HYPERVISOR_shared_info->events,
  13.106 +		    HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
  13.107 +#endif
  13.108 +
  13.109 +	/*
  13.110 +	 * we're only called after STIC, so we know that we'll have to
  13.111 +	 * STI at the end
  13.112 +	 */
  13.113 +	cli();
  13.114 +	while (s->vcpu_data[0].evtchn_upcall_pending) {
  13.115 +		s->vcpu_data[0].evtchn_upcall_pending = 0;
  13.116 +		/* NB. No need for a barrier here -- XCHG is a barrier
  13.117 +		 * on x86. */
  13.118 +		l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0);
  13.119 +		while ((l1i = ffs(l1)) != 0) {
  13.120 +			l1i--;
  13.121 +			l1 &= ~(1 << l1i);
  13.122 +
  13.123 +			l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
  13.124 +			while ((l2i = ffs(l2)) != 0) {
  13.125 +				l2i--;
  13.126 +				l2 &= ~(1 << l2i);
  13.127 +
  13.128 +				port = (l1i << 5) + l2i;
  13.129 +				if ((irq = evtchn_to_irq[port]) != -1) {
  13.130 +					hypervisor_acknowledge_irq(irq);
  13.131 +					ci->ci_ipending |= (1 << irq);
  13.132 +					if (ret == 0 && ci->ci_ilevel <
  13.133 +					    ci->ci_isources[irq]->is_handlers
  13.134 +					    ->ih_level)
  13.135 +						ret = 1;
  13.136 +				}
  13.137 +#if 0 /* XXXcl dev/evtchn */
  13.138 +				else
  13.139 +					evtchn_device_upcall(port);
  13.140 +#endif
  13.141 +			}
  13.142 +		}
  13.143 +	}
  13.144 +	sti();
  13.145 +
  13.146 +#if 0
  13.147 +	if (ci->ci_ipending & 0x1)
  13.148 +		printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n",
  13.149 +		    HYPERVISOR_shared_info->events,
  13.150 +		    HYPERVISOR_shared_info->events_mask, ci->ci_ilevel,
  13.151 +		    ci->ci_ipending);
  13.152 +#endif
  13.153 +
  13.154 +	return (ret);
  13.155 +}
  13.156 +
  13.157 +void do_hypervisor_callback(struct trapframe *regs)
  13.158 +{
  13.159 +	uint32_t l1;
  13.160 +	unsigned long l2;
  13.161 +	unsigned int l1i, l2i, port;
  13.162 +	int irq;
  13.163 +	shared_info_t *s = HYPERVISOR_shared_info;
  13.164 +	struct cpu_info *ci;
  13.165 +	int level;
  13.166 +
  13.167 +	ci = curcpu();
  13.168 +	level = ci->ci_ilevel;
  13.169 +
  13.170 +	while (s->vcpu_data[0].evtchn_upcall_pending) {
  13.171 +		s->vcpu_data[0].evtchn_upcall_pending = 0;
  13.172 +		/* NB. No need for a barrier here -- XCHG is a barrier
  13.173 +		 * on x86. */
  13.174 +		l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0);
  13.175 +		while ((l1i = ffs(l1)) != 0) {
  13.176 +			l1i--;
  13.177 +			l1 &= ~(1 << l1i);
  13.178 +
  13.179 +			l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
  13.180 +			while ((l2i = ffs(l2)) != 0) {
  13.181 +				l2i--;
  13.182 +				l2 &= ~(1 << l2i);
  13.183 +
  13.184 +				port = (l1i << 5) + l2i;
  13.185 +				if ((irq = evtchn_to_irq[port]) != -1)
  13.186 +					do_event(irq, regs);
  13.187 +#if 0 /* XXXcl dev/evtchn */
  13.188 +				else
  13.189 +					evtchn_device_upcall(port);
  13.190 +#endif
  13.191 +			}
  13.192 +		}
  13.193 +	}
  13.194 +
  13.195 +#ifdef DIAGNOSTIC
  13.196 +	if (level != ci->ci_ilevel)
  13.197 +		printf("hypervisor done %08x level %d/%d ipending %08x\n",
  13.198 +		    HYPERVISOR_shared_info->evtchn_pending_sel, level,
  13.199 +		    ci->ci_ilevel, ci->ci_ipending);
  13.200 +#endif
  13.201 +}
  13.202 +
  13.203 +void hypervisor_unmask_event(unsigned int ev)
  13.204 +{
  13.205 +	shared_info_t *s = HYPERVISOR_shared_info;
  13.206 +
  13.207 +	x86_atomic_clear_bit(&s->evtchn_mask[0], ev);
  13.208 +	/*
  13.209 +	 * The following is basically the equivalent of
  13.210 +	 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
  13.211 +	 * interrupt edge' if the channel is masked.
  13.212 +	 */
  13.213 +	if (x86_atomic_test_bit(&s->evtchn_pending[0], ev) && 
  13.214 +	    !x86_atomic_test_and_set_bit(&s->evtchn_pending_sel, ev>>5)) {
  13.215 +		s->vcpu_data[0].evtchn_upcall_pending = 1;
  13.216 +		if (!s->vcpu_data[0].evtchn_upcall_mask)
  13.217 +			hypervisor_force_callback();
  13.218 +	}
  13.219 +}
  13.220 +
  13.221 +void hypervisor_mask_event(unsigned int ev)
  13.222 +{
  13.223 +	shared_info_t *s = HYPERVISOR_shared_info;
  13.224 +
  13.225 +	x86_atomic_set_bit(&s->evtchn_mask[0], ev);
  13.226 +}
  13.227 +
  13.228 +void hypervisor_clear_event(unsigned int ev)
  13.229 +{
  13.230 +	shared_info_t *s = HYPERVISOR_shared_info;
  13.231 +
  13.232 +	x86_atomic_clear_bit(&s->evtchn_pending[0], ev);
  13.233 +}
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S	Mon Sep 06 19:11:17 2004 +0000
    14.3 @@ -0,0 +1,2000 @@
    14.4 +/*	$NetBSD: locore.S,v 1.2.2.1 2004/05/22 15:59:48 he Exp $	*/
    14.5 +/*	NetBSD: locore.S,v 1.26 2004/04/12 13:17:46 yamt Exp 	*/
    14.6 +
    14.7 +/*-
    14.8 + * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
    14.9 + * All rights reserved.
   14.10 + *
   14.11 + * This code is derived from software contributed to The NetBSD Foundation
   14.12 + * by Charles M. Hannum.
   14.13 + *
   14.14 + * Redistribution and use in source and binary forms, with or without
   14.15 + * modification, are permitted provided that the following conditions
   14.16 + * are met:
   14.17 + * 1. Redistributions of source code must retain the above copyright
   14.18 + *    notice, this list of conditions and the following disclaimer.
   14.19 + * 2. Redistributions in binary form must reproduce the above copyright
   14.20 + *    notice, this list of conditions and the following disclaimer in the
   14.21 + *    documentation and/or other materials provided with the distribution.
   14.22 + * 3. All advertising materials mentioning features or use of this software
   14.23 + *    must display the following acknowledgement:
   14.24 + *        This product includes software developed by the NetBSD
   14.25 + *        Foundation, Inc. and its contributors.
   14.26 + * 4. Neither the name of The NetBSD Foundation nor the names of its
   14.27 + *    contributors may be used to endorse or promote products derived
   14.28 + *    from this software without specific prior written permission.
   14.29 + *
   14.30 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   14.31 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   14.32 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   14.33 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   14.34 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   14.35 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   14.36 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   14.37 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   14.38 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   14.39 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   14.40 + * POSSIBILITY OF SUCH DAMAGE.
   14.41 + */
   14.42 +
   14.43 +/*-
   14.44 + * Copyright (c) 1990 The Regents of the University of California.
   14.45 + * All rights reserved.
   14.46 + *
   14.47 + * This code is derived from software contributed to Berkeley by
   14.48 + * William Jolitz.
   14.49 + *
   14.50 + * Redistribution and use in source and binary forms, with or without
   14.51 + * modification, are permitted provided that the following conditions
   14.52 + * are met:
   14.53 + * 1. Redistributions of source code must retain the above copyright
   14.54 + *    notice, this list of conditions and the following disclaimer.
   14.55 + * 2. Redistributions in binary form must reproduce the above copyright
   14.56 + *    notice, this list of conditions and the following disclaimer in the
   14.57 + *    documentation and/or other materials provided with the distribution.
   14.58 + * 3. Neither the name of the University nor the names of its contributors
   14.59 + *    may be used to endorse or promote products derived from this software
   14.60 + *    without specific prior written permission.
   14.61 + *
   14.62 + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   14.63 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   14.64 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   14.65 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   14.66 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   14.67 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   14.68 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   14.69 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   14.70 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   14.71 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   14.72 + * SUCH DAMAGE.
   14.73 + *
   14.74 + *	@(#)locore.s	7.3 (Berkeley) 5/13/91
   14.75 + */
   14.76 +
   14.77 +#include "opt_compat_netbsd.h"
   14.78 +#include "opt_compat_oldboot.h"
   14.79 +#include "opt_cputype.h"
   14.80 +#include "opt_ddb.h"
   14.81 +#include "opt_ipkdb.h"
   14.82 +#include "opt_lockdebug.h"
   14.83 +#include "opt_multiprocessor.h"
   14.84 +#include "opt_realmem.h"
   14.85 +#include "opt_user_ldt.h"
   14.86 +#include "opt_vm86.h"
   14.87 +#include "opt_xen.h"
   14.88 +
   14.89 +#include "npx.h"
   14.90 +#include "assym.h"
   14.91 +#include "apm.h"
   14.92 +#include "lapic.h"
   14.93 +#include "ioapic.h"
   14.94 +#include "ksyms.h"
   14.95 +
   14.96 +#include <sys/errno.h>
   14.97 +#include <sys/syscall.h>
   14.98 +
   14.99 +#include <machine/cputypes.h>
  14.100 +#include <machine/param.h>
  14.101 +#include <machine/pte.h>
  14.102 +#include <machine/segments.h>
  14.103 +#include <machine/specialreg.h>
  14.104 +#include <machine/trap.h>
  14.105 +#include <machine/bootinfo.h>
  14.106 +
  14.107 +#if NLAPIC > 0
  14.108 +#include <machine/i82489reg.h>
  14.109 +#endif
  14.110 +
  14.111 +/* LINTSTUB: include <sys/types.h> */
  14.112 +/* LINTSTUB: include <machine/cpu.h> */
  14.113 +/* LINTSTUB: include <sys/systm.h> */
  14.114 +
  14.115 +#include <machine/asm.h>
  14.116 +
  14.117 +#if defined(MULTIPROCESSOR)
  14.118 +	
  14.119 +#define SET_CURLWP(lwp,cpu)				\
  14.120 +	movl	CPUVAR(SELF),cpu		; 	\
  14.121 +	movl	lwp,CPUVAR(CURLWP)	;	\
  14.122 +	movl	cpu,L_CPU(lwp)
  14.123 +	
  14.124 +#else
  14.125 +
  14.126 +#define SET_CURLWP(lwp,tcpu)		movl	lwp,CPUVAR(CURLWP)
  14.127 +#define GET_CURLWP(reg)			movl	CPUVAR(CURLWP),reg
  14.128 +
  14.129 +#endif
  14.130 +
  14.131 +#define GET_CURPCB(reg)			movl	CPUVAR(CURPCB),reg	
  14.132 +#define SET_CURPCB(reg)			movl	reg,CPUVAR(CURPCB)
  14.133 +
  14.134 +#define CLEAR_RESCHED(reg)		movl	reg,CPUVAR(RESCHED)
  14.135 +
  14.136 +/* XXX temporary kluge; these should not be here */
  14.137 +/* Get definitions for IOM_BEGIN, IOM_END, and IOM_SIZE */
  14.138 +#include <dev/isa/isareg.h>
  14.139 +
  14.140 +
  14.141 +/* Disallow old names for REALBASEMEM */
  14.142 +#ifdef BIOSBASEMEM
  14.143 +#error BIOSBASEMEM option deprecated; use REALBASEMEM only if memory size reported by latest boot block is incorrect
  14.144 +#endif
  14.145 +
  14.146 +/* Disallow old names for REALEXTMEM */
  14.147 +#ifdef EXTMEM_SIZE
  14.148 +#error EXTMEM_SIZE option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect
  14.149 +#endif
  14.150 +#ifdef BIOSEXTMEM
  14.151 +#error BIOSEXTMEM option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect
  14.152 +#endif
  14.153 +
  14.154 +#include <machine/frameasm.h>
  14.155 +
  14.156 +
  14.157 +#ifdef MULTIPROCESSOR
  14.158 +#include <machine/i82489reg.h>
  14.159 +#endif
  14.160 +	
  14.161 +/*
  14.162 + * PTmap is recursive pagemap at top of virtual address space.
  14.163 + * Within PTmap, the page directory can be found (third indirection).
  14.164 + *
  14.165 + * XXX 4 == sizeof pde
  14.166 + */
  14.167 +	.set	_C_LABEL(PTmap),(PDSLOT_PTE << PDSHIFT)
  14.168 +	.set	_C_LABEL(PTD),(_C_LABEL(PTmap) + PDSLOT_PTE * PAGE_SIZE)
  14.169 +	.set	_C_LABEL(PTDpde),(_C_LABEL(PTD) + PDSLOT_PTE * 4)
  14.170 +
  14.171 +/*
  14.172 + * APTmap, APTD is the alternate recursive pagemap.
  14.173 + * It's used when modifying another process's page tables.
  14.174 + *
  14.175 + * XXX 4 == sizeof pde
  14.176 + */
  14.177 +	.set	_C_LABEL(APTmap),(PDSLOT_APTE << PDSHIFT)
  14.178 +	.set	_C_LABEL(APTD),(_C_LABEL(APTmap) + PDSLOT_APTE * PAGE_SIZE)
  14.179 +	.set	_C_LABEL(APTDpde),(_C_LABEL(PTD) + PDSLOT_APTE * 4)
  14.180 +
  14.181 +
  14.182 +/*
  14.183 + * Xen guest identifier and loader selection
  14.184 + */
  14.185 +.section __xen_guest
  14.186 +	.asciz "GUEST_OS=netbsd,GUEST_VER=2.0,XEN_VER=2.0,LOADER=generic"
  14.187 +
  14.188 +
  14.189 +/*
  14.190 + * Initialization
  14.191 + */
  14.192 +	.data
  14.193 +
  14.194 +	.globl	_C_LABEL(cpu)
  14.195 +	.globl	_C_LABEL(esym),_C_LABEL(boothowto)
  14.196 +	.globl	_C_LABEL(bootinfo),_C_LABEL(atdevbase)
  14.197 +#ifdef COMPAT_OLDBOOT
  14.198 +	.globl	_C_LABEL(bootdev)
  14.199 +#endif
  14.200 +	.globl	_C_LABEL(proc0paddr),_C_LABEL(PTDpaddr)
  14.201 +	.globl	_C_LABEL(biosbasemem),_C_LABEL(biosextmem)
  14.202 +	.globl	_C_LABEL(gdt)
  14.203 +#ifdef I586_CPU
  14.204 +	.globl	_C_LABEL(idt)
  14.205 +#endif
  14.206 +	.globl	_C_LABEL(lapic_tpr)	
  14.207 +	
  14.208 +#if NLAPIC > 0
  14.209 +#ifdef __ELF__
  14.210 +	.align	PAGE_SIZE
  14.211 +#else
  14.212 +	.align	12
  14.213 +#endif
  14.214 +	.globl _C_LABEL(local_apic), _C_LABEL(lapic_id)
  14.215 +_C_LABEL(local_apic):
  14.216 +	.space	LAPIC_ID
  14.217 +_C_LABEL(lapic_id):	
  14.218 +	.long	0x00000000
  14.219 +	.space  LAPIC_TPRI-(LAPIC_ID+4)
  14.220 +_C_LABEL(lapic_tpr):		
  14.221 +	.space  LAPIC_PPRI-LAPIC_TPRI
  14.222 +_C_LABEL(lapic_ppr):		
  14.223 +	.space	LAPIC_ISR-LAPIC_PPRI
  14.224 +_C_LABEL(lapic_isr):
  14.225 +	.space	PAGE_SIZE-LAPIC_ISR
  14.226 +#else
  14.227 +_C_LABEL(lapic_tpr):	
  14.228 +	.long 0
  14.229 +#endif
  14.230 +	
  14.231 +
  14.232 +_C_LABEL(cpu):		.long	0	# are we 386, 386sx, or 486,
  14.233 +					#   or Pentium, or..
  14.234 +_C_LABEL(esym):		.long	0	# ptr to end of syms
  14.235 +_C_LABEL(atdevbase):	.long	0	# location of start of iomem in virtual
  14.236 +_C_LABEL(proc0paddr):	.long	0
  14.237 +_C_LABEL(PTDpaddr):	.long	0	# paddr of PTD, for libkvm
  14.238 +#ifndef REALBASEMEM
  14.239 +_C_LABEL(biosbasemem):	.long	0	# base memory reported by BIOS
  14.240 +#else
  14.241 +_C_LABEL(biosbasemem):	.long	REALBASEMEM
  14.242 +#endif
  14.243 +#ifndef REALEXTMEM
  14.244 +_C_LABEL(biosextmem):	.long	0	# extended memory reported by BIOS
  14.245 +#else
  14.246 +_C_LABEL(biosextmem):	.long	REALEXTMEM
  14.247 +#endif
  14.248 +
  14.249 +#include <machine/xen.h>
  14.250 +#define __HYPERVISOR_yield		   8
  14.251 +
  14.252 +	.space 512
  14.253 +tmpstk:
  14.254 +	.long tmpstk, __KERNEL_DS
  14.255 +
  14.256 +
  14.257 +#define	_RELOC(x)	((x))
  14.258 +#define	RELOC(x)	_RELOC(_C_LABEL(x))
  14.259 +
  14.260 +/* XXX assym.h */
  14.261 +#define MOD_START   48
  14.262 +#define MOD_LEN     56
  14.263 +/* XXX assym.h */
  14.264 +
  14.265 +	.text
  14.266 +	.globl	_C_LABEL(kernel_text)
  14.267 +	.set	_C_LABEL(kernel_text),KERNTEXTOFF
  14.268 +
  14.269 +	.globl	start
  14.270 +start:
  14.271 +	cld
  14.272 +
  14.273 +	lss	tmpstk,%esp		# bootstrap stack end location
  14.274 +
  14.275 +	movl	%esi,%ebx		# save start_info pointer
  14.276 +
  14.277 +#if (NKSYMS || defined(DDB) || defined(LKM)) && !defined(SYMTAB_SPACE)
  14.278 +	/* Save the symbol locations. */
  14.279 +	movl	MOD_START(%ebx),%esi
  14.280 +	addl	MOD_LEN(%ebx),%esi
  14.281 +	movl	%esi,RELOC(esym)
  14.282 +#endif
  14.283 +
  14.284 +        /* Clear BSS first so that there are no surprises... */
  14.285 +	xorl	%eax,%eax
  14.286 +	movl	$RELOC(__bss_start),%edi
  14.287 +	movl	$RELOC(_end),%ecx
  14.288 +	subl	%edi,%ecx
  14.289 +	rep stosb
  14.290 +
  14.291 +	movl	%ebx,RELOC(avail_start)
  14.292 +
  14.293 +	/* Copy the necessary stuff from start_info structure. */
  14.294 +        /* We need to copy shared_info early, so that sti/cli work */
  14.295 +	movl	%ebx,%esi
  14.296 +	movl	$RELOC(start_info_union),%edi
  14.297 +	movl	$128,%ecx
  14.298 +	rep movsl
  14.299 +
  14.300 +    	/* (howto, [bootdev], bootinfo, basemem, extmem). */
  14.301 +	xorl	%eax,%eax
  14.302 +	movl	%eax,RELOC(boothowto)
  14.303 +#ifdef COMPAT_OLDBOOT
  14.304 +	movl	%eax,RELOC(bootdev)
  14.305 +#endif
  14.306 +	movl	$0x20000,%eax
  14.307 +	movl	%eax,RELOC(boothowto)
  14.308 +
  14.309 +	/* First, reset the PSL. */
  14.310 +	pushl	$PSL_MBO
  14.311 +	popfl
  14.312 +
  14.313 +	/* Clear segment registers; always null in proc0. */
  14.314 +	xorl	%eax,%eax
  14.315 +	movw	%ax,%fs
  14.316 +	movw	%ax,%gs
  14.317 +	decl	%eax
  14.318 +	movl	%eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL
  14.319 +
  14.320 +	xorl	%eax,%eax
  14.321 +	cpuid
  14.322 +	movl	%eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL
  14.323 +
  14.324 +/*
  14.325 + * Virtual address space of kernel:
  14.326 + *
  14.327 + * text | data | bss | [syms] | page dir | proc0 kstack 
  14.328 + *			      0          1       2      3
  14.329 + */
  14.330 +#define	PROC0PDIR	((0)              * PAGE_SIZE)
  14.331 +#define	PROC0STACK	((1)              * PAGE_SIZE)
  14.332 +#define	SYSMAP		((1+UPAGES)       * PAGE_SIZE)
  14.333 +#define	TABLESIZE	((1+UPAGES) * PAGE_SIZE) /* + nkpde * PAGE_SIZE */
  14.334 +
  14.335 +	/* Find end of kernel image. */
  14.336 +	movl	RELOC(avail_start),%edi
  14.337 +	/* Calculate where to start the bootstrap tables. */
  14.338 +	movl	%edi,%esi
  14.339 +
  14.340 +	/*
  14.341 +	 * Calculate the size of the kernel page table directory, and
  14.342 +	 * how many entries it will have.
  14.343 +	 */
  14.344 +	movl	RELOC(nkpde),%ecx		# get nkpde
  14.345 +	cmpl	$NKPTP_MIN,%ecx			# larger than min?
  14.346 +	jge	1f
  14.347 +	movl	$NKPTP_MIN,%ecx			# set at min
  14.348 +	jmp	2f
  14.349 +1:	cmpl	$NKPTP_MAX,%ecx			# larger than max?
  14.350 +	jle	2f
  14.351 +	movl	$NKPTP_MAX,%ecx
  14.352 +2:
  14.353 +
  14.354 +	/* Clear memory for bootstrap tables. */
  14.355 +	shll	$PGSHIFT,%ecx
  14.356 +	addl	$TABLESIZE,%ecx
  14.357 +	addl	%esi,%ecx			# end of tables
  14.358 +	movl	%ecx,RELOC(gdt)
  14.359 +	addl	$PAGE_SIZE,%ecx
  14.360 +	movl	%ecx,RELOC(avail_start)
  14.361 +	subl	%edi,%ecx			# size of tables
  14.362 +	shrl	$2,%ecx
  14.363 +	xorl	%eax,%eax
  14.364 +	cld
  14.365 +	rep
  14.366 +	stosl
  14.367 +
  14.368 +/*
  14.369 + * fillkpt
  14.370 + *	eax = pte (page frame | control | status)
  14.371 + *	ebx = page table address
  14.372 + *	ecx = number of pages to map
  14.373 + */
  14.374 +#define	fillkpt		\
  14.375 +1:	movl	%eax,(%ebx)	; \
  14.376 +	addl	$PAGE_SIZE,%eax	; /* increment physical address */ \
  14.377 +	addl	$4,%ebx		; /* next pte */ \
  14.378 +	loop	1b		;
  14.379 +
  14.380 +/*
  14.381 + * Build initial page tables.
  14.382 + */
  14.383 +	/* Calculate end of text segment, rounded to a page. */
  14.384 +	leal	(RELOC(etext)+PGOFSET),%edx
  14.385 +	andl	$~PGOFSET,%edx
  14.386 +	
  14.387 +	/* Skip over the first 1MB. */
  14.388 +	movl	$KERNTEXTOFF,%eax
  14.389 +	movl	%eax,%ecx
  14.390 +	subl	$KERNBASE_LOCORE,%ecx
  14.391 +	shrl	$PGSHIFT,%ecx
  14.392 +	leal	(SYSMAP)(%esi,%ecx,4),%ebx
  14.393 +
  14.394 +	/* Map the kernel text read-only. */
  14.395 +	movl	%edx,%ecx
  14.396 +	subl	%eax,%ecx
  14.397 +	shrl	$PGSHIFT,%ecx
  14.398 +	orl	$(PG_V|PG_KR),%eax
  14.399 +	fillkpt
  14.400 +
  14.401 +	/* Map the data, BSS, and bootstrap tables read-write. */
  14.402 +	movl	RELOC(avail_start),%ecx
  14.403 +						    # end of tables
  14.404 +	subl	%edx,%ecx				# subtract end of text
  14.405 +	shrl	$PGSHIFT,%ecx
  14.406 +	leal	(PG_V|PG_KW)(%edx),%eax
  14.407 +	fillkpt
  14.408 +
  14.409 +	movl	$0xffffffff,(%ebx)
  14.410 +	addl	$4,%ebx
  14.411 +
  14.412 +/*
  14.413 + * Construct a page table directory.
  14.414 + */
  14.415 +	/* Map kernel PDEs. */
  14.416 +	movl	RELOC(nkpde),%ecx			# for this many pde s,
  14.417 +	leal	(PROC0PDIR+PDSLOT_KERN*4)(%esi),%ebx	# kernel pde offset
  14.418 +	leal	(SYSMAP+PG_V|PG_KW)(%esi),%eax		# pte for KPT in proc 0,
  14.419 +	fillkpt
  14.420 +
  14.421 +	/* Install a PDE recursively mapping page directory as a page table! */
  14.422 +	leal	(PROC0PDIR+PG_V/*|PG_KW*/)(%esi),%eax	# pte for ptd
  14.423 +	movl	%eax,(PROC0PDIR+PDSLOT_PTE*4)(%esi)	# recursive PD slot
  14.424 +
  14.425 +	/* Save phys. addr of PTD, for libkvm. */
  14.426 +	movl	%esi,RELOC(PTDpaddr)
  14.427 +
  14.428 +    	call	xpmap_init
  14.429 +
  14.430 +	/* cr0 is 0x8005003b */
  14.431 +
  14.432 +	/* Relocate atdevbase. */
  14.433 +	movl	_C_LABEL(avail_start),%edx
  14.434 +	movl	%edx,_C_LABEL(HYPERVISOR_shared_info)
  14.435 +	addl	$PAGE_SIZE,%edx			# shared_inf
  14.436 +	movl	%edx,_C_LABEL(atdevbase)
  14.437 +
  14.438 +	/* Set up bootstrap stack. */
  14.439 +	leal	(PROC0STACK)(%esi),%eax
  14.440 +	movl	%eax,_C_LABEL(proc0paddr)
  14.441 +	leal	(USPACE-FRAMESIZE)(%eax),%esp
  14.442 +	subl	$KERNBASE_LOCORE,%esi
  14.443 +	movl	%esi,PCB_CR3(%eax)	# pcb->pcb_cr3
  14.444 +	xorl	%ebp,%ebp               # mark end of frames
  14.445 +
  14.446 +	movl	_C_LABEL(atdevbase),%eax
  14.447 +	pushl	%eax
  14.448 +	call	_C_LABEL(init386)	# wire 386 chip for unix operation
  14.449 +	addl	$4,%esp
  14.450 +
  14.451 +#ifdef SAFARI_FIFO_HACK
  14.452 +	movb	$5,%al
  14.453 +	movw	$0x37b,%dx
  14.454 +	outb	%al,%dx
  14.455 +	movw	$0x37f,%dx
  14.456 +	inb	%dx,%al
  14.457 +	movb	%al,%cl
  14.458 +
  14.459 +	orb	$1,%cl
  14.460 +
  14.461 +	movb	$5,%al
  14.462 +	movw	$0x37b,%dx
  14.463 +	outb	%al,%dx
  14.464 +	movw	$0x37f,%dx
  14.465 +	movb	%cl,%al
  14.466 +	outb	%al,%dx
  14.467 +#endif /* SAFARI_FIFO_HACK */
  14.468 +
  14.469 +	call 	_C_LABEL(main)
  14.470 +
  14.471 +/*
  14.472 + * void proc_trampoline(void);
  14.473 + * This is a trampoline function pushed onto the stack of a newly created
  14.474 + * process in order to do some additional setup.  The trampoline is entered by
  14.475 + * cpu_switch()ing to the process, so we abuse the callee-saved registers used
  14.476 + * by cpu_switch() to store the information about the stub to call.
  14.477 + * NOTE: This function does not have a normal calling sequence!
  14.478 + */
  14.479 +/* LINTSTUB: Func: void proc_trampoline(void) */
  14.480 +NENTRY(proc_trampoline)
  14.481 +#ifdef MULTIPROCESSOR
  14.482 +	call	_C_LABEL(proc_trampoline_mp)
  14.483 +#endif
  14.484 +	movl	$IPL_NONE,CPUVAR(ILEVEL)
  14.485 +	pushl	%ebx
  14.486 +	call	*%esi
  14.487 +	addl	$4,%esp
  14.488 +	DO_DEFERRED_SWITCH(%eax)
  14.489 +	INTRFASTEXIT
  14.490 +	/* NOTREACHED */
  14.491 +
  14.492 +/*****************************************************************************/
  14.493 +#ifdef COMPAT_16
  14.494 +/*
  14.495 + * Signal trampoline; copied to top of user stack.
  14.496 + */
  14.497 +/* LINTSTUB: Var: char sigcode[1], esigcode[1]; */
  14.498 +NENTRY(sigcode)
  14.499 +	/*
  14.500 +	 * Handler has returned here as if we called it.  The sigcontext
  14.501 +	 * is on the stack after the 3 args "we" pushed.
  14.502 +	 */
  14.503 +	leal	12(%esp),%eax		# get pointer to sigcontext
  14.504 +	movl	%eax,4(%esp)		# put it in the argument slot
  14.505 +					# fake return address already there
  14.506 +	movl	$SYS_compat_16___sigreturn14,%eax
  14.507 +	int	$0x80	 		# enter kernel with args on stack
  14.508 +	movl	$SYS_exit,%eax
  14.509 +	int	$0x80			# exit if sigreturn fails
  14.510 +	.globl	_C_LABEL(esigcode)
  14.511 +_C_LABEL(esigcode):
  14.512 +#endif
  14.513 +
  14.514 +/*****************************************************************************/
  14.515 +
  14.516 +/*
  14.517 + * The following primitives are used to fill and copy regions of memory.
  14.518 + */
  14.519 +
  14.520 +/*
  14.521 + * XXX No section 9 man page for fillw.
  14.522 + * fillw seems to be very sparsely used (only in pccons it seems.)
  14.523 + * One wonders if it couldn't be done without.
  14.524 + * -- Perry Metzger, May 7, 2001
  14.525 + */
  14.526 +/*
  14.527 + * void fillw(short pattern, void *addr, size_t len);
  14.528 + * Write len copies of pattern at addr.
  14.529 + */
  14.530 +/* LINTSTUB: Func: void fillw(short pattern, void *addr, size_t len) */
  14.531 +ENTRY(fillw)
  14.532 +	pushl	%edi
  14.533 +	movl	8(%esp),%eax
  14.534 +	movl	12(%esp),%edi
  14.535 +	movw	%ax,%cx
  14.536 +	rorl	$16,%eax
  14.537 +	movw	%cx,%ax
  14.538 +	cld
  14.539 +	movl	16(%esp),%ecx
  14.540 +	shrl	%ecx			# do longwords
  14.541 +	rep
  14.542 +	stosl
  14.543 +	movl	16(%esp),%ecx
  14.544 +	andl	$1,%ecx			# do remainder
  14.545 +	rep
  14.546 +	stosw
  14.547 +	popl	%edi
  14.548 +	ret
  14.549 +
  14.550 +/*
  14.551 + * int kcopy(const void *from, void *to, size_t len);
  14.552 + * Copy len bytes, abort on fault.
  14.553 + */
  14.554 +/* LINTSTUB: Func: int kcopy(const void *from, void *to, size_t len) */
  14.555 +ENTRY(kcopy)
  14.556 +	pushl	%esi
  14.557 +	pushl	%edi
  14.558 +	GET_CURPCB(%eax)		# load curpcb into eax and set on-fault
  14.559 +	pushl	PCB_ONFAULT(%eax)
  14.560 +	movl	$_C_LABEL(kcopy_fault), PCB_ONFAULT(%eax)
  14.561 +
  14.562 +	movl	16(%esp),%esi
  14.563 +	movl	20(%esp),%edi
  14.564 +	movl	24(%esp),%ecx
  14.565 +	movl	%edi,%eax
  14.566 +	subl	%esi,%eax
  14.567 +	cmpl	%ecx,%eax		# overlapping?
  14.568 +	jb	1f
  14.569 +	cld				# nope, copy forward
  14.570 +	shrl	$2,%ecx			# copy by 32-bit words
  14.571 +	rep
  14.572 +	movsl
  14.573 +	movl	24(%esp),%ecx
  14.574 +	andl	$3,%ecx			# any bytes left?
  14.575 +	rep
  14.576 +	movsb
  14.577 +
  14.578 +	GET_CURPCB(%edx)		# XXX save curpcb?
  14.579 +	popl	PCB_ONFAULT(%edx)
  14.580 +	popl	%edi
  14.581 +	popl	%esi
  14.582 +	xorl	%eax,%eax
  14.583 +	ret
  14.584 +
  14.585 +	ALIGN_TEXT
  14.586 +1:	addl	%ecx,%edi		# copy backward
  14.587 +	addl	%ecx,%esi
  14.588 +	std
  14.589 +	andl	$3,%ecx			# any fractional bytes?
  14.590 +	decl	%edi
  14.591 +	decl	%esi
  14.592 +	rep
  14.593 +	movsb
  14.594 +	movl	24(%esp),%ecx		# copy remainder by 32-bit words
  14.595 +	shrl	$2,%ecx
  14.596 +	subl	$3,%esi
  14.597 +	subl	$3,%edi
  14.598 +	rep
  14.599 +	movsl
  14.600 +	cld
  14.601 +
  14.602 +	GET_CURPCB(%edx)
  14.603 +	popl	PCB_ONFAULT(%edx)
  14.604 +	popl	%edi
  14.605 +	popl	%esi
  14.606 +	xorl	%eax,%eax
  14.607 +	ret
  14.608 +
  14.609 +/*****************************************************************************/
  14.610 +
  14.611 +/*
  14.612 + * The following primitives are used to copy data in and out of the user's
  14.613 + * address space.
  14.614 + */
  14.615 +
  14.616 +/*
  14.617 + * Default to the lowest-common-denominator.  We will improve it
  14.618 + * later.
  14.619 + */
  14.620 +#if defined(I386_CPU)
  14.621 +#define	DEFAULT_COPYOUT		_C_LABEL(i386_copyout)
  14.622 +#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)
  14.623 +#elif defined(I486_CPU)
  14.624 +#define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)
  14.625 +#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)
  14.626 +#elif defined(I586_CPU)
  14.627 +#define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)	/* XXX */
  14.628 +#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
  14.629 +#elif defined(I686_CPU)
  14.630 +#define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)	/* XXX */
  14.631 +#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
  14.632 +#endif
  14.633 +
  14.634 +	.data
  14.635 +
  14.636 +	.globl	_C_LABEL(copyout_func)
  14.637 +_C_LABEL(copyout_func):
  14.638 +	.long	DEFAULT_COPYOUT
  14.639 +
  14.640 +	.globl	_C_LABEL(copyin_func)
  14.641 +_C_LABEL(copyin_func):
  14.642 +	.long	DEFAULT_COPYIN
  14.643 +
  14.644 +	.text
  14.645 +
  14.646 +/*
  14.647 + * int copyout(const void *from, void *to, size_t len);
  14.648 + * Copy len bytes into the user's address space.
  14.649 + * see copyout(9)
  14.650 + */
  14.651 +/* LINTSTUB: Func: int copyout(const void *kaddr, void *uaddr, size_t len) */
  14.652 +ENTRY(copyout)
  14.653 +	DO_DEFERRED_SWITCH(%eax)
  14.654 +	jmp	*_C_LABEL(copyout_func)
  14.655 +
  14.656 +#if defined(I386_CPU)
  14.657 +/* LINTSTUB: Func: int i386_copyout(const void *kaddr, void *uaddr, size_t len) */
  14.658 +ENTRY(i386_copyout)
  14.659 +	pushl	%esi
  14.660 +	pushl	%edi
  14.661 +	pushl	$0
  14.662 +	
  14.663 +	movl	16(%esp),%esi
  14.664 +	movl	20(%esp),%edi
  14.665 +	movl	24(%esp),%eax
  14.666 +
  14.667 +	/*
  14.668 +	 * We check that the end of the destination buffer is not past the end
  14.669 +	 * of the user's address space.  If it's not, then we only need to
  14.670 +	 * check that each page is writable.  The 486 will do this for us; the
  14.671 +	 * 386 will not.  (We assume that pages in user space that are not
  14.672 +	 * writable by the user are not writable by the kernel either.)
  14.673 +	 */
  14.674 +	movl	%edi,%edx
  14.675 +	addl	%eax,%edx
  14.676 +	jc	_C_LABEL(copy_efault)
  14.677 +	cmpl	$VM_MAXUSER_ADDRESS,%edx
  14.678 +	ja	_C_LABEL(copy_efault)
  14.679 +
  14.680 +	testl	%eax,%eax		# anything to do?
  14.681 +	jz	3f
  14.682 +
  14.683 +	/*
  14.684 +	 * We have to check each PTE for (write) permission, since the CPU
  14.685 +	 * doesn't do it for us.
  14.686 +	 */
  14.687 +
  14.688 +	/* Compute number of pages. */
  14.689 +	movl	%edi,%ecx
  14.690 +	andl	$PGOFSET,%ecx
  14.691 +	addl	%eax,%ecx
  14.692 +	decl	%ecx
  14.693 +	shrl	$PGSHIFT,%ecx
  14.694 +
  14.695 +	/* Compute PTE offset for start address. */
  14.696 +	shrl	$PGSHIFT,%edi
  14.697 +
  14.698 +	GET_CURPCB(%edx)
  14.699 +	movl	$2f,PCB_ONFAULT(%edx)
  14.700 +
  14.701 +1:	/* Check PTE for each page. */
  14.702 +	testb	$PG_RW,_C_LABEL(PTmap)(,%edi,4)
  14.703 +	jz	2f
  14.704 +	
  14.705 +4:	incl	%edi
  14.706 +	decl	%ecx
  14.707 +	jns	1b
  14.708 +
  14.709 +	movl	20(%esp),%edi
  14.710 +	movl	24(%esp),%eax
  14.711 +	jmp	3f
  14.712 +	
  14.713 +2:	/* Simulate a trap. */
  14.714 +	pushl	%ecx
  14.715 +	movl	%edi,%eax
  14.716 +	shll	$PGSHIFT,%eax
  14.717 +	pushl	%eax
  14.718 +	call	_C_LABEL(trapwrite)	# trapwrite(addr)
  14.719 +	addl	$4,%esp			# pop argument
  14.720 +	popl	%ecx
  14.721 +	testl	%eax,%eax		# if not ok, return EFAULT
  14.722 +	jz	4b
  14.723 +	jmp	_C_LABEL(copy_efault)
  14.724 +
  14.725 +3:	GET_CURPCB(%edx)
  14.726 +	movl	$_C_LABEL(copy_fault),PCB_ONFAULT(%edx)
  14.727 +
  14.728 +	/* bcopy(%esi, %edi, %eax); */
  14.729 +	cld
  14.730 +	movl	%eax,%ecx
  14.731 +	shrl	$2,%ecx
  14.732 +	rep
  14.733 +	movsl
  14.734 +	movl	%eax,%ecx
  14.735 +	andl	$3,%ecx
  14.736 +	rep
  14.737 +	movsb
  14.738 +
  14.739 +	popl	PCB_ONFAULT(%edx)
  14.740 +	popl	%edi
  14.741 +	popl	%esi
  14.742 +	xorl	%eax,%eax
  14.743 +	ret
  14.744 +#endif /* I386_CPU */
  14.745 +
  14.746 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
  14.747 +/* LINTSTUB: Func: int i486_copyout(const void *kaddr, void *uaddr, size_t len) */
  14.748 +ENTRY(i486_copyout)
  14.749 +	pushl	%esi
  14.750 +	pushl	%edi
  14.751 +	pushl	$0
  14.752 +	
  14.753 +	movl	16(%esp),%esi
  14.754 +	movl	20(%esp),%edi
  14.755 +	movl	24(%esp),%eax
  14.756 +
  14.757 +	/*
  14.758 +	 * We check that the end of the destination buffer is not past the end
  14.759 +	 * of the user's address space.
  14.760 +	 */
  14.761 +	movl	%edi,%edx
  14.762 +	addl	%eax,%edx
  14.763 +	jc	_C_LABEL(copy_efault)
  14.764 +	cmpl	$VM_MAXUSER_ADDRESS,%edx
  14.765 +	ja	_C_LABEL(copy_efault)
  14.766 +
  14.767 +	GET_CURPCB(%edx)
  14.768 +	movl	$_C_LABEL(copy_fault),PCB_ONFAULT(%edx)
  14.769 +
  14.770 +	/* bcopy(%esi, %edi, %eax); */
  14.771 +	cld
  14.772 +	movl	%eax,%ecx
  14.773 +	shrl	$2,%ecx
  14.774 +	rep
  14.775 +	movsl
  14.776 +	movl	%eax,%ecx
  14.777 +	andl	$3,%ecx
  14.778 +	rep
  14.779 +	movsb
  14.780 +
  14.781 +	popl	PCB_ONFAULT(%edx)
  14.782 +	popl	%edi
  14.783 +	popl	%esi
  14.784 +	xorl	%eax,%eax
  14.785 +	ret
  14.786 +#endif /* I486_CPU || I586_CPU || I686_CPU */
  14.787 +
  14.788 +/*
  14.789 + * int copyin(const void *from, void *to, size_t len);
  14.790 + * Copy len bytes from the user's address space.
  14.791 + * see copyin(9)
  14.792 + */
  14.793 +/* LINTSTUB: Func: int copyin(const void *uaddr, void *kaddr, size_t len) */
  14.794 +ENTRY(copyin)
  14.795 +	DO_DEFERRED_SWITCH(%eax)
  14.796 +	jmp	*_C_LABEL(copyin_func)
  14.797 +
  14.798 +#if defined(I386_CPU) || defined(I486_CPU) || defined(I586_CPU) || \
  14.799 +    defined(I686_CPU)
  14.800 +/* LINTSTUB: Func: int i386_copyin(const void *uaddr, void *kaddr, size_t len) */
  14.801 +ENTRY(i386_copyin)
  14.802 +	pushl	%esi
  14.803 +	pushl	%edi
  14.804 +	GET_CURPCB(%eax)
  14.805 +	pushl	$0
  14.806 +	movl	$_C_LABEL(copy_fault),PCB_ONFAULT(%eax)
  14.807 +	
  14.808 +	movl	16(%esp),%esi
  14.809 +	movl	20(%esp),%edi
  14.810 +	movl	24(%esp),%eax
  14.811 +
  14.812 +	/*
  14.813 +	 * We check that the end of the destination buffer is not past the end
  14.814 +	 * of the user's address space.  If it's not, then we only need to
  14.815 +	 * check that each page is readable, and the CPU will do that for us.
  14.816 +	 */
  14.817 +	movl	%esi,%edx
  14.818 +	addl	%eax,%edx
  14.819 +	jc	_C_LABEL(copy_efault)
  14.820 +	cmpl	$VM_MAXUSER_ADDRESS,%edx
  14.821 +	ja	_C_LABEL(copy_efault)
  14.822 +
  14.823 +	/* bcopy(%esi, %edi, %eax); */
  14.824 +	cld
  14.825 +	movl	%eax,%ecx
  14.826 +	shrl	$2,%ecx
  14.827 +	rep
  14.828 +	movsl
  14.829 +	movl	%eax,%ecx
  14.830 +	andl	$3,%ecx
  14.831 +	rep
  14.832 +	movsb
  14.833 +
  14.834 +	GET_CURPCB(%edx)
  14.835 +	popl	PCB_ONFAULT(%edx)
  14.836 +	popl	%edi
  14.837 +	popl	%esi
  14.838 +	xorl	%eax,%eax
  14.839 +	ret
  14.840 +#endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */
  14.841 +
  14.842 +/* LINTSTUB: Ignore */
  14.843 +NENTRY(copy_efault)
  14.844 +	movl	$EFAULT,%eax
  14.845 +
  14.846 +/*
  14.847 + * kcopy_fault is used by kcopy and copy_fault is used by copyin/out.
  14.848 + *
  14.849 + * they're distinguished for lazy pmap switching.  see trap().
  14.850 + */
  14.851 +/* LINTSTUB: Ignore */
  14.852 +NENTRY(kcopy_fault)
  14.853 +	GET_CURPCB(%edx)
  14.854 +	popl	PCB_ONFAULT(%edx)
  14.855 +	popl	%edi
  14.856 +	popl	%esi
  14.857 +	ret
  14.858 +
  14.859 +/* LINTSTUB: Ignore */
  14.860 +NENTRY(copy_fault)
  14.861 +	GET_CURPCB(%edx)
  14.862 +	popl	PCB_ONFAULT(%edx)
  14.863 +	popl	%edi
  14.864 +	popl	%esi
  14.865 +	ret
  14.866 +
  14.867 +/*
  14.868 + * int copyoutstr(const void *from, void *to, size_t maxlen, size_t *lencopied);
  14.869 + * Copy a NUL-terminated string, at most maxlen characters long, into the
  14.870 + * user's address space.  Return the number of characters copied (including the
  14.871 + * NUL) in *lencopied.  If the string is too long, return ENAMETOOLONG; else
  14.872 + * return 0 or EFAULT.
  14.873 + * see copyoutstr(9)
  14.874 + */
  14.875 +/* LINTSTUB: Func: int copyoutstr(const void *kaddr, void *uaddr, size_t len, size_t *done) */
  14.876 +ENTRY(copyoutstr)
  14.877 +	pushl	%esi
  14.878 +	pushl	%edi
  14.879 +
  14.880 +	DO_DEFERRED_SWITCH(%eax)
  14.881 +
  14.882 +	movl	12(%esp),%esi		# esi = from
  14.883 +	movl	16(%esp),%edi		# edi = to
  14.884 +	movl	20(%esp),%edx		# edx = maxlen
  14.885 +
  14.886 +#if defined(I386_CPU)
  14.887 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
  14.888 +	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
  14.889 +	jne	5f
  14.890 +#endif /* I486_CPU || I586_CPU || I686_CPU */
  14.891 +
  14.892 +	/* Compute number of bytes in first page. */
  14.893 +	movl	%edi,%eax
  14.894 +	andl	$PGOFSET,%eax
  14.895 +	movl	$PAGE_SIZE,%ecx
  14.896 +	subl	%eax,%ecx		# ecx = PAGE_SIZE - (src % PAGE_SIZE)
  14.897 +
  14.898 +	GET_CURPCB(%eax)
  14.899 +	movl	$6f,PCB_ONFAULT(%eax)
  14.900 +
  14.901 +1:	/*
  14.902 +	 * Once per page, check that we are still within the bounds of user
  14.903 +	 * space, and check for a write fault.
  14.904 +	 */
  14.905 +	cmpl	$VM_MAXUSER_ADDRESS,%edi
  14.906 +	jae	_C_LABEL(copystr_efault)
  14.907 +
  14.908 +	/* Compute PTE offset. */
  14.909 +	movl	%edi,%eax
  14.910 +	shrl	$PGSHIFT,%eax		# calculate pte address
  14.911 +
  14.912 +	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
  14.913 +	jnz	2f
  14.914 +
  14.915 +6:	/* Simulate a trap. */
  14.916 +	pushl	%edx
  14.917 +	pushl	%edi
  14.918 +	call	_C_LABEL(trapwrite)	# trapwrite(addr)
  14.919 +	addl	$4,%esp			# clear argument from stack
  14.920 +	popl	%edx
  14.921 +	testl	%eax,%eax
  14.922 +	jnz	_C_LABEL(copystr_efault)
  14.923 +
  14.924 +2:	/* Copy up to end of this page. */
  14.925 +	subl	%ecx,%edx		# predecrement total count
  14.926 +	jnc	3f
  14.927 +	addl	%edx,%ecx		# ecx += (edx - ecx) = edx
  14.928 +	xorl	%edx,%edx
  14.929 +
  14.930 +3:	decl	%ecx
  14.931 +	js	4f
  14.932 +	lodsb
  14.933 +	stosb
  14.934 +	testb	%al,%al
  14.935 +	jnz	3b
  14.936 +
  14.937 +	/* Success -- 0 byte reached. */
  14.938 +	addl	%ecx,%edx		# add back residual for this page
  14.939 +	xorl	%eax,%eax
  14.940 +	jmp	copystr_return
  14.941 +
  14.942 +4:	/* Go to next page, if any. */
  14.943 +	movl	$PAGE_SIZE,%ecx
  14.944 +	testl	%edx,%edx
  14.945 +	jnz	1b
  14.946 +
  14.947 +	/* edx is zero -- return ENAMETOOLONG. */
  14.948 +	movl	$ENAMETOOLONG,%eax
  14.949 +	jmp	copystr_return
  14.950 +#endif /* I386_CPU */
  14.951 +
  14.952 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
  14.953 +5:	GET_CURPCB(%eax)
  14.954 +	movl	$_C_LABEL(copystr_fault),PCB_ONFAULT(%eax)
  14.955 +	/*
  14.956 +	 * Get min(%edx, VM_MAXUSER_ADDRESS-%edi).
  14.957 +	 */
  14.958 +	movl	$VM_MAXUSER_ADDRESS,%eax
  14.959 +	subl	%edi,%eax
  14.960 +	cmpl	%edx,%eax
  14.961 +	jae	1f
  14.962 +	movl	%eax,%edx
  14.963 +	movl	%eax,20(%esp)
  14.964 +
  14.965 +1:	incl	%edx
  14.966 +	cld
  14.967 +
  14.968 +1:	decl	%edx
  14.969 +	jz	2f
  14.970 +	lodsb
  14.971 +	stosb
  14.972 +	testb	%al,%al
  14.973 +	jnz	1b
  14.974 +
  14.975 +	/* Success -- 0 byte reached. */
  14.976 +	decl	%edx
  14.977 +	xorl	%eax,%eax
  14.978 +	jmp	copystr_return
  14.979 +
  14.980 +2:	/* edx is zero -- return EFAULT or ENAMETOOLONG. */
  14.981 +	cmpl	$VM_MAXUSER_ADDRESS,%edi
  14.982 +	jae	_C_LABEL(copystr_efault)
  14.983 +	movl	$ENAMETOOLONG,%eax
  14.984 +	jmp	copystr_return
  14.985 +#endif /* I486_CPU || I586_CPU || I686_CPU */
  14.986 +
  14.987 +/*
  14.988 + * int copyinstr(const void *from, void *to, size_t maxlen, size_t *lencopied);
  14.989 + * Copy a NUL-terminated string, at most maxlen characters long, from the
  14.990 + * user's address space.  Return the number of characters copied (including the
  14.991 + * NUL) in *lencopied.  If the string is too long, return ENAMETOOLONG; else
  14.992 + * return 0 or EFAULT.
  14.993 + * see copyinstr(9)
  14.994 + */
  14.995 +/* LINTSTUB: Func: int copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done) */
  14.996 +ENTRY(copyinstr)
  14.997 +	pushl	%esi
  14.998 +	pushl	%edi
  14.999 +
 14.1000 +	DO_DEFERRED_SWITCH(%eax)
 14.1001 +
 14.1002 +	GET_CURPCB(%ecx)
 14.1003 +	movl	$_C_LABEL(copystr_fault),PCB_ONFAULT(%ecx)
 14.1004 +
 14.1005 +	movl	12(%esp),%esi		# %esi = from
 14.1006 +	movl	16(%esp),%edi		# %edi = to
 14.1007 +	movl	20(%esp),%edx		# %edx = maxlen
 14.1008 +
 14.1009 +	/*
 14.1010 +	 * Get min(%edx, VM_MAXUSER_ADDRESS-%esi).
 14.1011 +	 */
 14.1012 +	movl	$VM_MAXUSER_ADDRESS,%eax
 14.1013 +	subl	%esi,%eax
 14.1014 +	cmpl	%edx,%eax
 14.1015 +	jae	1f
 14.1016 +	movl	%eax,%edx
 14.1017 +	movl	%eax,20(%esp)
 14.1018 +
 14.1019 +1:	incl	%edx
 14.1020 +	cld
 14.1021 +
 14.1022 +1:	decl	%edx
 14.1023 +	jz	2f
 14.1024 +	lodsb
 14.1025 +	stosb
 14.1026 +	testb	%al,%al
 14.1027 +	jnz	1b
 14.1028 +
 14.1029 +	/* Success -- 0 byte reached. */
 14.1030 +	decl	%edx
 14.1031 +	xorl	%eax,%eax
 14.1032 +	jmp	copystr_return
 14.1033 +
 14.1034 +2:	/* edx is zero -- return EFAULT or ENAMETOOLONG. */
 14.1035 +	cmpl	$VM_MAXUSER_ADDRESS,%esi
 14.1036 +	jae	_C_LABEL(copystr_efault)
 14.1037 +	movl	$ENAMETOOLONG,%eax
 14.1038 +	jmp	copystr_return
 14.1039 +
 14.1040 +/* LINTSTUB: Ignore */
 14.1041 +NENTRY(copystr_efault)
 14.1042 +	movl	$EFAULT,%eax
 14.1043 +
 14.1044 +/* LINTSTUB: Ignore */
 14.1045 +NENTRY(copystr_fault)
 14.1046 +copystr_return:
 14.1047 +	/* Set *lencopied and return %eax. */
 14.1048 +	GET_CURPCB(%ecx)
 14.1049 +	movl	$0,PCB_ONFAULT(%ecx)
 14.1050 +	movl	20(%esp),%ecx
 14.1051 +	subl	%edx,%ecx
 14.1052 +	movl	24(%esp),%edx
 14.1053 +	testl	%edx,%edx
 14.1054 +	jz	8f
 14.1055 +	movl	%ecx,(%edx)
 14.1056 +
 14.1057 +8:	popl	%edi
 14.1058 +	popl	%esi
 14.1059 +	ret
 14.1060 +
 14.1061 +/*
 14.1062 + * int copystr(const void *from, void *to, size_t maxlen, size_t *lencopied);
 14.1063 + * Copy a NUL-terminated string, at most maxlen characters long.  Return the
 14.1064 + * number of characters copied (including the NUL) in *lencopied.  If the
 14.1065 + * string is too long, return ENAMETOOLONG; else return 0.
 14.1066 + * see copystr(9)
 14.1067 + */
 14.1068 +/* LINTSTUB: Func: int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done) */
 14.1069 +ENTRY(copystr)
 14.1070 +	pushl	%esi
 14.1071 +	pushl	%edi
 14.1072 +
 14.1073 +	movl	12(%esp),%esi		# esi = from
 14.1074 +	movl	16(%esp),%edi		# edi = to
 14.1075 +	movl	20(%esp),%edx		# edx = maxlen
 14.1076 +	incl	%edx
 14.1077 +	cld
 14.1078 +
 14.1079 +1:	decl	%edx
 14.1080 +	jz	4f
 14.1081 +	lodsb
 14.1082 +	stosb
 14.1083 +	testb	%al,%al
 14.1084 +	jnz	1b
 14.1085 +
 14.1086 +	/* Success -- 0 byte reached. */
 14.1087 +	decl	%edx
 14.1088 +	xorl	%eax,%eax
 14.1089 +	jmp	6f
 14.1090 +
 14.1091 +4:	/* edx is zero -- return ENAMETOOLONG. */
 14.1092 +	movl	$ENAMETOOLONG,%eax
 14.1093 +
 14.1094 +6:	/* Set *lencopied and return %eax. */
 14.1095 +	movl	20(%esp),%ecx
 14.1096 +	subl	%edx,%ecx
 14.1097 +	movl	24(%esp),%edx
 14.1098 +	testl	%edx,%edx
 14.1099 +	jz	7f
 14.1100 +	movl	%ecx,(%edx)
 14.1101 +
 14.1102 +7:	popl	%edi
 14.1103 +	popl	%esi
 14.1104 +	ret
 14.1105 +
 14.1106 +/*
 14.1107 + * long fuword(const void *uaddr);
 14.1108 + * Fetch an int from the user's address space.
 14.1109 + * see fuword(9)
 14.1110 + */
 14.1111 +/* LINTSTUB: Func: long fuword(const void *base) */
 14.1112 +ENTRY(fuword)
 14.1113 +	DO_DEFERRED_SWITCH(%eax)
 14.1114 +	movl	4(%esp),%edx
 14.1115 +	cmpl	$VM_MAXUSER_ADDRESS-4,%edx
 14.1116 +	ja	_C_LABEL(fusuaddrfault)
 14.1117 +	GET_CURPCB(%ecx)
 14.1118 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
 14.1119 +	movl	(%edx),%eax
 14.1120 +	movl	$0,PCB_ONFAULT(%ecx)
 14.1121 +	ret
 14.1122 +	
 14.1123 +/*
 14.1124 + * int fusword(const void *uaddr);
 14.1125 + * Fetch a short from the user's address space.
 14.1126 + * see fusword(9)
 14.1127 + */
 14.1128 +/* LINTSTUB: Func: int fusword(const void *base) */
 14.1129 +ENTRY(fusword)
 14.1130 +	DO_DEFERRED_SWITCH(%eax)
 14.1131 +	movl	4(%esp),%edx
 14.1132 +	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
 14.1133 +	ja	_C_LABEL(fusuaddrfault)
 14.1134 +	GET_CURPCB(%ecx)
 14.1135 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
 14.1136 +	movzwl	(%edx),%eax
 14.1137 +	movl	$0,PCB_ONFAULT(%ecx)
 14.1138 +	ret
 14.1139 +	
 14.1140 +/*
 14.1141 + * int fuswintr(const void *uaddr);
 14.1142 + * Fetch a short from the user's address space.  Can be called during an
 14.1143 + * interrupt.
 14.1144 + * see fuswintr(9)
 14.1145 + */
 14.1146 +/* LINTSTUB: Func: int fuswintr(const void *base) */
 14.1147 +ENTRY(fuswintr)
 14.1148 +	cmpl	$TLBSTATE_VALID, CPUVAR(TLBSTATE)
 14.1149 +	jnz	_C_LABEL(fusuaddrfault)
 14.1150 +	movl	4(%esp),%edx
 14.1151 +	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
 14.1152 +	ja	_C_LABEL(fusuaddrfault)
 14.1153 +	movl	CPUVAR(CURLWP),%ecx
 14.1154 +	movl	L_ADDR(%ecx),%ecx
 14.1155 +	movl	$_C_LABEL(fusubail),PCB_ONFAULT(%ecx)
 14.1156 +	movzwl	(%edx),%eax
 14.1157 +	movl	$0,PCB_ONFAULT(%ecx)
 14.1158 +	ret
 14.1159 +	
 14.1160 +/*
 14.1161 + * int fubyte(const void *uaddr);
 14.1162 + * Fetch a byte from the user's address space.
 14.1163 + * see fubyte(9)
 14.1164 + */
 14.1165 +/* LINTSTUB: Func: int fubyte(const void *base) */
 14.1166 +ENTRY(fubyte)
 14.1167 +	DO_DEFERRED_SWITCH(%eax)
 14.1168 +	movl	4(%esp),%edx
 14.1169 +	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
 14.1170 +	ja	_C_LABEL(fusuaddrfault)
 14.1171 +	GET_CURPCB(%ecx)
 14.1172 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
 14.1173 +	movzbl	(%edx),%eax
 14.1174 +	movl	$0,PCB_ONFAULT(%ecx)
 14.1175 +	ret
 14.1176 +
 14.1177 +/*
 14.1178 + * Handle faults from [fs]u*().  Clean up and return -1.
 14.1179 + */
 14.1180 +/* LINTSTUB: Ignore */
 14.1181 +NENTRY(fusufault)
 14.1182 +	movl	$0,PCB_ONFAULT(%ecx)
 14.1183 +	movl	$-1,%eax
 14.1184 +	ret
 14.1185 +
 14.1186 +/*
 14.1187 + * Handle faults from [fs]u*().  Clean up and return -1.  This differs from
 14.1188 + * fusufault() in that trap() will recognize it and return immediately rather
 14.1189 + * than trying to page fault.
 14.1190 + */
 14.1191 +/* LINTSTUB: Ignore */
 14.1192 +NENTRY(fusubail)
 14.1193 +	movl	$0,PCB_ONFAULT(%ecx)
 14.1194 +	movl	$-1,%eax
 14.1195 +	ret
 14.1196 +
 14.1197 +/*
 14.1198 + * Handle earlier faults from [fs]u*(), due to our of range addresses.
 14.1199 + */
 14.1200 +/* LINTSTUB: Ignore */
 14.1201 +NENTRY(fusuaddrfault)
 14.1202 +	movl	$-1,%eax
 14.1203 +	ret
 14.1204 +
 14.1205 +/*
 14.1206 + * int suword(void *uaddr, long x);
 14.1207 + * Store an int in the user's address space.
 14.1208 + * see suword(9)
 14.1209 + */
 14.1210 +/* LINTSTUB: Func: int suword(void *base, long c) */
 14.1211 +ENTRY(suword)
 14.1212 +	DO_DEFERRED_SWITCH(%eax)
 14.1213 +	movl	4(%esp),%edx
 14.1214 +	cmpl	$VM_MAXUSER_ADDRESS-4,%edx
 14.1215 +	ja	_C_LABEL(fusuaddrfault)
 14.1216 +
 14.1217 +#if defined(I386_CPU)
 14.1218 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 14.1219 +	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
 14.1220 +	jne	2f
 14.1221 +#endif /* I486_CPU || I586_CPU || I686_CPU */
 14.1222 +
 14.1223 +	GET_CURPCB(%eax)
 14.1224 +	movl	$3f,PCB_ONFAULT(%eax)
 14.1225 +
 14.1226 +	movl	%edx,%eax
 14.1227 +	shrl	$PGSHIFT,%eax		# calculate pte address
 14.1228 +	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
 14.1229 +	jnz	1f
 14.1230 +
 14.1231 +3:	/* Simulate a trap. */
 14.1232 +	pushl	%edx
 14.1233 +	pushl	%edx
 14.1234 +	call	_C_LABEL(trapwrite)	# trapwrite(addr)
 14.1235 +	addl	$4,%esp			# clear parameter from the stack
 14.1236 +	popl	%edx
 14.1237 +	GET_CURPCB(%ecx)
 14.1238 +	testl	%eax,%eax
 14.1239 +	jnz	_C_LABEL(fusufault)
 14.1240 +
 14.1241 +1:	/* XXX also need to check the following 3 bytes for validity! */
 14.1242 +#endif
 14.1243 +
 14.1244 +2:	GET_CURPCB(%ecx)
 14.1245 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
 14.1246 +
 14.1247 +	movl	8(%esp),%eax
 14.1248 +	movl	%eax,(%edx)
 14.1249 +	xorl	%eax,%eax
 14.1250 +	movl	%eax,PCB_ONFAULT(%ecx)
 14.1251 +	ret
 14.1252 +	
 14.1253 +/*
 14.1254 + * int susword(void *uaddr, short x);
 14.1255 + * Store a short in the user's address space.
 14.1256 + * see susword(9)
 14.1257 + */
 14.1258 +/* LINTSTUB: Func: int susword(void *base, short c) */
 14.1259 +ENTRY(susword)
 14.1260 +	DO_DEFERRED_SWITCH(%eax)
 14.1261 +	movl	4(%esp),%edx
 14.1262 +	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
 14.1263 +	ja	_C_LABEL(fusuaddrfault)
 14.1264 +
 14.1265 +#if defined(I386_CPU)
 14.1266 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 14.1267 +	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
 14.1268 +	jne	2f
 14.1269 +#endif /* I486_CPU || I586_CPU || I686_CPU */
 14.1270 +
 14.1271 +	GET_CURPCB(%eax)
 14.1272 +	movl	$3f,PCB_ONFAULT(%eax)
 14.1273 +
 14.1274 +	movl	%edx,%eax
 14.1275 +	shrl	$PGSHIFT,%eax		# calculate pte address
 14.1276 +	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
 14.1277 +	jnz	1f
 14.1278 +
 14.1279 +3:	/* Simulate a trap. */
 14.1280 +	pushl	%edx
 14.1281 +	pushl	%edx
 14.1282 +	call	_C_LABEL(trapwrite)	# trapwrite(addr)
 14.1283 +	addl	$4,%esp			# clear parameter from the stack
 14.1284 +	popl	%edx
 14.1285 +	GET_CURPCB(%ecx)
 14.1286 +	testl	%eax,%eax
 14.1287 +	jnz	_C_LABEL(fusufault)
 14.1288 +
 14.1289 +1:	/* XXX also need to check the following byte for validity! */
 14.1290 +#endif
 14.1291 +
 14.1292 +2:	GET_CURPCB(%ecx)
 14.1293 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
 14.1294 +
 14.1295 +	movl	8(%esp),%eax
 14.1296 +	movw	%ax,(%edx)
 14.1297 +	xorl	%eax,%eax
 14.1298 +	movl	%eax,PCB_ONFAULT(%ecx)
 14.1299 +	ret
 14.1300 +
 14.1301 +/*
 14.1302 + * int suswintr(void *uaddr, short x);
 14.1303 + * Store a short in the user's address space.  Can be called during an
 14.1304 + * interrupt.
 14.1305 + * see suswintr(9)
 14.1306 + */
 14.1307 +/* LINTSTUB: Func: int suswintr(void *base, short c) */
 14.1308 +ENTRY(suswintr)
 14.1309 +	cmpl	$TLBSTATE_VALID, CPUVAR(TLBSTATE)
 14.1310 +	jnz	_C_LABEL(fusuaddrfault)
 14.1311 +	movl	4(%esp),%edx
 14.1312 +	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
 14.1313 +	ja	_C_LABEL(fusuaddrfault)
 14.1314 +	movl	CPUVAR(CURLWP),%ecx
 14.1315 +	movl	L_ADDR(%ecx),%ecx
 14.1316 +	movl	$_C_LABEL(fusubail),PCB_ONFAULT(%ecx)
 14.1317 +
 14.1318 +#if defined(I386_CPU)
 14.1319 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 14.1320 +	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
 14.1321 +	jne	2f
 14.1322 +#endif /* I486_CPU || I586_CPU || I686_CPU */
 14.1323 +
 14.1324 +	movl	%edx,%eax
 14.1325 +	shrl	$PGSHIFT,%eax		# calculate pte address
 14.1326 +	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
 14.1327 +	jnz	1f
 14.1328 +
 14.1329 +	/* Simulate a trap. */
 14.1330 +	jmp	_C_LABEL(fusubail)
 14.1331 +
 14.1332 +1:	/* XXX also need to check the following byte for validity! */
 14.1333 +#endif
 14.1334 +
 14.1335 +2:	movl	8(%esp),%eax
 14.1336 +	movw	%ax,(%edx)
 14.1337 +	xorl	%eax,%eax
 14.1338 +	movl	%eax,PCB_ONFAULT(%ecx)
 14.1339 +	ret
 14.1340 +
 14.1341 +/*
 14.1342 + * int subyte(void *uaddr, char x);
 14.1343 + * Store a byte in the user's address space.
 14.1344 + * see subyte(9)
 14.1345 + */
 14.1346 +/* LINTSTUB: Func: int subyte(void *base, int c) */
 14.1347 +ENTRY(subyte)
 14.1348 +	DO_DEFERRED_SWITCH(%eax)
 14.1349 +	movl	4(%esp),%edx
 14.1350 +	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
 14.1351 +	ja	_C_LABEL(fusuaddrfault)
 14.1352 +
 14.1353 +#if defined(I386_CPU)
 14.1354 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 14.1355 +	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
 14.1356 +	jne	2f
 14.1357 +#endif /* I486_CPU || I586_CPU || I686_CPU */
 14.1358 +
 14.1359 +	GET_CURPCB(%eax)	
 14.1360 +	movl	$3f,PCB_ONFAULT(%eax)
 14.1361 +
 14.1362 +	movl	%edx,%eax
 14.1363 +	shrl	$PGSHIFT,%eax		# calculate pte address
 14.1364 +	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
 14.1365 +	jnz	1f
 14.1366 +
 14.1367 +3:	/* Simulate a trap. */
 14.1368 +	pushl	%edx
 14.1369 +	pushl	%edx
 14.1370 +	call	_C_LABEL(trapwrite)	# trapwrite(addr)
 14.1371 +	addl	$4,%esp			# clear parameter from the stack
 14.1372 +	popl	%edx
 14.1373 +	GET_CURPCB(%ecx)
 14.1374 +	testl	%eax,%eax
 14.1375 +	jnz	_C_LABEL(fusufault)
 14.1376 +
 14.1377 +1:
 14.1378 +#endif
 14.1379 +
 14.1380 +2:	GET_CURPCB(%ecx)
 14.1381 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
 14.1382 +
 14.1383 +	movb	8(%esp),%al
 14.1384 +	movb	%al,(%edx)
 14.1385 +	xorl	%eax,%eax
 14.1386 +	movl	%eax,PCB_ONFAULT(%ecx)
 14.1387 +	ret
 14.1388 +
 14.1389 +/*****************************************************************************/
 14.1390 +
 14.1391 +/*
 14.1392 + * The following is i386-specific nonsense.
 14.1393 + */
 14.1394 +
 14.1395 +/*
 14.1396 + * void lgdt_finish(void);
 14.1397 + * Finish load a new GDT pointer (do any necessary cleanup).
 14.1398 + * XXX It's somewhat questionable whether reloading all the segment registers
 14.1399 + * is necessary, since the actual descriptor data is not changed except by
 14.1400 + * process creation and exit, both of which clean up via task switches.  OTOH,
 14.1401 + * this only happens at run time when the GDT is resized.
 14.1402 + */
 14.1403 +/* LINTSTUB: Func: void lgdt_finish(void) */
 14.1404 +NENTRY(lgdt_finish)
 14.1405 +	movl	$GSEL(GDATA_SEL, SEL_KPL),%eax
 14.1406 +	movw	%ax,%ds
 14.1407 +	movw	%ax,%es
 14.1408 +	movw	%ax,%gs
 14.1409 +	movw	%ax,%ss
 14.1410 +	movl	$GSEL(GCPU_SEL, SEL_KPL),%eax
 14.1411 +	movw	%ax,%fs
 14.1412 +	/* Reload code selector by doing intersegment return. */
 14.1413 +	popl	%eax
 14.1414 +	pushl	$GSEL(GCODE_SEL, SEL_KPL)
 14.1415 +	pushl	%eax
 14.1416 +	lret
 14.1417 +
 14.1418 +/*****************************************************************************/
 14.1419 +
 14.1420 +/*
 14.1421 + * These functions are primarily used by DDB.
 14.1422 + */
 14.1423 +
 14.1424 +/* LINTSTUB: Func: int setjmp (label_t *l) */
 14.1425 +ENTRY(setjmp)
 14.1426 +	movl	4(%esp),%eax
 14.1427 +	movl	%ebx,(%eax)		# save ebx
 14.1428 +	movl	%esp,4(%eax)		# save esp
 14.1429 +	movl	%ebp,8(%eax)		# save ebp
 14.1430 +	movl	%esi,12(%eax)		# save esi
 14.1431 +	movl	%edi,16(%eax)		# save edi
 14.1432 +	movl	(%esp),%edx		# get rta
 14.1433 +	movl	%edx,20(%eax)		# save eip
 14.1434 +	xorl	%eax,%eax		# return (0);
 14.1435 +	ret
 14.1436 +
 14.1437 +/* LINTSTUB: Func: void longjmp (label_t *l) */
 14.1438 +ENTRY(longjmp)
 14.1439 +	movl	4(%esp),%eax
 14.1440 +	movl	(%eax),%ebx		# restore ebx
 14.1441 +	movl	4(%eax),%esp		# restore esp
 14.1442 +	movl	8(%eax),%ebp		# restore ebp
 14.1443 +	movl	12(%eax),%esi		# restore esi
 14.1444 +	movl	16(%eax),%edi		# restore edi
 14.1445 +	movl	20(%eax),%edx		# get rta
 14.1446 +	movl	%edx,(%esp)		# put in return frame
 14.1447 +	xorl	%eax,%eax		# return (1);
 14.1448 +	incl	%eax
 14.1449 +	ret
 14.1450 +
 14.1451 +/*****************************************************************************/
 14.1452 +
 14.1453 +	.globl	_C_LABEL(sched_whichqs),_C_LABEL(sched_qs)
 14.1454 +	.globl	_C_LABEL(uvmexp),_C_LABEL(panic)
 14.1455 +
 14.1456 +#ifdef DIAGNOSTIC
 14.1457 +NENTRY(switch_error)
 14.1458 +	pushl	$1f
 14.1459 +3:	call	_C_LABEL(panic)
 14.1460 +	/* NOTREACHED */
 14.1461 +1:	.asciz	"cpu_switch"
 14.1462 +#endif /* DIAGNOSTIC */
 14.1463 +
 14.1464 +/*
 14.1465 + * void cpu_switch(struct lwp *)
 14.1466 + * Find a runnable process and switch to it.  Wait if necessary.  If the new
 14.1467 + * process is the same as the old one, we short-circuit the context save and
 14.1468 + * restore.
 14.1469 + *	
 14.1470 + * Note that the stack frame layout is known to "struct switchframe"
 14.1471 + * in <machine/frame.h> and to the code in cpu_fork() which initializes 
 14.1472 + * it for a new lwp.
 14.1473 + */
 14.1474 +ENTRY(cpu_switch)
 14.1475 +	pushl	%ebx
 14.1476 +	pushl	%esi
 14.1477 +	pushl	%edi
 14.1478 +
 14.1479 +#ifdef DEBUG
 14.1480 +	cmpl	$IPL_SCHED,CPUVAR(ILEVEL)
 14.1481 +	jae	1f
 14.1482 +	pushl	$2f
 14.1483 +	call	_C_LABEL(panic)
 14.1484 +	/* NOTREACHED */
 14.1485 +2:	.asciz	"not splsched() in cpu_switch!"
 14.1486 +1:	
 14.1487 +#endif /* DEBUG */
 14.1488 +
 14.1489 +	movl	16(%esp),%esi		# current
 14.1490 +
 14.1491 +	/*
 14.1492 +	 * Clear curlwp so that we don't accumulate system time while idle.
 14.1493 +	 * This also insures that schedcpu() will move the old lwp to
 14.1494 +	 * the correct queue if it happens to get called from the spllower()
 14.1495 +	 * below and changes the priority.  (See corresponding comment in
 14.1496 +	 * userret()).
 14.1497 +	 */
 14.1498 +	movl	$0,CPUVAR(CURLWP)
 14.1499 +	/*
 14.1500 +	 * First phase: find new lwp.
 14.1501 +	 *
 14.1502 +	 * Registers:
 14.1503 +	 *   %eax - queue head, scratch, then zero
 14.1504 +	 *   %ebx - queue number
 14.1505 +	 *   %ecx - cached value of whichqs
 14.1506 +	 *   %edx - next lwp in queue
 14.1507 +	 *   %esi - old lwp
 14.1508 +	 *   %edi - new lwp
 14.1509 +	 */
 14.1510 +
 14.1511 +	/* Look for new lwp. */
 14.1512 +	CLI(%ecx)			# splhigh doesn't do a cli
 14.1513 +	movl	_C_LABEL(sched_whichqs),%ecx
 14.1514 +	bsfl	%ecx,%ebx		# find a full q
 14.1515 +	jnz	switch_dequeue
 14.1516 +
 14.1517 +	/*
 14.1518 +	 * idling:	save old context.
 14.1519 +	 *
 14.1520 +	 * Registers:
 14.1521 +	 *   %eax, %ecx - scratch
 14.1522 +	 *   %esi - old lwp, then old pcb
 14.1523 +	 *   %edi - idle pcb
 14.1524 +	 */
 14.1525 +
 14.1526 +	pushl	%esi
 14.1527 +	call	_C_LABEL(pmap_deactivate2)	# pmap_deactivate(oldproc)
 14.1528 +	addl	$4,%esp
 14.1529 +
 14.1530 +	movl	L_ADDR(%esi),%esi
 14.1531 +
 14.1532 +	/* Save stack pointers. */
 14.1533 +	movl	%esp,PCB_ESP(%esi)
 14.1534 +	movl	%ebp,PCB_EBP(%esi)
 14.1535 +
 14.1536 +	/* Find idle PCB for this CPU */
 14.1537 +#ifndef MULTIPROCESSOR
 14.1538 +	movl	$_C_LABEL(lwp0),%ebx
 14.1539 +	movl	L_ADDR(%ebx),%edi
 14.1540 +	movl	L_MD_TSS_SEL(%ebx),%edx
 14.1541 +#else
 14.1542 +	movl	CPUVAR(IDLE_PCB),%edi
 14.1543 +	movl	CPUVAR(IDLE_TSS_SEL),%edx
 14.1544 +#endif
 14.1545 +	movl	$0,CPUVAR(CURLWP)		/* In case we fault... */
 14.1546 +
 14.1547 +	/* Restore the idle context (avoid interrupts) */
 14.1548 +	CLI(%ecx)
 14.1549 +
 14.1550 +	/* Restore stack pointers. */
 14.1551 +	movl	PCB_ESP(%edi),%esp
 14.1552 +	movl	PCB_EBP(%edi),%ebp
 14.1553 +
 14.1554 +	pushl	%edi
 14.1555 +	call	_C_LABEL(i386_switch_context)
 14.1556 +	addl	$4,%esp
 14.1557 +
 14.1558 +	/* Record new pcb. */
 14.1559 +	SET_CURPCB(%edi)
 14.1560 +
 14.1561 +	xorl	%esi,%esi
 14.1562 +	STI(%eax)
 14.1563 +idle_unlock:	
 14.1564 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)	
 14.1565 +	call	_C_LABEL(sched_unlock_idle)
 14.1566 +#endif
 14.1567 +	/* Interrupts are okay again. */
 14.1568 +	pushl	$IPL_NONE		# spl0()
 14.1569 +	call	_C_LABEL(Xspllower)	# process pending interrupts
 14.1570 +	addl	$4,%esp
 14.1571 +	jmp	idle_start
 14.1572 +idle_zero:		
 14.1573 +	STIC(%eax)
 14.1574 +    	jz	4f
 14.1575 +	call	_C_LABEL(stipending)
 14.1576 +	testl	%eax,%eax
 14.1577 +	jz	4f
 14.1578 +	pushl	$IPL_NONE
 14.1579 +	call	_C_LABEL(Xspllower)
 14.1580 +	addl	$4,%esp
 14.1581 +4:
 14.1582 +	call	_C_LABEL(uvm_pageidlezero)
 14.1583 +	CLI(%eax)
 14.1584 +	cmpl	$0,_C_LABEL(sched_whichqs)
 14.1585 +	jnz	idle_exit
 14.1586 +idle_loop:
 14.1587 +	/* Try to zero some pages. */
 14.1588 +	movl	_C_LABEL(uvm)+UVM_PAGE_IDLE_ZERO,%ecx
 14.1589 +	testl	%ecx,%ecx
 14.1590 +	jnz	idle_zero
 14.1591 +	STIC(%eax)
 14.1592 +    	jz	4f
 14.1593 +	call	_C_LABEL(stipending)
 14.1594 +	testl	%eax,%eax
 14.1595 +	jz	4f
 14.1596 +	pushl	$IPL_NONE
 14.1597 +	call	_C_LABEL(Xspllower)
 14.1598 +	addl	$4,%esp
 14.1599 +	jmp	idle_start
 14.1600 +4:
 14.1601 +	movl	$__HYPERVISOR_yield,%eax
 14.1602 +	TRAP_INSTR
 14.1603 +NENTRY(mpidle)
 14.1604 +idle_start:	
 14.1605 +	CLI(%eax)
 14.1606 +	cmpl	$0,_C_LABEL(sched_whichqs)
 14.1607 +	jz	idle_loop
 14.1608 +idle_exit:	
 14.1609 +	movl	$IPL_HIGH,CPUVAR(ILEVEL)		# splhigh
 14.1610 +	STI(%eax)
 14.1611 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)	
 14.1612 +	call	_C_LABEL(sched_lock_idle)
 14.1613 +#endif
 14.1614 +	movl	_C_LABEL(sched_whichqs),%ecx
 14.1615 +	bsfl	%ecx,%ebx
 14.1616 +	jz	idle_unlock
 14.1617 +
 14.1618 +#ifdef XENDEBUG_LOW
 14.1619 +	pushl	%ecx
 14.1620 +	call	_C_LABEL(xen_dbg1)
 14.1621 +	xorl	%ecx,%ecx
 14.1622 +	movl	%ecx,_C_LABEL(xen_once)
 14.1623 +	popl	%ecx
 14.1624 +#endif
 14.1625 +switch_dequeue:		
 14.1626 +	/* 
 14.1627 +	 * we're running at splhigh(), but it's otherwise okay to take
 14.1628 +	 * interrupts here. 
 14.1629 +	 */
 14.1630 +	STI(%edi)
 14.1631 +	leal	_C_LABEL(sched_qs)(,%ebx,8),%eax # select q
 14.1632 +
 14.1633 +	movl	L_FORW(%eax),%edi	# unlink from front of process q
 14.1634 +#ifdef	DIAGNOSTIC
 14.1635 +	cmpl	%edi,%eax		# linked to self (i.e. nothing queued)?
 14.1636 +	je	_C_LABEL(switch_error)	# not possible
 14.1637 +#endif /* DIAGNOSTIC */
 14.1638 +	movl	L_FORW(%edi),%edx
 14.1639 +	movl	%edx,L_FORW(%eax)
 14.1640 +	movl	%eax,L_BACK(%edx)
 14.1641 +
 14.1642 +	cmpl	%edx,%eax		# q empty?
 14.1643 +	jne	3f
 14.1644 +
 14.1645 +	btrl	%ebx,%ecx		# yes, clear to indicate empty
 14.1646 +	movl	%ecx,_C_LABEL(sched_whichqs) # update q status
 14.1647 +
 14.1648 +3:	/* We just did it. */
 14.1649 +	xorl	%eax,%eax
 14.1650 +	CLEAR_RESCHED(%eax)
 14.1651 +
 14.1652 +switch_resume:
 14.1653 +#ifdef	DIAGNOSTIC
 14.1654 +	cmpl	%eax,L_WCHAN(%edi)	# Waiting for something?
 14.1655 +	jne	_C_LABEL(switch_error)	# Yes; shouldn't be queued.
 14.1656 +	cmpb	$LSRUN,L_STAT(%edi)	# In run state?
 14.1657 +	jne	_C_LABEL(switch_error)	# No; shouldn't be queued.
 14.1658 +#endif /* DIAGNOSTIC */
 14.1659 +
 14.1660 +	/* Isolate lwp.  XXX Is this necessary? */
 14.1661 +	movl	%eax,L_BACK(%edi)
 14.1662 +
 14.1663 +	/* Record new lwp. */
 14.1664 +	movb	$LSONPROC,L_STAT(%edi)	# l->l_stat = LSONPROC
 14.1665 +	SET_CURLWP(%edi,%ecx)
 14.1666 +
 14.1667 +	/* Skip context switch if same lwp. */
 14.1668 +	xorl	%ebx,%ebx
 14.1669 +	cmpl	%edi,%esi
 14.1670 +	je	switch_return
 14.1671 +
 14.1672 +	/* If old lwp exited, don't bother. */
 14.1673 +	testl	%esi,%esi
 14.1674 +	jz	switch_exited
 14.1675 +
 14.1676 +	/*
 14.1677 +	 * Second phase: save old context.
 14.1678 +	 *
 14.1679 +	 * Registers:
 14.1680 +	 *   %eax, %ecx - scratch
 14.1681 +	 *   %esi - old lwp, then old pcb
 14.1682 +	 *   %edi - new lwp
 14.1683 +	 */
 14.1684 +
 14.1685 +	pushl	%esi
 14.1686 +	call	_C_LABEL(pmap_deactivate2)	# pmap_deactivate(oldproc)
 14.1687 +	addl	$4,%esp
 14.1688 +
 14.1689 +	movl	L_ADDR(%esi),%esi
 14.1690 +
 14.1691 +	/* Save stack pointers. */
 14.1692 +	movl	%esp,PCB_ESP(%esi)
 14.1693 +	movl	%ebp,PCB_EBP(%esi)
 14.1694 +
 14.1695 +switch_exited:
 14.1696 +	/*
 14.1697 +	 * Third phase: restore saved context.
 14.1698 +	 *
 14.1699 +	 * Registers:
 14.1700 +	 *   %eax, %ebx, %ecx, %edx - scratch
 14.1701 +	 *   %esi - new pcb
 14.1702 +	 *   %edi - new lwp
 14.1703 +	 */
 14.1704 +
 14.1705 +	/* No interrupts while loading new state. */
 14.1706 +	CLI(%eax)
 14.1707 +	movl	L_ADDR(%edi),%esi
 14.1708 +
 14.1709 +	/* Restore stack pointers. */
 14.1710 +	movl	PCB_ESP(%esi),%esp
 14.1711 +	movl	PCB_EBP(%esi),%ebp
 14.1712 +
 14.1713 +#if 0
 14.1714 +	/* Don't bother with the rest if switching to a system process. */
 14.1715 +	testl	$P_SYSTEM,L_FLAG(%edi);	XXX NJWLWP lwp's don't have P_SYSTEM!
 14.1716 +	jnz	switch_restored	; XXX skip stack_switch+pmap_activate
 14.1717 +#endif
 14.1718 +
 14.1719 +	pushl	%edi
 14.1720 +	call	_C_LABEL(pmap_activate)		# pmap_activate(p)
 14.1721 +	addl	$4,%esp
 14.1722 +
 14.1723 +	pushl	%esi
 14.1724 +	call	_C_LABEL(i386_switch_context)
 14.1725 +	addl	$4,%esp
 14.1726 +
 14.1727 +	/* Record new pcb. */
 14.1728 +	SET_CURPCB(%esi)
 14.1729 +
 14.1730 +	/* Interrupts are okay again. */
 14.1731 +	STI(%edi)
 14.1732 +
 14.1733 +/*
 14.1734 + *  Check for restartable atomic sequences (RAS)
 14.1735 + */
 14.1736 +	movl	CPUVAR(CURLWP),%edi
 14.1737 +	movl	L_PROC(%edi),%esi
 14.1738 +	cmpl	$0,P_RASLIST(%esi)
 14.1739 +	jne	2f
 14.1740 +1:
 14.1741 +	movl	$1,%ebx
 14.1742 +
 14.1743 +switch_return:
 14.1744 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)     
 14.1745 +	call    _C_LABEL(sched_unlock_idle)
 14.1746 +#endif
 14.1747 +	pushl	$IPL_NONE		# spl0()
 14.1748 +	call	_C_LABEL(Xspllower)	# process pending interrupts
 14.1749 +	addl	$4,%esp
 14.1750 +	movl	$IPL_HIGH,CPUVAR(ILEVEL)	# splhigh()
 14.1751 +
 14.1752 +	movl	%ebx,%eax
 14.1753 +
 14.1754 +	popl	%edi
 14.1755 +	popl	%esi
 14.1756 +	popl	%ebx
 14.1757 +	ret
 14.1758 +
 14.1759 +2:					# check RAS list
 14.1760 +	movl	L_MD_REGS(%edi),%ebx
 14.1761 +	movl	TF_EIP(%ebx),%eax
 14.1762 +	pushl	%eax
 14.1763 +	pushl	%esi
 14.1764 +	call	_C_LABEL(ras_lookup)
 14.1765 +	addl	$8,%esp
 14.1766 +	cmpl	$-1,%eax
 14.1767 +	je	1b
 14.1768 +	movl	%eax,TF_EIP(%ebx)
 14.1769 +	jmp	1b
 14.1770 +
 14.1771 +/*
 14.1772 + * void cpu_switchto(struct lwp *current, struct lwp *next)
 14.1773 + * Switch to the specified next LWP.
 14.1774 + */
 14.1775 +ENTRY(cpu_switchto)
 14.1776 +	pushl	%ebx
 14.1777 +	pushl	%esi
 14.1778 +	pushl	%edi
 14.1779 +
 14.1780 +#ifdef DEBUG
 14.1781 +	cmpl	$IPL_SCHED,CPUVAR(ILEVEL)
 14.1782 +	jae	1f
 14.1783 +	pushl	$2f
 14.1784 +	call	_C_LABEL(panic)
 14.1785 +	/* NOTREACHED */
 14.1786 +2:	.asciz	"not splsched() in cpu_switchto!"
 14.1787 +1:
 14.1788 +#endif /* DEBUG */
 14.1789 +
 14.1790 +	movl	16(%esp),%esi		# current
 14.1791 +	movl	20(%esp),%edi		# next
 14.1792 +
 14.1793 +	/*
 14.1794 +	 * Clear curlwp so that we don't accumulate system time while idle.
 14.1795 +	 * This also insures that schedcpu() will move the old process to
 14.1796 +	 * the correct queue if it happens to get called from the spllower()
 14.1797 +	 * below and changes the priority.  (See corresponding comment in
 14.1798 +	 * usrret()).
 14.1799 +	 *
 14.1800 +	 * XXX Is this necessary?  We know we won't go idle.
 14.1801 +	 */
 14.1802 +	movl	$0,CPUVAR(CURLWP)
 14.1803 +
 14.1804 +	/*
 14.1805 +	 * We're running at splhigh(), but it's otherwise okay to take
 14.1806 +	 * interrupts here.
 14.1807 +	 */
 14.1808 +	STI(%eax)
 14.1809 +
 14.1810 +	/* Jump into the middle of cpu_switch */
 14.1811 +	xorl	%eax,%eax
 14.1812 +	jmp	switch_resume
 14.1813 +
 14.1814 +/*
 14.1815 + * void cpu_exit(struct lwp *l)
 14.1816 + * Switch to the appropriate idle context (lwp0's if uniprocessor; the CPU's 
 14.1817 + * if multiprocessor) and deallocate the address space and kernel stack for p. 
 14.1818 + * Then jump into cpu_switch(), as if we were in the idle proc all along.
 14.1819 + */
 14.1820 +#ifndef MULTIPROCESSOR
 14.1821 +	.globl	_C_LABEL(lwp0)
 14.1822 +#endif
 14.1823 +	.globl  _C_LABEL(uvmspace_free),_C_LABEL(kernel_map)
 14.1824 +	.globl	_C_LABEL(uvm_km_free),_C_LABEL(tss_free)
 14.1825 +/* LINTSTUB: Func: void cpu_exit(struct lwp *l) */
 14.1826 +ENTRY(cpu_exit)
 14.1827 +	movl	4(%esp),%edi		# old process
 14.1828 +#ifndef MULTIPROCESSOR
 14.1829 +	movl	$_C_LABEL(lwp0),%ebx
 14.1830 +	movl	L_ADDR(%ebx),%esi
 14.1831 +	movl	L_MD_TSS_SEL(%ebx),%edx
 14.1832 +#else
 14.1833 +	movl	CPUVAR(IDLE_PCB),%esi
 14.1834 +	movl	CPUVAR(IDLE_TSS_SEL),%edx
 14.1835 +#endif
 14.1836 +	/* In case we fault... */
 14.1837 +	movl	$0,CPUVAR(CURLWP)
 14.1838 +
 14.1839 +	/* Restore the idle context. */
 14.1840 +	CLI(%eax)
 14.1841 +
 14.1842 +	/* Restore stack pointers. */
 14.1843 +	movl	PCB_ESP(%esi),%esp
 14.1844 +	movl	PCB_EBP(%esi),%ebp
 14.1845 +
 14.1846 +	pushl	%esi
 14.1847 +	call	_C_LABEL(i386_switch_context)
 14.1848 +	addl	$4,%esp
 14.1849 +
 14.1850 +	/* Record new pcb. */
 14.1851 +	SET_CURPCB(%esi)
 14.1852 +
 14.1853 +	/* Interrupts are okay again. */
 14.1854 +	STI(%eax)
 14.1855 +
 14.1856 +	/*
 14.1857 +	 * Schedule the dead LWP's stack to be freed.
 14.1858 +	 */
 14.1859 +	pushl	%edi
 14.1860 +	call	_C_LABEL(lwp_exit2)
 14.1861 +	addl	$4,%esp
 14.1862 +
 14.1863 +	/* Jump into cpu_switch() with the right state. */
 14.1864 +	xorl	%esi,%esi
 14.1865 +	movl	%esi,CPUVAR(CURLWP)
 14.1866 +	jmp	idle_start
 14.1867 +
 14.1868 +/*
 14.1869 + * void savectx(struct pcb *pcb);
 14.1870 + * Update pcb, saving current processor state.
 14.1871 + */
 14.1872 +/* LINTSTUB: Func: void savectx(struct pcb *pcb) */
 14.1873 +ENTRY(savectx)
 14.1874 +	movl	4(%esp),%edx		# edx = p->p_addr
 14.1875 +  
 14.1876 +	/* Save stack pointers. */
 14.1877 +	movl	%esp,PCB_ESP(%edx)
 14.1878 +	movl	%ebp,PCB_EBP(%edx)
 14.1879 +
 14.1880 +	ret
 14.1881 +
 14.1882 +/*
 14.1883 + * Old call gate entry for syscall
 14.1884 + */
 14.1885 +/* LINTSTUB: Var: char Xosyscall[1]; */
 14.1886 +IDTVEC(osyscall)
 14.1887 +	/* Set eflags in trap frame. */
 14.1888 +	pushfl
 14.1889 +	popl	8(%esp)
 14.1890 +	pushl	$7		# size of instruction for restart
 14.1891 +	jmp	syscall1
 14.1892 +
 14.1893 +/*
 14.1894 + * Trap gate entry for syscall
 14.1895 + */
 14.1896 +/* LINTSTUB: Var: char Xsyscall[1]; */
 14.1897 +IDTVEC(syscall)
 14.1898 +	pushl	$2		# size of instruction for restart
 14.1899 +syscall1:
 14.1900 +	pushl	$T_ASTFLT	# trap # for doing ASTs
 14.1901 +	INTRENTRY
 14.1902 +
 14.1903 +#ifdef DIAGNOSTIC
 14.1904 +	cmpl    $0, CPUVAR(WANT_PMAPLOAD)
 14.1905 +	jz	1f
 14.1906 +	pushl	$6f
 14.1907 +	call	_C_LABEL(printf)
 14.1908 +	addl	$4, %esp
 14.1909 +1:
 14.1910 +	movl	CPUVAR(ILEVEL),%ebx
 14.1911 +	testl	%ebx,%ebx
 14.1912 +	jz	1f
 14.1913 +	pushl	$5f
 14.1914 +	call	_C_LABEL(printf)
 14.1915 +	addl	$4,%esp
 14.1916 +#ifdef DDB
 14.1917 +	int	$3
 14.1918 +#endif
 14.1919 +1:	
 14.1920 +#endif /* DIAGNOSTIC */
 14.1921 +	movl	CPUVAR(CURLWP),%edx
 14.1922 +	movl	%esp,L_MD_REGS(%edx)	# save pointer to frame
 14.1923 +	movl	L_PROC(%edx),%edx
 14.1924 +	pushl	%esp
 14.1925 +	call	*P_MD_SYSCALL(%edx)	# get pointer to syscall() function
 14.1926 +	addl	$4,%esp
 14.1927 +syscall_checkast:
 14.1928 +	/* Check for ASTs on exit to user mode. */
 14.1929 +	CLI(%eax)
 14.1930 +	CHECK_ASTPENDING(%eax)
 14.1931 +	je	1f
 14.1932 +	/* Always returning to user mode here. */
 14.1933 +	CLEAR_ASTPENDING(%eax)
 14.1934 +	STI(%eax)
 14.1935 +	/* Pushed T_ASTFLT into tf_trapno on entry. */
 14.1936 +	pushl	%esp
 14.1937 +	call	_C_LABEL(trap)
 14.1938 +	addl	$4,%esp
 14.1939 +	jmp	syscall_checkast
 14.1940 +1:	STI(%eax)
 14.1941 +	CHECK_DEFERRED_SWITCH(%eax)
 14.1942 +	jnz	9f
 14.1943 +#ifndef DIAGNOSTIC
 14.1944 +	INTRFASTEXIT
 14.1945 +#else /* DIAGNOSTIC */
 14.1946 +	cmpl	$IPL_NONE,CPUVAR(ILEVEL)
 14.1947 +	jne	3f
 14.1948 +	INTRFASTEXIT
 14.1949 +3:	pushl	$4f
 14.1950 +	call	_C_LABEL(printf)
 14.1951 +	addl	$4,%esp
 14.1952 +#ifdef DDB
 14.1953 +	int	$3
 14.1954 +#endif /* DDB */
 14.1955 +	movl	$IPL_NONE,CPUVAR(ILEVEL)
 14.1956 +	jmp	2b
 14.1957 +4:	.asciz	"WARNING: SPL NOT LOWERED ON SYSCALL EXIT\n"
 14.1958 +5:	.asciz	"WARNING: SPL NOT ZERO ON SYSCALL ENTRY\n"	
 14.1959 +6:	.asciz	"WARNING: WANT PMAPLOAD ON SYSCALL ENTRY\n"     
 14.1960 +#endif /* DIAGNOSTIC */
 14.1961 +9:	call    _C_LABEL(pmap_load)
 14.1962 +	jmp     syscall_checkast        /* re-check ASTs */
 14.1963 +
 14.1964 +#if NNPX > 0
 14.1965 +/*
 14.1966 + * Special interrupt handlers.  Someday intr0-intr15 will be used to count
 14.1967 + * interrupts.  We'll still need a special exception 16 handler.  The busy
 14.1968 + * latch stuff in probintr() can be moved to npxprobe().
 14.1969 + */
 14.1970 +
 14.1971 +/* LINTSTUB: Func: void probeintr(void) */
 14.1972 +NENTRY(probeintr)
 14.1973 +	ss
 14.1974 +	incl	_C_LABEL(npx_intrs_while_probing)
 14.1975 +	pushl	%eax
 14.1976 +	movb	$0x20,%al	# EOI (asm in strings loses cpp features)
 14.1977 +	outb	%al,$0xa0	# IO_ICU2
 14.1978 +	outb	%al,$0x20	# IO_ICU1
 14.1979 +	movb	$0,%al
 14.1980 +	outb	%al,$0xf0	# clear BUSY# latch
 14.1981 +	popl	%eax
 14.1982 +	iret
 14.1983 +
 14.1984 +/* LINTSTUB: Func: void probetrap(void) */
 14.1985 +NENTRY(probetrap)
 14.1986 +	ss
 14.1987 +	incl	_C_LABEL(npx_traps_while_probing)
 14.1988 +	fnclex
 14.1989 +	iret
 14.1990 +
 14.1991 +/* LINTSTUB: Func: int npx586bug1(int a, int b) */
 14.1992 +NENTRY(npx586bug1)
 14.1993 +	fildl	4(%esp)		# x
 14.1994 +	fildl	8(%esp)		# y
 14.1995 +	fld	%st(1)
 14.1996 +	fdiv	%st(1),%st	# x/y
 14.1997 +	fmulp	%st,%st(1)	# (x/y)*y
 14.1998 +	fsubrp	%st,%st(1)	# x-(x/y)*y
 14.1999 +	pushl	$0
 14.2000 +	fistpl	(%esp)
 14.2001 +	popl	%eax
 14.2002 +	ret
 14.2003 +#endif /* NNPX > 0 */
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c	Mon Sep 06 19:11:17 2004 +0000
    15.3 @@ -0,0 +1,2561 @@
    15.4 +/*	$NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $	*/
    15.5 +/*	NetBSD: machdep.c,v 1.552 2004/03/24 15:34:49 atatat Exp 	*/
    15.6 +
    15.7 +/*-
    15.8 + * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
    15.9 + * All rights reserved.
   15.10 + *
   15.11 + * This code is derived from software contributed to The NetBSD Foundation
   15.12 + * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
   15.13 + * Simulation Facility, NASA Ames Research Center.
   15.14 + *
   15.15 + * Redistribution and use in source and binary forms, with or without
   15.16 + * modification, are permitted provided that the following conditions
   15.17 + * are met:
   15.18 + * 1. Redistributions of source code must retain the above copyright
   15.19 + *    notice, this list of conditions and the following disclaimer.
   15.20 + * 2. Redistributions in binary form must reproduce the above copyright
   15.21 + *    notice, this list of conditions and the following disclaimer in the
   15.22 + *    documentation and/or other materials provided with the distribution.
   15.23 + * 3. All advertising materials mentioning features or use of this software
   15.24 + *    must display the following acknowledgement:
   15.25 + *	This product includes software developed by the NetBSD
   15.26 + *	Foundation, Inc. and its contributors.
   15.27 + * 4. Neither the name of The NetBSD Foundation nor the names of its
   15.28 + *    contributors may be used to endorse or promote products derived
   15.29 + *    from this software without specific prior written permission.
   15.30 + *
   15.31 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   15.32 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   15.33 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   15.34 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   15.35 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   15.36 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   15.37 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   15.38 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   15.39 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   15.40 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   15.41 + * POSSIBILITY OF SUCH DAMAGE.
   15.42 + */
   15.43 +
   15.44 +/*-
   15.45 + * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
   15.46 + * All rights reserved.
   15.47 + *
   15.48 + * This code is derived from software contributed to Berkeley by
   15.49 + * William Jolitz.
   15.50 + *
   15.51 + * Redistribution and use in source and binary forms, with or without
   15.52 + * modification, are permitted provided that the following conditions
   15.53 + * are met:
   15.54 + * 1. Redistributions of source code must retain the above copyright
   15.55 + *    notice, this list of conditions and the following disclaimer.
   15.56 + * 2. Redistributions in binary form must reproduce the above copyright
   15.57 + *    notice, this list of conditions and the following disclaimer in the
   15.58 + *    documentation and/or other materials provided with the distribution.
   15.59 + * 3. Neither the name of the University nor the names of its contributors
   15.60 + *    may be used to endorse or promote products derived from this software
   15.61 + *    without specific prior written permission.
   15.62 + *
   15.63 + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   15.64 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   15.65 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   15.66 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   15.67 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   15.68 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   15.69 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   15.70 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   15.71 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   15.72 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   15.73 + * SUCH DAMAGE.
   15.74 + *
   15.75 + *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
   15.76 + */
   15.77 +
   15.78 +#include <sys/cdefs.h>
   15.79 +__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $");
   15.80 +
   15.81 +#include "opt_beep.h"
   15.82 +#include "opt_compat_ibcs2.h"
   15.83 +#include "opt_compat_mach.h"	/* need to get the right segment def */
   15.84 +#include "opt_compat_netbsd.h"
   15.85 +#include "opt_compat_svr4.h"
   15.86 +#include "opt_cpureset_delay.h"
   15.87 +#include "opt_cputype.h"
   15.88 +#include "opt_ddb.h"
   15.89 +#include "opt_ipkdb.h"
   15.90 +#include "opt_kgdb.h"
   15.91 +#include "opt_mtrr.h"
   15.92 +#include "opt_multiprocessor.h"
   15.93 +#include "opt_realmem.h"
   15.94 +#include "opt_user_ldt.h"
   15.95 +#include "opt_vm86.h"
   15.96 +#include "opt_xen.h"
   15.97 +
   15.98 +#include <sys/param.h>
   15.99 +#include <sys/systm.h>
  15.100 +#include <sys/signal.h>
  15.101 +#include <sys/signalvar.h>
  15.102 +#include <sys/kernel.h>
  15.103 +#include <sys/proc.h>
  15.104 +#include <sys/user.h>
  15.105 +#include <sys/exec.h>
  15.106 +#include <sys/buf.h>
  15.107 +#include <sys/reboot.h>
  15.108 +#include <sys/conf.h>
  15.109 +#include <sys/file.h>
  15.110 +#include <sys/malloc.h>
  15.111 +#include <sys/mbuf.h>
  15.112 +#include <sys/msgbuf.h>
  15.113 +#include <sys/mount.h>
  15.114 +#include <sys/vnode.h>
  15.115 +#include <sys/extent.h>
  15.116 +#include <sys/syscallargs.h>
  15.117 +#include <sys/core.h>
  15.118 +#include <sys/kcore.h>
  15.119 +#include <sys/ucontext.h>
  15.120 +#include <machine/kcore.h>
  15.121 +#include <sys/ras.h>
  15.122 +#include <sys/sa.h>
  15.123 +#include <sys/savar.h>
  15.124 +#include <sys/ksyms.h>
  15.125 +
  15.126 +#ifdef IPKDB
  15.127 +#include <ipkdb/ipkdb.h>
  15.128 +#endif
  15.129 +
  15.130 +#ifdef KGDB
  15.131 +#include <sys/kgdb.h>
  15.132 +#endif
  15.133 +
  15.134 +#include <dev/cons.h>
  15.135 +
  15.136 +#include <uvm/uvm_extern.h>
  15.137 +#include <uvm/uvm_page.h>
  15.138 +
  15.139 +#include <sys/sysctl.h>
  15.140 +
  15.141 +#include <machine/cpu.h>
  15.142 +#include <machine/cpufunc.h>
  15.143 +#include <machine/cpuvar.h>
  15.144 +#include <machine/gdt.h>
  15.145 +#include <machine/pio.h>
  15.146 +#include <machine/psl.h>
  15.147 +#include <machine/reg.h>
  15.148 +#include <machine/specialreg.h>
  15.149 +#include <machine/bootinfo.h>
  15.150 +#include <machine/mtrr.h>
  15.151 +#include <machine/evtchn.h>
  15.152 +
  15.153 +#include <dev/isa/isareg.h>
  15.154 +#include <machine/isa_machdep.h>
  15.155 +#include <dev/ic/i8042reg.h>
  15.156 +
  15.157 +#ifdef DDB
  15.158 +#include <machine/db_machdep.h>
  15.159 +#include <ddb/db_extern.h>
  15.160 +#endif
  15.161 +
  15.162 +#ifdef VM86
  15.163 +#include <machine/vm86.h>
  15.164 +#endif
  15.165 +
  15.166 +#include "acpi.h"
  15.167 +#include "apm.h"
  15.168 +#include "bioscall.h"
  15.169 +
  15.170 +#if NBIOSCALL > 0
  15.171 +#include <machine/bioscall.h>
  15.172 +#endif
  15.173 +
  15.174 +#if NACPI > 0
  15.175 +#include <dev/acpi/acpivar.h>
  15.176 +#define ACPI_MACHDEP_PRIVATE
  15.177 +#include <machine/acpi_machdep.h>
  15.178 +#endif
  15.179 +
  15.180 +#if NAPM > 0
  15.181 +#include <machine/apmvar.h>
  15.182 +#endif
  15.183 +
  15.184 +#include "isa.h"
  15.185 +#include "isadma.h"
  15.186 +#include "npx.h"
  15.187 +#include "ksyms.h"
  15.188 +
  15.189 +#include "mca.h"
  15.190 +#if NMCA > 0
  15.191 +#include <machine/mca_machdep.h>	/* for mca_busprobe() */
  15.192 +#endif
  15.193 +
  15.194 +#ifdef MULTIPROCESSOR		/* XXX */
  15.195 +#include <machine/mpbiosvar.h>	/* XXX */
  15.196 +#endif				/* XXX */
  15.197 +
  15.198 +#include <machine/xen.h>
  15.199 +#include <machine/hypervisor.h>
  15.200 +
  15.201 +#if defined(DDB) || defined(KGDB)
  15.202 +#include <ddb/db_interface.h>
  15.203 +#include <ddb/db_output.h>
  15.204 +
  15.205 +void ddb_trap_hook(int);
  15.206 +#endif
  15.207 +
  15.208 +/* #define	XENDEBUG */
  15.209 +/* #define	XENDEBUG_LOW */
  15.210 +
  15.211 +#ifdef XENDEBUG
  15.212 +extern void printk(char *, ...);
  15.213 +#define	XENPRINTF(x) printf x
  15.214 +#define	XENPRINTK(x) printk x
  15.215 +#else
  15.216 +#define	XENPRINTF(x)
  15.217 +#define	XENPRINTK(x)
  15.218 +#endif
  15.219 +#define	PRINTK(x) printf x
  15.220 +
  15.221 +#ifdef XENDEBUG_LOW
  15.222 +void xen_dbglow_init(void);
  15.223 +#endif
  15.224 +
  15.225 +#ifndef BEEP_ONHALT_COUNT
  15.226 +#define BEEP_ONHALT_COUNT 3
  15.227 +#endif
  15.228 +#ifndef BEEP_ONHALT_PITCH
  15.229 +#define BEEP_ONHALT_PITCH 1500
  15.230 +#endif
  15.231 +#ifndef BEEP_ONHALT_PERIOD
  15.232 +#define BEEP_ONHALT_PERIOD 250
  15.233 +#endif
  15.234 +
  15.235 +/* the following is used externally (sysctl_hw) */
  15.236 +char machine[] = "i386";		/* CPU "architecture" */
  15.237 +char machine_arch[] = "i386";		/* machine == machine_arch */
  15.238 +
  15.239 +char bootinfo[BOOTINFO_MAXSIZE];
  15.240 +
  15.241 +struct bi_devmatch *i386_alldisks = NULL;
  15.242 +int i386_ndisks = 0;
  15.243 +
  15.244 +#ifdef CPURESET_DELAY
  15.245 +int	cpureset_delay = CPURESET_DELAY;
  15.246 +#else
  15.247 +int     cpureset_delay = 2000; /* default to 2s */
  15.248 +#endif
  15.249 +
  15.250 +#ifdef MTRR
  15.251 +struct mtrr_funcs *mtrr_funcs;
  15.252 +#endif
  15.253 +
  15.254 +#ifdef COMPAT_NOMID
  15.255 +static int exec_nomid(struct proc *, struct exec_package *);
  15.256 +#endif
  15.257 +
  15.258 +int	physmem;
  15.259 +int	dumpmem_low;
  15.260 +int	dumpmem_high;
  15.261 +unsigned int cpu_feature;
  15.262 +int	cpu_class;
  15.263 +int	i386_fpu_present;
  15.264 +int	i386_fpu_exception;
  15.265 +int	i386_fpu_fdivbug;
  15.266 +
  15.267 +int	i386_use_fxsave;
  15.268 +int	i386_has_sse;
  15.269 +int	i386_has_sse2;
  15.270 +
  15.271 +int	tmx86_has_longrun;
  15.272 +
  15.273 +vaddr_t	msgbuf_vaddr;
  15.274 +paddr_t msgbuf_paddr;
  15.275 +
  15.276 +vaddr_t	idt_vaddr;
  15.277 +paddr_t	idt_paddr;
  15.278 +
  15.279 +#ifdef I586_CPU
  15.280 +vaddr_t	pentium_idt_vaddr;
  15.281 +#endif
  15.282 +
  15.283 +struct vm_map *exec_map = NULL;
  15.284 +struct vm_map *mb_map = NULL;
  15.285 +struct vm_map *phys_map = NULL;
  15.286 +
  15.287 +extern	paddr_t avail_start, avail_end;
  15.288 +extern	paddr_t pmap_pa_start, pmap_pa_end;
  15.289 +
  15.290 +#ifdef ISA_CLOCK
  15.291 +void (*delay_func)(int) = i8254_delay;
  15.292 +void (*microtime_func)(struct timeval *) = i8254_microtime;
  15.293 +void (*initclock_func)(void) = i8254_initclocks;
  15.294 +#else
  15.295 +void (*delay_func)(int) = xen_delay;
  15.296 +void (*microtime_func)(struct timeval *) = xen_microtime;
  15.297 +void (*initclock_func)(void) = xen_initclocks;
  15.298 +#endif
  15.299 +
  15.300 +void hypervisor_callback(void);
  15.301 +void failsafe_callback(void);
  15.302 +
  15.303 +/*
  15.304 + * Size of memory segments, before any memory is stolen.
  15.305 + */
  15.306 +phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
  15.307 +int	mem_cluster_cnt;
  15.308 +
  15.309 +int	cpu_dump(void);
  15.310 +int	cpu_dumpsize(void);
  15.311 +u_long	cpu_dump_mempagecnt(void);
  15.312 +void	dumpsys(void);
  15.313 +void	init386(paddr_t);
  15.314 +void	initgdt(void);
  15.315 +
  15.316 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
  15.317 +void	add_mem_cluster(u_int64_t, u_int64_t, u_int32_t);
  15.318 +#endif /* !defnied(REALBASEMEM) && !defined(REALEXTMEM) */
  15.319 +
  15.320 +extern int time_adjusted;
  15.321 +
  15.322 +/*
  15.323 + * Machine-dependent startup code
  15.324 + */
  15.325 +void
  15.326 +cpu_startup()
  15.327 +{
  15.328 +	int x;
  15.329 +	vaddr_t minaddr, maxaddr;
  15.330 +	char pbuf[9];
  15.331 +
  15.332 +	/*
  15.333 +	 * Initialize error message buffer (et end of core).
  15.334 +	 */
  15.335 +	msgbuf_vaddr = uvm_km_valloc(kernel_map, x86_round_page(MSGBUFSIZE));
  15.336 +	if (msgbuf_vaddr == 0)
  15.337 +		panic("failed to valloc msgbuf_vaddr");
  15.338 +
  15.339 +	/* msgbuf_paddr was init'd in pmap */
  15.340 +	for (x = 0; x < btoc(MSGBUFSIZE); x++)
  15.341 +		pmap_kenter_pa((vaddr_t)msgbuf_vaddr + x * PAGE_SIZE,
  15.342 +		    msgbuf_paddr + x * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE);
  15.343 +	pmap_update(pmap_kernel());
  15.344 +
  15.345 +	initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
  15.346 +
  15.347 +	printf("%s", version);
  15.348 +
  15.349 +#ifdef TRAPLOG
  15.350 +	/*
  15.351 +	 * Enable recording of branch from/to in MSR's
  15.352 +	 */
  15.353 +	wrmsr(MSR_DEBUGCTLMSR, 0x1);
  15.354 +#endif
  15.355 +
  15.356 +	format_bytes(pbuf, sizeof(pbuf), ptoa(physmem));
  15.357 +	printf("total memory = %s\n", pbuf);
  15.358 +
  15.359 +	minaddr = 0;
  15.360 +
  15.361 +	/*
  15.362 +	 * Allocate a submap for exec arguments.  This map effectively
  15.363 +	 * limits the number of processes exec'ing at any time.
  15.364 +	 */
  15.365 +	exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
  15.366 +				   16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
  15.367 +
  15.368 +	/*
  15.369 +	 * Allocate a submap for physio
  15.370 +	 */
  15.371 +	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
  15.372 +				   VM_PHYS_SIZE, 0, FALSE, NULL);
  15.373 +
  15.374 +	/*
  15.375 +	 * Finally, allocate mbuf cluster submap.
  15.376 +	 */
  15.377 +	mb_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
  15.378 +	    nmbclusters * mclbytes, VM_MAP_INTRSAFE, FALSE, NULL);
  15.379 +
  15.380 +	format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free));
  15.381 +	printf("avail memory = %s\n", pbuf);
  15.382 +
  15.383 +	/* Safe for i/o port / memory space allocation to use malloc now. */
  15.384 +	x86_bus_space_mallocok();
  15.385 +}
  15.386 +
  15.387 +/*
  15.388 + * Set up proc0's TSS and LDT.
  15.389 + */
  15.390 +void
  15.391 +i386_proc0_tss_ldt_init()
  15.392 +{
  15.393 +	struct pcb *pcb;
  15.394 +	int x;
  15.395 +
  15.396 +	gdt_init();
  15.397 +
  15.398 +	cpu_info_primary.ci_curpcb = pcb = &lwp0.l_addr->u_pcb;
  15.399 +
  15.400 +	pcb->pcb_tss.tss_ioopt =
  15.401 +	    ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16
  15.402 +		| SEL_KPL;		/* i/o pl */
  15.403 +
  15.404 +	for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
  15.405 +		pcb->pcb_iomap[x] = 0xffffffff;
  15.406 +
  15.407 +	pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
  15.408 +	pcb->pcb_cr0 = rcr0();
  15.409 +	pcb->pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
  15.410 +	pcb->pcb_tss.tss_esp0 = (int)lwp0.l_addr + USPACE - 16;
  15.411 +	lwp0.l_md.md_regs = (struct trapframe *)pcb->pcb_tss.tss_esp0 - 1;
  15.412 +	lwp0.l_md.md_tss_sel = tss_alloc(pcb);
  15.413 +
  15.414 +#ifndef XEN
  15.415 +	ltr(lwp0.l_md.md_tss_sel);
  15.416 +	lldt(pcb->pcb_ldt_sel);
  15.417 +#else
  15.418 +	HYPERVISOR_fpu_taskswitch();
  15.419 +	XENPRINTF(("lwp tss sp %p ss %04x/%04x\n",
  15.420 +		      (void *)pcb->pcb_tss.tss_esp0,
  15.421 +		      pcb->pcb_tss.tss_ss0, IDXSEL(pcb->pcb_tss.tss_ss0)));
  15.422 +	HYPERVISOR_stack_switch(pcb->pcb_tss.tss_ss0, pcb->pcb_tss.tss_esp0);
  15.423 +#endif
  15.424 +}
  15.425 +
  15.426 +/*
  15.427 + * Set up TSS and LDT for a new PCB.
  15.428 + */
  15.429 +
  15.430 +void
  15.431 +i386_init_pcb_tss_ldt(struct cpu_info *ci)
  15.432 +{
  15.433 +	int x;
  15.434 +	struct pcb *pcb = ci->ci_idle_pcb;
  15.435 +
  15.436 +	pcb->pcb_tss.tss_ioopt =
  15.437 +	    ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16
  15.438 +		| SEL_KPL;		/* i/o pl */
  15.439 +	for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
  15.440 +		pcb->pcb_iomap[x] = 0xffffffff;
  15.441 +
  15.442 +	pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
  15.443 +	pcb->pcb_cr0 = rcr0();
  15.444 +
  15.445 +	ci->ci_idle_tss_sel = tss_alloc(pcb);
  15.446 +}
  15.447 +
  15.448 +/*
  15.449 + * Switch context:
  15.450 + * - honor CR0_TS in saved CR0 and request DNA exception on FPU use
  15.451 + * - switch stack pointer for user->kernel transition
  15.452 + */
  15.453 +void
  15.454 +i386_switch_context(struct pcb *new)
  15.455 +{
  15.456 +	dom0_op_t op;
  15.457 +	struct cpu_info *ci;
  15.458 +
  15.459 +	ci = curcpu();
  15.460 +	if (ci->ci_fpused) {
  15.461 +		HYPERVISOR_fpu_taskswitch();
  15.462 +		ci->ci_fpused = 0;
  15.463 +	}
  15.464 +
  15.465 +	HYPERVISOR_stack_switch(new->pcb_tss.tss_ss0, new->pcb_tss.tss_esp0);
  15.466 +
  15.467 +	if (xen_start_info.flags & SIF_PRIVILEGED) {
  15.468 +		op.cmd = DOM0_IOPL;
  15.469 +		op.u.iopl.domain = DOMID_SELF;
  15.470 +		op.u.iopl.iopl = new->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */
  15.471 +		HYPERVISOR_dom0_op(&op);
  15.472 +	}
  15.473 +}
  15.474 +
  15.475 +/*
  15.476 + * sysctl helper routine for machdep.tm* nodes.
  15.477 + */
  15.478 +static int
  15.479 +sysctl_machdep_tm_longrun(SYSCTLFN_ARGS)
  15.480 +{
  15.481 +	struct sysctlnode node;
  15.482 +	int io, error;
  15.483 +
  15.484 +	if (!tmx86_has_longrun)
  15.485 +		return (EOPNOTSUPP);
  15.486 +
  15.487 +	node = *rnode;
  15.488 +	node.sysctl_data = &io;
  15.489 +
  15.490 +	switch (rnode->sysctl_num) {
  15.491 +	case CPU_TMLR_MODE:
  15.492 +		io = (int)(crusoe_longrun = tmx86_get_longrun_mode());
  15.493 +		break;
  15.494 +	case CPU_TMLR_FREQUENCY:
  15.495 +		tmx86_get_longrun_status_all();
  15.496 +		io = crusoe_frequency;
  15.497 +		break;
  15.498 +	case CPU_TMLR_VOLTAGE:
  15.499 +		tmx86_get_longrun_status_all();
  15.500 +		io = crusoe_voltage;
  15.501 +		break;
  15.502 +	case CPU_TMLR_PERCENTAGE:
  15.503 +		tmx86_get_longrun_status_all();
  15.504 +		io = crusoe_percentage;
  15.505 +		break;
  15.506 +	default:
  15.507 +		return (EOPNOTSUPP);
  15.508 +	}
  15.509 +
  15.510 +	error = sysctl_lookup(SYSCTLFN_CALL(&node));
  15.511 +	if (error || newp == NULL)
  15.512 +		return (error);
  15.513 +
  15.514 +	if (rnode->sysctl_num == CPU_TMLR_MODE) {
  15.515 +		if (tmx86_set_longrun_mode(io))
  15.516 +			crusoe_longrun = (u_int)io;
  15.517 +		else
  15.518 +			return (EINVAL);
  15.519 +	}
  15.520 +
  15.521 +	return (0);
  15.522 +}
  15.523 +
  15.524 +/*
  15.525 + * sysctl helper routine for machdep.booted_kernel
  15.526 + */
  15.527 +static int
  15.528 +sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)
  15.529 +{
  15.530 +	struct btinfo_bootpath *bibp;
  15.531 +	struct sysctlnode node;
  15.532 +
  15.533 +	bibp = lookup_bootinfo(BTINFO_BOOTPATH);
  15.534 +	if(!bibp)
  15.535 +		return(ENOENT); /* ??? */
  15.536 +
  15.537 +	node = *rnode;
  15.538 +	node.sysctl_data = bibp->bootpath;
  15.539 +	node.sysctl_size = sizeof(bibp->bootpath);
  15.540 +	return (sysctl_lookup(SYSCTLFN_CALL(&node)));
  15.541 +}
  15.542 +
  15.543 +/*
  15.544 + * sysctl helper routine for machdep.diskinfo
  15.545 + */
  15.546 +static int
  15.547 +sysctl_machdep_diskinfo(SYSCTLFN_ARGS)
  15.548 +{
  15.549 +	struct sysctlnode node;
  15.550 +
  15.551 +	node = *rnode;
  15.552 +	node.sysctl_data = i386_alldisks;
  15.553 +	node.sysctl_size = sizeof(struct disklist) +
  15.554 +	    (i386_ndisks - 1) * sizeof(struct nativedisk_info);
  15.555 +        return (sysctl_lookup(SYSCTLFN_CALL(&node)));
  15.556 +}
  15.557 +
  15.558 +/*
  15.559 + * machine dependent system variables.
  15.560 + */
  15.561 +SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup")
  15.562 +{
  15.563 +
  15.564 +	sysctl_createv(clog, 0, NULL, NULL,
  15.565 +		       CTLFLAG_PERMANENT,
  15.566 +		       CTLTYPE_NODE, "machdep", NULL,
  15.567 +		       NULL, 0, NULL, 0,
  15.568 +		       CTL_MACHDEP, CTL_EOL);
  15.569 +
  15.570 +	sysctl_createv(clog, 0, NULL, NULL,
  15.571 +		       CTLFLAG_PERMANENT,
  15.572 +		       CTLTYPE_STRUCT, "console_device", NULL,
  15.573 +		       sysctl_consdev, 0, NULL, sizeof(dev_t),
  15.574 +		       CTL_MACHDEP, CPU_CONSDEV, CTL_EOL);
  15.575 +	sysctl_createv(clog, 0, NULL, NULL,
  15.576 +		       CTLFLAG_PERMANENT,
  15.577 +		       CTLTYPE_INT, "biosbasemem", NULL,
  15.578 +		       NULL, 0, &biosbasemem, 0,
  15.579 +		       CTL_MACHDEP, CPU_BIOSBASEMEM, CTL_EOL);
  15.580 +	sysctl_createv(clog, 0, NULL, NULL,
  15.581 +		       CTLFLAG_PERMANENT,
  15.582 +		       CTLTYPE_INT, "biosextmem", NULL,
  15.583 +		       NULL, 0, &biosextmem, 0,
  15.584 +		       CTL_MACHDEP, CPU_BIOSEXTMEM, CTL_EOL);
  15.585 +	sysctl_createv(clog, 0, NULL, NULL,
  15.586 +		       CTLFLAG_PERMANENT,
  15.587 +		       CTLTYPE_INT, "nkpde", NULL,
  15.588 +		       NULL, 0, &nkpde, 0,
  15.589 +		       CTL_MACHDEP, CPU_NKPDE, CTL_EOL);
  15.590 +	sysctl_createv(clog, 0, NULL, NULL,
  15.591 +		       CTLFLAG_PERMANENT,
  15.592 +		       CTLTYPE_STRING, "booted_kernel", NULL,
  15.593 +		       sysctl_machdep_booted_kernel, 0, NULL, 0,
  15.594 +		       CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL);
  15.595 +	sysctl_createv(clog, 0, NULL, NULL,
  15.596 +		       CTLFLAG_PERMANENT,
  15.597 +		       CTLTYPE_STRUCT, "diskinfo", NULL,
  15.598 +		       sysctl_machdep_diskinfo, 0, NULL, 0,
  15.599 +		       CTL_MACHDEP, CPU_DISKINFO, CTL_EOL);
  15.600 +	sysctl_createv(clog, 0, NULL, NULL,
  15.601 +		       CTLFLAG_PERMANENT,
  15.602 +		       CTLTYPE_INT, "fpu_present", NULL,
  15.603 +		       NULL, 0, &i386_fpu_present, 0,
  15.604 +		       CTL_MACHDEP, CPU_FPU_PRESENT, CTL_EOL);
  15.605 +	sysctl_createv(clog, 0, NULL, NULL,
  15.606 +		       CTLFLAG_PERMANENT,
  15.607 +		       CTLTYPE_INT, "osfxsr", NULL,
  15.608 +		       NULL, 0, &i386_use_fxsave, 0,
  15.609 +		       CTL_MACHDEP, CPU_OSFXSR, CTL_EOL);
  15.610 +	sysctl_createv(clog, 0, NULL, NULL,
  15.611 +		       CTLFLAG_PERMANENT,
  15.612 +		       CTLTYPE_INT, "sse", NULL,
  15.613 +		       NULL, 0, &i386_has_sse, 0,
  15.614 +		       CTL_MACHDEP, CPU_SSE, CTL_EOL);
  15.615 +	sysctl_createv(clog, 0, NULL, NULL,
  15.616 +		       CTLFLAG_PERMANENT,
  15.617 +		       CTLTYPE_INT, "sse2", NULL,
  15.618 +		       NULL, 0, &i386_has_sse2, 0,
  15.619 +		       CTL_MACHDEP, CPU_SSE2, CTL_EOL);
  15.620 +	sysctl_createv(clog, 0, NULL, NULL,
  15.621 +		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
  15.622 +		       CTLTYPE_INT, "tm_longrun_mode", NULL,
  15.623 +		       sysctl_machdep_tm_longrun, 0, NULL, 0,
  15.624 +		       CTL_MACHDEP, CPU_TMLR_MODE, CTL_EOL);
  15.625 +	sysctl_createv(clog, 0, NULL, NULL,
  15.626 +		       CTLFLAG_PERMANENT,
  15.627 +		       CTLTYPE_INT, "tm_longrun_frequency", NULL,
  15.628 +		       sysctl_machdep_tm_longrun, 0, NULL, 0,
  15.629 +		       CTL_MACHDEP, CPU_TMLR_FREQUENCY, CTL_EOL);
  15.630 +	sysctl_createv(clog, 0, NULL, NULL,
  15.631 +		       CTLFLAG_PERMANENT,
  15.632 +		       CTLTYPE_INT, "tm_longrun_voltage", NULL,
  15.633 +		       sysctl_machdep_tm_longrun, 0, NULL, 0,
  15.634 +		       CTL_MACHDEP, CPU_TMLR_VOLTAGE, CTL_EOL);
  15.635 +	sysctl_createv(clog, 0, NULL, NULL,
  15.636 +		       CTLFLAG_PERMANENT,
  15.637 +		       CTLTYPE_INT, "tm_longrun_percentage", NULL,
  15.638 +		       sysctl_machdep_tm_longrun, 0, NULL, 0,
  15.639 +		       CTL_MACHDEP, CPU_TMLR_PERCENTAGE, CTL_EOL);
  15.640 +}
  15.641 +
  15.642 +void *
  15.643 +getframe(struct lwp *l, int sig, int *onstack)
  15.644 +{
  15.645 +	struct proc *p = l->l_proc;
  15.646 +	struct sigctx *ctx = &p->p_sigctx;
  15.647 +	struct trapframe *tf = l->l_md.md_regs;
  15.648 +
  15.649 +	/* Do we need to jump onto the signal stack? */
  15.650 +	*onstack = (ctx->ps_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0
  15.651 +	    && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
  15.652 +	if (*onstack)
  15.653 +		return (char *)ctx->ps_sigstk.ss_sp + ctx->ps_sigstk.ss_size;
  15.654 +#ifdef VM86
  15.655 +	if (tf->tf_eflags & PSL_VM)
  15.656 +		return (void *)(tf->tf_esp + (tf->tf_ss << 4));
  15.657 +	else
  15.658 +#endif
  15.659 +		return (void *)tf->tf_esp;
  15.660 +}
  15.661 +
  15.662 +/*
  15.663 + * Build context to run handler in.  We invoke the handler
  15.664 + * directly, only returning via the trampoline.  Note the
  15.665 + * trampoline version numbers are coordinated with machine-
  15.666 + * dependent code in libc.
  15.667 + */
  15.668 +void
  15.669 +buildcontext(struct lwp *l, int sel, void *catcher, void *fp)
  15.670 +{
  15.671 +	struct trapframe *tf = l->l_md.md_regs;
  15.672 +
  15.673 +	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
  15.674 +	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
  15.675 +	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
  15.676 +	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
  15.677 +	tf->tf_eip = (int)catcher;
  15.678 +	tf->tf_cs = GSEL(sel, SEL_UPL);
  15.679 +	tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
  15.680 +	tf->tf_esp = (int)fp;
  15.681 +	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
  15.682 +}
  15.683 +
  15.684 +static void
  15.685 +sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
  15.686 +{
  15.687 +	struct lwp *l = curlwp;
  15.688 +	struct proc *p = l->l_proc;
  15.689 +	struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map);
  15.690 +	int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
  15.691 +	    GUCODEBIG_SEL : GUCODE_SEL;
  15.692 +	struct sigacts *ps = p->p_sigacts;
  15.693 +	int onstack;
  15.694 +	int sig = ksi->ksi_signo;
  15.695 +	struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame;
  15.696 +	sig_t catcher = SIGACTION(p, sig).sa_handler;
  15.697 +	struct trapframe *tf = l->l_md.md_regs;
  15.698 +
  15.699 +	fp--;
  15.700 +
  15.701 +	/* Build stack frame for signal trampoline. */
  15.702 +	switch (ps->sa_sigdesc[sig].sd_vers) {
  15.703 +	case 0:		/* handled by sendsig_sigcontext */
  15.704 +	case 1:		/* handled by sendsig_sigcontext */
  15.705 +	default:	/* unknown version */
  15.706 +		printf("nsendsig: bad version %d\n",
  15.707 +		    ps->sa_sigdesc[sig].sd_vers);
  15.708 +		sigexit(l, SIGILL);
  15.709 +	case 2:
  15.710 +		break;
  15.711 +	}
  15.712 +
  15.713 +	frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp;
  15.714 +	frame.sf_signum = sig;
  15.715 +	frame.sf_sip = &fp->sf_si;
  15.716 +	frame.sf_ucp = &fp->sf_uc;
  15.717 +	frame.sf_si._info = ksi->ksi_info;
  15.718 +	frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM;
  15.719 +	frame.sf_uc.uc_sigmask = *mask;
  15.720 +	frame.sf_uc.uc_link = NULL;
  15.721 +	frame.sf_uc.uc_flags |= (p->p_sigctx.ps_sigstk.ss_flags & SS_ONSTACK)
  15.722 +	    ? _UC_SETSTACK : _UC_CLRSTACK;
  15.723 +	memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack));
  15.724 +	cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
  15.725 +
  15.726 +	if (tf->tf_eflags & PSL_VM)
  15.727 +		(*p->p_emul->e_syscall_intern)(p);
  15.728 +
  15.729 +	if (copyout(&frame, fp, sizeof(frame)) != 0) {
  15.730 +		/*
  15.731 +		 * Process has trashed its stack; give it an illegal
  15.732 +		 * instruction to halt it in its tracks.
  15.733 +		 */
  15.734 +		sigexit(l, SIGILL);
  15.735 +		/* NOTREACHED */
  15.736 +	}
  15.737 +
  15.738 +	buildcontext(l, sel, catcher, fp);
  15.739 +
  15.740 +	/* Remember that we're now on the signal stack. */
  15.741 +	if (onstack)
  15.742 +		p->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK;
  15.743 +}
  15.744 +
  15.745 +void
  15.746 +sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
  15.747 +{
  15.748 +#ifdef COMPAT_16
  15.749 +	if (curproc->p_sigacts->sa_sigdesc[ksi->ksi_signo].sd_vers < 2)
  15.750 +		sendsig_sigcontext(ksi, mask);
  15.751 +	else
  15.752 +#endif
  15.753 +		sendsig_siginfo(ksi, mask);
  15.754 +}
  15.755 +
  15.756 +void
  15.757 +cpu_upcall(struct lwp *l, int type, int nevents, int ninterrupted, void *sas,
  15.758 +    void *ap, void *sp, sa_upcall_t upcall)
  15.759 +{
  15.760 +	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
  15.761 +	struct saframe *sf, frame;
  15.762 +	struct trapframe *tf;
  15.763 +
  15.764 +	tf = l->l_md.md_regs;
  15.765 +
  15.766 +	/* Finally, copy out the rest of the frame. */
  15.767 +	frame.sa_type = type;
  15.768 +	frame.sa_sas = sas;
  15.769 +	frame.sa_events = nevents;
  15.770 +	frame.sa_interrupted = ninterrupted;
  15.771 +	frame.sa_arg = ap;
  15.772 +	frame.sa_ra = 0;
  15.773 +
  15.774 +	sf = (struct saframe *)sp - 1;
  15.775 +	if (copyout(&frame, sf, sizeof(frame)) != 0) {
  15.776 +		/* Copying onto the stack didn't work. Die. */
  15.777 +		sigexit(l, SIGILL);
  15.778 +		/* NOTREACHED */
  15.779 +	}
  15.780 +
  15.781 +	tf->tf_eip = (int) upcall;
  15.782 +	tf->tf_esp = (int) sf;
  15.783 +	tf->tf_ebp = 0; /* indicate call-frame-top to debuggers */
  15.784 +	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
  15.785 +	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
  15.786 +	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
  15.787 +	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
  15.788 +	tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
  15.789 +	    GSEL(GUCODEBIG_SEL, SEL_UPL) : GSEL(GUCODE_SEL, SEL_UPL);
  15.790 +	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
  15.791 +	tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
  15.792 +}
  15.793 +
  15.794 +int	waittime = -1;
  15.795 +struct pcb dumppcb;
  15.796 +
  15.797 +void
  15.798 +cpu_reboot(int howto, char *bootstr)
  15.799 +{
  15.800 +
  15.801 +	if (cold) {
  15.802 +		howto |= RB_HALT;
  15.803 +		goto haltsys;
  15.804 +	}
  15.805 +
  15.806 +	boothowto = howto;
  15.807 +	if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
  15.808 +		waittime = 0;
  15.809 +		vfs_shutdown();
  15.810 +		/*
  15.811 +		 * If we've been adjusting the clock, the todr
  15.812 +		 * will be out of synch; adjust it now.
  15.813 +		 */
  15.814 +		if (time_adjusted != 0)
  15.815 +			resettodr();
  15.816 +	}
  15.817 +
  15.818 +	/* Disable interrupts. */
  15.819 +	splhigh();
  15.820 +
  15.821 +	/* Do a dump if requested. */
  15.822 +	if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
  15.823 +		dumpsys();
  15.824 +
  15.825 +haltsys:
  15.826 +	doshutdownhooks();
  15.827 +
  15.828 +#ifdef MULTIPROCESSOR
  15.829 +	x86_broadcast_ipi(X86_IPI_HALT);
  15.830 +#endif
  15.831 +
  15.832 +	if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
  15.833 +#if NACPI > 0
  15.834 +		if (acpi_softc != NULL) {
  15.835 +			delay(500000);
  15.836 +			acpi_enter_sleep_state(acpi_softc, ACPI_STATE_S5);
  15.837 +			printf("WARNING: ACPI powerdown failed!\n");
  15.838 +		}
  15.839 +#endif
  15.840 +#if NAPM > 0 && !defined(APM_NO_POWEROFF)
  15.841 +		/* turn off, if we can.  But try to turn disk off and
  15.842 +		 * wait a bit first--some disk drives are slow to clean up
  15.843 +		 * and users have reported disk corruption.
  15.844 +		 */
  15.845 +		delay(500000);
  15.846 +		apm_set_powstate(APM_DEV_DISK(0xff), APM_SYS_OFF);
  15.847 +		delay(500000);
  15.848 +		apm_set_powstate(APM_DEV_ALLDEVS, APM_SYS_OFF);
  15.849 +		printf("WARNING: APM powerdown failed!\n");
  15.850 +		/*
  15.851 +		 * RB_POWERDOWN implies RB_HALT... fall into it...
  15.852 +		 */
  15.853 +#endif
  15.854 +		HYPERVISOR_shutdown();
  15.855 +	}
  15.856 +
  15.857 +	if (howto & RB_HALT) {
  15.858 +		printf("\n");
  15.859 +		printf("The operating system has halted.\n");
  15.860 +		printf("Please press any key to reboot.\n\n");
  15.861 +
  15.862 +#ifdef BEEP_ONHALT
  15.863 +		{
  15.864 +			int c;
  15.865 +			for (c = BEEP_ONHALT_COUNT; c > 0; c--) {
  15.866 +				sysbeep(BEEP_ONHALT_PITCH,
  15.867 +				        BEEP_ONHALT_PERIOD * hz / 1000);
  15.868 +				delay(BEEP_ONHALT_PERIOD * 1000);
  15.869 +				sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000);
  15.870 +				delay(BEEP_ONHALT_PERIOD * 1000);
  15.871 +			}
  15.872 +		}
  15.873 +#endif
  15.874 +
  15.875 +		cnpollc(1);	/* for proper keyboard command handling */
  15.876 +		if (cngetc() == 0) {
  15.877 +			/* no console attached, so just hlt */
  15.878 +			for(;;) {
  15.879 +				__asm __volatile("hlt");
  15.880 +			}
  15.881 +		}
  15.882 +		cnpollc(0);
  15.883 +	}
  15.884 +
  15.885 +	printf("rebooting...\n");
  15.886 +	if (cpureset_delay > 0)
  15.887 +		delay(cpureset_delay * 1000);
  15.888 +	cpu_reset();
  15.889 +	for(;;) ;
  15.890 +	/*NOTREACHED*/
  15.891 +}
  15.892 +
  15.893 +/*
  15.894 + * These variables are needed by /sbin/savecore
  15.895 + */
  15.896 +u_int32_t dumpmag = 0x8fca0101;	/* magic number */
  15.897 +int 	dumpsize = 0;		/* pages */
  15.898 +long	dumplo = 0; 		/* blocks */
  15.899 +
  15.900 +/*
  15.901 + * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
  15.902 + */
  15.903 +int
  15.904 +cpu_dumpsize()
  15.905 +{
  15.906 +	int size;
  15.907 +
  15.908 +	size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
  15.909 +	    ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
  15.910 +	if (roundup(size, dbtob(1)) != dbtob(1))
  15.911 +		return (-1);
  15.912 +
  15.913 +	return (1);
  15.914 +}
  15.915 +
  15.916 +/*
  15.917 + * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
  15.918 + */
  15.919 +u_long
  15.920 +cpu_dump_mempagecnt()
  15.921 +{
  15.922 +	u_long i, n;
  15.923 +
  15.924 +	n = 0;
  15.925 +	for (i = 0; i < mem_cluster_cnt; i++)
  15.926 +		n += atop(mem_clusters[i].size);
  15.927 +	return (n);
  15.928 +}
  15.929 +
  15.930 +/*
  15.931 + * cpu_dump: dump the machine-dependent kernel core dump headers.
  15.932 + */
  15.933 +int
  15.934 +cpu_dump()
  15.935 +{
  15.936 +	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
  15.937 +	char buf[dbtob(1)];
  15.938 +	kcore_seg_t *segp;
  15.939 +	cpu_kcore_hdr_t *cpuhdrp;
  15.940 +	phys_ram_seg_t *memsegp;
  15.941 +	const struct bdevsw *bdev;
  15.942 +	int i;
  15.943 +
  15.944 +	bdev = bdevsw_lookup(dumpdev);
  15.945 +	if (bdev == NULL)
  15.946 +		return (ENXIO);
  15.947 +	dump = bdev->d_dump;
  15.948 +
  15.949 +	memset(buf, 0, sizeof buf);
  15.950 +	segp = (kcore_seg_t *)buf;
  15.951 +	cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
  15.952 +	memsegp = (phys_ram_seg_t *)&buf[ ALIGN(sizeof(*segp)) +
  15.953 +	    ALIGN(sizeof(*cpuhdrp))];
  15.954 +
  15.955 +	/*
  15.956 +	 * Generate a segment header.
  15.957 +	 */
  15.958 +	CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
  15.959 +	segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
  15.960 +
  15.961 +	/*
  15.962 +	 * Add the machine-dependent header info.
  15.963 +	 */
  15.964 +	cpuhdrp->ptdpaddr = PTDpaddr;
  15.965 +	cpuhdrp->nmemsegs = mem_cluster_cnt;
  15.966 +
  15.967 +	/*
  15.968 +	 * Fill in the memory segment descriptors.
  15.969 +	 */
  15.970 +	for (i = 0; i < mem_cluster_cnt; i++) {
  15.971 +		memsegp[i].start = mem_clusters[i].start;
  15.972 +		memsegp[i].size = mem_clusters[i].size;
  15.973 +	}
  15.974 +
  15.975 +	return (dump(dumpdev, dumplo, (caddr_t)buf, dbtob(1)));
  15.976 +}
  15.977 +
  15.978 +/*
  15.979 + * This is called by main to set dumplo and dumpsize.
  15.980 + * Dumps always skip the first PAGE_SIZE of disk space
  15.981 + * in case there might be a disk label stored there.
  15.982 + * If there is extra space, put dump at the end to
  15.983 + * reduce the chance that swapping trashes it.
  15.984 + */
  15.985 +void
  15.986 +cpu_dumpconf()
  15.987 +{
  15.988 +	const struct bdevsw *bdev;
  15.989 +	int nblks, dumpblks;	/* size of dump area */
  15.990 +
  15.991 +	if (dumpdev == NODEV)
  15.992 +		goto bad;
  15.993 +	bdev = bdevsw_lookup(dumpdev);
  15.994 +	if (bdev == NULL)
  15.995 +		panic("dumpconf: bad dumpdev=0x%x", dumpdev);
  15.996 +	if (bdev->d_psize == NULL)
  15.997 +		goto bad;
  15.998 +	nblks = (*bdev->d_psize)(dumpdev);
  15.999 +	if (nblks <= ctod(1))
 15.1000 +		goto bad;
 15.1001 +
 15.1002 +	dumpblks = cpu_dumpsize();
 15.1003 +	if (dumpblks < 0)
 15.1004 +		goto bad;
 15.1005 +	dumpblks += ctod(cpu_dump_mempagecnt());
 15.1006 +
 15.1007 +	/* If dump won't fit (incl. room for possible label), punt. */
 15.1008 +	if (dumpblks > (nblks - ctod(1)))
 15.1009 +		goto bad;
 15.1010 +
 15.1011 +	/* Put dump at end of partition */
 15.1012 +	dumplo = nblks - dumpblks;
 15.1013 +
 15.1014 +	/* dumpsize is in page units, and doesn't include headers. */
 15.1015 +	dumpsize = cpu_dump_mempagecnt();
 15.1016 +	return;
 15.1017 +
 15.1018 + bad:
 15.1019 +	dumpsize = 0;
 15.1020 +}
 15.1021 +
 15.1022 +/*
 15.1023 + * Doadump comes here after turning off memory management and
 15.1024 + * getting on the dump stack, either when called above, or by
 15.1025 + * the auto-restart code.
 15.1026 + */
 15.1027 +#define BYTES_PER_DUMP  PAGE_SIZE /* must be a multiple of pagesize XXX small */
 15.1028 +static vaddr_t dumpspace;
 15.1029 +
 15.1030 +vaddr_t
 15.1031 +reserve_dumppages(vaddr_t p)
 15.1032 +{
 15.1033 +
 15.1034 +	dumpspace = p;
 15.1035 +	return (p + BYTES_PER_DUMP);
 15.1036 +}
 15.1037 +
 15.1038 +void
 15.1039 +dumpsys()
 15.1040 +{
 15.1041 +	u_long totalbytesleft, bytes, i, n, memseg;
 15.1042 +	u_long maddr;
 15.1043 +	int psize;
 15.1044 +	daddr_t blkno;
 15.1045 +	const struct bdevsw *bdev;
 15.1046 +	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
 15.1047 +	int error;
 15.1048 +
 15.1049 +	/* Save registers. */
 15.1050 +	savectx(&dumppcb);
 15.1051 +
 15.1052 +	if (dumpdev == NODEV)
 15.1053 +		return;
 15.1054 +
 15.1055 +	bdev = bdevsw_lookup(dumpdev);
 15.1056 +	if (bdev == NULL || bdev->d_psize == NULL)
 15.1057 +		return;
 15.1058 +
 15.1059 +	/*
 15.1060 +	 * For dumps during autoconfiguration,
 15.1061 +	 * if dump device has already configured...
 15.1062 +	 */
 15.1063 +	if (dumpsize == 0)
 15.1064 +		cpu_dumpconf();
 15.1065 +	if (dumplo <= 0 || dumpsize == 0) {
 15.1066 +		printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
 15.1067 +		    minor(dumpdev));
 15.1068 +		return;
 15.1069 +	}
 15.1070 +	printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
 15.1071 +	    minor(dumpdev), dumplo);
 15.1072 +
 15.1073 +	psize = (*bdev->d_psize)(dumpdev);
 15.1074 +	printf("dump ");
 15.1075 +	if (psize == -1) {
 15.1076 +		printf("area unavailable\n");
 15.1077 +		return;
 15.1078 +	}
 15.1079 +
 15.1080 +#if 0	/* XXX this doesn't work.  grr. */
 15.1081 +        /* toss any characters present prior to dump */
 15.1082 +	while (sget() != NULL); /*syscons and pccons differ */
 15.1083 +#endif
 15.1084 +
 15.1085 +	if ((error = cpu_dump()) != 0)
 15.1086 +		goto err;
 15.1087 +
 15.1088 +	totalbytesleft = ptoa(cpu_dump_mempagecnt());
 15.1089 +	blkno = dumplo + cpu_dumpsize();
 15.1090 +	dump = bdev->d_dump;
 15.1091 +	error = 0;
 15.1092 +
 15.1093 +	for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
 15.1094 +		maddr = mem_clusters[memseg].start;
 15.1095 +		bytes = mem_clusters[memseg].size;
 15.1096 +
 15.1097 +		for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
 15.1098 +			/* Print out how many MBs we have left to go. */
 15.1099 +			if ((totalbytesleft % (1024*1024)) == 0)
 15.1100 +				printf("%ld ", totalbytesleft / (1024 * 1024));
 15.1101 +
 15.1102 +			/* Limit size for next transfer. */
 15.1103 +			n = bytes - i;
 15.1104 +			if (n > BYTES_PER_DUMP)
 15.1105 +				n = BYTES_PER_DUMP;
 15.1106 +
 15.1107 +			(void) pmap_map(dumpspace, maddr, maddr + n,
 15.1108 +			    VM_PROT_READ);
 15.1109 +
 15.1110 +			error = (*dump)(dumpdev, blkno, (caddr_t)dumpspace, n);
 15.1111 +			if (error)
 15.1112 +				goto err;
 15.1113 +			maddr += n;
 15.1114 +			blkno += btodb(n);		/* XXX? */
 15.1115 +
 15.1116 +#if 0	/* XXX this doesn't work.  grr. */
 15.1117 +			/* operator aborting dump? */
 15.1118 +			if (sget() != NULL) {
 15.1119 +				error = EINTR;
 15.1120 +				break;
 15.1121 +			}
 15.1122 +#endif
 15.1123 +		}
 15.1124 +	}
 15.1125 +
 15.1126 + err:
 15.1127 +	switch (error) {
 15.1128 +
 15.1129 +	case ENXIO:
 15.1130 +		printf("device bad\n");
 15.1131 +		break;
 15.1132 +
 15.1133 +	case EFAULT:
 15.1134 +		printf("device not ready\n");
 15.1135 +		break;
 15.1136 +
 15.1137 +	case EINVAL:
 15.1138 +		printf("area improper\n");
 15.1139 +		break;
 15.1140 +
 15.1141 +	case EIO:
 15.1142 +		printf("i/o error\n");
 15.1143 +		break;
 15.1144 +
 15.1145 +	case EINTR:
 15.1146 +		printf("aborted from console\n");
 15.1147 +		break;
 15.1148 +
 15.1149 +	case 0:
 15.1150 +		printf("succeeded\n");
 15.1151 +		break;
 15.1152 +
 15.1153 +	default:
 15.1154 +		printf("error %d\n", error);
 15.1155 +		break;
 15.1156 +	}
 15.1157 +	printf("\n\n");
 15.1158 +	delay(5000000);		/* 5 seconds */
 15.1159 +}
 15.1160 +
 15.1161 +/*
 15.1162 + * Clear registers on exec
 15.1163 + */
 15.1164 +void
 15.1165 +setregs(struct lwp *l, struct exec_package *pack, u_long stack)
 15.1166 +{
 15.1167 +	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
 15.1168 +	struct pcb *pcb = &l->l_addr->u_pcb;
 15.1169 +	struct trapframe *tf;
 15.1170 +
 15.1171 +#if NNPX > 0
 15.1172 +	/* If we were using the FPU, forget about it. */
 15.1173 +	if (l->l_addr->u_pcb.pcb_fpcpu != NULL)
 15.1174 +		npxsave_lwp(l, 0);
 15.1175 +#endif
 15.1176 +
 15.1177 +#ifdef USER_LDT
 15.1178 +	pmap_ldt_cleanup(l);
 15.1179 +#endif
 15.1180 +
 15.1181 +	l->l_md.md_flags &= ~MDL_USEDFPU;
 15.1182 +	if (i386_use_fxsave) {
 15.1183 +		pcb->pcb_savefpu.sv_xmm.sv_env.en_cw = __NetBSD_NPXCW__;
 15.1184 +		pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr = __INITIAL_MXCSR__;
 15.1185 +	} else
 15.1186 +		pcb->pcb_savefpu.sv_87.sv_env.en_cw = __NetBSD_NPXCW__;
 15.1187 +
 15.1188 +	tf = l->l_md.md_regs;
 15.1189 +	tf->tf_gs = LSEL(LUDATA_SEL, SEL_UPL);
 15.1190 +	tf->tf_fs = LSEL(LUDATA_SEL, SEL_UPL);
 15.1191 +	tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
 15.1192 +	tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
 15.1193 +	tf->tf_edi = 0;
 15.1194 +	tf->tf_esi = 0;
 15.1195 +	tf->tf_ebp = 0;
 15.1196 +	tf->tf_ebx = (int)l->l_proc->p_psstr;
 15.1197 +	tf->tf_edx = 0;
 15.1198 +	tf->tf_ecx = 0;
 15.1199 +	tf->tf_eax = 0;
 15.1200 +	tf->tf_eip = pack->ep_entry;
 15.1201 +	tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
 15.1202 +	    LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL);
 15.1203 +	tf->tf_eflags = PSL_USERSET;
 15.1204 +	tf->tf_esp = stack;
 15.1205 +	tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
 15.1206 +}
 15.1207 +
 15.1208 +/*
 15.1209 + * Initialize segments and descriptor tables
 15.1210 + */
 15.1211 +
 15.1212 +union	descriptor *gdt, *ldt;
 15.1213 +struct gate_descriptor *idt;
 15.1214 +char idt_allocmap[NIDT];
 15.1215 +struct simplelock idt_lock = SIMPLELOCK_INITIALIZER;
 15.1216 +#ifdef I586_CPU
 15.1217 +union	descriptor *pentium_idt;
 15.1218 +#endif
 15.1219 +extern  struct user *proc0paddr;
 15.1220 +
 15.1221 +void
 15.1222 +setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl,
 15.1223 +    int sel)
 15.1224 +{
 15.1225 +
 15.1226 +	gd->gd_looffset = (int)func;
 15.1227 +	gd->gd_selector = sel;
 15.1228 +	gd->gd_stkcpy = args;
 15.1229 +	gd->gd_xx = 0;
 15.1230 +	gd->gd_type = type;
 15.1231 +	gd->gd_dpl = dpl;
 15.1232 +	gd->gd_p = 1;
 15.1233 +	gd->gd_hioffset = (int)func >> 16;
 15.1234 +}
 15.1235 +
 15.1236 +void
 15.1237 +unsetgate(struct gate_descriptor *gd)
 15.1238 +{
 15.1239 +	gd->gd_p = 0;
 15.1240 +	gd->gd_hioffset = 0;
 15.1241 +	gd->gd_looffset = 0;
 15.1242 +	gd->gd_selector = 0;
 15.1243 +	gd->gd_xx = 0;
 15.1244 +	gd->gd_stkcpy = 0;
 15.1245 +	gd->gd_type = 0;
 15.1246 +	gd->gd_dpl = 0;
 15.1247 +}
 15.1248 +
 15.1249 +
 15.1250 +void
 15.1251 +setregion(struct region_descriptor *rd, void *base, size_t limit)
 15.1252 +{
 15.1253 +
 15.1254 +	rd->rd_limit = (int)limit;
 15.1255 +	rd->rd_base = (int)base;
 15.1256 +}
 15.1257 +
 15.1258 +void
 15.1259 +setsegment(struct segment_descriptor *sd, void *base, size_t limit, int type,
 15.1260 +    int dpl, int def32, int gran)
 15.1261 +{
 15.1262 +
 15.1263 +	sd->sd_lolimit = (int)limit;
 15.1264 +	sd->sd_lobase = (int)base;
 15.1265 +	sd->sd_type = type;
 15.1266 +	sd->sd_dpl = dpl;
 15.1267 +	sd->sd_p = 1;
 15.1268 +	sd->sd_hilimit = (int)limit >> 16;
 15.1269 +	sd->sd_xx = 0;
 15.1270 +	sd->sd_def32 = def32;
 15.1271 +	sd->sd_gran = gran;
 15.1272 +	sd->sd_hibase = (int)base >> 24;
 15.1273 +}
 15.1274 +
 15.1275 +#define	IDTVEC(name)	__CONCAT(X, name)
 15.1276 +typedef void (vector)(void);
 15.1277 +extern vector IDTVEC(syscall);
 15.1278 +extern vector IDTVEC(osyscall);
 15.1279 +extern vector *IDTVEC(exceptions)[];
 15.1280 +#ifdef COMPAT_SVR4
 15.1281 +extern vector IDTVEC(svr4_fasttrap);
 15.1282 +#endif /* COMPAT_SVR4 */
 15.1283 +#ifdef COMPAT_MACH
 15.1284 +extern vector IDTVEC(mach_trap);
 15.1285 +#endif
 15.1286 +#define MAX_XEN_IDT 128
 15.1287 +trap_info_t xen_idt[MAX_XEN_IDT];
 15.1288 +int xen_idt_idx;
 15.1289 +
 15.1290 +#define	KBTOB(x)	((size_t)(x) * 1024UL)
 15.1291 +
 15.1292 +void cpu_init_idt()
 15.1293 +{
 15.1294 +	struct region_descriptor region;
 15.1295 +
 15.1296 +	panic("cpu_init_idt");
 15.1297 +#ifdef I586_CPU
 15.1298 +	setregion(&region, pentium_idt, NIDT * sizeof(idt[0]) - 1);
 15.1299 +#else
 15.1300 +	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
 15.1301 +#endif
 15.1302 +        lidt(&region);
 15.1303 +}
 15.1304 +
 15.1305 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
 15.1306 +void
 15.1307 +add_mem_cluster(u_int64_t seg_start, u_int64_t seg_end, u_int32_t type)
 15.1308 +{
 15.1309 +	extern struct extent *iomem_ex;
 15.1310 +	int i;
 15.1311 +
 15.1312 +	if (seg_end > 0x100000000ULL) {
 15.1313 +		printf("WARNING: skipping large "
 15.1314 +		    "memory map entry: "
 15.1315 +		    "0x%qx/0x%qx/0x%x\n",
 15.1316 +		    seg_start,
 15.1317 +		    (seg_end - seg_start),
 15.1318 +		    type);
 15.1319 +		return;
 15.1320 +	}
 15.1321 +
 15.1322 +	/*
 15.1323 +	 * XXX Chop the last page off the size so that
 15.1324 +	 * XXX it can fit in avail_end.
 15.1325 +	 */
 15.1326 +	if (seg_end == 0x100000000ULL)
 15.1327 +		seg_end -= PAGE_SIZE;
 15.1328 +
 15.1329 +	if (seg_end <= seg_start)
 15.1330 +		return;
 15.1331 +
 15.1332 +	for (i = 0; i < mem_cluster_cnt; i++) {
 15.1333 +		if ((mem_clusters[i].start == round_page(seg_start))
 15.1334 +		    && (mem_clusters[i].size
 15.1335 +			    == trunc_page(seg_end) - mem_clusters[i].start)) {
 15.1336 +#ifdef DEBUG_MEMLOAD
 15.1337 +			printf("WARNING: skipping duplicate segment entry\n");
 15.1338 +#endif
 15.1339 +			return;
 15.1340 +		}
 15.1341 +	}
 15.1342 +
 15.1343 +	/*
 15.1344 +	 * Allocate the physical addresses used by RAM
 15.1345 +	 * from the iomem extent map.  This is done before
 15.1346 +	 * the addresses are page rounded just to make
 15.1347 +	 * sure we get them all.
 15.1348 +	 */
 15.1349 +	if (extent_alloc_region(iomem_ex, seg_start,
 15.1350 +	    seg_end - seg_start, EX_NOWAIT)) {
 15.1351 +		/* XXX What should we do? */
 15.1352 +		printf("WARNING: CAN'T ALLOCATE "
 15.1353 +		    "MEMORY SEGMENT "
 15.1354 +		    "(0x%qx/0x%qx/0x%x) FROM "
 15.1355 +		    "IOMEM EXTENT MAP!\n",
 15.1356 +		    seg_start, seg_end - seg_start, type);
 15.1357 +		return;
 15.1358 +	}
 15.1359 +
 15.1360 +	/*
 15.1361 +	 * If it's not free memory, skip it.
 15.1362 +	 */
 15.1363 +	if (type != BIM_Memory)
 15.1364 +		return;
 15.1365 +
 15.1366 +	/* XXX XXX XXX */
 15.1367 +	if (mem_cluster_cnt >= VM_PHYSSEG_MAX)
 15.1368 +		panic("init386: too many memory segments");
 15.1369 +
 15.1370 +	seg_start = round_page(seg_start);
 15.1371 +	seg_end = trunc_page(seg_end);
 15.1372 +
 15.1373 +	if (seg_start == seg_end)
 15.1374 +		return;
 15.1375 +
 15.1376 +	mem_clusters[mem_cluster_cnt].start = seg_start;
 15.1377 +	mem_clusters[mem_cluster_cnt].size =
 15.1378 +	    seg_end - seg_start;
 15.1379 +
 15.1380 +	if (avail_end < seg_end)
 15.1381 +		avail_end = seg_end;
 15.1382 +	physmem += atop(mem_clusters[mem_cluster_cnt].size);
 15.1383 +	mem_cluster_cnt++;
 15.1384 +}
 15.1385 +#endif /* !defined(REALBASEMEM) && !defined(REALEXTMEM) */
 15.1386 +
 15.1387 +void
 15.1388 +initgdt()
 15.1389 +{
 15.1390 +#if !defined(XEN)
 15.1391 +	struct region_descriptor region;
 15.1392 +#else
 15.1393 +	paddr_t frames[16];
 15.1394 +#endif
 15.1395 +
 15.1396 +#if !defined(XEN)
 15.1397 +	gdt = tgdt;
 15.1398 +	memset(gdt, 0, NGDT*sizeof(*gdt));
 15.1399 +#endif
 15.1400 +	/* make gdt gates and memory segments */
 15.1401 +	setsegment(&gdt[GCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 1, 1);
 15.1402 +	setsegment(&gdt[GDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 1, 1);
 15.1403 +	setsegment(&gdt[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1,
 15.1404 +	    SDT_MEMERA, SEL_UPL, 1, 1);
 15.1405 +	setsegment(&gdt[GUCODEBIG_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1,
 15.1406 +	    SDT_MEMERA, SEL_UPL, 1, 1);
 15.1407 +	setsegment(&gdt[GUDATA_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1,
 15.1408 +	    SDT_MEMRWA, SEL_UPL, 1, 1);
 15.1409 +#ifdef COMPAT_MACH
 15.1410 +	setgate(&gdt[GMACHCALLS_SEL].gd, &IDTVEC(mach_trap), 1,
 15.1411 +	    SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 15.1412 +#endif
 15.1413 +#if NBIOSCALL > 0
 15.1414 +	/* bios trampoline GDT entries */
 15.1415 +	setsegment(&gdt[GBIOSCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 0,
 15.1416 +	    0);
 15.1417 +	setsegment(&gdt[GBIOSDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 0,
 15.1418 +	    0);
 15.1419 +#endif
 15.1420 +	setsegment(&gdt[GCPU_SEL].sd, &cpu_info_primary,
 15.1421 +	    sizeof(struct cpu_info)-1, SDT_MEMRWA, SEL_KPL, 1, 1);
 15.1422 +
 15.1423 +#if !defined(XEN)
 15.1424 +	setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
 15.1425 +	lgdt(&region);
 15.1426 +#else
 15.1427 +	frames[0] = xpmap_ptom((uint32_t)gdt - KERNBASE) >> PAGE_SHIFT;
 15.1428 +	/* pmap_kremove((vaddr_t)gdt, PAGE_SIZE); */
 15.1429 +	pmap_kenter_pa((vaddr_t)gdt, (uint32_t)gdt - KERNBASE,
 15.1430 +	    VM_PROT_READ);
 15.1431 +	XENPRINTK(("loading gdt %lx, %d entries\n", frames[0] << PAGE_SHIFT,
 15.1432 +	    LAST_RESERVED_GDT_ENTRY + 1));
 15.1433 +	if (HYPERVISOR_set_gdt(frames, LAST_RESERVED_GDT_ENTRY + 1))
 15.1434 +		panic("HYPERVISOR_set_gdt failed!\n");
 15.1435 +	lgdt_finish();
 15.1436 +#endif
 15.1437 +}
 15.1438 +
 15.1439 +void
 15.1440 +init386(paddr_t first_avail)
 15.1441 +{
 15.1442 +#if !defined(XEN)
 15.1443 +	union descriptor *tgdt;
 15.1444 +#endif
 15.1445 +	extern void consinit(void);
 15.1446 +#if !defined(XEN)
 15.1447 +	extern struct extent *iomem_ex;
 15.1448 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
 15.1449 +	struct btinfo_memmap *bim;
 15.1450 +#endif
 15.1451 +	struct region_descriptor region;
 15.1452 +#endif
 15.1453 +	int x;
 15.1454 +#if !defined(XEN)
 15.1455 +	int first16q;
 15.1456 +	u_int64_t seg_start, seg_end;
 15.1457 +	u_int64_t seg_start1, seg_end1;
 15.1458 +#endif
 15.1459 +	paddr_t realmode_reserved_start;
 15.1460 +	psize_t realmode_reserved_size;
 15.1461 +	int needs_earlier_install_pte0;
 15.1462 +#if NBIOSCALL > 0
 15.1463 +	extern int biostramp_image_size;
 15.1464 +	extern u_char biostramp_image[];
 15.1465 +#endif
 15.1466 +
 15.1467 +	XENPRINTK(("HYPERVISOR_shared_info %p\n", HYPERVISOR_shared_info));
 15.1468 +#ifdef XENDEBUG_LOW
 15.1469 +	xen_dbglow_init();
 15.1470 +#endif
 15.1471 +
 15.1472 +	cpu_probe_features(&cpu_info_primary);
 15.1473 +	cpu_feature = cpu_info_primary.ci_feature_flags;
 15.1474 +
 15.1475 +	/* not on Xen... */
 15.1476 +	cpu_feature &= ~(CPUID_PGE|CPUID_PSE|CPUID_MTRR|CPUID_FXSR);
 15.1477 +
 15.1478 +	lwp0.l_addr = proc0paddr;
 15.1479 +	cpu_info_primary.ci_curpcb = &lwp0.l_addr->u_pcb;
 15.1480 +
 15.1481 +	XENPRINTK(("proc0paddr %p pcb %p first_avail %p\n",
 15.1482 +	    proc0paddr, cpu_info_primary.ci_curpcb, (void *)first_avail));
 15.1483 +	XENPRINTK(("ptdpaddr %p atdevbase %p\n", (void *)PTDpaddr,
 15.1484 +		      (void *)atdevbase));
 15.1485 +
 15.1486 +	x86_bus_space_init();
 15.1487 +	consinit();	/* XXX SHOULD NOT BE DONE HERE */
 15.1488 +	/*
 15.1489 +	 * Initailize PAGE_SIZE-dependent variables.
 15.1490 +	 */
 15.1491 +	uvm_setpagesize();
 15.1492 +
 15.1493 +	/*
 15.1494 +	 * Saving SSE registers won't work if the save area isn't
 15.1495 +	 * 16-byte aligned.
 15.1496 +	 */
 15.1497 +	if (offsetof(struct user, u_pcb.pcb_savefpu) & 0xf)
 15.1498 +		panic("init386: pcb_savefpu not 16-byte aligned");
 15.1499 +
 15.1500 +	/*
 15.1501 +	 * Start with 2 color bins -- this is just a guess to get us
 15.1502 +	 * started.  We'll recolor when we determine the largest cache
 15.1503 +	 * sizes on the system.
 15.1504 +	 */
 15.1505 +	uvmexp.ncolors = 2;
 15.1506 +
 15.1507 +#if !defined(XEN)
 15.1508 +	/*
 15.1509 +	 * BIOS leaves data in physical page 0
 15.1510 +	 * Even if it didn't, our VM system doesn't like using zero as a
 15.1511 +	 * physical page number.
 15.1512 +	 * We may also need pages in low memory (one each) for secondary CPU
 15.1513 +	 * startup, for BIOS calls, and for ACPI, plus a page table page to map
 15.1514 +	 * them into the first few pages of the kernel's pmap.
 15.1515 +	 */
 15.1516 +	avail_start = PAGE_SIZE;
 15.1517 +#else
 15.1518 +	/* Make sure the end of the space used by the kernel is rounded. */
 15.1519 +	first_avail = round_page(first_avail);
 15.1520 +	avail_start = first_avail - KERNBASE;
 15.1521 +	avail_end = ptoa(xen_start_info.nr_pages) +
 15.1522 +		(KERNTEXTOFF - KERNBASE_LOCORE);
 15.1523 +	pmap_pa_start = (KERNTEXTOFF - KERNBASE_LOCORE);
 15.1524 +	pmap_pa_end = avail_end;
 15.1525 +	mem_clusters[0].start = avail_start;
 15.1526 +	mem_clusters[0].size = avail_end - avail_start;
 15.1527 +	mem_cluster_cnt++;
 15.1528 +	physmem += atop(mem_clusters[0].size);
 15.1529 +#endif
 15.1530 +
 15.1531 +	/*
 15.1532 +	 * reserve memory for real-mode call
 15.1533 +	 */
 15.1534 +	needs_earlier_install_pte0 = 0;
 15.1535 +	realmode_reserved_start = 0;
 15.1536 +	realmode_reserved_size = 0;
 15.1537 +#if NBIOSCALL > 0
 15.1538 +	/* save us a page for trampoline code */
 15.1539 +	realmode_reserved_size += PAGE_SIZE;
 15.1540 +	needs_earlier_install_pte0 = 1;
 15.1541 +#endif
 15.1542 +#ifdef MULTIPROCESSOR						 /* XXX */
 15.1543 +#if !defined(XEN)
 15.1544 +	KASSERT(avail_start == PAGE_SIZE);			 /* XXX */
 15.1545 +#endif
 15.1546 +	if (realmode_reserved_size < MP_TRAMPOLINE)		 /* XXX */
 15.1547 +		realmode_reserved_size = MP_TRAMPOLINE;		 /* XXX */
 15.1548 +	needs_earlier_install_pte0 = 1;				 /* XXX */
 15.1549 +#endif								 /* XXX */
 15.1550 +#if NACPI > 0
 15.1551 +	/* trampoline code for wake handler */
 15.1552 +	realmode_reserved_size += ptoa(acpi_md_get_npages_of_wakecode()+1);
 15.1553 +	needs_earlier_install_pte0 = 1;
 15.1554 +#endif
 15.1555 +	if (needs_earlier_install_pte0) {
 15.1556 +		/* page table for directory entry 0 */
 15.1557 +		realmode_reserved_size += PAGE_SIZE;
 15.1558 +	}
 15.1559 +	if (realmode_reserved_size>0) {
 15.1560 +		realmode_reserved_start = avail_start;
 15.1561 +		avail_start += realmode_reserved_size;
 15.1562 +	}
 15.1563 +
 15.1564 +#ifdef DEBUG_MEMLOAD
 15.1565 +	printf("mem_cluster_count: %d\n", mem_cluster_cnt);
 15.1566 +#endif
 15.1567 +
 15.1568 +	/*
 15.1569 +	 * Call pmap initialization to make new kernel address space.
 15.1570 +	 * We must do this before loading pages into the VM system.
 15.1571 +	 */
 15.1572 +	pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE);
 15.1573 +
 15.1574 +#if !defined(XEN)
 15.1575 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
 15.1576 +	/*
 15.1577 +	 * Check to see if we have a memory map from the BIOS (passed
 15.1578 +	 * to us by the boot program.
 15.1579 +	 */
 15.1580 +	bim = lookup_bootinfo(BTINFO_MEMMAP);
 15.1581 +	if (bim != NULL && bim->num > 0) {
 15.1582 +#ifdef DEBUG_MEMLOAD
 15.1583 +		printf("BIOS MEMORY MAP (%d ENTRIES):\n", bim->num);
 15.1584 +#endif
 15.1585 +		for (x = 0; x < bim->num; x++) {
 15.1586 +#ifdef DEBUG_MEMLOAD
 15.1587 +			printf("    addr 0x%qx  size 0x%qx  type 0x%x\n",
 15.1588 +			    bim->entry[x].addr,
 15.1589 +			    bim->entry[x].size,
 15.1590 +			    bim->entry[x].type);
 15.1591 +#endif
 15.1592 +
 15.1593 +			/*
 15.1594 +			 * If the segment is not memory, skip it.
 15.1595 +			 */
 15.1596 +			switch (bim->entry[x].type) {
 15.1597 +			case BIM_Memory:
 15.1598 +			case BIM_ACPI:
 15.1599 +			case BIM_NVS:
 15.1600 +				break;
 15.1601 +			default:
 15.1602 +				continue;
 15.1603 +			}
 15.1604 +
 15.1605 +			/*
 15.1606 +			 * Sanity check the entry.
 15.1607 +			 * XXX Need to handle uint64_t in extent code
 15.1608 +			 * XXX and 64-bit physical addresses in i386
 15.1609 +			 * XXX port.
 15.1610 +			 */
 15.1611 +			seg_start = bim->entry[x].addr;
 15.1612 +			seg_end = bim->entry[x].addr + bim->entry[x].size;
 15.1613 +
 15.1614 +			/*
 15.1615 +			 *   Avoid Compatibility Holes.
 15.1616 +			 * XXX  Holes within memory space that allow access
 15.1617 +			 * XXX to be directed to the PC-compatible frame buffer
 15.1618 +			 * XXX (0xa0000-0xbffff),to adapter ROM space
 15.1619 +			 * XXX (0xc0000-0xdffff), and to system BIOS space
 15.1620 +			 * XXX (0xe0000-0xfffff).
 15.1621 +			 * XXX  Some laptop(for example,Toshiba Satellite2550X)
 15.1622 +			 * XXX report this area and occurred problems,
 15.1623 +			 * XXX so we avoid this area.
 15.1624 +			 */
 15.1625 +			if (seg_start < 0x100000 && seg_end > 0xa0000) {
 15.1626 +				printf("WARNING: memory map entry overlaps "
 15.1627 +				    "with ``Compatibility Holes'': "
 15.1628 +				    "0x%qx/0x%qx/0x%x\n", seg_start,
 15.1629 +				    seg_end - seg_start, bim->entry[x].type);
 15.1630 +				add_mem_cluster(seg_start, 0xa0000,
 15.1631 +				    bim->entry[x].type);
 15.1632 +				add_mem_cluster(0x100000, seg_end,
 15.1633 +				    bim->entry[x].type);
 15.1634 +			} else
 15.1635 +				add_mem_cluster(seg_start, seg_end,
 15.1636 +				    bim->entry[x].type);
 15.1637 +		}
 15.1638 +	}
 15.1639 +#endif /* ! REALBASEMEM && ! REALEXTMEM */
 15.1640 +	/*
 15.1641 +	 * If the loop above didn't find any valid segment, fall back to
 15.1642 +	 * former code.
 15.1643 +	 */
 15.1644 +	if (mem_cluster_cnt == 0) {
 15.1645 +		/*
 15.1646 +		 * Allocate the physical addresses used by RAM from the iomem
 15.1647 +		 * extent map.  This is done before the addresses are
 15.1648 +		 * page rounded just to make sure we get them all.
 15.1649 +		 */
 15.1650 +		if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem),
 15.1651 +		    EX_NOWAIT)) {
 15.1652 +			/* XXX What should we do? */
 15.1653 +			printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM "
 15.1654 +			    "IOMEM EXTENT MAP!\n");
 15.1655 +		}
 15.1656 +		mem_clusters[0].start = 0;
 15.1657 +		mem_clusters[0].size = trunc_page(KBTOB(biosbasemem));
 15.1658 +		physmem += atop(mem_clusters[0].size);
 15.1659 +		if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem),
 15.1660 +		    EX_NOWAIT)) {
 15.1661 +			/* XXX What should we do? */
 15.1662 +			printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM "
 15.1663 +			    "IOMEM EXTENT MAP!\n");
 15.1664 +		}
 15.1665 +#if NISADMA > 0
 15.1666 +		/*
 15.1667 +		 * Some motherboards/BIOSes remap the 384K of RAM that would
 15.1668 +		 * normally be covered by the ISA hole to the end of memory
 15.1669 +		 * so that it can be used.  However, on a 16M system, this
 15.1670 +		 * would cause bounce buffers to be allocated and used.
 15.1671 +		 * This is not desirable behaviour, as more than 384K of
 15.1672 +		 * bounce buffers might be allocated.  As a work-around,
 15.1673 +		 * we round memory down to the nearest 1M boundary if
 15.1674 +		 * we're using any isadma devices and the remapped memory
 15.1675 +		 * is what puts us over 16M.
 15.1676 +		 */
 15.1677 +		if (biosextmem > (15*1024) && biosextmem < (16*1024)) {
 15.1678 +			char pbuf[9];
 15.1679 +
 15.1680 +			format_bytes(pbuf, sizeof(pbuf),
 15.1681 +			    biosextmem - (15*1024));
 15.1682 +			printf("Warning: ignoring %s of remapped memory\n",
 15.1683 +			    pbuf);
 15.1684 +			biosextmem = (15*1024);
 15.1685 +		}
 15.1686 +#endif
 15.1687 +		mem_clusters[1].start = IOM_END;
 15.1688 +		mem_clusters[1].size = trunc_page(KBTOB(biosextmem));
 15.1689 +		physmem += atop(mem_clusters[1].size);
 15.1690 +
 15.1691 +		mem_cluster_cnt = 2;
 15.1692 +
 15.1693 +		avail_end = IOM_END + trunc_page(KBTOB(biosextmem));
 15.1694 +	}
 15.1695 +	/*
 15.1696 +	 * If we have 16M of RAM or less, just put it all on
 15.1697 +	 * the default free list.  Otherwise, put the first
 15.1698 +	 * 16M of RAM on a lower priority free list (so that
 15.1699 +	 * all of the ISA DMA'able memory won't be eaten up
 15.1700 +	 * first-off).
 15.1701 +	 */
 15.1702 +	if (avail_end <= (16 * 1024 * 1024))
 15.1703 +		first16q = VM_FREELIST_DEFAULT;
 15.1704 +	else
 15.1705 +		first16q = VM_FREELIST_FIRST16;
 15.1706 +
 15.1707 +	/* Make sure the end of the space used by the kernel is rounded. */
 15.1708 +	first_avail = round_page(first_avail);
 15.1709 +#endif
 15.1710 +
 15.1711 +	XENPRINTK(("load the memory cluster %p(%d) - %p(%ld)\n",
 15.1712 +	    (void *)avail_start, (int)atop(avail_start),
 15.1713 +	    (void *)avail_end, (int)atop(avail_end)));
 15.1714 +	uvm_page_physload(atop(avail_start), atop(avail_end),
 15.1715 +	    atop(avail_start), atop(avail_end),
 15.1716 +	    VM_FREELIST_DEFAULT);
 15.1717 +
 15.1718 +#if !defined(XEN)
 15.1719 +
 15.1720 +	/*
 15.1721 +	 * Now, load the memory clusters (which have already been
 15.1722 +	 * rounded and truncated) into the VM system.
 15.1723 +	 *
 15.1724 +	 * NOTE: WE ASSUME THAT MEMORY STARTS AT 0 AND THAT THE KERNEL
 15.1725 +	 * IS LOADED AT IOM_END (1M).
 15.1726 +	 */
 15.1727 +	for (x = 0; x < mem_cluster_cnt; x++) {
 15.1728 +		seg_start = mem_clusters[x].start;
 15.1729 +		seg_end = mem_clusters[x].start + mem_clusters[x].size;
 15.1730 +		seg_start1 = 0;
 15.1731 +		seg_end1 = 0;
 15.1732 +
 15.1733 +		/*
 15.1734 +		 * Skip memory before our available starting point.
 15.1735 +		 */
 15.1736 +		if (seg_end <= avail_start)
 15.1737 +			continue;
 15.1738 +
 15.1739 +		if (avail_start >= seg_start && avail_start < seg_end) {
 15.1740 +			if (seg_start != 0)
 15.1741 +				panic("init386: memory doesn't start at 0");
 15.1742 +			seg_start = avail_start;
 15.1743 +			if (seg_start == seg_end)
 15.1744 +				continue;
 15.1745 +		}
 15.1746 +
 15.1747 +		/*
 15.1748 +		 * If this segment contains the kernel, split it
 15.1749 +		 * in two, around the kernel.
 15.1750 +		 */
 15.1751 +		if (seg_start <= IOM_END && first_avail <= seg_end) {
 15.1752 +			seg_start1 = first_avail;
 15.1753 +			seg_end1 = seg_end;
 15.1754 +			seg_end = IOM_END;
 15.1755 +		}
 15.1756 +
 15.1757 +		/* First hunk */
 15.1758 +		if (seg_start != seg_end) {
 15.1759 +			if (seg_start < (16 * 1024 * 1024) &&
 15.1760 +			    first16q != VM_FREELIST_DEFAULT) {
 15.1761 +				u_int64_t tmp;
 15.1762 +
 15.1763 +				if (seg_end > (16 * 1024 * 1024))
 15.1764 +					tmp = (16 * 1024 * 1024);
 15.1765 +				else
 15.1766 +					tmp = seg_end;
 15.1767 +
 15.1768 +				if (tmp != seg_start) {
 15.1769 +#ifdef DEBUG_MEMLOAD
 15.1770 +					printf("loading 0x%qx-0x%qx "
 15.1771 +					    "(0x%lx-0x%lx)\n",
 15.1772 +				    	    seg_start, tmp,
 15.1773 +				  	    atop(seg_start), atop(tmp));
 15.1774 +#endif
 15.1775 +					uvm_page_physload(atop(seg_start),
 15.1776 +				    	    atop(tmp), atop(seg_start),
 15.1777 +				    	    atop(tmp), first16q);
 15.1778 +				}
 15.1779 +				seg_start = tmp;
 15.1780 +			}
 15.1781 +
 15.1782 +			if (seg_start != seg_end) {
 15.1783 +#ifdef DEBUG_MEMLOAD
 15.1784 +				printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n",
 15.1785 +				    seg_start, seg_end,
 15.1786 +				    atop(seg_start), atop(seg_end));
 15.1787 +#endif
 15.1788 +				uvm_page_physload(atop(seg_start),
 15.1789 +				    atop(seg_end), atop(seg_start),
 15.1790 +				    atop(seg_end), VM_FREELIST_DEFAULT);
 15.1791 +			}
 15.1792 +		}
 15.1793 +
 15.1794 +		/* Second hunk */
 15.1795 +		if (seg_start1 != seg_end1) {
 15.1796 +			if (seg_start1 < (16 * 1024 * 1024) &&
 15.1797 +			    first16q != VM_FREELIST_DEFAULT) {
 15.1798 +				u_int64_t tmp;
 15.1799 +
 15.1800 +				if (seg_end1 > (16 * 1024 * 1024))
 15.1801 +					tmp = (16 * 1024 * 1024);
 15.1802 +				else
 15.1803 +					tmp = seg_end1;
 15.1804 +
 15.1805 +				if (tmp != seg_start1) {
 15.1806 +#ifdef DEBUG_MEMLOAD
 15.1807 +					printf("loading 0x%qx-0x%qx "
 15.1808 +					    "(0x%lx-0x%lx)\n",
 15.1809 +				    	    seg_start1, tmp,
 15.1810 +				    	    atop(seg_start1), atop(tmp));
 15.1811 +#endif
 15.1812 +					uvm_page_physload(atop(seg_start1),
 15.1813 +				    	    atop(tmp), atop(seg_start1),
 15.1814 +				    	    atop(tmp), first16q);
 15.1815 +				}
 15.1816 +				seg_start1 = tmp;
 15.1817 +			}
 15.1818 +
 15.1819 +			if (seg_start1 != seg_end1) {
 15.1820 +#ifdef DEBUG_MEMLOAD
 15.1821 +				printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n",
 15.1822 +				    seg_start1, seg_end1,
 15.1823 +				    atop(seg_start1), atop(seg_end1));
 15.1824 +#endif
 15.1825 +				uvm_page_physload(atop(seg_start1),
 15.1826 +				    atop(seg_end1), atop(seg_start1),
 15.1827 +				    atop(seg_end1), VM_FREELIST_DEFAULT);
 15.1828 +			}
 15.1829 +		}
 15.1830 +	}
 15.1831 +#endif
 15.1832 +
 15.1833 +	/*
 15.1834 +	 * Steal memory for the message buffer (at end of core).
 15.1835 +	 */
 15.1836 +	{
 15.1837 +		struct vm_physseg *vps;
 15.1838 +		psize_t sz = round_page(MSGBUFSIZE);
 15.1839 +		psize_t reqsz = sz;
 15.1840 +
 15.1841 +		for (x = 0; x < vm_nphysseg; x++) {
 15.1842 +			vps = &vm_physmem[x];
 15.1843 +			if (ptoa(vps->avail_end) == avail_end)
 15.1844 +				goto found;
 15.1845 +		}
 15.1846 +		panic("init386: can't find end of memory");
 15.1847 +
 15.1848 +	found:
 15.1849 +		/* Shrink so it'll fit in the last segment. */
 15.1850 +		if ((vps->avail_end - vps->avail_start) < atop(sz))
 15.1851 +			sz = ptoa(vps->avail_end - vps->avail_start);
 15.1852 +
 15.1853 +		vps->avail_end -= atop(sz);
 15.1854 +		vps->end -= atop(sz);
 15.1855 +		msgbuf_paddr = ptoa(vps->avail_end);
 15.1856 +
 15.1857 +		/* Remove the last segment if it now has no pages. */
 15.1858 +		if (vps->start == vps->end) {
 15.1859 +			for (vm_nphysseg--; x < vm_nphysseg; x++)
 15.1860 +				vm_physmem[x] = vm_physmem[x + 1];
 15.1861 +		}
 15.1862 +
 15.1863 +		/* Now find where the new avail_end is. */
 15.1864 +		for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
 15.1865 +			if (vm_physmem[x].avail_end > avail_end)
 15.1866 +				avail_end = vm_physmem[x].avail_end;
 15.1867 +		avail_end = ptoa(avail_end);
 15.1868 +
 15.1869 +		/* Warn if the message buffer had to be shrunk. */
 15.1870 +		if (sz != reqsz)
 15.1871 +			printf("WARNING: %ld bytes not available for msgbuf "
 15.1872 +			    "in last cluster (%ld used)\n", reqsz, sz);
 15.1873 +	}
 15.1874 +
 15.1875 +	/*
 15.1876 +	 * install PT page for the first 4M if needed.
 15.1877 +	 */
 15.1878 +	if (needs_earlier_install_pte0) {
 15.1879 +		paddr_t paddr;
 15.1880 +#ifdef DIAGNOSTIC
 15.1881 +		if (realmode_reserved_size < PAGE_SIZE) {
 15.1882 +			panic("cannot steal memory for first 4M PT page.");
 15.1883 +		}
 15.1884 +#endif
 15.1885 +		paddr=realmode_reserved_start+realmode_reserved_size-PAGE_SIZE;
 15.1886 +		pmap_enter(pmap_kernel(), (vaddr_t)vtopte(0), paddr,
 15.1887 +			   VM_PROT_READ|VM_PROT_WRITE,
 15.1888 +			   PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE);
 15.1889 +		pmap_update(pmap_kernel());
 15.1890 +		/* make sure it is clean before using */
 15.1891 +		memset(vtopte(0), 0, PAGE_SIZE);
 15.1892 +		realmode_reserved_size -= PAGE_SIZE;
 15.1893 +	}
 15.1894 +
 15.1895 +#if NBIOSCALL > 0
 15.1896 +	/*
 15.1897 +	 * this should be caught at kernel build time, but put it here
 15.1898 +	 * in case someone tries to fake it out...
 15.1899 +	 */
 15.1900 +#ifdef DIAGNOSTIC
 15.1901 +	if (realmode_reserved_start > BIOSTRAMP_BASE ||
 15.1902 +	    (realmode_reserved_start+realmode_reserved_size) < (BIOSTRAMP_BASE+
 15.1903 +							       PAGE_SIZE)) {
 15.1904 +	    panic("cannot steal memory for PT page of bioscall.");
 15.1905 +	}
 15.1906 +	if (biostramp_image_size > PAGE_SIZE)
 15.1907 +	    panic("biostramp_image_size too big: %x vs. %x",
 15.1908 +		  biostramp_image_size, PAGE_SIZE);
 15.1909 +#endif
 15.1910 +	pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE,	/* virtual */
 15.1911 +		       (paddr_t)BIOSTRAMP_BASE,	/* physical */
 15.1912 +		       VM_PROT_ALL);		/* protection */
 15.1913 +	pmap_update(pmap_kernel());
 15.1914 +	memcpy((caddr_t)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
 15.1915 +#ifdef DEBUG_BIOSCALL
 15.1916 +	printf("biostramp installed @ %x\n", BIOSTRAMP_BASE);
 15.1917 +#endif
 15.1918 +	realmode_reserved_size  -= PAGE_SIZE;
 15.1919 +	realmode_reserved_start += PAGE_SIZE;
 15.1920 +#endif
 15.1921 +
 15.1922 +#if NACPI > 0
 15.1923 +	/*
 15.1924 +	 * Steal memory for the acpi wake code
 15.1925 +	 */
 15.1926 +	{
 15.1927 +		paddr_t paddr, p;
 15.1928 +		psize_t sz;
 15.1929 +		int npg;
 15.1930 +
 15.1931 +		paddr = realmode_reserved_start;
 15.1932 +		npg = acpi_md_get_npages_of_wakecode();
 15.1933 +		sz = ptoa(npg);
 15.1934 +#ifdef DIAGNOSTIC
 15.1935 +		if (realmode_reserved_size < sz) {
 15.1936 +			panic("cannot steal memory for ACPI wake code.");
 15.1937 +		}
 15.1938 +#endif
 15.1939 +
 15.1940 +		/* identical mapping */
 15.1941 +		p = paddr;
 15.1942 +		for (x=0; x<npg; x++) {
 15.1943 +			printf("kenter: 0x%08X\n", (unsigned)p);
 15.1944 +			pmap_kenter_pa((vaddr_t)p, p, VM_PROT_ALL);
 15.1945 +			p += PAGE_SIZE;
 15.1946 +		}
 15.1947 +		pmap_update(pmap_kernel());
 15.1948 +
 15.1949 +		acpi_md_install_wakecode(paddr);
 15.1950 +
 15.1951 +		realmode_reserved_size  -= sz;
 15.1952 +		realmode_reserved_start += sz;
 15.1953 +	}
 15.1954 +#endif
 15.1955 +
 15.1956 +	pmap_enter(pmap_kernel(), idt_vaddr, idt_paddr,
 15.1957 +	    VM_PROT_READ|VM_PROT_WRITE, PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE);
 15.1958 +	pmap_update(pmap_kernel());
 15.1959 +	memset((void *)idt_vaddr, 0, PAGE_SIZE);
 15.1960 +
 15.1961 +#if !defined(XEN)
 15.1962 +	idt = (struct gate_descriptor *)idt_vaddr;
 15.1963 +#ifdef I586_CPU
 15.1964 +	pmap_enter(pmap_kernel(), pentium_idt_vaddr, idt_paddr,
 15.1965 +	    VM_PROT_READ, PMAP_WIRED|VM_PROT_READ);
 15.1966 +	pentium_idt = (union descriptor *)pentium_idt_vaddr;
 15.1967 +#endif
 15.1968 +#endif
 15.1969 +	pmap_update(pmap_kernel());
 15.1970 +
 15.1971 +	initgdt();
 15.1972 +
 15.1973 +	HYPERVISOR_set_callbacks(
 15.1974 +		GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback,
 15.1975 +		GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
 15.1976 +
 15.1977 +#if !defined(XEN)
 15.1978 +	tgdt = gdt;
 15.1979 +	gdt = (union descriptor *)
 15.1980 +		    ((char *)idt + NIDT * sizeof (struct gate_descriptor));
 15.1981 +	ldt = gdt + NGDT;
 15.1982 +
 15.1983 +	memcpy(gdt, tgdt, NGDT*sizeof(*gdt));
 15.1984 +
 15.1985 +	setsegment(&gdt[GLDT_SEL].sd, ldt, NLDT * sizeof(ldt[0]) - 1,
 15.1986 +	    SDT_SYSLDT, SEL_KPL, 0, 0);
 15.1987 +#else
 15.1988 +	ldt = (union descriptor *)idt_vaddr;
 15.1989 +#endif
 15.1990 +
 15.1991 +	/* make ldt gates and memory segments */
 15.1992 +	setgate(&ldt[LSYS5CALLS_SEL].gd, &IDTVEC(osyscall), 1,
 15.1993 +	    SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 15.1994 +
 15.1995 +	ldt[LUCODE_SEL] = gdt[GUCODE_SEL];
 15.1996 +	ldt[LUCODEBIG_SEL] = gdt[GUCODEBIG_SEL];
 15.1997 +	ldt[LUDATA_SEL] = gdt[GUDATA_SEL];
 15.1998 +	ldt[LSOL26CALLS_SEL] = ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 15.1999 +
 15.2000 +#if !defined(XEN)
 15.2001 +	/* exceptions */
 15.2002 +	for (x = 0; x < 32; x++) {
 15.2003 +		setgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386TGT,
 15.2004 +		    (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
 15.2005 +		    GSEL(GCODE_SEL, SEL_KPL));
 15.2006 +		idt_allocmap[x] = 1;
 15.2007 +	}
 15.2008 +
 15.2009 +	/* new-style interrupt gate for syscalls */
 15.2010 +	setgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386TGT, SEL_UPL,
 15.2011 +	    GSEL(GCODE_SEL, SEL_KPL));
 15.2012 +	idt_allocmap[128] = 1;
 15.2013 +#ifdef COMPAT_SVR4
 15.2014 +	setgate(&idt[0xd2], &IDTVEC(svr4_fasttrap), 0, SDT_SYS386TGT,
 15.2015 +	    SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 15.2016 +	idt_allocmap[0xd2] = 1;
 15.2017 +#endif /* COMPAT_SVR4 */
 15.2018 +#endif
 15.2019 +
 15.2020 +	memset(xen_idt, 0, sizeof(trap_info_t) * MAX_XEN_IDT);
 15.2021 +	xen_idt_idx = 0;
 15.2022 +	for (x = 0; x < 32; x++) {
 15.2023 +		KASSERT(xen_idt_idx < MAX_XEN_IDT);
 15.2024 +		xen_idt[xen_idt_idx].vector = x;
 15.2025 +		xen_idt[xen_idt_idx].flags =
 15.2026 +			(x == 3 || x == 4) ? SEL_UPL : SEL_XEN;
 15.2027 +		xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
 15.2028 +		xen_idt[xen_idt_idx].address =
 15.2029 +			(uint32_t)IDTVEC(exceptions)[x];
 15.2030 +		xen_idt_idx++;
 15.2031 +	}
 15.2032 +	KASSERT(xen_idt_idx < MAX_XEN_IDT);
 15.2033 +	xen_idt[xen_idt_idx].vector = 128;
 15.2034 +	xen_idt[xen_idt_idx].flags = SEL_UPL;
 15.2035 +	xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
 15.2036 +	xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(syscall);
 15.2037 +	xen_idt_idx++;
 15.2038 +#ifdef COMPAT_SVR4
 15.2039 +	KASSERT(xen_idt_idx < MAX_XEN_IDT);
 15.2040 +	xen_idt[xen_idt_idx].vector = 0xd2;
 15.2041 +	xen_idt[xen_idt_idx].flags = SEL_UPL;
 15.2042 +	xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
 15.2043 +	xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(svr4_fasttrap);
 15.2044 +	xen_idt_idx++;
 15.2045 +#endif /* COMPAT_SVR4 */
 15.2046 +
 15.2047 +#if !defined(XEN)
 15.2048 +	setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
 15.2049 +	lgdt(&region);
 15.2050 +#else
 15.2051 +	lldt(GSEL(GLDT_SEL, SEL_KPL));
 15.2052 +#endif
 15.2053 +
 15.2054 +#if !defined(XEN)
 15.2055 +	cpu_init_idt();
 15.2056 +#else
 15.2057 +	db_trap_callback = ddb_trap_hook;
 15.2058 +
 15.2059 +	XENPRINTF(("HYPERVISOR_set_trap_table %p\n", xen_idt));
 15.2060 +	if (HYPERVISOR_set_trap_table(xen_idt))
 15.2061 +		panic("HYPERVISOR_set_trap_table %p failed\n", xen_idt);
 15.2062 +#endif
 15.2063 +
 15.2064 +#if NKSYMS || defined(DDB) || defined(LKM)
 15.2065 +	{
 15.2066 +		extern int end;
 15.2067 +		extern int *esym;
 15.2068 +		struct btinfo_symtab *symtab;
 15.2069 +
 15.2070 +#ifdef DDB
 15.2071 +		db_machine_init();
 15.2072 +#endif
 15.2073 +
 15.2074 +		symtab = lookup_bootinfo(BTINFO_SYMTAB);
 15.2075 +
 15.2076 +		if (symtab) {
 15.2077 +			symtab->ssym += KERNBASE;
 15.2078 +			symtab->esym += KERNBASE;
 15.2079 +			ksyms_init(symtab->nsym, (int *)symtab->ssym,
 15.2080 +			    (int *)symtab->esym);
 15.2081 +		}
 15.2082 +		else
 15.2083 +			ksyms_init(*(int *)&end, ((int *)&end) + 1, esym);
 15.2084 +	}
 15.2085 +#endif
 15.2086 +#ifdef DDB
 15.2087 +	if (boothowto & RB_KDB)
 15.2088 +		Debugger();
 15.2089 +#endif
 15.2090 +#ifdef IPKDB
 15.2091 +	ipkdb_init();
 15.2092 +	if (boothowto & RB_KDB)
 15.2093 +		ipkdb_connect(0);
 15.2094 +#endif
 15.2095 +#ifdef KGDB
 15.2096 +	kgdb_port_init();
 15.2097 +	if (boothowto & RB_KDB) {
 15.2098 +		kgdb_debug_init = 1;
 15.2099 +		kgdb_connect(1);
 15.2100 +	}
 15.2101 +#endif
 15.2102 +
 15.2103 +#if NMCA > 0
 15.2104 +	/* check for MCA bus, needed to be done before ISA stuff - if
 15.2105 +	 * MCA is detected, ISA needs to use level triggered interrupts
 15.2106 +	 * by default */
 15.2107 +	mca_busprobe();
 15.2108 +#endif
 15.2109 +
 15.2110 +#if defined(XEN)
 15.2111 +	events_default_setup();
 15.2112 +#else
 15.2113 +	intr_default_setup();
 15.2114 +#endif
 15.2115 +
 15.2116 +	/* Initialize software interrupts. */
 15.2117 +	softintr_init();
 15.2118 +
 15.2119 +	splraise(IPL_IPI);
 15.2120 +	enable_intr();
 15.2121 +
 15.2122 +	if (physmem < btoc(2 * 1024 * 1024)) {
 15.2123 +		printf("warning: too little memory available; "
 15.2124 +		       "have %lu bytes, want %lu bytes\n"
 15.2125 +		       "running in degraded mode\n"
 15.2126 +		       "press a key to confirm\n\n",
 15.2127 +		       ptoa(physmem), 2*1024*1024UL);
 15.2128 +		cngetc();
 15.2129 +	}
 15.2130 +
 15.2131 +#ifdef __HAVE_CPU_MAXPROC
 15.2132 +	/* Make sure maxproc is sane */
 15.2133 +	if (maxproc > cpu_maxproc())
 15.2134 +		maxproc = cpu_maxproc();
 15.2135 +#endif
 15.2136 +}
 15.2137 +
 15.2138 +#ifdef COMPAT_NOMID
 15.2139 +static int
 15.2140 +exec_nomid(struct proc *p, struct exec_package *epp)
 15.2141 +{
 15.2142 +	int error;
 15.2143 +	u_long midmag, magic;
 15.2144 +	u_short mid;
 15.2145 +	struct exec *execp = epp->ep_hdr;
 15.2146 +
 15.2147 +	/* check on validity of epp->ep_hdr performed by exec_out_makecmds */
 15.2148 +
 15.2149 +	midmag = ntohl(execp->a_midmag);
 15.2150 +	mid = (midmag >> 16) & 0xffff;
 15.2151 +	magic = midmag & 0xffff;
 15.2152 +
 15.2153 +	if (magic == 0) {
 15.2154 +		magic = (execp->a_midmag & 0xffff);
 15.2155 +		mid = MID_ZERO;
 15.2156 +	}
 15.2157 +
 15.2158 +	midmag = mid << 16 | magic;
 15.2159 +
 15.2160 +	switch (midmag) {
 15.2161 +	case (MID_ZERO << 16) | ZMAGIC:
 15.2162 +		/*
 15.2163 +		 * 386BSD's ZMAGIC format:
 15.2164 +		 */
 15.2165 +		error = exec_aout_prep_oldzmagic(p, epp);
 15.2166 +		break;
 15.2167 +
 15.2168 +	case (MID_ZERO << 16) | QMAGIC:
 15.2169 +		/*
 15.2170 +		 * BSDI's QMAGIC format:
 15.2171 +		 * same as new ZMAGIC format, but with different magic number
 15.2172 +		 */
 15.2173 +		error = exec_aout_prep_zmagic(p, epp);
 15.2174 +		break;
 15.2175 +
 15.2176 +	case (MID_ZERO << 16) | NMAGIC:
 15.2177 +		/*
 15.2178 +		 * BSDI's NMAGIC format:
 15.2179 +		 * same as NMAGIC format, but with different magic number
 15.2180 +		 * and with text starting at 0.
 15.2181 +		 */
 15.2182 +		error = exec_aout_prep_oldnmagic(p, epp);
 15.2183 +		break;
 15.2184 +
 15.2185 +	case (MID_ZERO << 16) | OMAGIC:
 15.2186 +		/*
 15.2187 +		 * BSDI's OMAGIC format:
 15.2188 +		 * same as OMAGIC format, but with different magic number
 15.2189 +		 * and with text starting at 0.
 15.2190 +		 */
 15.2191 +		error = exec_aout_prep_oldomagic(p, epp);
 15.2192 +		break;
 15.2193 +
 15.2194 +	default:
 15.2195 +		error = ENOEXEC;
 15.2196 +	}
 15.2197 +
 15.2198 +	return error;
 15.2199 +}
 15.2200 +#endif
 15.2201 +
 15.2202 +/*
 15.2203 + * cpu_exec_aout_makecmds():
 15.2204 + *	CPU-dependent a.out format hook for execve().
 15.2205 + *
 15.2206 + * Determine of the given exec package refers to something which we
 15.2207 + * understand and, if so, set up the vmcmds for it.
 15.2208 + *
 15.2209 + * On the i386, old (386bsd) ZMAGIC binaries and BSDI QMAGIC binaries
 15.2210 + * if COMPAT_NOMID is given as a kernel option.
 15.2211 + */
 15.2212 +int
 15.2213 +cpu_exec_aout_makecmds(struct proc *p, struct exec_package *epp)
 15.2214 +{
 15.2215 +	int error = ENOEXEC;
 15.2216 +
 15.2217 +#ifdef COMPAT_NOMID
 15.2218 +	if ((error = exec_nomid(p, epp)) == 0)
 15.2219 +		return error;
 15.2220 +#endif /* ! COMPAT_NOMID */
 15.2221 +
 15.2222 +	return error;
 15.2223 +}
 15.2224 +
 15.2225 +void *
 15.2226 +lookup_bootinfo(int type)
 15.2227 +{
 15.2228 +	struct btinfo_common *help;
 15.2229 +	int n = *(int*)bootinfo;
 15.2230 +	help = (struct btinfo_common *)(bootinfo + sizeof(int));
 15.2231 +	while(n--) {
 15.2232 +		if(help->type == type)
 15.2233 +			return(help);
 15.2234 +		help = (struct btinfo_common *)((char*)help + help->len);
 15.2235 +	}
 15.2236 +	return(0);
 15.2237 +}
 15.2238 +
 15.2239 +#include <dev/ic/mc146818reg.h>		/* for NVRAM POST */
 15.2240 +#include <i386/isa/nvram.h>		/* for NVRAM POST */
 15.2241 +
 15.2242 +void
 15.2243 +cpu_reset()
 15.2244 +{
 15.2245 +
 15.2246 +	disable_intr();
 15.2247 +
 15.2248 +#if 0
 15.2249 +	/*
 15.2250 +	 * Ensure the NVRAM reset byte contains something vaguely sane.
 15.2251 +	 */
 15.2252 +
 15.2253 +	outb(IO_RTC, NVRAM_RESET);
 15.2254 +	outb(IO_RTC+1, NVRAM_RESET_RST);
 15.2255 +
 15.2256 +	/*
 15.2257 +	 * The keyboard controller has 4 random output pins, one of which is
 15.2258 +	 * connected to the RESET pin on the CPU in many PCs.  We tell the
 15.2259 +	 * keyboard controller to pulse this line a couple of times.
 15.2260 +	 */
 15.2261 +	outb(IO_KBD + KBCMDP, KBC_PULSE0);
 15.2262 +	delay(100000);
 15.2263 +	outb(IO_KBD + KBCMDP, KBC_PULSE0);
 15.2264 +	delay(100000);
 15.2265 +#endif
 15.2266 +
 15.2267 +	HYPERVISOR_reboot();
 15.2268 +
 15.2269 +	for (;;);
 15.2270 +}
 15.2271 +
 15.2272 +void
 15.2273 +cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
 15.2274 +{
 15.2275 +	const struct trapframe *tf = l->l_md.md_regs;
 15.2276 +	__greg_t *gr = mcp->__gregs;
 15.2277 +	__greg_t ras_eip;
 15.2278 +
 15.2279 +	/* Save register context. */
 15.2280 +#ifdef VM86
 15.2281 +	if (tf->tf_eflags & PSL_VM) {
 15.2282 +		gr[_REG_GS]  = tf->tf_vm86_gs;
 15.2283 +		gr[_REG_FS]  = tf->tf_vm86_fs;
 15.2284 +		gr[_REG_ES]  = tf->tf_vm86_es;
 15.2285 +		gr[_REG_DS]  = tf->tf_vm86_ds;
 15.2286 +		gr[_REG_EFL] = get_vflags(l);
 15.2287 +	} else
 15.2288 +#endif
 15.2289 +	{
 15.2290 +		gr[_REG_GS]  = tf->tf_gs;
 15.2291 +		gr[_REG_FS]  = tf->tf_fs;
 15.2292 +		gr[_REG_ES]  = tf->tf_es;
 15.2293 +		gr[_REG_DS]  = tf->tf_ds;
 15.2294 +		gr[_REG_EFL] = tf->tf_eflags;
 15.2295 +	}
 15.2296 +	gr[_REG_EDI]    = tf->tf_edi;
 15.2297 +	gr[_REG_ESI]    = tf->tf_esi;
 15.2298 +	gr[_REG_EBP]    = tf->tf_ebp;
 15.2299 +	gr[_REG_EBX]    = tf->tf_ebx;
 15.2300 +	gr[_REG_EDX]    = tf->tf_edx;
 15.2301 +	gr[_REG_ECX]    = tf->tf_ecx;
 15.2302 +	gr[_REG_EAX]    = tf->tf_eax;
 15.2303 +	gr[_REG_EIP]    = tf->tf_eip;
 15.2304 +	gr[_REG_CS]     = tf->tf_cs;
 15.2305 +	gr[_REG_ESP]    = tf->tf_esp;
 15.2306 +	gr[_REG_UESP]   = tf->tf_esp;
 15.2307 +	gr[_REG_SS]     = tf->tf_ss;
 15.2308 +	gr[_REG_TRAPNO] = tf->tf_trapno;
 15.2309 +	gr[_REG_ERR]    = tf->tf_err;
 15.2310 +
 15.2311 +	if ((ras_eip = (__greg_t)ras_lookup(l->l_proc,
 15.2312 +	    (caddr_t) gr[_REG_EIP])) != -1)
 15.2313 +		gr[_REG_EIP] = ras_eip;
 15.2314 +
 15.2315 +	*flags |= _UC_CPU;
 15.2316 +
 15.2317 +	/* Save floating point register context, if any. */
 15.2318 +	if ((l->l_md.md_flags & MDL_USEDFPU) != 0) {
 15.2319 +#if NNPX > 0
 15.2320 +		/*
 15.2321 +		 * If this process is the current FP owner, dump its
 15.2322 +		 * context to the PCB first.
 15.2323 +		 * XXX npxsave() also clears the FPU state; depending on the
 15.2324 +		 * XXX application this might be a penalty.
 15.2325 +		 */
 15.2326 +		if (l->l_addr->u_pcb.pcb_fpcpu) {
 15.2327 +			npxsave_lwp(l, 1);
 15.2328 +		}
 15.2329 +#endif
 15.2330 +		if (i386_use_fxsave) {
 15.2331 +			memcpy(&mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
 15.2332 +			    &l->l_addr->u_pcb.pcb_savefpu.sv_xmm,
 15.2333 +			    sizeof (mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm));
 15.2334 +			*flags |= _UC_FXSAVE;
 15.2335 +		} else {
 15.2336 +			memcpy(&mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
 15.2337 +			    &l->l_addr->u_pcb.pcb_savefpu.sv_87,
 15.2338 +			    sizeof (mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state));
 15.2339 +		}
 15.2340 +#if 0
 15.2341 +		/* Apparently nothing ever touches this. */
 15.2342 +		ucp->mcp.mc_fp.fp_emcsts = l->l_addr->u_pcb.pcb_saveemc;
 15.2343 +#endif
 15.2344 +		*flags |= _UC_FPU;
 15.2345 +	}
 15.2346 +}
 15.2347 +
 15.2348 +int
 15.2349 +cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
 15.2350 +{
 15.2351 +	struct trapframe *tf = l->l_md.md_regs;
 15.2352 +	__greg_t *gr = mcp->__gregs;
 15.2353 +
 15.2354 +	/* Restore register context, if any. */
 15.2355 +	if ((flags & _UC_CPU) != 0) {
 15.2356 +#ifdef VM86
 15.2357 +		if (gr[_REG_EFL] & PSL_VM) {
 15.2358 +			tf->tf_vm86_gs = gr[_REG_GS];
 15.2359 +			tf->tf_vm86_fs = gr[_REG_FS];
 15.2360 +			tf->tf_vm86_es = gr[_REG_ES];
 15.2361 +			tf->tf_vm86_ds = gr[_REG_DS];
 15.2362 +			set_vflags(l, gr[_REG_EFL]);
 15.2363 +			if (flags & _UC_VM) {
 15.2364 +				void syscall_vm86(struct trapframe *);
 15.2365 +				l->l_proc->p_md.md_syscall = syscall_vm86;
 15.2366 +			}
 15.2367 +		} else
 15.2368 +#endif
 15.2369 +		{
 15.2370 +			/*
 15.2371 +			 * Check for security violations.  If we're returning
 15.2372 +			 * to protected mode, the CPU will validate the segment
 15.2373 +			 * registers automatically and generate a trap on
 15.2374 +			 * violations.  We handle the trap, rather than doing
 15.2375 +			 * all of the checking here.
 15.2376 +			 */
 15.2377 +			if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) ||
 15.2378 +			    !USERMODE(gr[_REG_CS], gr[_REG_EFL])) {
 15.2379 +				printf("cpu_setmcontext error: uc EFL: 0x%08x"
 15.2380 +				    " tf EFL: 0x%08x uc CS: 0x%x\n",
 15.2381 +				    gr[_REG_EFL], tf->tf_eflags, gr[_REG_CS]);
 15.2382 +				return (EINVAL);
 15.2383 +			}
 15.2384 +			tf->tf_gs = gr[_REG_GS];
 15.2385 +			tf->tf_fs = gr[_REG_FS];
 15.2386 +			tf->tf_es = gr[_REG_ES];
 15.2387 +			tf->tf_ds = gr[_REG_DS];
 15.2388 +			/* Only change the user-alterable part of eflags */
 15.2389 +			tf->tf_eflags &= ~PSL_USER;
 15.2390 +			tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER);
 15.2391 +		}
 15.2392 +		tf->tf_edi    = gr[_REG_EDI];
 15.2393 +		tf->tf_esi    = gr[_REG_ESI];
 15.2394 +		tf->tf_ebp    = gr[_REG_EBP];
 15.2395 +		tf->tf_ebx    = gr[_REG_EBX];
 15.2396 +		tf->tf_edx    = gr[_REG_EDX];
 15.2397 +		tf->tf_ecx    = gr[_REG_ECX];
 15.2398 +		tf->tf_eax    = gr[_REG_EAX];
 15.2399 +		tf->tf_eip    = gr[_REG_EIP];
 15.2400 +		tf->tf_cs     = gr[_REG_CS];
 15.2401 +		tf->tf_esp    = gr[_REG_UESP];
 15.2402 +		tf->tf_ss     = gr[_REG_SS];
 15.2403 +	}
 15.2404 +
 15.2405 +	/* Restore floating point register context, if any. */
 15.2406 +	if ((flags & _UC_FPU) != 0) {
 15.2407 +#if NNPX > 0
 15.2408 +		/*
 15.2409 +		 * If we were using the FPU, forget that we were.
 15.2410 +		 */
 15.2411 +		if (l->l_addr->u_pcb.pcb_fpcpu != NULL)
 15.2412 +			npxsave_lwp(l, 0);
 15.2413 +#endif
 15.2414 +		if (flags & _UC_FXSAVE) {
 15.2415 +			if (i386_use_fxsave) {
 15.2416 +				memcpy(
 15.2417 +					&l->l_addr->u_pcb.pcb_savefpu.sv_xmm,
 15.2418 +					&mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
 15.2419 +					sizeof (&l->l_addr->u_pcb.pcb_savefpu.sv_xmm));
 15.2420 +			} else {
 15.2421 +				/* This is a weird corner case */
 15.2422 +				process_xmm_to_s87((struct savexmm *)
 15.2423 +				    &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
 15.2424 +				    &l->l_addr->u_pcb.pcb_savefpu.sv_87);
 15.2425 +			}
 15.2426 +		} else {
 15.2427 +			if (i386_use_fxsave) {
 15.2428 +				process_s87_to_xmm((struct save87 *)
 15.2429 +				    &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
 15.2430 +				    &l->l_addr->u_pcb.pcb_savefpu.sv_xmm);
 15.2431 +			} else {
 15.2432 +				memcpy(&l->l_addr->u_pcb.pcb_savefpu.sv_87,
 15.2433 +				    &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
 15.2434 +				    sizeof (l->l_addr->u_pcb.pcb_savefpu.sv_87));
 15.2435 +			}
 15.2436 +		}
 15.2437 +		/* If not set already. */
 15.2438 +		l->l_md.md_flags |= MDL_USEDFPU;
 15.2439 +#if 0
 15.2440 +		/* Apparently unused. */
 15.2441 +		l->l_addr->u_pcb.pcb_saveemc = mcp->mc_fp.fp_emcsts;
 15.2442 +#endif
 15.2443 +	}
 15.2444 +	if (flags & _UC_SETSTACK)
 15.2445 +		l->l_proc->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK;
 15.2446 +	if (flags & _UC_CLRSTACK)
 15.2447 +		l->l_proc->p_sigctx.ps_sigstk.ss_flags &= ~SS_ONSTACK;
 15.2448 +	return (0);
 15.2449 +}
 15.2450 +
 15.2451 +void
 15.2452 +cpu_initclocks()
 15.2453 +{
 15.2454 +	(*initclock_func)();
 15.2455 +}
 15.2456 +
 15.2457 +#ifdef MULTIPROCESSOR
 15.2458 +void
 15.2459 +need_resched(struct cpu_info *ci)
 15.2460 +{
 15.2461 +
 15.2462 +	if (ci->ci_want_resched)
 15.2463 +		return;
 15.2464 +
 15.2465 +	ci->ci_want_resched = 1;
 15.2466 +	if ((ci)->ci_curlwp != NULL)
 15.2467 +		aston((ci)->ci_curlwp->l_proc);
 15.2468 +	else if (ci != curcpu())
 15.2469 +		x86_send_ipi(ci, 0);
 15.2470 +}
 15.2471 +#endif
 15.2472 +
 15.2473 +/*
 15.2474 + * Allocate an IDT vector slot within the given range.
 15.2475 + * XXX needs locking to avoid MP allocation races.
 15.2476 + */
 15.2477 +
 15.2478 +int
 15.2479 +idt_vec_alloc(int low, int high)
 15.2480 +{
 15.2481 +	int vec;
 15.2482 +
 15.2483 +	simple_lock(&idt_lock);
 15.2484 +	for (vec = low; vec <= high; vec++) {
 15.2485 +		if (idt_allocmap[vec] == 0) {
 15.2486 +			idt_allocmap[vec] = 1;
 15.2487 +			simple_unlock(&idt_lock);
 15.2488 +			return vec;
 15.2489 +		}
 15.2490 +	}
 15.2491 +	simple_unlock(&idt_lock);
 15.2492 +	return 0;
 15.2493 +}
 15.2494 +
 15.2495 +void
 15.2496 +idt_vec_set(int vec, void (*function)(void))
 15.2497 +{
 15.2498 +	/*
 15.2499 +	 * Vector should be allocated, so no locking needed.
 15.2500 +	 */
 15.2501 +	KASSERT(idt_allocmap[vec] == 1);
 15.2502 +	setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
 15.2503 +	    GSEL(GCODE_SEL, SEL_KPL));
 15.2504 +}
 15.2505 +
 15.2506 +void
 15.2507 +idt_vec_free(int vec)
 15.2508 +{
 15.2509 +	simple_lock(&idt_lock);
 15.2510 +	unsetgate(&idt[vec]);
 15.2511 +	idt_allocmap[vec] = 0;
 15.2512 +	simple_unlock(&idt_lock);
 15.2513 +}
 15.2514 +
 15.2515 +/*
 15.2516 + * Number of processes is limited by number of available GDT slots.
 15.2517 + */
 15.2518 +int
 15.2519 +cpu_maxproc(void)
 15.2520 +{
 15.2521 +#ifdef USER_LDT
 15.2522 +	return ((MAXGDTSIZ - NGDT) / 2);
 15.2523 +#else
 15.2524 +	return (MAXGDTSIZ - NGDT);
 15.2525 +#endif
 15.2526 +}
 15.2527 +
 15.2528 +#if defined(DDB) || defined(KGDB)
 15.2529 +
 15.2530 +/* 
 15.2531 + * Callback to output a backtrace when entering ddb.
 15.2532 + */
 15.2533 +void
 15.2534 +ddb_trap_hook(int where)
 15.2535 +{
 15.2536 +	static int once = 0;
 15.2537 +	db_addr_t db_dot;
 15.2538 +
 15.2539 +	if (once != 0 || where != 1)
 15.2540 +		return;
 15.2541 +	once = 1;
 15.2542 +
 15.2543 +	if (curlwp != NULL) {
 15.2544 +		db_printf("Stopped");
 15.2545 +		if (curproc == NULL)
 15.2546 +			db_printf("; curlwp = %p,"
 15.2547 +			    " curproc is NULL at\t", curlwp);
 15.2548 +		else
 15.2549 +			db_printf(" in pid %d.%d (%s) at\t", 
 15.2550 +			    curproc->p_pid, curlwp->l_lid,
 15.2551 +			    curproc->p_comm);
 15.2552 +	} else
 15.2553 +		db_printf("Stopped at\t");
 15.2554 +	db_dot = PC_REGS(DDB_REGS);
 15.2555 +	db_print_loc_and_inst(db_dot);
 15.2556 +
 15.2557 +	db_stack_trace_print((db_expr_t) db_dot, FALSE, 65535,
 15.2558 +	    "", db_printf);
 15.2559 +#ifdef DEBUG
 15.2560 +	db_show_regs((db_expr_t) db_dot, FALSE, 65535, "");
 15.2561 +#endif
 15.2562 +}
 15.2563 +
 15.2564 +#endif /* DDB || KGDB */
    16.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c	Mon Sep 06 19:11:17 2004 +0000
    16.3 @@ -0,0 +1,4522 @@
    16.4 +/*	$NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $	*/
    16.5 +/*	NetBSD: pmap.c,v 1.172 2004/04/12 13:17:46 yamt Exp 	*/
    16.6 +
    16.7 +/*
    16.8 + *
    16.9 + * Copyright (c) 1997 Charles D. Cranor and Washington University.
   16.10 + * All rights reserved.
   16.11 + *
   16.12 + * Redistribution and use in source and binary forms, with or without
   16.13 + * modification, are permitted provided that the following conditions
   16.14 + * are met:
   16.15 + * 1. Redistributions of source code must retain the above copyright
   16.16 + *    notice, this list of conditions and the following disclaimer.
   16.17 + * 2. Redistributions in binary form must reproduce the above copyright
   16.18 + *    notice, this list of conditions and the following disclaimer in the
   16.19 + *    documentation and/or other materials provided with the distribution.
   16.20 + * 3. All advertising materials mentioning features or use of this software
   16.21 + *    must display the following acknowledgement:
   16.22 + *      This product includes software developed by Charles D. Cranor and
   16.23 + *      Washington University.
   16.24 + * 4. The name of the author may not be used to endorse or promote products
   16.25 + *    derived from this software without specific prior written permission.
   16.26 + *
   16.27 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   16.28 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   16.29 + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   16.30 + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   16.31 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   16.32 + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   16.33 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   16.34 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   16.35 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   16.36 + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   16.37 + */
   16.38 +
   16.39 +/*
   16.40 + * pmap.c: i386 pmap module rewrite
   16.41 + * Chuck Cranor <chuck@ccrc.wustl.edu>
   16.42 + * 11-Aug-97
   16.43 + *
   16.44 + * history of this pmap module: in addition to my own input, i used
   16.45 + *    the following references for this rewrite of the i386 pmap:
   16.46 + *
   16.47 + * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
   16.48 + *     BSD hp300 pmap done by Mike Hibler at University of Utah.
   16.49 + *     it was then ported to the i386 by William Jolitz of UUNET
   16.50 + *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
   16.51 + *     project fixed some bugs and provided some speed ups.
   16.52 + *
   16.53 + * [2] the FreeBSD i386 pmap.   this pmap seems to be the
   16.54 + *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
   16.55 + *     and David Greenman.
   16.56 + *
   16.57 + * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
   16.58 + *     between several processors.   the VAX version was done by
   16.59 + *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
   16.60 + *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
   16.61 + *     David Golub, and Richard Draves.    the alpha version was
   16.62 + *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
   16.63 + *     (NetBSD/alpha).
   16.64 + */
   16.65 +
   16.66 +#include <sys/cdefs.h>
   16.67 +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $");
   16.68 +
   16.69 +#include "opt_cputype.h"
   16.70 +#include "opt_user_ldt.h"
   16.71 +#include "opt_largepages.h"
   16.72 +#include "opt_lockdebug.h"
   16.73 +#include "opt_multiprocessor.h"
   16.74 +#include "opt_kstack_dr0.h"
   16.75 +#include "opt_xen.h"
   16.76 +
   16.77 +#include <sys/param.h>
   16.78 +#include <sys/systm.h>
   16.79 +#include <sys/proc.h>
   16.80 +#include <sys/malloc.h>
   16.81 +#include <sys/pool.h>
   16.82 +#include <sys/user.h>
   16.83 +#include <sys/kernel.h>
   16.84 +
   16.85 +#include <uvm/uvm.h>
   16.86 +
   16.87 +#include <machine/atomic.h>
   16.88 +#include <machine/cpu.h>
   16.89 +#include <machine/specialreg.h>
   16.90 +#include <machine/gdt.h>
   16.91 +
   16.92 +#include <dev/isa/isareg.h>
   16.93 +#include <machine/isa_machdep.h>
   16.94 +
   16.95 +#include <machine/xen.h>
   16.96 +#include <machine/hypervisor.h>
   16.97 +#include <machine/xenpmap.h>
   16.98 +
   16.99 +void xpmap_find_pte(paddr_t);
  16.100 +
  16.101 +/* #define XENDEBUG */
  16.102 +
  16.103 +#ifdef XENDEBUG
  16.104 +#define	XENPRINTF(x) printf x
  16.105 +#define	XENPRINTK(x) printf x
  16.106 +#else
  16.107 +#define	XENPRINTF(x)
  16.108 +#define	XENPRINTK(x)
  16.109 +#endif
  16.110 +#define	PRINTF(x) printf x
  16.111 +#define	PRINTK(x) printf x
  16.112 +
  16.113 +
  16.114 +/*
  16.115 + * general info:
  16.116 + *
  16.117 + *  - for an explanation of how the i386 MMU hardware works see
  16.118 + *    the comments in <machine/pte.h>.
  16.119 + *
  16.120 + *  - for an explanation of the general memory structure used by
  16.121 + *    this pmap (including the recursive mapping), see the comments
  16.122 + *    in <machine/pmap.h>.
  16.123 + *
  16.124 + * this file contains the code for the "pmap module."   the module's
  16.125 + * job is to manage the hardware's virtual to physical address mappings.
  16.126 + * note that there are two levels of mapping in the VM system:
  16.127 + *
  16.128 + *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
  16.129 + *      to map ranges of virtual address space to objects/files.  for
  16.130 + *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
  16.131 + *      to the file /bin/ls starting at offset zero."   note that
  16.132 + *      the upper layer mapping is not concerned with how individual
  16.133 + *      vm_pages are mapped.
  16.134 + *
  16.135 + *  [2] the lower layer of the VM system (the pmap) maintains the mappings
  16.136 + *      from virtual addresses.   it is concerned with which vm_page is
  16.137 + *      mapped where.   for example, when you run /bin/ls and start
  16.138 + *      at page 0x1000 the fault routine may lookup the correct page
  16.139 + *      of the /bin/ls file and then ask the pmap layer to establish
  16.140 + *      a mapping for it.
  16.141 + *
  16.142 + * note that information in the lower layer of the VM system can be
  16.143 + * thrown away since it can easily be reconstructed from the info
  16.144 + * in the upper layer.
  16.145 + *
  16.146 + * data structures we use include:
  16.147 + *
  16.148 + *  - struct pmap: describes the address space of one thread
  16.149 + *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
  16.150 + *  - struct pv_head: there is one pv_head per managed page of
  16.151 + *	physical memory.   the pv_head points to a list of pv_entry
  16.152 + *	structures which describe all the <PMAP,VA> pairs that this
  16.153 + *      page is mapped in.    this is critical for page based operations
  16.154 + *      such as pmap_page_protect() [change protection on _all_ mappings
  16.155 + *      of a page]
  16.156 + *  - pv_page/pv_page_info: pv_entry's are allocated out of pv_page's.
  16.157 + *      if we run out of pv_entry's we allocate a new pv_page and free
  16.158 + *      its pv_entrys.
  16.159 + * - pmap_remove_record: a list of virtual addresses whose mappings
  16.160 + *	have been changed.   used for TLB flushing.
  16.161 + */
  16.162 +
  16.163 +/*
  16.164 + * memory allocation
  16.165 + *
  16.166 + *  - there are three data structures that we must dynamically allocate:
  16.167 + *
  16.168 + * [A] new process' page directory page (PDP)
  16.169 + *	- plan 1: done at pmap_create() we use
  16.170 + *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
  16.171 + *	  allocation.
  16.172 + *
  16.173 + * if we are low in free physical memory then we sleep in
  16.174 + * uvm_km_alloc -- in this case this is ok since we are creating
  16.175 + * a new pmap and should not be holding any locks.
  16.176 + *
  16.177 + * if the kernel is totally out of virtual space
  16.178 + * (i.e. uvm_km_alloc returns NULL), then we panic.
  16.179 + *
  16.180 + * XXX: the fork code currently has no way to return an "out of
  16.181 + * memory, try again" error code since uvm_fork [fka vm_fork]
  16.182 + * is a void function.
  16.183 + *
  16.184 + * [B] new page tables pages (PTP)
  16.185 + * 	- call uvm_pagealloc()
  16.186 + * 		=> success: zero page, add to pm_pdir
  16.187 + * 		=> failure: we are out of free vm_pages, let pmap_enter()
  16.188 + *		   tell UVM about it.
  16.189 + *
  16.190 + * note: for kernel PTPs, we start with NKPTP of them.   as we map
  16.191 + * kernel memory (at uvm_map time) we check to see if we've grown
  16.192 + * the kernel pmap.   if so, we call the optional function
  16.193 + * pmap_growkernel() to grow the kernel PTPs in advance.
  16.194 + *
  16.195 + * [C] pv_entry structures
  16.196 + *	- plan 1: try to allocate one off the free list
  16.197 + *		=> success: done!
  16.198 + *		=> failure: no more free pv_entrys on the list
  16.199 + *	- plan 2: try to allocate a new pv_page to add a chunk of
  16.200 + *	pv_entrys to the free list
  16.201 + *		[a] obtain a free, unmapped, VA in kmem_map.  either
  16.202 + *		we have one saved from a previous call, or we allocate
  16.203 + *		one now using a "vm_map_lock_try" in uvm_map
  16.204 + *		=> success: we have an unmapped VA, continue to [b]
  16.205 + *		=> failure: unable to lock kmem_map or out of VA in it.
  16.206 + *			move on to plan 3.
  16.207 + *		[b] allocate a page in kmem_object for the VA
  16.208 + *		=> success: map it in, free the pv_entry's, DONE!
  16.209 + *		=> failure: kmem_object locked, no free vm_pages, etc.
  16.210 + *			save VA for later call to [a], go to plan 3.
  16.211 + *	If we fail, we simply let pmap_enter() tell UVM about it.
  16.212 + */
  16.213 +
  16.214 +/*
  16.215 + * locking
  16.216 + *
  16.217 + * we have the following locks that we must contend with:
  16.218 + *
  16.219 + * "normal" locks:
  16.220 + *
  16.221 + *  - pmap_main_lock
  16.222 + *    this lock is used to prevent deadlock and/or provide mutex
  16.223 + *    access to the pmap system.   most operations lock the pmap
  16.224 + *    structure first, then they lock the pv_lists (if needed).
  16.225 + *    however, some operations such as pmap_page_protect lock
  16.226 + *    the pv_lists and then lock pmaps.   in order to prevent a
  16.227 + *    cycle, we require a mutex lock when locking the pv_lists
  16.228 + *    first.   thus, the "pmap = >pv_list" lockers must gain a
  16.229 + *    read-lock on pmap_main_lock before locking the pmap.   and
  16.230 + *    the "pv_list => pmap" lockers must gain a write-lock on
  16.231 + *    pmap_main_lock before locking.    since only one thread
  16.232 + *    can write-lock a lock at a time, this provides mutex.
  16.233 + *
  16.234 + * "simple" locks:
  16.235 + *
  16.236 + * - pmap lock (per pmap, part of uvm_object)
  16.237 + *   this lock protects the fields in the pmap structure including
  16.238 + *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
  16.239 + *   in the alternate PTE space (since that is determined by the
  16.240 + *   entry in the PDP).
  16.241 + *
  16.242 + * - pvh_lock (per pv_head)
  16.243 + *   this lock protects the pv_entry list which is chained off the
  16.244 + *   pv_head structure for a specific managed PA.   it is locked
  16.245 + *   when traversing the list (e.g. adding/removing mappings,
  16.246 + *   syncing R/M bits, etc.)
  16.247 + *
  16.248 + * - pvalloc_lock
  16.249 + *   this lock protects the data structures which are used to manage
  16.250 + *   the free list of pv_entry structures.
  16.251 + *
  16.252 + * - pmaps_lock
  16.253 + *   this lock protects the list of active pmaps (headed by "pmaps").
  16.254 + *   we lock it when adding or removing pmaps from this list.
  16.255 + *
  16.256 + */
  16.257 +
  16.258 +/*
  16.259 + * locking data structures
  16.260 + */
  16.261 +
  16.262 +static struct simplelock pvalloc_lock;
  16.263 +static struct simplelock pmaps_lock;
  16.264 +
  16.265 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
  16.266 +static struct lock pmap_main_lock;
  16.267 +
  16.268 +#define PMAP_MAP_TO_HEAD_LOCK() \
  16.269 +     (void) spinlockmgr(&pmap_main_lock, LK_SHARED, NULL)
  16.270 +#define PMAP_MAP_TO_HEAD_UNLOCK() \
  16.271 +     (void) spinlockmgr(&pmap_main_lock, LK_RELEASE, NULL)
  16.272 +
  16.273 +#define PMAP_HEAD_TO_MAP_LOCK() \
  16.274 +     (void) spinlockmgr(&pmap_main_lock, LK_EXCLUSIVE, NULL)
  16.275 +#define PMAP_HEAD_TO_MAP_UNLOCK() \
  16.276 +     spinlockmgr(&pmap_main_lock, LK_RELEASE, (void *) 0)
  16.277 +
  16.278 +#else
  16.279 +
  16.280 +#define PMAP_MAP_TO_HEAD_LOCK()		/* null */
  16.281 +#define PMAP_MAP_TO_HEAD_UNLOCK()	/* null */
  16.282 +
  16.283 +#define PMAP_HEAD_TO_MAP_LOCK()		/* null */
  16.284 +#define PMAP_HEAD_TO_MAP_UNLOCK()	/* null */
  16.285 +
  16.286 +#endif
  16.287 +
  16.288 +#define COUNT(x)	/* nothing */
  16.289 +
  16.290 +/*
  16.291 + * TLB Shootdown:
  16.292 + *
  16.293 + * When a mapping is changed in a pmap, the TLB entry corresponding to
  16.294 + * the virtual address must be invalidated on all processors.  In order
  16.295 + * to accomplish this on systems with multiple processors, messages are
  16.296 + * sent from the processor which performs the mapping change to all
  16.297 + * processors on which the pmap is active.  For other processors, the
  16.298 + * ASN generation numbers for that processor is invalidated, so that
  16.299 + * the next time the pmap is activated on that processor, a new ASN
  16.300 + * will be allocated (which implicitly invalidates all TLB entries).
  16.301 + *
  16.302 + * Shootdown job queue entries are allocated using a simple special-
  16.303 + * purpose allocator for speed.
  16.304 + */
  16.305 +struct pmap_tlb_shootdown_job {
  16.306 +	TAILQ_ENTRY(pmap_tlb_shootdown_job) pj_list;
  16.307 +	vaddr_t pj_va;			/* virtual address */
  16.308 +	pmap_t pj_pmap;			/* the pmap which maps the address */
  16.309 +	pt_entry_t pj_pte;		/* the PTE bits */
  16.310 +	struct pmap_tlb_shootdown_job *pj_nextfree;
  16.311 +};
  16.312 +
  16.313 +#define PMAP_TLB_SHOOTDOWN_JOB_ALIGN 32
  16.314 +union pmap_tlb_shootdown_job_al {
  16.315 +	struct pmap_tlb_shootdown_job pja_job;
  16.316 +	char pja_align[PMAP_TLB_SHOOTDOWN_JOB_ALIGN];
  16.317 +};
  16.318 +
  16.319 +struct pmap_tlb_shootdown_q {
  16.320 +	TAILQ_HEAD(, pmap_tlb_shootdown_job) pq_head;
  16.321 +	int pq_pte;			/* aggregate PTE bits */
  16.322 +	int pq_count;			/* number of pending requests */
  16.323 +	__cpu_simple_lock_t pq_slock;	/* spin lock on queue */
  16.324 +	int pq_flushg;		/* pending flush global */
  16.325 +	int pq_flushu;		/* pending flush user */
  16.326 +} pmap_tlb_shootdown_q[X86_MAXPROCS];
  16.327 +
  16.328 +#define	PMAP_TLB_MAXJOBS	16
  16.329 +
  16.330 +void	pmap_tlb_shootdown_q_drain(struct pmap_tlb_shootdown_q *);
  16.331 +struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get
  16.332 +	   (struct pmap_tlb_shootdown_q *);
  16.333 +void	pmap_tlb_shootdown_job_put(struct pmap_tlb_shootdown_q *,
  16.334 +	    struct pmap_tlb_shootdown_job *);
  16.335 +
  16.336 +__cpu_simple_lock_t pmap_tlb_shootdown_job_lock;
  16.337 +union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
  16.338 +
  16.339 +/*
  16.340 + * global data structures
  16.341 + */
  16.342 +
  16.343 +struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
  16.344 +
  16.345 +/*
  16.346 + * nkpde is the number of kernel PTPs allocated for the kernel at
  16.347 + * boot time (NKPTP is a compile time override).   this number can
  16.348 + * grow dynamically as needed (but once allocated, we never free
  16.349 + * kernel PTPs).
  16.350 + */
  16.351 +
  16.352 +int nkpde = NKPTP;
  16.353 +#ifdef NKPDE
  16.354 +#error "obsolete NKPDE: use NKPTP"
  16.355 +#endif
  16.356 +
  16.357 +/*
  16.358 + * pmap_pg_g: if our processor supports PG_G in the PTE then we
  16.359 + * set pmap_pg_g to PG_G (otherwise it is zero).
  16.360 + */
  16.361 +
  16.362 +int pmap_pg_g = 0;
  16.363 +
  16.364 +#ifdef LARGEPAGES
  16.365 +/*
  16.366 + * pmap_largepages: if our processor supports PG_PS and we are
  16.367 + * using it, this is set to TRUE.
  16.368 + */
  16.369 +
  16.370 +int pmap_largepages;
  16.371 +#endif
  16.372 +
  16.373 +/*
  16.374 + * i386 physical memory comes in a big contig chunk with a small
  16.375 + * hole toward the front of it...  the following two paddr_t's
  16.376 + * (shared with machdep.c) describe the physical address space
  16.377 + * of this machine.
  16.378 + */
  16.379 +paddr_t avail_start;	/* PA of first available physical page */
  16.380 +paddr_t avail_end;	/* PA of last available physical page */
  16.381 +
  16.382 +paddr_t pmap_pa_start;	/* PA of first physical page for this domain */
  16.383 +paddr_t pmap_pa_end;	/* PA of last physical page for this domain */
  16.384 +
  16.385 +	/* MA of last physical page of the machine */
  16.386 +paddr_t pmap_mem_end = HYPERVISOR_VIRT_START; /* updated for domain-0 */
  16.387 +
  16.388 +/*
  16.389 + * other data structures
  16.390 + */
  16.391 +
  16.392 +static pt_entry_t protection_codes[8];     /* maps MI prot to i386 prot code */
  16.393 +static boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */
  16.394 +
  16.395 +/*
  16.396 + * the following two vaddr_t's are used during system startup
  16.397 + * to keep track of how much of the kernel's VM space we have used.
  16.398 + * once the system is started, the management of the remaining kernel
  16.399 + * VM space is turned over to the kernel_map vm_map.
  16.400 + */
  16.401 +
  16.402 +static vaddr_t virtual_avail;	/* VA of first free KVA */
  16.403 +static vaddr_t virtual_end;	/* VA of last free KVA */
  16.404 +
  16.405 +
  16.406 +/*
  16.407 + * pv_page management structures: locked by pvalloc_lock
  16.408 + */
  16.409 +
  16.410 +TAILQ_HEAD(pv_pagelist, pv_page);
  16.411 +static struct pv_pagelist pv_freepages;	/* list of pv_pages with free entrys */
  16.412 +static struct pv_pagelist pv_unusedpgs; /* list of unused pv_pages */
  16.413 +static int pv_nfpvents;			/* # of free pv entries */
  16.414 +static struct pv_page *pv_initpage;	/* bootstrap page from kernel_map */
  16.415 +static vaddr_t pv_cachedva;		/* cached VA for later use */
  16.416 +
  16.417 +#define PVE_LOWAT (PVE_PER_PVPAGE / 2)	/* free pv_entry low water mark */
  16.418 +#define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2))
  16.419 +					/* high water mark */
  16.420 +
  16.421 +static __inline int
  16.422 +pv_compare(struct pv_entry *a, struct pv_entry *b)
  16.423 +{
  16.424 +	if (a->pv_pmap < b->pv_pmap)
  16.425 +		return (-1);
  16.426 +	else if (a->pv_pmap > b->pv_pmap)
  16.427 +		return (1);
  16.428 +	else if (a->pv_va < b->pv_va)
  16.429 +		return (-1);
  16.430 +	else if (a->pv_va > b->pv_va)
  16.431 +		return (1);
  16.432 +	else
  16.433 +		return (0);
  16.434 +}
  16.435 +
  16.436 +SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare);
  16.437 +SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare);
  16.438 +
  16.439 +/*
  16.440 + * linked list of all non-kernel pmaps
  16.441 + */
  16.442 +
  16.443 +static struct pmap_head pmaps;
  16.444 +
  16.445 +/*
  16.446 + * pool that pmap structures are allocated from
  16.447 + */
  16.448 +
  16.449 +struct pool pmap_pmap_pool;
  16.450 +
  16.451 +/*
  16.452 + * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
  16.453 + * X86_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing
  16.454 + * due to false sharing.
  16.455 + */
  16.456 +
  16.457 +#ifdef MULTIPROCESSOR
  16.458 +#define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
  16.459 +#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
  16.460 +#else
  16.461 +#define PTESLEW(pte, id) (pte)
  16.462 +#define VASLEW(va,id) (va)
  16.463 +#endif
  16.464 +
  16.465 +/*
  16.466 + * special VAs and the PTEs that map them
  16.467 + */
  16.468 +static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte;
  16.469 +static caddr_t csrcp, cdstp, zerop, ptpp;
  16.470 +
  16.471 +/*
  16.472 + * pool and cache that PDPs are allocated from
  16.473 + */
  16.474 +
  16.475 +struct pool pmap_pdp_pool;
  16.476 +struct pool_cache pmap_pdp_cache;
  16.477 +u_int pmap_pdp_cache_generation;
  16.478 +
  16.479 +int	pmap_pdp_ctor(void *, void *, int);
  16.480 +void	pmap_pdp_dtor(void *, void *);
  16.481 +
  16.482 +caddr_t vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
  16.483 +
  16.484 +extern vaddr_t msgbuf_vaddr;
  16.485 +extern paddr_t msgbuf_paddr;
  16.486 +
  16.487 +extern vaddr_t idt_vaddr;			/* we allocate IDT early */
  16.488 +extern paddr_t idt_paddr;
  16.489 +
  16.490 +#if defined(I586_CPU)
  16.491 +/* stuff to fix the pentium f00f bug */
  16.492 +extern vaddr_t pentium_idt_vaddr;
  16.493 +#endif
  16.494 +
  16.495 +
  16.496 +/*
  16.497 + * local prototypes
  16.498 + */
  16.499 +
  16.500 +static struct pv_entry	*pmap_add_pvpage(struct pv_page *, boolean_t);
  16.501 +static struct vm_page	*pmap_alloc_ptp(struct pmap *, int);
  16.502 +static struct pv_entry	*pmap_alloc_pv(struct pmap *, int); /* see codes below */
  16.503 +#define ALLOCPV_NEED	0	/* need PV now */
  16.504 +#define ALLOCPV_TRY	1	/* just try to allocate, don't steal */
  16.505 +#define ALLOCPV_NONEED	2	/* don't need PV, just growing cache */
  16.506 +static struct pv_entry	*pmap_alloc_pvpage(struct pmap *, int);
  16.507 +static void		 pmap_enter_pv(struct pv_head *,
  16.508 +				       struct pv_entry *, struct pmap *,
  16.509 +				       vaddr_t, struct vm_page *);
  16.510 +static void		 pmap_free_pv(struct pmap *, struct pv_entry *);
  16.511 +static void		 pmap_free_pvs(struct pmap *, struct pv_entry *);
  16.512 +static void		 pmap_free_pv_doit(struct pv_entry *);
  16.513 +static void		 pmap_free_pvpage(void);
  16.514 +static struct vm_page	*pmap_get_ptp(struct pmap *, int);
  16.515 +static boolean_t	 pmap_is_curpmap(struct pmap *);
  16.516 +static boolean_t	 pmap_is_active(struct pmap *, int);
  16.517 +static pt_entry_t	*pmap_map_ptes(struct pmap *);
  16.518 +static struct pv_entry	*pmap_remove_pv(struct pv_head *, struct pmap *,
  16.519 +					vaddr_t);
  16.520 +static void		 pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
  16.521 +static boolean_t	 pmap_remove_pte(struct pmap *, struct vm_page *,
  16.522 +					 pt_entry_t *, vaddr_t, int32_t *, int);
  16.523 +static void		 pmap_remove_ptes(struct pmap *, struct vm_page *,
  16.524 +					  vaddr_t, vaddr_t, vaddr_t, int32_t *,
  16.525 +					  int);
  16.526 +#define PMAP_REMOVE_ALL		0	/* remove all mappings */
  16.527 +#define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
  16.528 +
  16.529 +static vaddr_t		 pmap_tmpmap_pa(paddr_t);
  16.530 +static pt_entry_t	*pmap_tmpmap_pvepte(struct pv_entry *);
  16.531 +static void		 pmap_tmpunmap_pa(void);
  16.532 +static void		 pmap_tmpunmap_pvepte(struct pv_entry *);
  16.533 +static void		 pmap_unmap_ptes(struct pmap *);
  16.534 +
  16.535 +static boolean_t	 pmap_reactivate(struct pmap *);
  16.536 +
  16.537 +#ifdef DEBUG
  16.538 +u_int	curapdp;
  16.539 +#endif
  16.540 +
  16.541 +/*
  16.542 + * p m a p   i n l i n e   h e l p e r   f u n c t i o n s
  16.543 + */
  16.544 +
  16.545 +/*
  16.546 + * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
  16.547 + *		of course the kernel is always loaded
  16.548 + */
  16.549 +
  16.550 +__inline static boolean_t
  16.551 +pmap_is_curpmap(pmap)
  16.552 +	struct pmap *pmap;
  16.553 +{
  16.554 +
  16.555 +	return((pmap == pmap_kernel()) ||
  16.556 +	       (pmap == curcpu()->ci_pmap));
  16.557 +}
  16.558 +
  16.559 +/*
  16.560 + * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
  16.561 + */
  16.562 +
  16.563 +__inline static boolean_t
  16.564 +pmap_is_active(pmap, cpu_id)
  16.565 +	struct pmap *pmap;
  16.566 +	int cpu_id;
  16.567 +{
  16.568 +
  16.569 +	return (pmap == pmap_kernel() ||
  16.570 +	    (pmap->pm_cpus & (1U << cpu_id)) != 0);
  16.571 +}
  16.572 +
  16.573 +/*
  16.574 + * pmap_tmpmap_pa: map a page in for tmp usage
  16.575 + */
  16.576 +
  16.577 +__inline static vaddr_t
  16.578 +pmap_tmpmap_pa(pa)
  16.579 +	paddr_t pa;
  16.580 +{
  16.581 +#ifdef MULTIPROCESSOR
  16.582 +	int id = cpu_number();
  16.583 +#endif
  16.584 +	pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
  16.585 +	pt_entry_t *maptp;
  16.586 +	caddr_t ptpva = VASLEW(ptpp, id);
  16.587 +#if defined(DIAGNOSTIC)
  16.588 +	if (*ptpte)
  16.589 +		panic("pmap_tmpmap_pa: ptp_pte in use?");
  16.590 +#endif
  16.591 +	maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte);
  16.592 +	PTE_SET(ptpte, maptp, PG_V | PG_RW | pa); /* always a new mapping */
  16.593 +	return((vaddr_t)ptpva);
  16.594 +}
  16.595 +
  16.596 +/*
  16.597 + * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa)
  16.598 + */
  16.599 +
  16.600 +__inline static void
  16.601 +pmap_tmpunmap_pa()
  16.602 +{
  16.603 +#ifdef MULTIPROCESSOR
  16.604 +	int id = cpu_number();
  16.605 +#endif
  16.606 +	pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
  16.607 +	pt_entry_t *maptp;
  16.608 +	caddr_t ptpva = VASLEW(ptpp, id);
  16.609 +#if defined(DIAGNOSTIC)
  16.610 +	if (!pmap_valid_entry(*ptp_pte))
  16.611 +		panic("pmap_tmpunmap_pa: our pte invalid?");
  16.612 +#endif
  16.613 +	maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte);
  16.614 +	PTE_CLEAR(ptpte, maptp);		/* zap! */
  16.615 +	pmap_update_pg((vaddr_t)ptpva);
  16.616 +#ifdef MULTIPROCESSOR
  16.617 +	/*
  16.618 +	 * No need for tlb shootdown here, since ptp_pte is per-CPU.
  16.619 +	 */
  16.620 +#endif
  16.621 +}
  16.622 +
  16.623 +/*
  16.624 + * pmap_tmpmap_pvepte: get a quick mapping of a PTE for a pv_entry
  16.625 + *
  16.626 + * => do NOT use this on kernel mappings [why?  because pv_ptp may be NULL]
  16.627 + */
  16.628 +
  16.629 +__inline static pt_entry_t *
  16.630 +pmap_tmpmap_pvepte(pve)
  16.631 +	struct pv_entry *pve;
  16.632 +{
  16.633 +#ifdef DIAGNOSTIC
  16.634 +	if (pve->pv_pmap == pmap_kernel())
  16.635 +		panic("pmap_tmpmap_pvepte: attempt to map kernel");
  16.636 +#endif
  16.637 +
  16.638 +	/* is it current pmap?  use direct mapping... */
  16.639 +	if (pmap_is_curpmap(pve->pv_pmap))
  16.640 +		return(vtopte(pve->pv_va));
  16.641 +
  16.642 +	return(((pt_entry_t *)pmap_tmpmap_pa(VM_PAGE_TO_PHYS(pve->pv_ptp)))
  16.643 +	       + ptei((unsigned)pve->pv_va));
  16.644 +}
  16.645 +
  16.646 +/*
  16.647 + * pmap_tmpunmap_pvepte: release a mapping obtained with pmap_tmpmap_pvepte
  16.648 + */
  16.649 +
  16.650 +__inline static void
  16.651 +pmap_tmpunmap_pvepte(pve)
  16.652 +	struct pv_entry *pve;
  16.653 +{
  16.654 +	/* was it current pmap?   if so, return */
  16.655 +	if (pmap_is_curpmap(pve->pv_pmap))
  16.656 +		return;
  16.657 +
  16.658 +	pmap_tmpunmap_pa();
  16.659 +}
  16.660 +
  16.661 +__inline static void
  16.662 +pmap_apte_flush(struct pmap *pmap)
  16.663 +{
  16.664 +#if defined(MULTIPROCESSOR)
  16.665 +	struct pmap_tlb_shootdown_q *pq;
  16.666 +	struct cpu_info *ci, *self = curcpu();
  16.667 +	CPU_INFO_ITERATOR cii;
  16.668 +	int s;
  16.669 +#endif
  16.670 +
  16.671 +	tlbflush();		/* flush TLB on current processor */
  16.672 +#if defined(MULTIPROCESSOR)
  16.673 +	/*
  16.674 +	 * Flush the APTE mapping from all other CPUs that
  16.675 +	 * are using the pmap we are using (who's APTE space
  16.676 +	 * is the one we've just modified).
  16.677 +	 *
  16.678 +	 * XXXthorpej -- find a way to defer the IPI.
  16.679 +	 */
  16.680 +	for (CPU_INFO_FOREACH(cii, ci)) {
  16.681 +		if (ci == self)
  16.682 +			continue;
  16.683 +		if (pmap_is_active(pmap, ci->ci_cpuid)) {
  16.684 +			pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
  16.685 +			s = splipi();
  16.686 +			__cpu_simple_lock(&pq->pq_slock);
  16.687 +			pq->pq_flushu++;
  16.688 +			__cpu_simple_unlock(&pq->pq_slock);
  16.689 +			splx(s);
  16.690 +			x86_send_ipi(ci, X86_IPI_TLB);
  16.691 +		}
  16.692 +	}
  16.693 +#endif
  16.694 +}
  16.695 +
  16.696 +/*
  16.697 + * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
  16.698 + *
  16.699 + * => we lock enough pmaps to keep things locked in
  16.700 + * => must be undone with pmap_unmap_ptes before returning
  16.701 + */
  16.702 +
  16.703 +__inline static pt_entry_t *
  16.704 +pmap_map_ptes(pmap)
  16.705 +	struct pmap *pmap;
  16.706 +{
  16.707 +	pd_entry_t opde;
  16.708 +	pd_entry_t *mapdp;
  16.709 +	struct pmap *ourpmap;
  16.710 +	struct cpu_info *ci;
  16.711 +
  16.712 +	/* the kernel's pmap is always accessible */
  16.713 +	if (pmap == pmap_kernel()) {
  16.714 +		return(PTE_BASE);
  16.715 +	}
  16.716 +
  16.717 +	ci = curcpu();
  16.718 +	if (ci->ci_want_pmapload &&
  16.719 +	    vm_map_pmap(&ci->ci_curlwp->l_proc->p_vmspace->vm_map) == pmap)
  16.720 +		pmap_load();
  16.721 +
  16.722 +	/* if curpmap then we are always mapped */
  16.723 +	if (pmap_is_curpmap(pmap)) {
  16.724 +		simple_lock(&pmap->pm_obj.vmobjlock);
  16.725 +		return(PTE_BASE);
  16.726 +	}
  16.727 +
  16.728 +	ourpmap = ci->ci_pmap;
  16.729 +
  16.730 +	/* need to lock both curpmap and pmap: use ordered locking */
  16.731 +	if ((unsigned) pmap < (unsigned) ourpmap) {
  16.732 +		simple_lock(&pmap->pm_obj.vmobjlock);
  16.733 +		simple_lock(&ourpmap->pm_obj.vmobjlock);
  16.734 +	} else {
  16.735 +		simple_lock(&ourpmap->pm_obj.vmobjlock);
  16.736 +		simple_lock(&pmap->pm_obj.vmobjlock);
  16.737 +	}
  16.738 +
  16.739 +	/* need to load a new alternate pt space into curpmap? */
  16.740 +	COUNT(apdp_pde_map);
  16.741 +	opde = PDE_GET(APDP_PDE);
  16.742 +	if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) {
  16.743 +		XENPRINTF(("APDP_PDE %p %p/%p set %p/%p\n",
  16.744 +			   pmap,
  16.745 +			   (void *)vtophys((vaddr_t)APDP_PDE),
  16.746 +			   (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)),
  16.747 +			   (void *)pmap->pm_pdirpa,
  16.748 +			   (void *)xpmap_ptom(pmap->pm_pdirpa)));
  16.749 +		mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
  16.750 +		PDE_SET(APDP_PDE, mapdp, pmap->pm_pdirpa /* | PG_RW */ | PG_V);
  16.751 +#ifdef DEBUG
  16.752 +		curapdp = pmap->pm_pdirpa;
  16.753 +#endif
  16.754 +		if (pmap_valid_entry(opde))
  16.755 +			pmap_apte_flush(ourpmap);
  16.756 +		XENPRINTF(("APDP_PDE set done\n"));
  16.757 +	}
  16.758 +	return(APTE_BASE);
  16.759 +}
  16.760 +
  16.761 +/*
  16.762 + * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
  16.763 + */
  16.764 +
  16.765 +__inline static void
  16.766 +pmap_unmap_ptes(pmap)
  16.767 +	struct pmap *pmap;
  16.768 +{
  16.769 +#if defined(MULTIPROCESSOR)
  16.770 +	pd_entry_t *mapdp;
  16.771 +#endif
  16.772 +
  16.773 +	if (pmap == pmap_kernel()) {
  16.774 +		return;
  16.775 +	}
  16.776 +	if (pmap_is_curpmap(pmap)) {
  16.777 +		simple_unlock(&pmap->pm_obj.vmobjlock);
  16.778 +	} else {
  16.779 +		struct pmap *ourpmap = curcpu()->ci_pmap;
  16.780 +
  16.781 +#if defined(MULTIPROCESSOR)
  16.782 +		mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
  16.783 +		PDE_CLEAR(APDP_PDE, mapdp);
  16.784 +		pmap_apte_flush(ourpmap);
  16.785 +#endif
  16.786 +#ifdef DEBUG
  16.787 +		curapdp = 0;
  16.788 +#endif
  16.789 +		XENPRINTF(("APDP_PDE clear %p/%p set %p/%p\n",
  16.790 +			   (void *)vtophys((vaddr_t)APDP_PDE),
  16.791 +			   (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)),
  16.792 +			   (void *)pmap->pm_pdirpa,
  16.793 +			   (void *)xpmap_ptom(pmap->pm_pdirpa)));
  16.794 +		COUNT(apdp_pde_unmap);
  16.795 +		simple_unlock(&pmap->pm_obj.vmobjlock);
  16.796 +		simple_unlock(&ourpmap->pm_obj.vmobjlock);
  16.797 +	}
  16.798 +}
  16.799 +
  16.800 +__inline static void
  16.801 +pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
  16.802 +{
  16.803 +	if (curproc == NULL || curproc->p_vmspace == NULL ||
  16.804 +	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
  16.805 +		return;
  16.806 +
  16.807 +	if ((opte ^ npte) & PG_X)
  16.808 +		pmap_update_pg(va);
  16.809 +
  16.810 +	/*
  16.811 +	 * Executability was removed on the last executable change.
  16.812 +	 * Reset the code segment to something conservative and
  16.813 +	 * let the trap handler deal with setting the right limit.
  16.814 +	 * We can't do that because of locking constraints on the vm map.
  16.815 +	 */
  16.816 +
  16.817 +	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
  16.818 +		struct trapframe *tf = curlwp->l_md.md_regs;
  16.819 +		struct pcb *pcb = &curlwp->l_addr->u_pcb;
  16.820 +
  16.821 +		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
  16.822 +		pm->pm_hiexec = I386_MAX_EXE_ADDR;
  16.823 +	}
  16.824 +}
  16.825 +
  16.826 +__inline static pt_entry_t
  16.827 +pte_mtop(pt_entry_t pte)
  16.828 +{
  16.829 +	pt_entry_t ppte;
  16.830 +
  16.831 +	KDASSERT(pmap_valid_entry(pte));
  16.832 +	ppte = xpmap_mtop(pte);
  16.833 +	if ((ppte & PG_FRAME) == XPMAP_OFFSET) {
  16.834 +		XENPRINTF(("pte_mtop: null page %08x -> %08x\n",
  16.835 +		    ppte, pte));
  16.836 +		ppte = pte;
  16.837 +	}
  16.838 +
  16.839 +	return ppte;
  16.840 +}
  16.841 +
  16.842 +__inline static pt_entry_t
  16.843 +pte_get_ma(pt_entry_t *pte)
  16.844 +{
  16.845 +
  16.846 +	return *pte;
  16.847 +}
  16.848 +
  16.849 +__inline static pt_entry_t
  16.850 +pte_get(pt_entry_t *pte)
  16.851 +{
  16.852 +
  16.853 +	if (pmap_valid_entry(*pte))
  16.854 +		return pte_mtop(*pte);
  16.855 +	return *pte;
  16.856 +}
  16.857 +
  16.858 +__inline static pt_entry_t
  16.859 +pte_atomic_update_ma(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte)
  16.860 +{
  16.861 +	pt_entry_t opte;
  16.862 +
  16.863 +	XENPRINTK(("pte_atomic_update_ma pte %p mapte %p npte %08x\n",
  16.864 +		   pte, mapte, npte));
  16.865 +	opte = PTE_GET_MA(pte);
  16.866 +	if (opte > pmap_mem_end) {
  16.867 +		/* must remove opte unchecked */
  16.868 +		if (npte > pmap_mem_end)
  16.869 +			/* must set npte unchecked */
  16.870 +			xpq_queue_unchecked_pte_update(mapte, npte);
  16.871 +		else {
  16.872 +			/* must set npte checked */
  16.873 +			xpq_queue_unchecked_pte_update(mapte, 0);
  16.874 +			xpq_queue_pte_update(mapte, npte);
  16.875 +		}
  16.876 +	} else {
  16.877 +		/* must remove opte checked */
  16.878 +		if (npte > pmap_mem_end) {
  16.879 +			/* must set npte unchecked */
  16.880 +			xpq_queue_pte_update(mapte, 0);
  16.881 +			xpq_queue_unchecked_pte_update(mapte, npte);
  16.882 +		} else
  16.883 +			/* must set npte checked */
  16.884 +			xpq_queue_pte_update(mapte, npte);
  16.885 +	}
  16.886 +	xpq_flush_queue();
  16.887 +
  16.888 +	return opte;
  16.889 +}
  16.890 +
  16.891 +__inline static pt_entry_t
  16.892 +pte_atomic_update(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte)
  16.893 +{
  16.894 +	pt_entry_t opte;
  16.895 +
  16.896 +	opte = pte_atomic_update_ma(pte, mapte, npte);
  16.897 +
  16.898 +	return pte_mtop(opte);
  16.899 +}
  16.900 +
  16.901 +/*
  16.902 + * Fixup the code segment to cover all potential executable mappings.
  16.903 + * returns 0 if no changes to the code segment were made.
  16.904 + */
  16.905 +
  16.906 +int
  16.907 +pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
  16.908 +{
  16.909 +	struct vm_map_entry *ent;
  16.910 +	struct pmap *pm = vm_map_pmap(map);
  16.911 +	vaddr_t va = 0;
  16.912 +
  16.913 +	vm_map_lock_read(map);
  16.914 +	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
  16.915 +
  16.916 +		/*
  16.917 +		 * This entry has greater va than the entries before.
  16.918 +		 * We need to make it point to the last page, not past it.
  16.919 +		 */
  16.920 +
  16.921 +		if (ent->protection & VM_PROT_EXECUTE)
  16.922 +			va = trunc_page(ent->end) - PAGE_SIZE;
  16.923 +	}
  16.924 +	vm_map_unlock_read(map);
  16.925 +	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
  16.926 +		return (0);
  16.927 +
  16.928 +	pm->pm_hiexec = va;
  16.929 +	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
  16.930 +		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
  16.931 +	} else {
  16.932 +		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
  16.933 +		return (0);
  16.934 +	}
  16.935 +	return (1);
  16.936 +}
  16.937 +
  16.938 +/*
  16.939 + * p m a p   k e n t e r   f u n c t i o n s
  16.940 + *
  16.941 + * functions to quickly enter/remove pages from the kernel address
  16.942 + * space.   pmap_kremove is exported to MI kernel.  we make use of
  16.943 + * the recursive PTE mappings.
  16.944 + */
  16.945 +
  16.946 +/*
  16.947 + * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
  16.948 + *
  16.949 + * => no need to lock anything, assume va is already allocated
  16.950 + * => should be faster than normal pmap enter function
  16.951 + */
  16.952 +
  16.953 +void
  16.954 +pmap_kenter_pa(va, pa, prot)
  16.955 +	vaddr_t va;
  16.956 +	paddr_t pa;
  16.957 +	vm_prot_t prot;
  16.958 +{
  16.959 +	pt_entry_t *pte, opte, npte;
  16.960 +	pt_entry_t *maptp;
  16.961 +
  16.962 +	if (va < VM_MIN_KERNEL_ADDRESS)
  16.963 +		pte = vtopte(va);
  16.964 +	else
  16.965 +		pte = kvtopte(va);
  16.966 +
  16.967 +	npte = ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
  16.968 +	     PG_V | pmap_pg_g;
  16.969 +
  16.970 +	if (pa >= pmap_pa_start && pa < pmap_pa_end) {
  16.971 +		npte |= xpmap_ptom(pa);
  16.972 +	} else {
  16.973 +		XENPRINTF(("pmap_kenter: va %08lx outside pa range %08lx\n",
  16.974 +			      va, pa));
  16.975 +		npte |= pa;
  16.976 +	}
  16.977 +
  16.978 +	maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
  16.979 +	opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */
  16.980 +	XENPRINTK(("pmap_kenter_pa(%p,%p) %p, was %08x now %08x\n", (void *)va, 
  16.981 +		      (void *)pa, pte, opte, npte));
  16.982 +#ifdef LARGEPAGES
  16.983 +	/* XXX For now... */
  16.984 +	if (opte & PG_PS)
  16.985 +		panic("pmap_kenter_pa: PG_PS");
  16.986 +#endif
  16.987 +	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
  16.988 +#if defined(MULTIPROCESSOR)
  16.989 +		int32_t cpumask = 0;
  16.990 +
  16.991 +		pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
  16.992 +		pmap_tlb_shootnow(cpumask);
  16.993 +#else
  16.994 +		/* Don't bother deferring in the single CPU case. */
  16.995 +		pmap_update_pg(va);
  16.996 +#endif
  16.997 +	}
  16.998 +}
  16.999 +
 16.1000 +/*
 16.1001 + * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
 16.1002 + *
 16.1003 + * => no need to lock anything, assume va is already allocated
 16.1004 + * => should be faster than normal pmap enter function
 16.1005 + */
 16.1006 +
 16.1007 +void		 pmap_kenter_ma __P((vaddr_t, paddr_t, vm_prot_t));
 16.1008 +
 16.1009 +void
 16.1010 +pmap_kenter_ma(va, ma, prot)
 16.1011 +	vaddr_t va;
 16.1012 +	paddr_t ma;
 16.1013 +	vm_prot_t prot;
 16.1014 +{
 16.1015 +	pt_entry_t *pte, opte, npte;
 16.1016 +	pt_entry_t *maptp;
 16.1017 +
 16.1018 +	KASSERT (va >= VM_MIN_KERNEL_ADDRESS);
 16.1019 +	pte = kvtopte(va);
 16.1020 +
 16.1021 +	npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
 16.1022 +	     PG_V | pmap_pg_g;
 16.1023 +
 16.1024 +	maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
 16.1025 +	opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */
 16.1026 +	XENPRINTK(("pmap_kenter_ma(%p,%p) %p, was %08x\n", (void *)va,
 16.1027 +		      (void *)ma, pte, opte));
 16.1028 +#ifdef LARGEPAGES
 16.1029 +	/* XXX For now... */
 16.1030 +	if (opte & PG_PS)
 16.1031 +		panic("pmap_kenter_ma: PG_PS");
 16.1032 +#endif
 16.1033 +	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
 16.1034 +#if defined(MULTIPROCESSOR)
 16.1035 +		int32_t cpumask = 0;
 16.1036 +
 16.1037 +		pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
 16.1038 +		pmap_tlb_shootnow(cpumask);
 16.1039 +#else
 16.1040 +		/* Don't bother deferring in the single CPU case. */
 16.1041 +		pmap_update_pg(va);
 16.1042 +#endif
 16.1043 +	}
 16.1044 +}
 16.1045 +
 16.1046 +/*
 16.1047 + * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
 16.1048 + *
 16.1049 + * => no need to lock anything
 16.1050 + * => caller must dispose of any vm_page mapped in the va range
 16.1051 + * => note: not an inline function
 16.1052 + * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
 16.1053 + * => we assume kernel only unmaps valid addresses and thus don't bother
 16.1054 + *    checking the valid bit before doing TLB flushing
 16.1055 + */
 16.1056 +
 16.1057 +void
 16.1058 +pmap_kremove(va, len)
 16.1059 +	vaddr_t va;
 16.1060 +	vsize_t len;
 16.1061 +{
 16.1062 +	pt_entry_t *pte, opte;
 16.1063 +	pt_entry_t *maptp;
 16.1064 +	int32_t cpumask = 0;
 16.1065 +
 16.1066 +	XENPRINTK(("pmap_kremove va %p, len %08lx\n", (void *)va, len));
 16.1067 +	len >>= PAGE_SHIFT;
 16.1068 +	for ( /* null */ ; len ; len--, va += PAGE_SIZE) {
 16.1069 +		if (va < VM_MIN_KERNEL_ADDRESS)
 16.1070 +			pte = vtopte(va);
 16.1071 +		else
 16.1072 +			pte = kvtopte(va);
 16.1073 +		maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
 16.1074 +		opte = pte_atomic_update_ma(pte, maptp, 0); /* zap! */
 16.1075 +		XENPRINTK(("pmap_kremove pte %p, was %08x\n", pte, opte));
 16.1076 +#ifdef LARGEPAGES
 16.1077 +		/* XXX For now... */
 16.1078 +		if (opte & PG_PS)
 16.1079 +			panic("pmap_kremove: PG_PS");
 16.1080 +#endif
 16.1081 +#ifdef DIAGNOSTIC
 16.1082 +		if (opte & PG_PVLIST)
 16.1083 +			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
 16.1084 +			      va);
 16.1085 +#endif
 16.1086 +		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U))
 16.1087 +			pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
 16.1088 +	}
 16.1089 +	pmap_tlb_shootnow(cpumask);
 16.1090 +}
 16.1091 +
 16.1092 +/*
 16.1093 + * p m a p   i n i t   f u n c t i o n s
 16.1094 + *
 16.1095 + * pmap_bootstrap and pmap_init are called during system startup
 16.1096 + * to init the pmap module.   pmap_bootstrap() does a low level
 16.1097 + * init just to get things rolling.   pmap_init() finishes the job.
 16.1098 + */
 16.1099 +
 16.1100 +/*
 16.1101 + * pmap_bootstrap: get the system in a state where it can run with VM
 16.1102 + *	properly enabled (called before main()).   the VM system is
 16.1103 + *      fully init'd later...
 16.1104 + *
 16.1105 + * => on i386, locore.s has already enabled the MMU by allocating
 16.1106 + *	a PDP for the kernel, and nkpde PTP's for the kernel.
 16.1107 + * => kva_start is the first free virtual address in kernel space
 16.1108 + */
 16.1109 +
 16.1110 +void
 16.1111 +pmap_bootstrap(kva_start)
 16.1112 +	vaddr_t kva_start;
 16.1113 +{
 16.1114 +	struct pmap *kpm;
 16.1115 +	vaddr_t kva;
 16.1116 +	pt_entry_t *pte;
 16.1117 +	pt_entry_t *maptp;
 16.1118 +	int i;
 16.1119 +
 16.1120 +	/*
 16.1121 +	 * set up our local static global vars that keep track of the
 16.1122 +	 * usage of KVM before kernel_map is set up
 16.1123 +	 */
 16.1124 +
 16.1125 +	virtual_avail = kva_start;		/* first free KVA */
 16.1126 +	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
 16.1127 +
 16.1128 +	/*
 16.1129 +	 * find out where physical memory ends on the real hardware.
 16.1130 +	 */
 16.1131 +
 16.1132 +	if (xen_start_info.flags & SIF_PRIVILEGED)
 16.1133 +		pmap_mem_end = find_pmap_mem_end(kva_start);
 16.1134 +
 16.1135 +	/*
 16.1136 +	 * set up protection_codes: we need to be able to convert from
 16.1137 +	 * a MI protection code (some combo of VM_PROT...) to something
 16.1138 +	 * we can jam into a i386 PTE.
 16.1139 +	 */
 16.1140 +
 16.1141 +	protection_codes[VM_PROT_NONE] = 0;  			/* --- */
 16.1142 +	protection_codes[VM_PROT_EXECUTE] = PG_X;		/* --x */
 16.1143 +	protection_codes[VM_PROT_READ] = PG_RO;			/* -r- */
 16.1144 +	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO|PG_X;/* -rx */
 16.1145 +	protection_codes[VM_PROT_WRITE] = PG_RW;		/* w-- */
 16.1146 +	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW|PG_X;/* w-x */
 16.1147 +	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW;	/* wr- */
 16.1148 +	protection_codes[VM_PROT_ALL] = PG_RW|PG_X;		/* wrx */
 16.1149 +
 16.1150 +	/*
 16.1151 +	 * now we init the kernel's pmap
 16.1152 +	 *
 16.1153 +	 * the kernel pmap's pm_obj is not used for much.   however, in
 16.1154 +	 * user pmaps the pm_obj contains the list of active PTPs.
 16.1155 +	 * the pm_obj currently does not have a pager.   it might be possible
 16.1156 +	 * to add a pager that would allow a process to read-only mmap its
 16.1157 +	 * own page tables (fast user level vtophys?).   this may or may not
 16.1158 +	 * be useful.
 16.1159 +	 */
 16.1160 +
 16.1161 +	kpm = pmap_kernel();
 16.1162 +	simple_lock_init(&kpm->pm_obj.vmobjlock);
 16.1163 +	kpm->pm_obj.pgops = NULL;
 16.1164 +	TAILQ_INIT(&kpm->pm_obj.memq);
 16.1165 +	kpm->pm_obj.uo_npages = 0;
 16.1166 +	kpm->pm_obj.uo_refs = 1;
 16.1167 +	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
 16.1168 +	kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE);
 16.1169 +	XENPRINTF(("pm_pdirpa %p PTDpaddr %p\n",
 16.1170 +	    (void *)lwp0.l_addr->u_pcb.pcb_cr3, (void *)PTDpaddr));
 16.1171 +	kpm->pm_pdirpa = (u_int32_t) lwp0.l_addr->u_pcb.pcb_cr3;
 16.1172 +	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
 16.1173 +		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
 16.1174 +
 16.1175 +	/*
 16.1176 +	 * the above is just a rough estimate and not critical to the proper
 16.1177 +	 * operation of the system.
 16.1178 +	 */
 16.1179 +
 16.1180 +	/*
 16.1181 +	 * Begin to enable global TLB entries if they are supported.
 16.1182 +	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
 16.1183 +	 * which happens in cpu_init(), which is run on each cpu
 16.1184 +	 * (and happens later)
 16.1185 +	 */
 16.1186 +
 16.1187 +	if (cpu_feature & CPUID_PGE) {
 16.1188 +		pmap_pg_g = PG_G;		/* enable software */
 16.1189 +
 16.1190 +		/* add PG_G attribute to already mapped kernel pages */
 16.1191 +		for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
 16.1192 +		     kva += PAGE_SIZE)
 16.1193 +			if (pmap_valid_entry(PTE_BASE[x86_btop(kva)])) {
 16.1194 +#if !defined(XEN)
 16.1195 +				PTE_BASE[x86_btop(kva)] |= PG_G;
 16.1196 +#else
 16.1197 +				maptp = (pt_entry_t *)vtomach(
 16.1198 +					(vaddr_t)&PTE_BASE[x86_btop(kva)]);
 16.1199 +				PTE_SETBITS(&PTE_BASE[x86_btop(kva)], maptp,
 16.1200 +				    PG_G);
 16.1201 +			}
 16.1202 +		PTE_UPDATES_FLUSH();
 16.1203 +#endif
 16.1204 +	}
 16.1205 +
 16.1206 +#ifdef LARGEPAGES
 16.1207 +	/*
 16.1208 +	 * enable large pages if they are supported.
 16.1209 +	 */
 16.1210 +
 16.1211 +	if (cpu_feature & CPUID_PSE) {
 16.1212 +		paddr_t pa;
 16.1213 +		vaddr_t kva_end;
 16.1214 +		pd_entry_t *pde;
 16.1215 +		pd_entry_t *mapdp;
 16.1216 +		extern char _etext;
 16.1217 +
 16.1218 +		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
 16.1219 +		pmap_largepages = 1;	/* enable software */
 16.1220 +
 16.1221 +		/*
 16.1222 +		 * the TLB must be flushed after enabling large pages
 16.1223 +		 * on Pentium CPUs, according to section 3.6.2.2 of
 16.1224 +		 * "Intel Architecture Software Developer's Manual,
 16.1225 +		 * Volume 3: System Programming".
 16.1226 +		 */
 16.1227 +		tlbflush();
 16.1228 +
 16.1229 +		/*
 16.1230 +		 * now, remap the kernel text using large pages.  we
 16.1231 +		 * assume that the linker has properly aligned the
 16.1232 +		 * .data segment to a 4MB boundary.
 16.1233 +		 */
 16.1234 +		kva_end = roundup((vaddr_t)&_etext, NBPD);
 16.1235 +		for (pa = 0, kva = KERNBASE; kva < kva_end;
 16.1236 +		     kva += NBPD, pa += NBPD) {
 16.1237 +			pde = &kpm->pm_pdir[pdei(kva)];
 16.1238 +			mapdp = (pt_entry_t *)vtomach((vaddr_t)pde);
 16.1239 +			PDE_SET(pde, mapdp, pa | pmap_pg_g | PG_PS |
 16.1240 +			    PG_KR | PG_V); /* zap! */
 16.1241 +			tlbflush();
 16.1242 +		}
 16.1243 +	}
 16.1244 +#endif /* LARGEPAGES */
 16.1245 +
 16.1246 +	/*
 16.1247 +	 * now we allocate the "special" VAs which are used for tmp mappings
 16.1248 +	 * by the pmap (and other modules).    we allocate the VAs by advancing
 16.1249 +	 * virtual_avail (note that there are no pages mapped at these VAs).
 16.1250 +	 * we find the PTE that maps the allocated VA via the linear PTE
 16.1251 +	 * mapping.
 16.1252 +	 */
 16.1253 +
 16.1254 +	pte = PTE_BASE + x86_btop(virtual_avail);
 16.1255 +
 16.1256 +#ifdef MULTIPROCESSOR
 16.1257 +	/*
 16.1258 +	 * Waste some VA space to avoid false sharing of cache lines
 16.1259 +	 * for page table pages: Give each possible CPU a cache line
 16.1260 +	 * of PTE's (8) to play with, though we only need 4.  We could
 16.1261 +	 * recycle some of this waste by putting the idle stacks here
 16.1262 +	 * as well; we could waste less space if we knew the largest
 16.1263 +	 * CPU ID beforehand.
 16.1264 +	 */
 16.1265 +	csrcp = (caddr_t) virtual_avail;  csrc_pte = pte;
 16.1266 +
 16.1267 +	cdstp = (caddr_t) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
 16.1268 +
 16.1269 +	zerop = (caddr_t) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
 16.1270 +
 16.1271 +	ptpp = (caddr_t) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
 16.1272 +
 16.1273 +	virtual_avail += PAGE_SIZE * X86_MAXPROCS * NPTECL;
 16.1274 +	pte += X86_MAXPROCS * NPTECL;
 16.1275 +#else
 16.1276 +	csrcp = (caddr_t) virtual_avail;  csrc_pte = pte;  /* allocate */
 16.1277 +	virtual_avail += PAGE_SIZE; pte++;			     /* advance */
 16.1278 +
 16.1279 +	cdstp = (caddr_t) virtual_avail;  cdst_pte = pte;
 16.1280 +	virtual_avail += PAGE_SIZE; pte++;
 16.1281 +
 16.1282 +	zerop = (caddr_t) virtual_avail;  zero_pte = pte;
 16.1283 +	virtual_avail += PAGE_SIZE; pte++;
 16.1284 +
 16.1285 +	ptpp = (caddr_t) virtual_avail;  ptp_pte = pte;
 16.1286 +	virtual_avail += PAGE_SIZE; pte++;
 16.1287 +#endif
 16.1288 +
 16.1289 +	XENPRINTK(("pmap_bootstrap csrcp %p cdstp %p zerop %p ptpp %p\n", 
 16.1290 +		      csrc_pte, cdst_pte, zero_pte, ptp_pte));
 16.1291 +	/*
 16.1292 +	 * Nothing after this point actually needs pte;
 16.1293 +	 */
 16.1294 +	pte = (void *)0xdeadbeef;
 16.1295 +
 16.1296 +	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
 16.1297 +	vmmap = (char *)virtual_avail;			/* don't need pte */
 16.1298 +	virtual_avail += PAGE_SIZE;
 16.1299 +
 16.1300 +	msgbuf_vaddr = virtual_avail;			/* don't need pte */
 16.1301 +	virtual_avail += round_page(MSGBUFSIZE);
 16.1302 +
 16.1303 +	idt_vaddr = virtual_avail;			/* don't need pte */
 16.1304 +	virtual_avail += PAGE_SIZE;
 16.1305 +	idt_paddr = avail_start;			/* steal a page */
 16.1306 +	avail_start += PAGE_SIZE;
 16.1307 +
 16.1308 +#if defined(I586_CPU)
 16.1309 +	/* pentium f00f bug stuff */
 16.1310 +	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
 16.1311 +	virtual_avail += PAGE_SIZE;
 16.1312 +#endif
 16.1313 +
 16.1314 +	/*
 16.1315 +	 * now we reserve some VM for mapping pages when doing a crash dump
 16.1316 +	 */
 16.1317 +
 16.1318 +	virtual_avail = reserve_dumppages(virtual_avail);
 16.1319 +
 16.1320 +	/*
 16.1321 +	 * init the static-global locks and global lists.
 16.1322 +	 */
 16.1323 +
 16.1324 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
 16.1325 +	spinlockinit(&pmap_main_lock, "pmaplk", 0);
 16.1326 +#endif
 16.1327 +	simple_lock_init(&pvalloc_lock);
 16.1328 +	simple_lock_init(&pmaps_lock);
 16.1329 +	LIST_INIT(&pmaps);
 16.1330 +	TAILQ_INIT(&pv_freepages);
 16.1331 +	TAILQ_INIT(&pv_unusedpgs);
 16.1332 +
 16.1333 +	/*
 16.1334 +	 * initialize the pmap pool.
 16.1335 +	 */
 16.1336 +
 16.1337 +	pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, 0, 0, "pmappl",
 16.1338 +	    &pool_allocator_nointr);
 16.1339 +
 16.1340 +	/*
 16.1341 +	 * Initialize the TLB shootdown queues.
 16.1342 +	 */
 16.1343 +
 16.1344 +	__cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock);
 16.1345 +
 16.1346 +	for (i = 0; i < X86_MAXPROCS; i++) {
 16.1347 +		TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
 16.1348 +		__cpu_simple_lock_init(&pmap_tlb_shootdown_q[i].pq_slock);
 16.1349 +	}
 16.1350 +
 16.1351 +	/*
 16.1352 +	 * initialize the PDE pool and cache.
 16.1353 +	 */
 16.1354 +	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, 0, "pdppl",
 16.1355 +		  &pool_allocator_nointr);
 16.1356 +	pool_cache_init(&pmap_pdp_cache, &pmap_pdp_pool,
 16.1357 +			pmap_pdp_ctor, pmap_pdp_dtor, NULL);
 16.1358 +
 16.1359 +	/*
 16.1360 +	 * ensure the TLB is sync'd with reality by flushing it...
 16.1361 +	 */
 16.1362 +
 16.1363 +	tlbflush();
 16.1364 +}
 16.1365 +
 16.1366 +/*
 16.1367 + * pmap_init: called from uvm_init, our job is to get the pmap
 16.1368 + * system ready to manage mappings... this mainly means initing
 16.1369 + * the pv_entry stuff.
 16.1370 + */
 16.1371 +
 16.1372 +void
 16.1373 +pmap_init()
 16.1374 +{
 16.1375 +	int i;
 16.1376 +
 16.1377 +	/*
 16.1378 +	 * now we need to free enough pv_entry structures to allow us to get
 16.1379 +	 * the kmem_map/kmem_object allocated and inited (done after this
 16.1380 +	 * function is finished).  to do this we allocate one bootstrap page out
 16.1381 +	 * of kernel_map and use it to provide an initial pool of pv_entry
 16.1382 +	 * structures.   we never free this page.
 16.1383 +	 */
 16.1384 +
 16.1385 +	pv_initpage = (struct pv_page *) uvm_km_alloc(kernel_map, PAGE_SIZE);
 16.1386 +	if (pv_initpage == NULL)
 16.1387 +		panic("pmap_init: pv_initpage");
 16.1388 +	pv_cachedva = 0;   /* a VA we have allocated but not used yet */
 16.1389 +	pv_nfpvents = 0;
 16.1390 +	(void) pmap_add_pvpage(pv_initpage, FALSE);
 16.1391 +
 16.1392 +	pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE);
 16.1393 +	if (pj_page == NULL)
 16.1394 +		panic("pmap_init: pj_page");
 16.1395 +
 16.1396 +	for (i = 0;
 16.1397 +	     i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
 16.1398 +	     i++)
 16.1399 +		pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job;
 16.1400 +	pj_page[i].pja_job.pj_nextfree = NULL;
 16.1401 +	pj_free = &pj_page[0];
 16.1402 +
 16.1403 +	/*
 16.1404 +	 * done: pmap module is up (and ready for business)
 16.1405 +	 */
 16.1406 +
 16.1407 +	pmap_initialized = TRUE;
 16.1408 +}
 16.1409 +
 16.1410 +/*
 16.1411 + * p v _ e n t r y   f u n c t i o n s
 16.1412 + */
 16.1413 +
 16.1414 +/*
 16.1415 + * pv_entry allocation functions:
 16.1416 + *   the main pv_entry allocation functions are:
 16.1417 + *     pmap_alloc_pv: allocate a pv_entry structure
 16.1418 + *     pmap_free_pv: free one pv_entry
 16.1419 + *     pmap_free_pvs: free a list of pv_entrys
 16.1420 + *
 16.1421 + * the rest are helper functions
 16.1422 + */
 16.1423 +
 16.1424 +/*
 16.1425 + * pmap_alloc_pv: inline function to allocate a pv_entry structure
 16.1426 + * => we lock pvalloc_lock
 16.1427 + * => if we fail, we call out to pmap_alloc_pvpage
 16.1428 + * => 3 modes:
 16.1429 + *    ALLOCPV_NEED   = we really need a pv_entry, even if we have to steal it
 16.1430 + *    ALLOCPV_TRY    = we want a pv_entry, but not enough to steal
 16.1431 + *    ALLOCPV_NONEED = we are trying to grow our free list, don't really need
 16.1432 + *			one now
 16.1433 + *
 16.1434 + * "try" is for optional functions like pmap_copy().
 16.1435 + */
 16.1436 +
 16.1437 +__inline static struct pv_entry *
 16.1438 +pmap_alloc_pv(pmap, mode)
 16.1439 +	struct pmap *pmap;
 16.1440 +	int mode;
 16.1441 +{
 16.1442 +	struct pv_page *pvpage;
 16.1443 +	struct pv_entry *pv;
 16.1444 +
 16.1445 +	simple_lock(&pvalloc_lock);
 16.1446 +
 16.1447 +	pvpage = TAILQ_FIRST(&pv_freepages);
 16.1448 +	if (pvpage != NULL) {
 16.1449 +		pvpage->pvinfo.pvpi_nfree--;
 16.1450 +		if (pvpage->pvinfo.pvpi_nfree == 0) {
 16.1451 +			/* nothing left in this one? */
 16.1452 +			TAILQ_REMOVE(&pv_freepages, pvpage, pvinfo.pvpi_list);
 16.1453 +		}
 16.1454 +		pv = pvpage->pvinfo.pvpi_pvfree;
 16.1455 +		KASSERT(pv);
 16.1456 +		pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
 16.1457 +		pv_nfpvents--;  /* took one from pool */
 16.1458 +	} else {
 16.1459 +		pv = NULL;		/* need more of them */
 16.1460 +	}
 16.1461 +
 16.1462 +	/*
 16.1463 +	 * if below low water mark or we didn't get a pv_entry we try and
 16.1464 +	 * create more pv_entrys ...
 16.1465 +	 */
 16.1466 +
 16.1467 +	if (pv_nfpvents < PVE_LOWAT || pv == NULL) {
 16.1468 +		if (pv == NULL)
 16.1469 +			pv = pmap_alloc_pvpage(pmap, (mode == ALLOCPV_TRY) ?
 16.1470 +					       mode : ALLOCPV_NEED);
 16.1471 +		else
 16.1472 +			(void) pmap_alloc_pvpage(pmap, ALLOCPV_NONEED);
 16.1473 +	}
 16.1474 +	simple_unlock(&pvalloc_lock);
 16.1475 +	return(pv);
 16.1476 +}
 16.1477 +
 16.1478 +/*
 16.1479 + * pmap_alloc_pvpage: maybe allocate a new pvpage
 16.1480 + *
 16.1481 + * if need_entry is false: try and allocate a new pv_page
 16.1482 + * if need_entry is true: try and allocate a new pv_page and return a
 16.1483 + *	new pv_entry from it.   if we are unable to allocate a pv_page
 16.1484 + *	we make a last ditch effort to steal a pv_page from some other
 16.1485 + *	mapping.    if that fails, we panic...
 16.1486 + *
 16.1487 + * => we assume that the caller holds pvalloc_lock
 16.1488 + */
 16.1489 +
 16.1490 +static struct pv_entry *
 16.1491 +pmap_alloc_pvpage(pmap, mode)
 16.1492 +	struct pmap *pmap;
 16.1493 +	int mode;
 16.1494 +{
 16.1495 +	struct vm_page *pg;
 16.1496 +	struct pv_page *pvpage;
 16.1497 +	struct pv_entry *pv;
 16.1498 +	int s;
 16.1499 +
 16.1500 +	/*
 16.1501 +	 * if we need_entry and we've got unused pv_pages, allocate from there
 16.1502 +	 */
 16.1503 +
 16.1504 +	pvpage = TAILQ_FIRST(&pv_unusedpgs);
 16.1505 +	if (mode != ALLOCPV_NONEED && pvpage != NULL) {
 16.1506 +
 16.1507 +		/* move it to pv_freepages list */
 16.1508 +		TAILQ_REMOVE(&pv_unusedpgs, pvpage, pvinfo.pvpi_list);
 16.1509 +		TAILQ_INSERT_HEAD(&pv_freepages, pvpage, pvinfo.pvpi_list);
 16.1510 +
 16.1511 +		/* allocate a pv_entry */
 16.1512 +		pvpage->pvinfo.pvpi_nfree--;	/* can't go to zero */
 16.1513 +		pv = pvpage->pvinfo.pvpi_pvfree;
 16.1514 +		KASSERT(pv);
 16.1515 +		pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
 16.1516 +		pv_nfpvents--;  /* took one from pool */
 16.1517 +		return(pv);
 16.1518 +	}
 16.1519 +
 16.1520 +	/*
 16.1521 +	 *  see if we've got a cached unmapped VA that we can map a page in.
 16.1522 +	 * if not, try to allocate one.
 16.1523 +	 */
 16.1524 +
 16.1525 +	if (pv_cachedva == 0) {
 16.1526 +		s = splvm();   /* must protect kmem_map with splvm! */
 16.1527 +		pv_cachedva = uvm_km_kmemalloc(kmem_map, NULL, PAGE_SIZE,
 16.1528 +		    UVM_KMF_TRYLOCK|UVM_KMF_VALLOC);
 16.1529 +		splx(s);
 16.1530 +		if (pv_cachedva == 0) {
 16.1531 +			return (NULL);
 16.1532 +		}
 16.1533 +	}
 16.1534 +
 16.1535 +	pg = uvm_pagealloc(NULL, pv_cachedva - vm_map_min(kernel_map), NULL,
 16.1536 +	    UVM_PGA_USERESERVE);
 16.1537 +	if (pg == NULL)
 16.1538 +		return (NULL);
 16.1539 +	pg->flags &= ~PG_BUSY;	/* never busy */
 16.1540 +
 16.1541 +	/*
 16.1542 +	 * add a mapping for our new pv_page and free its entrys (save one!)
 16.1543 +	 *
 16.1544 +	 * NOTE: If we are allocating a PV page for the kernel pmap, the
 16.1545 +	 * pmap is already locked!  (...but entering the mapping is safe...)
 16.1546 +	 */
 16.1547 +
 16.1548 +	pmap_kenter_pa(pv_cachedva, VM_PAGE_TO_PHYS(pg),
 16.1549 +	    VM_PROT_READ | VM_PROT_WRITE);
 16.1550 +	pmap_update(pmap_kernel());
 16.1551 +	pvpage = (struct pv_page *) pv_cachedva;
 16.1552 +	pv_cachedva = 0;
 16.1553 +	return (pmap_add_pvpage(pvpage, mode != ALLOCPV_NONEED));
 16.1554 +}
 16.1555 +
 16.1556 +/*
 16.1557 + * pmap_add_pvpage: add a pv_page's pv_entrys to the free list
 16.1558 + *
 16.1559 + * => caller must hold pvalloc_lock
 16.1560 + * => if need_entry is true, we allocate and return one pv_entry
 16.1561 + */
 16.1562 +
 16.1563 +static struct pv_entry *
 16.1564 +pmap_add_pvpage(pvp, need_entry)
 16.1565 +	struct pv_page *pvp;
 16.1566 +	boolean_t need_entry;
 16.1567 +{
 16.1568 +	int tofree, lcv;
 16.1569 +
 16.1570 +	/* do we need to return one? */
 16.1571 +	tofree = (need_entry) ? PVE_PER_PVPAGE - 1 : PVE_PER_PVPAGE;
 16.1572 +
 16.1573 +	pvp->pvinfo.pvpi_pvfree = NULL;
 16.1574 +	pvp->pvinfo.pvpi_nfree = tofree;
 16.1575 +	for (lcv = 0 ; lcv < tofree ; lcv++) {
 16.1576 +		SPLAY_RIGHT(&pvp->pvents[lcv], pv_node) =
 16.1577 +			pvp->pvinfo.pvpi_pvfree;
 16.1578 +		pvp->pvinfo.pvpi_pvfree = &pvp->pvents[lcv];
 16.1579 +	}
 16.1580 +	if (need_entry)
 16.1581 +		TAILQ_INSERT_TAIL(&pv_freepages, pvp, pvinfo.pvpi_list);
 16.1582 +	else
 16.1583 +		TAILQ_INSERT_TAIL(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
 16.1584 +	pv_nfpvents += tofree;
 16.1585 +	return((need_entry) ? &pvp->pvents[lcv] : NULL);
 16.1586 +}
 16.1587 +
 16.1588 +/*
 16.1589 + * pmap_free_pv_doit: actually free a pv_entry
 16.1590 + *
 16.1591 + * => do not call this directly!  instead use either
 16.1592 + *    1. pmap_free_pv ==> free a single pv_entry
 16.1593 + *    2. pmap_free_pvs => free a list of pv_entrys
 16.1594 + * => we must be holding pvalloc_lock
 16.1595 + */
 16.1596 +
 16.1597 +__inline static void
 16.1598 +pmap_free_pv_doit(pv)
 16.1599 +	struct pv_entry *pv;
 16.1600 +{
 16.1601 +	struct pv_page *pvp;
 16.1602 +
 16.1603 +	pvp = (struct pv_page *) x86_trunc_page(pv);
 16.1604 +	pv_nfpvents++;
 16.1605 +	pvp->pvinfo.pvpi_nfree++;
 16.1606 +
 16.1607 +	/* nfree == 1 => fully allocated page just became partly allocated */
 16.1608 +	if (pvp->pvinfo.pvpi_nfree == 1) {
 16.1609 +		TAILQ_INSERT_HEAD(&pv_freepages, pvp, pvinfo.pvpi_list);
 16.1610 +	}
 16.1611 +
 16.1612 +	/* free it */
 16.1613 +	SPLAY_RIGHT(pv, pv_node) = pvp->pvinfo.pvpi_pvfree;
 16.1614 +	pvp->pvinfo.pvpi_pvfree = pv;
 16.1615 +
 16.1616 +	/*
 16.1617 +	 * are all pv_page's pv_entry's free?  move it to unused queue.
 16.1618 +	 */
 16.1619 +
 16.1620 +	if (pvp->pvinfo.pvpi_nfree == PVE_PER_PVPAGE) {
 16.1621 +		TAILQ_REMOVE(&pv_freepages, pvp, pvinfo.pvpi_list);
 16.1622 +		TAILQ_INSERT_HEAD(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
 16.1623 +	}
 16.1624 +}
 16.1625 +
 16.1626 +/*
 16.1627 + * pmap_free_pv: free a single pv_entry
 16.1628 + *
 16.1629 + * => we gain the pvalloc_lock
 16.1630 + */
 16.1631 +
 16.1632 +__inline static void
 16.1633 +pmap_free_pv(pmap, pv)
 16.1634 +	struct pmap *pmap;
 16.1635 +	struct pv_entry *pv;
 16.1636 +{
 16.1637 +	simple_lock(&pvalloc_lock);
 16.1638 +	pmap_free_pv_doit(pv);
 16.1639 +
 16.1640 +	/*
 16.1641 +	 * Can't free the PV page if the PV entries were associated with
 16.1642 +	 * the kernel pmap; the pmap is already locked.
 16.1643 +	 */
 16.1644 +	if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL &&
 16.1645 +	    pmap != pmap_kernel())
 16.1646 +		pmap_free_pvpage();
 16.1647 +
 16.1648 +	simple_unlock(&pvalloc_lock);
 16.1649 +}
 16.1650 +
 16.1651 +/*
 16.1652 + * pmap_free_pvs: free a list of pv_entrys
 16.1653 + *
 16.1654 + * => we gain the pvalloc_lock
 16.1655 + */
 16.1656 +
 16.1657 +__inline static void
 16.1658 +pmap_free_pvs(pmap, pvs)
 16.1659 +	struct pmap *pmap;
 16.1660 +	struct pv_entry *pvs;
 16.1661 +{
 16.1662 +	struct pv_entry *nextpv;
 16.1663 +
 16.1664 +	simple_lock(&pvalloc_lock);
 16.1665 +
 16.1666 +	for ( /* null */ ; pvs != NULL ; pvs = nextpv) {
 16.1667 +		nextpv = SPLAY_RIGHT(pvs, pv_node);
 16.1668 +		pmap_free_pv_doit(pvs);
 16.1669 +	}
 16.1670 +
 16.1671 +	/*
 16.1672 +	 * Can't free the PV page if the PV entries were associated with
 16.1673 +	 * the kernel pmap; the pmap is already locked.
 16.1674 +	 */
 16.1675 +	if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL &&
 16.1676 +	    pmap != pmap_kernel())
 16.1677 +		pmap_free_pvpage();
 16.1678 +
 16.1679 +	simple_unlock(&pvalloc_lock);
 16.1680 +}
 16.1681 +
 16.1682 +
 16.1683 +/*
 16.1684 + * pmap_free_pvpage: try and free an unused pv_page structure
 16.1685 + *
 16.1686 + * => assume caller is holding the pvalloc_lock and that
 16.1687 + *	there is a page on the pv_unusedpgs list
 16.1688 + * => if we can't get a lock on the kmem_map we try again later
 16.1689 + */
 16.1690 +
 16.1691 +static void
 16.1692 +pmap_free_pvpage()
 16.1693 +{
 16.1694 +	int s;
 16.1695 +	struct vm_map *map;
 16.1696 +	struct vm_map_entry *dead_entries;
 16.1697 +	struct pv_page *pvp;
 16.1698 +
 16.1699 +	s = splvm(); /* protect kmem_map */
 16.1700 +
 16.1701 +	pvp = TAILQ_FIRST(&pv_unusedpgs);
 16.1702 +
 16.1703 +	/*
 16.1704 +	 * note: watch out for pv_initpage which is allocated out of
 16.1705 +	 * kernel_map rather than kmem_map.
 16.1706 +	 */
 16.1707 +
 16.1708 +	if (pvp == pv_initpage)
 16.1709 +		map = kernel_map;
 16.1710 +	else
 16.1711 +		map = kmem_map;
 16.1712 +	if (vm_map_lock_try(map)) {
 16.1713 +
 16.1714 +		/* remove pvp from pv_unusedpgs */
 16.1715 +		TAILQ_REMOVE(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
 16.1716 +
 16.1717 +		/* unmap the page */
 16.1718 +		dead_entries = NULL;
 16.1719 +		uvm_unmap_remove(map, (vaddr_t)pvp, ((vaddr_t)pvp) + PAGE_SIZE,
 16.1720 +		    &dead_entries);
 16.1721 +		vm_map_unlock(map);
 16.1722 +
 16.1723 +		if (dead_entries != NULL)
 16.1724 +			uvm_unmap_detach(dead_entries, 0);
 16.1725 +
 16.1726 +		pv_nfpvents -= PVE_PER_PVPAGE;  /* update free count */
 16.1727 +	}
 16.1728 +	if (pvp == pv_initpage)
 16.1729 +		/* no more initpage, we've freed it */
 16.1730 +		pv_initpage = NULL;
 16.1731 +
 16.1732 +	splx(s);
 16.1733 +}
 16.1734 +
 16.1735 +/*
 16.1736 + * pmap_lock_pvhs: Lock pvh1 and optional pvh2
 16.1737 + *                 Observe locking order when locking both pvhs
 16.1738 + */
 16.1739 +
 16.1740 +__inline static void
 16.1741 +pmap_lock_pvhs(struct pv_head *pvh1, struct pv_head *pvh2)
 16.1742 +{
 16.1743 +
 16.1744 +	if (pvh2 == NULL) {
 16.1745 +		simple_lock(&pvh1->pvh_lock);
 16.1746 +		return;
 16.1747 +	}
 16.1748 +
 16.1749 +	if (pvh1 < pvh2) {
 16.1750 +		simple_lock(&pvh1->pvh_lock);
 16.1751 +		simple_lock(&pvh2->pvh_lock);
 16.1752 +	} else {
 16.1753 +		simple_lock(&pvh2->pvh_lock);
 16.1754 +		simple_lock(&pvh1->pvh_lock);
 16.1755 +	}
 16.1756 +}
 16.1757 +
 16.1758 +
 16.1759 +/*
 16.1760 + * main pv_entry manipulation functions:
 16.1761 + *   pmap_enter_pv: enter a mapping onto a pv_head list
 16.1762 + *   pmap_remove_pv: remove a mappiing from a pv_head list
 16.1763 + *
 16.1764 + * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 
 16.1765 + *       the pvh before calling
 16.1766 + */
 16.1767 +
 16.1768 +/*
 16.1769 + * pmap_enter_pv: enter a mapping onto a pv_head lst
 16.1770 + *
 16.1771 + * => caller should hold the proper lock on pmap_main_lock
 16.1772 + * => caller should have pmap locked
 16.1773 + * => caller should have the pv_head locked
 16.1774 + * => caller should adjust ptp's wire_count before calling
 16.1775 + */
 16.1776 +
 16.1777 +__inline static void
 16.1778 +pmap_enter_pv(pvh, pve, pmap, va, ptp)
 16.1779 +	struct pv_head *pvh;
 16.1780 +	struct pv_entry *pve;	/* preallocated pve for us to use */
 16.1781 +	struct pmap *pmap;
 16.1782 +	vaddr_t va;
 16.1783 +	struct vm_page *ptp;	/* PTP in pmap that maps this VA */
 16.1784 +{
 16.1785 +	pve->pv_pmap = pmap;
 16.1786 +	pve->pv_va = va;
 16.1787 +	pve->pv_ptp = ptp;			/* NULL for kernel pmap */
 16.1788 +	SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */
 16.1789 +}
 16.1790 +
 16.1791 +/*
 16.1792 + * pmap_remove_pv: try to remove a mapping from a pv_list
 16.1793 + *
 16.1794 + * => caller should hold proper lock on pmap_main_lock
 16.1795 + * => pmap should be locked
 16.1796 + * => caller should hold lock on pv_head [so that attrs can be adjusted]
 16.1797 + * => caller should adjust ptp's wire_count and free PTP if needed
 16.1798 + * => we return the removed pve
 16.1799 + */
 16.1800 +
 16.1801 +__inline static struct pv_entry *
 16.1802 +pmap_remove_pv(pvh, pmap, va)
 16.1803 +	struct pv_head *pvh;
 16.1804 +	struct pmap *pmap;
 16.1805 +	vaddr_t va;
 16.1806 +{
 16.1807 +	struct pv_entry tmp, *pve;
 16.1808 +
 16.1809 +	tmp.pv_pmap = pmap;
 16.1810 +	tmp.pv_va = va;
 16.1811 +	pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp);
 16.1812 +	if (pve == NULL)
 16.1813 +		return (NULL);
 16.1814 +	SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve);
 16.1815 +	return(pve);				/* return removed pve */
 16.1816 +}
 16.1817 +
 16.1818 +/*
 16.1819 + * p t p   f u n c t i o n s
 16.1820 + */
 16.1821 +
 16.1822 +/*
 16.1823 + * pmap_alloc_ptp: allocate a PTP for a PMAP
 16.1824 + *
 16.1825 + * => pmap should already be locked by caller
 16.1826 + * => we use the ptp's wire_count to count the number of active mappings
 16.1827 + *	in the PTP (we start it at one to prevent any chance this PTP
 16.1828 + *	will ever leak onto the active/inactive queues)
 16.1829 + */
 16.1830 +
 16.1831 +__inline static struct vm_page *
 16.1832 +pmap_alloc_ptp(pmap, pde_index)
 16.1833 +	struct pmap *pmap;
 16.1834 +	int pde_index;
 16.1835 +{
 16.1836 +	struct vm_page *ptp;
 16.1837 +	pd_entry_t *mapdp;
 16.1838 +
 16.1839 +	ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
 16.1840 +			    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
 16.1841 +	if (ptp == NULL)
 16.1842 +		return(NULL);
 16.1843 +
 16.1844 +	/* got one! */
 16.1845 +	ptp->flags &= ~PG_BUSY;	/* never busy */
 16.1846 +	ptp->wire_count = 1;	/* no mappings yet */
 16.1847 +	mapdp = (pt_entry_t *)vtomach((vaddr_t)&pmap->pm_pdir[pde_index]);
 16.1848 +	PDE_SET(&pmap->pm_pdir[pde_index], mapdp,
 16.1849 +	    (pd_entry_t) (VM_PAGE_TO_PHYS(ptp) | PG_u | PG_RW | PG_V));
 16.1850 +	pmap->pm_stats.resident_count++;	/* count PTP as resident */
 16.1851 +	pmap->pm_ptphint = ptp;
 16.1852 +	return(ptp);
 16.1853 +}
 16.1854 +
 16.1855 +/*
 16.1856 + * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
 16.1857 + *
 16.1858 + * => pmap should NOT be pmap_kernel()
 16.1859 + * => pmap should be locked
 16.1860 + */
 16.1861 +
 16.1862 +static struct vm_page *
 16.1863 +pmap_get_ptp(pmap, pde_index)
 16.1864 +	struct pmap *pmap;
 16.1865 +	int pde_index;
 16.1866 +{
 16.1867 +	struct vm_page *ptp;
 16.1868 +
 16.1869 +	if (pmap_valid_entry(pmap->pm_pdir[pde_index])) {
 16.1870 +
 16.1871 +		/* valid... check hint (saves us a PA->PG lookup) */
 16.1872 +		if (pmap->pm_ptphint &&
 16.1873 +		    (PDE_GET(&pmap->pm_pdir[pde_index]) & PG_FRAME) ==
 16.1874 +		    VM_PAGE_TO_PHYS(pmap->pm_ptphint))
 16.1875 +			return(pmap->pm_ptphint);
 16.1876 +
 16.1877 +		ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
 16.1878 +#ifdef DIAGNOSTIC
 16.1879 +		if (ptp == NULL)
 16.1880 +			panic("pmap_get_ptp: unmanaged user PTP");
 16.1881 +#endif
 16.1882 +		pmap->pm_ptphint = ptp;
 16.1883 +		return(ptp);
 16.1884 +	}
 16.1885 +
 16.1886 +	/* allocate a new PTP (updates ptphint) */
 16.1887 +	return(pmap_alloc_ptp(pmap, pde_index));
 16.1888 +}
 16.1889 +
 16.1890 +/*
 16.1891 + * p m a p  l i f e c y c l e   f u n c t i o n s
 16.1892 + */
 16.1893 +
 16.1894 +/*
 16.1895 + * pmap_pdp_ctor: constructor for the PDP cache.
 16.1896 + */
 16.1897 +
 16.1898 +int
 16.1899 +pmap_pdp_ctor(void *arg, void *object, int flags)
 16.1900 +{
 16.1901 +	pd_entry_t *pdir = object;
 16.1902 +	paddr_t pdirpa;
 16.1903 +
 16.1904 +	/*
 16.1905 +	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
 16.1906 +	 * WE MUST NOT BLOCK!
 16.1907 +	 */
 16.1908 +
 16.1909 +	/* fetch the physical address of the page directory. */
 16.1910 +	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
 16.1911 +
 16.1912 +	XENPRINTF(("pmap_pdp_ctor %p %p\n", pdir, (void *)pdirpa));
 16.1913 +
 16.1914 +	/* zero init area */
 16.1915 +	memset(pdir, 0, PDSLOT_PTE * sizeof(pd_entry_t));
 16.1916 +
 16.1917 +	/* put in recursive PDE to map the PTEs */
 16.1918 +	pdir[PDSLOT_PTE] = xpmap_ptom(pdirpa | PG_V /* | PG_KW */);
 16.1919 +
 16.1920 +	/* put in kernel VM PDEs */
 16.1921 +	memcpy(&pdir[PDSLOT_KERN], &PDP_BASE[PDSLOT_KERN],
 16.1922 +	    nkpde * sizeof(pd_entry_t));
 16.1923 +
 16.1924 +	/* zero the rest */
 16.1925 +	memset(&pdir[PDSLOT_KERN + nkpde], 0,
 16.1926 +	    PAGE_SIZE - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
 16.1927 +
 16.1928 +	pmap_enter(pmap_kernel(), (vaddr_t)pdir, pdirpa, VM_PROT_READ,
 16.1929 +	    VM_PROT_READ);
 16.1930 +	pmap_update(pmap_kernel());
 16.1931 +
 16.1932 +	/* pin page type */
 16.1933 +	xpq_queue_pin_table(xpmap_ptom(pdirpa), XPQ_PIN_L2_TABLE);
 16.1934 +	xpq_flush_queue();
 16.1935 +
 16.1936 +	return (0);
 16.1937 +}
 16.1938 +
 16.1939 +void
 16.1940 +pmap_pdp_dtor(void *arg, void *object)
 16.1941 +{
 16.1942 +	pd_entry_t *pdir = object;
 16.1943 +	paddr_t pdirpa;
 16.1944 +
 16.1945 +	/* fetch the physical address of the page directory. */
 16.1946 +	pdirpa = PDE_GET(&pdir[PDSLOT_PTE]) & PG_FRAME;
 16.1947 +
 16.1948 +	XENPRINTF(("pmap_pdp_dtor %p %p\n", pdir, (void *)pdirpa));
 16.1949 +
 16.1950 +	/* unpin page type */
 16.1951 +	xpq_queue_unpin_table(xpmap_ptom(pdirpa));
 16.1952 +	xpq_flush_queue();
 16.1953 +}
 16.1954 +
 16.1955 +/*
 16.1956 + * pmap_create: create a pmap
 16.1957 + *
 16.1958 + * => note: old pmap interface took a "size" args which allowed for
 16.1959 + *	the creation of "software only" pmaps (not in bsd).
 16.1960 + */
 16.1961 +
 16.1962 +struct pmap *
 16.1963 +pmap_create()
 16.1964 +{
 16.1965 +	struct pmap *pmap;
 16.1966 +	u_int gen;
 16.1967 +
 16.1968 +	XENPRINTF(("pmap_create\n"));
 16.1969 +	pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
 16.1970 +
 16.1971 +	/* init uvm_object */
 16.1972 +	simple_lock_init(&pmap->pm_obj.vmobjlock);
 16.1973 +	pmap->pm_obj.pgops = NULL;	/* currently not a mappable object */
 16.1974 +	TAILQ_INIT(&pmap->pm_obj.memq);
 16.1975 +	pmap->pm_obj.uo_npages = 0;
 16.1976 +	pmap->pm_obj.uo_refs = 1;
 16.1977 +	pmap->pm_stats.wired_count = 0;
 16.1978 +	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
 16.1979 +	pmap->pm_ptphint = NULL;
 16.1980 +	pmap->pm_hiexec = 0;
 16.1981 +	pmap->pm_flags = 0;
 16.1982 +	pmap->pm_cpus = 0;
 16.1983 +
 16.1984 +	/* init the LDT */
 16.1985 +	pmap->pm_ldt = NULL;
 16.1986 +	pmap->pm_ldt_len = 0;
 16.1987 +	pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
 16.1988 +
 16.1989 +	/* allocate PDP */
 16.1990 +
 16.1991 +	/*
 16.1992 +	 * we need to lock pmaps_lock to prevent nkpde from changing on
 16.1993 +	 * us.  note that there is no need to splvm to protect us from
 16.1994 +	 * malloc since malloc allocates out of a submap and we should
 16.1995 +	 * have already allocated kernel PTPs to cover the range...
 16.1996 +	 *
 16.1997 +	 * NOTE: WE MUST NOT BLOCK WHILE HOLDING THE `pmap_lock', nor
 16.1998 +	 * must we call pmap_growkernel() while holding it!
 16.1999 +	 */
 16.2000 +
 16.2001 + try_again:
 16.2002 +	gen = pmap_pdp_cache_generation;
 16.2003 +	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
 16.2004 +
 16.2005 +	simple_lock(&pmaps_lock);
 16.2006 +
 16.2007 +	if (gen != pmap_pdp_cache_generation) {
 16.2008 +		simple_unlock(&pmaps_lock);
 16.2009 +		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
 16.2010 +		goto try_again;
 16.2011 +	}
 16.2012 +
 16.2013 +	pmap->pm_pdirpa = PDE_GET(&pmap->pm_pdir[PDSLOT_PTE]) & PG_FRAME;
 16.2014 +	XENPRINTF(("pmap_create %p set pm_pdirpa %p/%p slotval %p\n", pmap,
 16.2015 +		   (void *)pmap->pm_pdirpa,
 16.2016 +		   (void *)xpmap_ptom(pmap->pm_pdirpa),
 16.2017 +		   (void *)pmap->pm_pdir[PDSLOT_PTE]));
 16.2018 +
 16.2019 +	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
 16.2020 +
 16.2021 +	simple_unlock(&pmaps_lock);
 16.2022 +
 16.2023 +	return (pmap);
 16.2024 +}
 16.2025 +
 16.2026 +/*
 16.2027 + * pmap_destroy: drop reference count on pmap.   free pmap if
 16.2028 + *	reference count goes to zero.
 16.2029 + */
 16.2030 +
 16.2031 +void
 16.2032 +pmap_destroy(pmap)
 16.2033 +	struct pmap *pmap;
 16.2034 +{
 16.2035 +	int refs;
 16.2036 +#ifdef DIAGNOSTIC
 16.2037 +	struct cpu_info *ci;
 16.2038 +	CPU_INFO_ITERATOR cii;
 16.2039 +#endif /* DIAGNOSTIC */
 16.2040 +
 16.2041 +	/*
 16.2042 +	 * drop reference count
 16.2043 +	 */
 16.2044 +
 16.2045 +	simple_lock(&pmap->pm_obj.vmobjlock);
 16.2046 +	refs = --pmap->pm_obj.uo_refs;
 16.2047 +	simple_unlock(&pmap->pm_obj.vmobjlock);
 16.2048 +	if (refs > 0) {
 16.2049 +		return;
 16.2050 +	}
 16.2051 +
 16.2052 +#ifdef DIAGNOSTIC
 16.2053 +	for (CPU_INFO_FOREACH(cii, ci))
 16.2054 +		if (ci->ci_pmap == pmap)
 16.2055 +			panic("destroying pmap being used");
 16.2056 +#endif /* DIAGNOSTIC */
 16.2057 +
 16.2058 +	/*
 16.2059 +	 * reference count is zero, free pmap resources and then free pmap.
 16.2060 +	 */
 16.2061 +
 16.2062 +	XENPRINTF(("pmap_destroy %p pm_pdirpa %p/%p\n", pmap,
 16.2063 +		   (void *)pmap->pm_pdirpa,
 16.2064 +		   (void *)xpmap_ptom(pmap->pm_pdirpa)));
 16.2065 +
 16.2066 +	/*
 16.2067 +	 * remove it from global list of pmaps
 16.2068 +	 */
 16.2069 +
 16.2070 +	simple_lock(&pmaps_lock);
 16.2071 +	LIST_REMOVE(pmap, pm_list);
 16.2072 +	simple_unlock(&pmaps_lock);
 16.2073 +
 16.2074 +	/*
 16.2075 +	 * destroyed pmap shouldn't have remaining PTPs
 16.2076 +	 */
 16.2077 +
 16.2078 +	KASSERT(pmap->pm_obj.uo_npages == 0);
 16.2079 +	KASSERT(TAILQ_EMPTY(&pmap->pm_obj.memq));
 16.2080 +
 16.2081 +	/*
 16.2082 +	 * MULTIPROCESSOR -- no need to flush out of other processors'
 16.2083 +	 * APTE space because we do that in pmap_unmap_ptes().
 16.2084 +	 */
 16.2085 +	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
 16.2086 +
 16.2087 +#ifdef USER_LDT
 16.2088 +	if (pmap->pm_flags & PMF_USER_LDT) {
 16.2089 +		/*
 16.2090 +		 * no need to switch the LDT; this address space is gone,
 16.2091 +		 * nothing is using it.
 16.2092 +		 *
 16.2093 +		 * No need to lock the pmap for ldt_free (or anything else),
 16.2094 +		 * we're the last one to use it.
 16.2095 +		 */
 16.2096 +		ldt_free(pmap);
 16.2097 +		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
 16.2098 +			    pmap->pm_ldt_len * sizeof(union descriptor));
 16.2099 +	}
 16.2100 +#endif
 16.2101 +
 16.2102 +	pool_put(&pmap_pmap_pool, pmap);
 16.2103 +}
 16.2104 +
 16.2105 +/*
 16.2106 + *	Add a reference to the specified pmap.
 16.2107 + */
 16.2108 +
 16.2109 +void
 16.2110 +pmap_reference(pmap)
 16.2111 +	struct pmap *pmap;
 16.2112 +{
 16.2113 +	simple_lock(&pmap->pm_obj.vmobjlock);
 16.2114 +	pmap->pm_obj.uo_refs++;
 16.2115 +	simple_unlock(&pmap->pm_obj.vmobjlock);
 16.2116 +}
 16.2117 +
 16.2118 +#if defined(PMAP_FORK)
 16.2119 +/*
 16.2120 + * pmap_fork: perform any necessary data structure manipulation when
 16.2121 + * a VM space is forked.
 16.2122 + */
 16.2123 +
 16.2124 +void
 16.2125 +pmap_fork(pmap1, pmap2)
 16.2126 +	struct pmap *pmap1, *pmap2;
 16.2127 +{
 16.2128 +	simple_lock(&pmap1->pm_obj.vmobjlock);
 16.2129 +	simple_lock(&pmap2->pm_obj.vmobjlock);
 16.2130 +
 16.2131 +#ifdef USER_LDT
 16.2132 +	/* Copy the LDT, if necessary. */
 16.2133 +	if (pmap1->pm_flags & PMF_USER_LDT) {
 16.2134 +		union descriptor *new_ldt;
 16.2135 +		size_t len;
 16.2136 +
 16.2137 +		len = pmap1->pm_ldt_len * sizeof(union descriptor);
 16.2138 +		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len);
 16.2139 +		memcpy(new_ldt, pmap1->pm_ldt, len);
 16.2140 +		pmap2->pm_ldt = new_ldt;
 16.2141 +		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
 16.2142 +		pmap2->pm_flags |= PMF_USER_LDT;
 16.2143 +		ldt_alloc(pmap2, new_ldt, len);
 16.2144 +	}
 16.2145 +#endif /* USER_LDT */
 16.2146 +
 16.2147 +	simple_unlock(&pmap2->pm_obj.vmobjlock);
 16.2148 +	simple_unlock(&pmap1->pm_obj.vmobjlock);
 16.2149 +}
 16.2150 +#endif /* PMAP_FORK */
 16.2151 +
 16.2152 +#ifdef USER_LDT
 16.2153 +/*
 16.2154 + * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
 16.2155 + * restore the default.
 16.2156 + */
 16.2157 +
 16.2158 +void
 16.2159 +pmap_ldt_cleanup(l)
 16.2160 +	struct lwp *l;
 16.2161 +{
 16.2162 +	struct pcb *pcb = &l->l_addr->u_pcb;
 16.2163 +	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
 16.2164 +	union descriptor *old_ldt = NULL;
 16.2165 +	size_t len = 0;
 16.2166 +
 16.2167 +	simple_lock(&pmap->pm_obj.vmobjlock);
 16.2168 +
 16.2169 +	if (pmap->pm_flags & PMF_USER_LDT) {
 16.2170 +		ldt_free(pmap);
 16.2171 +		pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
 16.2172 +		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
 16.2173 +		if (pcb == curpcb)
 16.2174 +			lldt(pcb->pcb_ldt_sel);
 16.2175 +		old_ldt = pmap->pm_ldt;
 16.2176 +		len = pmap->pm_ldt_len * sizeof(union descriptor);
 16.2177 +		pmap->pm_ldt = NULL;
 16.2178 +		pmap->pm_ldt_len = 0;
 16.2179 +		pmap->pm_flags &= ~PMF_USER_LDT;
 16.2180 +	}
 16.2181 +
 16.2182 +	simple_unlock(&pmap->pm_obj.vmobjlock);
 16.2183 +
 16.2184 +	if (old_ldt != NULL)
 16.2185 +		uvm_km_free(kernel_map, (vaddr_t)old_ldt, len);
 16.2186 +}
 16.2187 +#endif /* USER_LDT */
 16.2188 +
 16.2189 +/*
 16.2190 + * pmap_activate: activate a process' pmap
 16.2191 + *
 16.2192 + * => called from cpu_switch()
 16.2193 + * => if lwp is the curlwp, then set ci_want_pmapload so that
 16.2194 + *    actual MMU context switch will be done by pmap_load() later
 16.2195 + */
 16.2196 +
 16.2197 +void
 16.2198 +pmap_activate(l)
 16.2199 +	struct lwp *l;
 16.2200 +{
 16.2201 +	struct cpu_info *ci = curcpu();
 16.2202 +	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
 16.2203 +
 16.2204 +	if (l == ci->ci_curlwp) {
 16.2205 +		struct pcb *pcb;
 16.2206 +
 16.2207 +		KASSERT(ci->ci_want_pmapload == 0);
 16.2208 +		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
 16.2209 +#ifdef KSTACK_CHECK_DR0
 16.2210 +		/*
 16.2211 +		 * setup breakpoint on the top of stack
 16.2212 +		 */
 16.2213 +		if (l == &lwp0)
 16.2214 +			dr0(0, 0, 0, 0);
 16.2215 +		else
 16.2216 +			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
 16.2217 +#endif
 16.2218 +
 16.2219 +		/*
 16.2220 +		 * no need to switch to kernel vmspace because
 16.2221 +		 * it's a subset of any vmspace.
 16.2222 +		 */
 16.2223 +
 16.2224 +		if (pmap == pmap_kernel()) {
 16.2225 +			ci->ci_want_pmapload = 0;
 16.2226 +			return;
 16.2227 +		}
 16.2228 +
 16.2229 +		pcb = &l->l_addr->u_pcb;
 16.2230 +		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
 16.2231 +
 16.2232 +		ci->ci_want_pmapload = 1;
 16.2233 +	}
 16.2234 +}
 16.2235 +
 16.2236 +/*
 16.2237 + * pmap_reactivate: try to regain reference to the pmap.
 16.2238 + */
 16.2239 +
 16.2240 +static boolean_t
 16.2241 +pmap_reactivate(struct pmap *pmap)
 16.2242 +{
 16.2243 +	struct cpu_info *ci = curcpu();
 16.2244 +	u_int32_t cpumask = 1U << ci->ci_cpuid;
 16.2245 +	int s;
 16.2246 +	boolean_t result;
 16.2247 +	u_int32_t oldcpus;
 16.2248 +
 16.2249 +	/*
 16.2250 +	 * if we still have a lazy reference to this pmap,
 16.2251 +	 * we can assume that there was no tlb shootdown
 16.2252 +	 * for this pmap in the meantime.
 16.2253 +	 */
 16.2254 +
 16.2255 +	s = splipi(); /* protect from tlb shootdown ipis. */
 16.2256 +	oldcpus = pmap->pm_cpus;
 16.2257 +	x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
 16.2258 +	if (oldcpus & cpumask) {
 16.2259 +		KASSERT(ci->ci_tlbstate == TLBSTATE_LAZY);
 16.2260 +		/* got it */
 16.2261 +		result = TRUE;
 16.2262 +	} else {
 16.2263 +		KASSERT(ci->ci_tlbstate == TLBSTATE_STALE);
 16.2264 +		result = FALSE;
 16.2265 +	}
 16.2266 +	ci->ci_tlbstate = TLBSTATE_VALID;
 16.2267 +	splx(s);
 16.2268 +
 16.2269 +	return result;
 16.2270 +}
 16.2271 +
 16.2272 +/*
 16.2273 + * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
 16.2274 + */
 16.2275 +
 16.2276 +void
 16.2277 +pmap_load()
 16.2278 +{
 16.2279 +	struct cpu_info *ci = curcpu();
 16.2280 +	u_int32_t cpumask = 1U << ci->ci_cpuid;
 16.2281 +	struct pmap *pmap;
 16.2282 +	struct pmap *oldpmap;
 16.2283 +	struct lwp *l;
 16.2284 +	struct pcb *pcb;
 16.2285 +	pd_entry_t *mapdp;
 16.2286 +	int s;
 16.2287 +
 16.2288 +	KASSERT(ci->ci_want_pmapload);
 16.2289 +
 16.2290 +	l = ci->ci_curlwp;
 16.2291 +	KASSERT(l != NULL);
 16.2292 +	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
 16.2293 +	KASSERT(pmap != pmap_kernel());
 16.2294 +	oldpmap = ci->ci_pmap;
 16.2295 +
 16.2296 +	pcb = ci->ci_curpcb;
 16.2297 +	KASSERT(pcb == &l->l_addr->u_pcb);
 16.2298 +	/* loaded by pmap_activate */
 16.2299 +	KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel);
 16.2300 +
 16.2301 +	if (pmap == oldpmap) {
 16.2302 +		if (!pmap_reactivate(pmap)) {
 16.2303 +
 16.2304 +			/*
 16.2305 +			 * pmap has been changed during deactivated.
 16.2306 +			 * our tlb may be stale.
 16.2307 +			 */
 16.2308 +
 16.2309 +			tlbflush();
 16.2310 +		}
 16.2311 +
 16.2312 +		ci->ci_want_pmapload = 0;
 16.2313 +		return;
 16.2314 +	}
 16.2315 +
 16.2316 +	/*
 16.2317 +	 * actually switch pmap.
 16.2318 +	 */
 16.2319 +
 16.2320 +	x86_atomic_clearbits_l(&oldpmap->pm_cpus, cpumask);
 16.2321 +
 16.2322 +	KASSERT((pmap->pm_cpus & cpumask) == 0);
 16.2323 +
 16.2324 +	KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE);
 16.2325 +	pmap_reference(pmap);
 16.2326 +	KERNEL_UNLOCK();
 16.2327 +
 16.2328 +	/*
 16.2329 +	 * mark the pmap in use by this processor.
 16.2330 +	 */
 16.2331 +
 16.2332 +	s = splipi();
 16.2333 +	x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
 16.2334 +	ci->ci_pmap = pmap;
 16.2335 +	ci->ci_tlbstate = TLBSTATE_VALID;
 16.2336 +	splx(s);
 16.2337 +
 16.2338 +	/*
 16.2339 +	 * clear apdp slot before loading %cr3 since Xen only allows
 16.2340 +	 * linear pagetable mappings in the current pagetable.
 16.2341 +	 */
 16.2342 +	KDASSERT(curapdp == 0);
 16.2343 +	mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
 16.2344 +	PDE_CLEAR(APDP_PDE, mapdp);
 16.2345 +
 16.2346 +	/*
 16.2347 +	 * update tss and load corresponding registers.
 16.2348 +	 */
 16.2349 +
 16.2350 +	lldt(pcb->pcb_ldt_sel);
 16.2351 +	pcb->pcb_cr3 = pmap->pm_pdirpa;
 16.2352 +	lcr3(pcb->pcb_cr3);
 16.2353 +
 16.2354 +	ci->ci_want_pmapload = 0;
 16.2355 +
 16.2356 +	KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE);
 16.2357 +	pmap_destroy(oldpmap);
 16.2358 +	KERNEL_UNLOCK();
 16.2359 +}
 16.2360 +
 16.2361 +/*
 16.2362 + * pmap_deactivate: deactivate a process' pmap
 16.2363 + */
 16.2364 +
 16.2365 +void
 16.2366 +pmap_deactivate(l)
 16.2367 +	struct lwp *l;
 16.2368 +{
 16.2369 +
 16.2370 +	if (l == curlwp)
 16.2371 +		pmap_deactivate2(l);
 16.2372 +}
 16.2373 +
 16.2374 +/*
 16.2375 + * pmap_deactivate2: context switch version of pmap_deactivate.
 16.2376 + * always treat l as curlwp.
 16.2377 + */
 16.2378 +
 16.2379 +void
 16.2380 +pmap_deactivate2(l)
 16.2381 +	struct lwp *l;
 16.2382 +{
 16.2383 +	struct pmap *pmap;
 16.2384 +	struct cpu_info *ci = curcpu();
 16.2385 +
 16.2386 +	if (ci->ci_want_pmapload) {
 16.2387 +		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
 16.2388 +		    != pmap_kernel());
 16.2389 +		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
 16.2390 +		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
 16.2391 +
 16.2392 +		/*
 16.2393 +		 * userspace has not been touched.
 16.2394 +		 * nothing to do here.
 16.2395 +		 */
 16.2396 +
 16.2397 +		ci->ci_want_pmapload = 0;
 16.2398 +		return;
 16.2399 +	}
 16.2400 +
 16.2401 +	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
 16.2402 +
 16.2403 +	if (pmap == pmap_kernel()) {
 16.2404 +		return;
 16.2405 +	}
 16.2406 +
 16.2407 +	KASSERT(ci->ci_pmap == pmap);
 16.2408 +
 16.2409 +	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
 16.2410 +	ci->ci_tlbstate = TLBSTATE_LAZY;
 16.2411 +	XENPRINTF(("pmap_deactivate %p ebp %p esp %p\n",
 16.2412 +		      l, (void *)l->l_addr->u_pcb.pcb_ebp, 
 16.2413 +		      (void *)l->l_addr->u_pcb.pcb_esp));
 16.2414 +}
 16.2415 +
 16.2416 +/*
 16.2417 + * end of lifecycle functions
 16.2418 + */
 16.2419 +
 16.2420 +/*
 16.2421 + * some misc. functions
 16.2422 + */
 16.2423 +
 16.2424 +/*
 16.2425 + * pmap_extract: extract a PA for the given VA
 16.2426 + */
 16.2427 +
 16.2428 +boolean_t
 16.2429 +pmap_extract(pmap, va, pap)
 16.2430 +	struct pmap *pmap;
 16.2431 +	vaddr_t va;
 16.2432 +	paddr_t *pap;
 16.2433 +{
 16.2434 +	pt_entry_t *ptes, pte;
 16.2435 +	pd_entry_t pde;
 16.2436 +
 16.2437 +	if (__predict_true((pde = PDE_GET(&pmap->pm_pdir[pdei(va)])) != 0)) {
 16.2438 +#ifdef LARGEPAGES
 16.2439 +		if (pde & PG_PS) {
 16.2440 +			if (pap != NULL)
 16.2441 +				*pap = (pde & PG_LGFRAME) | (va & ~PG_LGFRAME);
 16.2442 +			return (TRUE);
 16.2443 +		}
 16.2444 +#endif
 16.2445 +
 16.2446 +		ptes = pmap_map_ptes(pmap);
 16.2447 +		pte = PTE_GET(&ptes[x86_btop(va)]);
 16.2448 +		pmap_unmap_ptes(pmap);
 16.2449 +
 16.2450 +		if (__predict_true((pte & PG_V) != 0)) {
 16.2451 +			if (pap != NULL)
 16.2452 +				*pap = (pte & PG_FRAME) | (va & ~PG_FRAME);
 16.2453 +			return (TRUE);
 16.2454 +		}
 16.2455 +	}
 16.2456 +	return (FALSE);
 16.2457 +}
 16.2458 +
 16.2459 +
 16.2460 +/*
 16.2461 + * vtophys: virtual address to physical address.  For use by
 16.2462 + * machine-dependent code only.
 16.2463 + */
 16.2464 +
 16.2465 +paddr_t
 16.2466 +vtophys(va)
 16.2467 +	vaddr_t va;
 16.2468 +{
 16.2469 +	paddr_t pa;
 16.2470 +
 16.2471 +	if (pmap_extract(pmap_kernel(), va, &pa) == TRUE)
 16.2472 +		return (pa);
 16.2473 +	return (0);
 16.2474 +}
 16.2475 +
 16.2476 +
 16.2477 +/*
 16.2478 + * pmap_virtual_space: used during bootup [pmap_steal_memory] to
 16.2479 + *	determine the bounds of the kernel virtual addess space.
 16.2480 + */
 16.2481 +
 16.2482 +void
 16.2483 +pmap_virtual_space(startp, endp)
 16.2484 +	vaddr_t *startp;
 16.2485 +	vaddr_t *endp;
 16.2486 +{
 16.2487 +	*startp = virtual_avail;
 16.2488 +	*endp = virtual_end;
 16.2489 +}
 16.2490 +
 16.2491 +/*
 16.2492 + * pmap_map: map a range of PAs into kvm
 16.2493 + *
 16.2494 + * => used during crash dump
 16.2495 + * => XXX: pmap_map() should be phased out?
 16.2496 + */
 16.2497 +
 16.2498 +vaddr_t
 16.2499 +pmap_map(va, spa, epa, prot)
 16.2500 +	vaddr_t va;
 16.2501 +	paddr_t spa, epa;
 16.2502 +	vm_prot_t prot;
 16.2503 +{
 16.2504 +	while (spa < epa) {
 16.2505 +		pmap_enter(pmap_kernel(), va, spa, prot, 0);
 16.2506 +		va += PAGE_SIZE;
 16.2507 +		spa += PAGE_SIZE;
 16.2508 +	}
 16.2509 +	pmap_update(pmap_kernel());
 16.2510 +	return va;
 16.2511 +}
 16.2512 +
 16.2513 +/*
 16.2514 + * pmap_zero_page: zero a page
 16.2515 + */
 16.2516 +
 16.2517 +void
 16.2518 +pmap_zero_page(pa)
 16.2519 +	paddr_t pa;
 16.2520 +{
 16.2521 +#ifdef MULTIPROCESSOR
 16.2522 +	int id = cpu_number();
 16.2523 +#endif
 16.2524 +	pt_entry_t *zpte = PTESLEW(zero_pte, id);
 16.2525 +	pt_entry_t *maptp;
 16.2526 +	caddr_t zerova = VASLEW(zerop, id);
 16.2527 +
 16.2528 +#ifdef DIAGNOSTIC
 16.2529 +	if (PTE_GET(zpte))
 16.2530 +		panic("pmap_zero_page: lock botch");
 16.2531 +#endif
 16.2532 +
 16.2533 +	maptp = (pt_entry_t *)vtomach((vaddr_t)zpte);
 16.2534 +	PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW);	/* map in */
 16.2535 +	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
 16.2536 +
 16.2537 +	memset(zerova, 0, PAGE_SIZE);			/* zero */
 16.2538 +	PTE_CLEAR(zpte, maptp);				/* zap! */
 16.2539 +}
 16.2540 +
 16.2541 +/*
 16.2542 + * pmap_pagezeroidle: the same, for the idle loop page zero'er.
 16.2543 + * Returns TRUE if the page was zero'd, FALSE if we aborted for
 16.2544 + * some reason.
 16.2545 + */
 16.2546 +
 16.2547 +boolean_t
 16.2548 +pmap_pageidlezero(pa)
 16.2549 +	paddr_t pa;
 16.2550 +{
 16.2551 +#ifdef MULTIPROCESSOR
 16.2552 +	int id = cpu_number();
 16.2553 +#endif
 16.2554 +	pt_entry_t *zpte = PTESLEW(zero_pte, id);
 16.2555 +	pt_entry_t *maptp;
 16.2556 +	caddr_t zerova = VASLEW(zerop, id);
 16.2557 +	boolean_t rv = TRUE;
 16.2558 +	int i, *ptr;
 16.2559 +
 16.2560 +#ifdef DIAGNOSTIC
 16.2561 +	if (PTE_GET(zpte))
 16.2562 +		panic("pmap_zero_page_uncached: lock botch");
 16.2563 +#endif
 16.2564 +	maptp = (pt_entry_t *)vtomach((vaddr_t)zpte);
 16.2565 +	PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW);	/* map in */
 16.2566 +	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
 16.2567 +	for (i = 0, ptr = (int *) zerova; i < PAGE_SIZE / sizeof(int); i++) {
 16.2568 +		if (sched_whichqs != 0) {
 16.2569 +
 16.2570 +			/*
 16.2571 +			 * A process has become ready.  Abort now,
 16.2572 +			 * so we don't keep it waiting while we
 16.2573 +			 * do slow memory access to finish this
 16.2574 +			 * page.
 16.2575 +			 */
 16.2576 +
 16.2577 +			rv = FALSE;
 16.2578 +			break;
 16.2579 +		}
 16.2580 +		*ptr++ = 0;
 16.2581 +	}
 16.2582 +
 16.2583 +	PTE_CLEAR(zpte, maptp);				/* zap! */
 16.2584 +	return (rv);
 16.2585 +}
 16.2586 +
 16.2587 +/*
 16.2588 + * pmap_copy_page: copy a page
 16.2589 + */
 16.2590 +
 16.2591 +void
 16.2592 +pmap_copy_page(srcpa, dstpa)
 16.2593 +	paddr_t srcpa, dstpa;
 16.2594 +{
 16.2595 +#ifdef MULTIPROCESSOR
 16.2596 +	int id = cpu_number();
 16.2597 +#endif
 16.2598 +	pt_entry_t *spte = PTESLEW(csrc_pte,id), *maspte;
 16.2599 +	pt_entry_t *dpte = PTESLEW(cdst_pte,id), *madpte;
 16.2600 +	caddr_t csrcva = VASLEW(csrcp, id);
 16.2601 +	caddr_t cdstva = VASLEW(cdstp, id);
 16.2602 +
 16.2603 +#ifdef DIAGNOSTIC
 16.2604 +	if (PTE_GET(spte) || PTE_GET(dpte))
 16.2605 +		panic("pmap_copy_page: lock botch");
 16.2606 +#endif
 16.2607 +
 16.2608 +	maspte = (pt_entry_t *)vtomach((vaddr_t)spte);
 16.2609 +	madpte = (pt_entry_t *)vtomach((vaddr_t)dpte);
 16.2610 +	PTE_SET(spte, maspte, (srcpa & PG_FRAME) | PG_V | PG_RW);
 16.2611 +	PTE_SET(dpte, madpte, (dstpa & PG_FRAME) | PG_V | PG_RW);
 16.2612 +	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
 16.2613 +	memcpy(cdstva, csrcva, PAGE_SIZE);
 16.2614 +	PTE_CLEAR(spte, maspte);			/* zap! */
 16.2615 +	PTE_CLEAR(dpte, madpte);			/* zap! */
 16.2616 +}
 16.2617 +
 16.2618 +/*
 16.2619 + * p m a p   r e m o v e   f u n c t i o n s
 16.2620 + *
 16.2621 + * functions that remove mappings
 16.2622 + */
 16.2623 +
 16.2624 +/*
 16.2625 + * pmap_remove_ptes: remove PTEs from a PTP
 16.2626 + *
 16.2627 + * => must have proper locking on pmap_master_lock
 16.2628 + * => caller must hold pmap's lock
 16.2629 + * => PTP must be mapped into KVA
 16.2630 + * => PTP should be null if pmap == pmap_kernel()
 16.2631 + */
 16.2632 +
 16.2633 +static void
 16.2634 +pmap_remove_ptes(pmap, ptp, ptpva, startva, endva, cpumaskp, flags)
 16.2635 +	struct pmap *pmap;
 16.2636 +	struct vm_page *ptp;
 16.2637 +	vaddr_t ptpva;
 16.2638 +	vaddr_t startva, endva;
 16.2639 +	int32_t *cpumaskp;
 16.2640 +	int flags;
 16.2641 +{
 16.2642 +	struct pv_entry *pv_tofree = NULL;	/* list of pv_entrys to free */
 16.2643 +	struct pv_entry *pve;
 16.2644 +	pt_entry_t *pte = (pt_entry_t *) ptpva;
 16.2645 +	pt_entry_t opte;
 16.2646 +	pt_entry_t *maptp;
 16.2647 +
 16.2648 +	/*
 16.2649 +	 * note that ptpva points to the PTE that maps startva.   this may
 16.2650 +	 * or may not be the first PTE in the PTP.
 16.2651 +	 *
 16.2652 +	 * we loop through the PTP while there are still PTEs to look at
 16.2653 +	 * and the wire_count is greater than 1 (because we use the wire_count
 16.2654 +	 * to keep track of the number of real PTEs in the PTP).
 16.2655 +	 */
 16.2656 +
 16.2657 +	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
 16.2658 +			     ; pte++, startva += PAGE_SIZE) {
 16.2659 +		struct vm_page *pg;
 16.2660 +		struct vm_page_md *mdpg;
 16.2661 +
 16.2662 +		if (!pmap_valid_entry(*pte))
 16.2663 +			continue;			/* VA not mapped */
 16.2664 +		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
 16.2665 +			continue;
 16.2666 +		}
 16.2667 +
 16.2668 +		/* atomically save the old PTE and zap! it */
 16.2669 +		maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
 16.2670 +		opte = pte_atomic_update(pte, maptp, 0);
 16.2671 +		pmap_exec_account(pmap, startva, opte, 0);
 16.2672 +
 16.2673 +		if (opte & PG_W)
 16.2674 +			pmap->pm_stats.wired_count--;
 16.2675 +		pmap->pm_stats.resident_count--;
 16.2676 +
 16.2677 +		if (opte & PG_U)
 16.2678 +			pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
 16.2679 +
 16.2680 +		if (ptp) {
 16.2681 +			ptp->wire_count--;		/* dropping a PTE */
 16.2682 +			/* Make sure that the PDE is flushed */
 16.2683 +			if ((ptp->wire_count <= 1) && !(opte & PG_U))
 16.2684 +				pmap_tlb_shootdown(pmap, startva, opte,
 16.2685 +				    cpumaskp);
 16.2686 +		}
 16.2687 +
 16.2688 +		/*
 16.2689 +		 * if we are not on a pv_head list we are done.
 16.2690 +		 */
 16.2691 +
 16.2692 +		if ((opte & PG_PVLIST) == 0) {
 16.2693 +#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
 16.2694 +			if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
 16.2695 +				panic("pmap_remove_ptes: managed page without "
 16.2696 +				      "PG_PVLIST for 0x%lx", startva);
 16.2697 +#endif
 16.2698 +			continue;
 16.2699 +		}
 16.2700 +
 16.2701 +		pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
 16.2702 +#ifdef DIAGNOSTIC
 16.2703 +		if (pg == NULL)
 16.2704 +			panic("pmap_remove_ptes: unmanaged page marked "
 16.2705 +			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
 16.2706 +			      startva, (u_long)(opte & PG_FRAME));
 16.2707 +#endif
 16.2708 +		mdpg = &pg->mdpage;
 16.2709 +
 16.2710 +		/* sync R/M bits */
 16.2711 +		simple_lock(&mdpg->mp_pvhead.pvh_lock);
 16.2712 +		mdpg->mp_attrs |= (opte & (PG_U|PG_M));
 16.2713 +		pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, startva);
 16.2714 +		simple_unlock(&mdpg->mp_pvhead.pvh_lock);
 16.2715 +
 16.2716 +		if (pve) {
 16.2717 +			SPLAY_RIGHT(pve, pv_node) = pv_tofree;
 16.2718 +			pv_tofree = pve;
 16.2719 +		}
 16.2720 +
 16.2721 +		/* end of "for" loop: time for next pte */
 16.2722 +	}
 16.2723 +	if (pv_tofree)
 16.2724 +		pmap_free_pvs(pmap, pv_tofree);
 16.2725 +}
 16.2726 +
 16.2727 +
 16.2728 +/*
 16.2729 + * pmap_remove_pte: remove a single PTE from a PTP
 16.2730 + *
 16.2731 + * => must have proper locking on pmap_master_lock
 16.2732 + * => caller must hold pmap's lock
 16.2733 + * => PTP must be mapped into KVA
 16.2734 + * => PTP should be null if pmap == pmap_kernel()
 16.2735 + * => returns true if we removed a mapping
 16.2736 + */
 16.2737 +
 16.2738 +static boolean_t
 16.2739 +pmap_remove_pte(pmap, ptp, pte, va, cpumaskp, flags)
 16.2740 +	struct pmap *pmap;
 16.2741 +	struct vm_page *ptp;
 16.2742 +	pt_entry_t *pte;
 16.2743 +	vaddr_t va;
 16.2744 +	int32_t *cpumaskp;
 16.2745 +	int flags;
 16.2746 +{
 16.2747 +	pt_entry_t opte;
 16.2748 +	pt_entry_t *maptp;
 16.2749 +	struct pv_entry *pve;
 16.2750 +	struct vm_page *pg;
 16.2751 +	struct vm_page_md *mdpg;
 16.2752 +
 16.2753 +	if (!pmap_valid_entry(*pte))
 16.2754 +		return(FALSE);		/* VA not mapped */
 16.2755 +	if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
 16.2756 +		return(FALSE);
 16.2757 +	}
 16.2758 +
 16.2759 +	/* atomically save the old PTE and zap! it */
 16.2760 +	maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
 16.2761 +	opte = pte_atomic_update(pte, maptp, 0);
 16.2762 +
 16.2763 +	XENPRINTK(("pmap_remove_pte %p, was %08x\n", pte, opte));
 16.2764 +	pmap_exec_account(pmap, va, opte, 0);
 16.2765 +
 16.2766 +	if (opte & PG_W)
 16.2767 +		pmap->pm_stats.wired_count--;
 16.2768 +	pmap->pm_stats.resident_count--;
 16.2769 +
 16.2770 +	if (opte & PG_U)
 16.2771 +		pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
 16.2772 +
 16.2773 +	if (ptp) {
 16.2774 +		ptp->wire_count--;		/* dropping a PTE */
 16.2775 +		/* Make sure that the PDE is flushed */
 16.2776 +		if ((ptp->wire_count <= 1) && !(opte & PG_U))
 16.2777 +			pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
 16.2778 +
 16.2779 +	}
 16.2780 +	/*
 16.2781 +	 * if we are not on a pv_head list we are done.
 16.2782 +	 */
 16.2783 +
 16.2784 +	if ((opte & PG_PVLIST) == 0) {
 16.2785 +#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
 16.2786 +		if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
 16.2787 +			panic("pmap_remove_pte: managed page without "
 16.2788 +			      "PG_PVLIST for 0x%lx", va);
 16.2789 +#endif
 16.2790 +		return(TRUE);
 16.2791 +	}
 16.2792 +
 16.2793 +	pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
 16.2794 +#ifdef DIAGNOSTIC
 16.2795 +	if (pg == NULL)
 16.2796 +		panic("pmap_remove_pte: unmanaged page marked "
 16.2797 +		    "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
 16.2798 +		    (u_long)(opte & PG_FRAME));
 16.2799 +#endif
 16.2800 +	mdpg = &pg->mdpage;
 16.2801 +
 16.2802 +	/* sync R/M bits */
 16.2803 +	simple_lock(&mdpg->mp_pvhead.pvh_lock);
 16.2804 +	mdpg->mp_attrs |= (opte & (PG_U|PG_M));
 16.2805 +	pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, va);
 16.2806 +	simple_unlock(&mdpg->mp_pvhead.pvh_lock);
 16.2807 +
 16.2808 +	if (pve)
 16.2809 +		pmap_free_pv(pmap, pve);
 16.2810 +	return(TRUE);
 16.2811 +}
 16.2812 +
 16.2813 +/*
 16.2814 + * pmap_remove: top level mapping removal function
 16.2815 + *
 16.2816 + * => caller should not be holding any pmap locks
 16.2817 + */
 16.2818 +
 16.2819 +void
 16.2820 +pmap_remove(pmap, sva, eva)
 16.2821 +	struct pmap *pmap;
 16.2822 +	vaddr_t sva, eva;
 16.2823 +{
 16.2824 +	pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
 16.2825 +}
 16.2826 +
 16.2827 +/*
 16.2828 + * pmap_do_remove: mapping removal guts
 16.2829 + *
 16.2830 + * => caller should not be holding any pmap locks
 16.2831 + */
 16.2832 +
 16.2833 +static void
 16.2834 +pmap_do_remove(pmap, sva, eva, flags)
 16.2835 +	struct pmap *pmap;
 16.2836 +	vaddr_t sva, eva;
 16.2837 +	int flags;
 16.2838 +{
 16.2839 +	pt_entry_t *ptes, opte;
 16.2840 +	pt_entry_t *maptp;
 16.2841 +	boolean_t result;
 16.2842 +	paddr_t ptppa;
 16.2843 +	vaddr_t blkendva;
 16.2844 +	struct vm_page *ptp;
 16.2845 +	int32_t cpumask = 0;
 16.2846 +	TAILQ_HEAD(, vm_page) empty_ptps;
 16.2847 +	struct cpu_info *ci;
 16.2848 +	struct pmap *curpmap;
 16.2849 +
 16.2850 +	/*
 16.2851 +	 * we lock in the pmap => pv_head direction
 16.2852 +	 */
 16.2853 +
 16.2854 +	TAILQ_INIT(&empty_ptps);
 16.2855 +
 16.2856 +	PMAP_MAP_TO_HEAD_LOCK();
 16.2857 +
 16.2858 +	ptes = pmap_map_ptes(pmap);	/* locks pmap */
 16.2859 +
 16.2860 +	ci = curcpu();
 16.2861 +	curpmap = ci->ci_pmap;
 16.2862 +
 16.2863 +	/*
 16.2864 +	 * removing one page?  take shortcut function.
 16.2865 +	 */
 16.2866 +
 16.2867 +	if (sva + PAGE_SIZE == eva) {
 16.2868 +		if (pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) {
 16.2869 +
 16.2870 +			/* PA of the PTP */
 16.2871 +			ptppa = PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME;
 16.2872 +
 16.2873 +			/* get PTP if non-kernel mapping */
 16.2874 +			if (pmap == pmap_kernel()) {
 16.2875 +				/* we never free kernel PTPs */
 16.2876 +				ptp = NULL;
 16.2877 +			} else {
 16.2878 +				if (pmap->pm_ptphint &&
 16.2879 +				    VM_PAGE_TO_PHYS(pmap->pm_ptphint) ==
 16.2880 +				    ptppa) {
 16.2881 +					ptp = pmap->pm_ptphint;
 16.2882 +				} else {
 16.2883 +					ptp = PHYS_TO_VM_PAGE(ptppa);
 16.2884 +#ifdef DIAGNOSTIC
 16.2885 +					if (ptp == NULL)
 16.2886 +						panic("pmap_remove: unmanaged "
 16.2887 +						      "PTP detected");
 16.2888 +#endif
 16.2889 +				}
 16.2890 +			}
 16.2891 +
 16.2892 +			/* do it! */
 16.2893 +			result = pmap_remove_pte(pmap, ptp,
 16.2894 +			    &ptes[x86_btop(sva)], sva, &cpumask, flags);
 16.2895 +
 16.2896 +			/*
 16.2897 +			 * if mapping removed and the PTP is no longer
 16.2898 +			 * being used, free it!
 16.2899 +			 */
 16.2900 +
 16.2901 +			if (result && ptp && ptp->wire_count <= 1) {
 16.2902 +				/* zap! */
 16.2903 +				maptp = (pt_entry_t *)vtomach(
 16.2904 +					(vaddr_t)&pmap->pm_pdir[pdei(sva)]);
 16.2905 +				PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)],
 16.2906 +				    maptp, opte);
 16.2907 +#if defined(MULTIPROCESSOR)
 16.2908 +				/*
 16.2909 +				 * XXXthorpej Redundant shootdown can happen
 16.2910 +				 * here if we're using APTE space.
 16.2911 +				 */
 16.2912 +#endif
 16.2913 +				pmap_tlb_shootdown(curpmap,
 16.2914 +				    ((vaddr_t)ptes) + ptp->offset, opte,
 16.2915 +				    &cpumask);
 16.2916 +#if defined(MULTIPROCESSOR)
 16.2917 +				/*
 16.2918 +				 * Always shoot down the pmap's self-mapping
 16.2919 +				 * of the PTP.
 16.2920 +				 * XXXthorpej Redundant shootdown can happen
 16.2921 +				 * here if pmap == curpmap (not APTE space).
 16.2922 +				 */
 16.2923 +				pmap_tlb_shootdown(pmap,
 16.2924 +				    ((vaddr_t)PTE_BASE) + ptp->offset, opte,
 16.2925 +				    &cpumask);
 16.2926 +#endif
 16.2927 +				pmap->pm_stats.resident_count--;
 16.2928 +				if (pmap->pm_ptphint == ptp)
 16.2929 +					pmap->pm_ptphint =
 16.2930 +					    TAILQ_FIRST(&pmap->pm_obj.memq);
 16.2931 +				ptp->wire_count = 0;
 16.2932 +				ptp->flags |= PG_ZERO;
 16.2933 +				uvm_pagerealloc(ptp, NULL, 0);
 16.2934 +				TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
 16.2935 +			}
 16.2936 +		}
 16.2937 +		pmap_tlb_shootnow(cpumask);
 16.2938 +		pmap_unmap_ptes(pmap);		/* unlock pmap */
 16.2939 +		PMAP_MAP_TO_HEAD_UNLOCK();
 16.2940 +		/* Now we can free unused ptps */
 16.2941 +		TAILQ_FOREACH(ptp, &empty_ptps, listq)
 16.2942 +			uvm_pagefree(ptp);
 16.2943 +		return;
 16.2944 +	}
 16.2945 +
 16.2946 +	cpumask = 0;
 16.2947 +
 16.2948 +	for (/* null */ ; sva < eva ; sva = blkendva) {
 16.2949 +
 16.2950 +		/* determine range of block */
 16.2951 +		blkendva = x86_round_pdr(sva+1);
 16.2952 +		if (blkendva > eva)
 16.2953 +			blkendva = eva;
 16.2954 +
 16.2955 +		/*
 16.2956 +		 * XXXCDC: our PTE mappings should never be removed
 16.2957 +		 * with pmap_remove!  if we allow this (and why would
 16.2958 +		 * we?) then we end up freeing the pmap's page
 16.2959 +		 * directory page (PDP) before we are finished using
 16.2960 +		 * it when we hit in in the recursive mapping.  this
 16.2961 +		 * is BAD.
 16.2962 +		 *
 16.2963 +		 * long term solution is to move the PTEs out of user
 16.2964 +		 * address space.  and into kernel address space (up
 16.2965 +		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
 16.2966 +		 * be VM_MAX_ADDRESS.
 16.2967 +		 */
 16.2968 +
 16.2969 +		if (pdei(sva) == PDSLOT_PTE)
 16.2970 +			/* XXXCDC: ugly hack to avoid freeing PDP here */
 16.2971 +			continue;
 16.2972 +
 16.2973 +		if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
 16.2974 +			/* valid block? */
 16.2975 +			continue;
 16.2976 +
 16.2977 +		/* PA of the PTP */
 16.2978 +		ptppa = (PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME);
 16.2979 +
 16.2980 +		/* get PTP if non-kernel mapping */
 16.2981 +		if (pmap == pmap_kernel()) {
 16.2982 +			/* we never free kernel PTPs */
 16.2983 +			ptp = NULL;
 16.2984 +		} else {
 16.2985 +			if (pmap->pm_ptphint &&
 16.2986 +			    VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
 16.2987 +				ptp = pmap->pm_ptphint;
 16.2988 +			} else {
 16.2989 +				ptp = PHYS_TO_VM_PAGE(ptppa);
 16.2990 +#ifdef DIAGNOSTIC
 16.2991 +				if (ptp == NULL)
 16.2992 +					panic("pmap_remove: unmanaged PTP "
 16.2993 +					      "detected");
 16.2994 +#endif
 16.2995 +			}
 16.2996 +		}
 16.2997 +		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[x86_btop(sva)],
 16.2998 +		    sva, blkendva, &cpumask, flags);
 16.2999 +
 16.3000 +		/* if PTP is no longer being used, free it! */
 16.3001 +		if (ptp && ptp->wire_count <= 1) {
 16.3002 +			/* zap! */
 16.3003 +			maptp = (pt_entry_t *)vtomach(
 16.3004 +				(vaddr_t)&pmap->pm_pdir[pdei(sva)]);
 16.3005 +			PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)],
 16.3006 +			    maptp, opte);
 16.3007 +#if defined(MULTIPROCESSOR)
 16.3008 +			/*
 16.3009 +			 * XXXthorpej Redundant shootdown can happen here
 16.3010 +			 * if we're using APTE space.
 16.3011 +			 */
 16.3012 +#endif
 16.3013 +			pmap_tlb_shootdown(curpmap,
 16.3014 +			    ((vaddr_t)ptes) + ptp->offset, opte, &cpumask);
 16.3015 +#if defined(MULTIPROCESSOR)
 16.3016 +			/*
 16.3017 +			 * Always shoot down the pmap's self-mapping
 16.3018 +			 * of the PTP.
 16.3019 +			 * XXXthorpej Redundant shootdown can happen here
 16.3020 +			 * if pmap == curpmap (not APTE space).
 16.3021 +			 */
 16.3022 +			pmap_tlb_shootdown(pmap,
 16.3023 +			    ((vaddr_t)PTE_BASE) + ptp->offset, opte, &cpumask);
 16.3024 +#endif
 16.3025 +			pmap->pm_stats.resident_count--;
 16.3026 +			if (pmap->pm_ptphint == ptp)	/* update hint? */
 16.3027 +				pmap->pm_ptphint = pmap->pm_obj.memq.tqh_first;
 16.3028 +			ptp->wire_count = 0;
 16.3029 +			ptp->flags |= PG_ZERO;
 16.3030 +			/* Postpone free to shootdown */
 16.3031 +			uvm_pagerealloc(ptp, NULL, 0);
 16.3032 +			TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
 16.3033 +		}
 16.3034 +	}
 16.3035 +
 16.3036 +	pmap_tlb_shootnow(cpumask);
 16.3037 +	pmap_unmap_ptes(pmap);
 16.3038 +	PMAP_MAP_TO_HEAD_UNLOCK();
 16.3039 +	/* Now we can free unused ptps */
 16.3040 +	TAILQ_FOREACH(ptp, &empty_ptps, listq)
 16.3041 +		uvm_pagefree(ptp);
 16.3042 +}
 16.3043 +
 16.3044 +/*
 16.3045 + * pmap_page_remove: remove a managed vm_page from all pmaps that map it
 16.3046 + *
 16.3047 + * => we set pv_head => pmap locking
 16.3048 + * => R/M bits are sync'd back to attrs
 16.3049 + */
 16.3050 +
 16.3051 +void
 16.3052 +pmap_page_remove(pg)
 16.3053 +	struct vm_page *pg;
 16.3054 +{
 16.3055 +	struct pv_head *pvh;
 16.3056 +	struct pv_entry *pve, *npve, *killlist = NULL;
 16.3057 +	pt_entry_t *ptes, opte;
 16.3058 +	pt_entry_t *maptp;
 16.3059 +	int32_t cpumask = 0;
 16.3060 +	TAILQ_HEAD(, vm_page) empty_ptps;
 16.3061 +	struct vm_page *ptp;
 16.3062 +	struct cpu_info *ci;
 16.3063 +	struct pmap *curpmap;
 16.3064 +
 16.3065 +#ifdef DIAGNOSTIC
 16.3066 +	int bank, off;
 16.3067 +
 16.3068 +	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
 16.3069 +	if (bank == -1)
 16.3070 +		panic("pmap_page_remove: unmanaged page?");
 16.3071 +#endif
 16.3072 +
 16.3073 +	pvh = &pg->mdpage.mp_pvhead;
 16.3074 +	if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
 16.3075 +		return;
 16.3076 +	}
 16.3077 +
 16.3078 +	TAILQ_INIT(&empty_ptps);
 16.3079 +
 16.3080 +	/* set pv_head => pmap locking */
 16.3081 +	PMAP_HEAD_TO_MAP_LOCK();
 16.3082 +
 16.3083 +	ci = curcpu();
 16.3084 +	curpmap = ci->ci_pmap;
 16.3085 +
 16.3086 +	/* XXX: needed if we hold head->map lock? */
 16.3087 +	simple_lock(&pvh->pvh_lock);
 16.3088 +
 16.3089 +	for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) {
 16.3090 +		npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve);
 16.3091 +		ptes = pmap_map_ptes(pve->pv_pmap);		/* locks pmap */
 16.3092 +
 16.3093 +#ifdef DIAGNOSTIC
 16.3094 +		if (pve->pv_ptp &&
 16.3095 +		    (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]) &
 16.3096 +			PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
 16.3097 +			printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n",
 16.3098 +			    pg, pve->pv_va, pve->pv_ptp);
 16.3099 +			printf("pmap_page_remove: PTP's phys addr: "
 16.3100 +			    "actual=%lx, recorded=%lx\n",
 16.3101 +			    (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)])
 16.3102 +				& PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
 16.3103 +			panic("pmap_page_remove: mapped managed page has "
 16.3104 +			    "invalid pv_ptp field");
 16.3105 +		}
 16.3106 +#endif
 16.3107 +
 16.3108 +		/* atomically save the old PTE and zap! it */
 16.3109 +		maptp = (pt_entry_t *)vtomach(
 16.3110 +			(vaddr_t)&ptes[x86_btop(pve->pv_va)]);
 16.3111 +		opte = pte_atomic_update(&ptes[x86_btop(pve->pv_va)],
 16.3112 +		    maptp, 0);
 16.3113 +
 16.3114 +		if (opte & PG_W)
 16.3115 +			pve->pv_pmap->pm_stats.wired_count--;
 16.3116 +		pve->pv_pmap->pm_stats.resident_count--;
 16.3117 +
 16.3118 +		/* Shootdown only if referenced */
 16.3119 +		if (opte & PG_U)
 16.3120 +			pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
 16.3121 +			    &cpumask);
 16.3122 +
 16.3123 +		/* sync R/M bits */
 16.3124 +		pg->mdpage.mp_attrs |= (opte & (PG_U|PG_M));
 16.3125 +
 16.3126 +		/* update the PTP reference count.  free if last reference. */
 16.3127 +		if (pve->pv_ptp) {
 16.3128 +			pve->pv_ptp->wire_count--;
 16.3129 +			if (pve->pv_ptp->wire_count <= 1) {
 16.3130 +				/*
 16.3131 +				 * Do we have to shootdown the page just to
 16.3132 +				 * get the pte out of the TLB ?
 16.3133 +				 */
 16.3134 +				if(!(opte & PG_U))
 16.3135 +					pmap_tlb_shootdown(pve->pv_pmap,
 16.3136 +					    pve->pv_va, opte, &cpumask);
 16.3137 +
 16.3138 +				/* zap! */
 16.3139 +				maptp = (pt_entry_t *)vtomach((vaddr_t)
 16.3140 +				    &pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]);
 16.3141 +				PTE_ATOMIC_CLEAR(&pve->pv_pmap->pm_pdir
 16.3142 +				    [pdei(pve->pv_va)], maptp, opte);
 16.3143 +				pmap_tlb_shootdown(curpmap,
 16.3144 +				    ((vaddr_t)ptes) + pve->pv_ptp->offset,
 16.3145 +				    opte, &cpumask);
 16.3146 +#if defined(MULTIPROCESSOR)
 16.3147 +				/*
 16.3148 +				 * Always shoot down the other pmap's
 16.3149 +				 * self-mapping of the PTP.
 16.3150 +				 */
 16.3151 +				pmap_tlb_shootdown(pve->pv_pmap,
 16.3152 +				    ((vaddr_t)PTE_BASE) + pve->pv_ptp->offset,
 16.3153 +				    opte, &cpumask);
 16.3154 +#endif
 16.3155 +				pve->pv_pmap->pm_stats.resident_count--;
 16.3156 +				/* update hint? */
 16.3157 +				if (pve->pv_pmap->pm_ptphint == pve->pv_ptp)
 16.3158 +					pve->pv_pmap->pm_ptphint =
 16.3159 +					    pve->pv_pmap->pm_obj.memq.tqh_first;
 16.3160 +				pve->pv_ptp->wire_count = 0;
 16.3161 +				pve->pv_ptp->flags |= PG_ZERO;
 16.3162 +				/* Free only after the shootdown */
 16.3163 +				uvm_pagerealloc(pve->pv_ptp, NULL, 0);
 16.3164 +				TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp,
 16.3165 +				    listq);
 16.3166 +			}
 16.3167 +		}
 16.3168 +		pmap_unmap_ptes(pve->pv_pmap);		/* unlocks pmap */
 16.3169 +		SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */
 16.3170 +		SPLAY_RIGHT(pve, pv_node) = killlist;	/* mark it for death */
 16.3171 +		killlist = pve;
 16.3172 +	}
 16.3173 +	pmap_free_pvs(NULL, killlist);
 16.3174 +	simple_unlock(&pvh->pvh_lock);
 16.3175 +	PMAP_HEAD_TO_MAP_UNLOCK();
 16.3176 +	pmap_tlb_shootnow(cpumask);
 16.3177 +
 16.3178 +	/* Now we can free unused ptps */
 16.3179 +	TAILQ_FOREACH(ptp, &empty_ptps, listq)
 16.3180 +		uvm_pagefree(ptp);
 16.3181 +}
 16.3182 +
 16.3183 +/*
 16.3184 + * p m a p   a t t r i b u t e  f u n c t i o n s
 16.3185 + * functions that test/change managed page's attributes
 16.3186 + * since a page can be mapped multiple times we must check each PTE that
 16.3187 + * maps it by going down the pv lists.
 16.3188 + */
 16.3189 +
 16.3190 +/*
 16.3191 + * pmap_test_attrs: test a page's attributes
 16.3192 + *
 16.3193 + * => we set pv_head => pmap locking
 16.3194 + */
 16.3195 +
 16.3196 +boolean_t
 16.3197 +pmap_test_attrs(pg, testbits)
 16.3198 +	struct vm_page *pg;
 16.3199 +	int testbits;
 16.3200 +{
 16.3201 +	struct vm_page_md *mdpg;
 16.3202 +	int *myattrs;
 16.3203 +	struct pv_head *pvh;
 16.3204 +	struct pv_entry *pve;
 16.3205 +	volatile pt_entry_t *ptes;
 16.3206 +	pt_entry_t pte;
 16.3207 +
 16.3208 +#if DIAGNOSTIC
 16.3209 +	int bank, off;
 16.3210 +
 16.3211 +	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
 16.3212 +	if (bank == -1)
 16.3213 +		panic("pmap_test_attrs: unmanaged page?");
 16.3214 +#endif
 16.3215 +	mdpg = &pg->mdpage;
 16.3216 +
 16.3217 +	/*
 16.3218 +	 * before locking: see if attributes are already set and if so,
 16.3219 +	 * return!
 16.3220 +	 */
 16.3221 +
 16.3222 +	myattrs = &mdpg->mp_attrs;
 16.3223 +	if (*myattrs & testbits)
 16.3224 +		return(TRUE);
 16.3225 +
 16.3226 +	/* test to see if there is a list before bothering to lock */
 16.3227 +	pvh = &mdpg->mp_pvhead;
 16.3228 +	if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
 16.3229 +		return(FALSE);
 16.3230 +	}
 16.3231 +
 16.3232 +	/* nope, gonna have to do it the hard way */
 16.3233 +	PMAP_HEAD_TO_MAP_LOCK();
 16.3234 +	/* XXX: needed if we hold head->map lock? */
 16.3235 +	simple_lock(&pvh->pvh_lock);
 16.3236 +
 16.3237 +	for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root);
 16.3238 +	     pve != NULL && (*myattrs & testbits) == 0;
 16.3239 +	     pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) {
 16.3240 +		ptes = pmap_map_ptes(pve->pv_pmap);
 16.3241 +		pte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); /* XXX flags only? */
 16.3242 +		pmap_unmap_ptes(pve->pv_pmap);
 16.3243 +		*myattrs |= pte;
 16.3244 +	}
 16.3245 +
 16.3246 +	/*
 16.3247 +	 * note that we will exit the for loop with a non-null pve if
 16.3248 +	 * we have found the bits we are testing for.
 16.3249 +	 */
 16.3250 +
 16.3251 +	simple_unlock(&pvh->pvh_lock);
 16.3252 +	PMAP_HEAD_TO_MAP_UNLOCK();
 16.3253 +	return((*myattrs & testbits) != 0);
 16.3254 +}
 16.3255 +
 16.3256 +/*
 16.3257 + * pmap_clear_attrs: clear the specified attribute for a page.
 16.3258 + *
 16.3259 + * => we set pv_head => pmap locking
 16.3260 + * => we return TRUE if we cleared one of the bits we were asked to
 16.3261 + */
 16.3262 +
 16.3263 +boolean_t
 16.3264 +pmap_clear_attrs(pg, clearbits)
 16.3265 +	struct vm_page *pg;
 16.3266 +	int clearbits;
 16.3267 +{
 16.3268 +	struct vm_page_md *mdpg;
 16.3269 +	u_int32_t result;
 16.3270 +	struct pv_head *pvh;
 16.3271 +	struct pv_entry *pve;
 16.3272 +	pt_entry_t *ptes, opte;
 16.3273 +	pt_entry_t *maptp;
 16.3274 +	int *myattrs;
 16.3275 +	int32_t cpumask = 0;
 16.3276 +
 16.3277 +#ifdef DIAGNOSTIC
 16.3278 +	int bank, off;
 16.3279 +
 16.3280 +	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
 16.3281 +	if (bank == -1)
 16.3282 +		panic("pmap_change_attrs: unmanaged page?");
 16.3283 +#endif
 16.3284 +	mdpg = &pg->mdpage;
 16.3285 +
 16.3286 +	PMAP_HEAD_TO_MAP_LOCK();
 16.3287 +	pvh = &mdpg->mp_pvhead;
 16.3288 +	/* XXX: needed if we hold head->map lock? */
 16.3289 +	simple_lock(&pvh->pvh_lock);
 16.3290 +
 16.3291 +	myattrs = &mdpg->mp_attrs;
 16.3292 +	result = *myattrs & clearbits;
 16.3293 +	*myattrs &= ~clearbits;
 16.3294 +
 16.3295 +	SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) {
 16.3296 +#ifdef DIAGNOSTIC
 16.3297 +		if (!pmap_valid_entry(pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]))
 16.3298 +			panic("pmap_change_attrs: mapping without PTP "
 16.3299 +			      "detected");
 16.3300 +#endif
 16.3301 +
 16.3302 +		ptes = pmap_map_ptes(pve->pv_pmap);	/* locks pmap */
 16.3303 +		opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]);
 16.3304 +		if (opte & clearbits) {
 16.3305 +			/* We need to do something */
 16.3306 +			if (clearbits == PG_RW) {
 16.3307 +				result |= PG_RW;
 16.3308 +
 16.3309 +				/*
 16.3310 +				 * On write protect we might not need to flush 
 16.3311 +				 * the TLB
 16.3312 +				 */
 16.3313 +
 16.3314 +				/* First zap the RW bit! */
 16.3315 +				maptp = (pt_entry_t *)vtomach(
 16.3316 +					(vaddr_t)&ptes[x86_btop(pve->pv_va)]);
 16.3317 +				PTE_ATOMIC_CLEARBITS(
 16.3318 +					&ptes[x86_btop(pve->pv_va)],
 16.3319 +					maptp, PG_RW);
 16.3320 +				opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]);
 16.3321 +
 16.3322 +				/*
 16.3323 +				 * Then test if it is not cached as RW the TLB
 16.3324 +				 */
 16.3325 +				if (!(opte & PG_M))
 16.3326 +					goto no_tlb_shootdown;
 16.3327 +			}
 16.3328 +
 16.3329 +			/*
 16.3330 +			 * Since we need a shootdown me might as well
 16.3331 +			 * always clear PG_U AND PG_M.
 16.3332 +			 */
 16.3333 +
 16.3334 +			/* zap! */
 16.3335 +			maptp = (pt_entry_t *)vtomach(
 16.3336 +				(vaddr_t)&ptes[x86_btop(pve->pv_va)]);
 16.3337 +			PTE_ATOMIC_SET(&ptes[x86_btop(pve->pv_va)], maptp,
 16.3338 +			    (opte & ~(PG_U | PG_M)), opte);
 16.3339 +
 16.3340 +			result |= (opte & clearbits);
 16.3341 +			*myattrs |= (opte & ~(clearbits));
 16.3342 +
 16.3343 +			pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
 16.3344 +					   &cpumask);
 16.3345 +		}
 16.3346 +no_tlb_shootdown:
 16.3347 +		pmap_unmap_ptes(pve->pv_pmap);		/* unlocks pmap */
 16.3348 +	}
 16.3349 +
 16.3350 +	simple_unlock(&pvh->pvh_lock);
 16.3351 +	PMAP_HEAD_TO_MAP_UNLOCK();
 16.3352 +
 16.3353 +	pmap_tlb_shootnow(cpumask);
 16.3354 +	return(result != 0);
 16.3355 +}
 16.3356 +
 16.3357 +
 16.3358 +/*
 16.3359 + * p m a p   p r o t e c t i o n   f u n c t i o n s
 16.3360 + */
 16.3361 +
 16.3362 +/*
 16.3363 + * pmap_page_protect: change the protection of all recorded mappings
 16.3364 + *	of a managed page
 16.3365 + *
 16.3366 + * => NOTE: this is an inline function in pmap.h
 16.3367 + */
 16.3368 +
 16.3369 +/* see pmap.h */
 16.3370 +
 16.3371 +/*
 16.3372 + * pmap_protect: set the protection in of the pages in a pmap
 16.3373 + *
 16.3374 + * => NOTE: this is an inline function in pmap.h
 16.3375 + */
 16.3376 +
 16.3377 +/* see pmap.h */
 16.3378 +
 16.3379 +/*
 16.3380 + * pmap_write_protect: write-protect pages in a pmap
 16.3381 + */
 16.3382 +
 16.3383 +void
 16.3384 +pmap_write_protect(pmap, sva, eva, prot)
 16.3385 +	struct pmap *pmap;
 16.3386 +	vaddr_t sva, eva;
 16.3387 +	vm_prot_t prot;
 16.3388 +{
 16.3389 +	pt_entry_t *ptes, *epte;
 16.3390 +	pt_entry_t *maptp;
 16.3391 +#ifndef XEN
 16.3392 +	volatile
 16.3393 +#endif
 16.3394 +		pt_entry_t *spte;
 16.3395 +	vaddr_t blockend;
 16.3396 +	int32_t cpumask = 0;
 16.3397 +
 16.3398 +	ptes = pmap_map_ptes(pmap);		/* locks pmap */
 16.3399 +
 16.3400 +	/* should be ok, but just in case ... */
 16.3401 +	sva &= PG_FRAME;
 16.3402 +	eva &= PG_FRAME;
 16.3403 +
 16.3404 +	for (/* null */ ; sva < eva ; sva = blockend) {
 16.3405 +
 16.3406 +		blockend = (sva & PD_MASK) + NBPD;
 16.3407 +		if (blockend > eva)
 16.3408 +			blockend = eva;
 16.3409 +
 16.3410 +		/*
 16.3411 +		 * XXXCDC: our PTE mappings should never be write-protected!
 16.3412 +		 *
 16.3413 +		 * long term solution is to move the PTEs out of user
 16.3414 +		 * address space.  and into kernel address space (up
 16.3415 +		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
 16.3416 +		 * be VM_MAX_ADDRESS.
 16.3417 +		 */
 16.3418 +
 16.3419 +		/* XXXCDC: ugly hack to avoid freeing PDP here */
 16.3420 +		if (pdei(sva) == PDSLOT_PTE)
 16.3421 +			continue;
 16.3422 +
 16.3423 +		/* empty block? */
 16.3424 +		if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
 16.3425 +			continue;
 16.3426 +
 16.3427 +#ifdef DIAGNOSTIC
 16.3428 +		if (sva >= VM_MAXUSER_ADDRESS &&
 16.3429 +		    sva < VM_MAX_ADDRESS)
 16.3430 +			panic("pmap_write_protect: PTE space");
 16.3431 +#endif
 16.3432 +
 16.3433 +		spte = &ptes[x86_btop(sva)];
 16.3434 +		epte = &ptes[x86_btop(blockend)];
 16.3435 +
 16.3436 +		for (/*null */; spte < epte ; spte++) {
 16.3437 +			if ((PTE_GET(spte) & (PG_RW|PG_V)) == (PG_RW|PG_V)) {
 16.3438 +				maptp = (pt_entry_t *)vtomach((vaddr_t)spte);
 16.3439 +				PTE_ATOMIC_CLEARBITS(spte, maptp, PG_RW);
 16.3440 +				if (PTE_GET(spte) & PG_M)
 16.3441 +					pmap_tlb_shootdown(pmap,
 16.3442 +					    x86_ptob(spte - ptes),
 16.3443 +					    PTE_GET(spte), &cpumask);
 16.3444 +			}
 16.3445 +		}
 16.3446 +	}
 16.3447 +
 16.3448 +	/*
 16.3449 +	 * if we kept a removal record and removed some pages update the TLB
 16.3450 +	 */
 16.3451 +
 16.3452 +	pmap_tlb_shootnow(cpumask);
 16.3453 +	pmap_unmap_ptes(pmap);		/* unlocks pmap */
 16.3454 +}
 16.3455 +
 16.3456 +/*
 16.3457 + * end of protection functions
 16.3458 + */
 16.3459 +
 16.3460 +/*
 16.3461 + * pmap_unwire: clear the wired bit in the PTE
 16.3462 + *
 16.3463 + * => mapping should already be in map
 16.3464 + */
 16.3465 +
 16.3466 +void
 16.3467 +pmap_unwire(pmap, va)
 16.3468 +	struct pmap *pmap;
 16.3469 +	vaddr_t va;
 16.3470 +{
 16.3471 +	pt_entry_t *ptes;
 16.3472 +	pt_entry_t *maptp;