ia64/xen-unstable

changeset 2444:7432c2c8b98b

bitkeeper revision 1.1159.72.2 (413cb4b0nYQ7KFQbxIn6g-4lsRAgbQ)

Add sparse tree for NetBSD.
author cl349@labyrinth.cl.cam.ac.uk
date Mon Sep 06 19:04:16 2004 +0000 (2004-09-06)
parents e1636300e803
children 60e68411ab8a
files .rootkeys netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c netbsd-2.0-xen-sparse/sys/nfs/files.nfs
line diff
     1.1 --- a/.rootkeys	Mon Sep 06 18:52:41 2004 +0000
     1.2 +++ b/.rootkeys	Mon Sep 06 19:04:16 2004 +0000
     1.3 @@ -263,6 +263,33 @@ 413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2.
     1.4  413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
     1.5  413cb1e5kY_Zil7-b0kI6hvCIxBEYg netbsd-2.0-xen-sparse/nbconfig-xen
     1.6  413cb1e5-58q5doPifcE1Q8ZAgm-JQ netbsd-2.0-xen-sparse/nbmake-xen
     1.7 +413cb3b3Cmp02Gj87f3wwu2W9y0gBg netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN
     1.8 +413cb3b3aUP9GmUWqHWQ2SRp1qXnqQ netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen
     1.9 +413cb3b3pZuLKElEpQwX1C-3hLW4qA netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c
    1.10 +413cb3b34ui1cCGaSqIeLiBgMp-PDw netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c
    1.11 +413cb3b3i11i2GVGn0YGlRbM3ifbPQ netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c
    1.12 +413cb3b3FgMboWw-Pm3XdbBFSlZl_g netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S
    1.13 +413cb3b4ABCSfkHRmbsWfnZNG28nBA netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c
    1.14 +413cb3b4bvVJ7UlliMSH60J4uIb9kA netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c
    1.15 +413cb3b4aKd9SUY-OzUiTF0Gb9ve9w netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c
    1.16 +413cb3b4jUtWl-sP493PvB27o-Iltw netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S
    1.17 +413cb3b4ElwwoJEmmzflV0HgK5Qxcg netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c
    1.18 +413cb3b4k9OVRCxuSdhKt-2baTp_Yg netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h
    1.19 +413cb3b4bRsqiHQLTKEZk4-zOksf8A netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h
    1.20 +413cb3b4OqY83qI8GztIZGADpvrpSw netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h
    1.21 +413cb3b42GG0LffraTnpZKlSUq57wg netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h
    1.22 +413cb3b4F0ArkWVBRyspkw7ivfXihg netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h
    1.23 +413cb3b4ullQud70n4JClwoEEUBh8Q netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h
    1.24 +413cb3b4y1Ffq8BOhbdSpn-fGmKuEg netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h
    1.25 +413cb3b4uXOFcT56QuLt1fcDrB-4Zg netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c
    1.26 +413cb3b4hIffjrKn3zhVqJmH6ueB3Q netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c
    1.27 +413cb3b4eNdRIasCoQIuX4Nu39Dlqw netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c
    1.28 +413cb3b40DLJLbX_ZUIULB0JFjBuaw netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c
    1.29 +413cb3b46JnvK1UurZAubeQoFg1W-w netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c
    1.30 +413cb3b5rIKB3TbyhK3pbNyVkYysqA netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c
    1.31 +413cb3b5eKxnzoodEqaWn2wrPnHWnA netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c
    1.32 +413cb3b5F56TvQWAmO5TsuzhtzLFPQ netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c
    1.33 +413cb3b53nyOv1OIeDSsCXhBFDXvJA netbsd-2.0-xen-sparse/sys/nfs/files.nfs
    1.34  40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
    1.35  3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
    1.36  4124b307nRyK3dhn1hAsvrY76NuV3g tools/check/Makefile
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN	Mon Sep 06 19:04:16 2004 +0000
     2.3 @@ -0,0 +1,176 @@
     2.4 +# $NetBSD: XEN,v 1.1.2.2 2004/07/15 20:19:34 he Exp $
     2.5 +
     2.6 +include 	"arch/xen/conf/std.xen"
     2.7 +
     2.8 +options 	INCLUDE_CONFIG_FILE	# embed config file in kernel binary
     2.9 +
    2.10 +#options		UVMHIST
    2.11 +#options		UVMHIST_PRINT
    2.12 +#options		SYSCALL_DEBUG
    2.13 +
    2.14 +maxusers	32		# estimated number of users
    2.15 +
    2.16 +#
    2.17 +options		XEN
    2.18 +#options		DOM0OPS
    2.19 +options		HZ=50
    2.20 +
    2.21 +#options 	I586_CPU
    2.22 +options 	I686_CPU
    2.23 +
    2.24 +#options 	VM86		# virtual 8086 emulation
    2.25 +#options 	USER_LDT	# user-settable LDT; used by WINE
    2.26 +
    2.27 +#options 	MTRR		# memory-type range register syscall support
    2.28 +
    2.29 +#options 	CONSDEVNAME="\"xencons\""
    2.30 +#options 	CONS_OVERRIDE
    2.31 +
    2.32 +options		INSECURE	# disable kernel security levels - X needs this
    2.33 +
    2.34 +options 	RTC_OFFSET=0	# hardware clock is this many mins. west of GMT
    2.35 +#options 	NTP		# NTP phase/frequency locked loop
    2.36 +
    2.37 +options 	KTRACE		# system call tracing via ktrace(1)
    2.38 +#options 	SYSTRACE	# system call vetting via systrace(1)
    2.39 +
    2.40 +options 	SYSVMSG		# System V-like message queues
    2.41 +options 	SYSVSEM		# System V-like semaphores
    2.42 +#options 	SEMMNI=10	# number of semaphore identifiers
    2.43 +#options 	SEMMNS=60	# number of semaphores in system
    2.44 +#options 	SEMUME=10	# max number of undo entries per process
    2.45 +#options 	SEMMNU=30	# number of undo structures in system
    2.46 +options 	SYSVSHM		# System V-like memory sharing
    2.47 +#options 	SHMMAXPGS=2048	# 2048 pages is the default
    2.48 +options 	P1003_1B_SEMAPHORE	# p1003.1b semaphore support
    2.49 +
    2.50 +options 	LKM		# loadable kernel modules
    2.51 +
    2.52 +options 	USERCONF	# userconf(4) support
    2.53 +options 	SYSCTL_INCLUDE_DESCR	# Include sysctl descriptions in kernel
    2.54 +
    2.55 +# Diagnostic/debugging support options
    2.56 +options 	DIAGNOSTIC	# expensive kernel consistency checks
    2.57 +options 	DEBUG		# expensive debugging checks/support 
    2.58 +options 	KMEMSTATS	# kernel memory statistics (vmstat -m)
    2.59 +options 	DDB		# in-kernel debugger
    2.60 +options		DDB_ONPANIC=1	# see also sysctl(8): `ddb.onpanic'
    2.61 +options 	DDB_HISTORY_SIZE=512	# enable history editing in DDB
    2.62 +#options 	KGDB		# remote debugger
    2.63 +#options 	KGDB_DEVNAME="\"com\"",KGDB_DEVADDR=0x2f8,KGDB_DEVRATE=57600
    2.64 +makeoptions	DEBUG="-g"	# compile full symbol table
    2.65 +
    2.66 +#options 	COMPAT_14	# NetBSD 1.4
    2.67 +#options 	COMPAT_15	# NetBSD 1.5
    2.68 +options 	COMPAT_16	# NetBSD 1.6
    2.69 +
    2.70 +##options 	COMPAT_LINUX	# binary compatibility with Linux
    2.71 +#options 	COMPAT_FREEBSD	# binary compatibility with FreeBSD
    2.72 +#options 	COMPAT_MACH	# binary compatibility with Mach binaries
    2.73 +#options	COMPAT_DARWIN	# binary compatibility with Darwin binaries
    2.74 +#options 	EXEC_MACHO	# exec MACH-O binaries
    2.75 +#options 	COMPAT_PECOFF	# kernel support to run Win32 apps
    2.76 +
    2.77 +file-system 	FFS		# UFS
    2.78 +file-system 	EXT2FS		# second extended file system (linux)
    2.79 +#file-system 	LFS		# log-structured file system
    2.80 +#file-system 	MFS		# memory file system
    2.81 +file-system 	NFS		# Network File System client
    2.82 +#file-system 	NTFS		# Windows/NT file system (experimental)
    2.83 +#file-system 	CD9660		# ISO 9660 + Rock Ridge file system
    2.84 +#file-system 	MSDOSFS		# MS-DOS file system
    2.85 +file-system 	FDESC		# /dev/fd
    2.86 +file-system 	KERNFS		# /kern
    2.87 +file-system 	NULLFS		# loopback file system
    2.88 +#file-system 	OVERLAY		# overlay file system
    2.89 +#file-system 	PORTAL		# portal filesystem (still experimental)
    2.90 +file-system 	PROCFS		# /proc
    2.91 +#file-system 	UMAPFS		# NULLFS + uid and gid remapping
    2.92 +#file-system 	UNION		# union file system
    2.93 +#file-system	SMBFS		# experimental - CIFS; also needs nsmb (below)
    2.94 +
    2.95 +#options 	QUOTA		# UFS quotas
    2.96 +#options 	SOFTDEP		# FFS soft updates support.
    2.97 +#options 	NFSSERVER	# Network File System server
    2.98 +
    2.99 +options 	GATEWAY		# packet forwarding
   2.100 +options 	INET		# IP + ICMP + TCP + UDP
   2.101 +options 	INET6		# IPV6
   2.102 +options 	IPSEC		# IP security
   2.103 +options 	IPSEC_ESP	# IP security (encryption part; define w/IPSEC)
   2.104 +options 	MROUTING	# IP multicast routing
   2.105 +options 	PFIL_HOOKS	# pfil(9) packet filter hooks
   2.106 +options 	IPFILTER_LOG	# ipmon(8) log support
   2.107 +
   2.108 +options 	NFS_BOOT_DHCP,NFS_BOOT_BOOTPARAM,NFS_BOOT_BOOTSTATIC
   2.109 +#options 	NFS_BOOTSTATIC_MYIP="\"169.254.1.2\""
   2.110 +#options 	NFS_BOOTSTATIC_GWIP="\"169.254.1.1\""
   2.111 +#options 	NFS_BOOTSTATIC_MASK="\"255.255.255.0\""
   2.112 +#options 	NFS_BOOTSTATIC_SERVADDR="\"169.254.1.1\""
   2.113 +#options 	NFS_BOOTSTATIC_SERVER="\"server:/path/to/root\""
   2.114 +
   2.115 +options 	WSEMUL_VT100		# VT100 / VT220 emulation
   2.116 +options 	WS_KERNEL_FG=WSCOL_GREEN
   2.117 +options 	WSDISPLAY_COMPAT_PCVT		# emulate some ioctls
   2.118 +options 	WSDISPLAY_COMPAT_SYSCONS	# emulate some ioctls
   2.119 +options 	WSDISPLAY_COMPAT_USL		# VT handling
   2.120 +options 	WSDISPLAY_COMPAT_RAWKBD		# can get raw scancodes
   2.121 +options 	WSDISPLAY_DEFAULTSCREENS=4
   2.122 +options 	PCDISPLAY_SOFTCURSOR
   2.123 +
   2.124 +config		netbsd	root on ? type ?
   2.125 +#config		netbsd	root on wd0a type ffs
   2.126 +#config		netbsd	root on xennet0 type nfs
   2.127 +
   2.128 +mainbus0 at root
   2.129 +
   2.130 +cpu* at mainbus?
   2.131 +
   2.132 +hypervisor*	at mainbus?		# Xen hypervisor
   2.133 +
   2.134 +npx0		at hypervisor?		# x86 math coprocessor
   2.135 +
   2.136 +xencons*	at hypervisor?		# Xen virtual console
   2.137 +xennet* 	at hypervisor?		# Xen virtual network interface
   2.138 +
   2.139 +#xbd*		at hypervisor?		# Xen virtual block device
   2.140 +#wd*		at hypervisor?		# Xen vbd (wd identity)
   2.141 +#sd*		at hypervisor?		# Xen vbd (sd identity)
   2.142 +#cd*		at hypervisor?		# Xen vbd (cd identity)
   2.143 +
   2.144 +#xenkbc* 	at hypervisor?		# Xen Keyboard/Mouse Interface
   2.145 +#pckbd*		at xenkbc?		# Keyboard
   2.146 +#vga*		at hypervisor?		# Xen VGA display
   2.147 +#pms*		at xenkbc?		# PS/2 Mouse for wsmouse
   2.148 +
   2.149 +#wskbd*		at pckbd? console ?
   2.150 +#wsdisplay*	at vga? console ?
   2.151 +#wsmouse*	at pms? mux 0
   2.152 +
   2.153 +
   2.154 +include	"arch/xen/conf/GENERIC.local"
   2.155 +
   2.156 +
   2.157 +pseudo-device	ccd		4	# concatenated/striped disk devices
   2.158 +#pseudo-device	cgd		4	# cryptographic disk devices
   2.159 +#pseudo-device	md		1	# memory disk device (ramdisk)
   2.160 +#pseudo-device	vnd		4	# disk-like interface to files
   2.161 +
   2.162 +pseudo-device	bpfilter	8	# Berkeley packet filter
   2.163 +pseudo-device	ipfilter		# IP filter (firewall) and NAT
   2.164 +pseudo-device	loop			# network loopback
   2.165 +#pseudo-device	tun		2	# network tunneling over tty
   2.166 +#pseudo-device	gre		2	# generic L3 over IP tunnel
   2.167 +#pseudo-device	gif		4	# IPv[46] over IPv[46] tunnel (RFC1933)
   2.168 +#pseudo-device	faith		1	# IPv[46] tcp relay translation i/f
   2.169 +#pseudo-device	stf		1	# 6to4 IPv6 over IPv4 encapsulation
   2.170 +#pseudo-device	vlan			# IEEE 802.1q encapsulation
   2.171 +#pseudo-device	bridge			# simple inter-network bridging
   2.172 +
   2.173 +pseudo-device	pty			# pseudo-terminals
   2.174 +pseudo-device	rnd			# /dev/random and in-kernel generator
   2.175 +pseudo-device	clockctl		# user control of clock subsystem
   2.176 +
   2.177 +pseudo-device	wsmux			# mouse & keyboard multiplexor
   2.178 +pseudo-device	wsfont
   2.179 +pseudo-device	ksyms			# /dev/ksyms
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen	Mon Sep 06 19:04:16 2004 +0000
     3.3 @@ -0,0 +1,232 @@
     3.4 +#	$NetBSD: files.xen,v 1.3.2.1 2004/05/22 15:59:02 he Exp $
     3.5 +#	NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp 
     3.6 +#	NetBSD: files.i386,v 1.254 2004/03/25 23:32:10 jmc Exp 
     3.7 +
     3.8 +maxpartitions 8
     3.9 +
    3.10 +maxusers 2 16 128
    3.11 +
    3.12 +# Processor type options.
    3.13 +defflag	opt_cputype.h	I686_CPU
    3.14 +
    3.15 +# delay before cpu_reset() for reboot.
    3.16 +defparam		CPURESET_DELAY
    3.17 +
    3.18 +# No unmapped page below kernel stack
    3.19 +defflag			NOREDZONE
    3.20 +
    3.21 +# Beep on halt
    3.22 +defflag opt_beep.h		BEEP_ONHALT
    3.23 +defparam opt_beep.h		BEEP_ONHALT_COUNT
    3.24 +defparam opt_beep.h		BEEP_ONHALT_PITCH BEEP_ONHALT_PERIOD
    3.25 +
    3.26 +file	arch/xen/i386/autoconf.c
    3.27 +file	arch/i386/i386/db_dbgreg.S	ddb | kstack_check_dr0
    3.28 +file	arch/i386/i386/db_disasm.c	ddb
    3.29 +file	arch/i386/i386/db_interface.c	ddb
    3.30 +file	arch/i386/i386/db_memrw.c	ddb | kgdb
    3.31 +file	arch/i386/i386/db_trace.c	ddb
    3.32 +file	kern/subr_disk_mbr.c		disk
    3.33 +file	arch/xen/i386/gdt.c
    3.34 +file	arch/xen/i386/hypervisor_machdep.c
    3.35 +file	arch/i386/i386/in_cksum.S	inet | inet6
    3.36 +file	arch/i386/i386/ipkdb_glue.c	ipkdb
    3.37 +file	arch/i386/i386/kgdb_machdep.c	kgdb
    3.38 +file	arch/xen/i386/machdep.c
    3.39 +file	arch/xen/i386/identcpu.c
    3.40 +file	arch/i386/i386/math_emulate.c	math_emulate
    3.41 +file	arch/i386/i386/mem.c
    3.42 +file	kern/kern_microtime.c		i586_cpu | i686_cpu
    3.43 +file	arch/i386/i386/mtrr_k6.c	mtrr
    3.44 +file	netns/ns_cksum.c		ns
    3.45 +file	arch/xen/i386/pmap.c
    3.46 +file	arch/i386/i386/process_machdep.c
    3.47 +file	arch/i386/i386/procfs_machdep.c	procfs
    3.48 +file	arch/xen/i386/sys_machdep.c
    3.49 +file	arch/i386/i386/syscall.c
    3.50 +file	arch/xen/i386/trap.c
    3.51 +file	arch/i386/i386/vm_machdep.c
    3.52 +file	arch/xen/i386/xen_machdep.c
    3.53 +
    3.54 +file	arch/xen/xen/xen_debug.c
    3.55 +
    3.56 +file	arch/xen/xen/clock.c
    3.57 +file	arch/xen/xen/evtchn.c
    3.58 +file	arch/xen/xen/ctrl_if.c
    3.59 +
    3.60 +file	dev/cons.c
    3.61 +
    3.62 +file	arch/i386/i386/mptramp.S		multiprocessor
    3.63 +file    arch/i386/i386/ipifuncs.c	multiprocessor
    3.64 +
    3.65 +file	arch/i386/i386/pmc.c		perfctrs
    3.66 +
    3.67 +file	crypto/des/arch/i386/des_enc.S		des
    3.68 +file	crypto/des/arch/i386/des_cbc.S		des
    3.69 +
    3.70 +file	crypto/blowfish/arch/i386/bf_enc.S	blowfish
    3.71 +file	crypto/blowfish/arch/i386/bf_cbc.S	blowfish & !i386_cpu
    3.72 +
    3.73 +#
    3.74 +# Machine-independent SCSI drivers
    3.75 +#
    3.76 +
    3.77 +#xxx include	"dev/scsipi/files.scsipi"
    3.78 +
    3.79 +#
    3.80 +# Machine-independent ATA drivers
    3.81 +#
    3.82 +
    3.83 +#xxx include	"dev/ata/files.ata"
    3.84 +
    3.85 +# Memory Disk for install floppy
    3.86 +file	dev/md_root.c			memory_disk_hooks
    3.87 +
    3.88 +#
    3.89 +define  mainbus { [apid = -1] }
    3.90 +
    3.91 +file	arch/x86/x86/bus_dma.c
    3.92 +file	arch/xen/x86/bus_space.c
    3.93 +file	arch/x86/x86/cacheinfo.c
    3.94 +file	arch/xen/x86/consinit.c
    3.95 +file	arch/xen/x86/intr.c
    3.96 +file	arch/x86/x86/ipi.c		multiprocessor
    3.97 +file	arch/x86/x86/lock_machdep.c	lockdebug
    3.98 +file	arch/x86/x86/softintr.c
    3.99 +
   3.100 +include	"arch/xen/conf/files.compat"
   3.101 +
   3.102 +#
   3.103 +# System bus types
   3.104 +#
   3.105 +
   3.106 +device	mainbus: mainbus
   3.107 +attach	mainbus at root
   3.108 +file	arch/xen/i386/mainbus.c		mainbus
   3.109 +
   3.110 +# Xen hypervisor
   3.111 +device	hypervisor { }
   3.112 +attach	hypervisor at mainbus
   3.113 +file	arch/xen/xen/hypervisor.c	hypervisor needs-flag
   3.114 +
   3.115 +# Numeric Processing Extension; Math Co-processor
   3.116 +device	npx
   3.117 +file	arch/xen/i386/npx.c		npx needs-flag
   3.118 +
   3.119 +attach	npx at hypervisor with npx_hv
   3.120 +file	arch/xen/i386/npx_hv.c		npx_hv
   3.121 +
   3.122 +# Xen console support
   3.123 +device	xencons: tty
   3.124 +attach	xencons at hypervisor
   3.125 +file	arch/xen/xen/xencons.c		xencons needs-flag
   3.126 +
   3.127 +include	"dev/wscons/files.wscons"
   3.128 +include	"dev/wsfont/files.wsfont"
   3.129 +
   3.130 +include	"dev/pckbport/files.pckbport"
   3.131 +
   3.132 +# CPUS
   3.133 +
   3.134 +define cpu { [apid = -1] }
   3.135 +device cpu
   3.136 +attach cpu at mainbus
   3.137 +file	arch/xen/i386/cpu.c		cpu
   3.138 +
   3.139 +#
   3.140 +# Compatibility modules
   3.141 +#
   3.142 +
   3.143 +# VM86 mode
   3.144 +file	arch/i386/i386/vm86.c			vm86
   3.145 +
   3.146 +# VM86 in kernel
   3.147 +file	arch/i386/i386/kvm86.c			kvm86
   3.148 +file	arch/i386/i386/kvm86call.S		kvm86
   3.149 +
   3.150 +# Binary compatibility with previous NetBSD releases (COMPAT_XX)
   3.151 +file	arch/i386/i386/compat_13_machdep.c	compat_13 | compat_aout
   3.152 +file	arch/i386/i386/compat_16_machdep.c	compat_16 | compat_ibcs2
   3.153 +
   3.154 +# SVR4 binary compatibility (COMPAT_SVR4)
   3.155 +include	"compat/svr4/files.svr4"
   3.156 +file	arch/i386/i386/svr4_machdep.c		compat_svr4
   3.157 +file	arch/i386/i386/svr4_sigcode.S		compat_svr4
   3.158 +file	arch/i386/i386/svr4_syscall.c		compat_svr4
   3.159 +
   3.160 +# MACH binary compatibility (COMPAT_MACH)
   3.161 +include	"compat/mach/files.mach"
   3.162 +file	arch/i386/i386/mach_machdep.c		compat_mach | compat_darwin
   3.163 +file	arch/i386/i386/mach_sigcode.S		compat_mach | compat_darwin
   3.164 +file	arch/i386/i386/mach_syscall.c		compat_mach | compat_darwin
   3.165 +file	arch/i386/i386/macho_machdep.c		exec_macho
   3.166 +
   3.167 +# DARWIN binary compatibility (COMPAT_DARWIN)
   3.168 +include	"compat/darwin/files.darwin"
   3.169 +file	arch/i386/i386/darwin_machdep.c		compat_darwin
   3.170 +
   3.171 +# iBCS-2 binary compatibility (COMPAT_IBCS2)
   3.172 +include	"compat/ibcs2/files.ibcs2"
   3.173 +file	arch/i386/i386/ibcs2_machdep.c		compat_ibcs2
   3.174 +file	arch/i386/i386/ibcs2_sigcode.S		compat_ibcs2
   3.175 +file	arch/i386/i386/ibcs2_syscall.c		compat_ibcs2
   3.176 +
   3.177 +# Linux binary compatibility (COMPAT_LINUX)
   3.178 +include	"compat/linux/files.linux"
   3.179 +include	"compat/linux/arch/i386/files.linux_i386"
   3.180 +file	arch/i386/i386/linux_sigcode.S		compat_linux
   3.181 +file	arch/i386/i386/linux_syscall.c		compat_linux
   3.182 +file	arch/i386/i386/linux_trap.c		compat_linux
   3.183 +
   3.184 +# FreeBSD binary compatibility (COMPAT_FREEBSD)
   3.185 +include	"compat/freebsd/files.freebsd"
   3.186 +file	arch/i386/i386/freebsd_machdep.c	compat_freebsd
   3.187 +file	arch/i386/i386/freebsd_sigcode.S	compat_freebsd
   3.188 +file	arch/i386/i386/freebsd_syscall.c	compat_freebsd
   3.189 +
   3.190 +# a.out binary compatibility (COMPAT_AOUT)
   3.191 +include	"compat/aout/files.aout"
   3.192 +
   3.193 +# Win32 binary compatibility (COMPAT_PECOFF)
   3.194 +include	"compat/pecoff/files.pecoff"
   3.195 +
   3.196 +# OSS audio driver compatibility
   3.197 +include	"compat/ossaudio/files.ossaudio"
   3.198 +
   3.199 +# Xen devices
   3.200 +
   3.201 +# Network driver
   3.202 +device	xennet: arp, ether, ifnet
   3.203 +attach	xennet at hypervisor
   3.204 +file	arch/xen/xen/if_xennet.c	xennet needs-flag
   3.205 +
   3.206 +# Block device driver and wd/sd/cd identities
   3.207 +device	xbd: disk
   3.208 +attach	xbd at hypervisor
   3.209 +file	arch/xen/xen/xbd.c		xbd | wd | sd | cd needs-flag
   3.210 +
   3.211 +device	wd: disk
   3.212 +attach	wd at hypervisor
   3.213 +
   3.214 +device	sd: disk
   3.215 +attach	sd at hypervisor
   3.216 +
   3.217 +device	cd: disk
   3.218 +attach	cd at hypervisor
   3.219 +
   3.220 +# Keyboard
   3.221 +device	xenkbc: pckbport
   3.222 +attach	xenkbc at hypervisor
   3.223 +file	arch/xen/xen/xenkbc.c		xenkbc		needs-flag
   3.224 +
   3.225 +# Generic VGA
   3.226 +attach	vga at hypervisor with vga_xen
   3.227 +file	arch/xen/xen/vga_xen.c		vga_xen		needs-flag
   3.228 +
   3.229 +# Domain-0 operations
   3.230 +defflag	opt_xen.h			DOM0OPS
   3.231 +file	arch/xen/xen/machmem.c		dom0ops
   3.232 +file	arch/xen/xen/privcmd.c		dom0ops
   3.233 +file	arch/xen/xen/vfr.c		dom0ops
   3.234 +
   3.235 +include "arch/xen/conf/majors.i386"
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c	Mon Sep 06 19:04:16 2004 +0000
     4.3 @@ -0,0 +1,630 @@
     4.4 +/*	$NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $	*/
     4.5 +/*	NetBSD: autoconf.c,v 1.75 2003/12/30 12:33:22 pk Exp 	*/
     4.6 +
     4.7 +/*-
     4.8 + * Copyright (c) 1990 The Regents of the University of California.
     4.9 + * All rights reserved.
    4.10 + *
    4.11 + * This code is derived from software contributed to Berkeley by
    4.12 + * William Jolitz.
    4.13 + *
    4.14 + * Redistribution and use in source and binary forms, with or without
    4.15 + * modification, are permitted provided that the following conditions
    4.16 + * are met:
    4.17 + * 1. Redistributions of source code must retain the above copyright
    4.18 + *    notice, this list of conditions and the following disclaimer.
    4.19 + * 2. Redistributions in binary form must reproduce the above copyright
    4.20 + *    notice, this list of conditions and the following disclaimer in the
    4.21 + *    documentation and/or other materials provided with the distribution.
    4.22 + * 3. Neither the name of the University nor the names of its contributors
    4.23 + *    may be used to endorse or promote products derived from this software
    4.24 + *    without specific prior written permission.
    4.25 + *
    4.26 + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
    4.27 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    4.28 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    4.29 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
    4.30 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    4.31 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    4.32 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    4.33 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    4.34 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    4.35 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    4.36 + * SUCH DAMAGE.
    4.37 + *
    4.38 + *	@(#)autoconf.c	7.1 (Berkeley) 5/9/91
    4.39 + */
    4.40 +
    4.41 +/*
    4.42 + * Setup the system to run on the current machine.
    4.43 + *
    4.44 + * Configure() is called at boot time and initializes the vba
    4.45 + * device tables and the memory controller monitoring.  Available
    4.46 + * devices are determined (from possibilities mentioned in ioconf.c),
    4.47 + * and the drivers are initialized.
    4.48 + */
    4.49 +
    4.50 +#include <sys/cdefs.h>
    4.51 +__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $");
    4.52 +
    4.53 +#include "opt_compat_oldboot.h"
    4.54 +#include "opt_multiprocessor.h"
    4.55 +#include "opt_nfs_boot.h"
    4.56 +#include "xennet.h"
    4.57 +
    4.58 +#include <sys/param.h>
    4.59 +#include <sys/systm.h>
    4.60 +#include <sys/buf.h>
    4.61 +#include <sys/disklabel.h>
    4.62 +#include <sys/conf.h>
    4.63 +#ifdef COMPAT_OLDBOOT
    4.64 +#include <sys/reboot.h>
    4.65 +#endif
    4.66 +#include <sys/device.h>
    4.67 +#include <sys/malloc.h>
    4.68 +#include <sys/vnode.h>
    4.69 +#include <sys/fcntl.h>
    4.70 +#include <sys/dkio.h>
    4.71 +#include <sys/proc.h>
    4.72 +#include <sys/user.h>
    4.73 +
    4.74 +#ifdef NFS_BOOT_BOOTSTATIC
    4.75 +#include <net/if.h>
    4.76 +#include <net/if_ether.h>
    4.77 +#include <netinet/in.h>
    4.78 +#include <nfs/rpcv2.h>
    4.79 +#include <nfs/nfsproto.h>
    4.80 +#include <nfs/nfs.h>
    4.81 +#include <nfs/nfsmount.h>
    4.82 +#include <nfs/nfsdiskless.h>
    4.83 +#include <machine/if_xennetvar.h>
    4.84 +#endif
    4.85 +
    4.86 +#include <machine/pte.h>
    4.87 +#include <machine/cpu.h>
    4.88 +#include <machine/gdt.h>
    4.89 +#include <machine/pcb.h>
    4.90 +#include <machine/bootinfo.h>
    4.91 +
    4.92 +#include "ioapic.h"
    4.93 +#include "lapic.h"
    4.94 +
    4.95 +#if NIOAPIC > 0
    4.96 +#include <machine/i82093var.h>
    4.97 +#endif
    4.98 +
    4.99 +#if NLAPIC > 0
   4.100 +#include <machine/i82489var.h>
   4.101 +#endif
   4.102 +
   4.103 +static int match_harddisk(struct device *, struct btinfo_bootdisk *);
   4.104 +static void matchbiosdisks(void);
   4.105 +static void findroot(void);
   4.106 +static int is_valid_disk(struct device *);
   4.107 +
   4.108 +extern struct disklist *i386_alldisks;
   4.109 +extern int i386_ndisks;
   4.110 +
   4.111 +#include "bios32.h"
   4.112 +#if NBIOS32 > 0
   4.113 +#include <machine/bios32.h>
   4.114 +#endif
   4.115 +
   4.116 +#include "opt_pcibios.h"
   4.117 +#ifdef PCIBIOS
   4.118 +#include <dev/pci/pcireg.h>
   4.119 +#include <dev/pci/pcivar.h>
   4.120 +#include <i386/pci/pcibios.h>
   4.121 +#endif
   4.122 +
   4.123 +#include "opt_kvm86.h"
   4.124 +#ifdef KVM86
   4.125 +#include <machine/kvm86.h>
   4.126 +#endif
   4.127 +
   4.128 +#include "opt_xen.h"
   4.129 +
   4.130 +struct device *booted_device;
   4.131 +int booted_partition;
   4.132 +
   4.133 +/*
   4.134 + * Determine i/o configuration for a machine.
   4.135 + */
   4.136 +void
   4.137 +cpu_configure(void)
   4.138 +{
   4.139 +
   4.140 +	startrtclock();
   4.141 +
   4.142 +#if NBIOS32 > 0
   4.143 +	bios32_init();
   4.144 +#endif
   4.145 +#ifdef PCIBIOS
   4.146 +	pcibios_init();
   4.147 +#endif
   4.148 +
   4.149 +	/* kvm86 needs a TSS */
   4.150 +	i386_proc0_tss_ldt_init();
   4.151 +#ifdef KVM86
   4.152 +	kvm86_init();
   4.153 +#endif
   4.154 +
   4.155 +	if (config_rootfound("mainbus", NULL) == NULL)
   4.156 +		panic("configure: mainbus not configured");
   4.157 +
   4.158 +#ifdef INTRDEBUG
   4.159 +	intr_printconfig();
   4.160 +#endif
   4.161 +
   4.162 +#if NIOAPIC > 0
   4.163 +	lapic_set_lvt();
   4.164 +	ioapic_enable();
   4.165 +#endif
   4.166 +	/* resync cr0 after FPU configuration */
   4.167 +	lwp0.l_addr->u_pcb.pcb_cr0 = rcr0();
   4.168 +#ifdef MULTIPROCESSOR
   4.169 +	/* propagate this to the idle pcb's. */
   4.170 +	cpu_init_idle_pcbs();
   4.171 +#endif
   4.172 +
   4.173 +	spl0();
   4.174 +#if NLAPIC > 0
   4.175 +	lapic_tpr = 0;
   4.176 +#endif
   4.177 +}
   4.178 +
   4.179 +void
   4.180 +cpu_rootconf(void)
   4.181 +{
   4.182 +	findroot();
   4.183 +	matchbiosdisks();
   4.184 +
   4.185 +	printf("boot device: %s\n",
   4.186 +	    booted_device ? booted_device->dv_xname : "<unknown>");
   4.187 +
   4.188 +	setroot(booted_device, booted_partition);
   4.189 +}
   4.190 +
   4.191 +/*
   4.192 + * XXX ugly bit of code. But, this is the only safe time that the
   4.193 + * match between BIOS disks and native disks can be done.
   4.194 + */
   4.195 +static void
   4.196 +matchbiosdisks(void)
   4.197 +{
   4.198 +	struct btinfo_biosgeom *big;
   4.199 +	struct bi_biosgeom_entry *be;
   4.200 +	struct device *dv;
   4.201 +	int i, ck, error, m, n;
   4.202 +	struct vnode *tv;
   4.203 +	char mbr[DEV_BSIZE];
   4.204 +	int  dklist_size;
   4.205 +	int bmajor;
   4.206 +
   4.207 +	big = lookup_bootinfo(BTINFO_BIOSGEOM);
   4.208 +
   4.209 +	if (big == NULL)
   4.210 +		return;
   4.211 +
   4.212 +	/*
   4.213 +	 * First, count all native disks
   4.214 +	 */
   4.215 +	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next)
   4.216 +		if (is_valid_disk(dv))
   4.217 +			i386_ndisks++;
   4.218 +
   4.219 +	if (i386_ndisks == 0)
   4.220 +		return;
   4.221 +
   4.222 +	dklist_size = sizeof (struct disklist) + (i386_ndisks - 1) *
   4.223 +	    sizeof (struct nativedisk_info);
   4.224 +
   4.225 +	/* XXX M_TEMP is wrong */
   4.226 +	i386_alldisks = malloc(dklist_size, M_TEMP, M_NOWAIT);
   4.227 +	if (i386_alldisks == NULL)
   4.228 +		return;
   4.229 +
   4.230 +	memset(i386_alldisks, 0, dklist_size);
   4.231 +
   4.232 +	i386_alldisks->dl_nnativedisks = i386_ndisks;
   4.233 +	i386_alldisks->dl_nbiosdisks = big->num;
   4.234 +	for (i = 0; i < big->num; i++) {
   4.235 +		i386_alldisks->dl_biosdisks[i].bi_dev = big->disk[i].dev;
   4.236 +		i386_alldisks->dl_biosdisks[i].bi_sec = big->disk[i].sec;
   4.237 +		i386_alldisks->dl_biosdisks[i].bi_head = big->disk[i].head;
   4.238 +		i386_alldisks->dl_biosdisks[i].bi_cyl = big->disk[i].cyl;
   4.239 +		i386_alldisks->dl_biosdisks[i].bi_lbasecs = big->disk[i].totsec;
   4.240 +		i386_alldisks->dl_biosdisks[i].bi_flags = big->disk[i].flags;
   4.241 +#ifdef GEOM_DEBUG
   4.242 +#ifdef NOTYET
   4.243 +		printf("disk %x: flags %x, interface %x, device %llx\n",
   4.244 +			big->disk[i].dev, big->disk[i].flags,
   4.245 +			big->disk[i].interface_path, big->disk[i].device_path);
   4.246 +#endif
   4.247 +#endif
   4.248 +	}
   4.249 +
   4.250 +	/*
   4.251 +	 * XXX code duplication from findroot()
   4.252 +	 */
   4.253 +	n = -1;
   4.254 +	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
   4.255 +		if (dv->dv_class != DV_DISK)
   4.256 +			continue;
   4.257 +#ifdef GEOM_DEBUG
   4.258 +		printf("matchbiosdisks: trying to match (%s) %s\n",
   4.259 +		    dv->dv_xname, dv->dv_cfdata->cf_name);
   4.260 +#endif
   4.261 +		if (is_valid_disk(dv)) {
   4.262 +			n++;
   4.263 +			sprintf(i386_alldisks->dl_nativedisks[n].ni_devname,
   4.264 +			    "%s%d", dv->dv_cfdata->cf_name,
   4.265 +			    dv->dv_unit);
   4.266 +
   4.267 +			bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
   4.268 +			if (bmajor == -1)
   4.269 +				return;
   4.270 +
   4.271 +			if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, RAW_PART),
   4.272 +			    &tv))
   4.273 +				panic("matchbiosdisks: can't alloc vnode");
   4.274 +
   4.275 +			error = VOP_OPEN(tv, FREAD, NOCRED, 0);
   4.276 +			if (error) {
   4.277 +				vput(tv);
   4.278 +				continue;
   4.279 +			}
   4.280 +			error = vn_rdwr(UIO_READ, tv, mbr, DEV_BSIZE, 0,
   4.281 +			    UIO_SYSSPACE, 0, NOCRED, NULL, 0);
   4.282 +			VOP_CLOSE(tv, FREAD, NOCRED, 0);
   4.283 +			if (error) {
   4.284 +#ifdef GEOM_DEBUG
   4.285 +				printf("matchbiosdisks: %s: MBR read failure\n",
   4.286 +				    dv->dv_xname);
   4.287 +#endif
   4.288 +				continue;
   4.289 +			}
   4.290 +
   4.291 +			for (ck = i = 0; i < DEV_BSIZE; i++)
   4.292 +				ck += mbr[i];
   4.293 +			for (m = i = 0; i < big->num; i++) {
   4.294 +				be = &big->disk[i];
   4.295 +#ifdef GEOM_DEBUG
   4.296 +				printf("match %s with %d ", dv->dv_xname, i);
   4.297 +				printf("dev ck %x bios ck %x\n", ck, be->cksum);
   4.298 +#endif
   4.299 +				if (be->flags & BI_GEOM_INVALID)
   4.300 +					continue;
   4.301 +				if (be->cksum == ck &&
   4.302 +				    !memcmp(&mbr[MBR_PART_OFFSET], be->dosparts,
   4.303 +					MBR_PART_COUNT *
   4.304 +					    sizeof (struct mbr_partition))) {
   4.305 +#ifdef GEOM_DEBUG
   4.306 +					printf("matched bios disk %x with %s\n",
   4.307 +					    be->dev, dv->dv_xname);
   4.308 +#endif
   4.309 +					i386_alldisks->dl_nativedisks[n].
   4.310 +					    ni_biosmatches[m++] = i;
   4.311 +				}
   4.312 +			}
   4.313 +			i386_alldisks->dl_nativedisks[n].ni_nmatches = m;
   4.314 +			vput(tv);
   4.315 +		}
   4.316 +	}
   4.317 +}
   4.318 +
   4.319 +#ifdef COMPAT_OLDBOOT
   4.320 +u_long	bootdev = 0;		/* should be dev_t, but not until 32 bits */
   4.321 +#endif
   4.322 +
   4.323 +/*
   4.324 + * helper function for "findroot()":
   4.325 + * return nonzero if disk device matches bootinfo
   4.326 + */
   4.327 +static int
   4.328 +match_harddisk(struct device *dv, struct btinfo_bootdisk *bid)
   4.329 +{
   4.330 +	struct vnode *tmpvn;
   4.331 +	int error;
   4.332 +	struct disklabel label;
   4.333 +	int found = 0;
   4.334 +	int bmajor;
   4.335 +
   4.336 +	/*
   4.337 +	 * A disklabel is required here.  The
   4.338 +	 * bootblocks don't refuse to boot from
   4.339 +	 * a disk without a label, but this is
   4.340 +	 * normally not wanted.
   4.341 +	 */
   4.342 +	if (bid->labelsector == -1)
   4.343 +		return(0);
   4.344 +
   4.345 +	/*
   4.346 +	 * lookup major number for disk block device
   4.347 +	 */
   4.348 +	bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
   4.349 +	if (bmajor == -1)
   4.350 +		return(0); /* XXX panic() ??? */
   4.351 +
   4.352 +	/*
   4.353 +	 * Fake a temporary vnode for the disk, open
   4.354 +	 * it, and read the disklabel for comparison.
   4.355 +	 */
   4.356 +	if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, bid->partition), &tmpvn))
   4.357 +		panic("findroot can't alloc vnode");
   4.358 +	error = VOP_OPEN(tmpvn, FREAD, NOCRED, 0);
   4.359 +	if (error) {
   4.360 +#ifndef DEBUG
   4.361 +		/*
   4.362 +		 * Ignore errors caused by missing
   4.363 +		 * device, partition or medium.
   4.364 +		 */
   4.365 +		if (error != ENXIO && error != ENODEV)
   4.366 +#endif
   4.367 +			printf("findroot: can't open dev %s%c (%d)\n",
   4.368 +			       dv->dv_xname, 'a' + bid->partition, error);
   4.369 +		vput(tmpvn);
   4.370 +		return(0);
   4.371 +	}
   4.372 +	error = VOP_IOCTL(tmpvn, DIOCGDINFO, &label, FREAD, NOCRED, 0);
   4.373 +	if (error) {
   4.374 +		/*
   4.375 +		 * XXX can't happen - open() would
   4.376 +		 * have errored out (or faked up one)
   4.377 +		 */
   4.378 +		printf("can't get label for dev %s%c (%d)\n",
   4.379 +		       dv->dv_xname, 'a' + bid->partition, error);
   4.380 +		goto closeout;
   4.381 +	}
   4.382 +
   4.383 +	/* compare with our data */
   4.384 +	if (label.d_type == bid->label.type &&
   4.385 +	    label.d_checksum == bid->label.checksum &&
   4.386 +	    !strncmp(label.d_packname, bid->label.packname, 16))
   4.387 +		found = 1;
   4.388 +
   4.389 +closeout:
   4.390 +	VOP_CLOSE(tmpvn, FREAD, NOCRED, 0);
   4.391 +	vput(tmpvn);
   4.392 +	return(found);
   4.393 +}
   4.394 +
   4.395 +/*
   4.396 + * Attempt to find the device from which we were booted.
   4.397 + * If we can do so, and not instructed not to do so,
   4.398 + * change rootdev to correspond to the load device.
   4.399 + */
   4.400 +void
   4.401 +findroot(void)
   4.402 +{
   4.403 +	struct btinfo_bootdisk *bid;
   4.404 +	struct device *dv;
   4.405 +	union xen_cmdline_parseinfo xcp;
   4.406 +#ifdef COMPAT_OLDBOOT
   4.407 +	int i, majdev, unit, part;
   4.408 +	char buf[32];
   4.409 +#endif
   4.410 +
   4.411 +	if (booted_device)
   4.412 +		return;
   4.413 +
   4.414 +	if (lookup_bootinfo(BTINFO_NETIF)) {
   4.415 +		/*
   4.416 +		 * We got netboot interface information, but
   4.417 +		 * "device_register()" couldn't match it to a configured
   4.418 +		 * device. Bootdisk information cannot be present at the
   4.419 +		 * same time, so give up.
   4.420 +		 */
   4.421 +		printf("findroot: netboot interface not found\n");
   4.422 +		return;
   4.423 +	}
   4.424 +
   4.425 +	bid = lookup_bootinfo(BTINFO_BOOTDISK);
   4.426 +	if (bid) {
   4.427 +		/*
   4.428 +		 * Scan all disk devices for ones that match the passed data.
   4.429 +		 * Don't break if one is found, to get possible multiple
   4.430 +		 * matches - for problem tracking. Use the first match anyway
   4.431 +		 * because lower device numbers are more likely to be the
   4.432 +		 * boot device.
   4.433 +		 */
   4.434 +		for (dv = alldevs.tqh_first; dv != NULL;
   4.435 +		    dv = dv->dv_list.tqe_next) {
   4.436 +			if (dv->dv_class != DV_DISK)
   4.437 +				continue;
   4.438 +
   4.439 +			if (!strcmp(dv->dv_cfdata->cf_name, "fd")) {
   4.440 +				/*
   4.441 +				 * Assume the configured unit number matches
   4.442 +				 * the BIOS device number.  (This is the old
   4.443 +				 * behaviour.)  Needs some ideas how to handle
   4.444 +				 * BIOS's "swap floppy drive" options.
   4.445 +				 */
   4.446 +				if ((bid->biosdev & 0x80) ||
   4.447 +				    dv->dv_unit != bid->biosdev)
   4.448 +					continue;
   4.449 +
   4.450 +				goto found;
   4.451 +			}
   4.452 +
   4.453 +			if (is_valid_disk(dv)) {
   4.454 +				/*
   4.455 +				 * Don't trust BIOS device numbers, try
   4.456 +				 * to match the information passed by the
   4.457 +				 * bootloader instead.
   4.458 +				 */
   4.459 +				if ((bid->biosdev & 0x80) == 0 ||
   4.460 +				    !match_harddisk(dv, bid))
   4.461 +					continue;
   4.462 +
   4.463 +				goto found;
   4.464 +			}
   4.465 +
   4.466 +			/* no "fd", "wd", "sd", "ld", "ed" */
   4.467 +			continue;
   4.468 +
   4.469 +found:
   4.470 +			if (booted_device) {
   4.471 +				printf("warning: double match for boot "
   4.472 +				    "device (%s, %s)\n",
   4.473 +				    booted_device->dv_xname, dv->dv_xname);
   4.474 +				continue;
   4.475 +			}
   4.476 +			booted_device = dv;
   4.477 +			booted_partition = bid->partition;
   4.478 +		}
   4.479 +
   4.480 +		if (booted_device)
   4.481 +			return;
   4.482 +	}
   4.483 +
   4.484 +	xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
   4.485 +
   4.486 +	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
   4.487 +		if (is_valid_disk(dv) == 0)
   4.488 +			continue;
   4.489 +
   4.490 +		if (xcp.xcp_bootdev[0] == 0) {
   4.491 +			booted_device = dv;
   4.492 +			break;
   4.493 +		}
   4.494 +
   4.495 +		if (strncmp(xcp.xcp_bootdev, dv->dv_xname,
   4.496 +		    strlen(dv->dv_xname)))
   4.497 +			continue;
   4.498 +
   4.499 +		if (strlen(xcp.xcp_bootdev) > strlen(dv->dv_xname)) {
   4.500 +			booted_partition = toupper(
   4.501 +				xcp.xcp_bootdev[strlen(dv->dv_xname)]) - 'A';
   4.502 +		}
   4.503 +
   4.504 +		booted_device = dv;
   4.505 +		break;
   4.506 +	}
   4.507 +
   4.508 +	if (booted_device)
   4.509 +		return;
   4.510 +
   4.511 +#ifdef COMPAT_OLDBOOT
   4.512 +#if 0
   4.513 +	printf("howto %x bootdev %x ", boothowto, bootdev);
   4.514 +#endif
   4.515 +
   4.516 +	if ((bootdev & B_MAGICMASK) != (u_long)B_DEVMAGIC)
   4.517 +		return;
   4.518 +
   4.519 +	majdev = (bootdev >> B_TYPESHIFT) & B_TYPEMASK;
   4.520 +	name = devsw_blk2name(majdev);
   4.521 +	if (name == NULL)
   4.522 +		return;
   4.523 +
   4.524 +	part = (bootdev >> B_PARTITIONSHIFT) & B_PARTITIONMASK;
   4.525 +	unit = (bootdev >> B_UNITSHIFT) & B_UNITMASK;
   4.526 +
   4.527 +	sprintf(buf, "%s%d", name, unit);
   4.528 +	for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
   4.529 +		if (strcmp(buf, dv->dv_xname) == 0) {
   4.530 +			booted_device = dv;
   4.531 +			booted_partition = part;
   4.532 +			return;
   4.533 +		}
   4.534 +	}
   4.535 +#endif
   4.536 +}
   4.537 +
   4.538 +#include "pci.h"
   4.539 +
   4.540 +#include <dev/isa/isavar.h>
   4.541 +#if NPCI > 0
   4.542 +#include <dev/pci/pcivar.h>
   4.543 +#endif
   4.544 +
   4.545 +void
   4.546 +device_register(struct device *dev, void *aux)
   4.547 +{
   4.548 +	/*
   4.549 +	 * Handle network interfaces here, the attachment information is
   4.550 +	 * not available driver independantly later.
   4.551 +	 * For disks, there is nothing useful available at attach time.
   4.552 +	 */
   4.553 +#if NXENNET > 0
   4.554 +	if (dev->dv_class == DV_IFNET) {
   4.555 +		union xen_cmdline_parseinfo xcp;
   4.556 +
   4.557 +		xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
   4.558 +		if (strncmp(xcp.xcp_bootdev, dev->dv_xname, 16) == 0) {
   4.559 +#ifdef NFS_BOOT_BOOTSTATIC
   4.560 +			nfs_bootstatic_callback = xennet_bootstatic_callback;
   4.561 +#endif
   4.562 +			goto found;
   4.563 +		}
   4.564 +	}
   4.565 +#endif
   4.566 +	if (dev->dv_class == DV_IFNET) {
   4.567 +		struct btinfo_netif *bin = lookup_bootinfo(BTINFO_NETIF);
   4.568 +		if (bin == NULL)
   4.569 +			return;
   4.570 +
   4.571 +		/*
   4.572 +		 * We don't check the driver name against the device name
   4.573 +		 * passed by the boot ROM. The ROM should stay usable
   4.574 +		 * if the driver gets obsoleted.
   4.575 +		 * The physical attachment information (checked below)
   4.576 +		 * must be sufficient to identify the device.
   4.577 +		 */
   4.578 +
   4.579 +		if (bin->bus == BI_BUS_ISA &&
   4.580 +		    !strcmp(dev->dv_parent->dv_cfdata->cf_name, "isa")) {
   4.581 +			struct isa_attach_args *iaa = aux;
   4.582 +
   4.583 +			/* compare IO base address */
   4.584 +			/* XXXJRT what about multiple I/O addrs? */
   4.585 +			if (iaa->ia_nio > 0 &&
   4.586 +			    bin->addr.iobase == iaa->ia_io[0].ir_addr)
   4.587 +				goto found;
   4.588 +		}
   4.589 +#if NPCI > 0
   4.590 +		if (bin->bus == BI_BUS_PCI &&
   4.591 +		    !strcmp(dev->dv_parent->dv_cfdata->cf_name, "pci")) {
   4.592 +			struct pci_attach_args *paa = aux;
   4.593 +			int b, d, f;
   4.594 +
   4.595 +			/*
   4.596 +			 * Calculate BIOS representation of:
   4.597 +			 *
   4.598 +			 *	<bus,device,function>
   4.599 +			 *
   4.600 +			 * and compare.
   4.601 +			 */
   4.602 +			pci_decompose_tag(paa->pa_pc, paa->pa_tag, &b, &d, &f);
   4.603 +			if (bin->addr.tag == ((b << 8) | (d << 3) | f))
   4.604 +				goto found;
   4.605 +		}
   4.606 +#endif
   4.607 +	}
   4.608 +	return;
   4.609 +
   4.610 +found:
   4.611 +	if (booted_device) {
   4.612 +		/* XXX should be a "panic()" */
   4.613 +		printf("warning: double match for boot device (%s, %s)\n",
   4.614 +		    booted_device->dv_xname, dev->dv_xname);
   4.615 +		return;
   4.616 +	}
   4.617 +	booted_device = dev;
   4.618 +}
   4.619 +
   4.620 +static int
   4.621 +is_valid_disk(struct device *dv)
   4.622 +{
   4.623 +	const char *name;
   4.624 +
   4.625 +	if (dv->dv_class != DV_DISK)
   4.626 +		return (0);
   4.627 +
   4.628 +	name = dv->dv_cfdata->cf_name;
   4.629 +
   4.630 +	return (strcmp(name, "sd") == 0 || strcmp(name, "wd") == 0 ||
   4.631 +	    strcmp(name, "ld") == 0 || strcmp(name, "ed") == 0 ||
   4.632 +	    strcmp(name, "xbd") == 0);
   4.633 +}
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c	Mon Sep 06 19:04:16 2004 +0000
     5.3 @@ -0,0 +1,408 @@
     5.4 +/*	$NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $	*/
     5.5 +/*	NetBSD: gdt.c,v 1.32 2004/02/13 11:36:13 wiz Exp 	*/
     5.6 +
     5.7 +/*-
     5.8 + * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc.
     5.9 + * All rights reserved.
    5.10 + *
    5.11 + * This code is derived from software contributed to The NetBSD Foundation
    5.12 + * by John T. Kohl and Charles M. Hannum.
    5.13 + *
    5.14 + * Redistribution and use in source and binary forms, with or without
    5.15 + * modification, are permitted provided that the following conditions
    5.16 + * are met:
    5.17 + * 1. Redistributions of source code must retain the above copyright
    5.18 + *    notice, this list of conditions and the following disclaimer.
    5.19 + * 2. Redistributions in binary form must reproduce the above copyright
    5.20 + *    notice, this list of conditions and the following disclaimer in the
    5.21 + *    documentation and/or other materials provided with the distribution.
    5.22 + * 3. All advertising materials mentioning features or use of this software
    5.23 + *    must display the following acknowledgement:
    5.24 + *        This product includes software developed by the NetBSD
    5.25 + *        Foundation, Inc. and its contributors.
    5.26 + * 4. Neither the name of The NetBSD Foundation nor the names of its
    5.27 + *    contributors may be used to endorse or promote products derived
    5.28 + *    from this software without specific prior written permission.
    5.29 + *
    5.30 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
    5.31 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
    5.32 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    5.33 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
    5.34 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    5.35 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    5.36 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    5.37 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    5.38 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    5.39 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    5.40 + * POSSIBILITY OF SUCH DAMAGE.
    5.41 + */
    5.42 +
    5.43 +#include <sys/cdefs.h>
    5.44 +__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $");
    5.45 +
    5.46 +#include "opt_multiprocessor.h"
    5.47 +#include "opt_xen.h"
    5.48 +
    5.49 +#include <sys/param.h>
    5.50 +#include <sys/systm.h>
    5.51 +#include <sys/proc.h>
    5.52 +#include <sys/lock.h>
    5.53 +#include <sys/user.h>
    5.54 +
    5.55 +#include <uvm/uvm.h>
    5.56 +
    5.57 +#include <machine/gdt.h>
    5.58 +
    5.59 +int gdt_size[2];	/* total number of GDT entries */
    5.60 +int gdt_count[2];	/* number of GDT entries in use */
    5.61 +int gdt_next[2];	/* next available slot for sweeping */
    5.62 +int gdt_free[2];	/* next free slot; terminated with GNULL_SEL */
    5.63 +
    5.64 +struct lock gdt_lock_store;
    5.65 +
    5.66 +static __inline void gdt_lock(void);
    5.67 +static __inline void gdt_unlock(void);
    5.68 +void gdt_init(void);
    5.69 +void gdt_grow(int);
    5.70 +int gdt_get_slot(void);
    5.71 +int gdt_get_slot1(int);
    5.72 +void gdt_put_slot(int);
    5.73 +void gdt_put_slot1(int, int);
    5.74 +
    5.75 +/*
    5.76 + * Lock and unlock the GDT, to avoid races in case gdt_{ge,pu}t_slot() sleep
    5.77 + * waiting for memory.
    5.78 + *
    5.79 + * Note that the locking done here is not sufficient for multiprocessor
    5.80 + * systems.  A freshly allocated slot will still be of type SDT_SYSNULL for
    5.81 + * some time after the GDT is unlocked, so gdt_compact() could attempt to
    5.82 + * reclaim it.
    5.83 + */
    5.84 +static __inline void
    5.85 +gdt_lock()
    5.86 +{
    5.87 +
    5.88 +	(void) lockmgr(&gdt_lock_store, LK_EXCLUSIVE, NULL);
    5.89 +}
    5.90 +
    5.91 +static __inline void
    5.92 +gdt_unlock()
    5.93 +{
    5.94 +
    5.95 +	(void) lockmgr(&gdt_lock_store, LK_RELEASE, NULL);
    5.96 +}
    5.97 +
    5.98 +void
    5.99 +setgdt(int sel, void *base, size_t limit,
   5.100 +    int type, int dpl, int def32, int gran)
   5.101 +{
   5.102 +	struct segment_descriptor sd;
   5.103 +	CPU_INFO_ITERATOR cii;
   5.104 +	struct cpu_info *ci;
   5.105 +
   5.106 +	if (type == SDT_SYS386TSS) {
   5.107 +		/* printk("XXX TSS descriptor not supported in GDT\n"); */
   5.108 +		return;
   5.109 +	}
   5.110 +
   5.111 +	setsegment(&sd, base, limit, type, dpl, def32, gran);
   5.112 +	for (CPU_INFO_FOREACH(cii, ci)) {
   5.113 +		if (ci->ci_gdt != NULL) {
   5.114 +#ifndef XEN
   5.115 +			ci->ci_gdt[sel].sd = sd;
   5.116 +#else
   5.117 +			xen_update_descriptor(&ci->ci_gdt[sel],
   5.118 +			    (union descriptor *)&sd);
   5.119 +#endif
   5.120 +		}
   5.121 +	}
   5.122 +}
   5.123 +
   5.124 +/*
   5.125 + * Initialize the GDT subsystem.  Called from autoconf().
   5.126 + */
   5.127 +void
   5.128 +gdt_init()
   5.129 +{
   5.130 +	size_t max_len, min_len;
   5.131 +	union descriptor *old_gdt;
   5.132 +	struct vm_page *pg;
   5.133 +	vaddr_t va;
   5.134 +	struct cpu_info *ci = &cpu_info_primary;
   5.135 +
   5.136 +	lockinit(&gdt_lock_store, PZERO, "gdtlck", 0, 0);
   5.137 +
   5.138 +	max_len = MAXGDTSIZ * sizeof(gdt[0]);
   5.139 +	min_len = MINGDTSIZ * sizeof(gdt[0]);
   5.140 +
   5.141 +	gdt_size[0] = MINGDTSIZ;
   5.142 +	gdt_count[0] = NGDT;
   5.143 +	gdt_next[0] = NGDT;
   5.144 +	gdt_free[0] = GNULL_SEL;
   5.145 +
   5.146 +	gdt_size[1] = 0;
   5.147 +	gdt_count[1] = MAXGDTSIZ;
   5.148 +	gdt_next[1] = MAXGDTSIZ;
   5.149 +	gdt_free[1] = GNULL_SEL;
   5.150 +
   5.151 +	old_gdt = gdt;
   5.152 +	gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len + max_len);
   5.153 +	for (va = (vaddr_t)gdt; va < (vaddr_t)gdt + min_len; va += PAGE_SIZE) {
   5.154 +		pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
   5.155 +		if (pg == NULL) {
   5.156 +			panic("gdt_init: no pages");
   5.157 +		}
   5.158 +		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
   5.159 +		    VM_PROT_READ | VM_PROT_WRITE);
   5.160 +	}
   5.161 +	memcpy(gdt, old_gdt, NGDT * sizeof(gdt[0]));
   5.162 +	ci->ci_gdt = gdt;
   5.163 +	setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1,
   5.164 +	    SDT_MEMRWA, SEL_KPL, 1, 1);
   5.165 +
   5.166 +	gdt_init_cpu(ci);
   5.167 +}
   5.168 +
   5.169 +/*
   5.170 + * Allocate shadow GDT for a slave CPU.
   5.171 + */
   5.172 +void
   5.173 +gdt_alloc_cpu(struct cpu_info *ci)
   5.174 +{
   5.175 +	int max_len = MAXGDTSIZ * sizeof(gdt[0]);
   5.176 +	int min_len = MINGDTSIZ * sizeof(gdt[0]);
   5.177 +	struct vm_page *pg;
   5.178 +	vaddr_t va;
   5.179 +
   5.180 +	ci->ci_gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len);
   5.181 +	for (va = (vaddr_t)ci->ci_gdt; va < (vaddr_t)ci->ci_gdt + min_len;
   5.182 +	    va += PAGE_SIZE) {
   5.183 +		while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO))
   5.184 +		    == NULL) {
   5.185 +			uvm_wait("gdt_alloc_cpu");
   5.186 +		}
   5.187 +		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
   5.188 +		    VM_PROT_READ | VM_PROT_WRITE);
   5.189 +	}
   5.190 +	memset(ci->ci_gdt, 0, min_len);
   5.191 +	memcpy(ci->ci_gdt, gdt, gdt_count[0] * sizeof(gdt[0]));
   5.192 +	setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1,
   5.193 +	    SDT_MEMRWA, SEL_KPL, 1, 1);
   5.194 +}
   5.195 +
   5.196 +
   5.197 +/*
   5.198 + * Load appropriate gdt descriptor; we better be running on *ci
   5.199 + * (for the most part, this is how a CPU knows who it is).
   5.200 + */
   5.201 +void
   5.202 +gdt_init_cpu(struct cpu_info *ci)
   5.203 +{
   5.204 +#ifndef XEN
   5.205 +	struct region_descriptor region;
   5.206 +	size_t max_len;
   5.207 +
   5.208 +	max_len = MAXGDTSIZ * sizeof(gdt[0]);
   5.209 +	setregion(&region, ci->ci_gdt, max_len - 1);
   5.210 +	lgdt(&region);
   5.211 +#else
   5.212 +	size_t len = gdt_size[0] * sizeof(gdt[0]);
   5.213 +	unsigned long frames[len >> PAGE_SHIFT];
   5.214 +	vaddr_t va;
   5.215 +	pt_entry_t *ptp;
   5.216 +	pt_entry_t *maptp;
   5.217 +	int f;
   5.218 +
   5.219 +	for (va = (vaddr_t)ci->ci_gdt, f = 0;
   5.220 +	     va < (vaddr_t)ci->ci_gdt + len;
   5.221 +	     va += PAGE_SIZE, f++) {
   5.222 +		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
   5.223 +		ptp = kvtopte(va);
   5.224 +		frames[f] = *ptp >> PAGE_SHIFT;
   5.225 +		maptp = (pt_entry_t *)vtomach((vaddr_t)ptp);
   5.226 +		PTE_CLEARBITS(ptp, maptp, PG_RW);
   5.227 +	}
   5.228 +	PTE_UPDATES_FLUSH();
   5.229 +	/* printk("loading gdt %x, %d entries, %d pages", */
   5.230 +	    /* frames[0] << PAGE_SHIFT, gdt_size[0], len >> PAGE_SHIFT); */
   5.231 +	if (HYPERVISOR_set_gdt(frames, gdt_size[0]))
   5.232 +		panic("HYPERVISOR_set_gdt failed!\n");
   5.233 +	lgdt_finish();
   5.234 +#endif
   5.235 +}
   5.236 +
   5.237 +#ifdef MULTIPROCESSOR
   5.238 +
   5.239 +void
   5.240 +gdt_reload_cpu(struct cpu_info *ci)
   5.241 +{
   5.242 +	struct region_descriptor region;
   5.243 +	size_t max_len;
   5.244 +
   5.245 +	max_len = MAXGDTSIZ * sizeof(gdt[0]);
   5.246 +	setregion(&region, ci->ci_gdt, max_len - 1);
   5.247 +	lgdt(&region);
   5.248 +}
   5.249 +#endif
   5.250 +
   5.251 +
   5.252 +/*
   5.253 + * Grow the GDT.
   5.254 + */
   5.255 +void
   5.256 +gdt_grow(int which)
   5.257 +{
   5.258 +	size_t old_len, new_len, max_len;
   5.259 +	CPU_INFO_ITERATOR cii;
   5.260 +	struct cpu_info *ci;
   5.261 +	struct vm_page *pg;
   5.262 +	vaddr_t va;
   5.263 +
   5.264 +	old_len = gdt_size[which] * sizeof(gdt[0]);
   5.265 +	gdt_size[which] <<= 1;
   5.266 +	new_len = old_len << 1;
   5.267 +
   5.268 +	if (which != 0) {
   5.269 +		max_len = MAXGDTSIZ * sizeof(gdt[0]);
   5.270 +		if (old_len == 0) {
   5.271 +			gdt_size[which] = MINGDTSIZ;
   5.272 +			new_len = gdt_size[which] * sizeof(gdt[0]);
   5.273 +		}
   5.274 +		for (va = (vaddr_t)(cpu_info_primary.ci_gdt) + old_len + max_len;
   5.275 +		     va < (vaddr_t)(cpu_info_primary.ci_gdt) + new_len + max_len;
   5.276 +		     va += PAGE_SIZE) {
   5.277 +			while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) ==
   5.278 +			    NULL) {
   5.279 +				uvm_wait("gdt_grow");
   5.280 +			}
   5.281 +			pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
   5.282 +			    VM_PROT_READ | VM_PROT_WRITE);
   5.283 +		}
   5.284 +		return;
   5.285 +	}
   5.286 +
   5.287 +	for (CPU_INFO_FOREACH(cii, ci)) {
   5.288 +		for (va = (vaddr_t)(ci->ci_gdt) + old_len;
   5.289 +		     va < (vaddr_t)(ci->ci_gdt) + new_len;
   5.290 +		     va += PAGE_SIZE) {
   5.291 +			while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) ==
   5.292 +			    NULL) {
   5.293 +				uvm_wait("gdt_grow");
   5.294 +			}
   5.295 +			pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
   5.296 +			    VM_PROT_READ | VM_PROT_WRITE);
   5.297 +		}
   5.298 +	}
   5.299 +}
   5.300 +
   5.301 +/*
   5.302 + * Allocate a GDT slot as follows:
   5.303 + * 1) If there are entries on the free list, use those.
   5.304 + * 2) If there are fewer than gdt_size entries in use, there are free slots
   5.305 + *    near the end that we can sweep through.
   5.306 + * 3) As a last resort, we increase the size of the GDT, and sweep through
   5.307 + *    the new slots.
   5.308 + */
   5.309 +int
   5.310 +gdt_get_slot()
   5.311 +{
   5.312 +	return gdt_get_slot1(0);
   5.313 +}
   5.314 +
   5.315 +int
   5.316 +gdt_get_slot1(int which)
   5.317 +{
   5.318 +	size_t offset;
   5.319 +	int slot;
   5.320 +
   5.321 +	gdt_lock();
   5.322 +
   5.323 +	if (gdt_free[which] != GNULL_SEL) {
   5.324 +		slot = gdt_free[which];
   5.325 +		gdt_free[which] = gdt[slot].gd.gd_selector;
   5.326 +	} else {
   5.327 +		offset = which * MAXGDTSIZ * sizeof(gdt[0]);
   5.328 +		if (gdt_next[which] != gdt_count[which] + offset)
   5.329 +			panic("gdt_get_slot botch 1");
   5.330 +		if (gdt_next[which] - offset >= gdt_size[which]) {
   5.331 +			if (gdt_size[which] >= MAXGDTSIZ)
   5.332 +				panic("gdt_get_slot botch 2");
   5.333 +			gdt_grow(which);
   5.334 +		}
   5.335 +		slot = gdt_next[which]++;
   5.336 +	}
   5.337 +
   5.338 +	gdt_count[which]++;
   5.339 +	gdt_unlock();
   5.340 +	return (slot);
   5.341 +}
   5.342 +
   5.343 +/*
   5.344 + * Deallocate a GDT slot, putting it on the free list.
   5.345 + */
   5.346 +void
   5.347 +gdt_put_slot(int slot)
   5.348 +{
   5.349 +	gdt_put_slot1(slot, 0);
   5.350 +}
   5.351 +
   5.352 +void
   5.353 +gdt_put_slot1(int slot, int which)
   5.354 +{
   5.355 +
   5.356 +	gdt_lock();
   5.357 +	gdt_count[which]--;
   5.358 +
   5.359 +	gdt[slot].gd.gd_type = SDT_SYSNULL;
   5.360 +	gdt[slot].gd.gd_selector = gdt_free[which];
   5.361 +	gdt_free[which] = slot;
   5.362 +
   5.363 +	gdt_unlock();
   5.364 +}
   5.365 +
   5.366 +int
   5.367 +tss_alloc(struct pcb *pcb)
   5.368 +{
   5.369 +	int slot;
   5.370 +
   5.371 +	slot = gdt_get_slot();
   5.372 +	setgdt(slot, &pcb->pcb_tss, sizeof(struct pcb) - 1,
   5.373 +	    SDT_SYS386TSS, SEL_KPL, 0, 0);
   5.374 +	return GSEL(slot, SEL_KPL);
   5.375 +}
   5.376 +
   5.377 +void
   5.378 +tss_free(int sel)
   5.379 +{
   5.380 +
   5.381 +	gdt_put_slot(IDXSEL(sel));
   5.382 +}
   5.383 +
   5.384 +/*
   5.385 + * Caller must have pmap locked for both of these functions.
   5.386 + */
   5.387 +void
   5.388 +ldt_alloc(struct pmap *pmap, union descriptor *ldt, size_t len)
   5.389 +{
   5.390 +	int slot;
   5.391 +
   5.392 +	slot = gdt_get_slot1(1);
   5.393 +#ifndef XEN
   5.394 +	setgdt(slot, ldt, len - 1, SDT_SYSLDT, SEL_KPL, 0, 0);
   5.395 +#else
   5.396 +	cpu_info_primary.ci_gdt[slot].ld.ld_base = (uint32_t)ldt;
   5.397 +	cpu_info_primary.ci_gdt[slot].ld.ld_entries =
   5.398 +		len / sizeof(union descriptor);
   5.399 +#endif
   5.400 +	pmap->pm_ldt_sel = GSEL(slot, SEL_KPL);
   5.401 +}
   5.402 +
   5.403 +void
   5.404 +ldt_free(struct pmap *pmap)
   5.405 +{
   5.406 +	int slot;
   5.407 +
   5.408 +	slot = IDXSEL(pmap->pm_ldt_sel);
   5.409 +
   5.410 +	gdt_put_slot1(slot, 1);
   5.411 +}
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c	Mon Sep 06 19:04:16 2004 +0000
     6.3 @@ -0,0 +1,230 @@
     6.4 +/*	$NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $	*/
     6.5 +
     6.6 +/*
     6.7 + *
     6.8 + * Copyright (c) 2004 Christian Limpach.
     6.9 + * All rights reserved.
    6.10 + *
    6.11 + * Redistribution and use in source and binary forms, with or without
    6.12 + * modification, are permitted provided that the following conditions
    6.13 + * are met:
    6.14 + * 1. Redistributions of source code must retain the above copyright
    6.15 + *    notice, this list of conditions and the following disclaimer.
    6.16 + * 2. Redistributions in binary form must reproduce the above copyright
    6.17 + *    notice, this list of conditions and the following disclaimer in the
    6.18 + *    documentation and/or other materials provided with the distribution.
    6.19 + * 3. All advertising materials mentioning features or use of this software
    6.20 + *    must display the following acknowledgement:
    6.21 + *      This product includes software developed by Christian Limpach.
    6.22 + * 4. The name of the author may not be used to endorse or promote products
    6.23 + *    derived from this software without specific prior written permission.
    6.24 + *
    6.25 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
    6.26 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
    6.27 + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
    6.28 + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
    6.29 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
    6.30 + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    6.31 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    6.32 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    6.33 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
    6.34 + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    6.35 + */
    6.36 +
    6.37 +/******************************************************************************
    6.38 + * hypervisor.c
    6.39 + * 
    6.40 + * Communication to/from hypervisor.
    6.41 + * 
    6.42 + * Copyright (c) 2002-2004, K A Fraser
    6.43 + * 
    6.44 + * Permission is hereby granted, free of charge, to any person obtaining a copy
    6.45 + * of this software and associated documentation files (the "Software"), to
    6.46 + * deal in the Software without restriction, including without limitation the
    6.47 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
    6.48 + * sell copies of the Software, and to permit persons to whom the Software is
    6.49 + * furnished to do so, subject to the following conditions:
    6.50 + * 
    6.51 + * The above copyright notice and this permission notice shall be included in
    6.52 + * all copies or substantial portions of the Software.
    6.53 + * 
    6.54 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
    6.55 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
    6.56 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
    6.57 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
    6.58 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
    6.59 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
    6.60 + * DEALINGS IN THE SOFTWARE.
    6.61 + */
    6.62 +
    6.63 +
    6.64 +#include <sys/cdefs.h>
    6.65 +__KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $");
    6.66 +
    6.67 +#include <sys/cdefs.h>
    6.68 +#include <sys/param.h>
    6.69 +#include <sys/systm.h>
    6.70 +
    6.71 +#include <machine/xen.h>
    6.72 +#include <machine/hypervisor.h>
    6.73 +#include <machine/evtchn.h>
    6.74 +
    6.75 +/*
    6.76 + * Force a proper event-channel callback from Xen after clearing the
    6.77 + * callback mask. We do this in a very simple manner, by making a call
    6.78 + * down into Xen. The pending flag will be checked by Xen on return.
    6.79 + */
    6.80 +void
    6.81 +hypervisor_force_callback(void)
    6.82 +{
    6.83 +
    6.84 +	(void)HYPERVISOR_xen_version(0);
    6.85 +}
    6.86 +
    6.87 +int stipending(void);
    6.88 +int
    6.89 +stipending()
    6.90 +{
    6.91 +	uint32_t l1;
    6.92 +	unsigned long l2;
    6.93 +	unsigned int l1i, l2i, port;
    6.94 +	int irq;
    6.95 +	shared_info_t *s = HYPERVISOR_shared_info;
    6.96 +	struct cpu_info *ci;
    6.97 +	int ret;
    6.98 +
    6.99 +	ret = 0;
   6.100 +	ci = curcpu();
   6.101 +
   6.102 +#if 0
   6.103 +	if (HYPERVISOR_shared_info->events)
   6.104 +		printf("stipending events %08lx mask %08lx ilevel %d\n",
   6.105 +		    HYPERVISOR_shared_info->events,
   6.106 +		    HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
   6.107 +#endif
   6.108 +
   6.109 +	/*
   6.110 +	 * we're only called after STIC, so we know that we'll have to
   6.111 +	 * STI at the end
   6.112 +	 */
   6.113 +	cli();
   6.114 +	while (s->vcpu_data[0].evtchn_upcall_pending) {
   6.115 +		s->vcpu_data[0].evtchn_upcall_pending = 0;
   6.116 +		/* NB. No need for a barrier here -- XCHG is a barrier
   6.117 +		 * on x86. */
   6.118 +		l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0);
   6.119 +		while ((l1i = ffs(l1)) != 0) {
   6.120 +			l1i--;
   6.121 +			l1 &= ~(1 << l1i);
   6.122 +
   6.123 +			l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
   6.124 +			while ((l2i = ffs(l2)) != 0) {
   6.125 +				l2i--;
   6.126 +				l2 &= ~(1 << l2i);
   6.127 +
   6.128 +				port = (l1i << 5) + l2i;
   6.129 +				if ((irq = evtchn_to_irq[port]) != -1) {
   6.130 +					hypervisor_acknowledge_irq(irq);
   6.131 +					ci->ci_ipending |= (1 << irq);
   6.132 +					if (ret == 0 && ci->ci_ilevel <
   6.133 +					    ci->ci_isources[irq]->is_handlers
   6.134 +					    ->ih_level)
   6.135 +						ret = 1;
   6.136 +				}
   6.137 +#if 0 /* XXXcl dev/evtchn */
   6.138 +				else
   6.139 +					evtchn_device_upcall(port);
   6.140 +#endif
   6.141 +			}
   6.142 +		}
   6.143 +	}
   6.144 +	sti();
   6.145 +
   6.146 +#if 0
   6.147 +	if (ci->ci_ipending & 0x1)
   6.148 +		printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n",
   6.149 +		    HYPERVISOR_shared_info->events,
   6.150 +		    HYPERVISOR_shared_info->events_mask, ci->ci_ilevel,
   6.151 +		    ci->ci_ipending);
   6.152 +#endif
   6.153 +
   6.154 +	return (ret);
   6.155 +}
   6.156 +
   6.157 +void do_hypervisor_callback(struct trapframe *regs)
   6.158 +{
   6.159 +	uint32_t l1;
   6.160 +	unsigned long l2;
   6.161 +	unsigned int l1i, l2i, port;
   6.162 +	int irq;
   6.163 +	shared_info_t *s = HYPERVISOR_shared_info;
   6.164 +	struct cpu_info *ci;
   6.165 +	int level;
   6.166 +
   6.167 +	ci = curcpu();
   6.168 +	level = ci->ci_ilevel;
   6.169 +
   6.170 +	while (s->vcpu_data[0].evtchn_upcall_pending) {
   6.171 +		s->vcpu_data[0].evtchn_upcall_pending = 0;
   6.172 +		/* NB. No need for a barrier here -- XCHG is a barrier
   6.173 +		 * on x86. */
   6.174 +		l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0);
   6.175 +		while ((l1i = ffs(l1)) != 0) {
   6.176 +			l1i--;
   6.177 +			l1 &= ~(1 << l1i);
   6.178 +
   6.179 +			l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
   6.180 +			while ((l2i = ffs(l2)) != 0) {
   6.181 +				l2i--;
   6.182 +				l2 &= ~(1 << l2i);
   6.183 +
   6.184 +				port = (l1i << 5) + l2i;
   6.185 +				if ((irq = evtchn_to_irq[port]) != -1)
   6.186 +					do_event(irq, regs);
   6.187 +#if 0 /* XXXcl dev/evtchn */
   6.188 +				else
   6.189 +					evtchn_device_upcall(port);
   6.190 +#endif
   6.191 +			}
   6.192 +		}
   6.193 +	}
   6.194 +
   6.195 +#ifdef DIAGNOSTIC
   6.196 +	if (level != ci->ci_ilevel)
   6.197 +		printf("hypervisor done %08x level %d/%d ipending %08x\n",
   6.198 +		    HYPERVISOR_shared_info->evtchn_pending_sel, level,
   6.199 +		    ci->ci_ilevel, ci->ci_ipending);
   6.200 +#endif
   6.201 +}
   6.202 +
   6.203 +void hypervisor_unmask_event(unsigned int ev)
   6.204 +{
   6.205 +	shared_info_t *s = HYPERVISOR_shared_info;
   6.206 +
   6.207 +	x86_atomic_clear_bit(&s->evtchn_mask[0], ev);
   6.208 +	/*
   6.209 +	 * The following is basically the equivalent of
   6.210 +	 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
   6.211 +	 * interrupt edge' if the channel is masked.
   6.212 +	 */
   6.213 +	if (x86_atomic_test_bit(&s->evtchn_pending[0], ev) && 
   6.214 +	    !x86_atomic_test_and_set_bit(&s->evtchn_pending_sel, ev>>5)) {
   6.215 +		s->vcpu_data[0].evtchn_upcall_pending = 1;
   6.216 +		if (!s->vcpu_data[0].evtchn_upcall_mask)
   6.217 +			hypervisor_force_callback();
   6.218 +	}
   6.219 +}
   6.220 +
   6.221 +void hypervisor_mask_event(unsigned int ev)
   6.222 +{
   6.223 +	shared_info_t *s = HYPERVISOR_shared_info;
   6.224 +
   6.225 +	x86_atomic_set_bit(&s->evtchn_mask[0], ev);
   6.226 +}
   6.227 +
   6.228 +void hypervisor_clear_event(unsigned int ev)
   6.229 +{
   6.230 +	shared_info_t *s = HYPERVISOR_shared_info;
   6.231 +
   6.232 +	x86_atomic_clear_bit(&s->evtchn_pending[0], ev);
   6.233 +}
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S	Mon Sep 06 19:04:16 2004 +0000
     7.3 @@ -0,0 +1,2000 @@
     7.4 +/*	$NetBSD: locore.S,v 1.2.2.1 2004/05/22 15:59:48 he Exp $	*/
     7.5 +/*	NetBSD: locore.S,v 1.26 2004/04/12 13:17:46 yamt Exp 	*/
     7.6 +
     7.7 +/*-
     7.8 + * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
     7.9 + * All rights reserved.
    7.10 + *
    7.11 + * This code is derived from software contributed to The NetBSD Foundation
    7.12 + * by Charles M. Hannum.
    7.13 + *
    7.14 + * Redistribution and use in source and binary forms, with or without
    7.15 + * modification, are permitted provided that the following conditions
    7.16 + * are met:
    7.17 + * 1. Redistributions of source code must retain the above copyright
    7.18 + *    notice, this list of conditions and the following disclaimer.
    7.19 + * 2. Redistributions in binary form must reproduce the above copyright
    7.20 + *    notice, this list of conditions and the following disclaimer in the
    7.21 + *    documentation and/or other materials provided with the distribution.
    7.22 + * 3. All advertising materials mentioning features or use of this software
    7.23 + *    must display the following acknowledgement:
    7.24 + *        This product includes software developed by the NetBSD
    7.25 + *        Foundation, Inc. and its contributors.
    7.26 + * 4. Neither the name of The NetBSD Foundation nor the names of its
    7.27 + *    contributors may be used to endorse or promote products derived
    7.28 + *    from this software without specific prior written permission.
    7.29 + *
    7.30 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
    7.31 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
    7.32 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    7.33 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
    7.34 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    7.35 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    7.36 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    7.37 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    7.38 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    7.39 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    7.40 + * POSSIBILITY OF SUCH DAMAGE.
    7.41 + */
    7.42 +
    7.43 +/*-
    7.44 + * Copyright (c) 1990 The Regents of the University of California.
    7.45 + * All rights reserved.
    7.46 + *
    7.47 + * This code is derived from software contributed to Berkeley by
    7.48 + * William Jolitz.
    7.49 + *
    7.50 + * Redistribution and use in source and binary forms, with or without
    7.51 + * modification, are permitted provided that the following conditions
    7.52 + * are met:
    7.53 + * 1. Redistributions of source code must retain the above copyright
    7.54 + *    notice, this list of conditions and the following disclaimer.
    7.55 + * 2. Redistributions in binary form must reproduce the above copyright
    7.56 + *    notice, this list of conditions and the following disclaimer in the
    7.57 + *    documentation and/or other materials provided with the distribution.
    7.58 + * 3. Neither the name of the University nor the names of its contributors
    7.59 + *    may be used to endorse or promote products derived from this software
    7.60 + *    without specific prior written permission.
    7.61 + *
    7.62 + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
    7.63 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    7.64 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    7.65 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
    7.66 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    7.67 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    7.68 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    7.69 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    7.70 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    7.71 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    7.72 + * SUCH DAMAGE.
    7.73 + *
    7.74 + *	@(#)locore.s	7.3 (Berkeley) 5/13/91
    7.75 + */
    7.76 +
    7.77 +#include "opt_compat_netbsd.h"
    7.78 +#include "opt_compat_oldboot.h"
    7.79 +#include "opt_cputype.h"
    7.80 +#include "opt_ddb.h"
    7.81 +#include "opt_ipkdb.h"
    7.82 +#include "opt_lockdebug.h"
    7.83 +#include "opt_multiprocessor.h"
    7.84 +#include "opt_realmem.h"
    7.85 +#include "opt_user_ldt.h"
    7.86 +#include "opt_vm86.h"
    7.87 +#include "opt_xen.h"
    7.88 +
    7.89 +#include "npx.h"
    7.90 +#include "assym.h"
    7.91 +#include "apm.h"
    7.92 +#include "lapic.h"
    7.93 +#include "ioapic.h"
    7.94 +#include "ksyms.h"
    7.95 +
    7.96 +#include <sys/errno.h>
    7.97 +#include <sys/syscall.h>
    7.98 +
    7.99 +#include <machine/cputypes.h>
   7.100 +#include <machine/param.h>
   7.101 +#include <machine/pte.h>
   7.102 +#include <machine/segments.h>
   7.103 +#include <machine/specialreg.h>
   7.104 +#include <machine/trap.h>
   7.105 +#include <machine/bootinfo.h>
   7.106 +
   7.107 +#if NLAPIC > 0
   7.108 +#include <machine/i82489reg.h>
   7.109 +#endif
   7.110 +
   7.111 +/* LINTSTUB: include <sys/types.h> */
   7.112 +/* LINTSTUB: include <machine/cpu.h> */
   7.113 +/* LINTSTUB: include <sys/systm.h> */
   7.114 +
   7.115 +#include <machine/asm.h>
   7.116 +
   7.117 +#if defined(MULTIPROCESSOR)
   7.118 +	
   7.119 +#define SET_CURLWP(lwp,cpu)				\
   7.120 +	movl	CPUVAR(SELF),cpu		; 	\
   7.121 +	movl	lwp,CPUVAR(CURLWP)	;	\
   7.122 +	movl	cpu,L_CPU(lwp)
   7.123 +	
   7.124 +#else
   7.125 +
   7.126 +#define SET_CURLWP(lwp,tcpu)		movl	lwp,CPUVAR(CURLWP)
   7.127 +#define GET_CURLWP(reg)			movl	CPUVAR(CURLWP),reg
   7.128 +
   7.129 +#endif
   7.130 +
   7.131 +#define GET_CURPCB(reg)			movl	CPUVAR(CURPCB),reg	
   7.132 +#define SET_CURPCB(reg)			movl	reg,CPUVAR(CURPCB)
   7.133 +
   7.134 +#define CLEAR_RESCHED(reg)		movl	reg,CPUVAR(RESCHED)
   7.135 +
   7.136 +/* XXX temporary kluge; these should not be here */
   7.137 +/* Get definitions for IOM_BEGIN, IOM_END, and IOM_SIZE */
   7.138 +#include <dev/isa/isareg.h>
   7.139 +
   7.140 +
   7.141 +/* Disallow old names for REALBASEMEM */
   7.142 +#ifdef BIOSBASEMEM
   7.143 +#error BIOSBASEMEM option deprecated; use REALBASEMEM only if memory size reported by latest boot block is incorrect
   7.144 +#endif
   7.145 +
   7.146 +/* Disallow old names for REALEXTMEM */
   7.147 +#ifdef EXTMEM_SIZE
   7.148 +#error EXTMEM_SIZE option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect
   7.149 +#endif
   7.150 +#ifdef BIOSEXTMEM
   7.151 +#error BIOSEXTMEM option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect
   7.152 +#endif
   7.153 +
   7.154 +#include <machine/frameasm.h>
   7.155 +
   7.156 +
   7.157 +#ifdef MULTIPROCESSOR
   7.158 +#include <machine/i82489reg.h>
   7.159 +#endif
   7.160 +	
   7.161 +/*
   7.162 + * PTmap is recursive pagemap at top of virtual address space.
   7.163 + * Within PTmap, the page directory can be found (third indirection).
   7.164 + *
   7.165 + * XXX 4 == sizeof pde
   7.166 + */
   7.167 +	.set	_C_LABEL(PTmap),(PDSLOT_PTE << PDSHIFT)
   7.168 +	.set	_C_LABEL(PTD),(_C_LABEL(PTmap) + PDSLOT_PTE * PAGE_SIZE)
   7.169 +	.set	_C_LABEL(PTDpde),(_C_LABEL(PTD) + PDSLOT_PTE * 4)
   7.170 +
   7.171 +/*
   7.172 + * APTmap, APTD is the alternate recursive pagemap.
   7.173 + * It's used when modifying another process's page tables.
   7.174 + *
   7.175 + * XXX 4 == sizeof pde
   7.176 + */
   7.177 +	.set	_C_LABEL(APTmap),(PDSLOT_APTE << PDSHIFT)
   7.178 +	.set	_C_LABEL(APTD),(_C_LABEL(APTmap) + PDSLOT_APTE * PAGE_SIZE)
   7.179 +	.set	_C_LABEL(APTDpde),(_C_LABEL(PTD) + PDSLOT_APTE * 4)
   7.180 +
   7.181 +
   7.182 +/*
   7.183 + * Xen guest identifier and loader selection
   7.184 + */
   7.185 +.section __xen_guest
   7.186 +	.asciz "GUEST_OS=netbsd,GUEST_VER=2.0,XEN_VER=2.0,LOADER=generic"
   7.187 +
   7.188 +
   7.189 +/*
   7.190 + * Initialization
   7.191 + */
   7.192 +	.data
   7.193 +
   7.194 +	.globl	_C_LABEL(cpu)
   7.195 +	.globl	_C_LABEL(esym),_C_LABEL(boothowto)
   7.196 +	.globl	_C_LABEL(bootinfo),_C_LABEL(atdevbase)
   7.197 +#ifdef COMPAT_OLDBOOT
   7.198 +	.globl	_C_LABEL(bootdev)
   7.199 +#endif
   7.200 +	.globl	_C_LABEL(proc0paddr),_C_LABEL(PTDpaddr)
   7.201 +	.globl	_C_LABEL(biosbasemem),_C_LABEL(biosextmem)
   7.202 +	.globl	_C_LABEL(gdt)
   7.203 +#ifdef I586_CPU
   7.204 +	.globl	_C_LABEL(idt)
   7.205 +#endif
   7.206 +	.globl	_C_LABEL(lapic_tpr)	
   7.207 +	
   7.208 +#if NLAPIC > 0
   7.209 +#ifdef __ELF__
   7.210 +	.align	PAGE_SIZE
   7.211 +#else
   7.212 +	.align	12
   7.213 +#endif
   7.214 +	.globl _C_LABEL(local_apic), _C_LABEL(lapic_id)
   7.215 +_C_LABEL(local_apic):
   7.216 +	.space	LAPIC_ID
   7.217 +_C_LABEL(lapic_id):	
   7.218 +	.long	0x00000000
   7.219 +	.space  LAPIC_TPRI-(LAPIC_ID+4)
   7.220 +_C_LABEL(lapic_tpr):		
   7.221 +	.space  LAPIC_PPRI-LAPIC_TPRI
   7.222 +_C_LABEL(lapic_ppr):		
   7.223 +	.space	LAPIC_ISR-LAPIC_PPRI
   7.224 +_C_LABEL(lapic_isr):
   7.225 +	.space	PAGE_SIZE-LAPIC_ISR
   7.226 +#else
   7.227 +_C_LABEL(lapic_tpr):	
   7.228 +	.long 0
   7.229 +#endif
   7.230 +	
   7.231 +
   7.232 +_C_LABEL(cpu):		.long	0	# are we 386, 386sx, or 486,
   7.233 +					#   or Pentium, or..
   7.234 +_C_LABEL(esym):		.long	0	# ptr to end of syms
   7.235 +_C_LABEL(atdevbase):	.long	0	# location of start of iomem in virtual
   7.236 +_C_LABEL(proc0paddr):	.long	0
   7.237 +_C_LABEL(PTDpaddr):	.long	0	# paddr of PTD, for libkvm
   7.238 +#ifndef REALBASEMEM
   7.239 +_C_LABEL(biosbasemem):	.long	0	# base memory reported by BIOS
   7.240 +#else
   7.241 +_C_LABEL(biosbasemem):	.long	REALBASEMEM
   7.242 +#endif
   7.243 +#ifndef REALEXTMEM
   7.244 +_C_LABEL(biosextmem):	.long	0	# extended memory reported by BIOS
   7.245 +#else
   7.246 +_C_LABEL(biosextmem):	.long	REALEXTMEM
   7.247 +#endif
   7.248 +
   7.249 +#include <machine/xen.h>
   7.250 +#define __HYPERVISOR_yield		   8
   7.251 +
   7.252 +	.space 512
   7.253 +tmpstk:
   7.254 +	.long tmpstk, __KERNEL_DS
   7.255 +
   7.256 +
   7.257 +#define	_RELOC(x)	((x))
   7.258 +#define	RELOC(x)	_RELOC(_C_LABEL(x))
   7.259 +
   7.260 +/* XXX assym.h */
   7.261 +#define MOD_START   48
   7.262 +#define MOD_LEN     56
   7.263 +/* XXX assym.h */
   7.264 +
   7.265 +	.text
   7.266 +	.globl	_C_LABEL(kernel_text)
   7.267 +	.set	_C_LABEL(kernel_text),KERNTEXTOFF
   7.268 +
   7.269 +	.globl	start
   7.270 +start:
   7.271 +	cld
   7.272 +
   7.273 +	lss	tmpstk,%esp		# bootstrap stack end location
   7.274 +
   7.275 +	movl	%esi,%ebx		# save start_info pointer
   7.276 +
   7.277 +#if (NKSYMS || defined(DDB) || defined(LKM)) && !defined(SYMTAB_SPACE)
   7.278 +	/* Save the symbol locations. */
   7.279 +	movl	MOD_START(%ebx),%esi
   7.280 +	addl	MOD_LEN(%ebx),%esi
   7.281 +	movl	%esi,RELOC(esym)
   7.282 +#endif
   7.283 +
   7.284 +        /* Clear BSS first so that there are no surprises... */
   7.285 +	xorl	%eax,%eax
   7.286 +	movl	$RELOC(__bss_start),%edi
   7.287 +	movl	$RELOC(_end),%ecx
   7.288 +	subl	%edi,%ecx
   7.289 +	rep stosb
   7.290 +
   7.291 +	movl	%ebx,RELOC(avail_start)
   7.292 +
   7.293 +	/* Copy the necessary stuff from start_info structure. */
   7.294 +        /* We need to copy shared_info early, so that sti/cli work */
   7.295 +	movl	%ebx,%esi
   7.296 +	movl	$RELOC(start_info_union),%edi
   7.297 +	movl	$128,%ecx
   7.298 +	rep movsl
   7.299 +
   7.300 +    	/* (howto, [bootdev], bootinfo, basemem, extmem). */
   7.301 +	xorl	%eax,%eax
   7.302 +	movl	%eax,RELOC(boothowto)
   7.303 +#ifdef COMPAT_OLDBOOT
   7.304 +	movl	%eax,RELOC(bootdev)
   7.305 +#endif
   7.306 +	movl	$0x20000,%eax
   7.307 +	movl	%eax,RELOC(boothowto)
   7.308 +
   7.309 +	/* First, reset the PSL. */
   7.310 +	pushl	$PSL_MBO
   7.311 +	popfl
   7.312 +
   7.313 +	/* Clear segment registers; always null in proc0. */
   7.314 +	xorl	%eax,%eax
   7.315 +	movw	%ax,%fs
   7.316 +	movw	%ax,%gs
   7.317 +	decl	%eax
   7.318 +	movl	%eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL
   7.319 +
   7.320 +	xorl	%eax,%eax
   7.321 +	cpuid
   7.322 +	movl	%eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL
   7.323 +
   7.324 +/*
   7.325 + * Virtual address space of kernel:
   7.326 + *
   7.327 + * text | data | bss | [syms] | page dir | proc0 kstack 
   7.328 + *			      0          1       2      3
   7.329 + */
   7.330 +#define	PROC0PDIR	((0)              * PAGE_SIZE)
   7.331 +#define	PROC0STACK	((1)              * PAGE_SIZE)
   7.332 +#define	SYSMAP		((1+UPAGES)       * PAGE_SIZE)
   7.333 +#define	TABLESIZE	((1+UPAGES) * PAGE_SIZE) /* + nkpde * PAGE_SIZE */
   7.334 +
   7.335 +	/* Find end of kernel image. */
   7.336 +	movl	RELOC(avail_start),%edi
   7.337 +	/* Calculate where to start the bootstrap tables. */
   7.338 +	movl	%edi,%esi
   7.339 +
   7.340 +	/*
   7.341 +	 * Calculate the size of the kernel page table directory, and
   7.342 +	 * how many entries it will have.
   7.343 +	 */
   7.344 +	movl	RELOC(nkpde),%ecx		# get nkpde
   7.345 +	cmpl	$NKPTP_MIN,%ecx			# larger than min?
   7.346 +	jge	1f
   7.347 +	movl	$NKPTP_MIN,%ecx			# set at min
   7.348 +	jmp	2f
   7.349 +1:	cmpl	$NKPTP_MAX,%ecx			# larger than max?
   7.350 +	jle	2f
   7.351 +	movl	$NKPTP_MAX,%ecx
   7.352 +2:
   7.353 +
   7.354 +	/* Clear memory for bootstrap tables. */
   7.355 +	shll	$PGSHIFT,%ecx
   7.356 +	addl	$TABLESIZE,%ecx
   7.357 +	addl	%esi,%ecx			# end of tables
   7.358 +	movl	%ecx,RELOC(gdt)
   7.359 +	addl	$PAGE_SIZE,%ecx
   7.360 +	movl	%ecx,RELOC(avail_start)
   7.361 +	subl	%edi,%ecx			# size of tables
   7.362 +	shrl	$2,%ecx
   7.363 +	xorl	%eax,%eax
   7.364 +	cld
   7.365 +	rep
   7.366 +	stosl
   7.367 +
   7.368 +/*
   7.369 + * fillkpt
   7.370 + *	eax = pte (page frame | control | status)
   7.371 + *	ebx = page table address
   7.372 + *	ecx = number of pages to map
   7.373 + */
   7.374 +#define	fillkpt		\
   7.375 +1:	movl	%eax,(%ebx)	; \
   7.376 +	addl	$PAGE_SIZE,%eax	; /* increment physical address */ \
   7.377 +	addl	$4,%ebx		; /* next pte */ \
   7.378 +	loop	1b		;
   7.379 +
   7.380 +/*
   7.381 + * Build initial page tables.
   7.382 + */
   7.383 +	/* Calculate end of text segment, rounded to a page. */
   7.384 +	leal	(RELOC(etext)+PGOFSET),%edx
   7.385 +	andl	$~PGOFSET,%edx
   7.386 +	
   7.387 +	/* Skip over the first 1MB. */
   7.388 +	movl	$KERNTEXTOFF,%eax
   7.389 +	movl	%eax,%ecx
   7.390 +	subl	$KERNBASE_LOCORE,%ecx
   7.391 +	shrl	$PGSHIFT,%ecx
   7.392 +	leal	(SYSMAP)(%esi,%ecx,4),%ebx
   7.393 +
   7.394 +	/* Map the kernel text read-only. */
   7.395 +	movl	%edx,%ecx
   7.396 +	subl	%eax,%ecx
   7.397 +	shrl	$PGSHIFT,%ecx
   7.398 +	orl	$(PG_V|PG_KR),%eax
   7.399 +	fillkpt
   7.400 +
   7.401 +	/* Map the data, BSS, and bootstrap tables read-write. */
   7.402 +	movl	RELOC(avail_start),%ecx
   7.403 +						    # end of tables
   7.404 +	subl	%edx,%ecx				# subtract end of text
   7.405 +	shrl	$PGSHIFT,%ecx
   7.406 +	leal	(PG_V|PG_KW)(%edx),%eax
   7.407 +	fillkpt
   7.408 +
   7.409 +	movl	$0xffffffff,(%ebx)
   7.410 +	addl	$4,%ebx
   7.411 +
   7.412 +/*
   7.413 + * Construct a page table directory.
   7.414 + */
   7.415 +	/* Map kernel PDEs. */
   7.416 +	movl	RELOC(nkpde),%ecx			# for this many pde s,
   7.417 +	leal	(PROC0PDIR+PDSLOT_KERN*4)(%esi),%ebx	# kernel pde offset
   7.418 +	leal	(SYSMAP+PG_V|PG_KW)(%esi),%eax		# pte for KPT in proc 0,
   7.419 +	fillkpt
   7.420 +
   7.421 +	/* Install a PDE recursively mapping page directory as a page table! */
   7.422 +	leal	(PROC0PDIR+PG_V/*|PG_KW*/)(%esi),%eax	# pte for ptd
   7.423 +	movl	%eax,(PROC0PDIR+PDSLOT_PTE*4)(%esi)	# recursive PD slot
   7.424 +
   7.425 +	/* Save phys. addr of PTD, for libkvm. */
   7.426 +	movl	%esi,RELOC(PTDpaddr)
   7.427 +
   7.428 +    	call	xpmap_init
   7.429 +
   7.430 +	/* cr0 is 0x8005003b */
   7.431 +
   7.432 +	/* Relocate atdevbase. */
   7.433 +	movl	_C_LABEL(avail_start),%edx
   7.434 +	movl	%edx,_C_LABEL(HYPERVISOR_shared_info)
   7.435 +	addl	$PAGE_SIZE,%edx			# shared_inf
   7.436 +	movl	%edx,_C_LABEL(atdevbase)
   7.437 +
   7.438 +	/* Set up bootstrap stack. */
   7.439 +	leal	(PROC0STACK)(%esi),%eax
   7.440 +	movl	%eax,_C_LABEL(proc0paddr)
   7.441 +	leal	(USPACE-FRAMESIZE)(%eax),%esp
   7.442 +	subl	$KERNBASE_LOCORE,%esi
   7.443 +	movl	%esi,PCB_CR3(%eax)	# pcb->pcb_cr3
   7.444 +	xorl	%ebp,%ebp               # mark end of frames
   7.445 +
   7.446 +	movl	_C_LABEL(atdevbase),%eax
   7.447 +	pushl	%eax
   7.448 +	call	_C_LABEL(init386)	# wire 386 chip for unix operation
   7.449 +	addl	$4,%esp
   7.450 +
   7.451 +#ifdef SAFARI_FIFO_HACK
   7.452 +	movb	$5,%al
   7.453 +	movw	$0x37b,%dx
   7.454 +	outb	%al,%dx
   7.455 +	movw	$0x37f,%dx
   7.456 +	inb	%dx,%al
   7.457 +	movb	%al,%cl
   7.458 +
   7.459 +	orb	$1,%cl
   7.460 +
   7.461 +	movb	$5,%al
   7.462 +	movw	$0x37b,%dx
   7.463 +	outb	%al,%dx
   7.464 +	movw	$0x37f,%dx
   7.465 +	movb	%cl,%al
   7.466 +	outb	%al,%dx
   7.467 +#endif /* SAFARI_FIFO_HACK */
   7.468 +
   7.469 +	call 	_C_LABEL(main)
   7.470 +
   7.471 +/*
   7.472 + * void proc_trampoline(void);
   7.473 + * This is a trampoline function pushed onto the stack of a newly created
   7.474 + * process in order to do some additional setup.  The trampoline is entered by
   7.475 + * cpu_switch()ing to the process, so we abuse the callee-saved registers used
   7.476 + * by cpu_switch() to store the information about the stub to call.
   7.477 + * NOTE: This function does not have a normal calling sequence!
   7.478 + */
   7.479 +/* LINTSTUB: Func: void proc_trampoline(void) */
   7.480 +NENTRY(proc_trampoline)
   7.481 +#ifdef MULTIPROCESSOR
   7.482 +	call	_C_LABEL(proc_trampoline_mp)
   7.483 +#endif
   7.484 +	movl	$IPL_NONE,CPUVAR(ILEVEL)
   7.485 +	pushl	%ebx
   7.486 +	call	*%esi
   7.487 +	addl	$4,%esp
   7.488 +	DO_DEFERRED_SWITCH(%eax)
   7.489 +	INTRFASTEXIT
   7.490 +	/* NOTREACHED */
   7.491 +
   7.492 +/*****************************************************************************/
   7.493 +#ifdef COMPAT_16
   7.494 +/*
   7.495 + * Signal trampoline; copied to top of user stack.
   7.496 + */
   7.497 +/* LINTSTUB: Var: char sigcode[1], esigcode[1]; */
   7.498 +NENTRY(sigcode)
   7.499 +	/*
   7.500 +	 * Handler has returned here as if we called it.  The sigcontext
   7.501 +	 * is on the stack after the 3 args "we" pushed.
   7.502 +	 */
   7.503 +	leal	12(%esp),%eax		# get pointer to sigcontext
   7.504 +	movl	%eax,4(%esp)		# put it in the argument slot
   7.505 +					# fake return address already there
   7.506 +	movl	$SYS_compat_16___sigreturn14,%eax
   7.507 +	int	$0x80	 		# enter kernel with args on stack
   7.508 +	movl	$SYS_exit,%eax
   7.509 +	int	$0x80			# exit if sigreturn fails
   7.510 +	.globl	_C_LABEL(esigcode)
   7.511 +_C_LABEL(esigcode):
   7.512 +#endif
   7.513 +
   7.514 +/*****************************************************************************/
   7.515 +
   7.516 +/*
   7.517 + * The following primitives are used to fill and copy regions of memory.
   7.518 + */
   7.519 +
   7.520 +/*
   7.521 + * XXX No section 9 man page for fillw.
   7.522 + * fillw seems to be very sparsely used (only in pccons it seems.)
   7.523 + * One wonders if it couldn't be done without.
   7.524 + * -- Perry Metzger, May 7, 2001
   7.525 + */
   7.526 +/*
   7.527 + * void fillw(short pattern, void *addr, size_t len);
   7.528 + * Write len copies of pattern at addr.
   7.529 + */
   7.530 +/* LINTSTUB: Func: void fillw(short pattern, void *addr, size_t len) */
   7.531 +ENTRY(fillw)
   7.532 +	pushl	%edi
   7.533 +	movl	8(%esp),%eax
   7.534 +	movl	12(%esp),%edi
   7.535 +	movw	%ax,%cx
   7.536 +	rorl	$16,%eax
   7.537 +	movw	%cx,%ax
   7.538 +	cld
   7.539 +	movl	16(%esp),%ecx
   7.540 +	shrl	%ecx			# do longwords
   7.541 +	rep
   7.542 +	stosl
   7.543 +	movl	16(%esp),%ecx
   7.544 +	andl	$1,%ecx			# do remainder
   7.545 +	rep
   7.546 +	stosw
   7.547 +	popl	%edi
   7.548 +	ret
   7.549 +
   7.550 +/*
   7.551 + * int kcopy(const void *from, void *to, size_t len);
   7.552 + * Copy len bytes, abort on fault.
   7.553 + */
   7.554 +/* LINTSTUB: Func: int kcopy(const void *from, void *to, size_t len) */
   7.555 +ENTRY(kcopy)
   7.556 +	pushl	%esi
   7.557 +	pushl	%edi
   7.558 +	GET_CURPCB(%eax)		# load curpcb into eax and set on-fault
   7.559 +	pushl	PCB_ONFAULT(%eax)
   7.560 +	movl	$_C_LABEL(kcopy_fault), PCB_ONFAULT(%eax)
   7.561 +
   7.562 +	movl	16(%esp),%esi
   7.563 +	movl	20(%esp),%edi
   7.564 +	movl	24(%esp),%ecx
   7.565 +	movl	%edi,%eax
   7.566 +	subl	%esi,%eax
   7.567 +	cmpl	%ecx,%eax		# overlapping?
   7.568 +	jb	1f
   7.569 +	cld				# nope, copy forward
   7.570 +	shrl	$2,%ecx			# copy by 32-bit words
   7.571 +	rep
   7.572 +	movsl
   7.573 +	movl	24(%esp),%ecx
   7.574 +	andl	$3,%ecx			# any bytes left?
   7.575 +	rep
   7.576 +	movsb
   7.577 +
   7.578 +	GET_CURPCB(%edx)		# XXX save curpcb?
   7.579 +	popl	PCB_ONFAULT(%edx)
   7.580 +	popl	%edi
   7.581 +	popl	%esi
   7.582 +	xorl	%eax,%eax
   7.583 +	ret
   7.584 +
   7.585 +	ALIGN_TEXT
   7.586 +1:	addl	%ecx,%edi		# copy backward
   7.587 +	addl	%ecx,%esi
   7.588 +	std
   7.589 +	andl	$3,%ecx			# any fractional bytes?
   7.590 +	decl	%edi
   7.591 +	decl	%esi
   7.592 +	rep
   7.593 +	movsb
   7.594 +	movl	24(%esp),%ecx		# copy remainder by 32-bit words
   7.595 +	shrl	$2,%ecx
   7.596 +	subl	$3,%esi
   7.597 +	subl	$3,%edi
   7.598 +	rep
   7.599 +	movsl
   7.600 +	cld
   7.601 +
   7.602 +	GET_CURPCB(%edx)
   7.603 +	popl	PCB_ONFAULT(%edx)
   7.604 +	popl	%edi
   7.605 +	popl	%esi
   7.606 +	xorl	%eax,%eax
   7.607 +	ret
   7.608 +
   7.609 +/*****************************************************************************/
   7.610 +
   7.611 +/*
   7.612 + * The following primitives are used to copy data in and out of the user's
   7.613 + * address space.
   7.614 + */
   7.615 +
   7.616 +/*
   7.617 + * Default to the lowest-common-denominator.  We will improve it
   7.618 + * later.
   7.619 + */
   7.620 +#if defined(I386_CPU)
   7.621 +#define	DEFAULT_COPYOUT		_C_LABEL(i386_copyout)
   7.622 +#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)
   7.623 +#elif defined(I486_CPU)
   7.624 +#define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)
   7.625 +#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)
   7.626 +#elif defined(I586_CPU)
   7.627 +#define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)	/* XXX */
   7.628 +#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
   7.629 +#elif defined(I686_CPU)
   7.630 +#define	DEFAULT_COPYOUT		_C_LABEL(i486_copyout)	/* XXX */
   7.631 +#define	DEFAULT_COPYIN		_C_LABEL(i386_copyin)	/* XXX */
   7.632 +#endif
   7.633 +
   7.634 +	.data
   7.635 +
   7.636 +	.globl	_C_LABEL(copyout_func)
   7.637 +_C_LABEL(copyout_func):
   7.638 +	.long	DEFAULT_COPYOUT
   7.639 +
   7.640 +	.globl	_C_LABEL(copyin_func)
   7.641 +_C_LABEL(copyin_func):
   7.642 +	.long	DEFAULT_COPYIN
   7.643 +
   7.644 +	.text
   7.645 +
   7.646 +/*
   7.647 + * int copyout(const void *from, void *to, size_t len);
   7.648 + * Copy len bytes into the user's address space.
   7.649 + * see copyout(9)
   7.650 + */
   7.651 +/* LINTSTUB: Func: int copyout(const void *kaddr, void *uaddr, size_t len) */
   7.652 +ENTRY(copyout)
   7.653 +	DO_DEFERRED_SWITCH(%eax)
   7.654 +	jmp	*_C_LABEL(copyout_func)
   7.655 +
   7.656 +#if defined(I386_CPU)
   7.657 +/* LINTSTUB: Func: int i386_copyout(const void *kaddr, void *uaddr, size_t len) */
   7.658 +ENTRY(i386_copyout)
   7.659 +	pushl	%esi
   7.660 +	pushl	%edi
   7.661 +	pushl	$0
   7.662 +	
   7.663 +	movl	16(%esp),%esi
   7.664 +	movl	20(%esp),%edi
   7.665 +	movl	24(%esp),%eax
   7.666 +
   7.667 +	/*
   7.668 +	 * We check that the end of the destination buffer is not past the end
   7.669 +	 * of the user's address space.  If it's not, then we only need to
   7.670 +	 * check that each page is writable.  The 486 will do this for us; the
   7.671 +	 * 386 will not.  (We assume that pages in user space that are not
   7.672 +	 * writable by the user are not writable by the kernel either.)
   7.673 +	 */
   7.674 +	movl	%edi,%edx
   7.675 +	addl	%eax,%edx
   7.676 +	jc	_C_LABEL(copy_efault)
   7.677 +	cmpl	$VM_MAXUSER_ADDRESS,%edx
   7.678 +	ja	_C_LABEL(copy_efault)
   7.679 +
   7.680 +	testl	%eax,%eax		# anything to do?
   7.681 +	jz	3f
   7.682 +
   7.683 +	/*
   7.684 +	 * We have to check each PTE for (write) permission, since the CPU
   7.685 +	 * doesn't do it for us.
   7.686 +	 */
   7.687 +
   7.688 +	/* Compute number of pages. */
   7.689 +	movl	%edi,%ecx
   7.690 +	andl	$PGOFSET,%ecx
   7.691 +	addl	%eax,%ecx
   7.692 +	decl	%ecx
   7.693 +	shrl	$PGSHIFT,%ecx
   7.694 +
   7.695 +	/* Compute PTE offset for start address. */
   7.696 +	shrl	$PGSHIFT,%edi
   7.697 +
   7.698 +	GET_CURPCB(%edx)
   7.699 +	movl	$2f,PCB_ONFAULT(%edx)
   7.700 +
   7.701 +1:	/* Check PTE for each page. */
   7.702 +	testb	$PG_RW,_C_LABEL(PTmap)(,%edi,4)
   7.703 +	jz	2f
   7.704 +	
   7.705 +4:	incl	%edi
   7.706 +	decl	%ecx
   7.707 +	jns	1b
   7.708 +
   7.709 +	movl	20(%esp),%edi
   7.710 +	movl	24(%esp),%eax
   7.711 +	jmp	3f
   7.712 +	
   7.713 +2:	/* Simulate a trap. */
   7.714 +	pushl	%ecx
   7.715 +	movl	%edi,%eax
   7.716 +	shll	$PGSHIFT,%eax
   7.717 +	pushl	%eax
   7.718 +	call	_C_LABEL(trapwrite)	# trapwrite(addr)
   7.719 +	addl	$4,%esp			# pop argument
   7.720 +	popl	%ecx
   7.721 +	testl	%eax,%eax		# if not ok, return EFAULT
   7.722 +	jz	4b
   7.723 +	jmp	_C_LABEL(copy_efault)
   7.724 +
   7.725 +3:	GET_CURPCB(%edx)
   7.726 +	movl	$_C_LABEL(copy_fault),PCB_ONFAULT(%edx)
   7.727 +
   7.728 +	/* bcopy(%esi, %edi, %eax); */
   7.729 +	cld
   7.730 +	movl	%eax,%ecx
   7.731 +	shrl	$2,%ecx
   7.732 +	rep
   7.733 +	movsl
   7.734 +	movl	%eax,%ecx
   7.735 +	andl	$3,%ecx
   7.736 +	rep
   7.737 +	movsb
   7.738 +
   7.739 +	popl	PCB_ONFAULT(%edx)
   7.740 +	popl	%edi
   7.741 +	popl	%esi
   7.742 +	xorl	%eax,%eax
   7.743 +	ret
   7.744 +#endif /* I386_CPU */
   7.745 +
   7.746 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
   7.747 +/* LINTSTUB: Func: int i486_copyout(const void *kaddr, void *uaddr, size_t len) */
   7.748 +ENTRY(i486_copyout)
   7.749 +	pushl	%esi
   7.750 +	pushl	%edi
   7.751 +	pushl	$0
   7.752 +	
   7.753 +	movl	16(%esp),%esi
   7.754 +	movl	20(%esp),%edi
   7.755 +	movl	24(%esp),%eax
   7.756 +
   7.757 +	/*
   7.758 +	 * We check that the end of the destination buffer is not past the end
   7.759 +	 * of the user's address space.
   7.760 +	 */
   7.761 +	movl	%edi,%edx
   7.762 +	addl	%eax,%edx
   7.763 +	jc	_C_LABEL(copy_efault)
   7.764 +	cmpl	$VM_MAXUSER_ADDRESS,%edx
   7.765 +	ja	_C_LABEL(copy_efault)
   7.766 +
   7.767 +	GET_CURPCB(%edx)
   7.768 +	movl	$_C_LABEL(copy_fault),PCB_ONFAULT(%edx)
   7.769 +
   7.770 +	/* bcopy(%esi, %edi, %eax); */
   7.771 +	cld
   7.772 +	movl	%eax,%ecx
   7.773 +	shrl	$2,%ecx
   7.774 +	rep
   7.775 +	movsl
   7.776 +	movl	%eax,%ecx
   7.777 +	andl	$3,%ecx
   7.778 +	rep
   7.779 +	movsb
   7.780 +
   7.781 +	popl	PCB_ONFAULT(%edx)
   7.782 +	popl	%edi
   7.783 +	popl	%esi
   7.784 +	xorl	%eax,%eax
   7.785 +	ret
   7.786 +#endif /* I486_CPU || I586_CPU || I686_CPU */
   7.787 +
   7.788 +/*
   7.789 + * int copyin(const void *from, void *to, size_t len);
   7.790 + * Copy len bytes from the user's address space.
   7.791 + * see copyin(9)
   7.792 + */
   7.793 +/* LINTSTUB: Func: int copyin(const void *uaddr, void *kaddr, size_t len) */
   7.794 +ENTRY(copyin)
   7.795 +	DO_DEFERRED_SWITCH(%eax)
   7.796 +	jmp	*_C_LABEL(copyin_func)
   7.797 +
   7.798 +#if defined(I386_CPU) || defined(I486_CPU) || defined(I586_CPU) || \
   7.799 +    defined(I686_CPU)
   7.800 +/* LINTSTUB: Func: int i386_copyin(const void *uaddr, void *kaddr, size_t len) */
   7.801 +ENTRY(i386_copyin)
   7.802 +	pushl	%esi
   7.803 +	pushl	%edi
   7.804 +	GET_CURPCB(%eax)
   7.805 +	pushl	$0
   7.806 +	movl	$_C_LABEL(copy_fault),PCB_ONFAULT(%eax)
   7.807 +	
   7.808 +	movl	16(%esp),%esi
   7.809 +	movl	20(%esp),%edi
   7.810 +	movl	24(%esp),%eax
   7.811 +
   7.812 +	/*
   7.813 +	 * We check that the end of the destination buffer is not past the end
   7.814 +	 * of the user's address space.  If it's not, then we only need to
   7.815 +	 * check that each page is readable, and the CPU will do that for us.
   7.816 +	 */
   7.817 +	movl	%esi,%edx
   7.818 +	addl	%eax,%edx
   7.819 +	jc	_C_LABEL(copy_efault)
   7.820 +	cmpl	$VM_MAXUSER_ADDRESS,%edx
   7.821 +	ja	_C_LABEL(copy_efault)
   7.822 +
   7.823 +	/* bcopy(%esi, %edi, %eax); */
   7.824 +	cld
   7.825 +	movl	%eax,%ecx
   7.826 +	shrl	$2,%ecx
   7.827 +	rep
   7.828 +	movsl
   7.829 +	movl	%eax,%ecx
   7.830 +	andl	$3,%ecx
   7.831 +	rep
   7.832 +	movsb
   7.833 +
   7.834 +	GET_CURPCB(%edx)
   7.835 +	popl	PCB_ONFAULT(%edx)
   7.836 +	popl	%edi
   7.837 +	popl	%esi
   7.838 +	xorl	%eax,%eax
   7.839 +	ret
   7.840 +#endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */
   7.841 +
   7.842 +/* LINTSTUB: Ignore */
   7.843 +NENTRY(copy_efault)
   7.844 +	movl	$EFAULT,%eax
   7.845 +
   7.846 +/*
   7.847 + * kcopy_fault is used by kcopy and copy_fault is used by copyin/out.
   7.848 + *
   7.849 + * they're distinguished for lazy pmap switching.  see trap().
   7.850 + */
   7.851 +/* LINTSTUB: Ignore */
   7.852 +NENTRY(kcopy_fault)
   7.853 +	GET_CURPCB(%edx)
   7.854 +	popl	PCB_ONFAULT(%edx)
   7.855 +	popl	%edi
   7.856 +	popl	%esi
   7.857 +	ret
   7.858 +
   7.859 +/* LINTSTUB: Ignore */
   7.860 +NENTRY(copy_fault)
   7.861 +	GET_CURPCB(%edx)
   7.862 +	popl	PCB_ONFAULT(%edx)
   7.863 +	popl	%edi
   7.864 +	popl	%esi
   7.865 +	ret
   7.866 +
   7.867 +/*
   7.868 + * int copyoutstr(const void *from, void *to, size_t maxlen, size_t *lencopied);
   7.869 + * Copy a NUL-terminated string, at most maxlen characters long, into the
   7.870 + * user's address space.  Return the number of characters copied (including the
   7.871 + * NUL) in *lencopied.  If the string is too long, return ENAMETOOLONG; else
   7.872 + * return 0 or EFAULT.
   7.873 + * see copyoutstr(9)
   7.874 + */
   7.875 +/* LINTSTUB: Func: int copyoutstr(const void *kaddr, void *uaddr, size_t len, size_t *done) */
   7.876 +ENTRY(copyoutstr)
   7.877 +	pushl	%esi
   7.878 +	pushl	%edi
   7.879 +
   7.880 +	DO_DEFERRED_SWITCH(%eax)
   7.881 +
   7.882 +	movl	12(%esp),%esi		# esi = from
   7.883 +	movl	16(%esp),%edi		# edi = to
   7.884 +	movl	20(%esp),%edx		# edx = maxlen
   7.885 +
   7.886 +#if defined(I386_CPU)
   7.887 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
   7.888 +	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
   7.889 +	jne	5f
   7.890 +#endif /* I486_CPU || I586_CPU || I686_CPU */
   7.891 +
   7.892 +	/* Compute number of bytes in first page. */
   7.893 +	movl	%edi,%eax
   7.894 +	andl	$PGOFSET,%eax
   7.895 +	movl	$PAGE_SIZE,%ecx
   7.896 +	subl	%eax,%ecx		# ecx = PAGE_SIZE - (src % PAGE_SIZE)
   7.897 +
   7.898 +	GET_CURPCB(%eax)
   7.899 +	movl	$6f,PCB_ONFAULT(%eax)
   7.900 +
   7.901 +1:	/*
   7.902 +	 * Once per page, check that we are still within the bounds of user
   7.903 +	 * space, and check for a write fault.
   7.904 +	 */
   7.905 +	cmpl	$VM_MAXUSER_ADDRESS,%edi
   7.906 +	jae	_C_LABEL(copystr_efault)
   7.907 +
   7.908 +	/* Compute PTE offset. */
   7.909 +	movl	%edi,%eax
   7.910 +	shrl	$PGSHIFT,%eax		# calculate pte address
   7.911 +
   7.912 +	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
   7.913 +	jnz	2f
   7.914 +
   7.915 +6:	/* Simulate a trap. */
   7.916 +	pushl	%edx
   7.917 +	pushl	%edi
   7.918 +	call	_C_LABEL(trapwrite)	# trapwrite(addr)
   7.919 +	addl	$4,%esp			# clear argument from stack
   7.920 +	popl	%edx
   7.921 +	testl	%eax,%eax
   7.922 +	jnz	_C_LABEL(copystr_efault)
   7.923 +
   7.924 +2:	/* Copy up to end of this page. */
   7.925 +	subl	%ecx,%edx		# predecrement total count
   7.926 +	jnc	3f
   7.927 +	addl	%edx,%ecx		# ecx += (edx - ecx) = edx
   7.928 +	xorl	%edx,%edx
   7.929 +
   7.930 +3:	decl	%ecx
   7.931 +	js	4f
   7.932 +	lodsb
   7.933 +	stosb
   7.934 +	testb	%al,%al
   7.935 +	jnz	3b
   7.936 +
   7.937 +	/* Success -- 0 byte reached. */
   7.938 +	addl	%ecx,%edx		# add back residual for this page
   7.939 +	xorl	%eax,%eax
   7.940 +	jmp	copystr_return
   7.941 +
   7.942 +4:	/* Go to next page, if any. */
   7.943 +	movl	$PAGE_SIZE,%ecx
   7.944 +	testl	%edx,%edx
   7.945 +	jnz	1b
   7.946 +
   7.947 +	/* edx is zero -- return ENAMETOOLONG. */
   7.948 +	movl	$ENAMETOOLONG,%eax
   7.949 +	jmp	copystr_return
   7.950 +#endif /* I386_CPU */
   7.951 +
   7.952 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
   7.953 +5:	GET_CURPCB(%eax)
   7.954 +	movl	$_C_LABEL(copystr_fault),PCB_ONFAULT(%eax)
   7.955 +	/*
   7.956 +	 * Get min(%edx, VM_MAXUSER_ADDRESS-%edi).
   7.957 +	 */
   7.958 +	movl	$VM_MAXUSER_ADDRESS,%eax
   7.959 +	subl	%edi,%eax
   7.960 +	cmpl	%edx,%eax
   7.961 +	jae	1f
   7.962 +	movl	%eax,%edx
   7.963 +	movl	%eax,20(%esp)
   7.964 +
   7.965 +1:	incl	%edx
   7.966 +	cld
   7.967 +
   7.968 +1:	decl	%edx
   7.969 +	jz	2f
   7.970 +	lodsb
   7.971 +	stosb
   7.972 +	testb	%al,%al
   7.973 +	jnz	1b
   7.974 +
   7.975 +	/* Success -- 0 byte reached. */
   7.976 +	decl	%edx
   7.977 +	xorl	%eax,%eax
   7.978 +	jmp	copystr_return
   7.979 +
   7.980 +2:	/* edx is zero -- return EFAULT or ENAMETOOLONG. */
   7.981 +	cmpl	$VM_MAXUSER_ADDRESS,%edi
   7.982 +	jae	_C_LABEL(copystr_efault)
   7.983 +	movl	$ENAMETOOLONG,%eax
   7.984 +	jmp	copystr_return
   7.985 +#endif /* I486_CPU || I586_CPU || I686_CPU */
   7.986 +
   7.987 +/*
   7.988 + * int copyinstr(const void *from, void *to, size_t maxlen, size_t *lencopied);
   7.989 + * Copy a NUL-terminated string, at most maxlen characters long, from the
   7.990 + * user's address space.  Return the number of characters copied (including the
   7.991 + * NUL) in *lencopied.  If the string is too long, return ENAMETOOLONG; else
   7.992 + * return 0 or EFAULT.
   7.993 + * see copyinstr(9)
   7.994 + */
   7.995 +/* LINTSTUB: Func: int copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done) */
   7.996 +ENTRY(copyinstr)
   7.997 +	pushl	%esi
   7.998 +	pushl	%edi
   7.999 +
  7.1000 +	DO_DEFERRED_SWITCH(%eax)
  7.1001 +
  7.1002 +	GET_CURPCB(%ecx)
  7.1003 +	movl	$_C_LABEL(copystr_fault),PCB_ONFAULT(%ecx)
  7.1004 +
  7.1005 +	movl	12(%esp),%esi		# %esi = from
  7.1006 +	movl	16(%esp),%edi		# %edi = to
  7.1007 +	movl	20(%esp),%edx		# %edx = maxlen
  7.1008 +
  7.1009 +	/*
  7.1010 +	 * Get min(%edx, VM_MAXUSER_ADDRESS-%esi).
  7.1011 +	 */
  7.1012 +	movl	$VM_MAXUSER_ADDRESS,%eax
  7.1013 +	subl	%esi,%eax
  7.1014 +	cmpl	%edx,%eax
  7.1015 +	jae	1f
  7.1016 +	movl	%eax,%edx
  7.1017 +	movl	%eax,20(%esp)
  7.1018 +
  7.1019 +1:	incl	%edx
  7.1020 +	cld
  7.1021 +
  7.1022 +1:	decl	%edx
  7.1023 +	jz	2f
  7.1024 +	lodsb
  7.1025 +	stosb
  7.1026 +	testb	%al,%al
  7.1027 +	jnz	1b
  7.1028 +
  7.1029 +	/* Success -- 0 byte reached. */
  7.1030 +	decl	%edx
  7.1031 +	xorl	%eax,%eax
  7.1032 +	jmp	copystr_return
  7.1033 +
  7.1034 +2:	/* edx is zero -- return EFAULT or ENAMETOOLONG. */
  7.1035 +	cmpl	$VM_MAXUSER_ADDRESS,%esi
  7.1036 +	jae	_C_LABEL(copystr_efault)
  7.1037 +	movl	$ENAMETOOLONG,%eax
  7.1038 +	jmp	copystr_return
  7.1039 +
  7.1040 +/* LINTSTUB: Ignore */
  7.1041 +NENTRY(copystr_efault)
  7.1042 +	movl	$EFAULT,%eax
  7.1043 +
  7.1044 +/* LINTSTUB: Ignore */
  7.1045 +NENTRY(copystr_fault)
  7.1046 +copystr_return:
  7.1047 +	/* Set *lencopied and return %eax. */
  7.1048 +	GET_CURPCB(%ecx)
  7.1049 +	movl	$0,PCB_ONFAULT(%ecx)
  7.1050 +	movl	20(%esp),%ecx
  7.1051 +	subl	%edx,%ecx
  7.1052 +	movl	24(%esp),%edx
  7.1053 +	testl	%edx,%edx
  7.1054 +	jz	8f
  7.1055 +	movl	%ecx,(%edx)
  7.1056 +
  7.1057 +8:	popl	%edi
  7.1058 +	popl	%esi
  7.1059 +	ret
  7.1060 +
  7.1061 +/*
  7.1062 + * int copystr(const void *from, void *to, size_t maxlen, size_t *lencopied);
  7.1063 + * Copy a NUL-terminated string, at most maxlen characters long.  Return the
  7.1064 + * number of characters copied (including the NUL) in *lencopied.  If the
  7.1065 + * string is too long, return ENAMETOOLONG; else return 0.
  7.1066 + * see copystr(9)
  7.1067 + */
  7.1068 +/* LINTSTUB: Func: int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done) */
  7.1069 +ENTRY(copystr)
  7.1070 +	pushl	%esi
  7.1071 +	pushl	%edi
  7.1072 +
  7.1073 +	movl	12(%esp),%esi		# esi = from
  7.1074 +	movl	16(%esp),%edi		# edi = to
  7.1075 +	movl	20(%esp),%edx		# edx = maxlen
  7.1076 +	incl	%edx
  7.1077 +	cld
  7.1078 +
  7.1079 +1:	decl	%edx
  7.1080 +	jz	4f
  7.1081 +	lodsb
  7.1082 +	stosb
  7.1083 +	testb	%al,%al
  7.1084 +	jnz	1b
  7.1085 +
  7.1086 +	/* Success -- 0 byte reached. */
  7.1087 +	decl	%edx
  7.1088 +	xorl	%eax,%eax
  7.1089 +	jmp	6f
  7.1090 +
  7.1091 +4:	/* edx is zero -- return ENAMETOOLONG. */
  7.1092 +	movl	$ENAMETOOLONG,%eax
  7.1093 +
  7.1094 +6:	/* Set *lencopied and return %eax. */
  7.1095 +	movl	20(%esp),%ecx
  7.1096 +	subl	%edx,%ecx
  7.1097 +	movl	24(%esp),%edx
  7.1098 +	testl	%edx,%edx
  7.1099 +	jz	7f
  7.1100 +	movl	%ecx,(%edx)
  7.1101 +
  7.1102 +7:	popl	%edi
  7.1103 +	popl	%esi
  7.1104 +	ret
  7.1105 +
  7.1106 +/*
  7.1107 + * long fuword(const void *uaddr);
  7.1108 + * Fetch an int from the user's address space.
  7.1109 + * see fuword(9)
  7.1110 + */
  7.1111 +/* LINTSTUB: Func: long fuword(const void *base) */
  7.1112 +ENTRY(fuword)
  7.1113 +	DO_DEFERRED_SWITCH(%eax)
  7.1114 +	movl	4(%esp),%edx
  7.1115 +	cmpl	$VM_MAXUSER_ADDRESS-4,%edx
  7.1116 +	ja	_C_LABEL(fusuaddrfault)
  7.1117 +	GET_CURPCB(%ecx)
  7.1118 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
  7.1119 +	movl	(%edx),%eax
  7.1120 +	movl	$0,PCB_ONFAULT(%ecx)
  7.1121 +	ret
  7.1122 +	
  7.1123 +/*
  7.1124 + * int fusword(const void *uaddr);
  7.1125 + * Fetch a short from the user's address space.
  7.1126 + * see fusword(9)
  7.1127 + */
  7.1128 +/* LINTSTUB: Func: int fusword(const void *base) */
  7.1129 +ENTRY(fusword)
  7.1130 +	DO_DEFERRED_SWITCH(%eax)
  7.1131 +	movl	4(%esp),%edx
  7.1132 +	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
  7.1133 +	ja	_C_LABEL(fusuaddrfault)
  7.1134 +	GET_CURPCB(%ecx)
  7.1135 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
  7.1136 +	movzwl	(%edx),%eax
  7.1137 +	movl	$0,PCB_ONFAULT(%ecx)
  7.1138 +	ret
  7.1139 +	
  7.1140 +/*
  7.1141 + * int fuswintr(const void *uaddr);
  7.1142 + * Fetch a short from the user's address space.  Can be called during an
  7.1143 + * interrupt.
  7.1144 + * see fuswintr(9)
  7.1145 + */
  7.1146 +/* LINTSTUB: Func: int fuswintr(const void *base) */
  7.1147 +ENTRY(fuswintr)
  7.1148 +	cmpl	$TLBSTATE_VALID, CPUVAR(TLBSTATE)
  7.1149 +	jnz	_C_LABEL(fusuaddrfault)
  7.1150 +	movl	4(%esp),%edx
  7.1151 +	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
  7.1152 +	ja	_C_LABEL(fusuaddrfault)
  7.1153 +	movl	CPUVAR(CURLWP),%ecx
  7.1154 +	movl	L_ADDR(%ecx),%ecx
  7.1155 +	movl	$_C_LABEL(fusubail),PCB_ONFAULT(%ecx)
  7.1156 +	movzwl	(%edx),%eax
  7.1157 +	movl	$0,PCB_ONFAULT(%ecx)
  7.1158 +	ret
  7.1159 +	
  7.1160 +/*
  7.1161 + * int fubyte(const void *uaddr);
  7.1162 + * Fetch a byte from the user's address space.
  7.1163 + * see fubyte(9)
  7.1164 + */
  7.1165 +/* LINTSTUB: Func: int fubyte(const void *base) */
  7.1166 +ENTRY(fubyte)
  7.1167 +	DO_DEFERRED_SWITCH(%eax)
  7.1168 +	movl	4(%esp),%edx
  7.1169 +	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
  7.1170 +	ja	_C_LABEL(fusuaddrfault)
  7.1171 +	GET_CURPCB(%ecx)
  7.1172 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
  7.1173 +	movzbl	(%edx),%eax
  7.1174 +	movl	$0,PCB_ONFAULT(%ecx)
  7.1175 +	ret
  7.1176 +
  7.1177 +/*
  7.1178 + * Handle faults from [fs]u*().  Clean up and return -1.
  7.1179 + */
  7.1180 +/* LINTSTUB: Ignore */
  7.1181 +NENTRY(fusufault)
  7.1182 +	movl	$0,PCB_ONFAULT(%ecx)
  7.1183 +	movl	$-1,%eax
  7.1184 +	ret
  7.1185 +
  7.1186 +/*
  7.1187 + * Handle faults from [fs]u*().  Clean up and return -1.  This differs from
  7.1188 + * fusufault() in that trap() will recognize it and return immediately rather
  7.1189 + * than trying to page fault.
  7.1190 + */
  7.1191 +/* LINTSTUB: Ignore */
  7.1192 +NENTRY(fusubail)
  7.1193 +	movl	$0,PCB_ONFAULT(%ecx)
  7.1194 +	movl	$-1,%eax
  7.1195 +	ret
  7.1196 +
  7.1197 +/*
  7.1198 + * Handle earlier faults from [fs]u*(), due to our of range addresses.
  7.1199 + */
  7.1200 +/* LINTSTUB: Ignore */
  7.1201 +NENTRY(fusuaddrfault)
  7.1202 +	movl	$-1,%eax
  7.1203 +	ret
  7.1204 +
  7.1205 +/*
  7.1206 + * int suword(void *uaddr, long x);
  7.1207 + * Store an int in the user's address space.
  7.1208 + * see suword(9)
  7.1209 + */
  7.1210 +/* LINTSTUB: Func: int suword(void *base, long c) */
  7.1211 +ENTRY(suword)
  7.1212 +	DO_DEFERRED_SWITCH(%eax)
  7.1213 +	movl	4(%esp),%edx
  7.1214 +	cmpl	$VM_MAXUSER_ADDRESS-4,%edx
  7.1215 +	ja	_C_LABEL(fusuaddrfault)
  7.1216 +
  7.1217 +#if defined(I386_CPU)
  7.1218 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
  7.1219 +	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
  7.1220 +	jne	2f
  7.1221 +#endif /* I486_CPU || I586_CPU || I686_CPU */
  7.1222 +
  7.1223 +	GET_CURPCB(%eax)
  7.1224 +	movl	$3f,PCB_ONFAULT(%eax)
  7.1225 +
  7.1226 +	movl	%edx,%eax
  7.1227 +	shrl	$PGSHIFT,%eax		# calculate pte address
  7.1228 +	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
  7.1229 +	jnz	1f
  7.1230 +
  7.1231 +3:	/* Simulate a trap. */
  7.1232 +	pushl	%edx
  7.1233 +	pushl	%edx
  7.1234 +	call	_C_LABEL(trapwrite)	# trapwrite(addr)
  7.1235 +	addl	$4,%esp			# clear parameter from the stack
  7.1236 +	popl	%edx
  7.1237 +	GET_CURPCB(%ecx)
  7.1238 +	testl	%eax,%eax
  7.1239 +	jnz	_C_LABEL(fusufault)
  7.1240 +
  7.1241 +1:	/* XXX also need to check the following 3 bytes for validity! */
  7.1242 +#endif
  7.1243 +
  7.1244 +2:	GET_CURPCB(%ecx)
  7.1245 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
  7.1246 +
  7.1247 +	movl	8(%esp),%eax
  7.1248 +	movl	%eax,(%edx)
  7.1249 +	xorl	%eax,%eax
  7.1250 +	movl	%eax,PCB_ONFAULT(%ecx)
  7.1251 +	ret
  7.1252 +	
  7.1253 +/*
  7.1254 + * int susword(void *uaddr, short x);
  7.1255 + * Store a short in the user's address space.
  7.1256 + * see susword(9)
  7.1257 + */
  7.1258 +/* LINTSTUB: Func: int susword(void *base, short c) */
  7.1259 +ENTRY(susword)
  7.1260 +	DO_DEFERRED_SWITCH(%eax)
  7.1261 +	movl	4(%esp),%edx
  7.1262 +	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
  7.1263 +	ja	_C_LABEL(fusuaddrfault)
  7.1264 +
  7.1265 +#if defined(I386_CPU)
  7.1266 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
  7.1267 +	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
  7.1268 +	jne	2f
  7.1269 +#endif /* I486_CPU || I586_CPU || I686_CPU */
  7.1270 +
  7.1271 +	GET_CURPCB(%eax)
  7.1272 +	movl	$3f,PCB_ONFAULT(%eax)
  7.1273 +
  7.1274 +	movl	%edx,%eax
  7.1275 +	shrl	$PGSHIFT,%eax		# calculate pte address
  7.1276 +	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
  7.1277 +	jnz	1f
  7.1278 +
  7.1279 +3:	/* Simulate a trap. */
  7.1280 +	pushl	%edx
  7.1281 +	pushl	%edx
  7.1282 +	call	_C_LABEL(trapwrite)	# trapwrite(addr)
  7.1283 +	addl	$4,%esp			# clear parameter from the stack
  7.1284 +	popl	%edx
  7.1285 +	GET_CURPCB(%ecx)
  7.1286 +	testl	%eax,%eax
  7.1287 +	jnz	_C_LABEL(fusufault)
  7.1288 +
  7.1289 +1:	/* XXX also need to check the following byte for validity! */
  7.1290 +#endif
  7.1291 +
  7.1292 +2:	GET_CURPCB(%ecx)
  7.1293 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
  7.1294 +
  7.1295 +	movl	8(%esp),%eax
  7.1296 +	movw	%ax,(%edx)
  7.1297 +	xorl	%eax,%eax
  7.1298 +	movl	%eax,PCB_ONFAULT(%ecx)
  7.1299 +	ret
  7.1300 +
  7.1301 +/*
  7.1302 + * int suswintr(void *uaddr, short x);
  7.1303 + * Store a short in the user's address space.  Can be called during an
  7.1304 + * interrupt.
  7.1305 + * see suswintr(9)
  7.1306 + */
  7.1307 +/* LINTSTUB: Func: int suswintr(void *base, short c) */
  7.1308 +ENTRY(suswintr)
  7.1309 +	cmpl	$TLBSTATE_VALID, CPUVAR(TLBSTATE)
  7.1310 +	jnz	_C_LABEL(fusuaddrfault)
  7.1311 +	movl	4(%esp),%edx
  7.1312 +	cmpl	$VM_MAXUSER_ADDRESS-2,%edx
  7.1313 +	ja	_C_LABEL(fusuaddrfault)
  7.1314 +	movl	CPUVAR(CURLWP),%ecx
  7.1315 +	movl	L_ADDR(%ecx),%ecx
  7.1316 +	movl	$_C_LABEL(fusubail),PCB_ONFAULT(%ecx)
  7.1317 +
  7.1318 +#if defined(I386_CPU)
  7.1319 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
  7.1320 +	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
  7.1321 +	jne	2f
  7.1322 +#endif /* I486_CPU || I586_CPU || I686_CPU */
  7.1323 +
  7.1324 +	movl	%edx,%eax
  7.1325 +	shrl	$PGSHIFT,%eax		# calculate pte address
  7.1326 +	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
  7.1327 +	jnz	1f
  7.1328 +
  7.1329 +	/* Simulate a trap. */
  7.1330 +	jmp	_C_LABEL(fusubail)
  7.1331 +
  7.1332 +1:	/* XXX also need to check the following byte for validity! */
  7.1333 +#endif
  7.1334 +
  7.1335 +2:	movl	8(%esp),%eax
  7.1336 +	movw	%ax,(%edx)
  7.1337 +	xorl	%eax,%eax
  7.1338 +	movl	%eax,PCB_ONFAULT(%ecx)
  7.1339 +	ret
  7.1340 +
  7.1341 +/*
  7.1342 + * int subyte(void *uaddr, char x);
  7.1343 + * Store a byte in the user's address space.
  7.1344 + * see subyte(9)
  7.1345 + */
  7.1346 +/* LINTSTUB: Func: int subyte(void *base, int c) */
  7.1347 +ENTRY(subyte)
  7.1348 +	DO_DEFERRED_SWITCH(%eax)
  7.1349 +	movl	4(%esp),%edx
  7.1350 +	cmpl	$VM_MAXUSER_ADDRESS-1,%edx
  7.1351 +	ja	_C_LABEL(fusuaddrfault)
  7.1352 +
  7.1353 +#if defined(I386_CPU)
  7.1354 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
  7.1355 +	cmpl	$CPUCLASS_386,_C_LABEL(cpu_class)
  7.1356 +	jne	2f
  7.1357 +#endif /* I486_CPU || I586_CPU || I686_CPU */
  7.1358 +
  7.1359 +	GET_CURPCB(%eax)	
  7.1360 +	movl	$3f,PCB_ONFAULT(%eax)
  7.1361 +
  7.1362 +	movl	%edx,%eax
  7.1363 +	shrl	$PGSHIFT,%eax		# calculate pte address
  7.1364 +	testb	$PG_RW,_C_LABEL(PTmap)(,%eax,4)
  7.1365 +	jnz	1f
  7.1366 +
  7.1367 +3:	/* Simulate a trap. */
  7.1368 +	pushl	%edx
  7.1369 +	pushl	%edx
  7.1370 +	call	_C_LABEL(trapwrite)	# trapwrite(addr)
  7.1371 +	addl	$4,%esp			# clear parameter from the stack
  7.1372 +	popl	%edx
  7.1373 +	GET_CURPCB(%ecx)
  7.1374 +	testl	%eax,%eax
  7.1375 +	jnz	_C_LABEL(fusufault)
  7.1376 +
  7.1377 +1:
  7.1378 +#endif
  7.1379 +
  7.1380 +2:	GET_CURPCB(%ecx)
  7.1381 +	movl	$_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
  7.1382 +
  7.1383 +	movb	8(%esp),%al
  7.1384 +	movb	%al,(%edx)
  7.1385 +	xorl	%eax,%eax
  7.1386 +	movl	%eax,PCB_ONFAULT(%ecx)
  7.1387 +	ret
  7.1388 +
  7.1389 +/*****************************************************************************/
  7.1390 +
  7.1391 +/*
  7.1392 + * The following is i386-specific nonsense.
  7.1393 + */
  7.1394 +
  7.1395 +/*
  7.1396 + * void lgdt_finish(void);
  7.1397 + * Finish load a new GDT pointer (do any necessary cleanup).
  7.1398 + * XXX It's somewhat questionable whether reloading all the segment registers
  7.1399 + * is necessary, since the actual descriptor data is not changed except by
  7.1400 + * process creation and exit, both of which clean up via task switches.  OTOH,
  7.1401 + * this only happens at run time when the GDT is resized.
  7.1402 + */
  7.1403 +/* LINTSTUB: Func: void lgdt_finish(void) */
  7.1404 +NENTRY(lgdt_finish)
  7.1405 +	movl	$GSEL(GDATA_SEL, SEL_KPL),%eax
  7.1406 +	movw	%ax,%ds
  7.1407 +	movw	%ax,%es
  7.1408 +	movw	%ax,%gs
  7.1409 +	movw	%ax,%ss
  7.1410 +	movl	$GSEL(GCPU_SEL, SEL_KPL),%eax
  7.1411 +	movw	%ax,%fs
  7.1412 +	/* Reload code selector by doing intersegment return. */
  7.1413 +	popl	%eax
  7.1414 +	pushl	$GSEL(GCODE_SEL, SEL_KPL)
  7.1415 +	pushl	%eax
  7.1416 +	lret
  7.1417 +
  7.1418 +/*****************************************************************************/
  7.1419 +
  7.1420 +/*
  7.1421 + * These functions are primarily used by DDB.
  7.1422 + */
  7.1423 +
  7.1424 +/* LINTSTUB: Func: int setjmp (label_t *l) */
  7.1425 +ENTRY(setjmp)
  7.1426 +	movl	4(%esp),%eax
  7.1427 +	movl	%ebx,(%eax)		# save ebx
  7.1428 +	movl	%esp,4(%eax)		# save esp
  7.1429 +	movl	%ebp,8(%eax)		# save ebp
  7.1430 +	movl	%esi,12(%eax)		# save esi
  7.1431 +	movl	%edi,16(%eax)		# save edi
  7.1432 +	movl	(%esp),%edx		# get rta
  7.1433 +	movl	%edx,20(%eax)		# save eip
  7.1434 +	xorl	%eax,%eax		# return (0);
  7.1435 +	ret
  7.1436 +
  7.1437 +/* LINTSTUB: Func: void longjmp (label_t *l) */
  7.1438 +ENTRY(longjmp)
  7.1439 +	movl	4(%esp),%eax
  7.1440 +	movl	(%eax),%ebx		# restore ebx
  7.1441 +	movl	4(%eax),%esp		# restore esp
  7.1442 +	movl	8(%eax),%ebp		# restore ebp
  7.1443 +	movl	12(%eax),%esi		# restore esi
  7.1444 +	movl	16(%eax),%edi		# restore edi
  7.1445 +	movl	20(%eax),%edx		# get rta
  7.1446 +	movl	%edx,(%esp)		# put in return frame
  7.1447 +	xorl	%eax,%eax		# return (1);
  7.1448 +	incl	%eax
  7.1449 +	ret
  7.1450 +
  7.1451 +/*****************************************************************************/
  7.1452 +
  7.1453 +	.globl	_C_LABEL(sched_whichqs),_C_LABEL(sched_qs)
  7.1454 +	.globl	_C_LABEL(uvmexp),_C_LABEL(panic)
  7.1455 +
  7.1456 +#ifdef DIAGNOSTIC
  7.1457 +NENTRY(switch_error)
  7.1458 +	pushl	$1f
  7.1459 +3:	call	_C_LABEL(panic)
  7.1460 +	/* NOTREACHED */
  7.1461 +1:	.asciz	"cpu_switch"
  7.1462 +#endif /* DIAGNOSTIC */
  7.1463 +
  7.1464 +/*
  7.1465 + * void cpu_switch(struct lwp *)
  7.1466 + * Find a runnable process and switch to it.  Wait if necessary.  If the new
  7.1467 + * process is the same as the old one, we short-circuit the context save and
  7.1468 + * restore.
  7.1469 + *	
  7.1470 + * Note that the stack frame layout is known to "struct switchframe"
  7.1471 + * in <machine/frame.h> and to the code in cpu_fork() which initializes 
  7.1472 + * it for a new lwp.
  7.1473 + */
  7.1474 +ENTRY(cpu_switch)
  7.1475 +	pushl	%ebx
  7.1476 +	pushl	%esi
  7.1477 +	pushl	%edi
  7.1478 +
  7.1479 +#ifdef DEBUG
  7.1480 +	cmpl	$IPL_SCHED,CPUVAR(ILEVEL)
  7.1481 +	jae	1f
  7.1482 +	pushl	$2f
  7.1483 +	call	_C_LABEL(panic)
  7.1484 +	/* NOTREACHED */
  7.1485 +2:	.asciz	"not splsched() in cpu_switch!"
  7.1486 +1:	
  7.1487 +#endif /* DEBUG */
  7.1488 +
  7.1489 +	movl	16(%esp),%esi		# current
  7.1490 +
  7.1491 +	/*
  7.1492 +	 * Clear curlwp so that we don't accumulate system time while idle.
  7.1493 +	 * This also insures that schedcpu() will move the old lwp to
  7.1494 +	 * the correct queue if it happens to get called from the spllower()
  7.1495 +	 * below and changes the priority.  (See corresponding comment in
  7.1496 +	 * userret()).
  7.1497 +	 */
  7.1498 +	movl	$0,CPUVAR(CURLWP)
  7.1499 +	/*
  7.1500 +	 * First phase: find new lwp.
  7.1501 +	 *
  7.1502 +	 * Registers:
  7.1503 +	 *   %eax - queue head, scratch, then zero
  7.1504 +	 *   %ebx - queue number
  7.1505 +	 *   %ecx - cached value of whichqs
  7.1506 +	 *   %edx - next lwp in queue
  7.1507 +	 *   %esi - old lwp
  7.1508 +	 *   %edi - new lwp
  7.1509 +	 */
  7.1510 +
  7.1511 +	/* Look for new lwp. */
  7.1512 +	CLI(%ecx)			# splhigh doesn't do a cli
  7.1513 +	movl	_C_LABEL(sched_whichqs),%ecx
  7.1514 +	bsfl	%ecx,%ebx		# find a full q
  7.1515 +	jnz	switch_dequeue
  7.1516 +
  7.1517 +	/*
  7.1518 +	 * idling:	save old context.
  7.1519 +	 *
  7.1520 +	 * Registers:
  7.1521 +	 *   %eax, %ecx - scratch
  7.1522 +	 *   %esi - old lwp, then old pcb
  7.1523 +	 *   %edi - idle pcb
  7.1524 +	 */
  7.1525 +
  7.1526 +	pushl	%esi
  7.1527 +	call	_C_LABEL(pmap_deactivate2)	# pmap_deactivate(oldproc)
  7.1528 +	addl	$4,%esp
  7.1529 +
  7.1530 +	movl	L_ADDR(%esi),%esi
  7.1531 +
  7.1532 +	/* Save stack pointers. */
  7.1533 +	movl	%esp,PCB_ESP(%esi)
  7.1534 +	movl	%ebp,PCB_EBP(%esi)
  7.1535 +
  7.1536 +	/* Find idle PCB for this CPU */
  7.1537 +#ifndef MULTIPROCESSOR
  7.1538 +	movl	$_C_LABEL(lwp0),%ebx
  7.1539 +	movl	L_ADDR(%ebx),%edi
  7.1540 +	movl	L_MD_TSS_SEL(%ebx),%edx
  7.1541 +#else
  7.1542 +	movl	CPUVAR(IDLE_PCB),%edi
  7.1543 +	movl	CPUVAR(IDLE_TSS_SEL),%edx
  7.1544 +#endif
  7.1545 +	movl	$0,CPUVAR(CURLWP)		/* In case we fault... */
  7.1546 +
  7.1547 +	/* Restore the idle context (avoid interrupts) */
  7.1548 +	CLI(%ecx)
  7.1549 +
  7.1550 +	/* Restore stack pointers. */
  7.1551 +	movl	PCB_ESP(%edi),%esp
  7.1552 +	movl	PCB_EBP(%edi),%ebp
  7.1553 +
  7.1554 +	pushl	%edi
  7.1555 +	call	_C_LABEL(i386_switch_context)
  7.1556 +	addl	$4,%esp
  7.1557 +
  7.1558 +	/* Record new pcb. */
  7.1559 +	SET_CURPCB(%edi)
  7.1560 +
  7.1561 +	xorl	%esi,%esi
  7.1562 +	STI(%eax)
  7.1563 +idle_unlock:	
  7.1564 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)	
  7.1565 +	call	_C_LABEL(sched_unlock_idle)
  7.1566 +#endif
  7.1567 +	/* Interrupts are okay again. */
  7.1568 +	pushl	$IPL_NONE		# spl0()
  7.1569 +	call	_C_LABEL(Xspllower)	# process pending interrupts
  7.1570 +	addl	$4,%esp
  7.1571 +	jmp	idle_start
  7.1572 +idle_zero:		
  7.1573 +	STIC(%eax)
  7.1574 +    	jz	4f
  7.1575 +	call	_C_LABEL(stipending)
  7.1576 +	testl	%eax,%eax
  7.1577 +	jz	4f
  7.1578 +	pushl	$IPL_NONE
  7.1579 +	call	_C_LABEL(Xspllower)
  7.1580 +	addl	$4,%esp
  7.1581 +4:
  7.1582 +	call	_C_LABEL(uvm_pageidlezero)
  7.1583 +	CLI(%eax)
  7.1584 +	cmpl	$0,_C_LABEL(sched_whichqs)
  7.1585 +	jnz	idle_exit
  7.1586 +idle_loop:
  7.1587 +	/* Try to zero some pages. */
  7.1588 +	movl	_C_LABEL(uvm)+UVM_PAGE_IDLE_ZERO,%ecx
  7.1589 +	testl	%ecx,%ecx
  7.1590 +	jnz	idle_zero
  7.1591 +	STIC(%eax)
  7.1592 +    	jz	4f
  7.1593 +	call	_C_LABEL(stipending)
  7.1594 +	testl	%eax,%eax
  7.1595 +	jz	4f
  7.1596 +	pushl	$IPL_NONE
  7.1597 +	call	_C_LABEL(Xspllower)
  7.1598 +	addl	$4,%esp
  7.1599 +	jmp	idle_start
  7.1600 +4:
  7.1601 +	movl	$__HYPERVISOR_yield,%eax
  7.1602 +	TRAP_INSTR
  7.1603 +NENTRY(mpidle)
  7.1604 +idle_start:	
  7.1605 +	CLI(%eax)
  7.1606 +	cmpl	$0,_C_LABEL(sched_whichqs)
  7.1607 +	jz	idle_loop
  7.1608 +idle_exit:	
  7.1609 +	movl	$IPL_HIGH,CPUVAR(ILEVEL)		# splhigh
  7.1610 +	STI(%eax)
  7.1611 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)	
  7.1612 +	call	_C_LABEL(sched_lock_idle)
  7.1613 +#endif
  7.1614 +	movl	_C_LABEL(sched_whichqs),%ecx
  7.1615 +	bsfl	%ecx,%ebx
  7.1616 +	jz	idle_unlock
  7.1617 +
  7.1618 +#ifdef XENDEBUG_LOW
  7.1619 +	pushl	%ecx
  7.1620 +	call	_C_LABEL(xen_dbg1)
  7.1621 +	xorl	%ecx,%ecx
  7.1622 +	movl	%ecx,_C_LABEL(xen_once)
  7.1623 +	popl	%ecx
  7.1624 +#endif
  7.1625 +switch_dequeue:		
  7.1626 +	/* 
  7.1627 +	 * we're running at splhigh(), but it's otherwise okay to take
  7.1628 +	 * interrupts here. 
  7.1629 +	 */
  7.1630 +	STI(%edi)
  7.1631 +	leal	_C_LABEL(sched_qs)(,%ebx,8),%eax # select q
  7.1632 +
  7.1633 +	movl	L_FORW(%eax),%edi	# unlink from front of process q
  7.1634 +#ifdef	DIAGNOSTIC
  7.1635 +	cmpl	%edi,%eax		# linked to self (i.e. nothing queued)?
  7.1636 +	je	_C_LABEL(switch_error)	# not possible
  7.1637 +#endif /* DIAGNOSTIC */
  7.1638 +	movl	L_FORW(%edi),%edx
  7.1639 +	movl	%edx,L_FORW(%eax)
  7.1640 +	movl	%eax,L_BACK(%edx)
  7.1641 +
  7.1642 +	cmpl	%edx,%eax		# q empty?
  7.1643 +	jne	3f
  7.1644 +
  7.1645 +	btrl	%ebx,%ecx		# yes, clear to indicate empty
  7.1646 +	movl	%ecx,_C_LABEL(sched_whichqs) # update q status
  7.1647 +
  7.1648 +3:	/* We just did it. */
  7.1649 +	xorl	%eax,%eax
  7.1650 +	CLEAR_RESCHED(%eax)
  7.1651 +
  7.1652 +switch_resume:
  7.1653 +#ifdef	DIAGNOSTIC
  7.1654 +	cmpl	%eax,L_WCHAN(%edi)	# Waiting for something?
  7.1655 +	jne	_C_LABEL(switch_error)	# Yes; shouldn't be queued.
  7.1656 +	cmpb	$LSRUN,L_STAT(%edi)	# In run state?
  7.1657 +	jne	_C_LABEL(switch_error)	# No; shouldn't be queued.
  7.1658 +#endif /* DIAGNOSTIC */
  7.1659 +
  7.1660 +	/* Isolate lwp.  XXX Is this necessary? */
  7.1661 +	movl	%eax,L_BACK(%edi)
  7.1662 +
  7.1663 +	/* Record new lwp. */
  7.1664 +	movb	$LSONPROC,L_STAT(%edi)	# l->l_stat = LSONPROC
  7.1665 +	SET_CURLWP(%edi,%ecx)
  7.1666 +
  7.1667 +	/* Skip context switch if same lwp. */
  7.1668 +	xorl	%ebx,%ebx
  7.1669 +	cmpl	%edi,%esi
  7.1670 +	je	switch_return
  7.1671 +
  7.1672 +	/* If old lwp exited, don't bother. */
  7.1673 +	testl	%esi,%esi
  7.1674 +	jz	switch_exited
  7.1675 +
  7.1676 +	/*
  7.1677 +	 * Second phase: save old context.
  7.1678 +	 *
  7.1679 +	 * Registers:
  7.1680 +	 *   %eax, %ecx - scratch
  7.1681 +	 *   %esi - old lwp, then old pcb
  7.1682 +	 *   %edi - new lwp
  7.1683 +	 */
  7.1684 +
  7.1685 +	pushl	%esi
  7.1686 +	call	_C_LABEL(pmap_deactivate2)	# pmap_deactivate(oldproc)
  7.1687 +	addl	$4,%esp
  7.1688 +
  7.1689 +	movl	L_ADDR(%esi),%esi
  7.1690 +
  7.1691 +	/* Save stack pointers. */
  7.1692 +	movl	%esp,PCB_ESP(%esi)
  7.1693 +	movl	%ebp,PCB_EBP(%esi)
  7.1694 +
  7.1695 +switch_exited:
  7.1696 +	/*
  7.1697 +	 * Third phase: restore saved context.
  7.1698 +	 *
  7.1699 +	 * Registers:
  7.1700 +	 *   %eax, %ebx, %ecx, %edx - scratch
  7.1701 +	 *   %esi - new pcb
  7.1702 +	 *   %edi - new lwp
  7.1703 +	 */
  7.1704 +
  7.1705 +	/* No interrupts while loading new state. */
  7.1706 +	CLI(%eax)
  7.1707 +	movl	L_ADDR(%edi),%esi
  7.1708 +
  7.1709 +	/* Restore stack pointers. */
  7.1710 +	movl	PCB_ESP(%esi),%esp
  7.1711 +	movl	PCB_EBP(%esi),%ebp
  7.1712 +
  7.1713 +#if 0
  7.1714 +	/* Don't bother with the rest if switching to a system process. */
  7.1715 +	testl	$P_SYSTEM,L_FLAG(%edi);	XXX NJWLWP lwp's don't have P_SYSTEM!
  7.1716 +	jnz	switch_restored	; XXX skip stack_switch+pmap_activate
  7.1717 +#endif
  7.1718 +
  7.1719 +	pushl	%edi
  7.1720 +	call	_C_LABEL(pmap_activate)		# pmap_activate(p)
  7.1721 +	addl	$4,%esp
  7.1722 +
  7.1723 +	pushl	%esi
  7.1724 +	call	_C_LABEL(i386_switch_context)
  7.1725 +	addl	$4,%esp
  7.1726 +
  7.1727 +	/* Record new pcb. */
  7.1728 +	SET_CURPCB(%esi)
  7.1729 +
  7.1730 +	/* Interrupts are okay again. */
  7.1731 +	STI(%edi)
  7.1732 +
  7.1733 +/*
  7.1734 + *  Check for restartable atomic sequences (RAS)
  7.1735 + */
  7.1736 +	movl	CPUVAR(CURLWP),%edi
  7.1737 +	movl	L_PROC(%edi),%esi
  7.1738 +	cmpl	$0,P_RASLIST(%esi)
  7.1739 +	jne	2f
  7.1740 +1:
  7.1741 +	movl	$1,%ebx
  7.1742 +
  7.1743 +switch_return:
  7.1744 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)     
  7.1745 +	call    _C_LABEL(sched_unlock_idle)
  7.1746 +#endif
  7.1747 +	pushl	$IPL_NONE		# spl0()
  7.1748 +	call	_C_LABEL(Xspllower)	# process pending interrupts
  7.1749 +	addl	$4,%esp
  7.1750 +	movl	$IPL_HIGH,CPUVAR(ILEVEL)	# splhigh()
  7.1751 +
  7.1752 +	movl	%ebx,%eax
  7.1753 +
  7.1754 +	popl	%edi
  7.1755 +	popl	%esi
  7.1756 +	popl	%ebx
  7.1757 +	ret
  7.1758 +
  7.1759 +2:					# check RAS list
  7.1760 +	movl	L_MD_REGS(%edi),%ebx
  7.1761 +	movl	TF_EIP(%ebx),%eax
  7.1762 +	pushl	%eax
  7.1763 +	pushl	%esi
  7.1764 +	call	_C_LABEL(ras_lookup)
  7.1765 +	addl	$8,%esp
  7.1766 +	cmpl	$-1,%eax
  7.1767 +	je	1b
  7.1768 +	movl	%eax,TF_EIP(%ebx)
  7.1769 +	jmp	1b
  7.1770 +
  7.1771 +/*
  7.1772 + * void cpu_switchto(struct lwp *current, struct lwp *next)
  7.1773 + * Switch to the specified next LWP.
  7.1774 + */
  7.1775 +ENTRY(cpu_switchto)
  7.1776 +	pushl	%ebx
  7.1777 +	pushl	%esi
  7.1778 +	pushl	%edi
  7.1779 +
  7.1780 +#ifdef DEBUG
  7.1781 +	cmpl	$IPL_SCHED,CPUVAR(ILEVEL)
  7.1782 +	jae	1f
  7.1783 +	pushl	$2f
  7.1784 +	call	_C_LABEL(panic)
  7.1785 +	/* NOTREACHED */
  7.1786 +2:	.asciz	"not splsched() in cpu_switchto!"
  7.1787 +1:
  7.1788 +#endif /* DEBUG */
  7.1789 +
  7.1790 +	movl	16(%esp),%esi		# current
  7.1791 +	movl	20(%esp),%edi		# next
  7.1792 +
  7.1793 +	/*
  7.1794 +	 * Clear curlwp so that we don't accumulate system time while idle.
  7.1795 +	 * This also insures that schedcpu() will move the old process to
  7.1796 +	 * the correct queue if it happens to get called from the spllower()
  7.1797 +	 * below and changes the priority.  (See corresponding comment in
  7.1798 +	 * usrret()).
  7.1799 +	 *
  7.1800 +	 * XXX Is this necessary?  We know we won't go idle.
  7.1801 +	 */
  7.1802 +	movl	$0,CPUVAR(CURLWP)
  7.1803 +
  7.1804 +	/*
  7.1805 +	 * We're running at splhigh(), but it's otherwise okay to take
  7.1806 +	 * interrupts here.
  7.1807 +	 */
  7.1808 +	STI(%eax)
  7.1809 +
  7.1810 +	/* Jump into the middle of cpu_switch */
  7.1811 +	xorl	%eax,%eax
  7.1812 +	jmp	switch_resume
  7.1813 +
  7.1814 +/*
  7.1815 + * void cpu_exit(struct lwp *l)
  7.1816 + * Switch to the appropriate idle context (lwp0's if uniprocessor; the CPU's 
  7.1817 + * if multiprocessor) and deallocate the address space and kernel stack for p. 
  7.1818 + * Then jump into cpu_switch(), as if we were in the idle proc all along.
  7.1819 + */
  7.1820 +#ifndef MULTIPROCESSOR
  7.1821 +	.globl	_C_LABEL(lwp0)
  7.1822 +#endif
  7.1823 +	.globl  _C_LABEL(uvmspace_free),_C_LABEL(kernel_map)
  7.1824 +	.globl	_C_LABEL(uvm_km_free),_C_LABEL(tss_free)
  7.1825 +/* LINTSTUB: Func: void cpu_exit(struct lwp *l) */
  7.1826 +ENTRY(cpu_exit)
  7.1827 +	movl	4(%esp),%edi		# old process
  7.1828 +#ifndef MULTIPROCESSOR
  7.1829 +	movl	$_C_LABEL(lwp0),%ebx
  7.1830 +	movl	L_ADDR(%ebx),%esi
  7.1831 +	movl	L_MD_TSS_SEL(%ebx),%edx
  7.1832 +#else
  7.1833 +	movl	CPUVAR(IDLE_PCB),%esi
  7.1834 +	movl	CPUVAR(IDLE_TSS_SEL),%edx
  7.1835 +#endif
  7.1836 +	/* In case we fault... */
  7.1837 +	movl	$0,CPUVAR(CURLWP)
  7.1838 +
  7.1839 +	/* Restore the idle context. */
  7.1840 +	CLI(%eax)
  7.1841 +
  7.1842 +	/* Restore stack pointers. */
  7.1843 +	movl	PCB_ESP(%esi),%esp
  7.1844 +	movl	PCB_EBP(%esi),%ebp
  7.1845 +
  7.1846 +	pushl	%esi
  7.1847 +	call	_C_LABEL(i386_switch_context)
  7.1848 +	addl	$4,%esp
  7.1849 +
  7.1850 +	/* Record new pcb. */
  7.1851 +	SET_CURPCB(%esi)
  7.1852 +
  7.1853 +	/* Interrupts are okay again. */
  7.1854 +	STI(%eax)
  7.1855 +
  7.1856 +	/*
  7.1857 +	 * Schedule the dead LWP's stack to be freed.
  7.1858 +	 */
  7.1859 +	pushl	%edi
  7.1860 +	call	_C_LABEL(lwp_exit2)
  7.1861 +	addl	$4,%esp
  7.1862 +
  7.1863 +	/* Jump into cpu_switch() with the right state. */
  7.1864 +	xorl	%esi,%esi
  7.1865 +	movl	%esi,CPUVAR(CURLWP)
  7.1866 +	jmp	idle_start
  7.1867 +
  7.1868 +/*
  7.1869 + * void savectx(struct pcb *pcb);
  7.1870 + * Update pcb, saving current processor state.
  7.1871 + */
  7.1872 +/* LINTSTUB: Func: void savectx(struct pcb *pcb) */
  7.1873 +ENTRY(savectx)
  7.1874 +	movl	4(%esp),%edx		# edx = p->p_addr
  7.1875 +  
  7.1876 +	/* Save stack pointers. */
  7.1877 +	movl	%esp,PCB_ESP(%edx)
  7.1878 +	movl	%ebp,PCB_EBP(%edx)
  7.1879 +
  7.1880 +	ret
  7.1881 +
  7.1882 +/*
  7.1883 + * Old call gate entry for syscall
  7.1884 + */
  7.1885 +/* LINTSTUB: Var: char Xosyscall[1]; */
  7.1886 +IDTVEC(osyscall)
  7.1887 +	/* Set eflags in trap frame. */
  7.1888 +	pushfl
  7.1889 +	popl	8(%esp)
  7.1890 +	pushl	$7		# size of instruction for restart
  7.1891 +	jmp	syscall1
  7.1892 +
  7.1893 +/*
  7.1894 + * Trap gate entry for syscall
  7.1895 + */
  7.1896 +/* LINTSTUB: Var: char Xsyscall[1]; */
  7.1897 +IDTVEC(syscall)
  7.1898 +	pushl	$2		# size of instruction for restart
  7.1899 +syscall1:
  7.1900 +	pushl	$T_ASTFLT	# trap # for doing ASTs
  7.1901 +	INTRENTRY
  7.1902 +
  7.1903 +#ifdef DIAGNOSTIC
  7.1904 +	cmpl    $0, CPUVAR(WANT_PMAPLOAD)
  7.1905 +	jz	1f
  7.1906 +	pushl	$6f
  7.1907 +	call	_C_LABEL(printf)
  7.1908 +	addl	$4, %esp
  7.1909 +1:
  7.1910 +	movl	CPUVAR(ILEVEL),%ebx
  7.1911 +	testl	%ebx,%ebx
  7.1912 +	jz	1f
  7.1913 +	pushl	$5f
  7.1914 +	call	_C_LABEL(printf)
  7.1915 +	addl	$4,%esp
  7.1916 +#ifdef DDB
  7.1917 +	int	$3
  7.1918 +#endif
  7.1919 +1:	
  7.1920 +#endif /* DIAGNOSTIC */
  7.1921 +	movl	CPUVAR(CURLWP),%edx
  7.1922 +	movl	%esp,L_MD_REGS(%edx)	# save pointer to frame
  7.1923 +	movl	L_PROC(%edx),%edx
  7.1924 +	pushl	%esp
  7.1925 +	call	*P_MD_SYSCALL(%edx)	# get pointer to syscall() function
  7.1926 +	addl	$4,%esp
  7.1927 +syscall_checkast:
  7.1928 +	/* Check for ASTs on exit to user mode. */
  7.1929 +	CLI(%eax)
  7.1930 +	CHECK_ASTPENDING(%eax)
  7.1931 +	je	1f
  7.1932 +	/* Always returning to user mode here. */
  7.1933 +	CLEAR_ASTPENDING(%eax)
  7.1934 +	STI(%eax)
  7.1935 +	/* Pushed T_ASTFLT into tf_trapno on entry. */
  7.1936 +	pushl	%esp
  7.1937 +	call	_C_LABEL(trap)
  7.1938 +	addl	$4,%esp
  7.1939 +	jmp	syscall_checkast
  7.1940 +1:	STI(%eax)
  7.1941 +	CHECK_DEFERRED_SWITCH(%eax)
  7.1942 +	jnz	9f
  7.1943 +#ifndef DIAGNOSTIC
  7.1944 +	INTRFASTEXIT
  7.1945 +#else /* DIAGNOSTIC */
  7.1946 +	cmpl	$IPL_NONE,CPUVAR(ILEVEL)
  7.1947 +	jne	3f
  7.1948 +	INTRFASTEXIT
  7.1949 +3:	pushl	$4f
  7.1950 +	call	_C_LABEL(printf)
  7.1951 +	addl	$4,%esp
  7.1952 +#ifdef DDB
  7.1953 +	int	$3
  7.1954 +#endif /* DDB */
  7.1955 +	movl	$IPL_NONE,CPUVAR(ILEVEL)
  7.1956 +	jmp	2b
  7.1957 +4:	.asciz	"WARNING: SPL NOT LOWERED ON SYSCALL EXIT\n"
  7.1958 +5:	.asciz	"WARNING: SPL NOT ZERO ON SYSCALL ENTRY\n"	
  7.1959 +6:	.asciz	"WARNING: WANT PMAPLOAD ON SYSCALL ENTRY\n"     
  7.1960 +#endif /* DIAGNOSTIC */
  7.1961 +9:	call    _C_LABEL(pmap_load)
  7.1962 +	jmp     syscall_checkast        /* re-check ASTs */
  7.1963 +
  7.1964 +#if NNPX > 0
  7.1965 +/*
  7.1966 + * Special interrupt handlers.  Someday intr0-intr15 will be used to count
  7.1967 + * interrupts.  We'll still need a special exception 16 handler.  The busy
  7.1968 + * latch stuff in probintr() can be moved to npxprobe().
  7.1969 + */
  7.1970 +
  7.1971 +/* LINTSTUB: Func: void probeintr(void) */
  7.1972 +NENTRY(probeintr)
  7.1973 +	ss
  7.1974 +	incl	_C_LABEL(npx_intrs_while_probing)
  7.1975 +	pushl	%eax
  7.1976 +	movb	$0x20,%al	# EOI (asm in strings loses cpp features)
  7.1977 +	outb	%al,$0xa0	# IO_ICU2
  7.1978 +	outb	%al,$0x20	# IO_ICU1
  7.1979 +	movb	$0,%al
  7.1980 +	outb	%al,$0xf0	# clear BUSY# latch
  7.1981 +	popl	%eax
  7.1982 +	iret
  7.1983 +
  7.1984 +/* LINTSTUB: Func: void probetrap(void) */
  7.1985 +NENTRY(probetrap)
  7.1986 +	ss
  7.1987 +	incl	_C_LABEL(npx_traps_while_probing)
  7.1988 +	fnclex
  7.1989 +	iret
  7.1990 +
  7.1991 +/* LINTSTUB: Func: int npx586bug1(int a, int b) */
  7.1992 +NENTRY(npx586bug1)
  7.1993 +	fildl	4(%esp)		# x
  7.1994 +	fildl	8(%esp)		# y
  7.1995 +	fld	%st(1)
  7.1996 +	fdiv	%st(1),%st	# x/y
  7.1997 +	fmulp	%st,%st(1)	# (x/y)*y
  7.1998 +	fsubrp	%st,%st(1)	# x-(x/y)*y
  7.1999 +	pushl	$0
  7.2000 +	fistpl	(%esp)
  7.2001 +	popl	%eax
  7.2002 +	ret
  7.2003 +#endif /* NNPX > 0 */
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c	Mon Sep 06 19:04:16 2004 +0000
     8.3 @@ -0,0 +1,2561 @@
     8.4 +/*	$NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $	*/
     8.5 +/*	NetBSD: machdep.c,v 1.552 2004/03/24 15:34:49 atatat Exp 	*/
     8.6 +
     8.7 +/*-
     8.8 + * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
     8.9 + * All rights reserved.
    8.10 + *
    8.11 + * This code is derived from software contributed to The NetBSD Foundation
    8.12 + * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
    8.13 + * Simulation Facility, NASA Ames Research Center.
    8.14 + *
    8.15 + * Redistribution and use in source and binary forms, with or without
    8.16 + * modification, are permitted provided that the following conditions
    8.17 + * are met:
    8.18 + * 1. Redistributions of source code must retain the above copyright
    8.19 + *    notice, this list of conditions and the following disclaimer.
    8.20 + * 2. Redistributions in binary form must reproduce the above copyright
    8.21 + *    notice, this list of conditions and the following disclaimer in the
    8.22 + *    documentation and/or other materials provided with the distribution.
    8.23 + * 3. All advertising materials mentioning features or use of this software
    8.24 + *    must display the following acknowledgement:
    8.25 + *	This product includes software developed by the NetBSD
    8.26 + *	Foundation, Inc. and its contributors.
    8.27 + * 4. Neither the name of The NetBSD Foundation nor the names of its
    8.28 + *    contributors may be used to endorse or promote products derived
    8.29 + *    from this software without specific prior written permission.
    8.30 + *
    8.31 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
    8.32 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
    8.33 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    8.34 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
    8.35 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    8.36 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    8.37 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    8.38 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    8.39 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    8.40 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    8.41 + * POSSIBILITY OF SUCH DAMAGE.
    8.42 + */
    8.43 +
    8.44 +/*-
    8.45 + * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    8.46 + * All rights reserved.
    8.47 + *
    8.48 + * This code is derived from software contributed to Berkeley by
    8.49 + * William Jolitz.
    8.50 + *
    8.51 + * Redistribution and use in source and binary forms, with or without
    8.52 + * modification, are permitted provided that the following conditions
    8.53 + * are met:
    8.54 + * 1. Redistributions of source code must retain the above copyright
    8.55 + *    notice, this list of conditions and the following disclaimer.
    8.56 + * 2. Redistributions in binary form must reproduce the above copyright
    8.57 + *    notice, this list of conditions and the following disclaimer in the
    8.58 + *    documentation and/or other materials provided with the distribution.
    8.59 + * 3. Neither the name of the University nor the names of its contributors
    8.60 + *    may be used to endorse or promote products derived from this software
    8.61 + *    without specific prior written permission.
    8.62 + *
    8.63 + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
    8.64 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    8.65 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    8.66 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
    8.67 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    8.68 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    8.69 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    8.70 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    8.71 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    8.72 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    8.73 + * SUCH DAMAGE.
    8.74 + *
    8.75 + *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
    8.76 + */
    8.77 +
    8.78 +#include <sys/cdefs.h>
    8.79 +__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $");
    8.80 +
    8.81 +#include "opt_beep.h"
    8.82 +#include "opt_compat_ibcs2.h"
    8.83 +#include "opt_compat_mach.h"	/* need to get the right segment def */
    8.84 +#include "opt_compat_netbsd.h"
    8.85 +#include "opt_compat_svr4.h"
    8.86 +#include "opt_cpureset_delay.h"
    8.87 +#include "opt_cputype.h"
    8.88 +#include "opt_ddb.h"
    8.89 +#include "opt_ipkdb.h"
    8.90 +#include "opt_kgdb.h"
    8.91 +#include "opt_mtrr.h"
    8.92 +#include "opt_multiprocessor.h"
    8.93 +#include "opt_realmem.h"
    8.94 +#include "opt_user_ldt.h"
    8.95 +#include "opt_vm86.h"
    8.96 +#include "opt_xen.h"
    8.97 +
    8.98 +#include <sys/param.h>
    8.99 +#include <sys/systm.h>
   8.100 +#include <sys/signal.h>
   8.101 +#include <sys/signalvar.h>
   8.102 +#include <sys/kernel.h>
   8.103 +#include <sys/proc.h>
   8.104 +#include <sys/user.h>
   8.105 +#include <sys/exec.h>
   8.106 +#include <sys/buf.h>
   8.107 +#include <sys/reboot.h>
   8.108 +#include <sys/conf.h>
   8.109 +#include <sys/file.h>
   8.110 +#include <sys/malloc.h>
   8.111 +#include <sys/mbuf.h>
   8.112 +#include <sys/msgbuf.h>
   8.113 +#include <sys/mount.h>
   8.114 +#include <sys/vnode.h>
   8.115 +#include <sys/extent.h>
   8.116 +#include <sys/syscallargs.h>
   8.117 +#include <sys/core.h>
   8.118 +#include <sys/kcore.h>
   8.119 +#include <sys/ucontext.h>
   8.120 +#include <machine/kcore.h>
   8.121 +#include <sys/ras.h>
   8.122 +#include <sys/sa.h>
   8.123 +#include <sys/savar.h>
   8.124 +#include <sys/ksyms.h>
   8.125 +
   8.126 +#ifdef IPKDB
   8.127 +#include <ipkdb/ipkdb.h>
   8.128 +#endif
   8.129 +
   8.130 +#ifdef KGDB
   8.131 +#include <sys/kgdb.h>
   8.132 +#endif
   8.133 +
   8.134 +#include <dev/cons.h>
   8.135 +
   8.136 +#include <uvm/uvm_extern.h>
   8.137 +#include <uvm/uvm_page.h>
   8.138 +
   8.139 +#include <sys/sysctl.h>
   8.140 +
   8.141 +#include <machine/cpu.h>
   8.142 +#include <machine/cpufunc.h>
   8.143 +#include <machine/cpuvar.h>
   8.144 +#include <machine/gdt.h>
   8.145 +#include <machine/pio.h>
   8.146 +#include <machine/psl.h>
   8.147 +#include <machine/reg.h>
   8.148 +#include <machine/specialreg.h>
   8.149 +#include <machine/bootinfo.h>
   8.150 +#include <machine/mtrr.h>
   8.151 +#include <machine/evtchn.h>
   8.152 +
   8.153 +#include <dev/isa/isareg.h>
   8.154 +#include <machine/isa_machdep.h>
   8.155 +#include <dev/ic/i8042reg.h>
   8.156 +
   8.157 +#ifdef DDB
   8.158 +#include <machine/db_machdep.h>
   8.159 +#include <ddb/db_extern.h>
   8.160 +#endif
   8.161 +
   8.162 +#ifdef VM86
   8.163 +#include <machine/vm86.h>
   8.164 +#endif
   8.165 +
   8.166 +#include "acpi.h"
   8.167 +#include "apm.h"
   8.168 +#include "bioscall.h"
   8.169 +
   8.170 +#if NBIOSCALL > 0
   8.171 +#include <machine/bioscall.h>
   8.172 +#endif
   8.173 +
   8.174 +#if NACPI > 0
   8.175 +#include <dev/acpi/acpivar.h>
   8.176 +#define ACPI_MACHDEP_PRIVATE
   8.177 +#include <machine/acpi_machdep.h>
   8.178 +#endif
   8.179 +
   8.180 +#if NAPM > 0
   8.181 +#include <machine/apmvar.h>
   8.182 +#endif
   8.183 +
   8.184 +#include "isa.h"
   8.185 +#include "isadma.h"
   8.186 +#include "npx.h"
   8.187 +#include "ksyms.h"
   8.188 +
   8.189 +#include "mca.h"
   8.190 +#if NMCA > 0
   8.191 +#include <machine/mca_machdep.h>	/* for mca_busprobe() */
   8.192 +#endif
   8.193 +
   8.194 +#ifdef MULTIPROCESSOR		/* XXX */
   8.195 +#include <machine/mpbiosvar.h>	/* XXX */
   8.196 +#endif				/* XXX */
   8.197 +
   8.198 +#include <machine/xen.h>
   8.199 +#include <machine/hypervisor.h>
   8.200 +
   8.201 +#if defined(DDB) || defined(KGDB)
   8.202 +#include <ddb/db_interface.h>
   8.203 +#include <ddb/db_output.h>
   8.204 +
   8.205 +void ddb_trap_hook(int);
   8.206 +#endif
   8.207 +
   8.208 +/* #define	XENDEBUG */
   8.209 +/* #define	XENDEBUG_LOW */
   8.210 +
   8.211 +#ifdef XENDEBUG
   8.212 +extern void printk(char *, ...);
   8.213 +#define	XENPRINTF(x) printf x
   8.214 +#define	XENPRINTK(x) printk x
   8.215 +#else
   8.216 +#define	XENPRINTF(x)
   8.217 +#define	XENPRINTK(x)
   8.218 +#endif
   8.219 +#define	PRINTK(x) printf x
   8.220 +
   8.221 +#ifdef XENDEBUG_LOW
   8.222 +void xen_dbglow_init(void);
   8.223 +#endif
   8.224 +
   8.225 +#ifndef BEEP_ONHALT_COUNT
   8.226 +#define BEEP_ONHALT_COUNT 3
   8.227 +#endif
   8.228 +#ifndef BEEP_ONHALT_PITCH
   8.229 +#define BEEP_ONHALT_PITCH 1500
   8.230 +#endif
   8.231 +#ifndef BEEP_ONHALT_PERIOD
   8.232 +#define BEEP_ONHALT_PERIOD 250
   8.233 +#endif
   8.234 +
   8.235 +/* the following is used externally (sysctl_hw) */
   8.236 +char machine[] = "i386";		/* CPU "architecture" */
   8.237 +char machine_arch[] = "i386";		/* machine == machine_arch */
   8.238 +
   8.239 +char bootinfo[BOOTINFO_MAXSIZE];
   8.240 +
   8.241 +struct bi_devmatch *i386_alldisks = NULL;
   8.242 +int i386_ndisks = 0;
   8.243 +
   8.244 +#ifdef CPURESET_DELAY
   8.245 +int	cpureset_delay = CPURESET_DELAY;
   8.246 +#else
   8.247 +int     cpureset_delay = 2000; /* default to 2s */
   8.248 +#endif
   8.249 +
   8.250 +#ifdef MTRR
   8.251 +struct mtrr_funcs *mtrr_funcs;
   8.252 +#endif
   8.253 +
   8.254 +#ifdef COMPAT_NOMID
   8.255 +static int exec_nomid(struct proc *, struct exec_package *);
   8.256 +#endif
   8.257 +
   8.258 +int	physmem;
   8.259 +int	dumpmem_low;
   8.260 +int	dumpmem_high;
   8.261 +unsigned int cpu_feature;
   8.262 +int	cpu_class;
   8.263 +int	i386_fpu_present;
   8.264 +int	i386_fpu_exception;
   8.265 +int	i386_fpu_fdivbug;
   8.266 +
   8.267 +int	i386_use_fxsave;
   8.268 +int	i386_has_sse;
   8.269 +int	i386_has_sse2;
   8.270 +
   8.271 +int	tmx86_has_longrun;
   8.272 +
   8.273 +vaddr_t	msgbuf_vaddr;
   8.274 +paddr_t msgbuf_paddr;
   8.275 +
   8.276 +vaddr_t	idt_vaddr;
   8.277 +paddr_t	idt_paddr;
   8.278 +
   8.279 +#ifdef I586_CPU
   8.280 +vaddr_t	pentium_idt_vaddr;
   8.281 +#endif
   8.282 +
   8.283 +struct vm_map *exec_map = NULL;
   8.284 +struct vm_map *mb_map = NULL;
   8.285 +struct vm_map *phys_map = NULL;
   8.286 +
   8.287 +extern	paddr_t avail_start, avail_end;
   8.288 +extern	paddr_t pmap_pa_start, pmap_pa_end;
   8.289 +
   8.290 +#ifdef ISA_CLOCK
   8.291 +void (*delay_func)(int) = i8254_delay;
   8.292 +void (*microtime_func)(struct timeval *) = i8254_microtime;
   8.293 +void (*initclock_func)(void) = i8254_initclocks;
   8.294 +#else
   8.295 +void (*delay_func)(int) = xen_delay;
   8.296 +void (*microtime_func)(struct timeval *) = xen_microtime;
   8.297 +void (*initclock_func)(void) = xen_initclocks;
   8.298 +#endif
   8.299 +
   8.300 +void hypervisor_callback(void);
   8.301 +void failsafe_callback(void);
   8.302 +
   8.303 +/*
   8.304 + * Size of memory segments, before any memory is stolen.
   8.305 + */
   8.306 +phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
   8.307 +int	mem_cluster_cnt;
   8.308 +
   8.309 +int	cpu_dump(void);
   8.310 +int	cpu_dumpsize(void);
   8.311 +u_long	cpu_dump_mempagecnt(void);
   8.312 +void	dumpsys(void);
   8.313 +void	init386(paddr_t);
   8.314 +void	initgdt(void);
   8.315 +
   8.316 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
   8.317 +void	add_mem_cluster(u_int64_t, u_int64_t, u_int32_t);
   8.318 +#endif /* !defnied(REALBASEMEM) && !defined(REALEXTMEM) */
   8.319 +
   8.320 +extern int time_adjusted;
   8.321 +
   8.322 +/*
   8.323 + * Machine-dependent startup code
   8.324 + */
   8.325 +void
   8.326 +cpu_startup()
   8.327 +{
   8.328 +	int x;
   8.329 +	vaddr_t minaddr, maxaddr;
   8.330 +	char pbuf[9];
   8.331 +
   8.332 +	/*
   8.333 +	 * Initialize error message buffer (et end of core).
   8.334 +	 */
   8.335 +	msgbuf_vaddr = uvm_km_valloc(kernel_map, x86_round_page(MSGBUFSIZE));
   8.336 +	if (msgbuf_vaddr == 0)
   8.337 +		panic("failed to valloc msgbuf_vaddr");
   8.338 +
   8.339 +	/* msgbuf_paddr was init'd in pmap */
   8.340 +	for (x = 0; x < btoc(MSGBUFSIZE); x++)
   8.341 +		pmap_kenter_pa((vaddr_t)msgbuf_vaddr + x * PAGE_SIZE,
   8.342 +		    msgbuf_paddr + x * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE);
   8.343 +	pmap_update(pmap_kernel());
   8.344 +
   8.345 +	initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
   8.346 +
   8.347 +	printf("%s", version);
   8.348 +
   8.349 +#ifdef TRAPLOG
   8.350 +	/*
   8.351 +	 * Enable recording of branch from/to in MSR's
   8.352 +	 */
   8.353 +	wrmsr(MSR_DEBUGCTLMSR, 0x1);
   8.354 +#endif
   8.355 +
   8.356 +	format_bytes(pbuf, sizeof(pbuf), ptoa(physmem));
   8.357 +	printf("total memory = %s\n", pbuf);
   8.358 +
   8.359 +	minaddr = 0;
   8.360 +
   8.361 +	/*
   8.362 +	 * Allocate a submap for exec arguments.  This map effectively
   8.363 +	 * limits the number of processes exec'ing at any time.
   8.364 +	 */
   8.365 +	exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
   8.366 +				   16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
   8.367 +
   8.368 +	/*
   8.369 +	 * Allocate a submap for physio
   8.370 +	 */
   8.371 +	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
   8.372 +				   VM_PHYS_SIZE, 0, FALSE, NULL);
   8.373 +
   8.374 +	/*
   8.375 +	 * Finally, allocate mbuf cluster submap.
   8.376 +	 */
   8.377 +	mb_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
   8.378 +	    nmbclusters * mclbytes, VM_MAP_INTRSAFE, FALSE, NULL);
   8.379 +
   8.380 +	format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free));
   8.381 +	printf("avail memory = %s\n", pbuf);
   8.382 +
   8.383 +	/* Safe for i/o port / memory space allocation to use malloc now. */
   8.384 +	x86_bus_space_mallocok();
   8.385 +}
   8.386 +
   8.387 +/*
   8.388 + * Set up proc0's TSS and LDT.
   8.389 + */
   8.390 +void
   8.391 +i386_proc0_tss_ldt_init()
   8.392 +{
   8.393 +	struct pcb *pcb;
   8.394 +	int x;
   8.395 +
   8.396 +	gdt_init();
   8.397 +
   8.398 +	cpu_info_primary.ci_curpcb = pcb = &lwp0.l_addr->u_pcb;
   8.399 +
   8.400 +	pcb->pcb_tss.tss_ioopt =
   8.401 +	    ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16
   8.402 +		| SEL_KPL;		/* i/o pl */
   8.403 +
   8.404 +	for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
   8.405 +		pcb->pcb_iomap[x] = 0xffffffff;
   8.406 +
   8.407 +	pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
   8.408 +	pcb->pcb_cr0 = rcr0();
   8.409 +	pcb->pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
   8.410 +	pcb->pcb_tss.tss_esp0 = (int)lwp0.l_addr + USPACE - 16;
   8.411 +	lwp0.l_md.md_regs = (struct trapframe *)pcb->pcb_tss.tss_esp0 - 1;
   8.412 +	lwp0.l_md.md_tss_sel = tss_alloc(pcb);
   8.413 +
   8.414 +#ifndef XEN
   8.415 +	ltr(lwp0.l_md.md_tss_sel);
   8.416 +	lldt(pcb->pcb_ldt_sel);
   8.417 +#else
   8.418 +	HYPERVISOR_fpu_taskswitch();
   8.419 +	XENPRINTF(("lwp tss sp %p ss %04x/%04x\n",
   8.420 +		      (void *)pcb->pcb_tss.tss_esp0,
   8.421 +		      pcb->pcb_tss.tss_ss0, IDXSEL(pcb->pcb_tss.tss_ss0)));
   8.422 +	HYPERVISOR_stack_switch(pcb->pcb_tss.tss_ss0, pcb->pcb_tss.tss_esp0);
   8.423 +#endif
   8.424 +}
   8.425 +
   8.426 +/*
   8.427 + * Set up TSS and LDT for a new PCB.
   8.428 + */
   8.429 +
   8.430 +void
   8.431 +i386_init_pcb_tss_ldt(struct cpu_info *ci)
   8.432 +{
   8.433 +	int x;
   8.434 +	struct pcb *pcb = ci->ci_idle_pcb;
   8.435 +
   8.436 +	pcb->pcb_tss.tss_ioopt =
   8.437 +	    ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16
   8.438 +		| SEL_KPL;		/* i/o pl */
   8.439 +	for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
   8.440 +		pcb->pcb_iomap[x] = 0xffffffff;
   8.441 +
   8.442 +	pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
   8.443 +	pcb->pcb_cr0 = rcr0();
   8.444 +
   8.445 +	ci->ci_idle_tss_sel = tss_alloc(pcb);
   8.446 +}
   8.447 +
   8.448 +/*
   8.449 + * Switch context:
   8.450 + * - honor CR0_TS in saved CR0 and request DNA exception on FPU use
   8.451 + * - switch stack pointer for user->kernel transition
   8.452 + */
   8.453 +void
   8.454 +i386_switch_context(struct pcb *new)
   8.455 +{
   8.456 +	dom0_op_t op;
   8.457 +	struct cpu_info *ci;
   8.458 +
   8.459 +	ci = curcpu();
   8.460 +	if (ci->ci_fpused) {
   8.461 +		HYPERVISOR_fpu_taskswitch();
   8.462 +		ci->ci_fpused = 0;
   8.463 +	}
   8.464 +
   8.465 +	HYPERVISOR_stack_switch(new->pcb_tss.tss_ss0, new->pcb_tss.tss_esp0);
   8.466 +
   8.467 +	if (xen_start_info.flags & SIF_PRIVILEGED) {
   8.468 +		op.cmd = DOM0_IOPL;
   8.469 +		op.u.iopl.domain = DOMID_SELF;
   8.470 +		op.u.iopl.iopl = new->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */
   8.471 +		HYPERVISOR_dom0_op(&op);
   8.472 +	}
   8.473 +}
   8.474 +
   8.475 +/*
   8.476 + * sysctl helper routine for machdep.tm* nodes.
   8.477 + */
   8.478 +static int
   8.479 +sysctl_machdep_tm_longrun(SYSCTLFN_ARGS)
   8.480 +{
   8.481 +	struct sysctlnode node;
   8.482 +	int io, error;
   8.483 +
   8.484 +	if (!tmx86_has_longrun)
   8.485 +		return (EOPNOTSUPP);
   8.486 +
   8.487 +	node = *rnode;
   8.488 +	node.sysctl_data = &io;
   8.489 +
   8.490 +	switch (rnode->sysctl_num) {
   8.491 +	case CPU_TMLR_MODE:
   8.492 +		io = (int)(crusoe_longrun = tmx86_get_longrun_mode());
   8.493 +		break;
   8.494 +	case CPU_TMLR_FREQUENCY:
   8.495 +		tmx86_get_longrun_status_all();
   8.496 +		io = crusoe_frequency;
   8.497 +		break;
   8.498 +	case CPU_TMLR_VOLTAGE:
   8.499 +		tmx86_get_longrun_status_all();
   8.500 +		io = crusoe_voltage;
   8.501 +		break;
   8.502 +	case CPU_TMLR_PERCENTAGE:
   8.503 +		tmx86_get_longrun_status_all();
   8.504 +		io = crusoe_percentage;
   8.505 +		break;
   8.506 +	default:
   8.507 +		return (EOPNOTSUPP);
   8.508 +	}
   8.509 +
   8.510 +	error = sysctl_lookup(SYSCTLFN_CALL(&node));
   8.511 +	if (error || newp == NULL)
   8.512 +		return (error);
   8.513 +
   8.514 +	if (rnode->sysctl_num == CPU_TMLR_MODE) {
   8.515 +		if (tmx86_set_longrun_mode(io))
   8.516 +			crusoe_longrun = (u_int)io;
   8.517 +		else
   8.518 +			return (EINVAL);
   8.519 +	}
   8.520 +
   8.521 +	return (0);
   8.522 +}
   8.523 +
   8.524 +/*
   8.525 + * sysctl helper routine for machdep.booted_kernel
   8.526 + */
   8.527 +static int
   8.528 +sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)
   8.529 +{
   8.530 +	struct btinfo_bootpath *bibp;
   8.531 +	struct sysctlnode node;
   8.532 +
   8.533 +	bibp = lookup_bootinfo(BTINFO_BOOTPATH);
   8.534 +	if(!bibp)
   8.535 +		return(ENOENT); /* ??? */
   8.536 +
   8.537 +	node = *rnode;
   8.538 +	node.sysctl_data = bibp->bootpath;
   8.539 +	node.sysctl_size = sizeof(bibp->bootpath);
   8.540 +	return (sysctl_lookup(SYSCTLFN_CALL(&node)));
   8.541 +}
   8.542 +
   8.543 +/*
   8.544 + * sysctl helper routine for machdep.diskinfo
   8.545 + */
   8.546 +static int
   8.547 +sysctl_machdep_diskinfo(SYSCTLFN_ARGS)
   8.548 +{
   8.549 +	struct sysctlnode node;
   8.550 +
   8.551 +	node = *rnode;
   8.552 +	node.sysctl_data = i386_alldisks;
   8.553 +	node.sysctl_size = sizeof(struct disklist) +
   8.554 +	    (i386_ndisks - 1) * sizeof(struct nativedisk_info);
   8.555 +        return (sysctl_lookup(SYSCTLFN_CALL(&node)));
   8.556 +}
   8.557 +
   8.558 +/*
   8.559 + * machine dependent system variables.
   8.560 + */
   8.561 +SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup")
   8.562 +{
   8.563 +
   8.564 +	sysctl_createv(clog, 0, NULL, NULL,
   8.565 +		       CTLFLAG_PERMANENT,
   8.566 +		       CTLTYPE_NODE, "machdep", NULL,
   8.567 +		       NULL, 0, NULL, 0,
   8.568 +		       CTL_MACHDEP, CTL_EOL);
   8.569 +
   8.570 +	sysctl_createv(clog, 0, NULL, NULL,
   8.571 +		       CTLFLAG_PERMANENT,
   8.572 +		       CTLTYPE_STRUCT, "console_device", NULL,
   8.573 +		       sysctl_consdev, 0, NULL, sizeof(dev_t),
   8.574 +		       CTL_MACHDEP, CPU_CONSDEV, CTL_EOL);
   8.575 +	sysctl_createv(clog, 0, NULL, NULL,
   8.576 +		       CTLFLAG_PERMANENT,
   8.577 +		       CTLTYPE_INT, "biosbasemem", NULL,
   8.578 +		       NULL, 0, &biosbasemem, 0,
   8.579 +		       CTL_MACHDEP, CPU_BIOSBASEMEM, CTL_EOL);
   8.580 +	sysctl_createv(clog, 0, NULL, NULL,
   8.581 +		       CTLFLAG_PERMANENT,
   8.582 +		       CTLTYPE_INT, "biosextmem", NULL,
   8.583 +		       NULL, 0, &biosextmem, 0,
   8.584 +		       CTL_MACHDEP, CPU_BIOSEXTMEM, CTL_EOL);
   8.585 +	sysctl_createv(clog, 0, NULL, NULL,
   8.586 +		       CTLFLAG_PERMANENT,
   8.587 +		       CTLTYPE_INT, "nkpde", NULL,
   8.588 +		       NULL, 0, &nkpde, 0,
   8.589 +		       CTL_MACHDEP, CPU_NKPDE, CTL_EOL);
   8.590 +	sysctl_createv(clog, 0, NULL, NULL,
   8.591 +		       CTLFLAG_PERMANENT,
   8.592 +		       CTLTYPE_STRING, "booted_kernel", NULL,
   8.593 +		       sysctl_machdep_booted_kernel, 0, NULL, 0,
   8.594 +		       CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL);
   8.595 +	sysctl_createv(clog, 0, NULL, NULL,
   8.596 +		       CTLFLAG_PERMANENT,
   8.597 +		       CTLTYPE_STRUCT, "diskinfo", NULL,
   8.598 +		       sysctl_machdep_diskinfo, 0, NULL, 0,
   8.599 +		       CTL_MACHDEP, CPU_DISKINFO, CTL_EOL);
   8.600 +	sysctl_createv(clog, 0, NULL, NULL,
   8.601 +		       CTLFLAG_PERMANENT,
   8.602 +		       CTLTYPE_INT, "fpu_present", NULL,
   8.603 +		       NULL, 0, &i386_fpu_present, 0,
   8.604 +		       CTL_MACHDEP, CPU_FPU_PRESENT, CTL_EOL);
   8.605 +	sysctl_createv(clog, 0, NULL, NULL,
   8.606 +		       CTLFLAG_PERMANENT,
   8.607 +		       CTLTYPE_INT, "osfxsr", NULL,
   8.608 +		       NULL, 0, &i386_use_fxsave, 0,
   8.609 +		       CTL_MACHDEP, CPU_OSFXSR, CTL_EOL);
   8.610 +	sysctl_createv(clog, 0, NULL, NULL,
   8.611 +		       CTLFLAG_PERMANENT,
   8.612 +		       CTLTYPE_INT, "sse", NULL,
   8.613 +		       NULL, 0, &i386_has_sse, 0,
   8.614 +		       CTL_MACHDEP, CPU_SSE, CTL_EOL);
   8.615 +	sysctl_createv(clog, 0, NULL, NULL,
   8.616 +		       CTLFLAG_PERMANENT,
   8.617 +		       CTLTYPE_INT, "sse2", NULL,
   8.618 +		       NULL, 0, &i386_has_sse2, 0,
   8.619 +		       CTL_MACHDEP, CPU_SSE2, CTL_EOL);
   8.620 +	sysctl_createv(clog, 0, NULL, NULL,
   8.621 +		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
   8.622 +		       CTLTYPE_INT, "tm_longrun_mode", NULL,
   8.623 +		       sysctl_machdep_tm_longrun, 0, NULL, 0,
   8.624 +		       CTL_MACHDEP, CPU_TMLR_MODE, CTL_EOL);
   8.625 +	sysctl_createv(clog, 0, NULL, NULL,
   8.626 +		       CTLFLAG_PERMANENT,
   8.627 +		       CTLTYPE_INT, "tm_longrun_frequency", NULL,
   8.628 +		       sysctl_machdep_tm_longrun, 0, NULL, 0,
   8.629 +		       CTL_MACHDEP, CPU_TMLR_FREQUENCY, CTL_EOL);
   8.630 +	sysctl_createv(clog, 0, NULL, NULL,
   8.631 +		       CTLFLAG_PERMANENT,
   8.632 +		       CTLTYPE_INT, "tm_longrun_voltage", NULL,
   8.633 +		       sysctl_machdep_tm_longrun, 0, NULL, 0,
   8.634 +		       CTL_MACHDEP, CPU_TMLR_VOLTAGE, CTL_EOL);
   8.635 +	sysctl_createv(clog, 0, NULL, NULL,
   8.636 +		       CTLFLAG_PERMANENT,
   8.637 +		       CTLTYPE_INT, "tm_longrun_percentage", NULL,
   8.638 +		       sysctl_machdep_tm_longrun, 0, NULL, 0,
   8.639 +		       CTL_MACHDEP, CPU_TMLR_PERCENTAGE, CTL_EOL);
   8.640 +}
   8.641 +
   8.642 +void *
   8.643 +getframe(struct lwp *l, int sig, int *onstack)
   8.644 +{
   8.645 +	struct proc *p = l->l_proc;
   8.646 +	struct sigctx *ctx = &p->p_sigctx;
   8.647 +	struct trapframe *tf = l->l_md.md_regs;
   8.648 +
   8.649 +	/* Do we need to jump onto the signal stack? */
   8.650 +	*onstack = (ctx->ps_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0
   8.651 +	    && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
   8.652 +	if (*onstack)
   8.653 +		return (char *)ctx->ps_sigstk.ss_sp + ctx->ps_sigstk.ss_size;
   8.654 +#ifdef VM86
   8.655 +	if (tf->tf_eflags & PSL_VM)
   8.656 +		return (void *)(tf->tf_esp + (tf->tf_ss << 4));
   8.657 +	else
   8.658 +#endif
   8.659 +		return (void *)tf->tf_esp;
   8.660 +}
   8.661 +
   8.662 +/*
   8.663 + * Build context to run handler in.  We invoke the handler
   8.664 + * directly, only returning via the trampoline.  Note the
   8.665 + * trampoline version numbers are coordinated with machine-
   8.666 + * dependent code in libc.
   8.667 + */
   8.668 +void
   8.669 +buildcontext(struct lwp *l, int sel, void *catcher, void *fp)
   8.670 +{
   8.671 +	struct trapframe *tf = l->l_md.md_regs;
   8.672 +
   8.673 +	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
   8.674 +	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
   8.675 +	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
   8.676 +	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
   8.677 +	tf->tf_eip = (int)catcher;
   8.678 +	tf->tf_cs = GSEL(sel, SEL_UPL);
   8.679 +	tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
   8.680 +	tf->tf_esp = (int)fp;
   8.681 +	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
   8.682 +}
   8.683 +
   8.684 +static void
   8.685 +sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
   8.686 +{
   8.687 +	struct lwp *l = curlwp;
   8.688 +	struct proc *p = l->l_proc;
   8.689 +	struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map);
   8.690 +	int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
   8.691 +	    GUCODEBIG_SEL : GUCODE_SEL;
   8.692 +	struct sigacts *ps = p->p_sigacts;
   8.693 +	int onstack;
   8.694 +	int sig = ksi->ksi_signo;
   8.695 +	struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame;
   8.696 +	sig_t catcher = SIGACTION(p, sig).sa_handler;
   8.697 +	struct trapframe *tf = l->l_md.md_regs;
   8.698 +
   8.699 +	fp--;
   8.700 +
   8.701 +	/* Build stack frame for signal trampoline. */
   8.702 +	switch (ps->sa_sigdesc[sig].sd_vers) {
   8.703 +	case 0:		/* handled by sendsig_sigcontext */
   8.704 +	case 1:		/* handled by sendsig_sigcontext */
   8.705 +	default:	/* unknown version */
   8.706 +		printf("nsendsig: bad version %d\n",
   8.707 +		    ps->sa_sigdesc[sig].sd_vers);
   8.708 +		sigexit(l, SIGILL);
   8.709 +	case 2:
   8.710 +		break;
   8.711 +	}
   8.712 +
   8.713 +	frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp;
   8.714 +	frame.sf_signum = sig;
   8.715 +	frame.sf_sip = &fp->sf_si;
   8.716 +	frame.sf_ucp = &fp->sf_uc;
   8.717 +	frame.sf_si._info = ksi->ksi_info;
   8.718 +	frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM;
   8.719 +	frame.sf_uc.uc_sigmask = *mask;
   8.720 +	frame.sf_uc.uc_link = NULL;
   8.721 +	frame.sf_uc.uc_flags |= (p->p_sigctx.ps_sigstk.ss_flags & SS_ONSTACK)
   8.722 +	    ? _UC_SETSTACK : _UC_CLRSTACK;
   8.723 +	memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack));
   8.724 +	cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
   8.725 +
   8.726 +	if (tf->tf_eflags & PSL_VM)
   8.727 +		(*p->p_emul->e_syscall_intern)(p);
   8.728 +
   8.729 +	if (copyout(&frame, fp, sizeof(frame)) != 0) {
   8.730 +		/*
   8.731 +		 * Process has trashed its stack; give it an illegal
   8.732 +		 * instruction to halt it in its tracks.
   8.733 +		 */
   8.734 +		sigexit(l, SIGILL);
   8.735 +		/* NOTREACHED */
   8.736 +	}
   8.737 +
   8.738 +	buildcontext(l, sel, catcher, fp);
   8.739 +
   8.740 +	/* Remember that we're now on the signal stack. */
   8.741 +	if (onstack)
   8.742 +		p->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK;
   8.743 +}
   8.744 +
   8.745 +void
   8.746 +sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
   8.747 +{
   8.748 +#ifdef COMPAT_16
   8.749 +	if (curproc->p_sigacts->sa_sigdesc[ksi->ksi_signo].sd_vers < 2)
   8.750 +		sendsig_sigcontext(ksi, mask);
   8.751 +	else
   8.752 +#endif
   8.753 +		sendsig_siginfo(ksi, mask);
   8.754 +}
   8.755 +
   8.756 +void
   8.757 +cpu_upcall(struct lwp *l, int type, int nevents, int ninterrupted, void *sas,
   8.758 +    void *ap, void *sp, sa_upcall_t upcall)
   8.759 +{
   8.760 +	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
   8.761 +	struct saframe *sf, frame;
   8.762 +	struct trapframe *tf;
   8.763 +
   8.764 +	tf = l->l_md.md_regs;
   8.765 +
   8.766 +	/* Finally, copy out the rest of the frame. */
   8.767 +	frame.sa_type = type;
   8.768 +	frame.sa_sas = sas;
   8.769 +	frame.sa_events = nevents;
   8.770 +	frame.sa_interrupted = ninterrupted;
   8.771 +	frame.sa_arg = ap;
   8.772 +	frame.sa_ra = 0;
   8.773 +
   8.774 +	sf = (struct saframe *)sp - 1;
   8.775 +	if (copyout(&frame, sf, sizeof(frame)) != 0) {
   8.776 +		/* Copying onto the stack didn't work. Die. */
   8.777 +		sigexit(l, SIGILL);
   8.778 +		/* NOTREACHED */
   8.779 +	}
   8.780 +
   8.781 +	tf->tf_eip = (int) upcall;
   8.782 +	tf->tf_esp = (int) sf;
   8.783 +	tf->tf_ebp = 0; /* indicate call-frame-top to debuggers */
   8.784 +	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
   8.785 +	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
   8.786 +	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
   8.787 +	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
   8.788 +	tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
   8.789 +	    GSEL(GUCODEBIG_SEL, SEL_UPL) : GSEL(GUCODE_SEL, SEL_UPL);
   8.790 +	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
   8.791 +	tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
   8.792 +}
   8.793 +
   8.794 +int	waittime = -1;
   8.795 +struct pcb dumppcb;
   8.796 +
   8.797 +void
   8.798 +cpu_reboot(int howto, char *bootstr)
   8.799 +{
   8.800 +
   8.801 +	if (cold) {
   8.802 +		howto |= RB_HALT;
   8.803 +		goto haltsys;
   8.804 +	}
   8.805 +
   8.806 +	boothowto = howto;
   8.807 +	if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
   8.808 +		waittime = 0;
   8.809 +		vfs_shutdown();
   8.810 +		/*
   8.811 +		 * If we've been adjusting the clock, the todr
   8.812 +		 * will be out of synch; adjust it now.
   8.813 +		 */
   8.814 +		if (time_adjusted != 0)
   8.815 +			resettodr();
   8.816 +	}
   8.817 +
   8.818 +	/* Disable interrupts. */
   8.819 +	splhigh();
   8.820 +
   8.821 +	/* Do a dump if requested. */
   8.822 +	if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
   8.823 +		dumpsys();
   8.824 +
   8.825 +haltsys:
   8.826 +	doshutdownhooks();
   8.827 +
   8.828 +#ifdef MULTIPROCESSOR
   8.829 +	x86_broadcast_ipi(X86_IPI_HALT);
   8.830 +#endif
   8.831 +
   8.832 +	if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
   8.833 +#if NACPI > 0
   8.834 +		if (acpi_softc != NULL) {
   8.835 +			delay(500000);
   8.836 +			acpi_enter_sleep_state(acpi_softc, ACPI_STATE_S5);
   8.837 +			printf("WARNING: ACPI powerdown failed!\n");
   8.838 +		}
   8.839 +#endif
   8.840 +#if NAPM > 0 && !defined(APM_NO_POWEROFF)
   8.841 +		/* turn off, if we can.  But try to turn disk off and
   8.842 +		 * wait a bit first--some disk drives are slow to clean up
   8.843 +		 * and users have reported disk corruption.
   8.844 +		 */
   8.845 +		delay(500000);
   8.846 +		apm_set_powstate(APM_DEV_DISK(0xff), APM_SYS_OFF);
   8.847 +		delay(500000);
   8.848 +		apm_set_powstate(APM_DEV_ALLDEVS, APM_SYS_OFF);
   8.849 +		printf("WARNING: APM powerdown failed!\n");
   8.850 +		/*
   8.851 +		 * RB_POWERDOWN implies RB_HALT... fall into it...
   8.852 +		 */
   8.853 +#endif
   8.854 +		HYPERVISOR_shutdown();
   8.855 +	}
   8.856 +
   8.857 +	if (howto & RB_HALT) {
   8.858 +		printf("\n");
   8.859 +		printf("The operating system has halted.\n");
   8.860 +		printf("Please press any key to reboot.\n\n");
   8.861 +
   8.862 +#ifdef BEEP_ONHALT
   8.863 +		{
   8.864 +			int c;
   8.865 +			for (c = BEEP_ONHALT_COUNT; c > 0; c--) {
   8.866 +				sysbeep(BEEP_ONHALT_PITCH,
   8.867 +				        BEEP_ONHALT_PERIOD * hz / 1000);
   8.868 +				delay(BEEP_ONHALT_PERIOD * 1000);
   8.869 +				sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000);
   8.870 +				delay(BEEP_ONHALT_PERIOD * 1000);
   8.871 +			}
   8.872 +		}
   8.873 +#endif
   8.874 +
   8.875 +		cnpollc(1);	/* for proper keyboard command handling */
   8.876 +		if (cngetc() == 0) {
   8.877 +			/* no console attached, so just hlt */
   8.878 +			for(;;) {
   8.879 +				__asm __volatile("hlt");
   8.880 +			}
   8.881 +		}
   8.882 +		cnpollc(0);
   8.883 +	}
   8.884 +
   8.885 +	printf("rebooting...\n");
   8.886 +	if (cpureset_delay > 0)
   8.887 +		delay(cpureset_delay * 1000);
   8.888 +	cpu_reset();
   8.889 +	for(;;) ;
   8.890 +	/*NOTREACHED*/
   8.891 +}
   8.892 +
   8.893 +/*
   8.894 + * These variables are needed by /sbin/savecore
   8.895 + */
   8.896 +u_int32_t dumpmag = 0x8fca0101;	/* magic number */
   8.897 +int 	dumpsize = 0;		/* pages */
   8.898 +long	dumplo = 0; 		/* blocks */
   8.899 +
   8.900 +/*
   8.901 + * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
   8.902 + */
   8.903 +int
   8.904 +cpu_dumpsize()
   8.905 +{
   8.906 +	int size;
   8.907 +
   8.908 +	size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
   8.909 +	    ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
   8.910 +	if (roundup(size, dbtob(1)) != dbtob(1))
   8.911 +		return (-1);
   8.912 +
   8.913 +	return (1);
   8.914 +}
   8.915 +
   8.916 +/*
   8.917 + * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
   8.918 + */
   8.919 +u_long
   8.920 +cpu_dump_mempagecnt()
   8.921 +{
   8.922 +	u_long i, n;
   8.923 +
   8.924 +	n = 0;
   8.925 +	for (i = 0; i < mem_cluster_cnt; i++)
   8.926 +		n += atop(mem_clusters[i].size);
   8.927 +	return (n);
   8.928 +}
   8.929 +
   8.930 +/*
   8.931 + * cpu_dump: dump the machine-dependent kernel core dump headers.
   8.932 + */
   8.933 +int
   8.934 +cpu_dump()
   8.935 +{
   8.936 +	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
   8.937 +	char buf[dbtob(1)];
   8.938 +	kcore_seg_t *segp;
   8.939 +	cpu_kcore_hdr_t *cpuhdrp;
   8.940 +	phys_ram_seg_t *memsegp;
   8.941 +	const struct bdevsw *bdev;
   8.942 +	int i;
   8.943 +
   8.944 +	bdev = bdevsw_lookup(dumpdev);
   8.945 +	if (bdev == NULL)
   8.946 +		return (ENXIO);
   8.947 +	dump = bdev->d_dump;
   8.948 +
   8.949 +	memset(buf, 0, sizeof buf);
   8.950 +	segp = (kcore_seg_t *)buf;
   8.951 +	cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
   8.952 +	memsegp = (phys_ram_seg_t *)&buf[ ALIGN(sizeof(*segp)) +
   8.953 +	    ALIGN(sizeof(*cpuhdrp))];
   8.954 +
   8.955 +	/*
   8.956 +	 * Generate a segment header.
   8.957 +	 */
   8.958 +	CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
   8.959 +	segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
   8.960 +
   8.961 +	/*
   8.962 +	 * Add the machine-dependent header info.
   8.963 +	 */
   8.964 +	cpuhdrp->ptdpaddr = PTDpaddr;
   8.965 +	cpuhdrp->nmemsegs = mem_cluster_cnt;
   8.966 +
   8.967 +	/*
   8.968 +	 * Fill in the memory segment descriptors.
   8.969 +	 */
   8.970 +	for (i = 0; i < mem_cluster_cnt; i++) {
   8.971 +		memsegp[i].start = mem_clusters[i].start;
   8.972 +		memsegp[i].size = mem_clusters[i].size;
   8.973 +	}
   8.974 +
   8.975 +	return (dump(dumpdev, dumplo, (caddr_t)buf, dbtob(1)));
   8.976 +}
   8.977 +
   8.978 +/*
   8.979 + * This is called by main to set dumplo and dumpsize.
   8.980 + * Dumps always skip the first PAGE_SIZE of disk space
   8.981 + * in case there might be a disk label stored there.
   8.982 + * If there is extra space, put dump at the end to
   8.983 + * reduce the chance that swapping trashes it.
   8.984 + */
   8.985 +void
   8.986 +cpu_dumpconf()
   8.987 +{
   8.988 +	const struct bdevsw *bdev;
   8.989 +	int nblks, dumpblks;	/* size of dump area */
   8.990 +
   8.991 +	if (dumpdev == NODEV)
   8.992 +		goto bad;
   8.993 +	bdev = bdevsw_lookup(dumpdev);
   8.994 +	if (bdev == NULL)
   8.995 +		panic("dumpconf: bad dumpdev=0x%x", dumpdev);
   8.996 +	if (bdev->d_psize == NULL)
   8.997 +		goto bad;
   8.998 +	nblks = (*bdev->d_psize)(dumpdev);
   8.999 +	if (nblks <= ctod(1))
  8.1000 +		goto bad;
  8.1001 +
  8.1002 +	dumpblks = cpu_dumpsize();
  8.1003 +	if (dumpblks < 0)
  8.1004 +		goto bad;
  8.1005 +	dumpblks += ctod(cpu_dump_mempagecnt());
  8.1006 +
  8.1007 +	/* If dump won't fit (incl. room for possible label), punt. */
  8.1008 +	if (dumpblks > (nblks - ctod(1)))
  8.1009 +		goto bad;
  8.1010 +
  8.1011 +	/* Put dump at end of partition */
  8.1012 +	dumplo = nblks - dumpblks;
  8.1013 +
  8.1014 +	/* dumpsize is in page units, and doesn't include headers. */
  8.1015 +	dumpsize = cpu_dump_mempagecnt();
  8.1016 +	return;
  8.1017 +
  8.1018 + bad:
  8.1019 +	dumpsize = 0;
  8.1020 +}
  8.1021 +
  8.1022 +/*
  8.1023 + * Doadump comes here after turning off memory management and
  8.1024 + * getting on the dump stack, either when called above, or by
  8.1025 + * the auto-restart code.
  8.1026 + */
  8.1027 +#define BYTES_PER_DUMP  PAGE_SIZE /* must be a multiple of pagesize XXX small */
  8.1028 +static vaddr_t dumpspace;
  8.1029 +
  8.1030 +vaddr_t
  8.1031 +reserve_dumppages(vaddr_t p)
  8.1032 +{
  8.1033 +
  8.1034 +	dumpspace = p;
  8.1035 +	return (p + BYTES_PER_DUMP);
  8.1036 +}
  8.1037 +
  8.1038 +void
  8.1039 +dumpsys()
  8.1040 +{
  8.1041 +	u_long totalbytesleft, bytes, i, n, memseg;
  8.1042 +	u_long maddr;
  8.1043 +	int psize;
  8.1044 +	daddr_t blkno;
  8.1045 +	const struct bdevsw *bdev;
  8.1046 +	int (*dump)(dev_t, daddr_t, caddr_t, size_t);
  8.1047 +	int error;
  8.1048 +
  8.1049 +	/* Save registers. */
  8.1050 +	savectx(&dumppcb);
  8.1051 +
  8.1052 +	if (dumpdev == NODEV)
  8.1053 +		return;
  8.1054 +
  8.1055 +	bdev = bdevsw_lookup(dumpdev);
  8.1056 +	if (bdev == NULL || bdev->d_psize == NULL)
  8.1057 +		return;
  8.1058 +
  8.1059 +	/*
  8.1060 +	 * For dumps during autoconfiguration,
  8.1061 +	 * if dump device has already configured...
  8.1062 +	 */
  8.1063 +	if (dumpsize == 0)
  8.1064 +		cpu_dumpconf();
  8.1065 +	if (dumplo <= 0 || dumpsize == 0) {
  8.1066 +		printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
  8.1067 +		    minor(dumpdev));
  8.1068 +		return;
  8.1069 +	}
  8.1070 +	printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
  8.1071 +	    minor(dumpdev), dumplo);
  8.1072 +
  8.1073 +	psize = (*bdev->d_psize)(dumpdev);
  8.1074 +	printf("dump ");
  8.1075 +	if (psize == -1) {
  8.1076 +		printf("area unavailable\n");
  8.1077 +		return;
  8.1078 +	}
  8.1079 +
  8.1080 +#if 0	/* XXX this doesn't work.  grr. */
  8.1081 +        /* toss any characters present prior to dump */
  8.1082 +	while (sget() != NULL); /*syscons and pccons differ */
  8.1083 +#endif
  8.1084 +
  8.1085 +	if ((error = cpu_dump()) != 0)
  8.1086 +		goto err;
  8.1087 +
  8.1088 +	totalbytesleft = ptoa(cpu_dump_mempagecnt());
  8.1089 +	blkno = dumplo + cpu_dumpsize();
  8.1090 +	dump = bdev->d_dump;
  8.1091 +	error = 0;
  8.1092 +
  8.1093 +	for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
  8.1094 +		maddr = mem_clusters[memseg].start;
  8.1095 +		bytes = mem_clusters[memseg].size;
  8.1096 +
  8.1097 +		for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
  8.1098 +			/* Print out how many MBs we have left to go. */
  8.1099 +			if ((totalbytesleft % (1024*1024)) == 0)
  8.1100 +				printf("%ld ", totalbytesleft / (1024 * 1024));
  8.1101 +
  8.1102 +			/* Limit size for next transfer. */
  8.1103 +			n = bytes - i;
  8.1104 +			if (n > BYTES_PER_DUMP)
  8.1105 +				n = BYTES_PER_DUMP;
  8.1106 +
  8.1107 +			(void) pmap_map(dumpspace, maddr, maddr + n,
  8.1108 +			    VM_PROT_READ);
  8.1109 +
  8.1110 +			error = (*dump)(dumpdev, blkno, (caddr_t)dumpspace, n);
  8.1111 +			if (error)
  8.1112 +				goto err;
  8.1113 +			maddr += n;
  8.1114 +			blkno += btodb(n);		/* XXX? */
  8.1115 +
  8.1116 +#if 0	/* XXX this doesn't work.  grr. */
  8.1117 +			/* operator aborting dump? */
  8.1118 +			if (sget() != NULL) {
  8.1119 +				error = EINTR;
  8.1120 +				break;
  8.1121 +			}
  8.1122 +#endif
  8.1123 +		}
  8.1124 +	}
  8.1125 +
  8.1126 + err:
  8.1127 +	switch (error) {
  8.1128 +
  8.1129 +	case ENXIO:
  8.1130 +		printf("device bad\n");
  8.1131 +		break;
  8.1132 +
  8.1133 +	case EFAULT:
  8.1134 +		printf("device not ready\n");
  8.1135 +		break;
  8.1136 +
  8.1137 +	case EINVAL:
  8.1138 +		printf("area improper\n");
  8.1139 +		break;
  8.1140 +
  8.1141 +	case EIO:
  8.1142 +		printf("i/o error\n");
  8.1143 +		break;
  8.1144 +
  8.1145 +	case EINTR:
  8.1146 +		printf("aborted from console\n");
  8.1147 +		break;
  8.1148 +
  8.1149 +	case 0:
  8.1150 +		printf("succeeded\n");
  8.1151 +		break;
  8.1152 +
  8.1153 +	default:
  8.1154 +		printf("error %d\n", error);
  8.1155 +		break;
  8.1156 +	}
  8.1157 +	printf("\n\n");
  8.1158 +	delay(5000000);		/* 5 seconds */
  8.1159 +}
  8.1160 +
  8.1161 +/*
  8.1162 + * Clear registers on exec
  8.1163 + */
  8.1164 +void
  8.1165 +setregs(struct lwp *l, struct exec_package *pack, u_long stack)
  8.1166 +{
  8.1167 +	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
  8.1168 +	struct pcb *pcb = &l->l_addr->u_pcb;
  8.1169 +	struct trapframe *tf;
  8.1170 +
  8.1171 +#if NNPX > 0
  8.1172 +	/* If we were using the FPU, forget about it. */
  8.1173 +	if (l->l_addr->u_pcb.pcb_fpcpu != NULL)
  8.1174 +		npxsave_lwp(l, 0);
  8.1175 +#endif
  8.1176 +
  8.1177 +#ifdef USER_LDT
  8.1178 +	pmap_ldt_cleanup(l);
  8.1179 +#endif
  8.1180 +
  8.1181 +	l->l_md.md_flags &= ~MDL_USEDFPU;
  8.1182 +	if (i386_use_fxsave) {
  8.1183 +		pcb->pcb_savefpu.sv_xmm.sv_env.en_cw = __NetBSD_NPXCW__;
  8.1184 +		pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr = __INITIAL_MXCSR__;
  8.1185 +	} else
  8.1186 +		pcb->pcb_savefpu.sv_87.sv_env.en_cw = __NetBSD_NPXCW__;
  8.1187 +
  8.1188 +	tf = l->l_md.md_regs;
  8.1189 +	tf->tf_gs = LSEL(LUDATA_SEL, SEL_UPL);
  8.1190 +	tf->tf_fs = LSEL(LUDATA_SEL, SEL_UPL);
  8.1191 +	tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
  8.1192 +	tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
  8.1193 +	tf->tf_edi = 0;
  8.1194 +	tf->tf_esi = 0;
  8.1195 +	tf->tf_ebp = 0;
  8.1196 +	tf->tf_ebx = (int)l->l_proc->p_psstr;
  8.1197 +	tf->tf_edx = 0;
  8.1198 +	tf->tf_ecx = 0;
  8.1199 +	tf->tf_eax = 0;
  8.1200 +	tf->tf_eip = pack->ep_entry;
  8.1201 +	tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
  8.1202 +	    LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL);
  8.1203 +	tf->tf_eflags = PSL_USERSET;
  8.1204 +	tf->tf_esp = stack;
  8.1205 +	tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
  8.1206 +}
  8.1207 +
  8.1208 +/*
  8.1209 + * Initialize segments and descriptor tables
  8.1210 + */
  8.1211 +
  8.1212 +union	descriptor *gdt, *ldt;
  8.1213 +struct gate_descriptor *idt;
  8.1214 +char idt_allocmap[NIDT];
  8.1215 +struct simplelock idt_lock = SIMPLELOCK_INITIALIZER;
  8.1216 +#ifdef I586_CPU
  8.1217 +union	descriptor *pentium_idt;
  8.1218 +#endif
  8.1219 +extern  struct user *proc0paddr;
  8.1220 +
  8.1221 +void
  8.1222 +setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl,
  8.1223 +    int sel)
  8.1224 +{
  8.1225 +
  8.1226 +	gd->gd_looffset = (int)func;
  8.1227 +	gd->gd_selector = sel;
  8.1228 +	gd->gd_stkcpy = args;
  8.1229 +	gd->gd_xx = 0;
  8.1230 +	gd->gd_type = type;
  8.1231 +	gd->gd_dpl = dpl;
  8.1232 +	gd->gd_p = 1;
  8.1233 +	gd->gd_hioffset = (int)func >> 16;
  8.1234 +}
  8.1235 +
  8.1236 +void
  8.1237 +unsetgate(struct gate_descriptor *gd)
  8.1238 +{
  8.1239 +	gd->gd_p = 0;
  8.1240 +	gd->gd_hioffset = 0;
  8.1241 +	gd->gd_looffset = 0;
  8.1242 +	gd->gd_selector = 0;
  8.1243 +	gd->gd_xx = 0;
  8.1244 +	gd->gd_stkcpy = 0;
  8.1245 +	gd->gd_type = 0;
  8.1246 +	gd->gd_dpl = 0;
  8.1247 +}
  8.1248 +
  8.1249 +
  8.1250 +void
  8.1251 +setregion(struct region_descriptor *rd, void *base, size_t limit)
  8.1252 +{
  8.1253 +
  8.1254 +	rd->rd_limit = (int)limit;
  8.1255 +	rd->rd_base = (int)base;
  8.1256 +}
  8.1257 +
  8.1258 +void
  8.1259 +setsegment(struct segment_descriptor *sd, void *base, size_t limit, int type,
  8.1260 +    int dpl, int def32, int gran)
  8.1261 +{
  8.1262 +
  8.1263 +	sd->sd_lolimit = (int)limit;
  8.1264 +	sd->sd_lobase = (int)base;
  8.1265 +	sd->sd_type = type;
  8.1266 +	sd->sd_dpl = dpl;
  8.1267 +	sd->sd_p = 1;
  8.1268 +	sd->sd_hilimit = (int)limit >> 16;
  8.1269 +	sd->sd_xx = 0;
  8.1270 +	sd->sd_def32 = def32;
  8.1271 +	sd->sd_gran = gran;
  8.1272 +	sd->sd_hibase = (int)base >> 24;
  8.1273 +}
  8.1274 +
  8.1275 +#define	IDTVEC(name)	__CONCAT(X, name)
  8.1276 +typedef void (vector)(void);
  8.1277 +extern vector IDTVEC(syscall);
  8.1278 +extern vector IDTVEC(osyscall);
  8.1279 +extern vector *IDTVEC(exceptions)[];
  8.1280 +#ifdef COMPAT_SVR4
  8.1281 +extern vector IDTVEC(svr4_fasttrap);
  8.1282 +#endif /* COMPAT_SVR4 */
  8.1283 +#ifdef COMPAT_MACH
  8.1284 +extern vector IDTVEC(mach_trap);
  8.1285 +#endif
  8.1286 +#define MAX_XEN_IDT 128
  8.1287 +trap_info_t xen_idt[MAX_XEN_IDT];
  8.1288 +int xen_idt_idx;
  8.1289 +
  8.1290 +#define	KBTOB(x)	((size_t)(x) * 1024UL)
  8.1291 +
  8.1292 +void cpu_init_idt()
  8.1293 +{
  8.1294 +	struct region_descriptor region;
  8.1295 +
  8.1296 +	panic("cpu_init_idt");
  8.1297 +#ifdef I586_CPU
  8.1298 +	setregion(&region, pentium_idt, NIDT * sizeof(idt[0]) - 1);
  8.1299 +#else
  8.1300 +	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
  8.1301 +#endif
  8.1302 +        lidt(&region);
  8.1303 +}
  8.1304 +
  8.1305 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
  8.1306 +void
  8.1307 +add_mem_cluster(u_int64_t seg_start, u_int64_t seg_end, u_int32_t type)
  8.1308 +{
  8.1309 +	extern struct extent *iomem_ex;
  8.1310 +	int i;
  8.1311 +
  8.1312 +	if (seg_end > 0x100000000ULL) {
  8.1313 +		printf("WARNING: skipping large "
  8.1314 +		    "memory map entry: "
  8.1315 +		    "0x%qx/0x%qx/0x%x\n",
  8.1316 +		    seg_start,
  8.1317 +		    (seg_end - seg_start),
  8.1318 +		    type);
  8.1319 +		return;
  8.1320 +	}
  8.1321 +
  8.1322 +	/*
  8.1323 +	 * XXX Chop the last page off the size so that
  8.1324 +	 * XXX it can fit in avail_end.
  8.1325 +	 */
  8.1326 +	if (seg_end == 0x100000000ULL)
  8.1327 +		seg_end -= PAGE_SIZE;
  8.1328 +
  8.1329 +	if (seg_end <= seg_start)
  8.1330 +		return;
  8.1331 +
  8.1332 +	for (i = 0; i < mem_cluster_cnt; i++) {
  8.1333 +		if ((mem_clusters[i].start == round_page(seg_start))
  8.1334 +		    && (mem_clusters[i].size
  8.1335 +			    == trunc_page(seg_end) - mem_clusters[i].start)) {
  8.1336 +#ifdef DEBUG_MEMLOAD
  8.1337 +			printf("WARNING: skipping duplicate segment entry\n");
  8.1338 +#endif
  8.1339 +			return;
  8.1340 +		}
  8.1341 +	}
  8.1342 +
  8.1343 +	/*
  8.1344 +	 * Allocate the physical addresses used by RAM
  8.1345 +	 * from the iomem extent map.  This is done before
  8.1346 +	 * the addresses are page rounded just to make
  8.1347 +	 * sure we get them all.
  8.1348 +	 */
  8.1349 +	if (extent_alloc_region(iomem_ex, seg_start,
  8.1350 +	    seg_end - seg_start, EX_NOWAIT)) {
  8.1351 +		/* XXX What should we do? */
  8.1352 +		printf("WARNING: CAN'T ALLOCATE "
  8.1353 +		    "MEMORY SEGMENT "
  8.1354 +		    "(0x%qx/0x%qx/0x%x) FROM "
  8.1355 +		    "IOMEM EXTENT MAP!\n",
  8.1356 +		    seg_start, seg_end - seg_start, type);
  8.1357 +		return;
  8.1358 +	}
  8.1359 +
  8.1360 +	/*
  8.1361 +	 * If it's not free memory, skip it.
  8.1362 +	 */
  8.1363 +	if (type != BIM_Memory)
  8.1364 +		return;
  8.1365 +
  8.1366 +	/* XXX XXX XXX */
  8.1367 +	if (mem_cluster_cnt >= VM_PHYSSEG_MAX)
  8.1368 +		panic("init386: too many memory segments");
  8.1369 +
  8.1370 +	seg_start = round_page(seg_start);
  8.1371 +	seg_end = trunc_page(seg_end);
  8.1372 +
  8.1373 +	if (seg_start == seg_end)
  8.1374 +		return;
  8.1375 +
  8.1376 +	mem_clusters[mem_cluster_cnt].start = seg_start;
  8.1377 +	mem_clusters[mem_cluster_cnt].size =
  8.1378 +	    seg_end - seg_start;
  8.1379 +
  8.1380 +	if (avail_end < seg_end)
  8.1381 +		avail_end = seg_end;
  8.1382 +	physmem += atop(mem_clusters[mem_cluster_cnt].size);
  8.1383 +	mem_cluster_cnt++;
  8.1384 +}
  8.1385 +#endif /* !defined(REALBASEMEM) && !defined(REALEXTMEM) */
  8.1386 +
  8.1387 +void
  8.1388 +initgdt()
  8.1389 +{
  8.1390 +#if !defined(XEN)
  8.1391 +	struct region_descriptor region;
  8.1392 +#else
  8.1393 +	paddr_t frames[16];
  8.1394 +#endif
  8.1395 +
  8.1396 +#if !defined(XEN)
  8.1397 +	gdt = tgdt;
  8.1398 +	memset(gdt, 0, NGDT*sizeof(*gdt));
  8.1399 +#endif
  8.1400 +	/* make gdt gates and memory segments */
  8.1401 +	setsegment(&gdt[GCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 1, 1);
  8.1402 +	setsegment(&gdt[GDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 1, 1);
  8.1403 +	setsegment(&gdt[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1,
  8.1404 +	    SDT_MEMERA, SEL_UPL, 1, 1);
  8.1405 +	setsegment(&gdt[GUCODEBIG_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1,
  8.1406 +	    SDT_MEMERA, SEL_UPL, 1, 1);
  8.1407 +	setsegment(&gdt[GUDATA_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1,
  8.1408 +	    SDT_MEMRWA, SEL_UPL, 1, 1);
  8.1409 +#ifdef COMPAT_MACH
  8.1410 +	setgate(&gdt[GMACHCALLS_SEL].gd, &IDTVEC(mach_trap), 1,
  8.1411 +	    SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
  8.1412 +#endif
  8.1413 +#if NBIOSCALL > 0
  8.1414 +	/* bios trampoline GDT entries */
  8.1415 +	setsegment(&gdt[GBIOSCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 0,
  8.1416 +	    0);
  8.1417 +	setsegment(&gdt[GBIOSDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 0,
  8.1418 +	    0);
  8.1419 +#endif
  8.1420 +	setsegment(&gdt[GCPU_SEL].sd, &cpu_info_primary,
  8.1421 +	    sizeof(struct cpu_info)-1, SDT_MEMRWA, SEL_KPL, 1, 1);
  8.1422 +
  8.1423 +#if !defined(XEN)
  8.1424 +	setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
  8.1425 +	lgdt(&region);
  8.1426 +#else
  8.1427 +	frames[0] = xpmap_ptom((uint32_t)gdt - KERNBASE) >> PAGE_SHIFT;
  8.1428 +	/* pmap_kremove((vaddr_t)gdt, PAGE_SIZE); */
  8.1429 +	pmap_kenter_pa((vaddr_t)gdt, (uint32_t)gdt - KERNBASE,
  8.1430 +	    VM_PROT_READ);
  8.1431 +	XENPRINTK(("loading gdt %lx, %d entries\n", frames[0] << PAGE_SHIFT,
  8.1432 +	    LAST_RESERVED_GDT_ENTRY + 1));
  8.1433 +	if (HYPERVISOR_set_gdt(frames, LAST_RESERVED_GDT_ENTRY + 1))
  8.1434 +		panic("HYPERVISOR_set_gdt failed!\n");
  8.1435 +	lgdt_finish();
  8.1436 +#endif
  8.1437 +}
  8.1438 +
  8.1439 +void
  8.1440 +init386(paddr_t first_avail)
  8.1441 +{
  8.1442 +#if !defined(XEN)
  8.1443 +	union descriptor *tgdt;
  8.1444 +#endif
  8.1445 +	extern void consinit(void);
  8.1446 +#if !defined(XEN)
  8.1447 +	extern struct extent *iomem_ex;
  8.1448 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
  8.1449 +	struct btinfo_memmap *bim;
  8.1450 +#endif
  8.1451 +	struct region_descriptor region;
  8.1452 +#endif
  8.1453 +	int x;
  8.1454 +#if !defined(XEN)
  8.1455 +	int first16q;
  8.1456 +	u_int64_t seg_start, seg_end;
  8.1457 +	u_int64_t seg_start1, seg_end1;
  8.1458 +#endif
  8.1459 +	paddr_t realmode_reserved_start;
  8.1460 +	psize_t realmode_reserved_size;
  8.1461 +	int needs_earlier_install_pte0;
  8.1462 +#if NBIOSCALL > 0
  8.1463 +	extern int biostramp_image_size;
  8.1464 +	extern u_char biostramp_image[];
  8.1465 +#endif
  8.1466 +
  8.1467 +	XENPRINTK(("HYPERVISOR_shared_info %p\n", HYPERVISOR_shared_info));
  8.1468 +#ifdef XENDEBUG_LOW
  8.1469 +	xen_dbglow_init();
  8.1470 +#endif
  8.1471 +
  8.1472 +	cpu_probe_features(&cpu_info_primary);
  8.1473 +	cpu_feature = cpu_info_primary.ci_feature_flags;
  8.1474 +
  8.1475 +	/* not on Xen... */
  8.1476 +	cpu_feature &= ~(CPUID_PGE|CPUID_PSE|CPUID_MTRR|CPUID_FXSR);
  8.1477 +
  8.1478 +	lwp0.l_addr = proc0paddr;
  8.1479 +	cpu_info_primary.ci_curpcb = &lwp0.l_addr->u_pcb;
  8.1480 +
  8.1481 +	XENPRINTK(("proc0paddr %p pcb %p first_avail %p\n",
  8.1482 +	    proc0paddr, cpu_info_primary.ci_curpcb, (void *)first_avail));
  8.1483 +	XENPRINTK(("ptdpaddr %p atdevbase %p\n", (void *)PTDpaddr,
  8.1484 +		      (void *)atdevbase));
  8.1485 +
  8.1486 +	x86_bus_space_init();
  8.1487 +	consinit();	/* XXX SHOULD NOT BE DONE HERE */
  8.1488 +	/*
  8.1489 +	 * Initailize PAGE_SIZE-dependent variables.
  8.1490 +	 */
  8.1491 +	uvm_setpagesize();
  8.1492 +
  8.1493 +	/*
  8.1494 +	 * Saving SSE registers won't work if the save area isn't
  8.1495 +	 * 16-byte aligned.
  8.1496 +	 */
  8.1497 +	if (offsetof(struct user, u_pcb.pcb_savefpu) & 0xf)
  8.1498 +		panic("init386: pcb_savefpu not 16-byte aligned");
  8.1499 +
  8.1500 +	/*
  8.1501 +	 * Start with 2 color bins -- this is just a guess to get us
  8.1502 +	 * started.  We'll recolor when we determine the largest cache
  8.1503 +	 * sizes on the system.
  8.1504 +	 */
  8.1505 +	uvmexp.ncolors = 2;
  8.1506 +
  8.1507 +#if !defined(XEN)
  8.1508 +	/*
  8.1509 +	 * BIOS leaves data in physical page 0
  8.1510 +	 * Even if it didn't, our VM system doesn't like using zero as a
  8.1511 +	 * physical page number.
  8.1512 +	 * We may also need pages in low memory (one each) for secondary CPU
  8.1513 +	 * startup, for BIOS calls, and for ACPI, plus a page table page to map
  8.1514 +	 * them into the first few pages of the kernel's pmap.
  8.1515 +	 */
  8.1516 +	avail_start = PAGE_SIZE;
  8.1517 +#else
  8.1518 +	/* Make sure the end of the space used by the kernel is rounded. */
  8.1519 +	first_avail = round_page(first_avail);
  8.1520 +	avail_start = first_avail - KERNBASE;
  8.1521 +	avail_end = ptoa(xen_start_info.nr_pages) +
  8.1522 +		(KERNTEXTOFF - KERNBASE_LOCORE);
  8.1523 +	pmap_pa_start = (KERNTEXTOFF - KERNBASE_LOCORE);
  8.1524 +	pmap_pa_end = avail_end;
  8.1525 +	mem_clusters[0].start = avail_start;
  8.1526 +	mem_clusters[0].size = avail_end - avail_start;
  8.1527 +	mem_cluster_cnt++;
  8.1528 +	physmem += atop(mem_clusters[0].size);
  8.1529 +#endif
  8.1530 +
  8.1531 +	/*
  8.1532 +	 * reserve memory for real-mode call
  8.1533 +	 */
  8.1534 +	needs_earlier_install_pte0 = 0;
  8.1535 +	realmode_reserved_start = 0;
  8.1536 +	realmode_reserved_size = 0;
  8.1537 +#if NBIOSCALL > 0
  8.1538 +	/* save us a page for trampoline code */
  8.1539 +	realmode_reserved_size += PAGE_SIZE;
  8.1540 +	needs_earlier_install_pte0 = 1;
  8.1541 +#endif
  8.1542 +#ifdef MULTIPROCESSOR						 /* XXX */
  8.1543 +#if !defined(XEN)
  8.1544 +	KASSERT(avail_start == PAGE_SIZE);			 /* XXX */
  8.1545 +#endif
  8.1546 +	if (realmode_reserved_size < MP_TRAMPOLINE)		 /* XXX */
  8.1547 +		realmode_reserved_size = MP_TRAMPOLINE;		 /* XXX */
  8.1548 +	needs_earlier_install_pte0 = 1;				 /* XXX */
  8.1549 +#endif								 /* XXX */
  8.1550 +#if NACPI > 0
  8.1551 +	/* trampoline code for wake handler */
  8.1552 +	realmode_reserved_size += ptoa(acpi_md_get_npages_of_wakecode()+1);
  8.1553 +	needs_earlier_install_pte0 = 1;
  8.1554 +#endif
  8.1555 +	if (needs_earlier_install_pte0) {
  8.1556 +		/* page table for directory entry 0 */
  8.1557 +		realmode_reserved_size += PAGE_SIZE;
  8.1558 +	}
  8.1559 +	if (realmode_reserved_size>0) {
  8.1560 +		realmode_reserved_start = avail_start;
  8.1561 +		avail_start += realmode_reserved_size;
  8.1562 +	}
  8.1563 +
  8.1564 +#ifdef DEBUG_MEMLOAD
  8.1565 +	printf("mem_cluster_count: %d\n", mem_cluster_cnt);
  8.1566 +#endif
  8.1567 +
  8.1568 +	/*
  8.1569 +	 * Call pmap initialization to make new kernel address space.
  8.1570 +	 * We must do this before loading pages into the VM system.
  8.1571 +	 */
  8.1572 +	pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE);
  8.1573 +
  8.1574 +#if !defined(XEN)
  8.1575 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
  8.1576 +	/*
  8.1577 +	 * Check to see if we have a memory map from the BIOS (passed
  8.1578 +	 * to us by the boot program.
  8.1579 +	 */
  8.1580 +	bim = lookup_bootinfo(BTINFO_MEMMAP);
  8.1581 +	if (bim != NULL && bim->num > 0) {
  8.1582 +#ifdef DEBUG_MEMLOAD
  8.1583 +		printf("BIOS MEMORY MAP (%d ENTRIES):\n", bim->num);
  8.1584 +#endif
  8.1585 +		for (x = 0; x < bim->num; x++) {
  8.1586 +#ifdef DEBUG_MEMLOAD
  8.1587 +			printf("    addr 0x%qx  size 0x%qx  type 0x%x\n",
  8.1588 +			    bim->entry[x].addr,
  8.1589 +			    bim->entry[x].size,
  8.1590 +			    bim->entry[x].type);
  8.1591 +#endif
  8.1592 +
  8.1593 +			/*
  8.1594 +			 * If the segment is not memory, skip it.
  8.1595 +			 */
  8.1596 +			switch (bim->entry[x].type) {
  8.1597 +			case BIM_Memory:
  8.1598 +			case BIM_ACPI:
  8.1599 +			case BIM_NVS:
  8.1600 +				break;
  8.1601 +			default:
  8.1602 +				continue;
  8.1603 +			}
  8.1604 +
  8.1605 +			/*
  8.1606 +			 * Sanity check the entry.
  8.1607 +			 * XXX Need to handle uint64_t in extent code
  8.1608 +			 * XXX and 64-bit physical addresses in i386
  8.1609 +			 * XXX port.
  8.1610 +			 */
  8.1611 +			seg_start = bim->entry[x].addr;
  8.1612 +			seg_end = bim->entry[x].addr + bim->entry[x].size;
  8.1613 +
  8.1614 +			/*
  8.1615 +			 *   Avoid Compatibility Holes.
  8.1616 +			 * XXX  Holes within memory space that allow access
  8.1617 +			 * XXX to be directed to the PC-compatible frame buffer
  8.1618 +			 * XXX (0xa0000-0xbffff),to adapter ROM space
  8.1619 +			 * XXX (0xc0000-0xdffff), and to system BIOS space
  8.1620 +			 * XXX (0xe0000-0xfffff).
  8.1621 +			 * XXX  Some laptop(for example,Toshiba Satellite2550X)
  8.1622 +			 * XXX report this area and occurred problems,
  8.1623 +			 * XXX so we avoid this area.
  8.1624 +			 */
  8.1625 +			if (seg_start < 0x100000 && seg_end > 0xa0000) {
  8.1626 +				printf("WARNING: memory map entry overlaps "
  8.1627 +				    "with ``Compatibility Holes'': "
  8.1628 +				    "0x%qx/0x%qx/0x%x\n", seg_start,
  8.1629 +				    seg_end - seg_start, bim->entry[x].type);
  8.1630 +				add_mem_cluster(seg_start, 0xa0000,
  8.1631 +				    bim->entry[x].type);
  8.1632 +				add_mem_cluster(0x100000, seg_end,
  8.1633 +				    bim->entry[x].type);
  8.1634 +			} else
  8.1635 +				add_mem_cluster(seg_start, seg_end,
  8.1636 +				    bim->entry[x].type);
  8.1637 +		}
  8.1638 +	}
  8.1639 +#endif /* ! REALBASEMEM && ! REALEXTMEM */
  8.1640 +	/*
  8.1641 +	 * If the loop above didn't find any valid segment, fall back to
  8.1642 +	 * former code.
  8.1643 +	 */
  8.1644 +	if (mem_cluster_cnt == 0) {
  8.1645 +		/*
  8.1646 +		 * Allocate the physical addresses used by RAM from the iomem
  8.1647 +		 * extent map.  This is done before the addresses are
  8.1648 +		 * page rounded just to make sure we get them all.
  8.1649 +		 */
  8.1650 +		if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem),
  8.1651 +		    EX_NOWAIT)) {
  8.1652 +			/* XXX What should we do? */
  8.1653 +			printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM "
  8.1654 +			    "IOMEM EXTENT MAP!\n");
  8.1655 +		}
  8.1656 +		mem_clusters[0].start = 0;
  8.1657 +		mem_clusters[0].size = trunc_page(KBTOB(biosbasemem));
  8.1658 +		physmem += atop(mem_clusters[0].size);
  8.1659 +		if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem),
  8.1660 +		    EX_NOWAIT)) {
  8.1661 +			/* XXX What should we do? */
  8.1662 +			printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM "
  8.1663 +			    "IOMEM EXTENT MAP!\n");
  8.1664 +		}
  8.1665 +#if NISADMA > 0
  8.1666 +		/*
  8.1667 +		 * Some motherboards/BIOSes remap the 384K of RAM that would
  8.1668 +		 * normally be covered by the ISA hole to the end of memory
  8.1669 +		 * so that it can be used.  However, on a 16M system, this
  8.1670 +		 * would cause bounce buffers to be allocated and used.
  8.1671 +		 * This is not desirable behaviour, as more than 384K of
  8.1672 +		 * bounce buffers might be allocated.  As a work-around,
  8.1673 +		 * we round memory down to the nearest 1M boundary if
  8.1674 +		 * we're using any isadma devices and the remapped memory
  8.1675 +		 * is what puts us over 16M.
  8.1676 +		 */
  8.1677 +		if (biosextmem > (15*1024) && biosextmem < (16*1024)) {
  8.1678 +			char pbuf[9];
  8.1679 +
  8.1680 +			format_bytes(pbuf, sizeof(pbuf),
  8.1681 +			    biosextmem - (15*1024));
  8.1682 +			printf("Warning: ignoring %s of remapped memory\n",
  8.1683 +			    pbuf);
  8.1684 +			biosextmem = (15*1024);
  8.1685 +		}
  8.1686 +#endif
  8.1687 +		mem_clusters[1].start = IOM_END;
  8.1688 +		mem_clusters[1].size = trunc_page(KBTOB(biosextmem));
  8.1689 +		physmem += atop(mem_clusters[1].size);
  8.1690 +
  8.1691 +		mem_cluster_cnt = 2;
  8.1692 +
  8.1693 +		avail_end = IOM_END + trunc_page(KBTOB(biosextmem));
  8.1694 +	}
  8.1695 +	/*
  8.1696 +	 * If we have 16M of RAM or less, just put it all on
  8.1697 +	 * the default free list.  Otherwise, put the first
  8.1698 +	 * 16M of RAM on a lower priority free list (so that
  8.1699 +	 * all of the ISA DMA'able memory won't be eaten up
  8.1700 +	 * first-off).
  8.1701 +	 */
  8.1702 +	if (avail_end <= (16 * 1024 * 1024))
  8.1703 +		first16q = VM_FREELIST_DEFAULT;
  8.1704 +	else
  8.1705 +		first16q = VM_FREELIST_FIRST16;
  8.1706 +
  8.1707 +	/* Make sure the end of the space used by the kernel is rounded. */
  8.1708 +	first_avail = round_page(first_avail);
  8.1709 +#endif
  8.1710 +
  8.1711 +	XENPRINTK(("load the memory cluster %p(%d) - %p(%ld)\n",
  8.1712 +	    (void *)avail_start, (int)atop(avail_start),
  8.1713 +	    (void *)avail_end, (int)atop(avail_end)));
  8.1714 +	uvm_page_physload(atop(avail_start), atop(avail_end),
  8.1715 +	    atop(avail_start), atop(avail_end),
  8.1716 +	    VM_FREELIST_DEFAULT);
  8.1717 +
  8.1718 +#if !defined(XEN)
  8.1719 +
  8.1720 +	/*
  8.1721 +	 * Now, load the memory clusters (which have already been
  8.1722 +	 * rounded and truncated) into the VM system.
  8.1723 +	 *
  8.1724 +	 * NOTE: WE ASSUME THAT MEMORY STARTS AT 0 AND THAT THE KERNEL
  8.1725 +	 * IS LOADED AT IOM_END (1M).
  8.1726 +	 */
  8.1727 +	for (x = 0; x < mem_cluster_cnt; x++) {
  8.1728 +		seg_start = mem_clusters[x].start;
  8.1729 +		seg_end = mem_clusters[x].start + mem_clusters[x].size;
  8.1730 +		seg_start1 = 0;
  8.1731 +		seg_end1 = 0;
  8.1732 +
  8.1733 +		/*
  8.1734 +		 * Skip memory before our available starting point.
  8.1735 +		 */
  8.1736 +		if (seg_end <= avail_start)
  8.1737 +			continue;
  8.1738 +
  8.1739 +		if (avail_start >= seg_start && avail_start < seg_end) {
  8.1740 +			if (seg_start != 0)
  8.1741 +				panic("init386: memory doesn't start at 0");
  8.1742 +			seg_start = avail_start;
  8.1743 +			if (seg_start == seg_end)
  8.1744 +				continue;
  8.1745 +		}
  8.1746 +
  8.1747 +		/*
  8.1748 +		 * If this segment contains the kernel, split it
  8.1749 +		 * in two, around the kernel.
  8.1750 +		 */
  8.1751 +		if (seg_start <= IOM_END && first_avail <= seg_end) {
  8.1752 +			seg_start1 = first_avail;
  8.1753 +			seg_end1 = seg_end;
  8.1754 +			seg_end = IOM_END;
  8.1755 +		}
  8.1756 +
  8.1757 +		/* First hunk */
  8.1758 +		if (seg_start != seg_end) {
  8.1759 +			if (seg_start < (16 * 1024 * 1024) &&
  8.1760 +			    first16q != VM_FREELIST_DEFAULT) {
  8.1761 +				u_int64_t tmp;
  8.1762 +
  8.1763 +				if (seg_end > (16 * 1024 * 1024))
  8.1764 +					tmp = (16 * 1024 * 1024);
  8.1765 +				else
  8.1766 +					tmp = seg_end;
  8.1767 +
  8.1768 +				if (tmp != seg_start) {
  8.1769 +#ifdef DEBUG_MEMLOAD
  8.1770 +					printf("loading 0x%qx-0x%qx "
  8.1771 +					    "(0x%lx-0x%lx)\n",
  8.1772 +				    	    seg_start, tmp,
  8.1773 +				  	    atop(seg_start), atop(tmp));
  8.1774 +#endif
  8.1775 +					uvm_page_physload(atop(seg_start),
  8.1776 +				    	    atop(tmp), atop(seg_start),
  8.1777 +				    	    atop(tmp), first16q);
  8.1778 +				}
  8.1779 +				seg_start = tmp;
  8.1780 +			}
  8.1781 +
  8.1782 +			if (seg_start != seg_end) {
  8.1783 +#ifdef DEBUG_MEMLOAD
  8.1784 +				printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n",
  8.1785 +				    seg_start, seg_end,
  8.1786 +				    atop(seg_start), atop(seg_end));
  8.1787 +#endif
  8.1788 +				uvm_page_physload(atop(seg_start),
  8.1789 +				    atop(seg_end), atop(seg_start),
  8.1790 +				    atop(seg_end), VM_FREELIST_DEFAULT);
  8.1791 +			}
  8.1792 +		}
  8.1793 +
  8.1794 +		/* Second hunk */
  8.1795 +		if (seg_start1 != seg_end1) {
  8.1796 +			if (seg_start1 < (16 * 1024 * 1024) &&
  8.1797 +			    first16q != VM_FREELIST_DEFAULT) {
  8.1798 +				u_int64_t tmp;
  8.1799 +
  8.1800 +				if (seg_end1 > (16 * 1024 * 1024))
  8.1801 +					tmp = (16 * 1024 * 1024);
  8.1802 +				else
  8.1803 +					tmp = seg_end1;
  8.1804 +
  8.1805 +				if (tmp != seg_start1) {
  8.1806 +#ifdef DEBUG_MEMLOAD
  8.1807 +					printf("loading 0x%qx-0x%qx "
  8.1808 +					    "(0x%lx-0x%lx)\n",
  8.1809 +				    	    seg_start1, tmp,
  8.1810 +				    	    atop(seg_start1), atop(tmp));
  8.1811 +#endif
  8.1812 +					uvm_page_physload(atop(seg_start1),
  8.1813 +				    	    atop(tmp), atop(seg_start1),
  8.1814 +				    	    atop(tmp), first16q);
  8.1815 +				}
  8.1816 +				seg_start1 = tmp;
  8.1817 +			}
  8.1818 +
  8.1819 +			if (seg_start1 != seg_end1) {
  8.1820 +#ifdef DEBUG_MEMLOAD
  8.1821 +				printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n",
  8.1822 +				    seg_start1, seg_end1,
  8.1823 +				    atop(seg_start1), atop(seg_end1));
  8.1824 +#endif
  8.1825 +				uvm_page_physload(atop(seg_start1),
  8.1826 +				    atop(seg_end1), atop(seg_start1),
  8.1827 +				    atop(seg_end1), VM_FREELIST_DEFAULT);
  8.1828 +			}
  8.1829 +		}
  8.1830 +	}
  8.1831 +#endif
  8.1832 +
  8.1833 +	/*
  8.1834 +	 * Steal memory for the message buffer (at end of core).
  8.1835 +	 */
  8.1836 +	{
  8.1837 +		struct vm_physseg *vps;
  8.1838 +		psize_t sz = round_page(MSGBUFSIZE);
  8.1839 +		psize_t reqsz = sz;
  8.1840 +
  8.1841 +		for (x = 0; x < vm_nphysseg; x++) {
  8.1842 +			vps = &vm_physmem[x];
  8.1843 +			if (ptoa(vps->avail_end) == avail_end)
  8.1844 +				goto found;
  8.1845 +		}
  8.1846 +		panic("init386: can't find end of memory");
  8.1847 +
  8.1848 +	found:
  8.1849 +		/* Shrink so it'll fit in the last segment. */
  8.1850 +		if ((vps->avail_end - vps->avail_start) < atop(sz))
  8.1851 +			sz = ptoa(vps->avail_end - vps->avail_start);
  8.1852 +
  8.1853 +		vps->avail_end -= atop(sz);
  8.1854 +		vps->end -= atop(sz);
  8.1855 +		msgbuf_paddr = ptoa(vps->avail_end);
  8.1856 +
  8.1857 +		/* Remove the last segment if it now has no pages. */
  8.1858 +		if (vps->start == vps->end) {
  8.1859 +			for (vm_nphysseg--; x < vm_nphysseg; x++)
  8.1860 +				vm_physmem[x] = vm_physmem[x + 1];
  8.1861 +		}
  8.1862 +
  8.1863 +		/* Now find where the new avail_end is. */
  8.1864 +		for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
  8.1865 +			if (vm_physmem[x].avail_end > avail_end)
  8.1866 +				avail_end = vm_physmem[x].avail_end;
  8.1867 +		avail_end = ptoa(avail_end);
  8.1868 +
  8.1869 +		/* Warn if the message buffer had to be shrunk. */
  8.1870 +		if (sz != reqsz)
  8.1871 +			printf("WARNING: %ld bytes not available for msgbuf "
  8.1872 +			    "in last cluster (%ld used)\n", reqsz, sz);
  8.1873 +	}
  8.1874 +
  8.1875 +	/*
  8.1876 +	 * install PT page for the first 4M if needed.
  8.1877 +	 */
  8.1878 +	if (needs_earlier_install_pte0) {
  8.1879 +		paddr_t paddr;
  8.1880 +#ifdef DIAGNOSTIC
  8.1881 +		if (realmode_reserved_size < PAGE_SIZE) {
  8.1882 +			panic("cannot steal memory for first 4M PT page.");
  8.1883 +		}
  8.1884 +#endif
  8.1885 +		paddr=realmode_reserved_start+realmode_reserved_size-PAGE_SIZE;
  8.1886 +		pmap_enter(pmap_kernel(), (vaddr_t)vtopte(0), paddr,
  8.1887 +			   VM_PROT_READ|VM_PROT_WRITE,
  8.1888 +			   PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE);
  8.1889 +		pmap_update(pmap_kernel());
  8.1890 +		/* make sure it is clean before using */
  8.1891 +		memset(vtopte(0), 0, PAGE_SIZE);
  8.1892 +		realmode_reserved_size -= PAGE_SIZE;
  8.1893 +	}
  8.1894 +
  8.1895 +#if NBIOSCALL > 0
  8.1896 +	/*
  8.1897 +	 * this should be caught at kernel build time, but put it here
  8.1898 +	 * in case someone tries to fake it out...
  8.1899 +	 */
  8.1900 +#ifdef DIAGNOSTIC
  8.1901 +	if (realmode_reserved_start > BIOSTRAMP_BASE ||
  8.1902 +	    (realmode_reserved_start+realmode_reserved_size) < (BIOSTRAMP_BASE+
  8.1903 +							       PAGE_SIZE)) {
  8.1904 +	    panic("cannot steal memory for PT page of bioscall.");
  8.1905 +	}
  8.1906 +	if (biostramp_image_size > PAGE_SIZE)
  8.1907 +	    panic("biostramp_image_size too big: %x vs. %x",
  8.1908 +		  biostramp_image_size, PAGE_SIZE);
  8.1909 +#endif
  8.1910 +	pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE,	/* virtual */
  8.1911 +		       (paddr_t)BIOSTRAMP_BASE,	/* physical */
  8.1912 +		       VM_PROT_ALL);		/* protection */
  8.1913 +	pmap_update(pmap_kernel());
  8.1914 +	memcpy((caddr_t)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
  8.1915 +#ifdef DEBUG_BIOSCALL
  8.1916 +	printf("biostramp installed @ %x\n", BIOSTRAMP_BASE);
  8.1917 +#endif
  8.1918 +	realmode_reserved_size  -= PAGE_SIZE;
  8.1919 +	realmode_reserved_start += PAGE_SIZE;
  8.1920 +#endif
  8.1921 +
  8.1922 +#if NACPI > 0
  8.1923 +	/*
  8.1924 +	 * Steal memory for the acpi wake code
  8.1925 +	 */
  8.1926 +	{
  8.1927 +		paddr_t paddr, p;
  8.1928 +		psize_t sz;
  8.1929 +		int npg;
  8.1930 +
  8.1931 +		paddr = realmode_reserved_start;
  8.1932 +		npg = acpi_md_get_npages_of_wakecode();
  8.1933 +		sz = ptoa(npg);
  8.1934 +#ifdef DIAGNOSTIC
  8.1935 +		if (realmode_reserved_size < sz) {
  8.1936 +			panic("cannot steal memory for ACPI wake code.");
  8.1937 +		}
  8.1938 +#endif
  8.1939 +
  8.1940 +		/* identical mapping */
  8.1941 +		p = paddr;
  8.1942 +		for (x=0; x<npg; x++) {
  8.1943 +			printf("kenter: 0x%08X\n", (unsigned)p);
  8.1944 +			pmap_kenter_pa((vaddr_t)p, p, VM_PROT_ALL);
  8.1945 +			p += PAGE_SIZE;
  8.1946 +		}
  8.1947 +		pmap_update(pmap_kernel());
  8.1948 +
  8.1949 +		acpi_md_install_wakecode(paddr);
  8.1950 +
  8.1951 +		realmode_reserved_size  -= sz;
  8.1952 +		realmode_reserved_start += sz;
  8.1953 +	}
  8.1954 +#endif
  8.1955 +
  8.1956 +	pmap_enter(pmap_kernel(), idt_vaddr, idt_paddr,
  8.1957 +	    VM_PROT_READ|VM_PROT_WRITE, PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE);
  8.1958 +	pmap_update(pmap_kernel());
  8.1959 +	memset((void *)idt_vaddr, 0, PAGE_SIZE);
  8.1960 +
  8.1961 +#if !defined(XEN)
  8.1962 +	idt = (struct gate_descriptor *)idt_vaddr;
  8.1963 +#ifdef I586_CPU
  8.1964 +	pmap_enter(pmap_kernel(), pentium_idt_vaddr, idt_paddr,
  8.1965 +	    VM_PROT_READ, PMAP_WIRED|VM_PROT_READ);
  8.1966 +	pentium_idt = (union descriptor *)pentium_idt_vaddr;
  8.1967 +#endif
  8.1968 +#endif
  8.1969 +	pmap_update(pmap_kernel());
  8.1970 +
  8.1971 +	initgdt();
  8.1972 +
  8.1973 +	HYPERVISOR_set_callbacks(
  8.1974 +		GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback,
  8.1975 +		GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
  8.1976 +
  8.1977 +#if !defined(XEN)
  8.1978 +	tgdt = gdt;
  8.1979 +	gdt = (union descriptor *)
  8.1980 +		    ((char *)idt + NIDT * sizeof (struct gate_descriptor));
  8.1981 +	ldt = gdt + NGDT;
  8.1982 +
  8.1983 +	memcpy(gdt, tgdt, NGDT*sizeof(*gdt));
  8.1984 +
  8.1985 +	setsegment(&gdt[GLDT_SEL].sd, ldt, NLDT * sizeof(ldt[0]) - 1,
  8.1986 +	    SDT_SYSLDT, SEL_KPL, 0, 0);
  8.1987 +#else
  8.1988 +	ldt = (union descriptor *)idt_vaddr;
  8.1989 +#endif
  8.1990 +
  8.1991 +	/* make ldt gates and memory segments */
  8.1992 +	setgate(&ldt[LSYS5CALLS_SEL].gd, &IDTVEC(osyscall), 1,
  8.1993 +	    SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
  8.1994 +
  8.1995 +	ldt[LUCODE_SEL] = gdt[GUCODE_SEL];
  8.1996 +	ldt[LUCODEBIG_SEL] = gdt[GUCODEBIG_SEL];
  8.1997 +	ldt[LUDATA_SEL] = gdt[GUDATA_SEL];
  8.1998 +	ldt[LSOL26CALLS_SEL] = ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
  8.1999 +
  8.2000 +#if !defined(XEN)
  8.2001 +	/* exceptions */
  8.2002 +	for (x = 0; x < 32; x++) {
  8.2003 +		setgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386TGT,
  8.2004 +		    (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
  8.2005 +		    GSEL(GCODE_SEL, SEL_KPL));
  8.2006 +		idt_allocmap[x] = 1;
  8.2007 +	}
  8.2008 +
  8.2009 +	/* new-style interrupt gate for syscalls */
  8.2010 +	setgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386TGT, SEL_UPL,
  8.2011 +	    GSEL(GCODE_SEL, SEL_KPL));
  8.2012 +	idt_allocmap[128] = 1;
  8.2013 +#ifdef COMPAT_SVR4
  8.2014 +	setgate(&idt[0xd2], &IDTVEC(svr4_fasttrap), 0, SDT_SYS386TGT,
  8.2015 +	    SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
  8.2016 +	idt_allocmap[0xd2] = 1;
  8.2017 +#endif /* COMPAT_SVR4 */
  8.2018 +#endif
  8.2019 +
  8.2020 +	memset(xen_idt, 0, sizeof(trap_info_t) * MAX_XEN_IDT);
  8.2021 +	xen_idt_idx = 0;
  8.2022 +	for (x = 0; x < 32; x++) {
  8.2023 +		KASSERT(xen_idt_idx < MAX_XEN_IDT);
  8.2024 +		xen_idt[xen_idt_idx].vector = x;
  8.2025 +		xen_idt[xen_idt_idx].flags =
  8.2026 +			(x == 3 || x == 4) ? SEL_UPL : SEL_XEN;
  8.2027 +		xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
  8.2028 +		xen_idt[xen_idt_idx].address =
  8.2029 +			(uint32_t)IDTVEC(exceptions)[x];
  8.2030 +		xen_idt_idx++;
  8.2031 +	}
  8.2032 +	KASSERT(xen_idt_idx < MAX_XEN_IDT);
  8.2033 +	xen_idt[xen_idt_idx].vector = 128;
  8.2034 +	xen_idt[xen_idt_idx].flags = SEL_UPL;
  8.2035 +	xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
  8.2036 +	xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(syscall);
  8.2037 +	xen_idt_idx++;
  8.2038 +#ifdef COMPAT_SVR4
  8.2039 +	KASSERT(xen_idt_idx < MAX_XEN_IDT);
  8.2040 +	xen_idt[xen_idt_idx].vector = 0xd2;
  8.2041 +	xen_idt[xen_idt_idx].flags = SEL_UPL;
  8.2042 +	xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
  8.2043 +	xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(svr4_fasttrap);
  8.2044 +	xen_idt_idx++;
  8.2045 +#endif /* COMPAT_SVR4 */
  8.2046 +
  8.2047 +#if !defined(XEN)
  8.2048 +	setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
  8.2049 +	lgdt(&region);
  8.2050 +#else
  8.2051 +	lldt(GSEL(GLDT_SEL, SEL_KPL));
  8.2052 +#endif
  8.2053 +
  8.2054 +#if !defined(XEN)
  8.2055 +	cpu_init_idt();
  8.2056 +#else
  8.2057 +	db_trap_callback = ddb_trap_hook;
  8.2058 +
  8.2059 +	XENPRINTF(("HYPERVISOR_set_trap_table %p\n", xen_idt));
  8.2060 +	if (HYPERVISOR_set_trap_table(xen_idt))
  8.2061 +		panic("HYPERVISOR_set_trap_table %p failed\n", xen_idt);
  8.2062 +#endif
  8.2063 +
  8.2064 +#if NKSYMS || defined(DDB) || defined(LKM)
  8.2065 +	{
  8.2066 +		extern int end;
  8.2067 +		extern int *esym;
  8.2068 +		struct btinfo_symtab *symtab;
  8.2069 +
  8.2070 +#ifdef DDB
  8.2071 +		db_machine_init();
  8.2072 +#endif
  8.2073 +
  8.2074 +		symtab = lookup_bootinfo(BTINFO_SYMTAB);
  8.2075 +
  8.2076 +		if (symtab) {
  8.2077 +			symtab->ssym += KERNBASE;
  8.2078 +			symtab->esym += KERNBASE;
  8.2079 +			ksyms_init(symtab->nsym, (int *)symtab->ssym,
  8.2080 +			    (int *)symtab->esym);
  8.2081 +		}
  8.2082 +		else
  8.2083 +			ksyms_init(*(int *)&end, ((int *)&end) + 1, esym);
  8.2084 +	}
  8.2085 +#endif
  8.2086 +#ifdef DDB
  8.2087 +	if (boothowto & RB_KDB)
  8.2088 +		Debugger();
  8.2089 +#endif
  8.2090 +#ifdef IPKDB
  8.2091 +	ipkdb_init();
  8.2092 +	if (boothowto & RB_KDB)
  8.2093 +		ipkdb_connect(0);
  8.2094 +#endif
  8.2095 +#ifdef KGDB
  8.2096 +	kgdb_port_init();
  8.2097 +	if (boothowto & RB_KDB) {
  8.2098 +		kgdb_debug_init = 1;
  8.2099 +		kgdb_connect(1);
  8.2100 +	}
  8.2101 +#endif
  8.2102 +
  8.2103 +#if NMCA > 0
  8.2104 +	/* check for MCA bus, needed to be done before ISA stuff - if
  8.2105 +	 * MCA is detected, ISA needs to use level triggered interrupts
  8.2106 +	 * by default */
  8.2107 +	mca_busprobe();
  8.2108 +#endif
  8.2109 +
  8.2110 +#if defined(XEN)
  8.2111 +	events_default_setup();
  8.2112 +#else
  8.2113 +	intr_default_setup();
  8.2114 +#endif
  8.2115 +
  8.2116 +	/* Initialize software interrupts. */
  8.2117 +	softintr_init();
  8.2118 +
  8.2119 +	splraise(IPL_IPI);
  8.2120 +	enable_intr();
  8.2121 +
  8.2122 +	if (physmem < btoc(2 * 1024 * 1024)) {
  8.2123 +		printf("warning: too little memory available; "
  8.2124 +		       "have %lu bytes, want %lu bytes\n"
  8.2125 +		       "running in degraded mode\n"
  8.2126 +		       "press a key to confirm\n\n",
  8.2127 +		       ptoa(physmem), 2*1024*1024UL);
  8.2128 +		cngetc();
  8.2129 +	}
  8.2130 +
  8.2131 +#ifdef __HAVE_CPU_MAXPROC
  8.2132 +	/* Make sure maxproc is sane */
  8.2133 +	if (maxproc > cpu_maxproc())
  8.2134 +		maxproc = cpu_maxproc();
  8.2135 +#endif
  8.2136 +}
  8.2137 +
  8.2138 +#ifdef COMPAT_NOMID
  8.2139 +static int
  8.2140 +exec_nomid(struct proc *p, struct exec_package *epp)
  8.2141 +{
  8.2142 +	int error;
  8.2143 +	u_long midmag, magic;
  8.2144 +	u_short mid;
  8.2145 +	struct exec *execp = epp->ep_hdr;
  8.2146 +
  8.2147 +	/* check on validity of epp->ep_hdr performed by exec_out_makecmds */
  8.2148 +
  8.2149 +	midmag = ntohl(execp->a_midmag);
  8.2150 +	mid = (midmag >> 16) & 0xffff;
  8.2151 +	magic = midmag & 0xffff;
  8.2152 +
  8.2153 +	if (magic == 0) {
  8.2154 +		magic = (execp->a_midmag & 0xffff);
  8.2155 +		mid = MID_ZERO;
  8.2156 +	}
  8.2157 +
  8.2158 +	midmag = mid << 16 | magic;
  8.2159 +
  8.2160 +	switch (midmag) {
  8.2161 +	case (MID_ZERO << 16) | ZMAGIC:
  8.2162 +		/*
  8.2163 +		 * 386BSD's ZMAGIC format:
  8.2164 +		 */
  8.2165 +		error = exec_aout_prep_oldzmagic(p, epp);
  8.2166 +		break;
  8.2167 +
  8.2168 +	case (MID_ZERO << 16) | QMAGIC:
  8.2169 +		/*
  8.2170 +		 * BSDI's QMAGIC format:
  8.2171 +		 * same as new ZMAGIC format, but with different magic number
  8.2172 +		 */
  8.2173 +		error = exec_aout_prep_zmagic(p, epp);
  8.2174 +		break;
  8.2175 +
  8.2176 +	case (MID_ZERO << 16) | NMAGIC:
  8.2177 +		/*
  8.2178 +		 * BSDI's NMAGIC format:
  8.2179 +		 * same as NMAGIC format, but with different magic number
  8.2180 +		 * and with text starting at 0.
  8.2181 +		 */
  8.2182 +		error = exec_aout_prep_oldnmagic(p, epp);
  8.2183 +		break;
  8.2184 +
  8.2185 +	case (MID_ZERO << 16) | OMAGIC:
  8.2186 +		/*
  8.2187 +		 * BSDI's OMAGIC format:
  8.2188 +		 * same as OMAGIC format, but with different magic number
  8.2189 +		 * and with text starting at 0.
  8.2190 +		 */
  8.2191 +		error = exec_aout_prep_oldomagic(p, epp);
  8.2192 +		break;
  8.2193 +
  8.2194 +	default:
  8.2195 +		error = ENOEXEC;
  8.2196 +	}
  8.2197 +
  8.2198 +	return error;
  8.2199 +}
  8.2200 +#endif
  8.2201 +
  8.2202 +/*
  8.2203 + * cpu_exec_aout_makecmds():
  8.2204 + *	CPU-dependent a.out format hook for execve().
  8.2205 + *
  8.2206 + * Determine of the given exec package refers to something which we
  8.2207 + * understand and, if so, set up the vmcmds for it.
  8.2208 + *
  8.2209 + * On the i386, old (386bsd) ZMAGIC binaries and BSDI QMAGIC binaries
  8.2210 + * if COMPAT_NOMID is given as a kernel option.
  8.2211 + */
  8.2212 +int
  8.2213 +cpu_exec_aout_makecmds(struct proc *p, struct exec_package *epp)
  8.2214 +{
  8.2215 +	int error = ENOEXEC;
  8.2216 +
  8.2217 +#ifdef COMPAT_NOMID
  8.2218 +	if ((error = exec_nomid(p, epp)) == 0)
  8.2219 +		return error;
  8.2220 +#endif /* ! COMPAT_NOMID */
  8.2221 +
  8.2222 +	return error;
  8.2223 +}
  8.2224 +
  8.2225 +void *
  8.2226 +lookup_bootinfo(int type)
  8.2227 +{
  8.2228 +	struct btinfo_common *help;
  8.2229 +	int n = *(int*)bootinfo;
  8.2230 +	help = (struct btinfo_common *)(bootinfo + sizeof(int));
  8.2231 +	while(n--) {
  8.2232 +		if(help->type == type)
  8.2233 +			return(help);
  8.2234 +		help = (struct btinfo_common *)((char*)help + help->len);
  8.2235 +	}
  8.2236 +	return(0);
  8.2237 +}
  8.2238 +
  8.2239 +#include <dev/ic/mc146818reg.h>		/* for NVRAM POST */
  8.2240 +#include <i386/isa/nvram.h>		/* for NVRAM POST */
  8.2241 +
  8.2242 +void
  8.2243 +cpu_reset()
  8.2244 +{
  8.2245 +
  8.2246 +	disable_intr();
  8.2247 +
  8.2248 +#if 0
  8.2249 +	/*
  8.2250 +	 * Ensure the NVRAM reset byte contains something vaguely sane.
  8.2251 +	 */
  8.2252 +
  8.2253 +	outb(IO_RTC, NVRAM_RESET);
  8.2254 +	outb(IO_RTC+1, NVRAM_RESET_RST);
  8.2255 +
  8.2256 +	/*
  8.2257 +	 * The keyboard controller has 4 random output pins, one of which is
  8.2258 +	 * connected to the RESET pin on the CPU in many PCs.  We tell the
  8.2259 +	 * keyboard controller to pulse this line a couple of times.
  8.2260 +	 */
  8.2261 +	outb(IO_KBD + KBCMDP, KBC_PULSE0);
  8.2262 +	delay(100000);
  8.2263 +	outb(IO_KBD + KBCMDP, KBC_PULSE0);
  8.2264 +	delay(100000);
  8.2265 +#endif
  8.2266 +
  8.2267 +	HYPERVISOR_reboot();
  8.2268 +
  8.2269 +	for (;;);
  8.2270 +}
  8.2271 +
  8.2272 +void
  8.2273 +cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
  8.2274 +{
  8.2275 +	const struct trapframe *tf = l->l_md.md_regs;
  8.2276 +	__greg_t *gr = mcp->__gregs;
  8.2277 +	__greg_t ras_eip;
  8.2278 +
  8.2279 +	/* Save register context. */
  8.2280 +#ifdef VM86
  8.2281 +	if (tf->tf_eflags & PSL_VM) {
  8.2282 +		gr[_REG_GS]  = tf->tf_vm86_gs;
  8.2283 +		gr[_REG_FS]  = tf->tf_vm86_fs;
  8.2284 +		gr[_REG_ES]  = tf->tf_vm86_es;
  8.2285 +		gr[_REG_DS]  = tf->tf_vm86_ds;
  8.2286 +		gr[_REG_EFL] = get_vflags(l);
  8.2287 +	} else
  8.2288 +#endif
  8.2289 +	{
  8.2290 +		gr[_REG_GS]  = tf->tf_gs;
  8.2291 +		gr[_REG_FS]  = tf->tf_fs;
  8.2292 +		gr[_REG_ES]  = tf->tf_es;
  8.2293 +		gr[_REG_DS]  = tf->tf_ds;
  8.2294 +		gr[_REG_EFL] = tf->tf_eflags;
  8.2295 +	}
  8.2296 +	gr[_REG_EDI]    = tf->tf_edi;
  8.2297 +	gr[_REG_ESI]    = tf->tf_esi;
  8.2298 +	gr[_REG_EBP]    = tf->tf_ebp;
  8.2299 +	gr[_REG_EBX]    = tf->tf_ebx;
  8.2300 +	gr[_REG_EDX]    = tf->tf_edx;
  8.2301 +	gr[_REG_ECX]    = tf->tf_ecx;
  8.2302 +	gr[_REG_EAX]    = tf->tf_eax;
  8.2303 +	gr[_REG_EIP]    = tf->tf_eip;
  8.2304 +	gr[_REG_CS]     = tf->tf_cs;
  8.2305 +	gr[_REG_ESP]    = tf->tf_esp;
  8.2306 +	gr[_REG_UESP]   = tf->tf_esp;
  8.2307 +	gr[_REG_SS]     = tf->tf_ss;
  8.2308 +	gr[_REG_TRAPNO] = tf->tf_trapno;
  8.2309 +	gr[_REG_ERR]    = tf->tf_err;
  8.2310 +
  8.2311 +	if ((ras_eip = (__greg_t)ras_lookup(l->l_proc,
  8.2312 +	    (caddr_t) gr[_REG_EIP])) != -1)
  8.2313 +		gr[_REG_EIP] = ras_eip;
  8.2314 +
  8.2315 +	*flags |= _UC_CPU;
  8.2316 +
  8.2317 +	/* Save floating point register context, if any. */
  8.2318 +	if ((l->l_md.md_flags & MDL_USEDFPU) != 0) {
  8.2319 +#if NNPX > 0
  8.2320 +		/*
  8.2321 +		 * If this process is the current FP owner, dump its
  8.2322 +		 * context to the PCB first.
  8.2323 +		 * XXX npxsave() also clears the FPU state; depending on the
  8.2324 +		 * XXX application this might be a penalty.
  8.2325 +		 */
  8.2326 +		if (l->l_addr->u_pcb.pcb_fpcpu) {
  8.2327 +			npxsave_lwp(l, 1);
  8.2328 +		}
  8.2329 +#endif
  8.2330 +		if (i386_use_fxsave) {
  8.2331 +			memcpy(&mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
  8.2332 +			    &l->l_addr->u_pcb.pcb_savefpu.sv_xmm,
  8.2333 +			    sizeof (mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm));
  8.2334 +			*flags |= _UC_FXSAVE;
  8.2335 +		} else {
  8.2336 +			memcpy(&mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
  8.2337 +			    &l->l_addr->u_pcb.pcb_savefpu.sv_87,
  8.2338 +			    sizeof (mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state));
  8.2339 +		}
  8.2340 +#if 0
  8.2341 +		/* Apparently nothing ever touches this. */
  8.2342 +		ucp->mcp.mc_fp.fp_emcsts = l->l_addr->u_pcb.pcb_saveemc;
  8.2343 +#endif
  8.2344 +		*flags |= _UC_FPU;
  8.2345 +	}
  8.2346 +}
  8.2347 +
  8.2348 +int
  8.2349 +cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
  8.2350 +{
  8.2351 +	struct trapframe *tf = l->l_md.md_regs;
  8.2352 +	__greg_t *gr = mcp->__gregs;
  8.2353 +
  8.2354 +	/* Restore register context, if any. */
  8.2355 +	if ((flags & _UC_CPU) != 0) {
  8.2356 +#ifdef VM86
  8.2357 +		if (gr[_REG_EFL] & PSL_VM) {
  8.2358 +			tf->tf_vm86_gs = gr[_REG_GS];
  8.2359 +			tf->tf_vm86_fs = gr[_REG_FS];
  8.2360 +			tf->tf_vm86_es = gr[_REG_ES];
  8.2361 +			tf->tf_vm86_ds = gr[_REG_DS];
  8.2362 +			set_vflags(l, gr[_REG_EFL]);
  8.2363 +			if (flags & _UC_VM) {
  8.2364 +				void syscall_vm86(struct trapframe *);
  8.2365 +				l->l_proc->p_md.md_syscall = syscall_vm86;
  8.2366 +			}
  8.2367 +		} else
  8.2368 +#endif
  8.2369 +		{
  8.2370 +			/*
  8.2371 +			 * Check for security violations.  If we're returning
  8.2372 +			 * to protected mode, the CPU will validate the segment
  8.2373 +			 * registers automatically and generate a trap on
  8.2374 +			 * violations.  We handle the trap, rather than doing
  8.2375 +			 * all of the checking here.
  8.2376 +			 */
  8.2377 +			if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) ||
  8.2378 +			    !USERMODE(gr[_REG_CS], gr[_REG_EFL])) {
  8.2379 +				printf("cpu_setmcontext error: uc EFL: 0x%08x"
  8.2380 +				    " tf EFL: 0x%08x uc CS: 0x%x\n",
  8.2381 +				    gr[_REG_EFL], tf->tf_eflags, gr[_REG_CS]);
  8.2382 +				return (EINVAL);
  8.2383 +			}
  8.2384 +			tf->tf_gs = gr[_REG_GS];
  8.2385 +			tf->tf_fs = gr[_REG_FS];
  8.2386 +			tf->tf_es = gr[_REG_ES];
  8.2387 +			tf->tf_ds = gr[_REG_DS];
  8.2388 +			/* Only change the user-alterable part of eflags */
  8.2389 +			tf->tf_eflags &= ~PSL_USER;
  8.2390 +			tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER);
  8.2391 +		}
  8.2392 +		tf->tf_edi    = gr[_REG_EDI];
  8.2393 +		tf->tf_esi    = gr[_REG_ESI];
  8.2394 +		tf->tf_ebp    = gr[_REG_EBP];
  8.2395 +		tf->tf_ebx    = gr[_REG_EBX];
  8.2396 +		tf->tf_edx    = gr[_REG_EDX];
  8.2397 +		tf->tf_ecx    = gr[_REG_ECX];
  8.2398 +		tf->tf_eax    = gr[_REG_EAX];
  8.2399 +		tf->tf_eip    = gr[_REG_EIP];
  8.2400 +		tf->tf_cs     = gr[_REG_CS];
  8.2401 +		tf->tf_esp    = gr[_REG_UESP];
  8.2402 +		tf->tf_ss     = gr[_REG_SS];
  8.2403 +	}
  8.2404 +
  8.2405 +	/* Restore floating point register context, if any. */
  8.2406 +	if ((flags & _UC_FPU) != 0) {
  8.2407 +#if NNPX > 0
  8.2408 +		/*
  8.2409 +		 * If we were using the FPU, forget that we were.
  8.2410 +		 */
  8.2411 +		if (l->l_addr->u_pcb.pcb_fpcpu != NULL)
  8.2412 +			npxsave_lwp(l, 0);
  8.2413 +#endif
  8.2414 +		if (flags & _UC_FXSAVE) {
  8.2415 +			if (i386_use_fxsave) {
  8.2416 +				memcpy(
  8.2417 +					&l->l_addr->u_pcb.pcb_savefpu.sv_xmm,
  8.2418 +					&mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
  8.2419 +					sizeof (&l->l_addr->u_pcb.pcb_savefpu.sv_xmm));
  8.2420 +			} else {
  8.2421 +				/* This is a weird corner case */
  8.2422 +				process_xmm_to_s87((struct savexmm *)
  8.2423 +				    &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
  8.2424 +				    &l->l_addr->u_pcb.pcb_savefpu.sv_87);
  8.2425 +			}
  8.2426 +		} else {
  8.2427 +			if (i386_use_fxsave) {
  8.2428 +				process_s87_to_xmm((struct save87 *)
  8.2429 +				    &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
  8.2430 +				    &l->l_addr->u_pcb.pcb_savefpu.sv_xmm);
  8.2431 +			} else {
  8.2432 +				memcpy(&l->l_addr->u_pcb.pcb_savefpu.sv_87,
  8.2433 +				    &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
  8.2434 +				    sizeof (l->l_addr->u_pcb.pcb_savefpu.sv_87));
  8.2435 +			}
  8.2436 +		}
  8.2437 +		/* If not set already. */
  8.2438 +		l->l_md.md_flags |= MDL_USEDFPU;
  8.2439 +#if 0
  8.2440 +		/* Apparently unused. */
  8.2441 +		l->l_addr->u_pcb.pcb_saveemc = mcp->mc_fp.fp_emcsts;
  8.2442 +#endif
  8.2443 +	}
  8.2444 +	if (flags & _UC_SETSTACK)
  8.2445 +		l->l_proc->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK;
  8.2446 +	if (flags & _UC_CLRSTACK)
  8.2447 +		l->l_proc->p_sigctx.ps_sigstk.ss_flags &= ~SS_ONSTACK;
  8.2448 +	return (0);
  8.2449 +}
  8.2450 +
  8.2451 +void
  8.2452 +cpu_initclocks()
  8.2453 +{
  8.2454 +	(*initclock_func)();
  8.2455 +}
  8.2456 +
  8.2457 +#ifdef MULTIPROCESSOR
  8.2458 +void
  8.2459 +need_resched(struct cpu_info *ci)
  8.2460 +{
  8.2461 +
  8.2462 +	if (ci->ci_want_resched)
  8.2463 +		return;
  8.2464 +
  8.2465 +	ci->ci_want_resched = 1;
  8.2466 +	if ((ci)->ci_curlwp != NULL)
  8.2467 +		aston((ci)->ci_curlwp->l_proc);
  8.2468 +	else if (ci != curcpu())
  8.2469 +		x86_send_ipi(ci, 0);
  8.2470 +}
  8.2471 +#endif
  8.2472 +
  8.2473 +/*
  8.2474 + * Allocate an IDT vector slot within the given range.
  8.2475 + * XXX needs locking to avoid MP allocation races.
  8.2476 + */
  8.2477 +
  8.2478 +int
  8.2479 +idt_vec_alloc(int low, int high)
  8.2480 +{
  8.2481 +	int vec;
  8.2482 +
  8.2483 +	simple_lock(&idt_lock);
  8.2484 +	for (vec = low; vec <= high; vec++) {
  8.2485 +		if (idt_allocmap[vec] == 0) {
  8.2486 +			idt_allocmap[vec] = 1;
  8.2487 +			simple_unlock(&idt_lock);
  8.2488 +			return vec;
  8.2489 +		}
  8.2490 +	}
  8.2491 +	simple_unlock(&idt_lock);
  8.2492 +	return 0;
  8.2493 +}
  8.2494 +
  8.2495 +void
  8.2496 +idt_vec_set(int vec, void (*function)(void))
  8.2497 +{
  8.2498 +	/*
  8.2499 +	 * Vector should be allocated, so no locking needed.
  8.2500 +	 */
  8.2501 +	KASSERT(idt_allocmap[vec] == 1);
  8.2502 +	setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
  8.2503 +	    GSEL(GCODE_SEL, SEL_KPL));
  8.2504 +}
  8.2505 +
  8.2506 +void
  8.2507 +idt_vec_free(int vec)
  8.2508 +{
  8.2509 +	simple_lock(&idt_lock);
  8.2510 +	unsetgate(&idt[vec]);
  8.2511 +	idt_allocmap[vec] = 0;
  8.2512 +	simple_unlock(&idt_lock);
  8.2513 +}
  8.2514 +
  8.2515 +/*
  8.2516 + * Number of processes is limited by number of available GDT slots.
  8.2517 + */
  8.2518 +int
  8.2519 +cpu_maxproc(void)
  8.2520 +{
  8.2521 +#ifdef USER_LDT
  8.2522 +	return ((MAXGDTSIZ - NGDT) / 2);
  8.2523 +#else
  8.2524 +	return (MAXGDTSIZ - NGDT);
  8.2525 +#endif
  8.2526 +}
  8.2527 +
  8.2528 +#if defined(DDB) || defined(KGDB)
  8.2529 +
  8.2530 +/* 
  8.2531 + * Callback to output a backtrace when entering ddb.
  8.2532 + */
  8.2533 +void
  8.2534 +ddb_trap_hook(int where)
  8.2535 +{
  8.2536 +	static int once = 0;
  8.2537 +	db_addr_t db_dot;
  8.2538 +
  8.2539 +	if (once != 0 || where != 1)
  8.2540 +		return;
  8.2541 +	once = 1;
  8.2542 +
  8.2543 +	if (curlwp != NULL) {
  8.2544 +		db_printf("Stopped");
  8.2545 +		if (curproc == NULL)
  8.2546 +			db_printf("; curlwp = %p,"
  8.2547 +			    " curproc is NULL at\t", curlwp);
  8.2548 +		else
  8.2549 +			db_printf(" in pid %d.%d (%s) at\t", 
  8.2550 +			    curproc->p_pid, curlwp->l_lid,
  8.2551 +			    curproc->p_comm);
  8.2552 +	} else
  8.2553 +		db_printf("Stopped at\t");
  8.2554 +	db_dot = PC_REGS(DDB_REGS);
  8.2555 +	db_print_loc_and_inst(db_dot);
  8.2556 +
  8.2557 +	db_stack_trace_print((db_expr_t) db_dot, FALSE, 65535,
  8.2558 +	    "", db_printf);
  8.2559 +#ifdef DEBUG
  8.2560 +	db_show_regs((db_expr_t) db_dot, FALSE, 65535, "");
  8.2561 +#endif
  8.2562 +}
  8.2563 +
  8.2564 +#endif /* DDB || KGDB */
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c	Mon Sep 06 19:04:16 2004 +0000
     9.3 @@ -0,0 +1,4522 @@
     9.4 +/*	$NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $	*/
     9.5 +/*	NetBSD: pmap.c,v 1.172 2004/04/12 13:17:46 yamt Exp 	*/
     9.6 +
     9.7 +/*
     9.8 + *
     9.9 + * Copyright (c) 1997 Charles D. Cranor and Washington University.
    9.10 + * All rights reserved.
    9.11 + *
    9.12 + * Redistribution and use in source and binary forms, with or without
    9.13 + * modification, are permitted provided that the following conditions
    9.14 + * are met:
    9.15 + * 1. Redistributions of source code must retain the above copyright
    9.16 + *    notice, this list of conditions and the following disclaimer.
    9.17 + * 2. Redistributions in binary form must reproduce the above copyright
    9.18 + *    notice, this list of conditions and the following disclaimer in the
    9.19 + *    documentation and/or other materials provided with the distribution.
    9.20 + * 3. All advertising materials mentioning features or use of this software
    9.21 + *    must display the following acknowledgement:
    9.22 + *      This product includes software developed by Charles D. Cranor and
    9.23 + *      Washington University.
    9.24 + * 4. The name of the author may not be used to endorse or promote products
    9.25 + *    derived from this software without specific prior written permission.
    9.26 + *
    9.27 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
    9.28 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
    9.29 + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
    9.30 + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
    9.31 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
    9.32 + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    9.33 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    9.34 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    9.35 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
    9.36 + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    9.37 + */
    9.38 +
    9.39 +/*
    9.40 + * pmap.c: i386 pmap module rewrite
    9.41 + * Chuck Cranor <chuck@ccrc.wustl.edu>
    9.42 + * 11-Aug-97
    9.43 + *
    9.44 + * history of this pmap module: in addition to my own input, i used
    9.45 + *    the following references for this rewrite of the i386 pmap:
    9.46 + *
    9.47 + * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
    9.48 + *     BSD hp300 pmap done by Mike Hibler at University of Utah.
    9.49 + *     it was then ported to the i386 by William Jolitz of UUNET
    9.50 + *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
    9.51 + *     project fixed some bugs and provided some speed ups.
    9.52 + *
    9.53 + * [2] the FreeBSD i386 pmap.   this pmap seems to be the
    9.54 + *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
    9.55 + *     and David Greenman.
    9.56 + *
    9.57 + * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
    9.58 + *     between several processors.   the VAX version was done by
    9.59 + *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
    9.60 + *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
    9.61 + *     David Golub, and Richard Draves.    the alpha version was
    9.62 + *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
    9.63 + *     (NetBSD/alpha).
    9.64 + */
    9.65 +
    9.66 +#include <sys/cdefs.h>
    9.67 +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $");
    9.68 +
    9.69 +#include "opt_cputype.h"
    9.70 +#include "opt_user_ldt.h"
    9.71 +#include "opt_largepages.h"
    9.72 +#include "opt_lockdebug.h"
    9.73 +#include "opt_multiprocessor.h"
    9.74 +#include "opt_kstack_dr0.h"
    9.75 +#include "opt_xen.h"
    9.76 +
    9.77 +#include <sys/param.h>
    9.78 +#include <sys/systm.h>
    9.79 +#include <sys/proc.h>
    9.80 +#include <sys/malloc.h>
    9.81 +#include <sys/pool.h>
    9.82 +#include <sys/user.h>
    9.83 +#include <sys/kernel.h>
    9.84 +
    9.85 +#include <uvm/uvm.h>
    9.86 +
    9.87 +#include <machine/atomic.h>
    9.88 +#include <machine/cpu.h>
    9.89 +#include <machine/specialreg.h>
    9.90 +#include <machine/gdt.h>
    9.91 +
    9.92 +#include <dev/isa/isareg.h>
    9.93 +#include <machine/isa_machdep.h>
    9.94 +
    9.95 +#include <machine/xen.h>
    9.96 +#include <machine/hypervisor.h>
    9.97 +#include <machine/xenpmap.h>
    9.98 +
    9.99 +void xpmap_find_pte(paddr_t);
   9.100 +
   9.101 +/* #define XENDEBUG */
   9.102 +
   9.103 +#ifdef XENDEBUG
   9.104 +#define	XENPRINTF(x) printf x
   9.105 +#define	XENPRINTK(x) printf x
   9.106 +#else
   9.107 +#define	XENPRINTF(x)
   9.108 +#define	XENPRINTK(x)
   9.109 +#endif
   9.110 +#define	PRINTF(x) printf x
   9.111 +#define	PRINTK(x) printf x
   9.112 +
   9.113 +
   9.114 +/*
   9.115 + * general info:
   9.116 + *
   9.117 + *  - for an explanation of how the i386 MMU hardware works see
   9.118 + *    the comments in <machine/pte.h>.
   9.119 + *
   9.120 + *  - for an explanation of the general memory structure used by
   9.121 + *    this pmap (including the recursive mapping), see the comments
   9.122 + *    in <machine/pmap.h>.
   9.123 + *
   9.124 + * this file contains the code for the "pmap module."   the module's
   9.125 + * job is to manage the hardware's virtual to physical address mappings.
   9.126 + * note that there are two levels of mapping in the VM system:
   9.127 + *
   9.128 + *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
   9.129 + *      to map ranges of virtual address space to objects/files.  for
   9.130 + *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
   9.131 + *      to the file /bin/ls starting at offset zero."   note that
   9.132 + *      the upper layer mapping is not concerned with how individual
   9.133 + *      vm_pages are mapped.
   9.134 + *
   9.135 + *  [2] the lower layer of the VM system (the pmap) maintains the mappings
   9.136 + *      from virtual addresses.   it is concerned with which vm_page is
   9.137 + *      mapped where.   for example, when you run /bin/ls and start
   9.138 + *      at page 0x1000 the fault routine may lookup the correct page
   9.139 + *      of the /bin/ls file and then ask the pmap layer to establish
   9.140 + *      a mapping for it.
   9.141 + *
   9.142 + * note that information in the lower layer of the VM system can be
   9.143 + * thrown away since it can easily be reconstructed from the info
   9.144 + * in the upper layer.
   9.145 + *
   9.146 + * data structures we use include:
   9.147 + *
   9.148 + *  - struct pmap: describes the address space of one thread
   9.149 + *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
   9.150 + *  - struct pv_head: there is one pv_head per managed page of
   9.151 + *	physical memory.   the pv_head points to a list of pv_entry
   9.152 + *	structures which describe all the <PMAP,VA> pairs that this
   9.153 + *      page is mapped in.    this is critical for page based operations
   9.154 + *      such as pmap_page_protect() [change protection on _all_ mappings
   9.155 + *      of a page]
   9.156 + *  - pv_page/pv_page_info: pv_entry's are allocated out of pv_page's.
   9.157 + *      if we run out of pv_entry's we allocate a new pv_page and free
   9.158 + *      its pv_entrys.
   9.159 + * - pmap_remove_record: a list of virtual addresses whose mappings
   9.160 + *	have been changed.   used for TLB flushing.
   9.161 + */
   9.162 +
   9.163 +/*
   9.164 + * memory allocation
   9.165 + *
   9.166 + *  - there are three data structures that we must dynamically allocate:
   9.167 + *
   9.168 + * [A] new process' page directory page (PDP)
   9.169 + *	- plan 1: done at pmap_create() we use
   9.170 + *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
   9.171 + *	  allocation.
   9.172 + *
   9.173 + * if we are low in free physical memory then we sleep in
   9.174 + * uvm_km_alloc -- in this case this is ok since we are creating
   9.175 + * a new pmap and should not be holding any locks.
   9.176 + *
   9.177 + * if the kernel is totally out of virtual space
   9.178 + * (i.e. uvm_km_alloc returns NULL), then we panic.
   9.179 + *
   9.180 + * XXX: the fork code currently has no way to return an "out of
   9.181 + * memory, try again" error code since uvm_fork [fka vm_fork]
   9.182 + * is a void function.
   9.183 + *
   9.184 + * [B] new page tables pages (PTP)
   9.185 + * 	- call uvm_pagealloc()
   9.186 + * 		=> success: zero page, add to pm_pdir
   9.187 + * 		=> failure: we are out of free vm_pages, let pmap_enter()
   9.188 + *		   tell UVM about it.
   9.189 + *
   9.190 + * note: for kernel PTPs, we start with NKPTP of them.   as we map
   9.191 + * kernel memory (at uvm_map time) we check to see if we've grown
   9.192 + * the kernel pmap.   if so, we call the optional function
   9.193 + * pmap_growkernel() to grow the kernel PTPs in advance.
   9.194 + *
   9.195 + * [C] pv_entry structures
   9.196 + *	- plan 1: try to allocate one off the free list
   9.197 + *		=> success: done!
   9.198 + *		=> failure: no more free pv_entrys on the list
   9.199 + *	- plan 2: try to allocate a new pv_page to add a chunk of
   9.200 + *	pv_entrys to the free list
   9.201 + *		[a] obtain a free, unmapped, VA in kmem_map.  either
   9.202 + *		we have one saved from a previous call, or we allocate
   9.203 + *		one now using a "vm_map_lock_try" in uvm_map
   9.204 + *		=> success: we have an unmapped VA, continue to [b]
   9.205 + *		=> failure: unable to lock kmem_map or out of VA in it.
   9.206 + *			move on to plan 3.
   9.207 + *		[b] allocate a page in kmem_object for the VA
   9.208 + *		=> success: map it in, free the pv_entry's, DONE!
   9.209 + *		=> failure: kmem_object locked, no free vm_pages, etc.
   9.210 + *			save VA for later call to [a], go to plan 3.
   9.211 + *	If we fail, we simply let pmap_enter() tell UVM about it.
   9.212 + */
   9.213 +
   9.214 +/*
   9.215 + * locking
   9.216 + *
   9.217 + * we have the following locks that we must contend with:
   9.218 + *
   9.219 + * "normal" locks:
   9.220 + *
   9.221 + *  - pmap_main_lock
   9.222 + *    this lock is used to prevent deadlock and/or provide mutex
   9.223 + *    access to the pmap system.   most operations lock the pmap
   9.224 + *    structure first, then they lock the pv_lists (if needed).
   9.225 + *    however, some operations such as pmap_page_protect lock
   9.226 + *    the pv_lists and then lock pmaps.   in order to prevent a
   9.227 + *    cycle, we require a mutex lock when locking the pv_lists
   9.228 + *    first.   thus, the "pmap = >pv_list" lockers must gain a
   9.229 + *    read-lock on pmap_main_lock before locking the pmap.   and
   9.230 + *    the "pv_list => pmap" lockers must gain a write-lock on
   9.231 + *    pmap_main_lock before locking.    since only one thread
   9.232 + *    can write-lock a lock at a time, this provides mutex.
   9.233 + *
   9.234 + * "simple" locks:
   9.235 + *
   9.236 + * - pmap lock (per pmap, part of uvm_object)
   9.237 + *   this lock protects the fields in the pmap structure including
   9.238 + *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
   9.239 + *   in the alternate PTE space (since that is determined by the
   9.240 + *   entry in the PDP).
   9.241 + *
   9.242 + * - pvh_lock (per pv_head)
   9.243 + *   this lock protects the pv_entry list which is chained off the
   9.244 + *   pv_head structure for a specific managed PA.   it is locked
   9.245 + *   when traversing the list (e.g. adding/removing mappings,
   9.246 + *   syncing R/M bits, etc.)
   9.247 + *
   9.248 + * - pvalloc_lock
   9.249 + *   this lock protects the data structures which are used to manage
   9.250 + *   the free list of pv_entry structures.
   9.251 + *
   9.252 + * - pmaps_lock
   9.253 + *   this lock protects the list of active pmaps (headed by "pmaps").
   9.254 + *   we lock it when adding or removing pmaps from this list.
   9.255 + *
   9.256 + */
   9.257 +
   9.258 +/*
   9.259 + * locking data structures
   9.260 + */
   9.261 +
   9.262 +static struct simplelock pvalloc_lock;
   9.263 +static struct simplelock pmaps_lock;
   9.264 +
   9.265 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
   9.266 +static struct lock pmap_main_lock;
   9.267 +
   9.268 +#define PMAP_MAP_TO_HEAD_LOCK() \
   9.269 +     (void) spinlockmgr(&pmap_main_lock, LK_SHARED, NULL)
   9.270 +#define PMAP_MAP_TO_HEAD_UNLOCK() \
   9.271 +     (void) spinlockmgr(&pmap_main_lock, LK_RELEASE, NULL)
   9.272 +
   9.273 +#define PMAP_HEAD_TO_MAP_LOCK() \
   9.274 +     (void) spinlockmgr(&pmap_main_lock, LK_EXCLUSIVE, NULL)
   9.275 +#define PMAP_HEAD_TO_MAP_UNLOCK() \
   9.276 +     spinlockmgr(&pmap_main_lock, LK_RELEASE, (void *) 0)
   9.277 +
   9.278 +#else
   9.279 +
   9.280 +#define PMAP_MAP_TO_HEAD_LOCK()		/* null */
   9.281 +#define PMAP_MAP_TO_HEAD_UNLOCK()	/* null */
   9.282 +
   9.283 +#define PMAP_HEAD_TO_MAP_LOCK()		/* null */
   9.284 +#define PMAP_HEAD_TO_MAP_UNLOCK()	/* null */
   9.285 +
   9.286 +#endif
   9.287 +
   9.288 +#define COUNT(x)	/* nothing */
   9.289 +
   9.290 +/*
   9.291 + * TLB Shootdown:
   9.292 + *
   9.293 + * When a mapping is changed in a pmap, the TLB entry corresponding to
   9.294 + * the virtual address must be invalidated on all processors.  In order
   9.295 + * to accomplish this on systems with multiple processors, messages are
   9.296 + * sent from the processor which performs the mapping change to all
   9.297 + * processors on which the pmap is active.  For other processors, the
   9.298 + * ASN generation numbers for that processor is invalidated, so that
   9.299 + * the next time the pmap is activated on that processor, a new ASN
   9.300 + * will be allocated (which implicitly invalidates all TLB entries).
   9.301 + *
   9.302 + * Shootdown job queue entries are allocated using a simple special-
   9.303 + * purpose allocator for speed.
   9.304 + */
   9.305 +struct pmap_tlb_shootdown_job {
   9.306 +	TAILQ_ENTRY(pmap_tlb_shootdown_job) pj_list;
   9.307 +	vaddr_t pj_va;			/* virtual address */
   9.308 +	pmap_t pj_pmap;			/* the pmap which maps the address */
   9.309 +	pt_entry_t pj_pte;		/* the PTE bits */
   9.310 +	struct pmap_tlb_shootdown_job *pj_nextfree;
   9.311 +};
   9.312 +
   9.313 +#define PMAP_TLB_SHOOTDOWN_JOB_ALIGN 32
   9.314 +union pmap_tlb_shootdown_job_al {
   9.315 +	struct pmap_tlb_shootdown_job pja_job;
   9.316 +	char pja_align[PMAP_TLB_SHOOTDOWN_JOB_ALIGN];
   9.317 +};
   9.318 +
   9.319 +struct pmap_tlb_shootdown_q {
   9.320 +	TAILQ_HEAD(, pmap_tlb_shootdown_job) pq_head;
   9.321 +	int pq_pte;			/* aggregate PTE bits */
   9.322 +	int pq_count;			/* number of pending requests */
   9.323 +	__cpu_simple_lock_t pq_slock;	/* spin lock on queue */
   9.324 +	int pq_flushg;		/* pending flush global */
   9.325 +	int pq_flushu;		/* pending flush user */
   9.326 +} pmap_tlb_shootdown_q[X86_MAXPROCS];
   9.327 +
   9.328 +#define	PMAP_TLB_MAXJOBS	16
   9.329 +
   9.330 +void	pmap_tlb_shootdown_q_drain(struct pmap_tlb_shootdown_q *);
   9.331 +struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get
   9.332 +	   (struct pmap_tlb_shootdown_q *);
   9.333 +void	pmap_tlb_shootdown_job_put(struct pmap_tlb_shootdown_q *,
   9.334 +	    struct pmap_tlb_shootdown_job *);
   9.335 +
   9.336 +__cpu_simple_lock_t pmap_tlb_shootdown_job_lock;
   9.337 +union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
   9.338 +
   9.339 +/*
   9.340 + * global data structures
   9.341 + */
   9.342 +
   9.343 +struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
   9.344 +
   9.345 +/*
   9.346 + * nkpde is the number of kernel PTPs allocated for the kernel at
   9.347 + * boot time (NKPTP is a compile time override).   this number can
   9.348 + * grow dynamically as needed (but once allocated, we never free
   9.349 + * kernel PTPs).
   9.350 + */
   9.351 +
   9.352 +int nkpde = NKPTP;
   9.353 +#ifdef NKPDE
   9.354 +#error "obsolete NKPDE: use NKPTP"
   9.355 +#endif
   9.356 +
   9.357 +/*
   9.358 + * pmap_pg_g: if our processor supports PG_G in the PTE then we
   9.359 + * set pmap_pg_g to PG_G (otherwise it is zero).
   9.360 + */
   9.361 +
   9.362 +int pmap_pg_g = 0;
   9.363 +
   9.364 +#ifdef LARGEPAGES
   9.365 +/*
   9.366 + * pmap_largepages: if our processor supports PG_PS and we are
   9.367 + * using it, this is set to TRUE.
   9.368 + */
   9.369 +
   9.370 +int pmap_largepages;
   9.371 +#endif
   9.372 +
   9.373 +/*
   9.374 + * i386 physical memory comes in a big contig chunk with a small
   9.375 + * hole toward the front of it...  the following two paddr_t's
   9.376 + * (shared with machdep.c) describe the physical address space
   9.377 + * of this machine.
   9.378 + */
   9.379 +paddr_t avail_start;	/* PA of first available physical page */
   9.380 +paddr_t avail_end;	/* PA of last available physical page */
   9.381 +
   9.382 +paddr_t pmap_pa_start;	/* PA of first physical page for this domain */
   9.383 +paddr_t pmap_pa_end;	/* PA of last physical page for this domain */
   9.384 +
   9.385 +	/* MA of last physical page of the machine */
   9.386 +paddr_t pmap_mem_end = HYPERVISOR_VIRT_START; /* updated for domain-0 */
   9.387 +
   9.388 +/*
   9.389 + * other data structures
   9.390 + */
   9.391 +
   9.392 +static pt_entry_t protection_codes[8];     /* maps MI prot to i386 prot code */
   9.393 +static boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */
   9.394 +
   9.395 +/*
   9.396 + * the following two vaddr_t's are used during system startup
   9.397 + * to keep track of how much of the kernel's VM space we have used.
   9.398 + * once the system is started, the management of the remaining kernel
   9.399 + * VM space is turned over to the kernel_map vm_map.
   9.400 + */
   9.401 +
   9.402 +static vaddr_t virtual_avail;	/* VA of first free KVA */
   9.403 +static vaddr_t virtual_end;	/* VA of last free KVA */
   9.404 +
   9.405 +
   9.406 +/*
   9.407 + * pv_page management structures: locked by pvalloc_lock
   9.408 + */
   9.409 +
   9.410 +TAILQ_HEAD(pv_pagelist, pv_page);
   9.411 +static struct pv_pagelist pv_freepages;	/* list of pv_pages with free entrys */
   9.412 +static struct pv_pagelist pv_unusedpgs; /* list of unused pv_pages */
   9.413 +static int pv_nfpvents;			/* # of free pv entries */
   9.414 +static struct pv_page *pv_initpage;	/* bootstrap page from kernel_map */
   9.415 +static vaddr_t pv_cachedva;		/* cached VA for later use */
   9.416 +
   9.417 +#define PVE_LOWAT (PVE_PER_PVPAGE / 2)	/* free pv_entry low water mark */
   9.418 +#define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2))
   9.419 +					/* high water mark */
   9.420 +
   9.421 +static __inline int
   9.422 +pv_compare(struct pv_entry *a, struct pv_entry *b)
   9.423 +{
   9.424 +	if (a->pv_pmap < b->pv_pmap)
   9.425 +		return (-1);
   9.426 +	else if (a->pv_pmap > b->pv_pmap)
   9.427 +		return (1);
   9.428 +	else if (a->pv_va < b->pv_va)
   9.429 +		return (-1);
   9.430 +	else if (a->pv_va > b->pv_va)
   9.431 +		return (1);
   9.432 +	else
   9.433 +		return (0);
   9.434 +}
   9.435 +
   9.436 +SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare);
   9.437 +SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare);
   9.438 +
   9.439 +/*
   9.440 + * linked list of all non-kernel pmaps
   9.441 + */
   9.442 +
   9.443 +static struct pmap_head pmaps;
   9.444 +
   9.445 +/*
   9.446 + * pool that pmap structures are allocated from
   9.447 + */
   9.448 +
   9.449 +struct pool pmap_pmap_pool;
   9.450 +
   9.451 +/*
   9.452 + * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
   9.453 + * X86_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing
   9.454 + * due to false sharing.
   9.455 + */
   9.456 +
   9.457 +#ifdef MULTIPROCESSOR
   9.458 +#define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
   9.459 +#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
   9.460 +#else
   9.461 +#define PTESLEW(pte, id) (pte)
   9.462 +#define VASLEW(va,id) (va)
   9.463 +#endif
   9.464 +
   9.465 +/*
   9.466 + * special VAs and the PTEs that map them
   9.467 + */
   9.468 +static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte;
   9.469 +static caddr_t csrcp, cdstp, zerop, ptpp;
   9.470 +
   9.471 +/*
   9.472 + * pool and cache that PDPs are allocated from
   9.473 + */
   9.474 +
   9.475 +struct pool pmap_pdp_pool;
   9.476 +struct pool_cache pmap_pdp_cache;
   9.477 +u_int pmap_pdp_cache_generation;
   9.478 +
   9.479 +int	pmap_pdp_ctor(void *, void *, int);
   9.480 +void	pmap_pdp_dtor(void *, void *);
   9.481 +
   9.482 +caddr_t vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
   9.483 +
   9.484 +extern vaddr_t msgbuf_vaddr;
   9.485 +extern paddr_t msgbuf_paddr;
   9.486 +
   9.487 +extern vaddr_t idt_vaddr;			/* we allocate IDT early */
   9.488 +extern paddr_t idt_paddr;
   9.489 +
   9.490 +#if defined(I586_CPU)
   9.491 +/* stuff to fix the pentium f00f bug */
   9.492 +extern vaddr_t pentium_idt_vaddr;
   9.493 +#endif
   9.494 +
   9.495 +
   9.496 +/*
   9.497 + * local prototypes
   9.498 + */
   9.499 +
   9.500 +static struct pv_entry	*pmap_add_pvpage(struct pv_page *, boolean_t);
   9.501 +static struct vm_page	*pmap_alloc_ptp(struct pmap *, int);
   9.502 +static struct pv_entry	*pmap_alloc_pv(struct pmap *, int); /* see codes below */
   9.503 +#define ALLOCPV_NEED	0	/* need PV now */
   9.504 +#define ALLOCPV_TRY	1	/* just try to allocate, don't steal */
   9.505 +#define ALLOCPV_NONEED	2	/* don't need PV, just growing cache */
   9.506 +static struct pv_entry	*pmap_alloc_pvpage(struct pmap *, int);
   9.507 +static void		 pmap_enter_pv(struct pv_head *,
   9.508 +				       struct pv_entry *, struct pmap *,
   9.509 +				       vaddr_t, struct vm_page *);
   9.510 +static void		 pmap_free_pv(struct pmap *, struct pv_entry *);
   9.511 +static void		 pmap_free_pvs(struct pmap *, struct pv_entry *);
   9.512 +static void		 pmap_free_pv_doit(struct pv_entry *);
   9.513 +static void		 pmap_free_pvpage(void);
   9.514 +static struct vm_page	*pmap_get_ptp(struct pmap *, int);
   9.515 +static boolean_t	 pmap_is_curpmap(struct pmap *);
   9.516 +static boolean_t	 pmap_is_active(struct pmap *, int);
   9.517 +static pt_entry_t	*pmap_map_ptes(struct pmap *);
   9.518 +static struct pv_entry	*pmap_remove_pv(struct pv_head *, struct pmap *,
   9.519 +					vaddr_t);
   9.520 +static void		 pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
   9.521 +static boolean_t	 pmap_remove_pte(struct pmap *, struct vm_page *,
   9.522 +					 pt_entry_t *, vaddr_t, int32_t *, int);
   9.523 +static void		 pmap_remove_ptes(struct pmap *, struct vm_page *,
   9.524 +					  vaddr_t, vaddr_t, vaddr_t, int32_t *,
   9.525 +					  int);
   9.526 +#define PMAP_REMOVE_ALL		0	/* remove all mappings */
   9.527 +#define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
   9.528 +
   9.529 +static vaddr_t		 pmap_tmpmap_pa(paddr_t);
   9.530 +static pt_entry_t	*pmap_tmpmap_pvepte(struct pv_entry *);
   9.531 +static void		 pmap_tmpunmap_pa(void);
   9.532 +static void		 pmap_tmpunmap_pvepte(struct pv_entry *);
   9.533 +static void		 pmap_unmap_ptes(struct pmap *);
   9.534 +
   9.535 +static boolean_t	 pmap_reactivate(struct pmap *);
   9.536 +
   9.537 +#ifdef DEBUG
   9.538 +u_int	curapdp;
   9.539 +#endif
   9.540 +
   9.541 +/*
   9.542 + * p m a p   i n l i n e   h e l p e r   f u n c t i o n s
   9.543 + */
   9.544 +
   9.545 +/*
   9.546 + * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
   9.547 + *		of course the kernel is always loaded
   9.548 + */
   9.549 +
   9.550 +__inline static boolean_t
   9.551 +pmap_is_curpmap(pmap)
   9.552 +	struct pmap *pmap;
   9.553 +{
   9.554 +
   9.555 +	return((pmap == pmap_kernel()) ||
   9.556 +	       (pmap == curcpu()->ci_pmap));
   9.557 +}
   9.558 +
   9.559 +/*
   9.560 + * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
   9.561 + */
   9.562 +
   9.563 +__inline static boolean_t
   9.564 +pmap_is_active(pmap, cpu_id)
   9.565 +	struct pmap *pmap;
   9.566 +	int cpu_id;
   9.567 +{
   9.568 +
   9.569 +	return (pmap == pmap_kernel() ||
   9.570 +	    (pmap->pm_cpus & (1U << cpu_id)) != 0);
   9.571 +}
   9.572 +
   9.573 +/*
   9.574 + * pmap_tmpmap_pa: map a page in for tmp usage
   9.575 + */
   9.576 +
   9.577 +__inline static vaddr_t
   9.578 +pmap_tmpmap_pa(pa)
   9.579 +	paddr_t pa;
   9.580 +{
   9.581 +#ifdef MULTIPROCESSOR
   9.582 +	int id = cpu_number();
   9.583 +#endif
   9.584 +	pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
   9.585 +	pt_entry_t *maptp;
   9.586 +	caddr_t ptpva = VASLEW(ptpp, id);
   9.587 +#if defined(DIAGNOSTIC)
   9.588 +	if (*ptpte)
   9.589 +		panic("pmap_tmpmap_pa: ptp_pte in use?");
   9.590 +#endif
   9.591 +	maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte);
   9.592 +	PTE_SET(ptpte, maptp, PG_V | PG_RW | pa); /* always a new mapping */
   9.593 +	return((vaddr_t)ptpva);
   9.594 +}
   9.595 +
   9.596 +/*
   9.597 + * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa)
   9.598 + */
   9.599 +
   9.600 +__inline static void
   9.601 +pmap_tmpunmap_pa()
   9.602 +{
   9.603 +#ifdef MULTIPROCESSOR
   9.604 +	int id = cpu_number();
   9.605 +#endif
   9.606 +	pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
   9.607 +	pt_entry_t *maptp;
   9.608 +	caddr_t ptpva = VASLEW(ptpp, id);
   9.609 +#if defined(DIAGNOSTIC)
   9.610 +	if (!pmap_valid_entry(*ptp_pte))
   9.611 +		panic("pmap_tmpunmap_pa: our pte invalid?");
   9.612 +#endif
   9.613 +	maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte);
   9.614 +	PTE_CLEAR(ptpte, maptp);		/* zap! */
   9.615 +	pmap_update_pg((vaddr_t)ptpva);
   9.616 +#ifdef MULTIPROCESSOR
   9.617 +	/*
   9.618 +	 * No need for tlb shootdown here, since ptp_pte is per-CPU.
   9.619 +	 */
   9.620 +#endif
   9.621 +}
   9.622 +
   9.623 +/*
   9.624 + * pmap_tmpmap_pvepte: get a quick mapping of a PTE for a pv_entry
   9.625 + *
   9.626 + * => do NOT use this on kernel mappings [why?  because pv_ptp may be NULL]
   9.627 + */
   9.628 +
   9.629 +__inline static pt_entry_t *
   9.630 +pmap_tmpmap_pvepte(pve)
   9.631 +	struct pv_entry *pve;
   9.632 +{
   9.633 +#ifdef DIAGNOSTIC
   9.634 +	if (pve->pv_pmap == pmap_kernel())
   9.635 +		panic("pmap_tmpmap_pvepte: attempt to map kernel");
   9.636 +#endif
   9.637 +
   9.638 +	/* is it current pmap?  use direct mapping... */
   9.639 +	if (pmap_is_curpmap(pve->pv_pmap))
   9.640 +		return(vtopte(pve->pv_va));
   9.641 +
   9.642 +	return(((pt_entry_t *)pmap_tmpmap_pa(VM_PAGE_TO_PHYS(pve->pv_ptp)))
   9.643 +	       + ptei((unsigned)pve->pv_va));
   9.644 +}
   9.645 +
   9.646 +/*
   9.647 + * pmap_tmpunmap_pvepte: release a mapping obtained with pmap_tmpmap_pvepte
   9.648 + */
   9.649 +
   9.650 +__inline static void
   9.651 +pmap_tmpunmap_pvepte(pve)
   9.652 +	struct pv_entry *pve;
   9.653 +{
   9.654 +	/* was it current pmap?   if so, return */
   9.655 +	if (pmap_is_curpmap(pve->pv_pmap))
   9.656 +		return;
   9.657 +
   9.658 +	pmap_tmpunmap_pa();
   9.659 +}
   9.660 +
   9.661 +__inline static void
   9.662 +pmap_apte_flush(struct pmap *pmap)
   9.663 +{
   9.664 +#if defined(MULTIPROCESSOR)
   9.665 +	struct pmap_tlb_shootdown_q *pq;
   9.666 +	struct cpu_info *ci, *self = curcpu();
   9.667 +	CPU_INFO_ITERATOR cii;
   9.668 +	int s;
   9.669 +#endif
   9.670 +
   9.671 +	tlbflush();		/* flush TLB on current processor */
   9.672 +#if defined(MULTIPROCESSOR)
   9.673 +	/*
   9.674 +	 * Flush the APTE mapping from all other CPUs that
   9.675 +	 * are using the pmap we are using (who's APTE space
   9.676 +	 * is the one we've just modified).
   9.677 +	 *
   9.678 +	 * XXXthorpej -- find a way to defer the IPI.
   9.679 +	 */
   9.680 +	for (CPU_INFO_FOREACH(cii, ci)) {
   9.681 +		if (ci == self)
   9.682 +			continue;
   9.683 +		if (pmap_is_active(pmap, ci->ci_cpuid)) {
   9.684 +			pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
   9.685 +			s = splipi();
   9.686 +			__cpu_simple_lock(&pq->pq_slock);
   9.687 +			pq->pq_flushu++;
   9.688 +			__cpu_simple_unlock(&pq->pq_slock);
   9.689 +			splx(s);
   9.690 +			x86_send_ipi(ci, X86_IPI_TLB);
   9.691 +		}
   9.692 +	}
   9.693 +#endif
   9.694 +}
   9.695 +
   9.696 +/*
   9.697 + * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
   9.698 + *
   9.699 + * => we lock enough pmaps to keep things locked in
   9.700 + * => must be undone with pmap_unmap_ptes before returning
   9.701 + */
   9.702 +
   9.703 +__inline static pt_entry_t *
   9.704 +pmap_map_ptes(pmap)
   9.705 +	struct pmap *pmap;
   9.706 +{
   9.707 +	pd_entry_t opde;
   9.708 +	pd_entry_t *mapdp;
   9.709 +	struct pmap *ourpmap;
   9.710 +	struct cpu_info *ci;
   9.711 +
   9.712 +	/* the kernel's pmap is always accessible */
   9.713 +	if (pmap == pmap_kernel()) {
   9.714 +		return(PTE_BASE);
   9.715 +	}
   9.716 +
   9.717 +	ci = curcpu();
   9.718 +	if (ci->ci_want_pmapload &&
   9.719 +	    vm_map_pmap(&ci->ci_curlwp->l_proc->p_vmspace->vm_map) == pmap)
   9.720 +		pmap_load();
   9.721 +
   9.722 +	/* if curpmap then we are always mapped */
   9.723 +	if (pmap_is_curpmap(pmap)) {
   9.724 +		simple_lock(&pmap->pm_obj.vmobjlock);
   9.725 +		return(PTE_BASE);
   9.726 +	}
   9.727 +
   9.728 +	ourpmap = ci->ci_pmap;
   9.729 +
   9.730 +	/* need to lock both curpmap and pmap: use ordered locking */
   9.731 +	if ((unsigned) pmap < (unsigned) ourpmap) {
   9.732 +		simple_lock(&pmap->pm_obj.vmobjlock);
   9.733 +		simple_lock(&ourpmap->pm_obj.vmobjlock);
   9.734 +	} else {
   9.735 +		simple_lock(&ourpmap->pm_obj.vmobjlock);
   9.736 +		simple_lock(&pmap->pm_obj.vmobjlock);
   9.737 +	}
   9.738 +
   9.739 +	/* need to load a new alternate pt space into curpmap? */
   9.740 +	COUNT(apdp_pde_map);
   9.741 +	opde = PDE_GET(APDP_PDE);
   9.742 +	if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) {
   9.743 +		XENPRINTF(("APDP_PDE %p %p/%p set %p/%p\n",
   9.744 +			   pmap,
   9.745 +			   (void *)vtophys((vaddr_t)APDP_PDE),
   9.746 +			   (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)),
   9.747 +			   (void *)pmap->pm_pdirpa,
   9.748 +			   (void *)xpmap_ptom(pmap->pm_pdirpa)));
   9.749 +		mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
   9.750 +		PDE_SET(APDP_PDE, mapdp, pmap->pm_pdirpa /* | PG_RW */ | PG_V);
   9.751 +#ifdef DEBUG
   9.752 +		curapdp = pmap->pm_pdirpa;
   9.753 +#endif
   9.754 +		if (pmap_valid_entry(opde))
   9.755 +			pmap_apte_flush(ourpmap);
   9.756 +		XENPRINTF(("APDP_PDE set done\n"));
   9.757 +	}
   9.758 +	return(APTE_BASE);
   9.759 +}
   9.760 +
   9.761 +/*
   9.762 + * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
   9.763 + */
   9.764 +
   9.765 +__inline static void
   9.766 +pmap_unmap_ptes(pmap)
   9.767 +	struct pmap *pmap;
   9.768 +{
   9.769 +#if defined(MULTIPROCESSOR)
   9.770 +	pd_entry_t *mapdp;
   9.771 +#endif
   9.772 +
   9.773 +	if (pmap == pmap_kernel()) {
   9.774 +		return;
   9.775 +	}
   9.776 +	if (pmap_is_curpmap(pmap)) {
   9.777 +		simple_unlock(&pmap->pm_obj.vmobjlock);
   9.778 +	} else {
   9.779 +		struct pmap *ourpmap = curcpu()->ci_pmap;
   9.780 +
   9.781 +#if defined(MULTIPROCESSOR)
   9.782 +		mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
   9.783 +		PDE_CLEAR(APDP_PDE, mapdp);
   9.784 +		pmap_apte_flush(ourpmap);
   9.785 +#endif
   9.786 +#ifdef DEBUG
   9.787 +		curapdp = 0;
   9.788 +#endif
   9.789 +		XENPRINTF(("APDP_PDE clear %p/%p set %p/%p\n",
   9.790 +			   (void *)vtophys((vaddr_t)APDP_PDE),
   9.791 +			   (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)),
   9.792 +			   (void *)pmap->pm_pdirpa,
   9.793 +			   (void *)xpmap_ptom(pmap->pm_pdirpa)));
   9.794 +		COUNT(apdp_pde_unmap);
   9.795 +		simple_unlock(&pmap->pm_obj.vmobjlock);
   9.796 +		simple_unlock(&ourpmap->pm_obj.vmobjlock);
   9.797 +	}
   9.798 +}
   9.799 +
   9.800 +__inline static void
   9.801 +pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
   9.802 +{
   9.803 +	if (curproc == NULL || curproc->p_vmspace == NULL ||
   9.804 +	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
   9.805 +		return;
   9.806 +
   9.807 +	if ((opte ^ npte) & PG_X)
   9.808 +		pmap_update_pg(va);
   9.809 +
   9.810 +	/*
   9.811 +	 * Executability was removed on the last executable change.
   9.812 +	 * Reset the code segment to something conservative and
   9.813 +	 * let the trap handler deal with setting the right limit.
   9.814 +	 * We can't do that because of locking constraints on the vm map.
   9.815 +	 */
   9.816 +
   9.817 +	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
   9.818 +		struct trapframe *tf = curlwp->l_md.md_regs;
   9.819 +		struct pcb *pcb = &curlwp->l_addr->u_pcb;
   9.820 +
   9.821 +		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
   9.822 +		pm->pm_hiexec = I386_MAX_EXE_ADDR;
   9.823 +	}
   9.824 +}
   9.825 +
   9.826 +__inline static pt_entry_t
   9.827 +pte_mtop(pt_entry_t pte)
   9.828 +{
   9.829 +	pt_entry_t ppte;
   9.830 +
   9.831 +	KDASSERT(pmap_valid_entry(pte));
   9.832 +	ppte = xpmap_mtop(pte);
   9.833 +	if ((ppte & PG_FRAME) == XPMAP_OFFSET) {
   9.834 +		XENPRINTF(("pte_mtop: null page %08x -> %08x\n",
   9.835 +		    ppte, pte));
   9.836 +		ppte = pte;
   9.837 +	}
   9.838 +
   9.839 +	return ppte;
   9.840 +}
   9.841 +
   9.842 +__inline static pt_entry_t
   9.843 +pte_get_ma(pt_entry_t *pte)
   9.844 +{
   9.845 +
   9.846 +	return *pte;
   9.847 +}
   9.848 +
   9.849 +__inline static pt_entry_t
   9.850 +pte_get(pt_entry_t *pte)
   9.851 +{
   9.852 +
   9.853 +	if (pmap_valid_entry(*pte))
   9.854 +		return pte_mtop(*pte);
   9.855 +	return *pte;
   9.856 +}
   9.857 +
   9.858 +__inline static pt_entry_t
   9.859 +pte_atomic_update_ma(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte)
   9.860 +{
   9.861 +	pt_entry_t opte;
   9.862 +
   9.863 +	XENPRINTK(("pte_atomic_update_ma pte %p mapte %p npte %08x\n",
   9.864 +		   pte, mapte, npte));
   9.865 +	opte = PTE_GET_MA(pte);
   9.866 +	if (opte > pmap_mem_end) {
   9.867 +		/* must remove opte unchecked */
   9.868 +		if (npte > pmap_mem_end)
   9.869 +			/* must set npte unchecked */
   9.870 +			xpq_queue_unchecked_pte_update(mapte, npte);
   9.871 +		else {
   9.872 +			/* must set npte checked */
   9.873 +			xpq_queue_unchecked_pte_update(mapte, 0);
   9.874 +			xpq_queue_pte_update(mapte, npte);
   9.875 +		}
   9.876 +	} else {
   9.877 +		/* must remove opte checked */
   9.878 +		if (npte > pmap_mem_end) {
   9.879 +			/* must set npte unchecked */
   9.880 +			xpq_queue_pte_update(mapte, 0);
   9.881 +			xpq_queue_unchecked_pte_update(mapte, npte);
   9.882 +		} else
   9.883 +			/* must set npte checked */
   9.884 +			xpq_queue_pte_update(mapte, npte);
   9.885 +	}
   9.886 +	xpq_flush_queue();
   9.887 +
   9.888 +	return opte;
   9.889 +}
   9.890 +
   9.891 +__inline static pt_entry_t
   9.892 +pte_atomic_update(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte)
   9.893 +{
   9.894 +	pt_entry_t opte;
   9.895 +
   9.896 +	opte = pte_atomic_update_ma(pte, mapte, npte);
   9.897 +
   9.898 +	return pte_mtop(opte);
   9.899 +}
   9.900 +
   9.901 +/*
   9.902 + * Fixup the code segment to cover all potential executable mappings.
   9.903 + * returns 0 if no changes to the code segment were made.
   9.904 + */
   9.905 +
   9.906 +int
   9.907 +pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
   9.908 +{
   9.909 +	struct vm_map_entry *ent;
   9.910 +	struct pmap *pm = vm_map_pmap(map);
   9.911 +	vaddr_t va = 0;
   9.912 +
   9.913 +	vm_map_lock_read(map);
   9.914 +	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
   9.915 +
   9.916 +		/*
   9.917 +		 * This entry has greater va than the entries before.
   9.918 +		 * We need to make it point to the last page, not past it.
   9.919 +		 */
   9.920 +
   9.921 +		if (ent->protection & VM_PROT_EXECUTE)
   9.922 +			va = trunc_page(ent->end) - PAGE_SIZE;
   9.923 +	}
   9.924 +	vm_map_unlock_read(map);
   9.925 +	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
   9.926 +		return (0);
   9.927 +
   9.928 +	pm->pm_hiexec = va;
   9.929 +	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
   9.930 +		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
   9.931 +	} else {
   9.932 +		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
   9.933 +		return (0);
   9.934 +	}
   9.935 +	return (1);
   9.936 +}
   9.937 +
   9.938 +/*
   9.939 + * p m a p   k e n t e r   f u n c t i o n s
   9.940 + *
   9.941 + * functions to quickly enter/remove pages from the kernel address
   9.942 + * space.   pmap_kremove is exported to MI kernel.  we make use of
   9.943 + * the recursive PTE mappings.
   9.944 + */
   9.945 +
   9.946 +/*
   9.947 + * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
   9.948 + *
   9.949 + * => no need to lock anything, assume va is already allocated
   9.950 + * => should be faster than normal pmap enter function
   9.951 + */
   9.952 +
   9.953 +void
   9.954 +pmap_kenter_pa(va, pa, prot)
   9.955 +	vaddr_t va;
   9.956 +	paddr_t pa;
   9.957 +	vm_prot_t prot;
   9.958 +{
   9.959 +	pt_entry_t *pte, opte, npte;
   9.960 +	pt_entry_t *maptp;
   9.961 +
   9.962 +	if (va < VM_MIN_KERNEL_ADDRESS)
   9.963 +		pte = vtopte(va);
   9.964 +	else
   9.965 +		pte = kvtopte(va);
   9.966 +
   9.967 +	npte = ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
   9.968 +	     PG_V | pmap_pg_g;
   9.969 +
   9.970 +	if (pa >= pmap_pa_start && pa < pmap_pa_end) {
   9.971 +		npte |= xpmap_ptom(pa);
   9.972 +	} else {
   9.973 +		XENPRINTF(("pmap_kenter: va %08lx outside pa range %08lx\n",
   9.974 +			      va, pa));
   9.975 +		npte |= pa;
   9.976 +	}
   9.977 +
   9.978 +	maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
   9.979 +	opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */
   9.980 +	XENPRINTK(("pmap_kenter_pa(%p,%p) %p, was %08x now %08x\n", (void *)va, 
   9.981 +		      (void *)pa, pte, opte, npte));
   9.982 +#ifdef LARGEPAGES
   9.983 +	/* XXX For now... */
   9.984 +	if (opte & PG_PS)
   9.985 +		panic("pmap_kenter_pa: PG_PS");
   9.986 +#endif
   9.987 +	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
   9.988 +#if defined(MULTIPROCESSOR)
   9.989 +		int32_t cpumask = 0;
   9.990 +
   9.991 +		pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
   9.992 +		pmap_tlb_shootnow(cpumask);
   9.993 +#else
   9.994 +		/* Don't bother deferring in the single CPU case. */
   9.995 +		pmap_update_pg(va);
   9.996 +#endif
   9.997 +	}
   9.998 +}
   9.999 +
  9.1000 +/*
  9.1001 + * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
  9.1002 + *
  9.1003 + * => no need to lock anything, assume va is already allocated
  9.1004 + * => should be faster than normal pmap enter function
  9.1005 + */
  9.1006 +
  9.1007 +void		 pmap_kenter_ma __P((vaddr_t, paddr_t, vm_prot_t));
  9.1008 +
  9.1009 +void
  9.1010 +pmap_kenter_ma(va, ma, prot)
  9.1011 +	vaddr_t va;
  9.1012 +	paddr_t ma;
  9.1013 +	vm_prot_t prot;
  9.1014 +{
  9.1015 +	pt_entry_t *pte, opte, npte;
  9.1016 +	pt_entry_t *maptp;
  9.1017 +
  9.1018 +	KASSERT (va >= VM_MIN_KERNEL_ADDRESS);
  9.1019 +	pte = kvtopte(va);
  9.1020 +
  9.1021 +	npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
  9.1022 +	     PG_V | pmap_pg_g;
  9.1023 +
  9.1024 +	maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
  9.1025 +	opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */
  9.1026 +	XENPRINTK(("pmap_kenter_ma(%p,%p) %p, was %08x\n", (void *)va,
  9.1027 +		      (void *)ma, pte, opte));
  9.1028 +#ifdef LARGEPAGES
  9.1029 +	/* XXX For now... */
  9.1030 +	if (opte & PG_PS)
  9.1031 +		panic("pmap_kenter_ma: PG_PS");
  9.1032 +#endif
  9.1033 +	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
  9.1034 +#if defined(MULTIPROCESSOR)
  9.1035 +		int32_t cpumask = 0;
  9.1036 +
  9.1037 +		pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
  9.1038 +		pmap_tlb_shootnow(cpumask);
  9.1039 +#else
  9.1040 +		/* Don't bother deferring in the single CPU case. */
  9.1041 +		pmap_update_pg(va);
  9.1042 +#endif
  9.1043 +	}
  9.1044 +}
  9.1045 +
  9.1046 +/*
  9.1047 + * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
  9.1048 + *
  9.1049 + * => no need to lock anything
  9.1050 + * => caller must dispose of any vm_page mapped in the va range
  9.1051 + * => note: not an inline function
  9.1052 + * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
  9.1053 + * => we assume kernel only unmaps valid addresses and thus don't bother
  9.1054 + *    checking the valid bit before doing TLB flushing
  9.1055 + */
  9.1056 +
  9.1057 +void
  9.1058 +pmap_kremove(va, len)
  9.1059 +	vaddr_t va;
  9.1060 +	vsize_t len;
  9.1061 +{
  9.1062 +	pt_entry_t *pte, opte;
  9.1063 +	pt_entry_t *maptp;
  9.1064 +	int32_t cpumask = 0;
  9.1065 +
  9.1066 +	XENPRINTK(("pmap_kremove va %p, len %08lx\n", (void *)va, len));
  9.1067 +	len >>= PAGE_SHIFT;
  9.1068 +	for ( /* null */ ; len ; len--, va += PAGE_SIZE) {
  9.1069 +		if (va < VM_MIN_KERNEL_ADDRESS)
  9.1070 +			pte = vtopte(va);
  9.1071 +		else
  9.1072 +			pte = kvtopte(va);
  9.1073 +		maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
  9.1074 +		opte = pte_atomic_update_ma(pte, maptp, 0); /* zap! */
  9.1075 +		XENPRINTK(("pmap_kremove pte %p, was %08x\n", pte, opte));
  9.1076 +#ifdef LARGEPAGES
  9.1077 +		/* XXX For now... */
  9.1078 +		if (opte & PG_PS)
  9.1079 +			panic("pmap_kremove: PG_PS");
  9.1080 +#endif
  9.1081 +#ifdef DIAGNOSTIC
  9.1082 +		if (opte & PG_PVLIST)
  9.1083 +			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
  9.1084 +			      va);
  9.1085 +#endif
  9.1086 +		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U))
  9.1087 +			pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
  9.1088 +	}
  9.1089 +	pmap_tlb_shootnow(cpumask);
  9.1090 +}
  9.1091 +
  9.1092 +/*
  9.1093 + * p m a p   i n i t   f u n c t i o n s
  9.1094 + *
  9.1095 + * pmap_bootstrap and pmap_init are called during system startup
  9.1096 + * to init the pmap module.   pmap_bootstrap() does a low level
  9.1097 + * init just to get things rolling.   pmap_init() finishes the job.
  9.1098 + */
  9.1099 +
  9.1100 +/*
  9.1101 + * pmap_bootstrap: get the system in a state where it can run with VM
  9.1102 + *	properly enabled (called before main()).   the VM system is
  9.1103 + *      fully init'd later...
  9.1104 + *
  9.1105 + * => on i386, locore.s has already enabled the MMU by allocating
  9.1106 + *	a PDP for the kernel, and nkpde PTP's for the kernel.
  9.1107 + * => kva_start is the first free virtual address in kernel space
  9.1108 + */
  9.1109 +
  9.1110 +void
  9.1111 +pmap_bootstrap(kva_start)
  9.1112 +	vaddr_t kva_start;
  9.1113 +{
  9.1114 +	struct pmap *kpm;
  9.1115 +	vaddr_t kva;
  9.1116 +	pt_entry_t *pte;
  9.1117 +	pt_entry_t *maptp;
  9.1118 +	int i;
  9.1119 +
  9.1120 +	/*
  9.1121 +	 * set up our local static global vars that keep track of the
  9.1122 +	 * usage of KVM before kernel_map is set up
  9.1123 +	 */
  9.1124 +
  9.1125 +	virtual_avail = kva_start;		/* first free KVA */
  9.1126 +	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
  9.1127 +
  9.1128 +	/*
  9.1129 +	 * find out where physical memory ends on the real hardware.
  9.1130 +	 */
  9.1131 +
  9.1132 +	if (xen_start_info.flags & SIF_PRIVILEGED)
  9.1133 +		pmap_mem_end = find_pmap_mem_end(kva_start);
  9.1134 +
  9.1135 +	/*
  9.1136 +	 * set up protection_codes: we need to be able to convert from
  9.1137 +	 * a MI protection code (some combo of VM_PROT...) to something
  9.1138 +	 * we can jam into a i386 PTE.
  9.1139 +	 */
  9.1140 +
  9.1141 +	protection_codes[VM_PROT_NONE] = 0;  			/* --- */
  9.1142 +	protection_codes[VM_PROT_EXECUTE] = PG_X;		/* --x */
  9.1143 +	protection_codes[VM_PROT_READ] = PG_RO;			/* -r- */
  9.1144 +	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO|PG_X;/* -rx */
  9.1145 +	protection_codes[VM_PROT_WRITE] = PG_RW;		/* w-- */
  9.1146 +	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW|PG_X;/* w-x */
  9.1147 +	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW;	/* wr- */
  9.1148 +	protection_codes[VM_PROT_ALL] = PG_RW|PG_X;		/* wrx */
  9.1149 +
  9.1150 +	/*
  9.1151 +	 * now we init the kernel's pmap
  9.1152 +	 *
  9.1153 +	 * the kernel pmap's pm_obj is not used for much.   however, in
  9.1154 +	 * user pmaps the pm_obj contains the list of active PTPs.
  9.1155 +	 * the pm_obj currently does not have a pager.   it might be possible
  9.1156 +	 * to add a pager that would allow a process to read-only mmap its
  9.1157 +	 * own page tables (fast user level vtophys?).   this may or may not
  9.1158 +	 * be useful.
  9.1159 +	 */
  9.1160 +
  9.1161 +	kpm = pmap_kernel();
  9.1162 +	simple_lock_init(&kpm->pm_obj.vmobjlock);
  9.1163 +	kpm->pm_obj.pgops = NULL;
  9.1164 +	TAILQ_INIT(&kpm->pm_obj.memq);
  9.1165 +	kpm->pm_obj.uo_npages = 0;
  9.1166 +	kpm->pm_obj.uo_refs = 1;
  9.1167 +	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
  9.1168 +	kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE);
  9.1169 +	XENPRINTF(("pm_pdirpa %p PTDpaddr %p\n",
  9.1170 +	    (void *)lwp0.l_addr->u_pcb.pcb_cr3, (void *)PTDpaddr));
  9.1171 +	kpm->pm_pdirpa = (u_int32_t) lwp0.l_addr->u_pcb.pcb_cr3;
  9.1172 +	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
  9.1173 +		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
  9.1174 +
  9.1175 +	/*
  9.1176 +	 * the above is just a rough estimate and not critical to the proper
  9.1177 +	 * operation of the system.
  9.1178 +	 */
  9.1179 +
  9.1180 +	/*
  9.1181 +	 * Begin to enable global TLB entries if they are supported.
  9.1182 +	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
  9.1183 +	 * which happens in cpu_init(), which is run on each cpu
  9.1184 +	 * (and happens later)
  9.1185 +	 */
  9.1186 +
  9.1187 +	if (cpu_feature & CPUID_PGE) {
  9.1188 +		pmap_pg_g = PG_G;		/* enable software */
  9.1189 +
  9.1190 +		/* add PG_G attribute to already mapped kernel pages */
  9.1191 +		for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
  9.1192 +		     kva += PAGE_SIZE)
  9.1193 +			if (pmap_valid_entry(PTE_BASE[x86_btop(kva)])) {
  9.1194 +#if !defined(XEN)
  9.1195 +				PTE_BASE[x86_btop(kva)] |= PG_G;
  9.1196 +#else
  9.1197 +				maptp = (pt_entry_t *)vtomach(
  9.1198 +					(vaddr_t)&PTE_BASE[x86_btop(kva)]);
  9.1199 +				PTE_SETBITS(&PTE_BASE[x86_btop(kva)], maptp,
  9.1200 +				    PG_G);
  9.1201 +			}
  9.1202 +		PTE_UPDATES_FLUSH();
  9.1203 +#endif
  9.1204 +	}
  9.1205 +
  9.1206 +#ifdef LARGEPAGES
  9.1207 +	/*
  9.1208 +	 * enable large pages if they are supported.
  9.1209 +	 */
  9.1210 +
  9.1211 +	if (cpu_feature & CPUID_PSE) {
  9.1212 +		paddr_t pa;
  9.1213 +		vaddr_t kva_end;
  9.1214 +		pd_entry_t *pde;
  9.1215 +		pd_entry_t *mapdp;
  9.1216 +		extern char _etext;
  9.1217 +
  9.1218 +		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
  9.1219 +		pmap_largepages = 1;	/* enable software */
  9.1220 +
  9.1221 +		/*
  9.1222 +		 * the TLB must be flushed after enabling large pages
  9.1223 +		 * on Pentium CPUs, according to section 3.6.2.2 of
  9.1224 +		 * "Intel Architecture Software Developer's Manual,
  9.1225 +		 * Volume 3: System Programming".
  9.1226 +		 */
  9.1227 +		tlbflush();
  9.1228 +
  9.1229 +		/*
  9.1230 +		 * now, remap the kernel text using large pages.  we
  9.1231 +		 * assume that the linker has properly aligned the
  9.1232 +		 * .data segment to a 4MB boundary.
  9.1233 +		 */
  9.1234 +		kva_end = roundup((vaddr_t)&_etext, NBPD);
  9.1235 +		for (pa = 0, kva = KERNBASE; kva < kva_end;
  9.1236 +		     kva += NBPD, pa += NBPD) {
  9.1237 +			pde = &kpm->pm_pdir[pdei(kva)];
  9.1238 +			mapdp = (pt_entry_t *)vtomach((vaddr_t)pde);
  9.1239 +			PDE_SET(pde, mapdp, pa | pmap_pg_g | PG_PS |
  9.1240 +			    PG_KR | PG_V); /* zap! */
  9.1241 +			tlbflush();
  9.1242 +		}
  9.1243 +	}
  9.1244 +#endif /* LARGEPAGES */
  9.1245 +
  9.1246 +	/*
  9.1247 +	 * now we allocate the "special" VAs which are used for tmp mappings
  9.1248 +	 * by the pmap (and other modules).    we allocate the VAs by advancing
  9.1249 +	 * virtual_avail (note that there are no pages mapped at these VAs).
  9.1250 +	 * we find the PTE that maps the allocated VA via the linear PTE
  9.1251 +	 * mapping.
  9.1252 +	 */
  9.1253 +
  9.1254 +	pte = PTE_BASE + x86_btop(virtual_avail);
  9.1255 +
  9.1256 +#ifdef MULTIPROCESSOR
  9.1257 +	/*
  9.1258 +	 * Waste some VA space to avoid false sharing of cache lines
  9.1259 +	 * for page table pages: Give each possible CPU a cache line
  9.1260 +	 * of PTE's (8) to play with, though we only need 4.  We could
  9.1261 +	 * recycle some of this waste by putting the idle stacks here
  9.1262 +	 * as well; we could waste less space if we knew the largest
  9.1263 +	 * CPU ID beforehand.
  9.1264 +	 */
  9.1265 +	csrcp = (caddr_t) virtual_avail;  csrc_pte = pte;
  9.1266 +
  9.1267 +	cdstp = (caddr_t) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
  9.1268 +
  9.1269 +	zerop = (caddr_t) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
  9.1270 +
  9.1271 +	ptpp = (caddr_t) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
  9.1272 +
  9.1273 +	virtual_avail += PAGE_SIZE * X86_MAXPROCS * NPTECL;
  9.1274 +	pte += X86_MAXPROCS * NPTECL;
  9.1275 +#else
  9.1276 +	csrcp = (caddr_t) virtual_avail;  csrc_pte = pte;  /* allocate */
  9.1277 +	virtual_avail += PAGE_SIZE; pte++;			     /* advance */
  9.1278 +
  9.1279 +	cdstp = (caddr_t) virtual_avail;  cdst_pte = pte;
  9.1280 +	virtual_avail += PAGE_SIZE; pte++;
  9.1281 +
  9.1282 +	zerop = (caddr_t) virtual_avail;  zero_pte = pte;
  9.1283 +	virtual_avail += PAGE_SIZE; pte++;
  9.1284 +
  9.1285 +	ptpp = (caddr_t) virtual_avail;  ptp_pte = pte;
  9.1286 +	virtual_avail += PAGE_SIZE; pte++;
  9.1287 +#endif
  9.1288 +
  9.1289 +	XENPRINTK(("pmap_bootstrap csrcp %p cdstp %p zerop %p ptpp %p\n", 
  9.1290 +		      csrc_pte, cdst_pte, zero_pte, ptp_pte));
  9.1291 +	/*
  9.1292 +	 * Nothing after this point actually needs pte;
  9.1293 +	 */
  9.1294 +	pte = (void *)0xdeadbeef;
  9.1295 +
  9.1296 +	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
  9.1297 +	vmmap = (char *)virtual_avail;			/* don't need pte */
  9.1298 +	virtual_avail += PAGE_SIZE;
  9.1299 +
  9.1300 +	msgbuf_vaddr = virtual_avail;			/* don't need pte */
  9.1301 +	virtual_avail += round_page(MSGBUFSIZE);
  9.1302 +
  9.1303 +	idt_vaddr = virtual_avail;			/* don't need pte */
  9.1304 +	virtual_avail += PAGE_SIZE;
  9.1305 +	idt_paddr = avail_start;			/* steal a page */
  9.1306 +	avail_start += PAGE_SIZE;
  9.1307 +
  9.1308 +#if defined(I586_CPU)
  9.1309 +	/* pentium f00f bug stuff */
  9.1310 +	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
  9.1311 +	virtual_avail += PAGE_SIZE;
  9.1312 +#endif
  9.1313 +
  9.1314 +	/*
  9.1315 +	 * now we reserve some VM for mapping pages when doing a crash dump
  9.1316 +	 */
  9.1317 +
  9.1318 +	virtual_avail = reserve_dumppages(virtual_avail);
  9.1319 +
  9.1320 +	/*
  9.1321 +	 * init the static-global locks and global lists.
  9.1322 +	 */
  9.1323 +
  9.1324 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
  9.1325 +	spinlockinit(&pmap_main_lock, "pmaplk", 0);
  9.1326 +#endif
  9.1327 +	simple_lock_init(&pvalloc_lock);
  9.1328 +	simple_lock_init(&pmaps_lock);
  9.1329 +	LIST_INIT(&pmaps);
  9.1330 +	TAILQ_INIT(&pv_freepages);
  9.1331 +	TAILQ_INIT(&pv_unusedpgs);
  9.1332 +
  9.1333 +	/*
  9.1334 +	 * initialize the pmap pool.
  9.1335 +	 */
  9.1336 +
  9.1337 +	pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, 0, 0, "pmappl",
  9.1338 +	    &pool_allocator_nointr);
  9.1339 +
  9.1340 +	/*
  9.1341 +	 * Initialize the TLB shootdown queues.
  9.1342 +	 */
  9.1343 +
  9.1344 +	__cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock);
  9.1345 +
  9.1346 +	for (i = 0; i < X86_MAXPROCS; i++) {
  9.1347 +		TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
  9.1348 +		__cpu_simple_lock_init(&pmap_tlb_shootdown_q[i].pq_slock);
  9.1349 +	}
  9.1350 +
  9.1351 +	/*
  9.1352 +	 * initialize the PDE pool and cache.
  9.1353 +	 */
  9.1354 +	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, 0, "pdppl",
  9.1355 +		  &pool_allocator_nointr);
  9.1356 +	pool_cache_init(&pmap_pdp_cache, &pmap_pdp_pool,
  9.1357 +			pmap_pdp_ctor, pmap_pdp_dtor, NULL);
  9.1358 +
  9.1359 +	/*
  9.1360 +	 * ensure the TLB is sync'd with reality by flushing it...
  9.1361 +	 */
  9.1362 +
  9.1363 +	tlbflush();
  9.1364 +}
  9.1365 +
  9.1366 +/*
  9.1367 + * pmap_init: called from uvm_init, our job is to get the pmap
  9.1368 + * system ready to manage mappings... this mainly means initing
  9.1369 + * the pv_entry stuff.
  9.1370 + */
  9.1371 +
  9.1372 +void
  9.1373 +pmap_init()
  9.1374 +{
  9.1375 +	int i;
  9.1376 +
  9.1377 +	/*
  9.1378 +	 * now we need to free enough pv_entry structures to allow us to get
  9.1379 +	 * the kmem_map/kmem_object allocated and inited (done after this
  9.1380 +	 * function is finished).  to do this we allocate one bootstrap page out
  9.1381 +	 * of kernel_map and use it to provide an initial pool of pv_entry
  9.1382 +	 * structures.   we never free this page.
  9.1383 +	 */
  9.1384 +
  9.1385 +	pv_initpage = (struct pv_page *) uvm_km_alloc(kernel_map, PAGE_SIZE);
  9.1386 +	if (pv_initpage == NULL)
  9.1387 +		panic("pmap_init: pv_initpage");
  9.1388 +	pv_cachedva = 0;   /* a VA we have allocated but not used yet */
  9.1389 +	pv_nfpvents = 0;
  9.1390 +	(void) pmap_add_pvpage(pv_initpage, FALSE);
  9.1391 +
  9.1392 +	pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE);
  9.1393 +	if (pj_page == NULL)
  9.1394 +		panic("pmap_init: pj_page");
  9.1395 +
  9.1396 +	for (i = 0;
  9.1397 +	     i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
  9.1398 +	     i++)
  9.1399 +		pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job;
  9.1400 +	pj_page[i].pja_job.pj_nextfree = NULL;
  9.1401 +	pj_free = &pj_page[0];
  9.1402 +
  9.1403 +	/*
  9.1404 +	 * done: pmap module is up (and ready for business)
  9.1405 +	 */
  9.1406 +
  9.1407 +	pmap_initialized = TRUE;
  9.1408 +}
  9.1409 +
  9.1410 +/*
  9.1411 + * p v _ e n t r y   f u n c t i o n s
  9.1412 + */
  9.1413 +
  9.1414 +/*
  9.1415 + * pv_entry allocation functions:
  9.1416 + *   the main pv_entry allocation functions are:
  9.1417 + *     pmap_alloc_pv: allocate a pv_entry structure
  9.1418 + *     pmap_free_pv: free one pv_entry
  9.1419 + *     pmap_free_pvs: free a list of pv_entrys
  9.1420 + *
  9.1421 + * the rest are helper functions
  9.1422 + */
  9.1423 +
  9.1424 +/*
  9.1425 + * pmap_alloc_pv: inline function to allocate a pv_entry structure
  9.1426 + * => we lock pvalloc_lock
  9.1427 + * => if we fail, we call out to pmap_alloc_pvpage
  9.1428 + * => 3 modes:
  9.1429 + *    ALLOCPV_NEED   = we really need a pv_entry, even if we have to steal it
  9.1430 + *    ALLOCPV_TRY    = we want a pv_entry, but not enough to steal
  9.1431 + *    ALLOCPV_NONEED = we are trying to grow our free list, don't really need
  9.1432 + *			one now
  9.1433 + *
  9.1434 + * "try" is for optional functions like pmap_copy().
  9.1435 + */
  9.1436 +
  9.1437 +__inline static struct pv_entry *
  9.1438 +pmap_alloc_pv(pmap, mode)
  9.1439 +	struct pmap *pmap;
  9.1440 +	int mode;
  9.1441 +{
  9.1442 +	struct pv_page *pvpage;
  9.1443 +	struct pv_entry *pv;
  9.1444 +
  9.1445 +	simple_lock(&pvalloc_lock);
  9.1446 +
  9.1447 +	pvpage = TAILQ_FIRST(&pv_freepages);
  9.1448 +	if (pvpage != NULL) {
  9.1449 +		pvpage->pvinfo.pvpi_nfree--;
  9.1450 +		if (pvpage->pvinfo.pvpi_nfree == 0) {
  9.1451 +			/* nothing left in this one? */
  9.1452 +			TAILQ_REMOVE(&pv_freepages, pvpage, pvinfo.pvpi_list);
  9.1453 +		}
  9.1454 +		pv = pvpage->pvinfo.pvpi_pvfree;
  9.1455 +		KASSERT(pv);
  9.1456 +		pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
  9.1457 +		pv_nfpvents--;  /* took one from pool */
  9.1458 +	} else {
  9.1459 +		pv = NULL;		/* need more of them */
  9.1460 +	}
  9.1461 +
  9.1462 +	/*
  9.1463 +	 * if below low water mark or we didn't get a pv_entry we try and
  9.1464 +	 * create more pv_entrys ...
  9.1465 +	 */
  9.1466 +
  9.1467 +	if (pv_nfpvents < PVE_LOWAT || pv == NULL) {
  9.1468 +		if (pv == NULL)
  9.1469 +			pv = pmap_alloc_pvpage(pmap, (mode == ALLOCPV_TRY) ?
  9.1470 +					       mode : ALLOCPV_NEED);
  9.1471 +		else
  9.1472 +			(void) pmap_alloc_pvpage(pmap, ALLOCPV_NONEED);
  9.1473 +	}
  9.1474 +	simple_unlock(&pvalloc_lock);
  9.1475 +	return(pv);
  9.1476 +}
  9.1477 +
  9.1478 +/*
  9.1479 + * pmap_alloc_pvpage: maybe allocate a new pvpage
  9.1480 + *
  9.1481 + * if need_entry is false: try and allocate a new pv_page
  9.1482 + * if need_entry is true: try and allocate a new pv_page and return a
  9.1483 + *	new pv_entry from it.   if we are unable to allocate a pv_page
  9.1484 + *	we make a last ditch effort to steal a pv_page from some other
  9.1485 + *	mapping.    if that fails, we panic...
  9.1486 + *
  9.1487 + * => we assume that the caller holds pvalloc_lock
  9.1488 + */
  9.1489 +
  9.1490 +static struct pv_entry *
  9.1491 +pmap_alloc_pvpage(pmap, mode)
  9.1492 +	struct pmap *pmap;
  9.1493 +	int mode;
  9.1494 +{
  9.1495 +	struct vm_page *pg;
  9.1496 +	struct pv_page *pvpage;
  9.1497 +	struct pv_entry *pv;
  9.1498 +	int s;
  9.1499 +
  9.1500 +	/*
  9.1501 +	 * if we need_entry and we've got unused pv_pages, allocate from there
  9.1502 +	 */
  9.1503 +
  9.1504 +	pvpage = TAILQ_FIRST(&pv_unusedpgs);
  9.1505 +	if (mode != ALLOCPV_NONEED && pvpage != NULL) {
  9.1506 +
  9.1507 +		/* move it to pv_freepages list */
  9.1508 +		TAILQ_REMOVE(&pv_unusedpgs, pvpage, pvinfo.pvpi_list);
  9.1509 +		TAILQ_INSERT_HEAD(&pv_freepages, pvpage, pvinfo.pvpi_list);
  9.1510 +
  9.1511 +		/* allocate a pv_entry */
  9.1512 +		pvpage->pvinfo.pvpi_nfree--;	/* can't go to zero */
  9.1513 +		pv = pvpage->pvinfo.pvpi_pvfree;
  9.1514 +		KASSERT(pv);
  9.1515 +		pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
  9.1516 +		pv_nfpvents--;  /* took one from pool */
  9.1517 +		return(pv);
  9.1518 +	}
  9.1519 +
  9.1520 +	/*
  9.1521 +	 *  see if we've got a cached unmapped VA that we can map a page in.
  9.1522 +	 * if not, try to allocate one.
  9.1523 +	 */
  9.1524 +
  9.1525 +	if (pv_cachedva == 0) {
  9.1526 +		s = splvm();   /* must protect kmem_map with splvm! */
  9.1527 +		pv_cachedva = uvm_km_kmemalloc(kmem_map, NULL, PAGE_SIZE,
  9.1528 +		    UVM_KMF_TRYLOCK|UVM_KMF_VALLOC);
  9.1529 +		splx(s);
  9.1530 +		if (pv_cachedva == 0) {
  9.1531 +			return (NULL);
  9.1532 +		}
  9.1533 +	}
  9.1534 +
  9.1535 +	pg = uvm_pagealloc(NULL, pv_cachedva - vm_map_min(kernel_map), NULL,
  9.1536 +	    UVM_PGA_USERESERVE);
  9.1537 +	if (pg == NULL)
  9.1538 +		return (NULL);
  9.1539 +	pg->flags &= ~PG_BUSY;	/* never busy */
  9.1540 +
  9.1541 +	/*
  9.1542 +	 * add a mapping for our new pv_page and free its entrys (save one!)
  9.1543 +	 *
  9.1544 +	 * NOTE: If we are allocating a PV page for the kernel pmap, the
  9.1545 +	 * pmap is already locked!  (...but entering the mapping is safe...)
  9.1546 +	 */
  9.1547 +
  9.1548 +	pmap_kenter_pa(pv_cachedva, VM_PAGE_TO_PHYS(pg),
  9.1549 +	    VM_PROT_READ | VM_PROT_WRITE);
  9.1550 +	pmap_update(pmap_kernel());
  9.1551 +	pvpage = (struct pv_page *) pv_cachedva;
  9.1552 +	pv_cachedva = 0;
  9.1553 +	return (pmap_add_pvpage(pvpage, mode != ALLOCPV_NONEED));
  9.1554 +}
  9.1555 +
  9.1556 +/*
  9.1557 + * pmap_add_pvpage: add a pv_page's pv_entrys to the free list
  9.1558 + *
  9.1559 + * => caller must hold pvalloc_lock
  9.1560 + * => if need_entry is true, we allocate and return one pv_entry
  9.1561 + */
  9.1562 +
  9.1563 +static struct pv_entry *
  9.1564 +pmap_add_pvpage(pvp, need_entry)
  9.1565 +	struct pv_page *pvp;
  9.1566 +	boolean_t need_entry;
  9.1567 +{
  9.1568 +	int tofree, lcv;
  9.1569 +
  9.1570 +	/* do we need to return one? */
  9.1571 +	tofree = (need_entry) ? PVE_PER_PVPAGE - 1 : PVE_PER_PVPAGE;
  9.1572 +
  9.1573 +	pvp->pvinfo.pvpi_pvfree = NULL;
  9.1574 +	pvp->pvinfo.pvpi_nfree = tofree;
  9.1575 +	for (lcv = 0 ; lcv < tofree ; lcv++) {
  9.1576 +		SPLAY_RIGHT(&pvp->pvents[lcv], pv_node) =
  9.1577 +			pvp->pvinfo.pvpi_pvfree;
  9.1578 +		pvp->pvinfo.pvpi_pvfree = &pvp->pvents[lcv];
  9.1579 +	}
  9.1580 +	if (need_entry)
  9.1581 +		TAILQ_INSERT_TAIL(&pv_freepages, pvp, pvinfo.pvpi_list);
  9.1582 +	else
  9.1583 +		TAILQ_INSERT_TAIL(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
  9.1584 +	pv_nfpvents += tofree;
  9.1585 +	return((need_entry) ? &pvp->pvents[lcv] : NULL);
  9.1586 +}
  9.1587 +
  9.1588 +/*
  9.1589 + * pmap_free_pv_doit: actually free a pv_entry
  9.1590 + *
  9.1591 + * => do not call this directly!  instead use either
  9.1592 + *    1. pmap_free_pv ==> free a single pv_entry
  9.1593 + *    2. pmap_free_pvs => free a list of pv_entrys
  9.1594 + * => we must be holding pvalloc_lock
  9.1595 + */
  9.1596 +
  9.1597 +__inline static void
  9.1598 +pmap_free_pv_doit(pv)
  9.1599 +	struct pv_entry *pv;
  9.1600 +{
  9.1601 +	struct pv_page *pvp;
  9.1602 +
  9.1603 +	pvp = (struct pv_page *) x86_trunc_page(pv);
  9.1604 +	pv_nfpvents++;
  9.1605 +	pvp->pvinfo.pvpi_nfree++;
  9.1606 +
  9.1607 +	/* nfree == 1 => fully allocated page just became partly allocated */
  9.1608 +	if (pvp->pvinfo.pvpi_nfree == 1) {
  9.1609 +		TAILQ_INSERT_HEAD(&pv_freepages, pvp, pvinfo.pvpi_list);
  9.1610 +	}
  9.1611 +
  9.1612 +	/* free it */
  9.1613 +	SPLAY_RIGHT(pv, pv_node) = pvp->pvinfo.pvpi_pvfree;
  9.1614 +	pvp->pvinfo.pvpi_pvfree = pv;
  9.1615 +
  9.1616 +	/*
  9.1617 +	 * are all pv_page's pv_entry's free?  move it to unused queue.
  9.1618 +	 */
  9.1619 +
  9.1620 +	if (pvp->pvinfo.pvpi_nfree == PVE_PER_PVPAGE) {
  9.1621 +		TAILQ_REMOVE(&pv_freepages, pvp, pvinfo.pvpi_list);
  9.1622 +		TAILQ_INSERT_HEAD(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
  9.1623 +	}
  9.1624 +}
  9.1625 +
  9.1626 +/*
  9.1627 + * pmap_free_pv: free a single pv_entry
  9.1628 + *
  9.1629 + * => we gain the pvalloc_lock
  9.1630 + */
  9.1631 +
  9.1632 +__inline static void
  9.1633 +pmap_free_pv(pmap, pv)
  9.1634 +	struct pmap *pmap;
  9.1635 +	struct pv_entry *pv;
  9.1636 +{
  9.1637 +	simple_lock(&pvalloc_lock);
  9.1638 +	pmap_free_pv_doit(pv);
  9.1639 +
  9.1640 +	/*
  9.1641 +	 * Can't free the PV page if the PV entries were associated with
  9.1642 +	 * the kernel pmap; the pmap is already locked.
  9.1643 +	 */
  9.1644 +	if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL &&
  9.1645 +	    pmap != pmap_kernel())
  9.1646 +		pmap_free_pvpage();
  9.1647 +
  9.1648 +	simple_unlock(&pvalloc_lock);
  9.1649 +}
  9.1650 +
  9.1651 +/*
  9.1652 + * pmap_free_pvs: free a list of pv_entrys
  9.1653 + *
  9.1654 + * => we gain the pvalloc_lock
  9.1655 + */
  9.1656 +
  9.1657 +__inline static void
  9.1658 +pmap_free_pvs(pmap, pvs)
  9.1659 +	struct pmap *pmap;
  9.1660 +	struct pv_entry *pvs;
  9.1661 +{
  9.1662 +	struct pv_entry *nextpv;
  9.1663 +
  9.1664 +	simple_lock(&pvalloc_lock);
  9.1665 +
  9.1666 +	for ( /* null */ ; pvs != NULL ; pvs = nextpv) {
  9.1667 +		nextpv = SPLAY_RIGHT(pvs, pv_node);
  9.1668 +		pmap_free_pv_doit(pvs);
  9.1669 +	}
  9.1670 +
  9.1671 +	/*
  9.1672 +	 * Can't free the PV page if the PV entries were associated with
  9.1673 +	 * the kernel pmap; the pmap is already locked.
  9.1674 +	 */
  9.1675 +	if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL &&
  9.1676 +	    pmap != pmap_kernel())
  9.1677 +		pmap_free_pvpage();
  9.1678 +
  9.1679 +	simple_unlock(&pvalloc_lock);
  9.1680 +}
  9.1681 +
  9.1682 +
  9.1683 +/*
  9.1684 + * pmap_free_pvpage: try and free an unused pv_page structure
  9.1685 + *
  9.1686 + * => assume caller is holding the pvalloc_lock and that
  9.1687 + *	there is a page on the pv_unusedpgs list
  9.1688 + * => if we can't get a lock on the kmem_map we try again later
  9.1689 + */
  9.1690 +
  9.1691 +static void
  9.1692 +pmap_free_pvpage()
  9.1693 +{
  9.1694 +	int s;
  9.1695 +	struct vm_map *map;
  9.1696 +	struct vm_map_entry *dead_entries;
  9.1697 +	struct pv_page *pvp;
  9.1698 +
  9.1699 +	s = splvm(); /* protect kmem_map */
  9.1700 +
  9.1701 +	pvp = TAILQ_FIRST(&pv_unusedpgs);
  9.1702 +
  9.1703 +	/*
  9.1704 +	 * note: watch out for pv_initpage which is allocated out of
  9.1705 +	 * kernel_map rather than kmem_map.
  9.1706 +	 */
  9.1707 +
  9.1708 +	if (pvp == pv_initpage)
  9.1709 +		map = kernel_map;
  9.1710 +	else
  9.1711 +		map = kmem_map;
  9.1712 +	if (vm_map_lock_try(map)) {
  9.1713 +
  9.1714 +		/* remove pvp from pv_unusedpgs */
  9.1715 +		TAILQ_REMOVE(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
  9.1716 +
  9.1717 +		/* unmap the page */
  9.1718 +		dead_entries = NULL;
  9.1719 +		uvm_unmap_remove(map, (vaddr_t)pvp, ((vaddr_t)pvp) + PAGE_SIZE,
  9.1720 +		    &dead_entries);
  9.1721 +		vm_map_unlock(map);
  9.1722 +
  9.1723 +		if (dead_entries != NULL)
  9.1724 +			uvm_unmap_detach(dead_entries, 0);
  9.1725 +
  9.1726 +		pv_nfpvents -= PVE_PER_PVPAGE;  /* update free count */
  9.1727 +	}
  9.1728 +	if (pvp == pv_initpage)
  9.1729 +		/* no more initpage, we've freed it */
  9.1730 +		pv_initpage = NULL;
  9.1731 +
  9.1732 +	splx(s);
  9.1733 +}
  9.1734 +
  9.1735 +/*
  9.1736 + * pmap_lock_pvhs: Lock pvh1 and optional pvh2
  9.1737 + *                 Observe locking order when locking both pvhs
  9.1738 + */
  9.1739 +
  9.1740 +__inline static void
  9.1741 +pmap_lock_pvhs(struct pv_head *pvh1, struct pv_head *pvh2)
  9.1742 +{
  9.1743 +
  9.1744 +	if (pvh2 == NULL) {
  9.1745 +		simple_lock(&pvh1->pvh_lock);
  9.1746 +		return;
  9.1747 +	}
  9.1748 +
  9.1749 +	if (pvh1 < pvh2) {
  9.1750 +		simple_lock(&pvh1->pvh_lock);
  9.1751 +		simple_lock(&pvh2->pvh_lock);
  9.1752 +	} else {
  9.1753 +		simple_lock(&pvh2->pvh_lock);
  9.1754 +		simple_lock(&pvh1->pvh_lock);
  9.1755 +	}
  9.1756 +}
  9.1757 +
  9.1758 +
  9.1759 +/*
  9.1760 + * main pv_entry manipulation functions:
  9.1761 + *   pmap_enter_pv: enter a mapping onto a pv_head list
  9.1762 + *   pmap_remove_pv: remove a mappiing from a pv_head list
  9.1763 + *
  9.1764 + * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 
  9.1765 + *       the pvh before calling
  9.1766 + */
  9.1767 +
  9.1768 +/*
  9.1769 + * pmap_enter_pv: enter a mapping onto a pv_head lst
  9.1770 + *
  9.1771 + * => caller should hold the proper lock on pmap_main_lock
  9.1772 + * => caller should have pmap locked
  9.1773 + * => caller should have the pv_head locked
  9.1774 + * => caller should adjust ptp's wire_count before calling
  9.1775 + */
  9.1776 +
  9.1777 +__inline static void
  9.1778 +pmap_enter_pv(pvh, pve, pmap, va, ptp)
  9.1779 +	struct pv_head *pvh;
  9.1780 +	struct pv_entry *pve;	/* preallocated pve for us to use */
  9.1781 +	struct pmap *pmap;
  9.1782 +	vaddr_t va;
  9.1783 +	struct vm_page *ptp;	/* PTP in pmap that maps this VA */
  9.1784 +{
  9.1785 +	pve->pv_pmap = pmap;
  9.1786 +	pve->pv_va = va;
  9.1787 +	pve->pv_ptp = ptp;			/* NULL for kernel pmap */
  9.1788 +	SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */
  9.1789 +}
  9.1790 +
  9.1791 +/*
  9.1792 + * pmap_remove_pv: try to remove a mapping from a pv_list
  9.1793 + *
  9.1794 + * => caller should hold proper lock on pmap_main_lock
  9.1795 + * => pmap should be locked
  9.1796 + * => caller should hold lock on pv_head [so that attrs can be adjusted]
  9.1797 + * => caller should adjust ptp's wire_count and free PTP if needed
  9.1798 + * => we return the removed pve
  9.1799 + */
  9.1800 +
  9.1801 +__inline static struct pv_entry *
  9.1802 +pmap_remove_pv(pvh, pmap, va)
  9.1803 +	struct pv_head *pvh;
  9.1804 +	struct pmap *pmap;
  9.1805 +	vaddr_t va;
  9.1806 +{
  9.1807 +	struct pv_entry tmp, *pve;
  9.1808 +
  9.1809 +	tmp.pv_pmap = pmap;
  9.1810 +	tmp.pv_va = va;
  9.1811 +	pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp);
  9.1812 +	if (pve == NULL)
  9.1813 +		return (NULL);
  9.1814 +	SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve);
  9.1815 +	return(pve);				/* return removed pve */
  9.1816 +}
  9.1817 +
  9.1818 +/*
  9.1819 + * p t p   f u n c t i o n s
  9.1820 + */
  9.1821 +
  9.1822 +/*
  9.1823 + * pmap_alloc_ptp: allocate a PTP for a PMAP
  9.1824 + *
  9.1825 + * => pmap should already be locked by caller
  9.1826 + * => we use the ptp's wire_count to count the number of active mappings
  9.1827 + *	in the PTP (we start it at one to prevent any chance this PTP
  9.1828 + *	will ever leak onto the active/inactive queues)
  9.1829 + */
  9.1830 +
  9.1831 +__inline static struct vm_page *
  9.1832 +pmap_alloc_ptp(pmap, pde_index)
  9.1833 +	struct pmap *pmap;
  9.1834 +	int pde_index;
  9.1835 +{
  9.1836 +	struct vm_page *ptp;
  9.1837 +	pd_entry_t *mapdp;
  9.1838 +
  9.1839 +	ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
  9.1840 +			    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
  9.1841 +	if (ptp == NULL)
  9.1842 +		return(NULL);
  9.1843 +
  9.1844 +	/* got one! */
  9.1845 +	ptp->flags &= ~PG_BUSY;	/* never busy */
  9.1846 +	ptp->wire_count = 1;	/* no mappings yet */
  9.1847 +	mapdp = (pt_entry_t *)vtomach((vaddr_t)&pmap->pm_pdir[pde_index]);
  9.1848 +	PDE_SET(&pmap->pm_pdir[pde_index], mapdp,
  9.1849 +	    (pd_entry_t) (VM_PAGE_TO_PHYS(ptp) | PG_u | PG_RW | PG_V));
  9.1850 +	pmap->pm_stats.resident_count++;	/* count PTP as resident */
  9.1851 +	pmap->pm_ptphint = ptp;
  9.1852 +	return(ptp);
  9.1853 +}
  9.1854 +
  9.1855 +/*
  9.1856 + * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
  9.1857 + *
  9.1858 + * => pmap should NOT be pmap_kernel()
  9.1859 + * => pmap should be locked
  9.1860 + */
  9.1861 +
  9.1862 +static struct vm_page *
  9.1863 +pmap_get_ptp(pmap, pde_index)
  9.1864 +	struct pmap *pmap;
  9.1865 +	int pde_index;
  9.1866 +{
  9.1867 +	struct vm_page *ptp;
  9.1868 +
  9.1869 +	if (pmap_valid_entry(pmap->pm_pdir[pde_index])) {
  9.1870 +
  9.1871 +		/* valid... check hint (saves us a PA->PG lookup) */
  9.1872 +		if (pmap->pm_ptphint &&
  9.1873 +		    (PDE_GET(&pmap->pm_pdir[pde_index]) & PG_FRAME) ==
  9.1874 +		    VM_PAGE_TO_PHYS(pmap->pm_ptphint))
  9.1875 +			return(pmap->pm_ptphint);
  9.1876 +
  9.1877 +		ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
  9.1878 +#ifdef DIAGNOSTIC
  9.1879 +		if (ptp == NULL)
  9.1880 +			panic("pmap_get_ptp: unmanaged user PTP");
  9.1881 +#endif
  9.1882 +		pmap->pm_ptphint = ptp;
  9.1883 +		return(ptp);
  9.1884 +	}
  9.1885 +
  9.1886 +	/* allocate a new PTP (updates ptphint) */
  9.1887 +	return(pmap_alloc_ptp(pmap, pde_index));
  9.1888 +}
  9.1889 +
  9.1890 +/*
  9.1891 + * p m a p  l i f e c y c l e   f u n c t i o n s
  9.1892 + */
  9.1893 +
  9.1894 +/*
  9.1895 + * pmap_pdp_ctor: constructor for the PDP cache.
  9.1896 + */
  9.1897 +
  9.1898 +int
  9.1899 +pmap_pdp_ctor(void *arg, void *object, int flags)
  9.1900 +{
  9.1901 +	pd_entry_t *pdir = object;
  9.1902 +	paddr_t pdirpa;
  9.1903 +
  9.1904 +	/*
  9.1905 +	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
  9.1906 +	 * WE MUST NOT BLOCK!
  9.1907 +	 */
  9.1908 +
  9.1909 +	/* fetch the physical address of the page directory. */
  9.1910 +	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
  9.1911 +
  9.1912 +	XENPRINTF(("pmap_pdp_ctor %p %p\n", pdir, (void *)pdirpa));
  9.1913 +
  9.1914 +	/* zero init area */
  9.1915 +	memset(pdir, 0, PDSLOT_PTE * sizeof(pd_entry_t));
  9.1916 +
  9.1917 +	/* put in recursive PDE to map the PTEs */
  9.1918 +	pdir[PDSLOT_PTE] = xpmap_ptom(pdirpa | PG_V /* | PG_KW */);
  9.1919 +
  9.1920 +	/* put in kernel VM PDEs */
  9.1921 +	memcpy(&pdir[PDSLOT_KERN], &PDP_BASE[PDSLOT_KERN],
  9.1922 +	    nkpde * sizeof(pd_entry_t));
  9.1923 +
  9.1924 +	/* zero the rest */
  9.1925 +	memset(&pdir[PDSLOT_KERN + nkpde], 0,
  9.1926 +	    PAGE_SIZE - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
  9.1927 +
  9.1928 +	pmap_enter(pmap_kernel(), (vaddr_t)pdir, pdirpa, VM_PROT_READ,
  9.1929 +	    VM_PROT_READ);
  9.1930 +	pmap_update(pmap_kernel());
  9.1931 +
  9.1932 +	/* pin page type */
  9.1933 +	xpq_queue_pin_table(xpmap_ptom(pdirpa), XPQ_PIN_L2_TABLE);
  9.1934 +	xpq_flush_queue();
  9.1935 +
  9.1936 +	return (0);
  9.1937 +}
  9.1938 +
  9.1939 +void
  9.1940 +pmap_pdp_dtor(void *arg, void *object)
  9.1941 +{
  9.1942 +	pd_entry_t *pdir = object;
  9.1943 +	paddr_t pdirpa;
  9.1944 +
  9.1945 +	/* fetch the physical address of the page directory. */
  9.1946 +	pdirpa = PDE_GET(&pdir[PDSLOT_PTE]) & PG_FRAME;
  9.1947 +
  9.1948 +	XENPRINTF(("pmap_pdp_dtor %p %p\n", pdir, (void *)pdirpa));
  9.1949 +
  9.1950 +	/* unpin page type */
  9.1951 +	xpq_queue_unpin_table(xpmap_ptom(pdirpa));
  9.1952 +	xpq_flush_queue();
  9.1953 +}
  9.1954 +
  9.1955 +/*
  9.1956 + * pmap_create: create a pmap
  9.1957 + *
  9.1958 + * => note: old pmap interface took a "size" args which allowed for
  9.1959 + *	the creation of "software only" pmaps (not in bsd).
  9.1960 + */
  9.1961 +
  9.1962 +struct pmap *
  9.1963 +pmap_create()
  9.1964 +{
  9.1965 +	struct pmap *pmap;
  9.1966 +	u_int gen;
  9.1967 +
  9.1968 +	XENPRINTF(("pmap_create\n"));
  9.1969 +	pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
  9.1970 +
  9.1971 +	/* init uvm_object */
  9.1972 +	simple_lock_init(&pmap->pm_obj.vmobjlock);
  9.1973 +	pmap->pm_obj.pgops = NULL;	/* currently not a mappable object */
  9.1974 +	TAILQ_INIT(&pmap->pm_obj.memq);
  9.1975 +	pmap->pm_obj.uo_npages = 0;
  9.1976 +	pmap->pm_obj.uo_refs = 1;
  9.1977 +	pmap->pm_stats.wired_count = 0;
  9.1978 +	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
  9.1979 +	pmap->pm_ptphint = NULL;
  9.1980 +	pmap->pm_hiexec = 0;
  9.1981 +	pmap->pm_flags = 0;
  9.1982 +	pmap->pm_cpus = 0;
  9.1983 +
  9.1984 +	/* init the LDT */
  9.1985 +	pmap->pm_ldt = NULL;
  9.1986 +	pmap->pm_ldt_len = 0;
  9.1987 +	pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
  9.1988 +
  9.1989 +	/* allocate PDP */
  9.1990 +
  9.1991 +	/*
  9.1992 +	 * we need to lock pmaps_lock to prevent nkpde from changing on
  9.1993 +	 * us.  note that there is no need to splvm to protect us from
  9.1994 +	 * malloc since malloc allocates out of a submap and we should
  9.1995 +	 * have already allocated kernel PTPs to cover the range...
  9.1996 +	 *
  9.1997 +	 * NOTE: WE MUST NOT BLOCK WHILE HOLDING THE `pmap_lock', nor
  9.1998 +	 * must we call pmap_growkernel() while holding it!
  9.1999 +	 */
  9.2000 +
  9.2001 + try_again:
  9.2002 +	gen = pmap_pdp_cache_generation;
  9.2003 +	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
  9.2004 +
  9.2005 +	simple_lock(&pmaps_lock);
  9.2006 +
  9.2007 +	if (gen != pmap_pdp_cache_generation) {
  9.2008 +		simple_unlock(&pmaps_lock);
  9.2009 +		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
  9.2010 +		goto try_again;
  9.2011 +	}
  9.2012 +
  9.2013 +	pmap->pm_pdirpa = PDE_GET(&pmap->pm_pdir[PDSLOT_PTE]) & PG_FRAME;
  9.2014 +	XENPRINTF(("pmap_create %p set pm_pdirpa %p/%p slotval %p\n", pmap,
  9.2015 +		   (void *)pmap->pm_pdirpa,
  9.2016 +		   (void *)xpmap_ptom(pmap->pm_pdirpa),
  9.2017 +		   (void *)pmap->pm_pdir[PDSLOT_PTE]));
  9.2018 +
  9.2019 +	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
  9.2020 +
  9.2021 +	simple_unlock(&pmaps_lock);
  9.2022 +
  9.2023 +	return (pmap);
  9.2024 +}
  9.2025 +
  9.2026 +/*
  9.2027 + * pmap_destroy: drop reference count on pmap.   free pmap if
  9.2028 + *	reference count goes to zero.
  9.2029 + */
  9.2030 +
  9.2031 +void
  9.2032 +pmap_destroy(pmap)
  9.2033 +	struct pmap *pmap;
  9.2034 +{
  9.2035 +	int refs;
  9.2036 +#ifdef DIAGNOSTIC
  9.2037 +	struct cpu_info *ci;
  9.2038 +	CPU_INFO_ITERATOR cii;
  9.2039 +#endif /* DIAGNOSTIC */
  9.2040 +
  9.2041 +	/*
  9.2042 +	 * drop reference count
  9.2043 +	 */
  9.2044 +
  9.2045 +	simple_lock(&pmap->pm_obj.vmobjlock);
  9.2046 +	refs = --pmap->pm_obj.uo_refs;
  9.2047 +	simple_unlock(&pmap->pm_obj.vmobjlock);
  9.2048 +	if (refs > 0) {
  9.2049 +		return;
  9.2050 +	}
  9.2051 +
  9.2052 +#ifdef DIAGNOSTIC
  9.2053 +	for (CPU_INFO_FOREACH(cii, ci))
  9.2054 +		if (ci->ci_pmap == pmap)
  9.2055 +			panic("destroying pmap being used");
  9.2056 +#endif /* DIAGNOSTIC */
  9.2057 +
  9.2058 +	/*
  9.2059 +	 * reference count is zero, free pmap resources and then free pmap.
  9.2060 +	 */
  9.2061 +
  9.2062 +	XENPRINTF(("pmap_destroy %p pm_pdirpa %p/%p\n", pmap,
  9.2063 +		   (void *)pmap->pm_pdirpa,
  9.2064 +		   (void *)xpmap_ptom(pmap->pm_pdirpa)));
  9.2065 +
  9.2066 +	/*
  9.2067 +	 * remove it from global list of pmaps
  9.2068 +	 */
  9.2069 +
  9.2070 +	simple_lock(&pmaps_lock);
  9.2071 +	LIST_REMOVE(pmap, pm_list);
  9.2072 +	simple_unlock(&pmaps_lock);
  9.2073 +
  9.2074 +	/*
  9.2075 +	 * destroyed pmap shouldn't have remaining PTPs
  9.2076 +	 */
  9.2077 +
  9.2078 +	KASSERT(pmap->pm_obj.uo_npages == 0);
  9.2079 +	KASSERT(TAILQ_EMPTY(&pmap->pm_obj.memq));
  9.2080 +
  9.2081 +	/*
  9.2082 +	 * MULTIPROCESSOR -- no need to flush out of other processors'
  9.2083 +	 * APTE space because we do that in pmap_unmap_ptes().
  9.2084 +	 */
  9.2085 +	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
  9.2086 +
  9.2087 +#ifdef USER_LDT
  9.2088 +	if (pmap->pm_flags & PMF_USER_LDT) {
  9.2089 +		/*
  9.2090 +		 * no need to switch the LDT; this address space is gone,
  9.2091 +		 * nothing is using it.
  9.2092 +		 *
  9.2093 +		 * No need to lock the pmap for ldt_free (or anything else),
  9.2094 +		 * we're the last one to use it.
  9.2095 +		 */
  9.2096 +		ldt_free(pmap);
  9.2097 +		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
  9.2098 +			    pmap->pm_ldt_len * sizeof(union descriptor));
  9.2099 +	}
  9.2100 +#endif
  9.2101 +
  9.2102 +	pool_put(&pmap_pmap_pool, pmap);
  9.2103 +}
  9.2104 +
  9.2105 +/*
  9.2106 + *	Add a reference to the specified pmap.
  9.2107 + */
  9.2108 +
  9.2109 +void
  9.2110 +pmap_reference(pmap)
  9.2111 +	struct pmap *pmap;
  9.2112 +{
  9.2113 +	simple_lock(&pmap->pm_obj.vmobjlock);
  9.2114 +	pmap->pm_obj.uo_refs++;
  9.2115 +	simple_unlock(&pmap->pm_obj.vmobjlock);
  9.2116 +}
  9.2117 +
  9.2118 +#if defined(PMAP_FORK)
  9.2119 +/*
  9.2120 + * pmap_fork: perform any necessary data structure manipulation when
  9.2121 + * a VM space is forked.
  9.2122 + */
  9.2123 +
  9.2124 +void
  9.2125 +pmap_fork(pmap1, pmap2)
  9.2126 +	struct pmap *pmap1, *pmap2;
  9.2127 +{
  9.2128 +	simple_lock(&pmap1->pm_obj.vmobjlock);
  9.2129 +	simple_lock(&pmap2->pm_obj.vmobjlock);
  9.2130 +
  9.2131 +#ifdef USER_LDT
  9.2132 +	/* Copy the LDT, if necessary. */
  9.2133 +	if (pmap1->pm_flags & PMF_USER_LDT) {
  9.2134 +		union descriptor *new_ldt;
  9.2135 +		size_t len;
  9.2136 +
  9.2137 +		len = pmap1->pm_ldt_len * sizeof(union descriptor);
  9.2138 +		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len);
  9.2139 +		memcpy(new_ldt, pmap1->pm_ldt, len);
  9.2140 +		pmap2->pm_ldt = new_ldt;
  9.2141 +		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
  9.2142 +		pmap2->pm_flags |= PMF_USER_LDT;
  9.2143 +		ldt_alloc(pmap2, new_ldt, len);
  9.2144 +	}
  9.2145 +#endif /* USER_LDT */
  9.2146 +
  9.2147 +	simple_unlock(&pmap2->pm_obj.vmobjlock);
  9.2148 +	simple_unlock(&pmap1->pm_obj.vmobjlock);
  9.2149 +}
  9.2150 +#endif /* PMAP_FORK */
  9.2151 +
  9.2152 +#ifdef USER_LDT
  9.2153 +/*
  9.2154 + * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
  9.2155 + * restore the default.
  9.2156 + */
  9.2157 +
  9.2158 +void
  9.2159 +pmap_ldt_cleanup(l)
  9.2160 +	struct lwp *l;
  9.2161 +{
  9.2162 +	struct pcb *pcb = &l->l_addr->u_pcb;
  9.2163 +	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
  9.2164 +	union descriptor *old_ldt = NULL;
  9.2165 +	size_t len = 0;
  9.2166 +
  9.2167 +	simple_lock(&pmap->pm_obj.vmobjlock);
  9.2168 +
  9.2169 +	if (pmap->pm_flags & PMF_USER_LDT) {
  9.2170 +		ldt_free(pmap);
  9.2171 +		pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
  9.2172 +		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
  9.2173 +		if (pcb == curpcb)
  9.2174 +			lldt(pcb->pcb_ldt_sel);
  9.2175 +		old_ldt = pmap->pm_ldt;
  9.2176 +		len = pmap->pm_ldt_len * sizeof(union descriptor);
  9.2177 +		pmap->pm_ldt = NULL;
  9.2178 +		pmap->pm_ldt_len = 0;
  9.2179 +		pmap->pm_flags &= ~PMF_USER_LDT;
  9.2180 +	}
  9.2181 +
  9.2182 +	simple_unlock(&pmap->pm_obj.vmobjlock);
  9.2183 +
  9.2184 +	if (old_ldt != NULL)
  9.2185 +		uvm_km_free(kernel_map, (vaddr_t)old_ldt, len);
  9.2186 +}
  9.2187 +#endif /* USER_LDT */
  9.2188 +
  9.2189 +/*
  9.2190 + * pmap_activate: activate a process' pmap
  9.2191 + *
  9.2192 + * => called from cpu_switch()
  9.2193 + * => if lwp is the curlwp, then set ci_want_pmapload so that
  9.2194 + *    actual MMU context switch will be done by pmap_load() later
  9.2195 + */
  9.2196 +
  9.2197 +void
  9.2198 +pmap_activate(l)
  9.2199 +	struct lwp *l;
  9.2200 +{
  9.2201 +	struct cpu_info *ci = curcpu();
  9.2202 +	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
  9.2203 +
  9.2204 +	if (l == ci->ci_curlwp) {
  9.2205 +		struct pcb *pcb;
  9.2206 +
  9.2207 +		KASSERT(ci->ci_want_pmapload == 0);
  9.2208 +		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
  9.2209 +#ifdef KSTACK_CHECK_DR0
  9.2210 +		/*
  9.2211 +		 * setup breakpoint on the top of stack
  9.2212 +		 */
  9.2213 +		if (l == &lwp0)
  9.2214 +			dr0(0, 0, 0, 0);
  9.2215 +		else
  9.2216 +			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
  9.2217 +#endif
  9.2218 +
  9.2219 +		/*
  9.2220 +		 * no need to switch to kernel vmspace because
  9.2221 +		 * it's a subset of any vmspace.
  9.2222 +		 */
  9.2223 +
  9.2224 +		if (pmap == pmap_kernel()) {
  9.2225 +			ci->ci_want_pmapload = 0;
  9.2226 +			return;
  9.2227 +		}
  9.2228 +
  9.2229 +		pcb = &l->l_addr->u_pcb;
  9.2230 +		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
  9.2231 +
  9.2232 +		ci->ci_want_pmapload = 1;
  9.2233 +	}
  9.2234 +}
  9.2235 +
  9.2236 +/*
  9.2237 + * pmap_reactivate: try to regain reference to the pmap.
  9.2238 + */
  9.2239 +
  9.2240 +static boolean_t
  9.2241 +pmap_reactivate(struct pmap *pmap)
  9.2242 +{
  9.2243 +	struct cpu_info *ci = curcpu();
  9.2244 +	u_int32_t cpumask = 1U << ci->ci_cpuid;
  9.2245 +	int s;
  9.2246 +	boolean_t result;
  9.2247 +	u_int32_t oldcpus;
  9.2248 +
  9.2249 +	/*
  9.2250 +	 * if we still have a lazy reference to this pmap,
  9.2251 +	 * we can assume that there was no tlb shootdown
  9.2252 +	 * for this pmap in the meantime.
  9.2253 +	 */
  9.2254 +
  9.2255 +	s = splipi(); /* protect from tlb shootdown ipis. */
  9.2256 +	oldcpus = pmap->pm_cpus;
  9.2257 +	x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
  9.2258 +	if (oldcpus & cpumask) {
  9.2259 +		KASSERT(ci->ci_tlbstate == TLBSTATE_LAZY);
  9.2260 +		/* got it */
  9.2261 +		result = TRUE;
  9.2262 +	} else {
  9.2263 +		KASSERT(ci->ci_tlbstate == TLBSTATE_STALE);
  9.2264 +		result = FALSE;
  9.2265 +	}
  9.2266 +	ci->ci_tlbstate = TLBSTATE_VALID;
  9.2267 +	splx(s);
  9.2268 +
  9.2269 +	return result;
  9.2270 +}
  9.2271 +
  9.2272 +/*
  9.2273 + * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
  9.2274 + */
  9.2275 +
  9.2276 +void
  9.2277 +pmap_load()
  9.2278 +{
  9.2279 +	struct cpu_info *ci = curcpu();
  9.2280 +	u_int32_t cpumask = 1U << ci->ci_cpuid;
  9.2281 +	struct pmap *pmap;
  9.2282 +	struct pmap *oldpmap;
  9.2283 +	struct lwp *l;
  9.2284 +	struct pcb *pcb;
  9.2285 +	pd_entry_t *mapdp;
  9.2286 +	int s;
  9.2287 +
  9.2288 +	KASSERT(ci->ci_want_pmapload);
  9.2289 +
  9.2290 +	l = ci->ci_curlwp;
  9.2291 +	KASSERT(l != NULL);
  9.2292 +	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
  9.2293 +	KASSERT(pmap != pmap_kernel());
  9.2294 +	oldpmap = ci->ci_pmap;
  9.2295 +
  9.2296 +	pcb = ci->ci_curpcb;
  9.2297 +	KASSERT(pcb == &l->l_addr->u_pcb);
  9.2298 +	/* loaded by pmap_activate */
  9.2299 +	KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel);
  9.2300 +
  9.2301 +	if (pmap == oldpmap) {
  9.2302 +		if (!pmap_reactivate(pmap)) {
  9.2303 +
  9.2304 +			/*
  9.2305 +			 * pmap has been changed during deactivated.
  9.2306 +			 * our tlb may be stale.
  9.2307 +			 */
  9.2308 +
  9.2309 +			tlbflush();
  9.2310 +		}
  9.2311 +
  9.2312 +		ci->ci_want_pmapload = 0;
  9.2313 +		return;
  9.2314 +	}
  9.2315 +
  9.2316 +	/*
  9.2317 +	 * actually switch pmap.
  9.2318 +	 */
  9.2319 +
  9.2320 +	x86_atomic_clearbits_l(&oldpmap->pm_cpus, cpumask);
  9.2321 +
  9.2322 +	KASSERT((pmap->pm_cpus & cpumask) == 0);
  9.2323 +
  9.2324 +	KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE);
  9.2325 +	pmap_reference(pmap);
  9.2326 +	KERNEL_UNLOCK();
  9.2327 +
  9.2328 +	/*
  9.2329 +	 * mark the pmap in use by this processor.
  9.2330 +	 */
  9.2331 +
  9.2332 +	s = splipi();
  9.2333 +	x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
  9.2334 +	ci->ci_pmap = pmap;
  9.2335 +	ci->ci_tlbstate = TLBSTATE_VALID;
  9.2336 +	splx(s);
  9.2337 +
  9.2338 +	/*
  9.2339 +	 * clear apdp slot before loading %cr3 since Xen only allows
  9.2340 +	 * linear pagetable mappings in the current pagetable.
  9.2341 +	 */
  9.2342 +	KDASSERT(curapdp == 0);
  9.2343 +	mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
  9.2344 +	PDE_CLEAR(APDP_PDE, mapdp);
  9.2345 +
  9.2346 +	/*
  9.2347 +	 * update tss and load corresponding registers.
  9.2348 +	 */
  9.2349 +
  9.2350 +	lldt(pcb->pcb_ldt_sel);
  9.2351 +	pcb->pcb_cr3 = pmap->pm_pdirpa;
  9.2352 +	lcr3(pcb->pcb_cr3);
  9.2353 +
  9.2354 +	ci->ci_want_pmapload = 0;
  9.2355 +
  9.2356 +	KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE);
  9.2357 +	pmap_destroy(oldpmap);
  9.2358 +	KERNEL_UNLOCK();
  9.2359 +}
  9.2360 +
  9.2361 +/*
  9.2362 + * pmap_deactivate: deactivate a process' pmap
  9.2363 + */
  9.2364 +
  9.2365 +void
  9.2366 +pmap_deactivate(l)
  9.2367 +	struct lwp *l;
  9.2368 +{
  9.2369 +
  9.2370 +	if (l == curlwp)
  9.2371 +		pmap_deactivate2(l);
  9.2372 +}
  9.2373 +
  9.2374 +/*
  9.2375 + * pmap_deactivate2: context switch version of pmap_deactivate.
  9.2376 + * always treat l as curlwp.
  9.2377 + */
  9.2378 +
  9.2379 +void
  9.2380 +pmap_deactivate2(l)
  9.2381 +	struct lwp *l;
  9.2382 +{
  9.2383 +	struct pmap *pmap;
  9.2384 +	struct cpu_info *ci = curcpu();
  9.2385 +
  9.2386 +	if (ci->ci_want_pmapload) {
  9.2387 +		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
  9.2388 +		    != pmap_kernel());
  9.2389 +		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
  9.2390 +		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
  9.2391 +
  9.2392 +		/*
  9.2393 +		 * userspace has not been touched.
  9.2394 +		 * nothing to do here.
  9.2395 +		 */
  9.2396 +
  9.2397 +		ci->ci_want_pmapload = 0;
  9.2398 +		return;
  9.2399 +	}
  9.2400 +
  9.2401 +	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
  9.2402 +
  9.2403 +	if (pmap == pmap_kernel()) {
  9.2404 +		return;
  9.2405 +	}
  9.2406 +
  9.2407 +	KASSERT(ci->ci_pmap == pmap);
  9.2408 +
  9.2409 +	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
  9.2410 +	ci->ci_tlbstate = TLBSTATE_LAZY;
  9.2411 +	XENPRINTF(("pmap_deactivate %p ebp %p esp %p\n",
  9.2412 +		      l, (void *)l->l_addr->u_pcb.pcb_ebp, 
  9.2413 +		      (void *)l->l_addr->u_pcb.pcb_esp));
  9.2414 +}
  9.2415 +
  9.2416 +/*
  9.2417 + * end of lifecycle functions
  9.2418 + */
  9.2419 +
  9.2420 +/*
  9.2421 + * some misc. functions
  9.2422 + */
  9.2423 +
  9.2424 +/*
  9.2425 + * pmap_extract: extract a PA for the given VA
  9.2426 + */
  9.2427 +
  9.2428 +boolean_t
  9.2429 +pmap_extract(pmap, va, pap)
  9.2430 +	struct pmap *pmap;
  9.2431 +	vaddr_t va;
  9.2432 +	paddr_t *pap;
  9.2433 +{
  9.2434 +	pt_entry_t *ptes, pte;
  9.2435 +	pd_entry_t pde;
  9.2436 +
  9.2437 +	if (__predict_true((pde = PDE_GET(&pmap->pm_pdir[pdei(va)])) != 0)) {
  9.2438 +#ifdef LARGEPAGES
  9.2439 +		if (pde & PG_PS) {
  9.2440 +			if (pap != NULL)
  9.2441 +				*pap = (pde & PG_LGFRAME) | (va & ~PG_LGFRAME);
  9.2442 +			return (TRUE);
  9.2443 +		}
  9.2444 +#endif
  9.2445 +
  9.2446 +		ptes = pmap_map_ptes(pmap);
  9.2447 +		pte = PTE_GET(&ptes[x86_btop(va)]);
  9.2448 +		pmap_unmap_ptes(pmap);
  9.2449 +
  9.2450 +		if (__predict_true((pte & PG_V) != 0)) {
  9.2451 +			if (pap != NULL)
  9.2452 +				*pap = (pte & PG_FRAME) | (va & ~PG_FRAME);
  9.2453 +			return (TRUE);
  9.2454 +		}
  9.2455 +	}
  9.2456 +	return (FALSE);
  9.2457 +}
  9.2458 +
  9.2459 +
  9.2460 +/*
  9.2461 + * vtophys: virtual address to physical address.  For use by
  9.2462 + * machine-dependent code only.
  9.2463 + */
  9.2464 +
  9.2465 +paddr_t
  9.2466 +vtophys(va)
  9.2467 +	vaddr_t va;
  9.2468 +{
  9.2469 +	paddr_t pa;
  9.2470 +
  9.2471 +	if (pmap_extract(pmap_kernel(), va, &pa) == TRUE)
  9.2472 +		return (pa);
  9.2473 +	return (0);
  9.2474 +}
  9.2475 +
  9.2476 +
  9.2477 +/*
  9.2478 + * pmap_virtual_space: used during bootup [pmap_steal_memory] to
  9.2479 + *	determine the bounds of the kernel virtual addess space.
  9.2480 + */
  9.2481 +
  9.2482 +void
  9.2483 +pmap_virtual_space(startp, endp)
  9.2484 +	vaddr_t *startp;
  9.2485 +	vaddr_t *endp;
  9.2486 +{
  9.2487 +	*startp = virtual_avail;
  9.2488 +	*endp = virtual_end;
  9.2489 +}
  9.2490 +
  9.2491 +/*
  9.2492 + * pmap_map: map a range of PAs into kvm
  9.2493 + *
  9.2494 + * => used during crash dump
  9.2495 + * => XXX: pmap_map() should be phased out?
  9.2496 + */
  9.2497 +
  9.2498 +vaddr_t
  9.2499 +pmap_map(va, spa, epa, prot)
  9.2500 +	vaddr_t va;
  9.2501 +	paddr_t spa, epa;
  9.2502 +	vm_prot_t prot;
  9.2503 +{
  9.2504 +	while (spa < epa) {
  9.2505 +		pmap_enter(pmap_kernel(), va, spa, prot, 0);
  9.2506 +		va += PAGE_SIZE;
  9.2507 +		spa += PAGE_SIZE;
  9.2508 +	}
  9.2509 +	pmap_update(pmap_kernel());
  9.2510 +	return va;
  9.2511 +}
  9.2512 +
  9.2513 +/*
  9.2514 + * pmap_zero_page: zero a page
  9.2515 + */
  9.2516 +
  9.2517 +void
  9.2518 +pmap_zero_page(pa)
  9.2519 +	paddr_t pa;
  9.2520 +{
  9.2521 +#ifdef MULTIPROCESSOR
  9.2522 +	int id = cpu_number();
  9.2523 +#endif
  9.2524 +	pt_entry_t *zpte = PTESLEW(zero_pte, id);
  9.2525 +	pt_entry_t *maptp;
  9.2526 +	caddr_t zerova = VASLEW(zerop, id);
  9.2527 +
  9.2528 +#ifdef DIAGNOSTIC
  9.2529 +	if (PTE_GET(zpte))
  9.2530 +		panic("pmap_zero_page: lock botch");
  9.2531 +#endif
  9.2532 +
  9.2533 +	maptp = (pt_entry_t *)vtomach((vaddr_t)zpte);
  9.2534 +	PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW);	/* map in */
  9.2535 +	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
  9.2536 +
  9.2537 +	memset(zerova, 0, PAGE_SIZE);			/* zero */
  9.2538 +	PTE_CLEAR(zpte, maptp);				/* zap! */
  9.2539 +}
  9.2540 +
  9.2541 +/*
  9.2542 + * pmap_pagezeroidle: the same, for the idle loop page zero'er.
  9.2543 + * Returns TRUE if the page was zero'd, FALSE if we aborted for
  9.2544 + * some reason.
  9.2545 + */
  9.2546 +
  9.2547 +boolean_t
  9.2548 +pmap_pageidlezero(pa)
  9.2549 +	paddr_t pa;
  9.2550 +{
  9.2551 +#ifdef MULTIPROCESSOR
  9.2552 +	int id = cpu_number();
  9.2553 +#endif
  9.2554 +	pt_entry_t *zpte = PTESLEW(zero_pte, id);
  9.2555 +	pt_entry_t *maptp;
  9.2556 +	caddr_t zerova = VASLEW(zerop, id);
  9.2557 +	boolean_t rv = TRUE;
  9.2558 +	int i, *ptr;
  9.2559 +
  9.2560 +#ifdef DIAGNOSTIC
  9.2561 +	if (PTE_GET(zpte))
  9.2562 +		panic("pmap_zero_page_uncached: lock botch");
  9.2563 +#endif
  9.2564 +	maptp = (pt_entry_t *)vtomach((vaddr_t)zpte);
  9.2565 +	PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW);	/* map in */
  9.2566 +	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
  9.2567 +	for (i = 0, ptr = (int *) zerova; i < PAGE_SIZE / sizeof(int); i++) {
  9.2568 +		if (sched_whichqs != 0) {
  9.2569 +
  9.2570 +			/*
  9.2571 +			 * A process has become ready.  Abort now,
  9.2572 +			 * so we don't keep it waiting while we
  9.2573 +			 * do slow memory access to finish this
  9.2574 +			 * page.
  9.2575 +			 */
  9.2576 +
  9.2577 +			rv = FALSE;
  9.2578 +			break;
  9.2579 +		}
  9.2580 +		*ptr++ = 0;
  9.2581 +	}
  9.2582 +
  9.2583 +	PTE_CLEAR(zpte, maptp);				/* zap! */
  9.2584 +	return (rv);
  9.2585 +}
  9.2586 +
  9.2587 +/*
  9.2588 + * pmap_copy_page: copy a page
  9.2589 + */
  9.2590 +
  9.2591 +void
  9.2592 +pmap_copy_page(srcpa, dstpa)
  9.2593 +	paddr_t srcpa, dstpa;
  9.2594 +{
  9.2595 +#ifdef MULTIPROCESSOR
  9.2596 +	int id = cpu_number();
  9.2597 +#endif
  9.2598 +	pt_entry_t *spte = PTESLEW(csrc_pte,id), *maspte;
  9.2599 +	pt_entry_t *dpte = PTESLEW(cdst_pte,id), *madpte;
  9.2600 +	caddr_t csrcva = VASLEW(csrcp, id);
  9.2601 +	caddr_t cdstva = VASLEW(cdstp, id);
  9.2602 +
  9.2603 +#ifdef DIAGNOSTIC
  9.2604 +	if (PTE_GET(spte) || PTE_GET(dpte))
  9.2605 +		panic("pmap_copy_page: lock botch");
  9.2606 +#endif
  9.2607 +
  9.2608 +	maspte = (pt_entry_t *)vtomach((vaddr_t)spte);
  9.2609 +	madpte = (pt_entry_t *)vtomach((vaddr_t)dpte);
  9.2610 +	PTE_SET(spte, maspte, (srcpa & PG_FRAME) | PG_V | PG_RW);
  9.2611 +	PTE_SET(dpte, madpte, (dstpa & PG_FRAME) | PG_V | PG_RW);
  9.2612 +	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
  9.2613 +	memcpy(cdstva, csrcva, PAGE_SIZE);
  9.2614 +	PTE_CLEAR(spte, maspte);			/* zap! */
  9.2615 +	PTE_CLEAR(dpte, madpte);			/* zap! */
  9.2616 +}
  9.2617 +
  9.2618 +/*
  9.2619 + * p m a p   r e m o v e   f u n c t i o n s
  9.2620 + *
  9.2621 + * functions that remove mappings
  9.2622 + */
  9.2623 +
  9.2624 +/*
  9.2625 + * pmap_remove_ptes: remove PTEs from a PTP
  9.2626 + *
  9.2627 + * => must have proper locking on pmap_master_lock
  9.2628 + * => caller must hold pmap's lock
  9.2629 + * => PTP must be mapped into KVA
  9.2630 + * => PTP should be null if pmap == pmap_kernel()
  9.2631 + */
  9.2632 +
  9.2633 +static void
  9.2634 +pmap_remove_ptes(pmap, ptp, ptpva, startva, endva, cpumaskp, flags)
  9.2635 +	struct pmap *pmap;
  9.2636 +	struct vm_page *ptp;
  9.2637 +	vaddr_t ptpva;
  9.2638 +	vaddr_t startva, endva;
  9.2639 +	int32_t *cpumaskp;
  9.2640 +	int flags;
  9.2641 +{
  9.2642 +	struct pv_entry *pv_tofree = NULL;	/* list of pv_entrys to free */
  9.2643 +	struct pv_entry *pve;
  9.2644 +	pt_entry_t *pte = (pt_entry_t *) ptpva;
  9.2645 +	pt_entry_t opte;
  9.2646 +	pt_entry_t *maptp;
  9.2647 +
  9.2648 +	/*
  9.2649 +	 * note that ptpva points to the PTE that maps startva.   this may
  9.2650 +	 * or may not be the first PTE in the PTP.
  9.2651 +	 *
  9.2652 +	 * we loop through the PTP while there are still PTEs to look at
  9.2653 +	 * and the wire_count is greater than 1 (because we use the wire_count
  9.2654 +	 * to keep track of the number of real PTEs in the PTP).
  9.2655 +	 */
  9.2656 +
  9.2657 +	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
  9.2658 +			     ; pte++, startva += PAGE_SIZE) {
  9.2659 +		struct vm_page *pg;
  9.2660 +		struct vm_page_md *mdpg;
  9.2661 +
  9.2662 +		if (!pmap_valid_entry(*pte))
  9.2663 +			continue;			/* VA not mapped */
  9.2664 +		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
  9.2665 +			continue;
  9.2666 +		}
  9.2667 +
  9.2668 +		/* atomically save the old PTE and zap! it */
  9.2669 +		maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
  9.2670 +		opte = pte_atomic_update(pte, maptp, 0);
  9.2671 +		pmap_exec_account(pmap, startva, opte, 0);
  9.2672 +
  9.2673 +		if (opte & PG_W)
  9.2674 +			pmap->pm_stats.wired_count--;
  9.2675 +		pmap->pm_stats.resident_count--;
  9.2676 +
  9.2677 +		if (opte & PG_U)
  9.2678 +			pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
  9.2679 +
  9.2680 +		if (ptp) {
  9.2681 +			ptp->wire_count--;		/* dropping a PTE */
  9.2682 +			/* Make sure that the PDE is flushed */
  9.2683 +			if ((ptp->wire_count <= 1) && !(opte & PG_U))
  9.2684 +				pmap_tlb_shootdown(pmap, startva, opte,
  9.2685 +				    cpumaskp);
  9.2686 +		}
  9.2687 +
  9.2688 +		/*
  9.2689 +		 * if we are not on a pv_head list we are done.
  9.2690 +		 */
  9.2691 +
  9.2692 +		if ((opte & PG_PVLIST) == 0) {
  9.2693 +#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
  9.2694 +			if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
  9.2695 +				panic("pmap_remove_ptes: managed page without "
  9.2696 +				      "PG_PVLIST for 0x%lx", startva);
  9.2697 +#endif
  9.2698 +			continue;
  9.2699 +		}
  9.2700 +
  9.2701 +		pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
  9.2702 +#ifdef DIAGNOSTIC
  9.2703 +		if (pg == NULL)
  9.2704 +			panic("pmap_remove_ptes: unmanaged page marked "
  9.2705 +			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
  9.2706 +			      startva, (u_long)(opte & PG_FRAME));
  9.2707 +#endif
  9.2708 +		mdpg = &pg->mdpage;
  9.2709 +
  9.2710 +		/* sync R/M bits */
  9.2711 +		simple_lock(&mdpg->mp_pvhead.pvh_lock);
  9.2712 +		mdpg->mp_attrs |= (opte & (PG_U|PG_M));
  9.2713 +		pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, startva);
  9.2714 +		simple_unlock(&mdpg->mp_pvhead.pvh_lock);
  9.2715 +
  9.2716 +		if (pve) {
  9.2717 +			SPLAY_RIGHT(pve, pv_node) = pv_tofree;
  9.2718 +			pv_tofree = pve;
  9.2719 +		}
  9.2720 +
  9.2721 +		/* end of "for" loop: time for next pte */
  9.2722 +	}
  9.2723 +	if (pv_tofree)
  9.2724 +		pmap_free_pvs(pmap, pv_tofree);
  9.2725 +}
  9.2726 +
  9.2727 +
  9.2728 +/*
  9.2729 + * pmap_remove_pte: remove a single PTE from a PTP
  9.2730 + *
  9.2731 + * => must have proper locking on pmap_master_lock
  9.2732 + * => caller must hold pmap's lock
  9.2733 + * => PTP must be mapped into KVA
  9.2734 + * => PTP should be null if pmap == pmap_kernel()
  9.2735 + * => returns true if we removed a mapping
  9.2736 + */
  9.2737 +
  9.2738 +static boolean_t
  9.2739 +pmap_remove_pte(pmap, ptp, pte, va, cpumaskp, flags)
  9.2740 +	struct pmap *pmap;
  9.2741 +	struct vm_page *ptp;
  9.2742 +	pt_entry_t *pte;
  9.2743 +	vaddr_t va;
  9.2744 +	int32_t *cpumaskp;
  9.2745 +	int flags;
  9.2746 +{
  9.2747 +	pt_entry_t opte;
  9.2748 +	pt_entry_t *maptp;
  9.2749 +	struct pv_entry *pve;
  9.2750 +	struct vm_page *pg;
  9.2751 +	struct vm_page_md *mdpg;
  9.2752 +
  9.2753 +	if (!pmap_valid_entry(*pte))
  9.2754 +		return(FALSE);		/* VA not mapped */
  9.2755 +	if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
  9.2756 +		return(FALSE);
  9.2757 +	}
  9.2758 +
  9.2759 +	/* atomically save the old PTE and zap! it */
  9.2760 +	maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
  9.2761 +	opte = pte_atomic_update(pte, maptp, 0);
  9.2762 +
  9.2763 +	XENPRINTK(("pmap_remove_pte %p, was %08x\n", pte, opte));
  9.2764 +	pmap_exec_account(pmap, va, opte, 0);
  9.2765 +
  9.2766 +	if (opte & PG_W)
  9.2767 +		pmap->pm_stats.wired_count--;
  9.2768 +	pmap->pm_stats.resident_count--;
  9.2769 +
  9.2770 +	if (opte & PG_U)
  9.2771 +		pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
  9.2772 +
  9.2773 +	if (ptp) {
  9.2774 +		ptp->wire_count--;		/* dropping a PTE */
  9.2775 +		/* Make sure that the PDE is flushed */
  9.2776 +		if ((ptp->wire_count <= 1) && !(opte & PG_U))
  9.2777 +			pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
  9.2778 +
  9.2779 +	}
  9.2780 +	/*
  9.2781 +	 * if we are not on a pv_head list we are done.
  9.2782 +	 */
  9.2783 +
  9.2784 +	if ((opte & PG_PVLIST) == 0) {
  9.2785 +#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
  9.2786 +		if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
  9.2787 +			panic("pmap_remove_pte: managed page without "
  9.2788 +			      "PG_PVLIST for 0x%lx", va);
  9.2789 +#endif
  9.2790 +		return(TRUE);
  9.2791 +	}
  9.2792 +
  9.2793 +	pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
  9.2794 +#ifdef DIAGNOSTIC
  9.2795 +	if (pg == NULL)
  9.2796 +		panic("pmap_remove_pte: unmanaged page marked "
  9.2797 +		    "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
  9.2798 +		    (u_long)(opte & PG_FRAME));
  9.2799 +#endif
  9.2800 +	mdpg = &pg->mdpage;
  9.2801 +
  9.2802 +	/* sync R/M bits */
  9.2803 +	simple_lock(&mdpg->mp_pvhead.pvh_lock);
  9.2804 +	mdpg->mp_attrs |= (opte & (PG_U|PG_M));
  9.2805 +	pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, va);
  9.2806 +	simple_unlock(&mdpg->mp_pvhead.pvh_lock);
  9.2807 +
  9.2808 +	if (pve)
  9.2809 +		pmap_free_pv(pmap, pve);
  9.2810 +	return(TRUE);
  9.2811 +}
  9.2812 +
  9.2813 +/*
  9.2814 + * pmap_remove: top level mapping removal function
  9.2815 + *
  9.2816 + * => caller should not be holding any pmap locks
  9.2817 + */
  9.2818 +
  9.2819 +void
  9.2820 +pmap_remove(pmap, sva, eva)
  9.2821 +	struct pmap *pmap;
  9.2822 +	vaddr_t sva, eva;
  9.2823 +{
  9.2824 +	pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
  9.2825 +}
  9.2826 +
  9.2827 +/*
  9.2828 + * pmap_do_remove: mapping removal guts
  9.2829 + *
  9.2830 + * => caller should not be holding any pmap locks
  9.2831 + */
  9.2832 +
  9.2833 +static void
  9.2834 +pmap_do_remove(pmap, sva, eva, flags)
  9.2835 +	struct pmap *pmap;
  9.2836 +	vaddr_t sva, eva;
  9.2837 +	int flags;
  9.2838 +{
  9.2839 +	pt_entry_t *ptes, opte;
  9.2840 +	pt_entry_t *maptp;
  9.2841 +	boolean_t result;
  9.2842 +	paddr_t ptppa;
  9.2843 +	vaddr_t blkendva;
  9.2844 +	struct vm_page *ptp;
  9.2845 +	int32_t cpumask = 0;
  9.2846 +	TAILQ_HEAD(, vm_page) empty_ptps;
  9.2847 +	struct cpu_info *ci;
  9.2848 +	struct pmap *curpmap;
  9.2849 +
  9.2850 +	/*
  9.2851 +	 * we lock in the pmap => pv_head direction
  9.2852 +	 */
  9.2853 +
  9.2854 +	TAILQ_INIT(&empty_ptps);
  9.2855 +
  9.2856 +	PMAP_MAP_TO_HEAD_LOCK();
  9.2857 +
  9.2858 +	ptes = pmap_map_ptes(pmap);	/* locks pmap */
  9.2859 +
  9.2860 +	ci = curcpu();
  9.2861 +	curpmap = ci->ci_pmap;
  9.2862 +
  9.2863 +	/*
  9.2864 +	 * removing one page?  take shortcut function.
  9.2865 +	 */
  9.2866 +
  9.2867 +	if (sva + PAGE_SIZE == eva) {
  9.2868 +		if (pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) {
  9.2869 +
  9.2870 +			/* PA of the PTP */
  9.2871 +			ptppa = PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME;
  9.2872 +
  9.2873 +			/* get PTP if non-kernel mapping */
  9.2874 +			if (pmap == pmap_kernel()) {
  9.2875 +				/* we never free kernel PTPs */
  9.2876 +				ptp = NULL;
  9.2877 +			} else {
  9.2878 +				if (pmap->pm_ptphint &&
  9.2879 +				    VM_PAGE_TO_PHYS(pmap->pm_ptphint) ==
  9.2880 +				    ptppa) {
  9.2881 +					ptp = pmap->pm_ptphint;
  9.2882 +				} else {
  9.2883 +					ptp = PHYS_TO_VM_PAGE(ptppa);
  9.2884 +#ifdef DIAGNOSTIC
  9.2885 +					if (ptp == NULL)
  9.2886 +						panic("pmap_remove: unmanaged "
  9.2887 +						      "PTP detected");
  9.2888 +#endif
  9.2889 +				}
  9.2890 +			}
  9.2891 +
  9.2892 +			/* do it! */
  9.2893 +			result = pmap_remove_pte(pmap, ptp,
  9.2894 +			    &ptes[x86_btop(sva)], sva, &cpumask, flags);
  9.2895 +
  9.2896 +			/*
  9.2897 +			 * if mapping removed and the PTP is no longer
  9.2898 +			 * being used, free it!
  9.2899 +			 */
  9.2900 +
  9.2901 +			if (result && ptp && ptp->wire_count <= 1) {
  9.2902 +				/* zap! */
  9.2903 +				maptp = (pt_entry_t *)vtomach(
  9.2904 +					(vaddr_t)&pmap->pm_pdir[pdei(sva)]);
  9.2905 +				PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)],
  9.2906 +				    maptp, opte);
  9.2907 +#if defined(MULTIPROCESSOR)
  9.2908 +				/*
  9.2909 +				 * XXXthorpej Redundant shootdown can happen
  9.2910 +				 * here if we're using APTE space.
  9.2911 +				 */
  9.2912 +#endif
  9.2913 +				pmap_tlb_shootdown(curpmap,
  9.2914 +				    ((vaddr_t)ptes) + ptp->offset, opte,
  9.2915 +				    &cpumask);
  9.2916 +#if defined(MULTIPROCESSOR)
  9.2917 +				/*
  9.2918 +				 * Always shoot down the pmap's self-mapping
  9.2919 +				 * of the PTP.
  9.2920 +				 * XXXthorpej Redundant shootdown can happen
  9.2921 +				 * here if pmap == curpmap (not APTE space).
  9.2922 +				 */
  9.2923 +				pmap_tlb_shootdown(pmap,
  9.2924 +				    ((vaddr_t)PTE_BASE) + ptp->offset, opte,
  9.2925 +				    &cpumask);
  9.2926 +#endif
  9.2927 +				pmap->pm_stats.resident_count--;
  9.2928 +				if (pmap->pm_ptphint == ptp)
  9.2929 +					pmap->pm_ptphint =
  9.2930 +					    TAILQ_FIRST(&pmap->pm_obj.memq);
  9.2931 +				ptp->wire_count = 0;
  9.2932 +				ptp->flags |= PG_ZERO;
  9.2933 +				uvm_pagerealloc(ptp, NULL, 0);
  9.2934 +				TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
  9.2935 +			}
  9.2936 +		}
  9.2937 +		pmap_tlb_shootnow(cpumask);
  9.2938 +		pmap_unmap_ptes(pmap);		/* unlock pmap */
  9.2939 +		PMAP_MAP_TO_HEAD_UNLOCK();
  9.2940 +		/* Now we can free unused ptps */
  9.2941 +		TAILQ_FOREACH(ptp, &empty_ptps, listq)
  9.2942 +			uvm_pagefree(ptp);
  9.2943 +		return;
  9.2944 +	}
  9.2945 +
  9.2946 +	cpumask = 0;
  9.2947 +
  9.2948 +	for (/* null */ ; sva < eva ; sva = blkendva) {
  9.2949 +
  9.2950 +		/* determine range of block */
  9.2951 +		blkendva = x86_round_pdr(sva+1);
  9.2952 +		if (blkendva > eva)
  9.2953 +			blkendva = eva;
  9.2954 +
  9.2955 +		/*
  9.2956 +		 * XXXCDC: our PTE mappings should never be removed
  9.2957 +		 * with pmap_remove!  if we allow this (and why would
  9.2958 +		 * we?) then we end up freeing the pmap's page
  9.2959 +		 * directory page (PDP) before we are finished using
  9.2960 +		 * it when we hit in in the recursive mapping.  this
  9.2961 +		 * is BAD.
  9.2962 +		 *
  9.2963 +		 * long term solution is to move the PTEs out of user
  9.2964 +		 * address space.  and into kernel address space (up
  9.2965 +		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
  9.2966 +		 * be VM_MAX_ADDRESS.
  9.2967 +		 */
  9.2968 +
  9.2969 +		if (pdei(sva) == PDSLOT_PTE)
  9.2970 +			/* XXXCDC: ugly hack to avoid freeing PDP here */
  9.2971 +			continue;
  9.2972 +
  9.2973 +		if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
  9.2974 +			/* valid block? */
  9.2975 +			continue;
  9.2976 +
  9.2977 +		/* PA of the PTP */
  9.2978 +		ptppa = (PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME);
  9.2979 +
  9.2980 +		/* get PTP if non-kernel mapping */
  9.2981 +		if (pmap == pmap_kernel()) {
  9.2982 +			/* we never free kernel PTPs */
  9.2983 +			ptp = NULL;
  9.2984 +		} else {
  9.2985 +			if (pmap->pm_ptphint &&
  9.2986 +			    VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
  9.2987 +				ptp = pmap->pm_ptphint;
  9.2988 +			} else {
  9.2989 +				ptp = PHYS_TO_VM_PAGE(ptppa);
  9.2990 +#ifdef DIAGNOSTIC
  9.2991 +				if (ptp == NULL)
  9.2992 +					panic("pmap_remove: unmanaged PTP "
  9.2993 +					      "detected");
  9.2994 +#endif
  9.2995 +			}
  9.2996 +		}
  9.2997 +		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[x86_btop(sva)],
  9.2998 +		    sva, blkendva, &cpumask, flags);
  9.2999 +
  9.3000 +		/* if PTP is no longer being used, free it! */
  9.3001 +		if (ptp && ptp->wire_count <= 1) {
  9.3002 +			/* zap! */
  9.3003 +			maptp = (pt_entry_t *)vtomach(
  9.3004 +				(vaddr_t)&pmap->pm_pdir[pdei(sva)]);
  9.3005 +			PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)],
  9.3006 +			    maptp, opte);
  9.3007 +#if defined(MULTIPROCESSOR)
  9.3008 +			/*
  9.3009 +			 * XXXthorpej Redundant shootdown can happen here
  9.3010 +			 * if we're using APTE space.
  9.3011 +			 */
  9.3012 +#endif
  9.3013 +			pmap_tlb_shootdown(curpmap,
  9.3014 +			    ((vaddr_t)ptes) + ptp->offset, opte, &cpumask);
  9.3015 +#if defined(MULTIPROCESSOR)
  9.3016 +			/*
  9.3017 +			 * Always shoot down the pmap's self-mapping
  9.3018 +			 * of the PTP.
  9.3019 +			 * XXXthorpej Redundant shootdown can happen here
  9.3020 +			 * if pmap == curpmap (not APTE space).
  9.3021 +			 */
  9.3022 +			pmap_tlb_shootdown(pmap,
  9.3023 +			    ((vaddr_t)PTE_BASE) + ptp->offset, opte, &cpumask);
  9.3024 +#endif
  9.3025 +			pmap->pm_stats.resident_count--;
  9.3026 +			if (pmap->pm_ptphint == ptp)	/* update hint? */
  9.3027 +				pmap->pm_ptphint = pmap->pm_obj.memq.tqh_first;
  9.3028 +			ptp->wire_count = 0;
  9.3029 +			ptp->flags |= PG_ZERO;
  9.3030 +			/* Postpone free to shootdown */
  9.3031 +			uvm_pagerealloc(ptp, NULL, 0);
  9.3032 +			TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
  9.3033 +		}
  9.3034 +	}
  9.3035 +
  9.3036 +	pmap_tlb_shootnow(cpumask);
  9.3037 +	pmap_unmap_ptes(pmap);
  9.3038 +	PMAP_MAP_TO_HEAD_UNLOCK();
  9.3039 +	/* Now we can free unused ptps */
  9.3040 +	TAILQ_FOREACH(ptp, &empty_ptps, listq)
  9.3041 +		uvm_pagefree(ptp);
  9.3042 +}
  9.3043 +
  9.3044 +/*
  9.3045 + * pmap_page_remove: remove a managed vm_page from all pmaps that map it
  9.3046 + *
  9.3047 + * => we set pv_head => pmap locking
  9.3048 + * => R/M bits are sync'd back to attrs
  9.3049 + */
  9.3050 +
  9.3051 +void
  9.3052 +pmap_page_remove(pg)
  9.3053 +	struct vm_page *pg;
  9.3054 +{
  9.3055 +	struct pv_head *pvh;
  9.3056 +	struct pv_entry *pve, *npve, *killlist = NULL;
  9.3057 +	pt_entry_t *ptes, opte;
  9.3058 +	pt_entry_t *maptp;
  9.3059 +	int32_t cpumask = 0;
  9.3060 +	TAILQ_HEAD(, vm_page) empty_ptps;
  9.3061 +	struct vm_page *ptp;
  9.3062 +	struct cpu_info *ci;
  9.3063 +	struct pmap *curpmap;
  9.3064 +
  9.3065 +#ifdef DIAGNOSTIC
  9.3066 +	int bank, off;
  9.3067 +
  9.3068 +	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
  9.3069 +	if (bank == -1)
  9.3070 +		panic("pmap_page_remove: unmanaged page?");
  9.3071 +#endif
  9.3072 +
  9.3073 +	pvh = &pg->mdpage.mp_pvhead;
  9.3074 +	if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
  9.3075 +		return;
  9.3076 +	}
  9.3077 +
  9.3078 +	TAILQ_INIT(&empty_ptps);
  9.3079 +
  9.3080 +	/* set pv_head => pmap locking */
  9.3081 +	PMAP_HEAD_TO_MAP_LOCK();
  9.3082 +
  9.3083 +	ci = curcpu();
  9.3084 +	curpmap = ci->ci_pmap;
  9.3085 +
  9.3086 +	/* XXX: needed if we hold head->map lock? */
  9.3087 +	simple_lock(&pvh->pvh_lock);
  9.3088 +
  9.3089 +	for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) {
  9.3090 +		npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve);
  9.3091 +		ptes = pmap_map_ptes(pve->pv_pmap);		/* locks pmap */
  9.3092 +
  9.3093 +#ifdef DIAGNOSTIC
  9.3094 +		if (pve->pv_ptp &&
  9.3095 +		    (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]) &
  9.3096 +			PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
  9.3097 +			printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n",
  9.3098 +			    pg, pve->pv_va, pve->pv_ptp);
  9.3099 +			printf("pmap_page_remove: PTP's phys addr: "
  9.3100 +			    "actual=%lx, recorded=%lx\n",
  9.3101 +			    (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)])
  9.3102 +				& PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
  9.3103 +			panic("pmap_page_remove: mapped managed page has "
  9.3104 +			    "invalid pv_ptp field");
  9.3105 +		}
  9.3106 +#endif
  9.3107 +
  9.3108 +		/* atomically save the old PTE and zap! it */
  9.3109 +		maptp = (pt_entry_t *)vtomach(
  9.3110 +			(vaddr_t)&ptes[x86_btop(pve->pv_va)]);
  9.3111 +		opte = pte_atomic_update(&ptes[x86_btop(pve->pv_va)],
  9.3112 +		    maptp, 0);
  9.3113 +
  9.3114 +		if (opte & PG_W)
  9.3115 +			pve->pv_pmap->pm_stats.wired_count--;
  9.3116 +		pve->pv_pmap->pm_stats.resident_count--;
  9.3117 +
  9.3118 +		/* Shootdown only if referenced */
  9.3119 +		if (opte & PG_U)
  9.3120 +			pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
  9.3121 +			    &cpumask);
  9.3122 +
  9.3123 +		/* sync R/M bits */
  9.3124 +		pg->mdpage.mp_attrs |= (opte & (PG_U|PG_M));
  9.3125 +
  9.3126 +		/* update the PTP reference count.  free if last reference. */
  9.3127 +		if (pve->pv_ptp) {
  9.3128 +			pve->pv_ptp->wire_count--;
  9.3129 +			if (pve->pv_ptp->wire_count <= 1) {
  9.3130 +				/*
  9.3131 +				 * Do we have to shootdown the page just to
  9.3132 +				 * get the pte out of the TLB ?
  9.3133 +				 */
  9.3134 +				if(!(opte & PG_U))
  9.3135 +					pmap_tlb_shootdown(pve->pv_pmap,
  9.3136 +					    pve->pv_va, opte, &cpumask);
  9.3137 +
  9.3138 +				/* zap! */
  9.3139 +				maptp = (pt_entry_t *)vtomach((vaddr_t)
  9.3140 +				    &pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]);
  9.3141 +				PTE_ATOMIC_CLEAR(&pve->pv_pmap->pm_pdir
  9.3142 +				    [pdei(pve->pv_va)], maptp, opte);
  9.3143 +				pmap_tlb_shootdown(curpmap,
  9.3144 +				    ((vaddr_t)ptes) + pve->pv_ptp->offset,
  9.3145 +				    opte, &cpumask);
  9.3146 +#if defined(MULTIPROCESSOR)
  9.3147 +				/*
  9.3148 +				 * Always shoot down the other pmap's
  9.3149 +				 * self-mapping of the PTP.
  9.3150 +				 */
  9.3151 +				pmap_tlb_shootdown(pve->pv_pmap,
  9.3152 +				    ((vaddr_t)PTE_BASE) + pve->pv_ptp->offset,
  9.3153 +				    opte, &cpumask);
  9.3154 +#endif
  9.3155 +				pve->pv_pmap->pm_stats.resident_count--;
  9.3156 +				/* update hint? */
  9.3157 +				if (pve->pv_pmap->pm_ptphint == pve->pv_ptp)
  9.3158 +					pve->pv_pmap->pm_ptphint =
  9.3159 +					    pve->pv_pmap->pm_obj.memq.tqh_first;
  9.3160 +				pve->pv_ptp->wire_count = 0;
  9.3161 +				pve->pv_ptp->flags |= PG_ZERO;
  9.3162 +				/* Free only after the shootdown */
  9.3163 +				uvm_pagerealloc(pve->pv_ptp, NULL, 0);
  9.3164 +				TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp,
  9.3165 +				    listq);
  9.3166 +			}
  9.3167 +		}
  9.3168 +		pmap_unmap_ptes(pve->pv_pmap);		/* unlocks pmap */
  9.3169 +		SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */
  9.3170 +		SPLAY_RIGHT(pve, pv_node) = killlist;	/* mark it for death */
  9.3171 +		killlist = pve;
  9.3172 +	}
  9.3173 +	pmap_free_pvs(NULL, killlist);
  9.3174 +	simple_unlock(&pvh->pvh_lock);
  9.3175 +	PMAP_HEAD_TO_MAP_UNLOCK();
  9.3176 +	pmap_tlb_shootnow(cpumask);
  9.3177 +
  9.3178 +	/* Now we can free unused ptps */
  9.3179 +	TAILQ_FOREACH(ptp, &empty_ptps, listq)
  9.3180 +		uvm_pagefree(ptp);
  9.3181 +}
  9.3182 +
  9.3183 +/*
  9.3184 + * p m a p   a t t r i b u t e  f u n c t i o n s
  9.3185 + * functions that test/change managed page's attributes
  9.3186 + * since a page can be mapped multiple times we must check each PTE that
  9.3187 + * maps it by going down the pv lists.
  9.3188 + */
  9.3189 +
  9.3190 +/*
  9.3191 + * pmap_test_attrs: test a page's attributes
  9.3192 + *
  9.3193 + * => we set pv_head => pmap locking
  9.3194 + */
  9.3195 +
  9.3196 +boolean_t
  9.3197 +pmap_test_attrs(pg, testbits)
  9.3198 +	struct vm_page *pg;
  9.3199 +	int testbits;
  9.3200 +{
  9.3201 +	struct vm_page_md *mdpg;
  9.3202 +	int *myattrs;
  9.3203 +	struct pv_head *pvh;
  9.3204 +	struct pv_entry *pve;
  9.3205 +	volatile pt_entry_t *ptes;
  9.3206 +	pt_entry_t pte;
  9.3207 +
  9.3208 +#if DIAGNOSTIC
  9.3209 +	int bank, off;
  9.3210 +
  9.3211 +	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
  9.3212 +	if (bank == -1)
  9.3213 +		panic("pmap_test_attrs: unmanaged page?");
  9.3214 +#endif
  9.3215 +	mdpg = &pg->mdpage;
  9.3216 +
  9.3217 +	/*
  9.3218 +	 * before locking: see if attributes are already set and if so,
  9.3219 +	 * return!
  9.3220 +	 */
  9.3221 +
  9.3222 +	myattrs = &mdpg->mp_attrs;
  9.3223 +	if (*myattrs & testbits)
  9.3224 +		return(TRUE);
  9.3225 +
  9.3226 +	/* test to see if there is a list before bothering to lock */
  9.3227 +	pvh = &mdpg->mp_pvhead;
  9.3228 +	if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
  9.3229 +		return(FALSE);
  9.3230 +	}
  9.3231 +
  9.3232 +	/* nope, gonna have to do it the hard way */
  9.3233 +	PMAP_HEAD_TO_MAP_LOCK();
  9.3234 +	/* XXX: needed if we hold head->map lock? */
  9.3235 +	simple_lock(&pvh->pvh_lock);
  9.3236 +
  9.3237 +	for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root);
  9.3238 +	     pve != NULL && (*myattrs & testbits) == 0;
  9.3239 +	     pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) {
  9.3240 +		ptes = pmap_map_ptes(pve->pv_pmap);
  9.3241 +		pte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); /* XXX flags only? */
  9.3242 +		pmap_unmap_ptes(pve->pv_pmap);
  9.3243 +		*myattrs |= pte;
  9.3244 +	}
  9.3245 +
  9.3246 +	/*
  9.3247 +	 * note that we will exit the for loop with a non-null pve if
  9.3248 +	 * we have found the bits we are testing for.
  9.3249 +	 */
  9.3250 +
  9.3251 +	simple_unlock(&pvh->pvh_lock);
  9.3252 +	PMAP_HEAD_TO_MAP_UNLOCK();
  9.3253 +	return((*myattrs & testbits) != 0);
  9.3254 +}
  9.3255 +
  9.3256 +/*
  9.3257 + * pmap_clear_attrs: clear the specified attribute for a page.
  9.3258 + *
  9.3259 + * => we set pv_head => pmap locking
  9.3260 + * => we return TRUE if we cleared one of the bits we were asked to
  9.3261 + */
  9.3262 +
  9.3263 +boolean_t
  9.3264 +pmap_clear_attrs(pg, clearbits)
  9.3265 +	struct vm_page *pg;
  9.3266 +	int clearbits;
  9.3267 +{
  9.3268 +	struct vm_page_md *mdpg;
  9.3269 +	u_int32_t result;
  9.3270 +	struct pv_head *pvh;
  9.3271 +	struct pv_entry *pve;
  9.3272 +	pt_entry_t *ptes, opte;
  9.3273 +	pt_entry_t *maptp;
  9.3274 +	int *myattrs;
  9.3275 +	int32_t cpumask = 0;
  9.3276 +
  9.3277 +#ifdef DIAGNOSTIC
  9.3278 +	int bank, off;
  9.3279 +
  9.3280 +	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
  9.3281 +	if (bank == -1)
  9.3282 +		panic("pmap_change_attrs: unmanaged page?");
  9.3283 +#endif
  9.3284 +	mdpg = &pg->mdpage;
  9.3285 +
  9.3286 +	PMAP_HEAD_TO_MAP_LOCK();
  9.3287 +	pvh = &mdpg->mp_pvhead;
  9.3288 +	/* XXX: needed if we hold head->map lock? */
  9.3289 +	simple_lock(&pvh->pvh_lock);
  9.3290 +
  9.3291 +	myattrs = &mdpg->mp_attrs;
  9.3292 +	result = *myattrs & clearbits;
  9.3293 +	*myattrs &= ~clearbits;
  9.3294 +
  9.3295 +	SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) {
  9.3296 +#ifdef DIAGNOSTIC
  9.3297 +		if (!pmap_valid_entry(pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]))
  9.3298 +			panic("pmap_change_attrs: mapping without PTP "
  9.3299 +			      "detected");
  9.3300 +#endif
  9.3301 +
  9.3302 +		ptes = pmap_map_ptes(pve->pv_pmap);	/* locks pmap */
  9.3303 +		opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]);
  9.3304 +		if (opte & clearbits) {
  9.3305 +			/* We need to do something */
  9.3306 +			if (clearbits == PG_RW) {
  9.3307 +				result |= PG_RW;
  9.3308 +
  9.3309 +				/*
  9.3310 +				 * On write protect we might not need to flush 
  9.3311 +				 * the TLB
  9.3312 +				 */
  9.3313 +
  9.3314 +				/* First zap the RW bit! */
  9.3315 +				maptp = (pt_entry_t *)vtomach(
  9.3316 +					(vaddr_t)&ptes[x86_btop(pve->pv_va)]);
  9.3317 +				PTE_ATOMIC_CLEARBITS(
  9.3318 +					&ptes[x86_btop(pve->pv_va)],
  9.3319 +					maptp, PG_RW);
  9.3320 +				opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]);
  9.3321 +
  9.3322 +				/*
  9.3323 +				 * Then test if it is not cached as RW the TLB
  9.3324 +				 */
  9.3325 +				if (!(opte & PG_M))
  9.3326 +					goto no_tlb_shootdown;
  9.3327 +			}
  9.3328 +
  9.3329 +			/*
  9.3330 +			 * Since we need a shootdown me might as well
  9.3331 +			 * always clear PG_U AND PG_M.
  9.3332 +			 */
  9.3333 +
  9.3334 +			/* zap! */
  9.3335 +			maptp = (pt_entry_t *)vtomach(
  9.3336 +				(vaddr_t)&ptes[x86_btop(pve->pv_va)]);
  9.3337 +			PTE_ATOMIC_SET(&ptes[x86_btop(pve->pv_va)], maptp,
  9.3338 +			    (opte & ~(PG_U | PG_M)), opte);
  9.3339 +
  9.3340 +			result |= (opte & clearbits);
  9.3341 +			*myattrs |= (opte & ~(clearbits));
  9.3342 +
  9.3343 +			pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
  9.3344 +					   &cpumask);
  9.3345 +		}
  9.3346 +no_tlb_shootdown:
  9.3347 +		pmap_unmap_ptes(pve->pv_pmap);		/* unlocks pmap */
  9.3348 +	}
  9.3349 +
  9.3350 +	simple_unlock(&pvh->pvh_lock);
  9.3351 +	PMAP_HEAD_TO_MAP_UNLOCK();
  9.3352 +
  9.3353 +	pmap_tlb_shootnow(cpumask);
  9.3354 +	return(result != 0);
  9.3355 +}
  9.3356 +
  9.3357 +
  9.3358 +/*
  9.3359 + * p m a p   p r o t e c t i o n   f u n c t i o n s
  9.3360 + */
  9.3361 +
  9.3362 +/*
  9.3363 + * pmap_page_protect: change the protection of all recorded mappings
  9.3364 + *	of a managed page
  9.3365 + *
  9.3366 + * => NOTE: this is an inline function in pmap.h
  9.3367 + */
  9.3368 +
  9.3369 +/* see pmap.h */
  9.3370 +
  9.3371 +/*
  9.3372 + * pmap_protect: set the protection in of the pages in a pmap
  9.3373 + *
  9.3374 + * => NOTE: this is an inline function in pmap.h
  9.3375 + */
  9.3376 +
  9.3377 +/* see pmap.h */
  9.3378 +
  9.3379 +/*
  9.3380 + * pmap_write_protect: write-protect pages in a pmap
  9.3381 + */
  9.3382 +
  9.3383 +void
  9.3384 +pmap_write_protect(pmap, sva, eva, prot)
  9.3385 +	struct pmap *pmap;
  9.3386 +	vaddr_t sva, eva;
  9.3387 +	vm_prot_t prot;
  9.3388 +{
  9.3389 +	pt_entry_t *ptes, *epte;
  9.3390 +	pt_entry_t *maptp;
  9.3391 +#ifndef XEN
  9.3392 +	volatile
  9.3393 +#endif
  9.3394 +		pt_entry_t *spte;
  9.3395 +	vaddr_t blockend;
  9.3396 +	int32_t cpumask = 0;
  9.3397 +
  9.3398 +	ptes = pmap_map_ptes(pmap);		/* locks pmap */
  9.3399 +
  9.3400 +	/* should be ok, but just in case ... */
  9.3401 +	sva &= PG_FRAME;
  9.3402 +	eva &= PG_FRAME;
  9.3403 +
  9.3404 +	for (/* null */ ; sva < eva ; sva = blockend) {
  9.3405 +
  9.3406 +		blockend = (sva & PD_MASK) + NBPD;
  9.3407 +		if (blockend > eva)
  9.3408 +			blockend = eva;
  9.3409 +
  9.3410 +		/*
  9.3411 +		 * XXXCDC: our PTE mappings should never be write-protected!
  9.3412 +		 *
  9.3413 +		 * long term solution is to move the PTEs out of user
  9.3414 +		 * address space.  and into kernel address space (up
  9.3415 +		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
  9.3416 +		 * be VM_MAX_ADDRESS.
  9.3417 +		 */
  9.3418 +
  9.3419 +		/* XXXCDC: ugly hack to avoid freeing PDP here */
  9.3420 +		if (pdei(sva) == PDSLOT_PTE)
  9.3421 +			continue;
  9.3422 +
  9.3423 +		/* empty block? */
  9.3424 +		if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
  9.3425 +			continue;
  9.3426 +
  9.3427 +#ifdef DIAGNOSTIC
  9.3428 +		if (sva >= VM_MAXUSER_ADDRESS &&
  9.3429 +		    sva < VM_MAX_ADDRESS)
  9.3430 +			panic("pmap_write_protect: PTE space");
  9.3431 +#endif
  9.3432 +
  9.3433 +		spte = &ptes[x86_btop(sva)];
  9.3434 +		epte = &ptes[x86_btop(blockend)];
  9.3435 +
  9.3436 +		for (/*null */; spte < epte ; spte++) {
  9.3437 +			if ((PTE_GET(spte) & (PG_RW|PG_V)) == (PG_RW|PG_V)) {
  9.3438 +				maptp = (pt_entry_t *)vtomach((vaddr_t)spte);
  9.3439 +				PTE_ATOMIC_CLEARBITS(spte, maptp, PG_RW);
  9.3440 +				if (PTE_GET(spte) & PG_M)
  9.3441 +					pmap_tlb_shootdown(pmap,
  9.3442 +					    x86_ptob(spte - ptes),
  9.3443 +					    PTE_GET(spte), &cpumask);
  9.3444 +			}
  9.3445 +		}
  9.3446 +	}
  9.3447 +
  9.3448 +	/*
  9.3449 +	 * if we kept a removal record and removed some pages update the TLB
  9.3450 +	 */
  9.3451 +
  9.3452 +	pmap_tlb_shootnow(cpumask);
  9.3453 +	pmap_unmap_ptes(pmap);		/* unlocks pmap */
  9.3454 +}
  9.3455 +
  9.3456 +/*
  9.3457 + * end of protection functions
  9.3458 + */
  9.3459 +
  9.3460 +/*
  9.3461 + * pmap_unwire: clear the wired bit in the PTE
  9.3462 + *
  9.3463 + * => mapping should already be in map
  9.3464 + */
  9.3465 +
  9.3466 +void
  9.3467 +pmap_unwire(pmap, va)
  9.3468 +	struct pmap *pmap;
  9.3469 +	vaddr_t va;
  9.3470 +{
  9.3471 +	pt_entry_t *ptes;
  9.3472 +	pt_entry_t *maptp;
  9.3473 +
  9.3474 +	if (pmap_valid_entry(pmap->pm_pdir[pdei(va)])) {
  9.3475 +		ptes = pmap_map_ptes(pmap);		/* locks pmap */
  9.3476 +
  9.3477 +#ifdef DIAGNOSTIC
  9.3478 +		if (!pmap_valid_entry(ptes[x86_btop(va)]))
  9.3479 +			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
  9.3480 +#endif
  9.3481 +		if ((ptes[x86_btop(va)] & PG_W) != 0) {
  9.3482 +			maptp = (pt_entry_t *)vtomach(
  9.3483 +				(vaddr_t)&ptes[x86_btop(va)]);
  9.3484 +			PTE_ATOMIC_CLEARBITS(&ptes[x86_btop(va)], maptp, PG_W);
  9.3485 +			pmap->pm_stats.wired_count--;
  9.3486 +		}
  9.3487 +#ifdef DIAGNOSTIC
  9.3488 +		else {
  9.3489 +			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
  9.3490 +			       "didn't change!\n", pmap, va);
  9.3491 +		}
  9.3492 +#endif
  9.3493 +		pmap_unmap_ptes(pmap);		/* unlocks map */
  9.3494 +	}
  9.3495 +#ifdef DIAGNOSTIC
  9.3496 +	else {
  9.3497 +		panic("pmap_unwire: invalid PDE");
  9.3498 +	}
  9.3499 +#endif
  9.3500 +}
  9.3501 +
  9.3502 +/*
  9.3503 + * pmap_collect: free resources held by a pmap
  9.3504 + *
  9.3505 + * => optional function.
  9.3506 + * => called when a process is swapped out to free memory.
  9.3507 + */
  9.3508 +
  9.3509 +void
  9.3510 +pmap_collect(pmap)
  9.3511 +	struct pmap *pmap;
  9.3512 +{
  9.3513 +	/*
  9.3514 +	 * free all of the pt pages by removing the physical mappings
  9.3515 +	 * for its entire address space.
  9.3516 +	 */
  9.3517 +
  9.3518 +	pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS,
  9.3519 +	    PMAP_REMOVE_SKIPWIRED);
  9.3520 +}
  9.3521 +
  9.3522 +/*
  9.3523 + * pmap_copy: copy mappings from one pmap to another
  9.3524 + *
  9.3525 + * => optional function
  9.3526 + * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
  9.3527 + */
  9.3528 +
  9.3529 +/*
  9.3530 + * defined as macro in pmap.h
  9.3531 + */
  9.3532 +
  9.3533 +/*
  9.3534 + * pmap_enter: enter a mapping into a pmap
  9.3535 + *
  9.3536 + * => must be done "now" ... no lazy-evaluation
  9.3537 + * => we set pmap => pv_head locking
  9.3538 + */
  9.3539 +
  9.3540 +int
  9.3541 +pmap_enter(pmap, va, pa, prot, flags)
  9.3542 +	struct pmap *pmap;
  9.3543 +	vaddr_t va;
  9.3544 +	paddr_t pa;
  9.3545 +	vm_prot_t prot;
  9.3546 +	int flags;
  9.3547 +{
  9.3548 +	pt_entry_t *ptes, opte, npte;
  9.3549 +	struct vm_page *ptp, *pg;
  9.3550 +	struct vm_page_md *mdpg;
  9.3551 +	struct pv_head *old_pvh, *new_pvh;
  9.3552 +	struct pv_entry *pve = NULL; /* XXX gcc */
  9.3553 +	int error;
  9.3554 +	boolean_t wired = (flags & PMAP_WIRED) != 0;
  9.3555 +	pt_entry_t *maptp;
  9.3556 +
  9.3557 +	XENPRINTK(("pmap_enter(%p, %p, %p, %08x, %08x)\n",
  9.3558 +	    pmap, (void *)va, (void *)pa, prot, flags));
  9.3559 +
  9.3560 +#ifdef DIAGNOSTIC
  9.3561 +	/* sanity check: totally out of range? */
  9.3562 +	if (va >= VM_MAX_KERNEL_ADDRESS)
  9.3563 +		panic("pmap_enter: too big");
  9.3564 +
  9.3565 +	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
  9.3566 +		panic("pmap_enter: trying to map over PDP/APDP!");
  9.3567 +
  9.3568 +	/* sanity check: kernel PTPs should already have been pre-allocated */
  9.3569 +	if (va >= VM_MIN_KERNEL_ADDRESS &&
  9.3570 +	    !pmap_valid_entry(pmap->pm_pdir[pdei(va)]))
  9.3571 +		panic("pmap_enter: missing kernel PTP!");
  9.3572 +#endif
  9.3573 +
  9.3574 +	npte = protection_codes[prot] | PG_V;
  9.3575 +
  9.3576 +	if (pa >= pmap_pa_start && pa < pmap_pa_end)
  9.3577 +		npte |= xpmap_ptom(pa);
  9.3578 +	else {
  9.3579 +		XENPRINTF(("pmap_enter: va %08lx outside pa range %08lx\n",
  9.3580 +		    va, pa));
  9.3581 +		npte |= pa;
  9.3582 +	}
  9.3583 +
  9.3584 +	/* XENPRINTK(("npte %p\n", npte)); */
  9.3585 +
  9.3586 +	if (wired)
  9.3587 +	        npte |= PG_W;
  9.3588 +
  9.3589 +	if (va < VM_MAXUSER_ADDRESS)
  9.3590 +		npte |= PG_u;
  9.3591 +	else if (va < VM_MAX_ADDRESS)
  9.3592 +		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
  9.3593 +	if (pmap == pmap_kernel())
  9.3594 +		npte |= pmap_pg_g;
  9.3595 +
  9.3596 +	/* get lock */
  9.3597 +	PMAP_MAP_TO_HEAD_LOCK();
  9.3598 +
  9.3599 +	ptes = pmap_map_ptes(pmap);		/* locks pmap */
  9.3600 +	if (pmap == pmap_kernel()) {
  9.3601 +		ptp = NULL;
  9.3602 +	} else {
  9.3603 +		ptp = pmap_get_ptp(pmap, pdei(va));
  9.3604 +		if (ptp == NULL) {
  9.3605 +			if (flags & PMAP_CANFAIL) {
  9.3606 +				error = ENOMEM;
  9.3607 +				goto out;
  9.3608 +			}
  9.3609 +			panic("pmap_enter: get ptp failed");
  9.3610 +		}
  9.3611 +	}
  9.3612 +
  9.3613 +	/*
  9.3614 +	 * Get first view on old PTE 
  9.3615 +	 * on SMP the PTE might gain PG_U and PG_M flags
  9.3616 +	 * before we zap it later
  9.3617 +	 */
  9.3618 +	opte = pte_get(&ptes[x86_btop(va)]);		/* old PTE */
  9.3619 +	XENPRINTK(("npte %p opte %p ptes %p idx %03x\n", 
  9.3620 +		      (void *)npte, (void *)opte, ptes, x86_btop(va)));
  9.3621 +
  9.3622 +	/*
  9.3623 +	 * is there currently a valid mapping at our VA and does it
  9.3624 +	 * map to the same PA as the one we want to map ?
  9.3625 +	 */
  9.3626 +
  9.3627 +	if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) {
  9.3628 +
  9.3629 +		/*
  9.3630 +		 * first, calculate pm_stats updates.  resident count will not
  9.3631 +		 * change since we are replacing/changing a valid mapping.
  9.3632 +		 * wired count might change...
  9.3633 +		 */
  9.3634 +		pmap->pm_stats.wired_count +=
  9.3635 +		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
  9.3636 +
  9.3637 +		npte |= (opte & PG_PVLIST);
  9.3638 +
  9.3639 +		XENPRINTK(("pmap update opte == pa"));
  9.3640 +		/* zap! */
  9.3641 +		maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
  9.3642 +		opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte);
  9.3643 +
  9.3644 +		/*
  9.3645 +		 * Any change in the protection level that the CPU
  9.3646 +		 * should know about ? 
  9.3647 +		 */
  9.3648 +		if ((npte & PG_RW)
  9.3649 +		     || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) {
  9.3650 +			XENPRINTK(("pmap update opte == pa, prot change"));
  9.3651 +			/*
  9.3652 +			 * No need to flush the TLB.
  9.3653 +			 * Just add old PG_M, ... flags in new entry.
  9.3654 +			 */
  9.3655 +			PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp,
  9.3656 +			    opte & (PG_M | PG_U));
  9.3657 +			goto out_ok;
  9.3658 +		}
  9.3659 +
  9.3660 +		/*
  9.3661 +		 * Might be cached in the TLB as being writable
  9.3662 +		 * if this is on the PVLIST, sync R/M bit
  9.3663 +		 */
  9.3664 +		if (opte & PG_PVLIST) {
  9.3665 +			pg = PHYS_TO_VM_PAGE(pa);
  9.3666 +#ifdef DIAGNOSTIC
  9.3667 +			if (pg == NULL)
  9.3668 +				panic("pmap_enter: same pa PG_PVLIST "
  9.3669 +				      "mapping with unmanaged page "
  9.3670 +				      "pa = 0x%lx (0x%lx)", pa,
  9.3671 +				      atop(pa));
  9.3672 +#endif
  9.3673 +			mdpg = &pg->mdpage;
  9.3674 +			old_pvh = &mdpg->mp_pvhead;
  9.3675 +			simple_lock(&old_pvh->pvh_lock);
  9.3676 +			mdpg->mp_attrs |= opte;
  9.3677 +			simple_unlock(&old_pvh->pvh_lock);
  9.3678 +		}
  9.3679 +		goto shootdown_now;
  9.3680 +	}
  9.3681 +
  9.3682 +	pg = PHYS_TO_VM_PAGE(pa);
  9.3683 +	XENPRINTK(("pg %p from %p, init %d\n", pg, (void *)pa,
  9.3684 +		      pmap_initialized));
  9.3685 +	if (pmap_initialized && pg != NULL) {
  9.3686 +		/* This is a managed page */
  9.3687 +		npte |= PG_PVLIST;
  9.3688 +		mdpg = &pg->mdpage;
  9.3689 +		new_pvh = &mdpg->mp_pvhead;
  9.3690 +		if ((opte & (PG_PVLIST | PG_V)) != (PG_PVLIST | PG_V)) {
  9.3691 +			/* We can not steal a pve - allocate one */
  9.3692 +			pve = pmap_alloc_pv(pmap, ALLOCPV_NEED);
  9.3693 +			if (pve == NULL) {
  9.3694 +				if (!(flags & PMAP_CANFAIL))
  9.3695 +					panic("pmap_enter: "
  9.3696 +					    "no pv entries available");
  9.3697 +				error = ENOMEM;
  9.3698 +				goto out;
  9.3699 +  			}
  9.3700 +  		}
  9.3701 +	} else {
  9.3702 +		new_pvh = NULL;
  9.3703 +	}
  9.3704 +
  9.3705 +	/*
  9.3706 +	 * is there currently a valid mapping at our VA?
  9.3707 +	 */
  9.3708 +
  9.3709 +	if (pmap_valid_entry(opte)) {
  9.3710 +
  9.3711 +		/*
  9.3712 +		 * changing PAs: we must remove the old one first
  9.3713 +		 */
  9.3714 +
  9.3715 +		/*
  9.3716 +		 * first, calculate pm_stats updates.  resident count will not
  9.3717 +		 * change since we are replacing/changing a valid mapping.
  9.3718 +		 * wired count might change...
  9.3719 +		 */
  9.3720 +		pmap->pm_stats.wired_count +=
  9.3721 +		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
  9.3722 +
  9.3723 +		if (opte & PG_PVLIST) {
  9.3724 +			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
  9.3725 +#ifdef DIAGNOSTIC
  9.3726 +			if (pg == NULL)
  9.3727 +				panic("pmap_enter: PG_PVLIST mapping with "
  9.3728 +				      "unmanaged page "
  9.3729 +				      "pa = 0x%lx (0x%lx)", pa, atop(pa));
  9.3730 +#endif
  9.3731 +			mdpg = &pg->mdpage;
  9.3732 +			old_pvh = &mdpg->mp_pvhead;
  9.3733 +
  9.3734 +			/* new_pvh is NULL if page will not be managed */
  9.3735 +			pmap_lock_pvhs(old_pvh, new_pvh);
  9.3736 +
  9.3737 +			XENPRINTK(("pmap change pa"));
  9.3738 +			/* zap! */
  9.3739 +			maptp = (pt_entry_t *)vtomach(
  9.3740 +				(vaddr_t)&ptes[x86_btop(va)]);
  9.3741 +			opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp,
  9.3742 +						    npte);
  9.3743 +
  9.3744 +			pve = pmap_remove_pv(old_pvh, pmap, va);
  9.3745 +			KASSERT(pve != 0);
  9.3746 +			mdpg->mp_attrs |= opte;
  9.3747 +
  9.3748 +			if (new_pvh) {
  9.3749 +				pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
  9.3750 +				simple_unlock(&new_pvh->pvh_lock);
  9.3751 +			} else
  9.3752 +				pmap_free_pv(pmap, pve);
  9.3753 +			simple_unlock(&old_pvh->pvh_lock);
  9.3754 +
  9.3755 +			goto shootdown_test;
  9.3756 +		}
  9.3757 +	} else {	/* opte not valid */
  9.3758 +		pmap->pm_stats.resident_count++;
  9.3759 +		if (wired) 
  9.3760 +			pmap->pm_stats.wired_count++;
  9.3761 +		if (ptp)
  9.3762 +			ptp->wire_count++;
  9.3763 +	}
  9.3764 +
  9.3765 +	if (new_pvh) {
  9.3766 +		simple_lock(&new_pvh->pvh_lock);
  9.3767 +		pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
  9.3768 +		simple_unlock(&new_pvh->pvh_lock);
  9.3769 +	}
  9.3770 +
  9.3771 +	XENPRINTK(("pmap initial setup\n"));
  9.3772 +	maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
  9.3773 +	opte = pte_atomic_update_ma(&ptes[x86_btop(va)],
  9.3774 +	    maptp, npte); /* zap! */
  9.3775 +
  9.3776 +shootdown_test:
  9.3777 +	/* Update page attributes if needed */
  9.3778 +	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
  9.3779 +#if defined(MULTIPROCESSOR)
  9.3780 +		int32_t cpumask = 0;
  9.3781 +#endif
  9.3782 +shootdown_now:
  9.3783 +#if defined(MULTIPROCESSOR)
  9.3784 +		pmap_tlb_shootdown(pmap, va, opte, &cpumask);
  9.3785 +		pmap_tlb_shootnow(cpumask);
  9.3786 +#else
  9.3787 +		/* Don't bother deferring in the single CPU case. */
  9.3788 +		if (pmap_is_curpmap(pmap))
  9.3789 +			pmap_update_pg(va);
  9.3790 +#endif
  9.3791 +	}
  9.3792 +
  9.3793 +out_ok:
  9.3794 +	error = 0;
  9.3795 +
  9.3796 +out:
  9.3797 +	pmap_unmap_ptes(pmap);
  9.3798 +	PMAP_MAP_TO_HEAD_UNLOCK();
  9.3799 +
  9.3800 +	XENPRINTK(("pmap_enter: %d\n", error));
  9.3801 +	return error;
  9.3802 +}
  9.3803 +
  9.3804 +/*
  9.3805 + * pmap_enter_ma: enter a mapping into a pmap
  9.3806 + *
  9.3807 + * => must be done "now" ... no lazy-evaluation
  9.3808 + * => we set pmap => pv_head locking
  9.3809 + */
  9.3810 +
  9.3811 +int
  9.3812 +pmap_enter_ma(pmap, va, pa, prot, flags)
  9.3813 +	struct pmap *pmap;
  9.3814 +	vaddr_t va;
  9.3815 +	paddr_t pa;
  9.3816 +	vm_prot_t prot;
  9.3817 +	int flags;
  9.3818 +{
  9.3819 +	pt_entry_t *ptes, opte, npte;
  9.3820 +	pt_entry_t *maptp;
  9.3821 +	struct vm_page *ptp, *pg;
  9.3822 +	struct vm_page_md *mdpg;
  9.3823 +	struct pv_head *old_pvh;
  9.3824 +	struct pv_entry *pve = NULL; /* XXX gcc */
  9.3825 +	int error;
  9.3826 +	boolean_t wired = (flags & PMAP_WIRED) != 0;
  9.3827 +
  9.3828 +	XENPRINTK(("pmap_enter_ma(%p, %p, %p, %08x, %08x)\n",
  9.3829 +	    pmap, (void *)va, (void *)pa, prot, flags));
  9.3830 +
  9.3831 +#ifdef DIAGNOSTIC
  9.3832 +	/* sanity check: totally out of range? */
  9.3833 +	if (va >= VM_MAX_KERNEL_ADDRESS)
  9.3834 +		panic("pmap_enter: too big");
  9.3835 +
  9.3836 +	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
  9.3837 +		panic("pmap_enter: trying to map over PDP/APDP!");
  9.3838 +
  9.3839 +	/* sanity check: kernel PTPs should already have been pre-allocated */
  9.3840 +	if (va >= VM_MIN_KERNEL_ADDRESS &&
  9.3841 +	    !pmap_valid_entry(pmap->pm_pdir[pdei(va)]))
  9.3842 +		panic("pmap_enter: missing kernel PTP!");
  9.3843 +#endif
  9.3844 +
  9.3845 +	npte = pa | protection_codes[prot] | PG_V;
  9.3846 +	/* XENPRINTK(("npte %p\n", npte)); */
  9.3847 +
  9.3848 +	if (wired)
  9.3849 +	        npte |= PG_W;
  9.3850 +
  9.3851 +	if (va < VM_MAXUSER_ADDRESS)
  9.3852 +		npte |= PG_u;
  9.3853 +	else if (va < VM_MAX_ADDRESS)
  9.3854 +		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
  9.3855 +	if (pmap == pmap_kernel())
  9.3856 +		npte |= pmap_pg_g;
  9.3857 +
  9.3858 +	/* get lock */
  9.3859 +	PMAP_MAP_TO_HEAD_LOCK();
  9.3860 +
  9.3861 +	ptes = pmap_map_ptes(pmap);		/* locks pmap */
  9.3862 +	if (pmap == pmap_kernel()) {
  9.3863 +		ptp = NULL;
  9.3864 +	} else {
  9.3865 +		ptp = pmap_get_ptp(pmap, pdei(va));
  9.3866 +		if (ptp == NULL) {
  9.3867 +			if (flags & PMAP_CANFAIL) {
  9.3868 +				error = ENOMEM;
  9.3869 +				goto out;
  9.3870 +			}
  9.3871 +			panic("pmap_enter: get ptp failed");
  9.3872 +		}
  9.3873 +	}
  9.3874 +
  9.3875 +	/*
  9.3876 +	 * Get first view on old PTE 
  9.3877 +	 * on SMP the PTE might gain PG_U and PG_M flags
  9.3878 +	 * before we zap it later
  9.3879 +	 */
  9.3880 +	opte = pte_get_ma(&ptes[x86_btop(va)]);		/* old PTE */
  9.3881 +	XENPRINTK(("npte %p opte %p ptes %p idx %03x\n", 
  9.3882 +		      (void *)npte, (void *)opte, ptes, x86_btop(va)));
  9.3883 +	XENPRINTF(("pmap_enter_ma pa %08lx va %08lx opte %08x npte %08x "
  9.3884 +	    "wired %d count %ld\n", pa, va, opte, npte, wired,
  9.3885 +	    pmap->pm_stats.wired_count));
  9.3886 +
  9.3887 +	/*
  9.3888 +	 * is there currently a valid mapping at our VA and does it
  9.3889 +	 * map to the same MA as the one we want to map ?
  9.3890 +	 */
  9.3891 +
  9.3892 +	if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) {
  9.3893 +
  9.3894 +		/*
  9.3895 +		 * first, calculate pm_stats updates.  resident count will not
  9.3896 +		 * change since we are replacing/changing a valid mapping.
  9.3897 +		 * wired count might change...
  9.3898 +		 */
  9.3899 +		pmap->pm_stats.wired_count +=
  9.3900 +		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
  9.3901 +
  9.3902 +		XENPRINTK(("pmap update opte == pa"));
  9.3903 +		/* zap! */
  9.3904 +		maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
  9.3905 +		opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte);
  9.3906 +
  9.3907 +		/*
  9.3908 +		 * Any change in the protection level that the CPU
  9.3909 +		 * should know about ? 
  9.3910 +		 */
  9.3911 +		if ((npte & PG_RW)
  9.3912 +		     || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) {
  9.3913 +			XENPRINTK(("pmap update opte == pa, prot change"));
  9.3914 +			/*
  9.3915 +			 * No need to flush the TLB.
  9.3916 +			 * Just add old PG_M, ... flags in new entry.
  9.3917 +			 */
  9.3918 +			PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp,
  9.3919 +			    opte & (PG_M | PG_U));
  9.3920 +			goto out_ok;
  9.3921 +		}
  9.3922 +
  9.3923 +		/*
  9.3924 +		 * Might be cached in the TLB as being writable
  9.3925 +		 * if this is on the PVLIST, sync R/M bit
  9.3926 +		 */
  9.3927 +		KDASSERT((opte & PG_PVLIST) == 0);
  9.3928 +		goto shootdown_now;
  9.3929 +	}
  9.3930 +
  9.3931 +	/* 
  9.3932 +	 * no managed mapping for pages mapped through pmap_enter_ma.
  9.3933 +	 */
  9.3934 +
  9.3935 +	/*
  9.3936 +	 * is there currently a valid mapping at our VA?
  9.3937 +	 */
  9.3938 +
  9.3939 +	if (pmap_valid_entry(opte)) {
  9.3940 +
  9.3941 +		/*
  9.3942 +		 * changing PAs: we must remove the old one first
  9.3943 +		 */
  9.3944 +
  9.3945 +		/*
  9.3946 +		 * first, calculate pm_stats updates.  resident count will not
  9.3947 +		 * change since we are replacing/changing a valid mapping.
  9.3948 +		 * wired count might change...
  9.3949 +		 */
  9.3950 +		pmap->pm_stats.wired_count +=
  9.3951 +		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
  9.3952 +
  9.3953 +		if (opte & PG_PVLIST) {
  9.3954 +			opte = xpmap_mtop(opte);
  9.3955 +			KDASSERT((opte & PG_FRAME) !=
  9.3956 +			    (KERNTEXTOFF - KERNBASE_LOCORE));
  9.3957 +
  9.3958 +			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
  9.3959 +#ifdef DIAGNOSTIC
  9.3960 +			if (pg == NULL)
  9.3961 +				panic("pmap_enter: PG_PVLIST mapping with "
  9.3962 +				      "unmanaged page "
  9.3963 +				      "pa = 0x%lx (0x%lx)", pa, atop(pa));
  9.3964 +#endif
  9.3965 +			mdpg = &pg->mdpage;
  9.3966 +			old_pvh = &mdpg->mp_pvhead;
  9.3967 +
  9.3968 +			/* NULL new_pvh since page will not be managed */
  9.3969 +			pmap_lock_pvhs(old_pvh, NULL);
  9.3970 +
  9.3971 +			XENPRINTK(("pmap change pa"));
  9.3972 +			/* zap! */
  9.3973 +			maptp = (pt_entry_t *)vtomach(
  9.3974 +				(vaddr_t)&ptes[x86_btop(va)]);
  9.3975 +			opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp,
  9.3976 +						    npte);