ia64/xen-unstable
changeset 2444:7432c2c8b98b
bitkeeper revision 1.1159.72.2 (413cb4b0nYQ7KFQbxIn6g-4lsRAgbQ)
Add sparse tree for NetBSD.
Add sparse tree for NetBSD.
line diff
1.1 --- a/.rootkeys Mon Sep 06 18:52:41 2004 +0000 1.2 +++ b/.rootkeys Mon Sep 06 19:04:16 2004 +0000 1.3 @@ -263,6 +263,33 @@ 413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2. 1.4 413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree 1.5 413cb1e5kY_Zil7-b0kI6hvCIxBEYg netbsd-2.0-xen-sparse/nbconfig-xen 1.6 413cb1e5-58q5doPifcE1Q8ZAgm-JQ netbsd-2.0-xen-sparse/nbmake-xen 1.7 +413cb3b3Cmp02Gj87f3wwu2W9y0gBg netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN 1.8 +413cb3b3aUP9GmUWqHWQ2SRp1qXnqQ netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen 1.9 +413cb3b3pZuLKElEpQwX1C-3hLW4qA netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c 1.10 +413cb3b34ui1cCGaSqIeLiBgMp-PDw netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c 1.11 +413cb3b3i11i2GVGn0YGlRbM3ifbPQ netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c 1.12 +413cb3b3FgMboWw-Pm3XdbBFSlZl_g netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S 1.13 +413cb3b4ABCSfkHRmbsWfnZNG28nBA netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c 1.14 +413cb3b4bvVJ7UlliMSH60J4uIb9kA netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c 1.15 +413cb3b4aKd9SUY-OzUiTF0Gb9ve9w netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c 1.16 +413cb3b4jUtWl-sP493PvB27o-Iltw netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S 1.17 +413cb3b4ElwwoJEmmzflV0HgK5Qxcg netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c 1.18 +413cb3b4k9OVRCxuSdhKt-2baTp_Yg netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h 1.19 +413cb3b4bRsqiHQLTKEZk4-zOksf8A netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h 1.20 +413cb3b4OqY83qI8GztIZGADpvrpSw netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h 1.21 +413cb3b42GG0LffraTnpZKlSUq57wg netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h 1.22 +413cb3b4F0ArkWVBRyspkw7ivfXihg netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h 1.23 +413cb3b4ullQud70n4JClwoEEUBh8Q netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h 1.24 +413cb3b4y1Ffq8BOhbdSpn-fGmKuEg netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h 1.25 +413cb3b4uXOFcT56QuLt1fcDrB-4Zg netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c 1.26 +413cb3b4hIffjrKn3zhVqJmH6ueB3Q netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c 1.27 +413cb3b4eNdRIasCoQIuX4Nu39Dlqw netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c 1.28 +413cb3b40DLJLbX_ZUIULB0JFjBuaw netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c 1.29 +413cb3b46JnvK1UurZAubeQoFg1W-w netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c 1.30 +413cb3b5rIKB3TbyhK3pbNyVkYysqA netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c 1.31 +413cb3b5eKxnzoodEqaWn2wrPnHWnA netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c 1.32 +413cb3b5F56TvQWAmO5TsuzhtzLFPQ netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c 1.33 +413cb3b53nyOv1OIeDSsCXhBFDXvJA netbsd-2.0-xen-sparse/sys/nfs/files.nfs 1.34 40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs 1.35 3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile 1.36 4124b307nRyK3dhn1hAsvrY76NuV3g tools/check/Makefile
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN Mon Sep 06 19:04:16 2004 +0000 2.3 @@ -0,0 +1,176 @@ 2.4 +# $NetBSD: XEN,v 1.1.2.2 2004/07/15 20:19:34 he Exp $ 2.5 + 2.6 +include "arch/xen/conf/std.xen" 2.7 + 2.8 +options INCLUDE_CONFIG_FILE # embed config file in kernel binary 2.9 + 2.10 +#options UVMHIST 2.11 +#options UVMHIST_PRINT 2.12 +#options SYSCALL_DEBUG 2.13 + 2.14 +maxusers 32 # estimated number of users 2.15 + 2.16 +# 2.17 +options XEN 2.18 +#options DOM0OPS 2.19 +options HZ=50 2.20 + 2.21 +#options I586_CPU 2.22 +options I686_CPU 2.23 + 2.24 +#options VM86 # virtual 8086 emulation 2.25 +#options USER_LDT # user-settable LDT; used by WINE 2.26 + 2.27 +#options MTRR # memory-type range register syscall support 2.28 + 2.29 +#options CONSDEVNAME="\"xencons\"" 2.30 +#options CONS_OVERRIDE 2.31 + 2.32 +options INSECURE # disable kernel security levels - X needs this 2.33 + 2.34 +options RTC_OFFSET=0 # hardware clock is this many mins. west of GMT 2.35 +#options NTP # NTP phase/frequency locked loop 2.36 + 2.37 +options KTRACE # system call tracing via ktrace(1) 2.38 +#options SYSTRACE # system call vetting via systrace(1) 2.39 + 2.40 +options SYSVMSG # System V-like message queues 2.41 +options SYSVSEM # System V-like semaphores 2.42 +#options SEMMNI=10 # number of semaphore identifiers 2.43 +#options SEMMNS=60 # number of semaphores in system 2.44 +#options SEMUME=10 # max number of undo entries per process 2.45 +#options SEMMNU=30 # number of undo structures in system 2.46 +options SYSVSHM # System V-like memory sharing 2.47 +#options SHMMAXPGS=2048 # 2048 pages is the default 2.48 +options P1003_1B_SEMAPHORE # p1003.1b semaphore support 2.49 + 2.50 +options LKM # loadable kernel modules 2.51 + 2.52 +options USERCONF # userconf(4) support 2.53 +options SYSCTL_INCLUDE_DESCR # Include sysctl descriptions in kernel 2.54 + 2.55 +# Diagnostic/debugging support options 2.56 +options DIAGNOSTIC # expensive kernel consistency checks 2.57 +options DEBUG # expensive debugging checks/support 2.58 +options KMEMSTATS # kernel memory statistics (vmstat -m) 2.59 +options DDB # in-kernel debugger 2.60 +options DDB_ONPANIC=1 # see also sysctl(8): `ddb.onpanic' 2.61 +options DDB_HISTORY_SIZE=512 # enable history editing in DDB 2.62 +#options KGDB # remote debugger 2.63 +#options KGDB_DEVNAME="\"com\"",KGDB_DEVADDR=0x2f8,KGDB_DEVRATE=57600 2.64 +makeoptions DEBUG="-g" # compile full symbol table 2.65 + 2.66 +#options COMPAT_14 # NetBSD 1.4 2.67 +#options COMPAT_15 # NetBSD 1.5 2.68 +options COMPAT_16 # NetBSD 1.6 2.69 + 2.70 +##options COMPAT_LINUX # binary compatibility with Linux 2.71 +#options COMPAT_FREEBSD # binary compatibility with FreeBSD 2.72 +#options COMPAT_MACH # binary compatibility with Mach binaries 2.73 +#options COMPAT_DARWIN # binary compatibility with Darwin binaries 2.74 +#options EXEC_MACHO # exec MACH-O binaries 2.75 +#options COMPAT_PECOFF # kernel support to run Win32 apps 2.76 + 2.77 +file-system FFS # UFS 2.78 +file-system EXT2FS # second extended file system (linux) 2.79 +#file-system LFS # log-structured file system 2.80 +#file-system MFS # memory file system 2.81 +file-system NFS # Network File System client 2.82 +#file-system NTFS # Windows/NT file system (experimental) 2.83 +#file-system CD9660 # ISO 9660 + Rock Ridge file system 2.84 +#file-system MSDOSFS # MS-DOS file system 2.85 +file-system FDESC # /dev/fd 2.86 +file-system KERNFS # /kern 2.87 +file-system NULLFS # loopback file system 2.88 +#file-system OVERLAY # overlay file system 2.89 +#file-system PORTAL # portal filesystem (still experimental) 2.90 +file-system PROCFS # /proc 2.91 +#file-system UMAPFS # NULLFS + uid and gid remapping 2.92 +#file-system UNION # union file system 2.93 +#file-system SMBFS # experimental - CIFS; also needs nsmb (below) 2.94 + 2.95 +#options QUOTA # UFS quotas 2.96 +#options SOFTDEP # FFS soft updates support. 2.97 +#options NFSSERVER # Network File System server 2.98 + 2.99 +options GATEWAY # packet forwarding 2.100 +options INET # IP + ICMP + TCP + UDP 2.101 +options INET6 # IPV6 2.102 +options IPSEC # IP security 2.103 +options IPSEC_ESP # IP security (encryption part; define w/IPSEC) 2.104 +options MROUTING # IP multicast routing 2.105 +options PFIL_HOOKS # pfil(9) packet filter hooks 2.106 +options IPFILTER_LOG # ipmon(8) log support 2.107 + 2.108 +options NFS_BOOT_DHCP,NFS_BOOT_BOOTPARAM,NFS_BOOT_BOOTSTATIC 2.109 +#options NFS_BOOTSTATIC_MYIP="\"169.254.1.2\"" 2.110 +#options NFS_BOOTSTATIC_GWIP="\"169.254.1.1\"" 2.111 +#options NFS_BOOTSTATIC_MASK="\"255.255.255.0\"" 2.112 +#options NFS_BOOTSTATIC_SERVADDR="\"169.254.1.1\"" 2.113 +#options NFS_BOOTSTATIC_SERVER="\"server:/path/to/root\"" 2.114 + 2.115 +options WSEMUL_VT100 # VT100 / VT220 emulation 2.116 +options WS_KERNEL_FG=WSCOL_GREEN 2.117 +options WSDISPLAY_COMPAT_PCVT # emulate some ioctls 2.118 +options WSDISPLAY_COMPAT_SYSCONS # emulate some ioctls 2.119 +options WSDISPLAY_COMPAT_USL # VT handling 2.120 +options WSDISPLAY_COMPAT_RAWKBD # can get raw scancodes 2.121 +options WSDISPLAY_DEFAULTSCREENS=4 2.122 +options PCDISPLAY_SOFTCURSOR 2.123 + 2.124 +config netbsd root on ? type ? 2.125 +#config netbsd root on wd0a type ffs 2.126 +#config netbsd root on xennet0 type nfs 2.127 + 2.128 +mainbus0 at root 2.129 + 2.130 +cpu* at mainbus? 2.131 + 2.132 +hypervisor* at mainbus? # Xen hypervisor 2.133 + 2.134 +npx0 at hypervisor? # x86 math coprocessor 2.135 + 2.136 +xencons* at hypervisor? # Xen virtual console 2.137 +xennet* at hypervisor? # Xen virtual network interface 2.138 + 2.139 +#xbd* at hypervisor? # Xen virtual block device 2.140 +#wd* at hypervisor? # Xen vbd (wd identity) 2.141 +#sd* at hypervisor? # Xen vbd (sd identity) 2.142 +#cd* at hypervisor? # Xen vbd (cd identity) 2.143 + 2.144 +#xenkbc* at hypervisor? # Xen Keyboard/Mouse Interface 2.145 +#pckbd* at xenkbc? # Keyboard 2.146 +#vga* at hypervisor? # Xen VGA display 2.147 +#pms* at xenkbc? # PS/2 Mouse for wsmouse 2.148 + 2.149 +#wskbd* at pckbd? console ? 2.150 +#wsdisplay* at vga? console ? 2.151 +#wsmouse* at pms? mux 0 2.152 + 2.153 + 2.154 +include "arch/xen/conf/GENERIC.local" 2.155 + 2.156 + 2.157 +pseudo-device ccd 4 # concatenated/striped disk devices 2.158 +#pseudo-device cgd 4 # cryptographic disk devices 2.159 +#pseudo-device md 1 # memory disk device (ramdisk) 2.160 +#pseudo-device vnd 4 # disk-like interface to files 2.161 + 2.162 +pseudo-device bpfilter 8 # Berkeley packet filter 2.163 +pseudo-device ipfilter # IP filter (firewall) and NAT 2.164 +pseudo-device loop # network loopback 2.165 +#pseudo-device tun 2 # network tunneling over tty 2.166 +#pseudo-device gre 2 # generic L3 over IP tunnel 2.167 +#pseudo-device gif 4 # IPv[46] over IPv[46] tunnel (RFC1933) 2.168 +#pseudo-device faith 1 # IPv[46] tcp relay translation i/f 2.169 +#pseudo-device stf 1 # 6to4 IPv6 over IPv4 encapsulation 2.170 +#pseudo-device vlan # IEEE 802.1q encapsulation 2.171 +#pseudo-device bridge # simple inter-network bridging 2.172 + 2.173 +pseudo-device pty # pseudo-terminals 2.174 +pseudo-device rnd # /dev/random and in-kernel generator 2.175 +pseudo-device clockctl # user control of clock subsystem 2.176 + 2.177 +pseudo-device wsmux # mouse & keyboard multiplexor 2.178 +pseudo-device wsfont 2.179 +pseudo-device ksyms # /dev/ksyms
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen Mon Sep 06 19:04:16 2004 +0000 3.3 @@ -0,0 +1,232 @@ 3.4 +# $NetBSD: files.xen,v 1.3.2.1 2004/05/22 15:59:02 he Exp $ 3.5 +# NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp 3.6 +# NetBSD: files.i386,v 1.254 2004/03/25 23:32:10 jmc Exp 3.7 + 3.8 +maxpartitions 8 3.9 + 3.10 +maxusers 2 16 128 3.11 + 3.12 +# Processor type options. 3.13 +defflag opt_cputype.h I686_CPU 3.14 + 3.15 +# delay before cpu_reset() for reboot. 3.16 +defparam CPURESET_DELAY 3.17 + 3.18 +# No unmapped page below kernel stack 3.19 +defflag NOREDZONE 3.20 + 3.21 +# Beep on halt 3.22 +defflag opt_beep.h BEEP_ONHALT 3.23 +defparam opt_beep.h BEEP_ONHALT_COUNT 3.24 +defparam opt_beep.h BEEP_ONHALT_PITCH BEEP_ONHALT_PERIOD 3.25 + 3.26 +file arch/xen/i386/autoconf.c 3.27 +file arch/i386/i386/db_dbgreg.S ddb | kstack_check_dr0 3.28 +file arch/i386/i386/db_disasm.c ddb 3.29 +file arch/i386/i386/db_interface.c ddb 3.30 +file arch/i386/i386/db_memrw.c ddb | kgdb 3.31 +file arch/i386/i386/db_trace.c ddb 3.32 +file kern/subr_disk_mbr.c disk 3.33 +file arch/xen/i386/gdt.c 3.34 +file arch/xen/i386/hypervisor_machdep.c 3.35 +file arch/i386/i386/in_cksum.S inet | inet6 3.36 +file arch/i386/i386/ipkdb_glue.c ipkdb 3.37 +file arch/i386/i386/kgdb_machdep.c kgdb 3.38 +file arch/xen/i386/machdep.c 3.39 +file arch/xen/i386/identcpu.c 3.40 +file arch/i386/i386/math_emulate.c math_emulate 3.41 +file arch/i386/i386/mem.c 3.42 +file kern/kern_microtime.c i586_cpu | i686_cpu 3.43 +file arch/i386/i386/mtrr_k6.c mtrr 3.44 +file netns/ns_cksum.c ns 3.45 +file arch/xen/i386/pmap.c 3.46 +file arch/i386/i386/process_machdep.c 3.47 +file arch/i386/i386/procfs_machdep.c procfs 3.48 +file arch/xen/i386/sys_machdep.c 3.49 +file arch/i386/i386/syscall.c 3.50 +file arch/xen/i386/trap.c 3.51 +file arch/i386/i386/vm_machdep.c 3.52 +file arch/xen/i386/xen_machdep.c 3.53 + 3.54 +file arch/xen/xen/xen_debug.c 3.55 + 3.56 +file arch/xen/xen/clock.c 3.57 +file arch/xen/xen/evtchn.c 3.58 +file arch/xen/xen/ctrl_if.c 3.59 + 3.60 +file dev/cons.c 3.61 + 3.62 +file arch/i386/i386/mptramp.S multiprocessor 3.63 +file arch/i386/i386/ipifuncs.c multiprocessor 3.64 + 3.65 +file arch/i386/i386/pmc.c perfctrs 3.66 + 3.67 +file crypto/des/arch/i386/des_enc.S des 3.68 +file crypto/des/arch/i386/des_cbc.S des 3.69 + 3.70 +file crypto/blowfish/arch/i386/bf_enc.S blowfish 3.71 +file crypto/blowfish/arch/i386/bf_cbc.S blowfish & !i386_cpu 3.72 + 3.73 +# 3.74 +# Machine-independent SCSI drivers 3.75 +# 3.76 + 3.77 +#xxx include "dev/scsipi/files.scsipi" 3.78 + 3.79 +# 3.80 +# Machine-independent ATA drivers 3.81 +# 3.82 + 3.83 +#xxx include "dev/ata/files.ata" 3.84 + 3.85 +# Memory Disk for install floppy 3.86 +file dev/md_root.c memory_disk_hooks 3.87 + 3.88 +# 3.89 +define mainbus { [apid = -1] } 3.90 + 3.91 +file arch/x86/x86/bus_dma.c 3.92 +file arch/xen/x86/bus_space.c 3.93 +file arch/x86/x86/cacheinfo.c 3.94 +file arch/xen/x86/consinit.c 3.95 +file arch/xen/x86/intr.c 3.96 +file arch/x86/x86/ipi.c multiprocessor 3.97 +file arch/x86/x86/lock_machdep.c lockdebug 3.98 +file arch/x86/x86/softintr.c 3.99 + 3.100 +include "arch/xen/conf/files.compat" 3.101 + 3.102 +# 3.103 +# System bus types 3.104 +# 3.105 + 3.106 +device mainbus: mainbus 3.107 +attach mainbus at root 3.108 +file arch/xen/i386/mainbus.c mainbus 3.109 + 3.110 +# Xen hypervisor 3.111 +device hypervisor { } 3.112 +attach hypervisor at mainbus 3.113 +file arch/xen/xen/hypervisor.c hypervisor needs-flag 3.114 + 3.115 +# Numeric Processing Extension; Math Co-processor 3.116 +device npx 3.117 +file arch/xen/i386/npx.c npx needs-flag 3.118 + 3.119 +attach npx at hypervisor with npx_hv 3.120 +file arch/xen/i386/npx_hv.c npx_hv 3.121 + 3.122 +# Xen console support 3.123 +device xencons: tty 3.124 +attach xencons at hypervisor 3.125 +file arch/xen/xen/xencons.c xencons needs-flag 3.126 + 3.127 +include "dev/wscons/files.wscons" 3.128 +include "dev/wsfont/files.wsfont" 3.129 + 3.130 +include "dev/pckbport/files.pckbport" 3.131 + 3.132 +# CPUS 3.133 + 3.134 +define cpu { [apid = -1] } 3.135 +device cpu 3.136 +attach cpu at mainbus 3.137 +file arch/xen/i386/cpu.c cpu 3.138 + 3.139 +# 3.140 +# Compatibility modules 3.141 +# 3.142 + 3.143 +# VM86 mode 3.144 +file arch/i386/i386/vm86.c vm86 3.145 + 3.146 +# VM86 in kernel 3.147 +file arch/i386/i386/kvm86.c kvm86 3.148 +file arch/i386/i386/kvm86call.S kvm86 3.149 + 3.150 +# Binary compatibility with previous NetBSD releases (COMPAT_XX) 3.151 +file arch/i386/i386/compat_13_machdep.c compat_13 | compat_aout 3.152 +file arch/i386/i386/compat_16_machdep.c compat_16 | compat_ibcs2 3.153 + 3.154 +# SVR4 binary compatibility (COMPAT_SVR4) 3.155 +include "compat/svr4/files.svr4" 3.156 +file arch/i386/i386/svr4_machdep.c compat_svr4 3.157 +file arch/i386/i386/svr4_sigcode.S compat_svr4 3.158 +file arch/i386/i386/svr4_syscall.c compat_svr4 3.159 + 3.160 +# MACH binary compatibility (COMPAT_MACH) 3.161 +include "compat/mach/files.mach" 3.162 +file arch/i386/i386/mach_machdep.c compat_mach | compat_darwin 3.163 +file arch/i386/i386/mach_sigcode.S compat_mach | compat_darwin 3.164 +file arch/i386/i386/mach_syscall.c compat_mach | compat_darwin 3.165 +file arch/i386/i386/macho_machdep.c exec_macho 3.166 + 3.167 +# DARWIN binary compatibility (COMPAT_DARWIN) 3.168 +include "compat/darwin/files.darwin" 3.169 +file arch/i386/i386/darwin_machdep.c compat_darwin 3.170 + 3.171 +# iBCS-2 binary compatibility (COMPAT_IBCS2) 3.172 +include "compat/ibcs2/files.ibcs2" 3.173 +file arch/i386/i386/ibcs2_machdep.c compat_ibcs2 3.174 +file arch/i386/i386/ibcs2_sigcode.S compat_ibcs2 3.175 +file arch/i386/i386/ibcs2_syscall.c compat_ibcs2 3.176 + 3.177 +# Linux binary compatibility (COMPAT_LINUX) 3.178 +include "compat/linux/files.linux" 3.179 +include "compat/linux/arch/i386/files.linux_i386" 3.180 +file arch/i386/i386/linux_sigcode.S compat_linux 3.181 +file arch/i386/i386/linux_syscall.c compat_linux 3.182 +file arch/i386/i386/linux_trap.c compat_linux 3.183 + 3.184 +# FreeBSD binary compatibility (COMPAT_FREEBSD) 3.185 +include "compat/freebsd/files.freebsd" 3.186 +file arch/i386/i386/freebsd_machdep.c compat_freebsd 3.187 +file arch/i386/i386/freebsd_sigcode.S compat_freebsd 3.188 +file arch/i386/i386/freebsd_syscall.c compat_freebsd 3.189 + 3.190 +# a.out binary compatibility (COMPAT_AOUT) 3.191 +include "compat/aout/files.aout" 3.192 + 3.193 +# Win32 binary compatibility (COMPAT_PECOFF) 3.194 +include "compat/pecoff/files.pecoff" 3.195 + 3.196 +# OSS audio driver compatibility 3.197 +include "compat/ossaudio/files.ossaudio" 3.198 + 3.199 +# Xen devices 3.200 + 3.201 +# Network driver 3.202 +device xennet: arp, ether, ifnet 3.203 +attach xennet at hypervisor 3.204 +file arch/xen/xen/if_xennet.c xennet needs-flag 3.205 + 3.206 +# Block device driver and wd/sd/cd identities 3.207 +device xbd: disk 3.208 +attach xbd at hypervisor 3.209 +file arch/xen/xen/xbd.c xbd | wd | sd | cd needs-flag 3.210 + 3.211 +device wd: disk 3.212 +attach wd at hypervisor 3.213 + 3.214 +device sd: disk 3.215 +attach sd at hypervisor 3.216 + 3.217 +device cd: disk 3.218 +attach cd at hypervisor 3.219 + 3.220 +# Keyboard 3.221 +device xenkbc: pckbport 3.222 +attach xenkbc at hypervisor 3.223 +file arch/xen/xen/xenkbc.c xenkbc needs-flag 3.224 + 3.225 +# Generic VGA 3.226 +attach vga at hypervisor with vga_xen 3.227 +file arch/xen/xen/vga_xen.c vga_xen needs-flag 3.228 + 3.229 +# Domain-0 operations 3.230 +defflag opt_xen.h DOM0OPS 3.231 +file arch/xen/xen/machmem.c dom0ops 3.232 +file arch/xen/xen/privcmd.c dom0ops 3.233 +file arch/xen/xen/vfr.c dom0ops 3.234 + 3.235 +include "arch/xen/conf/majors.i386"
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c Mon Sep 06 19:04:16 2004 +0000 4.3 @@ -0,0 +1,630 @@ 4.4 +/* $NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $ */ 4.5 +/* NetBSD: autoconf.c,v 1.75 2003/12/30 12:33:22 pk Exp */ 4.6 + 4.7 +/*- 4.8 + * Copyright (c) 1990 The Regents of the University of California. 4.9 + * All rights reserved. 4.10 + * 4.11 + * This code is derived from software contributed to Berkeley by 4.12 + * William Jolitz. 4.13 + * 4.14 + * Redistribution and use in source and binary forms, with or without 4.15 + * modification, are permitted provided that the following conditions 4.16 + * are met: 4.17 + * 1. Redistributions of source code must retain the above copyright 4.18 + * notice, this list of conditions and the following disclaimer. 4.19 + * 2. Redistributions in binary form must reproduce the above copyright 4.20 + * notice, this list of conditions and the following disclaimer in the 4.21 + * documentation and/or other materials provided with the distribution. 4.22 + * 3. Neither the name of the University nor the names of its contributors 4.23 + * may be used to endorse or promote products derived from this software 4.24 + * without specific prior written permission. 4.25 + * 4.26 + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 4.27 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 4.28 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 4.29 + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 4.30 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 4.31 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 4.32 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 4.33 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 4.34 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 4.35 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 4.36 + * SUCH DAMAGE. 4.37 + * 4.38 + * @(#)autoconf.c 7.1 (Berkeley) 5/9/91 4.39 + */ 4.40 + 4.41 +/* 4.42 + * Setup the system to run on the current machine. 4.43 + * 4.44 + * Configure() is called at boot time and initializes the vba 4.45 + * device tables and the memory controller monitoring. Available 4.46 + * devices are determined (from possibilities mentioned in ioconf.c), 4.47 + * and the drivers are initialized. 4.48 + */ 4.49 + 4.50 +#include <sys/cdefs.h> 4.51 +__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $"); 4.52 + 4.53 +#include "opt_compat_oldboot.h" 4.54 +#include "opt_multiprocessor.h" 4.55 +#include "opt_nfs_boot.h" 4.56 +#include "xennet.h" 4.57 + 4.58 +#include <sys/param.h> 4.59 +#include <sys/systm.h> 4.60 +#include <sys/buf.h> 4.61 +#include <sys/disklabel.h> 4.62 +#include <sys/conf.h> 4.63 +#ifdef COMPAT_OLDBOOT 4.64 +#include <sys/reboot.h> 4.65 +#endif 4.66 +#include <sys/device.h> 4.67 +#include <sys/malloc.h> 4.68 +#include <sys/vnode.h> 4.69 +#include <sys/fcntl.h> 4.70 +#include <sys/dkio.h> 4.71 +#include <sys/proc.h> 4.72 +#include <sys/user.h> 4.73 + 4.74 +#ifdef NFS_BOOT_BOOTSTATIC 4.75 +#include <net/if.h> 4.76 +#include <net/if_ether.h> 4.77 +#include <netinet/in.h> 4.78 +#include <nfs/rpcv2.h> 4.79 +#include <nfs/nfsproto.h> 4.80 +#include <nfs/nfs.h> 4.81 +#include <nfs/nfsmount.h> 4.82 +#include <nfs/nfsdiskless.h> 4.83 +#include <machine/if_xennetvar.h> 4.84 +#endif 4.85 + 4.86 +#include <machine/pte.h> 4.87 +#include <machine/cpu.h> 4.88 +#include <machine/gdt.h> 4.89 +#include <machine/pcb.h> 4.90 +#include <machine/bootinfo.h> 4.91 + 4.92 +#include "ioapic.h" 4.93 +#include "lapic.h" 4.94 + 4.95 +#if NIOAPIC > 0 4.96 +#include <machine/i82093var.h> 4.97 +#endif 4.98 + 4.99 +#if NLAPIC > 0 4.100 +#include <machine/i82489var.h> 4.101 +#endif 4.102 + 4.103 +static int match_harddisk(struct device *, struct btinfo_bootdisk *); 4.104 +static void matchbiosdisks(void); 4.105 +static void findroot(void); 4.106 +static int is_valid_disk(struct device *); 4.107 + 4.108 +extern struct disklist *i386_alldisks; 4.109 +extern int i386_ndisks; 4.110 + 4.111 +#include "bios32.h" 4.112 +#if NBIOS32 > 0 4.113 +#include <machine/bios32.h> 4.114 +#endif 4.115 + 4.116 +#include "opt_pcibios.h" 4.117 +#ifdef PCIBIOS 4.118 +#include <dev/pci/pcireg.h> 4.119 +#include <dev/pci/pcivar.h> 4.120 +#include <i386/pci/pcibios.h> 4.121 +#endif 4.122 + 4.123 +#include "opt_kvm86.h" 4.124 +#ifdef KVM86 4.125 +#include <machine/kvm86.h> 4.126 +#endif 4.127 + 4.128 +#include "opt_xen.h" 4.129 + 4.130 +struct device *booted_device; 4.131 +int booted_partition; 4.132 + 4.133 +/* 4.134 + * Determine i/o configuration for a machine. 4.135 + */ 4.136 +void 4.137 +cpu_configure(void) 4.138 +{ 4.139 + 4.140 + startrtclock(); 4.141 + 4.142 +#if NBIOS32 > 0 4.143 + bios32_init(); 4.144 +#endif 4.145 +#ifdef PCIBIOS 4.146 + pcibios_init(); 4.147 +#endif 4.148 + 4.149 + /* kvm86 needs a TSS */ 4.150 + i386_proc0_tss_ldt_init(); 4.151 +#ifdef KVM86 4.152 + kvm86_init(); 4.153 +#endif 4.154 + 4.155 + if (config_rootfound("mainbus", NULL) == NULL) 4.156 + panic("configure: mainbus not configured"); 4.157 + 4.158 +#ifdef INTRDEBUG 4.159 + intr_printconfig(); 4.160 +#endif 4.161 + 4.162 +#if NIOAPIC > 0 4.163 + lapic_set_lvt(); 4.164 + ioapic_enable(); 4.165 +#endif 4.166 + /* resync cr0 after FPU configuration */ 4.167 + lwp0.l_addr->u_pcb.pcb_cr0 = rcr0(); 4.168 +#ifdef MULTIPROCESSOR 4.169 + /* propagate this to the idle pcb's. */ 4.170 + cpu_init_idle_pcbs(); 4.171 +#endif 4.172 + 4.173 + spl0(); 4.174 +#if NLAPIC > 0 4.175 + lapic_tpr = 0; 4.176 +#endif 4.177 +} 4.178 + 4.179 +void 4.180 +cpu_rootconf(void) 4.181 +{ 4.182 + findroot(); 4.183 + matchbiosdisks(); 4.184 + 4.185 + printf("boot device: %s\n", 4.186 + booted_device ? booted_device->dv_xname : "<unknown>"); 4.187 + 4.188 + setroot(booted_device, booted_partition); 4.189 +} 4.190 + 4.191 +/* 4.192 + * XXX ugly bit of code. But, this is the only safe time that the 4.193 + * match between BIOS disks and native disks can be done. 4.194 + */ 4.195 +static void 4.196 +matchbiosdisks(void) 4.197 +{ 4.198 + struct btinfo_biosgeom *big; 4.199 + struct bi_biosgeom_entry *be; 4.200 + struct device *dv; 4.201 + int i, ck, error, m, n; 4.202 + struct vnode *tv; 4.203 + char mbr[DEV_BSIZE]; 4.204 + int dklist_size; 4.205 + int bmajor; 4.206 + 4.207 + big = lookup_bootinfo(BTINFO_BIOSGEOM); 4.208 + 4.209 + if (big == NULL) 4.210 + return; 4.211 + 4.212 + /* 4.213 + * First, count all native disks 4.214 + */ 4.215 + for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) 4.216 + if (is_valid_disk(dv)) 4.217 + i386_ndisks++; 4.218 + 4.219 + if (i386_ndisks == 0) 4.220 + return; 4.221 + 4.222 + dklist_size = sizeof (struct disklist) + (i386_ndisks - 1) * 4.223 + sizeof (struct nativedisk_info); 4.224 + 4.225 + /* XXX M_TEMP is wrong */ 4.226 + i386_alldisks = malloc(dklist_size, M_TEMP, M_NOWAIT); 4.227 + if (i386_alldisks == NULL) 4.228 + return; 4.229 + 4.230 + memset(i386_alldisks, 0, dklist_size); 4.231 + 4.232 + i386_alldisks->dl_nnativedisks = i386_ndisks; 4.233 + i386_alldisks->dl_nbiosdisks = big->num; 4.234 + for (i = 0; i < big->num; i++) { 4.235 + i386_alldisks->dl_biosdisks[i].bi_dev = big->disk[i].dev; 4.236 + i386_alldisks->dl_biosdisks[i].bi_sec = big->disk[i].sec; 4.237 + i386_alldisks->dl_biosdisks[i].bi_head = big->disk[i].head; 4.238 + i386_alldisks->dl_biosdisks[i].bi_cyl = big->disk[i].cyl; 4.239 + i386_alldisks->dl_biosdisks[i].bi_lbasecs = big->disk[i].totsec; 4.240 + i386_alldisks->dl_biosdisks[i].bi_flags = big->disk[i].flags; 4.241 +#ifdef GEOM_DEBUG 4.242 +#ifdef NOTYET 4.243 + printf("disk %x: flags %x, interface %x, device %llx\n", 4.244 + big->disk[i].dev, big->disk[i].flags, 4.245 + big->disk[i].interface_path, big->disk[i].device_path); 4.246 +#endif 4.247 +#endif 4.248 + } 4.249 + 4.250 + /* 4.251 + * XXX code duplication from findroot() 4.252 + */ 4.253 + n = -1; 4.254 + for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) { 4.255 + if (dv->dv_class != DV_DISK) 4.256 + continue; 4.257 +#ifdef GEOM_DEBUG 4.258 + printf("matchbiosdisks: trying to match (%s) %s\n", 4.259 + dv->dv_xname, dv->dv_cfdata->cf_name); 4.260 +#endif 4.261 + if (is_valid_disk(dv)) { 4.262 + n++; 4.263 + sprintf(i386_alldisks->dl_nativedisks[n].ni_devname, 4.264 + "%s%d", dv->dv_cfdata->cf_name, 4.265 + dv->dv_unit); 4.266 + 4.267 + bmajor = devsw_name2blk(dv->dv_xname, NULL, 0); 4.268 + if (bmajor == -1) 4.269 + return; 4.270 + 4.271 + if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, RAW_PART), 4.272 + &tv)) 4.273 + panic("matchbiosdisks: can't alloc vnode"); 4.274 + 4.275 + error = VOP_OPEN(tv, FREAD, NOCRED, 0); 4.276 + if (error) { 4.277 + vput(tv); 4.278 + continue; 4.279 + } 4.280 + error = vn_rdwr(UIO_READ, tv, mbr, DEV_BSIZE, 0, 4.281 + UIO_SYSSPACE, 0, NOCRED, NULL, 0); 4.282 + VOP_CLOSE(tv, FREAD, NOCRED, 0); 4.283 + if (error) { 4.284 +#ifdef GEOM_DEBUG 4.285 + printf("matchbiosdisks: %s: MBR read failure\n", 4.286 + dv->dv_xname); 4.287 +#endif 4.288 + continue; 4.289 + } 4.290 + 4.291 + for (ck = i = 0; i < DEV_BSIZE; i++) 4.292 + ck += mbr[i]; 4.293 + for (m = i = 0; i < big->num; i++) { 4.294 + be = &big->disk[i]; 4.295 +#ifdef GEOM_DEBUG 4.296 + printf("match %s with %d ", dv->dv_xname, i); 4.297 + printf("dev ck %x bios ck %x\n", ck, be->cksum); 4.298 +#endif 4.299 + if (be->flags & BI_GEOM_INVALID) 4.300 + continue; 4.301 + if (be->cksum == ck && 4.302 + !memcmp(&mbr[MBR_PART_OFFSET], be->dosparts, 4.303 + MBR_PART_COUNT * 4.304 + sizeof (struct mbr_partition))) { 4.305 +#ifdef GEOM_DEBUG 4.306 + printf("matched bios disk %x with %s\n", 4.307 + be->dev, dv->dv_xname); 4.308 +#endif 4.309 + i386_alldisks->dl_nativedisks[n]. 4.310 + ni_biosmatches[m++] = i; 4.311 + } 4.312 + } 4.313 + i386_alldisks->dl_nativedisks[n].ni_nmatches = m; 4.314 + vput(tv); 4.315 + } 4.316 + } 4.317 +} 4.318 + 4.319 +#ifdef COMPAT_OLDBOOT 4.320 +u_long bootdev = 0; /* should be dev_t, but not until 32 bits */ 4.321 +#endif 4.322 + 4.323 +/* 4.324 + * helper function for "findroot()": 4.325 + * return nonzero if disk device matches bootinfo 4.326 + */ 4.327 +static int 4.328 +match_harddisk(struct device *dv, struct btinfo_bootdisk *bid) 4.329 +{ 4.330 + struct vnode *tmpvn; 4.331 + int error; 4.332 + struct disklabel label; 4.333 + int found = 0; 4.334 + int bmajor; 4.335 + 4.336 + /* 4.337 + * A disklabel is required here. The 4.338 + * bootblocks don't refuse to boot from 4.339 + * a disk without a label, but this is 4.340 + * normally not wanted. 4.341 + */ 4.342 + if (bid->labelsector == -1) 4.343 + return(0); 4.344 + 4.345 + /* 4.346 + * lookup major number for disk block device 4.347 + */ 4.348 + bmajor = devsw_name2blk(dv->dv_xname, NULL, 0); 4.349 + if (bmajor == -1) 4.350 + return(0); /* XXX panic() ??? */ 4.351 + 4.352 + /* 4.353 + * Fake a temporary vnode for the disk, open 4.354 + * it, and read the disklabel for comparison. 4.355 + */ 4.356 + if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, bid->partition), &tmpvn)) 4.357 + panic("findroot can't alloc vnode"); 4.358 + error = VOP_OPEN(tmpvn, FREAD, NOCRED, 0); 4.359 + if (error) { 4.360 +#ifndef DEBUG 4.361 + /* 4.362 + * Ignore errors caused by missing 4.363 + * device, partition or medium. 4.364 + */ 4.365 + if (error != ENXIO && error != ENODEV) 4.366 +#endif 4.367 + printf("findroot: can't open dev %s%c (%d)\n", 4.368 + dv->dv_xname, 'a' + bid->partition, error); 4.369 + vput(tmpvn); 4.370 + return(0); 4.371 + } 4.372 + error = VOP_IOCTL(tmpvn, DIOCGDINFO, &label, FREAD, NOCRED, 0); 4.373 + if (error) { 4.374 + /* 4.375 + * XXX can't happen - open() would 4.376 + * have errored out (or faked up one) 4.377 + */ 4.378 + printf("can't get label for dev %s%c (%d)\n", 4.379 + dv->dv_xname, 'a' + bid->partition, error); 4.380 + goto closeout; 4.381 + } 4.382 + 4.383 + /* compare with our data */ 4.384 + if (label.d_type == bid->label.type && 4.385 + label.d_checksum == bid->label.checksum && 4.386 + !strncmp(label.d_packname, bid->label.packname, 16)) 4.387 + found = 1; 4.388 + 4.389 +closeout: 4.390 + VOP_CLOSE(tmpvn, FREAD, NOCRED, 0); 4.391 + vput(tmpvn); 4.392 + return(found); 4.393 +} 4.394 + 4.395 +/* 4.396 + * Attempt to find the device from which we were booted. 4.397 + * If we can do so, and not instructed not to do so, 4.398 + * change rootdev to correspond to the load device. 4.399 + */ 4.400 +void 4.401 +findroot(void) 4.402 +{ 4.403 + struct btinfo_bootdisk *bid; 4.404 + struct device *dv; 4.405 + union xen_cmdline_parseinfo xcp; 4.406 +#ifdef COMPAT_OLDBOOT 4.407 + int i, majdev, unit, part; 4.408 + char buf[32]; 4.409 +#endif 4.410 + 4.411 + if (booted_device) 4.412 + return; 4.413 + 4.414 + if (lookup_bootinfo(BTINFO_NETIF)) { 4.415 + /* 4.416 + * We got netboot interface information, but 4.417 + * "device_register()" couldn't match it to a configured 4.418 + * device. Bootdisk information cannot be present at the 4.419 + * same time, so give up. 4.420 + */ 4.421 + printf("findroot: netboot interface not found\n"); 4.422 + return; 4.423 + } 4.424 + 4.425 + bid = lookup_bootinfo(BTINFO_BOOTDISK); 4.426 + if (bid) { 4.427 + /* 4.428 + * Scan all disk devices for ones that match the passed data. 4.429 + * Don't break if one is found, to get possible multiple 4.430 + * matches - for problem tracking. Use the first match anyway 4.431 + * because lower device numbers are more likely to be the 4.432 + * boot device. 4.433 + */ 4.434 + for (dv = alldevs.tqh_first; dv != NULL; 4.435 + dv = dv->dv_list.tqe_next) { 4.436 + if (dv->dv_class != DV_DISK) 4.437 + continue; 4.438 + 4.439 + if (!strcmp(dv->dv_cfdata->cf_name, "fd")) { 4.440 + /* 4.441 + * Assume the configured unit number matches 4.442 + * the BIOS device number. (This is the old 4.443 + * behaviour.) Needs some ideas how to handle 4.444 + * BIOS's "swap floppy drive" options. 4.445 + */ 4.446 + if ((bid->biosdev & 0x80) || 4.447 + dv->dv_unit != bid->biosdev) 4.448 + continue; 4.449 + 4.450 + goto found; 4.451 + } 4.452 + 4.453 + if (is_valid_disk(dv)) { 4.454 + /* 4.455 + * Don't trust BIOS device numbers, try 4.456 + * to match the information passed by the 4.457 + * bootloader instead. 4.458 + */ 4.459 + if ((bid->biosdev & 0x80) == 0 || 4.460 + !match_harddisk(dv, bid)) 4.461 + continue; 4.462 + 4.463 + goto found; 4.464 + } 4.465 + 4.466 + /* no "fd", "wd", "sd", "ld", "ed" */ 4.467 + continue; 4.468 + 4.469 +found: 4.470 + if (booted_device) { 4.471 + printf("warning: double match for boot " 4.472 + "device (%s, %s)\n", 4.473 + booted_device->dv_xname, dv->dv_xname); 4.474 + continue; 4.475 + } 4.476 + booted_device = dv; 4.477 + booted_partition = bid->partition; 4.478 + } 4.479 + 4.480 + if (booted_device) 4.481 + return; 4.482 + } 4.483 + 4.484 + xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp); 4.485 + 4.486 + for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) { 4.487 + if (is_valid_disk(dv) == 0) 4.488 + continue; 4.489 + 4.490 + if (xcp.xcp_bootdev[0] == 0) { 4.491 + booted_device = dv; 4.492 + break; 4.493 + } 4.494 + 4.495 + if (strncmp(xcp.xcp_bootdev, dv->dv_xname, 4.496 + strlen(dv->dv_xname))) 4.497 + continue; 4.498 + 4.499 + if (strlen(xcp.xcp_bootdev) > strlen(dv->dv_xname)) { 4.500 + booted_partition = toupper( 4.501 + xcp.xcp_bootdev[strlen(dv->dv_xname)]) - 'A'; 4.502 + } 4.503 + 4.504 + booted_device = dv; 4.505 + break; 4.506 + } 4.507 + 4.508 + if (booted_device) 4.509 + return; 4.510 + 4.511 +#ifdef COMPAT_OLDBOOT 4.512 +#if 0 4.513 + printf("howto %x bootdev %x ", boothowto, bootdev); 4.514 +#endif 4.515 + 4.516 + if ((bootdev & B_MAGICMASK) != (u_long)B_DEVMAGIC) 4.517 + return; 4.518 + 4.519 + majdev = (bootdev >> B_TYPESHIFT) & B_TYPEMASK; 4.520 + name = devsw_blk2name(majdev); 4.521 + if (name == NULL) 4.522 + return; 4.523 + 4.524 + part = (bootdev >> B_PARTITIONSHIFT) & B_PARTITIONMASK; 4.525 + unit = (bootdev >> B_UNITSHIFT) & B_UNITMASK; 4.526 + 4.527 + sprintf(buf, "%s%d", name, unit); 4.528 + for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) { 4.529 + if (strcmp(buf, dv->dv_xname) == 0) { 4.530 + booted_device = dv; 4.531 + booted_partition = part; 4.532 + return; 4.533 + } 4.534 + } 4.535 +#endif 4.536 +} 4.537 + 4.538 +#include "pci.h" 4.539 + 4.540 +#include <dev/isa/isavar.h> 4.541 +#if NPCI > 0 4.542 +#include <dev/pci/pcivar.h> 4.543 +#endif 4.544 + 4.545 +void 4.546 +device_register(struct device *dev, void *aux) 4.547 +{ 4.548 + /* 4.549 + * Handle network interfaces here, the attachment information is 4.550 + * not available driver independantly later. 4.551 + * For disks, there is nothing useful available at attach time. 4.552 + */ 4.553 +#if NXENNET > 0 4.554 + if (dev->dv_class == DV_IFNET) { 4.555 + union xen_cmdline_parseinfo xcp; 4.556 + 4.557 + xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp); 4.558 + if (strncmp(xcp.xcp_bootdev, dev->dv_xname, 16) == 0) { 4.559 +#ifdef NFS_BOOT_BOOTSTATIC 4.560 + nfs_bootstatic_callback = xennet_bootstatic_callback; 4.561 +#endif 4.562 + goto found; 4.563 + } 4.564 + } 4.565 +#endif 4.566 + if (dev->dv_class == DV_IFNET) { 4.567 + struct btinfo_netif *bin = lookup_bootinfo(BTINFO_NETIF); 4.568 + if (bin == NULL) 4.569 + return; 4.570 + 4.571 + /* 4.572 + * We don't check the driver name against the device name 4.573 + * passed by the boot ROM. The ROM should stay usable 4.574 + * if the driver gets obsoleted. 4.575 + * The physical attachment information (checked below) 4.576 + * must be sufficient to identify the device. 4.577 + */ 4.578 + 4.579 + if (bin->bus == BI_BUS_ISA && 4.580 + !strcmp(dev->dv_parent->dv_cfdata->cf_name, "isa")) { 4.581 + struct isa_attach_args *iaa = aux; 4.582 + 4.583 + /* compare IO base address */ 4.584 + /* XXXJRT what about multiple I/O addrs? */ 4.585 + if (iaa->ia_nio > 0 && 4.586 + bin->addr.iobase == iaa->ia_io[0].ir_addr) 4.587 + goto found; 4.588 + } 4.589 +#if NPCI > 0 4.590 + if (bin->bus == BI_BUS_PCI && 4.591 + !strcmp(dev->dv_parent->dv_cfdata->cf_name, "pci")) { 4.592 + struct pci_attach_args *paa = aux; 4.593 + int b, d, f; 4.594 + 4.595 + /* 4.596 + * Calculate BIOS representation of: 4.597 + * 4.598 + * <bus,device,function> 4.599 + * 4.600 + * and compare. 4.601 + */ 4.602 + pci_decompose_tag(paa->pa_pc, paa->pa_tag, &b, &d, &f); 4.603 + if (bin->addr.tag == ((b << 8) | (d << 3) | f)) 4.604 + goto found; 4.605 + } 4.606 +#endif 4.607 + } 4.608 + return; 4.609 + 4.610 +found: 4.611 + if (booted_device) { 4.612 + /* XXX should be a "panic()" */ 4.613 + printf("warning: double match for boot device (%s, %s)\n", 4.614 + booted_device->dv_xname, dev->dv_xname); 4.615 + return; 4.616 + } 4.617 + booted_device = dev; 4.618 +} 4.619 + 4.620 +static int 4.621 +is_valid_disk(struct device *dv) 4.622 +{ 4.623 + const char *name; 4.624 + 4.625 + if (dv->dv_class != DV_DISK) 4.626 + return (0); 4.627 + 4.628 + name = dv->dv_cfdata->cf_name; 4.629 + 4.630 + return (strcmp(name, "sd") == 0 || strcmp(name, "wd") == 0 || 4.631 + strcmp(name, "ld") == 0 || strcmp(name, "ed") == 0 || 4.632 + strcmp(name, "xbd") == 0); 4.633 +}
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c Mon Sep 06 19:04:16 2004 +0000 5.3 @@ -0,0 +1,408 @@ 5.4 +/* $NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $ */ 5.5 +/* NetBSD: gdt.c,v 1.32 2004/02/13 11:36:13 wiz Exp */ 5.6 + 5.7 +/*- 5.8 + * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc. 5.9 + * All rights reserved. 5.10 + * 5.11 + * This code is derived from software contributed to The NetBSD Foundation 5.12 + * by John T. Kohl and Charles M. Hannum. 5.13 + * 5.14 + * Redistribution and use in source and binary forms, with or without 5.15 + * modification, are permitted provided that the following conditions 5.16 + * are met: 5.17 + * 1. Redistributions of source code must retain the above copyright 5.18 + * notice, this list of conditions and the following disclaimer. 5.19 + * 2. Redistributions in binary form must reproduce the above copyright 5.20 + * notice, this list of conditions and the following disclaimer in the 5.21 + * documentation and/or other materials provided with the distribution. 5.22 + * 3. All advertising materials mentioning features or use of this software 5.23 + * must display the following acknowledgement: 5.24 + * This product includes software developed by the NetBSD 5.25 + * Foundation, Inc. and its contributors. 5.26 + * 4. Neither the name of The NetBSD Foundation nor the names of its 5.27 + * contributors may be used to endorse or promote products derived 5.28 + * from this software without specific prior written permission. 5.29 + * 5.30 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 5.31 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 5.32 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 5.33 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 5.34 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 5.35 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 5.36 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 5.37 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 5.38 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 5.39 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 5.40 + * POSSIBILITY OF SUCH DAMAGE. 5.41 + */ 5.42 + 5.43 +#include <sys/cdefs.h> 5.44 +__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $"); 5.45 + 5.46 +#include "opt_multiprocessor.h" 5.47 +#include "opt_xen.h" 5.48 + 5.49 +#include <sys/param.h> 5.50 +#include <sys/systm.h> 5.51 +#include <sys/proc.h> 5.52 +#include <sys/lock.h> 5.53 +#include <sys/user.h> 5.54 + 5.55 +#include <uvm/uvm.h> 5.56 + 5.57 +#include <machine/gdt.h> 5.58 + 5.59 +int gdt_size[2]; /* total number of GDT entries */ 5.60 +int gdt_count[2]; /* number of GDT entries in use */ 5.61 +int gdt_next[2]; /* next available slot for sweeping */ 5.62 +int gdt_free[2]; /* next free slot; terminated with GNULL_SEL */ 5.63 + 5.64 +struct lock gdt_lock_store; 5.65 + 5.66 +static __inline void gdt_lock(void); 5.67 +static __inline void gdt_unlock(void); 5.68 +void gdt_init(void); 5.69 +void gdt_grow(int); 5.70 +int gdt_get_slot(void); 5.71 +int gdt_get_slot1(int); 5.72 +void gdt_put_slot(int); 5.73 +void gdt_put_slot1(int, int); 5.74 + 5.75 +/* 5.76 + * Lock and unlock the GDT, to avoid races in case gdt_{ge,pu}t_slot() sleep 5.77 + * waiting for memory. 5.78 + * 5.79 + * Note that the locking done here is not sufficient for multiprocessor 5.80 + * systems. A freshly allocated slot will still be of type SDT_SYSNULL for 5.81 + * some time after the GDT is unlocked, so gdt_compact() could attempt to 5.82 + * reclaim it. 5.83 + */ 5.84 +static __inline void 5.85 +gdt_lock() 5.86 +{ 5.87 + 5.88 + (void) lockmgr(&gdt_lock_store, LK_EXCLUSIVE, NULL); 5.89 +} 5.90 + 5.91 +static __inline void 5.92 +gdt_unlock() 5.93 +{ 5.94 + 5.95 + (void) lockmgr(&gdt_lock_store, LK_RELEASE, NULL); 5.96 +} 5.97 + 5.98 +void 5.99 +setgdt(int sel, void *base, size_t limit, 5.100 + int type, int dpl, int def32, int gran) 5.101 +{ 5.102 + struct segment_descriptor sd; 5.103 + CPU_INFO_ITERATOR cii; 5.104 + struct cpu_info *ci; 5.105 + 5.106 + if (type == SDT_SYS386TSS) { 5.107 + /* printk("XXX TSS descriptor not supported in GDT\n"); */ 5.108 + return; 5.109 + } 5.110 + 5.111 + setsegment(&sd, base, limit, type, dpl, def32, gran); 5.112 + for (CPU_INFO_FOREACH(cii, ci)) { 5.113 + if (ci->ci_gdt != NULL) { 5.114 +#ifndef XEN 5.115 + ci->ci_gdt[sel].sd = sd; 5.116 +#else 5.117 + xen_update_descriptor(&ci->ci_gdt[sel], 5.118 + (union descriptor *)&sd); 5.119 +#endif 5.120 + } 5.121 + } 5.122 +} 5.123 + 5.124 +/* 5.125 + * Initialize the GDT subsystem. Called from autoconf(). 5.126 + */ 5.127 +void 5.128 +gdt_init() 5.129 +{ 5.130 + size_t max_len, min_len; 5.131 + union descriptor *old_gdt; 5.132 + struct vm_page *pg; 5.133 + vaddr_t va; 5.134 + struct cpu_info *ci = &cpu_info_primary; 5.135 + 5.136 + lockinit(&gdt_lock_store, PZERO, "gdtlck", 0, 0); 5.137 + 5.138 + max_len = MAXGDTSIZ * sizeof(gdt[0]); 5.139 + min_len = MINGDTSIZ * sizeof(gdt[0]); 5.140 + 5.141 + gdt_size[0] = MINGDTSIZ; 5.142 + gdt_count[0] = NGDT; 5.143 + gdt_next[0] = NGDT; 5.144 + gdt_free[0] = GNULL_SEL; 5.145 + 5.146 + gdt_size[1] = 0; 5.147 + gdt_count[1] = MAXGDTSIZ; 5.148 + gdt_next[1] = MAXGDTSIZ; 5.149 + gdt_free[1] = GNULL_SEL; 5.150 + 5.151 + old_gdt = gdt; 5.152 + gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len + max_len); 5.153 + for (va = (vaddr_t)gdt; va < (vaddr_t)gdt + min_len; va += PAGE_SIZE) { 5.154 + pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); 5.155 + if (pg == NULL) { 5.156 + panic("gdt_init: no pages"); 5.157 + } 5.158 + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), 5.159 + VM_PROT_READ | VM_PROT_WRITE); 5.160 + } 5.161 + memcpy(gdt, old_gdt, NGDT * sizeof(gdt[0])); 5.162 + ci->ci_gdt = gdt; 5.163 + setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1, 5.164 + SDT_MEMRWA, SEL_KPL, 1, 1); 5.165 + 5.166 + gdt_init_cpu(ci); 5.167 +} 5.168 + 5.169 +/* 5.170 + * Allocate shadow GDT for a slave CPU. 5.171 + */ 5.172 +void 5.173 +gdt_alloc_cpu(struct cpu_info *ci) 5.174 +{ 5.175 + int max_len = MAXGDTSIZ * sizeof(gdt[0]); 5.176 + int min_len = MINGDTSIZ * sizeof(gdt[0]); 5.177 + struct vm_page *pg; 5.178 + vaddr_t va; 5.179 + 5.180 + ci->ci_gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len); 5.181 + for (va = (vaddr_t)ci->ci_gdt; va < (vaddr_t)ci->ci_gdt + min_len; 5.182 + va += PAGE_SIZE) { 5.183 + while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) 5.184 + == NULL) { 5.185 + uvm_wait("gdt_alloc_cpu"); 5.186 + } 5.187 + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), 5.188 + VM_PROT_READ | VM_PROT_WRITE); 5.189 + } 5.190 + memset(ci->ci_gdt, 0, min_len); 5.191 + memcpy(ci->ci_gdt, gdt, gdt_count[0] * sizeof(gdt[0])); 5.192 + setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1, 5.193 + SDT_MEMRWA, SEL_KPL, 1, 1); 5.194 +} 5.195 + 5.196 + 5.197 +/* 5.198 + * Load appropriate gdt descriptor; we better be running on *ci 5.199 + * (for the most part, this is how a CPU knows who it is). 5.200 + */ 5.201 +void 5.202 +gdt_init_cpu(struct cpu_info *ci) 5.203 +{ 5.204 +#ifndef XEN 5.205 + struct region_descriptor region; 5.206 + size_t max_len; 5.207 + 5.208 + max_len = MAXGDTSIZ * sizeof(gdt[0]); 5.209 + setregion(®ion, ci->ci_gdt, max_len - 1); 5.210 + lgdt(®ion); 5.211 +#else 5.212 + size_t len = gdt_size[0] * sizeof(gdt[0]); 5.213 + unsigned long frames[len >> PAGE_SHIFT]; 5.214 + vaddr_t va; 5.215 + pt_entry_t *ptp; 5.216 + pt_entry_t *maptp; 5.217 + int f; 5.218 + 5.219 + for (va = (vaddr_t)ci->ci_gdt, f = 0; 5.220 + va < (vaddr_t)ci->ci_gdt + len; 5.221 + va += PAGE_SIZE, f++) { 5.222 + KASSERT(va >= VM_MIN_KERNEL_ADDRESS); 5.223 + ptp = kvtopte(va); 5.224 + frames[f] = *ptp >> PAGE_SHIFT; 5.225 + maptp = (pt_entry_t *)vtomach((vaddr_t)ptp); 5.226 + PTE_CLEARBITS(ptp, maptp, PG_RW); 5.227 + } 5.228 + PTE_UPDATES_FLUSH(); 5.229 + /* printk("loading gdt %x, %d entries, %d pages", */ 5.230 + /* frames[0] << PAGE_SHIFT, gdt_size[0], len >> PAGE_SHIFT); */ 5.231 + if (HYPERVISOR_set_gdt(frames, gdt_size[0])) 5.232 + panic("HYPERVISOR_set_gdt failed!\n"); 5.233 + lgdt_finish(); 5.234 +#endif 5.235 +} 5.236 + 5.237 +#ifdef MULTIPROCESSOR 5.238 + 5.239 +void 5.240 +gdt_reload_cpu(struct cpu_info *ci) 5.241 +{ 5.242 + struct region_descriptor region; 5.243 + size_t max_len; 5.244 + 5.245 + max_len = MAXGDTSIZ * sizeof(gdt[0]); 5.246 + setregion(®ion, ci->ci_gdt, max_len - 1); 5.247 + lgdt(®ion); 5.248 +} 5.249 +#endif 5.250 + 5.251 + 5.252 +/* 5.253 + * Grow the GDT. 5.254 + */ 5.255 +void 5.256 +gdt_grow(int which) 5.257 +{ 5.258 + size_t old_len, new_len, max_len; 5.259 + CPU_INFO_ITERATOR cii; 5.260 + struct cpu_info *ci; 5.261 + struct vm_page *pg; 5.262 + vaddr_t va; 5.263 + 5.264 + old_len = gdt_size[which] * sizeof(gdt[0]); 5.265 + gdt_size[which] <<= 1; 5.266 + new_len = old_len << 1; 5.267 + 5.268 + if (which != 0) { 5.269 + max_len = MAXGDTSIZ * sizeof(gdt[0]); 5.270 + if (old_len == 0) { 5.271 + gdt_size[which] = MINGDTSIZ; 5.272 + new_len = gdt_size[which] * sizeof(gdt[0]); 5.273 + } 5.274 + for (va = (vaddr_t)(cpu_info_primary.ci_gdt) + old_len + max_len; 5.275 + va < (vaddr_t)(cpu_info_primary.ci_gdt) + new_len + max_len; 5.276 + va += PAGE_SIZE) { 5.277 + while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) == 5.278 + NULL) { 5.279 + uvm_wait("gdt_grow"); 5.280 + } 5.281 + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), 5.282 + VM_PROT_READ | VM_PROT_WRITE); 5.283 + } 5.284 + return; 5.285 + } 5.286 + 5.287 + for (CPU_INFO_FOREACH(cii, ci)) { 5.288 + for (va = (vaddr_t)(ci->ci_gdt) + old_len; 5.289 + va < (vaddr_t)(ci->ci_gdt) + new_len; 5.290 + va += PAGE_SIZE) { 5.291 + while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) == 5.292 + NULL) { 5.293 + uvm_wait("gdt_grow"); 5.294 + } 5.295 + pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), 5.296 + VM_PROT_READ | VM_PROT_WRITE); 5.297 + } 5.298 + } 5.299 +} 5.300 + 5.301 +/* 5.302 + * Allocate a GDT slot as follows: 5.303 + * 1) If there are entries on the free list, use those. 5.304 + * 2) If there are fewer than gdt_size entries in use, there are free slots 5.305 + * near the end that we can sweep through. 5.306 + * 3) As a last resort, we increase the size of the GDT, and sweep through 5.307 + * the new slots. 5.308 + */ 5.309 +int 5.310 +gdt_get_slot() 5.311 +{ 5.312 + return gdt_get_slot1(0); 5.313 +} 5.314 + 5.315 +int 5.316 +gdt_get_slot1(int which) 5.317 +{ 5.318 + size_t offset; 5.319 + int slot; 5.320 + 5.321 + gdt_lock(); 5.322 + 5.323 + if (gdt_free[which] != GNULL_SEL) { 5.324 + slot = gdt_free[which]; 5.325 + gdt_free[which] = gdt[slot].gd.gd_selector; 5.326 + } else { 5.327 + offset = which * MAXGDTSIZ * sizeof(gdt[0]); 5.328 + if (gdt_next[which] != gdt_count[which] + offset) 5.329 + panic("gdt_get_slot botch 1"); 5.330 + if (gdt_next[which] - offset >= gdt_size[which]) { 5.331 + if (gdt_size[which] >= MAXGDTSIZ) 5.332 + panic("gdt_get_slot botch 2"); 5.333 + gdt_grow(which); 5.334 + } 5.335 + slot = gdt_next[which]++; 5.336 + } 5.337 + 5.338 + gdt_count[which]++; 5.339 + gdt_unlock(); 5.340 + return (slot); 5.341 +} 5.342 + 5.343 +/* 5.344 + * Deallocate a GDT slot, putting it on the free list. 5.345 + */ 5.346 +void 5.347 +gdt_put_slot(int slot) 5.348 +{ 5.349 + gdt_put_slot1(slot, 0); 5.350 +} 5.351 + 5.352 +void 5.353 +gdt_put_slot1(int slot, int which) 5.354 +{ 5.355 + 5.356 + gdt_lock(); 5.357 + gdt_count[which]--; 5.358 + 5.359 + gdt[slot].gd.gd_type = SDT_SYSNULL; 5.360 + gdt[slot].gd.gd_selector = gdt_free[which]; 5.361 + gdt_free[which] = slot; 5.362 + 5.363 + gdt_unlock(); 5.364 +} 5.365 + 5.366 +int 5.367 +tss_alloc(struct pcb *pcb) 5.368 +{ 5.369 + int slot; 5.370 + 5.371 + slot = gdt_get_slot(); 5.372 + setgdt(slot, &pcb->pcb_tss, sizeof(struct pcb) - 1, 5.373 + SDT_SYS386TSS, SEL_KPL, 0, 0); 5.374 + return GSEL(slot, SEL_KPL); 5.375 +} 5.376 + 5.377 +void 5.378 +tss_free(int sel) 5.379 +{ 5.380 + 5.381 + gdt_put_slot(IDXSEL(sel)); 5.382 +} 5.383 + 5.384 +/* 5.385 + * Caller must have pmap locked for both of these functions. 5.386 + */ 5.387 +void 5.388 +ldt_alloc(struct pmap *pmap, union descriptor *ldt, size_t len) 5.389 +{ 5.390 + int slot; 5.391 + 5.392 + slot = gdt_get_slot1(1); 5.393 +#ifndef XEN 5.394 + setgdt(slot, ldt, len - 1, SDT_SYSLDT, SEL_KPL, 0, 0); 5.395 +#else 5.396 + cpu_info_primary.ci_gdt[slot].ld.ld_base = (uint32_t)ldt; 5.397 + cpu_info_primary.ci_gdt[slot].ld.ld_entries = 5.398 + len / sizeof(union descriptor); 5.399 +#endif 5.400 + pmap->pm_ldt_sel = GSEL(slot, SEL_KPL); 5.401 +} 5.402 + 5.403 +void 5.404 +ldt_free(struct pmap *pmap) 5.405 +{ 5.406 + int slot; 5.407 + 5.408 + slot = IDXSEL(pmap->pm_ldt_sel); 5.409 + 5.410 + gdt_put_slot1(slot, 1); 5.411 +}
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c Mon Sep 06 19:04:16 2004 +0000 6.3 @@ -0,0 +1,230 @@ 6.4 +/* $NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $ */ 6.5 + 6.6 +/* 6.7 + * 6.8 + * Copyright (c) 2004 Christian Limpach. 6.9 + * All rights reserved. 6.10 + * 6.11 + * Redistribution and use in source and binary forms, with or without 6.12 + * modification, are permitted provided that the following conditions 6.13 + * are met: 6.14 + * 1. Redistributions of source code must retain the above copyright 6.15 + * notice, this list of conditions and the following disclaimer. 6.16 + * 2. Redistributions in binary form must reproduce the above copyright 6.17 + * notice, this list of conditions and the following disclaimer in the 6.18 + * documentation and/or other materials provided with the distribution. 6.19 + * 3. All advertising materials mentioning features or use of this software 6.20 + * must display the following acknowledgement: 6.21 + * This product includes software developed by Christian Limpach. 6.22 + * 4. The name of the author may not be used to endorse or promote products 6.23 + * derived from this software without specific prior written permission. 6.24 + * 6.25 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 6.26 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 6.27 + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 6.28 + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 6.29 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 6.30 + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 6.31 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 6.32 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 6.33 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 6.34 + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 6.35 + */ 6.36 + 6.37 +/****************************************************************************** 6.38 + * hypervisor.c 6.39 + * 6.40 + * Communication to/from hypervisor. 6.41 + * 6.42 + * Copyright (c) 2002-2004, K A Fraser 6.43 + * 6.44 + * Permission is hereby granted, free of charge, to any person obtaining a copy 6.45 + * of this software and associated documentation files (the "Software"), to 6.46 + * deal in the Software without restriction, including without limitation the 6.47 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 6.48 + * sell copies of the Software, and to permit persons to whom the Software is 6.49 + * furnished to do so, subject to the following conditions: 6.50 + * 6.51 + * The above copyright notice and this permission notice shall be included in 6.52 + * all copies or substantial portions of the Software. 6.53 + * 6.54 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 6.55 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 6.56 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 6.57 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 6.58 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 6.59 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 6.60 + * DEALINGS IN THE SOFTWARE. 6.61 + */ 6.62 + 6.63 + 6.64 +#include <sys/cdefs.h> 6.65 +__KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $"); 6.66 + 6.67 +#include <sys/cdefs.h> 6.68 +#include <sys/param.h> 6.69 +#include <sys/systm.h> 6.70 + 6.71 +#include <machine/xen.h> 6.72 +#include <machine/hypervisor.h> 6.73 +#include <machine/evtchn.h> 6.74 + 6.75 +/* 6.76 + * Force a proper event-channel callback from Xen after clearing the 6.77 + * callback mask. We do this in a very simple manner, by making a call 6.78 + * down into Xen. The pending flag will be checked by Xen on return. 6.79 + */ 6.80 +void 6.81 +hypervisor_force_callback(void) 6.82 +{ 6.83 + 6.84 + (void)HYPERVISOR_xen_version(0); 6.85 +} 6.86 + 6.87 +int stipending(void); 6.88 +int 6.89 +stipending() 6.90 +{ 6.91 + uint32_t l1; 6.92 + unsigned long l2; 6.93 + unsigned int l1i, l2i, port; 6.94 + int irq; 6.95 + shared_info_t *s = HYPERVISOR_shared_info; 6.96 + struct cpu_info *ci; 6.97 + int ret; 6.98 + 6.99 + ret = 0; 6.100 + ci = curcpu(); 6.101 + 6.102 +#if 0 6.103 + if (HYPERVISOR_shared_info->events) 6.104 + printf("stipending events %08lx mask %08lx ilevel %d\n", 6.105 + HYPERVISOR_shared_info->events, 6.106 + HYPERVISOR_shared_info->events_mask, ci->ci_ilevel); 6.107 +#endif 6.108 + 6.109 + /* 6.110 + * we're only called after STIC, so we know that we'll have to 6.111 + * STI at the end 6.112 + */ 6.113 + cli(); 6.114 + while (s->vcpu_data[0].evtchn_upcall_pending) { 6.115 + s->vcpu_data[0].evtchn_upcall_pending = 0; 6.116 + /* NB. No need for a barrier here -- XCHG is a barrier 6.117 + * on x86. */ 6.118 + l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0); 6.119 + while ((l1i = ffs(l1)) != 0) { 6.120 + l1i--; 6.121 + l1 &= ~(1 << l1i); 6.122 + 6.123 + l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i]; 6.124 + while ((l2i = ffs(l2)) != 0) { 6.125 + l2i--; 6.126 + l2 &= ~(1 << l2i); 6.127 + 6.128 + port = (l1i << 5) + l2i; 6.129 + if ((irq = evtchn_to_irq[port]) != -1) { 6.130 + hypervisor_acknowledge_irq(irq); 6.131 + ci->ci_ipending |= (1 << irq); 6.132 + if (ret == 0 && ci->ci_ilevel < 6.133 + ci->ci_isources[irq]->is_handlers 6.134 + ->ih_level) 6.135 + ret = 1; 6.136 + } 6.137 +#if 0 /* XXXcl dev/evtchn */ 6.138 + else 6.139 + evtchn_device_upcall(port); 6.140 +#endif 6.141 + } 6.142 + } 6.143 + } 6.144 + sti(); 6.145 + 6.146 +#if 0 6.147 + if (ci->ci_ipending & 0x1) 6.148 + printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n", 6.149 + HYPERVISOR_shared_info->events, 6.150 + HYPERVISOR_shared_info->events_mask, ci->ci_ilevel, 6.151 + ci->ci_ipending); 6.152 +#endif 6.153 + 6.154 + return (ret); 6.155 +} 6.156 + 6.157 +void do_hypervisor_callback(struct trapframe *regs) 6.158 +{ 6.159 + uint32_t l1; 6.160 + unsigned long l2; 6.161 + unsigned int l1i, l2i, port; 6.162 + int irq; 6.163 + shared_info_t *s = HYPERVISOR_shared_info; 6.164 + struct cpu_info *ci; 6.165 + int level; 6.166 + 6.167 + ci = curcpu(); 6.168 + level = ci->ci_ilevel; 6.169 + 6.170 + while (s->vcpu_data[0].evtchn_upcall_pending) { 6.171 + s->vcpu_data[0].evtchn_upcall_pending = 0; 6.172 + /* NB. No need for a barrier here -- XCHG is a barrier 6.173 + * on x86. */ 6.174 + l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0); 6.175 + while ((l1i = ffs(l1)) != 0) { 6.176 + l1i--; 6.177 + l1 &= ~(1 << l1i); 6.178 + 6.179 + l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i]; 6.180 + while ((l2i = ffs(l2)) != 0) { 6.181 + l2i--; 6.182 + l2 &= ~(1 << l2i); 6.183 + 6.184 + port = (l1i << 5) + l2i; 6.185 + if ((irq = evtchn_to_irq[port]) != -1) 6.186 + do_event(irq, regs); 6.187 +#if 0 /* XXXcl dev/evtchn */ 6.188 + else 6.189 + evtchn_device_upcall(port); 6.190 +#endif 6.191 + } 6.192 + } 6.193 + } 6.194 + 6.195 +#ifdef DIAGNOSTIC 6.196 + if (level != ci->ci_ilevel) 6.197 + printf("hypervisor done %08x level %d/%d ipending %08x\n", 6.198 + HYPERVISOR_shared_info->evtchn_pending_sel, level, 6.199 + ci->ci_ilevel, ci->ci_ipending); 6.200 +#endif 6.201 +} 6.202 + 6.203 +void hypervisor_unmask_event(unsigned int ev) 6.204 +{ 6.205 + shared_info_t *s = HYPERVISOR_shared_info; 6.206 + 6.207 + x86_atomic_clear_bit(&s->evtchn_mask[0], ev); 6.208 + /* 6.209 + * The following is basically the equivalent of 6.210 + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the 6.211 + * interrupt edge' if the channel is masked. 6.212 + */ 6.213 + if (x86_atomic_test_bit(&s->evtchn_pending[0], ev) && 6.214 + !x86_atomic_test_and_set_bit(&s->evtchn_pending_sel, ev>>5)) { 6.215 + s->vcpu_data[0].evtchn_upcall_pending = 1; 6.216 + if (!s->vcpu_data[0].evtchn_upcall_mask) 6.217 + hypervisor_force_callback(); 6.218 + } 6.219 +} 6.220 + 6.221 +void hypervisor_mask_event(unsigned int ev) 6.222 +{ 6.223 + shared_info_t *s = HYPERVISOR_shared_info; 6.224 + 6.225 + x86_atomic_set_bit(&s->evtchn_mask[0], ev); 6.226 +} 6.227 + 6.228 +void hypervisor_clear_event(unsigned int ev) 6.229 +{ 6.230 + shared_info_t *s = HYPERVISOR_shared_info; 6.231 + 6.232 + x86_atomic_clear_bit(&s->evtchn_pending[0], ev); 6.233 +}
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S Mon Sep 06 19:04:16 2004 +0000 7.3 @@ -0,0 +1,2000 @@ 7.4 +/* $NetBSD: locore.S,v 1.2.2.1 2004/05/22 15:59:48 he Exp $ */ 7.5 +/* NetBSD: locore.S,v 1.26 2004/04/12 13:17:46 yamt Exp */ 7.6 + 7.7 +/*- 7.8 + * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc. 7.9 + * All rights reserved. 7.10 + * 7.11 + * This code is derived from software contributed to The NetBSD Foundation 7.12 + * by Charles M. Hannum. 7.13 + * 7.14 + * Redistribution and use in source and binary forms, with or without 7.15 + * modification, are permitted provided that the following conditions 7.16 + * are met: 7.17 + * 1. Redistributions of source code must retain the above copyright 7.18 + * notice, this list of conditions and the following disclaimer. 7.19 + * 2. Redistributions in binary form must reproduce the above copyright 7.20 + * notice, this list of conditions and the following disclaimer in the 7.21 + * documentation and/or other materials provided with the distribution. 7.22 + * 3. All advertising materials mentioning features or use of this software 7.23 + * must display the following acknowledgement: 7.24 + * This product includes software developed by the NetBSD 7.25 + * Foundation, Inc. and its contributors. 7.26 + * 4. Neither the name of The NetBSD Foundation nor the names of its 7.27 + * contributors may be used to endorse or promote products derived 7.28 + * from this software without specific prior written permission. 7.29 + * 7.30 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 7.31 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 7.32 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 7.33 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 7.34 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 7.35 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 7.36 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 7.37 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 7.38 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 7.39 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 7.40 + * POSSIBILITY OF SUCH DAMAGE. 7.41 + */ 7.42 + 7.43 +/*- 7.44 + * Copyright (c) 1990 The Regents of the University of California. 7.45 + * All rights reserved. 7.46 + * 7.47 + * This code is derived from software contributed to Berkeley by 7.48 + * William Jolitz. 7.49 + * 7.50 + * Redistribution and use in source and binary forms, with or without 7.51 + * modification, are permitted provided that the following conditions 7.52 + * are met: 7.53 + * 1. Redistributions of source code must retain the above copyright 7.54 + * notice, this list of conditions and the following disclaimer. 7.55 + * 2. Redistributions in binary form must reproduce the above copyright 7.56 + * notice, this list of conditions and the following disclaimer in the 7.57 + * documentation and/or other materials provided with the distribution. 7.58 + * 3. Neither the name of the University nor the names of its contributors 7.59 + * may be used to endorse or promote products derived from this software 7.60 + * without specific prior written permission. 7.61 + * 7.62 + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 7.63 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 7.64 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 7.65 + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 7.66 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 7.67 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 7.68 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 7.69 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 7.70 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 7.71 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 7.72 + * SUCH DAMAGE. 7.73 + * 7.74 + * @(#)locore.s 7.3 (Berkeley) 5/13/91 7.75 + */ 7.76 + 7.77 +#include "opt_compat_netbsd.h" 7.78 +#include "opt_compat_oldboot.h" 7.79 +#include "opt_cputype.h" 7.80 +#include "opt_ddb.h" 7.81 +#include "opt_ipkdb.h" 7.82 +#include "opt_lockdebug.h" 7.83 +#include "opt_multiprocessor.h" 7.84 +#include "opt_realmem.h" 7.85 +#include "opt_user_ldt.h" 7.86 +#include "opt_vm86.h" 7.87 +#include "opt_xen.h" 7.88 + 7.89 +#include "npx.h" 7.90 +#include "assym.h" 7.91 +#include "apm.h" 7.92 +#include "lapic.h" 7.93 +#include "ioapic.h" 7.94 +#include "ksyms.h" 7.95 + 7.96 +#include <sys/errno.h> 7.97 +#include <sys/syscall.h> 7.98 + 7.99 +#include <machine/cputypes.h> 7.100 +#include <machine/param.h> 7.101 +#include <machine/pte.h> 7.102 +#include <machine/segments.h> 7.103 +#include <machine/specialreg.h> 7.104 +#include <machine/trap.h> 7.105 +#include <machine/bootinfo.h> 7.106 + 7.107 +#if NLAPIC > 0 7.108 +#include <machine/i82489reg.h> 7.109 +#endif 7.110 + 7.111 +/* LINTSTUB: include <sys/types.h> */ 7.112 +/* LINTSTUB: include <machine/cpu.h> */ 7.113 +/* LINTSTUB: include <sys/systm.h> */ 7.114 + 7.115 +#include <machine/asm.h> 7.116 + 7.117 +#if defined(MULTIPROCESSOR) 7.118 + 7.119 +#define SET_CURLWP(lwp,cpu) \ 7.120 + movl CPUVAR(SELF),cpu ; \ 7.121 + movl lwp,CPUVAR(CURLWP) ; \ 7.122 + movl cpu,L_CPU(lwp) 7.123 + 7.124 +#else 7.125 + 7.126 +#define SET_CURLWP(lwp,tcpu) movl lwp,CPUVAR(CURLWP) 7.127 +#define GET_CURLWP(reg) movl CPUVAR(CURLWP),reg 7.128 + 7.129 +#endif 7.130 + 7.131 +#define GET_CURPCB(reg) movl CPUVAR(CURPCB),reg 7.132 +#define SET_CURPCB(reg) movl reg,CPUVAR(CURPCB) 7.133 + 7.134 +#define CLEAR_RESCHED(reg) movl reg,CPUVAR(RESCHED) 7.135 + 7.136 +/* XXX temporary kluge; these should not be here */ 7.137 +/* Get definitions for IOM_BEGIN, IOM_END, and IOM_SIZE */ 7.138 +#include <dev/isa/isareg.h> 7.139 + 7.140 + 7.141 +/* Disallow old names for REALBASEMEM */ 7.142 +#ifdef BIOSBASEMEM 7.143 +#error BIOSBASEMEM option deprecated; use REALBASEMEM only if memory size reported by latest boot block is incorrect 7.144 +#endif 7.145 + 7.146 +/* Disallow old names for REALEXTMEM */ 7.147 +#ifdef EXTMEM_SIZE 7.148 +#error EXTMEM_SIZE option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect 7.149 +#endif 7.150 +#ifdef BIOSEXTMEM 7.151 +#error BIOSEXTMEM option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect 7.152 +#endif 7.153 + 7.154 +#include <machine/frameasm.h> 7.155 + 7.156 + 7.157 +#ifdef MULTIPROCESSOR 7.158 +#include <machine/i82489reg.h> 7.159 +#endif 7.160 + 7.161 +/* 7.162 + * PTmap is recursive pagemap at top of virtual address space. 7.163 + * Within PTmap, the page directory can be found (third indirection). 7.164 + * 7.165 + * XXX 4 == sizeof pde 7.166 + */ 7.167 + .set _C_LABEL(PTmap),(PDSLOT_PTE << PDSHIFT) 7.168 + .set _C_LABEL(PTD),(_C_LABEL(PTmap) + PDSLOT_PTE * PAGE_SIZE) 7.169 + .set _C_LABEL(PTDpde),(_C_LABEL(PTD) + PDSLOT_PTE * 4) 7.170 + 7.171 +/* 7.172 + * APTmap, APTD is the alternate recursive pagemap. 7.173 + * It's used when modifying another process's page tables. 7.174 + * 7.175 + * XXX 4 == sizeof pde 7.176 + */ 7.177 + .set _C_LABEL(APTmap),(PDSLOT_APTE << PDSHIFT) 7.178 + .set _C_LABEL(APTD),(_C_LABEL(APTmap) + PDSLOT_APTE * PAGE_SIZE) 7.179 + .set _C_LABEL(APTDpde),(_C_LABEL(PTD) + PDSLOT_APTE * 4) 7.180 + 7.181 + 7.182 +/* 7.183 + * Xen guest identifier and loader selection 7.184 + */ 7.185 +.section __xen_guest 7.186 + .asciz "GUEST_OS=netbsd,GUEST_VER=2.0,XEN_VER=2.0,LOADER=generic" 7.187 + 7.188 + 7.189 +/* 7.190 + * Initialization 7.191 + */ 7.192 + .data 7.193 + 7.194 + .globl _C_LABEL(cpu) 7.195 + .globl _C_LABEL(esym),_C_LABEL(boothowto) 7.196 + .globl _C_LABEL(bootinfo),_C_LABEL(atdevbase) 7.197 +#ifdef COMPAT_OLDBOOT 7.198 + .globl _C_LABEL(bootdev) 7.199 +#endif 7.200 + .globl _C_LABEL(proc0paddr),_C_LABEL(PTDpaddr) 7.201 + .globl _C_LABEL(biosbasemem),_C_LABEL(biosextmem) 7.202 + .globl _C_LABEL(gdt) 7.203 +#ifdef I586_CPU 7.204 + .globl _C_LABEL(idt) 7.205 +#endif 7.206 + .globl _C_LABEL(lapic_tpr) 7.207 + 7.208 +#if NLAPIC > 0 7.209 +#ifdef __ELF__ 7.210 + .align PAGE_SIZE 7.211 +#else 7.212 + .align 12 7.213 +#endif 7.214 + .globl _C_LABEL(local_apic), _C_LABEL(lapic_id) 7.215 +_C_LABEL(local_apic): 7.216 + .space LAPIC_ID 7.217 +_C_LABEL(lapic_id): 7.218 + .long 0x00000000 7.219 + .space LAPIC_TPRI-(LAPIC_ID+4) 7.220 +_C_LABEL(lapic_tpr): 7.221 + .space LAPIC_PPRI-LAPIC_TPRI 7.222 +_C_LABEL(lapic_ppr): 7.223 + .space LAPIC_ISR-LAPIC_PPRI 7.224 +_C_LABEL(lapic_isr): 7.225 + .space PAGE_SIZE-LAPIC_ISR 7.226 +#else 7.227 +_C_LABEL(lapic_tpr): 7.228 + .long 0 7.229 +#endif 7.230 + 7.231 + 7.232 +_C_LABEL(cpu): .long 0 # are we 386, 386sx, or 486, 7.233 + # or Pentium, or.. 7.234 +_C_LABEL(esym): .long 0 # ptr to end of syms 7.235 +_C_LABEL(atdevbase): .long 0 # location of start of iomem in virtual 7.236 +_C_LABEL(proc0paddr): .long 0 7.237 +_C_LABEL(PTDpaddr): .long 0 # paddr of PTD, for libkvm 7.238 +#ifndef REALBASEMEM 7.239 +_C_LABEL(biosbasemem): .long 0 # base memory reported by BIOS 7.240 +#else 7.241 +_C_LABEL(biosbasemem): .long REALBASEMEM 7.242 +#endif 7.243 +#ifndef REALEXTMEM 7.244 +_C_LABEL(biosextmem): .long 0 # extended memory reported by BIOS 7.245 +#else 7.246 +_C_LABEL(biosextmem): .long REALEXTMEM 7.247 +#endif 7.248 + 7.249 +#include <machine/xen.h> 7.250 +#define __HYPERVISOR_yield 8 7.251 + 7.252 + .space 512 7.253 +tmpstk: 7.254 + .long tmpstk, __KERNEL_DS 7.255 + 7.256 + 7.257 +#define _RELOC(x) ((x)) 7.258 +#define RELOC(x) _RELOC(_C_LABEL(x)) 7.259 + 7.260 +/* XXX assym.h */ 7.261 +#define MOD_START 48 7.262 +#define MOD_LEN 56 7.263 +/* XXX assym.h */ 7.264 + 7.265 + .text 7.266 + .globl _C_LABEL(kernel_text) 7.267 + .set _C_LABEL(kernel_text),KERNTEXTOFF 7.268 + 7.269 + .globl start 7.270 +start: 7.271 + cld 7.272 + 7.273 + lss tmpstk,%esp # bootstrap stack end location 7.274 + 7.275 + movl %esi,%ebx # save start_info pointer 7.276 + 7.277 +#if (NKSYMS || defined(DDB) || defined(LKM)) && !defined(SYMTAB_SPACE) 7.278 + /* Save the symbol locations. */ 7.279 + movl MOD_START(%ebx),%esi 7.280 + addl MOD_LEN(%ebx),%esi 7.281 + movl %esi,RELOC(esym) 7.282 +#endif 7.283 + 7.284 + /* Clear BSS first so that there are no surprises... */ 7.285 + xorl %eax,%eax 7.286 + movl $RELOC(__bss_start),%edi 7.287 + movl $RELOC(_end),%ecx 7.288 + subl %edi,%ecx 7.289 + rep stosb 7.290 + 7.291 + movl %ebx,RELOC(avail_start) 7.292 + 7.293 + /* Copy the necessary stuff from start_info structure. */ 7.294 + /* We need to copy shared_info early, so that sti/cli work */ 7.295 + movl %ebx,%esi 7.296 + movl $RELOC(start_info_union),%edi 7.297 + movl $128,%ecx 7.298 + rep movsl 7.299 + 7.300 + /* (howto, [bootdev], bootinfo, basemem, extmem). */ 7.301 + xorl %eax,%eax 7.302 + movl %eax,RELOC(boothowto) 7.303 +#ifdef COMPAT_OLDBOOT 7.304 + movl %eax,RELOC(bootdev) 7.305 +#endif 7.306 + movl $0x20000,%eax 7.307 + movl %eax,RELOC(boothowto) 7.308 + 7.309 + /* First, reset the PSL. */ 7.310 + pushl $PSL_MBO 7.311 + popfl 7.312 + 7.313 + /* Clear segment registers; always null in proc0. */ 7.314 + xorl %eax,%eax 7.315 + movw %ax,%fs 7.316 + movw %ax,%gs 7.317 + decl %eax 7.318 + movl %eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL 7.319 + 7.320 + xorl %eax,%eax 7.321 + cpuid 7.322 + movl %eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL 7.323 + 7.324 +/* 7.325 + * Virtual address space of kernel: 7.326 + * 7.327 + * text | data | bss | [syms] | page dir | proc0 kstack 7.328 + * 0 1 2 3 7.329 + */ 7.330 +#define PROC0PDIR ((0) * PAGE_SIZE) 7.331 +#define PROC0STACK ((1) * PAGE_SIZE) 7.332 +#define SYSMAP ((1+UPAGES) * PAGE_SIZE) 7.333 +#define TABLESIZE ((1+UPAGES) * PAGE_SIZE) /* + nkpde * PAGE_SIZE */ 7.334 + 7.335 + /* Find end of kernel image. */ 7.336 + movl RELOC(avail_start),%edi 7.337 + /* Calculate where to start the bootstrap tables. */ 7.338 + movl %edi,%esi 7.339 + 7.340 + /* 7.341 + * Calculate the size of the kernel page table directory, and 7.342 + * how many entries it will have. 7.343 + */ 7.344 + movl RELOC(nkpde),%ecx # get nkpde 7.345 + cmpl $NKPTP_MIN,%ecx # larger than min? 7.346 + jge 1f 7.347 + movl $NKPTP_MIN,%ecx # set at min 7.348 + jmp 2f 7.349 +1: cmpl $NKPTP_MAX,%ecx # larger than max? 7.350 + jle 2f 7.351 + movl $NKPTP_MAX,%ecx 7.352 +2: 7.353 + 7.354 + /* Clear memory for bootstrap tables. */ 7.355 + shll $PGSHIFT,%ecx 7.356 + addl $TABLESIZE,%ecx 7.357 + addl %esi,%ecx # end of tables 7.358 + movl %ecx,RELOC(gdt) 7.359 + addl $PAGE_SIZE,%ecx 7.360 + movl %ecx,RELOC(avail_start) 7.361 + subl %edi,%ecx # size of tables 7.362 + shrl $2,%ecx 7.363 + xorl %eax,%eax 7.364 + cld 7.365 + rep 7.366 + stosl 7.367 + 7.368 +/* 7.369 + * fillkpt 7.370 + * eax = pte (page frame | control | status) 7.371 + * ebx = page table address 7.372 + * ecx = number of pages to map 7.373 + */ 7.374 +#define fillkpt \ 7.375 +1: movl %eax,(%ebx) ; \ 7.376 + addl $PAGE_SIZE,%eax ; /* increment physical address */ \ 7.377 + addl $4,%ebx ; /* next pte */ \ 7.378 + loop 1b ; 7.379 + 7.380 +/* 7.381 + * Build initial page tables. 7.382 + */ 7.383 + /* Calculate end of text segment, rounded to a page. */ 7.384 + leal (RELOC(etext)+PGOFSET),%edx 7.385 + andl $~PGOFSET,%edx 7.386 + 7.387 + /* Skip over the first 1MB. */ 7.388 + movl $KERNTEXTOFF,%eax 7.389 + movl %eax,%ecx 7.390 + subl $KERNBASE_LOCORE,%ecx 7.391 + shrl $PGSHIFT,%ecx 7.392 + leal (SYSMAP)(%esi,%ecx,4),%ebx 7.393 + 7.394 + /* Map the kernel text read-only. */ 7.395 + movl %edx,%ecx 7.396 + subl %eax,%ecx 7.397 + shrl $PGSHIFT,%ecx 7.398 + orl $(PG_V|PG_KR),%eax 7.399 + fillkpt 7.400 + 7.401 + /* Map the data, BSS, and bootstrap tables read-write. */ 7.402 + movl RELOC(avail_start),%ecx 7.403 + # end of tables 7.404 + subl %edx,%ecx # subtract end of text 7.405 + shrl $PGSHIFT,%ecx 7.406 + leal (PG_V|PG_KW)(%edx),%eax 7.407 + fillkpt 7.408 + 7.409 + movl $0xffffffff,(%ebx) 7.410 + addl $4,%ebx 7.411 + 7.412 +/* 7.413 + * Construct a page table directory. 7.414 + */ 7.415 + /* Map kernel PDEs. */ 7.416 + movl RELOC(nkpde),%ecx # for this many pde s, 7.417 + leal (PROC0PDIR+PDSLOT_KERN*4)(%esi),%ebx # kernel pde offset 7.418 + leal (SYSMAP+PG_V|PG_KW)(%esi),%eax # pte for KPT in proc 0, 7.419 + fillkpt 7.420 + 7.421 + /* Install a PDE recursively mapping page directory as a page table! */ 7.422 + leal (PROC0PDIR+PG_V/*|PG_KW*/)(%esi),%eax # pte for ptd 7.423 + movl %eax,(PROC0PDIR+PDSLOT_PTE*4)(%esi) # recursive PD slot 7.424 + 7.425 + /* Save phys. addr of PTD, for libkvm. */ 7.426 + movl %esi,RELOC(PTDpaddr) 7.427 + 7.428 + call xpmap_init 7.429 + 7.430 + /* cr0 is 0x8005003b */ 7.431 + 7.432 + /* Relocate atdevbase. */ 7.433 + movl _C_LABEL(avail_start),%edx 7.434 + movl %edx,_C_LABEL(HYPERVISOR_shared_info) 7.435 + addl $PAGE_SIZE,%edx # shared_inf 7.436 + movl %edx,_C_LABEL(atdevbase) 7.437 + 7.438 + /* Set up bootstrap stack. */ 7.439 + leal (PROC0STACK)(%esi),%eax 7.440 + movl %eax,_C_LABEL(proc0paddr) 7.441 + leal (USPACE-FRAMESIZE)(%eax),%esp 7.442 + subl $KERNBASE_LOCORE,%esi 7.443 + movl %esi,PCB_CR3(%eax) # pcb->pcb_cr3 7.444 + xorl %ebp,%ebp # mark end of frames 7.445 + 7.446 + movl _C_LABEL(atdevbase),%eax 7.447 + pushl %eax 7.448 + call _C_LABEL(init386) # wire 386 chip for unix operation 7.449 + addl $4,%esp 7.450 + 7.451 +#ifdef SAFARI_FIFO_HACK 7.452 + movb $5,%al 7.453 + movw $0x37b,%dx 7.454 + outb %al,%dx 7.455 + movw $0x37f,%dx 7.456 + inb %dx,%al 7.457 + movb %al,%cl 7.458 + 7.459 + orb $1,%cl 7.460 + 7.461 + movb $5,%al 7.462 + movw $0x37b,%dx 7.463 + outb %al,%dx 7.464 + movw $0x37f,%dx 7.465 + movb %cl,%al 7.466 + outb %al,%dx 7.467 +#endif /* SAFARI_FIFO_HACK */ 7.468 + 7.469 + call _C_LABEL(main) 7.470 + 7.471 +/* 7.472 + * void proc_trampoline(void); 7.473 + * This is a trampoline function pushed onto the stack of a newly created 7.474 + * process in order to do some additional setup. The trampoline is entered by 7.475 + * cpu_switch()ing to the process, so we abuse the callee-saved registers used 7.476 + * by cpu_switch() to store the information about the stub to call. 7.477 + * NOTE: This function does not have a normal calling sequence! 7.478 + */ 7.479 +/* LINTSTUB: Func: void proc_trampoline(void) */ 7.480 +NENTRY(proc_trampoline) 7.481 +#ifdef MULTIPROCESSOR 7.482 + call _C_LABEL(proc_trampoline_mp) 7.483 +#endif 7.484 + movl $IPL_NONE,CPUVAR(ILEVEL) 7.485 + pushl %ebx 7.486 + call *%esi 7.487 + addl $4,%esp 7.488 + DO_DEFERRED_SWITCH(%eax) 7.489 + INTRFASTEXIT 7.490 + /* NOTREACHED */ 7.491 + 7.492 +/*****************************************************************************/ 7.493 +#ifdef COMPAT_16 7.494 +/* 7.495 + * Signal trampoline; copied to top of user stack. 7.496 + */ 7.497 +/* LINTSTUB: Var: char sigcode[1], esigcode[1]; */ 7.498 +NENTRY(sigcode) 7.499 + /* 7.500 + * Handler has returned here as if we called it. The sigcontext 7.501 + * is on the stack after the 3 args "we" pushed. 7.502 + */ 7.503 + leal 12(%esp),%eax # get pointer to sigcontext 7.504 + movl %eax,4(%esp) # put it in the argument slot 7.505 + # fake return address already there 7.506 + movl $SYS_compat_16___sigreturn14,%eax 7.507 + int $0x80 # enter kernel with args on stack 7.508 + movl $SYS_exit,%eax 7.509 + int $0x80 # exit if sigreturn fails 7.510 + .globl _C_LABEL(esigcode) 7.511 +_C_LABEL(esigcode): 7.512 +#endif 7.513 + 7.514 +/*****************************************************************************/ 7.515 + 7.516 +/* 7.517 + * The following primitives are used to fill and copy regions of memory. 7.518 + */ 7.519 + 7.520 +/* 7.521 + * XXX No section 9 man page for fillw. 7.522 + * fillw seems to be very sparsely used (only in pccons it seems.) 7.523 + * One wonders if it couldn't be done without. 7.524 + * -- Perry Metzger, May 7, 2001 7.525 + */ 7.526 +/* 7.527 + * void fillw(short pattern, void *addr, size_t len); 7.528 + * Write len copies of pattern at addr. 7.529 + */ 7.530 +/* LINTSTUB: Func: void fillw(short pattern, void *addr, size_t len) */ 7.531 +ENTRY(fillw) 7.532 + pushl %edi 7.533 + movl 8(%esp),%eax 7.534 + movl 12(%esp),%edi 7.535 + movw %ax,%cx 7.536 + rorl $16,%eax 7.537 + movw %cx,%ax 7.538 + cld 7.539 + movl 16(%esp),%ecx 7.540 + shrl %ecx # do longwords 7.541 + rep 7.542 + stosl 7.543 + movl 16(%esp),%ecx 7.544 + andl $1,%ecx # do remainder 7.545 + rep 7.546 + stosw 7.547 + popl %edi 7.548 + ret 7.549 + 7.550 +/* 7.551 + * int kcopy(const void *from, void *to, size_t len); 7.552 + * Copy len bytes, abort on fault. 7.553 + */ 7.554 +/* LINTSTUB: Func: int kcopy(const void *from, void *to, size_t len) */ 7.555 +ENTRY(kcopy) 7.556 + pushl %esi 7.557 + pushl %edi 7.558 + GET_CURPCB(%eax) # load curpcb into eax and set on-fault 7.559 + pushl PCB_ONFAULT(%eax) 7.560 + movl $_C_LABEL(kcopy_fault), PCB_ONFAULT(%eax) 7.561 + 7.562 + movl 16(%esp),%esi 7.563 + movl 20(%esp),%edi 7.564 + movl 24(%esp),%ecx 7.565 + movl %edi,%eax 7.566 + subl %esi,%eax 7.567 + cmpl %ecx,%eax # overlapping? 7.568 + jb 1f 7.569 + cld # nope, copy forward 7.570 + shrl $2,%ecx # copy by 32-bit words 7.571 + rep 7.572 + movsl 7.573 + movl 24(%esp),%ecx 7.574 + andl $3,%ecx # any bytes left? 7.575 + rep 7.576 + movsb 7.577 + 7.578 + GET_CURPCB(%edx) # XXX save curpcb? 7.579 + popl PCB_ONFAULT(%edx) 7.580 + popl %edi 7.581 + popl %esi 7.582 + xorl %eax,%eax 7.583 + ret 7.584 + 7.585 + ALIGN_TEXT 7.586 +1: addl %ecx,%edi # copy backward 7.587 + addl %ecx,%esi 7.588 + std 7.589 + andl $3,%ecx # any fractional bytes? 7.590 + decl %edi 7.591 + decl %esi 7.592 + rep 7.593 + movsb 7.594 + movl 24(%esp),%ecx # copy remainder by 32-bit words 7.595 + shrl $2,%ecx 7.596 + subl $3,%esi 7.597 + subl $3,%edi 7.598 + rep 7.599 + movsl 7.600 + cld 7.601 + 7.602 + GET_CURPCB(%edx) 7.603 + popl PCB_ONFAULT(%edx) 7.604 + popl %edi 7.605 + popl %esi 7.606 + xorl %eax,%eax 7.607 + ret 7.608 + 7.609 +/*****************************************************************************/ 7.610 + 7.611 +/* 7.612 + * The following primitives are used to copy data in and out of the user's 7.613 + * address space. 7.614 + */ 7.615 + 7.616 +/* 7.617 + * Default to the lowest-common-denominator. We will improve it 7.618 + * later. 7.619 + */ 7.620 +#if defined(I386_CPU) 7.621 +#define DEFAULT_COPYOUT _C_LABEL(i386_copyout) 7.622 +#define DEFAULT_COPYIN _C_LABEL(i386_copyin) 7.623 +#elif defined(I486_CPU) 7.624 +#define DEFAULT_COPYOUT _C_LABEL(i486_copyout) 7.625 +#define DEFAULT_COPYIN _C_LABEL(i386_copyin) 7.626 +#elif defined(I586_CPU) 7.627 +#define DEFAULT_COPYOUT _C_LABEL(i486_copyout) /* XXX */ 7.628 +#define DEFAULT_COPYIN _C_LABEL(i386_copyin) /* XXX */ 7.629 +#elif defined(I686_CPU) 7.630 +#define DEFAULT_COPYOUT _C_LABEL(i486_copyout) /* XXX */ 7.631 +#define DEFAULT_COPYIN _C_LABEL(i386_copyin) /* XXX */ 7.632 +#endif 7.633 + 7.634 + .data 7.635 + 7.636 + .globl _C_LABEL(copyout_func) 7.637 +_C_LABEL(copyout_func): 7.638 + .long DEFAULT_COPYOUT 7.639 + 7.640 + .globl _C_LABEL(copyin_func) 7.641 +_C_LABEL(copyin_func): 7.642 + .long DEFAULT_COPYIN 7.643 + 7.644 + .text 7.645 + 7.646 +/* 7.647 + * int copyout(const void *from, void *to, size_t len); 7.648 + * Copy len bytes into the user's address space. 7.649 + * see copyout(9) 7.650 + */ 7.651 +/* LINTSTUB: Func: int copyout(const void *kaddr, void *uaddr, size_t len) */ 7.652 +ENTRY(copyout) 7.653 + DO_DEFERRED_SWITCH(%eax) 7.654 + jmp *_C_LABEL(copyout_func) 7.655 + 7.656 +#if defined(I386_CPU) 7.657 +/* LINTSTUB: Func: int i386_copyout(const void *kaddr, void *uaddr, size_t len) */ 7.658 +ENTRY(i386_copyout) 7.659 + pushl %esi 7.660 + pushl %edi 7.661 + pushl $0 7.662 + 7.663 + movl 16(%esp),%esi 7.664 + movl 20(%esp),%edi 7.665 + movl 24(%esp),%eax 7.666 + 7.667 + /* 7.668 + * We check that the end of the destination buffer is not past the end 7.669 + * of the user's address space. If it's not, then we only need to 7.670 + * check that each page is writable. The 486 will do this for us; the 7.671 + * 386 will not. (We assume that pages in user space that are not 7.672 + * writable by the user are not writable by the kernel either.) 7.673 + */ 7.674 + movl %edi,%edx 7.675 + addl %eax,%edx 7.676 + jc _C_LABEL(copy_efault) 7.677 + cmpl $VM_MAXUSER_ADDRESS,%edx 7.678 + ja _C_LABEL(copy_efault) 7.679 + 7.680 + testl %eax,%eax # anything to do? 7.681 + jz 3f 7.682 + 7.683 + /* 7.684 + * We have to check each PTE for (write) permission, since the CPU 7.685 + * doesn't do it for us. 7.686 + */ 7.687 + 7.688 + /* Compute number of pages. */ 7.689 + movl %edi,%ecx 7.690 + andl $PGOFSET,%ecx 7.691 + addl %eax,%ecx 7.692 + decl %ecx 7.693 + shrl $PGSHIFT,%ecx 7.694 + 7.695 + /* Compute PTE offset for start address. */ 7.696 + shrl $PGSHIFT,%edi 7.697 + 7.698 + GET_CURPCB(%edx) 7.699 + movl $2f,PCB_ONFAULT(%edx) 7.700 + 7.701 +1: /* Check PTE for each page. */ 7.702 + testb $PG_RW,_C_LABEL(PTmap)(,%edi,4) 7.703 + jz 2f 7.704 + 7.705 +4: incl %edi 7.706 + decl %ecx 7.707 + jns 1b 7.708 + 7.709 + movl 20(%esp),%edi 7.710 + movl 24(%esp),%eax 7.711 + jmp 3f 7.712 + 7.713 +2: /* Simulate a trap. */ 7.714 + pushl %ecx 7.715 + movl %edi,%eax 7.716 + shll $PGSHIFT,%eax 7.717 + pushl %eax 7.718 + call _C_LABEL(trapwrite) # trapwrite(addr) 7.719 + addl $4,%esp # pop argument 7.720 + popl %ecx 7.721 + testl %eax,%eax # if not ok, return EFAULT 7.722 + jz 4b 7.723 + jmp _C_LABEL(copy_efault) 7.724 + 7.725 +3: GET_CURPCB(%edx) 7.726 + movl $_C_LABEL(copy_fault),PCB_ONFAULT(%edx) 7.727 + 7.728 + /* bcopy(%esi, %edi, %eax); */ 7.729 + cld 7.730 + movl %eax,%ecx 7.731 + shrl $2,%ecx 7.732 + rep 7.733 + movsl 7.734 + movl %eax,%ecx 7.735 + andl $3,%ecx 7.736 + rep 7.737 + movsb 7.738 + 7.739 + popl PCB_ONFAULT(%edx) 7.740 + popl %edi 7.741 + popl %esi 7.742 + xorl %eax,%eax 7.743 + ret 7.744 +#endif /* I386_CPU */ 7.745 + 7.746 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) 7.747 +/* LINTSTUB: Func: int i486_copyout(const void *kaddr, void *uaddr, size_t len) */ 7.748 +ENTRY(i486_copyout) 7.749 + pushl %esi 7.750 + pushl %edi 7.751 + pushl $0 7.752 + 7.753 + movl 16(%esp),%esi 7.754 + movl 20(%esp),%edi 7.755 + movl 24(%esp),%eax 7.756 + 7.757 + /* 7.758 + * We check that the end of the destination buffer is not past the end 7.759 + * of the user's address space. 7.760 + */ 7.761 + movl %edi,%edx 7.762 + addl %eax,%edx 7.763 + jc _C_LABEL(copy_efault) 7.764 + cmpl $VM_MAXUSER_ADDRESS,%edx 7.765 + ja _C_LABEL(copy_efault) 7.766 + 7.767 + GET_CURPCB(%edx) 7.768 + movl $_C_LABEL(copy_fault),PCB_ONFAULT(%edx) 7.769 + 7.770 + /* bcopy(%esi, %edi, %eax); */ 7.771 + cld 7.772 + movl %eax,%ecx 7.773 + shrl $2,%ecx 7.774 + rep 7.775 + movsl 7.776 + movl %eax,%ecx 7.777 + andl $3,%ecx 7.778 + rep 7.779 + movsb 7.780 + 7.781 + popl PCB_ONFAULT(%edx) 7.782 + popl %edi 7.783 + popl %esi 7.784 + xorl %eax,%eax 7.785 + ret 7.786 +#endif /* I486_CPU || I586_CPU || I686_CPU */ 7.787 + 7.788 +/* 7.789 + * int copyin(const void *from, void *to, size_t len); 7.790 + * Copy len bytes from the user's address space. 7.791 + * see copyin(9) 7.792 + */ 7.793 +/* LINTSTUB: Func: int copyin(const void *uaddr, void *kaddr, size_t len) */ 7.794 +ENTRY(copyin) 7.795 + DO_DEFERRED_SWITCH(%eax) 7.796 + jmp *_C_LABEL(copyin_func) 7.797 + 7.798 +#if defined(I386_CPU) || defined(I486_CPU) || defined(I586_CPU) || \ 7.799 + defined(I686_CPU) 7.800 +/* LINTSTUB: Func: int i386_copyin(const void *uaddr, void *kaddr, size_t len) */ 7.801 +ENTRY(i386_copyin) 7.802 + pushl %esi 7.803 + pushl %edi 7.804 + GET_CURPCB(%eax) 7.805 + pushl $0 7.806 + movl $_C_LABEL(copy_fault),PCB_ONFAULT(%eax) 7.807 + 7.808 + movl 16(%esp),%esi 7.809 + movl 20(%esp),%edi 7.810 + movl 24(%esp),%eax 7.811 + 7.812 + /* 7.813 + * We check that the end of the destination buffer is not past the end 7.814 + * of the user's address space. If it's not, then we only need to 7.815 + * check that each page is readable, and the CPU will do that for us. 7.816 + */ 7.817 + movl %esi,%edx 7.818 + addl %eax,%edx 7.819 + jc _C_LABEL(copy_efault) 7.820 + cmpl $VM_MAXUSER_ADDRESS,%edx 7.821 + ja _C_LABEL(copy_efault) 7.822 + 7.823 + /* bcopy(%esi, %edi, %eax); */ 7.824 + cld 7.825 + movl %eax,%ecx 7.826 + shrl $2,%ecx 7.827 + rep 7.828 + movsl 7.829 + movl %eax,%ecx 7.830 + andl $3,%ecx 7.831 + rep 7.832 + movsb 7.833 + 7.834 + GET_CURPCB(%edx) 7.835 + popl PCB_ONFAULT(%edx) 7.836 + popl %edi 7.837 + popl %esi 7.838 + xorl %eax,%eax 7.839 + ret 7.840 +#endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */ 7.841 + 7.842 +/* LINTSTUB: Ignore */ 7.843 +NENTRY(copy_efault) 7.844 + movl $EFAULT,%eax 7.845 + 7.846 +/* 7.847 + * kcopy_fault is used by kcopy and copy_fault is used by copyin/out. 7.848 + * 7.849 + * they're distinguished for lazy pmap switching. see trap(). 7.850 + */ 7.851 +/* LINTSTUB: Ignore */ 7.852 +NENTRY(kcopy_fault) 7.853 + GET_CURPCB(%edx) 7.854 + popl PCB_ONFAULT(%edx) 7.855 + popl %edi 7.856 + popl %esi 7.857 + ret 7.858 + 7.859 +/* LINTSTUB: Ignore */ 7.860 +NENTRY(copy_fault) 7.861 + GET_CURPCB(%edx) 7.862 + popl PCB_ONFAULT(%edx) 7.863 + popl %edi 7.864 + popl %esi 7.865 + ret 7.866 + 7.867 +/* 7.868 + * int copyoutstr(const void *from, void *to, size_t maxlen, size_t *lencopied); 7.869 + * Copy a NUL-terminated string, at most maxlen characters long, into the 7.870 + * user's address space. Return the number of characters copied (including the 7.871 + * NUL) in *lencopied. If the string is too long, return ENAMETOOLONG; else 7.872 + * return 0 or EFAULT. 7.873 + * see copyoutstr(9) 7.874 + */ 7.875 +/* LINTSTUB: Func: int copyoutstr(const void *kaddr, void *uaddr, size_t len, size_t *done) */ 7.876 +ENTRY(copyoutstr) 7.877 + pushl %esi 7.878 + pushl %edi 7.879 + 7.880 + DO_DEFERRED_SWITCH(%eax) 7.881 + 7.882 + movl 12(%esp),%esi # esi = from 7.883 + movl 16(%esp),%edi # edi = to 7.884 + movl 20(%esp),%edx # edx = maxlen 7.885 + 7.886 +#if defined(I386_CPU) 7.887 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) 7.888 + cmpl $CPUCLASS_386,_C_LABEL(cpu_class) 7.889 + jne 5f 7.890 +#endif /* I486_CPU || I586_CPU || I686_CPU */ 7.891 + 7.892 + /* Compute number of bytes in first page. */ 7.893 + movl %edi,%eax 7.894 + andl $PGOFSET,%eax 7.895 + movl $PAGE_SIZE,%ecx 7.896 + subl %eax,%ecx # ecx = PAGE_SIZE - (src % PAGE_SIZE) 7.897 + 7.898 + GET_CURPCB(%eax) 7.899 + movl $6f,PCB_ONFAULT(%eax) 7.900 + 7.901 +1: /* 7.902 + * Once per page, check that we are still within the bounds of user 7.903 + * space, and check for a write fault. 7.904 + */ 7.905 + cmpl $VM_MAXUSER_ADDRESS,%edi 7.906 + jae _C_LABEL(copystr_efault) 7.907 + 7.908 + /* Compute PTE offset. */ 7.909 + movl %edi,%eax 7.910 + shrl $PGSHIFT,%eax # calculate pte address 7.911 + 7.912 + testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) 7.913 + jnz 2f 7.914 + 7.915 +6: /* Simulate a trap. */ 7.916 + pushl %edx 7.917 + pushl %edi 7.918 + call _C_LABEL(trapwrite) # trapwrite(addr) 7.919 + addl $4,%esp # clear argument from stack 7.920 + popl %edx 7.921 + testl %eax,%eax 7.922 + jnz _C_LABEL(copystr_efault) 7.923 + 7.924 +2: /* Copy up to end of this page. */ 7.925 + subl %ecx,%edx # predecrement total count 7.926 + jnc 3f 7.927 + addl %edx,%ecx # ecx += (edx - ecx) = edx 7.928 + xorl %edx,%edx 7.929 + 7.930 +3: decl %ecx 7.931 + js 4f 7.932 + lodsb 7.933 + stosb 7.934 + testb %al,%al 7.935 + jnz 3b 7.936 + 7.937 + /* Success -- 0 byte reached. */ 7.938 + addl %ecx,%edx # add back residual for this page 7.939 + xorl %eax,%eax 7.940 + jmp copystr_return 7.941 + 7.942 +4: /* Go to next page, if any. */ 7.943 + movl $PAGE_SIZE,%ecx 7.944 + testl %edx,%edx 7.945 + jnz 1b 7.946 + 7.947 + /* edx is zero -- return ENAMETOOLONG. */ 7.948 + movl $ENAMETOOLONG,%eax 7.949 + jmp copystr_return 7.950 +#endif /* I386_CPU */ 7.951 + 7.952 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) 7.953 +5: GET_CURPCB(%eax) 7.954 + movl $_C_LABEL(copystr_fault),PCB_ONFAULT(%eax) 7.955 + /* 7.956 + * Get min(%edx, VM_MAXUSER_ADDRESS-%edi). 7.957 + */ 7.958 + movl $VM_MAXUSER_ADDRESS,%eax 7.959 + subl %edi,%eax 7.960 + cmpl %edx,%eax 7.961 + jae 1f 7.962 + movl %eax,%edx 7.963 + movl %eax,20(%esp) 7.964 + 7.965 +1: incl %edx 7.966 + cld 7.967 + 7.968 +1: decl %edx 7.969 + jz 2f 7.970 + lodsb 7.971 + stosb 7.972 + testb %al,%al 7.973 + jnz 1b 7.974 + 7.975 + /* Success -- 0 byte reached. */ 7.976 + decl %edx 7.977 + xorl %eax,%eax 7.978 + jmp copystr_return 7.979 + 7.980 +2: /* edx is zero -- return EFAULT or ENAMETOOLONG. */ 7.981 + cmpl $VM_MAXUSER_ADDRESS,%edi 7.982 + jae _C_LABEL(copystr_efault) 7.983 + movl $ENAMETOOLONG,%eax 7.984 + jmp copystr_return 7.985 +#endif /* I486_CPU || I586_CPU || I686_CPU */ 7.986 + 7.987 +/* 7.988 + * int copyinstr(const void *from, void *to, size_t maxlen, size_t *lencopied); 7.989 + * Copy a NUL-terminated string, at most maxlen characters long, from the 7.990 + * user's address space. Return the number of characters copied (including the 7.991 + * NUL) in *lencopied. If the string is too long, return ENAMETOOLONG; else 7.992 + * return 0 or EFAULT. 7.993 + * see copyinstr(9) 7.994 + */ 7.995 +/* LINTSTUB: Func: int copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done) */ 7.996 +ENTRY(copyinstr) 7.997 + pushl %esi 7.998 + pushl %edi 7.999 + 7.1000 + DO_DEFERRED_SWITCH(%eax) 7.1001 + 7.1002 + GET_CURPCB(%ecx) 7.1003 + movl $_C_LABEL(copystr_fault),PCB_ONFAULT(%ecx) 7.1004 + 7.1005 + movl 12(%esp),%esi # %esi = from 7.1006 + movl 16(%esp),%edi # %edi = to 7.1007 + movl 20(%esp),%edx # %edx = maxlen 7.1008 + 7.1009 + /* 7.1010 + * Get min(%edx, VM_MAXUSER_ADDRESS-%esi). 7.1011 + */ 7.1012 + movl $VM_MAXUSER_ADDRESS,%eax 7.1013 + subl %esi,%eax 7.1014 + cmpl %edx,%eax 7.1015 + jae 1f 7.1016 + movl %eax,%edx 7.1017 + movl %eax,20(%esp) 7.1018 + 7.1019 +1: incl %edx 7.1020 + cld 7.1021 + 7.1022 +1: decl %edx 7.1023 + jz 2f 7.1024 + lodsb 7.1025 + stosb 7.1026 + testb %al,%al 7.1027 + jnz 1b 7.1028 + 7.1029 + /* Success -- 0 byte reached. */ 7.1030 + decl %edx 7.1031 + xorl %eax,%eax 7.1032 + jmp copystr_return 7.1033 + 7.1034 +2: /* edx is zero -- return EFAULT or ENAMETOOLONG. */ 7.1035 + cmpl $VM_MAXUSER_ADDRESS,%esi 7.1036 + jae _C_LABEL(copystr_efault) 7.1037 + movl $ENAMETOOLONG,%eax 7.1038 + jmp copystr_return 7.1039 + 7.1040 +/* LINTSTUB: Ignore */ 7.1041 +NENTRY(copystr_efault) 7.1042 + movl $EFAULT,%eax 7.1043 + 7.1044 +/* LINTSTUB: Ignore */ 7.1045 +NENTRY(copystr_fault) 7.1046 +copystr_return: 7.1047 + /* Set *lencopied and return %eax. */ 7.1048 + GET_CURPCB(%ecx) 7.1049 + movl $0,PCB_ONFAULT(%ecx) 7.1050 + movl 20(%esp),%ecx 7.1051 + subl %edx,%ecx 7.1052 + movl 24(%esp),%edx 7.1053 + testl %edx,%edx 7.1054 + jz 8f 7.1055 + movl %ecx,(%edx) 7.1056 + 7.1057 +8: popl %edi 7.1058 + popl %esi 7.1059 + ret 7.1060 + 7.1061 +/* 7.1062 + * int copystr(const void *from, void *to, size_t maxlen, size_t *lencopied); 7.1063 + * Copy a NUL-terminated string, at most maxlen characters long. Return the 7.1064 + * number of characters copied (including the NUL) in *lencopied. If the 7.1065 + * string is too long, return ENAMETOOLONG; else return 0. 7.1066 + * see copystr(9) 7.1067 + */ 7.1068 +/* LINTSTUB: Func: int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done) */ 7.1069 +ENTRY(copystr) 7.1070 + pushl %esi 7.1071 + pushl %edi 7.1072 + 7.1073 + movl 12(%esp),%esi # esi = from 7.1074 + movl 16(%esp),%edi # edi = to 7.1075 + movl 20(%esp),%edx # edx = maxlen 7.1076 + incl %edx 7.1077 + cld 7.1078 + 7.1079 +1: decl %edx 7.1080 + jz 4f 7.1081 + lodsb 7.1082 + stosb 7.1083 + testb %al,%al 7.1084 + jnz 1b 7.1085 + 7.1086 + /* Success -- 0 byte reached. */ 7.1087 + decl %edx 7.1088 + xorl %eax,%eax 7.1089 + jmp 6f 7.1090 + 7.1091 +4: /* edx is zero -- return ENAMETOOLONG. */ 7.1092 + movl $ENAMETOOLONG,%eax 7.1093 + 7.1094 +6: /* Set *lencopied and return %eax. */ 7.1095 + movl 20(%esp),%ecx 7.1096 + subl %edx,%ecx 7.1097 + movl 24(%esp),%edx 7.1098 + testl %edx,%edx 7.1099 + jz 7f 7.1100 + movl %ecx,(%edx) 7.1101 + 7.1102 +7: popl %edi 7.1103 + popl %esi 7.1104 + ret 7.1105 + 7.1106 +/* 7.1107 + * long fuword(const void *uaddr); 7.1108 + * Fetch an int from the user's address space. 7.1109 + * see fuword(9) 7.1110 + */ 7.1111 +/* LINTSTUB: Func: long fuword(const void *base) */ 7.1112 +ENTRY(fuword) 7.1113 + DO_DEFERRED_SWITCH(%eax) 7.1114 + movl 4(%esp),%edx 7.1115 + cmpl $VM_MAXUSER_ADDRESS-4,%edx 7.1116 + ja _C_LABEL(fusuaddrfault) 7.1117 + GET_CURPCB(%ecx) 7.1118 + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) 7.1119 + movl (%edx),%eax 7.1120 + movl $0,PCB_ONFAULT(%ecx) 7.1121 + ret 7.1122 + 7.1123 +/* 7.1124 + * int fusword(const void *uaddr); 7.1125 + * Fetch a short from the user's address space. 7.1126 + * see fusword(9) 7.1127 + */ 7.1128 +/* LINTSTUB: Func: int fusword(const void *base) */ 7.1129 +ENTRY(fusword) 7.1130 + DO_DEFERRED_SWITCH(%eax) 7.1131 + movl 4(%esp),%edx 7.1132 + cmpl $VM_MAXUSER_ADDRESS-2,%edx 7.1133 + ja _C_LABEL(fusuaddrfault) 7.1134 + GET_CURPCB(%ecx) 7.1135 + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) 7.1136 + movzwl (%edx),%eax 7.1137 + movl $0,PCB_ONFAULT(%ecx) 7.1138 + ret 7.1139 + 7.1140 +/* 7.1141 + * int fuswintr(const void *uaddr); 7.1142 + * Fetch a short from the user's address space. Can be called during an 7.1143 + * interrupt. 7.1144 + * see fuswintr(9) 7.1145 + */ 7.1146 +/* LINTSTUB: Func: int fuswintr(const void *base) */ 7.1147 +ENTRY(fuswintr) 7.1148 + cmpl $TLBSTATE_VALID, CPUVAR(TLBSTATE) 7.1149 + jnz _C_LABEL(fusuaddrfault) 7.1150 + movl 4(%esp),%edx 7.1151 + cmpl $VM_MAXUSER_ADDRESS-2,%edx 7.1152 + ja _C_LABEL(fusuaddrfault) 7.1153 + movl CPUVAR(CURLWP),%ecx 7.1154 + movl L_ADDR(%ecx),%ecx 7.1155 + movl $_C_LABEL(fusubail),PCB_ONFAULT(%ecx) 7.1156 + movzwl (%edx),%eax 7.1157 + movl $0,PCB_ONFAULT(%ecx) 7.1158 + ret 7.1159 + 7.1160 +/* 7.1161 + * int fubyte(const void *uaddr); 7.1162 + * Fetch a byte from the user's address space. 7.1163 + * see fubyte(9) 7.1164 + */ 7.1165 +/* LINTSTUB: Func: int fubyte(const void *base) */ 7.1166 +ENTRY(fubyte) 7.1167 + DO_DEFERRED_SWITCH(%eax) 7.1168 + movl 4(%esp),%edx 7.1169 + cmpl $VM_MAXUSER_ADDRESS-1,%edx 7.1170 + ja _C_LABEL(fusuaddrfault) 7.1171 + GET_CURPCB(%ecx) 7.1172 + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) 7.1173 + movzbl (%edx),%eax 7.1174 + movl $0,PCB_ONFAULT(%ecx) 7.1175 + ret 7.1176 + 7.1177 +/* 7.1178 + * Handle faults from [fs]u*(). Clean up and return -1. 7.1179 + */ 7.1180 +/* LINTSTUB: Ignore */ 7.1181 +NENTRY(fusufault) 7.1182 + movl $0,PCB_ONFAULT(%ecx) 7.1183 + movl $-1,%eax 7.1184 + ret 7.1185 + 7.1186 +/* 7.1187 + * Handle faults from [fs]u*(). Clean up and return -1. This differs from 7.1188 + * fusufault() in that trap() will recognize it and return immediately rather 7.1189 + * than trying to page fault. 7.1190 + */ 7.1191 +/* LINTSTUB: Ignore */ 7.1192 +NENTRY(fusubail) 7.1193 + movl $0,PCB_ONFAULT(%ecx) 7.1194 + movl $-1,%eax 7.1195 + ret 7.1196 + 7.1197 +/* 7.1198 + * Handle earlier faults from [fs]u*(), due to our of range addresses. 7.1199 + */ 7.1200 +/* LINTSTUB: Ignore */ 7.1201 +NENTRY(fusuaddrfault) 7.1202 + movl $-1,%eax 7.1203 + ret 7.1204 + 7.1205 +/* 7.1206 + * int suword(void *uaddr, long x); 7.1207 + * Store an int in the user's address space. 7.1208 + * see suword(9) 7.1209 + */ 7.1210 +/* LINTSTUB: Func: int suword(void *base, long c) */ 7.1211 +ENTRY(suword) 7.1212 + DO_DEFERRED_SWITCH(%eax) 7.1213 + movl 4(%esp),%edx 7.1214 + cmpl $VM_MAXUSER_ADDRESS-4,%edx 7.1215 + ja _C_LABEL(fusuaddrfault) 7.1216 + 7.1217 +#if defined(I386_CPU) 7.1218 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) 7.1219 + cmpl $CPUCLASS_386,_C_LABEL(cpu_class) 7.1220 + jne 2f 7.1221 +#endif /* I486_CPU || I586_CPU || I686_CPU */ 7.1222 + 7.1223 + GET_CURPCB(%eax) 7.1224 + movl $3f,PCB_ONFAULT(%eax) 7.1225 + 7.1226 + movl %edx,%eax 7.1227 + shrl $PGSHIFT,%eax # calculate pte address 7.1228 + testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) 7.1229 + jnz 1f 7.1230 + 7.1231 +3: /* Simulate a trap. */ 7.1232 + pushl %edx 7.1233 + pushl %edx 7.1234 + call _C_LABEL(trapwrite) # trapwrite(addr) 7.1235 + addl $4,%esp # clear parameter from the stack 7.1236 + popl %edx 7.1237 + GET_CURPCB(%ecx) 7.1238 + testl %eax,%eax 7.1239 + jnz _C_LABEL(fusufault) 7.1240 + 7.1241 +1: /* XXX also need to check the following 3 bytes for validity! */ 7.1242 +#endif 7.1243 + 7.1244 +2: GET_CURPCB(%ecx) 7.1245 + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) 7.1246 + 7.1247 + movl 8(%esp),%eax 7.1248 + movl %eax,(%edx) 7.1249 + xorl %eax,%eax 7.1250 + movl %eax,PCB_ONFAULT(%ecx) 7.1251 + ret 7.1252 + 7.1253 +/* 7.1254 + * int susword(void *uaddr, short x); 7.1255 + * Store a short in the user's address space. 7.1256 + * see susword(9) 7.1257 + */ 7.1258 +/* LINTSTUB: Func: int susword(void *base, short c) */ 7.1259 +ENTRY(susword) 7.1260 + DO_DEFERRED_SWITCH(%eax) 7.1261 + movl 4(%esp),%edx 7.1262 + cmpl $VM_MAXUSER_ADDRESS-2,%edx 7.1263 + ja _C_LABEL(fusuaddrfault) 7.1264 + 7.1265 +#if defined(I386_CPU) 7.1266 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) 7.1267 + cmpl $CPUCLASS_386,_C_LABEL(cpu_class) 7.1268 + jne 2f 7.1269 +#endif /* I486_CPU || I586_CPU || I686_CPU */ 7.1270 + 7.1271 + GET_CURPCB(%eax) 7.1272 + movl $3f,PCB_ONFAULT(%eax) 7.1273 + 7.1274 + movl %edx,%eax 7.1275 + shrl $PGSHIFT,%eax # calculate pte address 7.1276 + testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) 7.1277 + jnz 1f 7.1278 + 7.1279 +3: /* Simulate a trap. */ 7.1280 + pushl %edx 7.1281 + pushl %edx 7.1282 + call _C_LABEL(trapwrite) # trapwrite(addr) 7.1283 + addl $4,%esp # clear parameter from the stack 7.1284 + popl %edx 7.1285 + GET_CURPCB(%ecx) 7.1286 + testl %eax,%eax 7.1287 + jnz _C_LABEL(fusufault) 7.1288 + 7.1289 +1: /* XXX also need to check the following byte for validity! */ 7.1290 +#endif 7.1291 + 7.1292 +2: GET_CURPCB(%ecx) 7.1293 + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) 7.1294 + 7.1295 + movl 8(%esp),%eax 7.1296 + movw %ax,(%edx) 7.1297 + xorl %eax,%eax 7.1298 + movl %eax,PCB_ONFAULT(%ecx) 7.1299 + ret 7.1300 + 7.1301 +/* 7.1302 + * int suswintr(void *uaddr, short x); 7.1303 + * Store a short in the user's address space. Can be called during an 7.1304 + * interrupt. 7.1305 + * see suswintr(9) 7.1306 + */ 7.1307 +/* LINTSTUB: Func: int suswintr(void *base, short c) */ 7.1308 +ENTRY(suswintr) 7.1309 + cmpl $TLBSTATE_VALID, CPUVAR(TLBSTATE) 7.1310 + jnz _C_LABEL(fusuaddrfault) 7.1311 + movl 4(%esp),%edx 7.1312 + cmpl $VM_MAXUSER_ADDRESS-2,%edx 7.1313 + ja _C_LABEL(fusuaddrfault) 7.1314 + movl CPUVAR(CURLWP),%ecx 7.1315 + movl L_ADDR(%ecx),%ecx 7.1316 + movl $_C_LABEL(fusubail),PCB_ONFAULT(%ecx) 7.1317 + 7.1318 +#if defined(I386_CPU) 7.1319 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) 7.1320 + cmpl $CPUCLASS_386,_C_LABEL(cpu_class) 7.1321 + jne 2f 7.1322 +#endif /* I486_CPU || I586_CPU || I686_CPU */ 7.1323 + 7.1324 + movl %edx,%eax 7.1325 + shrl $PGSHIFT,%eax # calculate pte address 7.1326 + testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) 7.1327 + jnz 1f 7.1328 + 7.1329 + /* Simulate a trap. */ 7.1330 + jmp _C_LABEL(fusubail) 7.1331 + 7.1332 +1: /* XXX also need to check the following byte for validity! */ 7.1333 +#endif 7.1334 + 7.1335 +2: movl 8(%esp),%eax 7.1336 + movw %ax,(%edx) 7.1337 + xorl %eax,%eax 7.1338 + movl %eax,PCB_ONFAULT(%ecx) 7.1339 + ret 7.1340 + 7.1341 +/* 7.1342 + * int subyte(void *uaddr, char x); 7.1343 + * Store a byte in the user's address space. 7.1344 + * see subyte(9) 7.1345 + */ 7.1346 +/* LINTSTUB: Func: int subyte(void *base, int c) */ 7.1347 +ENTRY(subyte) 7.1348 + DO_DEFERRED_SWITCH(%eax) 7.1349 + movl 4(%esp),%edx 7.1350 + cmpl $VM_MAXUSER_ADDRESS-1,%edx 7.1351 + ja _C_LABEL(fusuaddrfault) 7.1352 + 7.1353 +#if defined(I386_CPU) 7.1354 +#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) 7.1355 + cmpl $CPUCLASS_386,_C_LABEL(cpu_class) 7.1356 + jne 2f 7.1357 +#endif /* I486_CPU || I586_CPU || I686_CPU */ 7.1358 + 7.1359 + GET_CURPCB(%eax) 7.1360 + movl $3f,PCB_ONFAULT(%eax) 7.1361 + 7.1362 + movl %edx,%eax 7.1363 + shrl $PGSHIFT,%eax # calculate pte address 7.1364 + testb $PG_RW,_C_LABEL(PTmap)(,%eax,4) 7.1365 + jnz 1f 7.1366 + 7.1367 +3: /* Simulate a trap. */ 7.1368 + pushl %edx 7.1369 + pushl %edx 7.1370 + call _C_LABEL(trapwrite) # trapwrite(addr) 7.1371 + addl $4,%esp # clear parameter from the stack 7.1372 + popl %edx 7.1373 + GET_CURPCB(%ecx) 7.1374 + testl %eax,%eax 7.1375 + jnz _C_LABEL(fusufault) 7.1376 + 7.1377 +1: 7.1378 +#endif 7.1379 + 7.1380 +2: GET_CURPCB(%ecx) 7.1381 + movl $_C_LABEL(fusufault),PCB_ONFAULT(%ecx) 7.1382 + 7.1383 + movb 8(%esp),%al 7.1384 + movb %al,(%edx) 7.1385 + xorl %eax,%eax 7.1386 + movl %eax,PCB_ONFAULT(%ecx) 7.1387 + ret 7.1388 + 7.1389 +/*****************************************************************************/ 7.1390 + 7.1391 +/* 7.1392 + * The following is i386-specific nonsense. 7.1393 + */ 7.1394 + 7.1395 +/* 7.1396 + * void lgdt_finish(void); 7.1397 + * Finish load a new GDT pointer (do any necessary cleanup). 7.1398 + * XXX It's somewhat questionable whether reloading all the segment registers 7.1399 + * is necessary, since the actual descriptor data is not changed except by 7.1400 + * process creation and exit, both of which clean up via task switches. OTOH, 7.1401 + * this only happens at run time when the GDT is resized. 7.1402 + */ 7.1403 +/* LINTSTUB: Func: void lgdt_finish(void) */ 7.1404 +NENTRY(lgdt_finish) 7.1405 + movl $GSEL(GDATA_SEL, SEL_KPL),%eax 7.1406 + movw %ax,%ds 7.1407 + movw %ax,%es 7.1408 + movw %ax,%gs 7.1409 + movw %ax,%ss 7.1410 + movl $GSEL(GCPU_SEL, SEL_KPL),%eax 7.1411 + movw %ax,%fs 7.1412 + /* Reload code selector by doing intersegment return. */ 7.1413 + popl %eax 7.1414 + pushl $GSEL(GCODE_SEL, SEL_KPL) 7.1415 + pushl %eax 7.1416 + lret 7.1417 + 7.1418 +/*****************************************************************************/ 7.1419 + 7.1420 +/* 7.1421 + * These functions are primarily used by DDB. 7.1422 + */ 7.1423 + 7.1424 +/* LINTSTUB: Func: int setjmp (label_t *l) */ 7.1425 +ENTRY(setjmp) 7.1426 + movl 4(%esp),%eax 7.1427 + movl %ebx,(%eax) # save ebx 7.1428 + movl %esp,4(%eax) # save esp 7.1429 + movl %ebp,8(%eax) # save ebp 7.1430 + movl %esi,12(%eax) # save esi 7.1431 + movl %edi,16(%eax) # save edi 7.1432 + movl (%esp),%edx # get rta 7.1433 + movl %edx,20(%eax) # save eip 7.1434 + xorl %eax,%eax # return (0); 7.1435 + ret 7.1436 + 7.1437 +/* LINTSTUB: Func: void longjmp (label_t *l) */ 7.1438 +ENTRY(longjmp) 7.1439 + movl 4(%esp),%eax 7.1440 + movl (%eax),%ebx # restore ebx 7.1441 + movl 4(%eax),%esp # restore esp 7.1442 + movl 8(%eax),%ebp # restore ebp 7.1443 + movl 12(%eax),%esi # restore esi 7.1444 + movl 16(%eax),%edi # restore edi 7.1445 + movl 20(%eax),%edx # get rta 7.1446 + movl %edx,(%esp) # put in return frame 7.1447 + xorl %eax,%eax # return (1); 7.1448 + incl %eax 7.1449 + ret 7.1450 + 7.1451 +/*****************************************************************************/ 7.1452 + 7.1453 + .globl _C_LABEL(sched_whichqs),_C_LABEL(sched_qs) 7.1454 + .globl _C_LABEL(uvmexp),_C_LABEL(panic) 7.1455 + 7.1456 +#ifdef DIAGNOSTIC 7.1457 +NENTRY(switch_error) 7.1458 + pushl $1f 7.1459 +3: call _C_LABEL(panic) 7.1460 + /* NOTREACHED */ 7.1461 +1: .asciz "cpu_switch" 7.1462 +#endif /* DIAGNOSTIC */ 7.1463 + 7.1464 +/* 7.1465 + * void cpu_switch(struct lwp *) 7.1466 + * Find a runnable process and switch to it. Wait if necessary. If the new 7.1467 + * process is the same as the old one, we short-circuit the context save and 7.1468 + * restore. 7.1469 + * 7.1470 + * Note that the stack frame layout is known to "struct switchframe" 7.1471 + * in <machine/frame.h> and to the code in cpu_fork() which initializes 7.1472 + * it for a new lwp. 7.1473 + */ 7.1474 +ENTRY(cpu_switch) 7.1475 + pushl %ebx 7.1476 + pushl %esi 7.1477 + pushl %edi 7.1478 + 7.1479 +#ifdef DEBUG 7.1480 + cmpl $IPL_SCHED,CPUVAR(ILEVEL) 7.1481 + jae 1f 7.1482 + pushl $2f 7.1483 + call _C_LABEL(panic) 7.1484 + /* NOTREACHED */ 7.1485 +2: .asciz "not splsched() in cpu_switch!" 7.1486 +1: 7.1487 +#endif /* DEBUG */ 7.1488 + 7.1489 + movl 16(%esp),%esi # current 7.1490 + 7.1491 + /* 7.1492 + * Clear curlwp so that we don't accumulate system time while idle. 7.1493 + * This also insures that schedcpu() will move the old lwp to 7.1494 + * the correct queue if it happens to get called from the spllower() 7.1495 + * below and changes the priority. (See corresponding comment in 7.1496 + * userret()). 7.1497 + */ 7.1498 + movl $0,CPUVAR(CURLWP) 7.1499 + /* 7.1500 + * First phase: find new lwp. 7.1501 + * 7.1502 + * Registers: 7.1503 + * %eax - queue head, scratch, then zero 7.1504 + * %ebx - queue number 7.1505 + * %ecx - cached value of whichqs 7.1506 + * %edx - next lwp in queue 7.1507 + * %esi - old lwp 7.1508 + * %edi - new lwp 7.1509 + */ 7.1510 + 7.1511 + /* Look for new lwp. */ 7.1512 + CLI(%ecx) # splhigh doesn't do a cli 7.1513 + movl _C_LABEL(sched_whichqs),%ecx 7.1514 + bsfl %ecx,%ebx # find a full q 7.1515 + jnz switch_dequeue 7.1516 + 7.1517 + /* 7.1518 + * idling: save old context. 7.1519 + * 7.1520 + * Registers: 7.1521 + * %eax, %ecx - scratch 7.1522 + * %esi - old lwp, then old pcb 7.1523 + * %edi - idle pcb 7.1524 + */ 7.1525 + 7.1526 + pushl %esi 7.1527 + call _C_LABEL(pmap_deactivate2) # pmap_deactivate(oldproc) 7.1528 + addl $4,%esp 7.1529 + 7.1530 + movl L_ADDR(%esi),%esi 7.1531 + 7.1532 + /* Save stack pointers. */ 7.1533 + movl %esp,PCB_ESP(%esi) 7.1534 + movl %ebp,PCB_EBP(%esi) 7.1535 + 7.1536 + /* Find idle PCB for this CPU */ 7.1537 +#ifndef MULTIPROCESSOR 7.1538 + movl $_C_LABEL(lwp0),%ebx 7.1539 + movl L_ADDR(%ebx),%edi 7.1540 + movl L_MD_TSS_SEL(%ebx),%edx 7.1541 +#else 7.1542 + movl CPUVAR(IDLE_PCB),%edi 7.1543 + movl CPUVAR(IDLE_TSS_SEL),%edx 7.1544 +#endif 7.1545 + movl $0,CPUVAR(CURLWP) /* In case we fault... */ 7.1546 + 7.1547 + /* Restore the idle context (avoid interrupts) */ 7.1548 + CLI(%ecx) 7.1549 + 7.1550 + /* Restore stack pointers. */ 7.1551 + movl PCB_ESP(%edi),%esp 7.1552 + movl PCB_EBP(%edi),%ebp 7.1553 + 7.1554 + pushl %edi 7.1555 + call _C_LABEL(i386_switch_context) 7.1556 + addl $4,%esp 7.1557 + 7.1558 + /* Record new pcb. */ 7.1559 + SET_CURPCB(%edi) 7.1560 + 7.1561 + xorl %esi,%esi 7.1562 + STI(%eax) 7.1563 +idle_unlock: 7.1564 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 7.1565 + call _C_LABEL(sched_unlock_idle) 7.1566 +#endif 7.1567 + /* Interrupts are okay again. */ 7.1568 + pushl $IPL_NONE # spl0() 7.1569 + call _C_LABEL(Xspllower) # process pending interrupts 7.1570 + addl $4,%esp 7.1571 + jmp idle_start 7.1572 +idle_zero: 7.1573 + STIC(%eax) 7.1574 + jz 4f 7.1575 + call _C_LABEL(stipending) 7.1576 + testl %eax,%eax 7.1577 + jz 4f 7.1578 + pushl $IPL_NONE 7.1579 + call _C_LABEL(Xspllower) 7.1580 + addl $4,%esp 7.1581 +4: 7.1582 + call _C_LABEL(uvm_pageidlezero) 7.1583 + CLI(%eax) 7.1584 + cmpl $0,_C_LABEL(sched_whichqs) 7.1585 + jnz idle_exit 7.1586 +idle_loop: 7.1587 + /* Try to zero some pages. */ 7.1588 + movl _C_LABEL(uvm)+UVM_PAGE_IDLE_ZERO,%ecx 7.1589 + testl %ecx,%ecx 7.1590 + jnz idle_zero 7.1591 + STIC(%eax) 7.1592 + jz 4f 7.1593 + call _C_LABEL(stipending) 7.1594 + testl %eax,%eax 7.1595 + jz 4f 7.1596 + pushl $IPL_NONE 7.1597 + call _C_LABEL(Xspllower) 7.1598 + addl $4,%esp 7.1599 + jmp idle_start 7.1600 +4: 7.1601 + movl $__HYPERVISOR_yield,%eax 7.1602 + TRAP_INSTR 7.1603 +NENTRY(mpidle) 7.1604 +idle_start: 7.1605 + CLI(%eax) 7.1606 + cmpl $0,_C_LABEL(sched_whichqs) 7.1607 + jz idle_loop 7.1608 +idle_exit: 7.1609 + movl $IPL_HIGH,CPUVAR(ILEVEL) # splhigh 7.1610 + STI(%eax) 7.1611 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 7.1612 + call _C_LABEL(sched_lock_idle) 7.1613 +#endif 7.1614 + movl _C_LABEL(sched_whichqs),%ecx 7.1615 + bsfl %ecx,%ebx 7.1616 + jz idle_unlock 7.1617 + 7.1618 +#ifdef XENDEBUG_LOW 7.1619 + pushl %ecx 7.1620 + call _C_LABEL(xen_dbg1) 7.1621 + xorl %ecx,%ecx 7.1622 + movl %ecx,_C_LABEL(xen_once) 7.1623 + popl %ecx 7.1624 +#endif 7.1625 +switch_dequeue: 7.1626 + /* 7.1627 + * we're running at splhigh(), but it's otherwise okay to take 7.1628 + * interrupts here. 7.1629 + */ 7.1630 + STI(%edi) 7.1631 + leal _C_LABEL(sched_qs)(,%ebx,8),%eax # select q 7.1632 + 7.1633 + movl L_FORW(%eax),%edi # unlink from front of process q 7.1634 +#ifdef DIAGNOSTIC 7.1635 + cmpl %edi,%eax # linked to self (i.e. nothing queued)? 7.1636 + je _C_LABEL(switch_error) # not possible 7.1637 +#endif /* DIAGNOSTIC */ 7.1638 + movl L_FORW(%edi),%edx 7.1639 + movl %edx,L_FORW(%eax) 7.1640 + movl %eax,L_BACK(%edx) 7.1641 + 7.1642 + cmpl %edx,%eax # q empty? 7.1643 + jne 3f 7.1644 + 7.1645 + btrl %ebx,%ecx # yes, clear to indicate empty 7.1646 + movl %ecx,_C_LABEL(sched_whichqs) # update q status 7.1647 + 7.1648 +3: /* We just did it. */ 7.1649 + xorl %eax,%eax 7.1650 + CLEAR_RESCHED(%eax) 7.1651 + 7.1652 +switch_resume: 7.1653 +#ifdef DIAGNOSTIC 7.1654 + cmpl %eax,L_WCHAN(%edi) # Waiting for something? 7.1655 + jne _C_LABEL(switch_error) # Yes; shouldn't be queued. 7.1656 + cmpb $LSRUN,L_STAT(%edi) # In run state? 7.1657 + jne _C_LABEL(switch_error) # No; shouldn't be queued. 7.1658 +#endif /* DIAGNOSTIC */ 7.1659 + 7.1660 + /* Isolate lwp. XXX Is this necessary? */ 7.1661 + movl %eax,L_BACK(%edi) 7.1662 + 7.1663 + /* Record new lwp. */ 7.1664 + movb $LSONPROC,L_STAT(%edi) # l->l_stat = LSONPROC 7.1665 + SET_CURLWP(%edi,%ecx) 7.1666 + 7.1667 + /* Skip context switch if same lwp. */ 7.1668 + xorl %ebx,%ebx 7.1669 + cmpl %edi,%esi 7.1670 + je switch_return 7.1671 + 7.1672 + /* If old lwp exited, don't bother. */ 7.1673 + testl %esi,%esi 7.1674 + jz switch_exited 7.1675 + 7.1676 + /* 7.1677 + * Second phase: save old context. 7.1678 + * 7.1679 + * Registers: 7.1680 + * %eax, %ecx - scratch 7.1681 + * %esi - old lwp, then old pcb 7.1682 + * %edi - new lwp 7.1683 + */ 7.1684 + 7.1685 + pushl %esi 7.1686 + call _C_LABEL(pmap_deactivate2) # pmap_deactivate(oldproc) 7.1687 + addl $4,%esp 7.1688 + 7.1689 + movl L_ADDR(%esi),%esi 7.1690 + 7.1691 + /* Save stack pointers. */ 7.1692 + movl %esp,PCB_ESP(%esi) 7.1693 + movl %ebp,PCB_EBP(%esi) 7.1694 + 7.1695 +switch_exited: 7.1696 + /* 7.1697 + * Third phase: restore saved context. 7.1698 + * 7.1699 + * Registers: 7.1700 + * %eax, %ebx, %ecx, %edx - scratch 7.1701 + * %esi - new pcb 7.1702 + * %edi - new lwp 7.1703 + */ 7.1704 + 7.1705 + /* No interrupts while loading new state. */ 7.1706 + CLI(%eax) 7.1707 + movl L_ADDR(%edi),%esi 7.1708 + 7.1709 + /* Restore stack pointers. */ 7.1710 + movl PCB_ESP(%esi),%esp 7.1711 + movl PCB_EBP(%esi),%ebp 7.1712 + 7.1713 +#if 0 7.1714 + /* Don't bother with the rest if switching to a system process. */ 7.1715 + testl $P_SYSTEM,L_FLAG(%edi); XXX NJWLWP lwp's don't have P_SYSTEM! 7.1716 + jnz switch_restored ; XXX skip stack_switch+pmap_activate 7.1717 +#endif 7.1718 + 7.1719 + pushl %edi 7.1720 + call _C_LABEL(pmap_activate) # pmap_activate(p) 7.1721 + addl $4,%esp 7.1722 + 7.1723 + pushl %esi 7.1724 + call _C_LABEL(i386_switch_context) 7.1725 + addl $4,%esp 7.1726 + 7.1727 + /* Record new pcb. */ 7.1728 + SET_CURPCB(%esi) 7.1729 + 7.1730 + /* Interrupts are okay again. */ 7.1731 + STI(%edi) 7.1732 + 7.1733 +/* 7.1734 + * Check for restartable atomic sequences (RAS) 7.1735 + */ 7.1736 + movl CPUVAR(CURLWP),%edi 7.1737 + movl L_PROC(%edi),%esi 7.1738 + cmpl $0,P_RASLIST(%esi) 7.1739 + jne 2f 7.1740 +1: 7.1741 + movl $1,%ebx 7.1742 + 7.1743 +switch_return: 7.1744 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 7.1745 + call _C_LABEL(sched_unlock_idle) 7.1746 +#endif 7.1747 + pushl $IPL_NONE # spl0() 7.1748 + call _C_LABEL(Xspllower) # process pending interrupts 7.1749 + addl $4,%esp 7.1750 + movl $IPL_HIGH,CPUVAR(ILEVEL) # splhigh() 7.1751 + 7.1752 + movl %ebx,%eax 7.1753 + 7.1754 + popl %edi 7.1755 + popl %esi 7.1756 + popl %ebx 7.1757 + ret 7.1758 + 7.1759 +2: # check RAS list 7.1760 + movl L_MD_REGS(%edi),%ebx 7.1761 + movl TF_EIP(%ebx),%eax 7.1762 + pushl %eax 7.1763 + pushl %esi 7.1764 + call _C_LABEL(ras_lookup) 7.1765 + addl $8,%esp 7.1766 + cmpl $-1,%eax 7.1767 + je 1b 7.1768 + movl %eax,TF_EIP(%ebx) 7.1769 + jmp 1b 7.1770 + 7.1771 +/* 7.1772 + * void cpu_switchto(struct lwp *current, struct lwp *next) 7.1773 + * Switch to the specified next LWP. 7.1774 + */ 7.1775 +ENTRY(cpu_switchto) 7.1776 + pushl %ebx 7.1777 + pushl %esi 7.1778 + pushl %edi 7.1779 + 7.1780 +#ifdef DEBUG 7.1781 + cmpl $IPL_SCHED,CPUVAR(ILEVEL) 7.1782 + jae 1f 7.1783 + pushl $2f 7.1784 + call _C_LABEL(panic) 7.1785 + /* NOTREACHED */ 7.1786 +2: .asciz "not splsched() in cpu_switchto!" 7.1787 +1: 7.1788 +#endif /* DEBUG */ 7.1789 + 7.1790 + movl 16(%esp),%esi # current 7.1791 + movl 20(%esp),%edi # next 7.1792 + 7.1793 + /* 7.1794 + * Clear curlwp so that we don't accumulate system time while idle. 7.1795 + * This also insures that schedcpu() will move the old process to 7.1796 + * the correct queue if it happens to get called from the spllower() 7.1797 + * below and changes the priority. (See corresponding comment in 7.1798 + * usrret()). 7.1799 + * 7.1800 + * XXX Is this necessary? We know we won't go idle. 7.1801 + */ 7.1802 + movl $0,CPUVAR(CURLWP) 7.1803 + 7.1804 + /* 7.1805 + * We're running at splhigh(), but it's otherwise okay to take 7.1806 + * interrupts here. 7.1807 + */ 7.1808 + STI(%eax) 7.1809 + 7.1810 + /* Jump into the middle of cpu_switch */ 7.1811 + xorl %eax,%eax 7.1812 + jmp switch_resume 7.1813 + 7.1814 +/* 7.1815 + * void cpu_exit(struct lwp *l) 7.1816 + * Switch to the appropriate idle context (lwp0's if uniprocessor; the CPU's 7.1817 + * if multiprocessor) and deallocate the address space and kernel stack for p. 7.1818 + * Then jump into cpu_switch(), as if we were in the idle proc all along. 7.1819 + */ 7.1820 +#ifndef MULTIPROCESSOR 7.1821 + .globl _C_LABEL(lwp0) 7.1822 +#endif 7.1823 + .globl _C_LABEL(uvmspace_free),_C_LABEL(kernel_map) 7.1824 + .globl _C_LABEL(uvm_km_free),_C_LABEL(tss_free) 7.1825 +/* LINTSTUB: Func: void cpu_exit(struct lwp *l) */ 7.1826 +ENTRY(cpu_exit) 7.1827 + movl 4(%esp),%edi # old process 7.1828 +#ifndef MULTIPROCESSOR 7.1829 + movl $_C_LABEL(lwp0),%ebx 7.1830 + movl L_ADDR(%ebx),%esi 7.1831 + movl L_MD_TSS_SEL(%ebx),%edx 7.1832 +#else 7.1833 + movl CPUVAR(IDLE_PCB),%esi 7.1834 + movl CPUVAR(IDLE_TSS_SEL),%edx 7.1835 +#endif 7.1836 + /* In case we fault... */ 7.1837 + movl $0,CPUVAR(CURLWP) 7.1838 + 7.1839 + /* Restore the idle context. */ 7.1840 + CLI(%eax) 7.1841 + 7.1842 + /* Restore stack pointers. */ 7.1843 + movl PCB_ESP(%esi),%esp 7.1844 + movl PCB_EBP(%esi),%ebp 7.1845 + 7.1846 + pushl %esi 7.1847 + call _C_LABEL(i386_switch_context) 7.1848 + addl $4,%esp 7.1849 + 7.1850 + /* Record new pcb. */ 7.1851 + SET_CURPCB(%esi) 7.1852 + 7.1853 + /* Interrupts are okay again. */ 7.1854 + STI(%eax) 7.1855 + 7.1856 + /* 7.1857 + * Schedule the dead LWP's stack to be freed. 7.1858 + */ 7.1859 + pushl %edi 7.1860 + call _C_LABEL(lwp_exit2) 7.1861 + addl $4,%esp 7.1862 + 7.1863 + /* Jump into cpu_switch() with the right state. */ 7.1864 + xorl %esi,%esi 7.1865 + movl %esi,CPUVAR(CURLWP) 7.1866 + jmp idle_start 7.1867 + 7.1868 +/* 7.1869 + * void savectx(struct pcb *pcb); 7.1870 + * Update pcb, saving current processor state. 7.1871 + */ 7.1872 +/* LINTSTUB: Func: void savectx(struct pcb *pcb) */ 7.1873 +ENTRY(savectx) 7.1874 + movl 4(%esp),%edx # edx = p->p_addr 7.1875 + 7.1876 + /* Save stack pointers. */ 7.1877 + movl %esp,PCB_ESP(%edx) 7.1878 + movl %ebp,PCB_EBP(%edx) 7.1879 + 7.1880 + ret 7.1881 + 7.1882 +/* 7.1883 + * Old call gate entry for syscall 7.1884 + */ 7.1885 +/* LINTSTUB: Var: char Xosyscall[1]; */ 7.1886 +IDTVEC(osyscall) 7.1887 + /* Set eflags in trap frame. */ 7.1888 + pushfl 7.1889 + popl 8(%esp) 7.1890 + pushl $7 # size of instruction for restart 7.1891 + jmp syscall1 7.1892 + 7.1893 +/* 7.1894 + * Trap gate entry for syscall 7.1895 + */ 7.1896 +/* LINTSTUB: Var: char Xsyscall[1]; */ 7.1897 +IDTVEC(syscall) 7.1898 + pushl $2 # size of instruction for restart 7.1899 +syscall1: 7.1900 + pushl $T_ASTFLT # trap # for doing ASTs 7.1901 + INTRENTRY 7.1902 + 7.1903 +#ifdef DIAGNOSTIC 7.1904 + cmpl $0, CPUVAR(WANT_PMAPLOAD) 7.1905 + jz 1f 7.1906 + pushl $6f 7.1907 + call _C_LABEL(printf) 7.1908 + addl $4, %esp 7.1909 +1: 7.1910 + movl CPUVAR(ILEVEL),%ebx 7.1911 + testl %ebx,%ebx 7.1912 + jz 1f 7.1913 + pushl $5f 7.1914 + call _C_LABEL(printf) 7.1915 + addl $4,%esp 7.1916 +#ifdef DDB 7.1917 + int $3 7.1918 +#endif 7.1919 +1: 7.1920 +#endif /* DIAGNOSTIC */ 7.1921 + movl CPUVAR(CURLWP),%edx 7.1922 + movl %esp,L_MD_REGS(%edx) # save pointer to frame 7.1923 + movl L_PROC(%edx),%edx 7.1924 + pushl %esp 7.1925 + call *P_MD_SYSCALL(%edx) # get pointer to syscall() function 7.1926 + addl $4,%esp 7.1927 +syscall_checkast: 7.1928 + /* Check for ASTs on exit to user mode. */ 7.1929 + CLI(%eax) 7.1930 + CHECK_ASTPENDING(%eax) 7.1931 + je 1f 7.1932 + /* Always returning to user mode here. */ 7.1933 + CLEAR_ASTPENDING(%eax) 7.1934 + STI(%eax) 7.1935 + /* Pushed T_ASTFLT into tf_trapno on entry. */ 7.1936 + pushl %esp 7.1937 + call _C_LABEL(trap) 7.1938 + addl $4,%esp 7.1939 + jmp syscall_checkast 7.1940 +1: STI(%eax) 7.1941 + CHECK_DEFERRED_SWITCH(%eax) 7.1942 + jnz 9f 7.1943 +#ifndef DIAGNOSTIC 7.1944 + INTRFASTEXIT 7.1945 +#else /* DIAGNOSTIC */ 7.1946 + cmpl $IPL_NONE,CPUVAR(ILEVEL) 7.1947 + jne 3f 7.1948 + INTRFASTEXIT 7.1949 +3: pushl $4f 7.1950 + call _C_LABEL(printf) 7.1951 + addl $4,%esp 7.1952 +#ifdef DDB 7.1953 + int $3 7.1954 +#endif /* DDB */ 7.1955 + movl $IPL_NONE,CPUVAR(ILEVEL) 7.1956 + jmp 2b 7.1957 +4: .asciz "WARNING: SPL NOT LOWERED ON SYSCALL EXIT\n" 7.1958 +5: .asciz "WARNING: SPL NOT ZERO ON SYSCALL ENTRY\n" 7.1959 +6: .asciz "WARNING: WANT PMAPLOAD ON SYSCALL ENTRY\n" 7.1960 +#endif /* DIAGNOSTIC */ 7.1961 +9: call _C_LABEL(pmap_load) 7.1962 + jmp syscall_checkast /* re-check ASTs */ 7.1963 + 7.1964 +#if NNPX > 0 7.1965 +/* 7.1966 + * Special interrupt handlers. Someday intr0-intr15 will be used to count 7.1967 + * interrupts. We'll still need a special exception 16 handler. The busy 7.1968 + * latch stuff in probintr() can be moved to npxprobe(). 7.1969 + */ 7.1970 + 7.1971 +/* LINTSTUB: Func: void probeintr(void) */ 7.1972 +NENTRY(probeintr) 7.1973 + ss 7.1974 + incl _C_LABEL(npx_intrs_while_probing) 7.1975 + pushl %eax 7.1976 + movb $0x20,%al # EOI (asm in strings loses cpp features) 7.1977 + outb %al,$0xa0 # IO_ICU2 7.1978 + outb %al,$0x20 # IO_ICU1 7.1979 + movb $0,%al 7.1980 + outb %al,$0xf0 # clear BUSY# latch 7.1981 + popl %eax 7.1982 + iret 7.1983 + 7.1984 +/* LINTSTUB: Func: void probetrap(void) */ 7.1985 +NENTRY(probetrap) 7.1986 + ss 7.1987 + incl _C_LABEL(npx_traps_while_probing) 7.1988 + fnclex 7.1989 + iret 7.1990 + 7.1991 +/* LINTSTUB: Func: int npx586bug1(int a, int b) */ 7.1992 +NENTRY(npx586bug1) 7.1993 + fildl 4(%esp) # x 7.1994 + fildl 8(%esp) # y 7.1995 + fld %st(1) 7.1996 + fdiv %st(1),%st # x/y 7.1997 + fmulp %st,%st(1) # (x/y)*y 7.1998 + fsubrp %st,%st(1) # x-(x/y)*y 7.1999 + pushl $0 7.2000 + fistpl (%esp) 7.2001 + popl %eax 7.2002 + ret 7.2003 +#endif /* NNPX > 0 */
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c Mon Sep 06 19:04:16 2004 +0000 8.3 @@ -0,0 +1,2561 @@ 8.4 +/* $NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $ */ 8.5 +/* NetBSD: machdep.c,v 1.552 2004/03/24 15:34:49 atatat Exp */ 8.6 + 8.7 +/*- 8.8 + * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc. 8.9 + * All rights reserved. 8.10 + * 8.11 + * This code is derived from software contributed to The NetBSD Foundation 8.12 + * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace 8.13 + * Simulation Facility, NASA Ames Research Center. 8.14 + * 8.15 + * Redistribution and use in source and binary forms, with or without 8.16 + * modification, are permitted provided that the following conditions 8.17 + * are met: 8.18 + * 1. Redistributions of source code must retain the above copyright 8.19 + * notice, this list of conditions and the following disclaimer. 8.20 + * 2. Redistributions in binary form must reproduce the above copyright 8.21 + * notice, this list of conditions and the following disclaimer in the 8.22 + * documentation and/or other materials provided with the distribution. 8.23 + * 3. All advertising materials mentioning features or use of this software 8.24 + * must display the following acknowledgement: 8.25 + * This product includes software developed by the NetBSD 8.26 + * Foundation, Inc. and its contributors. 8.27 + * 4. Neither the name of The NetBSD Foundation nor the names of its 8.28 + * contributors may be used to endorse or promote products derived 8.29 + * from this software without specific prior written permission. 8.30 + * 8.31 + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 8.32 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 8.33 + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 8.34 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 8.35 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 8.36 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 8.37 + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 8.38 + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 8.39 + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 8.40 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 8.41 + * POSSIBILITY OF SUCH DAMAGE. 8.42 + */ 8.43 + 8.44 +/*- 8.45 + * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 8.46 + * All rights reserved. 8.47 + * 8.48 + * This code is derived from software contributed to Berkeley by 8.49 + * William Jolitz. 8.50 + * 8.51 + * Redistribution and use in source and binary forms, with or without 8.52 + * modification, are permitted provided that the following conditions 8.53 + * are met: 8.54 + * 1. Redistributions of source code must retain the above copyright 8.55 + * notice, this list of conditions and the following disclaimer. 8.56 + * 2. Redistributions in binary form must reproduce the above copyright 8.57 + * notice, this list of conditions and the following disclaimer in the 8.58 + * documentation and/or other materials provided with the distribution. 8.59 + * 3. Neither the name of the University nor the names of its contributors 8.60 + * may be used to endorse or promote products derived from this software 8.61 + * without specific prior written permission. 8.62 + * 8.63 + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 8.64 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 8.65 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 8.66 + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 8.67 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 8.68 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 8.69 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 8.70 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 8.71 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 8.72 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 8.73 + * SUCH DAMAGE. 8.74 + * 8.75 + * @(#)machdep.c 7.4 (Berkeley) 6/3/91 8.76 + */ 8.77 + 8.78 +#include <sys/cdefs.h> 8.79 +__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $"); 8.80 + 8.81 +#include "opt_beep.h" 8.82 +#include "opt_compat_ibcs2.h" 8.83 +#include "opt_compat_mach.h" /* need to get the right segment def */ 8.84 +#include "opt_compat_netbsd.h" 8.85 +#include "opt_compat_svr4.h" 8.86 +#include "opt_cpureset_delay.h" 8.87 +#include "opt_cputype.h" 8.88 +#include "opt_ddb.h" 8.89 +#include "opt_ipkdb.h" 8.90 +#include "opt_kgdb.h" 8.91 +#include "opt_mtrr.h" 8.92 +#include "opt_multiprocessor.h" 8.93 +#include "opt_realmem.h" 8.94 +#include "opt_user_ldt.h" 8.95 +#include "opt_vm86.h" 8.96 +#include "opt_xen.h" 8.97 + 8.98 +#include <sys/param.h> 8.99 +#include <sys/systm.h> 8.100 +#include <sys/signal.h> 8.101 +#include <sys/signalvar.h> 8.102 +#include <sys/kernel.h> 8.103 +#include <sys/proc.h> 8.104 +#include <sys/user.h> 8.105 +#include <sys/exec.h> 8.106 +#include <sys/buf.h> 8.107 +#include <sys/reboot.h> 8.108 +#include <sys/conf.h> 8.109 +#include <sys/file.h> 8.110 +#include <sys/malloc.h> 8.111 +#include <sys/mbuf.h> 8.112 +#include <sys/msgbuf.h> 8.113 +#include <sys/mount.h> 8.114 +#include <sys/vnode.h> 8.115 +#include <sys/extent.h> 8.116 +#include <sys/syscallargs.h> 8.117 +#include <sys/core.h> 8.118 +#include <sys/kcore.h> 8.119 +#include <sys/ucontext.h> 8.120 +#include <machine/kcore.h> 8.121 +#include <sys/ras.h> 8.122 +#include <sys/sa.h> 8.123 +#include <sys/savar.h> 8.124 +#include <sys/ksyms.h> 8.125 + 8.126 +#ifdef IPKDB 8.127 +#include <ipkdb/ipkdb.h> 8.128 +#endif 8.129 + 8.130 +#ifdef KGDB 8.131 +#include <sys/kgdb.h> 8.132 +#endif 8.133 + 8.134 +#include <dev/cons.h> 8.135 + 8.136 +#include <uvm/uvm_extern.h> 8.137 +#include <uvm/uvm_page.h> 8.138 + 8.139 +#include <sys/sysctl.h> 8.140 + 8.141 +#include <machine/cpu.h> 8.142 +#include <machine/cpufunc.h> 8.143 +#include <machine/cpuvar.h> 8.144 +#include <machine/gdt.h> 8.145 +#include <machine/pio.h> 8.146 +#include <machine/psl.h> 8.147 +#include <machine/reg.h> 8.148 +#include <machine/specialreg.h> 8.149 +#include <machine/bootinfo.h> 8.150 +#include <machine/mtrr.h> 8.151 +#include <machine/evtchn.h> 8.152 + 8.153 +#include <dev/isa/isareg.h> 8.154 +#include <machine/isa_machdep.h> 8.155 +#include <dev/ic/i8042reg.h> 8.156 + 8.157 +#ifdef DDB 8.158 +#include <machine/db_machdep.h> 8.159 +#include <ddb/db_extern.h> 8.160 +#endif 8.161 + 8.162 +#ifdef VM86 8.163 +#include <machine/vm86.h> 8.164 +#endif 8.165 + 8.166 +#include "acpi.h" 8.167 +#include "apm.h" 8.168 +#include "bioscall.h" 8.169 + 8.170 +#if NBIOSCALL > 0 8.171 +#include <machine/bioscall.h> 8.172 +#endif 8.173 + 8.174 +#if NACPI > 0 8.175 +#include <dev/acpi/acpivar.h> 8.176 +#define ACPI_MACHDEP_PRIVATE 8.177 +#include <machine/acpi_machdep.h> 8.178 +#endif 8.179 + 8.180 +#if NAPM > 0 8.181 +#include <machine/apmvar.h> 8.182 +#endif 8.183 + 8.184 +#include "isa.h" 8.185 +#include "isadma.h" 8.186 +#include "npx.h" 8.187 +#include "ksyms.h" 8.188 + 8.189 +#include "mca.h" 8.190 +#if NMCA > 0 8.191 +#include <machine/mca_machdep.h> /* for mca_busprobe() */ 8.192 +#endif 8.193 + 8.194 +#ifdef MULTIPROCESSOR /* XXX */ 8.195 +#include <machine/mpbiosvar.h> /* XXX */ 8.196 +#endif /* XXX */ 8.197 + 8.198 +#include <machine/xen.h> 8.199 +#include <machine/hypervisor.h> 8.200 + 8.201 +#if defined(DDB) || defined(KGDB) 8.202 +#include <ddb/db_interface.h> 8.203 +#include <ddb/db_output.h> 8.204 + 8.205 +void ddb_trap_hook(int); 8.206 +#endif 8.207 + 8.208 +/* #define XENDEBUG */ 8.209 +/* #define XENDEBUG_LOW */ 8.210 + 8.211 +#ifdef XENDEBUG 8.212 +extern void printk(char *, ...); 8.213 +#define XENPRINTF(x) printf x 8.214 +#define XENPRINTK(x) printk x 8.215 +#else 8.216 +#define XENPRINTF(x) 8.217 +#define XENPRINTK(x) 8.218 +#endif 8.219 +#define PRINTK(x) printf x 8.220 + 8.221 +#ifdef XENDEBUG_LOW 8.222 +void xen_dbglow_init(void); 8.223 +#endif 8.224 + 8.225 +#ifndef BEEP_ONHALT_COUNT 8.226 +#define BEEP_ONHALT_COUNT 3 8.227 +#endif 8.228 +#ifndef BEEP_ONHALT_PITCH 8.229 +#define BEEP_ONHALT_PITCH 1500 8.230 +#endif 8.231 +#ifndef BEEP_ONHALT_PERIOD 8.232 +#define BEEP_ONHALT_PERIOD 250 8.233 +#endif 8.234 + 8.235 +/* the following is used externally (sysctl_hw) */ 8.236 +char machine[] = "i386"; /* CPU "architecture" */ 8.237 +char machine_arch[] = "i386"; /* machine == machine_arch */ 8.238 + 8.239 +char bootinfo[BOOTINFO_MAXSIZE]; 8.240 + 8.241 +struct bi_devmatch *i386_alldisks = NULL; 8.242 +int i386_ndisks = 0; 8.243 + 8.244 +#ifdef CPURESET_DELAY 8.245 +int cpureset_delay = CPURESET_DELAY; 8.246 +#else 8.247 +int cpureset_delay = 2000; /* default to 2s */ 8.248 +#endif 8.249 + 8.250 +#ifdef MTRR 8.251 +struct mtrr_funcs *mtrr_funcs; 8.252 +#endif 8.253 + 8.254 +#ifdef COMPAT_NOMID 8.255 +static int exec_nomid(struct proc *, struct exec_package *); 8.256 +#endif 8.257 + 8.258 +int physmem; 8.259 +int dumpmem_low; 8.260 +int dumpmem_high; 8.261 +unsigned int cpu_feature; 8.262 +int cpu_class; 8.263 +int i386_fpu_present; 8.264 +int i386_fpu_exception; 8.265 +int i386_fpu_fdivbug; 8.266 + 8.267 +int i386_use_fxsave; 8.268 +int i386_has_sse; 8.269 +int i386_has_sse2; 8.270 + 8.271 +int tmx86_has_longrun; 8.272 + 8.273 +vaddr_t msgbuf_vaddr; 8.274 +paddr_t msgbuf_paddr; 8.275 + 8.276 +vaddr_t idt_vaddr; 8.277 +paddr_t idt_paddr; 8.278 + 8.279 +#ifdef I586_CPU 8.280 +vaddr_t pentium_idt_vaddr; 8.281 +#endif 8.282 + 8.283 +struct vm_map *exec_map = NULL; 8.284 +struct vm_map *mb_map = NULL; 8.285 +struct vm_map *phys_map = NULL; 8.286 + 8.287 +extern paddr_t avail_start, avail_end; 8.288 +extern paddr_t pmap_pa_start, pmap_pa_end; 8.289 + 8.290 +#ifdef ISA_CLOCK 8.291 +void (*delay_func)(int) = i8254_delay; 8.292 +void (*microtime_func)(struct timeval *) = i8254_microtime; 8.293 +void (*initclock_func)(void) = i8254_initclocks; 8.294 +#else 8.295 +void (*delay_func)(int) = xen_delay; 8.296 +void (*microtime_func)(struct timeval *) = xen_microtime; 8.297 +void (*initclock_func)(void) = xen_initclocks; 8.298 +#endif 8.299 + 8.300 +void hypervisor_callback(void); 8.301 +void failsafe_callback(void); 8.302 + 8.303 +/* 8.304 + * Size of memory segments, before any memory is stolen. 8.305 + */ 8.306 +phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; 8.307 +int mem_cluster_cnt; 8.308 + 8.309 +int cpu_dump(void); 8.310 +int cpu_dumpsize(void); 8.311 +u_long cpu_dump_mempagecnt(void); 8.312 +void dumpsys(void); 8.313 +void init386(paddr_t); 8.314 +void initgdt(void); 8.315 + 8.316 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM) 8.317 +void add_mem_cluster(u_int64_t, u_int64_t, u_int32_t); 8.318 +#endif /* !defnied(REALBASEMEM) && !defined(REALEXTMEM) */ 8.319 + 8.320 +extern int time_adjusted; 8.321 + 8.322 +/* 8.323 + * Machine-dependent startup code 8.324 + */ 8.325 +void 8.326 +cpu_startup() 8.327 +{ 8.328 + int x; 8.329 + vaddr_t minaddr, maxaddr; 8.330 + char pbuf[9]; 8.331 + 8.332 + /* 8.333 + * Initialize error message buffer (et end of core). 8.334 + */ 8.335 + msgbuf_vaddr = uvm_km_valloc(kernel_map, x86_round_page(MSGBUFSIZE)); 8.336 + if (msgbuf_vaddr == 0) 8.337 + panic("failed to valloc msgbuf_vaddr"); 8.338 + 8.339 + /* msgbuf_paddr was init'd in pmap */ 8.340 + for (x = 0; x < btoc(MSGBUFSIZE); x++) 8.341 + pmap_kenter_pa((vaddr_t)msgbuf_vaddr + x * PAGE_SIZE, 8.342 + msgbuf_paddr + x * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE); 8.343 + pmap_update(pmap_kernel()); 8.344 + 8.345 + initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE)); 8.346 + 8.347 + printf("%s", version); 8.348 + 8.349 +#ifdef TRAPLOG 8.350 + /* 8.351 + * Enable recording of branch from/to in MSR's 8.352 + */ 8.353 + wrmsr(MSR_DEBUGCTLMSR, 0x1); 8.354 +#endif 8.355 + 8.356 + format_bytes(pbuf, sizeof(pbuf), ptoa(physmem)); 8.357 + printf("total memory = %s\n", pbuf); 8.358 + 8.359 + minaddr = 0; 8.360 + 8.361 + /* 8.362 + * Allocate a submap for exec arguments. This map effectively 8.363 + * limits the number of processes exec'ing at any time. 8.364 + */ 8.365 + exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 8.366 + 16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL); 8.367 + 8.368 + /* 8.369 + * Allocate a submap for physio 8.370 + */ 8.371 + phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 8.372 + VM_PHYS_SIZE, 0, FALSE, NULL); 8.373 + 8.374 + /* 8.375 + * Finally, allocate mbuf cluster submap. 8.376 + */ 8.377 + mb_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 8.378 + nmbclusters * mclbytes, VM_MAP_INTRSAFE, FALSE, NULL); 8.379 + 8.380 + format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free)); 8.381 + printf("avail memory = %s\n", pbuf); 8.382 + 8.383 + /* Safe for i/o port / memory space allocation to use malloc now. */ 8.384 + x86_bus_space_mallocok(); 8.385 +} 8.386 + 8.387 +/* 8.388 + * Set up proc0's TSS and LDT. 8.389 + */ 8.390 +void 8.391 +i386_proc0_tss_ldt_init() 8.392 +{ 8.393 + struct pcb *pcb; 8.394 + int x; 8.395 + 8.396 + gdt_init(); 8.397 + 8.398 + cpu_info_primary.ci_curpcb = pcb = &lwp0.l_addr->u_pcb; 8.399 + 8.400 + pcb->pcb_tss.tss_ioopt = 8.401 + ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16 8.402 + | SEL_KPL; /* i/o pl */ 8.403 + 8.404 + for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++) 8.405 + pcb->pcb_iomap[x] = 0xffffffff; 8.406 + 8.407 + pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL); 8.408 + pcb->pcb_cr0 = rcr0(); 8.409 + pcb->pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); 8.410 + pcb->pcb_tss.tss_esp0 = (int)lwp0.l_addr + USPACE - 16; 8.411 + lwp0.l_md.md_regs = (struct trapframe *)pcb->pcb_tss.tss_esp0 - 1; 8.412 + lwp0.l_md.md_tss_sel = tss_alloc(pcb); 8.413 + 8.414 +#ifndef XEN 8.415 + ltr(lwp0.l_md.md_tss_sel); 8.416 + lldt(pcb->pcb_ldt_sel); 8.417 +#else 8.418 + HYPERVISOR_fpu_taskswitch(); 8.419 + XENPRINTF(("lwp tss sp %p ss %04x/%04x\n", 8.420 + (void *)pcb->pcb_tss.tss_esp0, 8.421 + pcb->pcb_tss.tss_ss0, IDXSEL(pcb->pcb_tss.tss_ss0))); 8.422 + HYPERVISOR_stack_switch(pcb->pcb_tss.tss_ss0, pcb->pcb_tss.tss_esp0); 8.423 +#endif 8.424 +} 8.425 + 8.426 +/* 8.427 + * Set up TSS and LDT for a new PCB. 8.428 + */ 8.429 + 8.430 +void 8.431 +i386_init_pcb_tss_ldt(struct cpu_info *ci) 8.432 +{ 8.433 + int x; 8.434 + struct pcb *pcb = ci->ci_idle_pcb; 8.435 + 8.436 + pcb->pcb_tss.tss_ioopt = 8.437 + ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16 8.438 + | SEL_KPL; /* i/o pl */ 8.439 + for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++) 8.440 + pcb->pcb_iomap[x] = 0xffffffff; 8.441 + 8.442 + pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL); 8.443 + pcb->pcb_cr0 = rcr0(); 8.444 + 8.445 + ci->ci_idle_tss_sel = tss_alloc(pcb); 8.446 +} 8.447 + 8.448 +/* 8.449 + * Switch context: 8.450 + * - honor CR0_TS in saved CR0 and request DNA exception on FPU use 8.451 + * - switch stack pointer for user->kernel transition 8.452 + */ 8.453 +void 8.454 +i386_switch_context(struct pcb *new) 8.455 +{ 8.456 + dom0_op_t op; 8.457 + struct cpu_info *ci; 8.458 + 8.459 + ci = curcpu(); 8.460 + if (ci->ci_fpused) { 8.461 + HYPERVISOR_fpu_taskswitch(); 8.462 + ci->ci_fpused = 0; 8.463 + } 8.464 + 8.465 + HYPERVISOR_stack_switch(new->pcb_tss.tss_ss0, new->pcb_tss.tss_esp0); 8.466 + 8.467 + if (xen_start_info.flags & SIF_PRIVILEGED) { 8.468 + op.cmd = DOM0_IOPL; 8.469 + op.u.iopl.domain = DOMID_SELF; 8.470 + op.u.iopl.iopl = new->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */ 8.471 + HYPERVISOR_dom0_op(&op); 8.472 + } 8.473 +} 8.474 + 8.475 +/* 8.476 + * sysctl helper routine for machdep.tm* nodes. 8.477 + */ 8.478 +static int 8.479 +sysctl_machdep_tm_longrun(SYSCTLFN_ARGS) 8.480 +{ 8.481 + struct sysctlnode node; 8.482 + int io, error; 8.483 + 8.484 + if (!tmx86_has_longrun) 8.485 + return (EOPNOTSUPP); 8.486 + 8.487 + node = *rnode; 8.488 + node.sysctl_data = &io; 8.489 + 8.490 + switch (rnode->sysctl_num) { 8.491 + case CPU_TMLR_MODE: 8.492 + io = (int)(crusoe_longrun = tmx86_get_longrun_mode()); 8.493 + break; 8.494 + case CPU_TMLR_FREQUENCY: 8.495 + tmx86_get_longrun_status_all(); 8.496 + io = crusoe_frequency; 8.497 + break; 8.498 + case CPU_TMLR_VOLTAGE: 8.499 + tmx86_get_longrun_status_all(); 8.500 + io = crusoe_voltage; 8.501 + break; 8.502 + case CPU_TMLR_PERCENTAGE: 8.503 + tmx86_get_longrun_status_all(); 8.504 + io = crusoe_percentage; 8.505 + break; 8.506 + default: 8.507 + return (EOPNOTSUPP); 8.508 + } 8.509 + 8.510 + error = sysctl_lookup(SYSCTLFN_CALL(&node)); 8.511 + if (error || newp == NULL) 8.512 + return (error); 8.513 + 8.514 + if (rnode->sysctl_num == CPU_TMLR_MODE) { 8.515 + if (tmx86_set_longrun_mode(io)) 8.516 + crusoe_longrun = (u_int)io; 8.517 + else 8.518 + return (EINVAL); 8.519 + } 8.520 + 8.521 + return (0); 8.522 +} 8.523 + 8.524 +/* 8.525 + * sysctl helper routine for machdep.booted_kernel 8.526 + */ 8.527 +static int 8.528 +sysctl_machdep_booted_kernel(SYSCTLFN_ARGS) 8.529 +{ 8.530 + struct btinfo_bootpath *bibp; 8.531 + struct sysctlnode node; 8.532 + 8.533 + bibp = lookup_bootinfo(BTINFO_BOOTPATH); 8.534 + if(!bibp) 8.535 + return(ENOENT); /* ??? */ 8.536 + 8.537 + node = *rnode; 8.538 + node.sysctl_data = bibp->bootpath; 8.539 + node.sysctl_size = sizeof(bibp->bootpath); 8.540 + return (sysctl_lookup(SYSCTLFN_CALL(&node))); 8.541 +} 8.542 + 8.543 +/* 8.544 + * sysctl helper routine for machdep.diskinfo 8.545 + */ 8.546 +static int 8.547 +sysctl_machdep_diskinfo(SYSCTLFN_ARGS) 8.548 +{ 8.549 + struct sysctlnode node; 8.550 + 8.551 + node = *rnode; 8.552 + node.sysctl_data = i386_alldisks; 8.553 + node.sysctl_size = sizeof(struct disklist) + 8.554 + (i386_ndisks - 1) * sizeof(struct nativedisk_info); 8.555 + return (sysctl_lookup(SYSCTLFN_CALL(&node))); 8.556 +} 8.557 + 8.558 +/* 8.559 + * machine dependent system variables. 8.560 + */ 8.561 +SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup") 8.562 +{ 8.563 + 8.564 + sysctl_createv(clog, 0, NULL, NULL, 8.565 + CTLFLAG_PERMANENT, 8.566 + CTLTYPE_NODE, "machdep", NULL, 8.567 + NULL, 0, NULL, 0, 8.568 + CTL_MACHDEP, CTL_EOL); 8.569 + 8.570 + sysctl_createv(clog, 0, NULL, NULL, 8.571 + CTLFLAG_PERMANENT, 8.572 + CTLTYPE_STRUCT, "console_device", NULL, 8.573 + sysctl_consdev, 0, NULL, sizeof(dev_t), 8.574 + CTL_MACHDEP, CPU_CONSDEV, CTL_EOL); 8.575 + sysctl_createv(clog, 0, NULL, NULL, 8.576 + CTLFLAG_PERMANENT, 8.577 + CTLTYPE_INT, "biosbasemem", NULL, 8.578 + NULL, 0, &biosbasemem, 0, 8.579 + CTL_MACHDEP, CPU_BIOSBASEMEM, CTL_EOL); 8.580 + sysctl_createv(clog, 0, NULL, NULL, 8.581 + CTLFLAG_PERMANENT, 8.582 + CTLTYPE_INT, "biosextmem", NULL, 8.583 + NULL, 0, &biosextmem, 0, 8.584 + CTL_MACHDEP, CPU_BIOSEXTMEM, CTL_EOL); 8.585 + sysctl_createv(clog, 0, NULL, NULL, 8.586 + CTLFLAG_PERMANENT, 8.587 + CTLTYPE_INT, "nkpde", NULL, 8.588 + NULL, 0, &nkpde, 0, 8.589 + CTL_MACHDEP, CPU_NKPDE, CTL_EOL); 8.590 + sysctl_createv(clog, 0, NULL, NULL, 8.591 + CTLFLAG_PERMANENT, 8.592 + CTLTYPE_STRING, "booted_kernel", NULL, 8.593 + sysctl_machdep_booted_kernel, 0, NULL, 0, 8.594 + CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL); 8.595 + sysctl_createv(clog, 0, NULL, NULL, 8.596 + CTLFLAG_PERMANENT, 8.597 + CTLTYPE_STRUCT, "diskinfo", NULL, 8.598 + sysctl_machdep_diskinfo, 0, NULL, 0, 8.599 + CTL_MACHDEP, CPU_DISKINFO, CTL_EOL); 8.600 + sysctl_createv(clog, 0, NULL, NULL, 8.601 + CTLFLAG_PERMANENT, 8.602 + CTLTYPE_INT, "fpu_present", NULL, 8.603 + NULL, 0, &i386_fpu_present, 0, 8.604 + CTL_MACHDEP, CPU_FPU_PRESENT, CTL_EOL); 8.605 + sysctl_createv(clog, 0, NULL, NULL, 8.606 + CTLFLAG_PERMANENT, 8.607 + CTLTYPE_INT, "osfxsr", NULL, 8.608 + NULL, 0, &i386_use_fxsave, 0, 8.609 + CTL_MACHDEP, CPU_OSFXSR, CTL_EOL); 8.610 + sysctl_createv(clog, 0, NULL, NULL, 8.611 + CTLFLAG_PERMANENT, 8.612 + CTLTYPE_INT, "sse", NULL, 8.613 + NULL, 0, &i386_has_sse, 0, 8.614 + CTL_MACHDEP, CPU_SSE, CTL_EOL); 8.615 + sysctl_createv(clog, 0, NULL, NULL, 8.616 + CTLFLAG_PERMANENT, 8.617 + CTLTYPE_INT, "sse2", NULL, 8.618 + NULL, 0, &i386_has_sse2, 0, 8.619 + CTL_MACHDEP, CPU_SSE2, CTL_EOL); 8.620 + sysctl_createv(clog, 0, NULL, NULL, 8.621 + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 8.622 + CTLTYPE_INT, "tm_longrun_mode", NULL, 8.623 + sysctl_machdep_tm_longrun, 0, NULL, 0, 8.624 + CTL_MACHDEP, CPU_TMLR_MODE, CTL_EOL); 8.625 + sysctl_createv(clog, 0, NULL, NULL, 8.626 + CTLFLAG_PERMANENT, 8.627 + CTLTYPE_INT, "tm_longrun_frequency", NULL, 8.628 + sysctl_machdep_tm_longrun, 0, NULL, 0, 8.629 + CTL_MACHDEP, CPU_TMLR_FREQUENCY, CTL_EOL); 8.630 + sysctl_createv(clog, 0, NULL, NULL, 8.631 + CTLFLAG_PERMANENT, 8.632 + CTLTYPE_INT, "tm_longrun_voltage", NULL, 8.633 + sysctl_machdep_tm_longrun, 0, NULL, 0, 8.634 + CTL_MACHDEP, CPU_TMLR_VOLTAGE, CTL_EOL); 8.635 + sysctl_createv(clog, 0, NULL, NULL, 8.636 + CTLFLAG_PERMANENT, 8.637 + CTLTYPE_INT, "tm_longrun_percentage", NULL, 8.638 + sysctl_machdep_tm_longrun, 0, NULL, 0, 8.639 + CTL_MACHDEP, CPU_TMLR_PERCENTAGE, CTL_EOL); 8.640 +} 8.641 + 8.642 +void * 8.643 +getframe(struct lwp *l, int sig, int *onstack) 8.644 +{ 8.645 + struct proc *p = l->l_proc; 8.646 + struct sigctx *ctx = &p->p_sigctx; 8.647 + struct trapframe *tf = l->l_md.md_regs; 8.648 + 8.649 + /* Do we need to jump onto the signal stack? */ 8.650 + *onstack = (ctx->ps_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 8.651 + && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0; 8.652 + if (*onstack) 8.653 + return (char *)ctx->ps_sigstk.ss_sp + ctx->ps_sigstk.ss_size; 8.654 +#ifdef VM86 8.655 + if (tf->tf_eflags & PSL_VM) 8.656 + return (void *)(tf->tf_esp + (tf->tf_ss << 4)); 8.657 + else 8.658 +#endif 8.659 + return (void *)tf->tf_esp; 8.660 +} 8.661 + 8.662 +/* 8.663 + * Build context to run handler in. We invoke the handler 8.664 + * directly, only returning via the trampoline. Note the 8.665 + * trampoline version numbers are coordinated with machine- 8.666 + * dependent code in libc. 8.667 + */ 8.668 +void 8.669 +buildcontext(struct lwp *l, int sel, void *catcher, void *fp) 8.670 +{ 8.671 + struct trapframe *tf = l->l_md.md_regs; 8.672 + 8.673 + tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); 8.674 + tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); 8.675 + tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); 8.676 + tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); 8.677 + tf->tf_eip = (int)catcher; 8.678 + tf->tf_cs = GSEL(sel, SEL_UPL); 8.679 + tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC); 8.680 + tf->tf_esp = (int)fp; 8.681 + tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); 8.682 +} 8.683 + 8.684 +static void 8.685 +sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask) 8.686 +{ 8.687 + struct lwp *l = curlwp; 8.688 + struct proc *p = l->l_proc; 8.689 + struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map); 8.690 + int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ? 8.691 + GUCODEBIG_SEL : GUCODE_SEL; 8.692 + struct sigacts *ps = p->p_sigacts; 8.693 + int onstack; 8.694 + int sig = ksi->ksi_signo; 8.695 + struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame; 8.696 + sig_t catcher = SIGACTION(p, sig).sa_handler; 8.697 + struct trapframe *tf = l->l_md.md_regs; 8.698 + 8.699 + fp--; 8.700 + 8.701 + /* Build stack frame for signal trampoline. */ 8.702 + switch (ps->sa_sigdesc[sig].sd_vers) { 8.703 + case 0: /* handled by sendsig_sigcontext */ 8.704 + case 1: /* handled by sendsig_sigcontext */ 8.705 + default: /* unknown version */ 8.706 + printf("nsendsig: bad version %d\n", 8.707 + ps->sa_sigdesc[sig].sd_vers); 8.708 + sigexit(l, SIGILL); 8.709 + case 2: 8.710 + break; 8.711 + } 8.712 + 8.713 + frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp; 8.714 + frame.sf_signum = sig; 8.715 + frame.sf_sip = &fp->sf_si; 8.716 + frame.sf_ucp = &fp->sf_uc; 8.717 + frame.sf_si._info = ksi->ksi_info; 8.718 + frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM; 8.719 + frame.sf_uc.uc_sigmask = *mask; 8.720 + frame.sf_uc.uc_link = NULL; 8.721 + frame.sf_uc.uc_flags |= (p->p_sigctx.ps_sigstk.ss_flags & SS_ONSTACK) 8.722 + ? _UC_SETSTACK : _UC_CLRSTACK; 8.723 + memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack)); 8.724 + cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags); 8.725 + 8.726 + if (tf->tf_eflags & PSL_VM) 8.727 + (*p->p_emul->e_syscall_intern)(p); 8.728 + 8.729 + if (copyout(&frame, fp, sizeof(frame)) != 0) { 8.730 + /* 8.731 + * Process has trashed its stack; give it an illegal 8.732 + * instruction to halt it in its tracks. 8.733 + */ 8.734 + sigexit(l, SIGILL); 8.735 + /* NOTREACHED */ 8.736 + } 8.737 + 8.738 + buildcontext(l, sel, catcher, fp); 8.739 + 8.740 + /* Remember that we're now on the signal stack. */ 8.741 + if (onstack) 8.742 + p->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK; 8.743 +} 8.744 + 8.745 +void 8.746 +sendsig(const ksiginfo_t *ksi, const sigset_t *mask) 8.747 +{ 8.748 +#ifdef COMPAT_16 8.749 + if (curproc->p_sigacts->sa_sigdesc[ksi->ksi_signo].sd_vers < 2) 8.750 + sendsig_sigcontext(ksi, mask); 8.751 + else 8.752 +#endif 8.753 + sendsig_siginfo(ksi, mask); 8.754 +} 8.755 + 8.756 +void 8.757 +cpu_upcall(struct lwp *l, int type, int nevents, int ninterrupted, void *sas, 8.758 + void *ap, void *sp, sa_upcall_t upcall) 8.759 +{ 8.760 + struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 8.761 + struct saframe *sf, frame; 8.762 + struct trapframe *tf; 8.763 + 8.764 + tf = l->l_md.md_regs; 8.765 + 8.766 + /* Finally, copy out the rest of the frame. */ 8.767 + frame.sa_type = type; 8.768 + frame.sa_sas = sas; 8.769 + frame.sa_events = nevents; 8.770 + frame.sa_interrupted = ninterrupted; 8.771 + frame.sa_arg = ap; 8.772 + frame.sa_ra = 0; 8.773 + 8.774 + sf = (struct saframe *)sp - 1; 8.775 + if (copyout(&frame, sf, sizeof(frame)) != 0) { 8.776 + /* Copying onto the stack didn't work. Die. */ 8.777 + sigexit(l, SIGILL); 8.778 + /* NOTREACHED */ 8.779 + } 8.780 + 8.781 + tf->tf_eip = (int) upcall; 8.782 + tf->tf_esp = (int) sf; 8.783 + tf->tf_ebp = 0; /* indicate call-frame-top to debuggers */ 8.784 + tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); 8.785 + tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); 8.786 + tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); 8.787 + tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); 8.788 + tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ? 8.789 + GSEL(GUCODEBIG_SEL, SEL_UPL) : GSEL(GUCODE_SEL, SEL_UPL); 8.790 + tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); 8.791 + tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC); 8.792 +} 8.793 + 8.794 +int waittime = -1; 8.795 +struct pcb dumppcb; 8.796 + 8.797 +void 8.798 +cpu_reboot(int howto, char *bootstr) 8.799 +{ 8.800 + 8.801 + if (cold) { 8.802 + howto |= RB_HALT; 8.803 + goto haltsys; 8.804 + } 8.805 + 8.806 + boothowto = howto; 8.807 + if ((howto & RB_NOSYNC) == 0 && waittime < 0) { 8.808 + waittime = 0; 8.809 + vfs_shutdown(); 8.810 + /* 8.811 + * If we've been adjusting the clock, the todr 8.812 + * will be out of synch; adjust it now. 8.813 + */ 8.814 + if (time_adjusted != 0) 8.815 + resettodr(); 8.816 + } 8.817 + 8.818 + /* Disable interrupts. */ 8.819 + splhigh(); 8.820 + 8.821 + /* Do a dump if requested. */ 8.822 + if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP) 8.823 + dumpsys(); 8.824 + 8.825 +haltsys: 8.826 + doshutdownhooks(); 8.827 + 8.828 +#ifdef MULTIPROCESSOR 8.829 + x86_broadcast_ipi(X86_IPI_HALT); 8.830 +#endif 8.831 + 8.832 + if ((howto & RB_POWERDOWN) == RB_POWERDOWN) { 8.833 +#if NACPI > 0 8.834 + if (acpi_softc != NULL) { 8.835 + delay(500000); 8.836 + acpi_enter_sleep_state(acpi_softc, ACPI_STATE_S5); 8.837 + printf("WARNING: ACPI powerdown failed!\n"); 8.838 + } 8.839 +#endif 8.840 +#if NAPM > 0 && !defined(APM_NO_POWEROFF) 8.841 + /* turn off, if we can. But try to turn disk off and 8.842 + * wait a bit first--some disk drives are slow to clean up 8.843 + * and users have reported disk corruption. 8.844 + */ 8.845 + delay(500000); 8.846 + apm_set_powstate(APM_DEV_DISK(0xff), APM_SYS_OFF); 8.847 + delay(500000); 8.848 + apm_set_powstate(APM_DEV_ALLDEVS, APM_SYS_OFF); 8.849 + printf("WARNING: APM powerdown failed!\n"); 8.850 + /* 8.851 + * RB_POWERDOWN implies RB_HALT... fall into it... 8.852 + */ 8.853 +#endif 8.854 + HYPERVISOR_shutdown(); 8.855 + } 8.856 + 8.857 + if (howto & RB_HALT) { 8.858 + printf("\n"); 8.859 + printf("The operating system has halted.\n"); 8.860 + printf("Please press any key to reboot.\n\n"); 8.861 + 8.862 +#ifdef BEEP_ONHALT 8.863 + { 8.864 + int c; 8.865 + for (c = BEEP_ONHALT_COUNT; c > 0; c--) { 8.866 + sysbeep(BEEP_ONHALT_PITCH, 8.867 + BEEP_ONHALT_PERIOD * hz / 1000); 8.868 + delay(BEEP_ONHALT_PERIOD * 1000); 8.869 + sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000); 8.870 + delay(BEEP_ONHALT_PERIOD * 1000); 8.871 + } 8.872 + } 8.873 +#endif 8.874 + 8.875 + cnpollc(1); /* for proper keyboard command handling */ 8.876 + if (cngetc() == 0) { 8.877 + /* no console attached, so just hlt */ 8.878 + for(;;) { 8.879 + __asm __volatile("hlt"); 8.880 + } 8.881 + } 8.882 + cnpollc(0); 8.883 + } 8.884 + 8.885 + printf("rebooting...\n"); 8.886 + if (cpureset_delay > 0) 8.887 + delay(cpureset_delay * 1000); 8.888 + cpu_reset(); 8.889 + for(;;) ; 8.890 + /*NOTREACHED*/ 8.891 +} 8.892 + 8.893 +/* 8.894 + * These variables are needed by /sbin/savecore 8.895 + */ 8.896 +u_int32_t dumpmag = 0x8fca0101; /* magic number */ 8.897 +int dumpsize = 0; /* pages */ 8.898 +long dumplo = 0; /* blocks */ 8.899 + 8.900 +/* 8.901 + * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers. 8.902 + */ 8.903 +int 8.904 +cpu_dumpsize() 8.905 +{ 8.906 + int size; 8.907 + 8.908 + size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) + 8.909 + ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t)); 8.910 + if (roundup(size, dbtob(1)) != dbtob(1)) 8.911 + return (-1); 8.912 + 8.913 + return (1); 8.914 +} 8.915 + 8.916 +/* 8.917 + * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped. 8.918 + */ 8.919 +u_long 8.920 +cpu_dump_mempagecnt() 8.921 +{ 8.922 + u_long i, n; 8.923 + 8.924 + n = 0; 8.925 + for (i = 0; i < mem_cluster_cnt; i++) 8.926 + n += atop(mem_clusters[i].size); 8.927 + return (n); 8.928 +} 8.929 + 8.930 +/* 8.931 + * cpu_dump: dump the machine-dependent kernel core dump headers. 8.932 + */ 8.933 +int 8.934 +cpu_dump() 8.935 +{ 8.936 + int (*dump)(dev_t, daddr_t, caddr_t, size_t); 8.937 + char buf[dbtob(1)]; 8.938 + kcore_seg_t *segp; 8.939 + cpu_kcore_hdr_t *cpuhdrp; 8.940 + phys_ram_seg_t *memsegp; 8.941 + const struct bdevsw *bdev; 8.942 + int i; 8.943 + 8.944 + bdev = bdevsw_lookup(dumpdev); 8.945 + if (bdev == NULL) 8.946 + return (ENXIO); 8.947 + dump = bdev->d_dump; 8.948 + 8.949 + memset(buf, 0, sizeof buf); 8.950 + segp = (kcore_seg_t *)buf; 8.951 + cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))]; 8.952 + memsegp = (phys_ram_seg_t *)&buf[ ALIGN(sizeof(*segp)) + 8.953 + ALIGN(sizeof(*cpuhdrp))]; 8.954 + 8.955 + /* 8.956 + * Generate a segment header. 8.957 + */ 8.958 + CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU); 8.959 + segp->c_size = dbtob(1) - ALIGN(sizeof(*segp)); 8.960 + 8.961 + /* 8.962 + * Add the machine-dependent header info. 8.963 + */ 8.964 + cpuhdrp->ptdpaddr = PTDpaddr; 8.965 + cpuhdrp->nmemsegs = mem_cluster_cnt; 8.966 + 8.967 + /* 8.968 + * Fill in the memory segment descriptors. 8.969 + */ 8.970 + for (i = 0; i < mem_cluster_cnt; i++) { 8.971 + memsegp[i].start = mem_clusters[i].start; 8.972 + memsegp[i].size = mem_clusters[i].size; 8.973 + } 8.974 + 8.975 + return (dump(dumpdev, dumplo, (caddr_t)buf, dbtob(1))); 8.976 +} 8.977 + 8.978 +/* 8.979 + * This is called by main to set dumplo and dumpsize. 8.980 + * Dumps always skip the first PAGE_SIZE of disk space 8.981 + * in case there might be a disk label stored there. 8.982 + * If there is extra space, put dump at the end to 8.983 + * reduce the chance that swapping trashes it. 8.984 + */ 8.985 +void 8.986 +cpu_dumpconf() 8.987 +{ 8.988 + const struct bdevsw *bdev; 8.989 + int nblks, dumpblks; /* size of dump area */ 8.990 + 8.991 + if (dumpdev == NODEV) 8.992 + goto bad; 8.993 + bdev = bdevsw_lookup(dumpdev); 8.994 + if (bdev == NULL) 8.995 + panic("dumpconf: bad dumpdev=0x%x", dumpdev); 8.996 + if (bdev->d_psize == NULL) 8.997 + goto bad; 8.998 + nblks = (*bdev->d_psize)(dumpdev); 8.999 + if (nblks <= ctod(1)) 8.1000 + goto bad; 8.1001 + 8.1002 + dumpblks = cpu_dumpsize(); 8.1003 + if (dumpblks < 0) 8.1004 + goto bad; 8.1005 + dumpblks += ctod(cpu_dump_mempagecnt()); 8.1006 + 8.1007 + /* If dump won't fit (incl. room for possible label), punt. */ 8.1008 + if (dumpblks > (nblks - ctod(1))) 8.1009 + goto bad; 8.1010 + 8.1011 + /* Put dump at end of partition */ 8.1012 + dumplo = nblks - dumpblks; 8.1013 + 8.1014 + /* dumpsize is in page units, and doesn't include headers. */ 8.1015 + dumpsize = cpu_dump_mempagecnt(); 8.1016 + return; 8.1017 + 8.1018 + bad: 8.1019 + dumpsize = 0; 8.1020 +} 8.1021 + 8.1022 +/* 8.1023 + * Doadump comes here after turning off memory management and 8.1024 + * getting on the dump stack, either when called above, or by 8.1025 + * the auto-restart code. 8.1026 + */ 8.1027 +#define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */ 8.1028 +static vaddr_t dumpspace; 8.1029 + 8.1030 +vaddr_t 8.1031 +reserve_dumppages(vaddr_t p) 8.1032 +{ 8.1033 + 8.1034 + dumpspace = p; 8.1035 + return (p + BYTES_PER_DUMP); 8.1036 +} 8.1037 + 8.1038 +void 8.1039 +dumpsys() 8.1040 +{ 8.1041 + u_long totalbytesleft, bytes, i, n, memseg; 8.1042 + u_long maddr; 8.1043 + int psize; 8.1044 + daddr_t blkno; 8.1045 + const struct bdevsw *bdev; 8.1046 + int (*dump)(dev_t, daddr_t, caddr_t, size_t); 8.1047 + int error; 8.1048 + 8.1049 + /* Save registers. */ 8.1050 + savectx(&dumppcb); 8.1051 + 8.1052 + if (dumpdev == NODEV) 8.1053 + return; 8.1054 + 8.1055 + bdev = bdevsw_lookup(dumpdev); 8.1056 + if (bdev == NULL || bdev->d_psize == NULL) 8.1057 + return; 8.1058 + 8.1059 + /* 8.1060 + * For dumps during autoconfiguration, 8.1061 + * if dump device has already configured... 8.1062 + */ 8.1063 + if (dumpsize == 0) 8.1064 + cpu_dumpconf(); 8.1065 + if (dumplo <= 0 || dumpsize == 0) { 8.1066 + printf("\ndump to dev %u,%u not possible\n", major(dumpdev), 8.1067 + minor(dumpdev)); 8.1068 + return; 8.1069 + } 8.1070 + printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev), 8.1071 + minor(dumpdev), dumplo); 8.1072 + 8.1073 + psize = (*bdev->d_psize)(dumpdev); 8.1074 + printf("dump "); 8.1075 + if (psize == -1) { 8.1076 + printf("area unavailable\n"); 8.1077 + return; 8.1078 + } 8.1079 + 8.1080 +#if 0 /* XXX this doesn't work. grr. */ 8.1081 + /* toss any characters present prior to dump */ 8.1082 + while (sget() != NULL); /*syscons and pccons differ */ 8.1083 +#endif 8.1084 + 8.1085 + if ((error = cpu_dump()) != 0) 8.1086 + goto err; 8.1087 + 8.1088 + totalbytesleft = ptoa(cpu_dump_mempagecnt()); 8.1089 + blkno = dumplo + cpu_dumpsize(); 8.1090 + dump = bdev->d_dump; 8.1091 + error = 0; 8.1092 + 8.1093 + for (memseg = 0; memseg < mem_cluster_cnt; memseg++) { 8.1094 + maddr = mem_clusters[memseg].start; 8.1095 + bytes = mem_clusters[memseg].size; 8.1096 + 8.1097 + for (i = 0; i < bytes; i += n, totalbytesleft -= n) { 8.1098 + /* Print out how many MBs we have left to go. */ 8.1099 + if ((totalbytesleft % (1024*1024)) == 0) 8.1100 + printf("%ld ", totalbytesleft / (1024 * 1024)); 8.1101 + 8.1102 + /* Limit size for next transfer. */ 8.1103 + n = bytes - i; 8.1104 + if (n > BYTES_PER_DUMP) 8.1105 + n = BYTES_PER_DUMP; 8.1106 + 8.1107 + (void) pmap_map(dumpspace, maddr, maddr + n, 8.1108 + VM_PROT_READ); 8.1109 + 8.1110 + error = (*dump)(dumpdev, blkno, (caddr_t)dumpspace, n); 8.1111 + if (error) 8.1112 + goto err; 8.1113 + maddr += n; 8.1114 + blkno += btodb(n); /* XXX? */ 8.1115 + 8.1116 +#if 0 /* XXX this doesn't work. grr. */ 8.1117 + /* operator aborting dump? */ 8.1118 + if (sget() != NULL) { 8.1119 + error = EINTR; 8.1120 + break; 8.1121 + } 8.1122 +#endif 8.1123 + } 8.1124 + } 8.1125 + 8.1126 + err: 8.1127 + switch (error) { 8.1128 + 8.1129 + case ENXIO: 8.1130 + printf("device bad\n"); 8.1131 + break; 8.1132 + 8.1133 + case EFAULT: 8.1134 + printf("device not ready\n"); 8.1135 + break; 8.1136 + 8.1137 + case EINVAL: 8.1138 + printf("area improper\n"); 8.1139 + break; 8.1140 + 8.1141 + case EIO: 8.1142 + printf("i/o error\n"); 8.1143 + break; 8.1144 + 8.1145 + case EINTR: 8.1146 + printf("aborted from console\n"); 8.1147 + break; 8.1148 + 8.1149 + case 0: 8.1150 + printf("succeeded\n"); 8.1151 + break; 8.1152 + 8.1153 + default: 8.1154 + printf("error %d\n", error); 8.1155 + break; 8.1156 + } 8.1157 + printf("\n\n"); 8.1158 + delay(5000000); /* 5 seconds */ 8.1159 +} 8.1160 + 8.1161 +/* 8.1162 + * Clear registers on exec 8.1163 + */ 8.1164 +void 8.1165 +setregs(struct lwp *l, struct exec_package *pack, u_long stack) 8.1166 +{ 8.1167 + struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 8.1168 + struct pcb *pcb = &l->l_addr->u_pcb; 8.1169 + struct trapframe *tf; 8.1170 + 8.1171 +#if NNPX > 0 8.1172 + /* If we were using the FPU, forget about it. */ 8.1173 + if (l->l_addr->u_pcb.pcb_fpcpu != NULL) 8.1174 + npxsave_lwp(l, 0); 8.1175 +#endif 8.1176 + 8.1177 +#ifdef USER_LDT 8.1178 + pmap_ldt_cleanup(l); 8.1179 +#endif 8.1180 + 8.1181 + l->l_md.md_flags &= ~MDL_USEDFPU; 8.1182 + if (i386_use_fxsave) { 8.1183 + pcb->pcb_savefpu.sv_xmm.sv_env.en_cw = __NetBSD_NPXCW__; 8.1184 + pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr = __INITIAL_MXCSR__; 8.1185 + } else 8.1186 + pcb->pcb_savefpu.sv_87.sv_env.en_cw = __NetBSD_NPXCW__; 8.1187 + 8.1188 + tf = l->l_md.md_regs; 8.1189 + tf->tf_gs = LSEL(LUDATA_SEL, SEL_UPL); 8.1190 + tf->tf_fs = LSEL(LUDATA_SEL, SEL_UPL); 8.1191 + tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL); 8.1192 + tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL); 8.1193 + tf->tf_edi = 0; 8.1194 + tf->tf_esi = 0; 8.1195 + tf->tf_ebp = 0; 8.1196 + tf->tf_ebx = (int)l->l_proc->p_psstr; 8.1197 + tf->tf_edx = 0; 8.1198 + tf->tf_ecx = 0; 8.1199 + tf->tf_eax = 0; 8.1200 + tf->tf_eip = pack->ep_entry; 8.1201 + tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ? 8.1202 + LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL); 8.1203 + tf->tf_eflags = PSL_USERSET; 8.1204 + tf->tf_esp = stack; 8.1205 + tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); 8.1206 +} 8.1207 + 8.1208 +/* 8.1209 + * Initialize segments and descriptor tables 8.1210 + */ 8.1211 + 8.1212 +union descriptor *gdt, *ldt; 8.1213 +struct gate_descriptor *idt; 8.1214 +char idt_allocmap[NIDT]; 8.1215 +struct simplelock idt_lock = SIMPLELOCK_INITIALIZER; 8.1216 +#ifdef I586_CPU 8.1217 +union descriptor *pentium_idt; 8.1218 +#endif 8.1219 +extern struct user *proc0paddr; 8.1220 + 8.1221 +void 8.1222 +setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl, 8.1223 + int sel) 8.1224 +{ 8.1225 + 8.1226 + gd->gd_looffset = (int)func; 8.1227 + gd->gd_selector = sel; 8.1228 + gd->gd_stkcpy = args; 8.1229 + gd->gd_xx = 0; 8.1230 + gd->gd_type = type; 8.1231 + gd->gd_dpl = dpl; 8.1232 + gd->gd_p = 1; 8.1233 + gd->gd_hioffset = (int)func >> 16; 8.1234 +} 8.1235 + 8.1236 +void 8.1237 +unsetgate(struct gate_descriptor *gd) 8.1238 +{ 8.1239 + gd->gd_p = 0; 8.1240 + gd->gd_hioffset = 0; 8.1241 + gd->gd_looffset = 0; 8.1242 + gd->gd_selector = 0; 8.1243 + gd->gd_xx = 0; 8.1244 + gd->gd_stkcpy = 0; 8.1245 + gd->gd_type = 0; 8.1246 + gd->gd_dpl = 0; 8.1247 +} 8.1248 + 8.1249 + 8.1250 +void 8.1251 +setregion(struct region_descriptor *rd, void *base, size_t limit) 8.1252 +{ 8.1253 + 8.1254 + rd->rd_limit = (int)limit; 8.1255 + rd->rd_base = (int)base; 8.1256 +} 8.1257 + 8.1258 +void 8.1259 +setsegment(struct segment_descriptor *sd, void *base, size_t limit, int type, 8.1260 + int dpl, int def32, int gran) 8.1261 +{ 8.1262 + 8.1263 + sd->sd_lolimit = (int)limit; 8.1264 + sd->sd_lobase = (int)base; 8.1265 + sd->sd_type = type; 8.1266 + sd->sd_dpl = dpl; 8.1267 + sd->sd_p = 1; 8.1268 + sd->sd_hilimit = (int)limit >> 16; 8.1269 + sd->sd_xx = 0; 8.1270 + sd->sd_def32 = def32; 8.1271 + sd->sd_gran = gran; 8.1272 + sd->sd_hibase = (int)base >> 24; 8.1273 +} 8.1274 + 8.1275 +#define IDTVEC(name) __CONCAT(X, name) 8.1276 +typedef void (vector)(void); 8.1277 +extern vector IDTVEC(syscall); 8.1278 +extern vector IDTVEC(osyscall); 8.1279 +extern vector *IDTVEC(exceptions)[]; 8.1280 +#ifdef COMPAT_SVR4 8.1281 +extern vector IDTVEC(svr4_fasttrap); 8.1282 +#endif /* COMPAT_SVR4 */ 8.1283 +#ifdef COMPAT_MACH 8.1284 +extern vector IDTVEC(mach_trap); 8.1285 +#endif 8.1286 +#define MAX_XEN_IDT 128 8.1287 +trap_info_t xen_idt[MAX_XEN_IDT]; 8.1288 +int xen_idt_idx; 8.1289 + 8.1290 +#define KBTOB(x) ((size_t)(x) * 1024UL) 8.1291 + 8.1292 +void cpu_init_idt() 8.1293 +{ 8.1294 + struct region_descriptor region; 8.1295 + 8.1296 + panic("cpu_init_idt"); 8.1297 +#ifdef I586_CPU 8.1298 + setregion(®ion, pentium_idt, NIDT * sizeof(idt[0]) - 1); 8.1299 +#else 8.1300 + setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1); 8.1301 +#endif 8.1302 + lidt(®ion); 8.1303 +} 8.1304 + 8.1305 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM) 8.1306 +void 8.1307 +add_mem_cluster(u_int64_t seg_start, u_int64_t seg_end, u_int32_t type) 8.1308 +{ 8.1309 + extern struct extent *iomem_ex; 8.1310 + int i; 8.1311 + 8.1312 + if (seg_end > 0x100000000ULL) { 8.1313 + printf("WARNING: skipping large " 8.1314 + "memory map entry: " 8.1315 + "0x%qx/0x%qx/0x%x\n", 8.1316 + seg_start, 8.1317 + (seg_end - seg_start), 8.1318 + type); 8.1319 + return; 8.1320 + } 8.1321 + 8.1322 + /* 8.1323 + * XXX Chop the last page off the size so that 8.1324 + * XXX it can fit in avail_end. 8.1325 + */ 8.1326 + if (seg_end == 0x100000000ULL) 8.1327 + seg_end -= PAGE_SIZE; 8.1328 + 8.1329 + if (seg_end <= seg_start) 8.1330 + return; 8.1331 + 8.1332 + for (i = 0; i < mem_cluster_cnt; i++) { 8.1333 + if ((mem_clusters[i].start == round_page(seg_start)) 8.1334 + && (mem_clusters[i].size 8.1335 + == trunc_page(seg_end) - mem_clusters[i].start)) { 8.1336 +#ifdef DEBUG_MEMLOAD 8.1337 + printf("WARNING: skipping duplicate segment entry\n"); 8.1338 +#endif 8.1339 + return; 8.1340 + } 8.1341 + } 8.1342 + 8.1343 + /* 8.1344 + * Allocate the physical addresses used by RAM 8.1345 + * from the iomem extent map. This is done before 8.1346 + * the addresses are page rounded just to make 8.1347 + * sure we get them all. 8.1348 + */ 8.1349 + if (extent_alloc_region(iomem_ex, seg_start, 8.1350 + seg_end - seg_start, EX_NOWAIT)) { 8.1351 + /* XXX What should we do? */ 8.1352 + printf("WARNING: CAN'T ALLOCATE " 8.1353 + "MEMORY SEGMENT " 8.1354 + "(0x%qx/0x%qx/0x%x) FROM " 8.1355 + "IOMEM EXTENT MAP!\n", 8.1356 + seg_start, seg_end - seg_start, type); 8.1357 + return; 8.1358 + } 8.1359 + 8.1360 + /* 8.1361 + * If it's not free memory, skip it. 8.1362 + */ 8.1363 + if (type != BIM_Memory) 8.1364 + return; 8.1365 + 8.1366 + /* XXX XXX XXX */ 8.1367 + if (mem_cluster_cnt >= VM_PHYSSEG_MAX) 8.1368 + panic("init386: too many memory segments"); 8.1369 + 8.1370 + seg_start = round_page(seg_start); 8.1371 + seg_end = trunc_page(seg_end); 8.1372 + 8.1373 + if (seg_start == seg_end) 8.1374 + return; 8.1375 + 8.1376 + mem_clusters[mem_cluster_cnt].start = seg_start; 8.1377 + mem_clusters[mem_cluster_cnt].size = 8.1378 + seg_end - seg_start; 8.1379 + 8.1380 + if (avail_end < seg_end) 8.1381 + avail_end = seg_end; 8.1382 + physmem += atop(mem_clusters[mem_cluster_cnt].size); 8.1383 + mem_cluster_cnt++; 8.1384 +} 8.1385 +#endif /* !defined(REALBASEMEM) && !defined(REALEXTMEM) */ 8.1386 + 8.1387 +void 8.1388 +initgdt() 8.1389 +{ 8.1390 +#if !defined(XEN) 8.1391 + struct region_descriptor region; 8.1392 +#else 8.1393 + paddr_t frames[16]; 8.1394 +#endif 8.1395 + 8.1396 +#if !defined(XEN) 8.1397 + gdt = tgdt; 8.1398 + memset(gdt, 0, NGDT*sizeof(*gdt)); 8.1399 +#endif 8.1400 + /* make gdt gates and memory segments */ 8.1401 + setsegment(&gdt[GCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 1, 1); 8.1402 + setsegment(&gdt[GDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 1, 1); 8.1403 + setsegment(&gdt[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1, 8.1404 + SDT_MEMERA, SEL_UPL, 1, 1); 8.1405 + setsegment(&gdt[GUCODEBIG_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1, 8.1406 + SDT_MEMERA, SEL_UPL, 1, 1); 8.1407 + setsegment(&gdt[GUDATA_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1, 8.1408 + SDT_MEMRWA, SEL_UPL, 1, 1); 8.1409 +#ifdef COMPAT_MACH 8.1410 + setgate(&gdt[GMACHCALLS_SEL].gd, &IDTVEC(mach_trap), 1, 8.1411 + SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); 8.1412 +#endif 8.1413 +#if NBIOSCALL > 0 8.1414 + /* bios trampoline GDT entries */ 8.1415 + setsegment(&gdt[GBIOSCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 0, 8.1416 + 0); 8.1417 + setsegment(&gdt[GBIOSDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 0, 8.1418 + 0); 8.1419 +#endif 8.1420 + setsegment(&gdt[GCPU_SEL].sd, &cpu_info_primary, 8.1421 + sizeof(struct cpu_info)-1, SDT_MEMRWA, SEL_KPL, 1, 1); 8.1422 + 8.1423 +#if !defined(XEN) 8.1424 + setregion(®ion, gdt, NGDT * sizeof(gdt[0]) - 1); 8.1425 + lgdt(®ion); 8.1426 +#else 8.1427 + frames[0] = xpmap_ptom((uint32_t)gdt - KERNBASE) >> PAGE_SHIFT; 8.1428 + /* pmap_kremove((vaddr_t)gdt, PAGE_SIZE); */ 8.1429 + pmap_kenter_pa((vaddr_t)gdt, (uint32_t)gdt - KERNBASE, 8.1430 + VM_PROT_READ); 8.1431 + XENPRINTK(("loading gdt %lx, %d entries\n", frames[0] << PAGE_SHIFT, 8.1432 + LAST_RESERVED_GDT_ENTRY + 1)); 8.1433 + if (HYPERVISOR_set_gdt(frames, LAST_RESERVED_GDT_ENTRY + 1)) 8.1434 + panic("HYPERVISOR_set_gdt failed!\n"); 8.1435 + lgdt_finish(); 8.1436 +#endif 8.1437 +} 8.1438 + 8.1439 +void 8.1440 +init386(paddr_t first_avail) 8.1441 +{ 8.1442 +#if !defined(XEN) 8.1443 + union descriptor *tgdt; 8.1444 +#endif 8.1445 + extern void consinit(void); 8.1446 +#if !defined(XEN) 8.1447 + extern struct extent *iomem_ex; 8.1448 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM) 8.1449 + struct btinfo_memmap *bim; 8.1450 +#endif 8.1451 + struct region_descriptor region; 8.1452 +#endif 8.1453 + int x; 8.1454 +#if !defined(XEN) 8.1455 + int first16q; 8.1456 + u_int64_t seg_start, seg_end; 8.1457 + u_int64_t seg_start1, seg_end1; 8.1458 +#endif 8.1459 + paddr_t realmode_reserved_start; 8.1460 + psize_t realmode_reserved_size; 8.1461 + int needs_earlier_install_pte0; 8.1462 +#if NBIOSCALL > 0 8.1463 + extern int biostramp_image_size; 8.1464 + extern u_char biostramp_image[]; 8.1465 +#endif 8.1466 + 8.1467 + XENPRINTK(("HYPERVISOR_shared_info %p\n", HYPERVISOR_shared_info)); 8.1468 +#ifdef XENDEBUG_LOW 8.1469 + xen_dbglow_init(); 8.1470 +#endif 8.1471 + 8.1472 + cpu_probe_features(&cpu_info_primary); 8.1473 + cpu_feature = cpu_info_primary.ci_feature_flags; 8.1474 + 8.1475 + /* not on Xen... */ 8.1476 + cpu_feature &= ~(CPUID_PGE|CPUID_PSE|CPUID_MTRR|CPUID_FXSR); 8.1477 + 8.1478 + lwp0.l_addr = proc0paddr; 8.1479 + cpu_info_primary.ci_curpcb = &lwp0.l_addr->u_pcb; 8.1480 + 8.1481 + XENPRINTK(("proc0paddr %p pcb %p first_avail %p\n", 8.1482 + proc0paddr, cpu_info_primary.ci_curpcb, (void *)first_avail)); 8.1483 + XENPRINTK(("ptdpaddr %p atdevbase %p\n", (void *)PTDpaddr, 8.1484 + (void *)atdevbase)); 8.1485 + 8.1486 + x86_bus_space_init(); 8.1487 + consinit(); /* XXX SHOULD NOT BE DONE HERE */ 8.1488 + /* 8.1489 + * Initailize PAGE_SIZE-dependent variables. 8.1490 + */ 8.1491 + uvm_setpagesize(); 8.1492 + 8.1493 + /* 8.1494 + * Saving SSE registers won't work if the save area isn't 8.1495 + * 16-byte aligned. 8.1496 + */ 8.1497 + if (offsetof(struct user, u_pcb.pcb_savefpu) & 0xf) 8.1498 + panic("init386: pcb_savefpu not 16-byte aligned"); 8.1499 + 8.1500 + /* 8.1501 + * Start with 2 color bins -- this is just a guess to get us 8.1502 + * started. We'll recolor when we determine the largest cache 8.1503 + * sizes on the system. 8.1504 + */ 8.1505 + uvmexp.ncolors = 2; 8.1506 + 8.1507 +#if !defined(XEN) 8.1508 + /* 8.1509 + * BIOS leaves data in physical page 0 8.1510 + * Even if it didn't, our VM system doesn't like using zero as a 8.1511 + * physical page number. 8.1512 + * We may also need pages in low memory (one each) for secondary CPU 8.1513 + * startup, for BIOS calls, and for ACPI, plus a page table page to map 8.1514 + * them into the first few pages of the kernel's pmap. 8.1515 + */ 8.1516 + avail_start = PAGE_SIZE; 8.1517 +#else 8.1518 + /* Make sure the end of the space used by the kernel is rounded. */ 8.1519 + first_avail = round_page(first_avail); 8.1520 + avail_start = first_avail - KERNBASE; 8.1521 + avail_end = ptoa(xen_start_info.nr_pages) + 8.1522 + (KERNTEXTOFF - KERNBASE_LOCORE); 8.1523 + pmap_pa_start = (KERNTEXTOFF - KERNBASE_LOCORE); 8.1524 + pmap_pa_end = avail_end; 8.1525 + mem_clusters[0].start = avail_start; 8.1526 + mem_clusters[0].size = avail_end - avail_start; 8.1527 + mem_cluster_cnt++; 8.1528 + physmem += atop(mem_clusters[0].size); 8.1529 +#endif 8.1530 + 8.1531 + /* 8.1532 + * reserve memory for real-mode call 8.1533 + */ 8.1534 + needs_earlier_install_pte0 = 0; 8.1535 + realmode_reserved_start = 0; 8.1536 + realmode_reserved_size = 0; 8.1537 +#if NBIOSCALL > 0 8.1538 + /* save us a page for trampoline code */ 8.1539 + realmode_reserved_size += PAGE_SIZE; 8.1540 + needs_earlier_install_pte0 = 1; 8.1541 +#endif 8.1542 +#ifdef MULTIPROCESSOR /* XXX */ 8.1543 +#if !defined(XEN) 8.1544 + KASSERT(avail_start == PAGE_SIZE); /* XXX */ 8.1545 +#endif 8.1546 + if (realmode_reserved_size < MP_TRAMPOLINE) /* XXX */ 8.1547 + realmode_reserved_size = MP_TRAMPOLINE; /* XXX */ 8.1548 + needs_earlier_install_pte0 = 1; /* XXX */ 8.1549 +#endif /* XXX */ 8.1550 +#if NACPI > 0 8.1551 + /* trampoline code for wake handler */ 8.1552 + realmode_reserved_size += ptoa(acpi_md_get_npages_of_wakecode()+1); 8.1553 + needs_earlier_install_pte0 = 1; 8.1554 +#endif 8.1555 + if (needs_earlier_install_pte0) { 8.1556 + /* page table for directory entry 0 */ 8.1557 + realmode_reserved_size += PAGE_SIZE; 8.1558 + } 8.1559 + if (realmode_reserved_size>0) { 8.1560 + realmode_reserved_start = avail_start; 8.1561 + avail_start += realmode_reserved_size; 8.1562 + } 8.1563 + 8.1564 +#ifdef DEBUG_MEMLOAD 8.1565 + printf("mem_cluster_count: %d\n", mem_cluster_cnt); 8.1566 +#endif 8.1567 + 8.1568 + /* 8.1569 + * Call pmap initialization to make new kernel address space. 8.1570 + * We must do this before loading pages into the VM system. 8.1571 + */ 8.1572 + pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE); 8.1573 + 8.1574 +#if !defined(XEN) 8.1575 +#if !defined(REALBASEMEM) && !defined(REALEXTMEM) 8.1576 + /* 8.1577 + * Check to see if we have a memory map from the BIOS (passed 8.1578 + * to us by the boot program. 8.1579 + */ 8.1580 + bim = lookup_bootinfo(BTINFO_MEMMAP); 8.1581 + if (bim != NULL && bim->num > 0) { 8.1582 +#ifdef DEBUG_MEMLOAD 8.1583 + printf("BIOS MEMORY MAP (%d ENTRIES):\n", bim->num); 8.1584 +#endif 8.1585 + for (x = 0; x < bim->num; x++) { 8.1586 +#ifdef DEBUG_MEMLOAD 8.1587 + printf(" addr 0x%qx size 0x%qx type 0x%x\n", 8.1588 + bim->entry[x].addr, 8.1589 + bim->entry[x].size, 8.1590 + bim->entry[x].type); 8.1591 +#endif 8.1592 + 8.1593 + /* 8.1594 + * If the segment is not memory, skip it. 8.1595 + */ 8.1596 + switch (bim->entry[x].type) { 8.1597 + case BIM_Memory: 8.1598 + case BIM_ACPI: 8.1599 + case BIM_NVS: 8.1600 + break; 8.1601 + default: 8.1602 + continue; 8.1603 + } 8.1604 + 8.1605 + /* 8.1606 + * Sanity check the entry. 8.1607 + * XXX Need to handle uint64_t in extent code 8.1608 + * XXX and 64-bit physical addresses in i386 8.1609 + * XXX port. 8.1610 + */ 8.1611 + seg_start = bim->entry[x].addr; 8.1612 + seg_end = bim->entry[x].addr + bim->entry[x].size; 8.1613 + 8.1614 + /* 8.1615 + * Avoid Compatibility Holes. 8.1616 + * XXX Holes within memory space that allow access 8.1617 + * XXX to be directed to the PC-compatible frame buffer 8.1618 + * XXX (0xa0000-0xbffff),to adapter ROM space 8.1619 + * XXX (0xc0000-0xdffff), and to system BIOS space 8.1620 + * XXX (0xe0000-0xfffff). 8.1621 + * XXX Some laptop(for example,Toshiba Satellite2550X) 8.1622 + * XXX report this area and occurred problems, 8.1623 + * XXX so we avoid this area. 8.1624 + */ 8.1625 + if (seg_start < 0x100000 && seg_end > 0xa0000) { 8.1626 + printf("WARNING: memory map entry overlaps " 8.1627 + "with ``Compatibility Holes'': " 8.1628 + "0x%qx/0x%qx/0x%x\n", seg_start, 8.1629 + seg_end - seg_start, bim->entry[x].type); 8.1630 + add_mem_cluster(seg_start, 0xa0000, 8.1631 + bim->entry[x].type); 8.1632 + add_mem_cluster(0x100000, seg_end, 8.1633 + bim->entry[x].type); 8.1634 + } else 8.1635 + add_mem_cluster(seg_start, seg_end, 8.1636 + bim->entry[x].type); 8.1637 + } 8.1638 + } 8.1639 +#endif /* ! REALBASEMEM && ! REALEXTMEM */ 8.1640 + /* 8.1641 + * If the loop above didn't find any valid segment, fall back to 8.1642 + * former code. 8.1643 + */ 8.1644 + if (mem_cluster_cnt == 0) { 8.1645 + /* 8.1646 + * Allocate the physical addresses used by RAM from the iomem 8.1647 + * extent map. This is done before the addresses are 8.1648 + * page rounded just to make sure we get them all. 8.1649 + */ 8.1650 + if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem), 8.1651 + EX_NOWAIT)) { 8.1652 + /* XXX What should we do? */ 8.1653 + printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM " 8.1654 + "IOMEM EXTENT MAP!\n"); 8.1655 + } 8.1656 + mem_clusters[0].start = 0; 8.1657 + mem_clusters[0].size = trunc_page(KBTOB(biosbasemem)); 8.1658 + physmem += atop(mem_clusters[0].size); 8.1659 + if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem), 8.1660 + EX_NOWAIT)) { 8.1661 + /* XXX What should we do? */ 8.1662 + printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM " 8.1663 + "IOMEM EXTENT MAP!\n"); 8.1664 + } 8.1665 +#if NISADMA > 0 8.1666 + /* 8.1667 + * Some motherboards/BIOSes remap the 384K of RAM that would 8.1668 + * normally be covered by the ISA hole to the end of memory 8.1669 + * so that it can be used. However, on a 16M system, this 8.1670 + * would cause bounce buffers to be allocated and used. 8.1671 + * This is not desirable behaviour, as more than 384K of 8.1672 + * bounce buffers might be allocated. As a work-around, 8.1673 + * we round memory down to the nearest 1M boundary if 8.1674 + * we're using any isadma devices and the remapped memory 8.1675 + * is what puts us over 16M. 8.1676 + */ 8.1677 + if (biosextmem > (15*1024) && biosextmem < (16*1024)) { 8.1678 + char pbuf[9]; 8.1679 + 8.1680 + format_bytes(pbuf, sizeof(pbuf), 8.1681 + biosextmem - (15*1024)); 8.1682 + printf("Warning: ignoring %s of remapped memory\n", 8.1683 + pbuf); 8.1684 + biosextmem = (15*1024); 8.1685 + } 8.1686 +#endif 8.1687 + mem_clusters[1].start = IOM_END; 8.1688 + mem_clusters[1].size = trunc_page(KBTOB(biosextmem)); 8.1689 + physmem += atop(mem_clusters[1].size); 8.1690 + 8.1691 + mem_cluster_cnt = 2; 8.1692 + 8.1693 + avail_end = IOM_END + trunc_page(KBTOB(biosextmem)); 8.1694 + } 8.1695 + /* 8.1696 + * If we have 16M of RAM or less, just put it all on 8.1697 + * the default free list. Otherwise, put the first 8.1698 + * 16M of RAM on a lower priority free list (so that 8.1699 + * all of the ISA DMA'able memory won't be eaten up 8.1700 + * first-off). 8.1701 + */ 8.1702 + if (avail_end <= (16 * 1024 * 1024)) 8.1703 + first16q = VM_FREELIST_DEFAULT; 8.1704 + else 8.1705 + first16q = VM_FREELIST_FIRST16; 8.1706 + 8.1707 + /* Make sure the end of the space used by the kernel is rounded. */ 8.1708 + first_avail = round_page(first_avail); 8.1709 +#endif 8.1710 + 8.1711 + XENPRINTK(("load the memory cluster %p(%d) - %p(%ld)\n", 8.1712 + (void *)avail_start, (int)atop(avail_start), 8.1713 + (void *)avail_end, (int)atop(avail_end))); 8.1714 + uvm_page_physload(atop(avail_start), atop(avail_end), 8.1715 + atop(avail_start), atop(avail_end), 8.1716 + VM_FREELIST_DEFAULT); 8.1717 + 8.1718 +#if !defined(XEN) 8.1719 + 8.1720 + /* 8.1721 + * Now, load the memory clusters (which have already been 8.1722 + * rounded and truncated) into the VM system. 8.1723 + * 8.1724 + * NOTE: WE ASSUME THAT MEMORY STARTS AT 0 AND THAT THE KERNEL 8.1725 + * IS LOADED AT IOM_END (1M). 8.1726 + */ 8.1727 + for (x = 0; x < mem_cluster_cnt; x++) { 8.1728 + seg_start = mem_clusters[x].start; 8.1729 + seg_end = mem_clusters[x].start + mem_clusters[x].size; 8.1730 + seg_start1 = 0; 8.1731 + seg_end1 = 0; 8.1732 + 8.1733 + /* 8.1734 + * Skip memory before our available starting point. 8.1735 + */ 8.1736 + if (seg_end <= avail_start) 8.1737 + continue; 8.1738 + 8.1739 + if (avail_start >= seg_start && avail_start < seg_end) { 8.1740 + if (seg_start != 0) 8.1741 + panic("init386: memory doesn't start at 0"); 8.1742 + seg_start = avail_start; 8.1743 + if (seg_start == seg_end) 8.1744 + continue; 8.1745 + } 8.1746 + 8.1747 + /* 8.1748 + * If this segment contains the kernel, split it 8.1749 + * in two, around the kernel. 8.1750 + */ 8.1751 + if (seg_start <= IOM_END && first_avail <= seg_end) { 8.1752 + seg_start1 = first_avail; 8.1753 + seg_end1 = seg_end; 8.1754 + seg_end = IOM_END; 8.1755 + } 8.1756 + 8.1757 + /* First hunk */ 8.1758 + if (seg_start != seg_end) { 8.1759 + if (seg_start < (16 * 1024 * 1024) && 8.1760 + first16q != VM_FREELIST_DEFAULT) { 8.1761 + u_int64_t tmp; 8.1762 + 8.1763 + if (seg_end > (16 * 1024 * 1024)) 8.1764 + tmp = (16 * 1024 * 1024); 8.1765 + else 8.1766 + tmp = seg_end; 8.1767 + 8.1768 + if (tmp != seg_start) { 8.1769 +#ifdef DEBUG_MEMLOAD 8.1770 + printf("loading 0x%qx-0x%qx " 8.1771 + "(0x%lx-0x%lx)\n", 8.1772 + seg_start, tmp, 8.1773 + atop(seg_start), atop(tmp)); 8.1774 +#endif 8.1775 + uvm_page_physload(atop(seg_start), 8.1776 + atop(tmp), atop(seg_start), 8.1777 + atop(tmp), first16q); 8.1778 + } 8.1779 + seg_start = tmp; 8.1780 + } 8.1781 + 8.1782 + if (seg_start != seg_end) { 8.1783 +#ifdef DEBUG_MEMLOAD 8.1784 + printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n", 8.1785 + seg_start, seg_end, 8.1786 + atop(seg_start), atop(seg_end)); 8.1787 +#endif 8.1788 + uvm_page_physload(atop(seg_start), 8.1789 + atop(seg_end), atop(seg_start), 8.1790 + atop(seg_end), VM_FREELIST_DEFAULT); 8.1791 + } 8.1792 + } 8.1793 + 8.1794 + /* Second hunk */ 8.1795 + if (seg_start1 != seg_end1) { 8.1796 + if (seg_start1 < (16 * 1024 * 1024) && 8.1797 + first16q != VM_FREELIST_DEFAULT) { 8.1798 + u_int64_t tmp; 8.1799 + 8.1800 + if (seg_end1 > (16 * 1024 * 1024)) 8.1801 + tmp = (16 * 1024 * 1024); 8.1802 + else 8.1803 + tmp = seg_end1; 8.1804 + 8.1805 + if (tmp != seg_start1) { 8.1806 +#ifdef DEBUG_MEMLOAD 8.1807 + printf("loading 0x%qx-0x%qx " 8.1808 + "(0x%lx-0x%lx)\n", 8.1809 + seg_start1, tmp, 8.1810 + atop(seg_start1), atop(tmp)); 8.1811 +#endif 8.1812 + uvm_page_physload(atop(seg_start1), 8.1813 + atop(tmp), atop(seg_start1), 8.1814 + atop(tmp), first16q); 8.1815 + } 8.1816 + seg_start1 = tmp; 8.1817 + } 8.1818 + 8.1819 + if (seg_start1 != seg_end1) { 8.1820 +#ifdef DEBUG_MEMLOAD 8.1821 + printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n", 8.1822 + seg_start1, seg_end1, 8.1823 + atop(seg_start1), atop(seg_end1)); 8.1824 +#endif 8.1825 + uvm_page_physload(atop(seg_start1), 8.1826 + atop(seg_end1), atop(seg_start1), 8.1827 + atop(seg_end1), VM_FREELIST_DEFAULT); 8.1828 + } 8.1829 + } 8.1830 + } 8.1831 +#endif 8.1832 + 8.1833 + /* 8.1834 + * Steal memory for the message buffer (at end of core). 8.1835 + */ 8.1836 + { 8.1837 + struct vm_physseg *vps; 8.1838 + psize_t sz = round_page(MSGBUFSIZE); 8.1839 + psize_t reqsz = sz; 8.1840 + 8.1841 + for (x = 0; x < vm_nphysseg; x++) { 8.1842 + vps = &vm_physmem[x]; 8.1843 + if (ptoa(vps->avail_end) == avail_end) 8.1844 + goto found; 8.1845 + } 8.1846 + panic("init386: can't find end of memory"); 8.1847 + 8.1848 + found: 8.1849 + /* Shrink so it'll fit in the last segment. */ 8.1850 + if ((vps->avail_end - vps->avail_start) < atop(sz)) 8.1851 + sz = ptoa(vps->avail_end - vps->avail_start); 8.1852 + 8.1853 + vps->avail_end -= atop(sz); 8.1854 + vps->end -= atop(sz); 8.1855 + msgbuf_paddr = ptoa(vps->avail_end); 8.1856 + 8.1857 + /* Remove the last segment if it now has no pages. */ 8.1858 + if (vps->start == vps->end) { 8.1859 + for (vm_nphysseg--; x < vm_nphysseg; x++) 8.1860 + vm_physmem[x] = vm_physmem[x + 1]; 8.1861 + } 8.1862 + 8.1863 + /* Now find where the new avail_end is. */ 8.1864 + for (avail_end = 0, x = 0; x < vm_nphysseg; x++) 8.1865 + if (vm_physmem[x].avail_end > avail_end) 8.1866 + avail_end = vm_physmem[x].avail_end; 8.1867 + avail_end = ptoa(avail_end); 8.1868 + 8.1869 + /* Warn if the message buffer had to be shrunk. */ 8.1870 + if (sz != reqsz) 8.1871 + printf("WARNING: %ld bytes not available for msgbuf " 8.1872 + "in last cluster (%ld used)\n", reqsz, sz); 8.1873 + } 8.1874 + 8.1875 + /* 8.1876 + * install PT page for the first 4M if needed. 8.1877 + */ 8.1878 + if (needs_earlier_install_pte0) { 8.1879 + paddr_t paddr; 8.1880 +#ifdef DIAGNOSTIC 8.1881 + if (realmode_reserved_size < PAGE_SIZE) { 8.1882 + panic("cannot steal memory for first 4M PT page."); 8.1883 + } 8.1884 +#endif 8.1885 + paddr=realmode_reserved_start+realmode_reserved_size-PAGE_SIZE; 8.1886 + pmap_enter(pmap_kernel(), (vaddr_t)vtopte(0), paddr, 8.1887 + VM_PROT_READ|VM_PROT_WRITE, 8.1888 + PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE); 8.1889 + pmap_update(pmap_kernel()); 8.1890 + /* make sure it is clean before using */ 8.1891 + memset(vtopte(0), 0, PAGE_SIZE); 8.1892 + realmode_reserved_size -= PAGE_SIZE; 8.1893 + } 8.1894 + 8.1895 +#if NBIOSCALL > 0 8.1896 + /* 8.1897 + * this should be caught at kernel build time, but put it here 8.1898 + * in case someone tries to fake it out... 8.1899 + */ 8.1900 +#ifdef DIAGNOSTIC 8.1901 + if (realmode_reserved_start > BIOSTRAMP_BASE || 8.1902 + (realmode_reserved_start+realmode_reserved_size) < (BIOSTRAMP_BASE+ 8.1903 + PAGE_SIZE)) { 8.1904 + panic("cannot steal memory for PT page of bioscall."); 8.1905 + } 8.1906 + if (biostramp_image_size > PAGE_SIZE) 8.1907 + panic("biostramp_image_size too big: %x vs. %x", 8.1908 + biostramp_image_size, PAGE_SIZE); 8.1909 +#endif 8.1910 + pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE, /* virtual */ 8.1911 + (paddr_t)BIOSTRAMP_BASE, /* physical */ 8.1912 + VM_PROT_ALL); /* protection */ 8.1913 + pmap_update(pmap_kernel()); 8.1914 + memcpy((caddr_t)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size); 8.1915 +#ifdef DEBUG_BIOSCALL 8.1916 + printf("biostramp installed @ %x\n", BIOSTRAMP_BASE); 8.1917 +#endif 8.1918 + realmode_reserved_size -= PAGE_SIZE; 8.1919 + realmode_reserved_start += PAGE_SIZE; 8.1920 +#endif 8.1921 + 8.1922 +#if NACPI > 0 8.1923 + /* 8.1924 + * Steal memory for the acpi wake code 8.1925 + */ 8.1926 + { 8.1927 + paddr_t paddr, p; 8.1928 + psize_t sz; 8.1929 + int npg; 8.1930 + 8.1931 + paddr = realmode_reserved_start; 8.1932 + npg = acpi_md_get_npages_of_wakecode(); 8.1933 + sz = ptoa(npg); 8.1934 +#ifdef DIAGNOSTIC 8.1935 + if (realmode_reserved_size < sz) { 8.1936 + panic("cannot steal memory for ACPI wake code."); 8.1937 + } 8.1938 +#endif 8.1939 + 8.1940 + /* identical mapping */ 8.1941 + p = paddr; 8.1942 + for (x=0; x<npg; x++) { 8.1943 + printf("kenter: 0x%08X\n", (unsigned)p); 8.1944 + pmap_kenter_pa((vaddr_t)p, p, VM_PROT_ALL); 8.1945 + p += PAGE_SIZE; 8.1946 + } 8.1947 + pmap_update(pmap_kernel()); 8.1948 + 8.1949 + acpi_md_install_wakecode(paddr); 8.1950 + 8.1951 + realmode_reserved_size -= sz; 8.1952 + realmode_reserved_start += sz; 8.1953 + } 8.1954 +#endif 8.1955 + 8.1956 + pmap_enter(pmap_kernel(), idt_vaddr, idt_paddr, 8.1957 + VM_PROT_READ|VM_PROT_WRITE, PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE); 8.1958 + pmap_update(pmap_kernel()); 8.1959 + memset((void *)idt_vaddr, 0, PAGE_SIZE); 8.1960 + 8.1961 +#if !defined(XEN) 8.1962 + idt = (struct gate_descriptor *)idt_vaddr; 8.1963 +#ifdef I586_CPU 8.1964 + pmap_enter(pmap_kernel(), pentium_idt_vaddr, idt_paddr, 8.1965 + VM_PROT_READ, PMAP_WIRED|VM_PROT_READ); 8.1966 + pentium_idt = (union descriptor *)pentium_idt_vaddr; 8.1967 +#endif 8.1968 +#endif 8.1969 + pmap_update(pmap_kernel()); 8.1970 + 8.1971 + initgdt(); 8.1972 + 8.1973 + HYPERVISOR_set_callbacks( 8.1974 + GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback, 8.1975 + GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); 8.1976 + 8.1977 +#if !defined(XEN) 8.1978 + tgdt = gdt; 8.1979 + gdt = (union descriptor *) 8.1980 + ((char *)idt + NIDT * sizeof (struct gate_descriptor)); 8.1981 + ldt = gdt + NGDT; 8.1982 + 8.1983 + memcpy(gdt, tgdt, NGDT*sizeof(*gdt)); 8.1984 + 8.1985 + setsegment(&gdt[GLDT_SEL].sd, ldt, NLDT * sizeof(ldt[0]) - 1, 8.1986 + SDT_SYSLDT, SEL_KPL, 0, 0); 8.1987 +#else 8.1988 + ldt = (union descriptor *)idt_vaddr; 8.1989 +#endif 8.1990 + 8.1991 + /* make ldt gates and memory segments */ 8.1992 + setgate(&ldt[LSYS5CALLS_SEL].gd, &IDTVEC(osyscall), 1, 8.1993 + SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); 8.1994 + 8.1995 + ldt[LUCODE_SEL] = gdt[GUCODE_SEL]; 8.1996 + ldt[LUCODEBIG_SEL] = gdt[GUCODEBIG_SEL]; 8.1997 + ldt[LUDATA_SEL] = gdt[GUDATA_SEL]; 8.1998 + ldt[LSOL26CALLS_SEL] = ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; 8.1999 + 8.2000 +#if !defined(XEN) 8.2001 + /* exceptions */ 8.2002 + for (x = 0; x < 32; x++) { 8.2003 + setgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386TGT, 8.2004 + (x == 3 || x == 4) ? SEL_UPL : SEL_KPL, 8.2005 + GSEL(GCODE_SEL, SEL_KPL)); 8.2006 + idt_allocmap[x] = 1; 8.2007 + } 8.2008 + 8.2009 + /* new-style interrupt gate for syscalls */ 8.2010 + setgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386TGT, SEL_UPL, 8.2011 + GSEL(GCODE_SEL, SEL_KPL)); 8.2012 + idt_allocmap[128] = 1; 8.2013 +#ifdef COMPAT_SVR4 8.2014 + setgate(&idt[0xd2], &IDTVEC(svr4_fasttrap), 0, SDT_SYS386TGT, 8.2015 + SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); 8.2016 + idt_allocmap[0xd2] = 1; 8.2017 +#endif /* COMPAT_SVR4 */ 8.2018 +#endif 8.2019 + 8.2020 + memset(xen_idt, 0, sizeof(trap_info_t) * MAX_XEN_IDT); 8.2021 + xen_idt_idx = 0; 8.2022 + for (x = 0; x < 32; x++) { 8.2023 + KASSERT(xen_idt_idx < MAX_XEN_IDT); 8.2024 + xen_idt[xen_idt_idx].vector = x; 8.2025 + xen_idt[xen_idt_idx].flags = 8.2026 + (x == 3 || x == 4) ? SEL_UPL : SEL_XEN; 8.2027 + xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); 8.2028 + xen_idt[xen_idt_idx].address = 8.2029 + (uint32_t)IDTVEC(exceptions)[x]; 8.2030 + xen_idt_idx++; 8.2031 + } 8.2032 + KASSERT(xen_idt_idx < MAX_XEN_IDT); 8.2033 + xen_idt[xen_idt_idx].vector = 128; 8.2034 + xen_idt[xen_idt_idx].flags = SEL_UPL; 8.2035 + xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); 8.2036 + xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(syscall); 8.2037 + xen_idt_idx++; 8.2038 +#ifdef COMPAT_SVR4 8.2039 + KASSERT(xen_idt_idx < MAX_XEN_IDT); 8.2040 + xen_idt[xen_idt_idx].vector = 0xd2; 8.2041 + xen_idt[xen_idt_idx].flags = SEL_UPL; 8.2042 + xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); 8.2043 + xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(svr4_fasttrap); 8.2044 + xen_idt_idx++; 8.2045 +#endif /* COMPAT_SVR4 */ 8.2046 + 8.2047 +#if !defined(XEN) 8.2048 + setregion(®ion, gdt, NGDT * sizeof(gdt[0]) - 1); 8.2049 + lgdt(®ion); 8.2050 +#else 8.2051 + lldt(GSEL(GLDT_SEL, SEL_KPL)); 8.2052 +#endif 8.2053 + 8.2054 +#if !defined(XEN) 8.2055 + cpu_init_idt(); 8.2056 +#else 8.2057 + db_trap_callback = ddb_trap_hook; 8.2058 + 8.2059 + XENPRINTF(("HYPERVISOR_set_trap_table %p\n", xen_idt)); 8.2060 + if (HYPERVISOR_set_trap_table(xen_idt)) 8.2061 + panic("HYPERVISOR_set_trap_table %p failed\n", xen_idt); 8.2062 +#endif 8.2063 + 8.2064 +#if NKSYMS || defined(DDB) || defined(LKM) 8.2065 + { 8.2066 + extern int end; 8.2067 + extern int *esym; 8.2068 + struct btinfo_symtab *symtab; 8.2069 + 8.2070 +#ifdef DDB 8.2071 + db_machine_init(); 8.2072 +#endif 8.2073 + 8.2074 + symtab = lookup_bootinfo(BTINFO_SYMTAB); 8.2075 + 8.2076 + if (symtab) { 8.2077 + symtab->ssym += KERNBASE; 8.2078 + symtab->esym += KERNBASE; 8.2079 + ksyms_init(symtab->nsym, (int *)symtab->ssym, 8.2080 + (int *)symtab->esym); 8.2081 + } 8.2082 + else 8.2083 + ksyms_init(*(int *)&end, ((int *)&end) + 1, esym); 8.2084 + } 8.2085 +#endif 8.2086 +#ifdef DDB 8.2087 + if (boothowto & RB_KDB) 8.2088 + Debugger(); 8.2089 +#endif 8.2090 +#ifdef IPKDB 8.2091 + ipkdb_init(); 8.2092 + if (boothowto & RB_KDB) 8.2093 + ipkdb_connect(0); 8.2094 +#endif 8.2095 +#ifdef KGDB 8.2096 + kgdb_port_init(); 8.2097 + if (boothowto & RB_KDB) { 8.2098 + kgdb_debug_init = 1; 8.2099 + kgdb_connect(1); 8.2100 + } 8.2101 +#endif 8.2102 + 8.2103 +#if NMCA > 0 8.2104 + /* check for MCA bus, needed to be done before ISA stuff - if 8.2105 + * MCA is detected, ISA needs to use level triggered interrupts 8.2106 + * by default */ 8.2107 + mca_busprobe(); 8.2108 +#endif 8.2109 + 8.2110 +#if defined(XEN) 8.2111 + events_default_setup(); 8.2112 +#else 8.2113 + intr_default_setup(); 8.2114 +#endif 8.2115 + 8.2116 + /* Initialize software interrupts. */ 8.2117 + softintr_init(); 8.2118 + 8.2119 + splraise(IPL_IPI); 8.2120 + enable_intr(); 8.2121 + 8.2122 + if (physmem < btoc(2 * 1024 * 1024)) { 8.2123 + printf("warning: too little memory available; " 8.2124 + "have %lu bytes, want %lu bytes\n" 8.2125 + "running in degraded mode\n" 8.2126 + "press a key to confirm\n\n", 8.2127 + ptoa(physmem), 2*1024*1024UL); 8.2128 + cngetc(); 8.2129 + } 8.2130 + 8.2131 +#ifdef __HAVE_CPU_MAXPROC 8.2132 + /* Make sure maxproc is sane */ 8.2133 + if (maxproc > cpu_maxproc()) 8.2134 + maxproc = cpu_maxproc(); 8.2135 +#endif 8.2136 +} 8.2137 + 8.2138 +#ifdef COMPAT_NOMID 8.2139 +static int 8.2140 +exec_nomid(struct proc *p, struct exec_package *epp) 8.2141 +{ 8.2142 + int error; 8.2143 + u_long midmag, magic; 8.2144 + u_short mid; 8.2145 + struct exec *execp = epp->ep_hdr; 8.2146 + 8.2147 + /* check on validity of epp->ep_hdr performed by exec_out_makecmds */ 8.2148 + 8.2149 + midmag = ntohl(execp->a_midmag); 8.2150 + mid = (midmag >> 16) & 0xffff; 8.2151 + magic = midmag & 0xffff; 8.2152 + 8.2153 + if (magic == 0) { 8.2154 + magic = (execp->a_midmag & 0xffff); 8.2155 + mid = MID_ZERO; 8.2156 + } 8.2157 + 8.2158 + midmag = mid << 16 | magic; 8.2159 + 8.2160 + switch (midmag) { 8.2161 + case (MID_ZERO << 16) | ZMAGIC: 8.2162 + /* 8.2163 + * 386BSD's ZMAGIC format: 8.2164 + */ 8.2165 + error = exec_aout_prep_oldzmagic(p, epp); 8.2166 + break; 8.2167 + 8.2168 + case (MID_ZERO << 16) | QMAGIC: 8.2169 + /* 8.2170 + * BSDI's QMAGIC format: 8.2171 + * same as new ZMAGIC format, but with different magic number 8.2172 + */ 8.2173 + error = exec_aout_prep_zmagic(p, epp); 8.2174 + break; 8.2175 + 8.2176 + case (MID_ZERO << 16) | NMAGIC: 8.2177 + /* 8.2178 + * BSDI's NMAGIC format: 8.2179 + * same as NMAGIC format, but with different magic number 8.2180 + * and with text starting at 0. 8.2181 + */ 8.2182 + error = exec_aout_prep_oldnmagic(p, epp); 8.2183 + break; 8.2184 + 8.2185 + case (MID_ZERO << 16) | OMAGIC: 8.2186 + /* 8.2187 + * BSDI's OMAGIC format: 8.2188 + * same as OMAGIC format, but with different magic number 8.2189 + * and with text starting at 0. 8.2190 + */ 8.2191 + error = exec_aout_prep_oldomagic(p, epp); 8.2192 + break; 8.2193 + 8.2194 + default: 8.2195 + error = ENOEXEC; 8.2196 + } 8.2197 + 8.2198 + return error; 8.2199 +} 8.2200 +#endif 8.2201 + 8.2202 +/* 8.2203 + * cpu_exec_aout_makecmds(): 8.2204 + * CPU-dependent a.out format hook for execve(). 8.2205 + * 8.2206 + * Determine of the given exec package refers to something which we 8.2207 + * understand and, if so, set up the vmcmds for it. 8.2208 + * 8.2209 + * On the i386, old (386bsd) ZMAGIC binaries and BSDI QMAGIC binaries 8.2210 + * if COMPAT_NOMID is given as a kernel option. 8.2211 + */ 8.2212 +int 8.2213 +cpu_exec_aout_makecmds(struct proc *p, struct exec_package *epp) 8.2214 +{ 8.2215 + int error = ENOEXEC; 8.2216 + 8.2217 +#ifdef COMPAT_NOMID 8.2218 + if ((error = exec_nomid(p, epp)) == 0) 8.2219 + return error; 8.2220 +#endif /* ! COMPAT_NOMID */ 8.2221 + 8.2222 + return error; 8.2223 +} 8.2224 + 8.2225 +void * 8.2226 +lookup_bootinfo(int type) 8.2227 +{ 8.2228 + struct btinfo_common *help; 8.2229 + int n = *(int*)bootinfo; 8.2230 + help = (struct btinfo_common *)(bootinfo + sizeof(int)); 8.2231 + while(n--) { 8.2232 + if(help->type == type) 8.2233 + return(help); 8.2234 + help = (struct btinfo_common *)((char*)help + help->len); 8.2235 + } 8.2236 + return(0); 8.2237 +} 8.2238 + 8.2239 +#include <dev/ic/mc146818reg.h> /* for NVRAM POST */ 8.2240 +#include <i386/isa/nvram.h> /* for NVRAM POST */ 8.2241 + 8.2242 +void 8.2243 +cpu_reset() 8.2244 +{ 8.2245 + 8.2246 + disable_intr(); 8.2247 + 8.2248 +#if 0 8.2249 + /* 8.2250 + * Ensure the NVRAM reset byte contains something vaguely sane. 8.2251 + */ 8.2252 + 8.2253 + outb(IO_RTC, NVRAM_RESET); 8.2254 + outb(IO_RTC+1, NVRAM_RESET_RST); 8.2255 + 8.2256 + /* 8.2257 + * The keyboard controller has 4 random output pins, one of which is 8.2258 + * connected to the RESET pin on the CPU in many PCs. We tell the 8.2259 + * keyboard controller to pulse this line a couple of times. 8.2260 + */ 8.2261 + outb(IO_KBD + KBCMDP, KBC_PULSE0); 8.2262 + delay(100000); 8.2263 + outb(IO_KBD + KBCMDP, KBC_PULSE0); 8.2264 + delay(100000); 8.2265 +#endif 8.2266 + 8.2267 + HYPERVISOR_reboot(); 8.2268 + 8.2269 + for (;;); 8.2270 +} 8.2271 + 8.2272 +void 8.2273 +cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags) 8.2274 +{ 8.2275 + const struct trapframe *tf = l->l_md.md_regs; 8.2276 + __greg_t *gr = mcp->__gregs; 8.2277 + __greg_t ras_eip; 8.2278 + 8.2279 + /* Save register context. */ 8.2280 +#ifdef VM86 8.2281 + if (tf->tf_eflags & PSL_VM) { 8.2282 + gr[_REG_GS] = tf->tf_vm86_gs; 8.2283 + gr[_REG_FS] = tf->tf_vm86_fs; 8.2284 + gr[_REG_ES] = tf->tf_vm86_es; 8.2285 + gr[_REG_DS] = tf->tf_vm86_ds; 8.2286 + gr[_REG_EFL] = get_vflags(l); 8.2287 + } else 8.2288 +#endif 8.2289 + { 8.2290 + gr[_REG_GS] = tf->tf_gs; 8.2291 + gr[_REG_FS] = tf->tf_fs; 8.2292 + gr[_REG_ES] = tf->tf_es; 8.2293 + gr[_REG_DS] = tf->tf_ds; 8.2294 + gr[_REG_EFL] = tf->tf_eflags; 8.2295 + } 8.2296 + gr[_REG_EDI] = tf->tf_edi; 8.2297 + gr[_REG_ESI] = tf->tf_esi; 8.2298 + gr[_REG_EBP] = tf->tf_ebp; 8.2299 + gr[_REG_EBX] = tf->tf_ebx; 8.2300 + gr[_REG_EDX] = tf->tf_edx; 8.2301 + gr[_REG_ECX] = tf->tf_ecx; 8.2302 + gr[_REG_EAX] = tf->tf_eax; 8.2303 + gr[_REG_EIP] = tf->tf_eip; 8.2304 + gr[_REG_CS] = tf->tf_cs; 8.2305 + gr[_REG_ESP] = tf->tf_esp; 8.2306 + gr[_REG_UESP] = tf->tf_esp; 8.2307 + gr[_REG_SS] = tf->tf_ss; 8.2308 + gr[_REG_TRAPNO] = tf->tf_trapno; 8.2309 + gr[_REG_ERR] = tf->tf_err; 8.2310 + 8.2311 + if ((ras_eip = (__greg_t)ras_lookup(l->l_proc, 8.2312 + (caddr_t) gr[_REG_EIP])) != -1) 8.2313 + gr[_REG_EIP] = ras_eip; 8.2314 + 8.2315 + *flags |= _UC_CPU; 8.2316 + 8.2317 + /* Save floating point register context, if any. */ 8.2318 + if ((l->l_md.md_flags & MDL_USEDFPU) != 0) { 8.2319 +#if NNPX > 0 8.2320 + /* 8.2321 + * If this process is the current FP owner, dump its 8.2322 + * context to the PCB first. 8.2323 + * XXX npxsave() also clears the FPU state; depending on the 8.2324 + * XXX application this might be a penalty. 8.2325 + */ 8.2326 + if (l->l_addr->u_pcb.pcb_fpcpu) { 8.2327 + npxsave_lwp(l, 1); 8.2328 + } 8.2329 +#endif 8.2330 + if (i386_use_fxsave) { 8.2331 + memcpy(&mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm, 8.2332 + &l->l_addr->u_pcb.pcb_savefpu.sv_xmm, 8.2333 + sizeof (mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm)); 8.2334 + *flags |= _UC_FXSAVE; 8.2335 + } else { 8.2336 + memcpy(&mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state, 8.2337 + &l->l_addr->u_pcb.pcb_savefpu.sv_87, 8.2338 + sizeof (mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state)); 8.2339 + } 8.2340 +#if 0 8.2341 + /* Apparently nothing ever touches this. */ 8.2342 + ucp->mcp.mc_fp.fp_emcsts = l->l_addr->u_pcb.pcb_saveemc; 8.2343 +#endif 8.2344 + *flags |= _UC_FPU; 8.2345 + } 8.2346 +} 8.2347 + 8.2348 +int 8.2349 +cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags) 8.2350 +{ 8.2351 + struct trapframe *tf = l->l_md.md_regs; 8.2352 + __greg_t *gr = mcp->__gregs; 8.2353 + 8.2354 + /* Restore register context, if any. */ 8.2355 + if ((flags & _UC_CPU) != 0) { 8.2356 +#ifdef VM86 8.2357 + if (gr[_REG_EFL] & PSL_VM) { 8.2358 + tf->tf_vm86_gs = gr[_REG_GS]; 8.2359 + tf->tf_vm86_fs = gr[_REG_FS]; 8.2360 + tf->tf_vm86_es = gr[_REG_ES]; 8.2361 + tf->tf_vm86_ds = gr[_REG_DS]; 8.2362 + set_vflags(l, gr[_REG_EFL]); 8.2363 + if (flags & _UC_VM) { 8.2364 + void syscall_vm86(struct trapframe *); 8.2365 + l->l_proc->p_md.md_syscall = syscall_vm86; 8.2366 + } 8.2367 + } else 8.2368 +#endif 8.2369 + { 8.2370 + /* 8.2371 + * Check for security violations. If we're returning 8.2372 + * to protected mode, the CPU will validate the segment 8.2373 + * registers automatically and generate a trap on 8.2374 + * violations. We handle the trap, rather than doing 8.2375 + * all of the checking here. 8.2376 + */ 8.2377 + if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) || 8.2378 + !USERMODE(gr[_REG_CS], gr[_REG_EFL])) { 8.2379 + printf("cpu_setmcontext error: uc EFL: 0x%08x" 8.2380 + " tf EFL: 0x%08x uc CS: 0x%x\n", 8.2381 + gr[_REG_EFL], tf->tf_eflags, gr[_REG_CS]); 8.2382 + return (EINVAL); 8.2383 + } 8.2384 + tf->tf_gs = gr[_REG_GS]; 8.2385 + tf->tf_fs = gr[_REG_FS]; 8.2386 + tf->tf_es = gr[_REG_ES]; 8.2387 + tf->tf_ds = gr[_REG_DS]; 8.2388 + /* Only change the user-alterable part of eflags */ 8.2389 + tf->tf_eflags &= ~PSL_USER; 8.2390 + tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER); 8.2391 + } 8.2392 + tf->tf_edi = gr[_REG_EDI]; 8.2393 + tf->tf_esi = gr[_REG_ESI]; 8.2394 + tf->tf_ebp = gr[_REG_EBP]; 8.2395 + tf->tf_ebx = gr[_REG_EBX]; 8.2396 + tf->tf_edx = gr[_REG_EDX]; 8.2397 + tf->tf_ecx = gr[_REG_ECX]; 8.2398 + tf->tf_eax = gr[_REG_EAX]; 8.2399 + tf->tf_eip = gr[_REG_EIP]; 8.2400 + tf->tf_cs = gr[_REG_CS]; 8.2401 + tf->tf_esp = gr[_REG_UESP]; 8.2402 + tf->tf_ss = gr[_REG_SS]; 8.2403 + } 8.2404 + 8.2405 + /* Restore floating point register context, if any. */ 8.2406 + if ((flags & _UC_FPU) != 0) { 8.2407 +#if NNPX > 0 8.2408 + /* 8.2409 + * If we were using the FPU, forget that we were. 8.2410 + */ 8.2411 + if (l->l_addr->u_pcb.pcb_fpcpu != NULL) 8.2412 + npxsave_lwp(l, 0); 8.2413 +#endif 8.2414 + if (flags & _UC_FXSAVE) { 8.2415 + if (i386_use_fxsave) { 8.2416 + memcpy( 8.2417 + &l->l_addr->u_pcb.pcb_savefpu.sv_xmm, 8.2418 + &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm, 8.2419 + sizeof (&l->l_addr->u_pcb.pcb_savefpu.sv_xmm)); 8.2420 + } else { 8.2421 + /* This is a weird corner case */ 8.2422 + process_xmm_to_s87((struct savexmm *) 8.2423 + &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm, 8.2424 + &l->l_addr->u_pcb.pcb_savefpu.sv_87); 8.2425 + } 8.2426 + } else { 8.2427 + if (i386_use_fxsave) { 8.2428 + process_s87_to_xmm((struct save87 *) 8.2429 + &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state, 8.2430 + &l->l_addr->u_pcb.pcb_savefpu.sv_xmm); 8.2431 + } else { 8.2432 + memcpy(&l->l_addr->u_pcb.pcb_savefpu.sv_87, 8.2433 + &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state, 8.2434 + sizeof (l->l_addr->u_pcb.pcb_savefpu.sv_87)); 8.2435 + } 8.2436 + } 8.2437 + /* If not set already. */ 8.2438 + l->l_md.md_flags |= MDL_USEDFPU; 8.2439 +#if 0 8.2440 + /* Apparently unused. */ 8.2441 + l->l_addr->u_pcb.pcb_saveemc = mcp->mc_fp.fp_emcsts; 8.2442 +#endif 8.2443 + } 8.2444 + if (flags & _UC_SETSTACK) 8.2445 + l->l_proc->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK; 8.2446 + if (flags & _UC_CLRSTACK) 8.2447 + l->l_proc->p_sigctx.ps_sigstk.ss_flags &= ~SS_ONSTACK; 8.2448 + return (0); 8.2449 +} 8.2450 + 8.2451 +void 8.2452 +cpu_initclocks() 8.2453 +{ 8.2454 + (*initclock_func)(); 8.2455 +} 8.2456 + 8.2457 +#ifdef MULTIPROCESSOR 8.2458 +void 8.2459 +need_resched(struct cpu_info *ci) 8.2460 +{ 8.2461 + 8.2462 + if (ci->ci_want_resched) 8.2463 + return; 8.2464 + 8.2465 + ci->ci_want_resched = 1; 8.2466 + if ((ci)->ci_curlwp != NULL) 8.2467 + aston((ci)->ci_curlwp->l_proc); 8.2468 + else if (ci != curcpu()) 8.2469 + x86_send_ipi(ci, 0); 8.2470 +} 8.2471 +#endif 8.2472 + 8.2473 +/* 8.2474 + * Allocate an IDT vector slot within the given range. 8.2475 + * XXX needs locking to avoid MP allocation races. 8.2476 + */ 8.2477 + 8.2478 +int 8.2479 +idt_vec_alloc(int low, int high) 8.2480 +{ 8.2481 + int vec; 8.2482 + 8.2483 + simple_lock(&idt_lock); 8.2484 + for (vec = low; vec <= high; vec++) { 8.2485 + if (idt_allocmap[vec] == 0) { 8.2486 + idt_allocmap[vec] = 1; 8.2487 + simple_unlock(&idt_lock); 8.2488 + return vec; 8.2489 + } 8.2490 + } 8.2491 + simple_unlock(&idt_lock); 8.2492 + return 0; 8.2493 +} 8.2494 + 8.2495 +void 8.2496 +idt_vec_set(int vec, void (*function)(void)) 8.2497 +{ 8.2498 + /* 8.2499 + * Vector should be allocated, so no locking needed. 8.2500 + */ 8.2501 + KASSERT(idt_allocmap[vec] == 1); 8.2502 + setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL, 8.2503 + GSEL(GCODE_SEL, SEL_KPL)); 8.2504 +} 8.2505 + 8.2506 +void 8.2507 +idt_vec_free(int vec) 8.2508 +{ 8.2509 + simple_lock(&idt_lock); 8.2510 + unsetgate(&idt[vec]); 8.2511 + idt_allocmap[vec] = 0; 8.2512 + simple_unlock(&idt_lock); 8.2513 +} 8.2514 + 8.2515 +/* 8.2516 + * Number of processes is limited by number of available GDT slots. 8.2517 + */ 8.2518 +int 8.2519 +cpu_maxproc(void) 8.2520 +{ 8.2521 +#ifdef USER_LDT 8.2522 + return ((MAXGDTSIZ - NGDT) / 2); 8.2523 +#else 8.2524 + return (MAXGDTSIZ - NGDT); 8.2525 +#endif 8.2526 +} 8.2527 + 8.2528 +#if defined(DDB) || defined(KGDB) 8.2529 + 8.2530 +/* 8.2531 + * Callback to output a backtrace when entering ddb. 8.2532 + */ 8.2533 +void 8.2534 +ddb_trap_hook(int where) 8.2535 +{ 8.2536 + static int once = 0; 8.2537 + db_addr_t db_dot; 8.2538 + 8.2539 + if (once != 0 || where != 1) 8.2540 + return; 8.2541 + once = 1; 8.2542 + 8.2543 + if (curlwp != NULL) { 8.2544 + db_printf("Stopped"); 8.2545 + if (curproc == NULL) 8.2546 + db_printf("; curlwp = %p," 8.2547 + " curproc is NULL at\t", curlwp); 8.2548 + else 8.2549 + db_printf(" in pid %d.%d (%s) at\t", 8.2550 + curproc->p_pid, curlwp->l_lid, 8.2551 + curproc->p_comm); 8.2552 + } else 8.2553 + db_printf("Stopped at\t"); 8.2554 + db_dot = PC_REGS(DDB_REGS); 8.2555 + db_print_loc_and_inst(db_dot); 8.2556 + 8.2557 + db_stack_trace_print((db_expr_t) db_dot, FALSE, 65535, 8.2558 + "", db_printf); 8.2559 +#ifdef DEBUG 8.2560 + db_show_regs((db_expr_t) db_dot, FALSE, 65535, ""); 8.2561 +#endif 8.2562 +} 8.2563 + 8.2564 +#endif /* DDB || KGDB */
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 9.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c Mon Sep 06 19:04:16 2004 +0000 9.3 @@ -0,0 +1,4522 @@ 9.4 +/* $NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $ */ 9.5 +/* NetBSD: pmap.c,v 1.172 2004/04/12 13:17:46 yamt Exp */ 9.6 + 9.7 +/* 9.8 + * 9.9 + * Copyright (c) 1997 Charles D. Cranor and Washington University. 9.10 + * All rights reserved. 9.11 + * 9.12 + * Redistribution and use in source and binary forms, with or without 9.13 + * modification, are permitted provided that the following conditions 9.14 + * are met: 9.15 + * 1. Redistributions of source code must retain the above copyright 9.16 + * notice, this list of conditions and the following disclaimer. 9.17 + * 2. Redistributions in binary form must reproduce the above copyright 9.18 + * notice, this list of conditions and the following disclaimer in the 9.19 + * documentation and/or other materials provided with the distribution. 9.20 + * 3. All advertising materials mentioning features or use of this software 9.21 + * must display the following acknowledgement: 9.22 + * This product includes software developed by Charles D. Cranor and 9.23 + * Washington University. 9.24 + * 4. The name of the author may not be used to endorse or promote products 9.25 + * derived from this software without specific prior written permission. 9.26 + * 9.27 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 9.28 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 9.29 + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 9.30 + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 9.31 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 9.32 + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 9.33 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 9.34 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 9.35 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 9.36 + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 9.37 + */ 9.38 + 9.39 +/* 9.40 + * pmap.c: i386 pmap module rewrite 9.41 + * Chuck Cranor <chuck@ccrc.wustl.edu> 9.42 + * 11-Aug-97 9.43 + * 9.44 + * history of this pmap module: in addition to my own input, i used 9.45 + * the following references for this rewrite of the i386 pmap: 9.46 + * 9.47 + * [1] the NetBSD i386 pmap. this pmap appears to be based on the 9.48 + * BSD hp300 pmap done by Mike Hibler at University of Utah. 9.49 + * it was then ported to the i386 by William Jolitz of UUNET 9.50 + * Technologies, Inc. Then Charles M. Hannum of the NetBSD 9.51 + * project fixed some bugs and provided some speed ups. 9.52 + * 9.53 + * [2] the FreeBSD i386 pmap. this pmap seems to be the 9.54 + * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 9.55 + * and David Greenman. 9.56 + * 9.57 + * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 9.58 + * between several processors. the VAX version was done by 9.59 + * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 9.60 + * version was done by Lance Berc, Mike Kupfer, Bob Baron, 9.61 + * David Golub, and Richard Draves. the alpha version was 9.62 + * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 9.63 + * (NetBSD/alpha). 9.64 + */ 9.65 + 9.66 +#include <sys/cdefs.h> 9.67 +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $"); 9.68 + 9.69 +#include "opt_cputype.h" 9.70 +#include "opt_user_ldt.h" 9.71 +#include "opt_largepages.h" 9.72 +#include "opt_lockdebug.h" 9.73 +#include "opt_multiprocessor.h" 9.74 +#include "opt_kstack_dr0.h" 9.75 +#include "opt_xen.h" 9.76 + 9.77 +#include <sys/param.h> 9.78 +#include <sys/systm.h> 9.79 +#include <sys/proc.h> 9.80 +#include <sys/malloc.h> 9.81 +#include <sys/pool.h> 9.82 +#include <sys/user.h> 9.83 +#include <sys/kernel.h> 9.84 + 9.85 +#include <uvm/uvm.h> 9.86 + 9.87 +#include <machine/atomic.h> 9.88 +#include <machine/cpu.h> 9.89 +#include <machine/specialreg.h> 9.90 +#include <machine/gdt.h> 9.91 + 9.92 +#include <dev/isa/isareg.h> 9.93 +#include <machine/isa_machdep.h> 9.94 + 9.95 +#include <machine/xen.h> 9.96 +#include <machine/hypervisor.h> 9.97 +#include <machine/xenpmap.h> 9.98 + 9.99 +void xpmap_find_pte(paddr_t); 9.100 + 9.101 +/* #define XENDEBUG */ 9.102 + 9.103 +#ifdef XENDEBUG 9.104 +#define XENPRINTF(x) printf x 9.105 +#define XENPRINTK(x) printf x 9.106 +#else 9.107 +#define XENPRINTF(x) 9.108 +#define XENPRINTK(x) 9.109 +#endif 9.110 +#define PRINTF(x) printf x 9.111 +#define PRINTK(x) printf x 9.112 + 9.113 + 9.114 +/* 9.115 + * general info: 9.116 + * 9.117 + * - for an explanation of how the i386 MMU hardware works see 9.118 + * the comments in <machine/pte.h>. 9.119 + * 9.120 + * - for an explanation of the general memory structure used by 9.121 + * this pmap (including the recursive mapping), see the comments 9.122 + * in <machine/pmap.h>. 9.123 + * 9.124 + * this file contains the code for the "pmap module." the module's 9.125 + * job is to manage the hardware's virtual to physical address mappings. 9.126 + * note that there are two levels of mapping in the VM system: 9.127 + * 9.128 + * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 9.129 + * to map ranges of virtual address space to objects/files. for 9.130 + * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 9.131 + * to the file /bin/ls starting at offset zero." note that 9.132 + * the upper layer mapping is not concerned with how individual 9.133 + * vm_pages are mapped. 9.134 + * 9.135 + * [2] the lower layer of the VM system (the pmap) maintains the mappings 9.136 + * from virtual addresses. it is concerned with which vm_page is 9.137 + * mapped where. for example, when you run /bin/ls and start 9.138 + * at page 0x1000 the fault routine may lookup the correct page 9.139 + * of the /bin/ls file and then ask the pmap layer to establish 9.140 + * a mapping for it. 9.141 + * 9.142 + * note that information in the lower layer of the VM system can be 9.143 + * thrown away since it can easily be reconstructed from the info 9.144 + * in the upper layer. 9.145 + * 9.146 + * data structures we use include: 9.147 + * 9.148 + * - struct pmap: describes the address space of one thread 9.149 + * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 9.150 + * - struct pv_head: there is one pv_head per managed page of 9.151 + * physical memory. the pv_head points to a list of pv_entry 9.152 + * structures which describe all the <PMAP,VA> pairs that this 9.153 + * page is mapped in. this is critical for page based operations 9.154 + * such as pmap_page_protect() [change protection on _all_ mappings 9.155 + * of a page] 9.156 + * - pv_page/pv_page_info: pv_entry's are allocated out of pv_page's. 9.157 + * if we run out of pv_entry's we allocate a new pv_page and free 9.158 + * its pv_entrys. 9.159 + * - pmap_remove_record: a list of virtual addresses whose mappings 9.160 + * have been changed. used for TLB flushing. 9.161 + */ 9.162 + 9.163 +/* 9.164 + * memory allocation 9.165 + * 9.166 + * - there are three data structures that we must dynamically allocate: 9.167 + * 9.168 + * [A] new process' page directory page (PDP) 9.169 + * - plan 1: done at pmap_create() we use 9.170 + * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 9.171 + * allocation. 9.172 + * 9.173 + * if we are low in free physical memory then we sleep in 9.174 + * uvm_km_alloc -- in this case this is ok since we are creating 9.175 + * a new pmap and should not be holding any locks. 9.176 + * 9.177 + * if the kernel is totally out of virtual space 9.178 + * (i.e. uvm_km_alloc returns NULL), then we panic. 9.179 + * 9.180 + * XXX: the fork code currently has no way to return an "out of 9.181 + * memory, try again" error code since uvm_fork [fka vm_fork] 9.182 + * is a void function. 9.183 + * 9.184 + * [B] new page tables pages (PTP) 9.185 + * - call uvm_pagealloc() 9.186 + * => success: zero page, add to pm_pdir 9.187 + * => failure: we are out of free vm_pages, let pmap_enter() 9.188 + * tell UVM about it. 9.189 + * 9.190 + * note: for kernel PTPs, we start with NKPTP of them. as we map 9.191 + * kernel memory (at uvm_map time) we check to see if we've grown 9.192 + * the kernel pmap. if so, we call the optional function 9.193 + * pmap_growkernel() to grow the kernel PTPs in advance. 9.194 + * 9.195 + * [C] pv_entry structures 9.196 + * - plan 1: try to allocate one off the free list 9.197 + * => success: done! 9.198 + * => failure: no more free pv_entrys on the list 9.199 + * - plan 2: try to allocate a new pv_page to add a chunk of 9.200 + * pv_entrys to the free list 9.201 + * [a] obtain a free, unmapped, VA in kmem_map. either 9.202 + * we have one saved from a previous call, or we allocate 9.203 + * one now using a "vm_map_lock_try" in uvm_map 9.204 + * => success: we have an unmapped VA, continue to [b] 9.205 + * => failure: unable to lock kmem_map or out of VA in it. 9.206 + * move on to plan 3. 9.207 + * [b] allocate a page in kmem_object for the VA 9.208 + * => success: map it in, free the pv_entry's, DONE! 9.209 + * => failure: kmem_object locked, no free vm_pages, etc. 9.210 + * save VA for later call to [a], go to plan 3. 9.211 + * If we fail, we simply let pmap_enter() tell UVM about it. 9.212 + */ 9.213 + 9.214 +/* 9.215 + * locking 9.216 + * 9.217 + * we have the following locks that we must contend with: 9.218 + * 9.219 + * "normal" locks: 9.220 + * 9.221 + * - pmap_main_lock 9.222 + * this lock is used to prevent deadlock and/or provide mutex 9.223 + * access to the pmap system. most operations lock the pmap 9.224 + * structure first, then they lock the pv_lists (if needed). 9.225 + * however, some operations such as pmap_page_protect lock 9.226 + * the pv_lists and then lock pmaps. in order to prevent a 9.227 + * cycle, we require a mutex lock when locking the pv_lists 9.228 + * first. thus, the "pmap = >pv_list" lockers must gain a 9.229 + * read-lock on pmap_main_lock before locking the pmap. and 9.230 + * the "pv_list => pmap" lockers must gain a write-lock on 9.231 + * pmap_main_lock before locking. since only one thread 9.232 + * can write-lock a lock at a time, this provides mutex. 9.233 + * 9.234 + * "simple" locks: 9.235 + * 9.236 + * - pmap lock (per pmap, part of uvm_object) 9.237 + * this lock protects the fields in the pmap structure including 9.238 + * the non-kernel PDEs in the PDP, and the PTEs. it also locks 9.239 + * in the alternate PTE space (since that is determined by the 9.240 + * entry in the PDP). 9.241 + * 9.242 + * - pvh_lock (per pv_head) 9.243 + * this lock protects the pv_entry list which is chained off the 9.244 + * pv_head structure for a specific managed PA. it is locked 9.245 + * when traversing the list (e.g. adding/removing mappings, 9.246 + * syncing R/M bits, etc.) 9.247 + * 9.248 + * - pvalloc_lock 9.249 + * this lock protects the data structures which are used to manage 9.250 + * the free list of pv_entry structures. 9.251 + * 9.252 + * - pmaps_lock 9.253 + * this lock protects the list of active pmaps (headed by "pmaps"). 9.254 + * we lock it when adding or removing pmaps from this list. 9.255 + * 9.256 + */ 9.257 + 9.258 +/* 9.259 + * locking data structures 9.260 + */ 9.261 + 9.262 +static struct simplelock pvalloc_lock; 9.263 +static struct simplelock pmaps_lock; 9.264 + 9.265 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 9.266 +static struct lock pmap_main_lock; 9.267 + 9.268 +#define PMAP_MAP_TO_HEAD_LOCK() \ 9.269 + (void) spinlockmgr(&pmap_main_lock, LK_SHARED, NULL) 9.270 +#define PMAP_MAP_TO_HEAD_UNLOCK() \ 9.271 + (void) spinlockmgr(&pmap_main_lock, LK_RELEASE, NULL) 9.272 + 9.273 +#define PMAP_HEAD_TO_MAP_LOCK() \ 9.274 + (void) spinlockmgr(&pmap_main_lock, LK_EXCLUSIVE, NULL) 9.275 +#define PMAP_HEAD_TO_MAP_UNLOCK() \ 9.276 + spinlockmgr(&pmap_main_lock, LK_RELEASE, (void *) 0) 9.277 + 9.278 +#else 9.279 + 9.280 +#define PMAP_MAP_TO_HEAD_LOCK() /* null */ 9.281 +#define PMAP_MAP_TO_HEAD_UNLOCK() /* null */ 9.282 + 9.283 +#define PMAP_HEAD_TO_MAP_LOCK() /* null */ 9.284 +#define PMAP_HEAD_TO_MAP_UNLOCK() /* null */ 9.285 + 9.286 +#endif 9.287 + 9.288 +#define COUNT(x) /* nothing */ 9.289 + 9.290 +/* 9.291 + * TLB Shootdown: 9.292 + * 9.293 + * When a mapping is changed in a pmap, the TLB entry corresponding to 9.294 + * the virtual address must be invalidated on all processors. In order 9.295 + * to accomplish this on systems with multiple processors, messages are 9.296 + * sent from the processor which performs the mapping change to all 9.297 + * processors on which the pmap is active. For other processors, the 9.298 + * ASN generation numbers for that processor is invalidated, so that 9.299 + * the next time the pmap is activated on that processor, a new ASN 9.300 + * will be allocated (which implicitly invalidates all TLB entries). 9.301 + * 9.302 + * Shootdown job queue entries are allocated using a simple special- 9.303 + * purpose allocator for speed. 9.304 + */ 9.305 +struct pmap_tlb_shootdown_job { 9.306 + TAILQ_ENTRY(pmap_tlb_shootdown_job) pj_list; 9.307 + vaddr_t pj_va; /* virtual address */ 9.308 + pmap_t pj_pmap; /* the pmap which maps the address */ 9.309 + pt_entry_t pj_pte; /* the PTE bits */ 9.310 + struct pmap_tlb_shootdown_job *pj_nextfree; 9.311 +}; 9.312 + 9.313 +#define PMAP_TLB_SHOOTDOWN_JOB_ALIGN 32 9.314 +union pmap_tlb_shootdown_job_al { 9.315 + struct pmap_tlb_shootdown_job pja_job; 9.316 + char pja_align[PMAP_TLB_SHOOTDOWN_JOB_ALIGN]; 9.317 +}; 9.318 + 9.319 +struct pmap_tlb_shootdown_q { 9.320 + TAILQ_HEAD(, pmap_tlb_shootdown_job) pq_head; 9.321 + int pq_pte; /* aggregate PTE bits */ 9.322 + int pq_count; /* number of pending requests */ 9.323 + __cpu_simple_lock_t pq_slock; /* spin lock on queue */ 9.324 + int pq_flushg; /* pending flush global */ 9.325 + int pq_flushu; /* pending flush user */ 9.326 +} pmap_tlb_shootdown_q[X86_MAXPROCS]; 9.327 + 9.328 +#define PMAP_TLB_MAXJOBS 16 9.329 + 9.330 +void pmap_tlb_shootdown_q_drain(struct pmap_tlb_shootdown_q *); 9.331 +struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get 9.332 + (struct pmap_tlb_shootdown_q *); 9.333 +void pmap_tlb_shootdown_job_put(struct pmap_tlb_shootdown_q *, 9.334 + struct pmap_tlb_shootdown_job *); 9.335 + 9.336 +__cpu_simple_lock_t pmap_tlb_shootdown_job_lock; 9.337 +union pmap_tlb_shootdown_job_al *pj_page, *pj_free; 9.338 + 9.339 +/* 9.340 + * global data structures 9.341 + */ 9.342 + 9.343 +struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 9.344 + 9.345 +/* 9.346 + * nkpde is the number of kernel PTPs allocated for the kernel at 9.347 + * boot time (NKPTP is a compile time override). this number can 9.348 + * grow dynamically as needed (but once allocated, we never free 9.349 + * kernel PTPs). 9.350 + */ 9.351 + 9.352 +int nkpde = NKPTP; 9.353 +#ifdef NKPDE 9.354 +#error "obsolete NKPDE: use NKPTP" 9.355 +#endif 9.356 + 9.357 +/* 9.358 + * pmap_pg_g: if our processor supports PG_G in the PTE then we 9.359 + * set pmap_pg_g to PG_G (otherwise it is zero). 9.360 + */ 9.361 + 9.362 +int pmap_pg_g = 0; 9.363 + 9.364 +#ifdef LARGEPAGES 9.365 +/* 9.366 + * pmap_largepages: if our processor supports PG_PS and we are 9.367 + * using it, this is set to TRUE. 9.368 + */ 9.369 + 9.370 +int pmap_largepages; 9.371 +#endif 9.372 + 9.373 +/* 9.374 + * i386 physical memory comes in a big contig chunk with a small 9.375 + * hole toward the front of it... the following two paddr_t's 9.376 + * (shared with machdep.c) describe the physical address space 9.377 + * of this machine. 9.378 + */ 9.379 +paddr_t avail_start; /* PA of first available physical page */ 9.380 +paddr_t avail_end; /* PA of last available physical page */ 9.381 + 9.382 +paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 9.383 +paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 9.384 + 9.385 + /* MA of last physical page of the machine */ 9.386 +paddr_t pmap_mem_end = HYPERVISOR_VIRT_START; /* updated for domain-0 */ 9.387 + 9.388 +/* 9.389 + * other data structures 9.390 + */ 9.391 + 9.392 +static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 9.393 +static boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */ 9.394 + 9.395 +/* 9.396 + * the following two vaddr_t's are used during system startup 9.397 + * to keep track of how much of the kernel's VM space we have used. 9.398 + * once the system is started, the management of the remaining kernel 9.399 + * VM space is turned over to the kernel_map vm_map. 9.400 + */ 9.401 + 9.402 +static vaddr_t virtual_avail; /* VA of first free KVA */ 9.403 +static vaddr_t virtual_end; /* VA of last free KVA */ 9.404 + 9.405 + 9.406 +/* 9.407 + * pv_page management structures: locked by pvalloc_lock 9.408 + */ 9.409 + 9.410 +TAILQ_HEAD(pv_pagelist, pv_page); 9.411 +static struct pv_pagelist pv_freepages; /* list of pv_pages with free entrys */ 9.412 +static struct pv_pagelist pv_unusedpgs; /* list of unused pv_pages */ 9.413 +static int pv_nfpvents; /* # of free pv entries */ 9.414 +static struct pv_page *pv_initpage; /* bootstrap page from kernel_map */ 9.415 +static vaddr_t pv_cachedva; /* cached VA for later use */ 9.416 + 9.417 +#define PVE_LOWAT (PVE_PER_PVPAGE / 2) /* free pv_entry low water mark */ 9.418 +#define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2)) 9.419 + /* high water mark */ 9.420 + 9.421 +static __inline int 9.422 +pv_compare(struct pv_entry *a, struct pv_entry *b) 9.423 +{ 9.424 + if (a->pv_pmap < b->pv_pmap) 9.425 + return (-1); 9.426 + else if (a->pv_pmap > b->pv_pmap) 9.427 + return (1); 9.428 + else if (a->pv_va < b->pv_va) 9.429 + return (-1); 9.430 + else if (a->pv_va > b->pv_va) 9.431 + return (1); 9.432 + else 9.433 + return (0); 9.434 +} 9.435 + 9.436 +SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare); 9.437 +SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare); 9.438 + 9.439 +/* 9.440 + * linked list of all non-kernel pmaps 9.441 + */ 9.442 + 9.443 +static struct pmap_head pmaps; 9.444 + 9.445 +/* 9.446 + * pool that pmap structures are allocated from 9.447 + */ 9.448 + 9.449 +struct pool pmap_pmap_pool; 9.450 + 9.451 +/* 9.452 + * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 9.453 + * X86_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing 9.454 + * due to false sharing. 9.455 + */ 9.456 + 9.457 +#ifdef MULTIPROCESSOR 9.458 +#define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 9.459 +#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 9.460 +#else 9.461 +#define PTESLEW(pte, id) (pte) 9.462 +#define VASLEW(va,id) (va) 9.463 +#endif 9.464 + 9.465 +/* 9.466 + * special VAs and the PTEs that map them 9.467 + */ 9.468 +static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte; 9.469 +static caddr_t csrcp, cdstp, zerop, ptpp; 9.470 + 9.471 +/* 9.472 + * pool and cache that PDPs are allocated from 9.473 + */ 9.474 + 9.475 +struct pool pmap_pdp_pool; 9.476 +struct pool_cache pmap_pdp_cache; 9.477 +u_int pmap_pdp_cache_generation; 9.478 + 9.479 +int pmap_pdp_ctor(void *, void *, int); 9.480 +void pmap_pdp_dtor(void *, void *); 9.481 + 9.482 +caddr_t vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 9.483 + 9.484 +extern vaddr_t msgbuf_vaddr; 9.485 +extern paddr_t msgbuf_paddr; 9.486 + 9.487 +extern vaddr_t idt_vaddr; /* we allocate IDT early */ 9.488 +extern paddr_t idt_paddr; 9.489 + 9.490 +#if defined(I586_CPU) 9.491 +/* stuff to fix the pentium f00f bug */ 9.492 +extern vaddr_t pentium_idt_vaddr; 9.493 +#endif 9.494 + 9.495 + 9.496 +/* 9.497 + * local prototypes 9.498 + */ 9.499 + 9.500 +static struct pv_entry *pmap_add_pvpage(struct pv_page *, boolean_t); 9.501 +static struct vm_page *pmap_alloc_ptp(struct pmap *, int); 9.502 +static struct pv_entry *pmap_alloc_pv(struct pmap *, int); /* see codes below */ 9.503 +#define ALLOCPV_NEED 0 /* need PV now */ 9.504 +#define ALLOCPV_TRY 1 /* just try to allocate, don't steal */ 9.505 +#define ALLOCPV_NONEED 2 /* don't need PV, just growing cache */ 9.506 +static struct pv_entry *pmap_alloc_pvpage(struct pmap *, int); 9.507 +static void pmap_enter_pv(struct pv_head *, 9.508 + struct pv_entry *, struct pmap *, 9.509 + vaddr_t, struct vm_page *); 9.510 +static void pmap_free_pv(struct pmap *, struct pv_entry *); 9.511 +static void pmap_free_pvs(struct pmap *, struct pv_entry *); 9.512 +static void pmap_free_pv_doit(struct pv_entry *); 9.513 +static void pmap_free_pvpage(void); 9.514 +static struct vm_page *pmap_get_ptp(struct pmap *, int); 9.515 +static boolean_t pmap_is_curpmap(struct pmap *); 9.516 +static boolean_t pmap_is_active(struct pmap *, int); 9.517 +static pt_entry_t *pmap_map_ptes(struct pmap *); 9.518 +static struct pv_entry *pmap_remove_pv(struct pv_head *, struct pmap *, 9.519 + vaddr_t); 9.520 +static void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); 9.521 +static boolean_t pmap_remove_pte(struct pmap *, struct vm_page *, 9.522 + pt_entry_t *, vaddr_t, int32_t *, int); 9.523 +static void pmap_remove_ptes(struct pmap *, struct vm_page *, 9.524 + vaddr_t, vaddr_t, vaddr_t, int32_t *, 9.525 + int); 9.526 +#define PMAP_REMOVE_ALL 0 /* remove all mappings */ 9.527 +#define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */ 9.528 + 9.529 +static vaddr_t pmap_tmpmap_pa(paddr_t); 9.530 +static pt_entry_t *pmap_tmpmap_pvepte(struct pv_entry *); 9.531 +static void pmap_tmpunmap_pa(void); 9.532 +static void pmap_tmpunmap_pvepte(struct pv_entry *); 9.533 +static void pmap_unmap_ptes(struct pmap *); 9.534 + 9.535 +static boolean_t pmap_reactivate(struct pmap *); 9.536 + 9.537 +#ifdef DEBUG 9.538 +u_int curapdp; 9.539 +#endif 9.540 + 9.541 +/* 9.542 + * p m a p i n l i n e h e l p e r f u n c t i o n s 9.543 + */ 9.544 + 9.545 +/* 9.546 + * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 9.547 + * of course the kernel is always loaded 9.548 + */ 9.549 + 9.550 +__inline static boolean_t 9.551 +pmap_is_curpmap(pmap) 9.552 + struct pmap *pmap; 9.553 +{ 9.554 + 9.555 + return((pmap == pmap_kernel()) || 9.556 + (pmap == curcpu()->ci_pmap)); 9.557 +} 9.558 + 9.559 +/* 9.560 + * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 9.561 + */ 9.562 + 9.563 +__inline static boolean_t 9.564 +pmap_is_active(pmap, cpu_id) 9.565 + struct pmap *pmap; 9.566 + int cpu_id; 9.567 +{ 9.568 + 9.569 + return (pmap == pmap_kernel() || 9.570 + (pmap->pm_cpus & (1U << cpu_id)) != 0); 9.571 +} 9.572 + 9.573 +/* 9.574 + * pmap_tmpmap_pa: map a page in for tmp usage 9.575 + */ 9.576 + 9.577 +__inline static vaddr_t 9.578 +pmap_tmpmap_pa(pa) 9.579 + paddr_t pa; 9.580 +{ 9.581 +#ifdef MULTIPROCESSOR 9.582 + int id = cpu_number(); 9.583 +#endif 9.584 + pt_entry_t *ptpte = PTESLEW(ptp_pte, id); 9.585 + pt_entry_t *maptp; 9.586 + caddr_t ptpva = VASLEW(ptpp, id); 9.587 +#if defined(DIAGNOSTIC) 9.588 + if (*ptpte) 9.589 + panic("pmap_tmpmap_pa: ptp_pte in use?"); 9.590 +#endif 9.591 + maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte); 9.592 + PTE_SET(ptpte, maptp, PG_V | PG_RW | pa); /* always a new mapping */ 9.593 + return((vaddr_t)ptpva); 9.594 +} 9.595 + 9.596 +/* 9.597 + * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa) 9.598 + */ 9.599 + 9.600 +__inline static void 9.601 +pmap_tmpunmap_pa() 9.602 +{ 9.603 +#ifdef MULTIPROCESSOR 9.604 + int id = cpu_number(); 9.605 +#endif 9.606 + pt_entry_t *ptpte = PTESLEW(ptp_pte, id); 9.607 + pt_entry_t *maptp; 9.608 + caddr_t ptpva = VASLEW(ptpp, id); 9.609 +#if defined(DIAGNOSTIC) 9.610 + if (!pmap_valid_entry(*ptp_pte)) 9.611 + panic("pmap_tmpunmap_pa: our pte invalid?"); 9.612 +#endif 9.613 + maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte); 9.614 + PTE_CLEAR(ptpte, maptp); /* zap! */ 9.615 + pmap_update_pg((vaddr_t)ptpva); 9.616 +#ifdef MULTIPROCESSOR 9.617 + /* 9.618 + * No need for tlb shootdown here, since ptp_pte is per-CPU. 9.619 + */ 9.620 +#endif 9.621 +} 9.622 + 9.623 +/* 9.624 + * pmap_tmpmap_pvepte: get a quick mapping of a PTE for a pv_entry 9.625 + * 9.626 + * => do NOT use this on kernel mappings [why? because pv_ptp may be NULL] 9.627 + */ 9.628 + 9.629 +__inline static pt_entry_t * 9.630 +pmap_tmpmap_pvepte(pve) 9.631 + struct pv_entry *pve; 9.632 +{ 9.633 +#ifdef DIAGNOSTIC 9.634 + if (pve->pv_pmap == pmap_kernel()) 9.635 + panic("pmap_tmpmap_pvepte: attempt to map kernel"); 9.636 +#endif 9.637 + 9.638 + /* is it current pmap? use direct mapping... */ 9.639 + if (pmap_is_curpmap(pve->pv_pmap)) 9.640 + return(vtopte(pve->pv_va)); 9.641 + 9.642 + return(((pt_entry_t *)pmap_tmpmap_pa(VM_PAGE_TO_PHYS(pve->pv_ptp))) 9.643 + + ptei((unsigned)pve->pv_va)); 9.644 +} 9.645 + 9.646 +/* 9.647 + * pmap_tmpunmap_pvepte: release a mapping obtained with pmap_tmpmap_pvepte 9.648 + */ 9.649 + 9.650 +__inline static void 9.651 +pmap_tmpunmap_pvepte(pve) 9.652 + struct pv_entry *pve; 9.653 +{ 9.654 + /* was it current pmap? if so, return */ 9.655 + if (pmap_is_curpmap(pve->pv_pmap)) 9.656 + return; 9.657 + 9.658 + pmap_tmpunmap_pa(); 9.659 +} 9.660 + 9.661 +__inline static void 9.662 +pmap_apte_flush(struct pmap *pmap) 9.663 +{ 9.664 +#if defined(MULTIPROCESSOR) 9.665 + struct pmap_tlb_shootdown_q *pq; 9.666 + struct cpu_info *ci, *self = curcpu(); 9.667 + CPU_INFO_ITERATOR cii; 9.668 + int s; 9.669 +#endif 9.670 + 9.671 + tlbflush(); /* flush TLB on current processor */ 9.672 +#if defined(MULTIPROCESSOR) 9.673 + /* 9.674 + * Flush the APTE mapping from all other CPUs that 9.675 + * are using the pmap we are using (who's APTE space 9.676 + * is the one we've just modified). 9.677 + * 9.678 + * XXXthorpej -- find a way to defer the IPI. 9.679 + */ 9.680 + for (CPU_INFO_FOREACH(cii, ci)) { 9.681 + if (ci == self) 9.682 + continue; 9.683 + if (pmap_is_active(pmap, ci->ci_cpuid)) { 9.684 + pq = &pmap_tlb_shootdown_q[ci->ci_cpuid]; 9.685 + s = splipi(); 9.686 + __cpu_simple_lock(&pq->pq_slock); 9.687 + pq->pq_flushu++; 9.688 + __cpu_simple_unlock(&pq->pq_slock); 9.689 + splx(s); 9.690 + x86_send_ipi(ci, X86_IPI_TLB); 9.691 + } 9.692 + } 9.693 +#endif 9.694 +} 9.695 + 9.696 +/* 9.697 + * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 9.698 + * 9.699 + * => we lock enough pmaps to keep things locked in 9.700 + * => must be undone with pmap_unmap_ptes before returning 9.701 + */ 9.702 + 9.703 +__inline static pt_entry_t * 9.704 +pmap_map_ptes(pmap) 9.705 + struct pmap *pmap; 9.706 +{ 9.707 + pd_entry_t opde; 9.708 + pd_entry_t *mapdp; 9.709 + struct pmap *ourpmap; 9.710 + struct cpu_info *ci; 9.711 + 9.712 + /* the kernel's pmap is always accessible */ 9.713 + if (pmap == pmap_kernel()) { 9.714 + return(PTE_BASE); 9.715 + } 9.716 + 9.717 + ci = curcpu(); 9.718 + if (ci->ci_want_pmapload && 9.719 + vm_map_pmap(&ci->ci_curlwp->l_proc->p_vmspace->vm_map) == pmap) 9.720 + pmap_load(); 9.721 + 9.722 + /* if curpmap then we are always mapped */ 9.723 + if (pmap_is_curpmap(pmap)) { 9.724 + simple_lock(&pmap->pm_obj.vmobjlock); 9.725 + return(PTE_BASE); 9.726 + } 9.727 + 9.728 + ourpmap = ci->ci_pmap; 9.729 + 9.730 + /* need to lock both curpmap and pmap: use ordered locking */ 9.731 + if ((unsigned) pmap < (unsigned) ourpmap) { 9.732 + simple_lock(&pmap->pm_obj.vmobjlock); 9.733 + simple_lock(&ourpmap->pm_obj.vmobjlock); 9.734 + } else { 9.735 + simple_lock(&ourpmap->pm_obj.vmobjlock); 9.736 + simple_lock(&pmap->pm_obj.vmobjlock); 9.737 + } 9.738 + 9.739 + /* need to load a new alternate pt space into curpmap? */ 9.740 + COUNT(apdp_pde_map); 9.741 + opde = PDE_GET(APDP_PDE); 9.742 + if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) { 9.743 + XENPRINTF(("APDP_PDE %p %p/%p set %p/%p\n", 9.744 + pmap, 9.745 + (void *)vtophys((vaddr_t)APDP_PDE), 9.746 + (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)), 9.747 + (void *)pmap->pm_pdirpa, 9.748 + (void *)xpmap_ptom(pmap->pm_pdirpa))); 9.749 + mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE); 9.750 + PDE_SET(APDP_PDE, mapdp, pmap->pm_pdirpa /* | PG_RW */ | PG_V); 9.751 +#ifdef DEBUG 9.752 + curapdp = pmap->pm_pdirpa; 9.753 +#endif 9.754 + if (pmap_valid_entry(opde)) 9.755 + pmap_apte_flush(ourpmap); 9.756 + XENPRINTF(("APDP_PDE set done\n")); 9.757 + } 9.758 + return(APTE_BASE); 9.759 +} 9.760 + 9.761 +/* 9.762 + * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 9.763 + */ 9.764 + 9.765 +__inline static void 9.766 +pmap_unmap_ptes(pmap) 9.767 + struct pmap *pmap; 9.768 +{ 9.769 +#if defined(MULTIPROCESSOR) 9.770 + pd_entry_t *mapdp; 9.771 +#endif 9.772 + 9.773 + if (pmap == pmap_kernel()) { 9.774 + return; 9.775 + } 9.776 + if (pmap_is_curpmap(pmap)) { 9.777 + simple_unlock(&pmap->pm_obj.vmobjlock); 9.778 + } else { 9.779 + struct pmap *ourpmap = curcpu()->ci_pmap; 9.780 + 9.781 +#if defined(MULTIPROCESSOR) 9.782 + mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE); 9.783 + PDE_CLEAR(APDP_PDE, mapdp); 9.784 + pmap_apte_flush(ourpmap); 9.785 +#endif 9.786 +#ifdef DEBUG 9.787 + curapdp = 0; 9.788 +#endif 9.789 + XENPRINTF(("APDP_PDE clear %p/%p set %p/%p\n", 9.790 + (void *)vtophys((vaddr_t)APDP_PDE), 9.791 + (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)), 9.792 + (void *)pmap->pm_pdirpa, 9.793 + (void *)xpmap_ptom(pmap->pm_pdirpa))); 9.794 + COUNT(apdp_pde_unmap); 9.795 + simple_unlock(&pmap->pm_obj.vmobjlock); 9.796 + simple_unlock(&ourpmap->pm_obj.vmobjlock); 9.797 + } 9.798 +} 9.799 + 9.800 +__inline static void 9.801 +pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 9.802 +{ 9.803 + if (curproc == NULL || curproc->p_vmspace == NULL || 9.804 + pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 9.805 + return; 9.806 + 9.807 + if ((opte ^ npte) & PG_X) 9.808 + pmap_update_pg(va); 9.809 + 9.810 + /* 9.811 + * Executability was removed on the last executable change. 9.812 + * Reset the code segment to something conservative and 9.813 + * let the trap handler deal with setting the right limit. 9.814 + * We can't do that because of locking constraints on the vm map. 9.815 + */ 9.816 + 9.817 + if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 9.818 + struct trapframe *tf = curlwp->l_md.md_regs; 9.819 + struct pcb *pcb = &curlwp->l_addr->u_pcb; 9.820 + 9.821 + pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 9.822 + pm->pm_hiexec = I386_MAX_EXE_ADDR; 9.823 + } 9.824 +} 9.825 + 9.826 +__inline static pt_entry_t 9.827 +pte_mtop(pt_entry_t pte) 9.828 +{ 9.829 + pt_entry_t ppte; 9.830 + 9.831 + KDASSERT(pmap_valid_entry(pte)); 9.832 + ppte = xpmap_mtop(pte); 9.833 + if ((ppte & PG_FRAME) == XPMAP_OFFSET) { 9.834 + XENPRINTF(("pte_mtop: null page %08x -> %08x\n", 9.835 + ppte, pte)); 9.836 + ppte = pte; 9.837 + } 9.838 + 9.839 + return ppte; 9.840 +} 9.841 + 9.842 +__inline static pt_entry_t 9.843 +pte_get_ma(pt_entry_t *pte) 9.844 +{ 9.845 + 9.846 + return *pte; 9.847 +} 9.848 + 9.849 +__inline static pt_entry_t 9.850 +pte_get(pt_entry_t *pte) 9.851 +{ 9.852 + 9.853 + if (pmap_valid_entry(*pte)) 9.854 + return pte_mtop(*pte); 9.855 + return *pte; 9.856 +} 9.857 + 9.858 +__inline static pt_entry_t 9.859 +pte_atomic_update_ma(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte) 9.860 +{ 9.861 + pt_entry_t opte; 9.862 + 9.863 + XENPRINTK(("pte_atomic_update_ma pte %p mapte %p npte %08x\n", 9.864 + pte, mapte, npte)); 9.865 + opte = PTE_GET_MA(pte); 9.866 + if (opte > pmap_mem_end) { 9.867 + /* must remove opte unchecked */ 9.868 + if (npte > pmap_mem_end) 9.869 + /* must set npte unchecked */ 9.870 + xpq_queue_unchecked_pte_update(mapte, npte); 9.871 + else { 9.872 + /* must set npte checked */ 9.873 + xpq_queue_unchecked_pte_update(mapte, 0); 9.874 + xpq_queue_pte_update(mapte, npte); 9.875 + } 9.876 + } else { 9.877 + /* must remove opte checked */ 9.878 + if (npte > pmap_mem_end) { 9.879 + /* must set npte unchecked */ 9.880 + xpq_queue_pte_update(mapte, 0); 9.881 + xpq_queue_unchecked_pte_update(mapte, npte); 9.882 + } else 9.883 + /* must set npte checked */ 9.884 + xpq_queue_pte_update(mapte, npte); 9.885 + } 9.886 + xpq_flush_queue(); 9.887 + 9.888 + return opte; 9.889 +} 9.890 + 9.891 +__inline static pt_entry_t 9.892 +pte_atomic_update(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte) 9.893 +{ 9.894 + pt_entry_t opte; 9.895 + 9.896 + opte = pte_atomic_update_ma(pte, mapte, npte); 9.897 + 9.898 + return pte_mtop(opte); 9.899 +} 9.900 + 9.901 +/* 9.902 + * Fixup the code segment to cover all potential executable mappings. 9.903 + * returns 0 if no changes to the code segment were made. 9.904 + */ 9.905 + 9.906 +int 9.907 +pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 9.908 +{ 9.909 + struct vm_map_entry *ent; 9.910 + struct pmap *pm = vm_map_pmap(map); 9.911 + vaddr_t va = 0; 9.912 + 9.913 + vm_map_lock_read(map); 9.914 + for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 9.915 + 9.916 + /* 9.917 + * This entry has greater va than the entries before. 9.918 + * We need to make it point to the last page, not past it. 9.919 + */ 9.920 + 9.921 + if (ent->protection & VM_PROT_EXECUTE) 9.922 + va = trunc_page(ent->end) - PAGE_SIZE; 9.923 + } 9.924 + vm_map_unlock_read(map); 9.925 + if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 9.926 + return (0); 9.927 + 9.928 + pm->pm_hiexec = va; 9.929 + if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 9.930 + pcb->pcb_cs = tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 9.931 + } else { 9.932 + pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 9.933 + return (0); 9.934 + } 9.935 + return (1); 9.936 +} 9.937 + 9.938 +/* 9.939 + * p m a p k e n t e r f u n c t i o n s 9.940 + * 9.941 + * functions to quickly enter/remove pages from the kernel address 9.942 + * space. pmap_kremove is exported to MI kernel. we make use of 9.943 + * the recursive PTE mappings. 9.944 + */ 9.945 + 9.946 +/* 9.947 + * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 9.948 + * 9.949 + * => no need to lock anything, assume va is already allocated 9.950 + * => should be faster than normal pmap enter function 9.951 + */ 9.952 + 9.953 +void 9.954 +pmap_kenter_pa(va, pa, prot) 9.955 + vaddr_t va; 9.956 + paddr_t pa; 9.957 + vm_prot_t prot; 9.958 +{ 9.959 + pt_entry_t *pte, opte, npte; 9.960 + pt_entry_t *maptp; 9.961 + 9.962 + if (va < VM_MIN_KERNEL_ADDRESS) 9.963 + pte = vtopte(va); 9.964 + else 9.965 + pte = kvtopte(va); 9.966 + 9.967 + npte = ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) | 9.968 + PG_V | pmap_pg_g; 9.969 + 9.970 + if (pa >= pmap_pa_start && pa < pmap_pa_end) { 9.971 + npte |= xpmap_ptom(pa); 9.972 + } else { 9.973 + XENPRINTF(("pmap_kenter: va %08lx outside pa range %08lx\n", 9.974 + va, pa)); 9.975 + npte |= pa; 9.976 + } 9.977 + 9.978 + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); 9.979 + opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */ 9.980 + XENPRINTK(("pmap_kenter_pa(%p,%p) %p, was %08x now %08x\n", (void *)va, 9.981 + (void *)pa, pte, opte, npte)); 9.982 +#ifdef LARGEPAGES 9.983 + /* XXX For now... */ 9.984 + if (opte & PG_PS) 9.985 + panic("pmap_kenter_pa: PG_PS"); 9.986 +#endif 9.987 + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 9.988 +#if defined(MULTIPROCESSOR) 9.989 + int32_t cpumask = 0; 9.990 + 9.991 + pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask); 9.992 + pmap_tlb_shootnow(cpumask); 9.993 +#else 9.994 + /* Don't bother deferring in the single CPU case. */ 9.995 + pmap_update_pg(va); 9.996 +#endif 9.997 + } 9.998 +} 9.999 + 9.1000 +/* 9.1001 + * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking 9.1002 + * 9.1003 + * => no need to lock anything, assume va is already allocated 9.1004 + * => should be faster than normal pmap enter function 9.1005 + */ 9.1006 + 9.1007 +void pmap_kenter_ma __P((vaddr_t, paddr_t, vm_prot_t)); 9.1008 + 9.1009 +void 9.1010 +pmap_kenter_ma(va, ma, prot) 9.1011 + vaddr_t va; 9.1012 + paddr_t ma; 9.1013 + vm_prot_t prot; 9.1014 +{ 9.1015 + pt_entry_t *pte, opte, npte; 9.1016 + pt_entry_t *maptp; 9.1017 + 9.1018 + KASSERT (va >= VM_MIN_KERNEL_ADDRESS); 9.1019 + pte = kvtopte(va); 9.1020 + 9.1021 + npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) | 9.1022 + PG_V | pmap_pg_g; 9.1023 + 9.1024 + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); 9.1025 + opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */ 9.1026 + XENPRINTK(("pmap_kenter_ma(%p,%p) %p, was %08x\n", (void *)va, 9.1027 + (void *)ma, pte, opte)); 9.1028 +#ifdef LARGEPAGES 9.1029 + /* XXX For now... */ 9.1030 + if (opte & PG_PS) 9.1031 + panic("pmap_kenter_ma: PG_PS"); 9.1032 +#endif 9.1033 + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 9.1034 +#if defined(MULTIPROCESSOR) 9.1035 + int32_t cpumask = 0; 9.1036 + 9.1037 + pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask); 9.1038 + pmap_tlb_shootnow(cpumask); 9.1039 +#else 9.1040 + /* Don't bother deferring in the single CPU case. */ 9.1041 + pmap_update_pg(va); 9.1042 +#endif 9.1043 + } 9.1044 +} 9.1045 + 9.1046 +/* 9.1047 + * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 9.1048 + * 9.1049 + * => no need to lock anything 9.1050 + * => caller must dispose of any vm_page mapped in the va range 9.1051 + * => note: not an inline function 9.1052 + * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 9.1053 + * => we assume kernel only unmaps valid addresses and thus don't bother 9.1054 + * checking the valid bit before doing TLB flushing 9.1055 + */ 9.1056 + 9.1057 +void 9.1058 +pmap_kremove(va, len) 9.1059 + vaddr_t va; 9.1060 + vsize_t len; 9.1061 +{ 9.1062 + pt_entry_t *pte, opte; 9.1063 + pt_entry_t *maptp; 9.1064 + int32_t cpumask = 0; 9.1065 + 9.1066 + XENPRINTK(("pmap_kremove va %p, len %08lx\n", (void *)va, len)); 9.1067 + len >>= PAGE_SHIFT; 9.1068 + for ( /* null */ ; len ; len--, va += PAGE_SIZE) { 9.1069 + if (va < VM_MIN_KERNEL_ADDRESS) 9.1070 + pte = vtopte(va); 9.1071 + else 9.1072 + pte = kvtopte(va); 9.1073 + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); 9.1074 + opte = pte_atomic_update_ma(pte, maptp, 0); /* zap! */ 9.1075 + XENPRINTK(("pmap_kremove pte %p, was %08x\n", pte, opte)); 9.1076 +#ifdef LARGEPAGES 9.1077 + /* XXX For now... */ 9.1078 + if (opte & PG_PS) 9.1079 + panic("pmap_kremove: PG_PS"); 9.1080 +#endif 9.1081 +#ifdef DIAGNOSTIC 9.1082 + if (opte & PG_PVLIST) 9.1083 + panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", 9.1084 + va); 9.1085 +#endif 9.1086 + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) 9.1087 + pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask); 9.1088 + } 9.1089 + pmap_tlb_shootnow(cpumask); 9.1090 +} 9.1091 + 9.1092 +/* 9.1093 + * p m a p i n i t f u n c t i o n s 9.1094 + * 9.1095 + * pmap_bootstrap and pmap_init are called during system startup 9.1096 + * to init the pmap module. pmap_bootstrap() does a low level 9.1097 + * init just to get things rolling. pmap_init() finishes the job. 9.1098 + */ 9.1099 + 9.1100 +/* 9.1101 + * pmap_bootstrap: get the system in a state where it can run with VM 9.1102 + * properly enabled (called before main()). the VM system is 9.1103 + * fully init'd later... 9.1104 + * 9.1105 + * => on i386, locore.s has already enabled the MMU by allocating 9.1106 + * a PDP for the kernel, and nkpde PTP's for the kernel. 9.1107 + * => kva_start is the first free virtual address in kernel space 9.1108 + */ 9.1109 + 9.1110 +void 9.1111 +pmap_bootstrap(kva_start) 9.1112 + vaddr_t kva_start; 9.1113 +{ 9.1114 + struct pmap *kpm; 9.1115 + vaddr_t kva; 9.1116 + pt_entry_t *pte; 9.1117 + pt_entry_t *maptp; 9.1118 + int i; 9.1119 + 9.1120 + /* 9.1121 + * set up our local static global vars that keep track of the 9.1122 + * usage of KVM before kernel_map is set up 9.1123 + */ 9.1124 + 9.1125 + virtual_avail = kva_start; /* first free KVA */ 9.1126 + virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 9.1127 + 9.1128 + /* 9.1129 + * find out where physical memory ends on the real hardware. 9.1130 + */ 9.1131 + 9.1132 + if (xen_start_info.flags & SIF_PRIVILEGED) 9.1133 + pmap_mem_end = find_pmap_mem_end(kva_start); 9.1134 + 9.1135 + /* 9.1136 + * set up protection_codes: we need to be able to convert from 9.1137 + * a MI protection code (some combo of VM_PROT...) to something 9.1138 + * we can jam into a i386 PTE. 9.1139 + */ 9.1140 + 9.1141 + protection_codes[VM_PROT_NONE] = 0; /* --- */ 9.1142 + protection_codes[VM_PROT_EXECUTE] = PG_X; /* --x */ 9.1143 + protection_codes[VM_PROT_READ] = PG_RO; /* -r- */ 9.1144 + protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO|PG_X;/* -rx */ 9.1145 + protection_codes[VM_PROT_WRITE] = PG_RW; /* w-- */ 9.1146 + protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW|PG_X;/* w-x */ 9.1147 + protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW; /* wr- */ 9.1148 + protection_codes[VM_PROT_ALL] = PG_RW|PG_X; /* wrx */ 9.1149 + 9.1150 + /* 9.1151 + * now we init the kernel's pmap 9.1152 + * 9.1153 + * the kernel pmap's pm_obj is not used for much. however, in 9.1154 + * user pmaps the pm_obj contains the list of active PTPs. 9.1155 + * the pm_obj currently does not have a pager. it might be possible 9.1156 + * to add a pager that would allow a process to read-only mmap its 9.1157 + * own page tables (fast user level vtophys?). this may or may not 9.1158 + * be useful. 9.1159 + */ 9.1160 + 9.1161 + kpm = pmap_kernel(); 9.1162 + simple_lock_init(&kpm->pm_obj.vmobjlock); 9.1163 + kpm->pm_obj.pgops = NULL; 9.1164 + TAILQ_INIT(&kpm->pm_obj.memq); 9.1165 + kpm->pm_obj.uo_npages = 0; 9.1166 + kpm->pm_obj.uo_refs = 1; 9.1167 + memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 9.1168 + kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE); 9.1169 + XENPRINTF(("pm_pdirpa %p PTDpaddr %p\n", 9.1170 + (void *)lwp0.l_addr->u_pcb.pcb_cr3, (void *)PTDpaddr)); 9.1171 + kpm->pm_pdirpa = (u_int32_t) lwp0.l_addr->u_pcb.pcb_cr3; 9.1172 + kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 9.1173 + x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 9.1174 + 9.1175 + /* 9.1176 + * the above is just a rough estimate and not critical to the proper 9.1177 + * operation of the system. 9.1178 + */ 9.1179 + 9.1180 + /* 9.1181 + * Begin to enable global TLB entries if they are supported. 9.1182 + * The G bit has no effect until the CR4_PGE bit is set in CR4, 9.1183 + * which happens in cpu_init(), which is run on each cpu 9.1184 + * (and happens later) 9.1185 + */ 9.1186 + 9.1187 + if (cpu_feature & CPUID_PGE) { 9.1188 + pmap_pg_g = PG_G; /* enable software */ 9.1189 + 9.1190 + /* add PG_G attribute to already mapped kernel pages */ 9.1191 + for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ; 9.1192 + kva += PAGE_SIZE) 9.1193 + if (pmap_valid_entry(PTE_BASE[x86_btop(kva)])) { 9.1194 +#if !defined(XEN) 9.1195 + PTE_BASE[x86_btop(kva)] |= PG_G; 9.1196 +#else 9.1197 + maptp = (pt_entry_t *)vtomach( 9.1198 + (vaddr_t)&PTE_BASE[x86_btop(kva)]); 9.1199 + PTE_SETBITS(&PTE_BASE[x86_btop(kva)], maptp, 9.1200 + PG_G); 9.1201 + } 9.1202 + PTE_UPDATES_FLUSH(); 9.1203 +#endif 9.1204 + } 9.1205 + 9.1206 +#ifdef LARGEPAGES 9.1207 + /* 9.1208 + * enable large pages if they are supported. 9.1209 + */ 9.1210 + 9.1211 + if (cpu_feature & CPUID_PSE) { 9.1212 + paddr_t pa; 9.1213 + vaddr_t kva_end; 9.1214 + pd_entry_t *pde; 9.1215 + pd_entry_t *mapdp; 9.1216 + extern char _etext; 9.1217 + 9.1218 + lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 9.1219 + pmap_largepages = 1; /* enable software */ 9.1220 + 9.1221 + /* 9.1222 + * the TLB must be flushed after enabling large pages 9.1223 + * on Pentium CPUs, according to section 3.6.2.2 of 9.1224 + * "Intel Architecture Software Developer's Manual, 9.1225 + * Volume 3: System Programming". 9.1226 + */ 9.1227 + tlbflush(); 9.1228 + 9.1229 + /* 9.1230 + * now, remap the kernel text using large pages. we 9.1231 + * assume that the linker has properly aligned the 9.1232 + * .data segment to a 4MB boundary. 9.1233 + */ 9.1234 + kva_end = roundup((vaddr_t)&_etext, NBPD); 9.1235 + for (pa = 0, kva = KERNBASE; kva < kva_end; 9.1236 + kva += NBPD, pa += NBPD) { 9.1237 + pde = &kpm->pm_pdir[pdei(kva)]; 9.1238 + mapdp = (pt_entry_t *)vtomach((vaddr_t)pde); 9.1239 + PDE_SET(pde, mapdp, pa | pmap_pg_g | PG_PS | 9.1240 + PG_KR | PG_V); /* zap! */ 9.1241 + tlbflush(); 9.1242 + } 9.1243 + } 9.1244 +#endif /* LARGEPAGES */ 9.1245 + 9.1246 + /* 9.1247 + * now we allocate the "special" VAs which are used for tmp mappings 9.1248 + * by the pmap (and other modules). we allocate the VAs by advancing 9.1249 + * virtual_avail (note that there are no pages mapped at these VAs). 9.1250 + * we find the PTE that maps the allocated VA via the linear PTE 9.1251 + * mapping. 9.1252 + */ 9.1253 + 9.1254 + pte = PTE_BASE + x86_btop(virtual_avail); 9.1255 + 9.1256 +#ifdef MULTIPROCESSOR 9.1257 + /* 9.1258 + * Waste some VA space to avoid false sharing of cache lines 9.1259 + * for page table pages: Give each possible CPU a cache line 9.1260 + * of PTE's (8) to play with, though we only need 4. We could 9.1261 + * recycle some of this waste by putting the idle stacks here 9.1262 + * as well; we could waste less space if we knew the largest 9.1263 + * CPU ID beforehand. 9.1264 + */ 9.1265 + csrcp = (caddr_t) virtual_avail; csrc_pte = pte; 9.1266 + 9.1267 + cdstp = (caddr_t) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 9.1268 + 9.1269 + zerop = (caddr_t) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 9.1270 + 9.1271 + ptpp = (caddr_t) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 9.1272 + 9.1273 + virtual_avail += PAGE_SIZE * X86_MAXPROCS * NPTECL; 9.1274 + pte += X86_MAXPROCS * NPTECL; 9.1275 +#else 9.1276 + csrcp = (caddr_t) virtual_avail; csrc_pte = pte; /* allocate */ 9.1277 + virtual_avail += PAGE_SIZE; pte++; /* advance */ 9.1278 + 9.1279 + cdstp = (caddr_t) virtual_avail; cdst_pte = pte; 9.1280 + virtual_avail += PAGE_SIZE; pte++; 9.1281 + 9.1282 + zerop = (caddr_t) virtual_avail; zero_pte = pte; 9.1283 + virtual_avail += PAGE_SIZE; pte++; 9.1284 + 9.1285 + ptpp = (caddr_t) virtual_avail; ptp_pte = pte; 9.1286 + virtual_avail += PAGE_SIZE; pte++; 9.1287 +#endif 9.1288 + 9.1289 + XENPRINTK(("pmap_bootstrap csrcp %p cdstp %p zerop %p ptpp %p\n", 9.1290 + csrc_pte, cdst_pte, zero_pte, ptp_pte)); 9.1291 + /* 9.1292 + * Nothing after this point actually needs pte; 9.1293 + */ 9.1294 + pte = (void *)0xdeadbeef; 9.1295 + 9.1296 + /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 9.1297 + vmmap = (char *)virtual_avail; /* don't need pte */ 9.1298 + virtual_avail += PAGE_SIZE; 9.1299 + 9.1300 + msgbuf_vaddr = virtual_avail; /* don't need pte */ 9.1301 + virtual_avail += round_page(MSGBUFSIZE); 9.1302 + 9.1303 + idt_vaddr = virtual_avail; /* don't need pte */ 9.1304 + virtual_avail += PAGE_SIZE; 9.1305 + idt_paddr = avail_start; /* steal a page */ 9.1306 + avail_start += PAGE_SIZE; 9.1307 + 9.1308 +#if defined(I586_CPU) 9.1309 + /* pentium f00f bug stuff */ 9.1310 + pentium_idt_vaddr = virtual_avail; /* don't need pte */ 9.1311 + virtual_avail += PAGE_SIZE; 9.1312 +#endif 9.1313 + 9.1314 + /* 9.1315 + * now we reserve some VM for mapping pages when doing a crash dump 9.1316 + */ 9.1317 + 9.1318 + virtual_avail = reserve_dumppages(virtual_avail); 9.1319 + 9.1320 + /* 9.1321 + * init the static-global locks and global lists. 9.1322 + */ 9.1323 + 9.1324 +#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 9.1325 + spinlockinit(&pmap_main_lock, "pmaplk", 0); 9.1326 +#endif 9.1327 + simple_lock_init(&pvalloc_lock); 9.1328 + simple_lock_init(&pmaps_lock); 9.1329 + LIST_INIT(&pmaps); 9.1330 + TAILQ_INIT(&pv_freepages); 9.1331 + TAILQ_INIT(&pv_unusedpgs); 9.1332 + 9.1333 + /* 9.1334 + * initialize the pmap pool. 9.1335 + */ 9.1336 + 9.1337 + pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, 0, 0, "pmappl", 9.1338 + &pool_allocator_nointr); 9.1339 + 9.1340 + /* 9.1341 + * Initialize the TLB shootdown queues. 9.1342 + */ 9.1343 + 9.1344 + __cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock); 9.1345 + 9.1346 + for (i = 0; i < X86_MAXPROCS; i++) { 9.1347 + TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head); 9.1348 + __cpu_simple_lock_init(&pmap_tlb_shootdown_q[i].pq_slock); 9.1349 + } 9.1350 + 9.1351 + /* 9.1352 + * initialize the PDE pool and cache. 9.1353 + */ 9.1354 + pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, 0, "pdppl", 9.1355 + &pool_allocator_nointr); 9.1356 + pool_cache_init(&pmap_pdp_cache, &pmap_pdp_pool, 9.1357 + pmap_pdp_ctor, pmap_pdp_dtor, NULL); 9.1358 + 9.1359 + /* 9.1360 + * ensure the TLB is sync'd with reality by flushing it... 9.1361 + */ 9.1362 + 9.1363 + tlbflush(); 9.1364 +} 9.1365 + 9.1366 +/* 9.1367 + * pmap_init: called from uvm_init, our job is to get the pmap 9.1368 + * system ready to manage mappings... this mainly means initing 9.1369 + * the pv_entry stuff. 9.1370 + */ 9.1371 + 9.1372 +void 9.1373 +pmap_init() 9.1374 +{ 9.1375 + int i; 9.1376 + 9.1377 + /* 9.1378 + * now we need to free enough pv_entry structures to allow us to get 9.1379 + * the kmem_map/kmem_object allocated and inited (done after this 9.1380 + * function is finished). to do this we allocate one bootstrap page out 9.1381 + * of kernel_map and use it to provide an initial pool of pv_entry 9.1382 + * structures. we never free this page. 9.1383 + */ 9.1384 + 9.1385 + pv_initpage = (struct pv_page *) uvm_km_alloc(kernel_map, PAGE_SIZE); 9.1386 + if (pv_initpage == NULL) 9.1387 + panic("pmap_init: pv_initpage"); 9.1388 + pv_cachedva = 0; /* a VA we have allocated but not used yet */ 9.1389 + pv_nfpvents = 0; 9.1390 + (void) pmap_add_pvpage(pv_initpage, FALSE); 9.1391 + 9.1392 + pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE); 9.1393 + if (pj_page == NULL) 9.1394 + panic("pmap_init: pj_page"); 9.1395 + 9.1396 + for (i = 0; 9.1397 + i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1); 9.1398 + i++) 9.1399 + pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job; 9.1400 + pj_page[i].pja_job.pj_nextfree = NULL; 9.1401 + pj_free = &pj_page[0]; 9.1402 + 9.1403 + /* 9.1404 + * done: pmap module is up (and ready for business) 9.1405 + */ 9.1406 + 9.1407 + pmap_initialized = TRUE; 9.1408 +} 9.1409 + 9.1410 +/* 9.1411 + * p v _ e n t r y f u n c t i o n s 9.1412 + */ 9.1413 + 9.1414 +/* 9.1415 + * pv_entry allocation functions: 9.1416 + * the main pv_entry allocation functions are: 9.1417 + * pmap_alloc_pv: allocate a pv_entry structure 9.1418 + * pmap_free_pv: free one pv_entry 9.1419 + * pmap_free_pvs: free a list of pv_entrys 9.1420 + * 9.1421 + * the rest are helper functions 9.1422 + */ 9.1423 + 9.1424 +/* 9.1425 + * pmap_alloc_pv: inline function to allocate a pv_entry structure 9.1426 + * => we lock pvalloc_lock 9.1427 + * => if we fail, we call out to pmap_alloc_pvpage 9.1428 + * => 3 modes: 9.1429 + * ALLOCPV_NEED = we really need a pv_entry, even if we have to steal it 9.1430 + * ALLOCPV_TRY = we want a pv_entry, but not enough to steal 9.1431 + * ALLOCPV_NONEED = we are trying to grow our free list, don't really need 9.1432 + * one now 9.1433 + * 9.1434 + * "try" is for optional functions like pmap_copy(). 9.1435 + */ 9.1436 + 9.1437 +__inline static struct pv_entry * 9.1438 +pmap_alloc_pv(pmap, mode) 9.1439 + struct pmap *pmap; 9.1440 + int mode; 9.1441 +{ 9.1442 + struct pv_page *pvpage; 9.1443 + struct pv_entry *pv; 9.1444 + 9.1445 + simple_lock(&pvalloc_lock); 9.1446 + 9.1447 + pvpage = TAILQ_FIRST(&pv_freepages); 9.1448 + if (pvpage != NULL) { 9.1449 + pvpage->pvinfo.pvpi_nfree--; 9.1450 + if (pvpage->pvinfo.pvpi_nfree == 0) { 9.1451 + /* nothing left in this one? */ 9.1452 + TAILQ_REMOVE(&pv_freepages, pvpage, pvinfo.pvpi_list); 9.1453 + } 9.1454 + pv = pvpage->pvinfo.pvpi_pvfree; 9.1455 + KASSERT(pv); 9.1456 + pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node); 9.1457 + pv_nfpvents--; /* took one from pool */ 9.1458 + } else { 9.1459 + pv = NULL; /* need more of them */ 9.1460 + } 9.1461 + 9.1462 + /* 9.1463 + * if below low water mark or we didn't get a pv_entry we try and 9.1464 + * create more pv_entrys ... 9.1465 + */ 9.1466 + 9.1467 + if (pv_nfpvents < PVE_LOWAT || pv == NULL) { 9.1468 + if (pv == NULL) 9.1469 + pv = pmap_alloc_pvpage(pmap, (mode == ALLOCPV_TRY) ? 9.1470 + mode : ALLOCPV_NEED); 9.1471 + else 9.1472 + (void) pmap_alloc_pvpage(pmap, ALLOCPV_NONEED); 9.1473 + } 9.1474 + simple_unlock(&pvalloc_lock); 9.1475 + return(pv); 9.1476 +} 9.1477 + 9.1478 +/* 9.1479 + * pmap_alloc_pvpage: maybe allocate a new pvpage 9.1480 + * 9.1481 + * if need_entry is false: try and allocate a new pv_page 9.1482 + * if need_entry is true: try and allocate a new pv_page and return a 9.1483 + * new pv_entry from it. if we are unable to allocate a pv_page 9.1484 + * we make a last ditch effort to steal a pv_page from some other 9.1485 + * mapping. if that fails, we panic... 9.1486 + * 9.1487 + * => we assume that the caller holds pvalloc_lock 9.1488 + */ 9.1489 + 9.1490 +static struct pv_entry * 9.1491 +pmap_alloc_pvpage(pmap, mode) 9.1492 + struct pmap *pmap; 9.1493 + int mode; 9.1494 +{ 9.1495 + struct vm_page *pg; 9.1496 + struct pv_page *pvpage; 9.1497 + struct pv_entry *pv; 9.1498 + int s; 9.1499 + 9.1500 + /* 9.1501 + * if we need_entry and we've got unused pv_pages, allocate from there 9.1502 + */ 9.1503 + 9.1504 + pvpage = TAILQ_FIRST(&pv_unusedpgs); 9.1505 + if (mode != ALLOCPV_NONEED && pvpage != NULL) { 9.1506 + 9.1507 + /* move it to pv_freepages list */ 9.1508 + TAILQ_REMOVE(&pv_unusedpgs, pvpage, pvinfo.pvpi_list); 9.1509 + TAILQ_INSERT_HEAD(&pv_freepages, pvpage, pvinfo.pvpi_list); 9.1510 + 9.1511 + /* allocate a pv_entry */ 9.1512 + pvpage->pvinfo.pvpi_nfree--; /* can't go to zero */ 9.1513 + pv = pvpage->pvinfo.pvpi_pvfree; 9.1514 + KASSERT(pv); 9.1515 + pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node); 9.1516 + pv_nfpvents--; /* took one from pool */ 9.1517 + return(pv); 9.1518 + } 9.1519 + 9.1520 + /* 9.1521 + * see if we've got a cached unmapped VA that we can map a page in. 9.1522 + * if not, try to allocate one. 9.1523 + */ 9.1524 + 9.1525 + if (pv_cachedva == 0) { 9.1526 + s = splvm(); /* must protect kmem_map with splvm! */ 9.1527 + pv_cachedva = uvm_km_kmemalloc(kmem_map, NULL, PAGE_SIZE, 9.1528 + UVM_KMF_TRYLOCK|UVM_KMF_VALLOC); 9.1529 + splx(s); 9.1530 + if (pv_cachedva == 0) { 9.1531 + return (NULL); 9.1532 + } 9.1533 + } 9.1534 + 9.1535 + pg = uvm_pagealloc(NULL, pv_cachedva - vm_map_min(kernel_map), NULL, 9.1536 + UVM_PGA_USERESERVE); 9.1537 + if (pg == NULL) 9.1538 + return (NULL); 9.1539 + pg->flags &= ~PG_BUSY; /* never busy */ 9.1540 + 9.1541 + /* 9.1542 + * add a mapping for our new pv_page and free its entrys (save one!) 9.1543 + * 9.1544 + * NOTE: If we are allocating a PV page for the kernel pmap, the 9.1545 + * pmap is already locked! (...but entering the mapping is safe...) 9.1546 + */ 9.1547 + 9.1548 + pmap_kenter_pa(pv_cachedva, VM_PAGE_TO_PHYS(pg), 9.1549 + VM_PROT_READ | VM_PROT_WRITE); 9.1550 + pmap_update(pmap_kernel()); 9.1551 + pvpage = (struct pv_page *) pv_cachedva; 9.1552 + pv_cachedva = 0; 9.1553 + return (pmap_add_pvpage(pvpage, mode != ALLOCPV_NONEED)); 9.1554 +} 9.1555 + 9.1556 +/* 9.1557 + * pmap_add_pvpage: add a pv_page's pv_entrys to the free list 9.1558 + * 9.1559 + * => caller must hold pvalloc_lock 9.1560 + * => if need_entry is true, we allocate and return one pv_entry 9.1561 + */ 9.1562 + 9.1563 +static struct pv_entry * 9.1564 +pmap_add_pvpage(pvp, need_entry) 9.1565 + struct pv_page *pvp; 9.1566 + boolean_t need_entry; 9.1567 +{ 9.1568 + int tofree, lcv; 9.1569 + 9.1570 + /* do we need to return one? */ 9.1571 + tofree = (need_entry) ? PVE_PER_PVPAGE - 1 : PVE_PER_PVPAGE; 9.1572 + 9.1573 + pvp->pvinfo.pvpi_pvfree = NULL; 9.1574 + pvp->pvinfo.pvpi_nfree = tofree; 9.1575 + for (lcv = 0 ; lcv < tofree ; lcv++) { 9.1576 + SPLAY_RIGHT(&pvp->pvents[lcv], pv_node) = 9.1577 + pvp->pvinfo.pvpi_pvfree; 9.1578 + pvp->pvinfo.pvpi_pvfree = &pvp->pvents[lcv]; 9.1579 + } 9.1580 + if (need_entry) 9.1581 + TAILQ_INSERT_TAIL(&pv_freepages, pvp, pvinfo.pvpi_list); 9.1582 + else 9.1583 + TAILQ_INSERT_TAIL(&pv_unusedpgs, pvp, pvinfo.pvpi_list); 9.1584 + pv_nfpvents += tofree; 9.1585 + return((need_entry) ? &pvp->pvents[lcv] : NULL); 9.1586 +} 9.1587 + 9.1588 +/* 9.1589 + * pmap_free_pv_doit: actually free a pv_entry 9.1590 + * 9.1591 + * => do not call this directly! instead use either 9.1592 + * 1. pmap_free_pv ==> free a single pv_entry 9.1593 + * 2. pmap_free_pvs => free a list of pv_entrys 9.1594 + * => we must be holding pvalloc_lock 9.1595 + */ 9.1596 + 9.1597 +__inline static void 9.1598 +pmap_free_pv_doit(pv) 9.1599 + struct pv_entry *pv; 9.1600 +{ 9.1601 + struct pv_page *pvp; 9.1602 + 9.1603 + pvp = (struct pv_page *) x86_trunc_page(pv); 9.1604 + pv_nfpvents++; 9.1605 + pvp->pvinfo.pvpi_nfree++; 9.1606 + 9.1607 + /* nfree == 1 => fully allocated page just became partly allocated */ 9.1608 + if (pvp->pvinfo.pvpi_nfree == 1) { 9.1609 + TAILQ_INSERT_HEAD(&pv_freepages, pvp, pvinfo.pvpi_list); 9.1610 + } 9.1611 + 9.1612 + /* free it */ 9.1613 + SPLAY_RIGHT(pv, pv_node) = pvp->pvinfo.pvpi_pvfree; 9.1614 + pvp->pvinfo.pvpi_pvfree = pv; 9.1615 + 9.1616 + /* 9.1617 + * are all pv_page's pv_entry's free? move it to unused queue. 9.1618 + */ 9.1619 + 9.1620 + if (pvp->pvinfo.pvpi_nfree == PVE_PER_PVPAGE) { 9.1621 + TAILQ_REMOVE(&pv_freepages, pvp, pvinfo.pvpi_list); 9.1622 + TAILQ_INSERT_HEAD(&pv_unusedpgs, pvp, pvinfo.pvpi_list); 9.1623 + } 9.1624 +} 9.1625 + 9.1626 +/* 9.1627 + * pmap_free_pv: free a single pv_entry 9.1628 + * 9.1629 + * => we gain the pvalloc_lock 9.1630 + */ 9.1631 + 9.1632 +__inline static void 9.1633 +pmap_free_pv(pmap, pv) 9.1634 + struct pmap *pmap; 9.1635 + struct pv_entry *pv; 9.1636 +{ 9.1637 + simple_lock(&pvalloc_lock); 9.1638 + pmap_free_pv_doit(pv); 9.1639 + 9.1640 + /* 9.1641 + * Can't free the PV page if the PV entries were associated with 9.1642 + * the kernel pmap; the pmap is already locked. 9.1643 + */ 9.1644 + if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL && 9.1645 + pmap != pmap_kernel()) 9.1646 + pmap_free_pvpage(); 9.1647 + 9.1648 + simple_unlock(&pvalloc_lock); 9.1649 +} 9.1650 + 9.1651 +/* 9.1652 + * pmap_free_pvs: free a list of pv_entrys 9.1653 + * 9.1654 + * => we gain the pvalloc_lock 9.1655 + */ 9.1656 + 9.1657 +__inline static void 9.1658 +pmap_free_pvs(pmap, pvs) 9.1659 + struct pmap *pmap; 9.1660 + struct pv_entry *pvs; 9.1661 +{ 9.1662 + struct pv_entry *nextpv; 9.1663 + 9.1664 + simple_lock(&pvalloc_lock); 9.1665 + 9.1666 + for ( /* null */ ; pvs != NULL ; pvs = nextpv) { 9.1667 + nextpv = SPLAY_RIGHT(pvs, pv_node); 9.1668 + pmap_free_pv_doit(pvs); 9.1669 + } 9.1670 + 9.1671 + /* 9.1672 + * Can't free the PV page if the PV entries were associated with 9.1673 + * the kernel pmap; the pmap is already locked. 9.1674 + */ 9.1675 + if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL && 9.1676 + pmap != pmap_kernel()) 9.1677 + pmap_free_pvpage(); 9.1678 + 9.1679 + simple_unlock(&pvalloc_lock); 9.1680 +} 9.1681 + 9.1682 + 9.1683 +/* 9.1684 + * pmap_free_pvpage: try and free an unused pv_page structure 9.1685 + * 9.1686 + * => assume caller is holding the pvalloc_lock and that 9.1687 + * there is a page on the pv_unusedpgs list 9.1688 + * => if we can't get a lock on the kmem_map we try again later 9.1689 + */ 9.1690 + 9.1691 +static void 9.1692 +pmap_free_pvpage() 9.1693 +{ 9.1694 + int s; 9.1695 + struct vm_map *map; 9.1696 + struct vm_map_entry *dead_entries; 9.1697 + struct pv_page *pvp; 9.1698 + 9.1699 + s = splvm(); /* protect kmem_map */ 9.1700 + 9.1701 + pvp = TAILQ_FIRST(&pv_unusedpgs); 9.1702 + 9.1703 + /* 9.1704 + * note: watch out for pv_initpage which is allocated out of 9.1705 + * kernel_map rather than kmem_map. 9.1706 + */ 9.1707 + 9.1708 + if (pvp == pv_initpage) 9.1709 + map = kernel_map; 9.1710 + else 9.1711 + map = kmem_map; 9.1712 + if (vm_map_lock_try(map)) { 9.1713 + 9.1714 + /* remove pvp from pv_unusedpgs */ 9.1715 + TAILQ_REMOVE(&pv_unusedpgs, pvp, pvinfo.pvpi_list); 9.1716 + 9.1717 + /* unmap the page */ 9.1718 + dead_entries = NULL; 9.1719 + uvm_unmap_remove(map, (vaddr_t)pvp, ((vaddr_t)pvp) + PAGE_SIZE, 9.1720 + &dead_entries); 9.1721 + vm_map_unlock(map); 9.1722 + 9.1723 + if (dead_entries != NULL) 9.1724 + uvm_unmap_detach(dead_entries, 0); 9.1725 + 9.1726 + pv_nfpvents -= PVE_PER_PVPAGE; /* update free count */ 9.1727 + } 9.1728 + if (pvp == pv_initpage) 9.1729 + /* no more initpage, we've freed it */ 9.1730 + pv_initpage = NULL; 9.1731 + 9.1732 + splx(s); 9.1733 +} 9.1734 + 9.1735 +/* 9.1736 + * pmap_lock_pvhs: Lock pvh1 and optional pvh2 9.1737 + * Observe locking order when locking both pvhs 9.1738 + */ 9.1739 + 9.1740 +__inline static void 9.1741 +pmap_lock_pvhs(struct pv_head *pvh1, struct pv_head *pvh2) 9.1742 +{ 9.1743 + 9.1744 + if (pvh2 == NULL) { 9.1745 + simple_lock(&pvh1->pvh_lock); 9.1746 + return; 9.1747 + } 9.1748 + 9.1749 + if (pvh1 < pvh2) { 9.1750 + simple_lock(&pvh1->pvh_lock); 9.1751 + simple_lock(&pvh2->pvh_lock); 9.1752 + } else { 9.1753 + simple_lock(&pvh2->pvh_lock); 9.1754 + simple_lock(&pvh1->pvh_lock); 9.1755 + } 9.1756 +} 9.1757 + 9.1758 + 9.1759 +/* 9.1760 + * main pv_entry manipulation functions: 9.1761 + * pmap_enter_pv: enter a mapping onto a pv_head list 9.1762 + * pmap_remove_pv: remove a mappiing from a pv_head list 9.1763 + * 9.1764 + * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 9.1765 + * the pvh before calling 9.1766 + */ 9.1767 + 9.1768 +/* 9.1769 + * pmap_enter_pv: enter a mapping onto a pv_head lst 9.1770 + * 9.1771 + * => caller should hold the proper lock on pmap_main_lock 9.1772 + * => caller should have pmap locked 9.1773 + * => caller should have the pv_head locked 9.1774 + * => caller should adjust ptp's wire_count before calling 9.1775 + */ 9.1776 + 9.1777 +__inline static void 9.1778 +pmap_enter_pv(pvh, pve, pmap, va, ptp) 9.1779 + struct pv_head *pvh; 9.1780 + struct pv_entry *pve; /* preallocated pve for us to use */ 9.1781 + struct pmap *pmap; 9.1782 + vaddr_t va; 9.1783 + struct vm_page *ptp; /* PTP in pmap that maps this VA */ 9.1784 +{ 9.1785 + pve->pv_pmap = pmap; 9.1786 + pve->pv_va = va; 9.1787 + pve->pv_ptp = ptp; /* NULL for kernel pmap */ 9.1788 + SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */ 9.1789 +} 9.1790 + 9.1791 +/* 9.1792 + * pmap_remove_pv: try to remove a mapping from a pv_list 9.1793 + * 9.1794 + * => caller should hold proper lock on pmap_main_lock 9.1795 + * => pmap should be locked 9.1796 + * => caller should hold lock on pv_head [so that attrs can be adjusted] 9.1797 + * => caller should adjust ptp's wire_count and free PTP if needed 9.1798 + * => we return the removed pve 9.1799 + */ 9.1800 + 9.1801 +__inline static struct pv_entry * 9.1802 +pmap_remove_pv(pvh, pmap, va) 9.1803 + struct pv_head *pvh; 9.1804 + struct pmap *pmap; 9.1805 + vaddr_t va; 9.1806 +{ 9.1807 + struct pv_entry tmp, *pve; 9.1808 + 9.1809 + tmp.pv_pmap = pmap; 9.1810 + tmp.pv_va = va; 9.1811 + pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp); 9.1812 + if (pve == NULL) 9.1813 + return (NULL); 9.1814 + SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); 9.1815 + return(pve); /* return removed pve */ 9.1816 +} 9.1817 + 9.1818 +/* 9.1819 + * p t p f u n c t i o n s 9.1820 + */ 9.1821 + 9.1822 +/* 9.1823 + * pmap_alloc_ptp: allocate a PTP for a PMAP 9.1824 + * 9.1825 + * => pmap should already be locked by caller 9.1826 + * => we use the ptp's wire_count to count the number of active mappings 9.1827 + * in the PTP (we start it at one to prevent any chance this PTP 9.1828 + * will ever leak onto the active/inactive queues) 9.1829 + */ 9.1830 + 9.1831 +__inline static struct vm_page * 9.1832 +pmap_alloc_ptp(pmap, pde_index) 9.1833 + struct pmap *pmap; 9.1834 + int pde_index; 9.1835 +{ 9.1836 + struct vm_page *ptp; 9.1837 + pd_entry_t *mapdp; 9.1838 + 9.1839 + ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL, 9.1840 + UVM_PGA_USERESERVE|UVM_PGA_ZERO); 9.1841 + if (ptp == NULL) 9.1842 + return(NULL); 9.1843 + 9.1844 + /* got one! */ 9.1845 + ptp->flags &= ~PG_BUSY; /* never busy */ 9.1846 + ptp->wire_count = 1; /* no mappings yet */ 9.1847 + mapdp = (pt_entry_t *)vtomach((vaddr_t)&pmap->pm_pdir[pde_index]); 9.1848 + PDE_SET(&pmap->pm_pdir[pde_index], mapdp, 9.1849 + (pd_entry_t) (VM_PAGE_TO_PHYS(ptp) | PG_u | PG_RW | PG_V)); 9.1850 + pmap->pm_stats.resident_count++; /* count PTP as resident */ 9.1851 + pmap->pm_ptphint = ptp; 9.1852 + return(ptp); 9.1853 +} 9.1854 + 9.1855 +/* 9.1856 + * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 9.1857 + * 9.1858 + * => pmap should NOT be pmap_kernel() 9.1859 + * => pmap should be locked 9.1860 + */ 9.1861 + 9.1862 +static struct vm_page * 9.1863 +pmap_get_ptp(pmap, pde_index) 9.1864 + struct pmap *pmap; 9.1865 + int pde_index; 9.1866 +{ 9.1867 + struct vm_page *ptp; 9.1868 + 9.1869 + if (pmap_valid_entry(pmap->pm_pdir[pde_index])) { 9.1870 + 9.1871 + /* valid... check hint (saves us a PA->PG lookup) */ 9.1872 + if (pmap->pm_ptphint && 9.1873 + (PDE_GET(&pmap->pm_pdir[pde_index]) & PG_FRAME) == 9.1874 + VM_PAGE_TO_PHYS(pmap->pm_ptphint)) 9.1875 + return(pmap->pm_ptphint); 9.1876 + 9.1877 + ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index)); 9.1878 +#ifdef DIAGNOSTIC 9.1879 + if (ptp == NULL) 9.1880 + panic("pmap_get_ptp: unmanaged user PTP"); 9.1881 +#endif 9.1882 + pmap->pm_ptphint = ptp; 9.1883 + return(ptp); 9.1884 + } 9.1885 + 9.1886 + /* allocate a new PTP (updates ptphint) */ 9.1887 + return(pmap_alloc_ptp(pmap, pde_index)); 9.1888 +} 9.1889 + 9.1890 +/* 9.1891 + * p m a p l i f e c y c l e f u n c t i o n s 9.1892 + */ 9.1893 + 9.1894 +/* 9.1895 + * pmap_pdp_ctor: constructor for the PDP cache. 9.1896 + */ 9.1897 + 9.1898 +int 9.1899 +pmap_pdp_ctor(void *arg, void *object, int flags) 9.1900 +{ 9.1901 + pd_entry_t *pdir = object; 9.1902 + paddr_t pdirpa; 9.1903 + 9.1904 + /* 9.1905 + * NOTE: The `pmap_lock' is held when the PDP is allocated. 9.1906 + * WE MUST NOT BLOCK! 9.1907 + */ 9.1908 + 9.1909 + /* fetch the physical address of the page directory. */ 9.1910 + (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 9.1911 + 9.1912 + XENPRINTF(("pmap_pdp_ctor %p %p\n", pdir, (void *)pdirpa)); 9.1913 + 9.1914 + /* zero init area */ 9.1915 + memset(pdir, 0, PDSLOT_PTE * sizeof(pd_entry_t)); 9.1916 + 9.1917 + /* put in recursive PDE to map the PTEs */ 9.1918 + pdir[PDSLOT_PTE] = xpmap_ptom(pdirpa | PG_V /* | PG_KW */); 9.1919 + 9.1920 + /* put in kernel VM PDEs */ 9.1921 + memcpy(&pdir[PDSLOT_KERN], &PDP_BASE[PDSLOT_KERN], 9.1922 + nkpde * sizeof(pd_entry_t)); 9.1923 + 9.1924 + /* zero the rest */ 9.1925 + memset(&pdir[PDSLOT_KERN + nkpde], 0, 9.1926 + PAGE_SIZE - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t))); 9.1927 + 9.1928 + pmap_enter(pmap_kernel(), (vaddr_t)pdir, pdirpa, VM_PROT_READ, 9.1929 + VM_PROT_READ); 9.1930 + pmap_update(pmap_kernel()); 9.1931 + 9.1932 + /* pin page type */ 9.1933 + xpq_queue_pin_table(xpmap_ptom(pdirpa), XPQ_PIN_L2_TABLE); 9.1934 + xpq_flush_queue(); 9.1935 + 9.1936 + return (0); 9.1937 +} 9.1938 + 9.1939 +void 9.1940 +pmap_pdp_dtor(void *arg, void *object) 9.1941 +{ 9.1942 + pd_entry_t *pdir = object; 9.1943 + paddr_t pdirpa; 9.1944 + 9.1945 + /* fetch the physical address of the page directory. */ 9.1946 + pdirpa = PDE_GET(&pdir[PDSLOT_PTE]) & PG_FRAME; 9.1947 + 9.1948 + XENPRINTF(("pmap_pdp_dtor %p %p\n", pdir, (void *)pdirpa)); 9.1949 + 9.1950 + /* unpin page type */ 9.1951 + xpq_queue_unpin_table(xpmap_ptom(pdirpa)); 9.1952 + xpq_flush_queue(); 9.1953 +} 9.1954 + 9.1955 +/* 9.1956 + * pmap_create: create a pmap 9.1957 + * 9.1958 + * => note: old pmap interface took a "size" args which allowed for 9.1959 + * the creation of "software only" pmaps (not in bsd). 9.1960 + */ 9.1961 + 9.1962 +struct pmap * 9.1963 +pmap_create() 9.1964 +{ 9.1965 + struct pmap *pmap; 9.1966 + u_int gen; 9.1967 + 9.1968 + XENPRINTF(("pmap_create\n")); 9.1969 + pmap = pool_get(&pmap_pmap_pool, PR_WAITOK); 9.1970 + 9.1971 + /* init uvm_object */ 9.1972 + simple_lock_init(&pmap->pm_obj.vmobjlock); 9.1973 + pmap->pm_obj.pgops = NULL; /* currently not a mappable object */ 9.1974 + TAILQ_INIT(&pmap->pm_obj.memq); 9.1975 + pmap->pm_obj.uo_npages = 0; 9.1976 + pmap->pm_obj.uo_refs = 1; 9.1977 + pmap->pm_stats.wired_count = 0; 9.1978 + pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ 9.1979 + pmap->pm_ptphint = NULL; 9.1980 + pmap->pm_hiexec = 0; 9.1981 + pmap->pm_flags = 0; 9.1982 + pmap->pm_cpus = 0; 9.1983 + 9.1984 + /* init the LDT */ 9.1985 + pmap->pm_ldt = NULL; 9.1986 + pmap->pm_ldt_len = 0; 9.1987 + pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL); 9.1988 + 9.1989 + /* allocate PDP */ 9.1990 + 9.1991 + /* 9.1992 + * we need to lock pmaps_lock to prevent nkpde from changing on 9.1993 + * us. note that there is no need to splvm to protect us from 9.1994 + * malloc since malloc allocates out of a submap and we should 9.1995 + * have already allocated kernel PTPs to cover the range... 9.1996 + * 9.1997 + * NOTE: WE MUST NOT BLOCK WHILE HOLDING THE `pmap_lock', nor 9.1998 + * must we call pmap_growkernel() while holding it! 9.1999 + */ 9.2000 + 9.2001 + try_again: 9.2002 + gen = pmap_pdp_cache_generation; 9.2003 + pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 9.2004 + 9.2005 + simple_lock(&pmaps_lock); 9.2006 + 9.2007 + if (gen != pmap_pdp_cache_generation) { 9.2008 + simple_unlock(&pmaps_lock); 9.2009 + pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 9.2010 + goto try_again; 9.2011 + } 9.2012 + 9.2013 + pmap->pm_pdirpa = PDE_GET(&pmap->pm_pdir[PDSLOT_PTE]) & PG_FRAME; 9.2014 + XENPRINTF(("pmap_create %p set pm_pdirpa %p/%p slotval %p\n", pmap, 9.2015 + (void *)pmap->pm_pdirpa, 9.2016 + (void *)xpmap_ptom(pmap->pm_pdirpa), 9.2017 + (void *)pmap->pm_pdir[PDSLOT_PTE])); 9.2018 + 9.2019 + LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 9.2020 + 9.2021 + simple_unlock(&pmaps_lock); 9.2022 + 9.2023 + return (pmap); 9.2024 +} 9.2025 + 9.2026 +/* 9.2027 + * pmap_destroy: drop reference count on pmap. free pmap if 9.2028 + * reference count goes to zero. 9.2029 + */ 9.2030 + 9.2031 +void 9.2032 +pmap_destroy(pmap) 9.2033 + struct pmap *pmap; 9.2034 +{ 9.2035 + int refs; 9.2036 +#ifdef DIAGNOSTIC 9.2037 + struct cpu_info *ci; 9.2038 + CPU_INFO_ITERATOR cii; 9.2039 +#endif /* DIAGNOSTIC */ 9.2040 + 9.2041 + /* 9.2042 + * drop reference count 9.2043 + */ 9.2044 + 9.2045 + simple_lock(&pmap->pm_obj.vmobjlock); 9.2046 + refs = --pmap->pm_obj.uo_refs; 9.2047 + simple_unlock(&pmap->pm_obj.vmobjlock); 9.2048 + if (refs > 0) { 9.2049 + return; 9.2050 + } 9.2051 + 9.2052 +#ifdef DIAGNOSTIC 9.2053 + for (CPU_INFO_FOREACH(cii, ci)) 9.2054 + if (ci->ci_pmap == pmap) 9.2055 + panic("destroying pmap being used"); 9.2056 +#endif /* DIAGNOSTIC */ 9.2057 + 9.2058 + /* 9.2059 + * reference count is zero, free pmap resources and then free pmap. 9.2060 + */ 9.2061 + 9.2062 + XENPRINTF(("pmap_destroy %p pm_pdirpa %p/%p\n", pmap, 9.2063 + (void *)pmap->pm_pdirpa, 9.2064 + (void *)xpmap_ptom(pmap->pm_pdirpa))); 9.2065 + 9.2066 + /* 9.2067 + * remove it from global list of pmaps 9.2068 + */ 9.2069 + 9.2070 + simple_lock(&pmaps_lock); 9.2071 + LIST_REMOVE(pmap, pm_list); 9.2072 + simple_unlock(&pmaps_lock); 9.2073 + 9.2074 + /* 9.2075 + * destroyed pmap shouldn't have remaining PTPs 9.2076 + */ 9.2077 + 9.2078 + KASSERT(pmap->pm_obj.uo_npages == 0); 9.2079 + KASSERT(TAILQ_EMPTY(&pmap->pm_obj.memq)); 9.2080 + 9.2081 + /* 9.2082 + * MULTIPROCESSOR -- no need to flush out of other processors' 9.2083 + * APTE space because we do that in pmap_unmap_ptes(). 9.2084 + */ 9.2085 + pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 9.2086 + 9.2087 +#ifdef USER_LDT 9.2088 + if (pmap->pm_flags & PMF_USER_LDT) { 9.2089 + /* 9.2090 + * no need to switch the LDT; this address space is gone, 9.2091 + * nothing is using it. 9.2092 + * 9.2093 + * No need to lock the pmap for ldt_free (or anything else), 9.2094 + * we're the last one to use it. 9.2095 + */ 9.2096 + ldt_free(pmap); 9.2097 + uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 9.2098 + pmap->pm_ldt_len * sizeof(union descriptor)); 9.2099 + } 9.2100 +#endif 9.2101 + 9.2102 + pool_put(&pmap_pmap_pool, pmap); 9.2103 +} 9.2104 + 9.2105 +/* 9.2106 + * Add a reference to the specified pmap. 9.2107 + */ 9.2108 + 9.2109 +void 9.2110 +pmap_reference(pmap) 9.2111 + struct pmap *pmap; 9.2112 +{ 9.2113 + simple_lock(&pmap->pm_obj.vmobjlock); 9.2114 + pmap->pm_obj.uo_refs++; 9.2115 + simple_unlock(&pmap->pm_obj.vmobjlock); 9.2116 +} 9.2117 + 9.2118 +#if defined(PMAP_FORK) 9.2119 +/* 9.2120 + * pmap_fork: perform any necessary data structure manipulation when 9.2121 + * a VM space is forked. 9.2122 + */ 9.2123 + 9.2124 +void 9.2125 +pmap_fork(pmap1, pmap2) 9.2126 + struct pmap *pmap1, *pmap2; 9.2127 +{ 9.2128 + simple_lock(&pmap1->pm_obj.vmobjlock); 9.2129 + simple_lock(&pmap2->pm_obj.vmobjlock); 9.2130 + 9.2131 +#ifdef USER_LDT 9.2132 + /* Copy the LDT, if necessary. */ 9.2133 + if (pmap1->pm_flags & PMF_USER_LDT) { 9.2134 + union descriptor *new_ldt; 9.2135 + size_t len; 9.2136 + 9.2137 + len = pmap1->pm_ldt_len * sizeof(union descriptor); 9.2138 + new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len); 9.2139 + memcpy(new_ldt, pmap1->pm_ldt, len); 9.2140 + pmap2->pm_ldt = new_ldt; 9.2141 + pmap2->pm_ldt_len = pmap1->pm_ldt_len; 9.2142 + pmap2->pm_flags |= PMF_USER_LDT; 9.2143 + ldt_alloc(pmap2, new_ldt, len); 9.2144 + } 9.2145 +#endif /* USER_LDT */ 9.2146 + 9.2147 + simple_unlock(&pmap2->pm_obj.vmobjlock); 9.2148 + simple_unlock(&pmap1->pm_obj.vmobjlock); 9.2149 +} 9.2150 +#endif /* PMAP_FORK */ 9.2151 + 9.2152 +#ifdef USER_LDT 9.2153 +/* 9.2154 + * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 9.2155 + * restore the default. 9.2156 + */ 9.2157 + 9.2158 +void 9.2159 +pmap_ldt_cleanup(l) 9.2160 + struct lwp *l; 9.2161 +{ 9.2162 + struct pcb *pcb = &l->l_addr->u_pcb; 9.2163 + pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 9.2164 + union descriptor *old_ldt = NULL; 9.2165 + size_t len = 0; 9.2166 + 9.2167 + simple_lock(&pmap->pm_obj.vmobjlock); 9.2168 + 9.2169 + if (pmap->pm_flags & PMF_USER_LDT) { 9.2170 + ldt_free(pmap); 9.2171 + pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL); 9.2172 + pcb->pcb_ldt_sel = pmap->pm_ldt_sel; 9.2173 + if (pcb == curpcb) 9.2174 + lldt(pcb->pcb_ldt_sel); 9.2175 + old_ldt = pmap->pm_ldt; 9.2176 + len = pmap->pm_ldt_len * sizeof(union descriptor); 9.2177 + pmap->pm_ldt = NULL; 9.2178 + pmap->pm_ldt_len = 0; 9.2179 + pmap->pm_flags &= ~PMF_USER_LDT; 9.2180 + } 9.2181 + 9.2182 + simple_unlock(&pmap->pm_obj.vmobjlock); 9.2183 + 9.2184 + if (old_ldt != NULL) 9.2185 + uvm_km_free(kernel_map, (vaddr_t)old_ldt, len); 9.2186 +} 9.2187 +#endif /* USER_LDT */ 9.2188 + 9.2189 +/* 9.2190 + * pmap_activate: activate a process' pmap 9.2191 + * 9.2192 + * => called from cpu_switch() 9.2193 + * => if lwp is the curlwp, then set ci_want_pmapload so that 9.2194 + * actual MMU context switch will be done by pmap_load() later 9.2195 + */ 9.2196 + 9.2197 +void 9.2198 +pmap_activate(l) 9.2199 + struct lwp *l; 9.2200 +{ 9.2201 + struct cpu_info *ci = curcpu(); 9.2202 + struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 9.2203 + 9.2204 + if (l == ci->ci_curlwp) { 9.2205 + struct pcb *pcb; 9.2206 + 9.2207 + KASSERT(ci->ci_want_pmapload == 0); 9.2208 + KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 9.2209 +#ifdef KSTACK_CHECK_DR0 9.2210 + /* 9.2211 + * setup breakpoint on the top of stack 9.2212 + */ 9.2213 + if (l == &lwp0) 9.2214 + dr0(0, 0, 0, 0); 9.2215 + else 9.2216 + dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 9.2217 +#endif 9.2218 + 9.2219 + /* 9.2220 + * no need to switch to kernel vmspace because 9.2221 + * it's a subset of any vmspace. 9.2222 + */ 9.2223 + 9.2224 + if (pmap == pmap_kernel()) { 9.2225 + ci->ci_want_pmapload = 0; 9.2226 + return; 9.2227 + } 9.2228 + 9.2229 + pcb = &l->l_addr->u_pcb; 9.2230 + pcb->pcb_ldt_sel = pmap->pm_ldt_sel; 9.2231 + 9.2232 + ci->ci_want_pmapload = 1; 9.2233 + } 9.2234 +} 9.2235 + 9.2236 +/* 9.2237 + * pmap_reactivate: try to regain reference to the pmap. 9.2238 + */ 9.2239 + 9.2240 +static boolean_t 9.2241 +pmap_reactivate(struct pmap *pmap) 9.2242 +{ 9.2243 + struct cpu_info *ci = curcpu(); 9.2244 + u_int32_t cpumask = 1U << ci->ci_cpuid; 9.2245 + int s; 9.2246 + boolean_t result; 9.2247 + u_int32_t oldcpus; 9.2248 + 9.2249 + /* 9.2250 + * if we still have a lazy reference to this pmap, 9.2251 + * we can assume that there was no tlb shootdown 9.2252 + * for this pmap in the meantime. 9.2253 + */ 9.2254 + 9.2255 + s = splipi(); /* protect from tlb shootdown ipis. */ 9.2256 + oldcpus = pmap->pm_cpus; 9.2257 + x86_atomic_setbits_l(&pmap->pm_cpus, cpumask); 9.2258 + if (oldcpus & cpumask) { 9.2259 + KASSERT(ci->ci_tlbstate == TLBSTATE_LAZY); 9.2260 + /* got it */ 9.2261 + result = TRUE; 9.2262 + } else { 9.2263 + KASSERT(ci->ci_tlbstate == TLBSTATE_STALE); 9.2264 + result = FALSE; 9.2265 + } 9.2266 + ci->ci_tlbstate = TLBSTATE_VALID; 9.2267 + splx(s); 9.2268 + 9.2269 + return result; 9.2270 +} 9.2271 + 9.2272 +/* 9.2273 + * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 9.2274 + */ 9.2275 + 9.2276 +void 9.2277 +pmap_load() 9.2278 +{ 9.2279 + struct cpu_info *ci = curcpu(); 9.2280 + u_int32_t cpumask = 1U << ci->ci_cpuid; 9.2281 + struct pmap *pmap; 9.2282 + struct pmap *oldpmap; 9.2283 + struct lwp *l; 9.2284 + struct pcb *pcb; 9.2285 + pd_entry_t *mapdp; 9.2286 + int s; 9.2287 + 9.2288 + KASSERT(ci->ci_want_pmapload); 9.2289 + 9.2290 + l = ci->ci_curlwp; 9.2291 + KASSERT(l != NULL); 9.2292 + pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 9.2293 + KASSERT(pmap != pmap_kernel()); 9.2294 + oldpmap = ci->ci_pmap; 9.2295 + 9.2296 + pcb = ci->ci_curpcb; 9.2297 + KASSERT(pcb == &l->l_addr->u_pcb); 9.2298 + /* loaded by pmap_activate */ 9.2299 + KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel); 9.2300 + 9.2301 + if (pmap == oldpmap) { 9.2302 + if (!pmap_reactivate(pmap)) { 9.2303 + 9.2304 + /* 9.2305 + * pmap has been changed during deactivated. 9.2306 + * our tlb may be stale. 9.2307 + */ 9.2308 + 9.2309 + tlbflush(); 9.2310 + } 9.2311 + 9.2312 + ci->ci_want_pmapload = 0; 9.2313 + return; 9.2314 + } 9.2315 + 9.2316 + /* 9.2317 + * actually switch pmap. 9.2318 + */ 9.2319 + 9.2320 + x86_atomic_clearbits_l(&oldpmap->pm_cpus, cpumask); 9.2321 + 9.2322 + KASSERT((pmap->pm_cpus & cpumask) == 0); 9.2323 + 9.2324 + KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE); 9.2325 + pmap_reference(pmap); 9.2326 + KERNEL_UNLOCK(); 9.2327 + 9.2328 + /* 9.2329 + * mark the pmap in use by this processor. 9.2330 + */ 9.2331 + 9.2332 + s = splipi(); 9.2333 + x86_atomic_setbits_l(&pmap->pm_cpus, cpumask); 9.2334 + ci->ci_pmap = pmap; 9.2335 + ci->ci_tlbstate = TLBSTATE_VALID; 9.2336 + splx(s); 9.2337 + 9.2338 + /* 9.2339 + * clear apdp slot before loading %cr3 since Xen only allows 9.2340 + * linear pagetable mappings in the current pagetable. 9.2341 + */ 9.2342 + KDASSERT(curapdp == 0); 9.2343 + mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE); 9.2344 + PDE_CLEAR(APDP_PDE, mapdp); 9.2345 + 9.2346 + /* 9.2347 + * update tss and load corresponding registers. 9.2348 + */ 9.2349 + 9.2350 + lldt(pcb->pcb_ldt_sel); 9.2351 + pcb->pcb_cr3 = pmap->pm_pdirpa; 9.2352 + lcr3(pcb->pcb_cr3); 9.2353 + 9.2354 + ci->ci_want_pmapload = 0; 9.2355 + 9.2356 + KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE); 9.2357 + pmap_destroy(oldpmap); 9.2358 + KERNEL_UNLOCK(); 9.2359 +} 9.2360 + 9.2361 +/* 9.2362 + * pmap_deactivate: deactivate a process' pmap 9.2363 + */ 9.2364 + 9.2365 +void 9.2366 +pmap_deactivate(l) 9.2367 + struct lwp *l; 9.2368 +{ 9.2369 + 9.2370 + if (l == curlwp) 9.2371 + pmap_deactivate2(l); 9.2372 +} 9.2373 + 9.2374 +/* 9.2375 + * pmap_deactivate2: context switch version of pmap_deactivate. 9.2376 + * always treat l as curlwp. 9.2377 + */ 9.2378 + 9.2379 +void 9.2380 +pmap_deactivate2(l) 9.2381 + struct lwp *l; 9.2382 +{ 9.2383 + struct pmap *pmap; 9.2384 + struct cpu_info *ci = curcpu(); 9.2385 + 9.2386 + if (ci->ci_want_pmapload) { 9.2387 + KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 9.2388 + != pmap_kernel()); 9.2389 + KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 9.2390 + != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 9.2391 + 9.2392 + /* 9.2393 + * userspace has not been touched. 9.2394 + * nothing to do here. 9.2395 + */ 9.2396 + 9.2397 + ci->ci_want_pmapload = 0; 9.2398 + return; 9.2399 + } 9.2400 + 9.2401 + pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 9.2402 + 9.2403 + if (pmap == pmap_kernel()) { 9.2404 + return; 9.2405 + } 9.2406 + 9.2407 + KASSERT(ci->ci_pmap == pmap); 9.2408 + 9.2409 + KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 9.2410 + ci->ci_tlbstate = TLBSTATE_LAZY; 9.2411 + XENPRINTF(("pmap_deactivate %p ebp %p esp %p\n", 9.2412 + l, (void *)l->l_addr->u_pcb.pcb_ebp, 9.2413 + (void *)l->l_addr->u_pcb.pcb_esp)); 9.2414 +} 9.2415 + 9.2416 +/* 9.2417 + * end of lifecycle functions 9.2418 + */ 9.2419 + 9.2420 +/* 9.2421 + * some misc. functions 9.2422 + */ 9.2423 + 9.2424 +/* 9.2425 + * pmap_extract: extract a PA for the given VA 9.2426 + */ 9.2427 + 9.2428 +boolean_t 9.2429 +pmap_extract(pmap, va, pap) 9.2430 + struct pmap *pmap; 9.2431 + vaddr_t va; 9.2432 + paddr_t *pap; 9.2433 +{ 9.2434 + pt_entry_t *ptes, pte; 9.2435 + pd_entry_t pde; 9.2436 + 9.2437 + if (__predict_true((pde = PDE_GET(&pmap->pm_pdir[pdei(va)])) != 0)) { 9.2438 +#ifdef LARGEPAGES 9.2439 + if (pde & PG_PS) { 9.2440 + if (pap != NULL) 9.2441 + *pap = (pde & PG_LGFRAME) | (va & ~PG_LGFRAME); 9.2442 + return (TRUE); 9.2443 + } 9.2444 +#endif 9.2445 + 9.2446 + ptes = pmap_map_ptes(pmap); 9.2447 + pte = PTE_GET(&ptes[x86_btop(va)]); 9.2448 + pmap_unmap_ptes(pmap); 9.2449 + 9.2450 + if (__predict_true((pte & PG_V) != 0)) { 9.2451 + if (pap != NULL) 9.2452 + *pap = (pte & PG_FRAME) | (va & ~PG_FRAME); 9.2453 + return (TRUE); 9.2454 + } 9.2455 + } 9.2456 + return (FALSE); 9.2457 +} 9.2458 + 9.2459 + 9.2460 +/* 9.2461 + * vtophys: virtual address to physical address. For use by 9.2462 + * machine-dependent code only. 9.2463 + */ 9.2464 + 9.2465 +paddr_t 9.2466 +vtophys(va) 9.2467 + vaddr_t va; 9.2468 +{ 9.2469 + paddr_t pa; 9.2470 + 9.2471 + if (pmap_extract(pmap_kernel(), va, &pa) == TRUE) 9.2472 + return (pa); 9.2473 + return (0); 9.2474 +} 9.2475 + 9.2476 + 9.2477 +/* 9.2478 + * pmap_virtual_space: used during bootup [pmap_steal_memory] to 9.2479 + * determine the bounds of the kernel virtual addess space. 9.2480 + */ 9.2481 + 9.2482 +void 9.2483 +pmap_virtual_space(startp, endp) 9.2484 + vaddr_t *startp; 9.2485 + vaddr_t *endp; 9.2486 +{ 9.2487 + *startp = virtual_avail; 9.2488 + *endp = virtual_end; 9.2489 +} 9.2490 + 9.2491 +/* 9.2492 + * pmap_map: map a range of PAs into kvm 9.2493 + * 9.2494 + * => used during crash dump 9.2495 + * => XXX: pmap_map() should be phased out? 9.2496 + */ 9.2497 + 9.2498 +vaddr_t 9.2499 +pmap_map(va, spa, epa, prot) 9.2500 + vaddr_t va; 9.2501 + paddr_t spa, epa; 9.2502 + vm_prot_t prot; 9.2503 +{ 9.2504 + while (spa < epa) { 9.2505 + pmap_enter(pmap_kernel(), va, spa, prot, 0); 9.2506 + va += PAGE_SIZE; 9.2507 + spa += PAGE_SIZE; 9.2508 + } 9.2509 + pmap_update(pmap_kernel()); 9.2510 + return va; 9.2511 +} 9.2512 + 9.2513 +/* 9.2514 + * pmap_zero_page: zero a page 9.2515 + */ 9.2516 + 9.2517 +void 9.2518 +pmap_zero_page(pa) 9.2519 + paddr_t pa; 9.2520 +{ 9.2521 +#ifdef MULTIPROCESSOR 9.2522 + int id = cpu_number(); 9.2523 +#endif 9.2524 + pt_entry_t *zpte = PTESLEW(zero_pte, id); 9.2525 + pt_entry_t *maptp; 9.2526 + caddr_t zerova = VASLEW(zerop, id); 9.2527 + 9.2528 +#ifdef DIAGNOSTIC 9.2529 + if (PTE_GET(zpte)) 9.2530 + panic("pmap_zero_page: lock botch"); 9.2531 +#endif 9.2532 + 9.2533 + maptp = (pt_entry_t *)vtomach((vaddr_t)zpte); 9.2534 + PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW); /* map in */ 9.2535 + pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 9.2536 + 9.2537 + memset(zerova, 0, PAGE_SIZE); /* zero */ 9.2538 + PTE_CLEAR(zpte, maptp); /* zap! */ 9.2539 +} 9.2540 + 9.2541 +/* 9.2542 + * pmap_pagezeroidle: the same, for the idle loop page zero'er. 9.2543 + * Returns TRUE if the page was zero'd, FALSE if we aborted for 9.2544 + * some reason. 9.2545 + */ 9.2546 + 9.2547 +boolean_t 9.2548 +pmap_pageidlezero(pa) 9.2549 + paddr_t pa; 9.2550 +{ 9.2551 +#ifdef MULTIPROCESSOR 9.2552 + int id = cpu_number(); 9.2553 +#endif 9.2554 + pt_entry_t *zpte = PTESLEW(zero_pte, id); 9.2555 + pt_entry_t *maptp; 9.2556 + caddr_t zerova = VASLEW(zerop, id); 9.2557 + boolean_t rv = TRUE; 9.2558 + int i, *ptr; 9.2559 + 9.2560 +#ifdef DIAGNOSTIC 9.2561 + if (PTE_GET(zpte)) 9.2562 + panic("pmap_zero_page_uncached: lock botch"); 9.2563 +#endif 9.2564 + maptp = (pt_entry_t *)vtomach((vaddr_t)zpte); 9.2565 + PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW); /* map in */ 9.2566 + pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 9.2567 + for (i = 0, ptr = (int *) zerova; i < PAGE_SIZE / sizeof(int); i++) { 9.2568 + if (sched_whichqs != 0) { 9.2569 + 9.2570 + /* 9.2571 + * A process has become ready. Abort now, 9.2572 + * so we don't keep it waiting while we 9.2573 + * do slow memory access to finish this 9.2574 + * page. 9.2575 + */ 9.2576 + 9.2577 + rv = FALSE; 9.2578 + break; 9.2579 + } 9.2580 + *ptr++ = 0; 9.2581 + } 9.2582 + 9.2583 + PTE_CLEAR(zpte, maptp); /* zap! */ 9.2584 + return (rv); 9.2585 +} 9.2586 + 9.2587 +/* 9.2588 + * pmap_copy_page: copy a page 9.2589 + */ 9.2590 + 9.2591 +void 9.2592 +pmap_copy_page(srcpa, dstpa) 9.2593 + paddr_t srcpa, dstpa; 9.2594 +{ 9.2595 +#ifdef MULTIPROCESSOR 9.2596 + int id = cpu_number(); 9.2597 +#endif 9.2598 + pt_entry_t *spte = PTESLEW(csrc_pte,id), *maspte; 9.2599 + pt_entry_t *dpte = PTESLEW(cdst_pte,id), *madpte; 9.2600 + caddr_t csrcva = VASLEW(csrcp, id); 9.2601 + caddr_t cdstva = VASLEW(cdstp, id); 9.2602 + 9.2603 +#ifdef DIAGNOSTIC 9.2604 + if (PTE_GET(spte) || PTE_GET(dpte)) 9.2605 + panic("pmap_copy_page: lock botch"); 9.2606 +#endif 9.2607 + 9.2608 + maspte = (pt_entry_t *)vtomach((vaddr_t)spte); 9.2609 + madpte = (pt_entry_t *)vtomach((vaddr_t)dpte); 9.2610 + PTE_SET(spte, maspte, (srcpa & PG_FRAME) | PG_V | PG_RW); 9.2611 + PTE_SET(dpte, madpte, (dstpa & PG_FRAME) | PG_V | PG_RW); 9.2612 + pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 9.2613 + memcpy(cdstva, csrcva, PAGE_SIZE); 9.2614 + PTE_CLEAR(spte, maspte); /* zap! */ 9.2615 + PTE_CLEAR(dpte, madpte); /* zap! */ 9.2616 +} 9.2617 + 9.2618 +/* 9.2619 + * p m a p r e m o v e f u n c t i o n s 9.2620 + * 9.2621 + * functions that remove mappings 9.2622 + */ 9.2623 + 9.2624 +/* 9.2625 + * pmap_remove_ptes: remove PTEs from a PTP 9.2626 + * 9.2627 + * => must have proper locking on pmap_master_lock 9.2628 + * => caller must hold pmap's lock 9.2629 + * => PTP must be mapped into KVA 9.2630 + * => PTP should be null if pmap == pmap_kernel() 9.2631 + */ 9.2632 + 9.2633 +static void 9.2634 +pmap_remove_ptes(pmap, ptp, ptpva, startva, endva, cpumaskp, flags) 9.2635 + struct pmap *pmap; 9.2636 + struct vm_page *ptp; 9.2637 + vaddr_t ptpva; 9.2638 + vaddr_t startva, endva; 9.2639 + int32_t *cpumaskp; 9.2640 + int flags; 9.2641 +{ 9.2642 + struct pv_entry *pv_tofree = NULL; /* list of pv_entrys to free */ 9.2643 + struct pv_entry *pve; 9.2644 + pt_entry_t *pte = (pt_entry_t *) ptpva; 9.2645 + pt_entry_t opte; 9.2646 + pt_entry_t *maptp; 9.2647 + 9.2648 + /* 9.2649 + * note that ptpva points to the PTE that maps startva. this may 9.2650 + * or may not be the first PTE in the PTP. 9.2651 + * 9.2652 + * we loop through the PTP while there are still PTEs to look at 9.2653 + * and the wire_count is greater than 1 (because we use the wire_count 9.2654 + * to keep track of the number of real PTEs in the PTP). 9.2655 + */ 9.2656 + 9.2657 + for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 9.2658 + ; pte++, startva += PAGE_SIZE) { 9.2659 + struct vm_page *pg; 9.2660 + struct vm_page_md *mdpg; 9.2661 + 9.2662 + if (!pmap_valid_entry(*pte)) 9.2663 + continue; /* VA not mapped */ 9.2664 + if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 9.2665 + continue; 9.2666 + } 9.2667 + 9.2668 + /* atomically save the old PTE and zap! it */ 9.2669 + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); 9.2670 + opte = pte_atomic_update(pte, maptp, 0); 9.2671 + pmap_exec_account(pmap, startva, opte, 0); 9.2672 + 9.2673 + if (opte & PG_W) 9.2674 + pmap->pm_stats.wired_count--; 9.2675 + pmap->pm_stats.resident_count--; 9.2676 + 9.2677 + if (opte & PG_U) 9.2678 + pmap_tlb_shootdown(pmap, startva, opte, cpumaskp); 9.2679 + 9.2680 + if (ptp) { 9.2681 + ptp->wire_count--; /* dropping a PTE */ 9.2682 + /* Make sure that the PDE is flushed */ 9.2683 + if ((ptp->wire_count <= 1) && !(opte & PG_U)) 9.2684 + pmap_tlb_shootdown(pmap, startva, opte, 9.2685 + cpumaskp); 9.2686 + } 9.2687 + 9.2688 + /* 9.2689 + * if we are not on a pv_head list we are done. 9.2690 + */ 9.2691 + 9.2692 + if ((opte & PG_PVLIST) == 0) { 9.2693 +#if defined(DIAGNOSTIC) && !defined(DOM0OPS) 9.2694 + if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL) 9.2695 + panic("pmap_remove_ptes: managed page without " 9.2696 + "PG_PVLIST for 0x%lx", startva); 9.2697 +#endif 9.2698 + continue; 9.2699 + } 9.2700 + 9.2701 + pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 9.2702 +#ifdef DIAGNOSTIC 9.2703 + if (pg == NULL) 9.2704 + panic("pmap_remove_ptes: unmanaged page marked " 9.2705 + "PG_PVLIST, va = 0x%lx, pa = 0x%lx", 9.2706 + startva, (u_long)(opte & PG_FRAME)); 9.2707 +#endif 9.2708 + mdpg = &pg->mdpage; 9.2709 + 9.2710 + /* sync R/M bits */ 9.2711 + simple_lock(&mdpg->mp_pvhead.pvh_lock); 9.2712 + mdpg->mp_attrs |= (opte & (PG_U|PG_M)); 9.2713 + pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, startva); 9.2714 + simple_unlock(&mdpg->mp_pvhead.pvh_lock); 9.2715 + 9.2716 + if (pve) { 9.2717 + SPLAY_RIGHT(pve, pv_node) = pv_tofree; 9.2718 + pv_tofree = pve; 9.2719 + } 9.2720 + 9.2721 + /* end of "for" loop: time for next pte */ 9.2722 + } 9.2723 + if (pv_tofree) 9.2724 + pmap_free_pvs(pmap, pv_tofree); 9.2725 +} 9.2726 + 9.2727 + 9.2728 +/* 9.2729 + * pmap_remove_pte: remove a single PTE from a PTP 9.2730 + * 9.2731 + * => must have proper locking on pmap_master_lock 9.2732 + * => caller must hold pmap's lock 9.2733 + * => PTP must be mapped into KVA 9.2734 + * => PTP should be null if pmap == pmap_kernel() 9.2735 + * => returns true if we removed a mapping 9.2736 + */ 9.2737 + 9.2738 +static boolean_t 9.2739 +pmap_remove_pte(pmap, ptp, pte, va, cpumaskp, flags) 9.2740 + struct pmap *pmap; 9.2741 + struct vm_page *ptp; 9.2742 + pt_entry_t *pte; 9.2743 + vaddr_t va; 9.2744 + int32_t *cpumaskp; 9.2745 + int flags; 9.2746 +{ 9.2747 + pt_entry_t opte; 9.2748 + pt_entry_t *maptp; 9.2749 + struct pv_entry *pve; 9.2750 + struct vm_page *pg; 9.2751 + struct vm_page_md *mdpg; 9.2752 + 9.2753 + if (!pmap_valid_entry(*pte)) 9.2754 + return(FALSE); /* VA not mapped */ 9.2755 + if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 9.2756 + return(FALSE); 9.2757 + } 9.2758 + 9.2759 + /* atomically save the old PTE and zap! it */ 9.2760 + maptp = (pt_entry_t *)vtomach((vaddr_t)pte); 9.2761 + opte = pte_atomic_update(pte, maptp, 0); 9.2762 + 9.2763 + XENPRINTK(("pmap_remove_pte %p, was %08x\n", pte, opte)); 9.2764 + pmap_exec_account(pmap, va, opte, 0); 9.2765 + 9.2766 + if (opte & PG_W) 9.2767 + pmap->pm_stats.wired_count--; 9.2768 + pmap->pm_stats.resident_count--; 9.2769 + 9.2770 + if (opte & PG_U) 9.2771 + pmap_tlb_shootdown(pmap, va, opte, cpumaskp); 9.2772 + 9.2773 + if (ptp) { 9.2774 + ptp->wire_count--; /* dropping a PTE */ 9.2775 + /* Make sure that the PDE is flushed */ 9.2776 + if ((ptp->wire_count <= 1) && !(opte & PG_U)) 9.2777 + pmap_tlb_shootdown(pmap, va, opte, cpumaskp); 9.2778 + 9.2779 + } 9.2780 + /* 9.2781 + * if we are not on a pv_head list we are done. 9.2782 + */ 9.2783 + 9.2784 + if ((opte & PG_PVLIST) == 0) { 9.2785 +#if defined(DIAGNOSTIC) && !defined(DOM0OPS) 9.2786 + if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL) 9.2787 + panic("pmap_remove_pte: managed page without " 9.2788 + "PG_PVLIST for 0x%lx", va); 9.2789 +#endif 9.2790 + return(TRUE); 9.2791 + } 9.2792 + 9.2793 + pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 9.2794 +#ifdef DIAGNOSTIC 9.2795 + if (pg == NULL) 9.2796 + panic("pmap_remove_pte: unmanaged page marked " 9.2797 + "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va, 9.2798 + (u_long)(opte & PG_FRAME)); 9.2799 +#endif 9.2800 + mdpg = &pg->mdpage; 9.2801 + 9.2802 + /* sync R/M bits */ 9.2803 + simple_lock(&mdpg->mp_pvhead.pvh_lock); 9.2804 + mdpg->mp_attrs |= (opte & (PG_U|PG_M)); 9.2805 + pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, va); 9.2806 + simple_unlock(&mdpg->mp_pvhead.pvh_lock); 9.2807 + 9.2808 + if (pve) 9.2809 + pmap_free_pv(pmap, pve); 9.2810 + return(TRUE); 9.2811 +} 9.2812 + 9.2813 +/* 9.2814 + * pmap_remove: top level mapping removal function 9.2815 + * 9.2816 + * => caller should not be holding any pmap locks 9.2817 + */ 9.2818 + 9.2819 +void 9.2820 +pmap_remove(pmap, sva, eva) 9.2821 + struct pmap *pmap; 9.2822 + vaddr_t sva, eva; 9.2823 +{ 9.2824 + pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL); 9.2825 +} 9.2826 + 9.2827 +/* 9.2828 + * pmap_do_remove: mapping removal guts 9.2829 + * 9.2830 + * => caller should not be holding any pmap locks 9.2831 + */ 9.2832 + 9.2833 +static void 9.2834 +pmap_do_remove(pmap, sva, eva, flags) 9.2835 + struct pmap *pmap; 9.2836 + vaddr_t sva, eva; 9.2837 + int flags; 9.2838 +{ 9.2839 + pt_entry_t *ptes, opte; 9.2840 + pt_entry_t *maptp; 9.2841 + boolean_t result; 9.2842 + paddr_t ptppa; 9.2843 + vaddr_t blkendva; 9.2844 + struct vm_page *ptp; 9.2845 + int32_t cpumask = 0; 9.2846 + TAILQ_HEAD(, vm_page) empty_ptps; 9.2847 + struct cpu_info *ci; 9.2848 + struct pmap *curpmap; 9.2849 + 9.2850 + /* 9.2851 + * we lock in the pmap => pv_head direction 9.2852 + */ 9.2853 + 9.2854 + TAILQ_INIT(&empty_ptps); 9.2855 + 9.2856 + PMAP_MAP_TO_HEAD_LOCK(); 9.2857 + 9.2858 + ptes = pmap_map_ptes(pmap); /* locks pmap */ 9.2859 + 9.2860 + ci = curcpu(); 9.2861 + curpmap = ci->ci_pmap; 9.2862 + 9.2863 + /* 9.2864 + * removing one page? take shortcut function. 9.2865 + */ 9.2866 + 9.2867 + if (sva + PAGE_SIZE == eva) { 9.2868 + if (pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) { 9.2869 + 9.2870 + /* PA of the PTP */ 9.2871 + ptppa = PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME; 9.2872 + 9.2873 + /* get PTP if non-kernel mapping */ 9.2874 + if (pmap == pmap_kernel()) { 9.2875 + /* we never free kernel PTPs */ 9.2876 + ptp = NULL; 9.2877 + } else { 9.2878 + if (pmap->pm_ptphint && 9.2879 + VM_PAGE_TO_PHYS(pmap->pm_ptphint) == 9.2880 + ptppa) { 9.2881 + ptp = pmap->pm_ptphint; 9.2882 + } else { 9.2883 + ptp = PHYS_TO_VM_PAGE(ptppa); 9.2884 +#ifdef DIAGNOSTIC 9.2885 + if (ptp == NULL) 9.2886 + panic("pmap_remove: unmanaged " 9.2887 + "PTP detected"); 9.2888 +#endif 9.2889 + } 9.2890 + } 9.2891 + 9.2892 + /* do it! */ 9.2893 + result = pmap_remove_pte(pmap, ptp, 9.2894 + &ptes[x86_btop(sva)], sva, &cpumask, flags); 9.2895 + 9.2896 + /* 9.2897 + * if mapping removed and the PTP is no longer 9.2898 + * being used, free it! 9.2899 + */ 9.2900 + 9.2901 + if (result && ptp && ptp->wire_count <= 1) { 9.2902 + /* zap! */ 9.2903 + maptp = (pt_entry_t *)vtomach( 9.2904 + (vaddr_t)&pmap->pm_pdir[pdei(sva)]); 9.2905 + PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)], 9.2906 + maptp, opte); 9.2907 +#if defined(MULTIPROCESSOR) 9.2908 + /* 9.2909 + * XXXthorpej Redundant shootdown can happen 9.2910 + * here if we're using APTE space. 9.2911 + */ 9.2912 +#endif 9.2913 + pmap_tlb_shootdown(curpmap, 9.2914 + ((vaddr_t)ptes) + ptp->offset, opte, 9.2915 + &cpumask); 9.2916 +#if defined(MULTIPROCESSOR) 9.2917 + /* 9.2918 + * Always shoot down the pmap's self-mapping 9.2919 + * of the PTP. 9.2920 + * XXXthorpej Redundant shootdown can happen 9.2921 + * here if pmap == curpmap (not APTE space). 9.2922 + */ 9.2923 + pmap_tlb_shootdown(pmap, 9.2924 + ((vaddr_t)PTE_BASE) + ptp->offset, opte, 9.2925 + &cpumask); 9.2926 +#endif 9.2927 + pmap->pm_stats.resident_count--; 9.2928 + if (pmap->pm_ptphint == ptp) 9.2929 + pmap->pm_ptphint = 9.2930 + TAILQ_FIRST(&pmap->pm_obj.memq); 9.2931 + ptp->wire_count = 0; 9.2932 + ptp->flags |= PG_ZERO; 9.2933 + uvm_pagerealloc(ptp, NULL, 0); 9.2934 + TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq); 9.2935 + } 9.2936 + } 9.2937 + pmap_tlb_shootnow(cpumask); 9.2938 + pmap_unmap_ptes(pmap); /* unlock pmap */ 9.2939 + PMAP_MAP_TO_HEAD_UNLOCK(); 9.2940 + /* Now we can free unused ptps */ 9.2941 + TAILQ_FOREACH(ptp, &empty_ptps, listq) 9.2942 + uvm_pagefree(ptp); 9.2943 + return; 9.2944 + } 9.2945 + 9.2946 + cpumask = 0; 9.2947 + 9.2948 + for (/* null */ ; sva < eva ; sva = blkendva) { 9.2949 + 9.2950 + /* determine range of block */ 9.2951 + blkendva = x86_round_pdr(sva+1); 9.2952 + if (blkendva > eva) 9.2953 + blkendva = eva; 9.2954 + 9.2955 + /* 9.2956 + * XXXCDC: our PTE mappings should never be removed 9.2957 + * with pmap_remove! if we allow this (and why would 9.2958 + * we?) then we end up freeing the pmap's page 9.2959 + * directory page (PDP) before we are finished using 9.2960 + * it when we hit in in the recursive mapping. this 9.2961 + * is BAD. 9.2962 + * 9.2963 + * long term solution is to move the PTEs out of user 9.2964 + * address space. and into kernel address space (up 9.2965 + * with APTE). then we can set VM_MAXUSER_ADDRESS to 9.2966 + * be VM_MAX_ADDRESS. 9.2967 + */ 9.2968 + 9.2969 + if (pdei(sva) == PDSLOT_PTE) 9.2970 + /* XXXCDC: ugly hack to avoid freeing PDP here */ 9.2971 + continue; 9.2972 + 9.2973 + if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) 9.2974 + /* valid block? */ 9.2975 + continue; 9.2976 + 9.2977 + /* PA of the PTP */ 9.2978 + ptppa = (PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME); 9.2979 + 9.2980 + /* get PTP if non-kernel mapping */ 9.2981 + if (pmap == pmap_kernel()) { 9.2982 + /* we never free kernel PTPs */ 9.2983 + ptp = NULL; 9.2984 + } else { 9.2985 + if (pmap->pm_ptphint && 9.2986 + VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) { 9.2987 + ptp = pmap->pm_ptphint; 9.2988 + } else { 9.2989 + ptp = PHYS_TO_VM_PAGE(ptppa); 9.2990 +#ifdef DIAGNOSTIC 9.2991 + if (ptp == NULL) 9.2992 + panic("pmap_remove: unmanaged PTP " 9.2993 + "detected"); 9.2994 +#endif 9.2995 + } 9.2996 + } 9.2997 + pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[x86_btop(sva)], 9.2998 + sva, blkendva, &cpumask, flags); 9.2999 + 9.3000 + /* if PTP is no longer being used, free it! */ 9.3001 + if (ptp && ptp->wire_count <= 1) { 9.3002 + /* zap! */ 9.3003 + maptp = (pt_entry_t *)vtomach( 9.3004 + (vaddr_t)&pmap->pm_pdir[pdei(sva)]); 9.3005 + PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)], 9.3006 + maptp, opte); 9.3007 +#if defined(MULTIPROCESSOR) 9.3008 + /* 9.3009 + * XXXthorpej Redundant shootdown can happen here 9.3010 + * if we're using APTE space. 9.3011 + */ 9.3012 +#endif 9.3013 + pmap_tlb_shootdown(curpmap, 9.3014 + ((vaddr_t)ptes) + ptp->offset, opte, &cpumask); 9.3015 +#if defined(MULTIPROCESSOR) 9.3016 + /* 9.3017 + * Always shoot down the pmap's self-mapping 9.3018 + * of the PTP. 9.3019 + * XXXthorpej Redundant shootdown can happen here 9.3020 + * if pmap == curpmap (not APTE space). 9.3021 + */ 9.3022 + pmap_tlb_shootdown(pmap, 9.3023 + ((vaddr_t)PTE_BASE) + ptp->offset, opte, &cpumask); 9.3024 +#endif 9.3025 + pmap->pm_stats.resident_count--; 9.3026 + if (pmap->pm_ptphint == ptp) /* update hint? */ 9.3027 + pmap->pm_ptphint = pmap->pm_obj.memq.tqh_first; 9.3028 + ptp->wire_count = 0; 9.3029 + ptp->flags |= PG_ZERO; 9.3030 + /* Postpone free to shootdown */ 9.3031 + uvm_pagerealloc(ptp, NULL, 0); 9.3032 + TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq); 9.3033 + } 9.3034 + } 9.3035 + 9.3036 + pmap_tlb_shootnow(cpumask); 9.3037 + pmap_unmap_ptes(pmap); 9.3038 + PMAP_MAP_TO_HEAD_UNLOCK(); 9.3039 + /* Now we can free unused ptps */ 9.3040 + TAILQ_FOREACH(ptp, &empty_ptps, listq) 9.3041 + uvm_pagefree(ptp); 9.3042 +} 9.3043 + 9.3044 +/* 9.3045 + * pmap_page_remove: remove a managed vm_page from all pmaps that map it 9.3046 + * 9.3047 + * => we set pv_head => pmap locking 9.3048 + * => R/M bits are sync'd back to attrs 9.3049 + */ 9.3050 + 9.3051 +void 9.3052 +pmap_page_remove(pg) 9.3053 + struct vm_page *pg; 9.3054 +{ 9.3055 + struct pv_head *pvh; 9.3056 + struct pv_entry *pve, *npve, *killlist = NULL; 9.3057 + pt_entry_t *ptes, opte; 9.3058 + pt_entry_t *maptp; 9.3059 + int32_t cpumask = 0; 9.3060 + TAILQ_HEAD(, vm_page) empty_ptps; 9.3061 + struct vm_page *ptp; 9.3062 + struct cpu_info *ci; 9.3063 + struct pmap *curpmap; 9.3064 + 9.3065 +#ifdef DIAGNOSTIC 9.3066 + int bank, off; 9.3067 + 9.3068 + bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 9.3069 + if (bank == -1) 9.3070 + panic("pmap_page_remove: unmanaged page?"); 9.3071 +#endif 9.3072 + 9.3073 + pvh = &pg->mdpage.mp_pvhead; 9.3074 + if (SPLAY_ROOT(&pvh->pvh_root) == NULL) { 9.3075 + return; 9.3076 + } 9.3077 + 9.3078 + TAILQ_INIT(&empty_ptps); 9.3079 + 9.3080 + /* set pv_head => pmap locking */ 9.3081 + PMAP_HEAD_TO_MAP_LOCK(); 9.3082 + 9.3083 + ci = curcpu(); 9.3084 + curpmap = ci->ci_pmap; 9.3085 + 9.3086 + /* XXX: needed if we hold head->map lock? */ 9.3087 + simple_lock(&pvh->pvh_lock); 9.3088 + 9.3089 + for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) { 9.3090 + npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve); 9.3091 + ptes = pmap_map_ptes(pve->pv_pmap); /* locks pmap */ 9.3092 + 9.3093 +#ifdef DIAGNOSTIC 9.3094 + if (pve->pv_ptp && 9.3095 + (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]) & 9.3096 + PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) { 9.3097 + printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n", 9.3098 + pg, pve->pv_va, pve->pv_ptp); 9.3099 + printf("pmap_page_remove: PTP's phys addr: " 9.3100 + "actual=%lx, recorded=%lx\n", 9.3101 + (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]) 9.3102 + & PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp)); 9.3103 + panic("pmap_page_remove: mapped managed page has " 9.3104 + "invalid pv_ptp field"); 9.3105 + } 9.3106 +#endif 9.3107 + 9.3108 + /* atomically save the old PTE and zap! it */ 9.3109 + maptp = (pt_entry_t *)vtomach( 9.3110 + (vaddr_t)&ptes[x86_btop(pve->pv_va)]); 9.3111 + opte = pte_atomic_update(&ptes[x86_btop(pve->pv_va)], 9.3112 + maptp, 0); 9.3113 + 9.3114 + if (opte & PG_W) 9.3115 + pve->pv_pmap->pm_stats.wired_count--; 9.3116 + pve->pv_pmap->pm_stats.resident_count--; 9.3117 + 9.3118 + /* Shootdown only if referenced */ 9.3119 + if (opte & PG_U) 9.3120 + pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte, 9.3121 + &cpumask); 9.3122 + 9.3123 + /* sync R/M bits */ 9.3124 + pg->mdpage.mp_attrs |= (opte & (PG_U|PG_M)); 9.3125 + 9.3126 + /* update the PTP reference count. free if last reference. */ 9.3127 + if (pve->pv_ptp) { 9.3128 + pve->pv_ptp->wire_count--; 9.3129 + if (pve->pv_ptp->wire_count <= 1) { 9.3130 + /* 9.3131 + * Do we have to shootdown the page just to 9.3132 + * get the pte out of the TLB ? 9.3133 + */ 9.3134 + if(!(opte & PG_U)) 9.3135 + pmap_tlb_shootdown(pve->pv_pmap, 9.3136 + pve->pv_va, opte, &cpumask); 9.3137 + 9.3138 + /* zap! */ 9.3139 + maptp = (pt_entry_t *)vtomach((vaddr_t) 9.3140 + &pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]); 9.3141 + PTE_ATOMIC_CLEAR(&pve->pv_pmap->pm_pdir 9.3142 + [pdei(pve->pv_va)], maptp, opte); 9.3143 + pmap_tlb_shootdown(curpmap, 9.3144 + ((vaddr_t)ptes) + pve->pv_ptp->offset, 9.3145 + opte, &cpumask); 9.3146 +#if defined(MULTIPROCESSOR) 9.3147 + /* 9.3148 + * Always shoot down the other pmap's 9.3149 + * self-mapping of the PTP. 9.3150 + */ 9.3151 + pmap_tlb_shootdown(pve->pv_pmap, 9.3152 + ((vaddr_t)PTE_BASE) + pve->pv_ptp->offset, 9.3153 + opte, &cpumask); 9.3154 +#endif 9.3155 + pve->pv_pmap->pm_stats.resident_count--; 9.3156 + /* update hint? */ 9.3157 + if (pve->pv_pmap->pm_ptphint == pve->pv_ptp) 9.3158 + pve->pv_pmap->pm_ptphint = 9.3159 + pve->pv_pmap->pm_obj.memq.tqh_first; 9.3160 + pve->pv_ptp->wire_count = 0; 9.3161 + pve->pv_ptp->flags |= PG_ZERO; 9.3162 + /* Free only after the shootdown */ 9.3163 + uvm_pagerealloc(pve->pv_ptp, NULL, 0); 9.3164 + TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, 9.3165 + listq); 9.3166 + } 9.3167 + } 9.3168 + pmap_unmap_ptes(pve->pv_pmap); /* unlocks pmap */ 9.3169 + SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */ 9.3170 + SPLAY_RIGHT(pve, pv_node) = killlist; /* mark it for death */ 9.3171 + killlist = pve; 9.3172 + } 9.3173 + pmap_free_pvs(NULL, killlist); 9.3174 + simple_unlock(&pvh->pvh_lock); 9.3175 + PMAP_HEAD_TO_MAP_UNLOCK(); 9.3176 + pmap_tlb_shootnow(cpumask); 9.3177 + 9.3178 + /* Now we can free unused ptps */ 9.3179 + TAILQ_FOREACH(ptp, &empty_ptps, listq) 9.3180 + uvm_pagefree(ptp); 9.3181 +} 9.3182 + 9.3183 +/* 9.3184 + * p m a p a t t r i b u t e f u n c t i o n s 9.3185 + * functions that test/change managed page's attributes 9.3186 + * since a page can be mapped multiple times we must check each PTE that 9.3187 + * maps it by going down the pv lists. 9.3188 + */ 9.3189 + 9.3190 +/* 9.3191 + * pmap_test_attrs: test a page's attributes 9.3192 + * 9.3193 + * => we set pv_head => pmap locking 9.3194 + */ 9.3195 + 9.3196 +boolean_t 9.3197 +pmap_test_attrs(pg, testbits) 9.3198 + struct vm_page *pg; 9.3199 + int testbits; 9.3200 +{ 9.3201 + struct vm_page_md *mdpg; 9.3202 + int *myattrs; 9.3203 + struct pv_head *pvh; 9.3204 + struct pv_entry *pve; 9.3205 + volatile pt_entry_t *ptes; 9.3206 + pt_entry_t pte; 9.3207 + 9.3208 +#if DIAGNOSTIC 9.3209 + int bank, off; 9.3210 + 9.3211 + bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 9.3212 + if (bank == -1) 9.3213 + panic("pmap_test_attrs: unmanaged page?"); 9.3214 +#endif 9.3215 + mdpg = &pg->mdpage; 9.3216 + 9.3217 + /* 9.3218 + * before locking: see if attributes are already set and if so, 9.3219 + * return! 9.3220 + */ 9.3221 + 9.3222 + myattrs = &mdpg->mp_attrs; 9.3223 + if (*myattrs & testbits) 9.3224 + return(TRUE); 9.3225 + 9.3226 + /* test to see if there is a list before bothering to lock */ 9.3227 + pvh = &mdpg->mp_pvhead; 9.3228 + if (SPLAY_ROOT(&pvh->pvh_root) == NULL) { 9.3229 + return(FALSE); 9.3230 + } 9.3231 + 9.3232 + /* nope, gonna have to do it the hard way */ 9.3233 + PMAP_HEAD_TO_MAP_LOCK(); 9.3234 + /* XXX: needed if we hold head->map lock? */ 9.3235 + simple_lock(&pvh->pvh_lock); 9.3236 + 9.3237 + for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); 9.3238 + pve != NULL && (*myattrs & testbits) == 0; 9.3239 + pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) { 9.3240 + ptes = pmap_map_ptes(pve->pv_pmap); 9.3241 + pte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); /* XXX flags only? */ 9.3242 + pmap_unmap_ptes(pve->pv_pmap); 9.3243 + *myattrs |= pte; 9.3244 + } 9.3245 + 9.3246 + /* 9.3247 + * note that we will exit the for loop with a non-null pve if 9.3248 + * we have found the bits we are testing for. 9.3249 + */ 9.3250 + 9.3251 + simple_unlock(&pvh->pvh_lock); 9.3252 + PMAP_HEAD_TO_MAP_UNLOCK(); 9.3253 + return((*myattrs & testbits) != 0); 9.3254 +} 9.3255 + 9.3256 +/* 9.3257 + * pmap_clear_attrs: clear the specified attribute for a page. 9.3258 + * 9.3259 + * => we set pv_head => pmap locking 9.3260 + * => we return TRUE if we cleared one of the bits we were asked to 9.3261 + */ 9.3262 + 9.3263 +boolean_t 9.3264 +pmap_clear_attrs(pg, clearbits) 9.3265 + struct vm_page *pg; 9.3266 + int clearbits; 9.3267 +{ 9.3268 + struct vm_page_md *mdpg; 9.3269 + u_int32_t result; 9.3270 + struct pv_head *pvh; 9.3271 + struct pv_entry *pve; 9.3272 + pt_entry_t *ptes, opte; 9.3273 + pt_entry_t *maptp; 9.3274 + int *myattrs; 9.3275 + int32_t cpumask = 0; 9.3276 + 9.3277 +#ifdef DIAGNOSTIC 9.3278 + int bank, off; 9.3279 + 9.3280 + bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 9.3281 + if (bank == -1) 9.3282 + panic("pmap_change_attrs: unmanaged page?"); 9.3283 +#endif 9.3284 + mdpg = &pg->mdpage; 9.3285 + 9.3286 + PMAP_HEAD_TO_MAP_LOCK(); 9.3287 + pvh = &mdpg->mp_pvhead; 9.3288 + /* XXX: needed if we hold head->map lock? */ 9.3289 + simple_lock(&pvh->pvh_lock); 9.3290 + 9.3291 + myattrs = &mdpg->mp_attrs; 9.3292 + result = *myattrs & clearbits; 9.3293 + *myattrs &= ~clearbits; 9.3294 + 9.3295 + SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) { 9.3296 +#ifdef DIAGNOSTIC 9.3297 + if (!pmap_valid_entry(pve->pv_pmap->pm_pdir[pdei(pve->pv_va)])) 9.3298 + panic("pmap_change_attrs: mapping without PTP " 9.3299 + "detected"); 9.3300 +#endif 9.3301 + 9.3302 + ptes = pmap_map_ptes(pve->pv_pmap); /* locks pmap */ 9.3303 + opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); 9.3304 + if (opte & clearbits) { 9.3305 + /* We need to do something */ 9.3306 + if (clearbits == PG_RW) { 9.3307 + result |= PG_RW; 9.3308 + 9.3309 + /* 9.3310 + * On write protect we might not need to flush 9.3311 + * the TLB 9.3312 + */ 9.3313 + 9.3314 + /* First zap the RW bit! */ 9.3315 + maptp = (pt_entry_t *)vtomach( 9.3316 + (vaddr_t)&ptes[x86_btop(pve->pv_va)]); 9.3317 + PTE_ATOMIC_CLEARBITS( 9.3318 + &ptes[x86_btop(pve->pv_va)], 9.3319 + maptp, PG_RW); 9.3320 + opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); 9.3321 + 9.3322 + /* 9.3323 + * Then test if it is not cached as RW the TLB 9.3324 + */ 9.3325 + if (!(opte & PG_M)) 9.3326 + goto no_tlb_shootdown; 9.3327 + } 9.3328 + 9.3329 + /* 9.3330 + * Since we need a shootdown me might as well 9.3331 + * always clear PG_U AND PG_M. 9.3332 + */ 9.3333 + 9.3334 + /* zap! */ 9.3335 + maptp = (pt_entry_t *)vtomach( 9.3336 + (vaddr_t)&ptes[x86_btop(pve->pv_va)]); 9.3337 + PTE_ATOMIC_SET(&ptes[x86_btop(pve->pv_va)], maptp, 9.3338 + (opte & ~(PG_U | PG_M)), opte); 9.3339 + 9.3340 + result |= (opte & clearbits); 9.3341 + *myattrs |= (opte & ~(clearbits)); 9.3342 + 9.3343 + pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte, 9.3344 + &cpumask); 9.3345 + } 9.3346 +no_tlb_shootdown: 9.3347 + pmap_unmap_ptes(pve->pv_pmap); /* unlocks pmap */ 9.3348 + } 9.3349 + 9.3350 + simple_unlock(&pvh->pvh_lock); 9.3351 + PMAP_HEAD_TO_MAP_UNLOCK(); 9.3352 + 9.3353 + pmap_tlb_shootnow(cpumask); 9.3354 + return(result != 0); 9.3355 +} 9.3356 + 9.3357 + 9.3358 +/* 9.3359 + * p m a p p r o t e c t i o n f u n c t i o n s 9.3360 + */ 9.3361 + 9.3362 +/* 9.3363 + * pmap_page_protect: change the protection of all recorded mappings 9.3364 + * of a managed page 9.3365 + * 9.3366 + * => NOTE: this is an inline function in pmap.h 9.3367 + */ 9.3368 + 9.3369 +/* see pmap.h */ 9.3370 + 9.3371 +/* 9.3372 + * pmap_protect: set the protection in of the pages in a pmap 9.3373 + * 9.3374 + * => NOTE: this is an inline function in pmap.h 9.3375 + */ 9.3376 + 9.3377 +/* see pmap.h */ 9.3378 + 9.3379 +/* 9.3380 + * pmap_write_protect: write-protect pages in a pmap 9.3381 + */ 9.3382 + 9.3383 +void 9.3384 +pmap_write_protect(pmap, sva, eva, prot) 9.3385 + struct pmap *pmap; 9.3386 + vaddr_t sva, eva; 9.3387 + vm_prot_t prot; 9.3388 +{ 9.3389 + pt_entry_t *ptes, *epte; 9.3390 + pt_entry_t *maptp; 9.3391 +#ifndef XEN 9.3392 + volatile 9.3393 +#endif 9.3394 + pt_entry_t *spte; 9.3395 + vaddr_t blockend; 9.3396 + int32_t cpumask = 0; 9.3397 + 9.3398 + ptes = pmap_map_ptes(pmap); /* locks pmap */ 9.3399 + 9.3400 + /* should be ok, but just in case ... */ 9.3401 + sva &= PG_FRAME; 9.3402 + eva &= PG_FRAME; 9.3403 + 9.3404 + for (/* null */ ; sva < eva ; sva = blockend) { 9.3405 + 9.3406 + blockend = (sva & PD_MASK) + NBPD; 9.3407 + if (blockend > eva) 9.3408 + blockend = eva; 9.3409 + 9.3410 + /* 9.3411 + * XXXCDC: our PTE mappings should never be write-protected! 9.3412 + * 9.3413 + * long term solution is to move the PTEs out of user 9.3414 + * address space. and into kernel address space (up 9.3415 + * with APTE). then we can set VM_MAXUSER_ADDRESS to 9.3416 + * be VM_MAX_ADDRESS. 9.3417 + */ 9.3418 + 9.3419 + /* XXXCDC: ugly hack to avoid freeing PDP here */ 9.3420 + if (pdei(sva) == PDSLOT_PTE) 9.3421 + continue; 9.3422 + 9.3423 + /* empty block? */ 9.3424 + if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) 9.3425 + continue; 9.3426 + 9.3427 +#ifdef DIAGNOSTIC 9.3428 + if (sva >= VM_MAXUSER_ADDRESS && 9.3429 + sva < VM_MAX_ADDRESS) 9.3430 + panic("pmap_write_protect: PTE space"); 9.3431 +#endif 9.3432 + 9.3433 + spte = &ptes[x86_btop(sva)]; 9.3434 + epte = &ptes[x86_btop(blockend)]; 9.3435 + 9.3436 + for (/*null */; spte < epte ; spte++) { 9.3437 + if ((PTE_GET(spte) & (PG_RW|PG_V)) == (PG_RW|PG_V)) { 9.3438 + maptp = (pt_entry_t *)vtomach((vaddr_t)spte); 9.3439 + PTE_ATOMIC_CLEARBITS(spte, maptp, PG_RW); 9.3440 + if (PTE_GET(spte) & PG_M) 9.3441 + pmap_tlb_shootdown(pmap, 9.3442 + x86_ptob(spte - ptes), 9.3443 + PTE_GET(spte), &cpumask); 9.3444 + } 9.3445 + } 9.3446 + } 9.3447 + 9.3448 + /* 9.3449 + * if we kept a removal record and removed some pages update the TLB 9.3450 + */ 9.3451 + 9.3452 + pmap_tlb_shootnow(cpumask); 9.3453 + pmap_unmap_ptes(pmap); /* unlocks pmap */ 9.3454 +} 9.3455 + 9.3456 +/* 9.3457 + * end of protection functions 9.3458 + */ 9.3459 + 9.3460 +/* 9.3461 + * pmap_unwire: clear the wired bit in the PTE 9.3462 + * 9.3463 + * => mapping should already be in map 9.3464 + */ 9.3465 + 9.3466 +void 9.3467 +pmap_unwire(pmap, va) 9.3468 + struct pmap *pmap; 9.3469 + vaddr_t va; 9.3470 +{ 9.3471 + pt_entry_t *ptes; 9.3472 + pt_entry_t *maptp; 9.3473 + 9.3474 + if (pmap_valid_entry(pmap->pm_pdir[pdei(va)])) { 9.3475 + ptes = pmap_map_ptes(pmap); /* locks pmap */ 9.3476 + 9.3477 +#ifdef DIAGNOSTIC 9.3478 + if (!pmap_valid_entry(ptes[x86_btop(va)])) 9.3479 + panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); 9.3480 +#endif 9.3481 + if ((ptes[x86_btop(va)] & PG_W) != 0) { 9.3482 + maptp = (pt_entry_t *)vtomach( 9.3483 + (vaddr_t)&ptes[x86_btop(va)]); 9.3484 + PTE_ATOMIC_CLEARBITS(&ptes[x86_btop(va)], maptp, PG_W); 9.3485 + pmap->pm_stats.wired_count--; 9.3486 + } 9.3487 +#ifdef DIAGNOSTIC 9.3488 + else { 9.3489 + printf("pmap_unwire: wiring for pmap %p va 0x%lx " 9.3490 + "didn't change!\n", pmap, va); 9.3491 + } 9.3492 +#endif 9.3493 + pmap_unmap_ptes(pmap); /* unlocks map */ 9.3494 + } 9.3495 +#ifdef DIAGNOSTIC 9.3496 + else { 9.3497 + panic("pmap_unwire: invalid PDE"); 9.3498 + } 9.3499 +#endif 9.3500 +} 9.3501 + 9.3502 +/* 9.3503 + * pmap_collect: free resources held by a pmap 9.3504 + * 9.3505 + * => optional function. 9.3506 + * => called when a process is swapped out to free memory. 9.3507 + */ 9.3508 + 9.3509 +void 9.3510 +pmap_collect(pmap) 9.3511 + struct pmap *pmap; 9.3512 +{ 9.3513 + /* 9.3514 + * free all of the pt pages by removing the physical mappings 9.3515 + * for its entire address space. 9.3516 + */ 9.3517 + 9.3518 + pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS, 9.3519 + PMAP_REMOVE_SKIPWIRED); 9.3520 +} 9.3521 + 9.3522 +/* 9.3523 + * pmap_copy: copy mappings from one pmap to another 9.3524 + * 9.3525 + * => optional function 9.3526 + * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 9.3527 + */ 9.3528 + 9.3529 +/* 9.3530 + * defined as macro in pmap.h 9.3531 + */ 9.3532 + 9.3533 +/* 9.3534 + * pmap_enter: enter a mapping into a pmap 9.3535 + * 9.3536 + * => must be done "now" ... no lazy-evaluation 9.3537 + * => we set pmap => pv_head locking 9.3538 + */ 9.3539 + 9.3540 +int 9.3541 +pmap_enter(pmap, va, pa, prot, flags) 9.3542 + struct pmap *pmap; 9.3543 + vaddr_t va; 9.3544 + paddr_t pa; 9.3545 + vm_prot_t prot; 9.3546 + int flags; 9.3547 +{ 9.3548 + pt_entry_t *ptes, opte, npte; 9.3549 + struct vm_page *ptp, *pg; 9.3550 + struct vm_page_md *mdpg; 9.3551 + struct pv_head *old_pvh, *new_pvh; 9.3552 + struct pv_entry *pve = NULL; /* XXX gcc */ 9.3553 + int error; 9.3554 + boolean_t wired = (flags & PMAP_WIRED) != 0; 9.3555 + pt_entry_t *maptp; 9.3556 + 9.3557 + XENPRINTK(("pmap_enter(%p, %p, %p, %08x, %08x)\n", 9.3558 + pmap, (void *)va, (void *)pa, prot, flags)); 9.3559 + 9.3560 +#ifdef DIAGNOSTIC 9.3561 + /* sanity check: totally out of range? */ 9.3562 + if (va >= VM_MAX_KERNEL_ADDRESS) 9.3563 + panic("pmap_enter: too big"); 9.3564 + 9.3565 + if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 9.3566 + panic("pmap_enter: trying to map over PDP/APDP!"); 9.3567 + 9.3568 + /* sanity check: kernel PTPs should already have been pre-allocated */ 9.3569 + if (va >= VM_MIN_KERNEL_ADDRESS && 9.3570 + !pmap_valid_entry(pmap->pm_pdir[pdei(va)])) 9.3571 + panic("pmap_enter: missing kernel PTP!"); 9.3572 +#endif 9.3573 + 9.3574 + npte = protection_codes[prot] | PG_V; 9.3575 + 9.3576 + if (pa >= pmap_pa_start && pa < pmap_pa_end) 9.3577 + npte |= xpmap_ptom(pa); 9.3578 + else { 9.3579 + XENPRINTF(("pmap_enter: va %08lx outside pa range %08lx\n", 9.3580 + va, pa)); 9.3581 + npte |= pa; 9.3582 + } 9.3583 + 9.3584 + /* XENPRINTK(("npte %p\n", npte)); */ 9.3585 + 9.3586 + if (wired) 9.3587 + npte |= PG_W; 9.3588 + 9.3589 + if (va < VM_MAXUSER_ADDRESS) 9.3590 + npte |= PG_u; 9.3591 + else if (va < VM_MAX_ADDRESS) 9.3592 + npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 9.3593 + if (pmap == pmap_kernel()) 9.3594 + npte |= pmap_pg_g; 9.3595 + 9.3596 + /* get lock */ 9.3597 + PMAP_MAP_TO_HEAD_LOCK(); 9.3598 + 9.3599 + ptes = pmap_map_ptes(pmap); /* locks pmap */ 9.3600 + if (pmap == pmap_kernel()) { 9.3601 + ptp = NULL; 9.3602 + } else { 9.3603 + ptp = pmap_get_ptp(pmap, pdei(va)); 9.3604 + if (ptp == NULL) { 9.3605 + if (flags & PMAP_CANFAIL) { 9.3606 + error = ENOMEM; 9.3607 + goto out; 9.3608 + } 9.3609 + panic("pmap_enter: get ptp failed"); 9.3610 + } 9.3611 + } 9.3612 + 9.3613 + /* 9.3614 + * Get first view on old PTE 9.3615 + * on SMP the PTE might gain PG_U and PG_M flags 9.3616 + * before we zap it later 9.3617 + */ 9.3618 + opte = pte_get(&ptes[x86_btop(va)]); /* old PTE */ 9.3619 + XENPRINTK(("npte %p opte %p ptes %p idx %03x\n", 9.3620 + (void *)npte, (void *)opte, ptes, x86_btop(va))); 9.3621 + 9.3622 + /* 9.3623 + * is there currently a valid mapping at our VA and does it 9.3624 + * map to the same PA as the one we want to map ? 9.3625 + */ 9.3626 + 9.3627 + if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) { 9.3628 + 9.3629 + /* 9.3630 + * first, calculate pm_stats updates. resident count will not 9.3631 + * change since we are replacing/changing a valid mapping. 9.3632 + * wired count might change... 9.3633 + */ 9.3634 + pmap->pm_stats.wired_count += 9.3635 + ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0); 9.3636 + 9.3637 + npte |= (opte & PG_PVLIST); 9.3638 + 9.3639 + XENPRINTK(("pmap update opte == pa")); 9.3640 + /* zap! */ 9.3641 + maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]); 9.3642 + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte); 9.3643 + 9.3644 + /* 9.3645 + * Any change in the protection level that the CPU 9.3646 + * should know about ? 9.3647 + */ 9.3648 + if ((npte & PG_RW) 9.3649 + || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) { 9.3650 + XENPRINTK(("pmap update opte == pa, prot change")); 9.3651 + /* 9.3652 + * No need to flush the TLB. 9.3653 + * Just add old PG_M, ... flags in new entry. 9.3654 + */ 9.3655 + PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp, 9.3656 + opte & (PG_M | PG_U)); 9.3657 + goto out_ok; 9.3658 + } 9.3659 + 9.3660 + /* 9.3661 + * Might be cached in the TLB as being writable 9.3662 + * if this is on the PVLIST, sync R/M bit 9.3663 + */ 9.3664 + if (opte & PG_PVLIST) { 9.3665 + pg = PHYS_TO_VM_PAGE(pa); 9.3666 +#ifdef DIAGNOSTIC 9.3667 + if (pg == NULL) 9.3668 + panic("pmap_enter: same pa PG_PVLIST " 9.3669 + "mapping with unmanaged page " 9.3670 + "pa = 0x%lx (0x%lx)", pa, 9.3671 + atop(pa)); 9.3672 +#endif 9.3673 + mdpg = &pg->mdpage; 9.3674 + old_pvh = &mdpg->mp_pvhead; 9.3675 + simple_lock(&old_pvh->pvh_lock); 9.3676 + mdpg->mp_attrs |= opte; 9.3677 + simple_unlock(&old_pvh->pvh_lock); 9.3678 + } 9.3679 + goto shootdown_now; 9.3680 + } 9.3681 + 9.3682 + pg = PHYS_TO_VM_PAGE(pa); 9.3683 + XENPRINTK(("pg %p from %p, init %d\n", pg, (void *)pa, 9.3684 + pmap_initialized)); 9.3685 + if (pmap_initialized && pg != NULL) { 9.3686 + /* This is a managed page */ 9.3687 + npte |= PG_PVLIST; 9.3688 + mdpg = &pg->mdpage; 9.3689 + new_pvh = &mdpg->mp_pvhead; 9.3690 + if ((opte & (PG_PVLIST | PG_V)) != (PG_PVLIST | PG_V)) { 9.3691 + /* We can not steal a pve - allocate one */ 9.3692 + pve = pmap_alloc_pv(pmap, ALLOCPV_NEED); 9.3693 + if (pve == NULL) { 9.3694 + if (!(flags & PMAP_CANFAIL)) 9.3695 + panic("pmap_enter: " 9.3696 + "no pv entries available"); 9.3697 + error = ENOMEM; 9.3698 + goto out; 9.3699 + } 9.3700 + } 9.3701 + } else { 9.3702 + new_pvh = NULL; 9.3703 + } 9.3704 + 9.3705 + /* 9.3706 + * is there currently a valid mapping at our VA? 9.3707 + */ 9.3708 + 9.3709 + if (pmap_valid_entry(opte)) { 9.3710 + 9.3711 + /* 9.3712 + * changing PAs: we must remove the old one first 9.3713 + */ 9.3714 + 9.3715 + /* 9.3716 + * first, calculate pm_stats updates. resident count will not 9.3717 + * change since we are replacing/changing a valid mapping. 9.3718 + * wired count might change... 9.3719 + */ 9.3720 + pmap->pm_stats.wired_count += 9.3721 + ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0); 9.3722 + 9.3723 + if (opte & PG_PVLIST) { 9.3724 + pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 9.3725 +#ifdef DIAGNOSTIC 9.3726 + if (pg == NULL) 9.3727 + panic("pmap_enter: PG_PVLIST mapping with " 9.3728 + "unmanaged page " 9.3729 + "pa = 0x%lx (0x%lx)", pa, atop(pa)); 9.3730 +#endif 9.3731 + mdpg = &pg->mdpage; 9.3732 + old_pvh = &mdpg->mp_pvhead; 9.3733 + 9.3734 + /* new_pvh is NULL if page will not be managed */ 9.3735 + pmap_lock_pvhs(old_pvh, new_pvh); 9.3736 + 9.3737 + XENPRINTK(("pmap change pa")); 9.3738 + /* zap! */ 9.3739 + maptp = (pt_entry_t *)vtomach( 9.3740 + (vaddr_t)&ptes[x86_btop(va)]); 9.3741 + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, 9.3742 + npte); 9.3743 + 9.3744 + pve = pmap_remove_pv(old_pvh, pmap, va); 9.3745 + KASSERT(pve != 0); 9.3746 + mdpg->mp_attrs |= opte; 9.3747 + 9.3748 + if (new_pvh) { 9.3749 + pmap_enter_pv(new_pvh, pve, pmap, va, ptp); 9.3750 + simple_unlock(&new_pvh->pvh_lock); 9.3751 + } else 9.3752 + pmap_free_pv(pmap, pve); 9.3753 + simple_unlock(&old_pvh->pvh_lock); 9.3754 + 9.3755 + goto shootdown_test; 9.3756 + } 9.3757 + } else { /* opte not valid */ 9.3758 + pmap->pm_stats.resident_count++; 9.3759 + if (wired) 9.3760 + pmap->pm_stats.wired_count++; 9.3761 + if (ptp) 9.3762 + ptp->wire_count++; 9.3763 + } 9.3764 + 9.3765 + if (new_pvh) { 9.3766 + simple_lock(&new_pvh->pvh_lock); 9.3767 + pmap_enter_pv(new_pvh, pve, pmap, va, ptp); 9.3768 + simple_unlock(&new_pvh->pvh_lock); 9.3769 + } 9.3770 + 9.3771 + XENPRINTK(("pmap initial setup\n")); 9.3772 + maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]); 9.3773 + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], 9.3774 + maptp, npte); /* zap! */ 9.3775 + 9.3776 +shootdown_test: 9.3777 + /* Update page attributes if needed */ 9.3778 + if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 9.3779 +#if defined(MULTIPROCESSOR) 9.3780 + int32_t cpumask = 0; 9.3781 +#endif 9.3782 +shootdown_now: 9.3783 +#if defined(MULTIPROCESSOR) 9.3784 + pmap_tlb_shootdown(pmap, va, opte, &cpumask); 9.3785 + pmap_tlb_shootnow(cpumask); 9.3786 +#else 9.3787 + /* Don't bother deferring in the single CPU case. */ 9.3788 + if (pmap_is_curpmap(pmap)) 9.3789 + pmap_update_pg(va); 9.3790 +#endif 9.3791 + } 9.3792 + 9.3793 +out_ok: 9.3794 + error = 0; 9.3795 + 9.3796 +out: 9.3797 + pmap_unmap_ptes(pmap); 9.3798 + PMAP_MAP_TO_HEAD_UNLOCK(); 9.3799 + 9.3800 + XENPRINTK(("pmap_enter: %d\n", error)); 9.3801 + return error; 9.3802 +} 9.3803 + 9.3804 +/* 9.3805 + * pmap_enter_ma: enter a mapping into a pmap 9.3806 + * 9.3807 + * => must be done "now" ... no lazy-evaluation 9.3808 + * => we set pmap => pv_head locking 9.3809 + */ 9.3810 + 9.3811 +int 9.3812 +pmap_enter_ma(pmap, va, pa, prot, flags) 9.3813 + struct pmap *pmap; 9.3814 + vaddr_t va; 9.3815 + paddr_t pa; 9.3816 + vm_prot_t prot; 9.3817 + int flags; 9.3818 +{ 9.3819 + pt_entry_t *ptes, opte, npte; 9.3820 + pt_entry_t *maptp; 9.3821 + struct vm_page *ptp, *pg; 9.3822 + struct vm_page_md *mdpg; 9.3823 + struct pv_head *old_pvh; 9.3824 + struct pv_entry *pve = NULL; /* XXX gcc */ 9.3825 + int error; 9.3826 + boolean_t wired = (flags & PMAP_WIRED) != 0; 9.3827 + 9.3828 + XENPRINTK(("pmap_enter_ma(%p, %p, %p, %08x, %08x)\n", 9.3829 + pmap, (void *)va, (void *)pa, prot, flags)); 9.3830 + 9.3831 +#ifdef DIAGNOSTIC 9.3832 + /* sanity check: totally out of range? */ 9.3833 + if (va >= VM_MAX_KERNEL_ADDRESS) 9.3834 + panic("pmap_enter: too big"); 9.3835 + 9.3836 + if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 9.3837 + panic("pmap_enter: trying to map over PDP/APDP!"); 9.3838 + 9.3839 + /* sanity check: kernel PTPs should already have been pre-allocated */ 9.3840 + if (va >= VM_MIN_KERNEL_ADDRESS && 9.3841 + !pmap_valid_entry(pmap->pm_pdir[pdei(va)])) 9.3842 + panic("pmap_enter: missing kernel PTP!"); 9.3843 +#endif 9.3844 + 9.3845 + npte = pa | protection_codes[prot] | PG_V; 9.3846 + /* XENPRINTK(("npte %p\n", npte)); */ 9.3847 + 9.3848 + if (wired) 9.3849 + npte |= PG_W; 9.3850 + 9.3851 + if (va < VM_MAXUSER_ADDRESS) 9.3852 + npte |= PG_u; 9.3853 + else if (va < VM_MAX_ADDRESS) 9.3854 + npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 9.3855 + if (pmap == pmap_kernel()) 9.3856 + npte |= pmap_pg_g; 9.3857 + 9.3858 + /* get lock */ 9.3859 + PMAP_MAP_TO_HEAD_LOCK(); 9.3860 + 9.3861 + ptes = pmap_map_ptes(pmap); /* locks pmap */ 9.3862 + if (pmap == pmap_kernel()) { 9.3863 + ptp = NULL; 9.3864 + } else { 9.3865 + ptp = pmap_get_ptp(pmap, pdei(va)); 9.3866 + if (ptp == NULL) { 9.3867 + if (flags & PMAP_CANFAIL) { 9.3868 + error = ENOMEM; 9.3869 + goto out; 9.3870 + } 9.3871 + panic("pmap_enter: get ptp failed"); 9.3872 + } 9.3873 + } 9.3874 + 9.3875 + /* 9.3876 + * Get first view on old PTE 9.3877 + * on SMP the PTE might gain PG_U and PG_M flags 9.3878 + * before we zap it later 9.3879 + */ 9.3880 + opte = pte_get_ma(&ptes[x86_btop(va)]); /* old PTE */ 9.3881 + XENPRINTK(("npte %p opte %p ptes %p idx %03x\n", 9.3882 + (void *)npte, (void *)opte, ptes, x86_btop(va))); 9.3883 + XENPRINTF(("pmap_enter_ma pa %08lx va %08lx opte %08x npte %08x " 9.3884 + "wired %d count %ld\n", pa, va, opte, npte, wired, 9.3885 + pmap->pm_stats.wired_count)); 9.3886 + 9.3887 + /* 9.3888 + * is there currently a valid mapping at our VA and does it 9.3889 + * map to the same MA as the one we want to map ? 9.3890 + */ 9.3891 + 9.3892 + if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) { 9.3893 + 9.3894 + /* 9.3895 + * first, calculate pm_stats updates. resident count will not 9.3896 + * change since we are replacing/changing a valid mapping. 9.3897 + * wired count might change... 9.3898 + */ 9.3899 + pmap->pm_stats.wired_count += 9.3900 + ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0); 9.3901 + 9.3902 + XENPRINTK(("pmap update opte == pa")); 9.3903 + /* zap! */ 9.3904 + maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]); 9.3905 + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte); 9.3906 + 9.3907 + /* 9.3908 + * Any change in the protection level that the CPU 9.3909 + * should know about ? 9.3910 + */ 9.3911 + if ((npte & PG_RW) 9.3912 + || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) { 9.3913 + XENPRINTK(("pmap update opte == pa, prot change")); 9.3914 + /* 9.3915 + * No need to flush the TLB. 9.3916 + * Just add old PG_M, ... flags in new entry. 9.3917 + */ 9.3918 + PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp, 9.3919 + opte & (PG_M | PG_U)); 9.3920 + goto out_ok; 9.3921 + } 9.3922 + 9.3923 + /* 9.3924 + * Might be cached in the TLB as being writable 9.3925 + * if this is on the PVLIST, sync R/M bit 9.3926 + */ 9.3927 + KDASSERT((opte & PG_PVLIST) == 0); 9.3928 + goto shootdown_now; 9.3929 + } 9.3930 + 9.3931 + /* 9.3932 + * no managed mapping for pages mapped through pmap_enter_ma. 9.3933 + */ 9.3934 + 9.3935 + /* 9.3936 + * is there currently a valid mapping at our VA? 9.3937 + */ 9.3938 + 9.3939 + if (pmap_valid_entry(opte)) { 9.3940 + 9.3941 + /* 9.3942 + * changing PAs: we must remove the old one first 9.3943 + */ 9.3944 + 9.3945 + /* 9.3946 + * first, calculate pm_stats updates. resident count will not 9.3947 + * change since we are replacing/changing a valid mapping. 9.3948 + * wired count might change... 9.3949 + */ 9.3950 + pmap->pm_stats.wired_count += 9.3951 + ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0); 9.3952 + 9.3953 + if (opte & PG_PVLIST) { 9.3954 + opte = xpmap_mtop(opte); 9.3955 + KDASSERT((opte & PG_FRAME) != 9.3956 + (KERNTEXTOFF - KERNBASE_LOCORE)); 9.3957 + 9.3958 + pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 9.3959 +#ifdef DIAGNOSTIC 9.3960 + if (pg == NULL) 9.3961 + panic("pmap_enter: PG_PVLIST mapping with " 9.3962 + "unmanaged page " 9.3963 + "pa = 0x%lx (0x%lx)", pa, atop(pa)); 9.3964 +#endif 9.3965 + mdpg = &pg->mdpage; 9.3966 + old_pvh = &mdpg->mp_pvhead; 9.3967 + 9.3968 + /* NULL new_pvh since page will not be managed */ 9.3969 + pmap_lock_pvhs(old_pvh, NULL); 9.3970 + 9.3971 + XENPRINTK(("pmap change pa")); 9.3972 + /* zap! */ 9.3973 + maptp = (pt_entry_t *)vtomach( 9.3974 + (vaddr_t)&ptes[x86_btop(va)]); 9.3975 + opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, 9.3976 + npte);