ia64/xen-unstable

annotate xen/arch/x86/mm/shadow/common.c @ 18479:fa2adc7fb996

x86, shadow: Fix some SHADOW_PRINTK() callers.
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Sep 11 15:17:31 2008 +0100 (2008-09-11)
parents 74621a2add54
children 12f3edfab6ef
rev   line source
kaf24@11310 1 /******************************************************************************
kaf24@11310 2 * arch/x86/mm/shadow/common.c
kaf24@11310 3 *
kaf24@11310 4 * Shadow code that does not need to be multiply compiled.
kaf24@11310 5 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
kaf24@11310 6 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
kaf24@11310 7 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
kaf24@11310 8 *
kaf24@11310 9 * This program is free software; you can redistribute it and/or modify
kaf24@11310 10 * it under the terms of the GNU General Public License as published by
kaf24@11310 11 * the Free Software Foundation; either version 2 of the License, or
kaf24@11310 12 * (at your option) any later version.
kaf24@11310 13 *
kaf24@11310 14 * This program is distributed in the hope that it will be useful,
kaf24@11310 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
kaf24@11310 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
kaf24@11310 17 * GNU General Public License for more details.
kaf24@11310 18 *
kaf24@11310 19 * You should have received a copy of the GNU General Public License
kaf24@11310 20 * along with this program; if not, write to the Free Software
kaf24@11310 21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
kaf24@11310 22 */
kaf24@11310 23
kaf24@11310 24 #include <xen/config.h>
kaf24@11310 25 #include <xen/types.h>
kaf24@11310 26 #include <xen/mm.h>
kaf24@11310 27 #include <xen/trace.h>
kaf24@11310 28 #include <xen/sched.h>
kaf24@11310 29 #include <xen/perfc.h>
kaf24@11310 30 #include <xen/irq.h>
kaf24@11310 31 #include <xen/domain_page.h>
kaf24@11310 32 #include <xen/guest_access.h>
kaf24@11310 33 #include <xen/keyhandler.h>
kaf24@11310 34 #include <asm/event.h>
kaf24@11310 35 #include <asm/page.h>
kaf24@11310 36 #include <asm/current.h>
kaf24@11310 37 #include <asm/flushtlb.h>
kaf24@11310 38 #include <asm/shadow.h>
keir@17385 39 #include <xen/numa.h>
kaf24@11310 40 #include "private.h"
kaf24@11310 41
keir@18454 42 DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
Tim@13141 43
Tim@13141 44 /* Set up the shadow-specific parts of a domain struct at start of day.
Tim@13141 45 * Called for every domain from arch_domain_create() */
Tim@13141 46 void shadow_domain_init(struct domain *d)
Tim@13141 47 {
Tim@13141 48 int i;
Tim@13141 49 shadow_lock_init(d);
Tim@13141 50 for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
Tim@13909 51 INIT_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
Tim@13909 52 INIT_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
Tim@13909 53 INIT_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
Tim@15311 54
Tim@15311 55 /* Use shadow pagetables for log-dirty support */
Tim@15311 56 paging_log_dirty_init(d, shadow_enable_log_dirty,
Tim@15311 57 shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
keir@17903 58
keir@17903 59 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17903 60 d->arch.paging.shadow.oos_active = 0;
keir@17903 61 #endif
Tim@13141 62 }
Tim@13141 63
Tim@13909 64 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
Tim@13909 65 * job is to initialize the update_paging_modes() function pointer, which is
Tim@13909 66 * used to initialized the rest of resources. Therefore, it really does not
Tim@13909 67 * matter to have v->arch.paging.mode pointing to any mode, as long as it can
Tim@13909 68 * be compiled.
Tim@13909 69 */
Tim@13909 70 void shadow_vcpu_init(struct vcpu *v)
Tim@13909 71 {
keir@17903 72 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17984 73 int i, j;
keir@17903 74
keir@17903 75 for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
keir@17905 76 {
keir@17903 77 v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
keir@17905 78 v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN);
keir@17984 79 for ( j = 0; j < SHADOW_OOS_FIXUPS; j++ )
keir@17984 80 v->arch.paging.shadow.oos_fixup[i].smfn[j] = _mfn(INVALID_MFN);
keir@17905 81 }
keir@17903 82 #endif
keir@17903 83
keir@17620 84 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
Tim@13909 85 }
Tim@13141 86
kaf24@11310 87 #if SHADOW_AUDIT
kaf24@11310 88 int shadow_audit_enable = 0;
kaf24@11310 89
kaf24@11310 90 static void shadow_audit_key(unsigned char key)
kaf24@11310 91 {
kaf24@11310 92 shadow_audit_enable = !shadow_audit_enable;
kaf24@11310 93 printk("%s shadow_audit_enable=%d\n",
kaf24@11310 94 __func__, shadow_audit_enable);
kaf24@11310 95 }
kaf24@11310 96
kaf24@11310 97 static int __init shadow_audit_key_init(void)
kaf24@11310 98 {
kaf24@11310 99 register_keyhandler(
kaf24@11310 100 'O', shadow_audit_key, "toggle shadow audits");
kaf24@11310 101 return 0;
kaf24@11310 102 }
kaf24@11310 103 __initcall(shadow_audit_key_init);
kaf24@11310 104 #endif /* SHADOW_AUDIT */
kaf24@11310 105
kaf24@11310 106 int _shadow_mode_refcounts(struct domain *d)
kaf24@11310 107 {
kaf24@11310 108 return shadow_mode_refcounts(d);
kaf24@11310 109 }
kaf24@11310 110
kaf24@11310 111
kaf24@11310 112 /**************************************************************************/
kaf24@11310 113 /* x86 emulator support for the shadow code
kaf24@11310 114 */
kaf24@11310 115
keir@16661 116 struct segment_register *hvm_get_seg_reg(
kfraser@12752 117 enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt)
kfraser@12752 118 {
kfraser@12752 119 struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg];
kfraser@12752 120 if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) )
kfraser@12752 121 hvm_get_segment_register(current, seg, seg_reg);
kfraser@12752 122 return seg_reg;
kfraser@12752 123 }
kfraser@12752 124
kfraser@12696 125 static int hvm_translate_linear_addr(
kfraser@12696 126 enum x86_segment seg,
kfraser@12696 127 unsigned long offset,
kfraser@12696 128 unsigned int bytes,
kaf24@12769 129 enum hvm_access_type access_type,
kfraser@12752 130 struct sh_emulate_ctxt *sh_ctxt,
kfraser@12696 131 unsigned long *paddr)
kfraser@12696 132 {
kaf24@12769 133 struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt);
kfraser@15919 134 int okay;
kfraser@12696 135
kfraser@15919 136 okay = hvm_virtual_to_linear_addr(
kfraser@15919 137 seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr);
kfraser@12696 138
kfraser@15919 139 if ( !okay )
kfraser@12696 140 {
kfraser@15919 141 hvm_inject_exception(TRAP_gp_fault, 0, 0);
kfraser@15919 142 return X86EMUL_EXCEPTION;
kfraser@12696 143 }
kfraser@12696 144
kfraser@15919 145 return 0;
kfraser@12696 146 }
kfraser@12696 147
kaf24@11310 148 static int
kaf24@12769 149 hvm_read(enum x86_segment seg,
kaf24@12769 150 unsigned long offset,
keir@17931 151 void *p_data,
kaf24@12769 152 unsigned int bytes,
kaf24@12769 153 enum hvm_access_type access_type,
kaf24@12769 154 struct sh_emulate_ctxt *sh_ctxt)
kaf24@11310 155 {
kfraser@12696 156 unsigned long addr;
keir@16662 157 int rc;
kfraser@12752 158
kaf24@12769 159 rc = hvm_translate_linear_addr(
kaf24@12769 160 seg, offset, bytes, access_type, sh_ctxt, &addr);
kfraser@12696 161 if ( rc )
kfraser@12696 162 return rc;
kfraser@12675 163
Tim@16313 164 if ( access_type == hvm_access_insn_fetch )
keir@17931 165 rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0);
Tim@16313 166 else
keir@17931 167 rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0);
Tim@16313 168
keir@16662 169 switch ( rc )
keir@16662 170 {
keir@16662 171 case HVMCOPY_okay:
kfraser@14063 172 return X86EMUL_OKAY;
keir@16662 173 case HVMCOPY_bad_gva_to_gfn:
keir@16662 174 return X86EMUL_EXCEPTION;
keir@16662 175 default:
keir@16662 176 break;
keir@16662 177 }
keir@16662 178
keir@16662 179 return X86EMUL_UNHANDLEABLE;
kaf24@11310 180 }
kaf24@11310 181
kaf24@12895 182 static int
kaf24@12895 183 hvm_emulate_read(enum x86_segment seg,
kaf24@12895 184 unsigned long offset,
keir@17931 185 void *p_data,
kaf24@12895 186 unsigned int bytes,
kaf24@12895 187 struct x86_emulate_ctxt *ctxt)
kaf24@12895 188 {
keir@16989 189 if ( !is_x86_user_segment(seg) )
keir@16989 190 return X86EMUL_UNHANDLEABLE;
keir@17931 191 return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
kaf24@12895 192 container_of(ctxt, struct sh_emulate_ctxt, ctxt));
kaf24@12895 193 }
kaf24@12895 194
kaf24@12895 195 static int
kaf24@12895 196 hvm_emulate_insn_fetch(enum x86_segment seg,
kaf24@12895 197 unsigned long offset,
keir@17931 198 void *p_data,
kaf24@12895 199 unsigned int bytes,
kaf24@12895 200 struct x86_emulate_ctxt *ctxt)
kaf24@12895 201 {
kaf24@12895 202 struct sh_emulate_ctxt *sh_ctxt =
kaf24@12895 203 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
Tim@15254 204 unsigned int insn_off = offset - sh_ctxt->insn_buf_eip;
kaf24@12895 205
keir@16989 206 ASSERT(seg == x86_seg_cs);
keir@16989 207
kaf24@12895 208 /* Fall back if requested bytes are not in the prefetch cache. */
kaf24@12895 209 if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
keir@17931 210 return hvm_read(seg, offset, p_data, bytes,
kaf24@12895 211 hvm_access_insn_fetch, sh_ctxt);
kaf24@12895 212
kaf24@12895 213 /* Hit the cache. Simple memcpy. */
keir@17931 214 memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
kfraser@14063 215 return X86EMUL_OKAY;
kaf24@12895 216 }
kaf24@12895 217
kaf24@12895 218 static int
kaf24@12895 219 hvm_emulate_write(enum x86_segment seg,
kaf24@12895 220 unsigned long offset,
keir@17931 221 void *p_data,
kaf24@12895 222 unsigned int bytes,
kaf24@12895 223 struct x86_emulate_ctxt *ctxt)
kaf24@12895 224 {
kaf24@12895 225 struct sh_emulate_ctxt *sh_ctxt =
kaf24@12895 226 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
kaf24@12895 227 struct vcpu *v = current;
kaf24@12895 228 unsigned long addr;
kaf24@12895 229 int rc;
kaf24@12895 230
keir@16989 231 if ( !is_x86_user_segment(seg) )
keir@16989 232 return X86EMUL_UNHANDLEABLE;
keir@16989 233
kfraser@14082 234 /* How many emulations could we save if we unshadowed on stack writes? */
kfraser@14082 235 if ( seg == x86_seg_ss )
kfraser@14595 236 perfc_incr(shadow_fault_emulate_stack);
kfraser@14082 237
kaf24@12895 238 rc = hvm_translate_linear_addr(
kaf24@12895 239 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
kaf24@12895 240 if ( rc )
kaf24@12895 241 return rc;
kaf24@12895 242
Tim@13909 243 return v->arch.paging.mode->shadow.x86_emulate_write(
keir@17931 244 v, addr, p_data, bytes, sh_ctxt);
kaf24@12895 245 }
kaf24@12895 246
kaf24@12895 247 static int
kaf24@12895 248 hvm_emulate_cmpxchg(enum x86_segment seg,
kaf24@12895 249 unsigned long offset,
keir@17503 250 void *p_old,
keir@17503 251 void *p_new,
kaf24@12895 252 unsigned int bytes,
kaf24@12895 253 struct x86_emulate_ctxt *ctxt)
kaf24@12895 254 {
kaf24@12895 255 struct sh_emulate_ctxt *sh_ctxt =
kaf24@12895 256 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
kaf24@12895 257 struct vcpu *v = current;
keir@17503 258 unsigned long addr, old[2], new[2];
kaf24@12895 259 int rc;
kaf24@12895 260
keir@16989 261 if ( !is_x86_user_segment(seg) )
keir@16989 262 return X86EMUL_UNHANDLEABLE;
keir@16989 263
kaf24@12895 264 rc = hvm_translate_linear_addr(
kaf24@12895 265 seg, offset, bytes, hvm_access_write, sh_ctxt, &addr);
kaf24@12895 266 if ( rc )
kaf24@12895 267 return rc;
kaf24@12895 268
keir@17503 269 old[0] = new[0] = 0;
keir@17503 270 memcpy(old, p_old, bytes);
keir@17503 271 memcpy(new, p_new, bytes);
keir@17503 272
keir@17503 273 if ( bytes <= sizeof(long) )
keir@17503 274 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
keir@17503 275 v, addr, old[0], new[0], bytes, sh_ctxt);
keir@17503 276
keir@17503 277 #ifdef __i386__
keir@17503 278 if ( bytes == 8 )
keir@17503 279 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
keir@17503 280 v, addr, old[0], old[1], new[0], new[1], sh_ctxt);
keir@17503 281 #endif
keir@17503 282
keir@17503 283 return X86EMUL_UNHANDLEABLE;
kaf24@12895 284 }
kaf24@12895 285
kaf24@12895 286 static struct x86_emulate_ops hvm_shadow_emulator_ops = {
kaf24@12895 287 .read = hvm_emulate_read,
kaf24@12895 288 .insn_fetch = hvm_emulate_insn_fetch,
kaf24@12895 289 .write = hvm_emulate_write,
kaf24@12895 290 .cmpxchg = hvm_emulate_cmpxchg,
kaf24@12895 291 };
kaf24@12895 292
kaf24@12895 293 static int
kaf24@12895 294 pv_emulate_read(enum x86_segment seg,
kaf24@12895 295 unsigned long offset,
keir@17931 296 void *p_data,
kaf24@12895 297 unsigned int bytes,
kaf24@12895 298 struct x86_emulate_ctxt *ctxt)
kaf24@12895 299 {
kaf24@12895 300 unsigned int rc;
kaf24@12895 301
keir@16989 302 if ( !is_x86_user_segment(seg) )
keir@16989 303 return X86EMUL_UNHANDLEABLE;
keir@16989 304
keir@17931 305 if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 )
kaf24@12895 306 {
kaf24@12895 307 propagate_page_fault(offset + bytes - rc, 0); /* read fault */
kfraser@14063 308 return X86EMUL_EXCEPTION;
kaf24@12895 309 }
kaf24@12895 310
kfraser@14063 311 return X86EMUL_OKAY;
kaf24@12895 312 }
kaf24@12895 313
kaf24@12895 314 static int
kaf24@12895 315 pv_emulate_write(enum x86_segment seg,
kaf24@12895 316 unsigned long offset,
keir@17931 317 void *p_data,
kaf24@12895 318 unsigned int bytes,
kaf24@12895 319 struct x86_emulate_ctxt *ctxt)
kaf24@12895 320 {
kaf24@12895 321 struct sh_emulate_ctxt *sh_ctxt =
kaf24@12895 322 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
kaf24@12895 323 struct vcpu *v = current;
keir@16989 324 if ( !is_x86_user_segment(seg) )
keir@16989 325 return X86EMUL_UNHANDLEABLE;
Tim@13909 326 return v->arch.paging.mode->shadow.x86_emulate_write(
keir@17931 327 v, offset, p_data, bytes, sh_ctxt);
kaf24@12895 328 }
kaf24@12895 329
kaf24@12895 330 static int
kaf24@12895 331 pv_emulate_cmpxchg(enum x86_segment seg,
kaf24@12895 332 unsigned long offset,
keir@17503 333 void *p_old,
keir@17503 334 void *p_new,
kaf24@12895 335 unsigned int bytes,
kaf24@12895 336 struct x86_emulate_ctxt *ctxt)
kaf24@12895 337 {
kaf24@12895 338 struct sh_emulate_ctxt *sh_ctxt =
kaf24@12895 339 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
keir@17503 340 unsigned long old[2], new[2];
kaf24@12895 341 struct vcpu *v = current;
keir@17503 342
keir@16989 343 if ( !is_x86_user_segment(seg) )
keir@16989 344 return X86EMUL_UNHANDLEABLE;
keir@17503 345
keir@17503 346 old[0] = new[0] = 0;
keir@17503 347 memcpy(old, p_old, bytes);
keir@17503 348 memcpy(new, p_new, bytes);
keir@17503 349
keir@17503 350 if ( bytes <= sizeof(long) )
keir@17503 351 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
keir@17503 352 v, offset, old[0], new[0], bytes, sh_ctxt);
keir@17503 353
keir@17503 354 #ifdef __i386__
keir@17503 355 if ( bytes == 8 )
keir@17503 356 return v->arch.paging.mode->shadow.x86_emulate_cmpxchg8b(
keir@17503 357 v, offset, old[0], old[1], new[0], new[1], sh_ctxt);
keir@17503 358 #endif
keir@17503 359
keir@17503 360 return X86EMUL_UNHANDLEABLE;
kaf24@12895 361 }
kaf24@12895 362
kaf24@12895 363 static struct x86_emulate_ops pv_shadow_emulator_ops = {
kaf24@12895 364 .read = pv_emulate_read,
kaf24@12895 365 .insn_fetch = pv_emulate_read,
kaf24@12895 366 .write = pv_emulate_write,
kaf24@12895 367 .cmpxchg = pv_emulate_cmpxchg,
kaf24@12895 368 };
kaf24@12895 369
kaf24@12895 370 struct x86_emulate_ops *shadow_init_emulation(
kaf24@12895 371 struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs)
kaf24@12769 372 {
kfraser@13443 373 struct segment_register *creg, *sreg;
kaf24@12769 374 struct vcpu *v = current;
kaf24@12769 375 unsigned long addr;
kaf24@12769 376
kaf24@12769 377 sh_ctxt->ctxt.regs = regs;
keir@17098 378 sh_ctxt->ctxt.force_writeback = 0;
kaf24@12769 379
kaf24@12895 380 if ( !is_hvm_vcpu(v) )
kaf24@12895 381 {
kfraser@13443 382 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG;
kaf24@12895 383 return &pv_shadow_emulator_ops;
kaf24@12895 384 }
kaf24@12895 385
kaf24@12769 386 /* Segment cache initialisation. Primed with CS. */
kaf24@12769 387 sh_ctxt->valid_seg_regs = 0;
kaf24@12769 388 creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt);
kaf24@12769 389
kaf24@12769 390 /* Work out the emulation mode. */
kfraser@13444 391 if ( hvm_long_mode_enabled(v) && creg->attr.fields.l )
kfraser@13443 392 {
kfraser@13444 393 sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64;
kfraser@13443 394 }
kaf24@12769 395 else
kfraser@13443 396 {
kfraser@13443 397 sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt);
kfraser@13443 398 sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16;
kfraser@13443 399 sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16;
kfraser@13443 400 }
kaf24@12769 401
kaf24@12769 402 /* Attempt to prefetch whole instruction. */
Tim@15254 403 sh_ctxt->insn_buf_eip = regs->eip;
kaf24@12769 404 sh_ctxt->insn_buf_bytes =
kaf24@12769 405 (!hvm_translate_linear_addr(
kaf24@12769 406 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
kaf24@12769 407 hvm_access_insn_fetch, sh_ctxt, &addr) &&
keir@16662 408 !hvm_fetch_from_guest_virt_nofault(
keir@17343 409 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
kaf24@12769 410 ? sizeof(sh_ctxt->insn_buf) : 0;
kaf24@12895 411
kaf24@12895 412 return &hvm_shadow_emulator_ops;
kaf24@11310 413 }
kaf24@11310 414
Tim@15254 415 /* Update an initialized emulation context to prepare for the next
Tim@15254 416 * instruction */
Tim@15254 417 void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt,
Tim@15254 418 struct cpu_user_regs *regs)
Tim@15254 419 {
Tim@15254 420 struct vcpu *v = current;
Tim@15254 421 unsigned long addr, diff;
Tim@15254 422
Tim@15254 423 /* We don't refetch the segment bases, because we don't emulate
Tim@15254 424 * writes to segment registers */
Tim@15254 425
Tim@15254 426 if ( is_hvm_vcpu(v) )
Tim@15254 427 {
Tim@15254 428 diff = regs->eip - sh_ctxt->insn_buf_eip;
Tim@15254 429 if ( diff > sh_ctxt->insn_buf_bytes )
Tim@15254 430 {
Tim@15254 431 /* Prefetch more bytes. */
Tim@15254 432 sh_ctxt->insn_buf_bytes =
Tim@15254 433 (!hvm_translate_linear_addr(
Tim@15254 434 x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf),
Tim@15254 435 hvm_access_insn_fetch, sh_ctxt, &addr) &&
keir@16662 436 !hvm_fetch_from_guest_virt_nofault(
keir@17343 437 sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0))
Tim@15254 438 ? sizeof(sh_ctxt->insn_buf) : 0;
Tim@15254 439 sh_ctxt->insn_buf_eip = regs->eip;
Tim@15254 440 }
Tim@15254 441 }
Tim@15254 442 }
keir@17903 443
keir@17903 444
keir@17903 445 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17903 446 /**************************************************************************/
keir@17903 447 /* Out-of-sync shadows. */
keir@17903 448
keir@17903 449 /* From time to time, we let a shadowed pagetable page go out of sync
keir@17903 450 * with its shadow: the guest is allowed to write directly to the page,
keir@17903 451 * and those writes are not synchronously reflected in the shadow.
keir@17903 452 * This lets us avoid many emulations if the guest is writing a lot to a
keir@17903 453 * pagetable, but it relaxes a pretty important invariant in the shadow
keir@17903 454 * pagetable design. Therefore, some rules:
keir@17903 455 *
keir@17903 456 * 1. Only L1 pagetables may go out of sync: any page that is shadowed
keir@17903 457 * at at higher level must be synchronously updated. This makes
keir@17903 458 * using linear shadow pagetables much less dangerous.
keir@17903 459 * That means that: (a) unsyncing code needs to check for higher-level
keir@17903 460 * shadows, and (b) promotion code needs to resync.
keir@17903 461 *
keir@17903 462 * 2. All shadow operations on a guest page require the page to be brought
keir@17903 463 * back into sync before proceeding. This must be done under the
keir@17903 464 * shadow lock so that the page is guaranteed to remain synced until
keir@17903 465 * the operation completes.
keir@17903 466 *
keir@17903 467 * Exceptions to this rule: the pagefault and invlpg handlers may
keir@17903 468 * update only one entry on an out-of-sync page without resyncing it.
keir@17903 469 *
keir@17903 470 * 3. Operations on shadows that do not start from a guest page need to
keir@17903 471 * be aware that they may be handling an out-of-sync shadow.
keir@17903 472 *
keir@17903 473 * 4. Operations that do not normally take the shadow lock (fast-path
keir@17903 474 * #PF handler, INVLPG) must fall back to a locking, syncing version
keir@17903 475 * if they see an out-of-sync table.
keir@17903 476 *
keir@17903 477 * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
keir@17903 478 * must explicitly resync all relevant pages or update their
keir@17903 479 * shadows.
keir@17903 480 *
keir@17903 481 * Currently out-of-sync pages are listed in a simple open-addressed
keir@17903 482 * hash table with a second chance (must resist temptation to radically
keir@17903 483 * over-engineer hash tables...) The virtual address of the access
keir@17903 484 * which caused us to unsync the page is also kept in the hash table, as
keir@17903 485 * a hint for finding the writable mappings later.
keir@17903 486 *
keir@17903 487 * We keep a hash per vcpu, because we want as much as possible to do
keir@17903 488 * the re-sync on the save vcpu we did the unsync on, so the VA hint
keir@17903 489 * will be valid.
keir@17903 490 */
keir@17903 491
keir@17903 492
keir@17903 493 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
keir@17903 494 static void sh_oos_audit(struct domain *d)
keir@17903 495 {
keir@17903 496 int idx, expected_idx, expected_idx_alt;
keir@17903 497 struct page_info *pg;
keir@17903 498 struct vcpu *v;
keir@17903 499
keir@17903 500 for_each_vcpu(d, v)
keir@17903 501 {
keir@17903 502 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
keir@17903 503 {
keir@17903 504 mfn_t *oos = v->arch.paging.shadow.oos;
keir@17903 505 if ( !mfn_valid(oos[idx]) )
keir@17903 506 continue;
keir@17903 507
keir@17903 508 expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
keir@17903 509 expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
keir@17903 510 if ( idx != expected_idx && idx != expected_idx_alt )
keir@17903 511 {
keir@17903 512 printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
keir@17903 513 __func__, idx, mfn_x(oos[idx]),
keir@17903 514 expected_idx, expected_idx_alt);
keir@17903 515 BUG();
keir@17903 516 }
keir@17903 517 pg = mfn_to_page(oos[idx]);
keir@17903 518 if ( !(pg->count_info & PGC_page_table) )
keir@17903 519 {
keir@17903 520 printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
keir@17903 521 __func__, idx, mfn_x(oos[idx]), pg->count_info);
keir@17903 522 BUG();
keir@17903 523 }
keir@17903 524 if ( !(pg->shadow_flags & SHF_out_of_sync) )
keir@17903 525 {
keir@17903 526 printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
keir@17903 527 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
keir@17903 528 BUG();
keir@17903 529 }
keir@17903 530 if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
keir@17903 531 {
keir@17903 532 printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
keir@17903 533 __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
keir@17903 534 BUG();
keir@17903 535 }
keir@17903 536 }
keir@17903 537 }
keir@17903 538 }
keir@17903 539 #endif
keir@17903 540
keir@17903 541 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
keir@17903 542 void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn)
keir@17903 543 {
keir@17903 544 int idx;
keir@17903 545 struct vcpu *v;
keir@17903 546 mfn_t *oos;
keir@17903 547
keir@17903 548 ASSERT(mfn_is_out_of_sync(gmfn));
keir@17903 549
keir@17903 550 for_each_vcpu(d, v)
keir@17903 551 {
keir@17903 552 oos = v->arch.paging.shadow.oos;
keir@17903 553 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
keir@17903 554 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
keir@17903 555 idx = (idx + 1) % SHADOW_OOS_PAGES;
keir@17903 556
keir@17903 557 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
keir@17903 558 return;
keir@17903 559 }
keir@17903 560
keir@17903 561 SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
keir@17903 562 BUG();
keir@17903 563 }
keir@17903 564 #endif
keir@17903 565
keir@17903 566 /* Update the shadow, but keep the page out of sync. */
keir@17905 567 static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
keir@17903 568 {
keir@17903 569 struct page_info *pg = mfn_to_page(gmfn);
keir@17903 570
keir@17903 571 ASSERT(mfn_valid(gmfn));
keir@17903 572 ASSERT(page_is_out_of_sync(pg));
keir@17903 573
keir@17903 574 /* Call out to the appropriate per-mode resyncing function */
keir@17903 575 if ( pg->shadow_flags & SHF_L1_32 )
keir@17905 576 SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
keir@17903 577 else if ( pg->shadow_flags & SHF_L1_PAE )
keir@17905 578 SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
keir@17903 579 #if CONFIG_PAGING_LEVELS >= 4
keir@17903 580 else if ( pg->shadow_flags & SHF_L1_64 )
keir@17905 581 SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
keir@17903 582 #endif
keir@17903 583 }
keir@17903 584
keir@17984 585
keir@17984 586 /*
keir@17984 587 * Fixup arrays: We limit the maximum number of writable mappings to
keir@17984 588 * SHADOW_OOS_FIXUPS and store enough information to remove them
keir@17984 589 * quickly on resync.
keir@17984 590 */
keir@17984 591
keir@17984 592 static inline int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn,
keir@17984 593 struct oos_fixup *fixup)
keir@17984 594 {
keir@17984 595 int i;
keir@17984 596 for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
keir@17984 597 {
keir@17984 598 if ( mfn_x(fixup->smfn[i]) != INVALID_MFN )
keir@17984 599 {
keir@17984 600 sh_remove_write_access_from_sl1p(v, gmfn,
keir@17984 601 fixup->smfn[i],
keir@17984 602 fixup->off[i]);
keir@17984 603 fixup->smfn[i] = _mfn(INVALID_MFN);
keir@17984 604 }
keir@17984 605 }
keir@17984 606
keir@17984 607 /* Always flush the TLBs. See comment on oos_fixup_add(). */
keir@17984 608 return 1;
keir@17984 609 }
keir@17904 610
keir@17904 611 void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
keir@17984 612 mfn_t smfn, unsigned long off)
keir@17904 613 {
keir@17984 614 int idx, next;
keir@17984 615 mfn_t *oos;
keir@17984 616 struct oos_fixup *oos_fixup;
keir@17984 617 struct domain *d = v->domain;
keir@17984 618
keir@17984 619 perfc_incr(shadow_oos_fixup_add);
keir@17984 620
keir@17984 621 for_each_vcpu(d, v)
keir@17904 622 {
keir@17984 623 oos = v->arch.paging.shadow.oos;
keir@17984 624 oos_fixup = v->arch.paging.shadow.oos_fixup;
keir@17984 625 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
keir@17984 626 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
keir@17984 627 idx = (idx + 1) % SHADOW_OOS_PAGES;
keir@17984 628 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
keir@17904 629 {
keir@17984 630 next = oos_fixup[idx].next;
keir@17984 631
keir@17984 632 if ( mfn_x(oos_fixup[idx].smfn[next]) != INVALID_MFN )
keir@17984 633 {
keir@18454 634 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT);
keir@18454 635
keir@17984 636 /* Reuse this slot and remove current writable mapping. */
keir@17984 637 sh_remove_write_access_from_sl1p(v, gmfn,
keir@17984 638 oos_fixup[idx].smfn[next],
keir@17984 639 oos_fixup[idx].off[next]);
keir@17984 640 perfc_incr(shadow_oos_fixup_evict);
keir@17984 641 /* We should flush the TLBs now, because we removed a
keir@17984 642 writable mapping, but since the shadow is already
keir@17984 643 OOS we have no problem if another vcpu write to
keir@17984 644 this page table. We just have to be very careful to
keir@17984 645 *always* flush the tlbs on resync. */
keir@17984 646 }
keir@17984 647
keir@17984 648 oos_fixup[idx].smfn[next] = smfn;
keir@17984 649 oos_fixup[idx].off[next] = off;
keir@17984 650 oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS;
keir@18454 651
keir@18454 652 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD);
keir@17904 653 return;
keir@17904 654 }
keir@17904 655 }
keir@17904 656
keir@17984 657 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
keir@17984 658 BUG();
keir@17904 659 }
keir@17904 660
keir@17984 661 static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn,
keir@17984 662 struct oos_fixup *fixup)
keir@17904 663 {
keir@17904 664 int ftlb = 0;
keir@17904 665
keir@17984 666 ftlb |= oos_fixup_flush_gmfn(v, gmfn, fixup);
keir@17984 667
keir@17984 668 switch ( sh_remove_write_access(v, gmfn, 0, 0) )
keir@17904 669 {
keir@17904 670 default:
keir@17904 671 case 0:
keir@17904 672 break;
keir@17904 673
keir@17904 674 case 1:
keir@17904 675 ftlb |= 1;
keir@17904 676 break;
keir@17904 677
keir@17904 678 case -1:
keir@17904 679 /* An unfindable writeable typecount has appeared, probably via a
keir@17904 680 * grant table entry: can't shoot the mapping, so try to unshadow
keir@17904 681 * the page. If that doesn't work either, the guest is granting
keir@17904 682 * his pagetables and must be killed after all.
keir@17904 683 * This will flush the tlb, so we can return with no worries. */
keir@17904 684 sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
keir@17904 685 return 1;
keir@17904 686 }
keir@17904 687
keir@17904 688 if ( ftlb )
keir@17904 689 flush_tlb_mask(v->domain->domain_dirty_cpumask);
keir@17904 690
keir@17904 691 return 0;
keir@17904 692 }
keir@17904 693
keir@17904 694
keir@18454 695 static inline void trace_resync(int event, mfn_t gmfn)
keir@18454 696 {
keir@18454 697 if ( tb_init_done )
keir@18454 698 {
keir@18454 699 /* Convert gmfn to gfn */
keir@18454 700 unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
keir@18454 701 __trace_var(event, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
keir@18454 702 }
keir@18454 703 }
keir@18454 704
keir@17903 705 /* Pull all the entries on an out-of-sync page back into sync. */
keir@17984 706 static void _sh_resync(struct vcpu *v, mfn_t gmfn,
keir@17984 707 struct oos_fixup *fixup, mfn_t snp)
keir@17903 708 {
keir@17903 709 struct page_info *pg = mfn_to_page(gmfn);
keir@17903 710
keir@17903 711 ASSERT(shadow_locked_by_me(v->domain));
keir@17903 712 ASSERT(mfn_is_out_of_sync(gmfn));
keir@17903 713 /* Guest page must be shadowed *only* as L1 when out of sync. */
keir@17903 714 ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask
keir@17903 715 & ~SHF_L1_ANY));
keir@17903 716 ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
keir@17903 717
keir@18479 718 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
keir@18479 719 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
keir@17903 720
keir@17904 721 /* Need to pull write access so the page *stays* in sync. */
keir@17984 722 if ( oos_remove_write_access(v, gmfn, fixup) )
keir@17903 723 {
keir@17904 724 /* Page has been unshadowed. */
keir@17903 725 return;
keir@17903 726 }
keir@17903 727
keir@17903 728 /* No more writable mappings of this page, please */
keir@17903 729 pg->shadow_flags &= ~SHF_oos_may_write;
keir@17903 730
keir@17903 731 /* Update the shadows with current guest entries. */
keir@17905 732 _sh_resync_l1(v, gmfn, snp);
keir@17903 733
keir@17903 734 /* Now we know all the entries are synced, and will stay that way */
keir@17903 735 pg->shadow_flags &= ~SHF_out_of_sync;
keir@17903 736 perfc_incr(shadow_resync);
keir@18454 737 trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn);
keir@17903 738 }
keir@17903 739
keir@17903 740
keir@17903 741 /* Add an MFN to the list of out-of-sync guest pagetables */
keir@17984 742 static void oos_hash_add(struct vcpu *v, mfn_t gmfn)
keir@17903 743 {
keir@17984 744 int i, idx, oidx, swap = 0;
keir@17905 745 void *gptr, *gsnpptr;
keir@17903 746 mfn_t *oos = v->arch.paging.shadow.oos;
keir@17905 747 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
keir@17984 748 struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
keir@17984 749 struct oos_fixup fixup = { .next = 0 };
keir@17984 750
keir@17984 751 for (i = 0; i < SHADOW_OOS_FIXUPS; i++ )
keir@17984 752 fixup.smfn[i] = _mfn(INVALID_MFN);
keir@17903 753
keir@17903 754 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
keir@17905 755 oidx = idx;
keir@17905 756
keir@17903 757 if ( mfn_valid(oos[idx])
keir@17903 758 && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
keir@17903 759 {
keir@17903 760 /* Punt the current occupant into the next slot */
keir@17903 761 SWAP(oos[idx], gmfn);
keir@17984 762 SWAP(oos_fixup[idx], fixup);
keir@17905 763 swap = 1;
keir@17903 764 idx = (idx + 1) % SHADOW_OOS_PAGES;
keir@17903 765 }
keir@17903 766 if ( mfn_valid(oos[idx]) )
keir@17903 767 {
keir@17903 768 /* Crush the current occupant. */
keir@17984 769 _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
keir@17903 770 perfc_incr(shadow_unsync_evict);
keir@17903 771 }
keir@17903 772 oos[idx] = gmfn;
keir@17984 773 oos_fixup[idx] = fixup;
keir@17905 774
keir@17905 775 if ( swap )
keir@17905 776 SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
keir@17905 777
keir@17905 778 gptr = sh_map_domain_page(oos[oidx]);
keir@17905 779 gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
keir@17905 780 memcpy(gsnpptr, gptr, PAGE_SIZE);
keir@17905 781 sh_unmap_domain_page(gptr);
keir@17905 782 sh_unmap_domain_page(gsnpptr);
keir@17903 783 }
keir@17903 784
keir@17903 785 /* Remove an MFN from the list of out-of-sync guest pagetables */
keir@17903 786 static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
keir@17903 787 {
keir@17903 788 int idx;
keir@17903 789 mfn_t *oos;
keir@17903 790 struct domain *d = v->domain;
keir@17903 791
keir@17903 792 SHADOW_PRINTK("D%dV%d gmfn %lx\n",
keir@17903 793 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
keir@17903 794
keir@17903 795 for_each_vcpu(d, v)
keir@17903 796 {
keir@17903 797 oos = v->arch.paging.shadow.oos;
keir@17903 798 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
keir@17903 799 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
keir@17903 800 idx = (idx + 1) % SHADOW_OOS_PAGES;
keir@17903 801 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
keir@17903 802 {
keir@17903 803 oos[idx] = _mfn(INVALID_MFN);
keir@17903 804 return;
keir@17903 805 }
keir@17903 806 }
keir@17903 807
keir@17903 808 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
keir@17903 809 BUG();
keir@17903 810 }
keir@17903 811
keir@17905 812 mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
keir@17905 813 {
keir@17905 814 int idx;
keir@17905 815 mfn_t *oos;
keir@17905 816 mfn_t *oos_snapshot;
keir@17905 817 struct domain *d = v->domain;
keir@17905 818
keir@17905 819 for_each_vcpu(d, v)
keir@17905 820 {
keir@17905 821 oos = v->arch.paging.shadow.oos;
keir@17905 822 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
keir@17905 823 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
keir@17905 824 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
keir@17905 825 idx = (idx + 1) % SHADOW_OOS_PAGES;
keir@17905 826 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
keir@17905 827 {
keir@17905 828 return oos_snapshot[idx];
keir@17905 829 }
keir@17905 830 }
keir@17905 831
keir@17905 832 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
keir@17905 833 BUG();
keir@17905 834 return _mfn(INVALID_MFN);
keir@17905 835 }
keir@17905 836
keir@17903 837 /* Pull a single guest page back into sync */
keir@17903 838 void sh_resync(struct vcpu *v, mfn_t gmfn)
keir@17903 839 {
keir@17903 840 int idx;
keir@17903 841 mfn_t *oos;
keir@17905 842 mfn_t *oos_snapshot;
keir@17984 843 struct oos_fixup *oos_fixup;
keir@17903 844 struct domain *d = v->domain;
keir@17903 845
keir@17903 846 for_each_vcpu(d, v)
keir@17903 847 {
keir@17903 848 oos = v->arch.paging.shadow.oos;
keir@17984 849 oos_fixup = v->arch.paging.shadow.oos_fixup;
keir@17905 850 oos_snapshot = v->arch.paging.shadow.oos_snapshot;
keir@17903 851 idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
keir@17903 852 if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
keir@17903 853 idx = (idx + 1) % SHADOW_OOS_PAGES;
keir@17903 854
keir@17903 855 if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
keir@17903 856 {
keir@17984 857 _sh_resync(v, gmfn, &oos_fixup[idx], oos_snapshot[idx]);
keir@17903 858 oos[idx] = _mfn(INVALID_MFN);
keir@17903 859 return;
keir@17903 860 }
keir@17903 861 }
keir@17903 862
keir@17903 863 SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
keir@17903 864 BUG();
keir@17903 865 }
keir@17903 866
keir@17903 867 /* Figure out whether it's definitely safe not to sync this l1 table,
keir@17903 868 * by making a call out to the mode in which that shadow was made. */
keir@17903 869 static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
keir@17903 870 {
keir@17903 871 struct page_info *pg = mfn_to_page(gl1mfn);
keir@17903 872 if ( pg->shadow_flags & SHF_L1_32 )
keir@17903 873 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
keir@17903 874 else if ( pg->shadow_flags & SHF_L1_PAE )
keir@17903 875 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
keir@17903 876 #if CONFIG_PAGING_LEVELS >= 4
keir@17903 877 else if ( pg->shadow_flags & SHF_L1_64 )
keir@17903 878 return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
keir@17903 879 #endif
keir@17903 880 SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n",
keir@17903 881 mfn_x(gl1mfn));
keir@17903 882 BUG();
keir@17903 883 return 0; /* BUG() is no longer __attribute__((noreturn)). */
keir@17903 884 }
keir@17903 885
keir@17903 886
keir@17903 887 /* Pull all out-of-sync pages back into sync. Pages brought out of sync
keir@17903 888 * on other vcpus are allowed to remain out of sync, but their contents
keir@17903 889 * will be made safe (TLB flush semantics); pages unsynced by this vcpu
keir@17903 890 * are brought back into sync and write-protected. If skip != 0, we try
keir@17903 891 * to avoid resyncing at all if we think we can get away with it. */
keir@17903 892 void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking)
keir@17903 893 {
keir@17903 894 int idx;
keir@17903 895 struct vcpu *other;
keir@17903 896 mfn_t *oos = v->arch.paging.shadow.oos;
keir@17905 897 mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
keir@17984 898 struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup;
keir@17903 899
keir@17903 900 SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
keir@17903 901
keir@17903 902 ASSERT(do_locking || shadow_locked_by_me(v->domain));
keir@17903 903
keir@17903 904 if ( !this )
keir@17903 905 goto resync_others;
keir@17903 906
keir@17903 907 if ( do_locking )
keir@17903 908 shadow_lock(v->domain);
keir@17903 909
keir@17903 910 /* First: resync all of this vcpu's oos pages */
keir@17903 911 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
keir@17903 912 if ( mfn_valid(oos[idx]) )
keir@17903 913 {
keir@17903 914 /* Write-protect and sync contents */
keir@17984 915 _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
keir@17903 916 oos[idx] = _mfn(INVALID_MFN);
keir@17903 917 }
keir@17903 918
keir@17903 919 if ( do_locking )
keir@17903 920 shadow_unlock(v->domain);
keir@17903 921
keir@17903 922 resync_others:
keir@17903 923 if ( !others )
keir@17903 924 return;
keir@17903 925
keir@17903 926 /* Second: make all *other* vcpus' oos pages safe. */
keir@17903 927 for_each_vcpu(v->domain, other)
keir@17903 928 {
keir@17903 929 if ( v == other )
keir@17903 930 continue;
keir@17903 931
keir@17903 932 if ( do_locking )
keir@17903 933 shadow_lock(v->domain);
keir@17903 934
keir@17903 935 oos = other->arch.paging.shadow.oos;
keir@17984 936 oos_fixup = other->arch.paging.shadow.oos_fixup;
keir@17905 937 oos_snapshot = other->arch.paging.shadow.oos_snapshot;
keir@17984 938
keir@17903 939 for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
keir@17903 940 {
keir@17903 941 if ( !mfn_valid(oos[idx]) )
keir@17903 942 continue;
keir@17903 943
keir@17903 944 if ( skip )
keir@17903 945 {
keir@17903 946 /* Update the shadows and leave the page OOS. */
keir@17903 947 if ( sh_skip_sync(v, oos[idx]) )
keir@17903 948 continue;
keir@18454 949 trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]);
keir@17905 950 _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
keir@17903 951 }
keir@17903 952 else
keir@17903 953 {
keir@17903 954 /* Write-protect and sync contents */
keir@17984 955 _sh_resync(other, oos[idx], &oos_fixup[idx], oos_snapshot[idx]);
keir@17903 956 oos[idx] = _mfn(INVALID_MFN);
keir@17903 957 }
keir@17903 958 }
keir@17903 959
keir@17903 960 if ( do_locking )
keir@17903 961 shadow_unlock(v->domain);
keir@17903 962 }
keir@17903 963 }
keir@17903 964
keir@18454 965 /* Allow a shadowed page to go out of sync. Unsyncs are traced in
keir@18454 966 * multi.c:sh_page_fault() */
keir@17984 967 int sh_unsync(struct vcpu *v, mfn_t gmfn)
keir@17903 968 {
keir@17903 969 struct page_info *pg;
keir@17903 970
keir@17903 971 ASSERT(shadow_locked_by_me(v->domain));
keir@17903 972
keir@18479 973 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
keir@18479 974 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
keir@17903 975
keir@17903 976 pg = mfn_to_page(gmfn);
keir@17903 977
keir@17903 978 /* Guest page must be shadowed *only* as L1 and *only* once when out
keir@17903 979 * of sync. Also, get out now if it's already out of sync.
keir@17903 980 * Also, can't safely unsync if some vcpus have paging disabled.*/
keir@17903 981 if ( pg->shadow_flags &
keir@17903 982 ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync)
keir@17903 983 || sh_page_has_multiple_shadows(pg)
keir@17903 984 || !is_hvm_domain(v->domain)
keir@17903 985 || !v->domain->arch.paging.shadow.oos_active )
keir@17903 986 return 0;
keir@17903 987
keir@17903 988 pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
keir@17984 989 oos_hash_add(v, gmfn);
keir@17903 990 perfc_incr(shadow_unsync);
keir@18454 991 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC);
keir@17903 992 return 1;
keir@17903 993 }
keir@17903 994
keir@17903 995 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
keir@17903 996
Tim@15254 997
kaf24@11310 998 /**************************************************************************/
kaf24@11310 999 /* Code for "promoting" a guest page to the point where the shadow code is
kaf24@11310 1000 * willing to let it be treated as a guest page table. This generally
kaf24@11310 1001 * involves making sure there are no writable mappings available to the guest
kaf24@11310 1002 * for this page.
kaf24@11310 1003 */
Tim@12561 1004 void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type)
kaf24@11310 1005 {
kaf24@11310 1006 struct page_info *page = mfn_to_page(gmfn);
kaf24@11310 1007
Tim@12603 1008 ASSERT(mfn_valid(gmfn));
kaf24@11310 1009
keir@17903 1010 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17903 1011 /* Is the page already shadowed and out of sync? */
keir@17903 1012 if ( page_is_out_of_sync(page) )
keir@17903 1013 sh_resync(v, gmfn);
keir@17903 1014 #endif
keir@17903 1015
kaf24@11310 1016 /* We should never try to promote a gmfn that has writeable mappings */
Tim@15505 1017 ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
Tim@15505 1018 || (page->u.inuse.type_info & PGT_count_mask) == 0
Tim@15505 1019 || v->domain->is_shutting_down);
kaf24@11310 1020
kfraser@11554 1021 /* Is the page already shadowed? */
kaf24@11310 1022 if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
kaf24@11310 1023 page->shadow_flags = 0;
kaf24@11310 1024
Tim@12561 1025 ASSERT(!test_bit(type, &page->shadow_flags));
Tim@12561 1026 set_bit(type, &page->shadow_flags);
keir@18454 1027 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
kaf24@11310 1028 }
kaf24@11310 1029
kaf24@11310 1030 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
kaf24@11310 1031 {
kaf24@11310 1032 struct page_info *page = mfn_to_page(gmfn);
kaf24@11310 1033
Tim@14175 1034 ASSERT(test_bit(_PGC_page_table, &page->count_info));
Tim@12561 1035 ASSERT(test_bit(type, &page->shadow_flags));
Tim@12561 1036
Tim@12561 1037 clear_bit(type, &page->shadow_flags);
kaf24@11310 1038
kaf24@11310 1039 if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
keir@17903 1040 {
keir@17903 1041 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17903 1042 /* Was the page out of sync? */
keir@17903 1043 if ( page_is_out_of_sync(page) )
keir@17904 1044 {
keir@17903 1045 oos_hash_remove(v, gmfn);
keir@17904 1046 }
keir@17903 1047 #endif
kaf24@11310 1048 clear_bit(_PGC_page_table, &page->count_info);
keir@17903 1049 }
keir@18454 1050
keir@18454 1051 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE);
kaf24@11310 1052 }
kaf24@11310 1053
kaf24@11310 1054 /**************************************************************************/
kaf24@11310 1055 /* Validate a pagetable change from the guest and update the shadows.
kaf24@11310 1056 * Returns a bitmask of SHADOW_SET_* flags. */
kaf24@11310 1057
tim@11666 1058 int
Tim@13141 1059 sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size)
kaf24@11310 1060 {
kaf24@11310 1061 int result = 0;
kaf24@11310 1062 struct page_info *page = mfn_to_page(gmfn);
kaf24@11310 1063
Tim@15310 1064 paging_mark_dirty(v->domain, mfn_x(gmfn));
kaf24@11310 1065
kaf24@11310 1066 // Determine which types of shadows are affected, and update each.
kaf24@11310 1067 //
kaf24@11310 1068 // Always validate L1s before L2s to prevent another cpu with a linear
kaf24@11310 1069 // mapping of this gmfn from seeing a walk that results from
kaf24@11310 1070 // using the new L2 value and the old L1 value. (It is OK for such a
kaf24@11310 1071 // guest to see a walk that uses the old L2 value with the new L1 value,
kaf24@11310 1072 // as hardware could behave this way if one level of the pagewalk occurs
kaf24@11310 1073 // before the store, and the next level of the pagewalk occurs after the
kaf24@11310 1074 // store.
kaf24@11310 1075 //
kaf24@11310 1076 // Ditto for L2s before L3s, etc.
kaf24@11310 1077 //
kaf24@11310 1078
kaf24@11310 1079 if ( !(page->count_info & PGC_page_table) )
kaf24@11310 1080 return 0; /* Not shadowed at all */
kaf24@11310 1081
kaf24@11310 1082 if ( page->shadow_flags & SHF_L1_32 )
keir@17620 1083 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2)
kaf24@11310 1084 (v, gmfn, entry, size);
kaf24@11310 1085 if ( page->shadow_flags & SHF_L2_32 )
keir@17620 1086 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2)
kaf24@11310 1087 (v, gmfn, entry, size);
keir@17618 1088
kaf24@11310 1089 if ( page->shadow_flags & SHF_L1_PAE )
keir@17620 1090 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3)
kaf24@11310 1091 (v, gmfn, entry, size);
kaf24@11310 1092 if ( page->shadow_flags & SHF_L2_PAE )
keir@17620 1093 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3)
kaf24@11310 1094 (v, gmfn, entry, size);
kaf24@11310 1095 if ( page->shadow_flags & SHF_L2H_PAE )
keir@17620 1096 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3)
kaf24@11310 1097 (v, gmfn, entry, size);
kaf24@11310 1098
kaf24@11310 1099 #if CONFIG_PAGING_LEVELS >= 4
kaf24@11310 1100 if ( page->shadow_flags & SHF_L1_64 )
keir@17620 1101 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4)
kaf24@11310 1102 (v, gmfn, entry, size);
kaf24@11310 1103 if ( page->shadow_flags & SHF_L2_64 )
keir@17620 1104 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4)
kaf24@11310 1105 (v, gmfn, entry, size);
ack@14013 1106 if ( page->shadow_flags & SHF_L2H_64 )
keir@17620 1107 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4)
ack@14013 1108 (v, gmfn, entry, size);
kaf24@11310 1109 if ( page->shadow_flags & SHF_L3_64 )
keir@17620 1110 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4)
kaf24@11310 1111 (v, gmfn, entry, size);
kaf24@11310 1112 if ( page->shadow_flags & SHF_L4_64 )
keir@17620 1113 result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4)
kaf24@11310 1114 (v, gmfn, entry, size);
keir@17618 1115 #else /* 32-bit hypervisor does not support 64-bit guests */
kaf24@11310 1116 ASSERT((page->shadow_flags
ack@14013 1117 & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
kaf24@11310 1118 #endif
keir@18454 1119 this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED));
kaf24@11310 1120
kaf24@11310 1121 return result;
kaf24@11310 1122 }
kaf24@11310 1123
kaf24@11310 1124
kaf24@11310 1125 void
Tim@13141 1126 sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
Tim@13141 1127 void *entry, u32 size)
tim@11666 1128 /* This is the entry point for emulated writes to pagetables in HVM guests and
tim@11666 1129 * PV translated guests.
tim@11666 1130 */
kaf24@11310 1131 {
kaf24@11310 1132 struct domain *d = v->domain;
kaf24@11310 1133 int rc;
kaf24@11310 1134
Tim@13137 1135 ASSERT(shadow_locked_by_me(v->domain));
Tim@13141 1136 rc = sh_validate_guest_entry(v, gmfn, entry, size);
kaf24@11310 1137 if ( rc & SHADOW_SET_FLUSH )
tdeegan@11327 1138 /* Need to flush TLBs to pick up shadow PT changes */
tdeegan@11327 1139 flush_tlb_mask(d->domain_dirty_cpumask);
kaf24@11310 1140 if ( rc & SHADOW_SET_ERROR )
kaf24@11310 1141 {
kaf24@11310 1142 /* This page is probably not a pagetable any more: tear it out of the
Tim@11866 1143 * shadows, along with any tables that reference it.
Tim@11866 1144 * Since the validate call above will have made a "safe" (i.e. zero)
Tim@11866 1145 * shadow entry, we can let the domain live even if we can't fully
Tim@11866 1146 * unshadow the page. */
Tim@11866 1147 sh_remove_shadows(v, gmfn, 0, 0);
kaf24@11310 1148 }
kaf24@11310 1149 }
kaf24@11310 1150
Tim@13141 1151 int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
Tim@13141 1152 intpte_t new, mfn_t gmfn)
Tim@13141 1153 /* Write a new value into the guest pagetable, and update the shadows
Tim@13141 1154 * appropriately. Returns 0 if we page-faulted, 1 for success. */
Tim@13141 1155 {
Tim@13141 1156 int failed;
Tim@13141 1157 shadow_lock(v->domain);
Tim@13141 1158 failed = __copy_to_user(p, &new, sizeof(new));
Tim@13141 1159 if ( failed != sizeof(new) )
Tim@13141 1160 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
Tim@13141 1161 shadow_unlock(v->domain);
Tim@13141 1162 return (failed == 0);
Tim@13141 1163 }
Tim@13141 1164
Tim@13141 1165 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
Tim@13141 1166 intpte_t *old, intpte_t new, mfn_t gmfn)
Tim@13141 1167 /* Cmpxchg a new value into the guest pagetable, and update the shadows
Tim@13141 1168 * appropriately. Returns 0 if we page-faulted, 1 if not.
Tim@13141 1169 * N.B. caller should check the value of "old" to see if the
Tim@13141 1170 * cmpxchg itself was successful. */
Tim@13141 1171 {
Tim@13141 1172 int failed;
Tim@13141 1173 intpte_t t = *old;
Tim@13141 1174 shadow_lock(v->domain);
Tim@13141 1175 failed = cmpxchg_user(p, t, new);
Tim@13141 1176 if ( t == *old )
Tim@13141 1177 sh_validate_guest_entry(v, gmfn, p, sizeof(new));
Tim@13141 1178 *old = t;
Tim@13141 1179 shadow_unlock(v->domain);
Tim@13141 1180 return (failed == 0);
Tim@13141 1181 }
Tim@13141 1182
kaf24@11310 1183
kaf24@11310 1184 /**************************************************************************/
kaf24@11310 1185 /* Memory management for shadow pages. */
kaf24@11310 1186
kaf24@11310 1187 /* Allocating shadow pages
kaf24@11310 1188 * -----------------------
kaf24@11310 1189 *
Tim@11867 1190 * Most shadow pages are allocated singly, but there is one case where
Tim@11867 1191 * we need to allocate multiple pages together: shadowing 32-bit guest
Tim@11867 1192 * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB
keir@17620 1193 * of virtual address space, and needs to be shadowed by two PAE/64-bit
Tim@11867 1194 * l1 tables (covering 2MB of virtual address space each). Similarly, a
Tim@11867 1195 * 32-bit guest l2 table (4GB va) needs to be shadowed by four
Tim@11867 1196 * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are
Tim@11867 1197 * contiguous and aligned; functions for handling offsets into them are
Tim@11867 1198 * defined in shadow.c (shadow_l1_index() etc.)
kaf24@11310 1199 *
kaf24@11310 1200 * This table shows the allocation behaviour of the different modes:
kaf24@11310 1201 *
keir@17620 1202 * Xen paging pae pae 64b 64b 64b
keir@17620 1203 * Guest paging 32b pae 32b pae 64b
keir@17620 1204 * PV or HVM HVM * HVM HVM *
keir@17620 1205 * Shadow paging pae pae pae pae 64b
kaf24@11310 1206 *
keir@17620 1207 * sl1 size 8k 4k 8k 4k 4k
keir@17620 1208 * sl2 size 16k 4k 16k 4k 4k
keir@17620 1209 * sl3 size - - - - 4k
keir@17620 1210 * sl4 size - - - - 4k
kaf24@11310 1211 *
kaf24@11310 1212 * We allocate memory from xen in four-page units and break them down
kaf24@11310 1213 * with a simple buddy allocator. Can't use the xen allocator to handle
kaf24@11310 1214 * this as it only works for contiguous zones, and a domain's shadow
kaf24@11310 1215 * pool is made of fragments.
kaf24@11310 1216 *
kaf24@11310 1217 * In HVM guests, the p2m table is built out of shadow pages, and we provide
kaf24@11310 1218 * a function for the p2m management to steal pages, in max-order chunks, from
kaf24@11310 1219 * the free pool. We don't provide for giving them back, yet.
kaf24@11310 1220 */
kaf24@11310 1221
kaf24@11310 1222 /* Figure out the least acceptable quantity of shadow memory.
kaf24@11310 1223 * The minimum memory requirement for always being able to free up a
kaf24@11310 1224 * chunk of memory is very small -- only three max-order chunks per
kaf24@11310 1225 * vcpu to hold the top level shadows and pages with Xen mappings in them.
kaf24@11310 1226 *
kaf24@11310 1227 * But for a guest to be guaranteed to successfully execute a single
kaf24@11310 1228 * instruction, we must be able to map a large number (about thirty) VAs
kaf24@11310 1229 * at the same time, which means that to guarantee progress, we must
kaf24@11310 1230 * allow for more than ninety allocated pages per vcpu. We round that
kaf24@11310 1231 * up to 128 pages, or half a megabyte per vcpu. */
keir@16091 1232 static unsigned int shadow_min_acceptable_pages(struct domain *d)
kaf24@11310 1233 {
kaf24@11310 1234 u32 vcpu_count = 0;
kaf24@11310 1235 struct vcpu *v;
kaf24@11310 1236
kaf24@11310 1237 for_each_vcpu(d, v)
kaf24@11310 1238 vcpu_count++;
kaf24@11310 1239
kaf24@11310 1240 return (vcpu_count * 128);
Tim@12561 1241 }
kaf24@11310 1242
kaf24@11310 1243 /* Figure out the order of allocation needed for a given shadow type */
kaf24@11310 1244 static inline u32
Tim@12561 1245 shadow_order(unsigned int shadow_type)
kaf24@11310 1246 {
ack@14013 1247 static const u32 type_to_order[SH_type_unused] = {
Tim@12561 1248 0, /* SH_type_none */
Tim@12561 1249 1, /* SH_type_l1_32_shadow */
Tim@12561 1250 1, /* SH_type_fl1_32_shadow */
Tim@12561 1251 2, /* SH_type_l2_32_shadow */
Tim@12561 1252 0, /* SH_type_l1_pae_shadow */
Tim@12561 1253 0, /* SH_type_fl1_pae_shadow */
Tim@12561 1254 0, /* SH_type_l2_pae_shadow */
Tim@12561 1255 0, /* SH_type_l2h_pae_shadow */
Tim@12561 1256 0, /* SH_type_l1_64_shadow */
Tim@12561 1257 0, /* SH_type_fl1_64_shadow */
Tim@12561 1258 0, /* SH_type_l2_64_shadow */
ack@14013 1259 0, /* SH_type_l2h_64_shadow */
Tim@12561 1260 0, /* SH_type_l3_64_shadow */
Tim@12561 1261 0, /* SH_type_l4_64_shadow */
Tim@12561 1262 2, /* SH_type_p2m_table */
keir@17905 1263 0, /* SH_type_monitor_table */
keir@17905 1264 0 /* SH_type_oos_snapshot */
kaf24@11310 1265 };
ack@14013 1266 ASSERT(shadow_type < SH_type_unused);
Tim@12561 1267 return type_to_order[shadow_type];
kaf24@11310 1268 }
kaf24@11310 1269
Tim@16130 1270 static inline unsigned int
Tim@16130 1271 shadow_max_order(struct domain *d)
kaf24@11310 1272 {
Tim@16130 1273 return is_hvm_domain(d) ? SHADOW_MAX_ORDER : 0;
Tim@16130 1274 }
Tim@16130 1275
Tim@16130 1276 /* Do we have at total of count pages of the requested order free? */
Tim@16130 1277 static inline int space_is_available(
Tim@16130 1278 struct domain *d,
Tim@16130 1279 unsigned int order,
Tim@16130 1280 unsigned int count)
Tim@16130 1281 {
Tim@16130 1282 for ( ; order <= shadow_max_order(d); ++order )
Tim@16130 1283 {
Tim@16130 1284 unsigned int n = count;
Tim@16130 1285 const struct list_head *p;
Tim@16130 1286
Tim@16130 1287 list_for_each ( p, &d->arch.paging.shadow.freelists[order] )
Tim@16130 1288 if ( --n == 0 )
Tim@16130 1289 return 1;
Tim@16130 1290 count = (count + 1) >> 1;
Tim@16130 1291 }
Tim@16130 1292
kaf24@11310 1293 return 0;
kaf24@11310 1294 }
kaf24@11310 1295
kaf24@11310 1296 /* Dispatcher function: call the per-mode function that will unhook the
kaf24@11310 1297 * non-Xen mappings in this top-level shadow mfn */
keir@16091 1298 static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
kaf24@11310 1299 {
Tim@12561 1300 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
Tim@12561 1301 switch ( sp->type )
kaf24@11310 1302 {
Tim@12561 1303 case SH_type_l2_32_shadow:
keir@17620 1304 SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v,smfn);
kaf24@11310 1305 break;
Tim@12561 1306 case SH_type_l2_pae_shadow:
Tim@12561 1307 case SH_type_l2h_pae_shadow:
keir@17620 1308 SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(v,smfn);
kaf24@11310 1309 break;
kaf24@11310 1310 #if CONFIG_PAGING_LEVELS >= 4
Tim@12561 1311 case SH_type_l4_64_shadow:
keir@17620 1312 SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(v,smfn);
kaf24@11310 1313 break;
kaf24@11310 1314 #endif
kaf24@11310 1315 default:
keir@16090 1316 SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->type);
kaf24@11310 1317 BUG();
kaf24@11310 1318 }
kaf24@11310 1319 }
kaf24@11310 1320
keir@18454 1321 static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
keir@18454 1322 {
keir@18454 1323 if ( tb_init_done )
keir@18454 1324 {
keir@18454 1325 /* Convert smfn to gfn */
keir@18454 1326 unsigned long gfn;
keir@18454 1327 ASSERT(mfn_valid(smfn));
keir@18454 1328 gfn = mfn_to_gfn(d, _mfn(mfn_to_shadow_page(smfn)->backpointer));
keir@18454 1329 __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/,
keir@18454 1330 sizeof(gfn), (unsigned char*)&gfn);
keir@18454 1331 }
keir@18454 1332 }
kaf24@11310 1333
Tim@16130 1334 /* Make sure there are at least count order-sized pages
Tim@16130 1335 * available in the shadow page pool. */
Tim@16130 1336 static void _shadow_prealloc(
Tim@16130 1337 struct domain *d,
Tim@16130 1338 unsigned int order,
Tim@16130 1339 unsigned int count)
kaf24@11310 1340 {
kaf24@11310 1341 /* Need a vpcu for calling unpins; for now, since we don't have
kaf24@11310 1342 * per-vcpu shadows, any will do */
Tim@11925 1343 struct vcpu *v, *v2;
kaf24@11310 1344 struct list_head *l, *t;
Tim@12561 1345 struct shadow_page_info *sp;
kaf24@11310 1346 mfn_t smfn;
Tim@12564 1347 int i;
kaf24@11310 1348
Tim@16130 1349 ASSERT(order <= shadow_max_order(d));
Tim@16130 1350 if ( space_is_available(d, order, count) ) return;
kaf24@11310 1351
Tim@11925 1352 v = current;
Tim@11925 1353 if ( v->domain != d )
Tim@11925 1354 v = d->vcpu[0];
Tim@14335 1355 ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus */
Tim@11925 1356
Tim@12564 1357 /* Stage one: walk the list of pinned pages, unpinning them */
kfraser@14595 1358 perfc_incr(shadow_prealloc_1);
Tim@13909 1359 list_for_each_backwards_safe(l, t, &d->arch.paging.shadow.pinned_shadows)
kaf24@11310 1360 {
Tim@12561 1361 sp = list_entry(l, struct shadow_page_info, list);
Tim@12561 1362 smfn = shadow_page_to_mfn(sp);
kaf24@11310 1363
Tim@11867 1364 /* Unpin this top-level shadow */
keir@18454 1365 trace_shadow_prealloc_unpin(d, smfn);
Tim@11867 1366 sh_unpin(v, smfn);
kaf24@11310 1367
Tim@16130 1368 /* See if that freed up enough space */
Tim@16130 1369 if ( space_is_available(d, order, count) ) return;
kaf24@11310 1370 }
kaf24@11310 1371
kaf24@11310 1372 /* Stage two: all shadow pages are in use in hierarchies that are
kaf24@11310 1373 * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen
kaf24@11310 1374 * mappings. */
kfraser@14595 1375 perfc_incr(shadow_prealloc_2);
Tim@12564 1376
Tim@12564 1377 for_each_vcpu(d, v2)
Tim@12564 1378 for ( i = 0 ; i < 4 ; i++ )
Tim@11925 1379 {
Tim@12564 1380 if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
Tim@12564 1381 {
keir@18454 1382 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK);
Tim@12564 1383 shadow_unhook_mappings(v,
Tim@12564 1384 pagetable_get_mfn(v2->arch.shadow_table[i]));
Tim@12564 1385
Tim@16130 1386 /* See if that freed up enough space */
Tim@16130 1387 if ( space_is_available(d, order, count) )
Tim@12564 1388 {
keir@17527 1389 flush_tlb_mask(d->domain_dirty_cpumask);
Tim@12564 1390 return;
Tim@12564 1391 }
Tim@12564 1392 }
Tim@11925 1393 }
kaf24@11310 1394
kaf24@11310 1395 /* Nothing more we can do: all remaining shadows are of pages that
kaf24@11310 1396 * hold Xen mappings for some vcpu. This can never happen. */
Tim@16130 1397 SHADOW_ERROR("Can't pre-allocate %u order-%u shadow pages!\n"
keir@16090 1398 " shadow pages total = %u, free = %u, p2m=%u\n",
Tim@16130 1399 count, order,
keir@16090 1400 d->arch.paging.shadow.total_pages,
keir@16090 1401 d->arch.paging.shadow.free_pages,
keir@16090 1402 d->arch.paging.shadow.p2m_pages);
kaf24@11310 1403 BUG();
kaf24@11310 1404 }
kaf24@11310 1405
Tim@16130 1406 /* Make sure there are at least count pages of the order according to
Tim@16130 1407 * type available in the shadow page pool.
Tim@16130 1408 * This must be called before any calls to shadow_alloc(). Since this
Tim@16130 1409 * will free existing shadows to make room, it must be called early enough
Tim@16130 1410 * to avoid freeing shadows that the caller is currently working on. */
Tim@16130 1411 void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
Tim@16130 1412 {
Tim@16130 1413 ASSERT(type != SH_type_p2m_table);
Tim@16130 1414 return _shadow_prealloc(d, shadow_order(type), count);
Tim@16130 1415 }
Tim@16130 1416
Tim@12564 1417 /* Deliberately free all the memory we can: this will tear down all of
Tim@12564 1418 * this domain's shadows */
Tim@13141 1419 static void shadow_blow_tables(struct domain *d)
Tim@12343 1420 {
Tim@12343 1421 struct list_head *l, *t;
Tim@12561 1422 struct shadow_page_info *sp;
Tim@12564 1423 struct vcpu *v = d->vcpu[0];
Tim@12343 1424 mfn_t smfn;
Tim@12564 1425 int i;
Tim@14335 1426
Tim@14335 1427 ASSERT(v != NULL);
Tim@14335 1428
Tim@12564 1429 /* Pass one: unpin all pinned pages */
Tim@13909 1430 list_for_each_backwards_safe(l,t, &d->arch.paging.shadow.pinned_shadows)
Tim@12564 1431 {
Tim@12564 1432 sp = list_entry(l, struct shadow_page_info, list);
Tim@12564 1433 smfn = shadow_page_to_mfn(sp);
Tim@12564 1434 sh_unpin(v, smfn);
Tim@12564 1435 }
Tim@12564 1436
Tim@12564 1437 /* Second pass: unhook entries of in-use shadows */
Tim@12564 1438 for_each_vcpu(d, v)
Tim@12564 1439 for ( i = 0 ; i < 4 ; i++ )
Tim@12564 1440 if ( !pagetable_is_null(v->arch.shadow_table[i]) )
Tim@12564 1441 shadow_unhook_mappings(v,
Tim@12564 1442 pagetable_get_mfn(v->arch.shadow_table[i]));
Tim@12564 1443
Tim@12564 1444 /* Make sure everyone sees the unshadowings */
Tim@12564 1445 flush_tlb_mask(d->domain_dirty_cpumask);
Tim@12564 1446 }
Tim@12564 1447
keir@16186 1448 void shadow_blow_tables_per_domain(struct domain *d)
keir@16186 1449 {
keir@16186 1450 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL ) {
keir@16186 1451 shadow_lock(d);
keir@16186 1452 shadow_blow_tables(d);
keir@16186 1453 shadow_unlock(d);
keir@16186 1454 }
keir@16186 1455 }
Tim@12564 1456
Tim@12564 1457 #ifndef NDEBUG
Tim@12564 1458 /* Blow all shadows of all shadowed domains: this can be used to cause the
Tim@12564 1459 * guest's pagetables to be re-shadowed if we suspect that the shadows
Tim@12564 1460 * have somehow got out of sync */
Tim@12564 1461 static void shadow_blow_all_tables(unsigned char c)
Tim@12564 1462 {
Tim@12564 1463 struct domain *d;
Tim@12564 1464 printk("'%c' pressed -> blowing all shadow tables\n", c);
kfraser@14058 1465 rcu_read_lock(&domlist_read_lock);
Tim@12343 1466 for_each_domain(d)
kfraser@14058 1467 {
Tim@12564 1468 if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL )
Tim@12343 1469 {
Tim@12343 1470 shadow_lock(d);
Tim@12564 1471 shadow_blow_tables(d);
Tim@12343 1472 shadow_unlock(d);
Tim@12343 1473 }
kfraser@14058 1474 }
kfraser@14058 1475 rcu_read_unlock(&domlist_read_lock);
Tim@12343 1476 }
Tim@12343 1477
Tim@12343 1478 /* Register this function in the Xen console keypress table */
Tim@12343 1479 static __init int shadow_blow_tables_keyhandler_init(void)
Tim@12343 1480 {
Tim@12564 1481 register_keyhandler('S', shadow_blow_all_tables,"reset shadow pagetables");
Tim@12343 1482 return 0;
Tim@12343 1483 }
Tim@12343 1484 __initcall(shadow_blow_tables_keyhandler_init);
Tim@12343 1485 #endif /* !NDEBUG */
kaf24@11310 1486
kaf24@11310 1487 /* Allocate another shadow's worth of (contiguous, aligned) pages,
kaf24@11310 1488 * and fill in the type and backpointer fields of their page_infos.
kaf24@11310 1489 * Never fails to allocate. */
kaf24@11310 1490 mfn_t shadow_alloc(struct domain *d,
kaf24@11310 1491 u32 shadow_type,
kaf24@11310 1492 unsigned long backpointer)
kaf24@11310 1493 {
Tim@12561 1494 struct shadow_page_info *sp = NULL;
kaf24@11310 1495 unsigned int order = shadow_order(shadow_type);
kaf24@11310 1496 cpumask_t mask;
kaf24@11310 1497 void *p;
kaf24@11310 1498 int i;
kaf24@11310 1499
Tim@13137 1500 ASSERT(shadow_locked_by_me(d));
Tim@16130 1501 if (shadow_type == SH_type_p2m_table && order > shadow_max_order(d))
Tim@16130 1502 order = shadow_max_order(d);
Tim@16130 1503 ASSERT(order <= shadow_max_order(d));
Tim@12561 1504 ASSERT(shadow_type != SH_type_none);
kfraser@14595 1505 perfc_incr(shadow_alloc);
kaf24@11310 1506
kaf24@11310 1507 /* Find smallest order which can satisfy the request. */
kaf24@11310 1508 for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
Tim@13909 1509 if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
kfraser@14021 1510 goto found;
kaf24@11310 1511
kaf24@11310 1512 /* If we get here, we failed to allocate. This should never happen.
kaf24@11310 1513 * It means that we didn't call shadow_prealloc() correctly before
kaf24@11310 1514 * we allocated. We can't recover by calling prealloc here, because
kaf24@11310 1515 * we might free up higher-level pages that the caller is working on. */
keir@16090 1516 SHADOW_ERROR("Can't allocate %i shadow pages!\n", 1 << order);
kaf24@11310 1517 BUG();
kfraser@14021 1518
kfraser@14021 1519 found:
kfraser@14021 1520 sp = list_entry(d->arch.paging.shadow.freelists[i].next,
kfraser@14021 1521 struct shadow_page_info, list);
kfraser@14021 1522 list_del(&sp->list);
kfraser@14021 1523
kfraser@14021 1524 /* We may have to halve the chunk a number of times. */
kfraser@14021 1525 while ( i != order )
kfraser@14021 1526 {
kfraser@14021 1527 i--;
kfraser@14021 1528 sp->order = i;
kfraser@14021 1529 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[i]);
kfraser@14021 1530 sp += 1 << i;
kfraser@14021 1531 }
kfraser@14021 1532 d->arch.paging.shadow.free_pages -= 1 << order;
kfraser@14021 1533
kfraser@14021 1534 /* Init page info fields and clear the pages */
kfraser@14021 1535 for ( i = 0; i < 1<<order ; i++ )
kfraser@14021 1536 {
kfraser@14021 1537 /* Before we overwrite the old contents of this page,
kfraser@14021 1538 * we need to be sure that no TLB holds a pointer to it. */
kfraser@14021 1539 mask = d->domain_dirty_cpumask;
kfraser@14021 1540 tlbflush_filter(mask, sp[i].tlbflush_timestamp);
kfraser@14021 1541 if ( unlikely(!cpus_empty(mask)) )
kfraser@14021 1542 {
kfraser@14595 1543 perfc_incr(shadow_alloc_tlbflush);
kfraser@14021 1544 flush_tlb_mask(mask);
kfraser@14021 1545 }
kfraser@14021 1546 /* Now safe to clear the page for reuse */
kfraser@14021 1547 p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
kfraser@14021 1548 ASSERT(p != NULL);
kfraser@14021 1549 clear_page(p);
kfraser@14021 1550 sh_unmap_domain_page(p);
kfraser@14021 1551 INIT_LIST_HEAD(&sp[i].list);
kfraser@14021 1552 sp[i].type = shadow_type;
kfraser@14021 1553 sp[i].pinned = 0;
kfraser@14021 1554 sp[i].count = 0;
kfraser@14021 1555 sp[i].backpointer = backpointer;
kfraser@14021 1556 sp[i].next_shadow = NULL;
kfraser@14021 1557 perfc_incr(shadow_alloc_count);
kfraser@14021 1558 }
kfraser@14021 1559 return shadow_page_to_mfn(sp);
kaf24@11310 1560 }
kaf24@11310 1561
kaf24@11310 1562
kaf24@11310 1563 /* Return some shadow pages to the pool. */
kaf24@11310 1564 void shadow_free(struct domain *d, mfn_t smfn)
kaf24@11310 1565 {
Tim@12561 1566 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
kaf24@11310 1567 u32 shadow_type;
kaf24@11310 1568 unsigned long order;
kaf24@11310 1569 unsigned long mask;
kaf24@11310 1570 int i;
kaf24@11310 1571
Tim@13137 1572 ASSERT(shadow_locked_by_me(d));
kfraser@14595 1573 perfc_incr(shadow_free);
kaf24@11310 1574
Tim@12561 1575 shadow_type = sp->type;
Tim@12561 1576 ASSERT(shadow_type != SH_type_none);
Tim@12561 1577 ASSERT(shadow_type != SH_type_p2m_table);
kaf24@11310 1578 order = shadow_order(shadow_type);
kaf24@11310 1579
Tim@13909 1580 d->arch.paging.shadow.free_pages += 1 << order;
kaf24@11310 1581
kaf24@11310 1582 for ( i = 0; i < 1<<order; i++ )
kaf24@11310 1583 {
keir@17080 1584 #if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION)
Tim@11868 1585 struct vcpu *v;
Tim@11868 1586 for_each_vcpu(d, v)
Tim@11868 1587 {
keir@17080 1588 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
Tim@11868 1589 /* No longer safe to look for a writeable mapping in this shadow */
Tim@13909 1590 if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(smfn) + i )
Tim@13909 1591 v->arch.paging.shadow.last_writeable_pte_smfn = 0;
keir@17080 1592 #endif
keir@17080 1593 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
keir@17080 1594 v->arch.paging.last_write_emul_ok = 0;
keir@17080 1595 #endif
Tim@11868 1596 }
Tim@11868 1597 #endif
kaf24@11310 1598 /* Strip out the type: this is now a free shadow page */
Tim@12561 1599 sp[i].type = 0;
kaf24@11310 1600 /* Remember the TLB timestamp so we will know whether to flush
kaf24@11310 1601 * TLBs when we reuse the page. Because the destructors leave the
kaf24@11310 1602 * contents of the pages in place, we can delay TLB flushes until
kaf24@11310 1603 * just before the allocator hands the page out again. */
Tim@12561 1604 sp[i].tlbflush_timestamp = tlbflush_current_time();
kaf24@11310 1605 perfc_decr(shadow_alloc_count);
kaf24@11310 1606 }
kaf24@11310 1607
kaf24@11310 1608 /* Merge chunks as far as possible. */
Tim@16130 1609 for ( ; order < shadow_max_order(d); ++order )
kaf24@11310 1610 {
kaf24@11310 1611 mask = 1 << order;
Tim@12561 1612 if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
kaf24@11310 1613 /* Merge with predecessor block? */
Tim@12561 1614 if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
kaf24@11310 1615 break;
Tim@12561 1616 list_del(&(sp-mask)->list);
Tim@12561 1617 sp -= mask;
kaf24@11310 1618 } else {
kaf24@11310 1619 /* Merge with successor block? */
Tim@12561 1620 if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
kaf24@11310 1621 break;
Tim@12561 1622 list_del(&(sp+mask)->list);
kaf24@11310 1623 }
kaf24@11310 1624 }
kaf24@11310 1625
Tim@12561 1626 sp->order = order;
Tim@13909 1627 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
kaf24@11310 1628 }
kaf24@11310 1629
kaf24@11310 1630 /* Divert some memory from the pool to be used by the p2m mapping.
kaf24@11310 1631 * This action is irreversible: the p2m mapping only ever grows.
tim@11666 1632 * That's OK because the p2m table only exists for translated domains,
kaf24@11310 1633 * and those domains can't ever turn off shadow mode.
kaf24@11310 1634 * Also, we only ever allocate a max-order chunk, so as to preserve
kaf24@11310 1635 * the invariant that shadow_prealloc() always works.
kaf24@11310 1636 * Returns 0 iff it can't get a chunk (the caller should then
Tim@13141 1637 * free up some pages in domheap and call sh_set_allocation);
kaf24@11310 1638 * returns non-zero on success.
kaf24@11310 1639 */
kaf24@11310 1640 static int
Tim@13909 1641 sh_alloc_p2m_pages(struct domain *d)
kaf24@11310 1642 {
kaf24@11310 1643 struct page_info *pg;
kaf24@11310 1644 u32 i;
Tim@16130 1645 unsigned int order = shadow_max_order(d);
Tim@16130 1646
Tim@13137 1647 ASSERT(shadow_locked_by_me(d));
kaf24@11310 1648
Tim@13909 1649 if ( d->arch.paging.shadow.total_pages
Tim@16130 1650 < (shadow_min_acceptable_pages(d) + (1 << order)) )
kaf24@11310 1651 return 0; /* Not enough shadow memory: need to increase it first */
kaf24@11310 1652
Tim@12561 1653 pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
Tim@16130 1654 d->arch.paging.shadow.p2m_pages += (1 << order);
Tim@16130 1655 d->arch.paging.shadow.total_pages -= (1 << order);
Tim@16130 1656 for (i = 0; i < (1U << order); i++)
kaf24@11310 1657 {
tim@11666 1658 /* Unlike shadow pages, mark p2m pages as owned by the domain.
tim@11666 1659 * Marking the domain as the owner would normally allow the guest to
tim@11666 1660 * create mappings of these pages, but these p2m pages will never be
tim@11666 1661 * in the domain's guest-physical address space, and so that is not
tim@11666 1662 * believed to be a concern.
tim@11666 1663 */
kaf24@11310 1664 page_set_owner(&pg[i], d);
Tim@13994 1665 pg[i].count_info = 1;
Tim@13909 1666 list_add_tail(&pg[i].list, &d->arch.paging.shadow.p2m_freelist);
kaf24@11310 1667 }
kaf24@11310 1668 return 1;
kaf24@11310 1669 }
kaf24@11310 1670
kaf24@11310 1671 // Returns 0 if no memory is available...
keir@16091 1672 static struct page_info *
kaf24@11310 1673 shadow_alloc_p2m_page(struct domain *d)
kaf24@11310 1674 {
kaf24@11310 1675 struct list_head *entry;
Tim@12564 1676 struct page_info *pg;
kaf24@11310 1677 mfn_t mfn;
kaf24@11310 1678 void *p;
Tim@13909 1679
Tim@13909 1680 shadow_lock(d);
Tim@13909 1681
Tim@13909 1682 if ( list_empty(&d->arch.paging.shadow.p2m_freelist) &&
Tim@13909 1683 !sh_alloc_p2m_pages(d) )
Tim@13909 1684 {
Tim@13909 1685 shadow_unlock(d);
Tim@13909 1686 return NULL;
Tim@13909 1687 }
Tim@13909 1688 entry = d->arch.paging.shadow.p2m_freelist.next;
kaf24@11310 1689 list_del(entry);
Tim@13909 1690
Tim@13909 1691 shadow_unlock(d);
Tim@13909 1692
Tim@12564 1693 pg = list_entry(entry, struct page_info, list);
Tim@12564 1694 mfn = page_to_mfn(pg);
kaf24@11310 1695 p = sh_map_domain_page(mfn);
kaf24@11310 1696 clear_page(p);
kaf24@11310 1697 sh_unmap_domain_page(p);
kaf24@11310 1698
Tim@13909 1699 return pg;
Tim@13909 1700 }
Tim@13909 1701
keir@16091 1702 static void
Tim@13909 1703 shadow_free_p2m_page(struct domain *d, struct page_info *pg)
Tim@13909 1704 {
Tim@13909 1705 ASSERT(page_get_owner(pg) == d);
Tim@13909 1706 /* Should have just the one ref we gave it in alloc_p2m_page() */
Tim@13909 1707 if ( (pg->count_info & PGC_count_mask) != 1 )
Tim@13909 1708 {
Tim@13909 1709 SHADOW_ERROR("Odd p2m page count c=%#x t=%"PRtype_info"\n",
Tim@13909 1710 pg->count_info, pg->u.inuse.type_info);
Tim@13909 1711 }
Tim@14310 1712 pg->count_info = 0;
Tim@13909 1713 /* Free should not decrement domain's total allocation, since
Tim@13909 1714 * these pages were allocated without an owner. */
Tim@13909 1715 page_set_owner(pg, NULL);
keir@18071 1716 #if defined(__x86_64__)
keir@18071 1717 spin_lock_init(&pg->lock);
keir@18071 1718 #endif
Tim@13909 1719 free_domheap_pages(pg, 0);
Tim@13909 1720 d->arch.paging.shadow.p2m_pages--;
Tim@13909 1721 perfc_decr(shadow_alloc_count);
kaf24@11310 1722 }
kaf24@11310 1723
kaf24@11310 1724 #if CONFIG_PAGING_LEVELS == 3
kaf24@11310 1725 static void p2m_install_entry_in_monitors(struct domain *d,
kaf24@11310 1726 l3_pgentry_t *l3e)
kaf24@11310 1727 /* Special case, only used for external-mode domains on PAE hosts:
kaf24@11310 1728 * update the mapping of the p2m table. Once again, this is trivial in
kaf24@11310 1729 * other paging modes (one top-level entry points to the top-level p2m,
kaf24@11310 1730 * no maintenance needed), but PAE makes life difficult by needing a
kaf24@11310 1731 * copy the eight l3es of the p2m table in eight l2h slots in the
kaf24@11310 1732 * monitor table. This function makes fresh copies when a p2m l3e
kaf24@11310 1733 * changes. */
kaf24@11310 1734 {
kaf24@11310 1735 l2_pgentry_t *ml2e;
kaf24@11310 1736 struct vcpu *v;
kaf24@11310 1737 unsigned int index;
kaf24@11310 1738
kaf24@11310 1739 index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
kaf24@11310 1740 ASSERT(index < MACHPHYS_MBYTES>>1);
kaf24@11310 1741
kaf24@11310 1742 for_each_vcpu(d, v)
kaf24@11310 1743 {
kaf24@11310 1744 if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
kaf24@11310 1745 continue;
kaf24@11310 1746 ASSERT(shadow_mode_external(v->domain));
kaf24@11310 1747
kaf24@11310 1748 SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
kaf24@11310 1749 d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
kaf24@11310 1750
kaf24@11310 1751 if ( v == current ) /* OK to use linear map of monitor_table */
kaf24@11310 1752 ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
kaf24@11310 1753 else
kaf24@11310 1754 {
kaf24@11310 1755 l3_pgentry_t *ml3e;
kaf24@11310 1756 ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
kaf24@11310 1757 ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
kaf24@11310 1758 ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
kaf24@11310 1759 ml2e += l2_table_offset(RO_MPT_VIRT_START);
kaf24@11310 1760 sh_unmap_domain_page(ml3e);
kaf24@11310 1761 }
kaf24@11310 1762 ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
kaf24@11310 1763 if ( v != current )
kaf24@11310 1764 sh_unmap_domain_page(ml2e);
kaf24@11310 1765 }
kaf24@11310 1766 }
kaf24@11310 1767 #endif
kaf24@11310 1768
kaf24@11310 1769 /* Set the pool of shadow pages to the required number of pages.
kaf24@11310 1770 * Input will be rounded up to at least shadow_min_acceptable_pages(),
kaf24@11310 1771 * plus space for the p2m table.
kaf24@11310 1772 * Returns 0 for success, non-zero for failure. */
Tim@13141 1773 static unsigned int sh_set_allocation(struct domain *d,
Tim@13141 1774 unsigned int pages,
Tim@13141 1775 int *preempted)
kaf24@11310 1776 {
Tim@12561 1777 struct shadow_page_info *sp;
kaf24@11310 1778 unsigned int lower_bound;
Tim@16130 1779 unsigned int j, order = shadow_max_order(d);
kaf24@11310 1780
Tim@13137 1781 ASSERT(shadow_locked_by_me(d));
kaf24@11310 1782
kaf24@11310 1783 /* Don't allocate less than the minimum acceptable, plus one page per
kaf24@11310 1784 * megabyte of RAM (for the p2m table) */
kaf24@11310 1785 lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256);
kaf24@11310 1786 if ( pages > 0 && pages < lower_bound )
kaf24@11310 1787 pages = lower_bound;
kaf24@11310 1788 /* Round up to largest block size */
kaf24@11310 1789 pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1);
kaf24@11310 1790
kaf24@11310 1791 SHADOW_PRINTK("current %i target %i\n",
Tim@13909 1792 d->arch.paging.shadow.total_pages, pages);
Tim@13909 1793
Tim@13909 1794 while ( d->arch.paging.shadow.total_pages != pages )
kaf24@11310 1795 {
Tim@13909 1796 if ( d->arch.paging.shadow.total_pages < pages )
kaf24@11310 1797 {
kaf24@11310 1798 /* Need to allocate more memory from domheap */
Tim@12561 1799 sp = (struct shadow_page_info *)
keir@17385 1800 alloc_domheap_pages(NULL, order, MEMF_node(domain_to_node(d)));
Tim@12561 1801 if ( sp == NULL )
kaf24@11310 1802 {
kaf24@11310 1803 SHADOW_PRINTK("failed to allocate shadow pages.\n");
kaf24@11310 1804 return -ENOMEM;
kaf24@11310 1805 }
Tim@16130 1806 d->arch.paging.shadow.free_pages += 1 << order;
Tim@16130 1807 d->arch.paging.shadow.total_pages += 1 << order;
Tim@16130 1808 for ( j = 0; j < 1U << order; j++ )
kaf24@11310 1809 {
Tim@12561 1810 sp[j].type = 0;
Tim@12561 1811 sp[j].pinned = 0;
Tim@12561 1812 sp[j].count = 0;
Tim@12561 1813 sp[j].mbz = 0;
Tim@12561 1814 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
kaf24@11310 1815 }
Tim@16130 1816 sp->order = order;
Tim@16130 1817 list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
kaf24@11310 1818 }
Tim@13909 1819 else if ( d->arch.paging.shadow.total_pages > pages )
kaf24@11310 1820 {
kaf24@11310 1821 /* Need to return memory to domheap */
Tim@16130 1822 _shadow_prealloc(d, order, 1);
Tim@16130 1823 ASSERT(!list_empty(&d->arch.paging.shadow.freelists[order]));
Tim@16130 1824 sp = list_entry(d->arch.paging.shadow.freelists[order].next,
Tim@12561 1825 struct shadow_page_info, list);
Tim@12561 1826 list_del(&sp->list);
keir@17913 1827 #if defined(__x86_64__)
keir@17913 1828 /*
keir@17913 1829 * Re-instate lock field which we overwrite with shadow_page_info.
keir@17913 1830 * This was safe, since the lock is only used on guest pages.
keir@17913 1831 */
keir@17913 1832 for ( j = 0; j < 1U << order; j++ )
keir@17913 1833 spin_lock_init(&((struct page_info *)sp)[j].lock);
keir@17913 1834 #endif
Tim@16130 1835 d->arch.paging.shadow.free_pages -= 1 << order;
Tim@16130 1836 d->arch.paging.shadow.total_pages -= 1 << order;
Tim@16130 1837 free_domheap_pages((struct page_info *)sp, order);
kaf24@11310 1838 }
kaf24@11310 1839
kaf24@11310 1840 /* Check to see if we need to yield and try again */
kaf24@11310 1841 if ( preempted && hypercall_preempt_check() )
kaf24@11310 1842 {
kaf24@11310 1843 *preempted = 1;
kaf24@11310 1844 return 0;
kaf24@11310 1845 }
kaf24@11310 1846 }
kaf24@11310 1847
kaf24@11310 1848 return 0;
kaf24@11310 1849 }
kaf24@11310 1850
Tim@13141 1851 /* Return the size of the shadow pool, rounded up to the nearest MB */
Tim@13141 1852 static unsigned int shadow_get_allocation(struct domain *d)
kaf24@11310 1853 {
Tim@13909 1854 unsigned int pg = d->arch.paging.shadow.total_pages;
Tim@13141 1855 return ((pg >> (20 - PAGE_SHIFT))
Tim@13141 1856 + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
kaf24@11310 1857 }
kaf24@11310 1858
kaf24@11310 1859 /**************************************************************************/
Tim@12562 1860 /* Hash table for storing the guest->shadow mappings.
Tim@12562 1861 * The table itself is an array of pointers to shadows; the shadows are then
Tim@12562 1862 * threaded on a singly-linked list of shadows with the same hash value */
Tim@12562 1863
Tim@12562 1864 #define SHADOW_HASH_BUCKETS 251
Tim@12562 1865 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
kaf24@11310 1866
kaf24@11310 1867 /* Hash function that takes a gfn or mfn, plus another byte of type info */
kaf24@11310 1868 typedef u32 key_t;
Tim@12562 1869 static inline key_t sh_hash(unsigned long n, unsigned int t)
kaf24@11310 1870 {
kaf24@11310 1871 unsigned char *p = (unsigned char *)&n;
kaf24@11310 1872 key_t k = t;
kaf24@11310 1873 int i;
kaf24@11310 1874 for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
Tim@12562 1875 return k % SHADOW_HASH_BUCKETS;
kaf24@11310 1876 }
kaf24@11310 1877
kaf24@11310 1878 #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL)
kaf24@11310 1879
kaf24@11310 1880 /* Before we get to the mechanism, define a pair of audit functions
kaf24@11310 1881 * that sanity-check the contents of the hash table. */
kaf24@11310 1882 static void sh_hash_audit_bucket(struct domain *d, int bucket)
kaf24@11310 1883 /* Audit one bucket of the hash table */
kaf24@11310 1884 {
Tim@12562 1885 struct shadow_page_info *sp, *x;
kaf24@11310 1886
kaf24@11310 1887 if ( !(SHADOW_AUDIT_ENABLE) )
kaf24@11310 1888 return;
kaf24@11310 1889
Tim@13909 1890 sp = d->arch.paging.shadow.hash_table[bucket];
Tim@12562 1891 while ( sp )
kaf24@11310 1892 {
kaf24@11310 1893 /* Not a shadow? */
Tim@12561 1894 BUG_ON( sp->mbz != 0 );
Tim@12562 1895 /* Bogus type? */
Tim@12562 1896 BUG_ON( sp->type == 0 );
Tim@12562 1897 BUG_ON( sp->type > SH_type_max_shadow );
Tim@12562 1898 /* Wrong bucket? */
Tim@12562 1899 BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket );
Tim@12562 1900 /* Duplicate entry? */
Tim@12562 1901 for ( x = sp->next_shadow; x; x = x->next_shadow )
Tim@12562 1902 BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
Tim@12562 1903 /* Follow the backpointer to the guest pagetable */
Tim@12562 1904 if ( sp->type != SH_type_fl1_32_shadow
Tim@12562 1905 && sp->type != SH_type_fl1_pae_shadow
Tim@12562 1906 && sp->type != SH_type_fl1_64_shadow )
kaf24@11310 1907 {
Tim@12562 1908 struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
kaf24@11310 1909 /* Bad shadow flags on guest page? */
Tim@12562 1910 BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
Tim@12069 1911 /* Bad type count on guest page? */
keir@17903 1912 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17903 1913 if ( sp->type == SH_type_l1_32_shadow
keir@17903 1914 || sp->type == SH_type_l1_pae_shadow
keir@17903 1915 || sp->type == SH_type_l1_64_shadow )
keir@17903 1916 {
keir@17903 1917 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
keir@17903 1918 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
keir@17903 1919 {
keir@17903 1920 if ( !page_is_out_of_sync(gpg) )
keir@17903 1921 {
keir@17903 1922 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
keir@17903 1923 " and not OOS but has typecount %#lx\n",
keir@17903 1924 sp->backpointer,
keir@17903 1925 mfn_x(shadow_page_to_mfn(sp)),
keir@17903 1926 gpg->u.inuse.type_info);
keir@17903 1927 BUG();
keir@17903 1928 }
keir@17903 1929 }
keir@17903 1930 }
keir@17903 1931 else /* Not an l1 */
keir@17903 1932 #endif
Tim@12069 1933 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
Tim@12069 1934 && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
Tim@12069 1935 {
Tim@13909 1936 SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
Tim@12069 1937 " but has typecount %#lx\n",
Tim@12562 1938 sp->backpointer, mfn_x(shadow_page_to_mfn(sp)),
Tim@12562 1939 gpg->u.inuse.type_info);
Tim@12069 1940 BUG();
Tim@12069 1941 }
kaf24@11310 1942 }
kaf24@11310 1943 /* That entry was OK; on we go */
Tim@12562 1944 sp = sp->next_shadow;
kaf24@11310 1945 }
kaf24@11310 1946 }
kaf24@11310 1947
kaf24@11310 1948 #else
Tim@12562 1949 #define sh_hash_audit_bucket(_d, _b) do {} while(0)
kaf24@11310 1950 #endif /* Hashtable bucket audit */
kaf24@11310 1951
kaf24@11310 1952
kaf24@11310 1953 #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL
kaf24@11310 1954
kaf24@11310 1955 static void sh_hash_audit(struct domain *d)
kaf24@11310 1956 /* Full audit: audit every bucket in the table */
kaf24@11310 1957 {
kaf24@11310 1958 int i;
kaf24@11310 1959
kaf24@11310 1960 if ( !(SHADOW_AUDIT_ENABLE) )
kaf24@11310 1961 return;
kaf24@11310 1962
kaf24@11310 1963 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
kaf24@11310 1964 {
kaf24@11310 1965 sh_hash_audit_bucket(d, i);
kaf24@11310 1966 }
kaf24@11310 1967 }
kaf24@11310 1968
kaf24@11310 1969 #else
Tim@12562 1970 #define sh_hash_audit(_d) do {} while(0)
kaf24@11310 1971 #endif /* Hashtable bucket audit */
kaf24@11310 1972
kaf24@11310 1973 /* Allocate and initialise the table itself.
kaf24@11310 1974 * Returns 0 for success, 1 for error. */
kaf24@11310 1975 static int shadow_hash_alloc(struct domain *d)
kaf24@11310 1976 {
Tim@12562 1977 struct shadow_page_info **table;
kaf24@11310 1978
Tim@13137 1979 ASSERT(shadow_locked_by_me(d));
Tim@13909 1980 ASSERT(!d->arch.paging.shadow.hash_table);
kaf24@11310 1981
Tim@12562 1982 table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
kaf24@11310 1983 if ( !table ) return 1;
kaf24@11310 1984 memset(table, 0,
Tim@12562 1985 SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
Tim@13909 1986 d->arch.paging.shadow.hash_table = table;
kaf24@11310 1987 return 0;
kaf24@11310 1988 }
kaf24@11310 1989
kaf24@11310 1990 /* Tear down the hash table and return all memory to Xen.
kaf24@11310 1991 * This function does not care whether the table is populated. */
kaf24@11310 1992 static void shadow_hash_teardown(struct domain *d)
kaf24@11310 1993 {
Tim@13137 1994 ASSERT(shadow_locked_by_me(d));
Tim@13909 1995 ASSERT(d->arch.paging.shadow.hash_table);
Tim@13909 1996
Tim@13909 1997 xfree(d->arch.paging.shadow.hash_table);
Tim@13909 1998 d->arch.paging.shadow.hash_table = NULL;
kaf24@11310 1999 }
kaf24@11310 2000
kaf24@11310 2001
Tim@12562 2002 mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
kaf24@11310 2003 /* Find an entry in the hash table. Returns the MFN of the shadow,
kaf24@11310 2004 * or INVALID_MFN if it doesn't exist */
kaf24@11310 2005 {
kaf24@11310 2006 struct domain *d = v->domain;
Tim@12562 2007 struct shadow_page_info *sp, *prev;
kaf24@11310 2008 key_t key;
kaf24@11310 2009
Tim@13137 2010 ASSERT(shadow_locked_by_me(d));
Tim@13909 2011 ASSERT(d->arch.paging.shadow.hash_table);
kaf24@11310 2012 ASSERT(t);
kaf24@11310 2013
kaf24@11310 2014 sh_hash_audit(d);
kaf24@11310 2015
kfraser@14595 2016 perfc_incr(shadow_hash_lookups);
kaf24@11310 2017 key = sh_hash(n, t);
Tim@12562 2018 sh_hash_audit_bucket(d, key);
Tim@12562 2019
Tim@13909 2020 sp = d->arch.paging.shadow.hash_table[key];
Tim@12562 2021 prev = NULL;
Tim@12562 2022 while(sp)
kaf24@11310 2023 {
Tim@12562 2024 if ( sp->backpointer == n && sp->type == t )
kaf24@11310 2025 {
Tim@12562 2026 /* Pull-to-front if 'sp' isn't already the head item */
Tim@13909 2027 if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
kaf24@11310 2028 {
Tim@13909 2029 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
kaf24@11310 2030 /* Can't reorder: someone is walking the hash chains */
Tim@12562 2031 return shadow_page_to_mfn(sp);
kaf24@11310 2032 else
kaf24@11310 2033 {
Tim@12562 2034 ASSERT(prev);
Tim@12562 2035 /* Delete sp from the list */
Tim@12562 2036 prev->next_shadow = sp->next_shadow;
Tim@12562 2037 /* Re-insert it at the head of the list */
Tim@13909 2038 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
Tim@13909 2039 d->arch.paging.shadow.hash_table[key] = sp;
kaf24@11310 2040 }
kaf24@11310 2041 }
kaf24@11310 2042 else
kaf24@11310 2043 {
kfraser@14595 2044 perfc_incr(shadow_hash_lookup_head);
kaf24@11310 2045 }
Tim@12562 2046 return shadow_page_to_mfn(sp);
kaf24@11310 2047 }
Tim@12562 2048 prev = sp;
Tim@12562 2049 sp = sp->next_shadow;
kaf24@11310 2050 }
kaf24@11310 2051
kfraser@14595 2052 perfc_incr(shadow_hash_lookup_miss);
kaf24@11310 2053 return _mfn(INVALID_MFN);
kaf24@11310 2054 }
kaf24@11310 2055
Tim@12562 2056 void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
Tim@12562 2057 mfn_t smfn)
kaf24@11310 2058 /* Put a mapping (n,t)->smfn into the hash table */
kaf24@11310 2059 {
kaf24@11310 2060 struct domain *d = v->domain;
Tim@12562 2061 struct shadow_page_info *sp;
kaf24@11310 2062 key_t key;
kaf24@11310 2063
Tim@13137 2064 ASSERT(shadow_locked_by_me(d));
Tim@13909 2065 ASSERT(d->arch.paging.shadow.hash_table);
kaf24@11310 2066 ASSERT(t);
kaf24@11310 2067
kaf24@11310 2068 sh_hash_audit(d);
kaf24@11310 2069
kfraser@14595 2070 perfc_incr(shadow_hash_inserts);
kaf24@11310 2071 key = sh_hash(n, t);
Tim@12562 2072 sh_hash_audit_bucket(d, key);
kaf24@11310 2073
Tim@12562 2074 /* Insert this shadow at the top of the bucket */
Tim@12562 2075 sp = mfn_to_shadow_page(smfn);
Tim@13909 2076 sp->next_shadow = d->arch.paging.shadow.hash_table[key];
Tim@13909 2077 d->arch.paging.shadow.hash_table[key] = sp;
Tim@12562 2078
Tim@12562 2079 sh_hash_audit_bucket(d, key);
kaf24@11310 2080 }
kaf24@11310 2081
Tim@12562 2082 void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
Tim@12562 2083 mfn_t smfn)
kaf24@11310 2084 /* Excise the mapping (n,t)->smfn from the hash table */
kaf24@11310 2085 {
kaf24@11310 2086 struct domain *d = v->domain;
Tim@12562 2087 struct shadow_page_info *sp, *x;
kaf24@11310 2088 key_t key;
kaf24@11310 2089
Tim@13137 2090 ASSERT(shadow_locked_by_me(d));
Tim@13909 2091 ASSERT(d->arch.paging.shadow.hash_table);
kaf24@11310 2092 ASSERT(t);
kaf24@11310 2093
kaf24@11310 2094 sh_hash_audit(d);
kaf24@11310 2095
kfraser@14595 2096 perfc_incr(shadow_hash_deletes);
kaf24@11310 2097 key = sh_hash(n, t);
Tim@12562 2098 sh_hash_audit_bucket(d, key);
Tim@12562 2099
Tim@12562 2100 sp = mfn_to_shadow_page(smfn);
Tim@13909 2101 if ( d->arch.paging.shadow.hash_table[key] == sp )
Tim@12562 2102 /* Easy case: we're deleting the head item. */
Tim@13909 2103 d->arch.paging.shadow.hash_table[key] = sp->next_shadow;
kaf24@11310 2104 else
kaf24@11310 2105 {
Tim@12562 2106 /* Need to search for the one we want */
Tim@13909 2107 x = d->arch.paging.shadow.hash_table[key];
Tim@12562 2108 while ( 1 )
kaf24@11310 2109 {
kaf24@11310 2110 ASSERT(x); /* We can't have hit the end, since our target is
kaf24@11310 2111 * still in the chain somehwere... */
Tim@12562 2112 if ( x->next_shadow == sp )
kaf24@11310 2113 {
Tim@12562 2114 x->next_shadow = sp->next_shadow;
kaf24@11310 2115 break;
kaf24@11310 2116 }
Tim@12562 2117 x = x->next_shadow;
kaf24@11310 2118 }
kaf24@11310 2119 }
Tim@12562 2120 sp->next_shadow = NULL;
Tim@12562 2121
Tim@12562 2122 sh_hash_audit_bucket(d, key);
kaf24@11310 2123 }
kaf24@11310 2124
kaf24@11310 2125 typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
kaf24@11310 2126
kaf24@11310 2127 static void hash_foreach(struct vcpu *v,
kaf24@11310 2128 unsigned int callback_mask,
kaf24@11310 2129 hash_callback_t callbacks[],
kaf24@11310 2130 mfn_t callback_mfn)
kaf24@11310 2131 /* Walk the hash table looking at the types of the entries and
kaf24@11310 2132 * calling the appropriate callback function for each entry.
kaf24@11310 2133 * The mask determines which shadow types we call back for, and the array
kaf24@11310 2134 * of callbacks tells us which function to call.
kaf24@11310 2135 * Any callback may return non-zero to let us skip the rest of the scan.
kaf24@11310 2136 *
kaf24@11310 2137 * WARNING: Callbacks MUST NOT add or remove hash entries unless they
kaf24@11310 2138 * then return non-zero to terminate the scan. */
kaf24@11310 2139 {
kaf24@11310 2140 int i, done = 0;
kaf24@11310 2141 struct domain *d = v->domain;
Tim@12562 2142 struct shadow_page_info *x;
kaf24@11310 2143
kaf24@11310 2144 /* Say we're here, to stop hash-lookups reordering the chains */
Tim@13137 2145 ASSERT(shadow_locked_by_me(d));
Tim@13909 2146 ASSERT(d->arch.paging.shadow.hash_walking == 0);
Tim@13909 2147 d->arch.paging.shadow.hash_walking = 1;
kaf24@11310 2148
kaf24@11310 2149 for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ )
kaf24@11310 2150 {
kaf24@11310 2151 /* WARNING: This is not safe against changes to the hash table.
kaf24@11310 2152 * The callback *must* return non-zero if it has inserted or
kaf24@11310 2153 * deleted anything from the hash (lookups are OK, though). */
Tim@13909 2154 for ( x = d->arch.paging.shadow.hash_table[i]; x; x = x->next_shadow )
kaf24@11310 2155 {
Tim@12562 2156 if ( callback_mask & (1 << x->type) )
kaf24@11310 2157 {
Tim@12562 2158 ASSERT(x->type <= 15);
Tim@12562 2159 ASSERT(callbacks[x->type] != NULL);
Tim@12562 2160 done = callbacks[x->type](v, shadow_page_to_mfn(x),
Tim@12562 2161 callback_mfn);
Tim@12562 2162 if ( done ) break;
kaf24@11310 2163 }
kaf24@11310 2164 }
kaf24@11310 2165 if ( done ) break;
kaf24@11310 2166 }
Tim@13909 2167 d->arch.paging.shadow.hash_walking = 0;
kaf24@11310 2168 }
kaf24@11310 2169
kaf24@11310 2170
kaf24@11310 2171 /**************************************************************************/
kaf24@11310 2172 /* Destroy a shadow page: simple dispatcher to call the per-type destructor
kaf24@11310 2173 * which will decrement refcounts appropriately and return memory to the
kaf24@11310 2174 * free pool. */
kaf24@11310 2175
kaf24@11310 2176 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
kaf24@11310 2177 {
Tim@12561 2178 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
Tim@12561 2179 unsigned int t = sp->type;
kaf24@11310 2180
kaf24@11310 2181
kaf24@11310 2182 SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
kaf24@11310 2183
kaf24@11310 2184 /* Double-check, if we can, that the shadowed page belongs to this
kaf24@11310 2185 * domain, (by following the back-pointer). */
Tim@12561 2186 ASSERT(t == SH_type_fl1_32_shadow ||
Tim@12561 2187 t == SH_type_fl1_pae_shadow ||
Tim@12561 2188 t == SH_type_fl1_64_shadow ||
Tim@12561 2189 t == SH_type_monitor_table ||
kfraser@14974 2190 (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
Tim@12561 2191 (page_get_owner(mfn_to_page(_mfn(sp->backpointer)))
kaf24@11310 2192 == v->domain));
kaf24@11310 2193
kaf24@11310 2194 /* The down-shifts here are so that the switch statement is on nice
kaf24@11310 2195 * small numbers that the compiler will enjoy */
Tim@12561 2196 switch ( t )
kaf24@11310 2197 {
Tim@12561 2198 case SH_type_l1_32_shadow:
Tim@12561 2199 case SH_type_fl1_32_shadow:
keir@17620 2200 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(v, smfn);
kaf24@11310 2201 break;
Tim@12561 2202 case SH_type_l2_32_shadow:
keir@17620 2203 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(v, smfn);
kaf24@11310 2204 break;
keir@17618 2205
Tim@12561 2206 case SH_type_l1_pae_shadow:
Tim@12561 2207 case SH_type_fl1_pae_shadow:
keir@17620 2208 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(v, smfn);
kaf24@11310 2209 break;
Tim@12561 2210 case SH_type_l2_pae_shadow:
Tim@12561 2211 case SH_type_l2h_pae_shadow:
keir@17620 2212 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(v, smfn);
kaf24@11310 2213 break;
kaf24@11310 2214
kaf24@11310 2215 #if CONFIG_PAGING_LEVELS >= 4
Tim@12561 2216 case SH_type_l1_64_shadow:
Tim@12561 2217 case SH_type_fl1_64_shadow:
keir@17620 2218 SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(v, smfn);
kaf24@11310 2219 break;
ack@14013 2220 case SH_type_l2h_64_shadow:
kfraser@14974 2221 ASSERT(is_pv_32on64_vcpu(v));
Tim@14175 2222 /* Fall through... */
Tim@12561 2223 case SH_type_l2_64_shadow:
keir@17620 2224 SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(v, smfn);
kaf24@11310 2225 break;
Tim@12561 2226 case SH_type_l3_64_shadow:
keir@17620 2227 SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(v, smfn);
kaf24@11310 2228 break;
Tim@12561 2229 case SH_type_l4_64_shadow:
keir@17620 2230 SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(v, smfn);
kaf24@11310 2231 break;
kaf24@11310 2232 #endif
kaf24@11310 2233 default:
keir@16090 2234 SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n",
keir@16090 2235 (unsigned long)t);
kaf24@11310 2236 BUG();
kaf24@11310 2237 }
kaf24@11310 2238 }
kaf24@11310 2239
keir@18454 2240 static inline void trace_shadow_wrmap_bf(mfn_t gmfn)
keir@18454 2241 {
keir@18454 2242 if ( tb_init_done )
keir@18454 2243 {
keir@18454 2244 /* Convert gmfn to gfn */
keir@18454 2245 unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
keir@18454 2246 __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
keir@18454 2247 }
keir@18454 2248 }
keir@18454 2249
kaf24@11310 2250 /**************************************************************************/
kaf24@11310 2251 /* Remove all writeable mappings of a guest frame from the shadow tables
kaf24@11310 2252 * Returns non-zero if we need to flush TLBs.
kaf24@11310 2253 * level and fault_addr desribe how we found this to be a pagetable;
keir@17903 2254 * level==0 means we have some other reason for revoking write access.
keir@17903 2255 * If level==0 we are allowed to fail, returning -1. */
kaf24@11310 2256
Tim@13141 2257 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
Tim@13141 2258 unsigned int level,
Tim@13141 2259 unsigned long fault_addr)
kaf24@11310 2260 {
kaf24@11310 2261 /* Dispatch table for getting per-type functions */
ack@14013 2262 static hash_callback_t callbacks[SH_type_unused] = {
kaf24@11310 2263 NULL, /* none */
keir@17620 2264 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* l1_32 */
keir@17620 2265 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* fl1_32 */
kaf24@11310 2266 NULL, /* l2_32 */
keir@17620 2267 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* l1_pae */
keir@17620 2268 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* fl1_pae */
kaf24@11310 2269 NULL, /* l2_pae */
kaf24@11310 2270 NULL, /* l2h_pae */
kaf24@11310 2271 #if CONFIG_PAGING_LEVELS >= 4
keir@17620 2272 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* l1_64 */
keir@17620 2273 SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* fl1_64 */
kaf24@11310 2274 #else
kaf24@11310 2275 NULL, /* l1_64 */
kaf24@11310 2276 NULL, /* fl1_64 */
kaf24@11310 2277 #endif
kaf24@11310 2278 NULL, /* l2_64 */
ack@14013 2279 NULL, /* l2h_64 */
kaf24@11310 2280 NULL, /* l3_64 */
kaf24@11310 2281 NULL, /* l4_64 */
kaf24@11310 2282 NULL, /* p2m */
kaf24@11310 2283 NULL /* unused */
kaf24@11310 2284 };
kaf24@11310 2285
kaf24@11310 2286 static unsigned int callback_mask =
Tim@12561 2287 1 << SH_type_l1_32_shadow
Tim@12561 2288 | 1 << SH_type_fl1_32_shadow
Tim@12561 2289 | 1 << SH_type_l1_pae_shadow
Tim@12561 2290 | 1 << SH_type_fl1_pae_shadow
Tim@12561 2291 | 1 << SH_type_l1_64_shadow
Tim@12561 2292 | 1 << SH_type_fl1_64_shadow
kaf24@11310 2293 ;
kaf24@11310 2294 struct page_info *pg = mfn_to_page(gmfn);
kaf24@11310 2295
Tim@13137 2296 ASSERT(shadow_locked_by_me(v->domain));
kaf24@11310 2297
kaf24@11310 2298 /* Only remove writable mappings if we are doing shadow refcounts.
kaf24@11310 2299 * In guest refcounting, we trust Xen to already be restricting
kaf24@11310 2300 * all the writes to the guest page tables, so we do not need to
kaf24@11310 2301 * do more. */
kaf24@11310 2302 if ( !shadow_mode_refcounts(v->domain) )
kaf24@11310 2303 return 0;
kaf24@11310 2304
kaf24@11310 2305 /* Early exit if it's already a pagetable, or otherwise not writeable */
keir@17903 2306 if ( (sh_mfn_is_a_page_table(gmfn)
keir@17903 2307 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17903 2308 /* Unless they've been allowed to go out of sync with their shadows */
keir@17903 2309 && !mfn_oos_may_write(gmfn)
keir@17903 2310 #endif
keir@17903 2311 )
kaf24@11310 2312 || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
kaf24@11310 2313 return 0;
kaf24@11310 2314
keir@18454 2315 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP);
keir@18454 2316
kfraser@14595 2317 perfc_incr(shadow_writeable);
kaf24@11310 2318
kaf24@11310 2319 /* If this isn't a "normal" writeable page, the domain is trying to
kaf24@11310 2320 * put pagetables in special memory of some kind. We can't allow that. */
kaf24@11310 2321 if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
kaf24@11310 2322 {
kaf24@11310 2323 SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %"
kaf24@11310 2324 PRtype_info "\n",
kaf24@11310 2325 mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
kaf24@11310 2326 domain_crash(v->domain);
kaf24@11310 2327 }
kaf24@11310 2328
kaf24@11310 2329 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
keir@17903 2330 if ( v == current )
kaf24@11310 2331 {
kaf24@11310 2332 unsigned long gfn;
kaf24@11310 2333 /* Heuristic: there is likely to be only one writeable mapping,
kaf24@11310 2334 * and that mapping is likely to be in the current pagetable,
Tim@11868 2335 * in the guest's linear map (on non-HIGHPTE linux and windows)*/
kaf24@11310 2336
keir@18454 2337 #define GUESS(_a, _h) do { \
Tim@13909 2338 if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
keir@18454 2339 perfc_incr(shadow_writeable_h_ ## _h); \
keir@18454 2340 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \
keir@18454 2341 { \
keir@18454 2342 TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); \
keir@18454 2343 return 1; \
keir@18454 2344 } \
kaf24@11310 2345 } while (0)
kaf24@11310 2346
keir@17903 2347 if ( level == 0 && fault_addr )
keir@17903 2348 GUESS(fault_addr, 6);
kaf24@11310 2349
Tim@13909 2350 if ( v->arch.paging.mode->guest_levels == 2 )
kaf24@11310 2351 {
kaf24@11310 2352 if ( level == 1 )
kaf24@11310 2353 /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
kaf24@11310 2354 GUESS(0xC0000000UL + (fault_addr >> 10), 1);
steven@11323 2355
steven@11323 2356 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
Tim@13909 2357 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
steven@11323 2358 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
steven@11323 2359
kaf24@11310 2360 }
Tim@13909 2361 else if ( v->arch.paging.mode->guest_levels == 3 )
kaf24@11310 2362 {
kaf24@11310 2363 /* 32bit PAE w2k3: linear map at 0xC0000000 */
kaf24@11310 2364 switch ( level )
kaf24@11310 2365 {
kaf24@11310 2366 case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
kaf24@11310 2367 case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
kaf24@11310 2368 }
steven@11323 2369
steven@11323 2370 /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */
Tim@13909 2371 if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 )
steven@11323 2372 GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4);
kaf24@11310 2373 }
kaf24@11310 2374 #if CONFIG_PAGING_LEVELS >= 4
Tim@13909 2375 else if ( v->arch.paging.mode->guest_levels == 4 )
kaf24@11310 2376 {
Tim@15057 2377 /* 64bit w2k3: linear map at 0xfffff68000000000 */
kaf24@11310 2378 switch ( level )
kaf24@11310 2379 {
Tim@15057 2380 case 1: GUESS(0xfffff68000000000UL
Tim@15057 2381 + ((fault_addr & VADDR_MASK) >> 9), 3); break;
Tim@15057 2382 case 2: GUESS(0xfffff6fb40000000UL
Tim@15057 2383 + ((fault_addr & VADDR_MASK) >> 18), 3); break;
Tim@15057 2384 case 3: GUESS(0xfffff6fb7da00000UL
Tim@15057 2385 + ((fault_addr & VADDR_MASK) >> 27), 3); break;
kaf24@11310 2386 }
steven@11323 2387
Tim@12442 2388 /* 64bit Linux direct map at 0xffff810000000000; older kernels
Tim@12442 2389 * had it at 0x0000010000000000UL */
Tim@13909 2390 gfn = mfn_to_gfn(v->domain, gmfn);
steven@11323 2391 GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4);
Tim@12442 2392 GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4);
keir@17787 2393 /*
keir@17787 2394 * 64bit Solaris kernel page map at
keir@17787 2395 * kpm_vbase; 0xfffffe0000000000UL
keir@17787 2396 */
keir@17787 2397 GUESS(0xfffffe0000000000UL + (gfn << PAGE_SHIFT), 4);
kaf24@11310 2398 }
kaf24@11310 2399 #endif /* CONFIG_PAGING_LEVELS >= 4 */
kaf24@11310 2400
kaf24@11310 2401 #undef GUESS
kaf24@11310 2402 }
Tim@11868 2403
Tim@11868 2404 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
Tim@11868 2405 return 1;
Tim@11868 2406
Tim@11868 2407 /* Second heuristic: on HIGHPTE linux, there are two particular PTEs
Tim@11868 2408 * (entries in the fixmap) where linux maps its pagetables. Since
Tim@11868 2409 * we expect to hit them most of the time, we start the search for
Tim@11868 2410 * the writeable mapping by looking at the same MFN where the last
Tim@11868 2411 * brute-force search succeeded. */
Tim@11868 2412
Tim@13909 2413 if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 )
Tim@11868 2414 {
Tim@11868 2415 unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
Tim@13909 2416 mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
Tim@12561 2417 int shtype = mfn_to_shadow_page(last_smfn)->type;
Tim@11868 2418
Tim@11868 2419 if ( callbacks[shtype] )
Tim@11868 2420 callbacks[shtype](v, last_smfn, gmfn);
Tim@11868 2421
Tim@11868 2422 if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count )
kfraser@14595 2423 perfc_incr(shadow_writeable_h_5);
Tim@11868 2424 }
Tim@11868 2425
Tim@11868 2426 if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )
Tim@11868 2427 return 1;
Tim@11868 2428
Tim@11868 2429 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
kaf24@11310 2430
kaf24@11310 2431 /* Brute-force search of all the shadows, by walking the hash */
keir@18454 2432 trace_shadow_wrmap_bf(gmfn);
keir@17904 2433 if ( level == 0 )
keir@17904 2434 perfc_incr(shadow_writeable_bf_1);
keir@17904 2435 else
keir@17904 2436 perfc_incr(shadow_writeable_bf);
kaf24@11310 2437 hash_foreach(v, callback_mask, callbacks, gmfn);
kaf24@11310 2438
Tim@15505 2439 /* If that didn't catch the mapping, then there's some non-pagetable
Tim@15505 2440 * mapping -- ioreq page, grant mapping, &c. */
kaf24@11310 2441 if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
kaf24@11310 2442 {
keir@17903 2443 if ( level == 0 )
keir@17903 2444 return -1;
keir@17903 2445
Tim@15505 2446 SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
Tim@15505 2447 "%lu special-use mappings of it\n", mfn_x(gmfn),
kaf24@11310 2448 (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
kaf24@11310 2449 domain_crash(v->domain);
kaf24@11310 2450 }
kaf24@11310 2451
kaf24@11310 2452 /* We killed at least one writeable mapping, so must flush TLBs. */
kaf24@11310 2453 return 1;
kaf24@11310 2454 }
kaf24@11310 2455
keir@17904 2456 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17904 2457 int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
keir@17904 2458 mfn_t smfn, unsigned long off)
keir@17904 2459 {
keir@17904 2460 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
keir@17904 2461
keir@17904 2462 ASSERT(mfn_valid(smfn));
keir@17904 2463 ASSERT(mfn_valid(gmfn));
keir@17904 2464
keir@17904 2465 if ( sp->type == SH_type_l1_32_shadow )
keir@17904 2466 {
keir@17904 2467 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
keir@17904 2468 (v, gmfn, smfn, off);
keir@17904 2469 }
keir@17904 2470 #if CONFIG_PAGING_LEVELS >= 3
keir@17904 2471 else if ( sp->type == SH_type_l1_pae_shadow )
keir@17904 2472 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
keir@17904 2473 (v, gmfn, smfn, off);
keir@17904 2474 #if CONFIG_PAGING_LEVELS >= 4
keir@17904 2475 else if ( sp->type == SH_type_l1_64_shadow )
keir@17904 2476 return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
keir@17904 2477 (v, gmfn, smfn, off);
keir@17904 2478 #endif
keir@17904 2479 #endif
keir@17904 2480
keir@17904 2481 return 0;
keir@17904 2482 }
keir@17904 2483 #endif
kaf24@11310 2484
kaf24@11310 2485 /**************************************************************************/
kaf24@11310 2486 /* Remove all mappings of a guest frame from the shadow tables.
kaf24@11310 2487 * Returns non-zero if we need to flush TLBs. */
kaf24@11310 2488
Tim@13141 2489 int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
kaf24@11310 2490 {
kaf24@11310 2491 struct page_info *page = mfn_to_page(gmfn);
Tim@13141 2492 int expected_count, do_locking;
kaf24@11310 2493
kaf24@11310 2494 /* Dispatch table for getting per-type functions */
ack@14013 2495 static hash_callback_t callbacks[SH_type_unused] = {
kaf24@11310 2496 NULL, /* none */
keir@17620 2497 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* l1_32 */
keir@17620 2498 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* fl1_32 */
kaf24@11310 2499 NULL, /* l2_32 */
keir@17620 2500 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* l1_pae */
keir@17620 2501 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* fl1_pae */
kaf24@11310 2502 NULL, /* l2_pae */
kaf24@11310 2503 NULL, /* l2h_pae */
kaf24@11310 2504 #if CONFIG_PAGING_LEVELS >= 4
keir@17620 2505 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* l1_64 */
keir@17620 2506 SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* fl1_64 */
kaf24@11310 2507 #else
kaf24@11310 2508 NULL, /* l1_64 */
kaf24@11310 2509 NULL, /* fl1_64 */
kaf24@11310 2510 #endif
kaf24@11310 2511 NULL, /* l2_64 */
ack@14013 2512 NULL, /* l2h_64 */
kaf24@11310 2513 NULL, /* l3_64 */
kaf24@11310 2514 NULL, /* l4_64 */
kaf24@11310 2515 NULL, /* p2m */
kaf24@11310 2516 NULL /* unused */
kaf24@11310 2517 };
kaf24@11310 2518
kaf24@11310 2519 static unsigned int callback_mask =
Tim@12561 2520 1 << SH_type_l1_32_shadow
Tim@12561 2521 | 1 << SH_type_fl1_32_shadow
Tim@12561 2522 | 1 << SH_type_l1_pae_shadow
Tim@12561 2523 | 1 << SH_type_fl1_pae_shadow
Tim@12561 2524 | 1 << SH_type_l1_64_shadow
Tim@12561 2525 | 1 << SH_type_fl1_64_shadow
kaf24@11310 2526 ;
kaf24@11310 2527
kfraser@14595 2528 perfc_incr(shadow_mappings);
kaf24@11310 2529 if ( (page->count_info & PGC_count_mask) == 0 )
kaf24@11310 2530 return 0;
kaf24@11310 2531
Tim@13141 2532 /* Although this is an externally visible function, we do not know
Tim@13141 2533 * whether the shadow lock will be held when it is called (since it
Tim@13141 2534 * can be called via put_page_type when we clear a shadow l1e).
Tim@13141 2535 * If the lock isn't held, take it for the duration of the call. */
Tim@13141 2536 do_locking = !shadow_locked_by_me(v->domain);
Tim@13141 2537 if ( do_locking ) shadow_lock(v->domain);
kaf24@11310 2538
kaf24@11310 2539 /* XXX TODO:
kaf24@11310 2540 * Heuristics for finding the (probably) single mapping of this gmfn */
kaf24@11310 2541
kaf24@11310 2542 /* Brute-force search of all the shadows, by walking the hash */
kfraser@14595 2543 perfc_incr(shadow_mappings_bf);
kaf24@11310 2544 hash_foreach(v, callback_mask, callbacks, gmfn);
kaf24@11310 2545
kaf24@11310 2546 /* If that didn't catch the mapping, something is very wrong */
kaf24@11310 2547 expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
kaf24@11310 2548 if ( (page->count_info & PGC_count_mask) != expected_count )
kaf24@11310 2549 {
Tim@13500 2550 /* Don't complain if we're in HVM and there are some extra mappings:
Tim@13500 2551 * The qemu helper process has an untyped mapping of this dom's RAM
Tim@13500 2552 * and the HVM restore program takes another. */
kaf24@11310 2553 if ( !(shadow_mode_external(v->domain)
Tim@13500 2554 && (page->count_info & PGC_count_mask) <= 3
kaf24@11310 2555 && (page->u.inuse.type_info & PGT_count_mask) == 0) )
kaf24@11310 2556 {
kaf24@11310 2557 SHADOW_ERROR("can't find all mappings of mfn %lx: "
kaf24@11310 2558 "c=%08x t=%08lx\n", mfn_x(gmfn),
kaf24@11310 2559 page->count_info, page->u.inuse.type_info);
kaf24@11310 2560 }
kaf24@11310 2561 }
kaf24@11310 2562
Tim@13141 2563 if ( do_locking ) shadow_unlock(v->domain);
Tim@13141 2564
kaf24@11310 2565 /* We killed at least one mapping, so must flush TLBs. */
kaf24@11310 2566 return 1;
kaf24@11310 2567 }
kaf24@11310 2568
kaf24@11310 2569
kaf24@11310 2570 /**************************************************************************/
kaf24@11310 2571 /* Remove all shadows of a guest frame from the shadow tables */
kaf24@11310 2572
kaf24@11310 2573 static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
kaf24@11310 2574 /* Follow this shadow's up-pointer, if it has one, and remove the reference
kaf24@11310 2575 * found there. Returns 1 if that was the only reference to this shadow */
kaf24@11310 2576 {
Tim@12561 2577 struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
kaf24@11310 2578 mfn_t pmfn;
kaf24@11310 2579 void *vaddr;
kaf24@11310 2580 int rc;
kaf24@11310 2581
Tim@12561 2582 ASSERT(sp->type > 0);
Tim@12561 2583 ASSERT(sp->type < SH_type_max_shadow);
Tim@12561 2584 ASSERT(sp->type != SH_type_l2_32_shadow);
Tim@12561 2585 ASSERT(sp->type != SH_type_l2_pae_shadow);
Tim@12561 2586 ASSERT(sp->type != SH_type_l2h_pae_shadow);
Tim@12561 2587 ASSERT(sp->type != SH_type_l4_64_shadow);
kaf24@11310 2588
Tim@12561 2589 if (sp->up == 0) return 0;
Tim@12561 2590 pmfn = _mfn(sp->up >> PAGE_SHIFT);
Tim@12603 2591 ASSERT(mfn_valid(pmfn));
kaf24@11310 2592 vaddr = sh_map_domain_page(pmfn);
kaf24@11310 2593 ASSERT(vaddr);
Tim@12561 2594 vaddr += sp->up & (PAGE_SIZE-1);
kaf24@11310 2595 ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
kaf24@11310 2596
kaf24@11310 2597 /* Is this the only reference to this shadow? */
Tim@12561 2598 rc = (sp->count == 1) ? 1 : 0;
kaf24@11310 2599
kaf24@11310 2600 /* Blank the offending entry */
Tim@12561 2601 switch (sp->type)
kaf24@11310 2602 {
Tim@12561 2603 case SH_type_l1_32_shadow:
Tim@12561 2604 case SH_type_l2_32_shadow:
keir@17620 2605 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(v, vaddr, pmfn);
kaf24@11310 2606 break;
Tim@12561 2607 case SH_type_l1_pae_shadow:
Tim@12561 2608 case SH_type_l2_pae_shadow:
Tim@12561 2609 case SH_type_l2h_pae_shadow:
keir@17620 2610 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(v, vaddr, pmfn);
kaf24@11310 2611 break;
kaf24@11310 2612 #if CONFIG_PAGING_LEVELS >= 4
Tim@12561 2613 case SH_type_l1_64_shadow:
Tim@12561 2614 case SH_type_l2_64_shadow:
ack@14013 2615 case SH_type_l2h_64_shadow:
Tim@12561 2616 case SH_type_l3_64_shadow:
Tim@12561 2617 case SH_type_l4_64_shadow:
keir@17620 2618 SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(v, vaddr, pmfn);
kaf24@11310 2619 break;
kaf24@11310 2620 #endif
kaf24@11310 2621 default: BUG(); /* Some wierd unknown shadow type */
kaf24@11310 2622 }
kaf24@11310 2623
kaf24@11310 2624 sh_unmap_domain_page(vaddr);
kaf24@11310 2625 if ( rc )
kfraser@14595 2626 perfc_incr(shadow_up_pointer);
kaf24@11310 2627 else
kfraser@14595 2628 perfc_incr(shadow_unshadow_bf);
kaf24@11310 2629
kaf24@11310 2630 return rc;
kaf24@11310 2631 }
kaf24@11310 2632
Tim@11866 2633 void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all)
kaf24@11310 2634 /* Remove the shadows of this guest page.
Tim@11866 2635 * If fast != 0, just try the quick heuristic, which will remove
Tim@11866 2636 * at most one reference to each shadow of the page. Otherwise, walk
Tim@11866 2637 * all the shadow tables looking for refs to shadows of this gmfn.
Tim@11866 2638 * If all != 0, kill the domain if we can't find all the shadows.
Tim@11866 2639 * (all != 0 implies fast == 0)
Tim@11866 2640 */
kaf24@11310 2641 {
Tim@13141 2642 struct page_info *pg = mfn_to_page(gmfn);
kaf24@11310 2643 mfn_t smfn;
Tim@13141 2644 int do_locking;
kaf24@11310 2645 unsigned char t;
Tim@11866 2646
kaf24@11310 2647 /* Dispatch table for getting per-type functions: each level must
kaf24@11310 2648 * be called with the function to remove a lower-level shadow. */
ack@14013 2649 static hash_callback_t callbacks[SH_type_unused] = {
kaf24@11310 2650 NULL, /* none */
kaf24@11310 2651 NULL, /* l1_32 */
kaf24@11310 2652 NULL, /* fl1_32 */
keir@17620 2653 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 2), /* l2_32 */
kaf24@11310 2654 NULL, /* l1_pae */
kaf24@11310 2655 NULL, /* fl1_pae */
keir@17620 2656 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2_pae */
keir@17620 2657 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2h_pae */
kaf24@11310 2658 NULL, /* l1_64 */
kaf24@11310 2659 NULL, /* fl1_64 */
kaf24@11310 2660 #if CONFIG_PAGING_LEVELS >= 4
keir@17620 2661 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2_64 */
keir@17620 2662 SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2h_64 */
keir@17620 2663 SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, 4), /* l3_64 */
keir@17620 2664 SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, 4), /* l4_64 */
kaf24@11310 2665 #else
kaf24@11310 2666 NULL, /* l2_64 */
ack@14013 2667 NULL, /* l2h_64 */
kaf24@11310 2668 NULL, /* l3_64 */
kaf24@11310 2669 NULL, /* l4_64 */
kaf24@11310 2670 #endif
kaf24@11310 2671 NULL, /* p2m */
kaf24@11310 2672 NULL /* unused */
kaf24@11310 2673 };
kaf24@11310 2674
kaf24@11310 2675 /* Another lookup table, for choosing which mask to use */
ack@14013 2676 static unsigned int masks[SH_type_unused] = {
kaf24@11310 2677 0, /* none */
Tim@12561 2678 1 << SH_type_l2_32_shadow, /* l1_32 */
kaf24@11310 2679 0, /* fl1_32 */
kaf24@11310 2680 0, /* l2_32 */
Tim@12561 2681 ((1 << SH_type_l2h_pae_shadow)
Tim@12561 2682 | (1 << SH_type_l2_pae_shadow)), /* l1_pae */
kaf24@11310 2683 0, /* fl1_pae */
Tim@11867 2684 0, /* l2_pae */
Tim@11867 2685 0, /* l2h_pae */
ack@14013 2686 ((1 << SH_type_l2h_64_shadow)
ack@14013 2687 | (1 << SH_type_l2_64_shadow)), /* l1_64 */
kaf24@11310 2688 0, /* fl1_64 */
Tim@12561 2689 1 << SH_type_l3_64_shadow, /* l2_64 */
ack@14013 2690 1 << SH_type_l3_64_shadow, /* l2h_64 */
Tim@12561 2691 1 << SH_type_l4_64_shadow, /* l3_64 */
kaf24@11310 2692 0, /* l4_64 */
kaf24@11310 2693 0, /* p2m */
kaf24@11310 2694 0 /* unused */
kaf24@11310 2695 };
kaf24@11310 2696
Tim@11866 2697 ASSERT(!(all && fast));
kaf24@11310 2698
Tim@13141 2699 /* Although this is an externally visible function, we do not know
Tim@13141 2700 * whether the shadow lock will be held when it is called (since it
Tim@13141 2701 * can be called via put_page_type when we clear a shadow l1e).
Tim@13141 2702 * If the lock isn't held, take it for the duration of the call. */
Tim@13141 2703 do_locking = !shadow_locked_by_me(v->domain);
Tim@13141 2704 if ( do_locking ) shadow_lock(v->domain);
Tim@13141 2705
kaf24@11310 2706 SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
kaf24@11310 2707 v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
kaf24@11310 2708
Tim@14020 2709 /* Bail out now if the page is not shadowed */
Tim@14020 2710 if ( (pg->count_info & PGC_page_table) == 0 )
Tim@14020 2711 {
Tim@14020 2712 if ( do_locking ) shadow_unlock(v->domain);
Tim@14020 2713 return;
Tim@14020 2714 }
Tim@14020 2715
kaf24@11310 2716 /* Search for this shadow in all appropriate shadows */
kfraser@14595 2717 perfc_incr(shadow_unshadow);
kaf24@11310 2718
kaf24@11310 2719 /* Lower-level shadows need to be excised from upper-level shadows.
kaf24@11310 2720 * This call to hash_foreach() looks dangerous but is in fact OK: each
kaf24@11310 2721 * call will remove at most one shadow, and terminate immediately when
kaf24@11310 2722 * it does remove it, so we never walk the hash after doing a deletion. */
gdunlap@15932 2723 #define DO_UNSHADOW(_type) do { \
gdunlap@15932 2724 t = (_type); \
gdunlap@15932 2725 if( !(pg->count_info & PGC_page_table) \
gdunlap@15932 2726 || !(pg->shadow_flags & (1 << t)) ) \
gdunlap@15932 2727 break; \
gdunlap@15932 2728 smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \
gdunlap@15932 2729 if ( unlikely(!mfn_valid(smfn)) ) \
gdunlap@15932 2730 { \
gdunlap@15932 2731 SHADOW_ERROR(": gmfn %#lx has flags 0x%"PRIx32 \
gdunlap@15932 2732 " but no type-0x%"PRIx32" shadow\n", \
gdunlap@15932 2733 mfn_x(gmfn), (uint32_t)pg->shadow_flags, t); \
gdunlap@15932 2734 break; \
gdunlap@15932 2735 } \
gdunlap@15932 2736 if ( sh_type_is_pinnable(v, t) ) \
gdunlap@15932 2737 sh_unpin(v, smfn); \
gdunlap@15932 2738 else \
gdunlap@15932 2739 sh_remove_shadow_via_pointer(v, smfn); \
gdunlap@15932 2740 if( !fast \
gdunlap@15932 2741 && (pg->count_info & PGC_page_table) \
gdunlap@15932 2742 && (pg->shadow_flags & (1 << t)) ) \
gdunlap@15932 2743 hash_foreach(v, masks[t], callbacks, smfn); \
kaf24@11310 2744 } while (0)
kaf24@11310 2745
gdunlap@15932 2746 DO_UNSHADOW(SH_type_l2_32_shadow);
gdunlap@15932 2747 DO_UNSHADOW(SH_type_l1_32_shadow);
gdunlap@15932 2748 DO_UNSHADOW(SH_type_l2h_pae_shadow);
gdunlap@15932 2749 DO_UNSHADOW(SH_type_l2_pae_shadow);
gdunlap@15932 2750 DO_UNSHADOW(SH_type_l1_pae_shadow);
kaf24@11310 2751 #if CONFIG_PAGING_LEVELS >= 4
gdunlap@15932 2752 DO_UNSHADOW(SH_type_l4_64_shadow);
gdunlap@15932 2753 DO_UNSHADOW(SH_type_l3_64_shadow);
gdunlap@15932 2754 DO_UNSHADOW(SH_type_l2h_64_shadow);
gdunlap@15932 2755 DO_UNSHADOW(SH_type_l2_64_shadow);
gdunlap@15932 2756 DO_UNSHADOW(SH_type_l1_64_shadow);
kaf24@11310 2757 #endif
kaf24@11310 2758
kaf24@11310 2759 #undef DO_UNSHADOW
kaf24@11310 2760
kaf24@11310 2761 /* If that didn't catch the shadows, something is wrong */
keir@17423 2762 if ( !fast && all && (pg->count_info & PGC_page_table) )
kaf24@11310 2763 {
Tim@11866 2764 SHADOW_ERROR("can't find all shadows of mfn %05lx "
keir@17851 2765 "(shadow_flags=%08x)\n",
kaf24@11310 2766 mfn_x(gmfn), pg->shadow_flags);
keir@17423 2767 domain_crash(v->domain);
kaf24@11310 2768 }
Tim@11925 2769
Tim@11925 2770 /* Need to flush TLBs now, so that linear maps are safe next time we
Tim@11925 2771 * take a fault. */
Tim@11925 2772 flush_tlb_mask(v->domain->domain_dirty_cpumask);
Tim@13141 2773
Tim@13141 2774 if ( do_locking ) shadow_unlock(v->domain);
kaf24@11310 2775 }
kaf24@11310 2776
Tim@13141 2777 static void
Tim@13141 2778 sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
kaf24@11310 2779 /* Even harsher: this is a HVM page that we thing is no longer a pagetable.
kaf24@11310 2780 * Unshadow it, and recursively unshadow pages that reference it. */
kaf24@11310 2781 {
Tim@13141 2782 sh_remove_shadows(v, gmfn, 0, 1);
kaf24@11310 2783 /* XXX TODO:
kaf24@11310 2784 * Rework this hashtable walker to return a linked-list of all
kaf24@11310 2785 * the shadows it modified, then do breadth-first recursion
kaf24@11310 2786 * to find the way up to higher-level tables and unshadow them too.
kaf24@11310 2787 *
kaf24@11310 2788 * The current code (just tearing down each page's shadows as we
kaf24@11310 2789 * detect that it is not a pagetable) is correct, but very slow.
kaf24@11310 2790 * It means extra emulated writes and slows down removal of mappings. */
kaf24@11310 2791 }
kaf24@11310 2792
kaf24@11310 2793 /**************************************************************************/
kaf24@11310 2794
Tim@13141 2795 static void sh_update_paging_modes(struct vcpu *v)
kaf24@11310 2796 {
kaf24@11310 2797 struct domain *d = v->domain;
Tim@13909 2798 struct paging_mode *old_mode = v->arch.paging.mode;
kaf24@11310 2799
Tim@13137 2800 ASSERT(shadow_locked_by_me(d));
kaf24@11310 2801
Tim@15255 2802 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
Tim@15255 2803 /* Make sure this vcpu has a virtual TLB array allocated */
Tim@15255 2804 if ( unlikely(!v->arch.paging.vtlb) )
Tim@15255 2805 {
Tim@15255 2806 v->arch.paging.vtlb = xmalloc_array(struct shadow_vtlb, VTLB_ENTRIES);
Tim@15255 2807 if ( unlikely(!v->arch.paging.vtlb) )
Tim@15255 2808 {
Tim@15255 2809 SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n",
Tim@15255 2810 d->domain_id, v->vcpu_id);
Tim@15255 2811 domain_crash(v->domain);
Tim@15255 2812 return;
Tim@15255 2813 }
Tim@15255 2814 memset(v->arch.paging.vtlb, 0,
Tim@15255 2815 VTLB_ENTRIES * sizeof (struct shadow_vtlb));
Tim@15255 2816 spin_lock_init(&v->arch.paging.vtlb_lock);
Tim@15255 2817 }
Tim@15255 2818 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
Tim@15255 2819
keir@17904 2820 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17905 2821 if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
keir@17905 2822 {
keir@17905 2823 int i;
keir@17905 2824 for(i = 0; i < SHADOW_OOS_PAGES; i++)
keir@17905 2825 {
keir@17905 2826 shadow_prealloc(d, SH_type_oos_snapshot, 1);
keir@17905 2827 v->arch.paging.shadow.oos_snapshot[i] =
keir@17905 2828 shadow_alloc(d, SH_type_oos_snapshot, 0);
keir@17905 2829 }
keir@17905 2830 }
keir@17904 2831 #endif /* OOS */
keir@17904 2832
kaf24@11310 2833 // Valid transitions handled by this function:
kaf24@11310 2834 // - For PV guests:
kaf24@11310 2835 // - after a shadow mode has been changed
kaf24@11310 2836 // - For HVM guests:
kaf24@11310 2837 // - after a shadow mode has been changed
kaf24@11310 2838 // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
kaf24@11310 2839 //
kaf24@11310 2840
kaf24@11310 2841 // First, tear down any old shadow tables held by this vcpu.
kaf24@11310 2842 //
Tim@13909 2843 if ( v->arch.paging.mode )
Tim@13909 2844 v->arch.paging.mode->shadow.detach_old_tables(v);
kaf24@11310 2845
kfraser@12210 2846 if ( !is_hvm_domain(d) )
kaf24@11310 2847 {
kaf24@11310 2848 ///
kaf24@11310 2849 /// PV guest
kaf24@11310 2850 ///
kaf24@11310 2851 #if CONFIG_PAGING_LEVELS == 4
keir@17620 2852 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
keir@17620 2853 #else /* CONFIG_PAGING_LEVELS == 3 */
keir@17620 2854 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
kaf24@11310 2855 #endif
kaf24@11310 2856 }
kaf24@11310 2857 else
kaf24@11310 2858 {
kaf24@11310 2859 ///
kaf24@11310 2860 /// HVM guest
kaf24@11310 2861 ///
kaf24@11310 2862 ASSERT(shadow_mode_translate(d));
kaf24@11310 2863 ASSERT(shadow_mode_external(d));
kaf24@11310 2864
keir@17903 2865 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17903 2866 /* Need to resync all our pages now, because if a page goes out
keir@17903 2867 * of sync with paging enabled and is resynced with paging
keir@17903 2868 * disabled, the resync will go wrong. */
keir@17903 2869 shadow_resync_all(v, 0);
keir@17903 2870 #endif /* OOS */
keir@17903 2871
Tim@15812 2872 if ( !hvm_paging_enabled(v) )
kaf24@11310 2873 {
Tim@15812 2874 /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
Tim@15812 2875 * pagetable for it, mapping 4 GB one-to-one using a single l2
Tim@15812 2876 * page of 1024 superpage mappings */
Tim@15812 2877 v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable;
keir@17620 2878 v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
kaf24@11310 2879 }
kaf24@11310 2880 else
kaf24@11310 2881 {
kaf24@11310 2882 #ifdef __x86_64__
kaf24@11310 2883 if ( hvm_long_mode_enabled(v) )
kaf24@11310 2884 {
kaf24@11310 2885 // long mode guest...
Tim@13909 2886 v->arch.paging.mode =
keir@17620 2887 &SHADOW_INTERNAL_NAME(sh_paging_mode, 4);
kaf24@11310 2888 }
kaf24@11310 2889 else
kaf24@11310 2890 #endif
steven@11578 2891 if ( hvm_pae_enabled(v) )
kaf24@11310 2892 {
kaf24@11310 2893 // 32-bit PAE mode guest...
Tim@13909 2894 v->arch.paging.mode =
keir@17620 2895 &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
kaf24@11310 2896 }
kaf24@11310 2897 else
kaf24@11310 2898 {
kaf24@11310 2899 // 32-bit 2 level guest...
Tim@13909 2900 v->arch.paging.mode =
keir@17620 2901 &SHADOW_INTERNAL_NAME(sh_paging_mode, 2);
kaf24@11310 2902 }
kaf24@11310 2903 }
kaf24@11310 2904
Tim@12813 2905 if ( pagetable_is_null(v->arch.monitor_table) )
kaf24@11310 2906 {
Tim@13909 2907 mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v);
kaf24@11310 2908 v->arch.monitor_table = pagetable_from_mfn(mmfn);
steven@13059 2909 make_cr3(v, mfn_x(mmfn));
steven@13059 2910 hvm_update_host_cr3(v);
steven@13059 2911 }
kaf24@11310 2912
Tim@13909 2913 if ( v->arch.paging.mode != old_mode )
kaf24@11310 2914 {
keir@17620 2915 SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d gl=%u "
tim@11666 2916 "(was g=%u s=%u)\n",
tim@11666 2917 d->domain_id, v->vcpu_id,
kfraser@15727 2918 is_hvm_domain(d) ? hvm_paging_enabled(v) : 1,
Tim@13909 2919 v->arch.paging.mode->guest_levels,
Tim@13909 2920 v->arch.paging.mode->shadow.shadow_levels,
tim@11666 2921 old_mode ? old_mode->guest_levels : 0,
Tim@13909 2922 old_mode ? old_mode->shadow.shadow_levels : 0);
kaf24@11310 2923 if ( old_mode &&
Tim@13909 2924 (v->arch.paging.mode->shadow.shadow_levels !=
Tim@13909 2925 old_mode->shadow.shadow_levels) )
kaf24@11310 2926 {
kaf24@11310 2927 /* Need to make a new monitor table for the new mode */
kaf24@11310 2928 mfn_t new_mfn, old_mfn;
kaf24@11310 2929
Tim@13547 2930 if ( v != current && vcpu_runnable(v) )
kaf24@11310 2931 {
kaf24@11310 2932 SHADOW_ERROR("Some third party (d=%u v=%u) is changing "
Tim@13547 2933 "this HVM vcpu's (d=%u v=%u) paging mode "
Tim@13547 2934 "while it is running.\n",
Tim@13547 2935 current->domain->domain_id, current->vcpu_id,
Tim@13547 2936 v->domain->domain_id, v->vcpu_id);
Tim@13547 2937 /* It's not safe to do that because we can't change
keir@17589 2938 * the host CR3 for a running domain */
kaf24@11310 2939 domain_crash(v->domain);
kaf24@11310 2940 return;
kaf24@11310 2941 }
kaf24@11310 2942
kaf24@11310 2943 old_mfn = pagetable_get_mfn(v->arch.monitor_table);
kaf24@11310 2944 v->arch.monitor_table = pagetable_null();
Tim@13909 2945 new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v);
kaf24@11310 2946 v->arch.monitor_table = pagetable_from_mfn(new_mfn);
Tim@13909 2947 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
kaf24@11310 2948 mfn_x(new_mfn));
kaf24@11310 2949
kaf24@11310 2950 /* Don't be running on the old monitor table when we
kaf24@11310 2951 * pull it down! Switch CR3, and warn the HVM code that
kaf24@11310 2952 * its host cr3 has changed. */
kaf24@11310 2953 make_cr3(v, mfn_x(new_mfn));
Tim@13547 2954 if ( v == current )
Tim@13547 2955 write_ptbase(v);
kaf24@11310 2956 hvm_update_host_cr3(v);
Tim@13909 2957 old_mode->shadow.destroy_monitor_table(v, old_mfn);
kaf24@11310 2958 }
kaf24@11310 2959 }
kaf24@11310 2960
kaf24@11310 2961 // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
kaf24@11310 2962 // These are HARD: think about the case where two CPU's have
kaf24@11310 2963 // different values for CR4.PSE and CR4.PGE at the same time.
kaf24@11310 2964 // This *does* happen, at least for CR4.PGE...
kaf24@11310 2965 }
kaf24@11310 2966
keir@17903 2967 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
keir@17903 2968 /* We need to check that all the vcpus have paging enabled to
keir@17903 2969 * unsync PTs. */
keir@17903 2970 if ( is_hvm_domain(d) )
keir@17903 2971 {
keir@17903 2972 int pe = 1;
keir@17903 2973 struct vcpu *vptr;
keir@17903 2974
keir@17903 2975 for_each_vcpu(d, vptr)
keir@17903 2976 {
keir@17903 2977 if ( !hvm_paging_enabled(vptr) )
keir@17903 2978 {
keir@17903 2979 pe = 0;
keir@17903 2980 break;
keir@17903 2981 }
keir@17903 2982 }
keir@17903 2983
keir@17903 2984 d->arch.paging.shadow.oos_active = pe;
keir@17903 2985 }
keir@17903 2986 #endif /* OOS */
keir@17903 2987
Tim@13909 2988 v->arch.paging.mode->update_cr3(v, 0);
Tim@13141 2989 }
Tim@13141 2990
Tim@13141 2991 void shadow_update_paging_modes(struct vcpu *v)
Tim@13141 2992 {
Tim@13141 2993 shadow_lock(v->domain);
Tim@13141 2994 sh_update_paging_modes(v);
Tim@13141 2995 shadow_unlock(v->domain);
kaf24@11310 2996 }
kaf24@11310 2997
kaf24@11310 2998 /**************************************************************************/
kaf24@11310 2999 /* Turning on and off shadow features */
kaf24@11310 3000
kaf24@11310 3001 static void sh_new_mode(struct domain *d, u32 new_mode)
kaf24@11310 3002 /* Inform all the vcpus that the shadow mode has been changed */
kaf24@11310 3003 {
kaf24@11310 3004 struct vcpu *v;
kaf24@11310 3005
Tim@13137 3006 ASSERT(shadow_locked_by_me(d));
kaf24@11310 3007 ASSERT(d != current->domain);
Tim@13909 3008 d->arch.paging.mode = new_mode;
kaf24@11310 3009 for_each_vcpu(d, v)
kaf24@11310 3010 sh_update_paging_modes(v);
kaf24@11310 3011 }
kaf24@11310 3012
kfraser@12213 3013 int shadow_enable(struct domain *d, u32 mode)
kaf24@11310 3014 /* Turn on "permanent" shadow features: external, translate, refcount.
kaf24@11310 3015 * Can only be called once on a domain, and these features cannot be
kaf24@11310 3016 * disabled.
kaf24@11310 3017 * Returns 0 for success, -errno for failure. */
kaf24@11310 3018 {
kaf24@11310 3019 unsigned int old_pages;
Tim@15812 3020 struct page_info *pg = NULL;
Tim@15812 3021 uint32_t *e;
Tim@15812 3022 int i, rv = 0;
kaf24@11310 3023
Tim@13909 3024 mode |= PG_SH_enable;
kaf24@11310 3025
kaf24@11310 3026 domain_pause(d);
kaf24@11310 3027
kaf24@11310 3028 /* Sanity check the arguments */
kaf24@11310 3029 if ( (d == current->domain) ||
kaf24@11310 3030 shadow_mode_enabled(d) ||
Tim@13909 3031 ((mode & PG_translate) && !(mode & PG_refcounts)) ||
Tim@13909 3032 ((mode & PG_external) && !(mode & PG_translate)) )
kaf24@11310 3033 {
kaf24@11310 3034 rv = -EINVAL;
Tim@13909 3035 goto out_unlocked;
kaf24@11310 3036 }
kaf24@11310 3037
kaf24@11310 3038 /* Init the shadow memory allocation if the user hasn't done so */
Tim@13909 3039 old_pages = d->arch.paging.shadow.total_pages;
kaf24@11310 3040 if ( old_pages == 0 )
Tim@13909 3041 {
Tim@13909 3042 unsigned int r;
Tim@13909 3043 shadow_lock(d);
Tim@13909 3044 r = sh_set_allocation(d, 256, NULL); /* Use at least 1MB */
Tim@13909 3045 if ( r != 0 )
kaf24@11310 3046 {
Tim@13141 3047 sh_set_allocation(d, 0, NULL);
kaf24@11310 3048 rv = -ENOMEM;
Tim@16076 3049 goto out_locked;
Tim@13909 3050 }
Tim@16076 3051 shadow_unlock(d);
Tim@13909 3052 }
Tim@13909 3053
Tim@13909 3054 /* Init the P2M table. Must be done before we take the shadow lock
Tim@13909 3055 * to avoid possible deadlock. */
Tim@13909 3056 if ( mode & PG_translate )
Tim@13909 3057 {
Tim@13909 3058 rv = p2m_alloc_table(d, shadow_alloc_p2m_page, shadow_free_p2m_page);
Tim@13909 3059 if (rv != 0)
Tim@13909 3060 goto out_unlocked;
Tim@13909 3061 }
Tim@13909 3062
Tim@15812 3063 /* HVM domains need an extra pagetable for vcpus that think they
Tim@15812 3064 * have paging disabled */
Tim@15812 3065 if ( is_hvm_domain(d) )
Tim@15812 3066 {
Tim@15812 3067 /* Get a single page from the shadow pool. Take it via the
Tim@15812 3068 * P2M interface to make freeing it simpler afterwards. */
Tim@15812 3069 pg = shadow_alloc_p2m_page(d);
Tim@15812 3070 if ( pg == NULL )
Tim@15812 3071 {
Tim@15812 3072 rv = -ENOMEM;
Tim@15812 3073 goto out_unlocked;
Tim@15812 3074 }
Tim@15812 3075 /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB
Tim@15812 3076 * of virtual address space onto the same physical address range */
Tim@15812 3077 e = sh_map_domain_page(page_to_mfn(pg));
Tim@15812 3078 for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ )
Tim@15812 3079 e[i] = ((0x400000U * i)
Tim@15812 3080 | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER
Tim@15812 3081 | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
Tim@15812 3082 sh_unmap_domain_page(e);
Tim@15812 3083 pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
Tim@15812 3084 }
Tim@15310 3085
Tim@13909 3086 shadow_lock(d);
Tim@13909 3087
Tim@13909 3088 /* Sanity check again with the lock held */
Tim@13909 3089 if ( shadow_mode_enabled(d) )
Tim@13909 3090 {
Tim@13909 3091 rv = -EINVAL;
Tim@13909 3092 goto out_locked;
Tim@13909 3093 }
kaf24@11310 3094
kaf24@11310 3095 /* Init the hash table */
kaf24@11310 3096 if ( shadow_hash_alloc(d) != 0 )
kaf24@11310 3097 {
kaf24@11310 3098 rv = -ENOMEM;
Tim@13909 3099 goto out_locked;
kaf24@11310 3100 }
kaf24@11310 3101
Tim@12564 3102 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL)
Tim@12564 3103 /* We assume we're dealing with an older 64bit linux guest until we
Tim@12564 3104 * see the guest use more than one l4 per vcpu. */
Tim@13909 3105 d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL;
Tim@12564 3106 #endif
Tim@12564 3107
Tim@15812 3108 /* Record the 1-to-1 pagetable we just made */
Tim@15812 3109 if ( is_hvm_domain(d) )
Tim@15812 3110 d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg);
Tim@15812 3111
kaf24@11310 3112 /* Update the bits */
kaf24@11310 3113 sh_new_mode(d, mode);
Tim@13909 3114
Tim@13909 3115 out_locked:
kaf24@11310 3116 shadow_unlock(d);
Tim@13909 3117 out_unlocked:
Tim@13909 3118 if ( rv != 0 && !pagetable_is_null(d->arch.phys_table) )
Tim@13909 3119 p2m_teardown(d);
Tim@15812 3120 if ( rv != 0 && pg != NULL )
Tim@15812 3121 shadow_free_p2m_page(d, pg);
kaf24@11310 3122 domain_unpause(d);
tim@11666 3123 return rv;
kaf24@11310 3124 }
kaf24@11310 3125
kaf24@11310 3126 void shadow_teardown(struct domain *d)
kaf24@11310 3127 /* Destroy the shadow pagetables of this domain and free its shadow memory.
kaf24@11310 3128 * Should only be called for dying domains. */
kaf24@11310 3129 {
kaf24@11310 3130 struct vcpu *v;
kaf24@11310 3131 mfn_t mfn;
Tim@13909 3132 struct list_head *entry, *n;
Tim@13909 3133 struct page_info *pg;
kaf24@11310 3134