From: --list Date: Thu, 11 Sep 2008 11:36:53 +0000 (+0100) Subject: Initial commit. X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=34457e4497c6ca61dbe9607ab779a22012530add;p=xenalyze.git Initial commit. --- 34457e4497c6ca61dbe9607ab779a22012530add diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4be0477 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +CC = gcc + +CFLAGS += -g +CFLAGS += -fno-strict-aliasing +CFLAGS += -std=gnu99 +CFLAGS += -Wall -Wstrict-prototypes +CFLAGS += -Wno-unused-value +CFLAGS += -Wdeclaration-after-statement + +CFLAGS += -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE +CFLAGS += -mno-tls-direct-seg-refs +CFLAGS += -Werror + +BIN = analyze dump-raw + +HDRS = trace.h analyze.h + +all: $(BIN) + +.PHONY: clean +clean: + $(RM) *.a *.so *.o *.rpm $(BIN) $(LIBBIN) + +%: %.c $(HDRS) Makefile + $(CC) $(CFLAGS) -o $@ $< diff --git a/analyze.c b/analyze.c new file mode 100644 index 0000000..0ccd852 --- /dev/null +++ b/analyze.c @@ -0,0 +1,7935 @@ +#define _XOPEN_SOURCE 600 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace.h" +#include "analyze.h" +#include +#include +#include +#include + +typedef unsigned long long tsc_t; + +#define DEFAULT_CPU_HZ 2400000000LL +#define ADDR_SPACE_BITS 48 +#define DEFAULT_SAMPLE_SIZE 10240 +#define DEFAULT_INTERVAL_LENGTH 1000 + +/* -- Global variables -- */ +struct { + int fd; + struct symbol_struct * symbols; + char * symbol_file; + char * trace_file; + int output_defined; + loff_t file_size; + struct { + loff_t update_offset; + int pipe[2]; + FILE* out; + int pid; + } progress; +} G = { + .fd=-1, + .symbols = NULL, + .symbol_file = NULL, + .trace_file = NULL, + .output_defined = 0, + .file_size = 0, + .progress = { .update_offset = 0 }, +}; + +struct { + unsigned + scatterplot_interrupt_eip:1, + scatterplot_cpi:1, + scatterplot_unpin_promote:1, + scatterplot_cr3_switch:1, + scatterplot_wake_to_halt:1, + scatterplot_io:1, + histogram_interrupt_eip:1, + interval_mode:1, + dump_cooked:1, + dump_all:1, + dump_raw_process:1, + dump_raw_reads:1, + dump_no_processing:1, + dump_ipi_latency:1, + dump_trace_volume_on_lost_record:1, + with_cr3_enumeration:1, + with_pio_enumeration:1, + show_default_domain_summary:1, + progress:1, + svm_mode:1, + summary:1, + summary_info; + long long cpu_hz; + int scatterplot_interrupt_vector; + int scatterplot_io_port; + int histogram_interrupt_vector; + unsigned long long histogram_interrupt_increment; + int default_guest_paging_levels; + int sample_size; + struct { + tsc_t cycles; + /* Used if interval is specified in seconds to delay calculating + * time_interval until all arguments have been processed (specifically, + * cpu_hz). */ + unsigned msec; + enum { + INTERVAL_CR3_SCHEDULE_TIME, + INTERVAL_CR3_SCHEDULE_ORDERED, + INTERVAL_CR3_SHORT_SUMMARY, + INTERVAL_DOMAIN_SHORT_SUMMARY, + INTERVAL_DOMAIN_GUEST_INTERRUPT, + INTERVAL_DOMAIN_GRANT_MAPS + } output; + /* Options for specific interval output types */ + union { + struct { + unsigned long long *values; + int count; + } cr3; + struct { + int did; + } domain; + }; + } interval; +} opt = { + .scatterplot_interrupt_eip=0, + .scatterplot_cpi=0, + .scatterplot_unpin_promote=0, + .scatterplot_cr3_switch=0, + .scatterplot_wake_to_halt=0, + .histogram_interrupt_eip=0, + .dump_cooked = 0, + .dump_all = 0, + .dump_raw_process = 0, + .dump_raw_reads = 0, + .dump_no_processing = 0, + .dump_ipi_latency = 0, + .dump_trace_volume_on_lost_record = 0, + .with_cr3_enumeration = 0, + .with_pio_enumeration = 1, + .show_default_domain_summary = 0, + .progress = 0, + .svm_mode = 0, + .summary = 0, + .cpu_hz = DEFAULT_CPU_HZ, + .default_guest_paging_levels = 2, + .sample_size = DEFAULT_SAMPLE_SIZE, + .interval = { .msec = DEFAULT_INTERVAL_LENGTH }, +}; + +/* -- on-disk trace buffer definitions -- */ +struct trace_record { + unsigned event:28, + extra_words:3, + cycle_flag:1; + union { + struct { + uint32_t tsc_lo, tsc_hi; + uint32_t data[7]; + } tsc; + struct { + uint32_t data[7]; + } notsc; + } u; +}; + +FILE *warn = NULL; + +/* -- General info about a current record -- */ +struct time_struct { + unsigned long long time; + unsigned long s, ns; +}; + +#define DUMP_HEADER_MAX 256 + +struct record_info { + int cpu; + tsc_t tsc; + union { + unsigned event; + struct { + unsigned minor:12, + sub:4, + main:12, + unused:4; + } evt; + }; + int extra_words; + int size; + uint32_t *d; + char dump_header[DUMP_HEADER_MAX]; + struct time_struct t; + struct trace_record rec; +}; + +/* -- Summary data -- */ +struct cycle_framework { + tsc_t first_tsc, last_tsc, total_cycles; +}; + +struct event_cycle_summary { + int count, cycles_count, interval_count; + long long cycles, interval_cycles; + long long *cycles_sample; +}; + +struct cycle_summary { + int count, interval_count; + unsigned long long cycles, interval_cycles; + long long *sample; +}; + +struct weighted_cpi_summary { + int count, interval_count; + unsigned long long instructions, interval_instructions; + unsigned long long cycles, interval_cycles; + float *cpi; + unsigned long long *cpi_weight; +}; + +/* -- Symbol list information -- */ +#define SYMBOL_ENTRIES_PER_STRUCT 1023 +#define SYMBOL_NAME_SIZE 124 +struct symbol_struct { + int count; + struct { + unsigned long long addr; + char name[SYMBOL_NAME_SIZE]; + } symbols[SYMBOL_ENTRIES_PER_STRUCT]; + struct symbol_struct *next; +}; + + +void parse_symbol_file(char *fn) { + unsigned long long last_addr = 0; + FILE * symbol_file; + struct symbol_struct ** p=&G.symbols; + + if((symbol_file=fopen(fn, "rb"))==NULL) { + fprintf(stderr, "Could not open symbol file %s\n", fn); + perror("open"); + exit(1); + } + while(!feof(symbol_file)) { + /* Allocate a new struct if we need it */ + if(!*p) { + *p = malloc(sizeof(**p)); + if(!*p) { + fprintf(stderr, "Malloc failed!\n"); + exit(1); + } + (*p)->count=0; + (*p)->next=NULL; + } + + /* FIXME -- use SYMBOL_NAME_SIZE */ + fscanf(symbol_file, "%llx %128s", + &(*p)->symbols[(*p)->count].addr, + (*p)->symbols[(*p)->count].name); + + if( ((*p)->symbols[(*p)->count].addr > 0) + && ((*p)->symbols[(*p)->count].addr < last_addr) ) { + fprintf(stderr, "Symbol file not properly ordered: %llx %s < %llx!\n", + (*p)->symbols[(*p)->count].addr, + (*p)->symbols[(*p)->count].name, + last_addr); + exit(1); + } else + last_addr = (*p)->symbols[(*p)->count].addr; + + + + (*p)->count++; + + /* If this struct is full, point to the next. It will be allocated + if needed. */ + if((*p)->count == SYMBOL_ENTRIES_PER_STRUCT) { + p=&((*p)->next); + } + } +} + +/* WARNING not thread safe */ +char * find_symbol(unsigned long long addr) { + struct symbol_struct * p=G.symbols; + int i; + char * lastname="ZERO"; + unsigned long long offset=addr; + static char name[128]; + + if(!p) { + name[0]=0; + return name; + } + + while(1) { + if(!p) + goto finish; + for(i=0; icount; i++) { + if(p->symbols[i].addr > addr) + goto finish; + else { + lastname=p->symbols[i].name; + offset=addr - p->symbols[i].addr; + } + } + p=p->next; + } + finish: + snprintf(name, 128, "(%s +%llx)", + lastname, offset); + return name; +} + +/* -- Eip list data -- */ +enum { + EIP_LIST_TYPE_NONE=0, + EIP_LIST_TYPE_MAX +}; + +struct eip_list_struct { + struct eip_list_struct *next; + unsigned long long eip; + struct event_cycle_summary summary; + int type; + void * extra; +}; + +struct { + void (*update)(struct eip_list_struct *, void *); + void (*new)(struct eip_list_struct *, void *); + void (*dump)(struct eip_list_struct *); +} eip_list_type[EIP_LIST_TYPE_MAX] = { + [EIP_LIST_TYPE_NONE] = { + .update=NULL, + .new=NULL, + .dump=NULL }, +}; + + +/* --- HVM class of events --- */ + +/* + * -- Algorithms -- + * + * Interrupt Wake-to-halt detection + * + * Purpose: To correlate device interrupts to vcpu runtime. + * + * Diagram: + * ... + * blocked -> runnable <- set to waking + * ... + * runnable -> running + * inj_virq A <- Note "waking" interrupt + * vmenter <- Start tsc of "wake-to-halt" interval. + Turn off 'waking'. + * ... + * inj_virq B <- Note alternate interrupt + * vmenter <- Start tsc of "interrupt-to-halt" interval + * ... + * vmexit <- End tsc of "x-to-halt" interval + * running -> blocked <- Process + * + * The "waking" interrupts we want to sub-classify into + * "wake-only" (when interrupt was the only interrupt from wake to halt) and + * "wake-all" (whether this was the only interrupt or not). + */ + +/* VMX data */ +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 + +#define EXIT_REASON_PENDING_INTERRUPT 7 + +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 +#define EXIT_REASON_MACHINE_CHECK 41 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 + +#define HVM_VMX_EXIT_REASON_MAX (EXIT_REASON_TPR_BELOW_THRESHOLD+1) + +char * hvm_vmx_exit_reason_name[HVM_VMX_EXIT_REASON_MAX] = { + [0] = "NONE", + [EXIT_REASON_EXCEPTION_NMI]="EXCEPTION_NMI", + [EXIT_REASON_EXTERNAL_INTERRUPT]="EXTERNAL_INTERRUPT", + [EXIT_REASON_PENDING_INTERRUPT]="PENDING_INTERRUPT", + [EXIT_REASON_TASK_SWITCH]="TASK_SWITCH", + [EXIT_REASON_CPUID]="CPUID", + [EXIT_REASON_HLT]="HLT", + [EXIT_REASON_INVLPG]="INVLPG", + [EXIT_REASON_RDPMC]="RDPMC", + [EXIT_REASON_RDTSC]="RDTSC", + [EXIT_REASON_VMCALL]="VMCALL", + [EXIT_REASON_VMCLEAR]="VMCLEAR", + [EXIT_REASON_VMLAUNCH]="VMLAUNCH", + [EXIT_REASON_VMPTRLD]="VMPTRLD", + [EXIT_REASON_VMPTRST]="VMPTRST", + [EXIT_REASON_VMREAD]="VMREAD", + [EXIT_REASON_VMRESUME]="VMRESUME", + [EXIT_REASON_VMWRITE]="VMWRITE", + [EXIT_REASON_VMOFF]="VMOFF", + [EXIT_REASON_VMON]="VMON", + [EXIT_REASON_CR_ACCESS]="CR_ACCESS", + [EXIT_REASON_DR_ACCESS]="DR_ACCESS", + [EXIT_REASON_IO_INSTRUCTION]="IO_INSTRUCTION", + [EXIT_REASON_MSR_READ]="MSR_READ", + [EXIT_REASON_MSR_WRITE]="MSR_WRITE", + [EXIT_REASON_MWAIT_INSTRUCTION]="MWAIT_INSTRUCTION", + [EXIT_REASON_MONITOR_INSTRUCTION]="MONITOR_INSTRUCTION", + [EXIT_REASON_PAUSE_INSTRUCTION]="PAUSE_INSTRUCTION", + [EXIT_REASON_MACHINE_CHECK]="MACHINE_CHECK", + [EXIT_REASON_TPR_BELOW_THRESHOLD]="TPR_BELOW_THRESHOLD", +}; + +/* SVM data */ +enum VMEXIT_EXITCODE +{ + /* control register read exitcodes */ + VMEXIT_CR0_READ = 0, + VMEXIT_CR1_READ = 1, + VMEXIT_CR2_READ = 2, + VMEXIT_CR3_READ = 3, + VMEXIT_CR4_READ = 4, + VMEXIT_CR5_READ = 5, + VMEXIT_CR6_READ = 6, + VMEXIT_CR7_READ = 7, + VMEXIT_CR8_READ = 8, + VMEXIT_CR9_READ = 9, + VMEXIT_CR10_READ = 10, + VMEXIT_CR11_READ = 11, + VMEXIT_CR12_READ = 12, + VMEXIT_CR13_READ = 13, + VMEXIT_CR14_READ = 14, + VMEXIT_CR15_READ = 15, + + /* control register write exitcodes */ + VMEXIT_CR0_WRITE = 16, + VMEXIT_CR1_WRITE = 17, + VMEXIT_CR2_WRITE = 18, + VMEXIT_CR3_WRITE = 19, + VMEXIT_CR4_WRITE = 20, + VMEXIT_CR5_WRITE = 21, + VMEXIT_CR6_WRITE = 22, + VMEXIT_CR7_WRITE = 23, + VMEXIT_CR8_WRITE = 24, + VMEXIT_CR9_WRITE = 25, + VMEXIT_CR10_WRITE = 26, + VMEXIT_CR11_WRITE = 27, + VMEXIT_CR12_WRITE = 28, + VMEXIT_CR13_WRITE = 29, + VMEXIT_CR14_WRITE = 30, + VMEXIT_CR15_WRITE = 31, + + /* debug register read exitcodes */ + VMEXIT_DR0_READ = 32, + VMEXIT_DR1_READ = 33, + VMEXIT_DR2_READ = 34, + VMEXIT_DR3_READ = 35, + VMEXIT_DR4_READ = 36, + VMEXIT_DR5_READ = 37, + VMEXIT_DR6_READ = 38, + VMEXIT_DR7_READ = 39, + VMEXIT_DR8_READ = 40, + VMEXIT_DR9_READ = 41, + VMEXIT_DR10_READ = 42, + VMEXIT_DR11_READ = 43, + VMEXIT_DR12_READ = 44, + VMEXIT_DR13_READ = 45, + VMEXIT_DR14_READ = 46, + VMEXIT_DR15_READ = 47, + + /* debug register write exitcodes */ + VMEXIT_DR0_WRITE = 48, + VMEXIT_DR1_WRITE = 49, + VMEXIT_DR2_WRITE = 50, + VMEXIT_DR3_WRITE = 51, + VMEXIT_DR4_WRITE = 52, + VMEXIT_DR5_WRITE = 53, + VMEXIT_DR6_WRITE = 54, + VMEXIT_DR7_WRITE = 55, + VMEXIT_DR8_WRITE = 56, + VMEXIT_DR9_WRITE = 57, + VMEXIT_DR10_WRITE = 58, + VMEXIT_DR11_WRITE = 59, + VMEXIT_DR12_WRITE = 60, + VMEXIT_DR13_WRITE = 61, + VMEXIT_DR14_WRITE = 62, + VMEXIT_DR15_WRITE = 63, + + /* processor exception exitcodes (VMEXIT_EXCP[0-31]) */ + VMEXIT_EXCEPTION_DE = 64, /* divide-by-zero-error */ + VMEXIT_EXCEPTION_DB = 65, /* debug */ + VMEXIT_EXCEPTION_NMI = 66, /* non-maskable-interrupt */ + VMEXIT_EXCEPTION_BP = 67, /* breakpoint */ + VMEXIT_EXCEPTION_OF = 68, /* overflow */ + VMEXIT_EXCEPTION_BR = 69, /* bound-range */ + VMEXIT_EXCEPTION_UD = 70, /* invalid-opcode*/ + VMEXIT_EXCEPTION_NM = 71, /* device-not-available */ + VMEXIT_EXCEPTION_DF = 72, /* double-fault */ + VMEXIT_EXCEPTION_09 = 73, /* unsupported (reserved) */ + VMEXIT_EXCEPTION_TS = 74, /* invalid-tss */ + VMEXIT_EXCEPTION_NP = 75, /* segment-not-present */ + VMEXIT_EXCEPTION_SS = 76, /* stack */ + VMEXIT_EXCEPTION_GP = 77, /* general-protection */ + VMEXIT_EXCEPTION_PF = 78, /* page-fault */ + VMEXIT_EXCEPTION_15 = 79, /* reserved */ + VMEXIT_EXCEPTION_MF = 80, /* x87 floating-point exception-pending */ + VMEXIT_EXCEPTION_AC = 81, /* alignment-check */ + VMEXIT_EXCEPTION_MC = 82, /* machine-check */ + VMEXIT_EXCEPTION_XF = 83, /* simd floating-point */ + + /* exceptions 20-31 (exitcodes 84-95) are reserved */ + + /* ...and the rest of the #VMEXITs */ + VMEXIT_INTR = 96, + VMEXIT_NMI = 97, + VMEXIT_SMI = 98, + VMEXIT_INIT = 99, + VMEXIT_VINTR = 100, + VMEXIT_CR0_SEL_WRITE = 101, + VMEXIT_IDTR_READ = 102, + VMEXIT_GDTR_READ = 103, + VMEXIT_LDTR_READ = 104, + VMEXIT_TR_READ = 105, + VMEXIT_IDTR_WRITE = 106, + VMEXIT_GDTR_WRITE = 107, + VMEXIT_LDTR_WRITE = 108, + VMEXIT_TR_WRITE = 109, + VMEXIT_RDTSC = 110, + VMEXIT_RDPMC = 111, + VMEXIT_PUSHF = 112, + VMEXIT_POPF = 113, + VMEXIT_CPUID = 114, + VMEXIT_RSM = 115, + VMEXIT_IRET = 116, + VMEXIT_SWINT = 117, + VMEXIT_INVD = 118, + VMEXIT_PAUSE = 119, + VMEXIT_HLT = 120, + VMEXIT_INVLPG = 121, + VMEXIT_INVLPGA = 122, + VMEXIT_IOIO = 123, + VMEXIT_MSR = 124, + VMEXIT_TASK_SWITCH = 125, + VMEXIT_FERR_FREEZE = 126, + VMEXIT_SHUTDOWN = 127, + VMEXIT_VMRUN = 128, + VMEXIT_VMMCALL = 129, + VMEXIT_VMLOAD = 130, + VMEXIT_VMSAVE = 131, + VMEXIT_STGI = 132, + VMEXIT_CLGI = 133, + VMEXIT_SKINIT = 134, + VMEXIT_RDTSCP = 135, + VMEXIT_ICEBP = 136, + VMEXIT_WBINVD = 137, + VMEXIT_MONITOR = 138, + VMEXIT_MWAIT = 139, + VMEXIT_MWAIT_CONDITIONAL= 140, + VMEXIT_NPF = 1024, /* nested paging fault */ + VMEXIT_INVALID = -1 +}; + +#define HVM_SVM_EXIT_REASON_MAX 1025 +char * hvm_svm_exit_reason_name[HVM_SVM_EXIT_REASON_MAX] = { + /* 0-15 */ + "VMEXIT_CR0_READ", + "VMEXIT_CR1_READ", + "VMEXIT_CR2_READ", + "VMEXIT_CR3_READ", + "VMEXIT_CR4_READ", + "VMEXIT_CR5_READ", + "VMEXIT_CR6_READ", + "VMEXIT_CR7_READ", + "VMEXIT_CR8_READ", + "VMEXIT_CR9_READ", + "VMEXIT_CR10_READ", + "VMEXIT_CR11_READ", + "VMEXIT_CR12_READ", + "VMEXIT_CR13_READ", + "VMEXIT_CR14_READ", + "VMEXIT_CR15_READ", + /* 16-31 */ + "VMEXIT_CR0_WRITE", + "VMEXIT_CR1_WRITE", + "VMEXIT_CR2_WRITE", + "VMEXIT_CR3_WRITE", + "VMEXIT_CR4_WRITE", + "VMEXIT_CR5_WRITE", + "VMEXIT_CR6_WRITE", + "VMEXIT_CR7_WRITE", + "VMEXIT_CR8_WRITE", + "VMEXIT_CR9_WRITE", + "VMEXIT_CR10_WRITE", + "VMEXIT_CR11_WRITE", + "VMEXIT_CR12_WRITE", + "VMEXIT_CR13_WRITE", + "VMEXIT_CR14_WRITE", + "VMEXIT_CR15_WRITE", + /* 32-47 */ + "VMEXIT_DR0_READ", + "VMEXIT_DR1_READ", + "VMEXIT_DR2_READ", + "VMEXIT_DR3_READ", + "VMEXIT_DR4_READ", + "VMEXIT_DR5_READ", + "VMEXIT_DR6_READ", + "VMEXIT_DR7_READ", + "VMEXIT_DR8_READ", + "VMEXIT_DR9_READ", + "VMEXIT_DR10_READ", + "VMEXIT_DR11_READ", + "VMEXIT_DR12_READ", + "VMEXIT_DR13_READ", + "VMEXIT_DR14_READ", + "VMEXIT_DR15_READ", + /* 48-63 */ + "VMEXIT_DR0_WRITE", + "VMEXIT_DR1_WRITE", + "VMEXIT_DR2_WRITE", + "VMEXIT_DR3_WRITE", + "VMEXIT_DR4_WRITE", + "VMEXIT_DR5_WRITE", + "VMEXIT_DR6_WRITE", + "VMEXIT_DR7_WRITE", + "VMEXIT_DR8_WRITE", + "VMEXIT_DR9_WRITE", + "VMEXIT_DR10_WRITE", + "VMEXIT_DR11_WRITE", + "VMEXIT_DR12_WRITE", + "VMEXIT_DR13_WRITE", + "VMEXIT_DR14_WRITE", + "VMEXIT_DR15_WRITE", + /* 64-83 */ + "VMEXIT_EXCEPTION_DE", + "VMEXIT_EXCEPTION_DB", + "VMEXIT_EXCEPTION_NMI", + "VMEXIT_EXCEPTION_BP", + "VMEXIT_EXCEPTION_OF", + "VMEXIT_EXCEPTION_BR", + "VMEXIT_EXCEPTION_UD", + "VMEXIT_EXCEPTION_NM", + "VMEXIT_EXCEPTION_DF", + "VMEXIT_EXCEPTION_09", + "VMEXIT_EXCEPTION_TS", + "VMEXIT_EXCEPTION_NP", + "VMEXIT_EXCEPTION_SS", + "VMEXIT_EXCEPTION_GP", + "VMEXIT_EXCEPTION_PF", + "VMEXIT_EXCEPTION_15", + "VMEXIT_EXCEPTION_MF", + "VMEXIT_EXCEPTION_AC", + "VMEXIT_EXCEPTION_MC", + "VMEXIT_EXCEPTION_XF", + /* 84-95 */ + "VMEXIT_EXCEPTION_20", + "VMEXIT_EXCEPTION_21", + "VMEXIT_EXCEPTION_22", + "VMEXIT_EXCEPTION_23", + "VMEXIT_EXCEPTION_24", + "VMEXIT_EXCEPTION_25", + "VMEXIT_EXCEPTION_26", + "VMEXIT_EXCEPTION_27", + "VMEXIT_EXCEPTION_28", + "VMEXIT_EXCEPTION_29", + "VMEXIT_EXCEPTION_30", + "VMEXIT_EXCEPTION_31", + /* 96-99 */ + "VMEXIT_INTR", + "VMEXIT_NMI", + "VMEXIT_SMI", + "VMEXIT_INIT", + /* 100-109 */ + "VMEXIT_VINTR", + "VMEXIT_CR0_SEL_WRITE", + "VMEXIT_IDTR_READ", + "VMEXIT_GDTR_READ", + "VMEXIT_LDTR_READ", + "VMEXIT_TR_READ", + "VMEXIT_IDTR_WRITE", + "VMEXIT_GDTR_WRITE", + "VMEXIT_LDTR_WRITE", + "VMEXIT_TR_WRITE", + /* 110-119 */ + "VMEXIT_RDTSC", + "VMEXIT_RDPMC", + "VMEXIT_PUSHF", + "VMEXIT_POPF", + "VMEXIT_CPUID", + "VMEXIT_RSM", + "VMEXIT_IRET", + "VMEXIT_SWINT", + "VMEXIT_INVD", + "VMEXIT_PAUSE", + /* 120-129 */ + "VMEXIT_HLT", + "VMEXIT_INVLPG", + "VMEXIT_INVLPGA", + "VMEXIT_IOIO", + "VMEXIT_MSR", + "VMEXIT_TASK_SWITCH", + "VMEXIT_FERR_FREEZE", + "VMEXIT_SHUTDOWN", + "VMEXIT_VMRUN", + "VMEXIT_VMMCALL", + /* 130-139 */ + "VMEXIT_VMLOAD", + "VMEXIT_VMSAVE", + "VMEXIT_STGI", + "VMEXIT_CLGI", + "VMEXIT_SKINIT", + "VMEXIT_RDTSCP", + "VMEXIT_ICEBP", + "VMEXIT_WBINVD", + "VMEXIT_MONITOR", + "VMEXIT_MWAIT", + /* 140 */ + "VMEXIT_MWAIT_CONDITIONAL", + [VMEXIT_NPF] = "VMEXIT_NPF", /* nested paging fault */ +}; + + +#if ( HVM_VMX_EXIT_REASON_MAX > HVM_SVM_EXIT_REASON_MAX ) +# define HVM_EXIT_REASON_MAX HVM_VMX_EXIT_REASON_MAX +# error - Strange! +#else +# define HVM_EXIT_REASON_MAX HVM_SVM_EXIT_REASON_MAX +#endif + +/* General hvm information */ +#define SPURIOUS_APIC_VECTOR 0xff +#define ERROR_APIC_VECTOR 0xfe +#define INVALIDATE_TLB_VECTOR 0xfd +#define EVENT_CHECK_VECTOR 0xfc +#define CALL_FUNCTION_VECTOR 0xfb +#define THERMAL_APIC_VECTOR 0xfa +#define LOCAL_TIMER_VECTOR 0xf9 + +#define EXTERNAL_INTERRUPT_MAX 256 + +/* Stringify numbers */ +char * hvm_extint_vector_name[EXTERNAL_INTERRUPT_MAX] = { + [SPURIOUS_APIC_VECTOR] = "SPURIOS_APIC", + [ERROR_APIC_VECTOR] = "ERROR_APIC", + [INVALIDATE_TLB_VECTOR]= "INVALIDATE_TLB", + [EVENT_CHECK_VECTOR]= "EVENT_CHECK", + [CALL_FUNCTION_VECTOR]= "CALL_FUNCTION", + [THERMAL_APIC_VECTOR]= "THERMAL_APIC", + [LOCAL_TIMER_VECTOR] = "LOCAL_TIMER", +}; + +#define HVM_TRAP_MAX 20 + +char * hvm_trap_name[HVM_TRAP_MAX] = { + [0] = "Divide", + [1] = "RESERVED", + [2] = "NMI", + [3] = "Breakpoint", + [4] = "Overflow", + [5] = "BOUND", + [6] = "Invalid Op", + [7] = "Coprocessor not present", + [8] = "Double Fault", + [9] = "Coprocessor segment overrun", + [10] = "TSS", + [11] = "Segment not present", + [12] = "Stack-segment fault", + [13] = "GP", + [14] = "Page fault", + [15] = "RESERVED", + [16] = "FPU", + [17] = "Alignment check", + [18] = "Machine check", + [19] = "SIMD", +}; + + +enum { + HVM_EVENT_HANDLER_PF_XEN = 1, + HVM_EVENT_HANDLER_PF_INJECT, + HVM_EVENT_HANDLER_INJ_EXC, + HVM_EVENT_HANDLER_INJ_VIRQ, + HVM_EVENT_HANDLER_REINJ_VIRQ, + HVM_EVENT_HANDLER_IO_READ, + HVM_EVENT_HANDLER_IO_WRITE, + HVM_EVENT_HANDLER_CR_READ, /* 8 */ + HVM_EVENT_HANDLER_CR_WRITE, + HVM_EVENT_HANDLER_DR_READ, + HVM_EVENT_HANDLER_DR_WRITE, + HVM_EVENT_HANDLER_MSR_READ, + HVM_EVENT_HANDLER_MSR_WRITE, + HVM_EVENT_HANDLER_CPUID, + HVM_EVENT_HANDLER_INTR, + HVM_EVENT_HANDLER_NMI, /* 16 */ + HVM_EVENT_HANDLER_SMI, + HVM_EVENT_HANDLER_VMCALL, + HVM_EVENT_HANDLER_HLT, + HVM_EVENT_HANDLER_INVLPG, + HVM_EVENT_HANDLER_MCE, + HVM_EVENT_HANDLER_IO_ASSIST, + HVM_EVENT_HANDLER_MMIO_ASSIST, + HVM_EVENT_HANDLER_CLTS, + HVM_EVENT_HANDLER_LMSW, + HVM_EVENT_HANDLER_MAX +}; +char * hvm_event_handler_name[HVM_EVENT_HANDLER_MAX] = { + NULL, + "pf_xen", + "pf_inject", + "inj_exc", + "inj_virq", + "reinj_virq", + "io_read", + "io_write", + "cr_read", /* 8 */ + "cr_write", + "dr_read", + "dr_write", + "msr_read", + "msr_write", + "cpuid", + "intr", + "nmi", /* 16 */ + "smi", + "vmcall", + "hlt", + "invlpg", + "mce", + "io_assist", + "mmio_assist", + "clts", + "lmsw", +}; + +enum { + HVM_VOL_VMENTRY, + HVM_VOL_VMEXIT, + HVM_VOL_HANDLER, + HVM_VOL_MAX +}; + +enum { + GUEST_INTERRUPT_CASE_NONE, + /* This interrupt woke, no other interrupts until halt */ + GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ALONE, + /* This interrupt woke, maybe another interrupt before halt */ + GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ANY, + /* Time from interrupt (running) to halt */ + GUEST_INTERRUPT_CASE_INTERRUPT_TO_HALT, + GUEST_INTERRUPT_CASE_MAX, +}; + +char *guest_interrupt_case_name[] = { + [GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ALONE]="wake to halt alone", + /* This interrupt woke, maybe another interrupt before halt */ + [GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ANY] ="wake to halt any ", + /* Time from interrupt (running) to halt */ + [GUEST_INTERRUPT_CASE_INTERRUPT_TO_HALT] ="intr to halt ", +}; + +char *hvm_vol_name[HVM_VOL_MAX] = { + [HVM_VOL_VMENTRY]="vmentry", + [HVM_VOL_VMEXIT] ="vmexit", + [HVM_VOL_HANDLER]="handler", +}; + +enum { + HYPERCALL_set_trap_table = 0, + HYPERCALL_mmu_update, + HYPERCALL_set_gdt, + HYPERCALL_stack_switch, + HYPERCALL_set_callbacks, + HYPERCALL_fpu_taskswitch, + HYPERCALL_sched_op_compat, + HYPERCALL_platform_op, + HYPERCALL_set_debugreg, + HYPERCALL_get_debugreg, + HYPERCALL_update_descriptor, + HYPERCALL_memory_op=12, + HYPERCALL_multicall, + HYPERCALL_update_va_mapping, + HYPERCALL_set_timer_op, + HYPERCALL_event_channel_op_compat, + HYPERCALL_xen_version, + HYPERCALL_console_io, + HYPERCALL_physdev_op_compat, + HYPERCALL_grant_table_op, + HYPERCALL_vm_assist, + HYPERCALL_update_va_mapping_otherdomain, + HYPERCALL_iret, + HYPERCALL_vcpu_op, + HYPERCALL_set_segment_base, + HYPERCALL_mmuext_op, + HYPERCALL_acm_op, + HYPERCALL_nmi_op, + HYPERCALL_sched_op, + HYPERCALL_callback_op, + HYPERCALL_xenoprof_op, + HYPERCALL_event_channel_op, + HYPERCALL_physdev_op, + HYPERCALL_hvm_op, + HYPERCALL_sysctl, + HYPERCALL_domctl, + HYPERCALL_kexec_op, + HYPERCALL_MAX +}; + +char *hypercall_name[HYPERCALL_MAX] = { + [HYPERCALL_set_trap_table]="set_trap_table", + [HYPERCALL_mmu_update]="mmu_update", + [HYPERCALL_set_gdt]="set_gdt", + [HYPERCALL_stack_switch]="stack_switch", + [HYPERCALL_set_callbacks]="set_callbacks", + [HYPERCALL_fpu_taskswitch]="fpu_taskswitch", + [HYPERCALL_sched_op_compat]="sched_op(compat)", + [HYPERCALL_platform_op]="platform_op", + [HYPERCALL_set_debugreg]="set_debugreg", + [HYPERCALL_get_debugreg]="get_debugreg", + [HYPERCALL_update_descriptor]="update_descriptor", + [HYPERCALL_memory_op]="memory_op", + [HYPERCALL_multicall]="multicall", + [HYPERCALL_update_va_mapping]="update_va_mapping", + [HYPERCALL_set_timer_op]="set_timer_op", + [HYPERCALL_event_channel_op_compat]="evtchn_op(compat)", + [HYPERCALL_xen_version]="xen_version", + [HYPERCALL_console_io]="console_io", + [HYPERCALL_physdev_op_compat]="physdev_op(compat)", + [HYPERCALL_grant_table_op]="grant_table_op", + [HYPERCALL_vm_assist]="vm_assist", + [HYPERCALL_update_va_mapping_otherdomain]="update_va_mapping_otherdomain", + [HYPERCALL_iret]="iret", + [HYPERCALL_vcpu_op]="vcpu_op", + [HYPERCALL_set_segment_base]="set_segment_base", + [HYPERCALL_mmuext_op]="mmuext_op", + [HYPERCALL_acm_op]="acm_op", + [HYPERCALL_nmi_op]="nmi_op", + [HYPERCALL_sched_op]="sched_op", + [HYPERCALL_callback_op]="callback_op", + [HYPERCALL_xenoprof_op]="xenoprof_op", + [HYPERCALL_event_channel_op]="evtchn_op", + [HYPERCALL_physdev_op]="physdev_op", + [HYPERCALL_hvm_op]="hvm_op", + [HYPERCALL_sysctl]="sysctl", + [HYPERCALL_domctl]="domctl", + [HYPERCALL_kexec_op]="kexec_op" +}; + +enum { + PF_XEN_EMUL_LVL_0, + PF_XEN_EMUL_LVL_1, + PF_XEN_EMUL_LVL_2, + PF_XEN_EMUL_LVL_3, + PF_XEN_EMUL_LVL_4, + PF_XEN_EMUL_EARLY_UNSHADOW, + PF_XEN_EMUL_SET_CHANGED, + PF_XEN_EMUL_SET_UNCHANGED, + PF_XEN_EMUL_SET_FLUSH, + PF_XEN_EMUL_SET_ERROR, + PF_XEN_EMUL_PROMOTE, + PF_XEN_EMUL_DEMOTE, + PF_XEN_EMUL_PREALLOC_UNPIN, + PF_XEN_EMUL_PREALLOC_UNHOOK, + PF_XEN_EMUL_MAX, +}; + +char * pf_xen_emul_name[PF_XEN_EMUL_MAX] = { + [PF_XEN_EMUL_LVL_0]="non-linmap", + [PF_XEN_EMUL_LVL_1]="linmap l1", + [PF_XEN_EMUL_LVL_2]="linmap l2", + [PF_XEN_EMUL_LVL_3]="linmap l3", + [PF_XEN_EMUL_LVL_4]="linmap l4", + [PF_XEN_EMUL_EARLY_UNSHADOW]="early unshadow", + [PF_XEN_EMUL_SET_UNCHANGED]="set unchanged", + [PF_XEN_EMUL_SET_CHANGED]="set changed", + [PF_XEN_EMUL_SET_FLUSH]="set changed", + [PF_XEN_EMUL_SET_ERROR]="set changed", + [PF_XEN_EMUL_PROMOTE]="promote", + [PF_XEN_EMUL_DEMOTE]="demote", + [PF_XEN_EMUL_PREALLOC_UNPIN]="unpin", + [PF_XEN_EMUL_PREALLOC_UNHOOK]="unhook", +}; + +/* Rio only */ +enum { + PF_XEN_NON_EMUL_VA_USER, + PF_XEN_NON_EMUL_VA_KERNEL, + PF_XEN_NON_EMUL_EIP_USER, + PF_XEN_NON_EMUL_EIP_KERNEL, + PF_XEN_NON_EMUL_MAX, +}; + +char * pf_xen_non_emul_name[PF_XEN_NON_EMUL_MAX] = { + [PF_XEN_NON_EMUL_VA_USER]="va user", + [PF_XEN_NON_EMUL_VA_KERNEL]="va kernel", + [PF_XEN_NON_EMUL_EIP_USER]="eip user", + [PF_XEN_NON_EMUL_EIP_KERNEL]="eip kernel", +}; + +enum { + PF_XEN_FIXUP_PREALLOC_UNPIN, + PF_XEN_FIXUP_PREALLOC_UNHOOK, + PF_XEN_FIXUP_UNSYNC, + PF_XEN_FIXUP_OOS_ADD, + PF_XEN_FIXUP_OOS_EVICT, + PF_XEN_FIXUP_PROMOTE, + PF_XEN_FIXUP_UPDATE_ONLY, + PF_XEN_FIXUP_WRMAP, + PF_XEN_FIXUP_BRUTE_FORCE, + PF_XEN_FIXUP_MAX, +}; + +char * pf_xen_fixup_name[PF_XEN_FIXUP_MAX] = { + [PF_XEN_FIXUP_PREALLOC_UNPIN] = "unpin", + [PF_XEN_FIXUP_PREALLOC_UNHOOK] = "unhook", + [PF_XEN_FIXUP_UNSYNC] = "unsync", + [PF_XEN_FIXUP_OOS_ADD] = "oos-add", + [PF_XEN_FIXUP_OOS_EVICT] = "oos-evict", + [PF_XEN_FIXUP_PROMOTE] = "promote", + [PF_XEN_FIXUP_UPDATE_ONLY] = "update", + [PF_XEN_FIXUP_WRMAP] = "wrmap", + [PF_XEN_FIXUP_BRUTE_FORCE] = "wrmap-bf", +}; + +enum { + PF_XEN_NOT_SHADOW = 1, + PF_XEN_FAST_PROPAGATE, + PF_XEN_FAST_MMIO, + PF_XEN_FALSE_FAST_PATH, + PF_XEN_MMIO, + PF_XEN_FIXUP, + PF_XEN_DOMF_DYING, + PF_XEN_EMULATE, + PF_XEN_EMULATE_UNSHADOW_USER, + PF_XEN_EMULATE_UNSHADOW_EVTINJ, + PF_XEN_EMULATE_UNSHADOW_UNHANDLED, + PF_XEN_EMULATE_UNSHADOW_HEURISTIC, + PF_XEN_LAST_FAULT=PF_XEN_EMULATE_UNSHADOW_HEURISTIC, + PF_XEN_NON_EMULATE, + PF_XEN_NO_HANDLER, + PF_XEN_MAX, +}; + +#define SHADOW_PREALLOC_UNPIN 13 +#define SHADOW_RESYNC_FULL 24 +#define SHADOW_RESYNC_ONLY 25 + +char * pf_xen_name[PF_XEN_MAX] = { + [PF_XEN_NOT_SHADOW]="propagate", + [PF_XEN_FAST_PROPAGATE]="fast propagate", + [PF_XEN_FAST_MMIO]="fast mmio", + [PF_XEN_FALSE_FAST_PATH]="false fast path", + [PF_XEN_MMIO]="mmio", + [PF_XEN_FIXUP]="fixup", + [PF_XEN_DOMF_DYING]="dom dying", + [PF_XEN_EMULATE]="emulate", + [PF_XEN_EMULATE_UNSHADOW_USER]="unshadow:user-mode", + [PF_XEN_EMULATE_UNSHADOW_EVTINJ]="unshadow:evt inj", + [PF_XEN_EMULATE_UNSHADOW_UNHANDLED]="unshadow:unhandled instr", + [PF_XEN_NON_EMULATE]="fixup|mmio", + [PF_XEN_NO_HANDLER]="(no handler)", +}; + +#define CORR_VA_INVALID (0ULL-1) + +struct pf_xen_extra { + unsigned long long va; + union { + unsigned flags; + struct { + unsigned flag_set_ad:1, + flag_set_a:1, + flag_shadow_l1_get_ref:1, + flag_shadow_l1_put_ref:1, + flag_l2_propagate:1, + flag_set_changed:1, + flag_set_flush:1, + flag_set_error:1, + flag_demote:1, + flag_promote:1, + flag_wrmap:1, + flag_wrmap_guess_found:1, + flag_wrmap_brute_force:1, + flag_early_unshadow:1, + flag_emulation_2nd_pt_written:1, + flag_emulation_last_failed:1, + flag_emulate_full_pt:1, + flag_prealloc_unhook:1, + flag_unsync:1, + flag_oos_fixup_add:1, + flag_oos_fixup_evict:1; + }; + }; /* Miami + ; fixup & emulate */ + unsigned long error_code; /* Rio only */ + + /* Calculated */ + int pf_case; /* Rio */ + + /* MMIO only */ + unsigned long long gpa; + unsigned long data; + + /* Emulate only */ + unsigned long long gl1e; /* Miami + */ + unsigned long long wval; /* Miami */ + unsigned long long corresponding_va; + unsigned int pt_index[5], pt_is_lo; + int pt_level; + + /* Other */ + unsigned long long gfn; + + /* Flags */ + unsigned corr_valid:1, + corr_is_kernel:1, + va_is_kernel:1, + mmio_data_valid:1, + mmio_is_write:1; +}; + +struct pcpu_info; + +#define GUEST_INTERRUPT_MAX 350 +#define CR_MAX 9 +#define RESYNCS_MAX 17 +#define PF_XEN_FIXUP_UNSYNC_RESYNC_MAX 2 + +struct hvm_data { + /* Summary information */ + int init; + int vmexit_valid; + int summary_info; + struct vcpu_data *v; /* up-pointer */ + + /* SVM / VMX compatibility. FIXME - should be global */ + char ** exit_reason_name; + int exit_reason_max; + void (* exit_reason_summary_handler[HVM_EXIT_REASON_MAX])(struct hvm_data *); + + /* Information about particular exit reasons */ + struct { + struct event_cycle_summary exit_reason[HVM_EXIT_REASON_MAX]; + int extint[EXTERNAL_INTERRUPT_MAX+1]; + int *extint_histogram; + struct event_cycle_summary trap[HVM_TRAP_MAX]; + struct event_cycle_summary pf_xen[PF_XEN_MAX]; + struct event_cycle_summary pf_xen_emul[PF_XEN_EMUL_MAX]; + struct event_cycle_summary pf_xen_emul_early_unshadow[5]; + struct event_cycle_summary pf_xen_non_emul[PF_XEN_NON_EMUL_MAX]; + struct event_cycle_summary pf_xen_fixup[PF_XEN_FIXUP_MAX]; + struct event_cycle_summary pf_xen_fixup_unsync_resync[PF_XEN_FIXUP_UNSYNC_RESYNC_MAX+1]; + struct event_cycle_summary cr_write[CR_MAX]; + struct event_cycle_summary cr3_write_resyncs[RESYNCS_MAX+1]; + struct hvm_gi_struct { + int count; + struct cycle_summary runtime[GUEST_INTERRUPT_CASE_MAX]; + /* OK, not summary info, but still... */ + int is_wake; + tsc_t start_tsc; + } guest_interrupt[GUEST_INTERRUPT_MAX + 1]; + /* IPI Latency */ + struct event_cycle_summary ipi_latency; + int ipi_count[256]; + struct io_address *mmio, *pio; + } summary; + + /* In-flight accumulation information */ + union { + struct { + unsigned port:16,size:16; + unsigned long val; + } io; + struct pf_xen_extra pf_xen; + struct { + unsigned cr; + unsigned long long val; + int repromote; + } cr_write; + struct { + unsigned addr; + unsigned long long val; + } msr; + struct { + unsigned long event; + uint32_t d[4]; + } generic; + } inflight; + int resyncs; + void (*post_process)(struct hvm_data *); + tsc_t exit_tsc, arc_cycles, entry_tsc; + unsigned long long rip; + unsigned exit_reason, event_handler; + char dump_header[256]; + int short_summary_done:1, prealloc_unpin:1; + + /* Immediate processing */ + void *d; + + /* Wake-to-halt detection. See comment above. */ + struct { + unsigned waking:1; + /* Wake vector: keep track of time from vmentry until: + next halt, or next interrupt */ + int vector, interrupts, interrupts_wanting_tsc; + } w2h; +}; + +enum { + HVM_SHORT_SUMMARY_EMULATE, + HVM_SHORT_SUMMARY_UNSYNC, + HVM_SHORT_SUMMARY_FIXUP, + HVM_SHORT_SUMMARY_MMIO, + HVM_SHORT_SUMMARY_PROPAGATE, + HVM_SHORT_SUMMARY_CR3, + HVM_SHORT_SUMMARY_VMCALL, + HVM_SHORT_SUMMARY_INTERRUPT, + HVM_SHORT_SUMMARY_HLT, + HVM_SHORT_SUMMARY_OTHER, + HVM_SHORT_SUMMARY_MAX, +}; + +char *hvm_short_summary_name[HVM_SHORT_SUMMARY_MAX] = { + [HVM_SHORT_SUMMARY_EMULATE] ="emulate", + [HVM_SHORT_SUMMARY_UNSYNC] ="unsync", + [HVM_SHORT_SUMMARY_FIXUP] ="fixup", + [HVM_SHORT_SUMMARY_MMIO] ="mmio", + [HVM_SHORT_SUMMARY_PROPAGATE]="propagate", + [HVM_SHORT_SUMMARY_CR3] ="cr3", + [HVM_SHORT_SUMMARY_VMCALL] ="vmcall", + [HVM_SHORT_SUMMARY_INTERRUPT]="intr", + [HVM_SHORT_SUMMARY_HLT] ="hlt", + [HVM_SHORT_SUMMARY_OTHER] ="other", +}; + +struct hvm_short_summary_struct { + struct cycle_summary s[HVM_SHORT_SUMMARY_MAX]; +}; + +void init_hvm_data(struct hvm_data *h, struct vcpu_data *v) { + int i; + + if(h->init) + return; + + h->v = v; + + h->init = 1; + + if(opt.svm_mode) { + h->exit_reason_max = HVM_SVM_EXIT_REASON_MAX; + h->exit_reason_name = hvm_svm_exit_reason_name; + } else { + h->exit_reason_max = HVM_VMX_EXIT_REASON_MAX; + h->exit_reason_name = hvm_vmx_exit_reason_name; + } + + if(opt.histogram_interrupt_eip) { + int count = ((1ULL<summary.extint_histogram = malloc(size); + if(h->summary.extint_histogram) + bzero(h->summary.extint_histogram, size); + else { + fprintf(stderr, "FATAL: Could not allocate %d bytes for interrupt histogram!\n", + size); + exit(1); + } + + } + for(i=0; isummary.guest_interrupt[i].count=0; +} + +/* PV data */ +enum { + PV_HYPERCALL=1, + PV_TRAP=3, + PV_PAGE_FAULT, + PV_FORCED_INVALID_OP, + PV_EMULATE_PRIVOP, + PV_EMULATE_4GB, + PV_MATH_STATE_RESTORE, + PV_PAGING_FIXUP, + PV_GDT_LDT_MAPPING_FAULT, + PV_PTWR_EMULATION, + PV_PTWR_EMULATION_PAE, + PV_MAX +}; + +char *pv_name[PV_MAX] = { + [PV_HYPERCALL]="hypercall", + [PV_TRAP]="trap", + [PV_PAGE_FAULT]="page_fault", + [PV_FORCED_INVALID_OP]="forced_invalid_op", + [PV_EMULATE_PRIVOP]="emulate privop", + [PV_EMULATE_4GB]="emulate 4g", + [PV_MATH_STATE_RESTORE]="math state restore", + [PV_PAGING_FIXUP]="paging fixup", + [PV_GDT_LDT_MAPPING_FAULT]="gdt/ldt mapping fault", + [PV_PTWR_EMULATION]="writable pt emulation" +}; + +#define PV_HYPERCALL_MAX 56 +#define PV_TRAP_MAX 20 + +struct pv_data { + unsigned summary_info:1; + int count[PV_MAX]; + int hypercall_count[PV_HYPERCALL_MAX]; + int trap_count[PV_TRAP_MAX]; +}; + +/* Sched data */ + +enum { + SCHED_DOM_ADD=1, + SCHED_DOM_REM, + SCHED_SLEEP, + SCHED_WAKE, + SCHED_YIELD, + SCHED_BLOCK, + SCHED_SHUTDOWN, + SCHED_CTL, + SCHED_ADJDOM, + SCHED_SWITCH, + SCHED_S_TIMER_FN, + SCHED_T_TIMER_FN, + SCHED_DOM_TIMER_FN, + SCHED_SWITCH_INFPREV, + SCHED_SWITCH_INFNEXT, + SCHED_SHUTDOWN_CODE, + SCHED_MAX +}; + +enum { + RUNSTATE_RUNNING=0, + RUNSTATE_RUNNABLE, + RUNSTATE_BLOCKED, + RUNSTATE_OFFLINE, + RUNSTATE_LOST, + RUNSTATE_QUEUED, + RUNSTATE_INIT, + RUNSTATE_MAX +}; + +char * runstate_name[RUNSTATE_MAX]={ + [RUNSTATE_RUNNING]= "running", + [RUNSTATE_RUNNABLE]="runnable", + [RUNSTATE_BLOCKED]= "blocked", /* to be blocked */ + [RUNSTATE_OFFLINE]= "offline", + [RUNSTATE_QUEUED]= "queued", + [RUNSTATE_INIT]= "init", + [RUNSTATE_LOST]= "lost", +}; + +/* Memory data */ +enum { + MEM_PAGE_GRANT_MAP = 1, + MEM_PAGE_GRANT_UNMAP, + MEM_PAGE_GRANT_TRANSFER, + MEM_MAX +}; + +char *mem_name[MEM_MAX] = { + [MEM_PAGE_GRANT_MAP] = "grant-map", + [MEM_PAGE_GRANT_UNMAP] = "grant-unmap", + [MEM_PAGE_GRANT_TRANSFER] = "grant-transfer", +}; + +/* Per-unit information. */ + +struct cr3_value_struct { + struct cr3_value_struct * next; + struct cr3_value_struct * gnext; + unsigned long long gmfn; + int cr3_id; + unsigned long long first_time, last_time, run_time; + struct cycle_summary total_time, guest_time, hv_time; + int switch_count, flush_count; + + struct hvm_short_summary_struct hvm; + + struct { + int now; + int count; + } prealloc_unpin; + + struct { + unsigned callback:1; + unsigned flush_count, switch_count; + unsigned fixup_user, emulate_corr_user; + } destroy; +}; + +#define MAX_CPUS 32 +typedef uint32_t cpu_mask_t; + +#define IDLE_DOMAIN 32767 +#define DEFAULT_DOMAIN 32768 + +struct vlapic_struct { + int outstanding_ipis; + tsc_t first_ipi_tsc; + int ipi_virq_injected; +}; + +struct vcpu_data { + int vid; + struct domain_data *d; /* up-pointer */ + unsigned activated:1; + + int guest_paging_levels; + + /* Schedule info */ + struct { + int state; + tsc_t tsc; + /* TSC skew detection/correction */ + struct last_oldstate_struct { + int wrong, actual, pid; + tsc_t tsc; + } last_oldstate; + /* Performance counters */ + unsigned long long p1_start, p2_start; + } runstate; + struct pcpu_info *p; + tsc_t pcpu_tsc; + + /* Hardware tracking */ + struct { + long long val; + tsc_t start_time; + struct cr3_value_struct *data; + } cr3; + + /* IPI latency tracking */ + struct vlapic_struct vlapic; + + /* Summary info */ + struct cycle_framework f; + struct cycle_summary runstates[RUNSTATE_MAX]; + struct weighted_cpi_summary cpi; + struct cycle_summary cpu_affinity; + enum { + VCPU_DATA_NONE=0, + VCPU_DATA_HVM, + VCPU_DATA_PV + } data_type; + union { + struct hvm_data hvm; + struct pv_data pv; + }; +}; + +enum { + DOMAIN_RUNSTATE_BLOCKED=0, + DOMAIN_RUNSTATE_PARTIAL_RUN, + DOMAIN_RUNSTATE_FULL_RUN, + DOMAIN_RUNSTATE_PARTIAL_CONTENTION, + DOMAIN_RUNSTATE_CONCURRENCY_HAZARD, + DOMAIN_RUNSTATE_FULL_CONTENTION, + DOMAIN_RUNSTATE_LOST, + DOMAIN_RUNSTATE_MAX +}; + +char * domain_runstate_name[] = { + [DOMAIN_RUNSTATE_BLOCKED]="blocked", + [DOMAIN_RUNSTATE_PARTIAL_RUN]="partial run", + [DOMAIN_RUNSTATE_FULL_RUN]="full run", + [DOMAIN_RUNSTATE_PARTIAL_CONTENTION]="partial contention", + [DOMAIN_RUNSTATE_CONCURRENCY_HAZARD]="concurrency_hazard", + [DOMAIN_RUNSTATE_FULL_CONTENTION]="full_contention", + [DOMAIN_RUNSTATE_LOST]="lost", +}; + +struct domain_data { + struct domain_data *next; + int did; + struct vcpu_data *vcpu[MAX_CPUS]; + + int max_vid; + + int runstate; + tsc_t runstate_tsc; + struct cycle_summary total_time; + struct cycle_summary runstates[DOMAIN_RUNSTATE_MAX]; + struct cr3_value_struct *cr3_value_head; + struct eip_list_struct *emulate_eip_list; + + int guest_interrupt[GUEST_INTERRUPT_MAX+1]; + struct hvm_short_summary_struct hvm_short; + struct { + int done[MEM_MAX]; + int done_interval[MEM_MAX]; + + int done_for[MEM_MAX]; + int done_for_interval[MEM_MAX]; + } memops; +}; + +struct domain_data * domain_list=NULL; + +struct domain_data default_domain; + +enum { + TOPLEVEL_GEN=0, + TOPLEVEL_SCHED, + TOPLEVEL_DOM0OP, + TOPLEVEL_HVM, + TOPLEVEL_MEM, + TOPLEVEL_PV, + TOPLEVEL_SHADOW, + TOPLEVEL_MAX=12 +}; + +char * toplevel_name[TOPLEVEL_MAX] = { + [TOPLEVEL_GEN]="gen", + [TOPLEVEL_SCHED]="sched", + [TOPLEVEL_DOM0OP]="dom0op", + [TOPLEVEL_HVM]="hvm", + [TOPLEVEL_MEM]="mem", + [TOPLEVEL_PV]="pv", + [TOPLEVEL_SHADOW]="shadow", +}; + +struct trace_volume { + unsigned long long toplevel[TOPLEVEL_MAX]; + unsigned long long sched_verbose; + unsigned long long hvm[HVM_VOL_MAX]; +} volume; + +#define UPDATE_VOLUME(_p,_x,_s) \ + do { \ + (_p)->volume.total._x += _s; \ + (_p)->volume.last_buffer._x += _s; \ + } while(0) + +void volume_clear(struct trace_volume *vol) +{ + bzero(vol, sizeof(*vol)); +} + +void volume_summary(struct trace_volume *vol) +{ + int j, k; + for(j=0; jtoplevel[j]) { + printf(" %-6s: %10lld\n", + toplevel_name[j], vol->toplevel[j]); + switch(j) { + case TOPLEVEL_SCHED: + if(vol->sched_verbose) + printf(" +-verbose: %10lld\n", + vol->sched_verbose); + break; + case TOPLEVEL_HVM: + for(k=0; khvm[k]) + printf(" +-%-7s: %10lld\n", + hvm_vol_name[k], vol->hvm[k]); + } + + break; + } + } +} + +struct pcpu_info { + /* Information about this pcpu */ + unsigned active:1, summary:1; + int pid; + + /* Information related to scanning thru the file */ + tsc_t first_tsc, last_tsc, order_tsc; + loff_t file_offset; + loff_t next_cpu_change_offset; + struct record_info ri; + int last_cpu_change_pid; + + /* Information related to tsc skew detection / correction */ + struct { + tsc_t offset; + cpu_mask_t downstream; /* To detect cycles in dependencies */ + } tsc_skew; + + /* Information related to domain tracking */ + struct vcpu_data * current; + struct { + unsigned active:1, + domain_valid:1; + unsigned did:16,vid:16; + tsc_t tsc; + } lost_record; + + /* Record volume */ + struct { + tsc_t buffer_first_tsc, + buffer_dom0_runstate_tsc, + buffer_dom0_runstate_cycles[RUNSTATE_MAX]; + int buffer_dom0_runstate; + unsigned buffer_size; + struct trace_volume total, last_buffer; + } volume; +}; + +void __fill_in_record_info(struct pcpu_info *p); + +#define INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX 10 + +struct { + int max_active_pcpu; + loff_t last_epoch_offset; + int early_eof; + int lost_cpus; + tsc_t now; + struct cycle_framework f; + tsc_t buffer_trace_virq_tsc; + struct pcpu_info pcpu[MAX_CPUS]; + + struct { + int id; + /* Invariant: head null => tail null; head !null => tail valid */ + struct cr3_value_struct *head, **tail; + } cr3; + + struct { + tsc_t start_tsc; + /* Information about specific interval output types */ + union { + struct { + struct cr3_value_struct ** array; + } cr3; + struct { + struct domain_data *d; + int guest_vector[INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX]; + } domain; + }; + } interval; +} P = { 0 }; + +char * pcpu_string(int pcpu); +void process_generic(struct record_info *ri); +void dump_generic(FILE *f, struct record_info *ri); +ssize_t __read_record(int fd, struct trace_record *rec, loff_t offset); + +void cpumask_init(cpu_mask_t *c) { + *c = 0UL; +} + +void cpumask_clear(cpu_mask_t *c, int cpu) { + *c &= ~(1UL << cpu); +} + +void cpumask_set(cpu_mask_t *c, int cpu) { + *c |= (1UL << cpu); +} + +int cpumask_isset(const cpu_mask_t *c, int cpu) { + if(*c & (1UL<time = ((c) * 1000) / (opt.cpu_hz / 1000000 ); + t->s = t->time / 1000000000; + t->ns = t->time % 1000000000; +} + +void abs_cycles_to_time(unsigned long long ac, struct time_struct *t) { + if(ac > P.f.first_tsc) { + t->time = ((ac - P.f.first_tsc) * 1000) / (opt.cpu_hz / 1000000 ); + t->s = t->time / 1000000000; + t->ns = t->time % 1000000000; + } else { + t->time = t->s = t->ns = 0; + } +} + +tsc_t abs_cycles_to_global(unsigned long long ac) { + if(ac > P.f.first_tsc) + return ac - P.f.first_tsc; + else + return 0; +} + +void scatterplot_vs_time(tsc_t atsc, long long y) { + struct time_struct t; + + abs_cycles_to_time(atsc, &t); + + printf("%lu.%09lu %lld\n", t.s, t.ns, y); +} + +/* -- Summary Code -- */ + +/* With compliments to "Numerical Recipes in C", which provided the algorithm + * and basic template for this function. */ +long long percentile(long long * A, int N, int ple) { + int I, J, L, R, K; + + long long X, W; + + /* Find K, the element # we want */ + K=N*ple/100; + + /* Set the left and right boundaries of the current search space */ + L=0; R=N-1; + + while(L < R) { + /* X: The value to order everything higher / lower than */ + X=A[K]; + + /* Starting at the left and the right... */ + I=L; + J=R; + + do { + /* Find the first element on the left that is out-of-order w/ X */ + while(A[I]> 1; + /* X: The value to order everything higher / lower than */ + X=A[K]; + + /* Starting at the left and the right... */ + I=L; I_weight = L_weight; + J=R; J_weight = R_weight; + + do { + /* Find the first element on the left that is out-of-order w/ X */ + while(A[I]> 1; + /* X: The value to order everything higher / lower than */ + X=A[K]; + + /* Starting at the left and the right... */ + I=L; I_weight = L_weight; + J=R; J_weight = R_weight; + + do { + /* Find the first element on the left that is out-of-order w/ X */ + while(A[I]cycles, f->total_cycles); +} + +static inline double summary_percent_global(struct event_cycle_summary *s) { + return __summary_percent(s, &P.f); +} + +static inline void update_summary(struct event_cycle_summary *s, long long c) { +/* We don't know ahead of time how many samples there are, and working + * with dynamic stuff is a pain, and unnecessary. This algorithm will + * generate a sample set that approximates an even sample. We can + * then take the percentiles on this, and get an approximate value. */ + if(c) { + if(opt.sample_size) { + int lap = (s->cycles_count/opt.sample_size)+1, + index =s->cycles_count % opt.sample_size; + if((index - (lap/3))%lap == 0) { + if(!s->cycles_sample) { + s->cycles_sample = malloc(sizeof(*s->cycles_sample) * opt.sample_size); + if(!s->cycles_sample) { + fprintf(stderr, "%s: malloc failed!\n", __func__); + exit(1); + } + } + s->cycles_sample[index]=c; + } + } + s->cycles_count++; + s->cycles += c; + + s->interval_count++; + s->interval_cycles += c; + } + s->count++; +} + +static inline void clear_interval_summary(struct event_cycle_summary *s) { + s->interval_count = 0; + s->interval_cycles = 0; +} + +static inline void update_cycles(struct cycle_summary *s, long long c) { +/* We don't know ahead of time how many samples there are, and working + * with dynamic stuff is a pain, and unnecessary. This algorithm will + * generate a sample set that approximates an even sample. We can + * then take the percentiles on this, and get an approximate value. */ + int lap, index; + + assert(c!=0); + + if ( opt.sample_size ) { + lap = (s->count/opt.sample_size)+1; + index =s->count % opt.sample_size; + + if((index - (lap/3))%lap == 0) { + if(!s->sample) { + s->sample = malloc(sizeof(*s->sample) * opt.sample_size); + if(!s->sample) { + fprintf(stderr, "%s: malloc failed!\n", __func__); + exit(1); + } + } + s->sample[index] = c; + } + } + + if(c > 0) { + s->cycles += c; + s->interval_cycles += c; + } else { + s->cycles += -c; + s->interval_cycles += -c; + } + s->count++; + s->interval_count++; +} + +static inline void clear_interval_cycles(struct cycle_summary *s) { + s->interval_cycles = 0; + s->interval_count = 0; +} + +static inline void update_cpi(struct weighted_cpi_summary *s, + unsigned long long i, + unsigned long long c) { +/* We don't know ahead of time how many samples there are, and working + * with dynamic stuff is a pain, and unnecessary. This algorithm will + * generate a sample set that approximates an even sample. We can + * then take the percentiles on this, and get an approximate value. */ + int lap, index; + + if ( opt.sample_size ) { + lap = (s->count/opt.sample_size)+1; + index =s->count % opt.sample_size; + + if((index - (lap/3))%lap == 0) { + if(!s->cpi) { + assert(!s->cpi_weight); + + s->cpi = malloc(sizeof(*s->cpi) * opt.sample_size); + s->cpi_weight = malloc(sizeof(*s->cpi_weight) * opt.sample_size); + if(!s->cpi || !s->cpi_weight) { + fprintf(stderr, "%s: malloc failed!\n", __func__); + exit(1); + } + } + assert(s->cpi_weight); + + s->cpi[index] = (float) c / i; + s->cpi_weight[index]=c; + } + } + + s->instructions += i; + s->cycles += c; + s->count++; + + s->interval_instructions += i; + s->interval_cycles += c; + s->interval_count++; +} + +static inline void clear_interval_cpi(struct weighted_cpi_summary *s) { + s->interval_cycles = 0; + s->interval_count = 0; + s->interval_instructions = 0; +} + +static inline void print_cpu_affinity(struct cycle_summary *s, char *p) { + if(s->count) { + long long avg; + + avg = s->cycles / s->count; + + if ( opt.sample_size ) { + long long p5, p50, p95; + int data_size = s->count; + if(data_size > opt.sample_size) + data_size = opt.sample_size; + + p50 = percentile(s->sample, data_size, 50); + p5 = percentile(s->sample, data_size, 5); + p95 = percentile(s->sample, data_size, 95); + + printf("%s: %7d %6lld {%6lld|%6lld|%6lld}\n", + p, s->count, avg, p5, p50, p95); + } else { + printf("%s: %7d %6lld\n", + p, s->count, avg); + } + } +} + +static inline void print_cpi_summary(struct weighted_cpi_summary *s) { + if(s->count) { + float avg; + + avg = (float)s->cycles / s->instructions; + + if ( opt.sample_size ) { + float p5, p50, p95; + int data_size = s->count; + + if(data_size > opt.sample_size) + data_size = opt.sample_size; + + p50 = weighted_percentile(s->cpi, s->cpi_weight, data_size, 50); + p5 = weighted_percentile(s->cpi, s->cpi_weight, data_size, 5); + p95 = weighted_percentile(s->cpi, s->cpi_weight, data_size, 95); + + printf(" CPI summary: %2.2f {%2.2f|%2.2f|%2.2f}\n", + avg, p5, p50, p95); + } else { + printf(" CPI summary: %2.2f\n", avg); + } + } +} + +static inline void print_cycle_percent_summary(struct cycle_summary *s, + tsc_t total, char *p) { + if(s->count) { + long long avg; + double percent, seconds; + + avg = s->cycles / s->count; + + seconds = ((double)s->cycles) / opt.cpu_hz; + + percent = ((double)(s->cycles * 100)) / total; + + if ( opt.sample_size ) { + long long p5, p50, p95; + int data_size = s->count; + + if(data_size > opt.sample_size) + data_size = opt.sample_size; + + p50 = self_weighted_percentile(s->sample, data_size, 50); + p5 = self_weighted_percentile(s->sample, data_size, 5); + p95 = self_weighted_percentile(s->sample, data_size, 95); + + printf("%s: %7d %5.2lfs %5.2lf%% %6lld {%6lld|%6lld|%6lld}\n", + p, s->count, + seconds, + percent, + avg, p5, p50, p95); + } else { + printf("%s: %7d %5.2lfs %5.2lf%% %6lld\n", + p, s->count, + seconds, + percent, + avg); + } + } +} + +static inline void print_cycle_summary(struct cycle_summary *s, char *p) { + if(s->count) { + long long avg; + + avg = s->cycles / s->count; + + if ( opt.sample_size ) { + long long p5, p50, p95; + int data_size = s->count; + + if(data_size > opt.sample_size) + data_size = opt.sample_size; + + p50 = self_weighted_percentile(s->sample, data_size, 50); + p5 = self_weighted_percentile(s->sample, data_size, 5); + p95 = self_weighted_percentile(s->sample, data_size, 95); + + printf("%s: %7d %5.2lfs %6lld {%6lld|%6lld|%6lld}\n", + p, s->count, ((double)s->cycles)/opt.cpu_hz, + avg, p5, p50, p95); + } else { + printf("%s: %7d %5.2lfs %6lld\n", + p, s->count, ((double)s->cycles)/opt.cpu_hz, avg); + } + } +} + +#define PRINT_SUMMARY(_s, _p...) \ + do { \ + if((_s).count) { \ + if ( opt.sample_size ) { \ + unsigned long long p5, p50, p95; \ + int data_size=(_s).cycles_count; \ + if(data_size > opt.sample_size) \ + data_size=opt.sample_size; \ + p50=percentile((_s).cycles_sample, data_size, 50); \ + p5=percentile((_s).cycles_sample, data_size, 5); \ + p95=percentile((_s).cycles_sample, data_size, 95); \ + printf(_p); \ + printf(" %7d %5.2lfs %5.2lf%% %5lld cyc {%5lld|%5lld|%5lld}\n", \ + (_s).count, \ + ((double)(_s).cycles)/opt.cpu_hz, \ + summary_percent_global(&(_s)), \ + (_s).cycles_count ? (_s).cycles / (_s).cycles_count:0, \ + p5, p50, p95); \ + } else { \ + printf(_p); \ + printf(" %7d %5.2lfs %5.2lf%% %5lld cyc\n", \ + (_s).count, \ + ((double)(_s).cycles)/opt.cpu_hz, \ + summary_percent_global(&(_s)), \ + (_s).cycles_count ? (_s).cycles / (_s).cycles_count:0); \ + } \ + } \ + } while(0) + +void __interval_cycle_percent_output(struct cycle_summary *s, tsc_t cycles) { + printf(" %.02lf", + __cycles_percent(s->interval_cycles, cycles)); + clear_interval_cycles(s); +} + +void interval_cycle_percent_output(struct cycle_summary *s) { + __interval_cycle_percent_output(s, opt.interval.cycles); +} + +void interval_time_output(void) { + struct time_struct t; + abs_cycles_to_time(P.interval.start_tsc, &t); + + printf("%lu.%09lu", t.s, t.ns); +} + +void interval_cr3_schedule_time_header(void) { + if( opt.interval.cr3.count ) { + int i; + + printf("time"); + for(i=0; ignext) { + printf(" %llx", cr3->gmfn); + } + printf("\n"); + } +} + +void interval_cr3_value_check(struct cr3_value_struct *cr3) { + if( opt.interval.cr3.count ) { + int i; + + for(i=0; igmfn == opt.interval.cr3.values[i]) { + if(P.interval.cr3.array[i]) { + fprintf(stderr, "Fatal: duplicate cr3 value %llx!\n", + cr3->gmfn); + exit(1); + } + fprintf(stderr, "%s: found gmfn %llx\n", + __func__, cr3->gmfn); + P.interval.cr3.array[i] = cr3; + } + } + } +} + +void interval_cr3_schedule_time_output(void) { + struct cr3_value_struct *cr3; + int i; + + interval_time_output(); + + if(opt.interval.cr3.count) { + for(i=0; itotal_time.interval_cycles, + opt.interval.cycles)); + clear_interval_cycles(&cr3->total_time); + } else { + printf(" 0.0"); + } + } + } else { + struct cr3_value_struct *cr3; + for(cr3 = P.cr3.head; cr3; cr3 = cr3->gnext) { + printf(" %.02lf", + __cycles_percent(cr3->total_time.interval_cycles, + opt.interval.cycles)); + clear_interval_cycles(&cr3->total_time); + } + } + printf("\n"); +} + +void interval_cr3_schedule_ordered_output(void) { + struct cr3_value_struct *p; + int i; + + struct cr3_value_struct **qsort_array; + int N=0; + + int cr3_time_compare(const void *_a, const void *_b) { + struct cr3_value_struct *a=*(typeof(&a))_a; + struct cr3_value_struct *b=*(typeof(&a))_b; + + if(a->total_time.interval_cycles < b->total_time.interval_cycles) + return 1; + else if(b->total_time.interval_cycles == a->total_time.interval_cycles) { + if(a->total_time.interval_count < b->total_time.interval_count) + return 1; + else if(a->total_time.interval_count == b->total_time.interval_count) + return 0; + else + return -1; + } else + return -1; + } + + for(p=P.cr3.head; p; p=p->gnext) + N++; + + if(!N) + return; + + qsort_array = malloc(N * sizeof(struct eip_list_struct *)); + + for(i=0, p=P.cr3.head; p; p=p->gnext, i++) + qsort_array[i]=p; + + qsort(qsort_array, N, sizeof(struct eip_list_struct *), + cr3_time_compare); + + interval_time_output(); + + for(i=0; itotal_time.interval_cycles > 0) { + printf(" %8llx: %.02lf %c\n", + p->gmfn, + __cycles_percent(p->total_time.interval_cycles, + opt.interval.cycles), + (p->first_time > P.interval.start_tsc)?'*':' '); + } + clear_interval_cycles(&p->total_time); + } + + free(qsort_array); +} + +void interval_cr3_short_summary_header(void) { + int i; + + printf("time guest"); + for(i=0; ihvm.s; + + printf(" %.02lf", + __cycles_percent(p->total_time.interval_cycles, + opt.interval.cycles)); + + for(i=0; itotal_time.interval_cycles); + + clear_interval_cycles(&p->total_time); + + printf("\n"); + } +} + +void interval_domain_short_summary_header(void) { + int i; + + printf("time running"); + for(i=0; itotal_time.interval_cycles; + + interval_time_output(); + + interval_cycle_percent_output(&d->total_time); + + for(i=0; ihvm_short.s + i, + interval_cycles); + + printf("\n"); + } +} + +void interval_domain_guest_interrupt(struct hvm_data *h, int vector) { + struct domain_data *d = h->v->d; + int i; + + /* Check to see if this vector is in the "print list" */ + for(i=0; iguest_interrupt[vector]++; +} + +void interval_domain_guest_interrupt_tail(void) { + int i; + + printf("time running"); + for(i=0; iguest_interrupt[v]); + + d->guest_interrupt[v]=0; + } + + printf("\n"); + } + +} + +void interval_domain_grant_maps_output(void) { + + if(P.interval.domain.d) { + struct domain_data *d; + + d=P.interval.domain.d; + + interval_time_output(); + + printf(" %d", d->memops.done_for_interval[MEM_PAGE_GRANT_MAP]); + + d->memops.done_for_interval[MEM_PAGE_GRANT_MAP] = 0; + + printf("\n"); + } +} + +/* General interval gateways */ + +void interval_callback(void) { + switch(opt.interval.output) { + case INTERVAL_CR3_SCHEDULE_TIME: + interval_cr3_schedule_time_output(); + break; + case INTERVAL_CR3_SCHEDULE_ORDERED: + interval_cr3_schedule_ordered_output(); + break; + case INTERVAL_CR3_SHORT_SUMMARY: + interval_cr3_short_summary_output(); + break; + case INTERVAL_DOMAIN_SHORT_SUMMARY: + interval_domain_short_summary_output(); + break; + case INTERVAL_DOMAIN_GUEST_INTERRUPT: + interval_domain_guest_interrupt_output(); + break; + case INTERVAL_DOMAIN_GRANT_MAPS: + interval_domain_grant_maps_output(); + break; + } +} + +void interval_header(void) { + switch(opt.interval.output) { + case INTERVAL_CR3_SCHEDULE_TIME: + interval_cr3_schedule_time_header(); + break; + case INTERVAL_CR3_SHORT_SUMMARY: + interval_cr3_short_summary_header(); + break; + case INTERVAL_DOMAIN_SHORT_SUMMARY: + interval_domain_short_summary_header(); + break; + default: + break; + } +} + +void interval_tail(void) { + switch(opt.interval.output) { + case INTERVAL_CR3_SCHEDULE_TIME: + interval_cr3_schedule_time_tail(); + break; + case INTERVAL_DOMAIN_GUEST_INTERRUPT: + interval_domain_guest_interrupt_tail(); + break; + default: + break; + } +} + +/* -- Eip list data -- */ + +void update_eip(struct eip_list_struct **head, unsigned long long eip, + unsigned long long cycles, int type, void * extra) { + struct eip_list_struct *p, **last=head; + + for(p=*head; p; last = (&p->next), p=p->next) + if(p->eip >= eip) + break; + + if(!p || p->eip != eip) { + p=malloc(sizeof(*p)); + if(!p) { + perror("malloc failed"); + exit(1); + } + + bzero(p, sizeof(*p)); + + p->eip=eip; + p->type = type; + + if(eip_list_type[type].new) { + eip_list_type[type].new(p, extra); + } + p->next = *last; + *last=p; + } else if(p->type != type) { + fprintf(stderr, "WARNING, mixed types! %d %d\n", p->type, type); + } else if(eip_list_type[type].update) { + eip_list_type[type].update(p, extra); + } + + update_summary(&p->summary, cycles); +} + +void dump_eip(struct eip_list_struct *head) { + struct eip_list_struct *p; + int i; + + struct eip_list_struct **qsort_array; + int N=0; + + int eip_compare(const void *_a, const void *_b) { + struct eip_list_struct *a=*(typeof(&a))_a; + struct eip_list_struct *b=*(typeof(&a))_b; + + if(a->summary.cycles < b->summary.cycles) + return 1; + else if(b->summary.cycles == a->summary.cycles) { + if(a->summary.count < b->summary.count) + return 1; + else if(a->summary.count == b->summary.count) + return 0; + else + return -1; + } else + return -1; + } + + for(p=head; p; p=p->next) + N++; + + if(!N) + return; + + qsort_array = malloc(N * sizeof(struct eip_list_struct *)); + + for(i=0, p=head; p; p=p->next, i++) + qsort_array[i]=p; + + qsort(qsort_array, N, sizeof(struct eip_list_struct *), + eip_compare); + + /* WARNING: don't use N after this point unless you copy this variable */ +#if 0 + if(opt.summary_eip_limit && opt.summary_eip_limit < N) + N=opt.summary_eip_limit; +#endif + + for(i=0; isummary, " %8llx%-45s: ", + p->eip, + find_symbol(p->eip)); + if(eip_list_type[p->type].dump) { + eip_list_type[p->type].dump(p); + } + } + + free(qsort_array); +} + +/* -- HVM code -- */ +struct hvm_pf_xen_record { + //unsigned vcpu:16, domain:16; + union { + struct { + unsigned long long va; + unsigned long error_code; + } x64; + struct { + unsigned long va; + unsigned long error_code; + } x32; + }; +}; + +void hvm_update_short_summary(struct hvm_data *h, int element) { + struct vcpu_data *v = h->v; + + if(v->cr3.data) + update_cycles(&v->cr3.data->hvm.s[element], h->arc_cycles); + + update_cycles(&v->d->hvm_short.s[element], h->arc_cycles); + + h->short_summary_done=1; +} + +void hvm_short_summary(struct hvm_short_summary_struct *hss, + tsc_t total, char *prefix) { + char desc[80]; + int i; + + for(i=0; is + i, total, desc); + } +} + +void hvm_set_summary_handler(struct hvm_data *h, void (*s)(struct hvm_data *h)) { + /* Set summary handler */ + if(h->exit_reason < h->exit_reason_max) + { + if(h->exit_reason_summary_handler[h->exit_reason]) + { + if(h->exit_reason_summary_handler[h->exit_reason]!= s) + fprintf(warn, "Strange, unexpected summary handler for exit_reason %s (%d!)\n", + h->exit_reason_name[h->exit_reason], h->exit_reason); + } + else + { + h->exit_reason_summary_handler[h->exit_reason] = s; + fprintf(warn, "Setting exit_reason %s (%d) summary handler\n", + h->exit_reason_name[h->exit_reason], h->exit_reason); + } + } +} + +#define SIGN_EXTENDED_BITS (~((1ULL<<48)-1)) +#define HIGH_BIT(_v) ((_v) & (1ULL<<47)) +static inline int is_valid_addr64(unsigned long long va) +{ + if(HIGH_BIT(va)) + return ((va & SIGN_EXTENDED_BITS) == SIGN_EXTENDED_BITS); + else + return ((va & SIGN_EXTENDED_BITS) == 0); +} + +void hvm_pf_xen_summary(struct hvm_data *h) { + int i,j, k; + + for(i=0; isummary.pf_xen[i], + " %-25s ", pf_xen_name[i]); + switch(i){ + case PF_XEN_NON_EMULATE: + for(j=0; jsummary.pf_xen_non_emul[j], + " *%-13s ", pf_xen_non_emul_name[j]); + break; + case PF_XEN_EMULATE: + for(j=0; jsummary.pf_xen_emul[j], + " *%-13s ", pf_xen_emul_name[j]); + if(j == PF_XEN_EMUL_EARLY_UNSHADOW) { + int k; + for(k=0; k<5; k++) { + PRINT_SUMMARY(h->summary.pf_xen_emul_early_unshadow[k], + " +[%d] ", k); + } + } + } + break; + case PF_XEN_FIXUP: + for(j=0; jsummary.pf_xen_fixup[j], + " *%-13s ", pf_xen_fixup_name[j]); + if(j == PF_XEN_FIXUP_UNSYNC ) { + for(k=0; ksummary.pf_xen_fixup_unsync_resync[k], + " +[%3d] ", k); + } + PRINT_SUMMARY(h->summary.pf_xen_fixup_unsync_resync[k], + " +[max] "); + } + } + break; + } + } +} + +void pf_preprocess(struct pf_xen_extra *e, int guest_paging_levels) +{ + switch(guest_paging_levels) { + /* Select a subfield of _bits bits starting at bit _shift from _x */ +#define _SUBFIELD(_bits, _shift, _x) \ + (((_x)>>(_shift)) & ((1ULL<<(_bits))-1)) + case 4: + /* Verify sign-extension */ + if((HIGH_BIT(e->va) + &&((e->va & SIGN_EXTENDED_BITS) != SIGN_EXTENDED_BITS)) + || (!HIGH_BIT(e->va) + && ((e->va & SIGN_EXTENDED_BITS) != 0))) { + fprintf(warn, "Strange, va %llx not properly sign extended for 4-level pagetables\n", + e->va); + } + e->pt_index[4]=_SUBFIELD(9,39,e->va); + e->pt_index[3]=_SUBFIELD(9,30,e->va); + e->pt_index[2]=_SUBFIELD(9,21,e->va); + e->pt_index[1]=_SUBFIELD(9,12,e->va); + /* These are only useful for the linear-pagetable code */ + e->pt_index[0]=_SUBFIELD(9,3,e->va); + if(e->va & 0x4) + e->pt_is_lo=0; + break; + case 3: + e->pt_index[3]=_SUBFIELD(2,30,e->va); + e->pt_index[2]=_SUBFIELD(9,21,e->va); + e->pt_index[1]=_SUBFIELD(9,12,e->va); + /* These are only useful for the linear-pagetable code */ + e->pt_index[0]=_SUBFIELD(9,3,e->va); + if(e->va & 0x4) + e->pt_is_lo=0; + break; + case 2: + e->pt_index[2]=_SUBFIELD(10,22,e->va); + e->pt_index[1]=_SUBFIELD(10,12,e->va); + /* This is only useful for the linear pagetable code */ + e->pt_index[0]=_SUBFIELD(10,2,e->va); + break; + case 0: + break; + default: + fprintf(warn, "Don't know how to handle %d-level pagetables\n", + guest_paging_levels); + } + + e->corresponding_va = CORR_VA_INVALID; + e->pt_level = 0; + + /* Detect accesses to Windows linear pagetables */ + switch(guest_paging_levels) + { + case 2: + if(e->pt_index[2] == 768) { + if(e->pt_index[1] == 768) + { + e->pt_level = 2; + e->corresponding_va=((1UL<<22)-1) + | e->pt_index[0]<<22; + } + else + { + e->pt_level = 1; + e->corresponding_va = ((1UL<<12)-1) + | e->pt_index[1]<<22 + | e->pt_index[0]<<12; + } + } + break; + case 3: + if(e->pt_index[3]==3 && (e->pt_index[2]>>2==0)) + { + if(e->pt_index[2]==3 && e->pt_index[1]>>2==0) + { + if(e->pt_index[1] == 3 && e->pt_index[0]>>2==0) + { + e->pt_level = 3; + e->corresponding_va=((1UL<<30)-1) + | e->pt_index[0]<<30; + } + else + { + e->pt_level = 2; + e->corresponding_va=((1UL<<21)-1) + | e->pt_index[1]<<30 + | e->pt_index[2]<<21; + } + } + else + { + e->pt_level = 1; + e->corresponding_va = ((1UL<<12)-1) + | e->pt_index[0]<<12 + | e->pt_index[1]<<21 + | e->pt_index[2]<<30; + } + } + break; + case 4: + if(e->pt_index[4] == 0x1ed) + { + if(e->pt_index[3] == 0x1ed) + { + if(e->pt_index[2] == 0x1ed) + { + if(e->pt_index[1] == 0x1ed) + { + e->pt_level = 4; + e->corresponding_va = ((1ULL<<39)-1) + | (unsigned long long)e->pt_index[0]<<39; + } + else + { + e->pt_level = 3; + e->corresponding_va = ((1ULL<<30)-1) + | (unsigned long long)e->pt_index[0]<<30 + | (unsigned long long)e->pt_index[1]<<39; + } + } + else + { + e->pt_level = 2; + e->corresponding_va = ((1ULL<<21)-1) + | (unsigned long long)e->pt_index[0]<<21 + | (unsigned long long)e->pt_index[1]<<30 + | (unsigned long long)e->pt_index[2]<<39; + } + } + else + { + e->pt_level = 1; + e->corresponding_va = ((1ULL<<12)-1) + | (unsigned long long)e->pt_index[0]<<12 + | (unsigned long long)e->pt_index[1]<<21 + | (unsigned long long)e->pt_index[2]<<30 + | (unsigned long long)e->pt_index[3]<<39; + } + + if(HIGH_BIT(e->corresponding_va)) + e->corresponding_va |= SIGN_EXTENDED_BITS; + } + break; + default: + break; + } +} + +void hvm_pf_xen_preprocess(unsigned event, struct hvm_data *h) { + struct pf_xen_extra *e = &h->inflight.pf_xen; + struct hvm_pf_xen_record *r = (typeof(r))h->d; + + if(event == TRC_HVM_PF_XEN64) + { + if(!is_valid_addr64(r->x64.va)) + fprintf(warn, "%s: invalid va %llx", + __func__, r->x64.va); + e->va = r->x64.va; + e->error_code = r->x64.error_code; + } + else + { + e->va = r->x32.va; + e->error_code = r->x32.error_code; + } + + if(e->mmio_data_valid) + e->pf_case = PF_XEN_MMIO; + else + { + pf_preprocess(e, h->v->guest_paging_levels); + + /* On rio traces, we try to infer emulation by looking for accesses + in the linear pagetable */ + if(e->pt_level > 0) + e->pf_case = PF_XEN_EMULATE; + else + e->pf_case = PF_XEN_NON_EMULATE; + } +} + +static inline int is_kernel(int paging_levels, unsigned long long va) { + switch(paging_levels) { + case 2: + case 3: + if(va & 0x80000000) + return 1; + else + return 0; + break; + case 4: + if(HIGH_BIT(va)) + return 1; + else return 0; + default: + return 0; + } + +} + +void hvm_pf_xen_postprocess(struct hvm_data *h) { + struct pf_xen_extra *e = &h->inflight.pf_xen; + + if(opt.summary_info) { + update_summary(&h->summary.pf_xen[e->pf_case], + h->arc_cycles); + switch(e->pf_case) + { + case PF_XEN_EMULATE: + update_eip(&h->v->d->emulate_eip_list, + h->rip, + h->arc_cycles, + 0, NULL); + break; + case PF_XEN_NON_EMULATE: + if(is_kernel(h->v->guest_paging_levels, h->rip)) + update_summary(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_EIP_KERNEL], + h->arc_cycles); + else + update_summary(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_EIP_USER], + h->arc_cycles); + if(is_kernel(h->v->guest_paging_levels, e->va)) + update_summary(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_VA_KERNEL], + h->arc_cycles); + + else + update_summary(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_VA_USER], + h->arc_cycles); + } + + /* Set summary handler */ + hvm_set_summary_handler(h, hvm_pf_xen_summary); + } + + if(opt.dump_cooked) + { + switch(e->pf_case) + { + case PF_XEN_EMULATE: + printf(" %s pf_xen:emulate va %llx ec %lx eip %llx%s lvl %d corr %llx\n", + h->dump_header, e->va, e->error_code, + h->rip, find_symbol(h->rip), + e->pt_level, e->corresponding_va); + break; + case PF_XEN_MMIO: + printf(" %s pf_xen:mmio va %llx ec %lx eip %llx%s data %lx\n", + h->dump_header, e->va, e->error_code, + h->rip, find_symbol(h->rip), + e->data); + break; + default: + printf(" %s pf_xen va %llx ec %lx eip %llx%s\n", + h->dump_header, e->va, e->error_code, + h->rip, find_symbol(h->rip)); + break; + } + } +} + +void hvm_pf_xen_process(struct record_info *ri, struct hvm_data *h) { + struct pf_xen_extra *e = &h->inflight.pf_xen; + + if(ri->event == TRC_HVM_PF_XEN64 + && h->v->guest_paging_levels != 4) + fprintf(warn, "Strange, PF_XEN64 but guest_paging_levels %d!\n", + h->v->guest_paging_levels); + else if(ri->event == TRC_HVM_PF_XEN + && h->v->guest_paging_levels == 4) + fprintf(warn, "Strange, PF_XEN but guest_paging_levels %d!\n", + h->v->guest_paging_levels); + + hvm_pf_xen_preprocess(ri->event, h); + + if(opt.dump_all) + { + if(e->pf_case == PF_XEN_EMULATE) + printf("]%s pf_xen:emulate va %llx ec %lx level %d corr %llx e->pt_index[%d %d %d %d %d]\n", + ri->dump_header, e->va, e->error_code, + e->pt_level, e->corresponding_va, + e->pt_index[0], e->pt_index[1], e->pt_index[2], + e->pt_index[3], + e->pt_index[4]); + else + printf("]%s pf_xen va %llx ec %lx e->pt_index[%d %d %d %d %d]\n", + ri->dump_header, e->va, e->error_code, + e->pt_index[0], e->pt_index[1], e->pt_index[2], + e->pt_index[3], + e->pt_index[4]); + } + + h->post_process = hvm_pf_xen_postprocess; +} + +char * hvm_vlapic_icr_dest_shorthand_name[4] = { + "dest_field", "self", "all-inc", "all-exc" +}; + +void clear_vlapic(struct vlapic_struct *vla) { + vla->outstanding_ipis = 0; + vla->first_ipi_tsc = 0; + vla->ipi_virq_injected = 0; +} + +void hvm_vlapic_icr_handler(struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + union { + unsigned long val; + struct { + unsigned vec:8, + delivery_mode:3, + dest_mode:1, + delivery_status:1, + _res1:1, + level:1, + trigger:1, + _res2:2, + dest_shorthand:2; + }; + } icr = { .val = e->data }; + + if(e->mmio_is_write) { + if(opt.dump_all || opt.dump_cooked) { + printf(" vlapic icr vec %d %s\n", + icr.vec, + hvm_vlapic_icr_dest_shorthand_name[icr.dest_shorthand]); + } + + if(icr.dest_shorthand == 3 && icr.vec == 0xd1) + { + struct vcpu_data *ov, *v = h->v; + struct vlapic_struct *vla; + struct domain_data *d = v->d; + int i; + + for(i=0; ivcpu[i]; + if(!ov || ov == v) + continue; + + if(ov->runstate.state == RUNSTATE_LOST) { + if(opt.dump_all) + fprintf(warn, "%s: v%d in state RUNSTATE_LOST, not counting ipi\n", + __func__, ov->vid); + continue; + } + + vla = &ov->vlapic; + + if(!vla->first_ipi_tsc) + { + struct record_info *ri; + ri = &v->p->ri; + vla->first_ipi_tsc = ri->tsc; + } + + vla->outstanding_ipis++; + + if((opt.dump_all || opt.dump_cooked) && + (ov->runstate.state != RUNSTATE_RUNNING)) + printf(" v%d in state %s (outstanding ipis %d)\n", + ov->vid, runstate_name[ov->runstate.state], + vla->outstanding_ipis); + } + } + } else { + /* Read */ + if(opt.dump_all || opt.dump_cooked) { + printf(" vlapic icr status %s\n", + icr.delivery_status?"pending":"idle"); + } + } + +} + +void hvm_vlapic_eoi_handler(struct hvm_data *h) { + if(opt.dump_all || opt.dump_cooked) + printf(" vlapic eoi\n"); +} + +void hvm_vlapic_handler(struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + + switch(e->gpa) { + case 0xfee00300: + hvm_vlapic_icr_handler(h); + break; + case 0xfee000b0: + hvm_vlapic_eoi_handler(h); + break; + } + +} + + +#define HVM_MMIO_ASSIST_WRITE 0x200 +void hvm_mmio_assist_process(struct record_info *ri, struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + + union { + struct { + unsigned long gpa; + unsigned long data; + } x32; + struct { + unsigned long long gpa; + unsigned long data; + } x64; + } *r = (typeof(r))h->d; + + union { + unsigned event; + struct { + unsigned minor:8, + x64:1, + write:2; + }; + } mevt = { .event = ri->event }; + + if(mevt.x64) { + e->gpa = r->x64.gpa; + e->data = r->x64.data; + if(ri->extra_words*(sizeof(unsigned long))==sizeof(r->x64)) + e->mmio_data_valid=1; + } else { + e->gpa = r->x32.gpa; + e->data = r->x32.data; + if(ri->extra_words*(sizeof(unsigned long))==sizeof(r->x32)) + e->mmio_data_valid=1; + } + + e->mmio_is_write = mevt.write; + + if(opt.dump_all) + { + if(e->mmio_data_valid) + printf("]%s mmio_assist %c gpa %llx data %lx\n", h->dump_header, + mevt.write?'w':'r', e->gpa, e->data); + else + printf("]%s mmio_assist %c gpa %llx (no data)\n", h->dump_header, + mevt.write?'w':'r', e->gpa); + } + + if((e->gpa & 0xfffff000) == 0xfee00000) + hvm_vlapic_handler(h); +} + +void hvm_inj_virq_process(struct record_info *ri, struct hvm_data *h) { + struct { + int vector, fake; + } *r = (typeof(r))h->d; + + if(opt.dump_cooked | opt.dump_all) { + printf(" %3lu.%09lu %s inj_virq vec %u %s\n", + ri->t.s, ri->t.ns, pcpu_string(ri->cpu), + r->vector, r->fake?"fake":"real"); + } + + if(opt.summary_info) + { + int vector = r->vector; + + if(vector >= GUEST_INTERRUPT_MAX) + vector = GUEST_INTERRUPT_MAX; + h->summary.guest_interrupt[vector].count++; + + if(opt.interval.output == INTERVAL_DOMAIN_GUEST_INTERRUPT) + interval_domain_guest_interrupt(h, vector); + } + + /* If we're waking, make this the wake vector */ + if(r->vector < GUEST_INTERRUPT_MAX ) { + int vector = r->vector; + if ( h->w2h.waking && h->w2h.vector == 0 ) { + if(h->summary.guest_interrupt[vector].start_tsc) { + fprintf(warn, "Strange, d%dv%d waking && wake_vector 0 but vec %d start_tsc %lld!\n", + h->v->d->did, h->v->vid, + vector, + h->summary.guest_interrupt[vector].start_tsc); + exit(1); + } + if(h->w2h.interrupts) + fprintf(warn, "Strange, waking && wake_vector 0 but interrupts_this_wait_to_halt %d!\n", + h->w2h.interrupts); + + if(opt.dump_all) + printf(" [w2h] d%dv%d Setting wake_vector %d\n", + h->v->d->did, h->v->vid, vector); + + h->w2h.vector = vector; + h->summary.guest_interrupt[vector].is_wake = 1; + } + + if( h->summary.guest_interrupt[vector].start_tsc == 0 ) { + /* Note that we want start_tsc set at the next vmentry */ + h->summary.guest_interrupt[vector].start_tsc = 1; + h->w2h.interrupts_wanting_tsc++; + h->w2h.interrupts++; + + if(opt.dump_all) + printf(" [w2h] d%dv%d Starting vec %d\n", + h->v->d->did, h->v->vid, vector); + } + } + + + /* Vista bug */ + if(r->vector == 0xd1 && h->v->vlapic.outstanding_ipis) + h->v->vlapic.ipi_virq_injected = 1; +} + +/* I/O Handling */ +struct io_address { + struct io_address *next; + unsigned long pa; + struct event_cycle_summary summary[2]; +}; + +void update_io_address(struct io_address ** list, unsigned long pa, int dir, + tsc_t arc_cycles) { + struct io_address *p, *q=NULL; + + /* Keep list in order */ + for(p=*list; p && (p->pa != pa) && (p->pa < pa); q=p, p=p->next); + + /* If we didn't find it, make a new element. */ + if(!p || (p->pa != pa)) { + if((p=malloc(sizeof(*p)))==NULL) { + fprintf(stderr, "malloc() failed.\n"); + exit(1); + } + + bzero(p, sizeof(*p)); + + p->pa=pa; + + /* If we stopped in the middle or at the end, add it in */ + if(q) { + p->next=q->next; + q->next=p; + } else { + /* Otherwise, we stopped after the first element; put it at the beginning */ + p->next = *list; + *list = p; + } + } + update_summary(&p->summary[dir], arc_cycles); +} + +void hvm_io_address_summary(struct io_address *list, char * s) { + if(!list) + return; + + printf("%s\n", s); + + for(; list; list=list->next) { + PRINT_SUMMARY(list->summary[0], "%8lx:[r] ", list->pa); + PRINT_SUMMARY(list->summary[1], "%8lx:[w] ", list->pa); + } +} + +void hvm_io_write_postprocess(struct hvm_data *h) +{ + if(opt.dump_cooked) + { + printf(" %s io_write port %x size %u val %lx\n", + h->dump_header, h->inflight.io.port, + h->inflight.io.size, + h->inflight.io.val); + } + if(opt.with_pio_enumeration) + update_io_address(&h->summary.pio, h->inflight.io.port, 1, h->arc_cycles); +} + +void hvm_io_write_process(struct hvm_data *h) +{ + struct { + unsigned long port; + int size; + unsigned long val; + } *r = (typeof(r))h->d; + + unsigned long long mask = ((1ULL<<(r->size<<3))-1); + + h->inflight.io.port = r->port; + h->inflight.io.size = r->size; + h->inflight.io.val = r->val & mask; + + if(opt.dump_all) + { + printf(" %s io_write port %lx size %d val %lx\n", + h->dump_header, + r->port, r->size, r->val); + } + + h->post_process = hvm_io_write_postprocess; +} + +void hvm_io_read_postprocess(struct hvm_data *h) +{ + if(opt.dump_cooked) + { + printf(" %s io_read port %x size %u val %lx\n", + h->dump_header, h->inflight.io.port, + h->inflight.io.size, + h->inflight.io.val); + } + if(opt.with_pio_enumeration) + update_io_address(&h->summary.pio, h->inflight.io.port, 0, h->arc_cycles); + if(opt.scatterplot_io && h->inflight.io.port == opt.scatterplot_io_port) + scatterplot_vs_time(h->exit_tsc, P.now - h->exit_tsc); +} + +void hvm_io_assist_process(struct hvm_data *h) +{ + struct { + unsigned long val; + } *r = (typeof(r))h->d; + + int size = h->inflight.io.size; + unsigned long long mask = ((1ULL<<(size<<3))-1); + + h->inflight.io.val = r->val & mask; + + if(opt.dump_all) + { + printf(" %s io_assist val %lx\n", + h->dump_header, + r->val); + } + + + if(!h->post_process) + fprintf(warn, "Strange, io_assist but post_process not set!\n"); +} + +void hvm_io_read_process(struct hvm_data *h) +{ + struct { + unsigned long port; + int size; + } *r = (typeof(r))h->d; + + h->inflight.io.port = r->port; + h->inflight.io.size = r->size; + h->inflight.io.val = 0; + + if(opt.dump_all) + { + printf(" %s io_read port %lx size %d\n", + h->dump_header, + r->port, r->size); + } + + h->post_process = hvm_io_read_postprocess; +} + +/* cr_write */ +/* CR3 list */ +void cr3_switch(unsigned long long val, struct hvm_data *h) { + struct vcpu_data *v = h->v; + /* Really only need absolute tsc here. Later change to global time. */ + unsigned long long now = P.now; + unsigned long long gmfn = val >> 12; + + if(opt.with_cr3_enumeration) { + if(v->cr3.data) { + struct cr3_value_struct *cur = v->cr3.data; + unsigned long long cycles = now - v->cr3.start_time; + + if(opt.summary_info) + update_cycles(&cur->total_time, cycles); + + cur->last_time = now; + } + + if(gmfn) { + struct cr3_value_struct *p, **last=&v->d->cr3_value_head; + + /* Always add to tail, so that we get consistent interval + ouptut as the number of cr3s grow */ + for(p=*last; p; last = (&p->next), p=p->next) + if(p->gmfn == gmfn) + break; + + if(!p) { + if((p=malloc(sizeof(*p)))==NULL) { + fprintf(stderr, "malloc() failed.\n"); + exit(1); + } + + bzero(p, sizeof(*p)); + + p->gmfn = gmfn; + p->cr3_id = P.cr3.id; + p->first_time = now; + + p->next=*last; + *last=p; + + p->gnext = NULL; + if(P.cr3.head) + *P.cr3.tail = p; + else + P.cr3.head = p; + P.cr3.tail = &p->gnext; + + P.cr3.id++; + + /* Add to the interval list if appropriate */ + if(v->d->did != DEFAULT_DOMAIN) + interval_cr3_value_check(p); + } + + if(p->prealloc_unpin.now) { + fprintf(warn, "Re-promoting previously unpinned cr3 %llx!\n", + p->gmfn); + p->prealloc_unpin.now = 0; + h->inflight.cr_write.repromote = 1; + } + + /* Accounting for new toplevel */ + v->cr3.start_time = now; + p->switch_count++; + if(p->destroy.callback) + p->destroy.switch_count++; + v->cr3.data = p; + } else { + v->cr3.data = NULL; + } + + if (opt.scatterplot_cr3_switch) { + scatterplot_vs_time(h->exit_tsc, + v->cr3.data ? (v->cr3.data->cr3_id) : 0); + } + } else { + if (opt.scatterplot_cr3_switch) + scatterplot_vs_time(h->exit_tsc, gmfn); + } + + v->cr3.val = val; +}; + +void cr3_prealloc_unpin(struct vcpu_data *v, unsigned long long gmfn) { + struct cr3_value_struct *cr3; + + /* Look for it in the list */ + for(cr3 = v->d->cr3_value_head; cr3; cr3=cr3->next) + if(cr3->gmfn == gmfn) + break; + + if(!cr3) + return; + + if(cr3->prealloc_unpin.now) + fprintf(warn, "Strange, gmfn %llx multiple unpins w/o access!\n", + gmfn); + + cr3->prealloc_unpin.now = 1; + cr3->prealloc_unpin.count++; + + if(opt.dump_all) + printf(" cr3 %llx unpinned %d times\n", + gmfn, cr3->prealloc_unpin.count); +} + +void cr3_dump_list(struct cr3_value_struct *head){ + struct cr3_value_struct *p; + struct cr3_value_struct **qsort_array; + int i, N=0; + + int cr3_compare(const void *_a, const void *_b) { + struct cr3_value_struct *a=*(typeof(&a))_a; + struct cr3_value_struct *b=*(typeof(&a))_b; + + if(a->total_time.cycles < b->total_time.cycles) + return 1; + else if(b->total_time.cycles == a->total_time.cycles) { + if(a->total_time.count < b->total_time.count) + return 1; + else if(a->total_time.count == b->total_time.count) + return 0; + else + return -1; + } else + return -1; + } + + if(!head) + return; + + /* Count the number of elements */ + for(p=head; p; p=p->next) + N++; + + if(!N) + return; + + /* Alloc a struct of the right size */ + qsort_array = malloc(N * sizeof(struct eip_list_struct *)); + + /* Point the array into it */ + for(i=0, p=head; p; p=p->next, i++) + qsort_array[i]=p; + + /* Sort the array by time */ + qsort(qsort_array, N, sizeof(struct eip_list_struct *), + cr3_compare); + + /* WARNING: don't use N after this point unless you copy this variable */ +#if 0 + if(opt.summary_eip_limit && opt.summary_eip_limit < N) + N=opt.summary_eip_limit; +#endif + + /* Now print the results */ + printf(" cr3 values:\n"); + for(i=0; ifirst_time, &first); + abs_cycles_to_time(p->last_time, &last); + + + snprintf(desc, 30, " %8llx (id %d)", p->gmfn, p->cr3_id); + print_cycle_summary(&p->total_time, desc); + snprintf(desc, 30, " guest"); + print_cycle_percent_summary(&p->guest_time, p->run_time, desc); + snprintf(desc, 30, " hv "); + print_cycle_percent_summary(&p->hv_time, p->run_time, desc); + + hvm_short_summary(&p->hvm, p->run_time, " + "); +#if 0 + printf(" Seen: %4lu.%09lu-%4lu.%09lu switch %d flush %d\n", + first.s, first.ns, + last.s, last.ns, + p->switch_count, p->flush_count); + if(p->destroy.callback) + printf(" destroy: flush %u switch %u fixup %u emulate %u\n", + p->destroy.flush_count, + p->destroy.switch_count, + p->destroy.fixup_user, + p->destroy.emulate_corr_user); +#endif + } + + free(qsort_array); +} + +void hvm_cr3_write_summary(struct hvm_data *h) { + int j; + + for(j=0; jsummary.cr3_write_resyncs[j], + " *[%3d] ", j); + PRINT_SUMMARY(h->summary.cr3_write_resyncs[j], + " *[MAX] "); +} + +void hvm_cr_write_summary(struct hvm_data *h) +{ + int i; + + for(i=0; isummary.cr_write[i], + " cr%d ", i); + switch(i) { + case 3: + hvm_cr3_write_summary(h); + break; + } + } +} + +void hvm_cr_write_postprocess(struct hvm_data *h) +{ + if(h->inflight.cr_write.cr == 3) { + struct vcpu_data *v = h->v; + unsigned long long new_val = h->inflight.cr_write.val; + unsigned long long oval; + int flush=0; + + if(v->cr3.val) { + oval = v->cr3.val; + + if(new_val == oval) { + if(v->cr3.data) { + v->cr3.data->flush_count++; + if(v->cr3.data->destroy.callback) + v->cr3.data->destroy.flush_count++; + } + flush=1; + } + + if(opt.dump_cooked) + { + printf(" %s cr_write cr3 val %llx oval %llx (%d resyncs) %s\n", + h->dump_header, + new_val, + oval, + h->resyncs, + flush?"flush":""); + } + } else { + if(opt.dump_cooked) + { + printf(" %s cr_write cr3 val %llx (%d resyncs)\n", + h->dump_header, + h->inflight.cr_write.val, + h->resyncs); + } + } + + if(opt.summary_info) { + int resyncs = h->resyncs; + + if(resyncs > RESYNCS_MAX) + resyncs = RESYNCS_MAX; + + update_summary(&h->summary.cr3_write_resyncs[resyncs], + h->arc_cycles); + + update_summary(&h->summary.cr_write[3], + h->arc_cycles); + + hvm_update_short_summary(h, HVM_SHORT_SUMMARY_CR3); + } + + if(!flush) + cr3_switch(new_val, h); + } else { + if(opt.dump_cooked) + { + printf(" %s cr_write cr%d val %llx\n", + h->dump_header, + h->inflight.cr_write.cr, + h->inflight.cr_write.val); + } + + if(opt.summary_info) + { + if(h->inflight.cr_write.cr < CR_MAX) + update_summary(&h->summary.cr_write[h->inflight.cr_write.cr], + h->arc_cycles); + + } + } + + /* Set summary handler */ + /* FIXME - deal with cr_read_summary */ + if(h->exit_reason < h->exit_reason_max) + { + if ( opt.svm_mode ) { + /* For svm, only need a summary for cr3 */ + if ( h->exit_reason == VMEXIT_CR3_WRITE ) + hvm_set_summary_handler(h, hvm_cr3_write_summary); + } else { + hvm_set_summary_handler(h, hvm_cr3_write_summary); + } + } + +} + +void hvm_cr_write_process(struct hvm_data *h) +{ + struct { + /* FIXME -- 64-bit values */ + unsigned cr; + unsigned val; + } *r = (typeof(r))h->d; + + h->inflight.cr_write.cr = r->cr; + h->inflight.cr_write.val = r->val; + + h->post_process = hvm_cr_write_postprocess; + + if(opt.dump_all) + { + if(r->cr == 3 && h->v->cr3.val) { + printf("]%s cr_write cr3 val %llx oval %llx %s\n", + h->dump_header, + (unsigned long long)r->val, + h->v->cr3.val, + (h->v->cr3.val == r->val)?"flush":"switch"); + } else { + printf(" %s cr_write cr%d val %llx\n", + h->dump_header, + r->cr, (unsigned long long)r->val); + + } + } + +} + +/* msr_write */ +void hvm_msr_write_summary(struct hvm_data *h) +{ +} + +void hvm_msr_write_postprocess(struct hvm_data *h) +{ + if(opt.dump_cooked) + { + printf(" %s msr_write msr %d val %llx\n", + h->dump_header, + h->inflight.msr.addr, + h->inflight.msr.val); + } + + if(opt.summary_info) { + } + + /* Set summary handler */ + hvm_set_summary_handler(h, hvm_msr_write_summary); +} + +void hvm_msr_write_process(struct record_info *ri, struct hvm_data *h) +{ + struct { + unsigned long long val; + unsigned long addr; + } *r = (typeof(r))h->d; + + if(ri->extra_words != (sizeof(*r)/sizeof(unsigned long))) + { + fprintf(warn, "FATAL: msr_write extra_words %d, expected %d!\n", + ri->extra_words, sizeof(*r)/sizeof(unsigned long)); + exit(1); + } + + h->inflight.msr.addr = r->addr; + h->inflight.msr.val = r->val; + + if(opt.dump_all) + { + printf(" %s msr_write addr %lx val %llx\n", + h->dump_header, + r->addr, r->val); + } + + h->post_process = hvm_msr_write_postprocess; +} + +/* msr_read */ +void hvm_msr_read_summary(struct hvm_data *h) +{ +} + +void hvm_msr_read_postprocess(struct hvm_data *h) +{ + if(opt.dump_cooked) + { + printf(" %s msr_read msr %d val %llx\n", + h->dump_header, + h->inflight.msr.addr, + h->inflight.msr.val); + } + + if(opt.summary_info) { + } + + /* Set summary handler */ + hvm_set_summary_handler(h, hvm_msr_read_summary); +} + +void hvm_msr_read_process(struct record_info *ri, struct hvm_data *h) +{ + struct { + unsigned long long val; + unsigned long addr; + } *r = (typeof(r))h->d; + + if(ri->extra_words != (sizeof(*r)/sizeof(unsigned long))) + { + fprintf(warn, "FATAL: msr_read extra_words %d, expected %d!\n", + ri->extra_words, sizeof(*r)/sizeof(unsigned long)); + exit(1); + } + + h->inflight.msr.addr = r->addr; + h->inflight.msr.val = r->val; + + if(opt.dump_all) + { + printf(" %s msr_read addr %lx val %llx\n", + h->dump_header, + r->addr, r->val); + } + + h->post_process = hvm_msr_read_postprocess; +} + +void hvm_vmcall_process(struct record_info *ri, struct hvm_data *h) +{ + struct { + unsigned long eax; + } *r = (typeof(r))h->d; + + if(opt.dump_all) { + if(r->eax < HYPERCALL_MAX) + printf(" %s vmcall %2lx (%s)\n", + h->dump_header, + r->eax, + hypercall_name[r->eax]); + else + printf(" %s vmcall %2lx\n", + h->dump_header, + r->eax); + } + +#if 0 + if(opt.summary) + hvm_set_summary_handler(h, hvm_vmcall_summary); +#endif +} + +void hvm_inj_exc_process(struct record_info *ri, struct hvm_data *h) +{ + struct { + unsigned vec, ec; + } *r = (typeof(r))h->d; + + if ( opt.dump_cooked || opt.dump_all ) + { + if(r->vec < HVM_TRAP_MAX) + printf(" %3lu.%09lu %s inj_exc trap %s ec %x\n", + ri->t.s, ri->t.ns, pcpu_string(ri->cpu), + hvm_trap_name[r->vec], r->ec); + else + printf(" %3lu.%09lu %s inj_exc trap %u ec %x\n", + ri->t.s, ri->t.ns, pcpu_string(ri->cpu), + r->vec, r->ec); + } + +} + +void hvm_intr_summary(struct hvm_data *h) +{ + int i; + + for(i=0; isummary.extint[i]) + { + if(hvm_extint_vector_name[i]) + printf(" %10s(%3d): %d\n", + hvm_extint_vector_name[i], + i, + h->summary.extint[i]); + else + printf(" [%3d]: %d\n", + i, + h->summary.extint[i]); + } + if(h->summary.extint[EXTERNAL_INTERRUPT_MAX]) + printf(" Other: : %d\n", + h->summary.extint[EXTERNAL_INTERRUPT_MAX]); +} + + +void hvm_intr_process(struct hvm_data *h) +{ + unsigned vec = *(unsigned *)h->d; + + if( (h->rip >> ADDR_SPACE_BITS) != 00 + && (h->rip >> ADDR_SPACE_BITS) != ((0ULL-1)>> ADDR_SPACE_BITS) ) { + fprintf(stderr, "Unexpected rip %llx (shift %llx)\n", + h->rip, + h->rip >> ADDR_SPACE_BITS); + exit(1); + } + + if ( opt.dump_cooked || opt.dump_all ) + { + if ( vec < EXTERNAL_INTERRUPT_MAX && + hvm_extint_vector_name[vec] ) + printf(" %s intr vec %s(%x)\n", + h->dump_header, + hvm_extint_vector_name[vec], + vec); + else + printf(" %s intr vec %x\n", + h->dump_header, vec); + } + + if(opt.scatterplot_interrupt_eip + && vec == opt.scatterplot_interrupt_vector) + { + /* Truncate to 40 bits */ + unsigned long long rip = h->rip & ((1ULL << ADDR_SPACE_BITS)-1); + + /* Want absolute tsc to global tsc */ + scatterplot_vs_time(h->exit_tsc, rip); + } + + if(opt.histogram_interrupt_eip + && vec == opt.histogram_interrupt_vector) + { + /* Truncate to 40 bits */ + unsigned long long rip = h->rip & ((1ULL << ADDR_SPACE_BITS)-1); + unsigned index = rip / opt.histogram_interrupt_increment; + + h->summary.extint_histogram[index]++; + } + + if(opt.summary_info) { + if(opt.summary) + hvm_set_summary_handler(h, hvm_intr_summary); + + if(vec < EXTERNAL_INTERRUPT_MAX) + h->summary.extint[vec]++; + else + h->summary.extint[EXTERNAL_INTERRUPT_MAX]++; + } +} + +void hvm_pf_inject_process(struct record_info *ri, struct hvm_data *h) +{ + struct { + unsigned cr2, ec; + } *r = (typeof(r))h->d; + + if ( opt.dump_cooked || opt.dump_all ) + { + printf(" %3lu.%09lu %s pf_inject guest_cr2 %x guest_ec %x\n", + ri->t.s, ri->t.ns, pcpu_string(ri->cpu), + r->cr2, r->ec); + } +} + +void hvm_exception_nmi_generic_postprocess(struct hvm_data *h) +{ + static int warned = 0; + if(opt.dump_cooked) + { + printf(" %s exnmi no_handler\n", + h->dump_header); + } + + if(opt.summary_info) + update_summary(&h->summary.pf_xen[PF_XEN_NO_HANDLER], + h->arc_cycles); + + if(!warned) { + warned++; + fprintf(warn, "Strange, no handler for exnmi!\n"); + } +} + +void hvm_generic_postprocess(struct hvm_data *h) +{ + if(opt.summary_info) { + /* HLT checked at hvm_vmexit_close() */ + } + + if(opt.dump_cooked) + { + int i, evt = h->event_handler; + + if(evt < HVM_EVENT_HANDLER_MAX) + printf(" %s %s [", + h->dump_header, + hvm_event_handler_name[evt]); + else + printf(" %s %d [", + h->dump_header, + evt); + + for(i=0; i<4; i++) { + printf(" %x", h->inflight.generic.d[i]); + } + + printf(" ]\n"); + } +} + +void hvm_generic_dump(struct record_info *ri, char * prefix) +{ + struct { + unsigned vcpu:16, domain:16; + unsigned d[4]; + } *cr = (typeof(cr))ri->d; + + char *evt_string, evt_number[256]; + unsigned *d; + int i, evt = ri->event - TRC_HVM_HANDLER; + + if(evt < HVM_EVENT_HANDLER_MAX) + { + evt_string = hvm_event_handler_name[evt]; + } + else + { + snprintf(evt_number, 256, "hvm_handler %d", evt); + evt_string = evt_number; + } + + d = ri->d; + printf("%s%s %s [", + prefix, + ri->dump_header, + evt_string); + + for(i=0; i<4; i++) + { + printf(" %x", d[i]); + } + + printf(" ]\n"); +} + +void hvm_handler_process(struct record_info *ri, struct hvm_data *h) { + /* Wait for first vmexit to initialize */ + if(!h->init) + { + if(opt.dump_all) + hvm_generic_dump(ri,"!"); + return; + } + + h->d = ri->d; + + /* Handle things that don't need a vmexit */ + switch(ri->event) { + default: + goto needs_vmexit; + /* Records about changing guest state */ + case TRC_HVM_PF_INJECT: + hvm_pf_inject_process(ri, h); + break; + case TRC_HVM_REINJ_VIRQ: + if ( opt.dump_cooked || opt.dump_all ) + { + printf(" %3lu.%09lu %s inj_virq vec %u\n", + ri->t.s, ri->t.ns, pcpu_string(ri->cpu), + *(unsigned*)h->d); + } + break; + case TRC_HVM_INJ_EXC: + hvm_inj_exc_process(ri, h); + break; + case TRC_HVM_INJ_VIRQ: + hvm_inj_virq_process(ri, h); + break; + case TRC_HVM_OP_DESTROY_PROC: + if(h->v->cr3.data) { + struct cr3_value_struct *cur = h->v->cr3.data; + if(cur->destroy.callback) + fprintf(warn, "Strange, double callback for cr3 gmfn %llx!\n", + cur->gmfn); + cur->destroy.callback = 1; + } else if(opt.with_cr3_enumeration) { + fprintf(warn, "Warning: destroy_proc: don't know current cr3\n"); + } + if ( opt.dump_cooked || opt.dump_all ) + { + printf(" %3lu.%09lu %s destroy_proc cur_cr3 %llx\n", + ri->t.s, ri->t.ns, pcpu_string(ri->cpu), h->v->cr3.val); + } + break; + } + + return; + +needs_vmexit: + /* Wait for the next vmexit */ + if(!h->vmexit_valid) + { + if(opt.dump_all) + hvm_generic_dump(ri,"!"); + return; + } + + /* Keep generic "event handler" info */ + h->event_handler = ri->event - TRC_HVM_HANDLER; + + switch(ri->event) { + /* Records adding to the vmexit reason */ + case TRC_HVM_INTR: + hvm_intr_process(h); + break; + case TRC_HVM_PF_XEN: + case TRC_HVM_PF_XEN64: + hvm_pf_xen_process(ri, h); + break; + case TRC_HVM_IO_READ: + hvm_io_read_process(h); + break; + case TRC_HVM_IO_WRITE: + hvm_io_write_process(h); + break; + case TRC_HVM_IO_ASSIST: + hvm_io_assist_process(h); + break; + case TRC_HVM_MMIO_ASSIST: + case TRC_HVM_MMIO_ASSIST|HVM_MMIO_ASSIST_WRITE: + hvm_mmio_assist_process(ri, h); + break; + case TRC_HVM_CR_WRITE: + hvm_cr_write_process(h); + break; + case TRC_HVM_MSR_WRITE: + hvm_msr_write_process(ri, h); + break; + case TRC_HVM_MSR_READ: + hvm_msr_read_process(ri, h); + break; + case TRC_HVM_VMMCALL: + hvm_vmcall_process(ri, h); + break; + case TRC_HVM_CR_READ: + case TRC_HVM_DR_READ: + case TRC_HVM_DR_WRITE: + case TRC_HVM_CPUID: + case TRC_HVM_NMI: + case TRC_HVM_SMI: + case TRC_HVM_HLT: + case TRC_HVM_INVLPG: + case TRC_HVM_MCE: + case TRC_HVM_CLTS: + case TRC_HVM_LMSW: + if ( h->post_process != NULL ) + fprintf(warn, "Strange, h->postprocess already set!\n"); + h->inflight.generic.event = ri->event; + bcopy(h->d, h->inflight.generic.d, sizeof(unsigned int) * 4); + h->post_process = hvm_generic_postprocess; + if(opt.dump_all) + { + hvm_generic_dump(ri, "]"); + } + break; + } +} + +void vcpu_next_update(struct pcpu_info *p, struct vcpu_data *next, tsc_t tsc); +void vcpu_prev_update(struct pcpu_info *p, struct vcpu_data *prev, + tsc_t tsc, int new_runstate); +struct vcpu_data * vcpu_find(int did, int vid); +void lose_vcpu(struct vcpu_data *v, tsc_t tsc); + +int domain_runstate(struct domain_data *d) { + int i; + int runstates[RUNSTATE_MAX]; + int ret=-1; + + if(d->did == DEFAULT_DOMAIN) + return 0; + + for(i=0; imax_vid; i++) + if(d->vcpu[i]) + runstates[d->vcpu[i]->runstate.state]++; + + if(runstates[RUNSTATE_LOST]) + ret=DOMAIN_RUNSTATE_LOST; + else if(runstates[RUNSTATE_RUNNING]) + { + if(runstates[RUNSTATE_RUNNABLE]) + ret=DOMAIN_RUNSTATE_CONCURRENCY_HAZARD; + else if(runstates[RUNSTATE_BLOCKED]||runstates[RUNSTATE_OFFLINE]) + ret= DOMAIN_RUNSTATE_PARTIAL_RUN; + else + ret= DOMAIN_RUNSTATE_FULL_RUN; + } + else if(runstates[RUNSTATE_RUNNABLE]) + { + if(runstates[RUNSTATE_BLOCKED]||runstates[RUNSTATE_OFFLINE]) + ret= DOMAIN_RUNSTATE_PARTIAL_CONTENTION; + else + ret= DOMAIN_RUNSTATE_FULL_CONTENTION; + } + else if(runstates[RUNSTATE_BLOCKED]||runstates[RUNSTATE_OFFLINE]) + { + ret= DOMAIN_RUNSTATE_BLOCKED; + } else { + fprintf(warn, "Strange, no meaningful runstates for d%d!\n", + d->did); + } + +#if 0 + printf(" Max vid: %d\n", d->max_vid); + for(i=0; i<=d->max_vid; i++) + if(d->vcpu[i]) + fprintf(warn, " v%d: %s\n", + i, runstate_name[d->vcpu[i]->runstate]); + + for(i=0; i= 0) + return ret; + else + exit(1); +} + +static inline void runstate_update(struct vcpu_data *v, int new_runstate, + tsc_t tsc) +{ + struct domain_data *d = v->d; + if(v->runstate.tsc > 0 && v->runstate.tsc < tsc) { + update_cycles(v->runstates + v->runstate.state, tsc - v->runstate.tsc); + + if(v->runstate.state == RUNSTATE_RUNNING) + update_cycles(&v->d->total_time, tsc - v->runstate.tsc); + + /* How much did dom0 run this buffer? */ + if(v->d->did == 0) { + int i; + for(i=0; iactive) + continue; + start_tsc = (p->volume.buffer_first_tsc > v->runstate.tsc) ? + p->volume.buffer_first_tsc : + v->runstate.tsc; + p->volume.buffer_dom0_runstate_cycles[v->runstate.state] + += tsc - start_tsc; +#if 0 + printf(" - updated p%d dom0_runstate %s to %lld cycles (+%lld)\n", + p->pid, runstate_name[v->runstate.state], + p->volume.buffer_dom0_runstate_cycles[v->runstate.state], + tsc - start_tsc); +#endif + p->volume.buffer_dom0_runstate = new_runstate; + p->volume.buffer_dom0_runstate_tsc = tsc; + } + } + } + v->runstate.state = new_runstate; + v->runstate.tsc = tsc; + + /* Determine the domain runstate */ + if(d->runstate_tsc > 0 && d->runstate_tsc < tsc) + update_cycles(d->runstates + d->runstate, tsc - d->runstate_tsc); + + d->runstate = domain_runstate(d); + + d->runstate_tsc = tsc; +} + +void hvm_vmexit_process(struct record_info *ri, struct hvm_data *h, + struct vcpu_data *v) { + struct { + union { + struct { + unsigned long long rip; + unsigned long exit_reason; + } x64; + struct { + unsigned long eip; + unsigned long exit_reason; + } x32; + }; + } *r; + + if(ri->extra_words != 2 && ri->extra_words != 3) + { + fprintf(warn, "FATAL: vmexit has unexpected extra words %d!\n", + ri->extra_words); + dump_generic(warn, ri); + exit(1); + } + + + r = (typeof(r))ri->d; + + if(!h->init) + init_hvm_data(h, v); + + h->vmexit_valid=1; + + if(ri->event == TRC_HVM_VMEXIT64) { + if(v->guest_paging_levels != 4) + { + fprintf(warn, "%s: VMEXIT64, but guest_paging_levels %d. Switching to 4.\n", + __func__, v->guest_paging_levels); + v->guest_paging_levels = 4; + } + if(!is_valid_addr64(r->x64.rip)) + fprintf(warn, "%s: invalid va %llx", + __func__, r->x64.rip); + h->rip = r->x64.rip; + h->exit_reason = r->x64.exit_reason; + } else { + if(v->guest_paging_levels == 4) + { + int new_paging_levels = opt.default_guest_paging_levels; + if(new_paging_levels == 4) + new_paging_levels = 2; /* Wild guess */ + fprintf(warn, "%s: VMEXIT, but guest_paging_levels %d. Switching to %d(default).\n", + __func__, v->guest_paging_levels, new_paging_levels); + v->guest_paging_levels = new_paging_levels; + } + h->rip = r->x32.eip; + h->exit_reason = r->x32.exit_reason; + } + + if(h->exit_reason > h->exit_reason_max) + { + fprintf(warn, "h->exit_reason %lx > exit_reason_max %lx!\n", + (unsigned long)h->exit_reason, + (unsigned long)h->exit_reason_max); + exit(1); + } + + if(opt.dump_all) { + if ( h->exit_reason < h->exit_reason_max ) + printf("]%s vmexit exit_reason %s eip %llx%s\n", + ri->dump_header, + h->exit_reason_name[h->exit_reason], + h->rip, + find_symbol(h->rip)); + else + printf("]%s vmexit exit_reason %x eip %llx%s\n", + ri->dump_header, + h->exit_reason, + h->rip, + find_symbol(h->rip)); + } + + if(h->v->cr3.data && h->entry_tsc) { + update_cycles(&h->v->cr3.data->guest_time, + ri->tsc - h->entry_tsc); + h->v->cr3.data->run_time += (ri->tsc - h->entry_tsc); + } + + h->exit_tsc = ri->tsc; + h->entry_tsc = 0; + h->resyncs = 0; + h->prealloc_unpin = 0; + h->short_summary_done = 0; + + if(!opt.svm_mode && h->exit_reason == EXIT_REASON_EXCEPTION_NMI) + { + bzero(&h->inflight.pf_xen, sizeof(h->inflight.pf_xen)); + h->post_process = hvm_exception_nmi_generic_postprocess; + } else { + h->post_process = NULL; + } + + { + struct time_struct t; + + abs_cycles_to_time(h->exit_tsc, &t); + + if(t.time) + { + snprintf(h->dump_header, + DUMP_HEADER_MAX, + "%3lu.%09lu %s", t.s, t.ns, pcpu_string(ri->cpu)); + } + else + { + snprintf(h->dump_header, + DUMP_HEADER_MAX, + " %s", pcpu_string(ri->cpu)); + } + } +} + +void hvm_close_vmexit(struct hvm_data *h, tsc_t tsc) { + + if(h->exit_tsc) { + if(h->exit_tsc > tsc) + h->arc_cycles = 0; + else { + h->arc_cycles = tsc - h->exit_tsc; + + if(opt.summary_info) { + update_summary(&h->summary.exit_reason[h->exit_reason], + h->arc_cycles); + h->summary_info = 1; + } + } + } + + if(h->post_process) + (h->post_process)(h); + + if(h->arc_cycles) { + if(opt.summary_info && !h->short_summary_done) { + switch(h->event_handler) { + case HVM_EVENT_HANDLER_VMCALL: + hvm_update_short_summary(h, HVM_SHORT_SUMMARY_VMCALL); + break; + case HVM_EVENT_HANDLER_INTR: + hvm_update_short_summary(h, HVM_SHORT_SUMMARY_INTERRUPT); + break; + case HVM_EVENT_HANDLER_HLT: + hvm_update_short_summary(h, HVM_SHORT_SUMMARY_HLT); + break; + default: + hvm_update_short_summary(h, HVM_SHORT_SUMMARY_OTHER); + break; + } + } + + + if(h->v->cr3.data) { + h->v->cr3.data->run_time += h->arc_cycles; + + if(opt.summary_info) + update_cycles(&h->v->cr3.data->hv_time, + h->arc_cycles); + } + } + + h->exit_tsc = 0; + h->vmexit_valid = 0; + h->post_process = NULL; + +} + +void hvm_vmentry_process(struct record_info *ri, struct hvm_data *h) { + if(!h->init) + { + if(opt.dump_all) + printf("!%s vmentry\n", + ri->dump_header); + return; + } + + /* Vista bug + * This has to be done here because irqs are injected on the path out + * to vmexit. */ + if(h->v->vlapic.ipi_virq_injected) + { + struct vcpu_data *v = h->v; + struct vlapic_struct *vla = &v->vlapic; + unsigned long long lat=0; + + if(ri->tsc > vla->first_ipi_tsc) + lat = ri->tsc - vla->first_ipi_tsc; + else + fprintf(stderr, "Strange, first_ipi_tsc %lld > ri->tsc %lld!\n", + vla->first_ipi_tsc, ri->tsc); + + if((opt.dump_all || opt.dump_cooked) && vla->outstanding_ipis > 1) + { + struct time_struct t; + cycles_to_time(lat, &t); + printf(" d%dv%d received %d ipis, latency %lld (%lu.%09lu s)\n", + v->d->did, v->vid, vla->outstanding_ipis, lat, + t.s, t.ns); + } + + if(opt.summary_info) + { + update_summary(&h->summary.ipi_latency, lat); + h->summary.ipi_count[vla->outstanding_ipis]++; + + } + + if(opt.dump_cooked || opt.dump_ipi_latency) { + struct time_struct t; + abs_cycles_to_time(ri->tsc, &t); + printf("Ipis: %d latency: %lld (time %lu.%09lu s)\n", + vla->outstanding_ipis, lat, t.s, t.ns); + } + + clear_vlapic(vla); + } + + if(h->w2h.waking && opt.dump_all) + printf(" [w2h] d%dv%d Finishing waking\n", + h->v->d->did, h->v->vid); + + h->w2h.waking = 0; + + if ( h->w2h.interrupts_wanting_tsc ) { + int i; + for(i=0; isummary.guest_interrupt[i].start_tsc == 1 ) + { + if(opt.dump_all) + printf(" [w2h] d%dv%d Setting vec %d tsc to %lld\n", + h->v->d->did, h->v->vid, i, ri->tsc); + h->summary.guest_interrupt[i].start_tsc = ri->tsc; + h->w2h.interrupts_wanting_tsc--; + if ( h->w2h.interrupts_wanting_tsc == 0 ) + break; + } + } + } + + if(!h->vmexit_valid) + { + if(opt.dump_all) + printf("!%s vmentry\n", + ri->dump_header); + return; + } + + if(opt.dump_all) { + printf("]%s vmentry\n", + ri->dump_header); + } + + hvm_close_vmexit(h, ri->tsc); + h->entry_tsc = ri->tsc; +} + +void hvm_process(struct pcpu_info *p) +{ + struct record_info *ri = &p->ri; + struct vcpu_data *v = p->current; + struct hvm_data *h = &v->hvm; + + if(v->d->did == IDLE_DOMAIN) { + fprintf(warn, "%s: Strange, hvm record for idle domain!\n", + __func__); + return; + } + + if(!v->data_type) + v->data_type = VCPU_DATA_HVM; + else if(v->data_type != VCPU_DATA_HVM) { + fprintf(warn, "Non-hvm data type %d for d%dv%d on proc %d, skipping\n", + v->data_type, v->d->did, v->vid, p->pid); + dump_generic(warn, ri); + return; + } + + + if(ri->evt.sub == 2) + { + UPDATE_VOLUME(p, hvm[HVM_VOL_HANDLER], ri->size); + hvm_handler_process(ri, h); + } + else + { + switch(ri->event) { + /* HVM */ + case TRC_HVM_VMEXIT: + case TRC_HVM_VMEXIT64: + UPDATE_VOLUME(p, hvm[HVM_VOL_VMEXIT], ri->size); + hvm_vmexit_process(ri, h, v); + break; + case TRC_HVM_VMENTRY: + UPDATE_VOLUME(p, hvm[HVM_VOL_VMENTRY], ri->size); + hvm_vmentry_process(ri, &p->current->hvm); + break; + default: + fprintf(warn, "Unknown hvm event: %x", ri->event); + } + } +} + +void hvm_summary(struct hvm_data *h) { + int i; + + if(!h->summary_info) + return; + + printf("Exit reasons:\n"); + for(i=0; iexit_reason_max; i++) { + PRINT_SUMMARY(h->summary.exit_reason[i], + " %-20s ", h->exit_reason_name[i]); + if(h->exit_reason_summary_handler[i]) + h->exit_reason_summary_handler[i](h); + } + + printf("Guest interrupt counts:\n"); + for(i=0; isummary.guest_interrupt[i].count) { + int j; + printf(" [%3d] %d\n", + i, h->summary.guest_interrupt[i].count); + for(j=1; jsummary.guest_interrupt[i].runtime+j, desc); + } + } + if(h->summary.guest_interrupt[i].count) + printf(" [%d+] %d\n", + i, h->summary.guest_interrupt[i].count); + + if(opt.histogram_interrupt_eip) + { + unsigned max = ((1ULL<summary.extint_histogram[i]) + { + printf("[%llx-%llx]: %d\n", + opt.histogram_interrupt_increment * i, + (opt.histogram_interrupt_increment * (i+1)) - 1, + h->summary.extint_histogram[i]); + } + } + + PRINT_SUMMARY(h->summary.ipi_latency, + "IPI latency \n"); + for(i=0; i<256; i++) + if(h->summary.ipi_count[i]) + printf(" [%3d] %10d\n", + i, h->summary.ipi_count[i]); + hvm_io_address_summary(h->summary.pio, "IO address summary:"); +} + +/* ---- Shadow records ---- */ +union shadow_event +{ + unsigned event; + struct { + unsigned minor:8, + paging_levels:4; + }; +}; + +/* WARNING - not thread safe */ +#define FLAGSTRING(_name, _char) \ + if(e->flag_ ## _name) \ + flagstring[i] = _char; \ + i++; + +char * flag_string(struct pf_xen_extra *e) +{ + static char flagstring[32]; + int i=0; + + for(i=0; i<32; i++) + flagstring[i]='-'; + + i=0; + + if(e->flag_set_ad) + flagstring[i]='d'; + else if(e->flag_set_a) + flagstring[i]='a'; + i++; + + FLAGSTRING(shadow_l1_get_ref, 'g'); + FLAGSTRING(shadow_l1_put_ref, 'p'); + //FLAGSTRING(l2_propagate, '2'); + FLAGSTRING(demote, 'D'); + FLAGSTRING(promote, 'P'); + FLAGSTRING(wrmap, 'w'); + FLAGSTRING(wrmap_guess_found, 'G'); + //FLAGSTRING(wrmap_brute_force, 'b'); + FLAGSTRING(early_unshadow, 'e'); + FLAGSTRING(prealloc_unhook, 'H'); + FLAGSTRING(unsync, 'u'); + FLAGSTRING(oos_fixup_add, 'a'); + FLAGSTRING(oos_fixup_evict, 'x'); + + flagstring[i]=0; + + return flagstring; +} + +void shadow_emulate_postprocess(struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + + if ( opt.summary_info ) + { + update_eip(&h->v->d->emulate_eip_list, + h->rip, + h->arc_cycles, + 0, NULL); + update_summary(&h->summary.pf_xen[PF_XEN_EMULATE], h->arc_cycles); + update_summary(&h->summary.pf_xen_emul[e->pt_level], h->arc_cycles); + if(h->prealloc_unpin) + update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_PREALLOC_UNPIN], h->arc_cycles); + if(e->flag_prealloc_unhook) + update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_PREALLOC_UNHOOK], h->arc_cycles); + if(e->flag_early_unshadow) + update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_EARLY_UNSHADOW], h->arc_cycles); + if(e->flag_set_changed) + update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_CHANGED], h->arc_cycles); + else + update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_UNCHANGED], h->arc_cycles); + if(e->flag_set_flush) + update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_FLUSH], h->arc_cycles); + if(e->flag_set_error) + update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_ERROR], h->arc_cycles); + if(e->flag_promote) + update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_PROMOTE], h->arc_cycles); + if(e->flag_demote) + update_summary(&h->summary.pf_xen_emul[PF_XEN_EMUL_DEMOTE], h->arc_cycles); + /* more summary info */ + + hvm_update_short_summary(h, HVM_SHORT_SUMMARY_EMULATE); + } + + if ( opt.dump_cooked ) + { + printf(" %s emulate va %llx eip %llx%s lvl %d/%d corr %llx wval %llx flags %s\n", + h->dump_header, e->va, + h->rip, find_symbol(h->rip), + e->pt_level, h->v->guest_paging_levels, + e->corresponding_va, + e->wval, + flag_string(e)); + } + + + if(opt.scatterplot_unpin_promote) { + if(e->flag_early_unshadow) + scatterplot_vs_time(h->exit_tsc, -10); + if(h->prealloc_unpin) + scatterplot_vs_time(h->exit_tsc, 0); + if(e->flag_promote) { + if(opt.with_cr3_enumeration) { + if(h->v->cr3.data) + scatterplot_vs_time(h->exit_tsc, h->v->cr3.data->cr3_id); + } else + scatterplot_vs_time(h->exit_tsc, 2); + } + } + + +} + +void shadow_emulate_process(struct record_info *ri, struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + union { + /* for PAE, guest_l1e may be 64 while guest_va may be 32; + so put it first for alignment sake. */ + struct { + unsigned gl1e, write_val; + unsigned va; + unsigned flags:29, emulation_count:3; + } gpl2; + struct { + unsigned long long gl1e, write_val; + unsigned va; + unsigned flags:29, emulation_count:3; + } gpl3; + struct { + unsigned long long gl1e, write_val; + unsigned long long va; + unsigned flags:29, emulation_count:3; + } gpl4; + } *r = (typeof(r))ri->d; + + int gpl = h->v->guest_paging_levels; + + /* Fill in extended information */ + switch(gpl) + { + case 2: + if(sizeof(r->gpl2) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl2), h->v->guest_paging_levels, + ri->extra_words * 4); + break; + } + e->va = r->gpl2.va; + e->flags = r->gpl2.flags; + e->gl1e = r->gpl2.gl1e; + e->wval = r->gpl2.write_val; + break; + case 3: + if(sizeof(r->gpl3) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl3), h->v->guest_paging_levels, + ri->extra_words * 4); + break; + } + e->va = r->gpl3.va; + e->flags = r->gpl3.flags; + e->gl1e = r->gpl3.gl1e; + e->wval = r->gpl3.write_val; + break; + case 4: + if(sizeof(r->gpl4) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl4), h->v->guest_paging_levels, + ri->extra_words * 4); + break; + } + e->va = r->gpl4.va; + e->flags = r->gpl4.flags; + e->gl1e = r->gpl4.gl1e; + e->wval = r->gpl4.write_val; + break; + } + + pf_preprocess(e,gpl); + + if(opt.dump_all) + printf("]%s emulate va %llx gl1e %8llx wval %8llx flags %s(%x) pt_level %d corr %8llx\n", + ri->dump_header, + e->va, + e->gl1e, e->wval, + flag_string(e), e->flags, + e->pt_level, e->corresponding_va); + + h->post_process = shadow_emulate_postprocess; + +} + +struct shadow_emulate_other { + unsigned long long gfn, va; +}; + +#define SHADOW_OTHER_LOGS_GFN_NOT_GMFN 1 + +void shadow_parse_other(struct record_info *ri, + struct shadow_emulate_other *o, + int gpl) { + union { + /* for PAE, guest_l1e may be 64 while guest_va may be 32; + so put it first for alignment sake. */ +#if SHADOW_OTHER_LOGS_GFN_NOT_GMFN + /* D'OH! Accidentally used mfn_t in the struct, so gmfns are always + 64-bit... :-/ */ + struct { + unsigned long gfn, va; + } gpl2; +#endif + struct { + unsigned long long gfn; + unsigned long va; + } gpl3; + struct { + unsigned long long gfn, va; + } gpl4; + } *r = (typeof(r))ri->d; + + + switch(gpl) + { +#if SHADOW_OTHER_LOGS_GFN_NOT_GMFN + case 2: + if(sizeof(r->gpl2) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl2), gpl, + ri->extra_words * 4); + break; + } + o->va = r->gpl2.va; + o->gfn = r->gpl2.gfn; + break; +#else + case 2: + /* FALLTHRU */ +#endif + case 3: + if(sizeof(r->gpl3) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl3), gpl, + ri->extra_words * 4); + break; + } + o->va = r->gpl3.va; + o->gfn = r->gpl3.gfn; + break; + case 4: + if(sizeof(r->gpl4) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl4), gpl, + ri->extra_words * 4); + break; + } + o->va = r->gpl4.va; + o->gfn = r->gpl4.gfn; + break; + } +} + +#if 0 +void shadow_unsync_postprocess(struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + + if(h->resyncs > 1) + fprintf(warn, "Strange, %d resyncs for an unsync!\n", + h->resyncs); + + if(opt.summary_info) { + update_summary(&h->summary.pf_xen[PF_XEN_EMULATE_UNSYNC], + h->arc_cycles); + if(h->resyncs <= 1) + update_summary(&h->summary.pf_xen_unsync[h->resyncs], + h->arc_cycles); + } + + if(opt.dump_cooked) + { + printf(" %s unsync gfn %llx %s (%d resyncs)\n", + h->dump_header, e->gfn, + h->resyncs?"(resync)":"", + h->resyncs); + } +} + + +void shadow_unsync_process(struct record_info *ri, struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + struct shadow_emulate_other r; + + shadow_parse_other(ri, &r, h->v->guest_paging_levels); + + e->gmfn = r.gmfn; + e->va = r.va; + + pf_preprocess(e, h->v->guest_paging_levels); + + if(opt.dump_all) + printf("]%s shadow unsync gmfn %llx va %llx pt_level %d corr %llx\n", + h->dump_header, + e->gmfn, + e->va, + e->pt_level, + e->corresponding_va); + + h->post_process = shadow_unsync_postprocess; +} +#endif + +void shadow_fault_generic_postprocess(struct hvm_data *h); + +void shadow_emulate_other_process(struct record_info *ri, struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + struct shadow_emulate_other r; + union shadow_event sevt = { .event = ri->event }; + + shadow_parse_other(ri, &r, h->v->guest_paging_levels); + + e->gfn = r.gfn; + e->va = r.va; + + pf_preprocess(e, h->v->guest_paging_levels); + + if(opt.dump_all) + printf("]%s shadow %s gfn %llx va %llx\n", + h->dump_header, + pf_xen_name[sevt.minor], + e->gfn, + e->va); + + h->post_process = shadow_fault_generic_postprocess; +} + +void shadow_fixup_postprocess(struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + + if ( opt.summary_info ) + { + update_summary(&h->summary.pf_xen[PF_XEN_FIXUP], h->arc_cycles); + if(h->prealloc_unpin) { + update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_PREALLOC_UNPIN], h->arc_cycles); + } + if(e->flag_unsync) { + update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_UNSYNC], h->arc_cycles); + if(h->resyncs < PF_XEN_FIXUP_UNSYNC_RESYNC_MAX) + update_summary(&h->summary.pf_xen_fixup_unsync_resync[h->resyncs], + h->arc_cycles); + else + update_summary(&h->summary.pf_xen_fixup_unsync_resync[PF_XEN_FIXUP_UNSYNC_RESYNC_MAX], + h->arc_cycles); + } + if(e->flag_oos_fixup_add) + update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_OOS_ADD], h->arc_cycles); + if(e->flag_oos_fixup_evict) + update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_OOS_EVICT], h->arc_cycles); + if(e->flag_promote) { + update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_PROMOTE], h->arc_cycles); + if(e->flag_wrmap) { + update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_WRMAP], h->arc_cycles); + if(e->flag_wrmap_brute_force) + update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_BRUTE_FORCE], h->arc_cycles); + } else if(e->flag_wrmap_brute_force) { + fprintf(warn, "Strange: wrmap_bf but not wrmap!\n"); + + } + + } else if(e->flag_wrmap) + fprintf(warn, "Strange, wrmap but not promote!\n"); + + if(!(e->flag_promote || h->prealloc_unpin || e->flag_unsync)) + update_summary(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_UPDATE_ONLY], h->arc_cycles); + /* more summary info */ + + if(e->flag_unsync) + hvm_update_short_summary(h, HVM_SHORT_SUMMARY_UNSYNC); + else + hvm_update_short_summary(h, HVM_SHORT_SUMMARY_FIXUP); + } + + if ( opt.dump_cooked ) + { + printf(" %s fixup%s va %llx eip %llx%s gl1e %llx flags %s\n", + h->dump_header, + e->flag_unsync?":unsync":"", + e->va, + h->rip, find_symbol(h->rip), + e->gl1e, + flag_string(e)); + } + + if(opt.scatterplot_unpin_promote) { + if(h->prealloc_unpin) + scatterplot_vs_time(h->exit_tsc, 0); + if(e->flag_promote) { + if(opt.with_cr3_enumeration) { + if(h->v->cr3.data) + scatterplot_vs_time(h->exit_tsc, h->v->cr3.data->cr3_id); + } else + scatterplot_vs_time(h->exit_tsc, 2); + } + } +} + +void shadow_fixup_process(struct record_info *ri, struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + union { + /* for PAE, guest_l1e may be 64 while guest_va may be 32; + so put it first for alignment sake. */ + struct { + unsigned long gl1e, va, flags; + } gpl2; + struct { + unsigned long long gl1e; + unsigned long va, flags; + } gpl3; + struct { + unsigned long long gl1e, va; + unsigned long flags; + } gpl4; + } *r = (typeof(r))ri->d; + int gpl = h->v->guest_paging_levels; + + switch(gpl) + { + case 2: + if(sizeof(r->gpl2) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl2), h->v->guest_paging_levels, + ri->extra_words * 4); + break; + } + e->va = r->gpl2.va; + e->flags = r->gpl2.flags; + e->gl1e = r->gpl2.gl1e; + break; + case 3: + if(sizeof(r->gpl3) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl3), h->v->guest_paging_levels, + ri->extra_words * 4); + break; + } + e->va = r->gpl3.va; + e->flags = r->gpl3.flags; + e->gl1e = r->gpl3.gl1e; + break; + case 4: + if(sizeof(r->gpl4) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl4), h->v->guest_paging_levels, + ri->extra_words * 4); + break; + } + e->va = r->gpl4.va; + e->flags = r->gpl4.flags; + e->gl1e = r->gpl4.gl1e; + break; + } + + if(opt.dump_all) + printf("]%s fixup%s va %llx gl1e %llx flags (%x)%s\n", + h->dump_header, + e->flag_unsync?":unsync":"", + e->va, e->gl1e, e->flags, + flag_string(e)); + + h->post_process = shadow_fixup_postprocess; +} + +void shadow_mmio_postprocess(struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + + if ( opt.summary_info ) + { + update_summary(&h->summary.pf_xen[e->pf_case], h->arc_cycles); + + hvm_update_short_summary(h, HVM_SHORT_SUMMARY_MMIO); + } + + if ( opt.dump_cooked ) + { + if(e->mmio_data_valid) + printf(" %s %smmio %s va %llx eip %llx%s gpa %llx data %lx\n", + h->dump_header, + (e->pf_case==PF_XEN_FAST_MMIO)?"fast ":"", + e->mmio_is_write?"write":"read", + e->va, + h->rip, find_symbol(h->rip), + e->gpa, + e->data); + else + printf(" %s %smmio %s va %llx eip %llx%s gpa %llx (no data)\n", + h->dump_header, + (e->pf_case==PF_XEN_FAST_MMIO)?"fast ":"", + e->mmio_is_write?"write":"read", + e->va, + h->rip, find_symbol(h->rip), + e->gpa); + } + +} + +void shadow_mmio_process(struct record_info *ri, struct hvm_data *h) +{ + struct pf_xen_extra *e = &h->inflight.pf_xen; + union { + /* for PAE, guest_l1e may be 64 while guest_va may be 32; + so put it first for alignment sake. */ + struct { + unsigned long va; + } gpl2; + struct { + unsigned long long va; + } gpl4; + } *r = (typeof(r))ri->d; + int gpl = h->v->guest_paging_levels; + + switch(gpl) + { + case 2: + case 3: + if(sizeof(r->gpl2) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl2), h->v->guest_paging_levels, + ri->extra_words * 4); + break; + } + e->va = r->gpl2.va; + break; + case 4: + if(sizeof(r->gpl4) != ri->extra_words * 4) + { + fprintf(warn, "%s: expected %d bytes for %d-level guest, got %d!\n", + __func__, sizeof(r->gpl4), h->v->guest_paging_levels, + ri->extra_words * 4); + break; + } + e->va = r->gpl4.va; + break; + } + + if(opt.dump_all) + printf("]%s %smmio va %llx\n", + h->dump_header, + (e->pf_case==PF_XEN_FAST_MMIO)?"fast ":"", + e->va); + + h->post_process = shadow_mmio_postprocess; +} + +void shadow_fault_generic_dump(unsigned long event, uint32_t *d, char *prefix, + char * dump_header) +{ + char *evt_string, evt_number[10]; + union shadow_event sevt = { .event = event }; + int i; + + if(sevt.minor < PF_XEN_MAX) + { + evt_string = pf_xen_name[sevt.minor]; + } + else + { + snprintf(evt_number, 10, "%d", sevt.minor); + evt_string = evt_number; + } + + printf("%s%s shadow %s gl %d [", + prefix, + dump_header, + evt_string, + sevt.paging_levels); + + for(i=0; i<4; i++) + { + printf(" %x", d[i]); + } + + printf(" ]\n"); +} + +void shadow_fault_generic_postprocess(struct hvm_data *h) +{ + union shadow_event sevt = { .event = h->inflight.generic.event }; + if(opt.summary_info) { + update_summary(&h->summary.pf_xen[sevt.minor], + h->arc_cycles); + + hvm_update_short_summary(h, HVM_SHORT_SUMMARY_PROPAGATE); + } + + if(opt.dump_cooked) + { + shadow_fault_generic_dump(h->inflight.generic.event, + h->inflight.generic.d, + " ", h->dump_header); + } +} + +void shadow_fault_generic_process(struct record_info *ri, struct hvm_data *h) +{ + union shadow_event sevt = { .event = ri->event }; + + /* pf-case traces, vs others */ + h->inflight.generic.event = ri->event; + bcopy(ri->d, h->inflight.generic.d, sizeof(unsigned int) * 4); + + if(opt.dump_all) + shadow_fault_generic_dump(h->inflight.generic.event, + h->inflight.generic.d, + "]", ri->dump_header); + + h->inflight.pf_xen.pf_case = sevt.minor; + h->post_process = shadow_fault_generic_postprocess; +} + +void shadow_resync_process(struct record_info *ri, struct hvm_data *h) +{ + struct { + unsigned long long gfn; + } *r = (typeof(r))ri->d; + + if(opt.dump_all || opt.dump_cooked) + printf(" %s oos resync %s gfn %llx\n", + ri->dump_header, + (ri->event == TRC_SHADOW_RESYNC_FULL)?"full":"only", + r->gfn); + + h->resyncs++; +} + +void shadow_prealloc_unpin_process(struct record_info *ri, struct hvm_data *h) { + struct { + unsigned long long gfn; + } *r = (typeof(r))ri->d; + + if(opt.dump_all || opt.dump_cooked) + printf(" %s prealloc-unpin gfn %llx\n", + ri->dump_header, r->gfn); + + if(h->prealloc_unpin) + fprintf(warn, "Strange, more than one prealloc_unpin per arc!\n"); + + h->prealloc_unpin = 1; + + if(opt.with_cr3_enumeration) + cr3_prealloc_unpin(h->v, r->gfn); +} + +void shadow_process(struct pcpu_info *p) +{ + struct record_info *ri = &p->ri; + struct hvm_data *h = &p->current->hvm; + + union shadow_event sevt = { .event = ri->event }; + + int gpl = sevt.paging_levels + 2; + + if(!h->init || !h->vmexit_valid) + { + if(opt.dump_all) + shadow_fault_generic_dump(ri->event, + ri->d, + "!", ri->dump_header); + return; + } + + if(sevt.minor <= PF_XEN_NOT_SHADOW) { + if(p->current->guest_paging_levels != gpl) + { + fprintf(warn, "%s: Changing guest paging levels to %d\n", + __func__, gpl); + p->current->guest_paging_levels = gpl; + } + } + + if(sevt.minor <= PF_XEN_LAST_FAULT) { + h->inflight.pf_xen.pf_case = sevt.minor; + if(opt.summary) { + hvm_set_summary_handler(h, hvm_pf_xen_summary); + } + } + + /* FIXME - mask out paging levels */ + switch(sevt.minor) + { + case PF_XEN_EMULATE: + shadow_emulate_process(ri, h); + break; + case PF_XEN_FIXUP: + shadow_fixup_process(ri, h); + break; + case PF_XEN_MMIO: + case PF_XEN_FAST_MMIO: + shadow_mmio_process(ri, h); + break; + case PF_XEN_EMULATE_UNSHADOW_USER: + case PF_XEN_EMULATE_UNSHADOW_EVTINJ: + case PF_XEN_EMULATE_UNSHADOW_UNHANDLED: + shadow_emulate_other_process(ri, h); + break; +#if 0 + case PF_XEN_EMULATE_UNSYNC: + shadow_unsync_process(ri, h); + break; +#endif + case SHADOW_RESYNC_FULL: + case SHADOW_RESYNC_ONLY: + shadow_resync_process(ri, h); + break; + case SHADOW_PREALLOC_UNPIN: + shadow_prealloc_unpin_process(ri, h); + break; + default: + if(sevt.minor <= PF_XEN_LAST_FAULT) { + shadow_fault_generic_process(ri, h); + } else { + fprintf(warn, "Warning: processing shadow as generic\n"); + process_generic(ri); + } + break; + } +} + +/* ---- PV guests ---- */ +union pv_event { + unsigned event; + struct { + unsigned minor:8, + x64:1, + unused1:3, + sub:4, + main:12, + unused:4; + }; +}; + +void pv_hypercall_process(struct record_info *ri, struct pv_data *pv) { + union { + struct { + uint32_t eip, eax; + } x32; + struct { + uint64_t eip; + uint32_t eax; + } x64; + } * r = (typeof(r)) ri->d; + union pv_event pevt = { .event = ri->event }; + unsigned long long eip; + unsigned long eax; + + if(pevt.x64) { + eip = r->x64.eip; + eax = r->x64.eax; + } else { + eip = r->x32.eip; + eax = r->x32.eax; + } + + if(opt.summary_info) { + if(eax < PV_HYPERCALL_MAX) + pv->hypercall_count[eax]++; + } + + if(opt.dump_cooked || opt.dump_all) { + if(eax < HYPERCALL_MAX) + printf(" %s hypercall %2lx (%s) eip %llx\n", + ri->dump_header, eax, + hypercall_name[eax], eip); + else + printf(" %s hypercall %lx eip %llx\n", + ri->dump_header, eax, eip); + } +} + +void pv_trap_process(struct record_info *ri, struct pv_data *pv) { + union { + struct { + unsigned long eip; + unsigned trapnr:15, + use_error_code:1, + error_code:16; + } x32; + struct { + unsigned long long eip; + unsigned trapnr:15, + use_error_code:1, + error_code:16; + } x64; + } * r = (typeof(r)) ri->d; + union pv_event pevt = { .event = ri->event }; + unsigned long long eip; + unsigned trapnr, use_ec, ec; + + if(pevt.x64) { + eip = r->x64.eip; + trapnr = r->x64.trapnr; + use_ec = r->x64.use_error_code; + ec = r->x64.error_code; + } else { + eip = r->x32.eip; + trapnr = r->x32.trapnr; + use_ec = r->x32.use_error_code; + ec = r->x32.error_code; + } + + if(opt.summary_info) { + if(trapnr < PV_TRAP_MAX) + pv->trap_count[trapnr]++; + } + + if(opt.dump_cooked || opt.dump_all) { + printf(" %s trap %x eip %llx", + ri->dump_header, trapnr, eip); + if(use_ec) + printf(" ec %x\n", ec); + else + printf("\n"); + } +} + +void pv_generic_process(struct record_info *ri, struct pv_data *pv) { + union pv_event pevt = { .event = ri->event }; + if ( opt.dump_cooked || opt.dump_all ) { + if(pevt.minor < PV_MAX && pv_name[pevt.minor]) + printf(" %s %s", + ri->dump_header, + pv_name[pevt.minor]); + else + printf(" %s PV-%d ", + ri->dump_header, pevt.minor); + if(ri->extra_words) { + int i; + printf("[ "); + for(i=0; iextra_words; i++) { + printf("%x ", (unsigned)ri->d[i]); + } + printf("]\n"); + + } + } +} + +void pv_summary(struct pv_data *pv) { + int i, j; + + if(!pv->summary_info) + return; + + printf("PV events:\n"); + for(i=0; icount[i]) + printf(" %s %d\n", pv_name[i], pv->count[i]); + switch(i) { + case PV_HYPERCALL: + for(j=0; jhypercall_count[j]) + printf(" %-29s[%2d]: %6d\n", + hypercall_name[j], + j, + pv->hypercall_count[j]); + } + break; + case PV_TRAP: + for(j=0; jtrap_count[j]) + printf(" [%d] %d\n", + j, pv->trap_count[j]); + } + break; + } + } +} + +void pv_process(struct pcpu_info *p) +{ + struct record_info *ri = &p->ri; + struct vcpu_data *v = p->current; + struct pv_data *pv = &v->pv; + + union pv_event pevt = { .event = ri->event }; + + if(v->d->did == IDLE_DOMAIN) { + fprintf(warn, "%s: Strange, pv record for idle domain!\n", + __func__); + return; + } + + if(!v->data_type) + v->data_type = VCPU_DATA_PV; + else if(v->data_type != VCPU_DATA_PV) { + fprintf(warn, "ERROR: non-pv data type %d for d%dv%d\n", + v->data_type, v->d->did, v->vid); + exit(1); + } + + if(opt.summary_info) { + pv->summary_info=1; + + if(pevt.minor == PV_PTWR_EMULATION_PAE) + pv->count[PV_PTWR_EMULATION]++; + else + pv->count[pevt.minor]++; + } + + switch(pevt.minor) + { + case PV_HYPERCALL: + pv_hypercall_process(ri, pv); + break; + case PV_TRAP: + pv_trap_process(ri, pv); + break; + case PV_PAGE_FAULT: + //pv_pf_process(ri, pv); + //break; + case PV_PTWR_EMULATION: + case PV_PTWR_EMULATION_PAE: + //pv_ptwr_emulation_process(ri, pv); + //break; + default: + pv_generic_process(ri, pv); + break; + } +} + +/* ---- Schedule ---- */ +struct vcpu_data * vcpu_create(struct domain_data *d, int vid) +{ + struct vcpu_data *v; + + assert(d->vcpu[vid] == NULL); + + fprintf(warn, "Creating vcpu %d for dom %d\n", vid, d->did); + + if((v=malloc(sizeof(*v)))==NULL) + { + fprintf(stderr, "%s: malloc %d failed!\n", __func__, sizeof(*d)); + exit(1); + } + + bzero(v, sizeof(*v)); + + v->vid = vid; + v->d = d; + v->p = NULL; + v->runstate.state = RUNSTATE_INIT; + v->runstate.last_oldstate.wrong = RUNSTATE_INIT; + + d->vcpu[vid] = v; + + assert(v == v->d->vcpu[v->vid]); + + if(vid > d->max_vid) + d->max_vid = vid; + + return v; +} + +/* Called by both domain_create and sched_default_domain_init */ +void domain_init(struct domain_data *d, int did) +{ + bzero(d, sizeof(*d)); + + d->did = did; + d->next = NULL; + + if((opt.interval.output == INTERVAL_DOMAIN_SHORT_SUMMARY + || opt.interval.output == INTERVAL_DOMAIN_GUEST_INTERRUPT + || opt.interval.output == INTERVAL_DOMAIN_GRANT_MAPS) + && opt.interval.domain.did == did) { + P.interval.domain.d = d; + } +} + +struct domain_data * domain_create(int did) +{ + struct domain_data *d; + + fprintf(warn, "Creating domain %d\n", did); + + if((d=malloc(sizeof(*d)))==NULL) + { + fprintf(stderr, "%s: malloc %d failed!\n", __func__, sizeof(*d)); + exit(1); + } + + /* Initialize domain & vcpus */ + domain_init(d, did); + + return d; + } + +struct domain_data * domain_find(int did) +{ + struct domain_data *d, *n, **q; + + /* Look for domain, keeping track of the last pointer so we can add + a domain if we need to. */ + for ( d = domain_list, q=&domain_list ; + d && (d->did < did) ; + q = &d->next, d=d->next ) ; + + if(d && d->did == did) + return d; + + /* Make a new domain */ + n = domain_create(did); + + /* Insert it into the list */ + n->next = d; + *q = n; + + return n; +} + +struct vcpu_data * vcpu_find(int did, int vid) +{ + struct domain_data *d; + struct vcpu_data *v; + + d = domain_find(did); + + v = d->vcpu[vid]; + + if(!v) + v = vcpu_create(d, vid); + + return v; +} + +void vcpu_prev_update(struct pcpu_info *p, struct vcpu_data *prev, + tsc_t tsc, int new_runstate) +{ + assert(prev == prev->d->vcpu[prev->vid]); + + if(prev->p != p) + { + fprintf(warn, "Strange, sched_switch on pcpu %d, prev->pcpu %d!\n", + p->pid, prev->p->pid); + prev->runstate.tsc = 0; + goto set; + } + + assert(p->current); + + if(p->current != prev) + { + fprintf(warn, "Strange, sched_switch prev d%dv%d, pcpu %d current d%dv%d!\n", + prev->d->did, prev->vid, + p->pid, p->current->d->did, p->current->vid); + prev->runstate.tsc = 0; + goto set; + } + + if(prev->runstate.state != RUNSTATE_RUNNING) + { + fprintf(warn, "Strange, prev d%dv%d not running!\n", + prev->d->did, prev->vid); + prev->runstate.tsc = 0; + goto set; + } + +set: + p->current = NULL; + runstate_update(prev, new_runstate, tsc); +} + +void vcpu_next_update(struct pcpu_info *p, struct vcpu_data *next, tsc_t tsc) +{ + assert(next == next->d->vcpu[next->vid]); + assert(p->current == NULL); + + if(next->activated) + { + /* We may get lost records at start-of-day, so ignore + setting runstate of default vcpus */ + if(next->runstate.state == RUNSTATE_RUNNING + && next->d->did != DEFAULT_DOMAIN) + { + fprintf(warn, "Strange, next d%dv%d already running on proc %d!\n", + next->d->did, next->vid, + next->p->pid); + next->runstate.tsc = 0; + } + } + else + { + next->guest_paging_levels = opt.default_guest_paging_levels; + next->activated = 1; + } + + runstate_update(next, RUNSTATE_RUNNING, tsc); + + /* If we're moving from one pcpu to another, record change & update tsc */ + if(next->p != p) { + if(next->pcpu_tsc) + update_cycles(&next->cpu_affinity, tsc - next->pcpu_tsc); + next->pcpu_tsc = tsc; + } + + next->p = p; + p->current = next; +} + +/* If current is the default domain, we're fixing up from something + * like start-of-day. Update what we can. */ +void vcpu_start(struct pcpu_info *p, struct vcpu_data *v) { + if( !p->current || p->current->d->did != DEFAULT_DOMAIN) { + fprintf(stderr, "Strange, p->current not default domain!\n"); + exit(1); + } + + if(!p->first_tsc) { + fprintf(stderr, "Strange, p%d first_tsc 0!\n", p->pid); + exit(1); + } + + if(p->first_tsc <= p->current->runstate.tsc) { + fprintf(stderr, "Strange, first_tsc %llx < default_domain runstate tsc %llx!\n", + p->first_tsc, + p->current->runstate.tsc); + exit(1); + } + + /* Change default domain to 'queued' */ + runstate_update(p->current, RUNSTATE_QUEUED, p->first_tsc); + + /* FIXME: Copy over data from the default domain this interval */ + fprintf(warn, "Using first_tsc for d%dv%d (%lld cycles)\n", + v->d->did, v->vid, p->last_tsc - p->first_tsc); + + /* Simulate the time since the first tsc */ + runstate_update(v, RUNSTATE_RUNNING, p->first_tsc); + p->current = v; + v->p = p; +} + +void sched_runstate_change_process(struct pcpu_info *p) +{ + struct vcpu_data *v; + struct record_info *ri = &p->ri; + struct { + unsigned vcpu:16, dom:16; + unsigned long long p1, p2; + } * r = (typeof(r))ri->d; + union { + unsigned long event; + struct { + unsigned lo:4, + new_runstate:4, + old_runstate:4, + sub:4, + main:12, + unused:4; + }; + } sevt = { .event = ri->event }; + int perfctrs; + struct last_oldstate_struct last_oldstate; + + perfctrs = (ri->extra_words == 5); + + if(opt.dump_cooked || opt.dump_all) { + if( perfctrs ) { + printf(" %s runstate_change_process {%lld,%lld} d%uv%u %s->%s\n", + ri->dump_header, + r->p1, r->p2, + r->dom, r->vcpu, + runstate_name[sevt.old_runstate], + runstate_name[sevt.new_runstate]); + } else { + printf(" %s runstate_change_process d%uv%u %s->%s\n", + ri->dump_header, + r->dom, r->vcpu, + runstate_name[sevt.old_runstate], + runstate_name[sevt.new_runstate]); + } + } + + /* Sanity check: expected transitions */ + if( (sevt.new_runstate == RUNSTATE_RUNNING + && sevt.old_runstate != RUNSTATE_RUNNABLE) + || (sevt.new_runstate == RUNSTATE_BLOCKED + && sevt.old_runstate == RUNSTATE_RUNNABLE ) ) + { + fprintf(warn, "Strange, d%dv%d unexpected runstate transition %s->%s\n", + r->dom, r->vcpu, + runstate_name[sevt.old_runstate], + runstate_name[sevt.new_runstate]); + } + + if(r->vcpu > MAX_CPUS) + { + fprintf(warn, "%s: vcpu %u > MAX_VCPUS %d!\n", + __func__, r->vcpu, MAX_CPUS); + return; + } + + v = vcpu_find(r->dom, r->vcpu); + + /* We want last_oldstate reset every time; so copy the last one and use + * that locally, clobbering the one in the vcpu struct. If it needs to + * be reset, it will be reset below. */ + last_oldstate = v->runstate.last_oldstate; + v->runstate.last_oldstate.wrong = RUNSTATE_INIT; + + /* Close vmexits when the putative reason for blocking / &c stops. + * This way, we don't account cpu contention to some other overhead. */ + if(sevt.new_runstate == RUNSTATE_RUNNABLE + && v->data_type == VCPU_DATA_HVM + && v->hvm.vmexit_valid) { + if(opt.dump_cooked) + printf("%s: d%dv%d changing to state runnable, closing vmexit\n", + __func__, r->dom, r->vcpu); + hvm_close_vmexit(&v->hvm, ri->tsc); + } + + /* Track waking state */ + if ( v->data_type == VCPU_DATA_HVM && v->runstate.state != RUNSTATE_LOST ) { + if ( sevt.new_runstate == RUNSTATE_RUNNABLE + && sevt.old_runstate == RUNSTATE_BLOCKED ) + { + /* Hmm... want to make sure we're not in some weird + vmexit state... have to look later. */ + if(opt.dump_all) + printf(" [w2h] d%dv%d Setting waking\n", v->d->did, v->vid); + v->hvm.w2h.waking = 1; + } + else if ( sevt.new_runstate != RUNSTATE_RUNNING + || sevt.old_runstate != RUNSTATE_RUNNABLE ) + { + if( v->hvm.w2h.waking + && sevt.old_runstate == RUNSTATE_RUNNING + && sevt.new_runstate != RUNSTATE_OFFLINE ) + { + fprintf(warn, "Strange, unexpected waking transition for d%dv%d: %s -> %s\n", + v->d->did, v->vid, + runstate_name[sevt.old_runstate], + runstate_name[sevt.new_runstate]); + v->hvm.w2h.waking = 0; + } + + /* Close wake-to-halt summary */ + if (sevt.new_runstate == RUNSTATE_BLOCKED + && sevt.old_runstate == RUNSTATE_RUNNING + && v->hvm.w2h.interrupts ) { + int i; + for(i=0; ihvm.summary.guest_interrupt + i; + tsc_t start_tsc = g->start_tsc; + if(start_tsc) { + tsc_t t = (start_tsc == 1) ? 0 : ri->tsc - start_tsc; + if(opt.dump_all) + printf(" [w2h] Halting vec %d is_wake %d time %lld\n", + i, + g->is_wake, + t); + + if(opt.scatterplot_wake_to_halt + && t + && g->is_wake) + scatterplot_vs_time(ri->tsc, t); + + if(opt.summary && t) { + if(g->is_wake) { + if(v->hvm.w2h.interrupts==1) + update_cycles(&g->runtime[GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ALONE], + t); + update_cycles(&g->runtime[GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ANY], + t); + } else { + update_cycles(&g->runtime[GUEST_INTERRUPT_CASE_INTERRUPT_TO_HALT], + t); + } + } + g->start_tsc = 0; + g->is_wake = 0; + } + } + v->hvm.w2h.interrupts = 0; + v->hvm.w2h.vector = 0; + } + } + } + + /* Sanity checks / tsc skew detection */ + if( v->runstate.state != sevt.old_runstate + && v->runstate.state != RUNSTATE_INIT ) + { + if(v->runstate.state == RUNSTATE_LOST) { + if( sevt.new_runstate == RUNSTATE_RUNNING ) + goto update; + else if(opt.dump_all) + fprintf(warn, "%s: d%dv%d in runstate lost, not updating to %s\n", + __func__, v->d->did, v->vid, + runstate_name[sevt.new_runstate]); + goto no_update; + } else if (last_oldstate.wrong == sevt.new_runstate + && last_oldstate.actual == sevt.old_runstate) { + tsc_t lag, old_offset; + struct pcpu_info *p2; + + if(ri->tsc < last_oldstate.tsc) { + fprintf(stderr, "FATAL: new tsc < detected runstate tsc!\n"); + exit(1); + } + + p2 = P.pcpu + last_oldstate.pid; + + lag = ri->tsc + - last_oldstate.tsc; + + old_offset = p2->tsc_skew.offset; + + cpumask_union(&p2->tsc_skew.downstream, &p->tsc_skew.downstream); + cpumask_set(&p2->tsc_skew.downstream, p->pid); + + if(cpumask_isset(&p2->tsc_skew.downstream, p2->pid)) { + fprintf(stderr, "FATAL: tsc skew dependency loop detected!\n"); + exit(1); + } + + p2->tsc_skew.offset += lag * 2; + + fprintf(warn, "TSC skew detected p%d->p%d, %lld cycles. Changing p%d offset from %lld to %lld\n", + p->pid, p2->pid, lag, + p2->pid, + old_offset, + p2->tsc_skew.offset); + + goto no_update; + } else { + fprintf(warn, "runstate_change old_runstate %s, d%dv%d runstate %s. Possible tsc skew.\n", + runstate_name[sevt.old_runstate], + v->d->did, v->vid, + runstate_name[v->runstate.state]); + + v->runstate.last_oldstate.wrong = sevt.old_runstate; + v->runstate.last_oldstate.actual = v->runstate.state; + v->runstate.last_oldstate.tsc = ri->tsc; + v->runstate.last_oldstate.pid = p->pid; + goto update; + } + fprintf(stderr, "FATAL: Logic hole in %s\n", __func__); + exit(1); + } + +update: + /* Actually update the runstate. Special things to do if we're starting + * or stopping actually running on a physical cpu. */ + if ( sevt.old_runstate == RUNSTATE_RUNNING + || v->runstate.state == RUNSTATE_RUNNING ) + { +#if 0 + /* A lot of traces include cpi that shouldn't... */ + if(perfctrs && v->runstate.tsc) { + unsigned long long run_cycles, run_instr; + double cpi; + + //run_cycles = r->p1 - v->runstate_p1_start; + run_cycles = ri->tsc - v->runstate.tsc; + run_instr = r->p2 - v->runstate.p2_start; + + cpi = ((double)run_cycles) / run_instr; + + if(opt.dump_cooked || opt.dump_all) { + printf(" cpi: %2.2lf ( %lld / %lld )\n", + cpi, run_cycles, run_instr); + } + + if(opt.scatterplot_cpi && v->d->did == 1) + printf("%lld,%2.2lf\n", + ri->tsc, cpi); + + if(opt.summary_info) + update_cpi(&v->cpi, run_instr, run_cycles); + } +#endif + /* + * Cases: + * old running, v running: + * normal (prev update p, lost record check) + * v running, old ! running: + * tsc skew (prev update v->p, lost record check) + * old running, v init: + start-of-day (fake update, prev p, lost record) + * old running, v !{running,init}: + * # (should never happen) + */ + if( sevt.old_runstate == RUNSTATE_RUNNING ) { + if( v->runstate.state == RUNSTATE_INIT ) { + /* Start-of-day; account first tsc -> now to v */ + vcpu_start(p, v); + } else if( v->runstate.state != RUNSTATE_RUNNING) { + /* This should never happen. */ + fprintf(stderr, "FATAL: sevt.old_runstate running, but d%dv%d runstate %s!\n", + v->d->did, v->vid, runstate_name[v->runstate.state]); + exit(1); + } + + vcpu_prev_update(p, v, ri->tsc, sevt.new_runstate); + } else { + vcpu_prev_update(v->p, v, ri->tsc, sevt.new_runstate); + } + + if(P.lost_cpus && v->d->did != IDLE_DOMAIN) { + if(opt.dump_all) + fprintf(warn, "%s: %d lost cpus, setting d%dv%d runstate to RUNSTATE_LOST\n", + __func__, P.lost_cpus, v->d->did, v->vid); + lose_vcpu(v, ri->tsc); + } + } + else if ( sevt.new_runstate == RUNSTATE_RUNNING ) + { + if(perfctrs) { + v->runstate.p1_start = r->p1; + v->runstate.p2_start = r->p2; + } + + vcpu_next_update(p, v, ri->tsc); + } + else + { + runstate_update(v, sevt.new_runstate, ri->tsc); + } + +no_update: + return; +} + +void sched_switch_process(struct pcpu_info *p) +{ + struct vcpu_data *prev, *next; + struct record_info *ri = &p->ri; + struct { + unsigned long prev_dom, prev_vcpu, next_dom, next_vcpu; + } * r = (typeof(r))ri->d; + + if(opt.dump_cooked || opt.dump_all) + printf("%s sched_switch prev d%luv%lu next d%luv%lu\n", + ri->dump_header, + r->prev_dom, r->prev_vcpu, + r->next_dom, r->next_vcpu); + + if(r->prev_vcpu > MAX_CPUS) + { + fprintf(warn, "%s: prev_vcpu %lu > MAX_VCPUS %d!\n", + __func__, r->prev_vcpu, MAX_CPUS); + return; + } + + if(r->next_vcpu > MAX_CPUS) + { + fprintf(warn, "%s: next_vcpu %lu > MAX_VCPUS %d!\n", + __func__, r->next_vcpu, MAX_CPUS); + return; + } + + prev = vcpu_find(r->prev_dom, r->prev_vcpu); + next = vcpu_find(r->next_dom, r->next_vcpu); + + vcpu_prev_update(p, prev, ri->tsc, RUNSTATE_QUEUED); /* FIXME */ + + vcpu_next_update(p, next, ri->tsc); +} + +void sched_default_vcpu_activate(struct pcpu_info *p) +{ + struct vcpu_data *v = default_domain.vcpu[p->pid]; + + if(!v) + v = vcpu_create(&default_domain, p->pid); + + assert(v == v->d->vcpu[v->vid]); + + v->activated = 1; + v->guest_paging_levels = opt.default_guest_paging_levels; + v->p = p; + v->runstate.state = RUNSTATE_RUNNING; + + p->current = v; +} + +void sched_default_domain_init(void) +{ + struct domain_data *d = &default_domain; + + domain_init(d, DEFAULT_DOMAIN); +} + +void runstate_clear(tsc_t * runstate_cycles) +{ + int i; + for(i=0; irunstates+i, desc); + } + print_cpi_summary(&v->cpi); + print_cpu_affinity(&v->cpu_affinity, " cpu affinity"); +} + +void sched_summary_domain(struct domain_data *d) +{ + int i; + char desc[30]; + + printf(" Runstates:\n"); + for(i=0; irunstates+i, desc); + } +} + + +void sched_process(struct pcpu_info *p) +{ + struct record_info *ri = &p->ri; + + if(ri->evt.sub == 0xf) { + switch(ri->event) + { + case TRC_SCHED_SWITCH: + sched_switch_process(p); + break; + default: + process_generic(&p->ri); + } + } else { + if(ri->evt.sub == 1) + sched_runstate_change_process(p); + else { + UPDATE_VOLUME(p, sched_verbose, ri->size); + process_generic(&p->ri); + } + } +} + +/* ---- Memory ---- */ +void mem_summary_domain(struct domain_data *d) { + int i; + + printf(" Grant table ops:\n"); + + printf(" Done by:\n"); + for(i=0; imemops.done[i]) + printf(" %-14s: %d\n", + mem_name[i], + d->memops.done[i]); + + printf(" Done for:\n"); + for(i=0; imemops.done_for[i]) + printf(" %-14s: %d\n", + mem_name[i], + d->memops.done_for[i]); +} + +void mem_process(struct pcpu_info *p) { + struct record_info *ri = &p->ri; + struct { + int dom; + } *r = (typeof(r))ri->d; + + int minor = ri->evt.minor; + + if(opt.dump_all || opt.dump_cooked) { + if(minor < MEM_MAX) + printf(" %s %s d%d\n", + ri->dump_header, + mem_name[minor], + r->dom); + else + dump_generic(stdout, ri); + } + + if(opt.summary_info && minor < MEM_MAX) { + struct domain_data *d; + + if(p->current->d) { + p->current->d->memops.done[minor]++; + p->current->d->memops.done_interval[minor]++; + } + if((d=domain_find(r->dom))) { + d->memops.done_for[minor]++; + d->memops.done_for_interval[minor]++; + } + } +} + +/* ---- Base ----- */ +void dump_generic(FILE * f, struct record_info *ri) +{ + int i; + + fprintf(f, "]%s %7x(%x:%x:%x) %u [", + ri->dump_header, + ri->event, + ri->evt.main, + ri->evt.sub, + ri->evt.minor, + ri->extra_words); + + for(i=0; iextra_words; i++) { + fprintf(f, " %x", ri->d[i]); + } + + fprintf(f, " ]\n"); +} + +void process_generic(struct record_info *ri) { + + if(opt.dump_cooked || opt.dump_all) { + dump_generic(stdout, ri); + } +} + +void lose_vcpu(struct vcpu_data *v, tsc_t tsc) +{ + if(v->data_type == VCPU_DATA_HVM) + v->hvm.vmexit_valid=0; + runstate_update(v, RUNSTATE_LOST, tsc); + clear_vlapic(&v->vlapic); + + if(v->data_type == VCPU_DATA_HVM) { + int i; + if(opt.dump_all) + printf(" [w2h] Clearing w2h state for d%dv%d\n", + v->d->did, v->vid); + v->hvm.w2h.interrupts=0; + v->hvm.w2h.vector=0; + v->hvm.w2h.waking = 0; + for(i=0; ihvm.summary.guest_interrupt[i].start_tsc) { + printf(" Interrupt %d clearing start_tsc %lld\n", + i, v->hvm.summary.guest_interrupt[i].start_tsc); + } + v->hvm.summary.guest_interrupt[i].start_tsc = 0; + } + } +} + +struct lost_record_struct { + int lost_records; + unsigned did:16,vid:16; + tsc_t first_tsc; +}; + +void process_lost_records(struct pcpu_info *p) +{ + struct record_info *ri = &p->ri; + struct lost_record_struct *r = (typeof(r))ri->d; + tsc_t first_tsc; /* TSC of first record that was lost */ + + /* Sanity checks */ + if(ri->extra_words != 4) + { + fprintf(warn, "FATAL: Lost record has unexpected extra words %d!\n", + ri->extra_words); + exit(1); + } + else + first_tsc = r->first_tsc; + + if(opt.dump_cooked || opt.dump_all) + { + printf(" %s lost_records count %d d%uv%u (cur d%dv%d) first_tsc %lld\n", + ri->dump_header, r->lost_records, + r->did, r->vid, + p->current->d->did, p->current->vid, + r->first_tsc); + } + +#if 0 + if(opt.dump_trace_volume_on_lost_record) + volume_summary(&p->volume.last_buffer); +#endif + + clear_vlapic(&p->current->vlapic); + if(p->current->data_type == VCPU_DATA_HVM) { + p->current->hvm.vmexit_valid=0; + cr3_switch(0, &p->current->hvm); + } + + /* We may lose scheduling records; so we need to: + * - Point all records until now to the next schedule in the + * "default" domain + * - Make sure there are no warnings / strangeness with the + * current vcpu (if it gets scheduled elsewhere). + */ + vcpu_prev_update(p, p->current, first_tsc, RUNSTATE_LOST); + +#if 0 + vcpu_next_update(p, default_domain.vcpu[p->pid], first_tsc); + if(p->current->data_type == VCPU_DATA_HVM) { + p->current->hvm.vmexit_valid=0; + } +#endif + + /* The lost record trace is processed early -- i.e., + * After the last good record, rather than when the next + * record is processed. Between the time it's processed and + * the time it actually went in, the vcpu may be scheduled on + * other processors. So we can't switch vcpus until the first + * TSC'd record after the lost record. */ + if(!p->lost_record.active) { + P.lost_cpus++; + if(P.lost_cpus > P.max_active_pcpu + 1) { + fprintf(warn, "ERROR: P.lost_cpus %d > P.max_active_pcpu + 1 %d!\n", + P.lost_cpus, P.max_active_pcpu + 1); + exit(1); + } + } else + fprintf(warn, "Strange, lost record for pcpu %d, but lost_record still active!\n", + p->pid); + + p->lost_record.active = 1; + p->lost_record.tsc = first_tsc; + + { + /* Any vcpu which is not actively running may be scheduled on the + * lost cpu. To avoid mis-accounting, we need to reset */ + struct domain_data *d; + int i; + for(d=domain_list ; d; d=d->next) + { + if(d->did != DEFAULT_DOMAIN) { + for(i=0; ivcpu[i] && + d->vcpu[i]->runstate.state != RUNSTATE_RUNNING) { + if(opt.dump_all) + fprintf(warn, "%s: setting d%dv%d to RUNSTATE_LOST\n", + __func__, d->did, i); + lose_vcpu(d->vcpu[i], first_tsc); + } + } + } + } + + p->lost_record.domain_valid=1; + p->lost_record.did=r->did; + p->lost_record.vid=r->vid; +} + + +void process_lost_records_end(struct pcpu_info *p) +{ + struct record_info *ri = &p->ri; + struct lost_record_struct *r = (typeof(r))ri->d; + + if(!p->lost_record.active) { + fprintf(warn, "FATAL: lost_records_end but pid %d not lost!\n", + p->pid); + exit(1); + } + + /* Lost records. If this is the first record on a pcpu after the loss, + * Update the information. */ + if(ri->tsc > p->lost_record.tsc) + { + if(opt.dump_cooked || opt.dump_all) + printf(" %s lost_records end ---\n", + pcpu_string(p->pid)); + if(p->lost_record.domain_valid) { + int did = p->lost_record.did, + vid = p->lost_record.vid; + if(p->current) + { + fprintf(warn, "FATAL: lost_record valid (d%dv%d), but current d%dv%d!\n", + did, vid, + p->current->d->did, p->current->vid); + exit(1); + } + + if(opt.dump_all) + fprintf(warn, "Changing p%d current to d%dv%d\n", + p->pid, did, vid); + vcpu_next_update(p, + vcpu_find(did, vid), + ri->tsc); + p->lost_record.domain_valid=0; + } + + + p->lost_record.active = 0; + P.lost_cpus--; + if(P.lost_cpus < 0) { + fprintf(warn, "ERROR: lost_cpus fell below 0 for pcpu %d!\n", + p->pid); + exit(1); + } + } +} + +void base_process(struct pcpu_info *p) { + struct record_info *ri = &p->ri; + switch(ri->event) + { + case TRC_LOST_RECORDS: + process_lost_records(p); + break; + case TRC_LOST_RECORDS_END: + process_lost_records_end(p); + break; + case TRC_TRACE_VIRQ: + if(opt.dump_all || opt.dump_cooked) { + printf(" %s trace_virq\n", ri->dump_header); + } + P.buffer_trace_virq_tsc = ri->tsc; + break; + default: + process_generic(ri); + } + } + + + +/* Non-compat only */ +struct cpu_change_data { + int cpu; + unsigned window_size; +}; + +void activate_early_eof(void) { + struct pcpu_info *p; + int i; + + fprintf(warn, "Short cpu_change window, activating early_eof\n"); + + P.early_eof = 1; + + for(i=0; i<=P.max_active_pcpu; i++) { + p = P.pcpu + i; + if(p->active && p->file_offset > P.last_epoch_offset) { + fprintf(warn, " deactivating pid %d\n", + p->pid); + p->active = 0; + } + } +} + +loff_t scan_for_new_pcpu(loff_t offset) { + ssize_t r; + struct trace_record rec; + struct cpu_change_data *cd; + + r=__read_record(G.fd, &rec, offset); + + if(r==0) + return 0; + + if(rec.event != TRC_TRACE_CPU_CHANGE + || rec.cycle_flag) + { + fprintf(stderr, "%s: Unexpected record event %x!\n", + __func__, rec.event); + exit(1); + } + + cd = (typeof(cd))rec.u.notsc.data; + + if(cd->cpu > P.max_active_pcpu) { + struct pcpu_info *p = P.pcpu + cd->cpu; + + fprintf(warn, "%s: Activating pcpu %d at offset %lld\n", + __func__, cd->cpu, offset); + + p->active = 1; + /* Process this cpu_change record first */ + p->ri.rec = rec; + p->ri.size = r; + __fill_in_record_info(p); + + p->file_offset = offset; + p->next_cpu_change_offset = offset; + + offset += r + cd->window_size; + + sched_default_vcpu_activate(p); + + P.max_active_pcpu = cd->cpu; + + return offset; + } else { + return 0; + } +} + +/* + * Conceptually, when we reach a cpu_change record that's not for our pcpu, + * we want to scan forward through the file until we reach one that's for us. + * However, looping through involves reading the file, which we'd rather + * do in one place. Because cpu_change records don't include a tsc, + * the same pcpu will be processed repeatedly until the cpu_change + * equals p->pid. + * + * There are two additional things we need to do in this algorithm: + * + Detect new pcpus as they come online + * + De-activate pcpus which don't have any more records + * + * Detecting new pcpus which are less than P.max_active_pcpu is straight- + * forward: when max_active_pcpu is searching for its next cpu window, + * it will pass by the new cpu's window, and can activate it then. + * + * Detecting new pcpus greater than P.max_active_pcpu is a little harder; + * When max_active_pcpu is scanning for its next cpu window, after it's found + * it, we need to scan one more window forward to see if its' an already-active + * pcpu; if not, activate it. + * + * We also need to deal with truncated files, where records from one pcpu may + * be present but not from another pcpu due to lack of disk space. The best + * thing to do is to find the last "epoch" and essentially truncate the file + * to that. + */ +void process_cpu_change(struct pcpu_info *p) { + struct record_info *ri = &p->ri; + struct cpu_change_data *r = (typeof(r))ri->d; + + if(opt.dump_all) { + printf("]%s cpu_change this-cpu %u record-cpu %u window_size %u(0x%08x)\n", + ri->dump_header, p->pid, r->cpu, r->window_size, + r->window_size); + } + + /* File sanity check */ + if(p->file_offset != p->next_cpu_change_offset) { + fprintf(warn, "Strange, pcpu %d expected offet %llx, actual %llx!\n", + p->pid, p->next_cpu_change_offset, p->file_offset); + } + + if(r->cpu > MAX_CPUS) + { + fprintf(stderr, "FATAL: cpu %d > MAX_CPUS %d.\n", + r->cpu, MAX_CPUS); + exit(1); + } + + /* Detect beginning of new "epoch" while scanning thru file */ + if((p->last_cpu_change_pid > r->cpu) + && (p->file_offset > P.last_epoch_offset)) { + P.last_epoch_offset = p->file_offset; + } + + /* If that pcpu has never been activated, activate it. */ + if(!P.pcpu[r->cpu].active && P.pcpu[r->cpu].file_offset == 0) + { + struct pcpu_info * p2 = P.pcpu + r->cpu; + + p2->active = 1; + if(r->cpu > P.max_active_pcpu) + P.max_active_pcpu = r->cpu; + + fprintf(warn, "%s: Activating pcpu %d at offset %llx\n", + __func__, r->cpu, p->file_offset); + + sched_default_vcpu_activate(p2); + + /* Taking this record as the first record should make everything + * run swimmingly. */ + p2->ri = *ri; + p2->ri.cpu = r->cpu; + p2->ri.d = p2->ri.rec.u.notsc.data; + p2->file_offset = p->file_offset; + p2->next_cpu_change_offset = p->file_offset; + } + + p->last_cpu_change_pid = r->cpu; + + /* If this isn't the cpu we're looking for, skip the whole bunch */ + if(p->pid != r->cpu) + { + p->file_offset += ri->size + r->window_size; + p->next_cpu_change_offset = p->file_offset; + + if(p->file_offset > G.file_size) { + activate_early_eof(); + } else if(P.early_eof && p->file_offset > P.last_epoch_offset) { + fprintf(warn, "%s: early_eof activated, pcpu %d past last_epoch_offset %llx, deactivating.\n", + __func__, p->pid, P.last_epoch_offset); + p->active = 0; + } + } + else + { + /* Track information about dom0 scheduling and records */ + if(opt.dump_trace_volume_on_lost_record) { + tsc_t cycles; + struct time_struct t; + + /* Update dom0 runstates */ + cycles = (p->volume.buffer_first_tsc > p->volume.buffer_dom0_runstate_tsc) ? + p->volume.buffer_first_tsc : + p->volume.buffer_dom0_runstate_tsc; + p->volume.buffer_dom0_runstate_cycles[p->volume.buffer_dom0_runstate] + += ri->tsc - cycles; + + printf(" - updated p%d dom0_runstate %s to %lld cycles (+%lld)\n", + p->pid, runstate_name[p->volume.buffer_dom0_runstate], + p->volume.buffer_dom0_runstate_cycles[p->volume.buffer_dom0_runstate], + ri->tsc - cycles); + + /* print info */ + cycles = ri->tsc - p->volume.buffer_first_tsc; + cycles_to_time(cycles, &t); + printf("Buffer time: %lu.%09lu (%lld cycles)\n", + t.s, t.ns, cycles); + if(p->volume.buffer_size) + printf("Rate: %lld cycles / byte\n", + cycles / p->volume.buffer_size); + if(P.buffer_trace_virq_tsc) + { + cycles = ri->tsc - P.buffer_trace_virq_tsc; + cycles_to_time(cycles, &t); + printf("trace_virq latency: %lu.%09lu (%lld cycles)\n", + t.s, t.ns, cycles); + P.buffer_trace_virq_tsc = 0; + } + else + { + printf("No trace_virq record found.\n"); + } + printf("Dom0 runstates this buffer:\n"); + runstate_summary(p->volume.buffer_dom0_runstate_cycles); + volume_summary(&p->volume.last_buffer); + + /* reset info */ + p->volume.buffer_first_tsc = 0; + p->volume.buffer_size = r->window_size; + runstate_clear(p->volume.buffer_dom0_runstate_cycles); + volume_clear(&p->volume.last_buffer); + } + + p->file_offset += ri->size; + p->next_cpu_change_offset = p->file_offset + r->window_size; + + if(p->next_cpu_change_offset > G.file_size) + activate_early_eof(); + else if(p->pid == P.max_active_pcpu) + scan_for_new_pcpu(p->next_cpu_change_offset); + + } +} + +void process_record(struct pcpu_info *p) { + struct record_info *ri = &p->ri; + int toplevel; + + /* Process only TRC_TRACE_CPU_CHANGE */ + if(ri->event == TRC_TRACE_CPU_CHANGE) { + process_cpu_change(p); + return; + } + + if ( opt.dump_no_processing ) + goto out; + + p->summary = 1; + + if( opt.dump_raw_process ) + { + if(ri->rec.cycle_flag) + printf("* %x %d %lld\n", + ri->event, ri->extra_words, ri->tsc); + else + printf("* %x %d -\n", + ri->event, ri->extra_words); + } + + /* Find the first tsc set */ + if(ri->tsc && ri->tsc >= P.f.first_tsc) { + /* We use the order_tsc to account for the second processing of + * a lost record. */ + tsc_t tsc = p->order_tsc; + + if(P.f.first_tsc == 0) { + P.f.first_tsc = tsc; + if ( opt.interval_mode ) { + P.interval.start_tsc = tsc; + } + } else { + if ( opt.interval_mode ) { + if(P.interval.start_tsc > tsc) { + fprintf(warn, "FATAL: order_tsc %lld < interval.start_tsc %lld!\n", + tsc, P.interval.start_tsc); + exit(1); + } + while ( tsc - P.interval.start_tsc > opt.interval.cycles ) { + interval_callback(); + P.interval.start_tsc += opt.interval.cycles; + } + } + } + + P.f.last_tsc=tsc; + + P.f.total_cycles = P.f.last_tsc - P.f.first_tsc; + + P.now = tsc; + } + + abs_cycles_to_time(ri->tsc, &ri->t); + + if(opt.dump_cooked || opt.dump_all) { + if(ri->t.time) { + snprintf(ri->dump_header, + DUMP_HEADER_MAX, + "%3lu.%09lu %s ", ri->t.s, ri->t.ns, pcpu_string(ri->cpu)); + } else { + snprintf(ri->dump_header, + DUMP_HEADER_MAX, + " %s ", pcpu_string(ri->cpu)); + } + } + + { + unsigned int i, count; + for(i=0, count=0; ievt.main & (1<event, + ri->evt.main, ri->evt.sub, ri->evt.minor); + exit(1); + } + } + + switch(toplevel) { + case TRC_GEN_MAIN: + base_process(p); + break; + case TRC_SCHED_MAIN: + sched_process(p); + break; + case TRC_HVM_MAIN: + hvm_process(p); + break; + case TRC_SHADOW_MAIN: + shadow_process(p); + break; + case TRC_PV_MAIN: + pv_process(p); + break; + case TRC_MEM_MAIN: + mem_process(p); + break; + case TRC_DOM0OP_MAIN: + default: + process_generic(ri); + } + + UPDATE_VOLUME(p, toplevel[toplevel], ri->size); + + if(!p->volume.buffer_first_tsc) + p->volume.buffer_first_tsc = ri->tsc; + + out: + /* Lost records gets processed twice */ + if(ri->event != TRC_LOST_RECORDS) + p->file_offset += ri->size; +} + +static inline ssize_t get_rec_size(struct trace_record *rec) { + ssize_t s; + + s = sizeof(uint32_t); + + if(rec->cycle_flag) + s += sizeof(tsc_t); + + s += rec->extra_words * sizeof(uint32_t); + + return s; +} + +#define STDIN 0 + +void progress_child_exec(void) { + fclose(stdin); + dup2(G.progress.pipe[0], STDIN); + + execlp("zenity", "zenity", "--progress", "--auto-close", "--title", + "Analyzing", "--text", G.trace_file, "--auto-kill", NULL); +} + +void progress_init(void) { + int pid; + + pipe(G.progress.pipe); + + if(!(pid = fork())) { + progress_child_exec(); + + fprintf(stderr, "%s: exec failed (%s), disabling progress bar\n", + __func__, strerror(errno)); + opt.progress = 0; + exit(1); + } else if( pid < 0 ) { + fprintf(stderr, "%s: could not fork: %s, disabling progress bar\n", + __func__, strerror(errno)); + opt.progress = 0; + } + + if( (G.progress.out = fdopen(G.progress.pipe[1], "w")) < 0 ) { + fprintf(stderr, "%s: could not fdopen pipe: %s, disabling progress bar\n", + __func__, strerror(errno)); + opt.progress = 0; + } + +} + +void progress_update(loff_t offset) { + long long p; + + p = ( offset * 100 ) / G.file_size; + + fprintf(G.progress.out, "%lld\n", p); + fflush(G.progress.out); + + p += 1; + + G.progress.update_offset = ( G.file_size * p ) / 100; + +#if 0 + fprintf(stderr, "Progress: %lld %% Next update_offset: %lld\n", + p-1, + G.progress.update_offset); +#endif +} + +void progress_finish(void) { + int pid; + + fprintf(G.progress.out, "100\n"); + fflush(G.progress.out); + fclose(G.progress.out); + + wait(NULL); + + if(!(pid = fork())) { + /* Child */ + char text[128]; + + snprintf(text, 128, "Finished analyzing %s", + G.trace_file); + execlp("zenity", "zenity", "--info", "--text", text, NULL); + } +} + +ssize_t __read_record(int fd, struct trace_record *rec, loff_t offset) +{ + ssize_t r, rsize; + + r=pread64(G.fd, rec, sizeof(*rec), offset); + + if(r < 0) { + /* Read error */ + perror("read"); + fprintf(stderr, "offset %llx\n", (unsigned long long)offset); + return 0; + } else if(r==0) { + /* End-of-file */ + return 0; + } else if(r < sizeof(uint32_t)) { + /* Full header not read */ + fprintf(stderr, "%s: short read (%d bytes)\n", + __func__, r); + exit(1); + } + + rsize=get_rec_size(rec); + + if(r < rsize) { + /* Full record not read */ + fprintf(stderr, "%s: short read (%d, expected %d)\n", + __func__, r, rsize); + return 0; + } + + return rsize; +} + +void __fill_in_record_info(struct pcpu_info *p) +{ + struct record_info *ri; + tsc_t tsc=0; + + ri = &p->ri; + + ri->event = ri->rec.event; + ri->extra_words = ri->rec.extra_words; + + if(ri->rec.cycle_flag) { + tsc = (((tsc_t)ri->rec.u.tsc.tsc_hi) << 32) + | ri->rec.u.tsc.tsc_lo; + + tsc += p->tsc_skew.offset; + + ri->tsc = tsc; + ri->d = ri->rec.u.tsc.data; + + if(p->first_tsc == 0) + p->first_tsc = tsc; + + /* We process lost record twice: once at the first_tsc, + once at the time it was placed in the log */ + if(ri->event == TRC_LOST_RECORDS && ri->extra_words == 4) { + struct lost_record_struct *r = (typeof(r))ri->d; + p->order_tsc = r->first_tsc + p->tsc_skew.offset; + } else + p->order_tsc = tsc; + + p->last_tsc = tsc; + } else { + ri->tsc = p->last_tsc; + ri->d = ri->rec.u.notsc.data; + } + + if ( opt.dump_raw_reads ) { + int i; + printf("R p%2d o%016llx %8lx %d ", + p->pid, p->file_offset, + (unsigned long)ri->rec.event, ri->rec.extra_words); + + if(ri->rec.cycle_flag) + printf("t%016llx [ ", tsc); + else + printf(" [ "); + + for(i=0; irec.extra_words; i++) + printf("%x ", + ri->d[i]); + printf("]\n"); + } + + /* Updated tracing uses CPU_CHANGE. If we hit one of these, + * it will process very next (since the tsc isn't updated), and + * we'll skip forward appropriately. */ + ri->cpu = p->pid; +} + +ssize_t read_record(int fd, struct pcpu_info * p) { + loff_t * offset; + struct record_info *ri; + + offset = &p->file_offset; + ri = &p->ri; + + ri->size = __read_record(fd, &ri->rec, *offset); + if(ri->size) + { + __fill_in_record_info(p); + } + else + { + fprintf(warn, "%s: read returned zero, deactivating pcpu %d\n", + __func__, p->pid); + p->active = 0; + } + + return ri->size; +} + +/* WARNING not thread-safe */ +char * pcpu_string(int pcpu) +{ + static char s[MAX_CPUS+1] = { 0 }; + static int max_active_pcpu=-1, last_pcpu=-1; + + assert(P.max_active_pcpu < MAX_CPUS); + assert(pcpu <= P.max_active_pcpu); + + if(P.max_active_pcpu > max_active_pcpu) + { + int i; + for(i=max_active_pcpu + 1; i<= P.max_active_pcpu; i++) { + if(P.pcpu[i].lost_record.active) + s[i]='l'; + else + s[i]='-'; + } + } + + if(last_pcpu >= 0) + s[last_pcpu]='-'; + + s[pcpu]='x'; + last_pcpu = pcpu; + + return s; +} + +struct pcpu_info * choose_next_record(void) +{ + int i, max_active_pcpu = -1; + struct pcpu_info * p, *min_p=NULL; + loff_t min_offset = 0; + + for(i=0; i<=P.max_active_pcpu; i++) + { + p = P.pcpu+i; + if(!p->active) + continue; + + max_active_pcpu = i; + if(!min_p || p->order_tsc < min_p->order_tsc) + min_p = p; + + if(!min_offset || p->file_offset < min_offset) + min_offset = p->file_offset; + } + + P.max_active_pcpu = max_active_pcpu; + + if(opt.progress && min_offset >= G.progress.update_offset) + progress_update(min_offset); + + /* If there are active pcpus, make sure we chose one */ + assert(min_p || (max_active_pcpu==-1)); + + return min_p; +} + +void process_records(void) { + while(1) { + struct pcpu_info *p = NULL; + + if(!(p=choose_next_record())) + return; + + process_record(p); + + /* Lost records gets processed twice. */ + if(p->ri.event == TRC_LOST_RECORDS) { + p->ri.event = TRC_LOST_RECORDS_END; + if(p->ri.tsc > p->order_tsc) + p->order_tsc = p->ri.tsc; + else { + fprintf(warn, "Strange, lost_record ri->tsc %lld !> p->order_tsc %lld!\n", + p->ri.tsc, p->order_tsc); + exit(1); + } + } + else + read_record(G.fd, p); + + } +} + +void vcpu_summary(struct vcpu_data *v) +{ + printf("-- v%d --\n", v->vid); + sched_summary_vcpu(v); + switch(v->data_type) { + case VCPU_DATA_HVM: + hvm_summary(&v->hvm); + break; + case VCPU_DATA_PV: + pv_summary(&v->pv); + break; + default: + break; + } +} + +void domain_summary(void) +{ + struct domain_data * d; + int i; + + if(opt.show_default_domain_summary) { + d = &default_domain; + printf("|-- Default domain --|\n"); + + for( i = 0; i < MAX_CPUS ; i++ ) + { + if(d->vcpu[i]) + vcpu_summary(d->vcpu[i]); + } + } + + for ( d = domain_list ; d ; d=d->next ) + { + int i; + printf("|-- Domain %d --|\n", d->did); + + sched_summary_domain(d); + + mem_summary_domain(d); + + for( i = 0; i < MAX_CPUS ; i++ ) + { + if(d->vcpu[i]) + vcpu_summary(d->vcpu[i]); + } + + printf("Emulate eip list\n"); + dump_eip(d->emulate_eip_list); + + cr3_dump_list(d->cr3_value_head); + } +} + +char * stringify_cpu_hz(long long cpu_hz); + +void summary(void) { + int i; + printf("Total time: %.2lf seconds (using cpu speed %s)\n", + ((double)(P.f.total_cycles))/opt.cpu_hz, + stringify_cpu_hz(opt.cpu_hz)); + printf("--- Log volume summary ---\n"); + for(i=0; isummary) + continue; + printf(" - cpu %d -\n", i); + volume_summary(&p->volume.total); + } + domain_summary(); +} + +void init_pcpus(void) { + int i=0; + loff_t offset = 0; + + for(i=0; i GHZ) { + hz = (float)cpu_hz / GHZ; + suffix = 'G'; + } else if(cpu_hz > MHZ) { + hz = (float)cpu_hz / MHZ; + suffix = 'M'; + } else if(cpu_hz > KHZ) { + hz = (float)cpu_hz / KHZ; + suffix = 'k'; + } else { + hz = cpu_hz; + suffix = ' '; + } + + snprintf(cpu_string, 20, "%1.2lf %cHz", hz, suffix); + + return cpu_string; +} + +int parse_cr3_array(char *arg) { + char *p, *q; + int n=1, i; + + /* Count the number of commas (and thus the number of elements) */ + for(p=arg; *p; p++) + if(*p == ',') + n++; + + opt.interval.cr3.count = n; + opt.interval.cr3.values = malloc(n * sizeof(unsigned long long)); + P.interval.cr3.array = malloc(n * sizeof(struct cr3_value_struct *)); + + if(!opt.interval.cr3.values || !P.interval.cr3.array) { + fprintf(stderr, "Malloc failed!\n"); + exit(1); + } + + bzero(P.interval.cr3.array, n*sizeof(struct cr3_value_struct *)); + + /* Now parse the elements */ + p = q = arg; + for(i=0; iarg_num == 0) + G.trace_file = arg; + else + argp_usage(state); + } + break; + case ARGP_KEY_END: + { + if(opt.interval_mode) { + opt.interval.cycles = ( opt.interval.msec * opt.cpu_hz ) / 1000 ; + interval_header(); + } + + if(!G.output_defined) + { + fprintf(stderr, "No output defined, using summary.\n"); + opt.summary = 1; + opt.summary_info = 1; + } + fprintf(stderr, "Using %s hardware-assisted virtualization.\n", + opt.svm_mode?"SVM":"VMX"); + } + break; + + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp_option cmd_opts[] = { + /* Dump group */ + { .name = "dump-cooked", + .key = OPT_DUMP_COOKED, + .group = OPT_GROUP_DUMP, + .doc = "Dump a sanitized summary of vmexit/vmentry.", }, + + { .name = "dump-all", + .key = OPT_DUMP_ALL, + .group = OPT_GROUP_DUMP, + .doc = "Dump all records as they come in.", }, + + { .name = "dump-raw-reads", + .key = OPT_DUMP_RAW_READS, + .group = OPT_GROUP_DUMP, + .doc = "Dump raw data as it's read from disk. Useful mainly for debugging the analysis tool.", }, + + { .name = "dump-no-processing", + .key = OPT_DUMP_NO_PROCESSING, + .group = OPT_GROUP_DUMP, + .doc = "Don't do any processing on records other than cpu changes. Implies dump-raw-reads (or you wouldn't get anything).", }, + + { .name = "dump-raw-process", + .key = OPT_DUMP_RAW_PROCESS, + .group = OPT_GROUP_DUMP, + .doc = "Dump raw data as it's processed. Useful mainly for debugging the analysis tool.", }, + + { .name = "dump-ipi-latency", + .key = OPT_DUMP_IPI_LATENCY, + .group = OPT_GROUP_DUMP, + .doc = "Dump IPI latency info as IPIs are delivered (vector 0xd1 only).", }, + + { .name = "dump-trace-volume-on-lost-record", + .key = OPT_DUMP_TRACE_VOLUME_ON_LOST_RECORD, + .group = OPT_GROUP_DUMP, + .doc = "Dump the volume of trace types in the previous cpu buffer when a lost record is created.", }, + + /* Extra processing group */ + { .name = "with-cr3-enumeration", + .key = OPT_WITH_CR3_ENUMERATION, + .group = OPT_GROUP_EXTRA, + .doc = "Keep track of per-cr3 values", }, + + { .name = "with-pio-enumeration", + .key = OPT_WITH_PIO_ENUMERATION, + .group = OPT_GROUP_EXTRA, + .doc = "Report summary info on indiviaul IO addresses", }, + + { .name = "scatterplot-interrupt-eip", + .key = OPT_SCATTERPLOT_INTERRUPT_EIP, + .arg = "vector", + .group = OPT_GROUP_EXTRA, + .doc = "Output scatterplot of eips as a function of time.", }, + + { .name = "scatterplot-cpi", + .key = OPT_SCATTERPLOT_CPI, + .group = OPT_GROUP_EXTRA, + .doc = "Output scatterplot of cpi.", }, + + { .name = "scatterplot-unpin-promote", + .key = OPT_SCATTERPLOT_UNPIN_PROMOTE, + .group = OPT_GROUP_EXTRA, + .doc = "Output scatterplot of unpins and promotions. If " \ + "--with-cr3-enumeration is included, promotions include current cr3.", }, + + { .name = "scatterplot-cr3-switch", + .key = OPT_SCATTERPLOT_CR3_SWITCH, + .group = OPT_GROUP_EXTRA, + .doc = "Output scatterplot of cr3 switches.", }, + + { .name = "scatterplot-wake-to-halt", + .key = OPT_SCATTERPLOT_WAKE_TO_HALT, + .group = OPT_GROUP_EXTRA, + .doc = "Output scatterplot of wake-to-halt.", }, + + { .name = "scatterplot-io", + .key = OPT_SCATTERPLOT_IO, + .arg = "port", + .group = OPT_GROUP_EXTRA, + .doc = "Output scatterplot of io latencies for givein address as a function of time.", }, + + { .name = "histogram-interrupt-eip", + .key = OPT_HISTOGRAM_INTERRUPT_EIP, + .arg = "vector[,increment]", + .group = OPT_GROUP_EXTRA, + .doc = "Output histograms of eips.", }, + + { .name = "interval", + .key = OPT_INTERVAL_LENGTH, + .arg = "sec", + .group = OPT_GROUP_INTERVAL, + .doc = "Interval length to do time-based graphs, in seconds", }, + + { .name = "interval-cr3-schedule-time", + .key = OPT_INTERVAL_CR3_SCHEDULE_TIME, + .arg = "gmfn[,gmfn...]", + .group = OPT_GROUP_INTERVAL, + .doc = "Print a csv with the listed cr3 value(s) every interval.", }, + + { .name = "interval-cr3-schedule-time-all", + .key = OPT_INTERVAL_CR3_SCHEDULE_TIME_ALL, + .group = OPT_GROUP_INTERVAL, + .doc = "Print a csv with all cr3 values every interval.", }, + + { .name = "interval-cr3-schedule-ordered", + .key = OPT_INTERVAL_CR3_SCHEDULE_ORDERED, + .group = OPT_GROUP_INTERVAL, + .doc = "Print summary with the top 10 cr3 values every interval.", }, + + { .name = "interval-cr3-short-summary", + .key = OPT_INTERVAL_CR3_SHORT_SUMMARY, + .arg = "gmfn", + .group = OPT_GROUP_INTERVAL, + .doc = "Print a csv with the hvm short summary of cr3 value every interval.", }, + + { .name = "interval-domain-short-summary", + .key = OPT_INTERVAL_DOMAIN_SHORT_SUMMARY, + .arg = "domain-id", + .group = OPT_GROUP_INTERVAL, + .doc = "Print a csv with the hvm short summary of given domain every interval.", }, + + { .name = "interval-domain-guest-interrupt", + .key = OPT_INTERVAL_DOMAIN_GUEST_INTERRUPT, + .arg = "domain-id", + .group = OPT_GROUP_INTERVAL, + .doc = "Print a csv with the guest interrupt count of given domain every interval.", }, + + { .name = "interval-domain-grant-maps", + .key = OPT_INTERVAL_DOMAIN_GRANT_MAPS, + .arg = "domain-id", + .group = OPT_GROUP_INTERVAL, + .doc = "Print a csv with the grant maps done on behalf of a given domain every interval.", }, + + /* Summary group */ + { .name = "show-default-domain-summary", + .key = OPT_SHOW_DEFAULT_DOMAIN_SUMMARY, + .group = OPT_GROUP_SUMMARY, + .doc = "Show default domain information on summary", }, + + { .name = "sample-size", + .key = OPT_SAMPLE_SIZE, + .arg = "size", + .group = OPT_GROUP_SUMMARY, + .doc = "Keep [size] samples for percentile purposes. Enter 0 to " \ + "disable. Default 10240.", }, + + { .name = "summary", + .key = OPT_SUMMARY, + .group = OPT_GROUP_SUMMARY, + .doc = "Output a summary", }, + + /* Guest info */ + { .name = "default-guest-paging-levels", + .key = OPT_DEFAULT_GUEST_PAGING_LEVELS, + .group = OPT_GROUP_GUEST, + .arg = "L", + .doc = "Default guest paging levels. Mainly necessary for Rio, as Miami traces include guest paging levels where appropriate.", }, + + { .name = "symbol-file", + .key = OPT_SYMBOL_FILE, + .group = OPT_GROUP_GUEST, + .arg = "filename", + .doc = "A symbol file for interpreting guest eips.", }, + + /* Hardware info */ + { .name = "cpu-hz", + .key = OPT_CPU_HZ, + .group = OPT_GROUP_HARDWARE, + .arg = "HZ", + .doc = "Cpu speed of the tracing host, used to convert tsc into seconds.", }, + + { .name = "svm-mode", + .key = OPT_SVM_MODE, + .group = OPT_GROUP_HARDWARE, + .doc = "Assume AMD SVM-style vmexit error codes. (Default is Intel VMX.)", }, + + { .name = "progress", + .key = OPT_PROGRESS, + .doc = "Progress dialog. Requires the zenity (GTK+) executable.", }, + + { 0 }, +}; + +const struct argp parser_def = { + .options = cmd_opts, + .parser = cmd_parser, + .args_doc = "[trace file]", + .doc = "", +}; + +const char *argp_program_version = "xenalyze - XenServer Trunk"; +const char *argp_program_bug_address = "George Dunlap "; + + +int main(int argc, char *argv[]) { + + argp_parse(&parser_def, argc, argv, 0, NULL, NULL); + + if (G.trace_file == NULL) + exit(1); + + if ( (G.fd = open(G.trace_file, O_RDONLY|O_LARGEFILE)) < 0) { + perror("open"); + exit(1); + } else { + struct stat s; + fstat(G.fd, &s); + G.file_size = s.st_size; + } + if (G.symbol_file != NULL) + parse_symbol_file(G.symbol_file); + + if(opt.dump_cooked || opt.dump_all) + warn = stdout; + else + warn = stderr; + + init_pcpus(); + + if(opt.progress) + progress_init(); + + process_records(); + + if(opt.interval_mode) + interval_tail(); + + if(opt.summary) + summary(); + + if(opt.progress) + progress_finish(); + + return 0; +} +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/analyze.h b/analyze.h new file mode 100644 index 0000000..a2d33c4 --- /dev/null +++ b/analyze.h @@ -0,0 +1,36 @@ +#ifndef __ANALYZE_H +# define __ANALYZE_H +#define TRC_GEN_MAIN 0 +#define TRC_SCHED_MAIN 1 +#define TRC_DOM0OP_MAIN 2 +#define TRC_HVM_MAIN 3 +#define TRC_MEM_MAIN 4 +#define TRC_PV_MAIN 5 +#define TRC_SHADOW_MAIN 6 + +#define TRC_LOST_RECORDS_END (TRC_GEN + 50) + +enum { + TRCE_SFLAG_SET_AD, + TRCE_SFLAG_SET_A, + TRCE_SFLAG_SHADOW_L1_GET_REF, + TRCE_SFLAG_SHADOW_L1_PUT_REF, + TRCE_SFLAG_L2_PROPAGATE, + TRCE_SFLAG_SET_CHANGED, + TRCE_SFLAG_SET_FLUSH, + TRCE_SFLAG_SET_ERROR, + TRCE_SFLAG_DEMOTE, + TRCE_SFLAG_PROMOTE, + TRCE_SFLAG_WRMAP, + TRCE_SFLAG_WRMAP_GUESS_FOUND, + TRCE_SFLAG_WRMAP_BRUTE_FORCE, + TRCE_SFLAG_EARLY_UNSHADOW, + TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN, + TRCE_SFLAG_EMULATION_LAST_FAILED, + TRCE_SFLAG_EMULATE_FULL_PT, + TRCE_SFLAG_PREALLOC_UNPIN, + TRCE_SFLAG_PREALLOC_UNHOOK +}; + +#define TRC_HVM_OP_DESTROY_PROC (TRC_HVM_HANDLER + 0x100) +#endif diff --git a/code-intro.txt b/code-intro.txt new file mode 100644 index 0000000..991a2e4 --- /dev/null +++ b/code-intro.txt @@ -0,0 +1,75 @@ +Analyze is designed to gather useful data from traces, mostly for tracking +down performance issues in hvm domains. + +There are two key things that it looks at: vmexits and runstate changes. + +For HVM domains, we take a trace with a timestamp (i.e. rdtsc) at +every vmexit and vmenter. We also take a trace in between about what +the vmexit was about: why it happened, and what Xen did as a result. +This allows us to track how much time we spent inside Xen vs inside +the guest, and for what reason. + +The key data structure for storing this kind of "summary" information +is struct event_cycle_summary. It contains a total count, total +number of cycles, and a pseudo-random sample of cycles. Each vmexit +gets classified into a certain number of non-exclusive buckets, with +one summary struct for each classification. For example, we may take +a page fault for which Xen promotes a page to being a shadow and fills +the entry in, and it takes 8995 cycles. This sample gets added into +the following event_cycle_summary structs: + * EXNMI vmexit (the vmexit that a page fault causes) + * page_fault + * shadow fixup + * shadow fixup promotion + +When the tracing is finished, we report the count, total time, average +number of cycles per exit, and 5th, 50th, and 95th percentiles of +vmexits from the sample. + +Runstate changes allow us to tell which domain and vcpu to attribute +to a certain vmexit. It also allows us to see how often a domain was +in what state -- running, waiting for a cpu, blocked, &c. We can +collect similar kinds of samples for runstate changes as for vmexits: +i.e., how long after waking up did it take, on average, to get the +scheduler? And so on. + +There are two things that make this complicated. First is that the +files are generally so huge that we can't really afford to load the +data into memory; we can only afford to pass through the file once. +The second is that we are working in a multi-cpu world, where several +things happen more or less concurrently -- however, the traces are +written as "clumps" into the trace file by pcpu, and are therefore not +in the file in the exact order that they were put into the trace buffer. + +Our first order of business, then, is to attempt to re-construct the +order of traces as they were generated, and process them in that +order. This is the job of process_records(). We keep per host +processor a pointer in the file to the next record from that processor +to be processed. We then look through each one, determine the next +one to process (usually based on the tsc of the record), and call +process_record(). Then, we search for the next record of that +processor and write it into the struct, repeating until the file is +empty. + +Process_record looks at the record, and classifies it according to its +contents. The processor struct also contains a pointer to "current", +a record of what domain is currently running on the pcpu. We +determine this using the runstate change traces. + +Because we can only make one pass, we must build information about +vmexits as we go. The hvm data struct in the vcpu has "in-flight" +information about the current vmexit which it fills in as it goes. We +don't know everything about a vmexit until the next vmenter* -- it's +not until the vmenter that we know now many cycles this vmexit took. + +* I said vmenter here for simplicity, but it's really the vmexit + "close". The close is either on the next vmenter, or when the vcpu + is scheduled out. + +One of the things it fills in is a pointer to a function to call when +the vmexit is over (either on vmenter, or when it gets preempted by +the scheduler). That way we can act on all the information. An +example of this is when we keep lists of eips at which emulations +happened, and we keep summary information on the individual eips in +the list. + diff --git a/dump-raw.c b/dump-raw.c new file mode 100644 index 0000000..00bc26b --- /dev/null +++ b/dump-raw.c @@ -0,0 +1,211 @@ +#define _XOPEN_SOURCE 600 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace.h" + +typedef unsigned long long tsc_t; + +struct { + int fd; + int cur_cpu; + char * trace_file; +} G = { + .fd=-1, + .cur_cpu = -1, + .trace_file = NULL, +}; + +/* -- on-disk trace buffer definitions -- */ +struct trace_record { + unsigned event:28, + extra_words:3, + cycle_flag:1; + union { + struct { + uint32_t tsc_lo, tsc_hi; + uint32_t data[7]; + } tsc; + struct { + uint32_t data[7]; + } notsc; + } u; +}; + +struct record_info { + int cpu; + tsc_t tsc; + union { + unsigned event; + struct { + unsigned minor:12, + sub:4, + main:12, + unused:4; + } evt; + }; + int extra_words; + int size; + uint32_t *d; + + struct trace_record rec; +}; + +struct data { + loff_t file_offset, next_cpu_change_offset; + int pid; + struct record_info ri; +}; + +static inline ssize_t get_rec_size(struct trace_record *rec) { + ssize_t s; + + s = sizeof(uint32_t); + + if(rec->cycle_flag) + s += sizeof(tsc_t); + + s += rec->extra_words * sizeof(uint32_t); + + return s; +} + +ssize_t __read_record(int fd, struct trace_record *rec, loff_t offset) +{ + ssize_t r, rsize; + + r=pread64(G.fd, rec, sizeof(*rec), offset); + + if(r < 0) { + /* Read error */ + perror("read"); + fprintf(stderr, "offset %llx\n", (unsigned long long)offset); + return 0; + } else if(r==0) { + /* End-of-file */ + return 0; + } else if(r < sizeof(uint32_t)) { + /* Full header not read */ + fprintf(stderr, "%s: short read (%d bytes)\n", + __func__, r); + exit(1); + } + + rsize=get_rec_size(rec); + + if(r < rsize) { + /* Full record not read */ + fprintf(stderr, "%s: short read (%d, expected %d)\n", + __func__, r, rsize); + return 0; + } + + return rsize; +} + +void __fill_in_record_info(struct record_info *ri) +{ + tsc_t tsc=0; + + ri->event = ri->rec.event; + ri->extra_words = ri->rec.extra_words; + + if(ri->rec.cycle_flag) { + tsc = (((tsc_t)ri->rec.u.tsc.tsc_hi) << 32) + | ri->rec.u.tsc.tsc_lo; + + ri->tsc = tsc; + ri->d = ri->rec.u.tsc.data; + } else { + ri->tsc = 0; + ri->d = ri->rec.u.notsc.data; + } +} + +struct cpu_change_data { + int cpu; + unsigned window_size; +}; + +void process_cpu_change(struct data *p) { + struct record_info *ri = &p->ri; + struct cpu_change_data *r = (typeof(r))ri->d; + + printf(" cpu_change this-cpu %u record-cpu %u window_size %u(0x%08x)\n", + p->pid, r->cpu, r->window_size, + r->window_size); + + /* File sanity check */ + if(p->file_offset != p->next_cpu_change_offset) { + printf("Strange, pcpu %d expected offet %llx, actual %llx!\n", + p->pid, p->next_cpu_change_offset, p->file_offset); + } + + p->next_cpu_change_offset = p->file_offset + ri->size + r->window_size; + + p->pid = r->cpu; +} + +void print_raw_record(struct data *p) { + struct record_info *ri = &p->ri; + int i; + + printf("R p%2d o%016llx %8lx %d ", + p->pid, p->file_offset, + (unsigned long)ri->rec.event, ri->rec.extra_words); + + if(ri->rec.cycle_flag) + printf("t%016llx [ ", ri->tsc); + else + printf(" [ "); + + for(i=0; irec.extra_words; i++) + printf("%x ", + ri->d[i]); + printf("]\n"); +} + +void read_records(int fd) { + struct data p = { + .file_offset = 0, + .next_cpu_change_offset = 0, + .pid = -1, + .ri = { 0 } + }; + struct record_info *ri=&p.ri; + + do { + ri->size = __read_record(fd, &ri->rec, p.file_offset); + + if(ri->size) + { + __fill_in_record_info(ri); + + print_raw_record(&p); + + if(ri->event == TRC_TRACE_CPU_CHANGE) + process_cpu_change(&p); + + p.file_offset += ri->size; + } + } while ( ri->size ); +} + +int main(int argc, char * argv[]) { + if(argc != 2) { + fprintf(stderr, "Usage: %s tracefile\n", argv[0]); + exit(1); + } + + G.fd = open(argv[1], O_RDONLY|O_LARGEFILE); + + read_records(G.fd); +} diff --git a/trace.h b/trace.h new file mode 100644 index 0000000..6f04260 --- /dev/null +++ b/trace.h @@ -0,0 +1,182 @@ +/****************************************************************************** + * include/public/trace.h + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Mark Williamson, (C) 2004 Intel Research Cambridge + * Copyright (C) 2005 Bin Ren + */ + +#ifndef __XEN_PUBLIC_TRACE_H__ +#define __XEN_PUBLIC_TRACE_H__ + +#define TRACE_EXTRA_MAX 7 +#define TRACE_EXTRA_SHIFT 28 + +/* Trace classes */ +#define TRC_CLS_SHIFT 16 +#define TRC_GEN 0x0001f000 /* General trace */ +#define TRC_SCHED 0x0002f000 /* Xen Scheduler trace */ +#define TRC_DOM0OP 0x0004f000 /* Xen DOM0 operation trace */ +#define TRC_HVM 0x0008f000 /* Xen HVM trace */ +#define TRC_MEM 0x0010f000 /* Xen memory trace */ +#define TRC_PV 0x0020f000 /* Xen PV traces */ +#define TRC_SHADOW 0x0040f000 /* Xen shadow tracing */ +#define TRC_ALL 0x0ffff000 +#define TRC_HD_TO_EVENT(x) ((x)&0x0fffffff) +#define TRC_HD_CYCLE_FLAG (1UL<<31) +#define TRC_HD_INCLUDES_CYCLE_COUNT(x) ( !!( (x) & TRC_HD_CYCLE_FLAG ) ) +#define TRC_HD_EXTRA(x) (((x)>>TRACE_EXTRA_SHIFT)&TRACE_EXTRA_MAX) + +/* Trace subclasses */ +#define TRC_SUBCLS_SHIFT 12 + +/* trace subclasses for SVM */ +#define TRC_HVM_ENTRYEXIT 0x00081000 /* VMENTRY and #VMEXIT */ +#define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */ + +/* Trace events per class */ +#define TRC_LOST_RECORDS (TRC_GEN + 1) +#define TRC_TRACE_WRAP_BUFFER (TRC_GEN + 2) +#define TRC_TRACE_CPU_CHANGE (TRC_GEN + 3) +#define TRC_TRACE_VIRQ (TRC_GEN + 4) + +#define TRC_SCHED_MIN 0x0081000 + +#define TRC_SCHED_DOM_ADD (TRC_SCHED + 1) +#define TRC_SCHED_DOM_REM (TRC_SCHED + 2) +#define TRC_SCHED_SLEEP (TRC_SCHED + 3) +#define TRC_SCHED_WAKE (TRC_SCHED + 4) +#define TRC_SCHED_YIELD (TRC_SCHED + 5) +#define TRC_SCHED_BLOCK (TRC_SCHED + 6) +#define TRC_SCHED_SHUTDOWN (TRC_SCHED + 7) +#define TRC_SCHED_CTL (TRC_SCHED + 8) +#define TRC_SCHED_ADJDOM (TRC_SCHED + 9) +#define TRC_SCHED_SWITCH (TRC_SCHED + 10) +#define TRC_SCHED_S_TIMER_FN (TRC_SCHED + 11) +#define TRC_SCHED_T_TIMER_FN (TRC_SCHED + 12) +#define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED + 13) +#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14) +#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15) +#define TRC_SCHED_SHUTDOWN_CODE (TRC_SCHED + 16) + +#define TRC_MEM_PAGE_GRANT_MAP (TRC_MEM + 1) +#define TRC_MEM_PAGE_GRANT_UNMAP (TRC_MEM + 2) +#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3) + +#define TRC_PV_HYPERCALL (TRC_PV + 1) +#define TRC_PV_TRAP (TRC_PV + 3) +#define TRC_PV_PAGE_FAULT (TRC_PV + 4) +#define TRC_PV_FORCED_INVALID_OP (TRC_PV + 5) +#define TRC_PV_EMULATE_PRIVOP (TRC_PV + 6) +#define TRC_PV_EMULATE_4GB (TRC_PV + 7) +#define TRC_PV_MATH_STATE_RESTORE (TRC_PV + 8) +#define TRC_PV_PAGING_FIXUP (TRC_PV + 9) +#define TRC_PV_GDT_LDT_MAPPING_FAULT (TRC_PV + 10) +#define TRC_PV_PTWR_EMULATION (TRC_PV + 11) +#define TRC_PV_PTWR_EMULATION_PAE (TRC_PV + 12) + /* Indicates that addresses in trace record are 64 bits */ +#define TRC_PV_64_FLAG (0x100) + +#define TRC_SHADOW_NOT_SHADOW (TRC_SHADOW + 1) +#define TRC_SHADOW_FAST_PROPAGATE (TRC_SHADOW + 2) +#define TRC_SHADOW_FAST_MMIO (TRC_SHADOW + 3) +#define TRC_SHADOW_FALSE_FAST_PATH (TRC_SHADOW + 4) +#define TRC_SHADOW_MMIO (TRC_SHADOW + 5) +#define TRC_SHADOW_FIXUP (TRC_SHADOW + 6) +#define TRC_SHADOW_DOMF_DYING (TRC_SHADOW + 7) +#define TRC_SHADOW_EMULATE (TRC_SHADOW + 8) +#define TRC_SHADOW_EMULATE_UNSHADOW_USER (TRC_SHADOW + 9) +#define TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ (TRC_SHADOW + 10) +#define TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED (TRC_SHADOW + 11) +#define TRC_SHADOW_EMULATE_UNSHADOW_HEURISTIC (TRC_SHADOW + 12) +#define TRC_SHADOW_PREALLOC_UNPIN (TRC_SHADOW + 13) +#define TRC_SHADOW_RESYNC_FULL (TRC_SHADOW + 24) +#define TRC_SHADOW_RESYNC_ONLY (TRC_SHADOW + 25) + +/* trace events per subclass */ +#define TRC_HVM_VMENTRY (TRC_HVM_ENTRYEXIT + 0x01) +#define TRC_HVM_VMEXIT (TRC_HVM_ENTRYEXIT + 0x02) +#define TRC_HVM_VMEXIT64 (TRC_HVM_ENTRYEXIT + 0x03) +#define TRC_HVM_PF_XEN (TRC_HVM_HANDLER + 0x01) +#define TRC_HVM_PF_INJECT (TRC_HVM_HANDLER + 0x02) +#define TRC_HVM_INJ_EXC (TRC_HVM_HANDLER + 0x03) +#define TRC_HVM_INJ_VIRQ (TRC_HVM_HANDLER + 0x04) +#define TRC_HVM_REINJ_VIRQ (TRC_HVM_HANDLER + 0x05) +#define TRC_HVM_IO_READ (TRC_HVM_HANDLER + 0x06) +#define TRC_HVM_IO_WRITE (TRC_HVM_HANDLER + 0x07) +#define TRC_HVM_CR_READ (TRC_HVM_HANDLER + 0x08) +#define TRC_HVM_CR_WRITE (TRC_HVM_HANDLER + 0x09) +#define TRC_HVM_DR_READ (TRC_HVM_HANDLER + 0x0A) +#define TRC_HVM_DR_WRITE (TRC_HVM_HANDLER + 0x0B) +#define TRC_HVM_MSR_READ (TRC_HVM_HANDLER + 0x0C) +#define TRC_HVM_MSR_WRITE (TRC_HVM_HANDLER + 0x0D) +#define TRC_HVM_CPUID (TRC_HVM_HANDLER + 0x0E) +#define TRC_HVM_INTR (TRC_HVM_HANDLER + 0x0F) +#define TRC_HVM_NMI (TRC_HVM_HANDLER + 0x10) +#define TRC_HVM_SMI (TRC_HVM_HANDLER + 0x11) +#define TRC_HVM_VMMCALL (TRC_HVM_HANDLER + 0x12) +#define TRC_HVM_HLT (TRC_HVM_HANDLER + 0x13) +#define TRC_HVM_INVLPG (TRC_HVM_HANDLER + 0x14) +#define TRC_HVM_MCE (TRC_HVM_HANDLER + 0x15) +#define TRC_HVM_IO_ASSIST (TRC_HVM_HANDLER + 0x16) +#define TRC_HVM_MMIO_ASSIST (TRC_HVM_HANDLER + 0x17) +#define TRC_HVM_CLTS (TRC_HVM_HANDLER + 0x18) +#define TRC_HVM_LMSW (TRC_HVM_HANDLER + 0x19) +#define TRC_HVM_PF_XEN64 (TRC_HVM_HANDLER + 0x20) +#define TRC_HVM_OP_DESTROY_PROC (TRC_HVM_HANDLER + 0x100) + + +/* This structure represents a single trace buffer record. */ +struct t_rec { + uint32_t event:28; + uint32_t extra_u32:3; /* # entries in trailing extra_u32[] array */ + uint32_t cycles_included:1; /* u.cycles or u.no_cycles? */ + union { + struct { + uint32_t cycles_lo, cycles_hi; /* cycle counter timestamp */ + uint32_t extra_u32[7]; /* event data items */ + } cycles; + struct { + uint32_t extra_u32[7]; /* event data items */ + } nocycles; + } u; +}; + +/* + * This structure contains the metadata for a single trace buffer. The head + * field, indexes into an array of struct t_rec's. + */ +struct t_buf { + uint32_t cons; /* Offset of next item to be consumed by control tools. */ + uint32_t prod; /* Offset of next item to be produced by Xen. */ + /* Records follow immediately after the meta-data header. */ +}; + +#endif /* __XEN_PUBLIC_TRACE_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */