direct-io.hg
changeset 2103:47f04385bb81
bitkeeper revision 1.1158 (41137071d88TgpVO07tfpWOGGvxo8A)
Dynamic binary rewriting of -ve segment accesses is now enabled by
default in Linux 2.6. Removes the approx 100-percent overhead incurred
by pure emulation on library-intensive benchmarks.
Dynamic binary rewriting of -ve segment accesses is now enabled by
default in Linux 2.6. Removes the approx 100-percent overhead incurred
by pure emulation on library-intensive benchmarks.
author | kaf24@scramble.cl.cam.ac.uk |
---|---|
date | Fri Aug 06 11:50:09 2004 +0000 (2004-08-06) |
parents | 1d27d2477ab7 |
children | 4c3d4467e243 |
files | linux-2.6.7-xen-sparse/arch/xen/i386/kernel/setup.c linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c |
line diff
1.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/setup.c Fri Aug 06 10:53:33 2004 +0000 1.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/setup.c Fri Aug 06 11:50:09 2004 +0000 1.3 @@ -1100,8 +1100,6 @@ void __init setup_arch(char **cmdline_p) 1.4 1.5 HYPERVISOR_vm_assist(VMASST_CMD_enable, 1.6 VMASST_TYPE_4gb_segments); 1.7 - HYPERVISOR_vm_assist(VMASST_CMD_enable, 1.8 - VMASST_TYPE_4gb_segments_notify); 1.9 #if 0 1.10 HYPERVISOR_vm_assist(VMASST_CMD_enable, 1.11 VMASST_TYPE_writeable_pagetables);
2.1 --- a/linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c Fri Aug 06 10:53:33 2004 +0000 2.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c Fri Aug 06 11:50:09 2004 +0000 2.3 @@ -110,7 +110,7 @@ static unsigned char insn_decode[256] = 2.4 X, X, X, X, X, X, X, X, 2.5 /* 0xE0 - 0xEF */ 2.6 X, X, X, X, X, X, X, X, 2.7 - X, X, X, O|1, X, X, X, X, 2.8 + X, O|4, X, O|1, X, X, X, X, 2.9 /* 0xF0 - 0xFF */ 2.10 P, X, P, P, O, O, O|M|1, O|M|4, 2.11 O, O, O, O, O, O, O|M, O|M 2.12 @@ -133,10 +133,7 @@ static unsigned int get_insn_len(unsigne 2.13 2.14 /* 2. Ensure we have a valid opcode byte. */ 2.15 if ( !(d & OPCODE_BYTE) ) 2.16 - { 2.17 - printk(KERN_ALERT " !!! 0x%02x 0x%02x\n", b, *(pb+1)); 2.18 return 0; 2.19 - } 2.20 2.21 /* 3. Process Mod/RM if there is one. */ 2.22 if ( d & HAS_MODRM ) 2.23 @@ -166,21 +163,32 @@ static unsigned int get_insn_len(unsigne 2.24 return ((pb - insn) + 1 + (d & INSN_SUFFIX_BYTES)); 2.25 } 2.26 2.27 +static unsigned char handleable_code[32] = { 2.28 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 2.29 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 2.30 + /* 0x80-0x83, 0x89, 0x8B */ 2.31 + 0x0F, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 2.32 + /* 0xC7 */ 2.33 + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 2.34 +}; 2.35 + 2.36 asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) 2.37 { 2.38 static unsigned int fixup_idx = 0; 2.39 - int relbyte_idx = -1; 2.40 + int relbyte_idx = -1, relword_idx = -1, save_indirect_reg; 2.41 unsigned int insn_len = (unsigned int)error_code, new_insn_len; 2.42 - unsigned char b[20], modrm, mod, reg, rm, *fixup_buf, patch[5], opcode; 2.43 - unsigned long fixup_buf_user, eip = regs->eip - insn_len; 2.44 + unsigned char b[20], modrm, mod, reg, rm, patch[5], opcode; 2.45 + unsigned char *fixup_buf = 2.46 + (unsigned char *)fix_to_virt(FIX_4GB_SEGMENT_FIXUP_RW); 2.47 + unsigned long fixup_buf_user = 2.48 + fix_to_virt(FIX_4GB_SEGMENT_FIXUP_RO); 2.49 + unsigned long eip = regs->eip - insn_len; 2.50 struct fixup_entry *fe; 2.51 pte_t *pte; 2.52 pmd_t *pmd; 2.53 pgd_t *pgd; 2.54 void *veip; 2.55 2.56 - return; /* XXX */ 2.57 - 2.58 /* Easy check that code segment has base 0, max limit. */ 2.59 if ( unlikely(regs->xcs != __USER_CS) ) 2.60 { 2.61 @@ -190,7 +198,17 @@ asmlinkage void do_fixup_4gb_segment(str 2.62 2.63 if ( unlikely(eip >= (PAGE_OFFSET-32)) ) 2.64 { 2.65 - DPRINTK("User executing out of kernel space?!"); 2.66 + if ( (eip < fixup_buf_user) || (eip >= (fixup_buf_user+PAGE_SIZE-32)) ) 2.67 + { 2.68 + DPRINTK("User executing out of kernel space?!"); 2.69 + return; 2.70 + } 2.71 + /* We know it's safe to directly copy teh bytes into our buffer. */ 2.72 + memcpy(b, (void *)eip, sizeof(b)); 2.73 + } 2.74 + else if ( unlikely(copy_from_user(b, (void *)eip, sizeof(b)) != 0) ) 2.75 + { 2.76 + DPRINTK("Could not read instruction bytes from user space."); 2.77 return; 2.78 } 2.79 2.80 @@ -203,18 +221,16 @@ asmlinkage void do_fixup_4gb_segment(str 2.81 /* Guaranteed enough room to patch? */ 2.82 if ( unlikely(fixup_idx > (PAGE_SIZE-32)) ) 2.83 { 2.84 - DPRINTK("Out of room in fixup page."); 2.85 + static int printed = 0; 2.86 + if ( !printed ) 2.87 + printk(KERN_ALERT "WARNING: Out of room in segment-fixup page.\n"); 2.88 + printed = 1; 2.89 return; 2.90 } 2.91 2.92 - if ( unlikely(copy_from_user(b, (void *)eip, sizeof(b)) != 0) ) 2.93 - { 2.94 - DPRINTK("Could not read instruction bytes from user space."); 2.95 - return; 2.96 - } 2.97 - 2.98 - /* Must be 'mov %gs:m32,r32' or 'mov r32,%gs:m32'. */ 2.99 - if ( (b[0] != 0x65) || ((b[1] != 0x89) && (b[1] != 0x8b)) ) 2.100 + /* Must be a handleable opcode with GS override. */ 2.101 + if ( (b[0] != 0x65) || 2.102 + !test_bit((unsigned int)b[1], (unsigned long *)handleable_code) ) 2.103 { 2.104 DPRINTK("No GS override, or not a MOV (%02x %02x).", b[0], b[1]); 2.105 return; 2.106 @@ -225,6 +241,9 @@ asmlinkage void do_fixup_4gb_segment(str 2.107 reg = (modrm >> 3) & 7; 2.108 rm = (modrm >> 0) & 7; 2.109 2.110 + /* If indirect register isn't clobbered then we must push/pop it. */ 2.111 + save_indirect_reg = !((b[1] == 0x8b) && (reg == rm)); 2.112 + 2.113 /* We don't grok SIB bytes. */ 2.114 if ( rm == 4 ) 2.115 { 2.116 @@ -236,26 +255,20 @@ asmlinkage void do_fixup_4gb_segment(str 2.117 switch ( mod ) 2.118 { 2.119 case 0: 2.120 - if ( (rm == 5) || unlikely(insn_len != 3) ) 2.121 + if ( rm == 5 ) 2.122 { 2.123 - DPRINTK("Unhandleable disp32 EA, or bad insn_len (%d, %d).", 2.124 - rm, insn_len); 2.125 + DPRINTK("Unhandleable disp32 EA %d.", rm); 2.126 return; 2.127 } 2.128 break; /* m32 == (r32) */ 2.129 case 1: 2.130 - if ( unlikely(insn_len != 4) ) 2.131 - { 2.132 - DPRINTK("Bad insn_len (%d).", insn_len); 2.133 - return; 2.134 - } 2.135 break; /* m32 == disp8(r32) */ 2.136 default: 2.137 DPRINTK("Unhandleable Mod value %d.", mod); 2.138 return; 2.139 } 2.140 2.141 - for ( ; ; ) 2.142 + while ( insn_len < 5 ) 2.143 { 2.144 /* Bail if can't decode the following instruction. */ 2.145 if ( unlikely((new_insn_len = 2.146 @@ -270,7 +283,7 @@ asmlinkage void do_fixup_4gb_segment(str 2.147 { 2.148 if ( relbyte_idx != -1 ) 2.149 { 2.150 - printk(KERN_ALERT "Multiple relative offsets in patch seq!"); 2.151 + DPRINTK("Multiple relative offsets in patch seq!"); 2.152 return; 2.153 } 2.154 relbyte_idx = insn_len; 2.155 @@ -278,6 +291,18 @@ asmlinkage void do_fixup_4gb_segment(str 2.156 relbyte_idx++; 2.157 relbyte_idx++; 2.158 } 2.159 + else if ( opcode == 0xe9 ) 2.160 + { 2.161 + if ( relword_idx != -1 ) 2.162 + { 2.163 + DPRINTK("Multiple relative offsets in patch seq!"); 2.164 + return; 2.165 + } 2.166 + relword_idx = insn_len; 2.167 + while ( b[relword_idx] != opcode ) 2.168 + relword_idx++; 2.169 + relword_idx++; 2.170 + } 2.171 2.172 if ( (insn_len += new_insn_len) > 20 ) 2.173 { 2.174 @@ -290,16 +315,13 @@ asmlinkage void do_fixup_4gb_segment(str 2.175 break; 2.176 2.177 /* Can't have a RET in the middle of a patch sequence. */ 2.178 - if ( (opcode == 0xc4) || (relbyte_idx != -1) ) 2.179 + if ( opcode == 0xc4 ) 2.180 { 2.181 - printk(KERN_ALERT "RET or rel. off. in middle of patch seq!\n"); 2.182 + DPRINTK("RET in middle of patch seq!\n"); 2.183 return; 2.184 } 2.185 } 2.186 2.187 - fixup_buf = (unsigned char *)fix_to_virt(FIX_4GB_SEGMENT_FIXUP_RW); 2.188 - fixup_buf_user = fix_to_virt(FIX_4GB_SEGMENT_FIXUP_RO); 2.189 - 2.190 /* Already created a fixup for this address and code sequence? */ 2.191 for ( fe = fixup_hash[FIXUP_HASH(eip)]; 2.192 fe != NULL; fe = fe->next ) 2.193 @@ -307,13 +329,7 @@ asmlinkage void do_fixup_4gb_segment(str 2.194 if ( (fe->patch_addr == eip) && 2.195 (fe->patched_code_len == insn_len) && 2.196 (memcmp(fe->patched_code, b, insn_len) == 0) ) 2.197 - { 2.198 -#if 0 2.199 - if ( fe->fixup_idx == 10000 ) 2.200 - return; 2.201 -#endif 2.202 goto do_the_patch; 2.203 - } 2.204 } 2.205 2.206 /* No existing patch -- create an entry for one. */ 2.207 @@ -330,22 +346,8 @@ asmlinkage void do_fixup_4gb_segment(str 2.208 fe->next = fixup_hash[FIXUP_HASH(eip)]; 2.209 fixup_hash[FIXUP_HASH(eip)] = fe; 2.210 2.211 -#if 0 2.212 - if ( (eip & 0x3f) == 0x38 ) 2.213 - { 2.214 - int i; 2.215 - static int ii = 0; 2.216 - printk(KERN_ALERT " !!!!!!! %d'th reject\n"KERN_ALERT" .byte ", ++ii); 2.217 - for ( i = 0; i < insn_len; i++ ) 2.218 - printk("0x%02x,", b[i]); 2.219 - printk("\n"); 2.220 - fe->fixup_idx = 10000; 2.221 - return; 2.222 - } 2.223 -#endif 2.224 - 2.225 /* push <r32> */ 2.226 - if ( reg != rm ) 2.227 + if ( save_indirect_reg ) 2.228 fixup_buf[fixup_idx++] = 0x50 + rm; 2.229 2.230 /* add %gs:0,<r32> */ 2.231 @@ -360,12 +362,15 @@ asmlinkage void do_fixup_4gb_segment(str 2.232 fixup_idx += error_code - 1; 2.233 2.234 /* pop <r32> */ 2.235 - if ( reg != rm ) 2.236 + if ( save_indirect_reg ) 2.237 fixup_buf[fixup_idx++] = 0x58 + rm; 2.238 2.239 - /* Relocated instructions, minus the initial GS override. */ 2.240 - memcpy(&fixup_buf[fixup_idx], &b[error_code], insn_len - error_code); 2.241 - fixup_idx += insn_len - error_code; 2.242 + if ( insn_len != error_code ) 2.243 + { 2.244 + /* Relocated instructions. */ 2.245 + memcpy(&fixup_buf[fixup_idx], &b[error_code], insn_len - error_code); 2.246 + fixup_idx += insn_len - error_code; 2.247 + } 2.248 2.249 /* jmp <rel32> */ 2.250 fixup_buf[fixup_idx++] = 0xe9; 2.251 @@ -376,8 +381,8 @@ asmlinkage void do_fixup_4gb_segment(str 2.252 if ( relbyte_idx != -1 ) 2.253 { 2.254 /* Patch the 8-bit relative offset. */ 2.255 - int idx = relbyte_idx + 6; 2.256 - if ( reg != rm ) 2.257 + int idx = fe->fixup_idx + relbyte_idx + 6; 2.258 + if ( save_indirect_reg ) 2.259 idx += 2; 2.260 fixup_buf[idx] = fixup_idx - (idx + 1); 2.261 2.262 @@ -385,9 +390,17 @@ asmlinkage void do_fixup_4gb_segment(str 2.263 fixup_buf[fixup_idx++] = 0xe9; 2.264 fixup_idx += 4; 2.265 *(unsigned long *)&fixup_buf[fixup_idx-4] = 2.266 - (eip + relbyte_idx + 1 + b[relbyte_idx]) - 2.267 + (eip + relbyte_idx + 1 + (long)(char)b[relbyte_idx]) - 2.268 (fixup_buf_user + fixup_idx); 2.269 - 2.270 + } 2.271 + else if ( relword_idx != -1 ) 2.272 + { 2.273 + /* Patch the 32-bit relative offset by subtracting the code disp. */ 2.274 + int idx = fe->fixup_idx + relword_idx + 6; 2.275 + if ( save_indirect_reg ) 2.276 + idx += 2; 2.277 + *(unsigned long *)&fixup_buf[idx] += 2.278 + (eip + relword_idx) - (fixup_buf_user + idx); 2.279 } 2.280 2.281 do_the_patch: 2.282 @@ -405,18 +418,36 @@ asmlinkage void do_fixup_4gb_segment(str 2.283 2.284 /* Success! Return to user land to execute 2nd insn of the pair. */ 2.285 regs->eip = fixup_buf_user + fe->fixup_idx + error_code + 6; 2.286 - if ( reg != rm ) 2.287 - regs->eip += 2; /* account for push/pop pair */ 2.288 + if ( save_indirect_reg ) 2.289 + regs->eip += 2; 2.290 return; 2.291 } 2.292 2.293 +static int nosegfixup = 0; 2.294 + 2.295 static int __init fixup_init(void) 2.296 { 2.297 - unsigned long page = get_zeroed_page(GFP_ATOMIC); 2.298 + unsigned long page; 2.299 + 2.300 + if ( nosegfixup ) 2.301 + return 0; 2.302 + 2.303 + HYPERVISOR_vm_assist(VMASST_CMD_enable, 2.304 + VMASST_TYPE_4gb_segments_notify); 2.305 + 2.306 + page = get_zeroed_page(GFP_ATOMIC); 2.307 __set_fixmap(FIX_4GB_SEGMENT_FIXUP_RO, __pa(page), PAGE_READONLY); 2.308 __set_fixmap(FIX_4GB_SEGMENT_FIXUP_RW, __pa(page), PAGE_KERNEL); 2.309 + 2.310 memset(fixup_hash, 0, sizeof(fixup_hash)); 2.311 + 2.312 return 0; 2.313 } 2.314 +__initcall(fixup_init); 2.315 2.316 -__initcall(fixup_init); 2.317 +static int __init fixup_setup(char *str) 2.318 +{ 2.319 + nosegfixup = 1; 2.320 + return 0; 2.321 +} 2.322 +__setup("nosegfixup", fixup_setup);