ia64/xen-unstable

changeset 1393:85bffd6ccfcb

bitkeeper revision 1.906 (40a5e91cnvIS_3gLwfnD2G3HV3odHA)

manual merge
author iap10@labyrinth.cl.cam.ac.uk
date Sat May 15 09:55:40 2004 +0000 (2004-05-15)
parents e8e20ed30675 769e154137df
children 996c4e53641e
files .rootkeys xen/arch/i386/entry.S xen/arch/i386/pdb-linux.c xen/arch/i386/pdb-stub.c xen/arch/i386/smp.c xen/arch/i386/traps.c xen/common/dom0_ops.c xen/common/domain.c xen/common/memory.c xen/include/asm-i386/pdb.h xen/include/asm-i386/processor.h xen/include/xen/perfc_defn.h xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c
line diff
     1.1 --- a/.rootkeys	Thu May 13 16:48:30 2004 +0000
     1.2 +++ b/.rootkeys	Sat May 15 09:55:40 2004 +0000
     1.3 @@ -144,6 +144,7 @@ 3ddb79bdeJ7_86z03yTAPIeeywOg3Q xen/arch/
     1.4  3ddb79bdIKgipvGoqExEQ7jawfVowA xen/arch/i386/pci-i386.h
     1.5  3ddb79bdHe6_Uij4-glW91vInNtBYQ xen/arch/i386/pci-irq.c
     1.6  3ddb79bcZ_2FxINljqNSkqa17ISyJw xen/arch/i386/pci-pc.c
     1.7 +40a4dfced2dnSzbKgJFlD3chKHexjQ xen/arch/i386/pdb-linux.c
     1.8  4022a73czgX7d-2zfF_cb33oVemApQ xen/arch/i386/pdb-stub.c
     1.9  3ddb79bc1_2bAt67x9MFCP4AZrQnvQ xen/arch/i386/process.c
    1.10  3ddb79bc7KxGCEJsgBnkDX7XjD_ZEQ xen/arch/i386/rwlock.c
     2.1 --- a/xen/arch/i386/entry.S	Thu May 13 16:48:30 2004 +0000
     2.2 +++ b/xen/arch/i386/entry.S	Sat May 15 09:55:40 2004 +0000
     2.3 @@ -537,14 +537,7 @@ error_code:
     2.4  	movl  %edx,%es
     2.5  	movl  %edx,%fs
     2.6  	movl  %edx,%gs
     2.7 -        # We force a STI here. In most cases it is illegal to fault with
     2.8 -        # interrupts disabled, so no need to check EFLAGS. There is one
     2.9 -        # case when it /is/ valid -- on final return to guest context, we
    2.10 -        # CLI so we can atomically check for events to notify guest about and
    2.11 -        # return, all in one go. If we fault it is necessary to STI and the
    2.12 -        # worst that will happen is that our return code is no longer atomic.
    2.13 -        # This will do -- noone will ever notice. :-)
    2.14 -	sti
    2.15 +	movl  %esp,%edx
    2.16  	pushl %esi			# push the error code
    2.17  	pushl %edx			# push the pt_regs pointer
    2.18  	GET_CURRENT(%ebx)
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/xen/arch/i386/pdb-linux.c	Sat May 15 09:55:40 2004 +0000
     3.3 @@ -0,0 +1,100 @@
     3.4 +
     3.5 +/*
     3.6 + * pervasive debugger
     3.7 + * www.cl.cam.ac.uk/netos/pdb
     3.8 + *
     3.9 + * alex ho
    3.10 + * 2004
    3.11 + * university of cambridge computer laboratory
    3.12 + *
    3.13 + * linux & i386 dependent code. bleech.
    3.14 + */
    3.15 +
    3.16 +#include <asm/pdb.h>
    3.17 +
    3.18 +/* offset to the first instruction in the linux system call code
    3.19 +   where we can safely set a breakpoint */
    3.20 +unsigned int pdb_linux_syscall_enter_bkpt_offset = 20;
    3.21 +
    3.22 +/* offset to eflags saved on the stack after an int 80 */
    3.23 +unsigned int pdb_linux_syscall_eflags_offset     = 48;
    3.24 +
    3.25 +/* offset to the instruction pointer saved on the stack after an int 80 */
    3.26 +unsigned int pdb_linux_syscall_eip_offset        = 40;
    3.27 +
    3.28 +unsigned char
    3.29 +pdb_linux_set_bkpt (unsigned long addr)
    3.30 +{
    3.31 +    unsigned char old_instruction = *(unsigned char *)addr;
    3.32 +    *(unsigned char *)addr = 0xcc;
    3.33 +    return old_instruction;
    3.34 +}
    3.35 +
    3.36 +void
    3.37 +pdb_linux_clr_bkpt (unsigned long addr, unsigned char value)
    3.38 +{
    3.39 +    *(unsigned char *)addr = value;
    3.40 +}
    3.41 +
    3.42 +void
    3.43 +pdb_linux_syscall_enter_bkpt (struct pt_regs *regs, long error_code,
    3.44 +			      trap_info_t *ti)
    3.45 +{
    3.46 +    /* set at breakpoint at the beginning of the 
    3.47 +       system call in the target domain */
    3.48 + 
    3.49 +    pdb_system_call_enter_instr = pdb_linux_set_bkpt(ti->address +
    3.50 +				    pdb_linux_syscall_enter_bkpt_offset);
    3.51 +    pdb_system_call = 1;
    3.52 +}
    3.53 +
    3.54 +void
    3.55 +pdb_linux_syscall_exit_bkpt (struct pt_regs *regs, struct pdb_context *pdb_ctx)
    3.56 +{
    3.57 +    /*
    3.58 +      we've hit an int 0x80 in a user's program, jumped into xen
    3.59 +      (traps.c::do_general_protection()) which re-wrote the next
    3.60 +      instruction in the os kernel to 0xcc, and then hit that 
    3.61 +      exception.
    3.62 +
    3.63 +      we need to re-write the return instruction in the user's
    3.64 +      program so that we know when we have finished the system call
    3.65 +      and are back in the user's program.
    3.66 +
    3.67 +      at this point our stack should look something like this:
    3.68 +
    3.69 +      esp      = 0x80a59f0
    3.70 +      esp + 4  = 0x0
    3.71 +      esp + 8  = 0x80485a0
    3.72 +      esp + 12 = 0x2d
    3.73 +      esp + 16 = 0x80485f4
    3.74 +      esp + 20 = 0xbffffa48
    3.75 +      esp + 24 = 0xd
    3.76 +      esp + 28 = 0xc00a0833
    3.77 +      esp + 32 = 0x833
    3.78 +      esp + 36 = 0xd
    3.79 +      esp + 40 = 0x804dcdd     saved eip
    3.80 +      esp + 44 = 0x82b         saved cs
    3.81 +      esp + 48 = 0x213392      saved eflags
    3.82 +      esp + 52 = 0xbffffa2c    saved esp
    3.83 +      esp + 56 = 0x833         saved ss
    3.84 +      esp + 60 = 0x1000000
    3.85 +    */
    3.86 +
    3.87 +    /* restore the entry instruction for the system call */
    3.88 +    pdb_linux_clr_bkpt(regs->eip - 1, pdb_system_call_enter_instr);
    3.89 +
    3.90 +    /* save the address of eflags that was saved on the stack */
    3.91 +    pdb_system_call_eflags_addr = (regs->esp +
    3.92 +				   pdb_linux_syscall_eflags_offset);
    3.93 + 
    3.94 +    /* muck with the return instruction so that we trap back into the
    3.95 +       debugger when re-entering user space */
    3.96 +    pdb_system_call_next_addr = *(unsigned long *)(regs->esp + 
    3.97 +						 pdb_linux_syscall_eip_offset);
    3.98 +    pdb_linux_get_values (&pdb_system_call_leave_instr, 1, 
    3.99 +			  pdb_system_call_next_addr,
   3.100 +			  pdb_ctx->process, pdb_ctx->ptbr);
   3.101 +    pdb_linux_set_values ("cc", 1, pdb_system_call_next_addr,
   3.102 +			  pdb_ctx->process, pdb_ctx->ptbr);
   3.103 +}
     4.1 --- a/xen/arch/i386/pdb-stub.c	Thu May 13 16:48:30 2004 +0000
     4.2 +++ b/xen/arch/i386/pdb-stub.c	Sat May 15 09:55:40 2004 +0000
     4.3 @@ -47,26 +47,13 @@ static int  pdb_in_buffer_ptr;
     4.4  static unsigned char  pdb_in_checksum;
     4.5  static unsigned char  pdb_xmit_checksum;
     4.6  
     4.7 -/* function pointers in the near future... */
     4.8 -unsigned long pdb_linux_pid_ptbr (unsigned long cr3, int pid);
     4.9 -void pdb_linux_get_values(char *buffer, int length, unsigned long address,
    4.10 -			  int pid, unsigned long cr3);
    4.11 -void pdb_linux_set_values(char *buffer, int length, unsigned long address,
    4.12 -			  int pid, unsigned long cr3);
    4.13 -
    4.14 -struct pdb_context
    4.15 -{
    4.16 -    int valid;
    4.17 -    int domain;
    4.18 -    int process;
    4.19 -    unsigned long ptbr;                   /* cached page table base register */
    4.20 -};
    4.21  struct pdb_context pdb_ctx;
    4.22 -
    4.23  int pdb_continue_thread = 0;
    4.24  int pdb_general_thread = 0;
    4.25  
    4.26  void pdb_put_packet (unsigned char *buffer, int ack);
    4.27 +void pdb_bkpt_check (u_char *buffer, int length,
    4.28 +		     unsigned long cr3, unsigned long addr);
    4.29  
    4.30  int pdb_initialized = 0;
    4.31  int pdb_page_fault_possible = 0;
    4.32 @@ -75,6 +62,12 @@ int pdb_page_fault = 0;
    4.33  static int pdb_serhnd = -1;
    4.34  static int pdb_stepping = 0;
    4.35  
    4.36 +int pdb_system_call = 0;
    4.37 +unsigned char pdb_system_call_enter_instr = 0;       /* original enter instr */
    4.38 +unsigned char pdb_system_call_leave_instr = 0;        /* original next instr */
    4.39 +unsigned long pdb_system_call_next_addr = 0;         /* instr after int 0x80 */
    4.40 +unsigned long pdb_system_call_eflags_addr = 0;      /* saved eflags on stack */
    4.41 +
    4.42  static inline void pdb_put_char(unsigned char c)
    4.43  {
    4.44      serial_putc(pdb_serhnd, c);
    4.45 @@ -406,15 +399,49 @@ pdb_process_command (char *ptr, struct p
    4.46          break;
    4.47      case 'S':                                            /* step with signal */
    4.48      case 's':                                                        /* step */
    4.49 +    {
    4.50 +        if ( pdb_system_call_eflags_addr != 0 )
    4.51 +	{
    4.52 +	    unsigned long eflags;
    4.53 +	    char eflags_buf[sizeof(eflags)*2];       /* STUPID STUPID STUPID */
    4.54 +
    4.55 +	    pdb_linux_get_values((u_char*)&eflags, sizeof(eflags), 
    4.56 +				 pdb_system_call_eflags_addr, 
    4.57 +				 pdb_ctx.process, pdb_ctx.ptbr);
    4.58 +	    eflags |= X86_EFLAGS_TF;
    4.59 +	    mem2hex ((u_char *)&eflags, eflags_buf, sizeof(eflags)); 
    4.60 +	    pdb_linux_set_values(eflags_buf, sizeof(eflags),
    4.61 +				 pdb_system_call_eflags_addr,
    4.62 +				 pdb_ctx.process, pdb_ctx.ptbr);
    4.63 +	}
    4.64 +
    4.65          regs->eflags |= X86_EFLAGS_TF;
    4.66          pdb_stepping = 1;
    4.67          return 1;                                        
    4.68          /* not reached */
    4.69 +    }
    4.70      case 'C':                                        /* continue with signal */
    4.71      case 'c':                                                    /* continue */
    4.72 +    {
    4.73 +        if ( pdb_system_call_eflags_addr != 0 )
    4.74 +	{
    4.75 +	    unsigned long eflags;
    4.76 +	    char eflags_buf[sizeof(eflags)*2];       /* STUPID STUPID STUPID */
    4.77 +
    4.78 +	    pdb_linux_get_values((u_char*)&eflags, sizeof(eflags), 
    4.79 +				 pdb_system_call_eflags_addr, 
    4.80 +				 pdb_ctx.process, pdb_ctx.ptbr);
    4.81 +	    eflags &= ~X86_EFLAGS_TF;
    4.82 +	    mem2hex ((u_char *)&eflags, eflags_buf, sizeof(eflags)); 
    4.83 +	    pdb_linux_set_values(eflags_buf, sizeof(eflags),
    4.84 +				 pdb_system_call_eflags_addr,
    4.85 +				 pdb_ctx.process, pdb_ctx.ptbr);
    4.86 +	}
    4.87 +
    4.88          regs->eflags &= ~X86_EFLAGS_TF;
    4.89          return 1;                         /* jump out before replying to gdb */
    4.90          /* not reached */
    4.91 +    }
    4.92      case 'd':
    4.93          remote_debug = !(remote_debug);                 /* toggle debug flag */
    4.94          break;
    4.95 @@ -424,54 +451,11 @@ pdb_process_command (char *ptr, struct p
    4.96      case 'g':                       /* return the value of the CPU registers */
    4.97      {
    4.98          pdb_x86_to_gdb_regs (pdb_out_buffer, regs);
    4.99 -
   4.100 -	/*
   4.101 -	printk ("  reg: %s",   pdb_out_buffer);
   4.102 -	printk ("\n");
   4.103 -	printk ("  eax: 0x%08lx\n", regs->eax);
   4.104 -	printk ("  ecx: 0x%08lx\n", regs->ecx);
   4.105 -	printk ("  edx: 0x%08lx\n", regs->edx);
   4.106 -	printk ("  ebx: 0x%08lx\n", regs->ebx);
   4.107 -	printk ("  esp: 0x%08lx\n", regs->esp);
   4.108 -	printk ("  ebp: 0x%08lx\n", regs->ebp);
   4.109 -	printk ("  esi: 0x%08lx\n", regs->esi);
   4.110 -	printk ("  edi: 0x%08lx\n", regs->edi);
   4.111 -	printk ("  eip: 0x%08lx\n", regs->eip);
   4.112 -	printk ("  efl: 0x%08lx\n", regs->eflags);
   4.113 -	printk ("  xcs: 0x%08x\n",  regs->xcs);
   4.114 -	printk ("  xss: 0x%08x\n",  regs->xss);
   4.115 -	printk ("  xds: 0x%08x\n",  regs->xds);
   4.116 -	printk ("  xes: 0x%08x\n",  regs->xes);
   4.117 -	printk ("  xfs: 0x%08x\n",  regs->xfs);
   4.118 -	printk ("  xgs: 0x%08x\n",  regs->xgs);
   4.119 -	*/
   4.120 -
   4.121          break;
   4.122      }
   4.123      case 'G':              /* set the value of the CPU registers - return OK */
   4.124      {
   4.125          pdb_gdb_to_x86_regs (regs, ptr);
   4.126 -
   4.127 -	/*
   4.128 -	printk ("  ptr: %s \n\n",   ptr);
   4.129 -	printk ("  eax: 0x%08lx\n", regs->eax);
   4.130 -	printk ("  ecx: 0x%08lx\n", regs->ecx);
   4.131 -	printk ("  edx: 0x%08lx\n", regs->edx);
   4.132 -	printk ("  ebx: 0x%08lx\n", regs->ebx);
   4.133 -	printk ("  esp: 0x%08lx\n", regs->esp);
   4.134 -	printk ("  ebp: 0x%08lx\n", regs->ebp);
   4.135 -	printk ("  esi: 0x%08lx\n", regs->esi);
   4.136 -	printk ("  edi: 0x%08lx\n", regs->edi);
   4.137 -	printk ("  eip: 0x%08lx\n", regs->eip);
   4.138 -	printk ("  efl: 0x%08lx\n", regs->eflags);
   4.139 -	printk ("  xcs: 0x%08x\n",  regs->xcs);
   4.140 -	printk ("  xss: 0x%08x\n",  regs->xss);
   4.141 -	printk ("  xds: 0x%08x\n",  regs->xds);
   4.142 -	printk ("  xes: 0x%08x\n",  regs->xes);
   4.143 -	printk ("  xfs: 0x%08x\n",  regs->xfs);
   4.144 -	printk ("  xgs: 0x%08x\n",  regs->xgs);
   4.145 -	*/
   4.146 -
   4.147          break;
   4.148      }
   4.149      case 'H':
   4.150 @@ -572,17 +556,20 @@ pdb_process_command (char *ptr, struct p
   4.151  			if (addr >= PAGE_OFFSET)
   4.152  			{
   4.153  			    hex2mem (ptr, (char *)addr, length);
   4.154 +			    pdb_bkpt_check(ptr, length, pdb_ctx.ptbr, addr);
   4.155  			}
   4.156  			else if (pdb_ctx.process != -1)
   4.157  			{
   4.158  			    pdb_linux_set_values(ptr, length, addr,
   4.159  						 pdb_ctx.process, 
   4.160  						 pdb_ctx.ptbr);
   4.161 +			    pdb_bkpt_check(ptr, length, pdb_ctx.ptbr, addr);
   4.162  			}
   4.163  			else
   4.164  			{
   4.165  			    pdb_set_values (ptr, length,
   4.166  					    pdb_ctx.ptbr, addr);
   4.167 +			    pdb_bkpt_check(ptr, length, pdb_ctx.ptbr, addr);
   4.168  			}
   4.169  			pdb_page_fault_possible = 0;
   4.170                          if (pdb_page_fault)
   4.171 @@ -936,7 +923,6 @@ int pdb_set_values(u_char *buffer, int l
   4.172  		   unsigned long cr3, unsigned long addr)
   4.173  {
   4.174      int count = pdb_change_values(buffer, length, cr3, addr, __PDB_SET_VAL);
   4.175 -    pdb_bkpt_check(buffer, length, cr3, addr);
   4.176      return count;
   4.177  }
   4.178  
   4.179 @@ -1176,16 +1162,35 @@ int pdb_handle_exception(int exceptionVe
   4.180  
   4.181      __asm__ __volatile__ ("movl %%cr3,%0" : "=r" (cr3) : );
   4.182  
   4.183 +    /* If the exception is an int3 from user space then pdb is only
   4.184 +       interested if it re-wrote an instruction set the breakpoint.
   4.185 +       This occurs when leaving a system call from a domain.
   4.186 +    */
   4.187 +    if ( exceptionVector == 3 &&
   4.188 +	 (xen_regs->xcs & 3) == 3 && 
   4.189 +	 xen_regs->eip != pdb_system_call_next_addr + 1)
   4.190 +    {
   4.191 +        TRC(printf("pdb: user bkpt (0x%x) at 0x%x:0x%lx:0x%lx\n", 
   4.192 +		   exceptionVector, xen_regs->xcs & 3, cr3, xen_regs->eip));
   4.193 +	return 1;
   4.194 +    }
   4.195 +
   4.196      /*
   4.197 -     * If PDB didn't set the breakpoint, is not single stepping, and the user
   4.198 -     * didn't press the magic debug key, then we don't handle the exception.
   4.199 +     * If PDB didn't set the breakpoint, is not single stepping, 
   4.200 +     * is not entering a system call in a domain,
   4.201 +     * the user didn't press the magic debug key, 
   4.202 +     * then we don't handle the exception.
   4.203       */
   4.204      bkpt = pdb_bkpt_search(cr3, xen_regs->eip - 1);
   4.205      if ( (bkpt == NULL) &&
   4.206 -         !pdb_stepping && (exceptionVector != KEYPRESS_EXCEPTION) &&
   4.207 +         !pdb_stepping && 
   4.208 +	 !pdb_system_call &&
   4.209 +	 xen_regs->eip != pdb_system_call_next_addr + 1 &&
   4.210 +	 (exceptionVector != KEYPRESS_EXCEPTION) &&
   4.211  	 xen_regs->eip < 0xc0000000)                   /* xenolinux for now! */
   4.212      {
   4.213 -        TRC(printf("pdb: user bkpt at 0x%lx:0x%lx\n", cr3, xen_regs->eip));
   4.214 +        TRC(printf("pdb: user bkpt (0x%x) at 0x%lx:0x%lx\n", 
   4.215 +		   exceptionVector, cr3, xen_regs->eip));
   4.216  	return 1;
   4.217      }
   4.218  
   4.219 @@ -1199,12 +1204,54 @@ int pdb_handle_exception(int exceptionVe
   4.220          pdb_stepping = 0;
   4.221      }
   4.222  
   4.223 +    if ( pdb_system_call )
   4.224 +    {
   4.225 +	pdb_system_call = 0;
   4.226 +
   4.227 +	pdb_linux_syscall_exit_bkpt (xen_regs, &pdb_ctx);
   4.228 +
   4.229 +	/* we don't have a saved breakpoint so we need to rewind eip */
   4.230 +	xen_regs->eip--;
   4.231 +	
   4.232 +	/* if ther user doesn't care about breaking when entering a
   4.233 +	   system call then we'll just ignore the exception */
   4.234 +	if ( (pdb_ctx.system_call & 0x01) == 0 )
   4.235 +	{
   4.236 +	    return 0;
   4.237 +	}
   4.238 +    }
   4.239 +
   4.240      if ( exceptionVector == BREAKPT_EXCEPTION && bkpt != NULL)
   4.241      {
   4.242          /* Executed Int3: replace breakpoint byte with real program byte. */
   4.243          xen_regs->eip--;
   4.244      }
   4.245  
   4.246 +    /* returning to user space after a system call */
   4.247 +    if ( xen_regs->eip == pdb_system_call_next_addr + 1)
   4.248 +    {
   4.249 +        u_char instr[2];                      /* REALLY REALLY REALLY STUPID */
   4.250 +
   4.251 +	mem2hex (&pdb_system_call_leave_instr, instr, sizeof(instr)); 
   4.252 +
   4.253 +	pdb_linux_set_values (instr, 1, pdb_system_call_next_addr,
   4.254 +			      pdb_ctx.process, pdb_ctx.ptbr);
   4.255 +
   4.256 +	pdb_system_call_next_addr = 0;
   4.257 +	pdb_system_call_leave_instr = 0;
   4.258 +
   4.259 +	/* manually rewind eip */
   4.260 +	xen_regs->eip--;
   4.261 +
   4.262 +	/* if the user doesn't care about breaking when returning 
   4.263 +	   to user space after a system call then we'll just ignore 
   4.264 +	   the exception */
   4.265 +	if ( (pdb_ctx.system_call & 0x02) == 0 )
   4.266 +	{
   4.267 +	    return 0;
   4.268 +	}
   4.269 +    }
   4.270 +
   4.271      /* Generate a signal for GDB. */
   4.272      switch ( exceptionVector )
   4.273      {
   4.274 @@ -1267,6 +1314,7 @@ void initialize_pdb()
   4.275      pdb_ctx.valid = 1;
   4.276      pdb_ctx.domain = -1;
   4.277      pdb_ctx.process = -1;
   4.278 +    pdb_ctx.system_call = 0;
   4.279      pdb_ctx.ptbr = 0;
   4.280  
   4.281      printk("pdb: pervasive debugger (%s)   www.cl.cam.ac.uk/netos/pdb\n", 
     5.1 --- a/xen/arch/i386/smp.c	Thu May 13 16:48:30 2004 +0000
     5.2 +++ b/xen/arch/i386/smp.c	Sat May 15 09:55:40 2004 +0000
     5.3 @@ -21,15 +21,6 @@
     5.4  #ifdef CONFIG_SMP
     5.5  
     5.6  /*
     5.7 - * This lock must be acquired before sending a synchronous IPI to another
     5.8 - * CPU (i.e., IPI + spin waiting for acknowledgement). The only safe ways of
     5.9 - * acquiring the lock are spin_lock() and spin_trylock(). The former is only
    5.10 - * safe if local interrupts are enabled (otherwise we will never see an IPI
    5.11 - * destined for us which we must acknowledge for the lock to be released).
    5.12 - */
    5.13 -static spinlock_t synchronous_ipi_lock = SPIN_LOCK_UNLOCKED;
    5.14 -
    5.15 -/*
    5.16   *	Some notes on x86 processor bugs affecting SMP operation:
    5.17   *
    5.18   *	Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
    5.19 @@ -220,16 +211,18 @@ static inline void send_IPI_allbutself(i
    5.20   * 2) Leave the mm if we are in the lazy tlb mode.
    5.21   */
    5.22  
    5.23 +static spinlock_t flush_lock = SPIN_LOCK_UNLOCKED;
    5.24  static volatile unsigned long flush_cpumask;
    5.25 -#define FLUSH_ALL	0xffffffff
    5.26  
    5.27  asmlinkage void smp_invalidate_interrupt(void)
    5.28  {
    5.29      ack_APIC_irq();
    5.30 -    local_flush_tlb();
    5.31 -    clear_bit(smp_processor_id(), &flush_cpumask);
    5.32 +    perfc_incrc(ipis);
    5.33 +    if ( likely(test_and_clear_bit(smp_processor_id(), &flush_cpumask)) )
    5.34 +        local_flush_tlb();
    5.35  }
    5.36  
    5.37 +#ifndef NO_DEVICES_IN_XEN
    5.38  int try_flush_tlb_mask(unsigned long mask)
    5.39  {
    5.40      if ( mask & (1 << smp_processor_id()) )
    5.41 @@ -240,7 +233,7 @@ int try_flush_tlb_mask(unsigned long mas
    5.42  
    5.43      if ( mask != 0 )
    5.44      {
    5.45 -        if ( unlikely(!spin_trylock(&synchronous_ipi_lock)) )
    5.46 +        if ( unlikely(!spin_trylock(&flush_lock)) )
    5.47              return 0;
    5.48          flush_cpumask = mask;
    5.49          send_IPI_mask(mask, INVALIDATE_TLB_VECTOR);
    5.50 @@ -249,15 +242,16 @@ int try_flush_tlb_mask(unsigned long mas
    5.51              rep_nop();
    5.52              barrier();
    5.53          }
    5.54 -        spin_unlock(&synchronous_ipi_lock);
    5.55 +        spin_unlock(&flush_lock);
    5.56      }
    5.57  
    5.58      return 1;
    5.59  }
    5.60 +#endif
    5.61  
    5.62  void flush_tlb_mask(unsigned long mask)
    5.63  {
    5.64 -    ASSERT(local_irq_is_enabled());
    5.65 +    ASSERT(!in_irq());
    5.66      
    5.67      if ( mask & (1 << smp_processor_id()) )
    5.68      {
    5.69 @@ -267,7 +261,21 @@ void flush_tlb_mask(unsigned long mask)
    5.70  
    5.71      if ( mask != 0 )
    5.72      {
    5.73 -        spin_lock(&synchronous_ipi_lock);
    5.74 +        /*
    5.75 +         * We are certainly not reentering a flush_lock region on this CPU
    5.76 +         * because we are not in an IRQ context. We can therefore wait for the
    5.77 +         * other guy to release the lock. This is harder than it sounds because
    5.78 +         * local interrupts might be disabled, and he may be waiting for us to
    5.79 +         * execute smp_invalidate_interrupt(). We deal with this possibility by
    5.80 +         * inlining the meat of that function here.
    5.81 +         */
    5.82 +        while ( unlikely(!spin_trylock(&flush_lock)) )
    5.83 +        {
    5.84 +            if ( test_and_clear_bit(smp_processor_id(), &flush_cpumask) )
    5.85 +                local_flush_tlb();
    5.86 +            rep_nop();
    5.87 +        }
    5.88 +
    5.89          flush_cpumask = mask;
    5.90          send_IPI_mask(mask, INVALIDATE_TLB_VECTOR);
    5.91          while ( flush_cpumask != 0 )
    5.92 @@ -275,13 +283,15 @@ void flush_tlb_mask(unsigned long mask)
    5.93              rep_nop();
    5.94              barrier();
    5.95          }
    5.96 -        spin_unlock(&synchronous_ipi_lock);
    5.97 +
    5.98 +        spin_unlock(&flush_lock);
    5.99      }
   5.100  }
   5.101  
   5.102  void new_tlbflush_clock_period(void)
   5.103  {
   5.104 -    if ( unlikely(!spin_trylock(&synchronous_ipi_lock)) )
   5.105 +    /* Avoid deadlock because we might be reentering a flush_lock region. */
   5.106 +    if ( unlikely(!spin_trylock(&flush_lock)) )
   5.107          return;
   5.108  
   5.109      /* Someone may acquire the lock and execute the flush before us. */
   5.110 @@ -304,7 +314,7 @@ void new_tlbflush_clock_period(void)
   5.111      tlbflush_clock++;
   5.112  
   5.113   out:
   5.114 -    spin_unlock(&synchronous_ipi_lock);
   5.115 +    spin_unlock(&flush_lock);
   5.116  }
   5.117  
   5.118  static void flush_tlb_all_pge_ipi(void* info)
   5.119 @@ -323,6 +333,12 @@ void smp_send_event_check_mask(unsigned 
   5.120      send_IPI_mask(cpu_mask, EVENT_CHECK_VECTOR);
   5.121  }
   5.122  
   5.123 +/*
   5.124 + * Structure and data for smp_call_function(). This is designed to minimise
   5.125 + * static memory requirements. It also looks cleaner.
   5.126 + */
   5.127 +static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
   5.128 +
   5.129  struct call_data_struct {
   5.130      void (*func) (void *info);
   5.131      void *info;
   5.132 @@ -368,7 +384,8 @@ int smp_call_function (void (*func) (voi
   5.133  
   5.134      ASSERT(local_irq_is_enabled());
   5.135  
   5.136 -    spin_lock(&synchronous_ipi_lock);
   5.137 +    spin_lock(&call_lock);
   5.138 +
   5.139      call_data = &data;
   5.140      wmb();
   5.141      /* Send a message to all other CPUs and wait for them to respond */
   5.142 @@ -382,7 +399,7 @@ int smp_call_function (void (*func) (voi
   5.143          while (atomic_read(&data.finished) != cpus)
   5.144              barrier();
   5.145  
   5.146 -    spin_unlock(&synchronous_ipi_lock);
   5.147 +    spin_unlock(&call_lock);
   5.148  
   5.149      return 0;
   5.150  }
   5.151 @@ -419,6 +436,7 @@ void smp_send_stop(void)
   5.152  asmlinkage void smp_event_check_interrupt(void)
   5.153  {
   5.154      ack_APIC_irq();
   5.155 +    perfc_incrc(ipis);
   5.156  }
   5.157  
   5.158  asmlinkage void smp_call_function_interrupt(void)
   5.159 @@ -428,6 +446,8 @@ asmlinkage void smp_call_function_interr
   5.160      int wait = call_data->wait;
   5.161  
   5.162      ack_APIC_irq();
   5.163 +    perfc_incrc(ipis);
   5.164 +
   5.165      /*
   5.166       * Notify initiating CPU that I've grabbed the data and am
   5.167       * about to execute the function
     6.1 --- a/xen/arch/i386/traps.c	Thu May 13 16:48:30 2004 +0000
     6.2 +++ b/xen/arch/i386/traps.c	Sat May 15 09:55:40 2004 +0000
     6.3 @@ -107,36 +107,9 @@ static inline int kernel_text_address(un
     6.4  
     6.5  }
     6.6  
     6.7 -
     6.8 -void show_trace(unsigned long * stack)
     6.9 -{
    6.10 -    int i;
    6.11 -    unsigned long addr;
    6.12 -
    6.13 -    printk("Call Trace: ");
    6.14 -    i = 1;
    6.15 -    while (((long) stack & (STACK_SIZE-1)) != 0) {
    6.16 -        addr = *stack++;
    6.17 -        if (kernel_text_address(addr)) {
    6.18 -            if (i && ((i % 6) == 0))
    6.19 -                printk("\n   ");
    6.20 -            printk("[<%08lx>] ", addr);
    6.21 -            i++;
    6.22 -        }
    6.23 -    }
    6.24 -    printk("\n");
    6.25 -}
    6.26 -
    6.27 -void show_traceX(void)
    6.28 -{
    6.29 -    unsigned long *addr;
    6.30 -    __asm__ __volatile__ ("movl %%esp,%0" : "=r" (addr) : );
    6.31 -    show_trace(addr);
    6.32 -}
    6.33 -
    6.34  void show_stack(unsigned long *esp)
    6.35  {
    6.36 -    unsigned long *stack;
    6.37 +    unsigned long *stack, addr;
    6.38      int i;
    6.39  
    6.40      printk("Stack trace from ESP=%p:\n", esp);
    6.41 @@ -154,6 +127,20 @@ void show_stack(unsigned long *esp)
    6.42              printk("%08lx ", *stack++);            
    6.43      }
    6.44      printk("\n");
    6.45 +
    6.46 +    printk("Call Trace from ESP=%p: ", esp);
    6.47 +    stack = esp;
    6.48 +    i = 0;
    6.49 +    while (((long) stack & (STACK_SIZE-1)) != 0) {
    6.50 +        addr = *stack++;
    6.51 +        if (kernel_text_address(addr)) {
    6.52 +            if (i && ((i % 6) == 0))
    6.53 +                printk("\n   ");
    6.54 +            printk("[<%08lx>] ", addr);
    6.55 +            i++;
    6.56 +        }
    6.57 +    }
    6.58 +    printk("\n");
    6.59  }
    6.60  
    6.61  void show_registers(struct pt_regs *regs)
    6.62 @@ -250,7 +237,6 @@ DO_ERROR_NOCODE( 0, "divide error", divi
    6.63  DO_ERROR_NOCODE( 4, "overflow", overflow)
    6.64  DO_ERROR_NOCODE( 5, "bounds", bounds)
    6.65  DO_ERROR_NOCODE( 6, "invalid operand", invalid_op)
    6.66 -DO_ERROR_NOCODE( 7, "device not available", device_not_available)
    6.67  DO_ERROR_NOCODE( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
    6.68  DO_ERROR(10, "invalid TSS", invalid_TSS)
    6.69  DO_ERROR(11, "segment not present", segment_not_present)
    6.70 @@ -267,10 +253,10 @@ asmlinkage void do_int3(struct pt_regs *
    6.71      struct guest_trap_bounce *gtb = guest_trap_bounce+smp_processor_id();
    6.72      trap_info_t *ti;
    6.73  
    6.74 +    if ( pdb_handle_exception(3, regs) == 0 )
    6.75 +        return;
    6.76      if ( (regs->xcs & 3) != 3 )
    6.77      {
    6.78 -        if ( pdb_handle_exception(3, regs) == 0 )
    6.79 -             return;
    6.80          if ( unlikely((regs->xcs & 3) == 0) )
    6.81          {
    6.82              show_registers(regs);
    6.83 @@ -445,6 +431,15 @@ asmlinkage void do_general_protection(st
    6.84          ti = current->thread.traps + (error_code>>3);
    6.85          if ( TI_GET_DPL(ti) >= (regs->xcs & 3) )
    6.86          {
    6.87 +	    unsigned long cr3;
    6.88 +	
    6.89 +	    __asm__ __volatile__ ("movl %%cr3,%0" : "=r" (cr3) : );
    6.90 +	    if (pdb_initialized && pdb_ctx.system_call != 0 &&
    6.91 +		cr3 == pdb_ctx.ptbr)
    6.92 +	    {
    6.93 +	        pdb_linux_syscall_enter_bkpt(regs, error_code, ti);
    6.94 +	    }
    6.95 +
    6.96              gtb->flags = GTBF_TRAP_NOCODE;
    6.97              regs->eip += 2;
    6.98              goto finish_propagation;
     7.1 --- a/xen/common/dom0_ops.c	Thu May 13 16:48:30 2004 +0000
     7.2 +++ b/xen/common/dom0_ops.c	Sat May 15 09:55:40 2004 +0000
     7.3 @@ -28,15 +28,6 @@
     7.4  
     7.5  extern unsigned int alloc_new_dom_mem(struct task_struct *, unsigned int);
     7.6  
     7.7 -/* Basically used to protect the domain-id space. */
     7.8 -static spinlock_t create_dom_lock = SPIN_LOCK_UNLOCKED;
     7.9 -
    7.10 -static domid_t get_domnr(void)
    7.11 -{
    7.12 -    static domid_t domnr = 0;
    7.13 -    return ++domnr;
    7.14 -}
    7.15 -
    7.16  static int msr_cpu_mask;
    7.17  static unsigned long msr_addr;
    7.18  static unsigned long msr_lo;
    7.19 @@ -117,23 +108,24 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
    7.20      case DOM0_CREATEDOMAIN:
    7.21      {
    7.22          struct task_struct *p;
    7.23 -        static unsigned int pro = 0;
    7.24 +        static domid_t    domnr = 0;
    7.25 +        static spinlock_t domnr_lock = SPIN_LOCK_UNLOCKED;
    7.26 +        unsigned int pro;
    7.27          domid_t dom;
    7.28          ret = -ENOMEM;
    7.29  
    7.30 -        spin_lock_irq(&create_dom_lock);
    7.31 -        
    7.32 -        if ( (dom = get_domnr()) == 0 ) 
    7.33 -            goto exit_create;
    7.34 +        spin_lock(&domnr_lock);
    7.35 +        dom = ++domnr;
    7.36 +        spin_unlock(&domnr_lock);
    7.37  
    7.38  	if (op->u.createdomain.cpu == -1 )
    7.39 -	    pro = (pro+1) % smp_num_cpus;
    7.40 +	    pro = (unsigned int)dom % smp_num_cpus;
    7.41  	else
    7.42  	    pro = op->u.createdomain.cpu % smp_num_cpus;
    7.43  
    7.44          p = do_createdomain(dom, pro);
    7.45          if ( p == NULL ) 
    7.46 -            goto exit_create;
    7.47 +            break;
    7.48  
    7.49  	if ( op->u.createdomain.name[0] )
    7.50          {
    7.51 @@ -145,16 +137,13 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
    7.52          if ( ret != 0 ) 
    7.53          {
    7.54              __kill_domain(p);
    7.55 -            goto exit_create;
    7.56 +            break;
    7.57          }
    7.58  
    7.59          ret = 0;
    7.60          
    7.61          op->u.createdomain.domain = p->domain;
    7.62          copy_to_user(u_dom0_op, op, sizeof(*op));
    7.63 - 
    7.64 -    exit_create:
    7.65 -        spin_unlock_irq(&create_dom_lock);
    7.66      }
    7.67      break;
    7.68  
     8.1 --- a/xen/common/domain.c	Thu May 13 16:48:30 2004 +0000
     8.2 +++ b/xen/common/domain.c	Sat May 15 09:55:40 2004 +0000
     8.3 @@ -292,6 +292,12 @@ struct pfn_info *alloc_domain_page(struc
     8.4      unsigned long flags, mask, pfn_stamp, cpu_stamp;
     8.5      int i;
     8.6  
     8.7 +#ifdef NO_DEVICES_IN_XEN
     8.8 +    ASSERT(!in_irq());
     8.9 +#else
    8.10 +    ASSERT((p != NULL) || !in_irq());
    8.11 +#endif
    8.12 +
    8.13      spin_lock_irqsave(&free_list_lock, flags);
    8.14      if ( likely(!list_empty(&free_list)) )
    8.15      {
    8.16 @@ -307,7 +313,7 @@ struct pfn_info *alloc_domain_page(struc
    8.17      if ( (mask = page->u.cpu_mask) != 0 )
    8.18      {
    8.19          pfn_stamp = page->tlbflush_timestamp;
    8.20 -        for ( i = 0; (mask != 0) && (i < NR_CPUS); i++ )
    8.21 +        for ( i = 0; (mask != 0) && (i < smp_num_cpus); i++ )
    8.22          {
    8.23              if ( mask & (1<<i) )
    8.24              {
    8.25 @@ -319,11 +325,15 @@ struct pfn_info *alloc_domain_page(struc
    8.26  
    8.27          if ( unlikely(mask != 0) )
    8.28          {
    8.29 +#ifdef NO_DEVICES_IN_XEN
    8.30 +            flush_tlb_mask(mask);
    8.31 +#else
    8.32              /* In IRQ ctxt, flushing is best-effort only, to avoid deadlock. */
    8.33              if ( likely(!in_irq()) )
    8.34                  flush_tlb_mask(mask);
    8.35              else if ( unlikely(!try_flush_tlb_mask(mask)) )
    8.36                  goto free_and_exit;
    8.37 +#endif
    8.38              perfc_incrc(need_flush_tlb_flush);
    8.39          }
    8.40      }
    8.41 @@ -332,7 +342,6 @@ struct pfn_info *alloc_domain_page(struc
    8.42      page->type_and_flags = 0;
    8.43      if ( p != NULL )
    8.44      {
    8.45 -        ASSERT(!in_irq());
    8.46          wmb(); /* Domain pointer must be visible before updating refcnt. */
    8.47          spin_lock(&p->page_list_lock);
    8.48          if ( unlikely(p->tot_pages >= p->max_pages) )
    8.49 @@ -363,8 +372,7 @@ void free_domain_page(struct pfn_info *p
    8.50      unsigned long flags;
    8.51      struct task_struct *p = page->u.domain;
    8.52  
    8.53 -    if ( unlikely(in_irq()) )
    8.54 -        BUG();
    8.55 +    ASSERT(!in_irq());
    8.56  
    8.57      if ( likely(!IS_XEN_HEAP_FRAME(page)) )
    8.58      {
     9.1 --- a/xen/common/memory.c	Thu May 13 16:48:30 2004 +0000
     9.2 +++ b/xen/common/memory.c	Sat May 15 09:55:40 2004 +0000
     9.3 @@ -804,6 +804,7 @@ static int do_extended_command(unsigned 
     9.4      unsigned long pfn = ptr >> PAGE_SHIFT;
     9.5      unsigned long old_base_pfn;
     9.6      struct pfn_info *page = &frame_table[pfn];
     9.7 +    struct task_struct *p = current, *q;
     9.8  
     9.9      switch ( cmd )
    9.10      {
    9.11 @@ -852,18 +853,18 @@ static int do_extended_command(unsigned 
    9.12          break;
    9.13  
    9.14      case MMUEXT_NEW_BASEPTR:
    9.15 -        okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, current);
    9.16 +        okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, p);
    9.17          if ( likely(okay) )
    9.18          {
    9.19              invalidate_shadow_ldt();
    9.20  
    9.21              percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
    9.22 -            old_base_pfn = pagetable_val(current->mm.pagetable) >> PAGE_SHIFT;
    9.23 -            current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
    9.24 +            old_base_pfn = pagetable_val(p->mm.pagetable) >> PAGE_SHIFT;
    9.25 +            p->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
    9.26  
    9.27 -            shadow_mk_pagetable(&current->mm);
    9.28 +            shadow_mk_pagetable(&p->mm);
    9.29  
    9.30 -            write_ptbase(&current->mm);
    9.31 +            write_ptbase(&p->mm);
    9.32  
    9.33              put_page_and_type(&frame_table[old_base_pfn]);    
    9.34  
    9.35 @@ -899,13 +900,13 @@ static int do_extended_command(unsigned 
    9.36              okay = 0;
    9.37              MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
    9.38          }
    9.39 -        else if ( (current->mm.ldt_ents != ents) || 
    9.40 -                  (current->mm.ldt_base != ptr) )
    9.41 +        else if ( (p->mm.ldt_ents != ents) || 
    9.42 +                  (p->mm.ldt_base != ptr) )
    9.43          {
    9.44              invalidate_shadow_ldt();
    9.45 -            current->mm.ldt_base = ptr;
    9.46 -            current->mm.ldt_ents = ents;
    9.47 -            load_LDT(current);
    9.48 +            p->mm.ldt_base = ptr;
    9.49 +            p->mm.ldt_ents = ents;
    9.50 +            load_LDT(p);
    9.51              percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
    9.52              if ( ents != 0 )
    9.53                  percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
    9.54 @@ -921,10 +922,10 @@ static int do_extended_command(unsigned 
    9.55          percpu_info[cpu].subject_id |= 
    9.56              ((domid_t)((ptr&~0xFFFF)|(val>>16)))<<32;
    9.57  
    9.58 -        if ( !IS_PRIV(current) )
    9.59 +        if ( !IS_PRIV(p) )
    9.60          {
    9.61              MEM_LOG("Dom %llu has no privilege to set subject domain",
    9.62 -                    current->domain);
    9.63 +                    p->domain);
    9.64              okay = 0;
    9.65          }
    9.66          else
    9.67 @@ -943,19 +944,26 @@ static int do_extended_command(unsigned 
    9.68          }
    9.69          break;
    9.70  
    9.71 -        /* XXX This function is racey! */
    9.72      case MMUEXT_REASSIGN_PAGE:
    9.73 -        if ( unlikely(!IS_PRIV(current)) )
    9.74 +        if ( unlikely(!IS_PRIV(p)) )
    9.75          {
    9.76              MEM_LOG("Dom %llu has no privilege to reassign page ownership",
    9.77 -                    current->domain);
    9.78 +                    p->domain);
    9.79              okay = 0;
    9.80          }
    9.81 -        else if ( likely(percpu_info[cpu].gps != NULL) )
    9.82 +        else if ( likely((q = percpu_info[cpu].gps) != NULL) &&
    9.83 +                  likely(test_bit(_PGC_allocated, &page->count_and_flags)) &&
    9.84 +                  likely(page->u.domain == p) ) /* won't be smp-guest safe */
    9.85          {
    9.86 -            current->tot_pages--;
    9.87 -            percpu_info[cpu].gps->tot_pages++;
    9.88 -            page->u.domain = percpu_info[cpu].gps;
    9.89 +            spin_lock(&p->page_list_lock);
    9.90 +            p->tot_pages--;
    9.91 +            list_del(&page->list);
    9.92 +            spin_unlock(&p->page_list_lock);
    9.93 +            page->u.domain = q;
    9.94 +            spin_lock(&q->page_list_lock);
    9.95 +            q->tot_pages++;
    9.96 +            list_add_tail(&page->list, &q->page_list);
    9.97 +            spin_unlock(&q->page_list_lock);
    9.98          }
    9.99          else
   9.100          {
    10.1 --- a/xen/include/asm-i386/pdb.h	Thu May 13 16:48:30 2004 +0000
    10.2 +++ b/xen/include/asm-i386/pdb.h	Sat May 15 09:55:40 2004 +0000
    10.3 @@ -14,6 +14,7 @@
    10.4  
    10.5  #include <asm/ptrace.h>
    10.6  #include <xen/list.h>
    10.7 +#include <hypervisor-ifs/dom0_ops.h>
    10.8  #include <hypervisor-ifs/hypervisor-if.h>                   /* for domain id */
    10.9  
   10.10  extern int pdb_initialized;
   10.11 @@ -37,6 +38,17 @@ extern int pdb_handle_exception(int exce
   10.12  extern int pdb_serial_input(u_char c, struct pt_regs *regs);
   10.13  extern void pdb_do_debug(dom0_op_t *op);
   10.14  
   10.15 +/* PDB Context. */
   10.16 +struct pdb_context
   10.17 +{
   10.18 +    int valid;
   10.19 +    int domain;
   10.20 +    int process;
   10.21 +    int system_call;              /* 0x01 break on enter, 0x02 break on exit */
   10.22 +    unsigned long ptbr;
   10.23 +};
   10.24 +extern struct pdb_context pdb_ctx;
   10.25 +
   10.26  /* Breakpoints. */
   10.27  struct pdb_breakpoint
   10.28  {
   10.29 @@ -56,4 +68,21 @@ extern char *mem2hex (char *, char *, in
   10.30  extern char *hex2mem (char *, char *, int);
   10.31  extern int   hexToInt (char **ptr, int *intValue);
   10.32  
   10.33 +/* Temporary Linux specific definitions */
   10.34 +extern int pdb_system_call;
   10.35 +extern unsigned char pdb_system_call_enter_instr;    /* original enter instr */
   10.36 +extern unsigned char pdb_system_call_leave_instr;     /* original next instr */
   10.37 +extern unsigned long pdb_system_call_next_addr;      /* instr after int 0x80 */
   10.38 +extern unsigned long pdb_system_call_eflags_addr;   /* saved eflags on stack */
   10.39 +
   10.40 +unsigned long pdb_linux_pid_ptbr (unsigned long cr3, int pid);
   10.41 +void pdb_linux_get_values(char *buffer, int length, unsigned long address,
   10.42 +			  int pid, unsigned long cr3);
   10.43 +void pdb_linux_set_values(char *buffer, int length, unsigned long address,
   10.44 +			  int pid, unsigned long cr3);
   10.45 +void pdb_linux_syscall_enter_bkpt (struct pt_regs *regs, long error_code,
   10.46 +				   trap_info_t *ti);
   10.47 +void pdb_linux_syscall_exit_bkpt (struct pt_regs *regs, 
   10.48 +				  struct pdb_context *pdb_ctx);
   10.49 +
   10.50  #endif  /* __PDB_H__ */
    11.1 --- a/xen/include/asm-i386/processor.h	Thu May 13 16:48:30 2004 +0000
    11.2 +++ b/xen/include/asm-i386/processor.h	Sat May 15 09:55:40 2004 +0000
    11.3 @@ -12,6 +12,7 @@
    11.4  #include <asm/cpufeature.h>
    11.5  #include <asm/desc.h>
    11.6  #include <asm/flushtlb.h>
    11.7 +#include <asm/pdb.h>
    11.8  #include <xen/config.h>
    11.9  #include <xen/spinlock.h>
   11.10  #include <hypervisor-ifs/hypervisor-if.h>
   11.11 @@ -406,8 +407,9 @@ extern struct desc_struct *idt_tables[];
   11.12       0, 8))
   11.13  
   11.14  #define SET_FAST_TRAP(_p)   \
   11.15 -    (memcpy(idt_tables[smp_processor_id()] + (_p)->fast_trap_idx, \
   11.16 -     &((_p)->fast_trap_desc), 8))
   11.17 +    (pdb_initialized ? (void *) 0 : \
   11.18 +       (memcpy(idt_tables[smp_processor_id()] + (_p)->fast_trap_idx, \
   11.19 +	       &((_p)->fast_trap_desc), 8)))
   11.20  
   11.21  long set_fast_trap(struct task_struct *p, int idx);
   11.22  
    12.1 --- a/xen/include/xen/perfc_defn.h	Thu May 13 16:48:30 2004 +0000
    12.2 +++ b/xen/include/xen/perfc_defn.h	Sat May 15 09:55:40 2004 +0000
    12.3 @@ -1,5 +1,6 @@
    12.4  
    12.5  PERFCOUNTER_CPU( irqs,         "#interrupts" )
    12.6 +PERFCOUNTER_CPU( ipis,         "#IPIs" )
    12.7  PERFCOUNTER_CPU( irq_time,     "cycles spent in irq handler" )
    12.8  
    12.9  PERFCOUNTER_CPU( apic_timer,   "apic timer interrupts" )
    13.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c	Thu May 13 16:48:30 2004 +0000
    13.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c	Sat May 15 09:55:40 2004 +0000
    13.3 @@ -13,19 +13,33 @@
    13.4  #include "common.h"
    13.5  #include <asm/hypervisor-ifs/dom_mem_ops.h>
    13.6  
    13.7 -static void net_tx_action(unsigned long unused);
    13.8  static void netif_page_release(struct page *page);
    13.9  static void make_tx_response(netif_t *netif, 
   13.10                               u16      id,
   13.11                               s8       st);
   13.12 -static void make_rx_response(netif_t     *netif, 
   13.13 +static int  make_rx_response(netif_t     *netif, 
   13.14                               u16          id, 
   13.15                               s8           st,
   13.16                               netif_addr_t addr,
   13.17                               u16          size);
   13.18  
   13.19 +static void net_tx_action(unsigned long unused);
   13.20  static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
   13.21  
   13.22 +static void net_rx_action(unsigned long unused);
   13.23 +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
   13.24 +
   13.25 +typedef struct {
   13.26 +    u16 id;
   13.27 +    unsigned long old_mach_ptr;
   13.28 +    unsigned long new_mach_pfn;
   13.29 +    netif_t *netif;
   13.30 +} rx_info_t;
   13.31 +static struct sk_buff_head rx_queue;
   13.32 +static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2];
   13.33 +static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE*4];
   13.34 +static unsigned char rx_notify[NR_EVENT_CHANNELS];
   13.35 +
   13.36  /* Don't currently gate addition of an interface to the tx scheduling list. */
   13.37  #define tx_work_exists(_if) (1)
   13.38  
   13.39 @@ -38,12 +52,24 @@ static unsigned long mmap_vstart;
   13.40  static u16 pending_id[MAX_PENDING_REQS];
   13.41  static netif_t *pending_netif[MAX_PENDING_REQS];
   13.42  static u16 pending_ring[MAX_PENDING_REQS];
   13.43 -static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
   13.44  typedef unsigned int PEND_RING_IDX;
   13.45  #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
   13.46  static PEND_RING_IDX pending_prod, pending_cons;
   13.47  #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
   13.48  
   13.49 +/* Freed TX SKBs get batched on this ring before return to pending_ring. */
   13.50 +static u16 dealloc_ring[MAX_PENDING_REQS];
   13.51 +static spinlock_t dealloc_lock = SPIN_LOCK_UNLOCKED;
   13.52 +static PEND_RING_IDX dealloc_prod, dealloc_cons;
   13.53 +
   13.54 +typedef struct {
   13.55 +    u16 idx;
   13.56 +    netif_tx_request_t req;
   13.57 +    netif_t *netif;
   13.58 +} tx_info_t;
   13.59 +static struct sk_buff_head tx_queue;
   13.60 +static multicall_entry_t tx_mcl[MAX_PENDING_REQS];
   13.61 +
   13.62  static struct list_head net_schedule_list;
   13.63  static spinlock_t net_schedule_list_lock;
   13.64  
   13.65 @@ -98,22 +124,12 @@ static inline void maybe_schedule_tx_act
   13.66  int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
   13.67  {
   13.68      netif_t *netif = (netif_t *)dev->priv;
   13.69 -    s8 status = NETIF_RSP_OKAY;
   13.70 -    u16 size=0, id;
   13.71 -    mmu_update_t mmu[4];
   13.72 -    multicall_entry_t mcl[2];
   13.73 -    unsigned long vdata, mdata=0, new_mfn;
   13.74  
   13.75      /* Drop the packet if the target domain has no receive buffers. */
   13.76      if ( (netif->rx_req_cons == netif->rx->req_prod) ||
   13.77           ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) )
   13.78 -    {
   13.79 -        dev_kfree_skb(skb);
   13.80 -        return 0;
   13.81 -    }
   13.82 +        goto drop;
   13.83  
   13.84 -    id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_req_cons++)].req.id;
   13.85 - 
   13.86      /*
   13.87       * We do not copy the packet unless:
   13.88       *  1. The data is shared; or
   13.89 @@ -130,11 +146,7 @@ int netif_be_start_xmit(struct sk_buff *
   13.90          struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC);
   13.91          int hlen = skb->data - skb->head;
   13.92          if ( unlikely(nskb == NULL) )
   13.93 -        {
   13.94 -            DPRINTK("DOM%llu couldn't get memory for skb.\n", netif->domid);
   13.95 -            status = NETIF_RSP_ERROR;
   13.96 -            goto out;
   13.97 -        }
   13.98 +            goto drop;
   13.99          skb_reserve(nskb, hlen);
  13.100          __skb_put(nskb, skb->len);
  13.101          (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len);
  13.102 @@ -142,63 +154,164 @@ int netif_be_start_xmit(struct sk_buff *
  13.103          skb = nskb;
  13.104      }
  13.105  
  13.106 -    vdata = (unsigned long)skb->data;
  13.107 -    mdata = virt_to_machine(vdata);
  13.108 -    size  = skb->tail - skb->data;
  13.109 +    ((rx_info_t *)&skb->cb[0])->id    =
  13.110 +        netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_req_cons++)].req.id;
  13.111 +    ((rx_info_t *)&skb->cb[0])->netif = netif;
  13.112 +        
  13.113 +    __skb_queue_tail(&rx_queue, skb);
  13.114 +    tasklet_schedule(&net_rx_tasklet);
  13.115  
  13.116 -    new_mfn = get_new_mfn();
  13.117 +    return 0;
  13.118  
  13.119 -    mmu[0].ptr  = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
  13.120 -    mmu[0].val  = __pa(vdata) >> PAGE_SHIFT;
  13.121 + drop:
  13.122 +    netif->stats.rx_dropped++;
  13.123 +    dev_kfree_skb(skb);
  13.124 +    return 0;
  13.125 +}
  13.126  
  13.127 -    mmu[1].val  = (unsigned long)(netif->domid<<16) & ~0xFFFFUL;
  13.128 -    mmu[1].ptr  = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL;
  13.129 -    mmu[2].val  = (unsigned long)(netif->domid>>16) & ~0xFFFFUL;
  13.130 -    mmu[2].ptr  = (unsigned long)(netif->domid>>32) & ~0xFFFFUL;
  13.131 -    mmu[1].ptr |= MMU_EXTENDED_COMMAND;
  13.132 -    mmu[1].val |= MMUEXT_SET_SUBJECTDOM_L;
  13.133 -    mmu[2].ptr |= MMU_EXTENDED_COMMAND;
  13.134 -    mmu[2].val |= MMUEXT_SET_SUBJECTDOM_H;
  13.135 +#if 0
  13.136 +static void xen_network_done_notify(void)
  13.137 +{
  13.138 +    static struct net_device *eth0_dev = NULL;
  13.139 +    if ( unlikely(eth0_dev == NULL) )
  13.140 +        eth0_dev = __dev_get_by_name("eth0");
  13.141 +    netif_rx_schedule(eth0_dev);
  13.142 +}
  13.143 +/* 
  13.144 + * Add following to poll() function in NAPI driver (Tigon3 is example):
  13.145 + *  if ( xen_network_done() )
  13.146 + *      tge_3nable_ints(tp); 
  13.147 + */
  13.148 +int xen_network_done(void)
  13.149 +{
  13.150 +    return skb_queue_empty(&rx_queue);
  13.151 +}
  13.152 +#endif
  13.153  
  13.154 -    mmu[3].ptr  = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
  13.155 -    mmu[3].val  = MMUEXT_REASSIGN_PAGE;
  13.156 +static void net_rx_action(unsigned long unused)
  13.157 +{
  13.158 +    netif_t *netif;
  13.159 +    s8 status;
  13.160 +    u16 size, id, evtchn;
  13.161 +    mmu_update_t *mmu = rx_mmu;
  13.162 +    multicall_entry_t *mcl;
  13.163 +    unsigned long vdata, mdata, new_mfn;
  13.164 +    struct sk_buff_head rxq;
  13.165 +    struct sk_buff *skb;
  13.166 +    u16 notify_list[NETIF_RX_RING_SIZE];
  13.167 +    int notify_nr = 0;
  13.168  
  13.169 -    mcl[0].op = __HYPERVISOR_mmu_update;
  13.170 -    mcl[0].args[0] = (unsigned long)mmu;
  13.171 -    mcl[0].args[1] = 4;
  13.172 -    mcl[1].op = __HYPERVISOR_update_va_mapping;
  13.173 -    mcl[1].args[0] = vdata >> PAGE_SHIFT;
  13.174 -    mcl[1].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
  13.175 -    mcl[1].args[2] = UVMF_INVLPG;
  13.176 +    skb_queue_head_init(&rxq);
  13.177  
  13.178 -    (void)HYPERVISOR_multicall(mcl, 2);
  13.179 -    if ( mcl[0].args[5] != 0 )
  13.180 +    mcl = rx_mcl;
  13.181 +    while ( (skb = __skb_dequeue(&rx_queue)) != NULL )
  13.182      {
  13.183 -        DPRINTK("Failed MMU update transferring to DOM%llu\n", netif->domid);
  13.184 -        (void)HYPERVISOR_update_va_mapping(
  13.185 -            vdata >> PAGE_SHIFT,
  13.186 -            (pte_t) { (mdata & PAGE_MASK) | __PAGE_KERNEL },
  13.187 -            UVMF_INVLPG);
  13.188 -        dealloc_mfn(new_mfn);
  13.189 -        status = NETIF_RSP_ERROR;
  13.190 -        goto out;
  13.191 +        netif   = ((rx_info_t *)&skb->cb[0])->netif;
  13.192 +        vdata   = (unsigned long)skb->data;
  13.193 +        mdata   = virt_to_machine(vdata);
  13.194 +        new_mfn = get_new_mfn();
  13.195 +        
  13.196 +        mmu[0].ptr  = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
  13.197 +        mmu[0].val  = __pa(vdata) >> PAGE_SHIFT;        
  13.198 +        mmu[1].val  = (unsigned long)(netif->domid<<16) & ~0xFFFFUL;
  13.199 +        mmu[1].ptr  = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL;
  13.200 +        mmu[2].val  = (unsigned long)(netif->domid>>16) & ~0xFFFFUL;
  13.201 +        mmu[2].ptr  = (unsigned long)(netif->domid>>32) & ~0xFFFFUL;
  13.202 +        mmu[1].ptr |= MMU_EXTENDED_COMMAND;
  13.203 +        mmu[1].val |= MMUEXT_SET_SUBJECTDOM_L;
  13.204 +        mmu[2].ptr |= MMU_EXTENDED_COMMAND;
  13.205 +        mmu[2].val |= MMUEXT_SET_SUBJECTDOM_H;
  13.206 +        mmu[3].ptr  = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
  13.207 +        mmu[3].val  = MMUEXT_REASSIGN_PAGE;
  13.208 +
  13.209 +        mcl[0].op = __HYPERVISOR_update_va_mapping;
  13.210 +        mcl[0].args[0] = vdata >> PAGE_SHIFT;
  13.211 +        mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
  13.212 +        mcl[0].args[2] = 0;
  13.213 +        mcl[1].op = __HYPERVISOR_mmu_update;
  13.214 +        mcl[1].args[0] = (unsigned long)mmu;
  13.215 +        mcl[1].args[1] = 4;
  13.216 +
  13.217 +        mmu += 4;
  13.218 +        mcl += 2;
  13.219 +
  13.220 +        ((rx_info_t *)&skb->cb[0])->old_mach_ptr = mdata;
  13.221 +        ((rx_info_t *)&skb->cb[0])->new_mach_pfn = new_mfn;
  13.222 +        __skb_queue_tail(&rxq, skb);
  13.223 +
  13.224 +        /* Filled the batch queue? */
  13.225 +        if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) )
  13.226 +            break;
  13.227      }
  13.228  
  13.229 -    phys_to_machine_mapping[__pa(vdata) >> PAGE_SHIFT] = new_mfn;
  13.230 +    if ( mcl == rx_mcl )
  13.231 +        return;
  13.232 +
  13.233 +    mcl[-2].args[2] = UVMF_FLUSH_TLB;
  13.234 +    (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
  13.235 +
  13.236 +    mcl = rx_mcl;
  13.237 +    while ( (skb = __skb_dequeue(&rxq)) != NULL )
  13.238 +    {
  13.239 +        netif   = ((rx_info_t *)&skb->cb[0])->netif;
  13.240 +        size    = skb->tail - skb->data;
  13.241 +        id      = ((rx_info_t *)&skb->cb[0])->id;
  13.242 +        new_mfn = ((rx_info_t *)&skb->cb[0])->new_mach_pfn;
  13.243 +        mdata   = ((rx_info_t *)&skb->cb[0])->old_mach_ptr;
  13.244  
  13.245 -    atomic_set(&(skb_shinfo(skb)->dataref), 1);
  13.246 -    skb_shinfo(skb)->nr_frags = 0;
  13.247 -    skb_shinfo(skb)->frag_list = NULL;
  13.248 +        /* Check the reassignment error code. */
  13.249 +        if ( unlikely(mcl[1].args[5] != 0) )
  13.250 +        {
  13.251 +            DPRINTK("Failed MMU update transferring to DOM%llu\n",
  13.252 +                    netif->domid);
  13.253 +            (void)HYPERVISOR_update_va_mapping(
  13.254 +                (unsigned long)skb->head >> PAGE_SHIFT,
  13.255 +                (pte_t) { (mdata & PAGE_MASK) | __PAGE_KERNEL },
  13.256 +                UVMF_INVLPG);
  13.257 +            dealloc_mfn(new_mfn);
  13.258 +            status = NETIF_RSP_ERROR;
  13.259 +        }
  13.260 +        else
  13.261 +        {
  13.262 +            phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn;
  13.263  
  13.264 -    netif->stats.rx_bytes += size;
  13.265 -    netif->stats.rx_packets++;
  13.266 +            atomic_set(&(skb_shinfo(skb)->dataref), 1);
  13.267 +            skb_shinfo(skb)->nr_frags = 0;
  13.268 +            skb_shinfo(skb)->frag_list = NULL;
  13.269 +
  13.270 +            netif->stats.rx_bytes += size;
  13.271 +            netif->stats.rx_packets++;
  13.272 +
  13.273 +            status = NETIF_RSP_OKAY;
  13.274 +        }
  13.275 +
  13.276 +        evtchn = netif->evtchn;
  13.277 +        if ( make_rx_response(netif, id, status, mdata, size) &&
  13.278 +             (rx_notify[evtchn] == 0) )
  13.279 +        {
  13.280 +            rx_notify[evtchn] = 1;
  13.281 +            notify_list[notify_nr++] = evtchn;
  13.282 +        }
  13.283  
  13.284 - out:
  13.285 -    spin_lock(&netif->rx_lock);
  13.286 -    make_rx_response(netif, id, status, mdata, size);
  13.287 -    spin_unlock(&netif->rx_lock);    
  13.288 -    dev_kfree_skb(skb);
  13.289 -    return 0;
  13.290 +        dev_kfree_skb(skb);
  13.291 +
  13.292 +        mcl += 2;
  13.293 +    }
  13.294 +
  13.295 +    while ( notify_nr != 0 )
  13.296 +    {
  13.297 +        evtchn = notify_list[--notify_nr];
  13.298 +        rx_notify[evtchn] = 0;
  13.299 +        notify_via_evtchn(evtchn);
  13.300 +    }
  13.301 +
  13.302 +    /* More work to do? */
  13.303 +    if ( !skb_queue_empty(&rx_queue) )
  13.304 +        tasklet_schedule(&net_rx_tasklet);
  13.305 +#if 0
  13.306 +    else
  13.307 +        xen_network_done_notify();
  13.308 +#endif
  13.309  }
  13.310  
  13.311  struct net_device_stats *netif_be_get_stats(struct net_device *dev)
  13.312 @@ -215,10 +328,12 @@ static int __on_net_schedule_list(netif_
  13.313  static void remove_from_net_schedule_list(netif_t *netif)
  13.314  {
  13.315      spin_lock(&net_schedule_list_lock);
  13.316 -    ASSERT(__on_net_schedule_list(netif));
  13.317 -    list_del(&netif->list);
  13.318 -    netif->list.next = NULL;
  13.319 -    netif_put(netif);
  13.320 +    if ( likely(__on_net_schedule_list(netif)) )
  13.321 +    {
  13.322 +        list_del(&netif->list);
  13.323 +        netif->list.next = NULL;
  13.324 +        netif_put(netif);
  13.325 +    }
  13.326      spin_unlock(&net_schedule_list_lock);
  13.327  }
  13.328  
  13.329 @@ -269,7 +384,51 @@ static void net_tx_action(unsigned long 
  13.330      u16 pending_idx;
  13.331      NETIF_RING_IDX i;
  13.332      struct page *page;
  13.333 +    multicall_entry_t *mcl;
  13.334  
  13.335 +    if ( (i = dealloc_cons) == dealloc_prod )
  13.336 +        goto skip_dealloc;
  13.337 +
  13.338 +    mcl = tx_mcl;
  13.339 +    while ( i != dealloc_prod )
  13.340 +    {
  13.341 +        pending_idx = dealloc_ring[MASK_PEND_IDX(i++)];
  13.342 +        mcl[0].op = __HYPERVISOR_update_va_mapping;
  13.343 +        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
  13.344 +        mcl[0].args[1] = 0;
  13.345 +        mcl[0].args[2] = 0;
  13.346 +        mcl++;        
  13.347 +    }
  13.348 +
  13.349 +    mcl[-1].args[2] = UVMF_FLUSH_TLB;
  13.350 +    (void)HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl);
  13.351 +
  13.352 +    while ( dealloc_cons != dealloc_prod )
  13.353 +    {
  13.354 +        pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
  13.355 +
  13.356 +        netif = pending_netif[pending_idx];
  13.357 +
  13.358 +        spin_lock(&netif->tx_lock);
  13.359 +        make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY);
  13.360 +        spin_unlock(&netif->tx_lock);
  13.361 +        
  13.362 +        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
  13.363 +        
  13.364 +        /*
  13.365 +         * Scheduling checks must happen after the above response is posted.
  13.366 +         * This avoids a possible race with a guest OS on another CPU.
  13.367 +         */
  13.368 +        mb();
  13.369 +        if ( (netif->tx_req_cons != netif->tx->req_prod) &&
  13.370 +             ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
  13.371 +            add_to_net_schedule_list_tail(netif);
  13.372 +        
  13.373 +        netif_put(netif);
  13.374 +    }
  13.375 +
  13.376 + skip_dealloc:
  13.377 +    mcl = tx_mcl;
  13.378      while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
  13.379              !list_empty(&net_schedule_list) )
  13.380      {
  13.381 @@ -340,29 +499,61 @@ static void net_tx_action(unsigned long 
  13.382  
  13.383          pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
  13.384  
  13.385 -        if ( HYPERVISOR_update_va_mapping_otherdomain(
  13.386 -            MMAP_VADDR(pending_idx) >> PAGE_SHIFT,
  13.387 -            (pte_t) { (txreq.addr & PAGE_MASK) | __PAGE_KERNEL },
  13.388 -            0, netif->domid) != 0 )
  13.389 -        {
  13.390 -            DPRINTK("Bad page frame\n");
  13.391 -            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  13.392 -            netif_put(netif);
  13.393 -            continue;
  13.394 -        }
  13.395 -        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] =
  13.396 -            txreq.addr >> PAGE_SHIFT;
  13.397 -
  13.398          if ( unlikely((skb = alloc_skb(PKT_PROT_LEN, GFP_ATOMIC)) == NULL) )
  13.399          {
  13.400              DPRINTK("Can't allocate a skb in start_xmit.\n");
  13.401              make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  13.402              netif_put(netif);
  13.403 -            HYPERVISOR_update_va_mapping(MMAP_VADDR(pending_idx) >> PAGE_SHIFT,
  13.404 -                                         (pte_t) { 0 }, UVMF_INVLPG);
  13.405              break;
  13.406          }
  13.407 +
  13.408 +        mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain;
  13.409 +        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
  13.410 +        mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL;
  13.411 +        mcl[0].args[2] = 0;
  13.412 +        mcl[0].args[3] = (unsigned long)netif->domid;
  13.413 +        mcl[0].args[4] = (unsigned long)(netif->domid>>32);
  13.414 +        mcl++;
  13.415          
  13.416 +        ((tx_info_t *)&skb->cb[0])->idx = pending_idx;
  13.417 +        ((tx_info_t *)&skb->cb[0])->netif = netif;
  13.418 +        memcpy(&((tx_info_t *)&skb->cb[0])->req, &txreq, sizeof(txreq));
  13.419 +        __skb_queue_tail(&tx_queue, skb);
  13.420 +
  13.421 +        pending_cons++;
  13.422 +
  13.423 +        /* Filled the batch queue? */
  13.424 +        if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) )
  13.425 +            break;
  13.426 +    }
  13.427 +
  13.428 +    if ( mcl == tx_mcl )
  13.429 +        return;
  13.430 +
  13.431 +    (void)HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl);
  13.432 +
  13.433 +    mcl = tx_mcl;
  13.434 +    while ( (skb = __skb_dequeue(&tx_queue)) != NULL )
  13.435 +    {
  13.436 +        pending_idx = ((tx_info_t *)&skb->cb[0])->idx;
  13.437 +        netif       = ((tx_info_t *)&skb->cb[0])->netif;
  13.438 +        memcpy(&txreq, &((tx_info_t *)&skb->cb[0])->req, sizeof(txreq));
  13.439 +
  13.440 +        /* Check the remap error code. */
  13.441 +        if ( unlikely(mcl[0].args[5] != 0) )
  13.442 +        {
  13.443 +            DPRINTK("Bad page frame\n");
  13.444 +            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
  13.445 +            netif_put(netif);
  13.446 +            kfree_skb(skb);
  13.447 +            mcl++;
  13.448 +            pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
  13.449 +            continue;
  13.450 +        }
  13.451 +
  13.452 +        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] =
  13.453 +            txreq.addr >> PAGE_SHIFT;
  13.454 +
  13.455          __skb_put(skb, PKT_PROT_LEN);
  13.456          memcpy(skb->data, 
  13.457                 (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
  13.458 @@ -391,42 +582,23 @@ static void net_tx_action(unsigned long 
  13.459          netif->stats.tx_bytes += txreq.size;
  13.460          netif->stats.tx_packets++;
  13.461  
  13.462 -        pending_cons++;
  13.463 -
  13.464          netif_rx(skb);
  13.465          netif->dev->last_rx = jiffies;
  13.466 +
  13.467 +        mcl++;
  13.468      }
  13.469  }
  13.470  
  13.471  static void netif_page_release(struct page *page)
  13.472  {
  13.473      unsigned long flags;
  13.474 -    netif_t *netif;
  13.475 -    u16 pending_idx;
  13.476 -
  13.477 -    pending_idx = page - virt_to_page(mmap_vstart);
  13.478 -
  13.479 -    netif = pending_netif[pending_idx];
  13.480 +    u16 pending_idx = page - virt_to_page(mmap_vstart);
  13.481  
  13.482 -    HYPERVISOR_update_va_mapping(MMAP_VADDR(pending_idx) >> PAGE_SHIFT,
  13.483 -                                 (pte_t) { 0 }, UVMF_INVLPG);
  13.484 -        
  13.485 -    spin_lock(&netif->tx_lock);
  13.486 -    make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY);
  13.487 -    spin_unlock(&netif->tx_lock);
  13.488 +    spin_lock_irqsave(&dealloc_lock, flags);
  13.489 +    dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
  13.490 +    spin_unlock_irqrestore(&dealloc_lock, flags);
  13.491  
  13.492 -    /*
  13.493 -     * Scheduling checks must happen after the above response is posted.
  13.494 -     * This avoids a possible race with a guest OS on another CPU.
  13.495 -     */
  13.496 -    mb();
  13.497 -    netif_schedule_work(netif);
  13.498 -
  13.499 -    netif_put(netif);
  13.500 - 
  13.501 -    spin_lock_irqsave(&pend_prod_lock, flags);
  13.502 -    pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
  13.503 -    spin_unlock_irqrestore(&pend_prod_lock, flags);
  13.504 +    tasklet_schedule(&net_tx_tasklet);
  13.505  }
  13.506  
  13.507  #if 0
  13.508 @@ -497,11 +669,11 @@ static void make_tx_response(netif_t *ne
  13.509          notify_via_evtchn(netif->evtchn);
  13.510  }
  13.511  
  13.512 -static void make_rx_response(netif_t     *netif, 
  13.513 -                             u16          id, 
  13.514 -                             s8           st,
  13.515 -                             netif_addr_t addr,
  13.516 -                             u16          size)
  13.517 +static int make_rx_response(netif_t     *netif, 
  13.518 +                            u16          id, 
  13.519 +                            s8           st,
  13.520 +                            netif_addr_t addr,
  13.521 +                            u16          size)
  13.522  {
  13.523      NET_RING_IDX i = netif->rx_resp_prod;
  13.524      netif_rx_response_t *resp;
  13.525 @@ -516,8 +688,7 @@ static void make_rx_response(netif_t    
  13.526      netif->rx->resp_prod = netif->rx_resp_prod = ++i;
  13.527  
  13.528      mb(); /* Update producer before checking event threshold. */
  13.529 -    if ( i == netif->rx->event )
  13.530 -        notify_via_evtchn(netif->evtchn);
  13.531 +    return (i == netif->rx->event);
  13.532  }
  13.533  
  13.534  static int __init init_module(void)
  13.535 @@ -527,6 +698,9 @@ static int __init init_module(void)
  13.536      if ( !(start_info.flags & SIF_INITDOMAIN) )
  13.537          return 0;
  13.538  
  13.539 +    skb_queue_head_init(&rx_queue);
  13.540 +    skb_queue_head_init(&tx_queue);
  13.541 +
  13.542      netif_interface_init();
  13.543  
  13.544      if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
    14.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c	Thu May 13 16:48:30 2004 +0000
    14.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c	Sat May 15 09:55:40 2004 +0000
    14.3 @@ -37,6 +37,10 @@ static void network_tx_buf_gc(struct net
    14.4  static void network_alloc_rx_buffers(struct net_device *dev);
    14.5  static void cleanup_module(void);
    14.6  
    14.7 +static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
    14.8 +static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
    14.9 +static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
   14.10 +
   14.11  static struct list_head dev_list;
   14.12  
   14.13  struct net_private
   14.14 @@ -178,8 +182,7 @@ static void network_alloc_rx_buffers(str
   14.15      struct sk_buff *skb;
   14.16      NETIF_RING_IDX i = np->rx->req_prod;
   14.17      dom_mem_op_t op;
   14.18 -    unsigned long pfn_array[NETIF_RX_RING_SIZE];
   14.19 -    int ret, nr_pfns = 0;
   14.20 +    int nr_pfns = 0;
   14.21  
   14.22      /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
   14.23      if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 
   14.24 @@ -201,9 +204,14 @@ static void network_alloc_rx_buffers(str
   14.25  
   14.26          np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id;
   14.27          
   14.28 -        pfn_array[nr_pfns++] = virt_to_machine(skb->head) >> PAGE_SHIFT;
   14.29 -        HYPERVISOR_update_va_mapping((unsigned long)skb->head >> PAGE_SHIFT,
   14.30 -                                     (pte_t) { 0 }, UVMF_INVLPG);
   14.31 +        rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT;
   14.32 +
   14.33 +        rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
   14.34 +        rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
   14.35 +        rx_mcl[nr_pfns].args[1] = 0;
   14.36 +        rx_mcl[nr_pfns].args[2] = 0;
   14.37 +
   14.38 +        nr_pfns++;
   14.39      }
   14.40      while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
   14.41  
   14.42 @@ -213,14 +221,22 @@ static void network_alloc_rx_buffers(str
   14.43       */
   14.44      flush_page_update_queue();
   14.45  
   14.46 +    /* After all PTEs have been zapped we blow away stale TLB entries. */
   14.47 +    rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
   14.48 +
   14.49 +    /* Give away a batch of pages. */
   14.50      op.op = MEMOP_RESERVATION_DECREASE;
   14.51      op.u.decrease.size  = nr_pfns;
   14.52 -    op.u.decrease.pages = pfn_array;
   14.53 -    if ( (ret = HYPERVISOR_dom_mem_op(&op)) != nr_pfns )
   14.54 -    {
   14.55 -        printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
   14.56 -        BUG();
   14.57 -    }
   14.58 +    op.u.decrease.pages = rx_pfn_array;
   14.59 +    rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
   14.60 +    rx_mcl[nr_pfns].args[0] = (unsigned long)&op;
   14.61 +
   14.62 +    /* Zap PTEs and give away pages in one big multicall. */
   14.63 +    (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
   14.64 +
   14.65 +    /* Check return status of HYPERVISOR_dom_mem_op(). */
   14.66 +    if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
   14.67 +        panic("Unable to reduce memory reservation\n");
   14.68  
   14.69      np->rx->req_prod = i;
   14.70  }
   14.71 @@ -295,17 +311,36 @@ static void netif_int(int irq, void *dev
   14.72      struct net_device *dev = dev_id;
   14.73      struct net_private *np = dev->priv;
   14.74      unsigned long flags;
   14.75 -    struct sk_buff *skb;
   14.76 -    netif_rx_response_t *rx;
   14.77 -    NETIF_RING_IDX i;
   14.78 -    mmu_update_t mmu;
   14.79  
   14.80      spin_lock_irqsave(&np->tx_lock, flags);
   14.81      network_tx_buf_gc(dev);
   14.82      spin_unlock_irqrestore(&np->tx_lock, flags);
   14.83  
   14.84 - again:
   14.85 -    for ( i = np->rx_resp_cons; i != np->rx->resp_prod; i++ )
   14.86 +    if ( np->rx_resp_cons != np->rx->resp_prod )
   14.87 +        netif_rx_schedule(dev);
   14.88 +}
   14.89 +
   14.90 +
   14.91 +static int netif_poll(struct net_device *dev, int *pbudget)
   14.92 +{
   14.93 +    struct net_private *np = dev->priv;
   14.94 +    struct sk_buff *skb;
   14.95 +    netif_rx_response_t *rx;
   14.96 +    NETIF_RING_IDX i;
   14.97 +    mmu_update_t *mmu = rx_mmu;
   14.98 +    multicall_entry_t *mcl = rx_mcl;
   14.99 +    int work_done, budget, more_to_do = 1;
  14.100 +    struct sk_buff_head rxq;
  14.101 +    unsigned long flags;
  14.102 +
  14.103 +    skb_queue_head_init(&rxq);
  14.104 +
  14.105 +    if ( (budget = *pbudget) > dev->quota )
  14.106 +        budget = dev->quota;
  14.107 +
  14.108 +    for ( i = np->rx_resp_cons, work_done = 0; 
  14.109 +          (i != np->rx->resp_prod) && (work_done < budget); 
  14.110 +          i++, work_done++ )
  14.111      {
  14.112          rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp;
  14.113  
  14.114 @@ -317,38 +352,53 @@ static void netif_int(int irq, void *dev
  14.115              /* Gate this error. We get a (valid) slew of them on suspend. */
  14.116              if ( np->state == NETIF_STATE_ACTIVE )
  14.117                  printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
  14.118 -            dev_kfree_skb_any(skb);
  14.119 +            dev_kfree_skb(skb);
  14.120              continue;
  14.121          }
  14.122  
  14.123 +        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
  14.124 +        skb_put(skb, rx->status);
  14.125 +
  14.126 +        np->stats.rx_packets++;
  14.127 +        np->stats.rx_bytes += rx->status;
  14.128 +
  14.129          /* Remap the page. */
  14.130 -        mmu.ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
  14.131 -        mmu.val  = __pa(skb->head) >> PAGE_SHIFT;
  14.132 -        if ( HYPERVISOR_mmu_update(&mmu, 1) != 0 )
  14.133 -            BUG();
  14.134 -        HYPERVISOR_update_va_mapping((unsigned long)skb->head >> PAGE_SHIFT,
  14.135 -                                     (pte_t) { (rx->addr & PAGE_MASK) | 
  14.136 -                                                   __PAGE_KERNEL },
  14.137 -                                     0);
  14.138 +        mmu->ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
  14.139 +        mmu->val  = __pa(skb->head) >> PAGE_SHIFT;
  14.140 +        mmu++;
  14.141 +        mcl->op = __HYPERVISOR_update_va_mapping;
  14.142 +        mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
  14.143 +        mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
  14.144 +        mcl->args[2] = 0;
  14.145 +        mcl++;
  14.146 +
  14.147          phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 
  14.148              rx->addr >> PAGE_SHIFT;
  14.149  
  14.150 -        /*
  14.151 -         * Set up shinfo -- from alloc_skb This was particularily nasty:  the
  14.152 -         * shared info is hidden at the back of the data area (presumably so it
  14.153 -         * can be shared), but on page flip it gets very spunked.
  14.154 -         */
  14.155 +        __skb_queue_tail(&rxq, skb);
  14.156 +    }
  14.157 +
  14.158 +    /* Do all the remapping work, and M->P updates, in one big hypercall. */
  14.159 +    if ( likely((mcl - rx_mcl) != 0) )
  14.160 +    {
  14.161 +        mcl->op = __HYPERVISOR_mmu_update;
  14.162 +        mcl->args[0] = (unsigned long)rx_mmu;
  14.163 +        mcl->args[1] = mmu - rx_mmu;
  14.164 +        mcl++;
  14.165 +        (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
  14.166 +    }
  14.167 +
  14.168 +    while ( (skb = __skb_dequeue(&rxq)) != NULL )
  14.169 +    {
  14.170 +        /* Set the shared-info area, which is hidden behind the real data. */
  14.171          atomic_set(&(skb_shinfo(skb)->dataref), 1);
  14.172          skb_shinfo(skb)->nr_frags = 0;
  14.173          skb_shinfo(skb)->frag_list = NULL;
  14.174  
  14.175 -        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
  14.176 -        skb_put(skb, rx->status);
  14.177 +        /* Ethernet-specific work. Delayed to here as it peeks the header. */
  14.178          skb->protocol = eth_type_trans(skb, dev);
  14.179  
  14.180 -        np->stats.rx_packets++;
  14.181 -
  14.182 -        np->stats.rx_bytes += rx->status;
  14.183 +        /* Pass it up. */
  14.184          netif_rx(skb);
  14.185          dev->last_rx = jiffies;
  14.186      }
  14.187 @@ -356,12 +406,28 @@ static void netif_int(int irq, void *dev
  14.188      np->rx_resp_cons = i;
  14.189  
  14.190      network_alloc_rx_buffers(dev);
  14.191 -    np->rx->event = np->rx_resp_cons + 1;
  14.192 +
  14.193 +    *pbudget   -= work_done;
  14.194 +    dev->quota -= work_done;
  14.195 +
  14.196 +    if ( work_done < budget )
  14.197 +    {
  14.198 +        local_irq_save(flags);
  14.199 +
  14.200 +        np->rx->event = i + 1;
  14.201      
  14.202 -    /* Deal with hypervisor racing our resetting of rx_event. */
  14.203 -    mb();
  14.204 -    if ( np->rx->resp_prod != i )
  14.205 -        goto again;
  14.206 +        /* Deal with hypervisor racing our resetting of rx_event. */
  14.207 +        mb();
  14.208 +        if ( np->rx->resp_prod == i )
  14.209 +        {
  14.210 +            __netif_rx_complete(dev);
  14.211 +            more_to_do = 0;
  14.212 +        }
  14.213 +
  14.214 +        local_irq_restore(flags);
  14.215 +    }
  14.216 +
  14.217 +    return more_to_do;
  14.218  }
  14.219  
  14.220  
  14.221 @@ -524,6 +590,8 @@ static int __init init_module(void)
  14.222      dev->hard_start_xmit = network_start_xmit;
  14.223      dev->stop            = network_close;
  14.224      dev->get_stats       = network_get_stats;
  14.225 +    dev->poll            = netif_poll;
  14.226 +    dev->weight          = 64;
  14.227      
  14.228      if ( (err = register_netdev(dev)) != 0 )
  14.229      {