ia64/xen-unstable

changeset 1060:a4c34e4a7c07

bitkeeper revision 1.699 (401c05c9TV2zsaZ_e3zpy-zaKxCetw)

timer.c, timer.h, sched.h:
new file
Many files:
Rolf's new timer interface, plus various cleanups.
author kaf24@scramble.cl.cam.ac.uk
date Sat Jan 31 19:45:13 2004 +0000 (2004-01-31)
parents 871510593652
children 9ccd60698e09
files .rootkeys docs/interface.tex extras/mini-os/h/hypervisor.h extras/mini-os/time.c tools/misc/Makefile tools/misc/xen_read_console.c tools/xc/lib/Makefile xen/arch/i386/entry.S xen/arch/i386/time.c xen/common/ac_timer.c xen/common/keyhandler.c xen/common/schedule.c xen/drivers/block/xen_vbd.c xen/include/hypervisor-ifs/hypervisor-if.h xen/include/xeno/sched.h xen/net/dev.c xenolinux-2.4.24-sparse/arch/xeno/config.in xenolinux-2.4.24-sparse/arch/xeno/defconfig xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h xenolinux-2.4.24-sparse/include/linux/sched.h xenolinux-2.4.24-sparse/include/linux/timer.h xenolinux-2.4.24-sparse/kernel/panic.c xenolinux-2.4.24-sparse/kernel/timer.c
line diff
     1.1 --- a/.rootkeys	Fri Jan 30 15:55:06 2004 +0000
     1.2 +++ b/.rootkeys	Sat Jan 31 19:45:13 2004 +0000
     1.3 @@ -581,11 +581,14 @@ 3f1056a9L_kqHcFheV00KbKBzv9j5w xenolinux
     1.4  3f689063nhrIRsMMZjZxMFk7iEINqQ xenolinux-2.4.24-sparse/include/asm-xeno/xeno_proc.h
     1.5  3f056927gMHl7mWB89rb73JahbhQIA xenolinux-2.4.24-sparse/include/linux/blk.h
     1.6  3e5a4e68WLX3B8owTvktP3HHOtznPQ xenolinux-2.4.24-sparse/include/linux/major.h
     1.7 +401c0590D_kwJDU59X8NyvqSv_Cl2A xenolinux-2.4.24-sparse/include/linux/sched.h
     1.8  3e5a4e686V0nioX2ZpFf056sgvdiQw xenolinux-2.4.24-sparse/include/linux/sunrpc/debug.h
     1.9 +401c0592pLrp_aCbQRo9GXiYQQaVVA xenolinux-2.4.24-sparse/include/linux/timer.h
    1.10  3e5a4e68W_hpMlM3u_-QOKMp3gzcwQ xenolinux-2.4.24-sparse/init/do_mounts.c
    1.11  3e5a4e68TJJavrunYwTAnLRSBxSYqQ xenolinux-2.4.24-sparse/kernel/panic.c
    1.12  3f1056a9LXNTgSzITNh1mb-MIKV1Ng xenolinux-2.4.24-sparse/kernel/printk.c
    1.13  3f9d4b44247udoqWEgFkaHiWv6Uvyg xenolinux-2.4.24-sparse/kernel/time.c
    1.14 +401c059bjLBFYHRD4Py2uM3eA1D4zQ xenolinux-2.4.24-sparse/kernel/timer.c
    1.15  3eba8f878XjouY21EkQBXwYBsPsipQ xenolinux-2.4.24-sparse/lndir-rel
    1.16  3e6e7c1efbQe93xCvOpOVCnXTMmQ5w xenolinux-2.4.24-sparse/mkbuildtree
    1.17  3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.24-sparse/mm/memory.c
     2.1 --- a/docs/interface.tex	Fri Jan 30 15:55:06 2004 +0000
     2.2 +++ b/docs/interface.tex	Sat Jan 31 19:45:13 2004 +0000
     2.3 @@ -117,18 +117,20 @@ time.
     2.4  
     2.5  
     2.6  \section{Cycle counter time}
     2.7 -This provides the finest-grained, free-running time reference, with the approximate
     2.8 -frequency being publicly accessible. The cycle counter time is used to accurately
     2.9 -extrapolate the other time references. On SMP machines it is currently assumed
    2.10 -that the cycle counter time is synchronised between CPUs. The current x86-based
    2.11 -implementation achieves this within inter-CPU communication latencies.
    2.12 +This provides the finest-grained, free-running time reference, with the
    2.13 +approximate frequency being publicly accessible. The cycle counter time is
    2.14 +used to accurately extrapolate the other time references. On SMP machines
    2.15 +it is currently assumed that the cycle counter time is synchronised between
    2.16 +CPUs. The current x86-based implementation achieves this within inter-CPU
    2.17 +communication latencies.
    2.18  
    2.19  \section{System time}
    2.20 -This is a 64-bit value containing the nanoseconds elapsed since boot time. Unlike
    2.21 -cycle counter time, system time accurately reflects the passage of real time, i.e.
    2.22 -it is adjusted several times a second for timer drift. This is done by running an
    2.23 -NTP client in {\it domain0} on behalf of the machine, feeding updates to the 
    2.24 -hypervisor. Intermediate values can be extrapolated using the cycle counter. 
    2.25 +This is a 64-bit value containing the nanoseconds elapsed since boot
    2.26 +time. Unlike cycle counter time, system time accurately reflects the
    2.27 +passage of real time, i.e.  it is adjusted several times a second for timer
    2.28 +drift. This is done by running an NTP client in {\it domain0} on behalf of
    2.29 +the machine, feeding updates to the hypervisor. Intermediate values can be
    2.30 +extrapolated using the cycle counter.
    2.31  
    2.32  \section{Wall clock time}
    2.33  This is the actual ``time of day'' Unix style struct timeval (i.e. seconds and
    2.34 @@ -140,10 +142,39 @@ and remain perfectly in time.
    2.35  
    2.36  
    2.37  \section{Domain virtual time}
    2.38 -This progresses at the same pace as cycle counter time, but only while a domain
    2.39 -is executing. It stops while a domain is de-scheduled. Therefore the share of the 
    2.40 -CPU that a domain receives is indicated by the rate at which its domain virtual
    2.41 -time increases, relative to the rate at which cycle counter time does so.
    2.42 +This progresses at the same pace as cycle counter time, but only while a
    2.43 +domain is executing. It stops while a domain is de-scheduled. Therefore the
    2.44 +share of the CPU that a domain receives is indicated by the rate at which
    2.45 +its domain virtual time increases, relative to the rate at which cycle
    2.46 +counter time does so.
    2.47 +
    2.48 +\section{Time interface}
    2.49 +Xen exports some timestamps to guest operating systems through their shared
    2.50 +info page. Timestamps are provided for system time and wall-clock time. Xen
    2.51 +also provides the cycle counter values at the time of the last update
    2.52 +allowing guests to calculate the current values. The cpu frequency and a
    2.53 +scaling factor are provided for guests to convert cycle counter values to
    2.54 +real time. Since all time stamps need to be updated and read
    2.55 +\emph{atomically} two version numbers are also stored in the shared info
    2.56 +page.
    2.57 +
    2.58 +Xen will ensure that the time stamps are updated frequently enough to avoid
    2.59 +an overflow of the cycle counter values. Guest can check if its notion of
    2.60 +time is up-to-date by comparing the version numbers.
    2.61 +
    2.62 +\section{Timer events}
    2.63 +
    2.64 +Xen maintains a periodic timer (currently with a 10ms period) which sends a
    2.65 +timer event to the currently executing domain. This allows Guest OSes to
    2.66 +keep track of the passing of time when executing. The scheduler also
    2.67 +arranges for a newly activated domain to receive a timer event when
    2.68 +scheduled so that the Guest OS can adjust to the passage of time while it
    2.69 +has been inactive.
    2.70 +
    2.71 +In addition, Xen exports a hypercall interface to each domain which allows
    2.72 +them to request a timer event send to them at the specified system
    2.73 +time. Guest OSes may use this timer to implemented timeout values when they
    2.74 +block.
    2.75  
    2.76  \chapter{Memory}
    2.77  
    2.78 @@ -371,7 +402,15 @@ Notify hypervisor of updates to transmit
    2.79  Notify hypervisor that fpu registers needed to be save on context switch.
    2.80  
    2.81  \section{ sched\_op(unsigned long op)} 
    2.82 -Request scheduling operation from hypervisor. The options are: yield, stop, and exit.
    2.83 +Request scheduling operation from hypervisor. The options are: {\it yield},
    2.84 +{\it block}, {\it stop}, and {\it exit}. {\it yield} keeps the calling
    2.85 +domain run-able but may cause a reschedule if other domains are
    2.86 +run-able. {\it block} removes the calling domain from the run queue and the
    2.87 +domains sleeps until an event is delivered to it. {\it stop} and {\it exit}
    2.88 +should be self-explanatory.
    2.89 +
    2.90 +\section{ set\_dom\_timer(dom\_timer\_arg\_t *timer\_arg)} 
    2.91 +Request a timer event to be sent at the specified system time.
    2.92  
    2.93  \section{ dom0\_op(dom0\_op\_t *op)} 
    2.94  Administrative domain operations for domain management. The options are:
     3.1 --- a/extras/mini-os/h/hypervisor.h	Fri Jan 30 15:55:06 2004 +0000
     3.2 +++ b/extras/mini-os/h/hypervisor.h	Sat Jan 31 19:45:13 2004 +0000
     3.3 @@ -1,3 +1,10 @@
     3.4 +/******************************************************************************
     3.5 + * hypervisor.h
     3.6 + * 
     3.7 + * Linux-specific hypervisor handling.
     3.8 + * 
     3.9 + * Copyright (c) 2002, K A Fraser
    3.10 + */
    3.11  
    3.12  #ifndef _HYPERVISOR_H_
    3.13  #define _HYPERVISOR_H_
    3.14 @@ -135,6 +142,17 @@ static __inline__ int HYPERVISOR_yield(v
    3.15      return ret;
    3.16  }
    3.17  
    3.18 +static __inline__ int HYPERVISOR_block(void)
    3.19 +{
    3.20 +    int ret;
    3.21 +    __asm__ __volatile__ (
    3.22 +        TRAP_INSTR
    3.23 +        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
    3.24 +        "b" (SCHEDOP_block) );
    3.25 +
    3.26 +    return ret;
    3.27 +}
    3.28 +
    3.29  static __inline__ int HYPERVISOR_exit(void)
    3.30  {
    3.31      int ret;
    3.32 @@ -146,13 +164,25 @@ static __inline__ int HYPERVISOR_exit(vo
    3.33      return ret;
    3.34  }
    3.35  
    3.36 -static __inline__ int HYPERVISOR_stop(void)
    3.37 +static __inline__ int HYPERVISOR_stop(unsigned long srec)
    3.38 +{
    3.39 +    int ret;
    3.40 +    /* NB. On suspend, control software expects a suspend record in %esi. */
    3.41 +    __asm__ __volatile__ (
    3.42 +        TRAP_INSTR
    3.43 +        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
    3.44 +        "b" (SCHEDOP_stop), "S" (srec) : "memory" );
    3.45 +
    3.46 +    return ret;
    3.47 +}
    3.48 +
    3.49 +static __inline__ long HYPERVISOR_set_dom_timer(void *timer_arg)
    3.50  {
    3.51      int ret;
    3.52      __asm__ __volatile__ (
    3.53          TRAP_INSTR
    3.54 -        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
    3.55 -        "b" (SCHEDOP_stop) );
    3.56 +        : "=a" (ret) : "0" (__HYPERVISOR_set_dom_timer),
    3.57 +        "b" (timer_arg) : "memory" );
    3.58  
    3.59      return ret;
    3.60  }
     4.1 --- a/extras/mini-os/time.c	Fri Jan 30 15:55:06 2004 +0000
     4.2 +++ b/extras/mini-os/time.c	Sat Jan 31 19:45:13 2004 +0000
     4.3 @@ -1,20 +1,14 @@
     4.4  /* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
     4.5   ****************************************************************************
     4.6   * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
     4.7 + * (C) 2002-2003 - Keir Fraser - University of Cambridge 
     4.8   ****************************************************************************
     4.9   *
    4.10   *        File: time.c
    4.11 - *      Author: Rolf Neugebauer (neugebar@dcs.gla.ac.uk)
    4.12 - *     Changes: 
    4.13 - *              
    4.14 - *        Date: Jul 2003
    4.15 - * 
    4.16 - * Environment: Xen Minimal OS
    4.17 + *      Author: Rolf Neugebauer and Keir Fraser
    4.18 + *
    4.19   * Description: Simple time and timer functions
    4.20   *
    4.21 - ****************************************************************************
    4.22 - * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $
    4.23 - ****************************************************************************
    4.24   * Permission is hereby granted, free of charge, to any person obtaining a copy
    4.25   * of this software and associated documentation files (the "Software"), to
    4.26   * deal in the Software without restriction, including without limitation the
    4.27 @@ -105,6 +99,29 @@ static __inline__ unsigned long get_time
    4.28      return (unsigned long)delta;
    4.29  }
    4.30  
    4.31 +s64 get_s_time (void)
    4.32 +{
    4.33 +    u64 u_delta;
    4.34 +    s64 ret;
    4.35 +
    4.36 + again:
    4.37 +
    4.38 +    u_delta = get_time_delta_usecs();
    4.39 +    ret = shadow_system_time + (1000 * u_delta);
    4.40 +
    4.41 +    if ( unlikely(!TIME_VALUES_UP_TO_DATE) )
    4.42 +    {
    4.43 +        /*
    4.44 +         * We may have blocked for a long time, rendering our calculations
    4.45 +         * invalid (e.g. the time delta may have overflowed). Detect that
    4.46 +         * and recalculate with fresh values.
    4.47 +         */
    4.48 +        get_time_values_from_xen();
    4.49 +        goto again;
    4.50 +    }
    4.51 +
    4.52 +    return ret;
    4.53 +}
    4.54  
    4.55  void gettimeofday(struct timeval *tv)
    4.56  {
    4.57 @@ -123,11 +140,16 @@ void gettimeofday(struct timeval *tv)
    4.58  }
    4.59  
    4.60  
    4.61 +/*
    4.62 + * Just a dummy 
    4.63 + */
    4.64  static void timer_handler(int ev, struct pt_regs *regs)
    4.65  {
    4.66      static int i;
    4.67      struct timeval tv;
    4.68  
    4.69 +    get_time_values_from_xen();
    4.70 +
    4.71      i++;
    4.72      if (i >= 1000) {
    4.73          gettimeofday(&tv);
     5.1 --- a/tools/misc/Makefile	Fri Jan 30 15:55:06 2004 +0000
     5.2 +++ b/tools/misc/Makefile	Sat Jan 31 19:45:13 2004 +0000
     5.3 @@ -16,7 +16,7 @@ all: $(TARGETS)
     5.4  
     5.5  install: all
     5.6  	mkdir -p /usr/bin
     5.7 -	cp -a $(INSTALL) /usr/bin
     5.8 +	cp $(INSTALL) /usr/bin
     5.9  	chmod 755 /usr/bin/xen-mkdevnodes
    5.10  	chmod 755 /usr/bin/xen_nat_enable
    5.11  	chmod 755 /usr/bin/xen-clone
    5.12 @@ -24,7 +24,7 @@ install: all
    5.13  
    5.14  dist: all
    5.15  	mkdir -p ../../../install/bin
    5.16 -	cp -a $(INSTALL) ../../../install/bin
    5.17 +	cp $(INSTALL) ../../../install/bin
    5.18  	chmod 755 ../../../install/bin/xen-mkdevnodes
    5.19  	chmod 755 ../../../install/bin/xen_nat_enable
    5.20  	chmod 755 ../../../install/bin/xen-clone
     6.1 --- a/tools/misc/xen_read_console.c	Fri Jan 30 15:55:06 2004 +0000
     6.2 +++ b/tools/misc/xen_read_console.c	Sat Jan 31 19:45:13 2004 +0000
     6.3 @@ -11,9 +11,9 @@
     6.4  
     6.5  int main(void)
     6.6  {
     6.7 -    unsigned char buf[208];
     6.8 +    unsigned char buf[208], filtered[208];
     6.9      struct sockaddr_in addr, from;
    6.10 -    int fromlen = sizeof(from);
    6.11 +    int fromlen = sizeof(from), i, j;
    6.12      int len, fd = socket(PF_INET, SOCK_DGRAM, 0);
    6.13      
    6.14      if ( fd < 0 )
    6.15 @@ -46,7 +46,12 @@ int main(void)
    6.16          if ( buf[len-1] != '\n' ) { buf[len] = '\n'; len++; }
    6.17          buf[len] = '\0';
    6.18  
    6.19 -        printf("[%d] %s", ntohs(from.sin_port),buf);
    6.20 +        for ( i = 0, j = 0; i < len; i++ )
    6.21 +            if ( (buf[i] == '\n') || (buf[i] == '\0') ||
    6.22 +                 ((buf[i] >= 32) && (buf[i] <= 126)) )
    6.23 +                filtered[j++] = buf[i];
    6.24 +
    6.25 +        printf("[%d] %s", ntohs(from.sin_port), filtered);
    6.26  
    6.27          fromlen = sizeof(from);
    6.28      }
     7.1 --- a/tools/xc/lib/Makefile	Fri Jan 30 15:55:06 2004 +0000
     7.2 +++ b/tools/xc/lib/Makefile	Sat Jan 31 19:45:13 2004 +0000
     7.3 @@ -21,17 +21,17 @@ check-for-zlib:
     7.4  install: all
     7.5  	mkdir -p /usr/lib
     7.6  	mkdir -p /usr/include
     7.7 -	cp -a $(LIB) /usr/lib
     7.8 +	cp $(LIB) /usr/lib
     7.9  	chmod 755 /usr/lib/$(LIB)
    7.10 -	cp -a xc.h /usr/include
    7.11 +	cp xc.h /usr/include
    7.12  	chmod 644 /usr/include/xc.h
    7.13  
    7.14  dist: all
    7.15  	mkdir -p ../../../../install/lib
    7.16  	mkdir -p ../../../../install/include
    7.17 -	cp -a $(LIB) ../../../../install/lib
    7.18 +	cp $(LIB) ../../../../install/lib
    7.19  	chmod 755 ../../../../install/lib/$(LIB)
    7.20 -	cp -a xc.h ../../../../install/include
    7.21 +	cp xc.h ../../../../install/include
    7.22  	chmod 644 ../../../../install/include/xc.h
    7.23  
    7.24  clean:
     8.1 --- a/xen/arch/i386/entry.S	Fri Jan 30 15:55:06 2004 +0000
     8.2 +++ b/xen/arch/i386/entry.S	Sat Jan 31 19:45:13 2004 +0000
     8.3 @@ -713,6 +713,7 @@ ENTRY(hypervisor_call_table)
     8.4          .long SYMBOL_NAME(do_net_io_op)
     8.5          .long SYMBOL_NAME(do_fpu_taskswitch)
     8.6          .long SYMBOL_NAME(do_sched_op)
     8.7 +        .long SYMBOL_NAME(do_set_timer_op)
     8.8          .long SYMBOL_NAME(do_dom0_op)
     8.9          .long SYMBOL_NAME(do_network_op)
    8.10          .long SYMBOL_NAME(do_block_io_op)
     9.1 --- a/xen/arch/i386/time.c	Fri Jan 30 15:55:06 2004 +0000
     9.2 +++ b/xen/arch/i386/time.c	Sat Jan 31 19:45:13 2004 +0000
     9.3 @@ -37,12 +37,6 @@
     9.4  #include <asm/fixmap.h>
     9.5  #include <asm/mc146818rtc.h>
     9.6  
     9.7 -#ifdef TIME_TRACE
     9.8 -#define TRC(_x) _x
     9.9 -#else
    9.10 -#define TRC(_x)
    9.11 -#endif
    9.12 -
    9.13  extern rwlock_t xtime_lock;
    9.14  extern unsigned long wall_jiffies;
    9.15  
    10.1 --- a/xen/common/ac_timer.c	Fri Jan 30 15:55:06 2004 +0000
    10.2 +++ b/xen/common/ac_timer.c	Sat Jan 31 19:45:13 2004 +0000
    10.3 @@ -27,12 +27,6 @@
    10.4  #include <asm/system.h>
    10.5  #include <asm/desc.h>
    10.6  
    10.7 -#ifdef AC_TIMER_TRACE
    10.8 -#define TRC(_x) _x
    10.9 -#else
   10.10 -#define TRC(_x)
   10.11 -#endif
   10.12 -
   10.13  /*
   10.14   * We pull handlers off the timer list this far in future,
   10.15   * rather than reprogramming the time hardware.
    11.1 --- a/xen/common/keyhandler.c	Fri Jan 30 15:55:06 2004 +0000
    11.2 +++ b/xen/common/keyhandler.c	Sat Jan 31 19:45:13 2004 +0000
    11.3 @@ -1,5 +1,7 @@
    11.4 +
    11.5  #include <xeno/keyhandler.h> 
    11.6  #include <xeno/reboot.h>
    11.7 +#include <xeno/event.h>
    11.8  
    11.9  #define KEY_MAX 256
   11.10  #define STR_MAX  64
   11.11 @@ -80,40 +82,48 @@ static void kill_dom0(u_char key, void *
   11.12  /* XXX SMH: this is keir's fault */
   11.13  static char *task_states[] = 
   11.14  { 
   11.15 -    "Runnable", 
   11.16 -    "Interruptible Sleep", 
   11.17 -    "Uninterruptible Sleep", 
   11.18 -    NULL, "Stopped", 
   11.19 -    NULL, NULL, NULL, "Dying", 
   11.20 +    "Runnable  ", 
   11.21 +    "Int Sleep ", 
   11.22 +    "UInt Sleep", 
   11.23 +    NULL,
   11.24 +    "Stopped   ", 
   11.25 +    NULL,
   11.26 +    NULL,
   11.27 +    NULL,
   11.28 +    "Dying     ", 
   11.29  }; 
   11.30  
   11.31  void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs) 
   11.32  {
   11.33 -    unsigned long       flags; 
   11.34 +    unsigned long       flags, cpu_mask = 0; 
   11.35      struct task_struct *p; 
   11.36      shared_info_t      *s; 
   11.37 +    s_time_t            now = NOW();
   11.38  
   11.39 -    printk("'%c' pressed -> dumping task queues\n", key); 
   11.40 +    printk("'%c' pressed -> dumping task queues (now=0x%X:%08X)\n", key,
   11.41 +           (u32)(now>>32), (u32)now); 
   11.42  
   11.43      read_lock_irqsave(&tasklist_lock, flags); 
   11.44  
   11.45      p = &idle0_task;
   11.46      do {
   11.47          printk("Xen: DOM %d, CPU %d [has=%c], state = %s, "
   11.48 -	       "hyp_events = %08x\n", 
   11.49 -	       p->domain, p->processor, p->has_cpu ? 'T':'F', 
   11.50 -	       task_states[p->state], p->hyp_events); 
   11.51 -	s = p->shared_info; 
   11.52 -	if( !is_idle_task(p) )
   11.53 +               "hyp_events = %08x\n", 
   11.54 +               p->domain, p->processor, p->has_cpu ? 'T':'F', 
   11.55 +               task_states[p->state], p->hyp_events); 
   11.56 +        s = p->shared_info; 
   11.57 +        if( !is_idle_task(p) )
   11.58          {
   11.59 -	    printk("Guest: events = %08lx, events_mask = %08lx\n", 
   11.60 -		   s->events, s->events_mask); 
   11.61 -	    printk("Notifying guest...\n"); 
   11.62 -	    set_bit(_EVENT_DEBUG, &s->events); 
   11.63 -	}
   11.64 +            printk("Guest: events = %08lx, events_mask = %08lx\n", 
   11.65 +                   s->events, s->events_mask); 
   11.66 +            printk("Notifying guest...\n"); 
   11.67 +            cpu_mask |= mark_guest_event(p, _EVENT_DEBUG);
   11.68 +        }
   11.69      } while ( (p = p->next_task) != &idle0_task );
   11.70  
   11.71      read_unlock_irqrestore(&tasklist_lock, flags); 
   11.72 +
   11.73 +    guest_event_notify(cpu_mask);
   11.74  }
   11.75  
   11.76  extern void perfc_printall (u_char key, void *dev_id, struct pt_regs *regs);
    12.1 --- a/xen/common/schedule.c	Fri Jan 30 15:55:06 2004 +0000
    12.2 +++ b/xen/common/schedule.c	Sat Jan 31 19:45:13 2004 +0000
    12.3 @@ -5,7 +5,7 @@
    12.4   ****************************************************************************
    12.5   *
    12.6   *        File: common/schedule.c
    12.7 - *      Author: Rolf Neugebar & Keir Fraser
    12.8 + *      Author: Rolf Neugebauer & Keir Fraser
    12.9   * 
   12.10   * Description: CPU scheduling
   12.11   *              implements A Borrowed Virtual Time scheduler.
   12.12 @@ -24,16 +24,13 @@
   12.13  #include <xeno/timer.h>
   12.14  #include <xeno/perfc.h>
   12.15  
   12.16 -#undef SCHEDULER_TRACE
   12.17 -#ifdef SCHEDULER_TRACE
   12.18 -#define TRC(_x) _x
   12.19 -#else
   12.20 -#define TRC(_x)
   12.21 -#endif
   12.22 +/*#define WAKEUP_HISTO*/
   12.23 +/*#define BLOCKTIME_HISTO*/
   12.24  
   12.25 -/*#define SCHED_HISTO*/
   12.26 -#ifdef SCHED_HISTO
   12.27 +#if defined(WAKEUP_HISTO)
   12.28  #define BUCKETS 31
   12.29 +#elif defined(BLOCKTIME_HISTO)
   12.30 +#define BUCKETS 200
   12.31  #endif
   12.32  
   12.33  #define MCU            (s32)MICROSECS(100)    /* Minimum unit */
   12.34 @@ -48,7 +45,7 @@ typedef struct schedule_data_st
   12.35      struct task_struct *idle;           /* idle task for this cpu */
   12.36      u32                 svt;            /* system virtual time. per CPU??? */
   12.37      struct ac_timer     s_timer;        /* scheduling timer  */
   12.38 -#ifdef SCHED_HISTO
   12.39 +#ifdef BUCKETS
   12.40      u32                 hist[BUCKETS];  /* for scheduler latency histogram */
   12.41  #endif
   12.42  } __cacheline_aligned schedule_data_t;
   12.43 @@ -56,19 +53,25 @@ static schedule_data_t schedule_data[NR_
   12.44  
   12.45  spinlock_t schedule_lock[NR_CPUS] __cacheline_aligned;
   12.46  
   12.47 -/* Skanky periodic event to all guests. This must die in the next release! */
   12.48 -static struct ac_timer v_timer; 
   12.49 +/* Per-CPU periodic timer sends an event to the currently-executing domain. */
   12.50 +static struct ac_timer t_timer[NR_CPUS]; 
   12.51  
   12.52  /*
   12.53 - * Per-CPU timer to ensure that even guests with very long quantums get
   12.54 + * Per-CPU timer which ensures that even guests with very long quantums get
   12.55   * their time-of-day state updated often enough to avoid wrapping.
   12.56   */
   12.57  static struct ac_timer fallback_timer[NR_CPUS];
   12.58  
   12.59 -static void virt_timer(unsigned long foo);
   12.60 -static void dump_rqueue(struct list_head *queue, char *name);
   12.61 +/* Various timer handlers. */
   12.62 +static void s_timer_fn(unsigned long unused);
   12.63 +static void t_timer_fn(unsigned long unused);
   12.64 +static void dom_timer_fn(unsigned long data);
   12.65 +static void fallback_timer_fn(unsigned long unused);
   12.66  
   12.67 -
   12.68 +/*
   12.69 + * Wrappers for run-queue management. Must be called with the schedule_lock
   12.70 + * held.
   12.71 + */
   12.72  static inline void __add_to_runqueue_head(struct task_struct * p)
   12.73  {    
   12.74      list_add(&p->run_list, &schedule_data[p->processor].runqueue);
   12.75 @@ -93,6 +96,10 @@ static inline int __task_on_runqueue(str
   12.76  #define next_domain(p) \\
   12.77          list_entry((p)->run_list.next, struct task_struct, run_list)
   12.78  
   12.79 +/*
   12.80 + * Calculate the effective virtual time for a domain. Take into account 
   12.81 + * warping limits
   12.82 + */
   12.83  static void __calc_evt(struct task_struct *p)
   12.84  {
   12.85      s_time_t now = NOW();
   12.86 @@ -134,14 +141,21 @@ void sched_add_domain(struct task_struct
   12.87      } 
   12.88      else 
   12.89      {
   12.90 -        /* set avt end evt to system virtual time */
   12.91 +        /* Set avt end evt to system virtual time. */
   12.92          p->avt         = schedule_data[p->processor].svt;
   12.93          p->evt         = schedule_data[p->processor].svt;
   12.94 -        /* set some default values here */
   12.95 +        /* Set some default values here. */
   12.96          p->warpback    = 0;
   12.97          p->warp        = 0;
   12.98          p->warpl       = 0;
   12.99          p->warpu       = 0;
  12.100 +
  12.101 +        /* Initialise the per-domain timer. */
  12.102 +        init_ac_timer(&p->timer);
  12.103 +        p->timer.cpu      =  p->processor;
  12.104 +        p->timer.data     = (unsigned long)p;
  12.105 +        p->timer.function = &dom_timer_fn;
  12.106 +
  12.107      }
  12.108  }
  12.109  
  12.110 @@ -187,7 +201,7 @@ void __wake_up(struct task_struct *p)
  12.111      p->warped    = NOW();
  12.112      __calc_evt(p);
  12.113  
  12.114 -#ifdef SCHED_HISTO
  12.115 +#ifdef WAKEUP_HISTO
  12.116      p->wokenup = NOW();
  12.117  #endif
  12.118  }
  12.119 @@ -200,16 +214,31 @@ void wake_up(struct task_struct *p)
  12.120      spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
  12.121  }
  12.122  
  12.123 -/* Voluntarily yield the processor to another domain, until an event occurs. */
  12.124 -long do_yield(void)
  12.125 +/* 
  12.126 + * Block the currently-executing domain until a pertinent event occurs.
  12.127 + */
  12.128 +static long do_block(void)
  12.129  {
  12.130 +    set_bit(EVENTS_MASTER_ENABLE_BIT, &current->shared_info->events_mask);
  12.131      current->state = TASK_INTERRUPTIBLE;
  12.132 -    current->warpback = 0; /* XXX should only do this when blocking */
  12.133 +    current->warpback = 0; 
  12.134      __enter_scheduler();
  12.135      return 0;
  12.136  }
  12.137  
  12.138 -/* Demultiplex scheduler-related hypercalls. */
  12.139 +/*
  12.140 + * Voluntarily yield the processor for this allocation.
  12.141 + */
  12.142 +static long do_yield(void)
  12.143 +{
  12.144 +    __enter_scheduler();
  12.145 +    return 0;
  12.146 +}
  12.147 +
  12.148 +
  12.149 +/*
  12.150 + * Demultiplex scheduler-related hypercalls.
  12.151 + */
  12.152  long do_sched_op(unsigned long op)
  12.153  {
  12.154      long ret = 0;
  12.155 @@ -223,14 +252,24 @@ long do_sched_op(unsigned long op)
  12.156          break;
  12.157      }
  12.158  
  12.159 +    case SCHEDOP_block:
  12.160 +    {
  12.161 +        ret = do_block();
  12.162 +        break;
  12.163 +    }
  12.164 +
  12.165      case SCHEDOP_exit:
  12.166      {
  12.167 +        DPRINTK("DOM%d killed itself!\n", current->domain);
  12.168 +        DPRINTK(" EIP == %08lx\n", get_execution_context()->eip);
  12.169          kill_domain();
  12.170          break;
  12.171      }
  12.172  
  12.173      case SCHEDOP_stop:
  12.174      {
  12.175 +        DPRINTK("DOM%d stopped itself!\n", current->domain);
  12.176 +        DPRINTK(" EIP == %08lx\n", get_execution_context()->eip);
  12.177          stop_domain();
  12.178          break;
  12.179      }
  12.180 @@ -242,6 +281,23 @@ long do_sched_op(unsigned long op)
  12.181      return ret;
  12.182  }
  12.183  
  12.184 +/* Per-domain one-shot-timer hypercall. */
  12.185 +long do_set_timer_op(unsigned long timeout_hi, unsigned long timeout_lo)
  12.186 +{
  12.187 +    struct task_struct *p = current;
  12.188 +
  12.189 +    rem_ac_timer(&p->timer);
  12.190 +    
  12.191 +    if ( (timeout_hi != 0) || (timeout_lo != 0) )
  12.192 +    {
  12.193 +        p->timer.expires = ((s_time_t)timeout_hi<<32) | ((s_time_t)timeout_lo);
  12.194 +        add_ac_timer(&p->timer);
  12.195 +    }
  12.196 +
  12.197 +    return 0;
  12.198 +}
  12.199 +
  12.200 +
  12.201  /* Control the scheduler. */
  12.202  long sched_bvtctl(unsigned long c_allow)
  12.203  {
  12.204 @@ -330,7 +386,7 @@ asmlinkage void __enter_scheduler(void)
  12.205  {
  12.206      struct task_struct *prev = current, *next = NULL, *next_prime, *p;
  12.207      struct list_head   *tmp;
  12.208 -    int                 this_cpu = prev->processor;
  12.209 +    int                 cpu = prev->processor;
  12.210      s_time_t            now;
  12.211      s32                 r_time;     /* time for new dom to run */
  12.212      s32                 ranfor;     /* assume we never run longer than 2.1s! */
  12.213 @@ -339,11 +395,11 @@ asmlinkage void __enter_scheduler(void)
  12.214  
  12.215      perfc_incrc(sched_run);
  12.216  
  12.217 -    spin_lock_irq(&schedule_lock[this_cpu]);
  12.218 +    spin_lock_irq(&schedule_lock[cpu]);
  12.219  
  12.220      now = NOW();
  12.221  
  12.222 -    rem_ac_timer(&schedule_data[this_cpu].s_timer);
  12.223 +    rem_ac_timer(&schedule_data[cpu].s_timer);
  12.224  
  12.225      ASSERT(!in_interrupt());
  12.226      ASSERT(__task_on_runqueue(prev));
  12.227 @@ -374,21 +430,21 @@ asmlinkage void __enter_scheduler(void)
  12.228      clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
  12.229  
  12.230      /* We should at least have the idle task */
  12.231 -    ASSERT(!list_empty(&schedule_data[this_cpu].runqueue));
  12.232 +    ASSERT(!list_empty(&schedule_data[cpu].runqueue));
  12.233  
  12.234      /*
  12.235       * scan through the run queue and pick the task with the lowest evt
  12.236       * *and* the task the second lowest evt.
  12.237       * this code is O(n) but we expect n to be small.
  12.238       */
  12.239 -    next       = schedule_data[this_cpu].idle;
  12.240 +    next       = schedule_data[cpu].idle;
  12.241      next_prime = NULL;
  12.242  
  12.243      next_evt       = ~0U;
  12.244      next_prime_evt = ~0U;
  12.245      min_avt        = ~0U;
  12.246  
  12.247 -    list_for_each ( tmp, &schedule_data[this_cpu].runqueue )
  12.248 +    list_for_each ( tmp, &schedule_data[cpu].runqueue )
  12.249      {
  12.250          p = list_entry(tmp, struct task_struct, run_list);
  12.251          if ( p->evt < next_evt )
  12.252 @@ -416,16 +472,16 @@ asmlinkage void __enter_scheduler(void)
  12.253  
  12.254      /* Update system virtual time. */
  12.255      if ( min_avt != ~0U )
  12.256 -        schedule_data[this_cpu].svt = min_avt;
  12.257 +        schedule_data[cpu].svt = min_avt;
  12.258  
  12.259      /* check for virtual time overrun on this cpu */
  12.260 -    if ( schedule_data[this_cpu].svt >= 0xf0000000 )
  12.261 +    if ( schedule_data[cpu].svt >= 0xf0000000 )
  12.262      {
  12.263          u_long t_flags; 
  12.264          write_lock_irqsave(&tasklist_lock, t_flags); 
  12.265          p = &idle0_task;
  12.266          do {
  12.267 -            if ( (p->processor == this_cpu) && !is_idle_task(p) )
  12.268 +            if ( (p->processor == cpu) && !is_idle_task(p) )
  12.269              {
  12.270                  p->evt -= 0xe0000000;
  12.271                  p->avt -= 0xe0000000;
  12.272 @@ -433,7 +489,7 @@ asmlinkage void __enter_scheduler(void)
  12.273          } 
  12.274          while ( (p = p->next_task) != &idle0_task );
  12.275          write_unlock_irqrestore(&tasklist_lock, t_flags); 
  12.276 -        schedule_data[this_cpu].svt -= 0xe0000000;
  12.277 +        schedule_data[cpu].svt -= 0xe0000000;
  12.278      }
  12.279  
  12.280      /* work out time for next run through scheduler */
  12.281 @@ -461,46 +517,43 @@ asmlinkage void __enter_scheduler(void)
  12.282   sched_done:
  12.283      ASSERT(r_time >= ctx_allow);
  12.284  
  12.285 -#ifndef NDEBUG
  12.286 -    if ( r_time < ctx_allow )
  12.287 -    {
  12.288 -        printk("[%02d]: %lx\n", this_cpu, (unsigned long)r_time);
  12.289 -        dump_rqueue(&schedule_data[this_cpu].runqueue, "foo");
  12.290 -    }
  12.291 -#endif
  12.292 -
  12.293      prev->has_cpu = 0;
  12.294      next->has_cpu = 1;
  12.295  
  12.296 -    schedule_data[this_cpu].curr = next;
  12.297 +    schedule_data[cpu].curr = next;
  12.298  
  12.299      next->lastschd = now;
  12.300  
  12.301      /* reprogramm the timer */
  12.302 -    schedule_data[this_cpu].s_timer.expires  = now + r_time;
  12.303 -    add_ac_timer(&schedule_data[this_cpu].s_timer);
  12.304 +    schedule_data[cpu].s_timer.expires  = now + r_time;
  12.305 +    add_ac_timer(&schedule_data[cpu].s_timer);
  12.306  
  12.307 -    spin_unlock_irq(&schedule_lock[this_cpu]);
  12.308 +    spin_unlock_irq(&schedule_lock[cpu]);
  12.309  
  12.310 -    /* done, switch tasks */
  12.311 +    /* Ensure that the domain has an up-to-date time base. */
  12.312 +    if ( !is_idle_task(next) )
  12.313 +        update_dom_time(next->shared_info);
  12.314 +
  12.315      if ( unlikely(prev == next) )
  12.316 -    {
  12.317 -        /* We won't go through the normal tail, so do this by hand */
  12.318 -        update_dom_time(prev->shared_info);
  12.319          return;
  12.320 -    }
  12.321  
  12.322      perfc_incrc(sched_ctx);
  12.323 -#ifdef SCHED_HISTO
  12.324 +
  12.325 +#if defined(WAKEUP_HISTO)
  12.326 +    if ( !is_idle_task(next) && next->wokenup ) {
  12.327 +        ulong diff = (ulong)(now - next->wokenup);
  12.328 +        diff /= (ulong)MILLISECS(1);
  12.329 +        if (diff <= BUCKETS-2)  schedule_data[cpu].hist[diff]++;
  12.330 +        else                    schedule_data[cpu].hist[BUCKETS-1]++;
  12.331 +    }
  12.332 +    next->wokenup = (s_time_t)0;
  12.333 +#elif defined(BLOCKTIME_HISTO)
  12.334 +    prev->lastdeschd = now;
  12.335 +    if ( !is_idle_task(next) )
  12.336      {
  12.337 -        ulong diff; /* should fit in 32bits */
  12.338 -        if (!is_idle_task(next) && next->wokenup) {
  12.339 -            diff = (ulong)(now - next->wokenup);
  12.340 -            diff /= (ulong)MILLISECS(1);
  12.341 -            if (diff <= BUCKETS-2)  schedule_data[this_cpu].hist[diff]++;
  12.342 -            else                    schedule_data[this_cpu].hist[BUCKETS-1]++;
  12.343 -        }
  12.344 -        next->wokenup = (s_time_t)0;
  12.345 +        ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
  12.346 +        if (diff <= BUCKETS-2)  schedule_data[cpu].hist[diff]++;
  12.347 +        else                    schedule_data[cpu].hist[BUCKETS-1]++;
  12.348      }
  12.349  #endif
  12.350  
  12.351 @@ -509,8 +562,10 @@ asmlinkage void __enter_scheduler(void)
  12.352      if ( unlikely(prev->state == TASK_DYING) ) 
  12.353          put_task_struct(prev);
  12.354  
  12.355 -    update_dom_time(next->shared_info);
  12.356 -
  12.357 +    /* Mark a timer event for the newly-scheduled domain. */
  12.358 +    if ( !is_idle_task(next) )
  12.359 +        set_bit(_EVENT_TIMER, &next->shared_info->events);
  12.360 +    
  12.361      schedule_tail(next);
  12.362  
  12.363      BUG();
  12.364 @@ -524,55 +579,57 @@ int idle_cpu(int cpu)
  12.365  }
  12.366  
  12.367  
  12.368 -/* The scheduler timer. */
  12.369 -static void sched_timer(unsigned long unused)
  12.370 +/****************************************************************************
  12.371 + * Timers: the scheduler utilises a number of timers
  12.372 + * - s_timer: per CPU timer for preemption and scheduling decisions
  12.373 + * - t_timer: per CPU periodic timer to send timer interrupt to current dom
  12.374 + * - dom_timer: per domain timer to specifiy timeout values
  12.375 + * - fallback_timer: safeguard to ensure time is up to date
  12.376 + ****************************************************************************/
  12.377 +
  12.378 +/* The scheduler timer: force a run through the scheduler*/
  12.379 +static void s_timer_fn(unsigned long unused)
  12.380  {
  12.381 -    int                 cpu  = smp_processor_id();
  12.382 -    struct task_struct *curr = schedule_data[cpu].curr;
  12.383 -    /* cause a reschedule */
  12.384 -    set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
  12.385 +    set_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events);
  12.386      perfc_incrc(sched_irq);
  12.387  }
  12.388  
  12.389 -/* The Domain virtual time timer */
  12.390 -static void virt_timer(unsigned long unused)
  12.391 +/* Periodic tick timer: send timer event to current domain*/
  12.392 +static void t_timer_fn(unsigned long unused)
  12.393  {
  12.394 -    unsigned long flags, cpu_mask = 0;
  12.395 -    struct task_struct *p;
  12.396 -    s_time_t now;
  12.397 +    struct task_struct *p = current;
  12.398 +
  12.399 +    if ( !is_idle_task(p) ) 
  12.400 +        set_bit(_EVENT_TIMER, &p->shared_info->events);
  12.401  
  12.402 -    /* send virtual timer interrupt */
  12.403 -    read_lock_irqsave(&tasklist_lock, flags);
  12.404 -    p = &idle0_task;
  12.405 -    do {
  12.406 -        if ( is_idle_task(p) ) continue;
  12.407 -        cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
  12.408 -    }
  12.409 -    while ( (p = p->next_task) != &idle0_task );
  12.410 -    read_unlock_irqrestore(&tasklist_lock, flags);
  12.411 +    t_timer[p->processor].expires = NOW() + MILLISECS(10);
  12.412 +    add_ac_timer(&t_timer[p->processor]);
  12.413 +}
  12.414 +
  12.415 +/* Domain timer function, sends a virtual timer interrupt to domain */
  12.416 +static void dom_timer_fn(unsigned long data)
  12.417 +{
  12.418 +    unsigned long cpu_mask = 0;
  12.419 +    struct task_struct *p = (struct task_struct *)data;
  12.420 +
  12.421 +    cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
  12.422      guest_event_notify(cpu_mask);
  12.423 +}
  12.424  
  12.425 -    now = NOW();
  12.426 -    v_timer.expires = now + MILLISECS(20);
  12.427 -    add_ac_timer(&v_timer);
  12.428 -}
  12.429  
  12.430  /* Fallback timer to ensure guests get time updated 'often enough'. */
  12.431  static void fallback_timer_fn(unsigned long unused)
  12.432  {
  12.433      struct task_struct *p = current;
  12.434 -    unsigned int cpu = p->processor;
  12.435  
  12.436      if ( !is_idle_task(p) )
  12.437          update_dom_time(p->shared_info);
  12.438  
  12.439 -    fallback_timer[cpu].expires = NOW() + MILLISECS(500);
  12.440 -    add_ac_timer(&fallback_timer[cpu]);
  12.441 +    fallback_timer[p->processor].expires = NOW() + MILLISECS(500);
  12.442 +    add_ac_timer(&fallback_timer[p->processor]);
  12.443  }
  12.444  
  12.445 -/*
  12.446 - * Initialise the data structures
  12.447 - */
  12.448 +/* Initialise the data structures. */
  12.449  void __init scheduler_init(void)
  12.450  {
  12.451      int i;
  12.452 @@ -588,20 +645,20 @@ void __init scheduler_init(void)
  12.453          init_ac_timer(&schedule_data[i].s_timer);
  12.454          schedule_data[i].s_timer.cpu      = i;
  12.455          schedule_data[i].s_timer.data     = 2;
  12.456 -        schedule_data[i].s_timer.function = &sched_timer;
  12.457 +        schedule_data[i].s_timer.function = &s_timer_fn;
  12.458 +
  12.459 +        init_ac_timer(&t_timer[i]);
  12.460 +        t_timer[i].cpu      = i;
  12.461 +        t_timer[i].data     = 3;
  12.462 +        t_timer[i].function = &t_timer_fn;
  12.463  
  12.464          init_ac_timer(&fallback_timer[i]);
  12.465          fallback_timer[i].cpu      = i;
  12.466 -        fallback_timer[i].data     = 0;
  12.467 +        fallback_timer[i].data     = 4;
  12.468          fallback_timer[i].function = &fallback_timer_fn;
  12.469      }
  12.470  
  12.471      schedule_data[0].idle = &idle0_task;
  12.472 -
  12.473 -    init_ac_timer(&v_timer);
  12.474 -    v_timer.cpu      = 0;
  12.475 -    v_timer.data     = 0;
  12.476 -    v_timer.function = &virt_timer;
  12.477  }
  12.478  
  12.479  /*
  12.480 @@ -612,10 +669,11 @@ void schedulers_start(void)
  12.481  {   
  12.482      printk("Start schedulers\n");
  12.483  
  12.484 -    virt_timer(0);
  12.485 +    s_timer_fn(0);
  12.486 +    smp_call_function((void *)s_timer_fn, NULL, 1, 1);
  12.487  
  12.488 -    sched_timer(0);
  12.489 -    smp_call_function((void *)sched_timer, NULL, 1, 1);
  12.490 +    t_timer_fn(0);
  12.491 +    smp_call_function((void *)t_timer_fn, NULL, 1, 1);
  12.492  
  12.493      fallback_timer_fn(0);
  12.494      smp_call_function((void *)fallback_timer_fn, NULL, 1, 1);
  12.495 @@ -668,7 +726,7 @@ void dump_runq(u_char key, void *dev_id,
  12.496      return; 
  12.497  }
  12.498  
  12.499 -#ifdef SCHED_HISTO
  12.500 +#if defined(WAKEUP_HISTO) || defined(BLOCKTIME_HISTO)
  12.501  void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
  12.502  {
  12.503      int loop, i, j;
    13.1 --- a/xen/drivers/block/xen_vbd.c	Fri Jan 30 15:55:06 2004 +0000
    13.2 +++ b/xen/drivers/block/xen_vbd.c	Sat Jan 31 19:45:13 2004 +0000
    13.3 @@ -89,7 +89,7 @@ long vbd_create(vbd_create_t *create)
    13.4      if ( unlikely((p = find_domain_by_id(create->domain)) == NULL) )
    13.5      {
    13.6          DPRINTK("vbd_create attempted for non-existent domain %d\n", 
    13.7 -                domain); 
    13.8 +                create->domain); 
    13.9          return -EINVAL; 
   13.10      }
   13.11  
    14.1 --- a/xen/include/hypervisor-ifs/hypervisor-if.h	Fri Jan 30 15:55:06 2004 +0000
    14.2 +++ b/xen/include/hypervisor-ifs/hypervisor-if.h	Sat Jan 31 19:45:13 2004 +0000
    14.3 @@ -49,18 +49,19 @@
    14.4  #define __HYPERVISOR_net_io_op             6
    14.5  #define __HYPERVISOR_fpu_taskswitch        7
    14.6  #define __HYPERVISOR_sched_op              8
    14.7 -#define __HYPERVISOR_dom0_op               9
    14.8 -#define __HYPERVISOR_network_op           10
    14.9 -#define __HYPERVISOR_block_io_op          11
   14.10 -#define __HYPERVISOR_set_debugreg         12
   14.11 -#define __HYPERVISOR_get_debugreg         13
   14.12 -#define __HYPERVISOR_update_descriptor    14
   14.13 -#define __HYPERVISOR_set_fast_trap        15
   14.14 -#define __HYPERVISOR_dom_mem_op           16
   14.15 -#define __HYPERVISOR_multicall            17
   14.16 -#define __HYPERVISOR_kbd_op               18
   14.17 -#define __HYPERVISOR_update_va_mapping    19
   14.18 -#define __HYPERVISOR_event_channel_op     20
   14.19 +#define __HYPERVISOR_set_dom_timer         9
   14.20 +#define __HYPERVISOR_dom0_op              10
   14.21 +#define __HYPERVISOR_network_op           11
   14.22 +#define __HYPERVISOR_block_io_op          12
   14.23 +#define __HYPERVISOR_set_debugreg         13
   14.24 +#define __HYPERVISOR_get_debugreg         14
   14.25 +#define __HYPERVISOR_update_descriptor    15
   14.26 +#define __HYPERVISOR_set_fast_trap        16
   14.27 +#define __HYPERVISOR_dom_mem_op           17
   14.28 +#define __HYPERVISOR_multicall            18
   14.29 +#define __HYPERVISOR_kbd_op               19
   14.30 +#define __HYPERVISOR_update_va_mapping    20
   14.31 +#define __HYPERVISOR_event_channel_op     21
   14.32  
   14.33  /* And the trap vector is... */
   14.34  #define TRAP_INSTR "int $0x82"
   14.35 @@ -161,9 +162,10 @@
   14.36  /*
   14.37   * SCHEDOP_* - Scheduler hypercall operations.
   14.38   */
   14.39 -#define SCHEDOP_yield           0
   14.40 -#define SCHEDOP_exit            1
   14.41 -#define SCHEDOP_stop            2
   14.42 +#define SCHEDOP_yield           0   /* Give up the CPU voluntarily.      */
   14.43 +#define SCHEDOP_block           1   /* Block until an event is received. */
   14.44 +#define SCHEDOP_exit            3   /* Exit and kill this domain.        */
   14.45 +#define SCHEDOP_stop            4   /* Stop executing this domain.       */
   14.46  
   14.47  /*
   14.48   * EVTCHNOP_* - Event channel operations.
    15.1 --- a/xen/include/xeno/sched.h	Fri Jan 30 15:55:06 2004 +0000
    15.2 +++ b/xen/include/xeno/sched.h	Sat Jan 31 19:45:13 2004 +0000
    15.3 @@ -100,25 +100,27 @@ struct task_struct
    15.4      unsigned int     tot_pages; /* number of pages currently possesed */
    15.5      unsigned int     max_pages; /* max number of pages that can be possesed */
    15.6  
    15.7 -    /* scheduling */
    15.8 +    /* Scheduling. */
    15.9      struct list_head run_list;
   15.10      int              has_cpu;
   15.11 -    int state;                  /* current run state */
   15.12 -    int cpupinned;              /* true if pinned to curent CPU */
   15.13 +    int              state;         /* current run state */
   15.14 +    int              cpupinned;     /* true if pinned to curent CPU */
   15.15 +    s_time_t         lastschd;      /* time this domain was last scheduled */
   15.16 +    s_time_t         lastdeschd;    /* time this domain was last descheduled */
   15.17 +    s_time_t         cpu_time;      /* total CPU time received till now */
   15.18 +    s_time_t         wokenup;       /* time domain got woken up */
   15.19 +    struct ac_timer  timer;         /* one-shot timer for timeout values */
   15.20  
   15.21 -    s_time_t lastschd;              /* time this domain was last scheduled */
   15.22 -    s_time_t cpu_time;              /* total CPU time received till now */
   15.23 -    s_time_t wokenup;               /* time domain got woken up */
   15.24 -
   15.25 +    /* BVT scheduler specific. */
   15.26      unsigned long mcu_advance;      /* inverse of weight */
   15.27 -    u32  avt;                       /* actual virtual time */
   15.28 -    u32  evt;                       /* effective virtual time */
   15.29 -    int  warpback;                  /* warp?  */
   15.30 -    long warp;                      /* virtual time warp */
   15.31 -    long warpl;                     /* warp limit */
   15.32 -    long warpu;                     /* unwarp time requirement */
   15.33 -    s_time_t warped;                /* time it ran warped last time */
   15.34 -    s_time_t uwarped;               /* time it ran unwarped last time */
   15.35 +    u32           avt;              /* actual virtual time */
   15.36 +    u32           evt;              /* effective virtual time */
   15.37 +    int           warpback;         /* warp?  */
   15.38 +    long          warp;             /* virtual time warp */
   15.39 +    long          warpl;            /* warp limit */
   15.40 +    long          warpu;            /* unwarp time requirement */
   15.41 +    s_time_t      warped;           /* time it ran warped last time */
   15.42 +    s_time_t      uwarped;          /* time it ran unwarped last time */
   15.43  
   15.44      /* Network I/O */
   15.45      net_vif_t *net_vif_list[MAX_DOMAIN_VIFS];
   15.46 @@ -250,7 +252,6 @@ long sched_adjdom(int dom, unsigned long
   15.47  void init_idle_task(void);
   15.48  void __wake_up(struct task_struct *p);
   15.49  void wake_up(struct task_struct *p);
   15.50 -long do_yield(void);
   15.51  unsigned long __reschedule(struct task_struct *p);
   15.52  void reschedule(struct task_struct *p);
   15.53  
   15.54 @@ -271,8 +272,9 @@ static inline long schedule_timeout(long
   15.55      return 0;
   15.56  }
   15.57  
   15.58 -#define signal_pending(_p) ((_p)->hyp_events || \
   15.59 -                            (_p)->shared_info->events)
   15.60 +#define signal_pending(_p) \
   15.61 +    ((_p)->hyp_events ||   \
   15.62 +     ((_p)->shared_info->events & (_p)->shared_info->events_mask))
   15.63  
   15.64  void domain_init(void);
   15.65  
    16.1 --- a/xen/net/dev.c	Fri Jan 30 15:55:06 2004 +0000
    16.2 +++ b/xen/net/dev.c	Sat Jan 31 19:45:13 2004 +0000
    16.3 @@ -1972,6 +1972,16 @@ static int get_tx_bufs(net_vif_t *vif)
    16.4          }
    16.5          else if ( (target == VIF_PHYS) || IS_PRIV(p) )
    16.6          {
    16.7 +            /*
    16.8 +             * XXX HACK XXX: Our wildcard rule for domain-0 incorrectly puts 
    16.9 +             * some 169.254.* (ie. link-local) packets on the wire unless we 
   16.10 +             * include this explicit test. :-(
   16.11 +             */
   16.12 +            if ( (ntohs(*(unsigned short *)(g_data + 12)) == ETH_P_IP) &&
   16.13 +                 ((ntohl(*(unsigned long *)(g_data + 26)) & 0xFFFF0000) == 
   16.14 +                  0xA9FE0000) )
   16.15 +                goto disallow_linklocal_packets;
   16.16 +
   16.17              stx = &vif->tx_shadow_ring[MASK_NET_TX_IDX(j)];
   16.18              stx->id     = tx.id;
   16.19              stx->size   = tx.size;
   16.20 @@ -1990,6 +2000,7 @@ static int get_tx_bufs(net_vif_t *vif)
   16.21          }
   16.22          else
   16.23          {
   16.24 +        disallow_linklocal_packets:
   16.25              make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
   16.26          }
   16.27  
    17.1 --- a/xenolinux-2.4.24-sparse/arch/xeno/config.in	Fri Jan 30 15:55:06 2004 +0000
    17.2 +++ b/xenolinux-2.4.24-sparse/arch/xeno/config.in	Sat Jan 31 19:45:13 2004 +0000
    17.3 @@ -13,9 +13,11 @@ define_bool CONFIG_SBUS n
    17.4  define_bool CONFIG_UID16 y
    17.5  
    17.6  mainmenu_option next_comment
    17.7 -comment 'Privileged guest OS'
    17.8 +comment 'XenoLinux'
    17.9  bool 'Support for privileged operations (domain 0)' CONFIG_XENO_PRIV
   17.10  endmenu
   17.11 +# the IBM S/390 patch needs this.
   17.12 +define_bool CONFIG_NO_IDLE_HZ y
   17.13  
   17.14  mainmenu_option next_comment
   17.15  comment 'Code maturity level options'
    18.1 --- a/xenolinux-2.4.24-sparse/arch/xeno/defconfig	Fri Jan 30 15:55:06 2004 +0000
    18.2 +++ b/xenolinux-2.4.24-sparse/arch/xeno/defconfig	Sat Jan 31 19:45:13 2004 +0000
    18.3 @@ -8,9 +8,12 @@ CONFIG_ISA=y
    18.4  CONFIG_UID16=y
    18.5  
    18.6  #
    18.7 -# Privileged guest OS
    18.8 +# XenoLinux Options
    18.9  #
   18.10 +# support for priviledged domains
   18.11  CONFIG_XENO_PRIV=y
   18.12 +# On demand timer setting (taken from s390 patch set)
   18.13 +CONFIG_NO_IDLE_HZ=y
   18.14  
   18.15  #
   18.16  # Code maturity level options
    19.1 --- a/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c	Fri Jan 30 15:55:06 2004 +0000
    19.2 +++ b/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c	Sat Jan 31 19:45:13 2004 +0000
    19.3 @@ -81,15 +81,15 @@ static void _dbg_network_int(struct net_
    19.4      if ( np->state == STATE_CLOSED )
    19.5          return;
    19.6      
    19.7 -    printk(KERN_ALERT "tx_full = %d, tx_resp_cons = 0x%08x,"
    19.8 -           " tx_req_prod = 0x%08x, tx_resp_prod = 0x%08x,"
    19.9 -           " tx_event = 0x%08x, state=%d\n",
   19.10 +    printk(KERN_ALERT "net: tx_full=%d, tx_resp_cons=0x%08x,"
   19.11 +           " tx_req_prod=0x%08x\nnet: tx_resp_prod=0x%08x,"
   19.12 +           " tx_event=0x%08x, state=%d\n",
   19.13             np->tx_full, np->tx_resp_cons, 
   19.14             np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod, 
   19.15             np->net_idx->tx_event,
   19.16             test_bit(__LINK_STATE_XOFF, &dev->state));
   19.17 -    printk(KERN_ALERT "rx_resp_cons = 0x%08x,"
   19.18 -           " rx_req_prod = 0x%08x, rx_resp_prod = 0x%08x, rx_event = 0x%08x\n",
   19.19 +    printk(KERN_ALERT "net: rx_resp_cons=0x%08x,"
   19.20 +           " rx_req_prod=0x%08x\nnet: rx_resp_prod=0x%08x, rx_event=0x%08x\n",
   19.21             np->rx_resp_cons, np->net_idx->rx_req_prod,
   19.22             np->net_idx->rx_resp_prod, np->net_idx->rx_event);
   19.23  }
   19.24 @@ -550,7 +550,8 @@ int __init init_module(void)
   19.25          goto fail;
   19.26      }
   19.27      
   19.28 -    err = request_irq(_EVENT_DEBUG, dbg_network_int, 0, "debug", NULL);
   19.29 +    err = request_irq(_EVENT_DEBUG, dbg_network_int, SA_SHIRQ, "net_dbg", 
   19.30 +                      &dbg_network_int);
   19.31      if ( err )
   19.32          printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n");
   19.33  
    20.1 --- a/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c	Fri Jan 30 15:55:06 2004 +0000
    20.2 +++ b/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c	Sat Jan 31 19:45:13 2004 +0000
    20.3 @@ -80,14 +80,36 @@ void enable_hlt(void)
    20.4   */
    20.5  void cpu_idle (void)
    20.6  {
    20.7 -    /* endless idle loop with no priority at all */
    20.8 +    extern int set_timeout_timer(void);
    20.9 +    
   20.10 +    /* Endless idle loop with no priority at all. */
   20.11      init_idle();
   20.12      current->nice = 20;
   20.13      current->counter = -100;
   20.14  
   20.15 -    while (1) {
   20.16 -        while (!current->need_resched)
   20.17 -            HYPERVISOR_yield();
   20.18 +    for ( ; ; )
   20.19 +    {
   20.20 +        while ( !current->need_resched )
   20.21 +        {
   20.22 +            __cli();
   20.23 +            if ( current->need_resched )
   20.24 +            {
   20.25 +                /* The race-free check for events failed. */
   20.26 +                __sti();
   20.27 +                break;
   20.28 +            }
   20.29 +            else if ( set_timeout_timer() == 0 )
   20.30 +            {
   20.31 +                /* NB. Blocking reenable events in a race-free manner. */
   20.32 +                HYPERVISOR_block();
   20.33 +            }
   20.34 +            else
   20.35 +            {
   20.36 +                /* No race here: yielding will get us the CPU again anyway. */
   20.37 +                __sti();
   20.38 +                HYPERVISOR_yield();
   20.39 +            }
   20.40 +        }
   20.41          schedule();
   20.42          check_pgt_cache();
   20.43      }
    21.1 --- a/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c	Fri Jan 30 15:55:06 2004 +0000
    21.2 +++ b/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c	Sat Jan 31 19:45:13 2004 +0000
    21.3 @@ -75,7 +75,7 @@ static u32 st_scale_i; /* convert ticks 
    21.4  
    21.5  /* These are peridically updated in shared_info, and then copied here. */
    21.6  static u32 shadow_tsc_stamp;
    21.7 -static s64 shadow_system_time;
    21.8 +static u64 shadow_system_time;
    21.9  static u32 shadow_time_version;
   21.10  static struct timeval shadow_tv;
   21.11  
   21.12 @@ -91,9 +91,12 @@ static long last_update_to_rtc, last_upd
   21.13  #endif
   21.14  
   21.15  /* Periodically take synchronised time base from Xen, if we need it. */
   21.16 -static long last_update_from_xen;
   21.17 +static long last_update_from_xen;   /* UTC seconds when last read Xen clock. */
   21.18  
   21.19 -static u64 processed_system_time;
   21.20 +/* Keep track of last time we did processing/updating of jiffies and xtime. */
   21.21 +static u64 processed_system_time;   /* System time (ns) at last processing. */
   21.22 +
   21.23 +#define NS_PER_TICK (1000000000ULL/HZ)
   21.24  
   21.25  #define HANDLE_USEC_UNDERFLOW(_tv)         \
   21.26      do {                                   \
   21.27 @@ -197,8 +200,11 @@ static int set_rtc_mmss(unsigned long no
   21.28  #endif
   21.29  
   21.30  
   21.31 -/* Must be called with the xtime_lock held for writing. */
   21.32 -static void get_time_values_from_xen(void)
   21.33 +/*
   21.34 + * Reads a consistent set of time-base values from Xen, into a shadow data
   21.35 + * area. Must be called with the xtime_lock held for writing.
   21.36 + */
   21.37 +static void __get_time_values_from_xen(void)
   21.38  {
   21.39      do {
   21.40          shadow_time_version = HYPERVISOR_shared_info->time_version2;
   21.41 @@ -216,7 +222,11 @@ static void get_time_values_from_xen(voi
   21.42      (shadow_time_version == HYPERVISOR_shared_info->time_version2)
   21.43  
   21.44  
   21.45 -static inline unsigned long get_time_delta_usecs(void)
   21.46 +/*
   21.47 + * Returns the system time elapsed, in ns, since the current shadow_timestamp
   21.48 + * was calculated. Must be called with the xtime_lock held for reading.
   21.49 + */
   21.50 +static inline unsigned long __get_time_delta_usecs(void)
   21.51  {
   21.52      s32      delta_tsc;
   21.53      u32      low;
   21.54 @@ -234,6 +244,9 @@ static inline unsigned long get_time_del
   21.55  }
   21.56  
   21.57  
   21.58 +/*
   21.59 + * Returns the current time-of-day in UTC timeval format.
   21.60 + */
   21.61  void do_gettimeofday(struct timeval *tv)
   21.62  {
   21.63  	unsigned long flags, lost;
   21.64 @@ -242,7 +255,7 @@ void do_gettimeofday(struct timeval *tv)
   21.65   again:
   21.66      read_lock_irqsave(&xtime_lock, flags);
   21.67  
   21.68 -    _tv.tv_usec = get_time_delta_usecs();
   21.69 +    _tv.tv_usec = __get_time_delta_usecs();
   21.70      if ( (lost = (jiffies - wall_jiffies)) != 0 )
   21.71          _tv.tv_usec += lost * (1000000 / HZ);
   21.72      _tv.tv_sec   = xtime.tv_sec;
   21.73 @@ -257,7 +270,7 @@ void do_gettimeofday(struct timeval *tv)
   21.74           */
   21.75          read_unlock_irqrestore(&xtime_lock, flags);
   21.76          write_lock_irqsave(&xtime_lock, flags);
   21.77 -        get_time_values_from_xen();
   21.78 +        __get_time_values_from_xen();
   21.79          write_unlock_irqrestore(&xtime_lock, flags);
   21.80          goto again;
   21.81      }
   21.82 @@ -276,6 +289,10 @@ void do_gettimeofday(struct timeval *tv)
   21.83      *tv = _tv;
   21.84  }
   21.85  
   21.86 +
   21.87 +/*
   21.88 + * Sets the current time-of-day based on passed-in UTC timeval parameter.
   21.89 + */
   21.90  void do_settimeofday(struct timeval *tv)
   21.91  {
   21.92      struct timeval newtv;
   21.93 @@ -291,10 +308,10 @@ void do_settimeofday(struct timeval *tv)
   21.94       * be stale, so we can retry with fresh ones.
   21.95       */
   21.96   again:
   21.97 -    tv->tv_usec -= get_time_delta_usecs();
   21.98 +    tv->tv_usec -= __get_time_delta_usecs();
   21.99      if ( unlikely(!TIME_VALUES_UP_TO_DATE) )
  21.100      {
  21.101 -        get_time_values_from_xen();
  21.102 +        __get_time_values_from_xen();
  21.103          goto again;
  21.104      }
  21.105      
  21.106 @@ -334,6 +351,7 @@ void do_settimeofday(struct timeval *tv)
  21.107      }
  21.108  }
  21.109  
  21.110 +
  21.111  asmlinkage long sys_stime(int *tptr)
  21.112  {
  21.113  	int value;
  21.114 @@ -353,14 +371,22 @@ asmlinkage long sys_stime(int *tptr)
  21.115  	return 0;
  21.116  }
  21.117  
  21.118 -#define NS_PER_TICK (1000000000ULL/HZ)
  21.119 +
  21.120 +/* Convert jiffies to system time. Call with xtime_lock held for reading. */
  21.121 +static inline u64 __jiffies_to_st(unsigned long j) 
  21.122 +{
  21.123 +    return processed_system_time + ((j - jiffies) * NS_PER_TICK);
  21.124 +}
  21.125 +
  21.126 +
  21.127  static inline void do_timer_interrupt(int irq, void *dev_id,
  21.128                                        struct pt_regs *regs)
  21.129  {
  21.130      s64 delta;
  21.131 +    unsigned long ticks = 0;
  21.132      long sec_diff;
  21.133  
  21.134 -    get_time_values_from_xen();
  21.135 +    __get_time_values_from_xen();
  21.136  
  21.137      if ( (delta = (s64)(shadow_system_time - processed_system_time)) < 0 )
  21.138      {
  21.139 @@ -368,13 +394,24 @@ static inline void do_timer_interrupt(in
  21.140          return;
  21.141      }
  21.142  
  21.143 +    /* Process elapsed jiffies since last call. */
  21.144      while ( delta >= NS_PER_TICK )
  21.145      {
  21.146 -        do_timer(regs);
  21.147 +        ticks++;
  21.148          delta -= NS_PER_TICK;
  21.149          processed_system_time += NS_PER_TICK;
  21.150      }
  21.151 -    
  21.152 +
  21.153 +    if ( ticks != 0 )
  21.154 +    {
  21.155 +        do_timer_ticks(ticks);
  21.156 +
  21.157 +        if ( user_mode(regs) )
  21.158 +            update_process_times_us(ticks, 0);
  21.159 +        else
  21.160 +            update_process_times_us(0, ticks);
  21.161 +    }
  21.162 +
  21.163      /*
  21.164       * Take synchronised time from Xen once a minute if we're not
  21.165       * synchronised ourselves, and we haven't chosen to keep an independent
  21.166 @@ -446,6 +483,7 @@ static inline void do_timer_interrupt(in
  21.167  #endif
  21.168  }
  21.169  
  21.170 +
  21.171  static void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
  21.172  {
  21.173      write_lock(&xtime_lock);
  21.174 @@ -463,6 +501,89 @@ static struct irqaction irq_timer = {
  21.175      NULL
  21.176  };
  21.177  
  21.178 +
  21.179 +/*
  21.180 + * This function works out when the the next timer function has to be
  21.181 + * executed (by looking at the timer list) and sets the Xen one-shot
  21.182 + * domain timer to the appropriate value. This is typically called in
  21.183 + * cpu_idle() before the domain blocks.
  21.184 + * 
  21.185 + * The function returns a non-0 value on error conditions.
  21.186 + * 
  21.187 + * It must be called with interrupts disabled.
  21.188 + */
  21.189 +extern spinlock_t timerlist_lock;
  21.190 +int set_timeout_timer(void)
  21.191 +{
  21.192 +    struct timer_list *timer;
  21.193 +    u64 alarm = 0;
  21.194 +    int ret = 0;
  21.195 +
  21.196 +    spin_lock(&timerlist_lock);
  21.197 +
  21.198 +    /*
  21.199 +     * This is safe against long blocking (since calculations are not based on 
  21.200 +     * TSC deltas). It is also safe against warped system time since
  21.201 +     * suspend-resume is cooperative and we would first get locked out. It is 
  21.202 +     * safe against normal updates of jiffies since interrupts are off.
  21.203 +     */
  21.204 +    if ( (timer = next_timer_event()) != NULL )
  21.205 +        alarm = __jiffies_to_st(timer->expires);
  21.206 +
  21.207 +    /* Failure is pretty bad, but we'd best soldier on. */
  21.208 +    if ( HYPERVISOR_set_dom_timer(alarm) != 0 )
  21.209 +        ret = -1;
  21.210 +    
  21.211 +    spin_unlock(&timerlist_lock);
  21.212 +
  21.213 +    return ret;
  21.214 +}
  21.215 +
  21.216 +
  21.217 +/* Time debugging. */
  21.218 +static void dbg_time_int(int irq, void *dev_id, struct pt_regs *ptregs)
  21.219 +{
  21.220 +    unsigned long flags, j;
  21.221 +    u64 s_now, j_st;
  21.222 +    struct timeval s_tv, tv;
  21.223 +
  21.224 +    struct timer_list *timer;
  21.225 +    u64 t_st;
  21.226 +
  21.227 +    read_lock_irqsave(&xtime_lock, flags);
  21.228 +    s_tv.tv_sec  = shadow_tv.tv_sec;
  21.229 +    s_tv.tv_usec = shadow_tv.tv_usec;
  21.230 +    s_now        = shadow_system_time;
  21.231 +    read_unlock_irqrestore(&xtime_lock, flags);
  21.232 +
  21.233 +    do_gettimeofday(&tv);
  21.234 +
  21.235 +    j = jiffies;
  21.236 +    j_st = __jiffies_to_st(j);
  21.237 +
  21.238 +    timer = next_timer_event();
  21.239 +    t_st = __jiffies_to_st(timer->expires);
  21.240 +
  21.241 +    printk(KERN_ALERT "time: shadow_st=0x%X:%08X\n",
  21.242 +           (u32)(s_now>>32), (u32)s_now);
  21.243 +    printk(KERN_ALERT "time: wct=%lds %ldus shadow_wct=%lds %ldus\n",
  21.244 +           tv.tv_sec, tv.tv_usec, s_tv.tv_sec, s_tv.tv_usec);
  21.245 +    printk(KERN_ALERT "time: jiffies=%lu(0x%X:%08X) timeout=%lu(0x%X:%08X)\n",
  21.246 +           jiffies,(u32)(j_st>>32), (u32)j_st,
  21.247 +           timer->expires,(u32)(t_st>>32), (u32)t_st);
  21.248 +    printk(KERN_ALERT "time: processed_system_time=0x%X:%08X\n",
  21.249 +           (u32)(processed_system_time>>32), (u32)processed_system_time);
  21.250 +}
  21.251 +
  21.252 +static struct irqaction dbg_time = {
  21.253 +    dbg_time_int, 
  21.254 +    SA_SHIRQ, 
  21.255 +    0, 
  21.256 +    "timer_dbg", 
  21.257 +    &dbg_time_int,
  21.258 +    NULL
  21.259 +};
  21.260 +
  21.261  void __init time_init(void)
  21.262  {
  21.263      unsigned long long alarm;
  21.264 @@ -494,10 +615,12 @@ void __init time_init(void)
  21.265      st_scale_f = scale & 0xffffffff;
  21.266      st_scale_i = scale >> 32;
  21.267  
  21.268 -    get_time_values_from_xen();
  21.269 +    __get_time_values_from_xen();
  21.270      processed_system_time = shadow_system_time;
  21.271  
  21.272 -    setup_irq(TIMER_IRQ, &irq_timer);
  21.273 +    (void)setup_irq(TIMER_IRQ, &irq_timer);
  21.274 +
  21.275 +    (void)setup_irq(_EVENT_DEBUG, &dbg_time);
  21.276  
  21.277      rdtscll(alarm);
  21.278  
    22.1 --- a/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h	Fri Jan 30 15:55:06 2004 +0000
    22.2 +++ b/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h	Sat Jan 31 19:45:13 2004 +0000
    22.3 @@ -256,6 +256,17 @@ static inline int HYPERVISOR_yield(void)
    22.4      return ret;
    22.5  }
    22.6  
    22.7 +static inline int HYPERVISOR_block(void)
    22.8 +{
    22.9 +    int ret;
   22.10 +    __asm__ __volatile__ (
   22.11 +        TRAP_INSTR
   22.12 +        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
   22.13 +        "b" (SCHEDOP_block) );
   22.14 +
   22.15 +    return ret;
   22.16 +}
   22.17 +
   22.18  static inline int HYPERVISOR_exit(void)
   22.19  {
   22.20      int ret;
   22.21 @@ -279,6 +290,19 @@ static inline int HYPERVISOR_stop(unsign
   22.22      return ret;
   22.23  }
   22.24  
   22.25 +static inline long HYPERVISOR_set_dom_timer(u64 timeout)
   22.26 +{
   22.27 +    int ret;
   22.28 +    unsigned long timeout_hi = (unsigned long)(timeout>>32);
   22.29 +    unsigned long timeout_lo = (unsigned long)timeout;
   22.30 +    __asm__ __volatile__ (
   22.31 +        TRAP_INSTR
   22.32 +        : "=a" (ret) : "0" (__HYPERVISOR_set_dom_timer),
   22.33 +        "b" (timeout_hi), "c" (timeout_lo) : "memory" );
   22.34 +
   22.35 +    return ret;
   22.36 +}
   22.37 +
   22.38  static inline int HYPERVISOR_dom0_op(dom0_op_t *dom0_op)
   22.39  {
   22.40      int ret;
    23.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    23.2 +++ b/xenolinux-2.4.24-sparse/include/linux/sched.h	Sat Jan 31 19:45:13 2004 +0000
    23.3 @@ -0,0 +1,966 @@
    23.4 +#ifndef _LINUX_SCHED_H
    23.5 +#define _LINUX_SCHED_H
    23.6 +
    23.7 +#include <asm/param.h>	/* for HZ */
    23.8 +
    23.9 +extern unsigned long event;
   23.10 +
   23.11 +#include <linux/config.h>
   23.12 +#include <linux/binfmts.h>
   23.13 +#include <linux/threads.h>
   23.14 +#include <linux/kernel.h>
   23.15 +#include <linux/types.h>
   23.16 +#include <linux/times.h>
   23.17 +#include <linux/timex.h>
   23.18 +#include <linux/rbtree.h>
   23.19 +
   23.20 +#include <asm/system.h>
   23.21 +#include <asm/semaphore.h>
   23.22 +#include <asm/page.h>
   23.23 +#include <asm/ptrace.h>
   23.24 +#include <asm/mmu.h>
   23.25 +
   23.26 +#include <linux/smp.h>
   23.27 +#include <linux/tty.h>
   23.28 +#include <linux/sem.h>
   23.29 +#include <linux/signal.h>
   23.30 +#include <linux/securebits.h>
   23.31 +#include <linux/fs_struct.h>
   23.32 +
   23.33 +struct exec_domain;
   23.34 +
   23.35 +/*
   23.36 + * cloning flags:
   23.37 + */
   23.38 +#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
   23.39 +#define CLONE_VM	0x00000100	/* set if VM shared between processes */
   23.40 +#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
   23.41 +#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
   23.42 +#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
   23.43 +#define CLONE_PID	0x00001000	/* set if pid shared */
   23.44 +#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
   23.45 +#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
   23.46 +#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
   23.47 +#define CLONE_THREAD	0x00010000	/* Same thread group? */
   23.48 +#define CLONE_NEWNS	0x00020000	/* New namespace group? */
   23.49 +
   23.50 +#define CLONE_SIGNAL	(CLONE_SIGHAND | CLONE_THREAD)
   23.51 +
   23.52 +/*
   23.53 + * These are the constant used to fake the fixed-point load-average
   23.54 + * counting. Some notes:
   23.55 + *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
   23.56 + *    a load-average precision of 10 bits integer + 11 bits fractional
   23.57 + *  - if you want to count load-averages more often, you need more
   23.58 + *    precision, or rounding will get you. With 2-second counting freq,
   23.59 + *    the EXP_n values would be 1981, 2034 and 2043 if still using only
   23.60 + *    11 bit fractions.
   23.61 + */
   23.62 +extern unsigned long avenrun[];		/* Load averages */
   23.63 +
   23.64 +#define FSHIFT		11		/* nr of bits of precision */
   23.65 +#define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
   23.66 +#define LOAD_FREQ	(5*HZ)		/* 5 sec intervals */
   23.67 +#define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
   23.68 +#define EXP_5		2014		/* 1/exp(5sec/5min) */
   23.69 +#define EXP_15		2037		/* 1/exp(5sec/15min) */
   23.70 +
   23.71 +#define CALC_LOAD(load,exp,n) \
   23.72 +	load *= exp; \
   23.73 +	load += n*(FIXED_1-exp); \
   23.74 +	load >>= FSHIFT;
   23.75 +
   23.76 +#define CT_TO_SECS(x)	((x) / HZ)
   23.77 +#define CT_TO_USECS(x)	(((x) % HZ) * 1000000/HZ)
   23.78 +
   23.79 +extern int nr_running, nr_threads;
   23.80 +extern int last_pid;
   23.81 +
   23.82 +#include <linux/fs.h>
   23.83 +#include <linux/time.h>
   23.84 +#include <linux/param.h>
   23.85 +#include <linux/resource.h>
   23.86 +#ifdef __KERNEL__
   23.87 +#include <linux/timer.h>
   23.88 +#endif
   23.89 +
   23.90 +#include <asm/processor.h>
   23.91 +
   23.92 +#define TASK_RUNNING		0
   23.93 +#define TASK_INTERRUPTIBLE	1
   23.94 +#define TASK_UNINTERRUPTIBLE	2
   23.95 +#define TASK_ZOMBIE		4
   23.96 +#define TASK_STOPPED		8
   23.97 +
   23.98 +#define __set_task_state(tsk, state_value)		\
   23.99 +	do { (tsk)->state = (state_value); } while (0)
  23.100 +#define set_task_state(tsk, state_value)		\
  23.101 +	set_mb((tsk)->state, (state_value))
  23.102 +
  23.103 +#define __set_current_state(state_value)			\
  23.104 +	do { current->state = (state_value); } while (0)
  23.105 +#define set_current_state(state_value)		\
  23.106 +	set_mb(current->state, (state_value))
  23.107 +
  23.108 +/*
  23.109 + * Scheduling policies
  23.110 + */
  23.111 +#define SCHED_OTHER		0
  23.112 +#define SCHED_FIFO		1
  23.113 +#define SCHED_RR		2
  23.114 +
  23.115 +/*
  23.116 + * This is an additional bit set when we want to
  23.117 + * yield the CPU for one re-schedule..
  23.118 + */
  23.119 +#define SCHED_YIELD		0x10
  23.120 +
  23.121 +struct sched_param {
  23.122 +	int sched_priority;
  23.123 +};
  23.124 +
  23.125 +struct completion;
  23.126 +
  23.127 +#ifdef __KERNEL__
  23.128 +
  23.129 +#include <linux/spinlock.h>
  23.130 +
  23.131 +/*
  23.132 + * This serializes "schedule()" and also protects
  23.133 + * the run-queue from deletions/modifications (but
  23.134 + * _adding_ to the beginning of the run-queue has
  23.135 + * a separate lock).
  23.136 + */
  23.137 +extern rwlock_t tasklist_lock;
  23.138 +extern spinlock_t runqueue_lock;
  23.139 +extern spinlock_t mmlist_lock;
  23.140 +
  23.141 +extern void sched_init(void);
  23.142 +extern void init_idle(void);
  23.143 +extern void show_state(void);
  23.144 +extern void cpu_init (void);
  23.145 +extern void trap_init(void);
  23.146 +extern void update_process_times(int user);
  23.147 +#ifdef CONFIG_NO_IDLE_HZ
  23.148 +extern void update_process_times_us(int user, int system);
  23.149 +#endif
  23.150 +extern void update_one_process(struct task_struct *p, unsigned long user,
  23.151 +			       unsigned long system, int cpu);
  23.152 +
  23.153 +#define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
  23.154 +extern signed long FASTCALL(schedule_timeout(signed long timeout));
  23.155 +asmlinkage void schedule(void);
  23.156 +
  23.157 +extern int schedule_task(struct tq_struct *task);
  23.158 +extern void flush_scheduled_tasks(void);
  23.159 +extern int start_context_thread(void);
  23.160 +extern int current_is_keventd(void);
  23.161 +
  23.162 +#if CONFIG_SMP
  23.163 +extern void set_cpus_allowed(struct task_struct *p, unsigned long new_mask);
  23.164 +#else
  23.165 +# define set_cpus_allowed(p, new_mask) do { } while (0)
  23.166 +#endif
  23.167 +
  23.168 +/*
  23.169 + * The default fd array needs to be at least BITS_PER_LONG,
  23.170 + * as this is the granularity returned by copy_fdset().
  23.171 + */
  23.172 +#define NR_OPEN_DEFAULT BITS_PER_LONG
  23.173 +
  23.174 +struct namespace;
  23.175 +/*
  23.176 + * Open file table structure
  23.177 + */
  23.178 +struct files_struct {
  23.179 +	atomic_t count;
  23.180 +	rwlock_t file_lock;	/* Protects all the below members.  Nests inside tsk->alloc_lock */
  23.181 +	int max_fds;
  23.182 +	int max_fdset;
  23.183 +	int next_fd;
  23.184 +	struct file ** fd;	/* current fd array */
  23.185 +	fd_set *close_on_exec;
  23.186 +	fd_set *open_fds;
  23.187 +	fd_set close_on_exec_init;
  23.188 +	fd_set open_fds_init;
  23.189 +	struct file * fd_array[NR_OPEN_DEFAULT];
  23.190 +};
  23.191 +
  23.192 +#define INIT_FILES \
  23.193 +{ 							\
  23.194 +	count:		ATOMIC_INIT(1), 		\
  23.195 +	file_lock:	RW_LOCK_UNLOCKED, 		\
  23.196 +	max_fds:	NR_OPEN_DEFAULT, 		\
  23.197 +	max_fdset:	__FD_SETSIZE, 			\
  23.198 +	next_fd:	0, 				\
  23.199 +	fd:		&init_files.fd_array[0], 	\
  23.200 +	close_on_exec:	&init_files.close_on_exec_init, \
  23.201 +	open_fds:	&init_files.open_fds_init, 	\
  23.202 +	close_on_exec_init: { { 0, } }, 		\
  23.203 +	open_fds_init:	{ { 0, } }, 			\
  23.204 +	fd_array:	{ NULL, } 			\
  23.205 +}
  23.206 +
  23.207 +/* Maximum number of active map areas.. This is a random (large) number */
  23.208 +#define DEFAULT_MAX_MAP_COUNT	(65536)
  23.209 +
  23.210 +extern int max_map_count;
  23.211 +
  23.212 +struct mm_struct {
  23.213 +	struct vm_area_struct * mmap;		/* list of VMAs */
  23.214 +	rb_root_t mm_rb;
  23.215 +	struct vm_area_struct * mmap_cache;	/* last find_vma result */
  23.216 +	pgd_t * pgd;
  23.217 +	atomic_t mm_users;			/* How many users with user space? */
  23.218 +	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
  23.219 +	int map_count;				/* number of VMAs */
  23.220 +	struct rw_semaphore mmap_sem;
  23.221 +	spinlock_t page_table_lock;		/* Protects task page tables and mm->rss */
  23.222 +
  23.223 +	struct list_head mmlist;		/* List of all active mm's.  These are globally strung
  23.224 +						 * together off init_mm.mmlist, and are protected
  23.225 +						 * by mmlist_lock
  23.226 +						 */
  23.227 +
  23.228 +	unsigned long start_code, end_code, start_data, end_data;
  23.229 +	unsigned long start_brk, brk, start_stack;
  23.230 +	unsigned long arg_start, arg_end, env_start, env_end;
  23.231 +	unsigned long rss, total_vm, locked_vm;
  23.232 +	unsigned long def_flags;
  23.233 +	unsigned long cpu_vm_mask;
  23.234 +	unsigned long swap_address;
  23.235 +
  23.236 +	unsigned dumpable:1;
  23.237 +
  23.238 +	/* Architecture-specific MM context */
  23.239 +	mm_context_t context;
  23.240 +};
  23.241 +
  23.242 +extern int mmlist_nr;
  23.243 +
  23.244 +#define INIT_MM(name) \
  23.245 +{			 				\
  23.246 +	mm_rb:		RB_ROOT,			\
  23.247 +	pgd:		swapper_pg_dir, 		\
  23.248 +	mm_users:	ATOMIC_INIT(2), 		\
  23.249 +	mm_count:	ATOMIC_INIT(1), 		\
  23.250 +	mmap_sem:	__RWSEM_INITIALIZER(name.mmap_sem), \
  23.251 +	page_table_lock: SPIN_LOCK_UNLOCKED, 		\
  23.252 +	mmlist:		LIST_HEAD_INIT(name.mmlist),	\
  23.253 +}
  23.254 +
  23.255 +struct signal_struct {
  23.256 +	atomic_t		count;
  23.257 +	struct k_sigaction	action[_NSIG];
  23.258 +	spinlock_t		siglock;
  23.259 +};
  23.260 +
  23.261 +
  23.262 +#define INIT_SIGNALS {	\
  23.263 +	count:		ATOMIC_INIT(1), 		\
  23.264 +	action:		{ {{0,}}, }, 			\
  23.265 +	siglock:	SPIN_LOCK_UNLOCKED 		\
  23.266 +}
  23.267 +
  23.268 +/*
  23.269 + * Some day this will be a full-fledged user tracking system..
  23.270 + */
  23.271 +struct user_struct {
  23.272 +	atomic_t __count;	/* reference count */
  23.273 +	atomic_t processes;	/* How many processes does this user have? */
  23.274 +	atomic_t files;		/* How many open files does this user have? */
  23.275 +
  23.276 +	/* Hash table maintenance information */
  23.277 +	struct user_struct *next, **pprev;
  23.278 +	uid_t uid;
  23.279 +};
  23.280 +
  23.281 +#define get_current_user() ({ 				\
  23.282 +	struct user_struct *__user = current->user;	\
  23.283 +	atomic_inc(&__user->__count);			\
  23.284 +	__user; })
  23.285 +
  23.286 +extern struct user_struct root_user;
  23.287 +#define INIT_USER (&root_user)
  23.288 +
  23.289 +struct task_struct {
  23.290 +	/*
  23.291 +	 * offsets of these are hardcoded elsewhere - touch with care
  23.292 +	 */
  23.293 +	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
  23.294 +	unsigned long flags;	/* per process flags, defined below */
  23.295 +	int sigpending;
  23.296 +	mm_segment_t addr_limit;	/* thread address space:
  23.297 +					 	0-0xBFFFFFFF for user-thead
  23.298 +						0-0xFFFFFFFF for kernel-thread
  23.299 +					 */
  23.300 +	struct exec_domain *exec_domain;
  23.301 +	volatile long need_resched;
  23.302 +	unsigned long ptrace;
  23.303 +
  23.304 +	int lock_depth;		/* Lock depth */
  23.305 +
  23.306 +/*
  23.307 + * offset 32 begins here on 32-bit platforms. We keep
  23.308 + * all fields in a single cacheline that are needed for
  23.309 + * the goodness() loop in schedule().
  23.310 + */
  23.311 +	long counter;
  23.312 +	long nice;
  23.313 +	unsigned long policy;
  23.314 +	struct mm_struct *mm;
  23.315 +	int processor;
  23.316 +	/*
  23.317 +	 * cpus_runnable is ~0 if the process is not running on any
  23.318 +	 * CPU. It's (1 << cpu) if it's running on a CPU. This mask
  23.319 +	 * is updated under the runqueue lock.
  23.320 +	 *
  23.321 +	 * To determine whether a process might run on a CPU, this
  23.322 +	 * mask is AND-ed with cpus_allowed.
  23.323 +	 */
  23.324 +	unsigned long cpus_runnable, cpus_allowed;
  23.325 +	/*
  23.326 +	 * (only the 'next' pointer fits into the cacheline, but
  23.327 +	 * that's just fine.)
  23.328 +	 */
  23.329 +	struct list_head run_list;
  23.330 +	unsigned long sleep_time;
  23.331 +
  23.332 +	struct task_struct *next_task, *prev_task;
  23.333 +	struct mm_struct *active_mm;
  23.334 +	struct list_head local_pages;
  23.335 +	unsigned int allocation_order, nr_local_pages;
  23.336 +
  23.337 +/* task state */
  23.338 +	struct linux_binfmt *binfmt;
  23.339 +	int exit_code, exit_signal;
  23.340 +	int pdeath_signal;  /*  The signal sent when the parent dies  */
  23.341 +	/* ??? */
  23.342 +	unsigned long personality;
  23.343 +	int did_exec:1;
  23.344 +	unsigned task_dumpable:1;
  23.345 +	pid_t pid;
  23.346 +	pid_t pgrp;
  23.347 +	pid_t tty_old_pgrp;
  23.348 +	pid_t session;
  23.349 +	pid_t tgid;
  23.350 +	/* boolean value for session group leader */
  23.351 +	int leader;
  23.352 +	/* 
  23.353 +	 * pointers to (original) parent process, youngest child, younger sibling,
  23.354 +	 * older sibling, respectively.  (p->father can be replaced with 
  23.355 +	 * p->p_pptr->pid)
  23.356 +	 */
  23.357 +	struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
  23.358 +	struct list_head thread_group;
  23.359 +
  23.360 +	/* PID hash table linkage. */
  23.361 +	struct task_struct *pidhash_next;
  23.362 +	struct task_struct **pidhash_pprev;
  23.363 +
  23.364 +	wait_queue_head_t wait_chldexit;	/* for wait4() */
  23.365 +	struct completion *vfork_done;		/* for vfork() */
  23.366 +	unsigned long rt_priority;
  23.367 +	unsigned long it_real_value, it_prof_value, it_virt_value;
  23.368 +	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
  23.369 +	struct timer_list real_timer;
  23.370 +	struct tms times;
  23.371 +	unsigned long start_time;
  23.372 +	long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
  23.373 +/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
  23.374 +	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
  23.375 +	int swappable:1;
  23.376 +/* process credentials */
  23.377 +	uid_t uid,euid,suid,fsuid;
  23.378 +	gid_t gid,egid,sgid,fsgid;
  23.379 +	int ngroups;
  23.380 +	gid_t	groups[NGROUPS];
  23.381 +	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
  23.382 +	int keep_capabilities:1;
  23.383 +	struct user_struct *user;
  23.384 +/* limits */
  23.385 +	struct rlimit rlim[RLIM_NLIMITS];
  23.386 +	unsigned short used_math;
  23.387 +	char comm[16];
  23.388 +/* file system info */
  23.389 +	int link_count, total_link_count;
  23.390 +	struct tty_struct *tty; /* NULL if no tty */
  23.391 +	unsigned int locks; /* How many file locks are being held */
  23.392 +/* ipc stuff */
  23.393 +	struct sem_undo *semundo;
  23.394 +	struct sem_queue *semsleeping;
  23.395 +/* CPU-specific state of this task */
  23.396 +	struct thread_struct thread;
  23.397 +/* filesystem information */
  23.398 +	struct fs_struct *fs;
  23.399 +/* open file information */
  23.400 +	struct files_struct *files;
  23.401 +/* namespace */
  23.402 +	struct namespace *namespace;
  23.403 +/* signal handlers */
  23.404 +	spinlock_t sigmask_lock;	/* Protects signal and blocked */
  23.405 +	struct signal_struct *sig;
  23.406 +
  23.407 +	sigset_t blocked;
  23.408 +	struct sigpending pending;
  23.409 +
  23.410 +	unsigned long sas_ss_sp;
  23.411 +	size_t sas_ss_size;
  23.412 +	int (*notifier)(void *priv);
  23.413 +	void *notifier_data;
  23.414 +	sigset_t *notifier_mask;
  23.415 +	
  23.416 +/* Thread group tracking */
  23.417 +   	u32 parent_exec_id;
  23.418 +   	u32 self_exec_id;
  23.419 +/* Protection of (de-)allocation: mm, files, fs, tty */
  23.420 +	spinlock_t alloc_lock;
  23.421 +
  23.422 +/* journalling filesystem info */
  23.423 +	void *journal_info;
  23.424 +};
  23.425 +
  23.426 +/*
  23.427 + * Per process flags
  23.428 + */
  23.429 +#define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
  23.430 +					/* Not implemented yet, only for 486*/
  23.431 +#define PF_STARTING	0x00000002	/* being created */
  23.432 +#define PF_EXITING	0x00000004	/* getting shut down */
  23.433 +#define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
  23.434 +#define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
  23.435 +#define PF_DUMPCORE	0x00000200	/* dumped core */
  23.436 +#define PF_SIGNALED	0x00000400	/* killed by a signal */
  23.437 +#define PF_MEMALLOC	0x00000800	/* Allocating memory */
  23.438 +#define PF_FREE_PAGES	0x00002000	/* per process page freeing */
  23.439 +#define PF_NOIO		0x00004000	/* avoid generating further I/O */
  23.440 +
  23.441 +#define PF_USEDFPU	0x00100000	/* task used FPU this quantum (SMP) */
  23.442 +
  23.443 +/*
  23.444 + * Ptrace flags
  23.445 + */
  23.446 +
  23.447 +#define PT_PTRACED	0x00000001
  23.448 +#define PT_TRACESYS	0x00000002
  23.449 +#define PT_DTRACE	0x00000004	/* delayed trace (used on m68k, i386) */
  23.450 +#define PT_TRACESYSGOOD	0x00000008
  23.451 +#define PT_PTRACE_CAP	0x00000010	/* ptracer can follow suid-exec */
  23.452 +
  23.453 +#define is_dumpable(tsk)    ((tsk)->task_dumpable && (tsk)->mm && (tsk)->mm->dumpable)
  23.454 +
  23.455 +/*
  23.456 + * Limit the stack by to some sane default: root can always
  23.457 + * increase this limit if needed..  8MB seems reasonable.
  23.458 + */
  23.459 +#define _STK_LIM	(8*1024*1024)
  23.460 +
  23.461 +#define DEF_COUNTER	(10*HZ/100)	/* 100 ms time slice */
  23.462 +#define MAX_COUNTER	(20*HZ/100)
  23.463 +#define DEF_NICE	(0)
  23.464 +
  23.465 +extern void yield(void);
  23.466 +
  23.467 +/*
  23.468 + * The default (Linux) execution domain.
  23.469 + */
  23.470 +extern struct exec_domain	default_exec_domain;
  23.471 +
  23.472 +/*
  23.473 + *  INIT_TASK is used to set up the first task table, touch at
  23.474 + * your own risk!. Base=0, limit=0x1fffff (=2MB)
  23.475 + */
  23.476 +#define INIT_TASK(tsk)	\
  23.477 +{									\
  23.478 +    state:		0,						\
  23.479 +    flags:		0,						\
  23.480 +    sigpending:		0,						\
  23.481 +    addr_limit:		KERNEL_DS,					\
  23.482 +    exec_domain:	&default_exec_domain,				\
  23.483 +    lock_depth:		-1,						\
  23.484 +    counter:		DEF_COUNTER,					\
  23.485 +    nice:		DEF_NICE,					\
  23.486 +    policy:		SCHED_OTHER,					\
  23.487 +    mm:			NULL,						\
  23.488 +    active_mm:		&init_mm,					\
  23.489 +    cpus_runnable:	~0UL,						\
  23.490 +    cpus_allowed:	~0UL,						\
  23.491 +    run_list:		LIST_HEAD_INIT(tsk.run_list),			\
  23.492 +    next_task:		&tsk,						\
  23.493 +    prev_task:		&tsk,						\
  23.494 +    p_opptr:		&tsk,						\
  23.495 +    p_pptr:		&tsk,						\
  23.496 +    thread_group:	LIST_HEAD_INIT(tsk.thread_group),		\
  23.497 +    wait_chldexit:	__WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
  23.498 +    real_timer:		{						\
  23.499 +	function:		it_real_fn				\
  23.500 +    },									\
  23.501 +    cap_effective:	CAP_INIT_EFF_SET,				\
  23.502 +    cap_inheritable:	CAP_INIT_INH_SET,				\
  23.503 +    cap_permitted:	CAP_FULL_SET,					\
  23.504 +    keep_capabilities:	0,						\
  23.505 +    rlim:		INIT_RLIMITS,					\
  23.506 +    user:		INIT_USER,					\
  23.507 +    comm:		"swapper",					\
  23.508 +    thread:		INIT_THREAD,					\
  23.509 +    fs:			&init_fs,					\
  23.510 +    files:		&init_files,					\
  23.511 +    sigmask_lock:	SPIN_LOCK_UNLOCKED,				\
  23.512 +    sig:		&init_signals,					\
  23.513 +    pending:		{ NULL, &tsk.pending.head, {{0}}},		\
  23.514 +    blocked:		{{0}},						\
  23.515 +    alloc_lock:		SPIN_LOCK_UNLOCKED,				\
  23.516 +    journal_info:	NULL,						\
  23.517 +}
  23.518 +
  23.519 +
  23.520 +#ifndef INIT_TASK_SIZE
  23.521 +# define INIT_TASK_SIZE	2048*sizeof(long)
  23.522 +#endif
  23.523 +
  23.524 +union task_union {
  23.525 +	struct task_struct task;
  23.526 +	unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
  23.527 +};
  23.528 +
  23.529 +extern union task_union init_task_union;
  23.530 +
  23.531 +extern struct   mm_struct init_mm;
  23.532 +extern struct task_struct *init_tasks[NR_CPUS];
  23.533 +
  23.534 +/* PID hashing. (shouldnt this be dynamic?) */
  23.535 +#define PIDHASH_SZ (4096 >> 2)
  23.536 +extern struct task_struct *pidhash[PIDHASH_SZ];
  23.537 +
  23.538 +#define pid_hashfn(x)	((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
  23.539 +
  23.540 +static inline void hash_pid(struct task_struct *p)
  23.541 +{
  23.542 +	struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
  23.543 +
  23.544 +	if((p->pidhash_next = *htable) != NULL)
  23.545 +		(*htable)->pidhash_pprev = &p->pidhash_next;
  23.546 +	*htable = p;
  23.547 +	p->pidhash_pprev = htable;
  23.548 +}
  23.549 +
  23.550 +static inline void unhash_pid(struct task_struct *p)
  23.551 +{
  23.552 +	if(p->pidhash_next)
  23.553 +		p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
  23.554 +	*p->pidhash_pprev = p->pidhash_next;
  23.555 +}
  23.556 +
  23.557 +static inline struct task_struct *find_task_by_pid(int pid)
  23.558 +{
  23.559 +	struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
  23.560 +
  23.561 +	for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
  23.562 +		;
  23.563 +
  23.564 +	return p;
  23.565 +}
  23.566 +
  23.567 +#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
  23.568 +
  23.569 +static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
  23.570 +{
  23.571 +	tsk->processor = cpu;
  23.572 +	tsk->cpus_runnable = 1UL << cpu;
  23.573 +}
  23.574 +
  23.575 +static inline void task_release_cpu(struct task_struct *tsk)
  23.576 +{
  23.577 +	tsk->cpus_runnable = ~0UL;
  23.578 +}
  23.579 +
  23.580 +/* per-UID process charging. */
  23.581 +extern struct user_struct * alloc_uid(uid_t);
  23.582 +extern void free_uid(struct user_struct *);
  23.583 +extern void switch_uid(struct user_struct *);
  23.584 +
  23.585 +#include <asm/current.h>
  23.586 +
  23.587 +extern unsigned long volatile jiffies;
  23.588 +extern unsigned long itimer_ticks;
  23.589 +extern unsigned long itimer_next;
  23.590 +extern struct timeval xtime;
  23.591 +extern void do_timer(struct pt_regs *);
  23.592 +#ifdef CONFIG_NO_IDLE_HZ
  23.593 +extern void do_timer_ticks(int ticks);
  23.594 +#endif
  23.595 +
  23.596 +extern unsigned int * prof_buffer;
  23.597 +extern unsigned long prof_len;
  23.598 +extern unsigned long prof_shift;
  23.599 +
  23.600 +#define CURRENT_TIME (xtime.tv_sec)
  23.601 +
  23.602 +extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr));
  23.603 +extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
  23.604 +extern void FASTCALL(sleep_on(wait_queue_head_t *q));
  23.605 +extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
  23.606 +				      signed long timeout));
  23.607 +extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
  23.608 +extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
  23.609 +						    signed long timeout));
  23.610 +extern int FASTCALL(wake_up_process(struct task_struct * tsk));
  23.611 +
  23.612 +#define wake_up(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
  23.613 +#define wake_up_nr(x, nr)		__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
  23.614 +#define wake_up_all(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0)
  23.615 +#define wake_up_sync(x)			__wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
  23.616 +#define wake_up_sync_nr(x, nr)		__wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
  23.617 +#define wake_up_interruptible(x)	__wake_up((x),TASK_INTERRUPTIBLE, 1)
  23.618 +#define wake_up_interruptible_nr(x, nr)	__wake_up((x),TASK_INTERRUPTIBLE, nr)
  23.619 +#define wake_up_interruptible_all(x)	__wake_up((x),TASK_INTERRUPTIBLE, 0)
  23.620 +#define wake_up_interruptible_sync(x)	__wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
  23.621 +#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE,  nr)
  23.622 +asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
  23.623 +
  23.624 +extern int in_group_p(gid_t);
  23.625 +extern int in_egroup_p(gid_t);
  23.626 +
  23.627 +extern void proc_caches_init(void);
  23.628 +extern void flush_signals(struct task_struct *);
  23.629 +extern void flush_signal_handlers(struct task_struct *);
  23.630 +extern void sig_exit(int, int, struct siginfo *);
  23.631 +extern int dequeue_signal(sigset_t *, siginfo_t *);
  23.632 +extern void block_all_signals(int (*notifier)(void *priv), void *priv,
  23.633 +			      sigset_t *mask);
  23.634 +extern void unblock_all_signals(void);
  23.635 +extern int send_sig_info(int, struct siginfo *, struct task_struct *);
  23.636 +extern int force_sig_info(int, struct siginfo *, struct task_struct *);
  23.637 +extern int kill_pg_info(int, struct siginfo *, pid_t);
  23.638 +extern int kill_sl_info(int, struct siginfo *, pid_t);
  23.639 +extern int kill_proc_info(int, struct siginfo *, pid_t);
  23.640 +extern void notify_parent(struct task_struct *, int);
  23.641 +extern void do_notify_parent(struct task_struct *, int);
  23.642 +extern void force_sig(int, struct task_struct *);
  23.643 +extern int send_sig(int, struct task_struct *, int);
  23.644 +extern int kill_pg(pid_t, int, int);
  23.645 +extern int kill_sl(pid_t, int, int);
  23.646 +extern int kill_proc(pid_t, int, int);
  23.647 +extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
  23.648 +extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
  23.649 +
  23.650 +static inline int signal_pending(struct task_struct *p)
  23.651 +{
  23.652 +	return (p->sigpending != 0);
  23.653 +}
  23.654 +
  23.655 +/*
  23.656 + * Re-calculate pending state from the set of locally pending
  23.657 + * signals, globally pending signals, and blocked signals.
  23.658 + */
  23.659 +static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
  23.660 +{
  23.661 +	unsigned long ready;
  23.662 +	long i;
  23.663 +
  23.664 +	switch (_NSIG_WORDS) {
  23.665 +	default:
  23.666 +		for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
  23.667 +			ready |= signal->sig[i] &~ blocked->sig[i];
  23.668 +		break;
  23.669 +
  23.670 +	case 4: ready  = signal->sig[3] &~ blocked->sig[3];
  23.671 +		ready |= signal->sig[2] &~ blocked->sig[2];
  23.672 +		ready |= signal->sig[1] &~ blocked->sig[1];
  23.673 +		ready |= signal->sig[0] &~ blocked->sig[0];
  23.674 +		break;
  23.675 +
  23.676 +	case 2: ready  = signal->sig[1] &~ blocked->sig[1];
  23.677 +		ready |= signal->sig[0] &~ blocked->sig[0];
  23.678 +		break;
  23.679 +
  23.680 +	case 1: ready  = signal->sig[0] &~ blocked->sig[0];
  23.681 +	}
  23.682 +	return ready !=	0;
  23.683 +}
  23.684 +
  23.685 +/* Reevaluate whether the task has signals pending delivery.
  23.686 +   This is required every time the blocked sigset_t changes.
  23.687 +   All callers should have t->sigmask_lock.  */
  23.688 +
  23.689 +static inline void recalc_sigpending(struct task_struct *t)
  23.690 +{
  23.691 +	t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
  23.692 +}
  23.693 +
  23.694 +/* True if we are on the alternate signal stack.  */
  23.695 +
  23.696 +static inline int on_sig_stack(unsigned long sp)
  23.697 +{
  23.698 +	return (sp - current->sas_ss_sp < current->sas_ss_size);
  23.699 +}
  23.700 +
  23.701 +static inline int sas_ss_flags(unsigned long sp)
  23.702 +{
  23.703 +	return (current->sas_ss_size == 0 ? SS_DISABLE
  23.704 +		: on_sig_stack(sp) ? SS_ONSTACK : 0);
  23.705 +}
  23.706 +
  23.707 +extern int request_irq(unsigned int,
  23.708 +		       void (*handler)(int, void *, struct pt_regs *),
  23.709 +		       unsigned long, const char *, void *);
  23.710 +extern void free_irq(unsigned int, void *);
  23.711 +
  23.712 +/*
  23.713 + * This has now become a routine instead of a macro, it sets a flag if
  23.714 + * it returns true (to do BSD-style accounting where the process is flagged
  23.715 + * if it uses root privs). The implication of this is that you should do
  23.716 + * normal permissions checks first, and check suser() last.
  23.717 + *
  23.718 + * [Dec 1997 -- Chris Evans]
  23.719 + * For correctness, the above considerations need to be extended to
  23.720 + * fsuser(). This is done, along with moving fsuser() checks to be
  23.721 + * last.
  23.722 + *
  23.723 + * These will be removed, but in the mean time, when the SECURE_NOROOT 
  23.724 + * flag is set, uids don't grant privilege.
  23.725 + */
  23.726 +static inline int suser(void)
  23.727 +{
  23.728 +	if (!issecure(SECURE_NOROOT) && current->euid == 0) { 
  23.729 +		current->flags |= PF_SUPERPRIV;
  23.730 +		return 1;
  23.731 +	}
  23.732 +	return 0;
  23.733 +}
  23.734 +
  23.735 +static inline int fsuser(void)
  23.736 +{
  23.737 +	if (!issecure(SECURE_NOROOT) && current->fsuid == 0) {
  23.738 +		current->flags |= PF_SUPERPRIV;
  23.739 +		return 1;
  23.740 +	}
  23.741 +	return 0;
  23.742 +}
  23.743 +
  23.744 +/*
  23.745 + * capable() checks for a particular capability.  
  23.746 + * New privilege checks should use this interface, rather than suser() or
  23.747 + * fsuser(). See include/linux/capability.h for defined capabilities.
  23.748 + */
  23.749 +
  23.750 +static inline int capable(int cap)
  23.751 +{
  23.752 +#if 1 /* ok now */
  23.753 +	if (cap_raised(current->cap_effective, cap))
  23.754 +#else
  23.755 +	if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0)
  23.756 +#endif
  23.757 +	{
  23.758 +		current->flags |= PF_SUPERPRIV;
  23.759 +		return 1;
  23.760 +	}
  23.761 +	return 0;
  23.762 +}
  23.763 +
  23.764 +/*
  23.765 + * Routines for handling mm_structs
  23.766 + */
  23.767 +extern struct mm_struct * mm_alloc(void);
  23.768 +
  23.769 +extern struct mm_struct * start_lazy_tlb(void);
  23.770 +extern void end_lazy_tlb(struct mm_struct *mm);
  23.771 +
  23.772 +/* mmdrop drops the mm and the page tables */
  23.773 +extern inline void FASTCALL(__mmdrop(struct mm_struct *));
  23.774 +static inline void mmdrop(struct mm_struct * mm)
  23.775 +{
  23.776 +	if (atomic_dec_and_test(&mm->mm_count))
  23.777 +		__mmdrop(mm);
  23.778 +}
  23.779 +
  23.780 +/* mmput gets rid of the mappings and all user-space */
  23.781 +extern void mmput(struct mm_struct *);
  23.782 +/* Remove the current tasks stale references to the old mm_struct */
  23.783 +extern void mm_release(void);
  23.784 +
  23.785 +/*
  23.786 + * Routines for handling the fd arrays
  23.787 + */
  23.788 +extern struct file ** alloc_fd_array(int);
  23.789 +extern int expand_fd_array(struct files_struct *, int nr);
  23.790 +extern void free_fd_array(struct file **, int);
  23.791 +
  23.792 +extern fd_set *alloc_fdset(int);
  23.793 +extern int expand_fdset(struct files_struct *, int nr);
  23.794 +extern void free_fdset(fd_set *, int);
  23.795 +
  23.796 +extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
  23.797 +extern void flush_thread(void);
  23.798 +extern void exit_thread(void);
  23.799 +
  23.800 +extern void exit_mm(struct task_struct *);
  23.801 +extern void exit_files(struct task_struct *);
  23.802 +extern void exit_sighand(struct task_struct *);
  23.803 +
  23.804 +extern void reparent_to_init(void);
  23.805 +extern void daemonize(void);
  23.806 +
  23.807 +extern int do_execve(char *, char **, char **, struct pt_regs *);
  23.808 +extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
  23.809 +
  23.810 +extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
  23.811 +extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
  23.812 +extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
  23.813 +
  23.814 +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
  23.815 +
  23.816 +#define __wait_event(wq, condition) 					\
  23.817 +do {									\
  23.818 +	wait_queue_t __wait;						\
  23.819 +	init_waitqueue_entry(&__wait, current);				\
  23.820 +									\
  23.821 +	add_wait_queue(&wq, &__wait);					\
  23.822 +	for (;;) {							\
  23.823 +		set_current_state(TASK_UNINTERRUPTIBLE);		\
  23.824 +		if (condition)						\
  23.825 +			break;						\
  23.826 +		schedule();						\
  23.827 +	}								\
  23.828 +	current->state = TASK_RUNNING;					\
  23.829 +	remove_wait_queue(&wq, &__wait);				\
  23.830 +} while (0)
  23.831 +
  23.832 +#define wait_event(wq, condition) 					\
  23.833 +do {									\
  23.834 +	if (condition)	 						\
  23.835 +		break;							\
  23.836 +	__wait_event(wq, condition);					\
  23.837 +} while (0)
  23.838 +
  23.839 +#define __wait_event_interruptible(wq, condition, ret)			\
  23.840 +do {									\
  23.841 +	wait_queue_t __wait;						\
  23.842 +	init_waitqueue_entry(&__wait, current);				\
  23.843 +									\
  23.844 +	add_wait_queue(&wq, &__wait);					\
  23.845 +	for (;;) {							\
  23.846 +		set_current_state(TASK_INTERRUPTIBLE);			\
  23.847 +		if (condition)						\
  23.848 +			break;						\
  23.849 +		if (!signal_pending(current)) {				\
  23.850 +			schedule();					\
  23.851 +			continue;					\
  23.852 +		}							\
  23.853 +		ret = -ERESTARTSYS;					\
  23.854 +		break;							\
  23.855 +	}								\
  23.856 +	current->state = TASK_RUNNING;					\
  23.857 +	remove_wait_queue(&wq, &__wait);				\
  23.858 +} while (0)
  23.859 +	
  23.860 +#define wait_event_interruptible(wq, condition)				\
  23.861 +({									\
  23.862 +	int __ret = 0;							\
  23.863 +	if (!(condition))						\
  23.864 +		__wait_event_interruptible(wq, condition, __ret);	\
  23.865 +	__ret;								\
  23.866 +})
  23.867 +
  23.868 +#define REMOVE_LINKS(p) do { \
  23.869 +	(p)->next_task->prev_task = (p)->prev_task; \
  23.870 +	(p)->prev_task->next_task = (p)->next_task; \
  23.871 +	if ((p)->p_osptr) \
  23.872 +		(p)->p_osptr->p_ysptr = (p)->p_ysptr; \
  23.873 +	if ((p)->p_ysptr) \
  23.874 +		(p)->p_ysptr->p_osptr = (p)->p_osptr; \
  23.875 +	else \
  23.876 +		(p)->p_pptr->p_cptr = (p)->p_osptr; \
  23.877 +	} while (0)
  23.878 +
  23.879 +#define SET_LINKS(p) do { \
  23.880 +	(p)->next_task = &init_task; \
  23.881 +	(p)->prev_task = init_task.prev_task; \
  23.882 +	init_task.prev_task->next_task = (p); \
  23.883 +	init_task.prev_task = (p); \
  23.884 +	(p)->p_ysptr = NULL; \
  23.885 +	if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \
  23.886 +		(p)->p_osptr->p_ysptr = p; \
  23.887 +	(p)->p_pptr->p_cptr = p; \
  23.888 +	} while (0)
  23.889 +
  23.890 +#define for_each_task(p) \
  23.891 +	for (p = &init_task ; (p = p->next_task) != &init_task ; )
  23.892 +
  23.893 +#define for_each_thread(task) \
  23.894 +	for (task = next_thread(current) ; task != current ; task = next_thread(task))
  23.895 +
  23.896 +#define next_thread(p) \
  23.897 +	list_entry((p)->thread_group.next, struct task_struct, thread_group)
  23.898 +
  23.899 +#define thread_group_leader(p)	(p->pid == p->tgid)
  23.900 +
  23.901 +static inline void del_from_runqueue(struct task_struct * p)
  23.902 +{
  23.903 +	nr_running--;
  23.904 +	p->sleep_time = jiffies;
  23.905 +	list_del(&p->run_list);
  23.906 +	p->run_list.next = NULL;
  23.907 +}
  23.908 +
  23.909 +static inline int task_on_runqueue(struct task_struct *p)
  23.910 +{
  23.911 +	return (p->run_list.next != NULL);
  23.912 +}
  23.913 +
  23.914 +static inline void unhash_process(struct task_struct *p)
  23.915 +{
  23.916 +	if (task_on_runqueue(p))
  23.917 +		out_of_line_bug();
  23.918 +	write_lock_irq(&tasklist_lock);
  23.919 +	nr_threads--;
  23.920 +	unhash_pid(p);
  23.921 +	REMOVE_LINKS(p);
  23.922 +	list_del(&p->thread_group);
  23.923 +	write_unlock_irq(&tasklist_lock);
  23.924 +}
  23.925 +
  23.926 +/* Protects ->fs, ->files, ->mm, and synchronises with wait4().  Nests inside tasklist_lock */
  23.927 +static inline void task_lock(struct task_struct *p)
  23.928 +{
  23.929 +	spin_lock(&p->alloc_lock);
  23.930 +}
  23.931 +
  23.932 +static inline void task_unlock(struct task_struct *p)
  23.933 +{
  23.934 +	spin_unlock(&p->alloc_lock);
  23.935 +}
  23.936 +
  23.937 +/* write full pathname into buffer and return start of pathname */
  23.938 +static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
  23.939 +				char *buf, int buflen)
  23.940 +{
  23.941 +	char *res;
  23.942 +	struct vfsmount *rootmnt;
  23.943 +	struct dentry *root;
  23.944 +	read_lock(&current->fs->lock);
  23.945 +	rootmnt = mntget(current->fs->rootmnt);
  23.946 +	root = dget(current->fs->root);
  23.947 +	read_unlock(&current->fs->lock);
  23.948 +	spin_lock(&dcache_lock);
  23.949 +	res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen);
  23.950 +	spin_unlock(&dcache_lock);
  23.951 +	dput(root);
  23.952 +	mntput(rootmnt);
  23.953 +	return res;
  23.954 +}
  23.955 +
  23.956 +static inline int need_resched(void)
  23.957 +{
  23.958 +	return (unlikely(current->need_resched));
  23.959 +}
  23.960 +
  23.961 +extern void __cond_resched(void);
  23.962 +static inline void cond_resched(void)
  23.963 +{
  23.964 +	if (need_resched())
  23.965 +		__cond_resched();
  23.966 +}
  23.967 +
  23.968 +#endif /* __KERNEL__ */
  23.969 +#endif
    24.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.2 +++ b/xenolinux-2.4.24-sparse/include/linux/timer.h	Sat Jan 31 19:45:13 2004 +0000
    24.3 @@ -0,0 +1,77 @@
    24.4 +#ifndef _LINUX_TIMER_H
    24.5 +#define _LINUX_TIMER_H
    24.6 +
    24.7 +#include <linux/config.h>
    24.8 +#include <linux/list.h>
    24.9 +
   24.10 +/*
   24.11 + * In Linux 2.4, static timers have been removed from the kernel.
   24.12 + * Timers may be dynamically created and destroyed, and should be initialized
   24.13 + * by a call to init_timer() upon creation.
   24.14 + *
   24.15 + * The "data" field enables use of a common timeout function for several
   24.16 + * timeouts. You can use this field to distinguish between the different
   24.17 + * invocations.
   24.18 + */
   24.19 +struct timer_list {
   24.20 +	struct list_head list;
   24.21 +	unsigned long expires;
   24.22 +	unsigned long data;
   24.23 +	void (*function)(unsigned long);
   24.24 +};
   24.25 +
   24.26 +extern void add_timer(struct timer_list * timer);
   24.27 +extern int del_timer(struct timer_list * timer);
   24.28 +#ifdef CONFIG_NO_IDLE_HZ
   24.29 +extern struct timer_list *next_timer_event(void);
   24.30 +#endif
   24.31 +
   24.32 +#ifdef CONFIG_SMP
   24.33 +extern int del_timer_sync(struct timer_list * timer);
   24.34 +extern void sync_timers(void);
   24.35 +#else
   24.36 +#define del_timer_sync(t)	del_timer(t)
   24.37 +#define sync_timers()		do { } while (0)
   24.38 +#endif
   24.39 +
   24.40 +/*
   24.41 + * mod_timer is a more efficient way to update the expire field of an
   24.42 + * active timer (if the timer is inactive it will be activated)
   24.43 + * mod_timer(a,b) is equivalent to del_timer(a); a->expires = b; add_timer(a).
   24.44 + * If the timer is known to be not pending (ie, in the handler), mod_timer
   24.45 + * is less efficient than a->expires = b; add_timer(a).
   24.46 + */
   24.47 +int mod_timer(struct timer_list *timer, unsigned long expires);
   24.48 +
   24.49 +extern void it_real_fn(unsigned long);
   24.50 +
   24.51 +static inline void init_timer(struct timer_list * timer)
   24.52 +{
   24.53 +	timer->list.next = timer->list.prev = NULL;
   24.54 +}
   24.55 +
   24.56 +static inline int timer_pending (const struct timer_list * timer)
   24.57 +{
   24.58 +	return timer->list.next != NULL;
   24.59 +}
   24.60 +
   24.61 +/*
   24.62 + *	These inlines deal with timer wrapping correctly. You are 
   24.63 + *	strongly encouraged to use them
   24.64 + *	1. Because people otherwise forget
   24.65 + *	2. Because if the timer wrap changes in future you wont have to
   24.66 + *	   alter your driver code.
   24.67 + *
   24.68 + * time_after(a,b) returns true if the time a is after time b.
   24.69 + *
   24.70 + * Do this with "<0" and ">=0" to only test the sign of the result. A
   24.71 + * good compiler would generate better code (and a really good compiler
   24.72 + * wouldn't care). Gcc is currently neither.
   24.73 + */
   24.74 +#define time_after(a,b)		((long)(b) - (long)(a) < 0)
   24.75 +#define time_before(a,b)	time_after(b,a)
   24.76 +
   24.77 +#define time_after_eq(a,b)	((long)(a) - (long)(b) >= 0)
   24.78 +#define time_before_eq(a,b)	time_after_eq(b,a)
   24.79 +
   24.80 +#endif
    25.1 --- a/xenolinux-2.4.24-sparse/kernel/panic.c	Fri Jan 30 15:55:06 2004 +0000
    25.2 +++ b/xenolinux-2.4.24-sparse/kernel/panic.c	Sat Jan 31 19:45:13 2004 +0000
    25.3 @@ -110,7 +110,8 @@ NORET_TYPE void panic(const char * fmt, 
    25.4  #endif
    25.5  		CHECK_EMERGENCY_SYNC
    25.6  #if defined(CONFIG_XENO)
    25.7 -                HYPERVISOR_exit();
    25.8 +		HYPERVISOR_console_write(buf, strlen(buf));
    25.9 +		HYPERVISOR_exit();
   25.10  #endif
   25.11  	}
   25.12  }
    26.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    26.2 +++ b/xenolinux-2.4.24-sparse/kernel/timer.c	Sat Jan 31 19:45:13 2004 +0000
    26.3 @@ -0,0 +1,968 @@
    26.4 +/*
    26.5 + *  linux/kernel/timer.c
    26.6 + *
    26.7 + *  Kernel internal timers, kernel timekeeping, basic process system calls
    26.8 + *
    26.9 + *  Copyright (C) 1991, 1992  Linus Torvalds
   26.10 + *
   26.11 + *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
   26.12 + *
   26.13 + *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
   26.14 + *              "A Kernel Model for Precision Timekeeping" by Dave Mills
   26.15 + *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
   26.16 + *              serialize accesses to xtime/lost_ticks).
   26.17 + *                              Copyright (C) 1998  Andrea Arcangeli
   26.18 + *  1999-03-10  Improved NTP compatibility by Ulrich Windl
   26.19 + */
   26.20 +
   26.21 +#include <linux/config.h>
   26.22 +#include <linux/mm.h>
   26.23 +#include <linux/timex.h>
   26.24 +#include <linux/delay.h>
   26.25 +#include <linux/smp_lock.h>
   26.26 +#include <linux/interrupt.h>
   26.27 +#include <linux/kernel_stat.h>
   26.28 +
   26.29 +#include <asm/uaccess.h>
   26.30 +
   26.31 +/*
   26.32 + * Timekeeping variables
   26.33 + */
   26.34 +
   26.35 +long tick = (1000000 + HZ/2) / HZ;	/* timer interrupt period */
   26.36 +
   26.37 +/* The current time */
   26.38 +struct timeval xtime __attribute__ ((aligned (16)));
   26.39 +
   26.40 +/* Don't completely fail for HZ > 500.  */
   26.41 +int tickadj = 500/HZ ? : 1;		/* microsecs */
   26.42 +
   26.43 +DECLARE_TASK_QUEUE(tq_timer);
   26.44 +DECLARE_TASK_QUEUE(tq_immediate);
   26.45 +
   26.46 +/*
   26.47 + * phase-lock loop variables
   26.48 + */
   26.49 +/* TIME_ERROR prevents overwriting the CMOS clock */
   26.50 +int time_state = TIME_OK;		/* clock synchronization status	*/
   26.51 +int time_status = STA_UNSYNC;		/* clock status bits		*/
   26.52 +long time_offset;			/* time adjustment (us)		*/
   26.53 +long time_constant = 2;			/* pll time constant		*/
   26.54 +long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
   26.55 +long time_precision = 1;		/* clock precision (us)		*/
   26.56 +long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
   26.57 +long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
   26.58 +long time_phase;			/* phase offset (scaled us)	*/
   26.59 +long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;
   26.60 +					/* frequency offset (scaled ppm)*/
   26.61 +long time_adj;				/* tick adjust (scaled 1 / HZ)	*/
   26.62 +long time_reftime;			/* time at last adjustment (s)	*/
   26.63 +
   26.64 +long time_adjust;
   26.65 +long time_adjust_step;
   26.66 +
   26.67 +unsigned long event;
   26.68 +
   26.69 +extern int do_setitimer(int, struct itimerval *, struct itimerval *);
   26.70 +
   26.71 +unsigned long volatile jiffies;
   26.72 +
   26.73 +unsigned int * prof_buffer;
   26.74 +unsigned long prof_len;
   26.75 +unsigned long prof_shift;
   26.76 +
   26.77 +/*
   26.78 + * Event timer code
   26.79 + */
   26.80 +#define TVN_BITS 6
   26.81 +#define TVR_BITS 8
   26.82 +#define TVN_SIZE (1 << TVN_BITS)
   26.83 +#define TVR_SIZE (1 << TVR_BITS)
   26.84 +#define TVN_MASK (TVN_SIZE - 1)
   26.85 +#define TVR_MASK (TVR_SIZE - 1)
   26.86 +
   26.87 +struct timer_vec {
   26.88 +	int index;
   26.89 +	struct list_head vec[TVN_SIZE];
   26.90 +};
   26.91 +
   26.92 +struct timer_vec_root {
   26.93 +	int index;
   26.94 +	struct list_head vec[TVR_SIZE];
   26.95 +};
   26.96 +
   26.97 +static struct timer_vec tv5;
   26.98 +static struct timer_vec tv4;
   26.99 +static struct timer_vec tv3;
  26.100 +static struct timer_vec tv2;
  26.101 +static struct timer_vec_root tv1;
  26.102 +
  26.103 +static struct timer_vec * const tvecs[] = {
  26.104 +	(struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
  26.105 +};
  26.106 +
  26.107 +static struct list_head * run_timer_list_running;
  26.108 +
  26.109 +#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
  26.110 +
  26.111 +void init_timervecs (void)
  26.112 +{
  26.113 +	int i;
  26.114 +
  26.115 +	for (i = 0; i < TVN_SIZE; i++) {
  26.116 +		INIT_LIST_HEAD(tv5.vec + i);
  26.117 +		INIT_LIST_HEAD(tv4.vec + i);
  26.118 +		INIT_LIST_HEAD(tv3.vec + i);
  26.119 +		INIT_LIST_HEAD(tv2.vec + i);
  26.120 +	}
  26.121 +	for (i = 0; i < TVR_SIZE; i++)
  26.122 +		INIT_LIST_HEAD(tv1.vec + i);
  26.123 +}
  26.124 +
  26.125 +static unsigned long timer_jiffies;
  26.126 +
  26.127 +static inline void internal_add_timer(struct timer_list *timer)
  26.128 +{
  26.129 +	/*
  26.130 +	 * must be cli-ed when calling this
  26.131 +	 */
  26.132 +	unsigned long expires = timer->expires;
  26.133 +	unsigned long idx = expires - timer_jiffies;
  26.134 +	struct list_head * vec;
  26.135 +
  26.136 +	if (run_timer_list_running)
  26.137 +		vec = run_timer_list_running;
  26.138 +	else if (idx < TVR_SIZE) {
  26.139 +		int i = expires & TVR_MASK;
  26.140 +		vec = tv1.vec + i;
  26.141 +	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
  26.142 +		int i = (expires >> TVR_BITS) & TVN_MASK;
  26.143 +		vec = tv2.vec + i;
  26.144 +	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
  26.145 +		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
  26.146 +		vec =  tv3.vec + i;
  26.147 +	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
  26.148 +		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
  26.149 +		vec = tv4.vec + i;
  26.150 +	} else if ((signed long) idx < 0) {
  26.151 +		/* can happen if you add a timer with expires == jiffies,
  26.152 +		 * or you set a timer to go off in the past
  26.153 +		 */
  26.154 +		vec = tv1.vec + tv1.index;
  26.155 +	} else if (idx <= 0xffffffffUL) {
  26.156 +		int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
  26.157 +		vec = tv5.vec + i;
  26.158 +	} else {
  26.159 +		/* Can only get here on architectures with 64-bit jiffies */
  26.160 +		INIT_LIST_HEAD(&timer->list);
  26.161 +		return;
  26.162 +	}
  26.163 +	/*
  26.164 +	 * Timers are FIFO!
  26.165 +	 */
  26.166 +	list_add(&timer->list, vec->prev);
  26.167 +}
  26.168 +
  26.169 +/* Initialize both explicitly - let's try to have them in the same cache line */
  26.170 +spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
  26.171 +
  26.172 +#ifdef CONFIG_SMP
  26.173 +volatile struct timer_list * volatile running_timer;
  26.174 +#define timer_enter(t) do { running_timer = t; mb(); } while (0)
  26.175 +#define timer_exit() do { running_timer = NULL; } while (0)
  26.176 +#define timer_is_running(t) (running_timer == t)
  26.177 +#define timer_synchronize(t) while (timer_is_running(t)) barrier()
  26.178 +#else
  26.179 +#define timer_enter(t)		do { } while (0)
  26.180 +#define timer_exit()		do { } while (0)
  26.181 +#endif
  26.182 +
  26.183 +void add_timer(struct timer_list *timer)
  26.184 +{
  26.185 +	unsigned long flags;
  26.186 +
  26.187 +	spin_lock_irqsave(&timerlist_lock, flags);
  26.188 +	if (timer_pending(timer))
  26.189 +		goto bug;
  26.190 +	internal_add_timer(timer);
  26.191 +	spin_unlock_irqrestore(&timerlist_lock, flags);
  26.192 +	return;
  26.193 +bug:
  26.194 +	spin_unlock_irqrestore(&timerlist_lock, flags);
  26.195 +	printk("bug: kernel timer added twice at %p.\n",
  26.196 +			__builtin_return_address(0));
  26.197 +}
  26.198 +
  26.199 +static inline int detach_timer (struct timer_list *timer)
  26.200 +{
  26.201 +	if (!timer_pending(timer))
  26.202 +		return 0;
  26.203 +	list_del(&timer->list);
  26.204 +	return 1;
  26.205 +}
  26.206 +
  26.207 +int mod_timer(struct timer_list *timer, unsigned long expires)
  26.208 +{
  26.209 +	int ret;
  26.210 +	unsigned long flags;
  26.211 +
  26.212 +	spin_lock_irqsave(&timerlist_lock, flags);
  26.213 +	timer->expires = expires;
  26.214 +	ret = detach_timer(timer);
  26.215 +	internal_add_timer(timer);
  26.216 +	spin_unlock_irqrestore(&timerlist_lock, flags);
  26.217 +	return ret;
  26.218 +}
  26.219 +
  26.220 +int del_timer(struct timer_list * timer)
  26.221 +{
  26.222 +	int ret;
  26.223 +	unsigned long flags;
  26.224 +
  26.225 +	spin_lock_irqsave(&timerlist_lock, flags);
  26.226 +	ret = detach_timer(timer);
  26.227 +	timer->list.next = timer->list.prev = NULL;
  26.228 +	spin_unlock_irqrestore(&timerlist_lock, flags);
  26.229 +	return ret;
  26.230 +}
  26.231 +
  26.232 +#ifdef CONFIG_SMP
  26.233 +void sync_timers(void)
  26.234 +{
  26.235 +	spin_unlock_wait(&global_bh_lock);
  26.236 +}
  26.237 +
  26.238 +/*
  26.239 + * SMP specific function to delete periodic timer.
  26.240 + * Caller must disable by some means restarting the timer
  26.241 + * for new. Upon exit the timer is not queued and handler is not running
  26.242 + * on any CPU. It returns number of times, which timer was deleted
  26.243 + * (for reference counting).
  26.244 + */
  26.245 +
  26.246 +int del_timer_sync(struct timer_list * timer)
  26.247 +{
  26.248 +	int ret = 0;
  26.249 +
  26.250 +	for (;;) {
  26.251 +		unsigned long flags;
  26.252 +		int running;
  26.253 +
  26.254 +		spin_lock_irqsave(&timerlist_lock, flags);
  26.255 +		ret += detach_timer(timer);
  26.256 +		timer->list.next = timer->list.prev = 0;
  26.257 +		running = timer_is_running(timer);
  26.258 +		spin_unlock_irqrestore(&timerlist_lock, flags);
  26.259 +
  26.260 +		if (!running)
  26.261 +			break;
  26.262 +
  26.263 +		timer_synchronize(timer);
  26.264 +	}
  26.265 +
  26.266 +	return ret;
  26.267 +}
  26.268 +#endif
  26.269 +
  26.270 +
  26.271 +static inline void cascade_timers(struct timer_vec *tv)
  26.272 +{
  26.273 +	/* cascade all the timers from tv up one level */
  26.274 +	struct list_head *head, *curr, *next;
  26.275 +
  26.276 +	head = tv->vec + tv->index;
  26.277 +	curr = head->next;
  26.278 +	/*
  26.279 +	 * We are removing _all_ timers from the list, so we don't  have to
  26.280 +	 * detach them individually, just clear the list afterwards.
  26.281 +	 */
  26.282 +	while (curr != head) {
  26.283 +		struct timer_list *tmp;
  26.284 +
  26.285 +		tmp = list_entry(curr, struct timer_list, list);
  26.286 +		next = curr->next;
  26.287 +		list_del(curr); // not needed
  26.288 +		internal_add_timer(tmp);
  26.289 +		curr = next;
  26.290 +	}
  26.291 +	INIT_LIST_HEAD(head);
  26.292 +	tv->index = (tv->index + 1) & TVN_MASK;
  26.293 +}
  26.294 +
  26.295 +static inline void run_timer_list(void)
  26.296 +{
  26.297 +	spin_lock_irq(&timerlist_lock);
  26.298 +	while ((long)(jiffies - timer_jiffies) >= 0) {
  26.299 +		LIST_HEAD(queued);
  26.300 +		struct list_head *head, *curr;
  26.301 +		if (!tv1.index) {
  26.302 +			int n = 1;
  26.303 +			do {
  26.304 +				cascade_timers(tvecs[n]);
  26.305 +			} while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
  26.306 +		}
  26.307 +		run_timer_list_running = &queued;
  26.308 +repeat:
  26.309 +		head = tv1.vec + tv1.index;
  26.310 +		curr = head->next;
  26.311 +		if (curr != head) {
  26.312 +			struct timer_list *timer;
  26.313 +			void (*fn)(unsigned long);
  26.314 +			unsigned long data;
  26.315 +
  26.316 +			timer = list_entry(curr, struct timer_list, list);
  26.317 + 			fn = timer->function;
  26.318 + 			data= timer->data;
  26.319 +
  26.320 +			detach_timer(timer);
  26.321 +			timer->list.next = timer->list.prev = NULL;
  26.322 +			timer_enter(timer);
  26.323 +			spin_unlock_irq(&timerlist_lock);
  26.324 +			fn(data);
  26.325 +			spin_lock_irq(&timerlist_lock);
  26.326 +			timer_exit();
  26.327 +			goto repeat;
  26.328 +		}
  26.329 +		run_timer_list_running = NULL;
  26.330 +		++timer_jiffies; 
  26.331 +		tv1.index = (tv1.index + 1) & TVR_MASK;
  26.332 +
  26.333 +		curr = queued.next;
  26.334 +		while (curr != &queued) {
  26.335 +			struct timer_list *timer;
  26.336 +
  26.337 +			timer = list_entry(curr, struct timer_list, list);
  26.338 +			curr = curr->next;
  26.339 +			internal_add_timer(timer);
  26.340 +		}			
  26.341 +	}
  26.342 +	spin_unlock_irq(&timerlist_lock);
  26.343 +}
  26.344 +
  26.345 +#ifdef CONFIG_NO_IDLE_HZ
  26.346 +/*
  26.347 + * Find out when the next timer event is due to happen. This
  26.348 + * is used on S/390 to stop all activity when all cpus are idle.
  26.349 + * And in XenoLinux to achieve the same.
  26.350 + * The timerlist_lock must be acquired before calling this function.
  26.351 + */
  26.352 +struct timer_list *next_timer_event(void)
  26.353 +{
  26.354 +	struct timer_list *nte, *tmp;
  26.355 +	struct list_head *lst;
  26.356 +	int i, j;
  26.357 +
  26.358 +	/* Look for the next timer event in tv1. */
  26.359 +	i = 0;
  26.360 +	j = tvecs[0]->index;
  26.361 +	do {
  26.362 +		struct list_head *head = tvecs[0]->vec + j;
  26.363 +		if (!list_empty(head)) {
  26.364 +			nte = list_entry(head->next, struct timer_list, list);
  26.365 +			goto found;
  26.366 +		}
  26.367 +		j = (j + 1) & TVR_MASK;
  26.368 +	} while (j != tv1.index);
  26.369 +
  26.370 +	/* No event found in tv1. Check tv2-tv5. */
  26.371 +	for (i = 1; i < NOOF_TVECS; i++) {
  26.372 +		j = tvecs[i]->index;
  26.373 +		do {
  26.374 +			nte = NULL;
  26.375 +			list_for_each(lst, tvecs[i]->vec + j) {
  26.376 +				tmp = list_entry(lst, struct timer_list, list);
  26.377 +				if (nte == NULL ||
  26.378 +				    time_before(tmp->expires, nte->expires))
  26.379 +					nte = tmp;
  26.380 +			}
  26.381 +			if (nte)
  26.382 +				goto found;
  26.383 +			j = (j + 1) & TVN_MASK;
  26.384 +		} while (j != tvecs[i]->index);
  26.385 +	}
  26.386 +	return NULL;
  26.387 +found:
  26.388 +	/* Found timer event in tvecs[i]->vec[j] */
  26.389 +	if (j < tvecs[i]->index && i < NOOF_TVECS-1) {
  26.390 +		/* 
  26.391 +		 * The search wrapped. We need to look at the next list
  26.392 +		 * from tvecs[i+1] that would cascade into tvecs[i].
  26.393 +		 */
  26.394 +		list_for_each(lst, tvecs[i+1]->vec+tvecs[i+1]->index) {
  26.395 +			tmp = list_entry(lst, struct timer_list, list);
  26.396 +			if (time_before(tmp->expires, nte->expires))
  26.397 +				nte = tmp;
  26.398 +		}
  26.399 +	}
  26.400 +	return nte;
  26.401 +}
  26.402 +#endif
  26.403 +
  26.404 +spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED;
  26.405 +
  26.406 +void tqueue_bh(void)
  26.407 +{
  26.408 +	run_task_queue(&tq_timer);
  26.409 +}
  26.410 +
  26.411 +void immediate_bh(void)
  26.412 +{
  26.413 +	run_task_queue(&tq_immediate);
  26.414 +}
  26.415 +
  26.416 +/*
  26.417 + * this routine handles the overflow of the microsecond field
  26.418 + *
  26.419 + * The tricky bits of code to handle the accurate clock support
  26.420 + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
  26.421 + * They were originally developed for SUN and DEC kernels.
  26.422 + * All the kudos should go to Dave for this stuff.
  26.423 + *
  26.424 + */
  26.425 +static void second_overflow(void)
  26.426 +{
  26.427 +    long ltemp;
  26.428 +
  26.429 +    /* Bump the maxerror field */
  26.430 +    time_maxerror += time_tolerance >> SHIFT_USEC;
  26.431 +    if ( time_maxerror > NTP_PHASE_LIMIT ) {
  26.432 +	time_maxerror = NTP_PHASE_LIMIT;
  26.433 +	time_status |= STA_UNSYNC;
  26.434 +    }
  26.435 +
  26.436 +    /*
  26.437 +     * Leap second processing. If in leap-insert state at
  26.438 +     * the end of the day, the system clock is set back one
  26.439 +     * second; if in leap-delete state, the system clock is
  26.440 +     * set ahead one second. The microtime() routine or
  26.441 +     * external clock driver will insure that reported time
  26.442 +     * is always monotonic. The ugly divides should be
  26.443 +     * replaced.
  26.444 +     */
  26.445 +    switch (time_state) {
  26.446 +
  26.447 +    case TIME_OK:
  26.448 +	if (time_status & STA_INS)
  26.449 +	    time_state = TIME_INS;
  26.450 +	else if (time_status & STA_DEL)
  26.451 +	    time_state = TIME_DEL;
  26.452 +	break;
  26.453 +
  26.454 +    case TIME_INS:
  26.455 +	if (xtime.tv_sec % 86400 == 0) {
  26.456 +	    xtime.tv_sec--;
  26.457 +	    time_state = TIME_OOP;
  26.458 +	    printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
  26.459 +	}
  26.460 +	break;
  26.461 +
  26.462 +    case TIME_DEL:
  26.463 +	if ((xtime.tv_sec + 1) % 86400 == 0) {
  26.464 +	    xtime.tv_sec++;
  26.465 +	    time_state = TIME_WAIT;
  26.466 +	    printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
  26.467 +	}
  26.468 +	break;
  26.469 +
  26.470 +    case TIME_OOP:
  26.471 +	time_state = TIME_WAIT;
  26.472 +	break;
  26.473 +
  26.474 +    case TIME_WAIT:
  26.475 +	if (!(time_status & (STA_INS | STA_DEL)))
  26.476 +	    time_state = TIME_OK;
  26.477 +    }
  26.478 +
  26.479 +    /*
  26.480 +     * Compute the phase adjustment for the next second. In
  26.481 +     * PLL mode, the offset is reduced by a fixed factor
  26.482 +     * times the time constant. In FLL mode the offset is
  26.483 +     * used directly. In either mode, the maximum phase
  26.484 +     * adjustment for each second is clamped so as to spread
  26.485 +     * the adjustment over not more than the number of
  26.486 +     * seconds between updates.
  26.487 +     */
  26.488 +    if (time_offset < 0) {
  26.489 +	ltemp = -time_offset;
  26.490 +	if (!(time_status & STA_FLL))
  26.491 +	    ltemp >>= SHIFT_KG + time_constant;
  26.492 +	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
  26.493 +	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
  26.494 +	time_offset += ltemp;
  26.495 +	time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
  26.496 +    } else {
  26.497 +	ltemp = time_offset;
  26.498 +	if (!(time_status & STA_FLL))
  26.499 +	    ltemp >>= SHIFT_KG + time_constant;
  26.500 +	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
  26.501 +	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
  26.502 +	time_offset -= ltemp;
  26.503 +	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
  26.504 +    }
  26.505 +
  26.506 +    /*
  26.507 +     * Compute the frequency estimate and additional phase
  26.508 +     * adjustment due to frequency error for the next
  26.509 +     * second. When the PPS signal is engaged, gnaw on the
  26.510 +     * watchdog counter and update the frequency computed by
  26.511 +     * the pll and the PPS signal.
  26.512 +     */
  26.513 +    pps_valid++;
  26.514 +    if (pps_valid == PPS_VALID) {	/* PPS signal lost */
  26.515 +	pps_jitter = MAXTIME;
  26.516 +	pps_stabil = MAXFREQ;
  26.517 +	time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
  26.518 +			 STA_PPSWANDER | STA_PPSERROR);
  26.519 +    }
  26.520 +    ltemp = time_freq + pps_freq;
  26.521 +    if (ltemp < 0)
  26.522 +	time_adj -= -ltemp >>
  26.523 +	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
  26.524 +    else
  26.525 +	time_adj += ltemp >>
  26.526 +	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
  26.527 +
  26.528 +#if HZ == 100
  26.529 +    /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
  26.530 +     * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
  26.531 +     */
  26.532 +    if (time_adj < 0)
  26.533 +	time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
  26.534 +    else
  26.535 +	time_adj += (time_adj >> 2) + (time_adj >> 5);
  26.536 +#endif
  26.537 +}
  26.538 +
  26.539 +/* in the NTP reference this is called "hardclock()" */
  26.540 +static void update_wall_time_one_tick(void)
  26.541 +{
  26.542 +	if ( (time_adjust_step = time_adjust) != 0 ) {
  26.543 +	    /* We are doing an adjtime thing. 
  26.544 +	     *
  26.545 +	     * Prepare time_adjust_step to be within bounds.
  26.546 +	     * Note that a positive time_adjust means we want the clock
  26.547 +	     * to run faster.
  26.548 +	     *
  26.549 +	     * Limit the amount of the step to be in the range
  26.550 +	     * -tickadj .. +tickadj
  26.551 +	     */
  26.552 +	     if (time_adjust > tickadj)
  26.553 +		time_adjust_step = tickadj;
  26.554 +	     else if (time_adjust < -tickadj)
  26.555 +		time_adjust_step = -tickadj;
  26.556 +	     
  26.557 +	    /* Reduce by this step the amount of time left  */
  26.558 +	    time_adjust -= time_adjust_step;
  26.559 +	}
  26.560 +	xtime.tv_usec += tick + time_adjust_step;
  26.561 +	/*
  26.562 +	 * Advance the phase, once it gets to one microsecond, then
  26.563 +	 * advance the tick more.
  26.564 +	 */
  26.565 +	time_phase += time_adj;
  26.566 +	if (time_phase <= -FINEUSEC) {
  26.567 +		long ltemp = -time_phase >> SHIFT_SCALE;
  26.568 +		time_phase += ltemp << SHIFT_SCALE;
  26.569 +		xtime.tv_usec -= ltemp;
  26.570 +	}
  26.571 +	else if (time_phase >= FINEUSEC) {
  26.572 +		long ltemp = time_phase >> SHIFT_SCALE;
  26.573 +		time_phase -= ltemp << SHIFT_SCALE;
  26.574 +		xtime.tv_usec += ltemp;
  26.575 +	}
  26.576 +}
  26.577 +
  26.578 +/*
  26.579 + * Using a loop looks inefficient, but "ticks" is
  26.580 + * usually just one (we shouldn't be losing ticks,
  26.581 + * we're doing this this way mainly for interrupt
  26.582 + * latency reasons, not because we think we'll
  26.583 + * have lots of lost timer ticks
  26.584 + */
  26.585 +static void update_wall_time(unsigned long ticks)
  26.586 +{
  26.587 +	do {
  26.588 +		ticks--;
  26.589 +		update_wall_time_one_tick();
  26.590 +	} while (ticks);
  26.591 +
  26.592 +	while (xtime.tv_usec >= 1000000) {
  26.593 +	    xtime.tv_usec -= 1000000;
  26.594 +	    xtime.tv_sec++;
  26.595 +	    second_overflow();
  26.596 +	}
  26.597 +}
  26.598 +
  26.599 +static inline void do_process_times(struct task_struct *p,
  26.600 +	unsigned long user, unsigned long system)
  26.601 +{
  26.602 +	unsigned long psecs;
  26.603 +
  26.604 +	psecs = (p->times.tms_utime += user);
  26.605 +	psecs += (p->times.tms_stime += system);
  26.606 +	if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
  26.607 +		/* Send SIGXCPU every second.. */
  26.608 +		if (!(psecs % HZ))
  26.609 +			send_sig(SIGXCPU, p, 1);
  26.610 +		/* and SIGKILL when we go over max.. */
  26.611 +		if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
  26.612 +			send_sig(SIGKILL, p, 1);
  26.613 +	}
  26.614 +}
  26.615 +
  26.616 +static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
  26.617 +{
  26.618 +	unsigned long it_virt = p->it_virt_value;
  26.619 +
  26.620 +	if (it_virt) {
  26.621 +		it_virt -= ticks;
  26.622 +		if (!it_virt) {
  26.623 +			it_virt = p->it_virt_incr;
  26.624 +			send_sig(SIGVTALRM, p, 1);
  26.625 +		}
  26.626 +		p->it_virt_value = it_virt;
  26.627 +	}
  26.628 +}
  26.629 +
  26.630 +static inline void do_it_prof(struct task_struct *p)
  26.631 +{
  26.632 +	unsigned long it_prof = p->it_prof_value;
  26.633 +
  26.634 +	if (it_prof) {
  26.635 +		if (--it_prof == 0) {
  26.636 +			it_prof = p->it_prof_incr;
  26.637 +			send_sig(SIGPROF, p, 1);
  26.638 +		}
  26.639 +		p->it_prof_value = it_prof;
  26.640 +	}
  26.641 +}
  26.642 +
  26.643 +void update_one_process(struct task_struct *p, unsigned long user,
  26.644 +			unsigned long system, int cpu)
  26.645 +{
  26.646 +	p->per_cpu_utime[cpu] += user;
  26.647 +	p->per_cpu_stime[cpu] += system;
  26.648 +	do_process_times(p, user, system);
  26.649 +	do_it_virt(p, user);
  26.650 +	do_it_prof(p);
  26.651 +}	
  26.652 +
  26.653 +/*
  26.654 + * Called from the timer interrupt handler to charge one tick to the current 
  26.655 + * process.  user_tick is 1 if the tick is user time, 0 for system.
  26.656 + */
  26.657 +void update_process_times(int user_tick)
  26.658 +{
  26.659 +	struct task_struct *p = current;
  26.660 +	int cpu = smp_processor_id(), system = user_tick ^ 1;
  26.661 +
  26.662 +	update_one_process(p, user_tick, system, cpu);
  26.663 +	if (p->pid) {
  26.664 +		if (--p->counter <= 0) {
  26.665 +			p->counter = 0;
  26.666 +			/*
  26.667 +			 * SCHED_FIFO is priority preemption, so this is 
  26.668 +			 * not the place to decide whether to reschedule a
  26.669 +			 * SCHED_FIFO task or not - Bhavesh Davda
  26.670 +			 */
  26.671 +			if (p->policy != SCHED_FIFO) {
  26.672 +				p->need_resched = 1;
  26.673 +			}
  26.674 +		}
  26.675 +		if (p->nice > 0)
  26.676 +			kstat.per_cpu_nice[cpu] += user_tick;
  26.677 +		else
  26.678 +			kstat.per_cpu_user[cpu] += user_tick;
  26.679 +		kstat.per_cpu_system[cpu] += system;
  26.680 +	} else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
  26.681 +		kstat.per_cpu_system[cpu] += system;
  26.682 +}
  26.683 +
  26.684 +/*
  26.685 + * Called from the timer interrupt handler to charge a couple of ticks
  26.686 + * to the current process.
  26.687 + */
  26.688 +void update_process_times_us(int user_ticks, int system_ticks)
  26.689 +{
  26.690 +	struct task_struct *p = current;
  26.691 +	int cpu = smp_processor_id();
  26.692 +
  26.693 +	update_one_process(p, user_ticks, system_ticks, cpu);
  26.694 +	if (p->pid) {
  26.695 +		p->counter -= user_ticks + system_ticks;
  26.696 +		if (p->counter <= 0) {
  26.697 +			p->counter = 0;
  26.698 +			p->need_resched = 1;
  26.699 +		}
  26.700 +		if (p->nice > 0)
  26.701 +			kstat.per_cpu_nice[cpu] += user_ticks;
  26.702 +		else
  26.703 +			kstat.per_cpu_user[cpu] += user_ticks;
  26.704 +		kstat.per_cpu_system[cpu] += system_ticks;
  26.705 +	} else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
  26.706 +		kstat.per_cpu_system[cpu] += system_ticks;
  26.707 +}
  26.708 +
  26.709 +/*
  26.710 + * Nr of active tasks - counted in fixed-point numbers
  26.711 + */
  26.712 +static unsigned long count_active_tasks(void)
  26.713 +{
  26.714 +	struct task_struct *p;
  26.715 +	unsigned long nr = 0;
  26.716 +
  26.717 +	read_lock(&tasklist_lock);
  26.718 +	for_each_task(p) {
  26.719 +		if ((p->state == TASK_RUNNING ||
  26.720 +		     (p->state & TASK_UNINTERRUPTIBLE)))
  26.721 +			nr += FIXED_1;
  26.722 +	}
  26.723 +	read_unlock(&tasklist_lock);
  26.724 +	return nr;
  26.725 +}
  26.726 +
  26.727 +/*
  26.728 + * Hmm.. Changed this, as the GNU make sources (load.c) seems to
  26.729 + * imply that avenrun[] is the standard name for this kind of thing.
  26.730 + * Nothing else seems to be standardized: the fractional size etc
  26.731 + * all seem to differ on different machines.
  26.732 + */
  26.733 +unsigned long avenrun[3];
  26.734 +
  26.735 +static inline void calc_load(unsigned long ticks)
  26.736 +{
  26.737 +	unsigned long active_tasks; /* fixed-point */
  26.738 +	static int count = LOAD_FREQ;
  26.739 +
  26.740 +	count -= ticks;
  26.741 +	while (count < 0) {
  26.742 +		count += LOAD_FREQ;
  26.743 +		active_tasks = count_active_tasks();
  26.744 +		CALC_LOAD(avenrun[0], EXP_1, active_tasks);
  26.745 +		CALC_LOAD(avenrun[1], EXP_5, active_tasks);
  26.746 +		CALC_LOAD(avenrun[2], EXP_15, active_tasks);
  26.747 +	}
  26.748 +}
  26.749 +
  26.750 +/* jiffies at the most recent update of wall time */
  26.751 +unsigned long wall_jiffies;
  26.752 +
  26.753 +/*
  26.754 + * This spinlock protect us from races in SMP while playing with xtime. -arca
  26.755 + */
  26.756 +rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
  26.757 +
  26.758 +static inline void update_times(void)
  26.759 +{
  26.760 +	unsigned long ticks;
  26.761 +
  26.762 +	/*
  26.763 +	 * update_times() is run from the raw timer_bh handler so we
  26.764 +	 * just know that the irqs are locally enabled and so we don't
  26.765 +	 * need to save/restore the flags of the local CPU here. -arca
  26.766 +	 */
  26.767 +	write_lock_irq(&xtime_lock);
  26.768 +	vxtime_lock();
  26.769 +
  26.770 +	ticks = jiffies - wall_jiffies;
  26.771 +	if (ticks) {
  26.772 +		wall_jiffies += ticks;
  26.773 +		update_wall_time(ticks);
  26.774 +	}
  26.775 +	vxtime_unlock();
  26.776 +	write_unlock_irq(&xtime_lock);
  26.777 +	calc_load(ticks);
  26.778 +}
  26.779 +
  26.780 +void timer_bh(void)
  26.781 +{
  26.782 +	update_times();
  26.783 +	run_timer_list();
  26.784 +}
  26.785 +
  26.786 +void do_timer(struct pt_regs *regs)
  26.787 +{
  26.788 +	(*(unsigned long *)&jiffies)++;
  26.789 +#ifndef CONFIG_SMP
  26.790 +	/* SMP process accounting uses the local APIC timer */
  26.791 +
  26.792 +	update_process_times(user_mode(regs));
  26.793 +#endif
  26.794 +	mark_bh(TIMER_BH);
  26.795 +	if (TQ_ACTIVE(tq_timer))
  26.796 +		mark_bh(TQUEUE_BH);
  26.797 +}
  26.798 +
  26.799 +void do_timer_ticks(int ticks)
  26.800 +{
  26.801 +	(*(unsigned long *)&jiffies) += ticks;
  26.802 +	mark_bh(TIMER_BH);
  26.803 +	if (TQ_ACTIVE(tq_timer))
  26.804 +		mark_bh(TQUEUE_BH);
  26.805 +}
  26.806 +
  26.807 +#if !defined(__alpha__) && !defined(__ia64__)
  26.808 +
  26.809 +/*
  26.810 + * For backwards compatibility?  This can be done in libc so Alpha
  26.811 + * and all newer ports shouldn't need it.
  26.812 + */
  26.813 +asmlinkage unsigned long sys_alarm(unsigned int seconds)
  26.814 +{
  26.815 +	struct itimerval it_new, it_old;
  26.816 +	unsigned int oldalarm;
  26.817 +
  26.818 +	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
  26.819 +	it_new.it_value.tv_sec = seconds;
  26.820 +	it_new.it_value.tv_usec = 0;
  26.821 +	do_setitimer(ITIMER_REAL, &it_new, &it_old);
  26.822 +	oldalarm = it_old.it_value.tv_sec;
  26.823 +	/* ehhh.. We can't return 0 if we have an alarm pending.. */
  26.824 +	/* And we'd better return too much than too little anyway */
  26.825 +	if (it_old.it_value.tv_usec)
  26.826 +		oldalarm++;
  26.827 +	return oldalarm;
  26.828 +}
  26.829 +
  26.830 +#endif
  26.831 +
  26.832 +#ifndef __alpha__
  26.833 +
  26.834 +/*
  26.835 + * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
  26.836 + * should be moved into arch/i386 instead?
  26.837 + */
  26.838 +
  26.839 +/**
  26.840 + * sys_getpid - return the thread group id of the current process
  26.841 + *
  26.842 + * Note, despite the name, this returns the tgid not the pid.  The tgid and
  26.843 + * the pid are identical unless CLONE_THREAD was specified on clone() in
  26.844 + * which case the tgid is the same in all threads of the same group.
  26.845 + *
  26.846 + * This is SMP safe as current->tgid does not change.
  26.847 + */
  26.848 +asmlinkage long sys_getpid(void)
  26.849 +{
  26.850 +	return current->tgid;
  26.851 +}
  26.852 +
  26.853 +/*
  26.854 + * This is not strictly SMP safe: p_opptr could change
  26.855 + * from under us. However, rather than getting any lock
  26.856 + * we can use an optimistic algorithm: get the parent
  26.857 + * pid, and go back and check that the parent is still
  26.858 + * the same. If it has changed (which is extremely unlikely
  26.859 + * indeed), we just try again..
  26.860 + *
  26.861 + * NOTE! This depends on the fact that even if we _do_
  26.862 + * get an old value of "parent", we can happily dereference
  26.863 + * the pointer: we just can't necessarily trust the result
  26.864 + * until we know that the parent pointer is valid.
  26.865 + *
  26.866 + * The "mb()" macro is a memory barrier - a synchronizing
  26.867 + * event. It also makes sure that gcc doesn't optimize
  26.868 + * away the necessary memory references.. The barrier doesn't
  26.869 + * have to have all that strong semantics: on x86 we don't
  26.870 + * really require a synchronizing instruction, for example.
  26.871 + * The barrier is more important for code generation than
  26.872 + * for any real memory ordering semantics (even if there is
  26.873 + * a small window for a race, using the old pointer is
  26.874 + * harmless for a while).
  26.875 + */
  26.876 +asmlinkage long sys_getppid(void)
  26.877 +{
  26.878 +	int pid;
  26.879 +	struct task_struct * me = current;
  26.880 +	struct task_struct * parent;
  26.881 +
  26.882 +	parent = me->p_opptr;
  26.883 +	for (;;) {
  26.884 +		pid = parent->pid;
  26.885 +#if CONFIG_SMP
  26.886 +{
  26.887 +		struct task_struct *old = parent;
  26.888 +		mb();
  26.889 +		parent = me->p_opptr;
  26.890 +		if (old != parent)
  26.891 +			continue;
  26.892 +}
  26.893 +#endif
  26.894 +		break;
  26.895 +	}
  26.896 +	return pid;
  26.897 +}
  26.898 +
  26.899 +asmlinkage long sys_getuid(void)
  26.900 +{
  26.901 +	/* Only we change this so SMP safe */
  26.902 +	return current->uid;
  26.903 +}
  26.904 +
  26.905 +asmlinkage long sys_geteuid(void)
  26.906 +{
  26.907 +	/* Only we change this so SMP safe */
  26.908 +	return current->euid;
  26.909 +}
  26.910 +
  26.911 +asmlinkage long sys_getgid(void)
  26.912 +{
  26.913 +	/* Only we change this so SMP safe */
  26.914 +	return current->gid;
  26.915 +}
  26.916 +
  26.917 +asmlinkage long sys_getegid(void)
  26.918 +{
  26.919 +	/* Only we change this so SMP safe */
  26.920 +	return  current->egid;
  26.921 +}
  26.922 +
  26.923 +#endif
  26.924 +
  26.925 +/* Thread ID - the internal kernel "pid" */
  26.926 +asmlinkage long sys_gettid(void)
  26.927 +{
  26.928 +	return current->pid;
  26.929 +}
  26.930 +
  26.931 +asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
  26.932 +{
  26.933 +	struct timespec t;
  26.934 +	unsigned long expire;
  26.935 +
  26.936 +	if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
  26.937 +		return -EFAULT;
  26.938 +
  26.939 +	if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
  26.940 +		return -EINVAL;
  26.941 +
  26.942 +
  26.943 +	if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
  26.944 +	    current->policy != SCHED_OTHER)
  26.945 +	{
  26.946 +		/*
  26.947 +		 * Short delay requests up to 2 ms will be handled with
  26.948 +		 * high precision by a busy wait for all real-time processes.
  26.949 +		 *
  26.950 +		 * Its important on SMP not to do this holding locks.
  26.951 +		 */
  26.952 +		udelay((t.tv_nsec + 999) / 1000);
  26.953 +		return 0;
  26.954 +	}
  26.955 +
  26.956 +	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
  26.957 +
  26.958 +	current->state = TASK_INTERRUPTIBLE;
  26.959 +	expire = schedule_timeout(expire);
  26.960 +
  26.961 +	if (expire) {
  26.962 +		if (rmtp) {
  26.963 +			jiffies_to_timespec(expire, &t);
  26.964 +			if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
  26.965 +				return -EFAULT;
  26.966 +		}
  26.967 +		return -EINTR;
  26.968 +	}
  26.969 +	return 0;
  26.970 +}
  26.971 +