]> xenbits.xensource.com Git - xenclient/kernel.git/commitdiff
* Wed Jul 23 2008 Jiri Pirko <jpirko@redhat.com> [2.6.18-92.1.10.el5] kernel-2.6.18-92.1.6.el5.patch
authort_jeang <devnull@localhost>
Tue, 6 Jan 2009 12:05:54 +0000 (12:05 +0000)
committert_jeang <devnull@localhost>
Tue, 6 Jan 2009 12:05:54 +0000 (12:05 +0000)
- [ia64] softlock: prevent endless warnings in kdump (Neil Horman ) [456117 453200]

* Wed Jul 16 2008 Jiri Pirko <jpirko@redhat.com> [2.6.18-92.1.9.el5]

- [misc] signaling msgrvc() should not pass back error (Jiri Pirko ) [455278 452533]
- [ia64] properly unregister legacy interrupts (Prarit Bhargava ) [450337 445886]

* Mon Jul 14 2008 Jiri Pirko <jpirko@redhat.com> [2.6.18-92.1.8.el5]

- [net] randomize udp port allocation (Eugene Teo ) [454571 454572]
- [tty] add NULL pointer checks (Aristeu Rozanski ) [453425 453154] {CVE-2008-2812}
- [net] sctp: make sure sctp_addr does not overflow (David S. Miller ) [452482 452483] {CVE-2008-2826}
- [sys] sys_setrlimit: prevent setting RLIMIT_CPU to 0 (Neil Horman ) [437121 437122] {CVE-2008-1294}
- [net] sit: exploitable remote memory leak (Jiri Pirko ) [446038 446039] {CVE-2008-2136}
- [misc] ttyS1 lost interrupt, stops transmitting v2 (Brian Maly ) [455256 451157]
- [misc] ttyS1 loses interrupt and stops transmitting (Simon McGrath ) [443071 440121]

* Thu Jul 10 2008 Jiri Pirko <jpirko@redhat.com> [2.6.18-92.1.7.el5]

- [x86_64]: extend MCE banks support for Dunnington, Nehalem (Prarit Bhargava ) [451941 446673]
- [nfs] address nfs rewrite performance regression in RHEL5 (Eric Sandeen ) [448685 436004]
- [mm] Make mmap() with PROT_WRITE on RHEL5 (Larry Woodman ) [450758 448978]
- [i386]: Add check for supported_cpus in powernow_k8 driver (Prarit Bhargava ) [450866 443853]
- [i386]: Add check for dmi_data in powernow_k8 driver (Prarit Bhargava ) [450866 443853]
- [net] fix recv return zero (Thomas Graf ) [452231 435657]
- [misc] kernel crashes on futex (Anton Arapov ) [450336 435178]
- [net] Fixing bonding rtnl_lock screwups (Fabio Olive Leite ) [451939 450219]

33 files changed:
Makefile
arch/i386/mm/fault.c
arch/ia64/kernel/acpi.c
arch/ia64/mm/fault.c
arch/powerpc/mm/fault.c
arch/ppc/mm/fault.c
arch/x86_64/kernel/mce.c
arch/x86_64/mm/fault.c
buildconfigs/Rules.mk
configs/kernel-2.6.18-i686-PAE.config
configs/kernel-2.6.18-i686-debug.config
configs/kernel-2.6.18-i686-xen.config
configs/kernel-2.6.18-i686.config
drivers/net/bonding/bond_sysfs.c
drivers/net/irda/irtty-sir.c
drivers/net/ppp_async.c
drivers/net/ppp_synctty.c
drivers/net/slip.c
drivers/serial/8250.c
include/linux/sched.h
include/net/udp.h
kernel/exit.c
kernel/futex.c
kernel/futex_compat.c
kernel/signal.c
kernel/softlockup.c
kernel/sys.c
mm/filemap.c
net/ipv4/udp.c
net/ipv6/sit.c
net/ipv6/udp.c
net/sctp/socket.c
net/unix/af_unix.c

index c37030623d0563d08149f022f5db7fade9c4cf8e..8f4a08cccbdccf99c28855619adcc7e4a36815e6 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 18
-EXTRAVERSION = -92.1.6.el5
+EXTRAVERSION = -92.1.10.el5
 RHEL_MAJOR = 5
 RHEL_MINOR = 2
 NAME=Avast! A bilge rat!
index f7279468323a62cef22984f3e81707eaa6a93b4b..45914b5785ef39d83d626980336b5d2477bb5967 100644 (file)
@@ -449,7 +449,7 @@ good_area:
                case 1:         /* read, present */
                        goto bad_area;
                case 0:         /* read, not present */
-                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
                                goto bad_area;
        }
 
index c9e3aae66bc0cc7ccf03712fbde8f3bca8e3de8e..97c2fbd37f213f6967570a5ca3641a5a64bcf11f 100644 (file)
@@ -621,6 +621,9 @@ void acpi_unregister_gsi(u32 gsi)
        if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
                return;
 
+       if (has_8259 && gsi < 16)
+               return;
+
        iosapic_unregister_intr(gsi);
 }
 
index 42bc87c7f0bd64a8270016ab3fce4dfe3a973eda..643000426420203022f38f1f2496bd91bbd3a2e4 100644 (file)
@@ -152,9 +152,11 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 #              error File is out of sync with <linux/mm.h>.  Please update.
 #      endif
 
+       if (((isr >> IA64_ISR_R_BIT) & 1UL) && (!(vma->vm_flags & (VM_READ | VM_WRITE))))
+               goto bad_area;
+
        mask = (  (((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
-               | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)
-               | (((isr >> IA64_ISR_R_BIT) & 1UL) << VM_READ_BIT));
+               | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT));
 
        if ((vma->vm_flags & mask) != mask)
                goto bad_area;
index 78a0d59903ee69d2b8ac96de20e18c17fd5f9206..77953f41d75406c200085e0bca8151857df9742f 100644 (file)
@@ -333,7 +333,7 @@ good_area:
                /* protection fault */
                if (error_code & 0x08000000)
                        goto bad_area;
-               if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
                        goto bad_area;
        }
 
index 5cdfb71fcb078ca50f56059a907e0104ab452194..bc776beb3136f28bced9d3a4ab08d4efc039e5f8 100644 (file)
@@ -239,7 +239,7 @@ good_area:
                /* protection fault */
                if (error_code & 0x08000000)
                        goto bad_area;
-               if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
                        goto bad_area;
        }
 
index bbea88801d883cc8928c6a9c91fe5bf5efbf4c9f..d370a4ea97e3a5dbc9668c62cc35d8bbf49bd333 100644 (file)
@@ -27,7 +27,7 @@
 #include <asm/smp.h>
 
 #define MISC_MCELOG_MINOR 227
-#define NR_BANKS 6
+#define NR_SYSFS_BANKS 6
 
 atomic_t mce_entry;
 
@@ -37,7 +37,7 @@ static int mce_dont_init;
    3: never panic or exit (for testing only) */
 static int tolerant = 1;
 static int banks;
-static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
 static unsigned long console_logged;
 static int notify_user;
 static int rip_msr;
@@ -191,7 +191,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
        barrier();
 
        for (i = 0; i < banks; i++) {
-               if (!bank[i])
+               if (i < NR_SYSFS_BANKS && !bank[i])
                        continue;
                
                m.misc = 0; 
@@ -354,9 +354,10 @@ static void mce_init(void *dummy)
 
        rdmsrl(MSR_IA32_MCG_CAP, cap);
        banks = cap & 0xff;
-       if (banks > NR_BANKS) { 
-               printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
-               banks = NR_BANKS; 
+       if (banks > MCE_EXTENDED_BANK) { 
+               printk(KERN_INFO "MCE: warning: using only %d banks\n",
+                      MCE_EXTENDED_BANK);
+               banks = MCE_EXTENDED_BANK; 
        }
        /* Use accurate RIP reporting if available. */
        if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
@@ -372,7 +373,7 @@ static void mce_init(void *dummy)
                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
        for (i = 0; i < banks; i++) {
-               wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+               wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
        }       
 }
@@ -606,13 +607,16 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
        }                                                                          \
        static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
+/* TBD should generate these dynamically based on number of available banks.
+ * Have only 6 contol banks in /sysfs until then.
+ */
 ACCESSOR(bank0ctl,bank[0],mce_restart())
 ACCESSOR(bank1ctl,bank[1],mce_restart())
 ACCESSOR(bank2ctl,bank[2],mce_restart())
 ACCESSOR(bank3ctl,bank[3],mce_restart())
 ACCESSOR(bank4ctl,bank[4],mce_restart())
 ACCESSOR(bank5ctl,bank[5],mce_restart())
-static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
+static struct sysdev_attribute * bank_attributes[NR_SYSFS_BANKS] = {
        &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
        &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
 ACCESSOR(tolerant,tolerant,)
@@ -632,7 +636,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)
        err = sysdev_register(&per_cpu(device_mce,cpu));
 
        if (!err) {
-               for (i = 0; i < banks; i++)
+               for (i = 0; i < NR_SYSFS_BANKS; i++)
                        sysdev_create_file(&per_cpu(device_mce,cpu),
                                bank_attributes[i]);
                sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
index 82542a2b1d0008b2cd19424e8d933381ff6f66c9..390160b3e04a98e328aab2788e6c86ec61e3de73 100644 (file)
@@ -477,7 +477,7 @@ good_area:
                case PF_PROT:           /* read, present */
                        goto bad_area;
                case 0:                 /* read, not present */
-                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
                                goto bad_area;
        }
 
index 879cb142e534fd96e1362f92292d7c836aa372ba..0ce24ae5ac83663c4a6e59919fa3f1487ba1535b 100644 (file)
@@ -2,7 +2,7 @@ XEN_TARGET_ARCH     = x86_32
 XEN_TARGET_X86_PAE ?= y
 
 LINUX_SERIES = 2.6
-LINUX_VER    = 2.6.18-92.1.6.el5
+LINUX_VER    = 2.6.18-92.1.10.el5
 
 EXTRAVERSION ?= xen
 
index e119cdcd5ebc523457bd36a21d824aa01be86de6..f2e5c7adac689cd72501033c78cf3edf5a1ea296 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.18-prep
-# Tue Jul  1 11:03:51 2008
+# Thu Aug  7 09:42:27 2008
 #
 CONFIG_X86_32=y
 CONFIG_GENERIC_TIME=y
index a8c5d219a8cff26d50d25facaf078ead0bd816aa..d8307face3f3f04d105610b5f20a983a31631b85 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.18-prep
-# Tue Jul  1 11:03:51 2008
+# Thu Aug  7 09:42:27 2008
 #
 CONFIG_X86_32=y
 CONFIG_GENERIC_TIME=y
index e6de9e82f1e9c19dc5be6f5ab5ed03afb7c48184..6b43869c29bc7805cf0f181c43c44145a6dc5127 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.18-prep
-# Tue Jul  1 11:03:51 2008
+# Thu Aug  7 09:42:27 2008
 #
 CONFIG_X86_32=y
 CONFIG_LOCKDEP_SUPPORT=y
index 065bed08d4b70a9e76d91fcf95e231031e12b1fb..6d342520508fa8d16340c76c56dda2241011b4a5 100644 (file)
@@ -2,7 +2,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.18-prep
-# Tue Jul  1 11:03:51 2008
+# Thu Aug  7 09:42:27 2008
 #
 CONFIG_X86_32=y
 CONFIG_GENERIC_TIME=y
index dbd9b964da4111936d98bb2de01184883d50e6fe..b1f861f80a1aadce3059d66fce7ca6f2c886f5a9 100644 (file)
@@ -147,29 +147,29 @@ static ssize_t bonding_store_bonds(struct class *cls, const char *buffer, size_t
                                                ": Unable remove bond %s due to open references.\n",
                                                ifname);
                                        res = -EPERM;
-                                       goto out;
+                                       goto out_unlock;
                                }
                                printk(KERN_INFO DRV_NAME
                                        ": %s is being deleted...\n",
                                        bond->dev->name);
                                bond_destroy(bond);
-                               up_write(&bonding_rwsem);
-                               rtnl_unlock();
-                               goto out;
+                               goto out_unlock;
                        }
 
                printk(KERN_ERR DRV_NAME
                        ": unable to delete non-existent bond %s\n", ifname);
                res = -ENODEV;
-               up_write(&bonding_rwsem);
-               rtnl_unlock();
-               goto out;
+               goto out_unlock;
        }
 
 err_no_cmd:
        printk(KERN_ERR DRV_NAME
                ": no command found in bonding_masters. Use +ifname or -ifname.\n");
-       res = -EPERM;
+       return -EPERM;
+
+out_unlock:
+       up_write(&bonding_rwsem);
+       rtnl_unlock();
 
        /* Always return either count or an error.  If you return 0, you'll
         * get called forever, which is bad.
@@ -254,8 +254,8 @@ static ssize_t bonding_store_slaves(struct class_device *cd, const char *buffer,
                printk(KERN_ERR DRV_NAME
                       ": %s: Unable to update slaves because interface is down.\n",
                       bond->dev->name);
-               ret = -EPERM;
-               goto out;
+               /* early return before rtnl_lock() */
+               return -EPERM;
        }
 
        /* Note:  We can't hold bond->lock here, as bond_create grabs it. */
index 6a98b7ae4975d9066e840707e8d095e183075b46..3c414dd8e7bac465c3008433278c8c21ce331582 100644 (file)
@@ -64,7 +64,9 @@ static int irtty_chars_in_buffer(struct sir_dev *dev)
        IRDA_ASSERT(priv != NULL, return -1;);
        IRDA_ASSERT(priv->magic == IRTTY_MAGIC, return -1;);
 
-       return priv->tty->driver->chars_in_buffer(priv->tty);
+       if (priv->tty->driver->chars_in_buffer)
+               return priv->tty->driver->chars_in_buffer(priv->tty);
+       return 0;
 }
 
 /* Wait (sleep) until underlaying hardware finished transmission
index 23659fd7c3a6bc120c5149eeb4a832e28a8c75b3..30bb05b05a7edeb2b02c54a683fbd0b0871afbe6 100644 (file)
@@ -158,6 +158,9 @@ ppp_asynctty_open(struct tty_struct *tty)
        struct asyncppp *ap;
        int err;
 
+       if (!tty->driver->write)
+               return -EOPNOTSUPP;
+
        err = -ENOMEM;
        ap = kmalloc(sizeof(*ap), GFP_KERNEL);
        if (ap == 0)
index 33255fe8031ef43292f1419065218786b5396b74..c49d6c9ecb49986647600b220c05c82e1fc07f56 100644 (file)
@@ -207,6 +207,9 @@ ppp_sync_open(struct tty_struct *tty)
        struct syncppp *ap;
        int err;
 
+       if (!tty->driver->write)
+               return -EOPNOTSUPP;
+
        ap = kmalloc(sizeof(*ap), GFP_KERNEL);
        err = -ENOMEM;
        if (ap == 0)
index 1588cb7f6c835ecd2bed3c9204fe7230cfe9a09e..13025d42b444dd231554bb471a523e1448366bbc 100644 (file)
@@ -463,9 +463,14 @@ static void sl_tx_timeout(struct net_device *dev)
                        /* 20 sec timeout not reached */
                        goto out;
                }
-               printk(KERN_WARNING "%s: transmit timed out, %s?\n", dev->name,
-                      (sl->tty->driver->chars_in_buffer(sl->tty) || sl->xleft) ?
-                      "bad line quality" : "driver error");
+               {
+                       int cib = 0;
+                       if (sl->tty->driver->chars_in_buffer)
+                               cib = sl->tty->driver->chars_in_buffer(sl->tty);
+                       printk(KERN_WARNING "%s: transmit timed out, %s?\n",
+                               dev->name, (cib || sl->xleft) ?
+                                      "bad line quality" : "driver error");
+               }
                sl->xleft = 0;
                sl->tty->flags &= ~(1 << TTY_DO_WRITE_WAKEUP);
                sl_unlock(sl);
@@ -836,6 +841,8 @@ static int slip_open(struct tty_struct *tty)
 
        if(!capable(CAP_NET_ADMIN))
                return -EPERM;
+       if (!tty->driver->write)
+               return -EOPNOTSUPP;
                
        /* RTnetlink lock is misused here to serialize concurrent
           opens of slip channels. There are better ways, but it is
index f7c9f2338b63f294ee26491580ab8dcc7e6d8507..96eec5cc613b2cdb90cc14d0d6b2e079665ce639 100644 (file)
@@ -1762,7 +1762,11 @@ static int serial8250_startup(struct uart_port *port)
         */
        serial_outp(up, UART_LCR, UART_LCR_WLEN8);
 
-       spin_lock_irqsave(&up->port.lock, flags);
+       if (is_real_interrupt(up->port.irq)) {
+               spin_lock_irqsave(&irq_lists[up->port.irq].lock, flags);
+               spin_lock(&up->port.lock);
+       } else
+               spin_lock_irqsave(&up->port.lock, flags);
        if (up->port.flags & UPF_FOURPORT) {
                if (!is_real_interrupt(up->port.irq))
                        up->port.mctrl |= TIOCM_OUT1;
@@ -1794,7 +1798,11 @@ static int serial8250_startup(struct uart_port *port)
                up->bugs &= ~UART_BUG_TXEN;
        }
 
-       spin_unlock_irqrestore(&up->port.lock, flags);
+       if (is_real_interrupt(up->port.irq)) {
+               spin_unlock(&up->port.lock);
+               spin_unlock_irqrestore(&irq_lists[up->port.irq].lock, flags);
+       } else
+               spin_unlock_irqrestore(&up->port.lock, flags);
 
        /*
         * Finally, enable interrupts.  Note: Modem status interrupts
index 4bf2f475286b3a7f5e8475e72a84037b0de475dd..c7502a678f3b2ed1f956e7179fa0101181687738 100644 (file)
@@ -1077,6 +1077,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_STARTING    0x00000002      /* being created */
 #define PF_EXITING     0x00000004      /* getting shut down */
 #define PF_DEAD                0x00000008      /* Dead */
+#define PF_EXITPIDONE  0x00000010      /* pi exit done on shut down */
 #define PF_FORKNOEXEC  0x00000040      /* forked but didn't exec */
 #define PF_SUPERPRIV   0x00000100      /* used super-user privileges */
 #define PF_DUMPCORE    0x00000200      /* dumped core */
index dd5e3b67496395fb01115d88a004601ab073d3b5..97fe4b5abdd59a7fdba67428adc7885e136b6e3e 100644 (file)
@@ -37,8 +37,6 @@
 extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 extern rwlock_t udp_hash_lock;
 
-extern int udp_port_rover;
-
 static inline int udp_lport_inuse(u16 num)
 {
        struct sock *sk;
index 45a156295b3f097be8da144c2e1a3ec90715cced..24d2b68e4cacc353d63d24d0367568d1c5a08020 100644 (file)
@@ -827,13 +827,29 @@ fastcall NORET_TYPE void do_exit(long code)
        if (unlikely(tsk->flags & PF_EXITING)) {
                printk(KERN_ALERT
                        "Fixing recursive fault but reboot is needed!\n");
+               /*
+                * We can do this unlocked here. The futex code uses
+                * this flag just to verify whether the pi state
+                * cleanup has been done or not. In the worst case it
+                * loops once more. We pretend that the cleanup was
+                * done as there is no way to return. Either the
+                * OWNER_DIED bit is set by now or we push the blocked
+                * task into the wait for ever nirwana as well.
+                */
+               tsk->flags |= PF_EXITPIDONE;
                if (tsk->io_context)
                        exit_io_context();
                set_current_state(TASK_UNINTERRUPTIBLE);
                schedule();
        }
 
+       /*
+        * tsk->flags are checked in the futex code to protect against
+        * an exiting task cleaning up the robust pi futexes.
+        */
+       spin_lock_irq(&tsk->pi_lock);
        tsk->flags |= PF_EXITING;
+       spin_unlock_irq(&tsk->pi_lock);
 
        ptrace_exit(tsk);
 
@@ -851,7 +867,7 @@ fastcall NORET_TYPE void do_exit(long code)
        }
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
-               hrtimer_cancel(&tsk->signal->real_timer);
+               hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk->signal);
        }
 
@@ -913,6 +929,12 @@ fastcall NORET_TYPE void do_exit(long code)
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held(tsk);
+       /*
+        * We can do this unlocked here. The futex code uses this flag
+        * just to verify whether the pi state cleanup has been done
+        * or not. In the worst case it loops once more.
+        */
+       tsk->flags |= PF_EXITPIDONE;
 
        if (tsk->io_context)
                exit_io_context();
index 9d260e838cffdca6f951d6625626ab9d62f19d81..b7b11976825ff9499bad65468935443e2815855c 100644 (file)
@@ -391,18 +391,12 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 
        read_lock(&tasklist_lock);
        p = find_task_by_pid(pid);
-       if (!p)
-               goto out_unlock;
-       if ((current->euid != p->euid) && (current->euid != p->uid)) {
-               p = NULL;
-               goto out_unlock;
-       }
-       if (p->exit_state != 0) {
-               p = NULL;
-               goto out_unlock;
-       }
-       get_task_struct(p);
-out_unlock:
+
+       if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
+               p = ERR_PTR(-ESRCH);
+       else
+               get_task_struct(p);
+
        read_unlock(&tasklist_lock);
 
        return p;
@@ -468,7 +462,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        struct futex_q *this, *next;
        struct list_head *head;
        struct task_struct *p;
-       pid_t pid;
+       pid_t pid = uval & FUTEX_TID_MASK;
 
        head = &hb->chain;
 
@@ -486,6 +480,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
                                return -EINVAL;
 
                        WARN_ON(!atomic_read(&pi_state->refcount));
+                       WARN_ON(pid && pi_state->owner &&
+                               pi_state->owner->pid != pid);
 
                        atomic_inc(&pi_state->refcount);
                        me->pi_state = pi_state;
@@ -496,15 +492,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 
        /*
         * We are the first waiter - try to look up the real owner and attach
-        * the new pi_state to it, but bail out when the owner died bit is set
-        * and TID = 0:
+        * the new pi_state to it, but bail out when TID = 0
         */
-       pid = uval & FUTEX_TID_MASK;
-       if (!pid && (uval & FUTEX_OWNER_DIED))
+       if (!pid)
                return -ESRCH;
        p = futex_find_get_task(pid);
-       if (!p)
-               return -ESRCH;
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       /*
+        * We need to look at the task state flags to figure out,
+        * whether the task is exiting. To protect against the do_exit
+        * change of the task flags, we do this protected by
+        * p->pi_lock:
+        */
+       spin_lock_irq(&p->pi_lock);
+       if (unlikely(p->flags & PF_EXITING)) {
+               /*
+                * The task is on the way out. When PF_EXITPIDONE is
+                * set, we know that the task has finished the
+                * cleanup:
+                */
+               int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+
+               spin_unlock_irq(&p->pi_lock);
+               put_task_struct(p);
+               return ret;
+       }
 
        pi_state = alloc_pi_state();
 
@@ -517,7 +531,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        /* Store the key for possible exit cleanups: */
        pi_state->key = me->key;
 
-       spin_lock_irq(&p->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &p->pi_state_list);
        pi_state->owner = p;
@@ -566,6 +579,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        if (!pi_state)
                return -EINVAL;
 
+       spin_lock(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 
        /*
@@ -583,15 +597,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
         * preserve the owner died bit.)
         */
        if (!(uval & FUTEX_OWNER_DIED)) {
+               int ret = 0;
                newval = FUTEX_WAITERS | new_owner->pid;
 
                inc_preempt_count();
                curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
                dec_preempt_count();
                if (curval == -EFAULT)
-                       return -EFAULT;
-               if (curval != uval)
-                       return -EINVAL;
+                       ret = -EFAULT;
+               else if (curval != uval)
+                       ret = -EINVAL;
+               if (ret) {
+                       spin_unlock(&pi_state->pi_mutex.wait_lock);
+                       return ret;
+               }
        }
 
        spin_lock_irq(&pi_state->owner->pi_lock);
@@ -605,6 +624,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        pi_state->owner = new_owner;
        spin_unlock_irq(&new_owner->pi_lock);
 
+       spin_unlock(&pi_state->pi_mutex.wait_lock);
        rt_mutex_unlock(&pi_state->pi_mutex);
 
        return 0;
@@ -1001,6 +1021,60 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
        drop_key_refs(&q->key);
 }
 
+/*
+ * Fixup the pi_state owner with the new owner.
+ *
+ * The cur->mm semaphore must be held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 *uaddr, struct futex_q *q,
+                               struct task_struct *newowner)
+{
+       u32 newtid = newowner->pid | FUTEX_WAITERS;
+       struct futex_pi_state *pi_state = q->pi_state;
+       u32 uval, curval, newval;
+       int ret;
+
+       /* Owner died? */
+       if (pi_state->owner != NULL) {
+               spin_lock_irq(&pi_state->owner->pi_lock);
+               WARN_ON(list_empty(&pi_state->list));
+               list_del_init(&pi_state->list);
+               spin_unlock_irq(&pi_state->owner->pi_lock);
+       } else
+               newtid |= FUTEX_OWNER_DIED;
+
+       pi_state->owner = newowner;
+
+       spin_lock_irq(&newowner->pi_lock);
+       WARN_ON(!list_empty(&pi_state->list));
+       list_add(&pi_state->list, &newowner->pi_state_list);
+       spin_unlock_irq(&newowner->pi_lock);
+
+       /*
+        * We own it, so we have to replace the pending owner
+        * TID. This must be atomic as we have preserve the
+        * owner died bit here.
+        */
+       ret = get_futex_value_locked(&uval, uaddr);
+
+       while (!ret) {
+               newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+               inc_preempt_count();
+               curval = futex_atomic_cmpxchg_inatomic(uaddr,
+                                                       uval, newval);
+               dec_preempt_count();
+
+               if (curval == -EFAULT)
+                       ret = -EFAULT;
+               if (curval == uval)
+                       break;
+               uval = curval;
+       }
+       return ret;
+}
+
 static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 {
        struct task_struct *curr = current;
@@ -1128,7 +1202,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        struct futex_hash_bucket *hb;
        u32 uval, newval, curval;
        struct futex_q q;
-       int ret, attempt = 0;
+       int ret, lock_taken, ownerdied = 0, attempt = 0;
 
        if (refill_pi_state_cache())
                return -ENOMEM;
@@ -1148,9 +1222,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        if (unlikely(ret != 0))
                goto out_release_sem;
 
+ retry_unlocked:
        hb = queue_lock(&q, -1, NULL);
 
  retry_locked:
+       ret = lock_taken = 0;
+
        /*
         * To avoid races, we attempt to take the lock here again
         * (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1165,24 +1242,44 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        if (unlikely(curval == -EFAULT))
                goto uaddr_faulted;
 
-       /* We own the lock already */
+       /*
+        * Detect deadlocks. In case of REQUEUE_PI this is a valid
+        * situation and we return success to user space.
+        */
        if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
-               if (!detect && 0)
-                       force_sig(SIGKILL, current);
                ret = -EDEADLK;
                goto out_unlock_release_sem;
        }
 
        /*
-        * Surprise - we got the lock. Just return
-        * to userspace:
+        * Surprise - we got the lock. Just return to userspace:
         */
        if (unlikely(!curval))
                goto out_unlock_release_sem;
 
        uval = curval;
+
+       /*
+        * Set the WAITERS flag, so the owner will know it has someone
+        * to wake at next unlock
+        */
        newval = uval | FUTEX_WAITERS;
 
+       /*
+        * There are two cases, where a futex might have no owner (the
+        * owner TID is 0): OWNER_DIED or REQUEUE. We take over the
+        * futex in this case. We also do an unconditional take over,
+        * when the owner of the futex died.
+        *
+        * This is safe as we are protected by the hash bucket lock !
+        */
+       if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+               /* Keep the OWNER_DIED bit */
+               newval = (curval & ~FUTEX_TID_MASK) | current->pid;
+               ownerdied = 0;
+               lock_taken = 1;
+       }
+
        inc_preempt_count();
        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
        dec_preempt_count();
@@ -1192,6 +1289,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        if (unlikely(curval != uval))
                goto retry_locked;
 
+       /*
+        * We took the lock due to requeue or owner died take over.
+        */
+       if (unlikely(lock_taken))
+               goto out_unlock_release_sem;
+
        /*
         * We dont have the lock. Look up the PI state (or create it if
         * we are the first waiter):
@@ -1199,34 +1302,39 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        ret = lookup_pi_state(uval, hb, &q);
 
        if (unlikely(ret)) {
-               /*
-                * There were no waiters and the owner task lookup
-                * failed. When the OWNER_DIED bit is set, then we
-                * know that this is a robust futex and we actually
-                * take the lock. This is safe as we are protected by
-                * the hash bucket lock. We also set the waiters bit
-                * unconditionally here, to simplify glibc handling of
-                * multiple tasks racing to acquire the lock and
-                * cleanup the problems which were left by the dead
-                * owner.
-                */
-               if (curval & FUTEX_OWNER_DIED) {
-                       uval = newval;
-                       newval = current->pid |
-                               FUTEX_OWNER_DIED | FUTEX_WAITERS;
+               switch (ret) {
 
-                       inc_preempt_count();
-                       curval = futex_atomic_cmpxchg_inatomic(uaddr,
-                                                              uval, newval);
-                       dec_preempt_count();
+               case -EAGAIN:
+                       /*
+                        * Task is exiting and we just wait for the
+                        * exit to complete.
+                        */
+                       queue_unlock(&q, hb);
+                       up_read(&curr->mm->mmap_sem);
+                       cond_resched();
+                       goto retry;
 
-                       if (unlikely(curval == -EFAULT))
+               case -ESRCH:
+                       /*
+                        * No owner found for this futex. Check if the
+                        * OWNER_DIED bit is set to figure out whether
+                        * this is a robust futex or not.
+                        */
+                       if (get_futex_value_locked(&curval, uaddr))
                                goto uaddr_faulted;
-                       if (unlikely(curval != uval))
+
+                       /*
+                        * We simply start over in case of a robust
+                        * futex. The code above will take the futex
+                        * and return happy.
+                        */
+                       if (curval & FUTEX_OWNER_DIED) {
+                               ownerdied = 1;
                                goto retry_locked;
-                       ret = 0;
+                       }
+               default:
+                       goto out_unlock_release_sem;
                }
-               goto out_unlock_release_sem;
        }
 
        /*
@@ -1255,65 +1363,63 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        down_read(&curr->mm->mmap_sem);
        spin_lock(q.lock_ptr);
 
-       /*
-        * Got the lock. We might not be the anticipated owner if we
-        * did a lock-steal - fix up the PI-state in that case.
-        */
-       if (!ret && q.pi_state->owner != curr) {
-               u32 newtid = current->pid | FUTEX_WAITERS;
-
-               /* Owner died? */
-               if (q.pi_state->owner != NULL) {
-                       spin_lock_irq(&q.pi_state->owner->pi_lock);
-                       WARN_ON(list_empty(&q.pi_state->list));
-                       list_del_init(&q.pi_state->list);
-                       spin_unlock_irq(&q.pi_state->owner->pi_lock);
-               } else
-                       newtid |= FUTEX_OWNER_DIED;
-
-               q.pi_state->owner = current;
-
-               spin_lock_irq(&current->pi_lock);
-               WARN_ON(!list_empty(&q.pi_state->list));
-               list_add(&q.pi_state->list, &current->pi_state_list);
-               spin_unlock_irq(&current->pi_lock);
-
-               /* Unqueue and drop the lock */
-               unqueue_me_pi(&q, hb);
-               up_read(&curr->mm->mmap_sem);
+       if (!ret) {
                /*
-                * We own it, so we have to replace the pending owner
-                * TID. This must be atomic as we have preserve the
-                * owner died bit here.
+                * Got the lock. We might not be the anticipated owner 
+                * if we did a lock-steal - fix up the PI-state in
+                * that case:
                 */
-               ret = get_user(uval, uaddr);
-               while (!ret) {
-                       newval = (uval & FUTEX_OWNER_DIED) | newtid;
-                       curval = futex_atomic_cmpxchg_inatomic(uaddr,
-                                                              uval, newval);
-                       if (curval == -EFAULT)
-                               ret = -EFAULT;
-                       if (curval == uval)
-                               break;
-                       uval = curval;
-               }
+               if (q.pi_state->owner != curr)
+                       ret = fixup_pi_state_owner(uaddr, &q, curr);
        } else {
                /*
                 * Catch the rare case, where the lock was released
-                * when we were on the way back before we locked
-                * the hash bucket.
+                * when we were on the way back before we locked the
+                * hash bucket.
                 */
-               if (ret && q.pi_state->owner == curr) {
+               if (q.pi_state->owner == curr) {
                        if (rt_mutex_trylock(&q.pi_state->pi_mutex))
                                ret = 0;
+                       else {
+                               /*
+                                * pi_state is incorrect, some other
+                                * task did a lock steal and we
+                                * returned due to timeout or signal
+                                * without taking the rt_mutex. Too
+                                * late. We can access the
+                                * rt_mutex_owner without locking, as
+                                * the other task is now blocked on
+                                * the hash bucket lock. Fix the state
+                                * up.
+                                */
+                               struct task_struct *owner;
+                               int res;
+
+                               owner = rt_mutex_owner(&q.pi_state->pi_mutex);
+                               res = fixup_pi_state_owner(uaddr, &q, owner);
+
+                               /* propagate -EFAULT, if the fixup failed */
+                               if (res)
+                                       ret = res;
+                       }
+               } else {
+                       /*
+                        * Paranoia check. If we did not take the lock
+                        * in the trylock above, then we should not be
+                        * the owner of the rtmutex, neither the real
+                        * nor the pending one:
+                        */
+                       if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
+                               printk(KERN_ERR "futex_lock_pi: ret = %d "
+                                      "pi-mutex: %p pi-state %p\n", ret,
+                                      q.pi_state->pi_mutex.owner,
+                                      q.pi_state->owner);
                }
-               /* Unqueue and drop the lock */
-               unqueue_me_pi(&q, hb);
-               up_read(&curr->mm->mmap_sem);
        }
-
-       if (!detect && ret == -EDEADLK && 0)
-               force_sig(SIGKILL, current);
+       
+       /* Unqueue and drop the lock */
+       unqueue_me_pi(&q, hb);
+       up_read(&curr->mm->mmap_sem);
 
        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
@@ -1330,16 +1436,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
         * non-atomically.  Therefore, if get_user below is not
         * enough, we need to handle the fault ourselves, while
         * still holding the mmap_sem.
+        *
+        * ... and hb->lock.  :-)  --ANK
         */
+       queue_unlock(&q, hb);
+
        if (attempt++) {
-               if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-                       ret = -EFAULT;
-                       goto out_unlock_release_sem;
-               }
-               goto retry_locked;
+               ret = futex_handle_fault((unsigned long)uaddr, attempt);
+               if (ret)
+                       goto out_release_sem;
+               goto retry_unlocked;
        }
 
-       queue_unlock(&q, hb);
        up_read(&curr->mm->mmap_sem);
 
        ret = get_user(uval, uaddr);
@@ -1381,9 +1489,9 @@ retry:
                goto out;
 
        hb = hash_futex(&key);
+retry_unlocked:
        spin_lock(&hb->lock);
 
-retry_locked:
        /*
         * To avoid races, try to do the TID -> 0 atomic transition
         * again. If it succeeds then we can return without waking
@@ -1445,16 +1553,19 @@ pi_faulted:
         * non-atomically.  Therefore, if get_user below is not
         * enough, we need to handle the fault ourselves, while
         * still holding the mmap_sem.
+        *
+        * ... and hb->lock. --ANK
         */
+       spin_unlock(&hb->lock);
+
        if (attempt++) {
-               if (futex_handle_fault((unsigned long)uaddr, attempt)) {
-                       ret = -EFAULT;
-                       goto out_unlock;
-               }
-               goto retry_locked;
+               ret = futex_handle_fault((unsigned long)uaddr, attempt);
+               if (ret)
+                       goto out;
+               uval = 0;
+               goto retry_unlocked;
        }
 
-       spin_unlock(&hb->lock);
        up_read(&current->mm->mmap_sem);
 
        ret = get_user(uval, uaddr);
@@ -1716,9 +1827,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
 void exit_robust_list(struct task_struct *curr)
 {
        struct robust_list_head __user *head = curr->robust_list;
-       struct robust_list __user *entry, *pending;
-       unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+       struct robust_list __user *entry, *next_entry, *pending;
+       unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
        unsigned long futex_offset;
+       int rc;
 
        /*
         * Fetch the list head (which was registered earlier, via
@@ -1741,7 +1853,13 @@ void exit_robust_list(struct task_struct *curr)
        if (pending)
                handle_futex_death((void *)pending + futex_offset, curr, pip);
 
+       next_entry = NULL;      /* avoid warning with gcc */
        while (entry != &head->list) {
+               /*
+                * Fetch the next entry in the list before calling
+                * handle_futex_death:
+                */
+               rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
                /*
                 * A pending lock might already be on the list, so
                 * don't process it twice:
@@ -1750,11 +1868,10 @@ void exit_robust_list(struct task_struct *curr)
                        if (handle_futex_death((void *)entry + futex_offset,
                                                curr, pi))
                                return;
-               /*
-                * Fetch the next entry in the list:
-                */
-               if (fetch_robust_entry(&entry, &entry->next, &pi))
+               if (rc)
                        return;
+               entry = next_entry;
+               pi = next_pi;
                /*
                 * Avoid excessively long or circular lists:
                 */
@@ -1763,6 +1880,10 @@ void exit_robust_list(struct task_struct *curr)
 
                cond_resched();
        }
+
+       if (pending)
+               handle_futex_death((void __user *)pending + futex_offset,
+                                  curr, pip);
 }
 
 long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
index c5cca3f65cb776f2757e4bd5eeb167a4f84f6ffd..a31f13d4381a262d6d373f41228e24184c5fceba 100644 (file)
@@ -29,6 +29,15 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
        return 0;
 }
 
+static void __user *futex_uaddr(struct robust_list *entry,
+                               compat_long_t futex_offset)
+{
+       compat_uptr_t base = ptr_to_compat(entry);
+       void __user *uaddr = compat_ptr(base + futex_offset);
+
+       return uaddr;
+}
+
 /*
  * Walk curr->robust_list (very carefully, it's a userspace list!)
  * and mark any locks found there dead, and notify any waiters.
@@ -38,10 +47,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
 void compat_exit_robust_list(struct task_struct *curr)
 {
        struct compat_robust_list_head __user *head = curr->compat_robust_list;
-       struct robust_list __user *entry, *pending;
-       unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-       compat_uptr_t uentry, upending;
+       struct robust_list __user *entry, *next_entry, *pending;
+       unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+       compat_uptr_t uentry, next_uentry, upending;
        compat_long_t futex_offset;
+       int rc;
 
        /*
         * Fetch the list head (which was registered earlier, via
@@ -61,25 +71,30 @@ void compat_exit_robust_list(struct task_struct *curr)
        if (fetch_robust_entry(&upending, &pending,
                               &head->list_op_pending, &pip))
                return;
-       if (upending)
-               handle_futex_death((void *)pending + futex_offset, curr, pip);
 
-       while (compat_ptr(uentry) != &head->list) {
+       next_entry = NULL;      /* avoid warning with gcc */
+       while (entry != (struct robust_list __user *) &head->list) {
+               /*
+                * Fetch the next entry in the list before calling
+                * handle_futex_death:
+                */
+               rc = fetch_robust_entry(&next_uentry, &next_entry,
+                       (compat_uptr_t __user *)&entry->next, &next_pi);
                /*
                 * A pending lock might already be on the list, so
                 * dont process it twice:
                 */
-               if (entry != pending)
-                       if (handle_futex_death((void *)entry + futex_offset,
-                                               curr, pi))
-                               return;
+               if (entry != pending) {
+                       void *uaddr = futex_uaddr(entry, futex_offset);
 
-               /*
-                * Fetch the next entry in the list:
-                */
-               if (fetch_robust_entry(&uentry, &entry,
-                                      (compat_uptr_t *)&entry->next, &pi))
+                       if (handle_futex_death(uaddr, curr, pi))
+                               return;
+               }
+               if (rc)
                        return;
+               uentry = next_uentry;
+               entry = next_entry;
+               pi = next_pi;
                /*
                 * Avoid excessively long or circular lists:
                 */
@@ -88,6 +103,11 @@ void compat_exit_robust_list(struct task_struct *curr)
 
                cond_resched();
        }
+       if (pending) {
+               void *uaddr = futex_uaddr(pending, futex_offset);
+
+               handle_futex_death(uaddr, curr, pip);
+       }
 }
 
 asmlinkage long
index 201917b0598081e2399e8f6230a3d57a01c23863..53fb8cff430fa7180bf557ca64b7e049f7bdac47 100644 (file)
@@ -100,7 +100,11 @@ static int recalc_sigpending_tsk(struct task_struct *t)
                set_tsk_thread_flag(t, TIF_SIGPENDING);
                return 1;
        }
-       clear_tsk_thread_flag(t, TIF_SIGPENDING);
+       /*
+        * We must never clear the flag in another thread, or in current
+        * when it's possible the current syscall is returning -ERESTART*.
+        * So we don't clear it here, and only callers who know they should do.
+        */
        return 0;
 }
 
@@ -116,7 +120,9 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 
 void recalc_sigpending(void)
 {
-       recalc_sigpending_tsk(current);
+       if (!recalc_sigpending_tsk(current))
+               clear_thread_flag(TIF_SIGPENDING);
+
 }
 
 /* Given the mask, find the first available signal that should be serviced. */
@@ -327,7 +333,6 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                        sig = 0;
                                
        }
-       recalc_sigpending();
 
        return sig;
 }
@@ -344,7 +349,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
        if (!signr)
                signr = __dequeue_signal(&tsk->signal->shared_pending,
                                         mask, info);
-       if (signr && unlikely(sig_kernel_stop(signr))) {
+       recalc_sigpending();
+       if (signr && unlikely(sig_kernel_stop(signr))) {
                /*
                 * Set a marker that we have dequeued a stop signal.  Our
                 * caller might release the siglock and then the pending
index c9aad28384be3b1dd47482daf757a66335e30508..8248555c5d2403f032efe91126c0636cd6d54eeb 100644 (file)
@@ -89,6 +89,12 @@ void softlockup_tick(struct pt_regs *regs)
                return;
        }
 
+       /* do not print during early bootup: */
+       if (unlikely(system_state != SYSTEM_RUNNING)) {
+               touch_softlockup_watchdog();
+               return;
+       }
+
        now = jiffies;
 
        /* Wake up the high-prio watchdog task every second: */
index 21bf335d3601ae60fa564e44d6e242580989832b..7695fc56afce2a0532d2274c263e61e73821411c 100644 (file)
@@ -1808,6 +1808,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
        if (retval)
                return retval;
 
+       if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
+               /*
+                * The caller is asking for an immediate RLIMIT_CPU
+                * expiry.  But we use the zero value to mean "it was
+                * never set".  So let's cheat and make it one second
+                * instead
+                */
+               new_rlim.rlim_cur = 1;
+       }
+
        task_lock(current->group_leader);
        *old_rlim = new_rlim;
        task_unlock(current->group_leader);
@@ -1829,15 +1839,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
                unsigned long rlim_cur = new_rlim.rlim_cur;
                cputime_t cputime;
 
-               if (rlim_cur == 0) {
-                       /*
-                        * The caller is asking for an immediate RLIMIT_CPU
-                        * expiry.  But we use the zero value to mean "it was
-                        * never set".  So let's cheat and make it one second
-                        * instead
-                        */
-                       rlim_cur = 1;
-               }
                cputime = secs_to_cputime(rlim_cur);
                read_lock(&tasklist_lock);
                spin_lock_irq(&current->sighand->siglock);
index 9e3585e24d21e3380b278eb7253e61a6d14d3b13..6605ba75ec9adbea35d5d85ba95cbd4b1b242ac2 100644 (file)
@@ -2140,21 +2140,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                /* Limit the size of the copy to the caller's write size */
                bytes = min(bytes, count);
 
-               /*
-                * Limit the size of the copy to that of the current segment,
-                * because fault_in_pages_readable() doesn't know how to walk
-                * segments.
+               /* We only need to worry about prefaulting when writes are from
+                * user-space.  NFSd uses vfs_writev with several non-aligned
+                * segments in the vector, and limiting to one segment a time is
+                * a noticeable performance for re-write
                 */
-               bytes = min(bytes, cur_iov->iov_len - iov_base);
-
-               /*
-                * Bring in the user page that we will copy from _first_.
-                * Otherwise there's a nasty deadlock on copying from the
-                * same page as we're writing to, without it being marked
-                * up-to-date.
-                */
-               fault_in_pages_readable(buf, bytes);
+               if (!segment_eq(get_fs(), KERNEL_DS)) {
+                       /*
+                        * Limit the size of the copy to that of the current
+                        * segment, because fault_in_pages_readable() doesn't
+                        * know how to walk segments.
+                        */
+                       bytes = min(bytes, cur_iov->iov_len - iov_base);
 
+                       /*
+                        * Bring in the user page that we will copy from
+                        * _first_.  Otherwise there's a nasty deadlock on
+                        * copying from the same page as we're writing to,
+                        * without it being marked up-to-date.
+                        */
+                       fault_in_pages_readable(buf, bytes);
+               }
                page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
                if (!page) {
                        status = -ENOMEM;
index 1a3466f7dbcd96c839b8781b4e789ebf67afc9a4..a791d3bb92e084d8cdf56bc21bdfd012c304212d 100644 (file)
@@ -131,9 +131,6 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
 atomic_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
-/* Shared by v4/v6 udp. */
-int udp_port_rover;
-
 static int udp_v4_get_port(struct sock *sk, unsigned short snum)
 {
        struct hlist_node *node;
@@ -141,47 +138,51 @@ static int udp_v4_get_port(struct sock *sk, unsigned short snum)
        struct inet_sock *inet = inet_sk(sk);
 
        write_lock_bh(&udp_hash_lock);
-       if (snum == 0) {
-               int best_size_so_far, best, result, i, low, high;
+       if (!snum) {
+               int i, low, high, remaining;
+               unsigned rover, best, best_size_so_far;
 
                inet_get_local_port_range(&low, &high);
+               remaining = (high - low) + 1;
+
+               best_size_so_far = UINT_MAX;
+               best = rover = net_random() % remaining + low;
 
-               if (udp_port_rover > high ||
-                   udp_port_rover < low)
-                       udp_port_rover = low;
-               best_size_so_far = 32767;
-               best = result = udp_port_rover;
-               for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
+               /* 1st pass: look for empty (or shortest) hash chain */
+               for (i = 0; i < UDP_HTABLE_SIZE; i++) {
                        struct hlist_head *list;
-                       int size;
+                       int size = 0;
 
-                       list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
-                       if (hlist_empty(list)) {
-                               if (result > high)
-                                       result = low + ((result - low) &
-                                                (UDP_HTABLE_SIZE - 1));
+                       list = &udp_hash[rover & (UDP_HTABLE_SIZE - 1)];
+                       if (hlist_empty(list))
                                goto gotit;
-                       }
-                       size = 0;
+
                        sk_for_each(sk2, node, list)
                                if (++size >= best_size_so_far)
                                        goto next;
                        best_size_so_far = size;
-                       best = result;
-               next:;
+                       best = rover;
+               next:
+                       /* fold back if end of range */
+                       if (++rover > high)
+                               rover = low + ((rover - low)
+                                           & (UDP_HTABLE_SIZE - 1));
                }
-               result = best;
-               for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
-                       if (result > high)
-                               result = low + ((result - low) &
-                                          (UDP_HTABLE_SIZE - 1));
-                       if (!udp_lport_inuse(result))
-                               break;
+               /* 2nd pass: find hole in shortest hash chain */
+               rover = best;
+               for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) {
+                       if (!udp_lport_inuse(rover))
+                               goto gotit;
+                       rover += UDP_HTABLE_SIZE;
+                       if (rover > high)
+                               rover = low + ((rover - low)
+                                           & (UDP_HTABLE_SIZE - 1));
                }
-               if (i >= (1 << 16) / UDP_HTABLE_SIZE)
-                       goto fail;
+               /* All ports in use! */
+               goto fail;
+
 gotit:
-               udp_port_rover = snum = result;
+               snum = rover;
        } else {
                sk_for_each(sk2, node,
                            &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
@@ -1648,7 +1649,6 @@ EXPORT_SYMBOL(udp_disconnect);
 EXPORT_SYMBOL(udp_hash);
 EXPORT_SYMBOL(udp_hash_lock);
 EXPORT_SYMBOL(udp_ioctl);
-EXPORT_SYMBOL(udp_port_rover);
 EXPORT_SYMBOL(udp_prot);
 EXPORT_SYMBOL(udp_sendmsg);
 EXPORT_SYMBOL(udp_poll);
index 836eecd7e62bfaa100a616d5aeda26ecc652ada9..cb81464d5488a96f08cacb177a074aa6683dde8f 100644 (file)
@@ -396,9 +396,9 @@ static int ipip6_rcv(struct sk_buff *skb)
        }
 
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-       kfree_skb(skb);
        read_unlock(&ipip6_lock);
 out:
+       kfree_skb(skb);
        return 0;
 }
 
index 69dc6616dffa02d3252e9f76f84f60515685f3f9..1a5d72b4d266008843cbeeaab1d1f362b61c88b2 100644 (file)
@@ -70,45 +70,51 @@ static int udp_v6_get_port(struct sock *sk, unsigned short snum)
        struct hlist_node *node;
 
        write_lock_bh(&udp_hash_lock);
-       if (snum == 0) {
-               int best_size_so_far, best, result, i, low, high;
+       if (!snum) {
+               int i, low, high, remaining;
+               unsigned rover, best, best_size_so_far;
 
                inet_get_local_port_range(&low, &high);
-               if (udp_port_rover > high || udp_port_rover < low)
-                       udp_port_rover = low;
-               best_size_so_far = 32767;
-               best = result = udp_port_rover;
-               for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
-                       int size;
+               remaining = (high - low) + 1;
+
+               best_size_so_far = UINT_MAX;
+               best = rover = net_random() % remaining + low;
+
+               /* 1st pass: look for empty (or shortest) hash chain */
+               for (i = 0; i < UDP_HTABLE_SIZE; i++) {
+                       int size = 0;
                        struct hlist_head *list;
 
-                       list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
-                       if (hlist_empty(list)) {
-                               if (result > high)
-                                       result = low + ((result - low) &
-                                                (UDP_HTABLE_SIZE - 1));
+                       list = &udp_hash[rover & (UDP_HTABLE_SIZE - 1)];
+                       if (hlist_empty(list))
                                goto gotit;
-                       }
-                       size = 0;
+
                        sk_for_each(sk2, node, list)
                                if (++size >= best_size_so_far)
                                        goto next;
                        best_size_so_far = size;
-                       best = result;
-               next:;
+                       best = rover;
+               next:
+                       /* fold back if end of range */
+                       if (++rover > high)
+                               rover = low + ((rover - low)
+                                           & (UDP_HTABLE_SIZE - 1));
                }
-               result = best;
-               for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
-                       if (result > high)
-                               result = low + ((result - low) &
-                                          (UDP_HTABLE_SIZE - 1));
-                       if (!udp_lport_inuse(result))
-                               break;
+               /* 2nd pass: find hole in shortest hash chain */
+               rover = best;
+               for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) {
+                       if (!udp_lport_inuse(rover))
+                               goto gotit;
+                       rover += UDP_HTABLE_SIZE;
+                       if (rover > high)
+                               rover = low + ((rover - low)
+                                           & (UDP_HTABLE_SIZE - 1));
                }
-               if (i >= (1 << 16) / UDP_HTABLE_SIZE)
-                       goto fail;
+               /* All ports in use! */
+               goto fail;
+
 gotit:
-               udp_port_rover = snum = result;
+               snum = rover;
        } else {
                sk_for_each(sk2, node,
                            &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
index 9adc06f582b97259da40baa435f11261f4251dec..cdddff500b0f2b9ac071df1923fd845913fcd23a 100644 (file)
@@ -4024,7 +4024,9 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
        if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs_old)))
                return -EFAULT;
 
-       if (getaddrs.addr_num <= 0) return -EINVAL;
+       if (getaddrs.addr_num <= 0 ||
+           getaddrs.addr_num >= (INT_MAX / sizeof(union sctp_addr)))
+               return -EINVAL;
        /*
         *  For UDP-style sockets, id specifies the association to query.
         *  If the id field is set to the value '0' then the locally bound
index de6ec519272e5225f85bce0b909a5241a6371a6f..acde97ec3813d82f4238a4b16aa551683716d204 100644 (file)
@@ -1711,20 +1711,22 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                int chunk;
                struct sk_buff *skb;
 
+               unix_state_rlock(sk);
                skb = skb_dequeue(&sk->sk_receive_queue);
                if (skb==NULL)
                {
                        if (copied >= target)
-                               break;
+                               goto unlock;
 
                        /*
                         *      POSIX 1003.1g mandates this order.
                         */
                         
                        if ((err = sock_error(sk)) != 0)
-                               break;
+                               goto unlock;
                        if (sk->sk_shutdown & RCV_SHUTDOWN)
-                               break;
+                               goto unlock;
+                       unix_state_runlock(sk);
                        err = -EAGAIN;
                        if (!timeo)
                                break;
@@ -1738,7 +1740,11 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                        }
                        mutex_lock(&u->readlock);
                        continue;
+unlock:
+                       unix_state_runlock(sk);
+                       break;
                }
+               unix_state_runlock(sk);
 
                if (check_creds) {
                        /* Never glue messages from different writers */