From: t_jeang Date: Tue, 6 Jan 2009 12:05:54 +0000 (+0000) Subject: * Wed Jul 23 2008 Jiri Pirko [2.6.18-92.1.10.el5] X-Git-Tag: kernel-2.6.18-92.1.6.el5.patch X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=bb6fb51c51f75923e64ffc98f69008f37fee7136;p=xenclient%2Fkernel.git * Wed Jul 23 2008 Jiri Pirko [2.6.18-92.1.10.el5] - [ia64] softlock: prevent endless warnings in kdump (Neil Horman ) [456117 453200] * Wed Jul 16 2008 Jiri Pirko [2.6.18-92.1.9.el5] - [misc] signaling msgrvc() should not pass back error (Jiri Pirko ) [455278 452533] - [ia64] properly unregister legacy interrupts (Prarit Bhargava ) [450337 445886] * Mon Jul 14 2008 Jiri Pirko [2.6.18-92.1.8.el5] - [net] randomize udp port allocation (Eugene Teo ) [454571 454572] - [tty] add NULL pointer checks (Aristeu Rozanski ) [453425 453154] {CVE-2008-2812} - [net] sctp: make sure sctp_addr does not overflow (David S. Miller ) [452482 452483] {CVE-2008-2826} - [sys] sys_setrlimit: prevent setting RLIMIT_CPU to 0 (Neil Horman ) [437121 437122] {CVE-2008-1294} - [net] sit: exploitable remote memory leak (Jiri Pirko ) [446038 446039] {CVE-2008-2136} - [misc] ttyS1 lost interrupt, stops transmitting v2 (Brian Maly ) [455256 451157] - [misc] ttyS1 loses interrupt and stops transmitting (Simon McGrath ) [443071 440121] * Thu Jul 10 2008 Jiri Pirko [2.6.18-92.1.7.el5] - [x86_64]: extend MCE banks support for Dunnington, Nehalem (Prarit Bhargava ) [451941 446673] - [nfs] address nfs rewrite performance regression in RHEL5 (Eric Sandeen ) [448685 436004] - [mm] Make mmap() with PROT_WRITE on RHEL5 (Larry Woodman ) [450758 448978] - [i386]: Add check for supported_cpus in powernow_k8 driver (Prarit Bhargava ) [450866 443853] - [i386]: Add check for dmi_data in powernow_k8 driver (Prarit Bhargava ) [450866 443853] - [net] fix recv return zero (Thomas Graf ) [452231 435657] - [misc] kernel crashes on futex (Anton Arapov ) [450336 435178] - [net] Fixing bonding rtnl_lock screwups (Fabio Olive Leite ) [451939 450219] --- diff --git a/Makefile b/Makefile index c3703062..8f4a08cc 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 18 -EXTRAVERSION = -92.1.6.el5 +EXTRAVERSION = -92.1.10.el5 RHEL_MAJOR = 5 RHEL_MINOR = 2 NAME=Avast! A bilge rat! diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index f7279468..45914b57 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -449,7 +449,7 @@ good_area: case 1: /* read, present */ goto bad_area; case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; } diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index c9e3aae6..97c2fbd3 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -621,6 +621,9 @@ void acpi_unregister_gsi(u32 gsi) if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM) return; + if (has_8259 && gsi < 16) + return; + iosapic_unregister_intr(gsi); } diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 42bc87c7..64300042 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -152,9 +152,11 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re # error File is out of sync with . Please update. # endif + if (((isr >> IA64_ISR_R_BIT) & 1UL) && (!(vma->vm_flags & (VM_READ | VM_WRITE)))) + goto bad_area; + mask = ( (((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) - | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT) - | (((isr >> IA64_ISR_R_BIT) & 1UL) << VM_READ_BIT)); + | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); if ((vma->vm_flags & mask) != mask) goto bad_area; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 78a0d599..77953f41 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -333,7 +333,7 @@ good_area: /* protection fault */ if (error_code & 0x08000000) goto bad_area; - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; } diff --git a/arch/ppc/mm/fault.c b/arch/ppc/mm/fault.c index 5cdfb71f..bc776beb 100644 --- a/arch/ppc/mm/fault.c +++ b/arch/ppc/mm/fault.c @@ -239,7 +239,7 @@ good_area: /* protection fault */ if (error_code & 0x08000000) goto bad_area; - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; } diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index bbea8880..d370a4ea 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -27,7 +27,7 @@ #include #define MISC_MCELOG_MINOR 227 -#define NR_BANKS 6 +#define NR_SYSFS_BANKS 6 atomic_t mce_entry; @@ -37,7 +37,7 @@ static int mce_dont_init; 3: never panic or exit (for testing only) */ static int tolerant = 1; static int banks; -static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; +static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; static unsigned long console_logged; static int notify_user; static int rip_msr; @@ -191,7 +191,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - if (!bank[i]) + if (i < NR_SYSFS_BANKS && !bank[i]) continue; m.misc = 0; @@ -354,9 +354,10 @@ static void mce_init(void *dummy) rdmsrl(MSR_IA32_MCG_CAP, cap); banks = cap & 0xff; - if (banks > NR_BANKS) { - printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); - banks = NR_BANKS; + if (banks > MCE_EXTENDED_BANK) { + printk(KERN_INFO "MCE: warning: using only %d banks\n", + MCE_EXTENDED_BANK); + banks = MCE_EXTENDED_BANK; } /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) @@ -372,7 +373,7 @@ static void mce_init(void *dummy) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -606,13 +607,16 @@ DEFINE_PER_CPU(struct sys_device, device_mce); } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); +/* TBD should generate these dynamically based on number of available banks. + * Have only 6 contol banks in /sysfs until then. + */ ACCESSOR(bank0ctl,bank[0],mce_restart()) ACCESSOR(bank1ctl,bank[1],mce_restart()) ACCESSOR(bank2ctl,bank[2],mce_restart()) ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(bank5ctl,bank[5],mce_restart()) -static struct sysdev_attribute * bank_attributes[NR_BANKS] = { +static struct sysdev_attribute * bank_attributes[NR_SYSFS_BANKS] = { &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl}; ACCESSOR(tolerant,tolerant,) @@ -632,7 +636,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) err = sysdev_register(&per_cpu(device_mce,cpu)); if (!err) { - for (i = 0; i < banks; i++) + for (i = 0; i < NR_SYSFS_BANKS; i++) sysdev_create_file(&per_cpu(device_mce,cpu), bank_attributes[i]); sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 82542a2b..390160b3 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -477,7 +477,7 @@ good_area: case PF_PROT: /* read, present */ goto bad_area; case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; } diff --git a/buildconfigs/Rules.mk b/buildconfigs/Rules.mk index 879cb142..0ce24ae5 100644 --- a/buildconfigs/Rules.mk +++ b/buildconfigs/Rules.mk @@ -2,7 +2,7 @@ XEN_TARGET_ARCH = x86_32 XEN_TARGET_X86_PAE ?= y LINUX_SERIES = 2.6 -LINUX_VER = 2.6.18-92.1.6.el5 +LINUX_VER = 2.6.18-92.1.10.el5 EXTRAVERSION ?= xen diff --git a/configs/kernel-2.6.18-i686-PAE.config b/configs/kernel-2.6.18-i686-PAE.config index e119cdcd..f2e5c7ad 100644 --- a/configs/kernel-2.6.18-i686-PAE.config +++ b/configs/kernel-2.6.18-i686-PAE.config @@ -2,7 +2,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.18-prep -# Tue Jul 1 11:03:51 2008 +# Thu Aug 7 09:42:27 2008 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y diff --git a/configs/kernel-2.6.18-i686-debug.config b/configs/kernel-2.6.18-i686-debug.config index a8c5d219..d8307fac 100644 --- a/configs/kernel-2.6.18-i686-debug.config +++ b/configs/kernel-2.6.18-i686-debug.config @@ -2,7 +2,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.18-prep -# Tue Jul 1 11:03:51 2008 +# Thu Aug 7 09:42:27 2008 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y diff --git a/configs/kernel-2.6.18-i686-xen.config b/configs/kernel-2.6.18-i686-xen.config index e6de9e82..6b43869c 100644 --- a/configs/kernel-2.6.18-i686-xen.config +++ b/configs/kernel-2.6.18-i686-xen.config @@ -2,7 +2,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.18-prep -# Tue Jul 1 11:03:51 2008 +# Thu Aug 7 09:42:27 2008 # CONFIG_X86_32=y CONFIG_LOCKDEP_SUPPORT=y diff --git a/configs/kernel-2.6.18-i686.config b/configs/kernel-2.6.18-i686.config index 065bed08..6d342520 100644 --- a/configs/kernel-2.6.18-i686.config +++ b/configs/kernel-2.6.18-i686.config @@ -2,7 +2,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.18-prep -# Tue Jul 1 11:03:51 2008 +# Thu Aug 7 09:42:27 2008 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index dbd9b964..b1f861f8 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -147,29 +147,29 @@ static ssize_t bonding_store_bonds(struct class *cls, const char *buffer, size_t ": Unable remove bond %s due to open references.\n", ifname); res = -EPERM; - goto out; + goto out_unlock; } printk(KERN_INFO DRV_NAME ": %s is being deleted...\n", bond->dev->name); bond_destroy(bond); - up_write(&bonding_rwsem); - rtnl_unlock(); - goto out; + goto out_unlock; } printk(KERN_ERR DRV_NAME ": unable to delete non-existent bond %s\n", ifname); res = -ENODEV; - up_write(&bonding_rwsem); - rtnl_unlock(); - goto out; + goto out_unlock; } err_no_cmd: printk(KERN_ERR DRV_NAME ": no command found in bonding_masters. Use +ifname or -ifname.\n"); - res = -EPERM; + return -EPERM; + +out_unlock: + up_write(&bonding_rwsem); + rtnl_unlock(); /* Always return either count or an error. If you return 0, you'll * get called forever, which is bad. @@ -254,8 +254,8 @@ static ssize_t bonding_store_slaves(struct class_device *cd, const char *buffer, printk(KERN_ERR DRV_NAME ": %s: Unable to update slaves because interface is down.\n", bond->dev->name); - ret = -EPERM; - goto out; + /* early return before rtnl_lock() */ + return -EPERM; } /* Note: We can't hold bond->lock here, as bond_create grabs it. */ diff --git a/drivers/net/irda/irtty-sir.c b/drivers/net/irda/irtty-sir.c index 6a98b7ae..3c414dd8 100644 --- a/drivers/net/irda/irtty-sir.c +++ b/drivers/net/irda/irtty-sir.c @@ -64,7 +64,9 @@ static int irtty_chars_in_buffer(struct sir_dev *dev) IRDA_ASSERT(priv != NULL, return -1;); IRDA_ASSERT(priv->magic == IRTTY_MAGIC, return -1;); - return priv->tty->driver->chars_in_buffer(priv->tty); + if (priv->tty->driver->chars_in_buffer) + return priv->tty->driver->chars_in_buffer(priv->tty); + return 0; } /* Wait (sleep) until underlaying hardware finished transmission diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c index 23659fd7..30bb05b0 100644 --- a/drivers/net/ppp_async.c +++ b/drivers/net/ppp_async.c @@ -158,6 +158,9 @@ ppp_asynctty_open(struct tty_struct *tty) struct asyncppp *ap; int err; + if (!tty->driver->write) + return -EOPNOTSUPP; + err = -ENOMEM; ap = kmalloc(sizeof(*ap), GFP_KERNEL); if (ap == 0) diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c index 33255fe8..c49d6c9e 100644 --- a/drivers/net/ppp_synctty.c +++ b/drivers/net/ppp_synctty.c @@ -207,6 +207,9 @@ ppp_sync_open(struct tty_struct *tty) struct syncppp *ap; int err; + if (!tty->driver->write) + return -EOPNOTSUPP; + ap = kmalloc(sizeof(*ap), GFP_KERNEL); err = -ENOMEM; if (ap == 0) diff --git a/drivers/net/slip.c b/drivers/net/slip.c index 1588cb7f..13025d42 100644 --- a/drivers/net/slip.c +++ b/drivers/net/slip.c @@ -463,9 +463,14 @@ static void sl_tx_timeout(struct net_device *dev) /* 20 sec timeout not reached */ goto out; } - printk(KERN_WARNING "%s: transmit timed out, %s?\n", dev->name, - (sl->tty->driver->chars_in_buffer(sl->tty) || sl->xleft) ? - "bad line quality" : "driver error"); + { + int cib = 0; + if (sl->tty->driver->chars_in_buffer) + cib = sl->tty->driver->chars_in_buffer(sl->tty); + printk(KERN_WARNING "%s: transmit timed out, %s?\n", + dev->name, (cib || sl->xleft) ? + "bad line quality" : "driver error"); + } sl->xleft = 0; sl->tty->flags &= ~(1 << TTY_DO_WRITE_WAKEUP); sl_unlock(sl); @@ -836,6 +841,8 @@ static int slip_open(struct tty_struct *tty) if(!capable(CAP_NET_ADMIN)) return -EPERM; + if (!tty->driver->write) + return -EOPNOTSUPP; /* RTnetlink lock is misused here to serialize concurrent opens of slip channels. There are better ways, but it is diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index f7c9f233..96eec5cc 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -1762,7 +1762,11 @@ static int serial8250_startup(struct uart_port *port) */ serial_outp(up, UART_LCR, UART_LCR_WLEN8); - spin_lock_irqsave(&up->port.lock, flags); + if (is_real_interrupt(up->port.irq)) { + spin_lock_irqsave(&irq_lists[up->port.irq].lock, flags); + spin_lock(&up->port.lock); + } else + spin_lock_irqsave(&up->port.lock, flags); if (up->port.flags & UPF_FOURPORT) { if (!is_real_interrupt(up->port.irq)) up->port.mctrl |= TIOCM_OUT1; @@ -1794,7 +1798,11 @@ static int serial8250_startup(struct uart_port *port) up->bugs &= ~UART_BUG_TXEN; } - spin_unlock_irqrestore(&up->port.lock, flags); + if (is_real_interrupt(up->port.irq)) { + spin_unlock(&up->port.lock); + spin_unlock_irqrestore(&irq_lists[up->port.irq].lock, flags); + } else + spin_unlock_irqrestore(&up->port.lock, flags); /* * Finally, enable interrupts. Note: Modem status interrupts diff --git a/include/linux/sched.h b/include/linux/sched.h index 4bf2f475..c7502a67 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1077,6 +1077,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_DEAD 0x00000008 /* Dead */ +#define PF_EXITPIDONE 0x00000010 /* pi exit done on shut down */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ diff --git a/include/net/udp.h b/include/net/udp.h index dd5e3b67..97fe4b5a 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -37,8 +37,6 @@ extern struct hlist_head udp_hash[UDP_HTABLE_SIZE]; extern rwlock_t udp_hash_lock; -extern int udp_port_rover; - static inline int udp_lport_inuse(u16 num) { struct sock *sk; diff --git a/kernel/exit.c b/kernel/exit.c index 45a15629..24d2b68e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -827,13 +827,29 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->flags & PF_EXITING)) { printk(KERN_ALERT "Fixing recursive fault but reboot is needed!\n"); + /* + * We can do this unlocked here. The futex code uses + * this flag just to verify whether the pi state + * cleanup has been done or not. In the worst case it + * loops once more. We pretend that the cleanup was + * done as there is no way to return. Either the + * OWNER_DIED bit is set by now or we push the blocked + * task into the wait for ever nirwana as well. + */ + tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } + /* + * tsk->flags are checked in the futex code to protect against + * an exiting task cleaning up the robust pi futexes. + */ + spin_lock_irq(&tsk->pi_lock); tsk->flags |= PF_EXITING; + spin_unlock_irq(&tsk->pi_lock); ptrace_exit(tsk); @@ -851,7 +867,7 @@ fastcall NORET_TYPE void do_exit(long code) } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { - hrtimer_cancel(&tsk->signal->real_timer); + hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } @@ -913,6 +929,12 @@ fastcall NORET_TYPE void do_exit(long code) * Make sure we are holding no locks: */ debug_check_no_locks_held(tsk); + /* + * We can do this unlocked here. The futex code uses this flag + * just to verify whether the pi state cleanup has been done + * or not. In the worst case it loops once more. + */ + tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(); diff --git a/kernel/futex.c b/kernel/futex.c index 9d260e83..b7b11976 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -391,18 +391,12 @@ static struct task_struct * futex_find_get_task(pid_t pid) read_lock(&tasklist_lock); p = find_task_by_pid(pid); - if (!p) - goto out_unlock; - if ((current->euid != p->euid) && (current->euid != p->uid)) { - p = NULL; - goto out_unlock; - } - if (p->exit_state != 0) { - p = NULL; - goto out_unlock; - } - get_task_struct(p); -out_unlock: + + if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) + p = ERR_PTR(-ESRCH); + else + get_task_struct(p); + read_unlock(&tasklist_lock); return p; @@ -468,7 +462,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) struct futex_q *this, *next; struct list_head *head; struct task_struct *p; - pid_t pid; + pid_t pid = uval & FUTEX_TID_MASK; head = &hb->chain; @@ -486,6 +480,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); + WARN_ON(pid && pi_state->owner && + pi_state->owner->pid != pid); atomic_inc(&pi_state->refcount); me->pi_state = pi_state; @@ -496,15 +492,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) /* * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when the owner died bit is set - * and TID = 0: + * the new pi_state to it, but bail out when TID = 0 */ - pid = uval & FUTEX_TID_MASK; - if (!pid && (uval & FUTEX_OWNER_DIED)) + if (!pid) return -ESRCH; p = futex_find_get_task(pid); - if (!p) - return -ESRCH; + if (IS_ERR(p)) + return PTR_ERR(p); + + /* + * We need to look at the task state flags to figure out, + * whether the task is exiting. To protect against the do_exit + * change of the task flags, we do this protected by + * p->pi_lock: + */ + spin_lock_irq(&p->pi_lock); + if (unlikely(p->flags & PF_EXITING)) { + /* + * The task is on the way out. When PF_EXITPIDONE is + * set, we know that the task has finished the + * cleanup: + */ + int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; + + spin_unlock_irq(&p->pi_lock); + put_task_struct(p); + return ret; + } pi_state = alloc_pi_state(); @@ -517,7 +531,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) /* Store the key for possible exit cleanups: */ pi_state->key = me->key; - spin_lock_irq(&p->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; @@ -566,6 +579,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; + spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -583,15 +597,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) * preserve the owner died bit.) */ if (!(uval & FUTEX_OWNER_DIED)) { + int ret = 0; newval = FUTEX_WAITERS | new_owner->pid; inc_preempt_count(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); dec_preempt_count(); if (curval == -EFAULT) - return -EFAULT; - if (curval != uval) - return -EINVAL; + ret = -EFAULT; + else if (curval != uval) + ret = -EINVAL; + if (ret) { + spin_unlock(&pi_state->pi_mutex.wait_lock); + return ret; + } } spin_lock_irq(&pi_state->owner->pi_lock); @@ -605,6 +624,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) pi_state->owner = new_owner; spin_unlock_irq(&new_owner->pi_lock); + spin_unlock(&pi_state->pi_mutex.wait_lock); rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -1001,6 +1021,60 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) drop_key_refs(&q->key); } +/* + * Fixup the pi_state owner with the new owner. + * + * The cur->mm semaphore must be held, it is released at return of this + * function. + */ +static int fixup_pi_state_owner(u32 *uaddr, struct futex_q *q, + struct task_struct *newowner) +{ + u32 newtid = newowner->pid | FUTEX_WAITERS; + struct futex_pi_state *pi_state = q->pi_state; + u32 uval, curval, newval; + int ret; + + /* Owner died? */ + if (pi_state->owner != NULL) { + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + } else + newtid |= FUTEX_OWNER_DIED; + + pi_state->owner = newowner; + + spin_lock_irq(&newowner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &newowner->pi_state_list); + spin_unlock_irq(&newowner->pi_lock); + + /* + * We own it, so we have to replace the pending owner + * TID. This must be atomic as we have preserve the + * owner died bit here. + */ + ret = get_futex_value_locked(&uval, uaddr); + + while (!ret) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + dec_preempt_count(); + + if (curval == -EFAULT) + ret = -EFAULT; + if (curval == uval) + break; + uval = curval; + } + return ret; +} + static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) { struct task_struct *curr = current; @@ -1128,7 +1202,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, struct futex_hash_bucket *hb; u32 uval, newval, curval; struct futex_q q; - int ret, attempt = 0; + int ret, lock_taken, ownerdied = 0, attempt = 0; if (refill_pi_state_cache()) return -ENOMEM; @@ -1148,9 +1222,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (unlikely(ret != 0)) goto out_release_sem; + retry_unlocked: hb = queue_lock(&q, -1, NULL); retry_locked: + ret = lock_taken = 0; + /* * To avoid races, we attempt to take the lock here again * (by doing a 0 -> TID atomic cmpxchg), while holding all @@ -1165,24 +1242,44 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (unlikely(curval == -EFAULT)) goto uaddr_faulted; - /* We own the lock already */ + /* + * Detect deadlocks. In case of REQUEUE_PI this is a valid + * situation and we return success to user space. + */ if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { - if (!detect && 0) - force_sig(SIGKILL, current); ret = -EDEADLK; goto out_unlock_release_sem; } /* - * Surprise - we got the lock. Just return - * to userspace: + * Surprise - we got the lock. Just return to userspace: */ if (unlikely(!curval)) goto out_unlock_release_sem; uval = curval; + + /* + * Set the WAITERS flag, so the owner will know it has someone + * to wake at next unlock + */ newval = uval | FUTEX_WAITERS; + /* + * There are two cases, where a futex might have no owner (the + * owner TID is 0): OWNER_DIED or REQUEUE. We take over the + * futex in this case. We also do an unconditional take over, + * when the owner of the futex died. + * + * This is safe as we are protected by the hash bucket lock ! + */ + if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + /* Keep the OWNER_DIED bit */ + newval = (curval & ~FUTEX_TID_MASK) | current->pid; + ownerdied = 0; + lock_taken = 1; + } + inc_preempt_count(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); dec_preempt_count(); @@ -1192,6 +1289,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (unlikely(curval != uval)) goto retry_locked; + /* + * We took the lock due to requeue or owner died take over. + */ + if (unlikely(lock_taken)) + goto out_unlock_release_sem; + /* * We dont have the lock. Look up the PI state (or create it if * we are the first waiter): @@ -1199,34 +1302,39 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, ret = lookup_pi_state(uval, hb, &q); if (unlikely(ret)) { - /* - * There were no waiters and the owner task lookup - * failed. When the OWNER_DIED bit is set, then we - * know that this is a robust futex and we actually - * take the lock. This is safe as we are protected by - * the hash bucket lock. We also set the waiters bit - * unconditionally here, to simplify glibc handling of - * multiple tasks racing to acquire the lock and - * cleanup the problems which were left by the dead - * owner. - */ - if (curval & FUTEX_OWNER_DIED) { - uval = newval; - newval = current->pid | - FUTEX_OWNER_DIED | FUTEX_WAITERS; + switch (ret) { - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - dec_preempt_count(); + case -EAGAIN: + /* + * Task is exiting and we just wait for the + * exit to complete. + */ + queue_unlock(&q, hb); + up_read(&curr->mm->mmap_sem); + cond_resched(); + goto retry; - if (unlikely(curval == -EFAULT)) + case -ESRCH: + /* + * No owner found for this futex. Check if the + * OWNER_DIED bit is set to figure out whether + * this is a robust futex or not. + */ + if (get_futex_value_locked(&curval, uaddr)) goto uaddr_faulted; - if (unlikely(curval != uval)) + + /* + * We simply start over in case of a robust + * futex. The code above will take the futex + * and return happy. + */ + if (curval & FUTEX_OWNER_DIED) { + ownerdied = 1; goto retry_locked; - ret = 0; + } + default: + goto out_unlock_release_sem; } - goto out_unlock_release_sem; } /* @@ -1255,65 +1363,63 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, down_read(&curr->mm->mmap_sem); spin_lock(q.lock_ptr); - /* - * Got the lock. We might not be the anticipated owner if we - * did a lock-steal - fix up the PI-state in that case. - */ - if (!ret && q.pi_state->owner != curr) { - u32 newtid = current->pid | FUTEX_WAITERS; - - /* Owner died? */ - if (q.pi_state->owner != NULL) { - spin_lock_irq(&q.pi_state->owner->pi_lock); - WARN_ON(list_empty(&q.pi_state->list)); - list_del_init(&q.pi_state->list); - spin_unlock_irq(&q.pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; - - q.pi_state->owner = current; - - spin_lock_irq(¤t->pi_lock); - WARN_ON(!list_empty(&q.pi_state->list)); - list_add(&q.pi_state->list, ¤t->pi_state_list); - spin_unlock_irq(¤t->pi_lock); - - /* Unqueue and drop the lock */ - unqueue_me_pi(&q, hb); - up_read(&curr->mm->mmap_sem); + if (!ret) { /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * Got the lock. We might not be the anticipated owner + * if we did a lock-steal - fix up the PI-state in + * that case: */ - ret = get_user(uval, uaddr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } + if (q.pi_state->owner != curr) + ret = fixup_pi_state_owner(uaddr, &q, curr); } else { /* * Catch the rare case, where the lock was released - * when we were on the way back before we locked - * the hash bucket. + * when we were on the way back before we locked the + * hash bucket. */ - if (ret && q.pi_state->owner == curr) { + if (q.pi_state->owner == curr) { if (rt_mutex_trylock(&q.pi_state->pi_mutex)) ret = 0; + else { + /* + * pi_state is incorrect, some other + * task did a lock steal and we + * returned due to timeout or signal + * without taking the rt_mutex. Too + * late. We can access the + * rt_mutex_owner without locking, as + * the other task is now blocked on + * the hash bucket lock. Fix the state + * up. + */ + struct task_struct *owner; + int res; + + owner = rt_mutex_owner(&q.pi_state->pi_mutex); + res = fixup_pi_state_owner(uaddr, &q, owner); + + /* propagate -EFAULT, if the fixup failed */ + if (res) + ret = res; + } + } else { + /* + * Paranoia check. If we did not take the lock + * in the trylock above, then we should not be + * the owner of the rtmutex, neither the real + * nor the pending one: + */ + if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) + printk(KERN_ERR "futex_lock_pi: ret = %d " + "pi-mutex: %p pi-state %p\n", ret, + q.pi_state->pi_mutex.owner, + q.pi_state->owner); } - /* Unqueue and drop the lock */ - unqueue_me_pi(&q, hb); - up_read(&curr->mm->mmap_sem); } - - if (!detect && ret == -EDEADLK && 0) - force_sig(SIGKILL, current); + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); return ret != -EINTR ? ret : -ERESTARTNOINTR; @@ -1330,16 +1436,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, * non-atomically. Therefore, if get_user below is not * enough, we need to handle the fault ourselves, while * still holding the mmap_sem. + * + * ... and hb->lock. :-) --ANK */ + queue_unlock(&q, hb); + if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; - goto out_unlock_release_sem; - } - goto retry_locked; + ret = futex_handle_fault((unsigned long)uaddr, attempt); + if (ret) + goto out_release_sem; + goto retry_unlocked; } - queue_unlock(&q, hb); up_read(&curr->mm->mmap_sem); ret = get_user(uval, uaddr); @@ -1381,9 +1489,9 @@ retry: goto out; hb = hash_futex(&key); +retry_unlocked: spin_lock(&hb->lock); -retry_locked: /* * To avoid races, try to do the TID -> 0 atomic transition * again. If it succeeds then we can return without waking @@ -1445,16 +1553,19 @@ pi_faulted: * non-atomically. Therefore, if get_user below is not * enough, we need to handle the fault ourselves, while * still holding the mmap_sem. + * + * ... and hb->lock. --ANK */ + spin_unlock(&hb->lock); + if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; - goto out_unlock; - } - goto retry_locked; + ret = futex_handle_fault((unsigned long)uaddr, attempt); + if (ret) + goto out; + uval = 0; + goto retry_unlocked; } - spin_unlock(&hb->lock); up_read(¤t->mm->mmap_sem); ret = get_user(uval, uaddr); @@ -1716,9 +1827,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; unsigned long futex_offset; + int rc; /* * Fetch the list head (which was registered earlier, via @@ -1741,7 +1853,13 @@ void exit_robust_list(struct task_struct *curr) if (pending) handle_futex_death((void *)pending + futex_offset, curr, pip); + next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); /* * A pending lock might already be on the list, so * don't process it twice: @@ -1750,11 +1868,10 @@ void exit_robust_list(struct task_struct *curr) if (handle_futex_death((void *)entry + futex_offset, curr, pi)) return; - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&entry, &entry->next, &pi)) + if (rc) return; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -1763,6 +1880,10 @@ void exit_robust_list(struct task_struct *curr) cond_resched(); } + + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); } long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index c5cca3f6..a31f13d4 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -29,6 +29,15 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, return 0; } +static void __user *futex_uaddr(struct robust_list *entry, + compat_long_t futex_offset) +{ + compat_uptr_t base = ptr_to_compat(entry); + void __user *uaddr = compat_ptr(base + futex_offset); + + return uaddr; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -38,10 +47,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - compat_uptr_t uentry, upending; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; + compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; + int rc; /* * Fetch the list head (which was registered earlier, via @@ -61,25 +71,30 @@ void compat_exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; - if (upending) - handle_futex_death((void *)pending + futex_offset, curr, pip); - while (compat_ptr(uentry) != &head->list) { + next_entry = NULL; /* avoid warning with gcc */ + while (entry != (struct robust_list __user *) &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_uentry, &next_entry, + (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: */ - if (entry != pending) - if (handle_futex_death((void *)entry + futex_offset, - curr, pi)) - return; + if (entry != pending) { + void *uaddr = futex_uaddr(entry, futex_offset); - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&uentry, &entry, - (compat_uptr_t *)&entry->next, &pi)) + if (handle_futex_death(uaddr, curr, pi)) + return; + } + if (rc) return; + uentry = next_uentry; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -88,6 +103,11 @@ void compat_exit_robust_list(struct task_struct *curr) cond_resched(); } + if (pending) { + void *uaddr = futex_uaddr(pending, futex_offset); + + handle_futex_death(uaddr, curr, pip); + } } asmlinkage long diff --git a/kernel/signal.c b/kernel/signal.c index 201917b0..53fb8cff 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -100,7 +100,11 @@ static int recalc_sigpending_tsk(struct task_struct *t) set_tsk_thread_flag(t, TIF_SIGPENDING); return 1; } - clear_tsk_thread_flag(t, TIF_SIGPENDING); + /* + * We must never clear the flag in another thread, or in current + * when it's possible the current syscall is returning -ERESTART*. + * So we don't clear it here, and only callers who know they should do. + */ return 0; } @@ -116,7 +120,9 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - recalc_sigpending_tsk(current); + if (!recalc_sigpending_tsk(current)) + clear_thread_flag(TIF_SIGPENDING); + } /* Given the mask, find the first available signal that should be serviced. */ @@ -327,7 +333,6 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, sig = 0; } - recalc_sigpending(); return sig; } @@ -344,7 +349,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!signr) signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); - if (signr && unlikely(sig_kernel_stop(signr))) { + recalc_sigpending(); + if (signr && unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our * caller might release the siglock and then the pending diff --git a/kernel/softlockup.c b/kernel/softlockup.c index c9aad283..8248555c 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -89,6 +89,12 @@ void softlockup_tick(struct pt_regs *regs) return; } + /* do not print during early bootup: */ + if (unlikely(system_state != SYSTEM_RUNNING)) { + touch_softlockup_watchdog(); + return; + } + now = jiffies; /* Wake up the high-prio watchdog task every second: */ diff --git a/kernel/sys.c b/kernel/sys.c index 21bf335d..7695fc56 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1808,6 +1808,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) if (retval) return retval; + if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim.rlim_cur = 1; + } + task_lock(current->group_leader); *old_rlim = new_rlim; task_unlock(current->group_leader); @@ -1829,15 +1839,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) unsigned long rlim_cur = new_rlim.rlim_cur; cputime_t cputime; - if (rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - rlim_cur = 1; - } cputime = secs_to_cputime(rlim_cur); read_lock(&tasklist_lock); spin_lock_irq(¤t->sighand->siglock); diff --git a/mm/filemap.c b/mm/filemap.c index 9e3585e2..6605ba75 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2140,21 +2140,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, /* Limit the size of the copy to the caller's write size */ bytes = min(bytes, count); - /* - * Limit the size of the copy to that of the current segment, - * because fault_in_pages_readable() doesn't know how to walk - * segments. + /* We only need to worry about prefaulting when writes are from + * user-space. NFSd uses vfs_writev with several non-aligned + * segments in the vector, and limiting to one segment a time is + * a noticeable performance for re-write */ - bytes = min(bytes, cur_iov->iov_len - iov_base); - - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - fault_in_pages_readable(buf, bytes); + if (!segment_eq(get_fs(), KERNEL_DS)) { + /* + * Limit the size of the copy to that of the current + * segment, because fault_in_pages_readable() doesn't + * know how to walk segments. + */ + bytes = min(bytes, cur_iov->iov_len - iov_base); + /* + * Bring in the user page that we will copy from + * _first_. Otherwise there's a nasty deadlock on + * copying from the same page as we're writing to, + * without it being marked up-to-date. + */ + fault_in_pages_readable(buf, bytes); + } page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { status = -ENOMEM; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1a3466f7..a791d3bb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -131,9 +131,6 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min); atomic_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); -/* Shared by v4/v6 udp. */ -int udp_port_rover; - static int udp_v4_get_port(struct sock *sk, unsigned short snum) { struct hlist_node *node; @@ -141,47 +138,51 @@ static int udp_v4_get_port(struct sock *sk, unsigned short snum) struct inet_sock *inet = inet_sk(sk); write_lock_bh(&udp_hash_lock); - if (snum == 0) { - int best_size_so_far, best, result, i, low, high; + if (!snum) { + int i, low, high, remaining; + unsigned rover, best, best_size_so_far; inet_get_local_port_range(&low, &high); + remaining = (high - low) + 1; + + best_size_so_far = UINT_MAX; + best = rover = net_random() % remaining + low; - if (udp_port_rover > high || - udp_port_rover < low) - udp_port_rover = low; - best_size_so_far = 32767; - best = result = udp_port_rover; - for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { + /* 1st pass: look for empty (or shortest) hash chain */ + for (i = 0; i < UDP_HTABLE_SIZE; i++) { struct hlist_head *list; - int size; + int size = 0; - list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; - if (hlist_empty(list)) { - if (result > high) - result = low + ((result - low) & - (UDP_HTABLE_SIZE - 1)); + list = &udp_hash[rover & (UDP_HTABLE_SIZE - 1)]; + if (hlist_empty(list)) goto gotit; - } - size = 0; + sk_for_each(sk2, node, list) if (++size >= best_size_so_far) goto next; best_size_so_far = size; - best = result; - next:; + best = rover; + next: + /* fold back if end of range */ + if (++rover > high) + rover = low + ((rover - low) + & (UDP_HTABLE_SIZE - 1)); } - result = best; - for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) { - if (result > high) - result = low + ((result - low) & - (UDP_HTABLE_SIZE - 1)); - if (!udp_lport_inuse(result)) - break; + /* 2nd pass: find hole in shortest hash chain */ + rover = best; + for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) { + if (!udp_lport_inuse(rover)) + goto gotit; + rover += UDP_HTABLE_SIZE; + if (rover > high) + rover = low + ((rover - low) + & (UDP_HTABLE_SIZE - 1)); } - if (i >= (1 << 16) / UDP_HTABLE_SIZE) - goto fail; + /* All ports in use! */ + goto fail; + gotit: - udp_port_rover = snum = result; + snum = rover; } else { sk_for_each(sk2, node, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { @@ -1648,7 +1649,6 @@ EXPORT_SYMBOL(udp_disconnect); EXPORT_SYMBOL(udp_hash); EXPORT_SYMBOL(udp_hash_lock); EXPORT_SYMBOL(udp_ioctl); -EXPORT_SYMBOL(udp_port_rover); EXPORT_SYMBOL(udp_prot); EXPORT_SYMBOL(udp_sendmsg); EXPORT_SYMBOL(udp_poll); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 836eecd7..cb81464d 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -396,9 +396,9 @@ static int ipip6_rcv(struct sk_buff *skb) } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - kfree_skb(skb); read_unlock(&ipip6_lock); out: + kfree_skb(skb); return 0; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 69dc6616..1a5d72b4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -70,45 +70,51 @@ static int udp_v6_get_port(struct sock *sk, unsigned short snum) struct hlist_node *node; write_lock_bh(&udp_hash_lock); - if (snum == 0) { - int best_size_so_far, best, result, i, low, high; + if (!snum) { + int i, low, high, remaining; + unsigned rover, best, best_size_so_far; inet_get_local_port_range(&low, &high); - if (udp_port_rover > high || udp_port_rover < low) - udp_port_rover = low; - best_size_so_far = 32767; - best = result = udp_port_rover; - for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { - int size; + remaining = (high - low) + 1; + + best_size_so_far = UINT_MAX; + best = rover = net_random() % remaining + low; + + /* 1st pass: look for empty (or shortest) hash chain */ + for (i = 0; i < UDP_HTABLE_SIZE; i++) { + int size = 0; struct hlist_head *list; - list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; - if (hlist_empty(list)) { - if (result > high) - result = low + ((result - low) & - (UDP_HTABLE_SIZE - 1)); + list = &udp_hash[rover & (UDP_HTABLE_SIZE - 1)]; + if (hlist_empty(list)) goto gotit; - } - size = 0; + sk_for_each(sk2, node, list) if (++size >= best_size_so_far) goto next; best_size_so_far = size; - best = result; - next:; + best = rover; + next: + /* fold back if end of range */ + if (++rover > high) + rover = low + ((rover - low) + & (UDP_HTABLE_SIZE - 1)); } - result = best; - for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) { - if (result > high) - result = low + ((result - low) & - (UDP_HTABLE_SIZE - 1)); - if (!udp_lport_inuse(result)) - break; + /* 2nd pass: find hole in shortest hash chain */ + rover = best; + for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) { + if (!udp_lport_inuse(rover)) + goto gotit; + rover += UDP_HTABLE_SIZE; + if (rover > high) + rover = low + ((rover - low) + & (UDP_HTABLE_SIZE - 1)); } - if (i >= (1 << 16) / UDP_HTABLE_SIZE) - goto fail; + /* All ports in use! */ + goto fail; + gotit: - udp_port_rover = snum = result; + snum = rover; } else { sk_for_each(sk2, node, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9adc06f5..cdddff50 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4024,7 +4024,9 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len, if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs_old))) return -EFAULT; - if (getaddrs.addr_num <= 0) return -EINVAL; + if (getaddrs.addr_num <= 0 || + getaddrs.addr_num >= (INT_MAX / sizeof(union sctp_addr))) + return -EINVAL; /* * For UDP-style sockets, id specifies the association to query. * If the id field is set to the value '0' then the locally bound diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index de6ec519..acde97ec 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1711,20 +1711,22 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, int chunk; struct sk_buff *skb; + unix_state_rlock(sk); skb = skb_dequeue(&sk->sk_receive_queue); if (skb==NULL) { if (copied >= target) - break; + goto unlock; /* * POSIX 1003.1g mandates this order. */ if ((err = sock_error(sk)) != 0) - break; + goto unlock; if (sk->sk_shutdown & RCV_SHUTDOWN) - break; + goto unlock; + unix_state_runlock(sk); err = -EAGAIN; if (!timeo) break; @@ -1738,7 +1740,11 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, } mutex_lock(&u->readlock); continue; +unlock: + unix_state_runlock(sk); + break; } + unix_state_runlock(sk); if (check_creds) { /* Never glue messages from different writers */