- [ia64] softlock: prevent endless warnings in kdump (Neil Horman ) [456117 453200]
* Wed Jul 16 2008 Jiri Pirko <jpirko@redhat.com> [2.6.18-92.1.9.el5]
- [misc] signaling msgrvc() should not pass back error (Jiri Pirko ) [455278 452533]
- [ia64] properly unregister legacy interrupts (Prarit Bhargava ) [450337 445886]
* Mon Jul 14 2008 Jiri Pirko <jpirko@redhat.com> [2.6.18-92.1.8.el5]
- [net] randomize udp port allocation (Eugene Teo ) [454571 454572]
- [tty] add NULL pointer checks (Aristeu Rozanski ) [453425 453154] {CVE-2008-2812}
- [net] sctp: make sure sctp_addr does not overflow (David S. Miller ) [452482 452483] {CVE-2008-2826}
- [sys] sys_setrlimit: prevent setting RLIMIT_CPU to 0 (Neil Horman ) [437121 437122] {CVE-2008-1294}
- [net] sit: exploitable remote memory leak (Jiri Pirko ) [446038 446039] {CVE-2008-2136}
- [misc] ttyS1 lost interrupt, stops transmitting v2 (Brian Maly ) [455256 451157]
- [misc] ttyS1 loses interrupt and stops transmitting (Simon McGrath ) [443071 440121]
* Thu Jul 10 2008 Jiri Pirko <jpirko@redhat.com> [2.6.18-92.1.7.el5]
- [x86_64]: extend MCE banks support for Dunnington, Nehalem (Prarit Bhargava ) [451941 446673]
- [nfs] address nfs rewrite performance regression in RHEL5 (Eric Sandeen ) [448685 436004]
- [mm] Make mmap() with PROT_WRITE on RHEL5 (Larry Woodman ) [450758 448978]
- [i386]: Add check for supported_cpus in powernow_k8 driver (Prarit Bhargava ) [450866 443853]
- [i386]: Add check for dmi_data in powernow_k8 driver (Prarit Bhargava ) [450866 443853]
- [net] fix recv return zero (Thomas Graf ) [452231 435657]
- [misc] kernel crashes on futex (Anton Arapov ) [450336 435178]
- [net] Fixing bonding rtnl_lock screwups (Fabio Olive Leite ) [451939 450219]
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 18
-EXTRAVERSION = -92.1.6.el5
+EXTRAVERSION = -92.1.10.el5
RHEL_MAJOR = 5
RHEL_MINOR = 2
NAME=Avast! A bilge rat!
case 1: /* read, present */
goto bad_area;
case 0: /* read, not present */
- if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
}
if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
return;
+ if (has_8259 && gsi < 16)
+ return;
+
iosapic_unregister_intr(gsi);
}
# error File is out of sync with <linux/mm.h>. Please update.
# endif
+ if (((isr >> IA64_ISR_R_BIT) & 1UL) && (!(vma->vm_flags & (VM_READ | VM_WRITE))))
+ goto bad_area;
+
mask = ( (((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
- | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)
- | (((isr >> IA64_ISR_R_BIT) & 1UL) << VM_READ_BIT));
+ | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT));
if ((vma->vm_flags & mask) != mask)
goto bad_area;
/* protection fault */
if (error_code & 0x08000000)
goto bad_area;
- if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
}
/* protection fault */
if (error_code & 0x08000000)
goto bad_area;
- if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
}
#include <asm/smp.h>
#define MISC_MCELOG_MINOR 227
-#define NR_BANKS 6
+#define NR_SYSFS_BANKS 6
atomic_t mce_entry;
3: never panic or exit (for testing only) */
static int tolerant = 1;
static int banks;
-static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
static unsigned long console_logged;
static int notify_user;
static int rip_msr;
barrier();
for (i = 0; i < banks; i++) {
- if (!bank[i])
+ if (i < NR_SYSFS_BANKS && !bank[i])
continue;
m.misc = 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
banks = cap & 0xff;
- if (banks > NR_BANKS) {
- printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
- banks = NR_BANKS;
+ if (banks > MCE_EXTENDED_BANK) {
+ printk(KERN_INFO "MCE: warning: using only %d banks\n",
+ MCE_EXTENDED_BANK);
+ banks = MCE_EXTENDED_BANK;
}
/* Use accurate RIP reporting if available. */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
for (i = 0; i < banks; i++) {
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
}
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
+/* TBD should generate these dynamically based on number of available banks.
+ * Have only 6 contol banks in /sysfs until then.
+ */
ACCESSOR(bank0ctl,bank[0],mce_restart())
ACCESSOR(bank1ctl,bank[1],mce_restart())
ACCESSOR(bank2ctl,bank[2],mce_restart())
ACCESSOR(bank3ctl,bank[3],mce_restart())
ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(bank5ctl,bank[5],mce_restart())
-static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
+static struct sysdev_attribute * bank_attributes[NR_SYSFS_BANKS] = {
&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
ACCESSOR(tolerant,tolerant,)
err = sysdev_register(&per_cpu(device_mce,cpu));
if (!err) {
- for (i = 0; i < banks; i++)
+ for (i = 0; i < NR_SYSFS_BANKS; i++)
sysdev_create_file(&per_cpu(device_mce,cpu),
bank_attributes[i]);
sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
case PF_PROT: /* read, present */
goto bad_area;
case 0: /* read, not present */
- if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
}
XEN_TARGET_X86_PAE ?= y
LINUX_SERIES = 2.6
-LINUX_VER = 2.6.18-92.1.6.el5
+LINUX_VER = 2.6.18-92.1.10.el5
EXTRAVERSION ?= xen
#
# Automatically generated make config: don't edit
# Linux kernel version: 2.6.18-prep
-# Tue Jul 1 11:03:51 2008
+# Thu Aug 7 09:42:27 2008
#
CONFIG_X86_32=y
CONFIG_GENERIC_TIME=y
#
# Automatically generated make config: don't edit
# Linux kernel version: 2.6.18-prep
-# Tue Jul 1 11:03:51 2008
+# Thu Aug 7 09:42:27 2008
#
CONFIG_X86_32=y
CONFIG_GENERIC_TIME=y
#
# Automatically generated make config: don't edit
# Linux kernel version: 2.6.18-prep
-# Tue Jul 1 11:03:51 2008
+# Thu Aug 7 09:42:27 2008
#
CONFIG_X86_32=y
CONFIG_LOCKDEP_SUPPORT=y
#
# Automatically generated make config: don't edit
# Linux kernel version: 2.6.18-prep
-# Tue Jul 1 11:03:51 2008
+# Thu Aug 7 09:42:27 2008
#
CONFIG_X86_32=y
CONFIG_GENERIC_TIME=y
": Unable remove bond %s due to open references.\n",
ifname);
res = -EPERM;
- goto out;
+ goto out_unlock;
}
printk(KERN_INFO DRV_NAME
": %s is being deleted...\n",
bond->dev->name);
bond_destroy(bond);
- up_write(&bonding_rwsem);
- rtnl_unlock();
- goto out;
+ goto out_unlock;
}
printk(KERN_ERR DRV_NAME
": unable to delete non-existent bond %s\n", ifname);
res = -ENODEV;
- up_write(&bonding_rwsem);
- rtnl_unlock();
- goto out;
+ goto out_unlock;
}
err_no_cmd:
printk(KERN_ERR DRV_NAME
": no command found in bonding_masters. Use +ifname or -ifname.\n");
- res = -EPERM;
+ return -EPERM;
+
+out_unlock:
+ up_write(&bonding_rwsem);
+ rtnl_unlock();
/* Always return either count or an error. If you return 0, you'll
* get called forever, which is bad.
printk(KERN_ERR DRV_NAME
": %s: Unable to update slaves because interface is down.\n",
bond->dev->name);
- ret = -EPERM;
- goto out;
+ /* early return before rtnl_lock() */
+ return -EPERM;
}
/* Note: We can't hold bond->lock here, as bond_create grabs it. */
IRDA_ASSERT(priv != NULL, return -1;);
IRDA_ASSERT(priv->magic == IRTTY_MAGIC, return -1;);
- return priv->tty->driver->chars_in_buffer(priv->tty);
+ if (priv->tty->driver->chars_in_buffer)
+ return priv->tty->driver->chars_in_buffer(priv->tty);
+ return 0;
}
/* Wait (sleep) until underlaying hardware finished transmission
struct asyncppp *ap;
int err;
+ if (!tty->driver->write)
+ return -EOPNOTSUPP;
+
err = -ENOMEM;
ap = kmalloc(sizeof(*ap), GFP_KERNEL);
if (ap == 0)
struct syncppp *ap;
int err;
+ if (!tty->driver->write)
+ return -EOPNOTSUPP;
+
ap = kmalloc(sizeof(*ap), GFP_KERNEL);
err = -ENOMEM;
if (ap == 0)
/* 20 sec timeout not reached */
goto out;
}
- printk(KERN_WARNING "%s: transmit timed out, %s?\n", dev->name,
- (sl->tty->driver->chars_in_buffer(sl->tty) || sl->xleft) ?
- "bad line quality" : "driver error");
+ {
+ int cib = 0;
+ if (sl->tty->driver->chars_in_buffer)
+ cib = sl->tty->driver->chars_in_buffer(sl->tty);
+ printk(KERN_WARNING "%s: transmit timed out, %s?\n",
+ dev->name, (cib || sl->xleft) ?
+ "bad line quality" : "driver error");
+ }
sl->xleft = 0;
sl->tty->flags &= ~(1 << TTY_DO_WRITE_WAKEUP);
sl_unlock(sl);
if(!capable(CAP_NET_ADMIN))
return -EPERM;
+ if (!tty->driver->write)
+ return -EOPNOTSUPP;
/* RTnetlink lock is misused here to serialize concurrent
opens of slip channels. There are better ways, but it is
*/
serial_outp(up, UART_LCR, UART_LCR_WLEN8);
- spin_lock_irqsave(&up->port.lock, flags);
+ if (is_real_interrupt(up->port.irq)) {
+ spin_lock_irqsave(&irq_lists[up->port.irq].lock, flags);
+ spin_lock(&up->port.lock);
+ } else
+ spin_lock_irqsave(&up->port.lock, flags);
if (up->port.flags & UPF_FOURPORT) {
if (!is_real_interrupt(up->port.irq))
up->port.mctrl |= TIOCM_OUT1;
up->bugs &= ~UART_BUG_TXEN;
}
- spin_unlock_irqrestore(&up->port.lock, flags);
+ if (is_real_interrupt(up->port.irq)) {
+ spin_unlock(&up->port.lock);
+ spin_unlock_irqrestore(&irq_lists[up->port.irq].lock, flags);
+ } else
+ spin_unlock_irqrestore(&up->port.lock, flags);
/*
* Finally, enable interrupts. Note: Modem status interrupts
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_DEAD 0x00000008 /* Dead */
+#define PF_EXITPIDONE 0x00000010 /* pi exit done on shut down */
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* dumped core */
extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
extern rwlock_t udp_hash_lock;
-extern int udp_port_rover;
-
static inline int udp_lport_inuse(u16 num)
{
struct sock *sk;
if (unlikely(tsk->flags & PF_EXITING)) {
printk(KERN_ALERT
"Fixing recursive fault but reboot is needed!\n");
+ /*
+ * We can do this unlocked here. The futex code uses
+ * this flag just to verify whether the pi state
+ * cleanup has been done or not. In the worst case it
+ * loops once more. We pretend that the cleanup was
+ * done as there is no way to return. Either the
+ * OWNER_DIED bit is set by now or we push the blocked
+ * task into the wait for ever nirwana as well.
+ */
+ tsk->flags |= PF_EXITPIDONE;
if (tsk->io_context)
exit_io_context();
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
+ /*
+ * tsk->flags are checked in the futex code to protect against
+ * an exiting task cleaning up the robust pi futexes.
+ */
+ spin_lock_irq(&tsk->pi_lock);
tsk->flags |= PF_EXITING;
+ spin_unlock_irq(&tsk->pi_lock);
ptrace_exit(tsk);
}
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
- hrtimer_cancel(&tsk->signal->real_timer);
+ hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
}
* Make sure we are holding no locks:
*/
debug_check_no_locks_held(tsk);
+ /*
+ * We can do this unlocked here. The futex code uses this flag
+ * just to verify whether the pi state cleanup has been done
+ * or not. In the worst case it loops once more.
+ */
+ tsk->flags |= PF_EXITPIDONE;
if (tsk->io_context)
exit_io_context();
read_lock(&tasklist_lock);
p = find_task_by_pid(pid);
- if (!p)
- goto out_unlock;
- if ((current->euid != p->euid) && (current->euid != p->uid)) {
- p = NULL;
- goto out_unlock;
- }
- if (p->exit_state != 0) {
- p = NULL;
- goto out_unlock;
- }
- get_task_struct(p);
-out_unlock:
+
+ if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
+ p = ERR_PTR(-ESRCH);
+ else
+ get_task_struct(p);
+
read_unlock(&tasklist_lock);
return p;
struct futex_q *this, *next;
struct list_head *head;
struct task_struct *p;
- pid_t pid;
+ pid_t pid = uval & FUTEX_TID_MASK;
head = &hb->chain;
return -EINVAL;
WARN_ON(!atomic_read(&pi_state->refcount));
+ WARN_ON(pid && pi_state->owner &&
+ pi_state->owner->pid != pid);
atomic_inc(&pi_state->refcount);
me->pi_state = pi_state;
/*
* We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when the owner died bit is set
- * and TID = 0:
+ * the new pi_state to it, but bail out when TID = 0
*/
- pid = uval & FUTEX_TID_MASK;
- if (!pid && (uval & FUTEX_OWNER_DIED))
+ if (!pid)
return -ESRCH;
p = futex_find_get_task(pid);
- if (!p)
- return -ESRCH;
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ /*
+ * We need to look at the task state flags to figure out,
+ * whether the task is exiting. To protect against the do_exit
+ * change of the task flags, we do this protected by
+ * p->pi_lock:
+ */
+ spin_lock_irq(&p->pi_lock);
+ if (unlikely(p->flags & PF_EXITING)) {
+ /*
+ * The task is on the way out. When PF_EXITPIDONE is
+ * set, we know that the task has finished the
+ * cleanup:
+ */
+ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+
+ spin_unlock_irq(&p->pi_lock);
+ put_task_struct(p);
+ return ret;
+ }
pi_state = alloc_pi_state();
/* Store the key for possible exit cleanups: */
pi_state->key = me->key;
- spin_lock_irq(&p->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &p->pi_state_list);
pi_state->owner = p;
if (!pi_state)
return -EINVAL;
+ spin_lock(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
* preserve the owner died bit.)
*/
if (!(uval & FUTEX_OWNER_DIED)) {
+ int ret = 0;
newval = FUTEX_WAITERS | new_owner->pid;
inc_preempt_count();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
dec_preempt_count();
if (curval == -EFAULT)
- return -EFAULT;
- if (curval != uval)
- return -EINVAL;
+ ret = -EFAULT;
+ else if (curval != uval)
+ ret = -EINVAL;
+ if (ret) {
+ spin_unlock(&pi_state->pi_mutex.wait_lock);
+ return ret;
+ }
}
spin_lock_irq(&pi_state->owner->pi_lock);
pi_state->owner = new_owner;
spin_unlock_irq(&new_owner->pi_lock);
+ spin_unlock(&pi_state->pi_mutex.wait_lock);
rt_mutex_unlock(&pi_state->pi_mutex);
return 0;
drop_key_refs(&q->key);
}
+/*
+ * Fixup the pi_state owner with the new owner.
+ *
+ * The cur->mm semaphore must be held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 *uaddr, struct futex_q *q,
+ struct task_struct *newowner)
+{
+ u32 newtid = newowner->pid | FUTEX_WAITERS;
+ struct futex_pi_state *pi_state = q->pi_state;
+ u32 uval, curval, newval;
+ int ret;
+
+ /* Owner died? */
+ if (pi_state->owner != NULL) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+ } else
+ newtid |= FUTEX_OWNER_DIED;
+
+ pi_state->owner = newowner;
+
+ spin_lock_irq(&newowner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &newowner->pi_state_list);
+ spin_unlock_irq(&newowner->pi_lock);
+
+ /*
+ * We own it, so we have to replace the pending owner
+ * TID. This must be atomic as we have preserve the
+ * owner died bit here.
+ */
+ ret = get_futex_value_locked(&uval, uaddr);
+
+ while (!ret) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval, newval);
+ dec_preempt_count();
+
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+ return ret;
+}
+
static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
{
struct task_struct *curr = current;
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
struct futex_q q;
- int ret, attempt = 0;
+ int ret, lock_taken, ownerdied = 0, attempt = 0;
if (refill_pi_state_cache())
return -ENOMEM;
if (unlikely(ret != 0))
goto out_release_sem;
+ retry_unlocked:
hb = queue_lock(&q, -1, NULL);
retry_locked:
+ ret = lock_taken = 0;
+
/*
* To avoid races, we attempt to take the lock here again
* (by doing a 0 -> TID atomic cmpxchg), while holding all
if (unlikely(curval == -EFAULT))
goto uaddr_faulted;
- /* We own the lock already */
+ /*
+ * Detect deadlocks. In case of REQUEUE_PI this is a valid
+ * situation and we return success to user space.
+ */
if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
- if (!detect && 0)
- force_sig(SIGKILL, current);
ret = -EDEADLK;
goto out_unlock_release_sem;
}
/*
- * Surprise - we got the lock. Just return
- * to userspace:
+ * Surprise - we got the lock. Just return to userspace:
*/
if (unlikely(!curval))
goto out_unlock_release_sem;
uval = curval;
+
+ /*
+ * Set the WAITERS flag, so the owner will know it has someone
+ * to wake at next unlock
+ */
newval = uval | FUTEX_WAITERS;
+ /*
+ * There are two cases, where a futex might have no owner (the
+ * owner TID is 0): OWNER_DIED or REQUEUE. We take over the
+ * futex in this case. We also do an unconditional take over,
+ * when the owner of the futex died.
+ *
+ * This is safe as we are protected by the hash bucket lock !
+ */
+ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+ /* Keep the OWNER_DIED bit */
+ newval = (curval & ~FUTEX_TID_MASK) | current->pid;
+ ownerdied = 0;
+ lock_taken = 1;
+ }
+
inc_preempt_count();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
dec_preempt_count();
if (unlikely(curval != uval))
goto retry_locked;
+ /*
+ * We took the lock due to requeue or owner died take over.
+ */
+ if (unlikely(lock_taken))
+ goto out_unlock_release_sem;
+
/*
* We dont have the lock. Look up the PI state (or create it if
* we are the first waiter):
ret = lookup_pi_state(uval, hb, &q);
if (unlikely(ret)) {
- /*
- * There were no waiters and the owner task lookup
- * failed. When the OWNER_DIED bit is set, then we
- * know that this is a robust futex and we actually
- * take the lock. This is safe as we are protected by
- * the hash bucket lock. We also set the waiters bit
- * unconditionally here, to simplify glibc handling of
- * multiple tasks racing to acquire the lock and
- * cleanup the problems which were left by the dead
- * owner.
- */
- if (curval & FUTEX_OWNER_DIED) {
- uval = newval;
- newval = current->pid |
- FUTEX_OWNER_DIED | FUTEX_WAITERS;
+ switch (ret) {
- inc_preempt_count();
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
- dec_preempt_count();
+ case -EAGAIN:
+ /*
+ * Task is exiting and we just wait for the
+ * exit to complete.
+ */
+ queue_unlock(&q, hb);
+ up_read(&curr->mm->mmap_sem);
+ cond_resched();
+ goto retry;
- if (unlikely(curval == -EFAULT))
+ case -ESRCH:
+ /*
+ * No owner found for this futex. Check if the
+ * OWNER_DIED bit is set to figure out whether
+ * this is a robust futex or not.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
goto uaddr_faulted;
- if (unlikely(curval != uval))
+
+ /*
+ * We simply start over in case of a robust
+ * futex. The code above will take the futex
+ * and return happy.
+ */
+ if (curval & FUTEX_OWNER_DIED) {
+ ownerdied = 1;
goto retry_locked;
- ret = 0;
+ }
+ default:
+ goto out_unlock_release_sem;
}
- goto out_unlock_release_sem;
}
/*
down_read(&curr->mm->mmap_sem);
spin_lock(q.lock_ptr);
- /*
- * Got the lock. We might not be the anticipated owner if we
- * did a lock-steal - fix up the PI-state in that case.
- */
- if (!ret && q.pi_state->owner != curr) {
- u32 newtid = current->pid | FUTEX_WAITERS;
-
- /* Owner died? */
- if (q.pi_state->owner != NULL) {
- spin_lock_irq(&q.pi_state->owner->pi_lock);
- WARN_ON(list_empty(&q.pi_state->list));
- list_del_init(&q.pi_state->list);
- spin_unlock_irq(&q.pi_state->owner->pi_lock);
- } else
- newtid |= FUTEX_OWNER_DIED;
-
- q.pi_state->owner = current;
-
- spin_lock_irq(¤t->pi_lock);
- WARN_ON(!list_empty(&q.pi_state->list));
- list_add(&q.pi_state->list, ¤t->pi_state_list);
- spin_unlock_irq(¤t->pi_lock);
-
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
+ if (!ret) {
/*
- * We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
- * owner died bit here.
+ * Got the lock. We might not be the anticipated owner
+ * if we did a lock-steal - fix up the PI-state in
+ * that case:
*/
- ret = get_user(uval, uaddr);
- while (!ret) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
- if (curval == -EFAULT)
- ret = -EFAULT;
- if (curval == uval)
- break;
- uval = curval;
- }
+ if (q.pi_state->owner != curr)
+ ret = fixup_pi_state_owner(uaddr, &q, curr);
} else {
/*
* Catch the rare case, where the lock was released
- * when we were on the way back before we locked
- * the hash bucket.
+ * when we were on the way back before we locked the
+ * hash bucket.
*/
- if (ret && q.pi_state->owner == curr) {
+ if (q.pi_state->owner == curr) {
if (rt_mutex_trylock(&q.pi_state->pi_mutex))
ret = 0;
+ else {
+ /*
+ * pi_state is incorrect, some other
+ * task did a lock steal and we
+ * returned due to timeout or signal
+ * without taking the rt_mutex. Too
+ * late. We can access the
+ * rt_mutex_owner without locking, as
+ * the other task is now blocked on
+ * the hash bucket lock. Fix the state
+ * up.
+ */
+ struct task_struct *owner;
+ int res;
+
+ owner = rt_mutex_owner(&q.pi_state->pi_mutex);
+ res = fixup_pi_state_owner(uaddr, &q, owner);
+
+ /* propagate -EFAULT, if the fixup failed */
+ if (res)
+ ret = res;
+ }
+ } else {
+ /*
+ * Paranoia check. If we did not take the lock
+ * in the trylock above, then we should not be
+ * the owner of the rtmutex, neither the real
+ * nor the pending one:
+ */
+ if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
+ printk(KERN_ERR "futex_lock_pi: ret = %d "
+ "pi-mutex: %p pi-state %p\n", ret,
+ q.pi_state->pi_mutex.owner,
+ q.pi_state->owner);
}
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
}
-
- if (!detect && ret == -EDEADLK && 0)
- force_sig(SIGKILL, current);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q, hb);
+ up_read(&curr->mm->mmap_sem);
return ret != -EINTR ? ret : -ERESTARTNOINTR;
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
* still holding the mmap_sem.
+ *
+ * ... and hb->lock. :-) --ANK
*/
+ queue_unlock(&q, hb);
+
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
- goto out_unlock_release_sem;
- }
- goto retry_locked;
+ ret = futex_handle_fault((unsigned long)uaddr, attempt);
+ if (ret)
+ goto out_release_sem;
+ goto retry_unlocked;
}
- queue_unlock(&q, hb);
up_read(&curr->mm->mmap_sem);
ret = get_user(uval, uaddr);
goto out;
hb = hash_futex(&key);
+retry_unlocked:
spin_lock(&hb->lock);
-retry_locked:
/*
* To avoid races, try to do the TID -> 0 atomic transition
* again. If it succeeds then we can return without waking
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
* still holding the mmap_sem.
+ *
+ * ... and hb->lock. --ANK
*/
+ spin_unlock(&hb->lock);
+
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
- goto out_unlock;
- }
- goto retry_locked;
+ ret = futex_handle_fault((unsigned long)uaddr, attempt);
+ if (ret)
+ goto out;
+ uval = 0;
+ goto retry_unlocked;
}
- spin_unlock(&hb->lock);
up_read(¤t->mm->mmap_sem);
ret = get_user(uval, uaddr);
void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->robust_list;
- struct robust_list __user *entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
unsigned long futex_offset;
+ int rc;
/*
* Fetch the list head (which was registered earlier, via
if (pending)
handle_futex_death((void *)pending + futex_offset, curr, pip);
+ next_entry = NULL; /* avoid warning with gcc */
while (entry != &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
/*
* A pending lock might already be on the list, so
* don't process it twice:
if (handle_futex_death((void *)entry + futex_offset,
curr, pi))
return;
- /*
- * Fetch the next entry in the list:
- */
- if (fetch_robust_entry(&entry, &entry->next, &pi))
+ if (rc)
return;
+ entry = next_entry;
+ pi = next_pi;
/*
* Avoid excessively long or circular lists:
*/
cond_resched();
}
+
+ if (pending)
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip);
}
long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
return 0;
}
+static void __user *futex_uaddr(struct robust_list *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
/*
* Walk curr->robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
void compat_exit_robust_list(struct task_struct *curr)
{
struct compat_robust_list_head __user *head = curr->compat_robust_list;
- struct robust_list __user *entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- compat_uptr_t uentry, upending;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+ compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
+ int rc;
/*
* Fetch the list head (which was registered earlier, via
if (fetch_robust_entry(&upending, &pending,
&head->list_op_pending, &pip))
return;
- if (upending)
- handle_futex_death((void *)pending + futex_offset, curr, pip);
- while (compat_ptr(uentry) != &head->list) {
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
/*
* A pending lock might already be on the list, so
* dont process it twice:
*/
- if (entry != pending)
- if (handle_futex_death((void *)entry + futex_offset,
- curr, pi))
- return;
+ if (entry != pending) {
+ void *uaddr = futex_uaddr(entry, futex_offset);
- /*
- * Fetch the next entry in the list:
- */
- if (fetch_robust_entry(&uentry, &entry,
- (compat_uptr_t *)&entry->next, &pi))
+ if (handle_futex_death(uaddr, curr, pi))
+ return;
+ }
+ if (rc)
return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
/*
* Avoid excessively long or circular lists:
*/
cond_resched();
}
+ if (pending) {
+ void *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip);
+ }
}
asmlinkage long
set_tsk_thread_flag(t, TIF_SIGPENDING);
return 1;
}
- clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ /*
+ * We must never clear the flag in another thread, or in current
+ * when it's possible the current syscall is returning -ERESTART*.
+ * So we don't clear it here, and only callers who know they should do.
+ */
return 0;
}
void recalc_sigpending(void)
{
- recalc_sigpending_tsk(current);
+ if (!recalc_sigpending_tsk(current))
+ clear_thread_flag(TIF_SIGPENDING);
+
}
/* Given the mask, find the first available signal that should be serviced. */
sig = 0;
}
- recalc_sigpending();
return sig;
}
if (!signr)
signr = __dequeue_signal(&tsk->signal->shared_pending,
mask, info);
- if (signr && unlikely(sig_kernel_stop(signr))) {
+ recalc_sigpending();
+ if (signr && unlikely(sig_kernel_stop(signr))) {
/*
* Set a marker that we have dequeued a stop signal. Our
* caller might release the siglock and then the pending
return;
}
+ /* do not print during early bootup: */
+ if (unlikely(system_state != SYSTEM_RUNNING)) {
+ touch_softlockup_watchdog();
+ return;
+ }
+
now = jiffies;
/* Wake up the high-prio watchdog task every second: */
if (retval)
return retval;
+ if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
+ /*
+ * The caller is asking for an immediate RLIMIT_CPU
+ * expiry. But we use the zero value to mean "it was
+ * never set". So let's cheat and make it one second
+ * instead
+ */
+ new_rlim.rlim_cur = 1;
+ }
+
task_lock(current->group_leader);
*old_rlim = new_rlim;
task_unlock(current->group_leader);
unsigned long rlim_cur = new_rlim.rlim_cur;
cputime_t cputime;
- if (rlim_cur == 0) {
- /*
- * The caller is asking for an immediate RLIMIT_CPU
- * expiry. But we use the zero value to mean "it was
- * never set". So let's cheat and make it one second
- * instead
- */
- rlim_cur = 1;
- }
cputime = secs_to_cputime(rlim_cur);
read_lock(&tasklist_lock);
spin_lock_irq(¤t->sighand->siglock);
/* Limit the size of the copy to the caller's write size */
bytes = min(bytes, count);
- /*
- * Limit the size of the copy to that of the current segment,
- * because fault_in_pages_readable() doesn't know how to walk
- * segments.
+ /* We only need to worry about prefaulting when writes are from
+ * user-space. NFSd uses vfs_writev with several non-aligned
+ * segments in the vector, and limiting to one segment a time is
+ * a noticeable performance for re-write
*/
- bytes = min(bytes, cur_iov->iov_len - iov_base);
-
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- */
- fault_in_pages_readable(buf, bytes);
+ if (!segment_eq(get_fs(), KERNEL_DS)) {
+ /*
+ * Limit the size of the copy to that of the current
+ * segment, because fault_in_pages_readable() doesn't
+ * know how to walk segments.
+ */
+ bytes = min(bytes, cur_iov->iov_len - iov_base);
+ /*
+ * Bring in the user page that we will copy from
+ * _first_. Otherwise there's a nasty deadlock on
+ * copying from the same page as we're writing to,
+ * without it being marked up-to-date.
+ */
+ fault_in_pages_readable(buf, bytes);
+ }
page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
if (!page) {
status = -ENOMEM;
atomic_t udp_memory_allocated;
EXPORT_SYMBOL(udp_memory_allocated);
-/* Shared by v4/v6 udp. */
-int udp_port_rover;
-
static int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
struct hlist_node *node;
struct inet_sock *inet = inet_sk(sk);
write_lock_bh(&udp_hash_lock);
- if (snum == 0) {
- int best_size_so_far, best, result, i, low, high;
+ if (!snum) {
+ int i, low, high, remaining;
+ unsigned rover, best, best_size_so_far;
inet_get_local_port_range(&low, &high);
+ remaining = (high - low) + 1;
+
+ best_size_so_far = UINT_MAX;
+ best = rover = net_random() % remaining + low;
- if (udp_port_rover > high ||
- udp_port_rover < low)
- udp_port_rover = low;
- best_size_so_far = 32767;
- best = result = udp_port_rover;
- for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
+ /* 1st pass: look for empty (or shortest) hash chain */
+ for (i = 0; i < UDP_HTABLE_SIZE; i++) {
struct hlist_head *list;
- int size;
+ int size = 0;
- list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
- if (hlist_empty(list)) {
- if (result > high)
- result = low + ((result - low) &
- (UDP_HTABLE_SIZE - 1));
+ list = &udp_hash[rover & (UDP_HTABLE_SIZE - 1)];
+ if (hlist_empty(list))
goto gotit;
- }
- size = 0;
+
sk_for_each(sk2, node, list)
if (++size >= best_size_so_far)
goto next;
best_size_so_far = size;
- best = result;
- next:;
+ best = rover;
+ next:
+ /* fold back if end of range */
+ if (++rover > high)
+ rover = low + ((rover - low)
+ & (UDP_HTABLE_SIZE - 1));
}
- result = best;
- for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
- if (result > high)
- result = low + ((result - low) &
- (UDP_HTABLE_SIZE - 1));
- if (!udp_lport_inuse(result))
- break;
+ /* 2nd pass: find hole in shortest hash chain */
+ rover = best;
+ for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) {
+ if (!udp_lport_inuse(rover))
+ goto gotit;
+ rover += UDP_HTABLE_SIZE;
+ if (rover > high)
+ rover = low + ((rover - low)
+ & (UDP_HTABLE_SIZE - 1));
}
- if (i >= (1 << 16) / UDP_HTABLE_SIZE)
- goto fail;
+ /* All ports in use! */
+ goto fail;
+
gotit:
- udp_port_rover = snum = result;
+ snum = rover;
} else {
sk_for_each(sk2, node,
&udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
EXPORT_SYMBOL(udp_hash);
EXPORT_SYMBOL(udp_hash_lock);
EXPORT_SYMBOL(udp_ioctl);
-EXPORT_SYMBOL(udp_port_rover);
EXPORT_SYMBOL(udp_prot);
EXPORT_SYMBOL(udp_sendmsg);
EXPORT_SYMBOL(udp_poll);
}
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
- kfree_skb(skb);
read_unlock(&ipip6_lock);
out:
+ kfree_skb(skb);
return 0;
}
struct hlist_node *node;
write_lock_bh(&udp_hash_lock);
- if (snum == 0) {
- int best_size_so_far, best, result, i, low, high;
+ if (!snum) {
+ int i, low, high, remaining;
+ unsigned rover, best, best_size_so_far;
inet_get_local_port_range(&low, &high);
- if (udp_port_rover > high || udp_port_rover < low)
- udp_port_rover = low;
- best_size_so_far = 32767;
- best = result = udp_port_rover;
- for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
- int size;
+ remaining = (high - low) + 1;
+
+ best_size_so_far = UINT_MAX;
+ best = rover = net_random() % remaining + low;
+
+ /* 1st pass: look for empty (or shortest) hash chain */
+ for (i = 0; i < UDP_HTABLE_SIZE; i++) {
+ int size = 0;
struct hlist_head *list;
- list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
- if (hlist_empty(list)) {
- if (result > high)
- result = low + ((result - low) &
- (UDP_HTABLE_SIZE - 1));
+ list = &udp_hash[rover & (UDP_HTABLE_SIZE - 1)];
+ if (hlist_empty(list))
goto gotit;
- }
- size = 0;
+
sk_for_each(sk2, node, list)
if (++size >= best_size_so_far)
goto next;
best_size_so_far = size;
- best = result;
- next:;
+ best = rover;
+ next:
+ /* fold back if end of range */
+ if (++rover > high)
+ rover = low + ((rover - low)
+ & (UDP_HTABLE_SIZE - 1));
}
- result = best;
- for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
- if (result > high)
- result = low + ((result - low) &
- (UDP_HTABLE_SIZE - 1));
- if (!udp_lport_inuse(result))
- break;
+ /* 2nd pass: find hole in shortest hash chain */
+ rover = best;
+ for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) {
+ if (!udp_lport_inuse(rover))
+ goto gotit;
+ rover += UDP_HTABLE_SIZE;
+ if (rover > high)
+ rover = low + ((rover - low)
+ & (UDP_HTABLE_SIZE - 1));
}
- if (i >= (1 << 16) / UDP_HTABLE_SIZE)
- goto fail;
+ /* All ports in use! */
+ goto fail;
+
gotit:
- udp_port_rover = snum = result;
+ snum = rover;
} else {
sk_for_each(sk2, node,
&udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs_old)))
return -EFAULT;
- if (getaddrs.addr_num <= 0) return -EINVAL;
+ if (getaddrs.addr_num <= 0 ||
+ getaddrs.addr_num >= (INT_MAX / sizeof(union sctp_addr)))
+ return -EINVAL;
/*
* For UDP-style sockets, id specifies the association to query.
* If the id field is set to the value '0' then the locally bound
int chunk;
struct sk_buff *skb;
+ unix_state_rlock(sk);
skb = skb_dequeue(&sk->sk_receive_queue);
if (skb==NULL)
{
if (copied >= target)
- break;
+ goto unlock;
/*
* POSIX 1003.1g mandates this order.
*/
if ((err = sock_error(sk)) != 0)
- break;
+ goto unlock;
if (sk->sk_shutdown & RCV_SHUTDOWN)
- break;
+ goto unlock;
+ unix_state_runlock(sk);
err = -EAGAIN;
if (!timeo)
break;
}
mutex_lock(&u->readlock);
continue;
+unlock:
+ unix_state_runlock(sk);
+ break;
}
+ unix_state_runlock(sk);
if (check_creds) {
/* Never glue messages from different writers */