#include <linux/bitops.h>
#include <linux/hrtimer.h>
#include <linux/uaccess.h>
+ #include <linux/intel-iommu.h>
#include <asm/pgtable.h>
#include <asm/gcc_intrin.h>
#include <asm/cacheflush.h>
#include <asm/div64.h>
#include <asm/tlb.h>
+ #include <asm/elf.h>
#include "misc.h"
#include "vti.h"
#include "iodev.h"
#include "ioapic.h"
#include "lapic.h"
+ #include "irq.h"
static unsigned long kvm_vmm_base;
static unsigned long kvm_vsa_base;
{ NULL }
};
-
- struct fdesc{
- unsigned long ip;
- unsigned long gp;
- };
-
static void kvm_flush_icache(unsigned long start, unsigned long len)
{
int l;
switch (ext) {
case KVM_CAP_IRQCHIP:
case KVM_CAP_USER_MEMORY:
+ case KVM_CAP_MP_STATE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
r = KVM_COALESCED_MMIO_PAGE_OFFSET;
break;
+ case KVM_CAP_IOMMU:
+ r = intel_iommu_found();
+ break;
default:
r = 0;
}
*/
kvm_build_io_pmt(kvm);
+ INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
}
struct kvm *kvm_arch_create_vm(void)
struct hrtimer *p_ht = &vcpu->arch.hlt_timer;
if (hrtimer_cancel(p_ht))
- hrtimer_start(p_ht, p_ht->expires, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(p_ht, HRTIMER_MODE_ABS);
}
static enum hrtimer_restart hlt_timer_fn(struct hrtimer *data)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ kvm_iommu_unmap_guest(kvm);
+ #ifdef KVM_CAP_DEVICE_ASSIGNMENT
+ kvm_free_all_assigned_devices(kvm);
+ #endif
kfree(kvm->arch.vioapic);
kvm_release_vm_pages(kvm);
kvm_free_physmem(kvm);
int user_alloc)
{
unsigned long i;
- struct page *page;
+ unsigned long pfn;
int npages = mem->memory_size >> PAGE_SHIFT;
struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
unsigned long base_gfn = memslot->base_gfn;
for (i = 0; i < npages; i++) {
- page = gfn_to_page(kvm, base_gfn + i);
- kvm_set_pmt_entry(kvm, base_gfn + i,
- page_to_pfn(page) << PAGE_SHIFT,
- _PAGE_AR_RWX|_PAGE_MA_WB);
- memslot->rmap[i] = (unsigned long)page;
+ pfn = gfn_to_pfn(kvm, base_gfn + i);
+ if (!kvm_is_mmio_pfn(pfn)) {
+ kvm_set_pmt_entry(kvm, base_gfn + i,
+ pfn << PAGE_SHIFT,
+ _PAGE_AR_RWX | _PAGE_MA_WB);
+ memslot->rmap[i] = (unsigned long)pfn_to_page(pfn);
+ } else {
+ kvm_set_pmt_entry(kvm, base_gfn + i,
+ GPFN_PHYS_MMIO | (pfn << PAGE_SHIFT),
+ _PAGE_MA_UC);
+ memslot->rmap[i] = 0;
+ }
}
return 0;
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- return -EINVAL;
+ vcpu_load(vcpu);
+ mp_state->mp_state = vcpu->arch.mp_state;
+ vcpu_put(vcpu);
+ return 0;
+ }
+
+ static int vcpu_reset(struct kvm_vcpu *vcpu)
+ {
+ int r;
+ long psr;
+ local_irq_save(psr);
+ r = kvm_insert_vmm_mapping(vcpu);
+ if (r)
+ goto fail;
+
+ vcpu->arch.launched = 0;
+ kvm_arch_vcpu_uninit(vcpu);
+ r = kvm_arch_vcpu_init(vcpu);
+ if (r)
+ goto fail;
+
+ kvm_purge_vmm_mapping(vcpu);
+ r = 0;
+ fail:
+ local_irq_restore(psr);
+ return r;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- return -EINVAL;
+ int r = 0;
+
+ vcpu_load(vcpu);
+ vcpu->arch.mp_state = mp_state->mp_state;
+ if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
+ r = vcpu_reset(vcpu);
+ vcpu_put(vcpu);
+ return r;
}
if (!atomic_inc_and_test(&pt->pending))
set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
- if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
- vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ if (vcpu0 && waitqueue_active(&vcpu0->wq))
wake_up_interruptible(&vcpu0->wq);
- }
- pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
- pt->scheduled = ktime_to_ns(pt->timer.expires);
+ hrtimer_add_expires_ns(&pt->timer, pt->period);
- pt->scheduled = ktime_to_ns(hrtimer_get_expires(&pt->timer));
++ pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
+ if (pt->period)
- ps->channels[0].count_load_time = pt->timer.expires;
++ ps->channels[0].count_load_time = hrtimer_get_expires(&pt->timer);
return (pt->period == 0 ? 0 : 1);
}
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
+ if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
return atomic_read(&pit->pit_state.pit_timer.pending);
-
return 0;
}
+ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+ {
+ struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+ irq_ack_notifier);
+ spin_lock(&ps->inject_lock);
+ if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+ atomic_inc(&ps->pit_timer.pending);
+ ps->irq_ack = 1;
+ spin_unlock(&ps->inject_lock);
+ }
+
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
struct kvm_kpit_state *ps;
timer = &pit->pit_state.pit_timer.timer;
if (hrtimer_cancel(timer))
- hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
static void destroy_pit_timer(struct kvm_kpit_timer *pt)
hrtimer_cancel(&pt->timer);
}
- static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
+ struct kvm_kpit_timer *pt = &ps->pit_timer;
s64 interval;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
pt->period = (is_period == 0) ? 0 : interval;
pt->timer.function = pit_timer_fn;
atomic_set(&pt->pending, 0);
+ ps->irq_ack = 1;
hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- create_pit_timer(&ps->pit_timer, val, 0);
+ create_pit_timer(ps, val, 0);
break;
case 2:
case 3:
- create_pit_timer(&ps->pit_timer, val, 1);
+ create_pit_timer(ps, val, 1);
break;
default:
destroy_pit_timer(&ps->pit_timer);
mutex_unlock(&pit->pit_state.lock);
atomic_set(&pit->pit_state.pit_timer.pending, 0);
- pit->pit_state.inject_pending = 1;
+ pit->pit_state.irq_ack = 1;
}
struct kvm_pit *kvm_create_pit(struct kvm *kvm)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
+ spin_lock_init(&pit->pit_state.inject_lock);
/* Initialize PIO device */
pit->dev.read = pit_ioport_read;
pit_state->pit = pit;
hrtimer_init(&pit_state->pit_timer.timer,
CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pit_state->irq_ack_notifier.gsi = 0;
+ pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+ kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
mutex_unlock(&pit->pit_state.lock);
kvm_pit_reset(pit);
static void __inject_pit_timer_intr(struct kvm *kvm)
{
mutex_lock(&kvm->lock);
- kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
- kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
- kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
- kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+ kvm_set_irq(kvm, 0, 1);
+ kvm_set_irq(kvm, 0, 0);
mutex_unlock(&kvm->lock);
}
struct kvm_kpit_state *ps;
if (vcpu && pit) {
+ int inject = 0;
ps = &pit->pit_state;
- /* Try to inject pending interrupts when:
- * 1. Pending exists
- * 2. Last interrupt was accepted or waited for too long time*/
- if (atomic_read(&ps->pit_timer.pending) &&
- (ps->inject_pending ||
- (jiffies - ps->last_injected_time
- >= KVM_MAX_PIT_INTR_INTERVAL))) {
- ps->inject_pending = 0;
- __inject_pit_timer_intr(kvm);
- ps->last_injected_time = jiffies;
- }
- }
- }
-
- void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
- {
- struct kvm_arch *arch = &vcpu->kvm->arch;
- struct kvm_kpit_state *ps;
-
- if (vcpu && arch->vpit) {
- ps = &arch->vpit->pit_state;
- if (atomic_read(&ps->pit_timer.pending) &&
- (((arch->vpic->pics[0].imr & 1) == 0 &&
- arch->vpic->pics[0].irq_base == vec) ||
- (arch->vioapic->redirtbl[0].fields.vector == vec &&
- arch->vioapic->redirtbl[0].fields.mask != 1))) {
- ps->inject_pending = 1;
- atomic_dec(&ps->pit_timer.pending);
- ps->channels[0].count_load_time = ktime_get();
+ /* Try to inject pending interrupts when
+ * last one has been acked.
+ */
+ spin_lock(&ps->inject_lock);
+ if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
+ ps->irq_ack = 0;
+ inject = 1;
}
+ spin_unlock(&ps->inject_lock);
+ if (inject)
+ __inject_pit_timer_intr(kvm);
}
}
#include <asm/current.h>
#include <asm/apicdef.h>
#include <asm/atomic.h>
+ #include "kvm_cache_regs.h"
#include "irq.h"
#define PRId64 "d"
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
- kvm_vcpu_kick(vcpu);
- else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
- }
+ kvm_vcpu_kick(vcpu);
result = (orig_irr == 0);
break;
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
kvm_vcpu_kick(vcpu);
} else {
- printk(KERN_DEBUG
- "Ignoring de-assert INIT to vcpu %d\n",
- vcpu->vcpu_id);
+ apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+ vcpu->vcpu_id);
}
-
break;
case APIC_DM_STARTUP:
- printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
- vcpu->vcpu_id, vector);
+ apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+ vcpu->vcpu_id, vector);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
+ kvm_vcpu_kick(vcpu);
}
break;
static void apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
-
+ int trigger_mode;
/*
* Not every write EOI will has corresponding ISR,
* one example is when Kernel check timer on setup_IO_APIC
apic_update_ppr(apic);
if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+ kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
}
static void apic_send_ipi(struct kvm_lapic *apic)
struct kvm_run *run = vcpu->run;
set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
- kvm_x86_ops->cache_regs(vcpu);
- run->tpr_access.rip = vcpu->arch.rip;
+ run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
* Refer SDM 8.4.1
*/
if (len != 4 || alignment) {
- if (printk_ratelimit())
- printk(KERN_ERR "apic write: bad size=%d %lx\n",
- len, (long)address);
+ /* Don't shout loud, $infamous_os would cause only noise. */
+ apic_debug("apic write: bad size=%d %lx\n",
+ len, (long)address);
return;
}
if(!atomic_inc_and_test(&apic->timer.pending))
set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
- if (waitqueue_active(q)) {
- apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ if (waitqueue_active(q))
wake_up_interruptible(q);
- }
+
if (apic_lvtt_period(apic)) {
result = 1;
- apic->timer.dev.expires = ktime_add_ns(
- apic->timer.dev.expires,
- apic->timer.period);
+ hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
}
return result;
}
timer = &apic->timer.dev;
if (hrtimer_cancel(timer))
- hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
hr_time = ktime_set(0, poll_timeout);
if (!hrtimer_is_queued(&ap_poll_timer) ||
- !hrtimer_forward(&ap_poll_timer, ap_poll_timer.expires, hr_time)) {
- ap_poll_timer.expires = hr_time;
- hrtimer_start(&ap_poll_timer, hr_time, HRTIMER_MODE_ABS);
+ !hrtimer_forward(&ap_poll_timer, hrtimer_get_expires(&ap_poll_timer), hr_time)) {
+ hrtimer_set_expires(&ap_poll_timer, hr_time);
+ hrtimer_start_expires(&ap_poll_timer, HRTIMER_MODE_ABS);
}
return count;
}
ap_dev->device.bus = &ap_bus_type;
ap_dev->device.parent = ap_root_device;
- snprintf(ap_dev->device.bus_id, BUS_ID_SIZE, "card%02x",
- AP_QID_DEVICE(ap_dev->qid));
+ dev_set_name(&ap_dev->device, "card%02x",
+ AP_QID_DEVICE(ap_dev->qid));
ap_dev->device.release = ap_device_release;
rc = device_register(&ap_dev->device);
if (rc) {
return compat_sys_futimesat(AT_FDCWD, filename, t);
}
+ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
+ {
+ compat_ino_t ino = stat->ino;
+ typeof(ubuf->st_uid) uid = 0;
+ typeof(ubuf->st_gid) gid = 0;
+ int err;
+
+ SET_UID(uid, stat->uid);
+ SET_GID(gid, stat->gid);
+
+ if ((u64) stat->size > MAX_NON_LFS ||
+ !old_valid_dev(stat->dev) ||
+ !old_valid_dev(stat->rdev))
+ return -EOVERFLOW;
+ if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
+ return -EOVERFLOW;
+
+ if (clear_user(ubuf, sizeof(*ubuf)))
+ return -EFAULT;
+
+ err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
+ err |= __put_user(ino, &ubuf->st_ino);
+ err |= __put_user(stat->mode, &ubuf->st_mode);
+ err |= __put_user(stat->nlink, &ubuf->st_nlink);
+ err |= __put_user(uid, &ubuf->st_uid);
+ err |= __put_user(gid, &ubuf->st_gid);
+ err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
+ err |= __put_user(stat->size, &ubuf->st_size);
+ err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
+ err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
+ err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
+ err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
+ err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
+ err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
+ err |= __put_user(stat->blksize, &ubuf->st_blksize);
+ err |= __put_user(stat->blocks, &ubuf->st_blocks);
+ return err;
+ }
+
asmlinkage long compat_sys_newstat(char __user * filename,
struct compat_stat __user *statbuf)
{
if (!p)
break;
argv++;
- if(++i > max)
+ if (i++ >= max)
return -E2BIG;
}
}
#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t))
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+ int timeval, int ret)
+{
+ struct timespec ts;
+
+ if (!p)
+ return ret;
+
+ if (current->personality & STICKY_TIMEOUTS)
+ goto sticky;
+
+ /* No update for zero timeout */
+ if (!end_time->tv_sec && !end_time->tv_nsec)
+ return ret;
+
+ ktime_get_ts(&ts);
+ ts = timespec_sub(*end_time, ts);
+ if (ts.tv_sec < 0)
+ ts.tv_sec = ts.tv_nsec = 0;
+
+ if (timeval) {
+ struct compat_timeval rtv;
+
+ rtv.tv_sec = ts.tv_sec;
+ rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+ if (!copy_to_user(p, &rtv, sizeof(rtv)))
+ return ret;
+ } else {
+ struct compat_timespec rts;
+
+ rts.tv_sec = ts.tv_sec;
+ rts.tv_nsec = ts.tv_nsec;
+
+ if (!copy_to_user(p, &rts, sizeof(rts)))
+ return ret;
+ }
+ /*
+ * If an application puts its timeval in read-only memory, we
+ * don't want the Linux-specific update to the timeval to
+ * cause a fault after the select has completed
+ * successfully. However, because we're not updating the
+ * timeval, we can't restart the system call.
+ */
+
+sticky:
+ if (ret == -ERESTARTNOHAND)
+ ret = -EINTR;
+ return ret;
+}
+
/*
* Ooo, nasty. We need here to frob 32-bit unsigned longs to
* 64-bit unsigned longs.
((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
int compat_core_sys_select(int n, compat_ulong_t __user *inp,
- compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout)
+ compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+ struct timespec *end_time)
{
fd_set_bits fds;
void *bits;
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex);
- ret = do_select(n, &fds, timeout);
+ ret = do_select(n, &fds, end_time);
if (ret < 0)
goto out;
compat_ulong_t __user *outp, compat_ulong_t __user *exp,
struct compat_timeval __user *tvp)
{
- s64 timeout = -1;
+ struct timespec end_time, *to = NULL;
struct compat_timeval tv;
int ret;
if (copy_from_user(&tv, tvp, sizeof(tv)))
return -EFAULT;
- if (tv.tv_sec < 0 || tv.tv_usec < 0)
+ to = &end_time;
+ if (poll_select_set_timeout(to, tv.tv_sec,
+ tv.tv_usec * NSEC_PER_USEC))
return -EINVAL;
-
- /* Cast to u64 to make GCC stop complaining */
- if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
- timeout = -1; /* infinite */
- else {
- timeout = DIV_ROUND_UP(tv.tv_usec, 1000000/HZ);
- timeout += tv.tv_sec * HZ;
- }
}
- ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
-
- if (tvp) {
- struct compat_timeval rtv;
-
- if (current->personality & STICKY_TIMEOUTS)
- goto sticky;
- rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
- rtv.tv_sec = timeout;
- if (compat_timeval_compare(&rtv, &tv) >= 0)
- rtv = tv;
- if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
-sticky:
- /*
- * If an application puts its timeval in read-only
- * memory, we don't want the Linux-specific update to
- * the timeval to cause a fault after the select has
- * completed successfully. However, because we're not
- * updating the timeval, we can't restart the system
- * call.
- */
- if (ret == -ERESTARTNOHAND)
- ret = -EINTR;
- }
- }
+ ret = compat_core_sys_select(n, inp, outp, exp, to);
+ ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
return ret;
}
{
compat_sigset_t ss32;
sigset_t ksigmask, sigsaved;
- s64 timeout = MAX_SCHEDULE_TIMEOUT;
struct compat_timespec ts;
+ struct timespec end_time, *to = NULL;
int ret;
if (tsp) {
if (copy_from_user(&ts, tsp, sizeof(ts)))
return -EFAULT;
- if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+ to = &end_time;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
return -EINVAL;
}
sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
}
- do {
- if (tsp) {
- if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) {
- timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
- timeout += ts.tv_sec * (unsigned long)HZ;
- ts.tv_sec = 0;
- ts.tv_nsec = 0;
- } else {
- ts.tv_sec -= MAX_SELECT_SECONDS;
- timeout = MAX_SELECT_SECONDS * HZ;
- }
- }
-
- ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
-
- } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
-
- if (tsp) {
- struct compat_timespec rts;
-
- if (current->personality & STICKY_TIMEOUTS)
- goto sticky;
-
- rts.tv_sec = timeout / HZ;
- rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ);
- if (rts.tv_nsec >= NSEC_PER_SEC) {
- rts.tv_sec++;
- rts.tv_nsec -= NSEC_PER_SEC;
- }
- if (compat_timespec_compare(&rts, &ts) >= 0)
- rts = ts;
- if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
- /*
- * If an application puts its timeval in read-only
- * memory, we don't want the Linux-specific update to
- * the timeval to cause a fault after the select has
- * completed successfully. However, because we're not
- * updating the timeval, we can't restart the system
- * call.
- */
- if (ret == -ERESTARTNOHAND)
- ret = -EINTR;
- }
- }
+ ret = compat_core_sys_select(n, inp, outp, exp, to);
+ ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
if (ret == -ERESTARTNOHAND) {
/*
compat_sigset_t ss32;
sigset_t ksigmask, sigsaved;
struct compat_timespec ts;
- s64 timeout = -1;
+ struct timespec end_time, *to = NULL;
int ret;
if (tsp) {
if (copy_from_user(&ts, tsp, sizeof(ts)))
return -EFAULT;
- /* We assume that ts.tv_sec is always lower than
- the number of seconds that can be expressed in
- an s64. Otherwise the compiler bitches at us */
- timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
- timeout += ts.tv_sec * HZ;
+ to = &end_time;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
}
if (sigmask) {
sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
}
- ret = do_sys_poll(ufds, nfds, &timeout);
+ ret = do_sys_poll(ufds, nfds, to);
/* We can restart this syscall, usually */
if (ret == -EINTR) {
} else if (sigmask)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- if (tsp && timeout >= 0) {
- struct compat_timespec rts;
-
- if (current->personality & STICKY_TIMEOUTS)
- goto sticky;
- /* Yes, we know it's actually an s64, but it's also positive. */
- rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
- 1000;
- rts.tv_sec = timeout;
- if (compat_timespec_compare(&rts, &ts) >= 0)
- rts = ts;
- if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
- /*
- * If an application puts its timeval in read-only
- * memory, we don't want the Linux-specific update to
- * the timeval to cause a fault after the select has
- * completed successfully. However, because we're not
- * updating the timeval, we can't restart the system
- * call.
- */
- if (ret == -ERESTARTNOHAND && timeout >= 0)
- ret = -EINTR;
- }
- }
+ ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
return ret;
}
#include <linux/init.h>
#include <linux/list.h>
#include <linux/wait.h>
+#include <linux/percpu.h>
+
struct hrtimer_clock_base;
struct hrtimer_cpu_base;
* HRTIMER_CB_IRQSAFE: Callback may run in hardirq context
* HRTIMER_CB_IRQSAFE_NO_RESTART: Callback may run in hardirq context and
* does not restart the timer
- * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: Callback must run in hardirq context
- * Special mode for tick emultation
+ * HRTIMER_CB_IRQSAFE_PERCPU: Callback must run in hardirq context
+ * Special mode for tick emulation and
+ * scheduler timer. Such timers are per
+ * cpu and not allowed to be migrated on
+ * cpu unplug.
+ * HRTIMER_CB_IRQSAFE_UNLOCKED: Callback should run in hardirq context
+ * with timer->base lock unlocked
+ * used for timers which call wakeup to
+ * avoid lock order problems with rq->lock
*/
enum hrtimer_cb_mode {
HRTIMER_CB_SOFTIRQ,
HRTIMER_CB_IRQSAFE,
HRTIMER_CB_IRQSAFE_NO_RESTART,
- HRTIMER_CB_IRQSAFE_NO_SOFTIRQ,
+ HRTIMER_CB_IRQSAFE_PERCPU,
+ HRTIMER_CB_IRQSAFE_UNLOCKED,
};
/*
* 0x02 callback function running
* 0x04 callback pending (high resolution mode)
*
- * Special case:
+ * Special cases:
* 0x03 callback function running and enqueued
* (was requeued on another CPU)
+ * 0x09 timer was migrated on CPU hotunplug
* The "callback function running and enqueued" status is only possible on
* SMP. It happens for example when a posix timer expired and the callback
* queued a signal. Between dropping the lock which protects the posix timer
#define HRTIMER_STATE_ENQUEUED 0x01
#define HRTIMER_STATE_CALLBACK 0x02
#define HRTIMER_STATE_PENDING 0x04
+ #define HRTIMER_STATE_MIGRATE 0x08
/**
* struct hrtimer - the basic hrtimer structure
*/
struct hrtimer {
struct rb_node node;
- ktime_t expires;
+ ktime_t _expires;
+ ktime_t _softexpires;
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
unsigned long state;
#endif
};
+static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
+{
+ timer->_expires = time;
+ timer->_softexpires = time;
+}
+
+static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
+{
+ timer->_softexpires = time;
+ timer->_expires = ktime_add_safe(time, delta);
+}
+
+static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
+{
+ timer->_softexpires = time;
+ timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
+}
+
+static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
+{
+ timer->_expires.tv64 = tv64;
+ timer->_softexpires.tv64 = tv64;
+}
+
+static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
+{
+ timer->_expires = ktime_add_safe(timer->_expires, time);
+ timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
+}
+
+static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
+{
+ timer->_expires = ktime_add_ns(timer->_expires, ns);
+ timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
+}
+
+static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
+{
+ return timer->_expires;
+}
+
+static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
+{
+ return timer->_softexpires;
+}
+
+static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
+{
+ return timer->_expires.tv64;
+}
+static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
+{
+ return timer->_softexpires.tv64;
+}
+
+static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
+{
+ return ktime_to_ns(timer->_expires);
+}
+
+static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
+{
+ return ktime_sub(timer->_expires, timer->base->get_time());
+}
+
#ifdef CONFIG_HIGH_RES_TIMERS
struct clock_event_device;
return timer->base->cpu_base->hres_active;
}
+extern void hrtimer_peek_ahead_timers(void);
+
/*
* The resolution of the clocks. The resolution value is returned in
* the clock_getres() system call to give application programmers an
* is expired in the next softirq when the clock was advanced.
*/
static inline void clock_was_set(void) { }
+static inline void hrtimer_peek_ahead_timers(void) { }
static inline void hres_timers_resume(void) { }
extern ktime_t ktime_get(void);
extern ktime_t ktime_get_real(void);
+
+DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
+
+
/* Exported timer functions: */
/* Initialize timers: */
/* Basic timer operations: */
extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
const enum hrtimer_mode mode);
+extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long range_ns, const enum hrtimer_mode mode);
extern int hrtimer_cancel(struct hrtimer *timer);
extern int hrtimer_try_to_cancel(struct hrtimer *timer);
+static inline int hrtimer_start_expires(struct hrtimer *timer,
+ enum hrtimer_mode mode)
+{
+ unsigned long delta;
+ ktime_t soft, hard;
+ soft = hrtimer_get_softexpires(timer);
+ hard = hrtimer_get_expires(timer);
+ delta = ktime_to_ns(ktime_sub(hard, soft));
+ return hrtimer_start_range_ns(timer, soft, delta, mode);
+}
+
static inline int hrtimer_restart(struct hrtimer *timer)
{
- return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+ return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
/* Query timers: */
extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
struct task_struct *tsk);
+extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+ const enum hrtimer_mode mode);
+extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
+
/* Soft interrupt function to run the hrtimer queues: */
extern void hrtimer_run_queues(void);
extern void hrtimer_run_pending(void);
extern void arch_unmap_area(struct mm_struct *, unsigned long);
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
- #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+ #if USE_SPLIT_PTLOCKS
/*
* The mm counters are not protected by its page_table_lock,
* so must be incremented atomically.
#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
- #else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+ #else /* !USE_SPLIT_PTLOCKS */
/*
* The mm counters are protected by its page_table_lock,
* so can be incremented directly.
#define inc_mm_counter(mm, member) (mm)->_##member++
#define dec_mm_counter(mm, member) (mm)->_##member--
- #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+ #endif /* !USE_SPLIT_PTLOCKS */
#define get_mm_rss(mm) \
(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
* - everyone except group_exit_task is stopped during signal delivery
* of fatal signals, group_exit_task processes the signal.
*/
- struct task_struct *group_exit_task;
int notify_count;
+ struct task_struct *group_exit_task;
/* thread group stop support, overloads group_exit_code too */
int group_stop_count;
unsigned int ttwu_move_affine;
unsigned int ttwu_move_balance;
#endif
+ #ifdef CONFIG_SCHED_DEBUG
+ char *name;
+ #endif
};
extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
void (*yield_task) (struct rq *rq);
int (*select_task_rq)(struct task_struct *p, int sync);
- void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
+ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
struct task_struct * (*pick_next_task) (struct rq *rq);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
struct sched_rt_entity {
struct list_head run_list;
- unsigned int time_slice;
unsigned long timeout;
+ unsigned int time_slice;
int nr_cpus_allowed;
struct sched_rt_entity *back;
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
+ /*
+ * time slack values; these are used to round up poll() and
+ * select() etc timeout values. These are in nanoseconds.
+ */
+ unsigned long timer_slack_ns;
+ unsigned long default_timer_slack_ns;
};
/*
#ifdef __KERNEL__
+ extern struct timezone sys_tz;
+
/* Parameters used to convert the timespec values: */
#define MSEC_PER_SEC 1000L
#define USEC_PER_MSEC 1000L
#define NSEC_PER_SEC 1000000000L
#define FSEC_PER_SEC 1000000000000000L
+#define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
+
static inline int timespec_equal(const struct timespec *a,
const struct timespec *b)
{
const unsigned int min, const unsigned int sec);
extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
+extern struct timespec timespec_add_safe(const struct timespec lhs,
+ const struct timespec rhs);
/*
* sub = lhs - rhs, in normalized form
sig->leader = 0; /* session leadership doesn't inherit */
sig->tty_old_pgrp = NULL;
+ sig->tty = NULL;
sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
sig->gtime = cputime_zero;
void __cleanup_signal(struct signal_struct *sig)
{
exit_thread_group_keys(sig);
+ tty_kref_put(sig->tty);
kmem_cache_free(signal_cachep, sig);
}
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;
+ p->default_timer_slack_ns = current->timer_slack_ns;
+
#ifdef CONFIG_DETECT_SOFTLOCKUP
p->last_switch_count = 0;
p->last_switch_timestamp = 0;
p->nsproxy->pid_ns->child_reaper = p;
p->signal->leader_pid = pid;
- p->signal->tty = current->signal->tty;
+ tty_kref_put(p->signal->tty);
+ p->signal->tty = tty_kref_get(current->signal->tty);
set_task_pgrp(p, task_pgrp_nr(current));
set_task_session(p, task_session_nr(current));
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
if (!base->first)
continue;
timer = rb_entry(base->first, struct hrtimer, node);
- expires = ktime_sub(timer->expires, base->offset);
+ expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires.tv64 < cpu_base->expires_next.tv64)
cpu_base->expires_next = expires;
}
struct hrtimer_clock_base *base)
{
ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
- ktime_t expires = ktime_sub(timer->expires, base->offset);
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
int res;
- WARN_ON_ONCE(timer->expires.tv64 < 0);
+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
/*
* When the callback is running, we do not reprogram the clock event
*/
BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
return 1;
- case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+ case HRTIMER_CB_IRQSAFE_PERCPU:
+ case HRTIMER_CB_IRQSAFE_UNLOCKED:
/*
* This is solely for the sched tick emulation with
* dynamic tick support to ensure that we do not
* restart the tick right on the edge and end up with
* the tick timer in the softirq ! The calling site
- * takes care of this.
+ * takes care of this. Also used for hrtimer sleeper !
*/
debug_hrtimer_deactivate(timer);
return 1;
u64 orun = 1;
ktime_t delta;
- delta = ktime_sub(now, timer->expires);
+ delta = ktime_sub(now, hrtimer_get_expires(timer));
if (delta.tv64 < 0)
return 0;
s64 incr = ktime_to_ns(interval);
orun = ktime_divns(delta, incr);
- timer->expires = ktime_add_ns(timer->expires, incr * orun);
- if (timer->expires.tv64 > now.tv64)
+ hrtimer_add_expires_ns(timer, incr * orun);
+ if (hrtimer_get_expires_tv64(timer) > now.tv64)
return orun;
/*
* This (and the ktime_add() below) is the
*/
orun++;
}
- timer->expires = ktime_add_safe(timer->expires, interval);
+ hrtimer_add_expires(timer, interval);
return orun;
}
* We dont care about collisions. Nodes with
* the same expiry time stay together.
*/
- if (timer->expires.tv64 < entry->expires.tv64) {
+ if (hrtimer_get_expires_tv64(timer) <
+ hrtimer_get_expires_tv64(entry)) {
link = &(*link)->rb_left;
} else {
link = &(*link)->rb_right;
}
/**
- * hrtimer_start - (re)start an relative timer on the current CPU
+ * hrtimer_start_range_ns - (re)start an relative timer on the current CPU
* @timer: the timer to be added
* @tim: expiry time
+ * @delta_ns: "slack" range for the timer
* @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
*
* Returns:
* 1 when the timer was active
*/
int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
+ const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
#endif
}
- timer->expires = tim;
+ hrtimer_set_expires_range_ns(timer, tim, delta_ns);
timer_stats_hrtimer_set_start_info(timer);
return ret;
}
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+/**
+ * hrtimer_start - (re)start an relative timer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ * 0 on success
+ * 1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+ return hrtimer_start_range_ns(timer, tim, 0, mode);
+}
EXPORT_SYMBOL_GPL(hrtimer_start);
+
/**
* hrtimer_try_to_cancel - try to deactivate a timer
* @timer: hrtimer to stop
ktime_t rem;
base = lock_hrtimer_base(timer, &flags);
- rem = ktime_sub(timer->expires, base->get_time());
+ rem = hrtimer_expires_remaining(timer);
unlock_hrtimer_base(timer, &flags);
return rem;
continue;
timer = rb_entry(base->first, struct hrtimer, node);
- delta.tv64 = timer->expires.tv64;
+ delta.tv64 = hrtimer_get_expires_tv64(timer);
delta = ktime_sub(delta, base->get_time());
if (delta.tv64 < mindelta.tv64)
mindelta.tv64 = delta.tv64;
timer_stats_account_hrtimer(timer);
fn = timer->function;
- if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+ timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
/*
* Used for scheduler timers, avoid lock inversion with
* rq->lock and tasklist_lock.
timer = rb_entry(node, struct hrtimer, node);
- if (basenow.tv64 < timer->expires.tv64) {
+ /*
+ * The immediate goal for using the softexpires is
+ * minimizing wakeups, not running timers at the
+ * earliest interrupt after their soft expiration.
+ * This allows us to avoid using a Priority Search
+ * Tree, which can answer a stabbing querry for
+ * overlapping intervals and instead use the simple
+ * BST we already have.
+ * We don't add extra wakeups by delaying timers that
+ * are right-of a not yet expired timer, because that
+ * timer will have to trigger a wakeup anyway.
+ */
+
+ if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
ktime_t expires;
- expires = ktime_sub(timer->expires,
+ expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
if (expires.tv64 < expires_next.tv64)
expires_next = expires;
raise_softirq(HRTIMER_SOFTIRQ);
}
+/**
+ * hrtimer_peek_ahead_timers -- run soft-expired timers now
+ *
+ * hrtimer_peek_ahead_timers will peek at the timer queue of
+ * the current cpu and check if there are any timers for which
+ * the soft expires time has passed. If any such timers exist,
+ * they are run immediately and then removed from the timer queue.
+ *
+ */
+void hrtimer_peek_ahead_timers(void)
+{
+ unsigned long flags;
+ struct tick_device *td;
+ struct clock_event_device *dev;
+
+ if (!hrtimer_hres_active())
+ return;
+
+ local_irq_save(flags);
+ td = &__get_cpu_var(tick_cpu_device);
+ if (!td)
+ goto out;
+ dev = td->evtdev;
+ if (!dev)
+ goto out;
+ hrtimer_interrupt(dev);
+out:
+ local_irq_restore(flags);
+}
+
static void run_hrtimer_softirq(struct softirq_action *h)
{
run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
struct hrtimer *timer;
timer = rb_entry(node, struct hrtimer, node);
- if (base->softirq_time.tv64 <= timer->expires.tv64)
+ if (base->softirq_time.tv64 <=
+ hrtimer_get_expires_tv64(timer))
break;
if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
sl->timer.function = hrtimer_wakeup;
sl->task = task;
#ifdef CONFIG_HIGH_RES_TIMERS
- sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
#endif
}
do {
set_current_state(TASK_INTERRUPTIBLE);
- hrtimer_start(&t->timer, t->timer.expires, mode);
+ hrtimer_start_expires(&t->timer, mode);
if (!hrtimer_active(&t->timer))
t->task = NULL;
struct timespec rmt;
ktime_t rem;
- rem = ktime_sub(timer->expires, timer->base->get_time());
+ rem = hrtimer_expires_remaining(timer);
if (rem.tv64 <= 0)
return 0;
rmt = ktime_to_timespec(rem);
hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
HRTIMER_MODE_ABS);
- t.timer.expires.tv64 = restart->nanosleep.expires;
+ hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
if (do_nanosleep(&t, HRTIMER_MODE_ABS))
goto out;
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
+ unsigned long slack;
+
+ slack = current->timer_slack_ns;
+ if (rt_task(current))
+ slack = 0;
hrtimer_init_on_stack(&t.timer, clockid, mode);
- t.timer.expires = timespec_to_ktime(*rqtp);
+ hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
if (do_nanosleep(&t, mode))
goto out;
restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.index = t.timer.base->index;
restart->nanosleep.rmtp = rmtp;
- restart->nanosleep.expires = t.timer.expires.tv64;
+ restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
ret = -ERESTART_RESTARTBLOCK;
out:
#ifdef CONFIG_HOTPLUG_CPU
- static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
- struct hrtimer_clock_base *new_base)
+ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+ struct hrtimer_clock_base *new_base, int dcpu)
{
struct hrtimer *timer;
struct rb_node *node;
+ int raise = 0;
while ((node = rb_first(&old_base->active))) {
timer = rb_entry(node, struct hrtimer, node);
BUG_ON(hrtimer_callback_running(timer));
debug_hrtimer_deactivate(timer);
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
+
+ /*
+ * Should not happen. Per CPU timers should be
+ * canceled _before_ the migration code is called
+ */
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
+ __remove_hrtimer(timer, old_base,
+ HRTIMER_STATE_INACTIVE, 0);
+ WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
+ timer, timer->function, dcpu);
+ continue;
+ }
+
+ /*
+ * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+ * timer could be seen as !active and just vanish away
+ * under us on another CPU
+ */
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
timer->base = new_base;
/*
* Enqueue the timer. Allow reprogramming of the event device
*/
enqueue_hrtimer(timer, new_base, 1);
+
+ #ifdef CONFIG_HIGH_RES_TIMERS
+ /*
+ * Happens with high res enabled when the timer was
+ * already expired and the callback mode is
+ * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
+ * enqueue code does not move them to the soft irq
+ * pending list for performance/latency reasons, but
+ * in the migration state, we need to do that
+ * otherwise we end up with a stale timer.
+ */
+ if (timer->state == HRTIMER_STATE_MIGRATE) {
+ timer->state = HRTIMER_STATE_PENDING;
+ list_add_tail(&timer->cb_entry,
+ &new_base->cpu_base->cb_pending);
+ raise = 1;
+ }
+ #endif
+ /* Clear the migration state bit */
+ timer->state &= ~HRTIMER_STATE_MIGRATE;
+ }
+ return raise;
+ }
+
+ #ifdef CONFIG_HIGH_RES_TIMERS
+ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+ struct hrtimer_cpu_base *new_base)
+ {
+ struct hrtimer *timer;
+ int raise = 0;
+
+ while (!list_empty(&old_base->cb_pending)) {
+ timer = list_entry(old_base->cb_pending.next,
+ struct hrtimer, cb_entry);
+
+ __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
+ timer->base = &new_base->clock_base[timer->base->index];
+ list_add_tail(&timer->cb_entry, &new_base->cb_pending);
+ raise = 1;
}
+ return raise;
+ }
+ #else
+ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+ struct hrtimer_cpu_base *new_base)
+ {
+ return 0;
}
+ #endif
static void migrate_hrtimers(int cpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
- int i;
+ int i, raise = 0;
BUG_ON(cpu_online(cpu));
old_base = &per_cpu(hrtimer_bases, cpu);
spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- migrate_hrtimer_list(&old_base->clock_base[i],
- &new_base->clock_base[i]);
+ if (migrate_hrtimer_list(&old_base->clock_base[i],
+ &new_base->clock_base[i], cpu))
+ raise = 1;
}
+ if (migrate_hrtimer_pending(old_base, new_base))
+ raise = 1;
+
spin_unlock(&old_base->lock);
spin_unlock(&new_base->lock);
local_irq_enable();
put_cpu_var(hrtimer_bases);
+
+ if (raise)
+ hrtimer_raise_softirq();
}
#endif /* CONFIG_HOTPLUG_CPU */
#endif
}
+/**
+ * schedule_hrtimeout_range - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @delta: slack in expires timeout (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+ const enum hrtimer_mode mode)
+{
+ struct hrtimer_sleeper t;
+
+ /*
+ * Optimize when a zero timeout value is given. It does not
+ * matter whether this is an absolute or a relative time.
+ */
+ if (expires && !expires->tv64) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ /*
+ * A NULL parameter means "inifinte"
+ */
+ if (!expires) {
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ return -EINTR;
+ }
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
+
+ hrtimer_init_sleeper(&t, current);
+
+ hrtimer_start_expires(&t.timer, mode);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ return !t.task ? 0 : -EINTR;
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout(ktime_t *expires,
+ const enum hrtimer_mode mode)
+{
+ return schedule_hrtimeout_range(expires, 0, mode);
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout);
return tmr;
if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
kmem_cache_free(posix_timers_cache, tmr);
- tmr = NULL;
+ return NULL;
}
memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
return tmr;
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
- remaining = ktime_sub(timer->expires, now);
+ remaining = ktime_sub(hrtimer_get_expires(timer), now);
/* Return 0 only, when the timer is expired and not pending */
if (remaining.tv64 <= 0) {
/*
hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
timr->it.real.timer.function = posix_timer_fn;
- timer->expires = timespec_to_ktime(new_setting->it_value);
+ hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
/* Convert interval */
timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
/* Setup correct expiry time for relative timers */
if (mode == HRTIMER_MODE_REL) {
- timer->expires =
- ktime_add_safe(timer->expires,
- timer->base->get_time());
+ hrtimer_add_expires(timer, timer->base->get_time());
}
return 0;
}
- hrtimer_start(timer, timer->expires, mode);
+ hrtimer_start_expires(timer, mode);
return 0;
}
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
- rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+ }
+
+ static inline int rt_bandwidth_enabled(void)
+ {
+ return sysctl_sched_rt_runtime >= 0;
}
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
ktime_t now;
- if (rt_b->rt_runtime == RUNTIME_INF)
+ if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start(&rt_b->rt_period_timer,
- rt_b->rt_period_timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
#endif /* CONFIG_RT_GROUP_SCHED */
- #else /* !CONFIG_FAIR_GROUP_SCHED */
+ #else /* !CONFIG_USER_SCHED */
#define root_task_group init_task_group
- #endif /* CONFIG_FAIR_GROUP_SCHED */
+ #endif /* CONFIG_USER_SCHED */
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
{
- rq->curr->sched_class->check_preempt_curr(rq, p);
+ rq->curr->sched_class->check_preempt_curr(rq, p, sync);
}
static inline int cpu_of(struct rq *rq)
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
- timer->expires = time;
+ hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
hrtimer_restart(timer);
return NOTIFY_DONE;
}
- static void init_hrtick(void)
+ static __init void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
}
- static void init_hrtick(void)
+ static inline void init_hrtick(void)
{
}
#endif /* CONFIG_SMP */
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
- rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
}
- #else
+ #else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
static inline void init_hrtick(void)
{
}
- #endif
+ #endif /* CONFIG_SCHED_HRTICK */
/*
* resched_task - mark a task 'to be rescheduled now'.
update_load_sub(&rq->load, load);
}
- #ifdef CONFIG_SMP
- static unsigned long source_load(int cpu, int type);
- static unsigned long target_load(int cpu, int type);
- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
- static unsigned long cpu_avg_load_per_task(int cpu)
- {
- struct rq *rq = cpu_rq(cpu);
-
- if (rq->nr_running)
- rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-
- return rq->avg_load_per_task;
- }
-
- #ifdef CONFIG_FAIR_GROUP_SCHED
-
- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+ typedef int (*tg_visitor)(struct task_group *, void *);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
- static void
- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
+ int ret;
rcu_read_lock();
parent = &root_task_group;
down:
- (*down)(parent, cpu, sd);
+ ret = (*down)(parent, data);
+ if (ret)
+ goto out_unlock;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
- (*up)(parent, cpu, sd);
+ ret = (*up)(parent, data);
+ if (ret)
+ goto out_unlock;
child = parent;
parent = parent->parent;
if (parent)
goto up;
+ out_unlock:
rcu_read_unlock();
+
+ return ret;
}
+ static int tg_nop(struct task_group *tg, void *data)
+ {
+ return 0;
+ }
+ #endif
+
+ #ifdef CONFIG_SMP
+ static unsigned long source_load(int cpu, int type);
+ static unsigned long target_load(int cpu, int type);
+ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+ static unsigned long cpu_avg_load_per_task(int cpu)
+ {
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->nr_running)
+ rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+
+ return rq->avg_load_per_task;
+ }
+
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/*
* This needs to be done in a bottom-up fashion because the rq weight of a
* parent group depends on the shares of its child groups.
*/
- static void
- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+ static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long rq_weight = 0;
unsigned long shares = 0;
+ struct sched_domain *sd = data;
int i;
for_each_cpu_mask(i, sd->span) {
__update_group_shares_cpu(tg, i, shares, rq_weight);
spin_unlock_irqrestore(&rq->lock, flags);
}
+
+ return 0;
}
/*
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
- static void
- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+ static int tg_load_down(struct task_group *tg, void *data)
{
unsigned long load;
+ long cpu = (long)data;
if (!tg->parent) {
load = cpu_rq(cpu)->load.weight;
}
tg->cfs_rq[cpu]->h_load = load;
- }
- static void
- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
- {
+ return 0;
}
static void update_shares(struct sched_domain *sd)
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now;
- walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+ walk_tg_tree(tg_nop, tg_shares_up, sd);
}
}
spin_lock(&rq->lock);
}
- static void update_h_load(int cpu)
+ static void update_h_load(long cpu)
{
- walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+ walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
#else
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
- if (!match_state || p->state == match_state) {
- ncsw = p->nivcsw + p->nvcsw;
- if (unlikely(!ncsw))
- ncsw = 1;
- }
+ if (!match_state || p->state == match_state)
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, &flags);
/*
trace_mark(kernel_sched_wakeup,
"pid %d state %ld ## rq %p task %p rq->curr %p",
p->pid, p->state, rq, p, rq->curr);
- check_preempt_curr(rq, p);
+ check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
trace_mark(kernel_sched_wakeup_new,
"pid %d state %ld ## rq %p task %p rq->curr %p",
p->pid, p->state, rq, p, rq->curr);
- check_preempt_curr(rq, p);
+ check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
p->sched_class->task_wake_up(rq, p);
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- check_preempt_curr(this_rq, p);
+ check_preempt_curr(this_rq, p, 0);
}
/*
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+ /**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ */
void complete(struct completion *x)
{
unsigned long flags;
}
EXPORT_SYMBOL(complete);
+ /**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ */
void complete_all(struct completion *x)
{
unsigned long flags;
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
- if ((state == TASK_INTERRUPTIBLE &&
- signal_pending(current)) ||
- (state == TASK_KILLABLE &&
- fatal_signal_pending(current))) {
+ if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
return timeout;
}
+ /**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
void __sched wait_for_completion(struct completion *x)
{
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion);
+ /**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ */
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
}
EXPORT_SYMBOL(wait_for_completion_timeout);
+ /**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ */
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
+ /**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ */
unsigned long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+ /**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ */
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
- if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0)
return -EPERM;
#endif
set_task_cpu(p, dest_cpu);
if (on_rq) {
activate_task(rq_dest, p, 0);
- check_preempt_curr(rq_dest, p);
+ check_preempt_curr(rq_dest, p, 0);
}
done:
ret = 1;
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(12);
+ struct ctl_table *table = sd_alloc_ctl_entry(13);
if (table == NULL)
return NULL;
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax);
- /* &table[11] is terminator */
+ set_table_entry(&table[11], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring);
+ /* &table[12] is terminator */
return table;
}
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
+ #ifdef CONFIG_SCHED_DEBUG
+ # define SD_INIT_NAME(sd, type) sd->name = #type
+ #else
+ # define SD_INIT_NAME(sd, type) do { } while (0)
+ #endif
+
#define SD_INIT(sd, type) sd_init_##type(sd)
+
#define SD_INIT_FUNC(type) \
static noinline void sd_init_##type(struct sched_domain *sd) \
{ \
memset(sd, 0, sizeof(*sd)); \
*sd = SD_##type##_INIT; \
sd->level = SD_LV_##type; \
+ SD_INIT_NAME(sd, type); \
}
SD_INIT_FUNC(CPU)
* and partition_sched_domains() will fallback to the single partition
* 'fallback_doms', it also forces the domains to be rebuilt.
*
+ * If doms_new==NULL it will be replaced with cpu_online_map.
+ * ndoms_new==0 is a special case for destroying existing domains.
+ * It will not create the default domain.
+ *
* Call with hotplug lock held
*/
void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
struct sched_domain_attr *dattr_new)
{
- int i, j;
+ int i, j, n;
mutex_lock(&sched_domains_mutex);
/* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl();
- if (doms_new == NULL)
- ndoms_new = 0;
+ n = doms_new ? ndoms_new : 0;
/* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < ndoms_new; j++) {
+ for (j = 0; j < n; j++) {
if (cpus_equal(doms_cur[i], doms_new[j])
&& dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
if (doms_new == NULL) {
ndoms_cur = 0;
- ndoms_new = 1;
doms_new = &fallback_doms;
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
dattr_new = NULL;
int arch_reinit_sched_domains(void)
{
get_online_cpus();
+
+ /* Destroy domains first to force the rebuild */
+ partition_sched_domains(0, NULL, NULL);
+
rebuild_sched_domains();
put_online_cpus();
+
return 0;
}
case CPU_ONLINE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- partition_sched_domains(0, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
return NOTIFY_OK;
default:
#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
- if ((in_atomic() || irqs_disabled()) &&
- system_state == SYSTEM_RUNNING && !oops_in_progress) {
- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
- return;
- prev_jiffy = jiffies;
- printk(KERN_ERR "BUG: sleeping function called from invalid"
- " context at %s:%d\n", file, line);
- printk("in_atomic():%d, irqs_disabled():%d\n",
- in_atomic(), irqs_disabled());
- debug_show_held_locks(current);
- if (irqs_disabled())
- print_irqtrace_events(current);
- dump_stack();
- }
+ if ((!in_atomic() && !irqs_disabled()) ||
+ system_state != SYSTEM_RUNNING || oops_in_progress)
+ return;
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR
+ "BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ printk(KERN_ERR
+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+ dump_stack();
#endif
}
EXPORT_SYMBOL(__might_sleep);
static unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
- return 1ULL << 16;
+ return 1ULL << 20;
- return div64_u64(runtime << 16, period);
+ return div64_u64(runtime << 20, period);
}
- #ifdef CONFIG_CGROUP_SCHED
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+ /* Must be called with tasklist_lock held */
+ static inline int tg_has_rt_tasks(struct task_group *tg)
{
- struct task_group *tgi, *parent = tg->parent;
- unsigned long total = 0;
+ struct task_struct *g, *p;
- if (!parent) {
- if (global_rt_period() < period)
- return 0;
+ do_each_thread(g, p) {
+ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+ return 1;
+ } while_each_thread(g, p);
- return to_ratio(period, runtime) <
- to_ratio(global_rt_period(), global_rt_runtime());
- }
+ return 0;
+ }
- if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
- return 0;
+ struct rt_schedulable_data {
+ struct task_group *tg;
+ u64 rt_period;
+ u64 rt_runtime;
+ };
- rcu_read_lock();
- list_for_each_entry_rcu(tgi, &parent->children, siblings) {
- if (tgi == tg)
- continue;
+ static int tg_schedulable(struct task_group *tg, void *data)
+ {
+ struct rt_schedulable_data *d = data;
+ struct task_group *child;
+ unsigned long total, sum = 0;
+ u64 period, runtime;
+
+ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ runtime = tg->rt_bandwidth.rt_runtime;
- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- tgi->rt_bandwidth.rt_runtime);
+ if (tg == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
}
- rcu_read_unlock();
- return total + to_ratio(period, runtime) <=
- to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
- parent->rt_bandwidth.rt_runtime);
- }
- #elif defined CONFIG_USER_SCHED
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
- {
- struct task_group *tgi;
- unsigned long total = 0;
- unsigned long global_ratio =
- to_ratio(global_rt_period(), global_rt_runtime());
+ /*
+ * Cannot have more runtime than the period.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
- rcu_read_lock();
- list_for_each_entry_rcu(tgi, &task_groups, list) {
- if (tgi == tg)
- continue;
+ /*
+ * Ensure we don't starve existing RT tasks.
+ */
+ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ return -EBUSY;
+
+ total = to_ratio(period, runtime);
+
+ /*
+ * Nobody can have more than the global setting allows.
+ */
+ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+ return -EINVAL;
- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- tgi->rt_bandwidth.rt_runtime);
+ /*
+ * The sum of our children's runtime should not exceed our own.
+ */
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ period = ktime_to_ns(child->rt_bandwidth.rt_period);
+ runtime = child->rt_bandwidth.rt_runtime;
+
+ if (child == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ sum += to_ratio(period, runtime);
}
- rcu_read_unlock();
- return total + to_ratio(period, runtime) < global_ratio;
+ if (sum > total)
+ return -EINVAL;
+
+ return 0;
}
- #endif
- /* Must be called with tasklist_lock held */
- static inline int tg_has_rt_tasks(struct task_group *tg)
+ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
- struct task_struct *g, *p;
- do_each_thread(g, p) {
- if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
- return 1;
- } while_each_thread(g, p);
- return 0;
+ struct rt_schedulable_data data = {
+ .tg = tg,
+ .rt_period = period,
+ .rt_runtime = runtime,
+ };
+
+ return walk_tg_tree(tg_schedulable, tg_nop, &data);
}
static int tg_set_bandwidth(struct task_group *tg,
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
- if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
- err = -EBUSY;
- goto unlock;
- }
- if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
- err = -EINVAL;
+ err = __rt_schedulable(tg, rt_period, rt_runtime);
+ if (err)
goto unlock;
- }
spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
static int sched_rt_global_constraints(void)
{
- struct task_group *tg = &root_task_group;
- u64 rt_runtime, rt_period;
+ u64 runtime, period;
int ret = 0;
- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- rt_runtime = tg->rt_bandwidth.rt_runtime;
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ runtime = global_rt_runtime();
+ period = global_rt_period();
+
+ /*
+ * Sanity check on the sysctl variables.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
mutex_lock(&rt_constraints_mutex);
- if (!__rt_schedulable(tg, rt_period, rt_runtime))
- ret = -EINVAL;
+ read_lock(&tasklist_lock);
+ ret = __rt_schedulable(NULL, 0, 0);
+ read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return ret;
unsigned long flags;
int i;
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;
if (!cgrp->parent) {
/* This is early initialization for the top cgroup */
- init_task_group.css.cgroup = cgrp;
return &init_task_group.css;
}
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- /* Bind the cgroup to task_group object we just created */
- tg->css.cgroup = cgrp;
-
return &tg->css;
}
group_leader->signal->leader = 1;
__set_special_pids(sid);
- spin_lock(&group_leader->sighand->siglock);
- group_leader->signal->tty = NULL;
- spin_unlock(&group_leader->sighand->siglock);
+ proc_clear_tty(group_leader);
err = session;
out:
down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
- memcpy(utsname()->nodename, tmp, len);
- utsname()->nodename[len] = 0;
+ struct new_utsname *u = utsname();
+
+ memcpy(u->nodename, tmp, len);
+ memset(u->nodename + len, 0, sizeof(u->nodename) - len);
errno = 0;
}
up_write(&uts_sem);
asmlinkage long sys_gethostname(char __user *name, int len)
{
int i, errno;
+ struct new_utsname *u;
if (len < 0)
return -EINVAL;
down_read(&uts_sem);
- i = 1 + strlen(utsname()->nodename);
+ u = utsname();
+ i = 1 + strlen(u->nodename);
if (i > len)
i = len;
errno = 0;
- if (copy_to_user(name, utsname()->nodename, i))
+ if (copy_to_user(name, u->nodename, i))
errno = -EFAULT;
up_read(&uts_sem);
return errno;
down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
- memcpy(utsname()->domainname, tmp, len);
- utsname()->domainname[len] = 0;
+ struct new_utsname *u = utsname();
+
+ memcpy(u->domainname, tmp, len);
+ memset(u->domainname + len, 0, sizeof(u->domainname) - len);
errno = 0;
}
up_write(&uts_sem);
return -EINVAL;
if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
return -EFAULT;
- if (new_rlim.rlim_cur > new_rlim.rlim_max)
- return -EINVAL;
old_rlim = current->signal->rlim + resource;
if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
- return -EPERM;
+
+ if (resource == RLIMIT_NOFILE) {
+ if (new_rlim.rlim_max == RLIM_INFINITY)
+ new_rlim.rlim_max = sysctl_nr_open;
+ if (new_rlim.rlim_cur == RLIM_INFINITY)
+ new_rlim.rlim_cur = sysctl_nr_open;
+ if (new_rlim.rlim_max > sysctl_nr_open)
+ return -EPERM;
+ }
+
+ if (new_rlim.rlim_cur > new_rlim.rlim_max)
+ return -EINVAL;
retval = security_task_setrlimit(resource, &new_rlim);
if (retval)
case PR_SET_TSC:
error = SET_TSC_CTL(arg2);
break;
+ case PR_GET_TIMERSLACK:
+ error = current->timer_slack_ns;
+ break;
+ case PR_SET_TIMERSLACK:
+ if (arg2 <= 0)
+ current->timer_slack_ns =
+ current->default_timer_slack_ns;
+ else
+ current->timer_slack_ns = arg2;
+ break;
default:
error = -EINVAL;
break;
time_state = TIME_OOP;
printk(KERN_NOTICE "Clock: "
"inserting leap second 23:59:60 UTC\n");
- leap_timer.expires = ktime_add_ns(leap_timer.expires,
- NSEC_PER_SEC);
+ hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
res = HRTIMER_RESTART;
break;
case TIME_DEL:
if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
fail = update_persistent_clock(now);
- next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
+ next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
if (next.tv_nsec <= 0)
next.tv_nsec += NSEC_PER_SEC;
#include <linux/profile.h>
#include <linux/sched.h>
#include <linux/tick.h>
+ #include <linux/module.h>
#include <asm/irq_regs.h>
incr * ticks);
}
do_timer(++ticks);
+
+ /* Keep the tick_next_period variable up to date */
+ tick_next_period = ktime_add(last_jiffies_update, tick_period);
}
write_sequnlock(&xtime_lock);
}
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- *last_update_time = ktime_to_us(ts->idle_lastupdate);
+ if (!tick_nohz_enabled)
+ return -1;
+
+ if (ts->idle_active)
+ *last_update_time = ktime_to_us(ts->idle_lastupdate);
+ else
+ *last_update_time = ktime_to_us(ktime_get());
+
return ktime_to_us(ts->idle_sleeptime);
}
+ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
/**
* tick_nohz_stop_sched_tick - stop the idle tick from the idle task
*/
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
- tick_do_timer_cpu = -1;
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
}
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
next_jiffies = get_next_timer_interrupt(last_jiffies);
delta_jiffies = next_jiffies - last_jiffies;
- if (rcu_needs_cpu(cpu))
+ if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
delta_jiffies = 1;
/*
* Do not stop the tick, if we are only one off
goto out;
}
- ts->idle_tick = ts->sched_timer.expires;
+ ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
rcu_enter_nohz();
* invoked.
*/
if (cpu == tick_do_timer_cpu)
- tick_do_timer_cpu = -1;
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
ts->idle_sleeps++;
ts->tick_stopped = 0;
ts->idle_exittime = now;
hrtimer_cancel(&ts->sched_timer);
- ts->sched_timer.expires = ts->idle_tick;
+ hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
while (1) {
/* Forward the time to expire in the future */
hrtimer_forward(&ts->sched_timer, now, tick_period);
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer,
- ts->sched_timer.expires,
+ hrtimer_start_expires(&ts->sched_timer,
HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
} else {
- if (!tick_program_event(ts->sched_timer.expires, 0))
+ if (!tick_program_event(
+ hrtimer_get_expires(&ts->sched_timer), 0))
break;
}
/* Update jiffies and reread time */
static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
{
hrtimer_forward(&ts->sched_timer, now, tick_period);
- return tick_program_event(ts->sched_timer.expires, 0);
+ return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
}
/*
* this duty, then the jiffies update is still serialized by
* xtime_lock.
*/
- if (unlikely(tick_do_timer_cpu == -1))
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
tick_do_timer_cpu = cpu;
/* Check, if the jiffies need an update */
next = tick_init_jiffy_update();
for (;;) {
- ts->sched_timer.expires = next;
+ hrtimer_set_expires(&ts->sched_timer, next);
if (!tick_program_event(next, 0))
break;
next = ktime_add(next, tick_period);
* this duty, then the jiffies update is still serialized by
* xtime_lock.
*/
- if (unlikely(tick_do_timer_cpu == -1))
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
tick_do_timer_cpu = cpu;
#endif
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
ts->sched_timer.function = tick_sched_timer;
- ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
/* Get the next period (per cpu) */
- ts->sched_timer.expires = tick_init_jiffy_update();
+ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
offset = ktime_to_ns(tick_period) >> 1;
do_div(offset, num_possible_cpus());
offset *= smp_processor_id();
- ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
+ hrtimer_add_expires_ns(&ts->sched_timer, offset);
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;