Login | Register For Free | Help
Search for: (Advanced)

Mailing List Archive: Linux: Kernel

[PATCH 3/5] x86/pvclock: add vsyscall implementation

 

 

First page Previous page 1 2 Next page Last page  View All Linux kernel RSS feed   Index | Next | Previous | View Threaded


jeremy.fitzhardinge at citrix

Oct 5, 2009, 5:50 PM

Post #1 of 50 (794 views)
Permalink
[PATCH 3/5] x86/pvclock: add vsyscall implementation

This patch allows the pvclock mechanism to be used in usermode. To
do this, we map an extra page into usermode containing an array of
pvclock_vcpu_time_info structures which give the information required
to compute a global system clock from the tsc. With this, we can
implement pvclock_clocksource_vread().

One complication is that usermode is subject to two levels of scheduling:
kernel scheduling of tasks onto vcpus, and hypervisor scheduling of
vcpus onto pcpus. In either case the underlying pcpu changed, and with
it, the correct set of parameters to compute tsc->system clock. To
address this we install a preempt notifier on sched_out to increment
that vcpu's version number. Usermode can then check the version number
is unchanged while computing the time and retry if it has (the only
difference from the kernel's version of the algorithm is that the vcpu
may have changed, so we may need to switch pvclock_vcpu_time_info
structures.

To use this feature, hypervisor-specific code is required
to call pvclock_init_vsyscall(), and if successful:
- cause the pvclock_vcpu_time_info structure at
pvclock_get_vsyscall_time_info(cpu) to be updated appropriately for
each vcpu.
- use pvclock_clocksource_vread as the implementation of clocksource
.vread.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge [at] citrix>
Cc: Keir Fraser <keir.fraser [at] eu>
Cc: Avi Kivity <avi [at] redhat>
Cc: Glauber Costa <gcosta [at] redhat>
Cc: Zach Brown <zach.brown [at] oracle>
Cc: Chris Mason <chris.mason [at] oracle>
Cc: Dan Magenheimer <dan.magenheimer [at] oracle>
---
arch/x86/Kconfig | 4 +
arch/x86/include/asm/fixmap.h | 3 +
arch/x86/include/asm/pvclock.h | 6 ++
arch/x86/include/asm/vsyscall.h | 3 +
arch/x86/kernel/Makefile | 2 +
arch/x86/kernel/pvclock.c | 152 ++++++++++++++++++++++++++++++++++++---
6 files changed, 160 insertions(+), 10 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5d..93346ff 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -518,6 +518,10 @@ config PARAVIRT_CLOCK
bool
default n

+config PARAVIRT_CLOCK_VSYSCALL
+ bool
+ depends on PARAVIRT_CLOCK && PREEMPT_NOTIFIERS
+
endif

config PARAVIRT_DEBUG
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 7b2d71d..ff3cffa 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -80,6 +80,9 @@ enum fixed_addresses {
+ ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
VSYSCALL_HPET,
#endif
+#ifdef CONFIG_PARAVIRT_CLOCK_VSYSCALL
+ FIX_PVCLOCK_TIME_INFO,
+#endif
FIX_DBGP_BASE,
FIX_EARLYCON_MEM_BASE,
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd..d2402b3 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -3,6 +3,7 @@

#include <linux/clocksource.h>
#include <asm/pvclock-abi.h>
+#include <asm/vsyscall.h>

/* some helper functions for xen and kvm pv clock sources */
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
@@ -11,4 +12,9 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct pvclock_vcpu_time_info *vcpu,
struct timespec *ts);

+int __init pvclock_init_vsyscall(void);
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
+
+cycle_t __vsyscall_fn pvclock_clocksource_vread(void);
+
#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d2..df5fb43 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -33,6 +33,9 @@ enum vsyscall_num {
extern int __vgetcpu_mode;
extern volatile unsigned long __jiffies;

+struct getcpu_cache;
+extern long vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache);
+
/* kernel space (writeable) */
extern int vgetcpu_mode;
extern struct timezone sys_tz;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b2..97d2e88 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,10 +24,12 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
CFLAGS_tsc.o := $(nostackp)
CFLAGS_paravirt.o := $(nostackp)
+CFLAGS_pvclock.o := $(nostackp)
GCOV_PROFILE_vsyscall_64.o := n
GCOV_PROFILE_hpet.o := n
GCOV_PROFILE_tsc.o := n
GCOV_PROFILE_paravirt.o := n
+GCOV_PROFILE_pvclock.o := n

obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 5ecce7f..14de7f3 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -17,7 +17,9 @@

#include <linux/kernel.h>
#include <linux/percpu.h>
+
#include <asm/pvclock.h>
+#include <asm/vsyscall.h>

/*
* These are perodically updated
@@ -71,9 +73,10 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
return product;
}

-static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
+static __always_inline
+u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
{
- u64 delta = native_read_tsc() - shadow->tsc_timestamp;
+ u64 delta = __native_read_tsc() - shadow->tsc_timestamp;
return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}

@@ -81,8 +84,9 @@ static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
* Reads a consistent set of time-base values from hypervisor,
* into a shadow data area.
*/
-static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
- struct pvclock_vcpu_time_info *src)
+static __always_inline
+unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
+ const struct pvclock_vcpu_time_info *src)
{
do {
dst->version = src->version;
@@ -109,18 +113,31 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
return pv_tsc_khz;
}

-cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+static __always_inline
+unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+ cycle_t *cycles)
{
struct pvclock_shadow_time shadow;
unsigned version;
cycle_t ret, offset;

+ version = pvclock_get_time_values(&shadow, src);
+ rdtsc_barrier();
+ offset = pvclock_get_nsec_offset(&shadow);
+ ret = shadow.system_timestamp + offset;
+ rdtsc_barrier();
+
+ *cycles = ret;
+ return version;
+}
+
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ unsigned version;
+ cycle_t ret;
+
do {
- version = pvclock_get_time_values(&shadow, src);
- rdtsc_barrier();
- offset = pvclock_get_nsec_offset(&shadow);
- ret = shadow.system_timestamp + offset;
- rdtsc_barrier();
+ version = __pvclock_read_cycles(src, &ret);
} while (version != src->version);

return ret;
@@ -151,3 +168,118 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,

set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
+
+#ifdef CONFIG_PARAVIRT_CLOCK_VSYSCALL
+
+static struct pvclock_vcpu_time_info *pvclock_vsyscall_time_info;
+
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
+{
+ if (pvclock_vsyscall_time_info == NULL)
+ return NULL;
+
+ return &pvclock_vsyscall_time_info[cpu];
+}
+
+static void vti_inc_version(struct pvclock_vcpu_time_info *pvti)
+{
+ /*
+ * This increments the version in an interrupt-atomic way.
+ * We're not concerned about global bus (inter-cpu) atomicity,
+ * but we just need to make sure the update can't be
+ * interrupted by the hypervisor preempting us.
+ */
+#ifdef CONFIG_X86
+ asm("add $2, %0\n" : "+m" (pvti->version));
+#else
+#error FIXME
+#endif
+}
+
+/*
+ * Increment version when switching away from a task so that it can
+ * tell if it has switched vcpus (hypervisor's update of the version
+ * will tell it if it switches pcpus).
+ */
+static void pvclock_vsyscall_sched_out(struct preempt_notifier *pn,
+ struct task_struct *next)
+{
+ int cpu = smp_processor_id();
+ struct pvclock_vcpu_time_info *pvti;
+
+ pvti = pvclock_get_vsyscall_time_info(cpu);
+ if (pvti)
+ vti_inc_version(pvti);
+}
+
+/* Don't care about scheduling in */
+static void pvclock_vsyscall_sched_in(struct preempt_notifier *notifier, int cpu)
+{
+}
+
+static struct preempt_notifier pvclock_vsyscall_notifier;
+static struct preempt_ops pvclock_vsyscall_preempt_ops = {
+ .sched_in = pvclock_vsyscall_sched_in,
+ .sched_out = pvclock_vsyscall_sched_out,
+};
+
+cycle_t __vsyscall_fn pvclock_clocksource_vread(void)
+{
+ const struct pvclock_vcpu_time_info *pvti_base;
+ const struct pvclock_vcpu_time_info *pvti;
+ cycle_t ret;
+ u32 version;
+
+ pvti_base = (struct pvclock_vcpu_time_info *)fix_to_virt(FIX_PVCLOCK_TIME_INFO);
+
+ /*
+ * When looping to get a consistent (time-info, tsc) pair, we
+ * also need to deal with the possibility we can switch vcpus,
+ * so make sure we always re-fetch time-info for the current vcpu.
+ */
+ do {
+ unsigned cpu;
+
+ vgetcpu(&cpu, NULL, NULL);
+ pvti = &pvti_base[cpu];
+
+ version = __pvclock_read_cycles(pvti, &ret);
+ } while (unlikely(pvti->version != version));
+
+ return ret;
+}
+
+/*
+ * Initialize the generic pvclock vsyscall state. This will allocate
+ * a/some page(s) for the per-vcpu pvclock information, set up a
+ * fixmap mapping for the page(s)
+ */
+int __init pvclock_init_vsyscall(void)
+{
+ int cpu;
+
+ /* Just one page for now */
+ if (nr_cpu_ids * sizeof(struct vcpu_time_info) > PAGE_SIZE) {
+ printk(KERN_WARNING "pvclock_vsyscall: too many CPUs to fit time_info into a single page\n");
+ return -ENOSPC;
+ }
+
+ pvclock_vsyscall_time_info =
+ (struct pvclock_vcpu_time_info *)get_zeroed_page(GFP_KERNEL);
+ if (pvclock_vsyscall_time_info == NULL)
+ return -ENOMEM;
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ pvclock_vsyscall_time_info[cpu].version = ~0;
+
+ __set_fixmap(FIX_PVCLOCK_TIME_INFO, __pa(pvclock_vsyscall_time_info),
+ PAGE_KERNEL_VSYSCALL);
+
+ preempt_notifier_init(&pvclock_vsyscall_notifier,
+ &pvclock_vsyscall_preempt_ops);
+ preempt_notifier_register(&pvclock_vsyscall_notifier);
+
+ return 0;
+}
+
+#endif /* CONFIG_PARAVIRT_CLOCK_VSYSCALL */
--
1.6.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 6, 2009, 2:04 AM

Post #2 of 50 (767 views)
Permalink
Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/06/2009 02:50 AM, Jeremy Fitzhardinge wrote:
> This patch allows the pvclock mechanism to be used in usermode. To
> do this, we map an extra page into usermode containing an array of
> pvclock_vcpu_time_info structures which give the information required
> to compute a global system clock from the tsc. With this, we can
> implement pvclock_clocksource_vread().
>
> One complication is that usermode is subject to two levels of scheduling:
> kernel scheduling of tasks onto vcpus, and hypervisor scheduling of
> vcpus onto pcpus. In either case the underlying pcpu changed, and with
> it, the correct set of parameters to compute tsc->system clock. To
> address this we install a preempt notifier on sched_out to increment
> that vcpu's version number. Usermode can then check the version number
> is unchanged while computing the time and retry if it has (the only
> difference from the kernel's version of the algorithm is that the vcpu
> may have changed, so we may need to switch pvclock_vcpu_time_info
> structures.
>
> To use this feature, hypervisor-specific code is required
> to call pvclock_init_vsyscall(), and if successful:
> - cause the pvclock_vcpu_time_info structure at
> pvclock_get_vsyscall_time_info(cpu) to be updated appropriately for
> each vcpu.
> - use pvclock_clocksource_vread as the implementation of clocksource
> .vread.
>
> +
> +cycle_t __vsyscall_fn pvclock_clocksource_vread(void)
> +{
> + const struct pvclock_vcpu_time_info *pvti_base;
> + const struct pvclock_vcpu_time_info *pvti;
> + cycle_t ret;
> + u32 version;
> +
> + pvti_base = (struct pvclock_vcpu_time_info *)fix_to_virt(FIX_PVCLOCK_TIME_INFO);
> +
> + /*
> + * When looping to get a consistent (time-info, tsc) pair, we
> + * also need to deal with the possibility we can switch vcpus,
> + * so make sure we always re-fetch time-info for the current vcpu.
> + */
> + do {
> + unsigned cpu;
> +
> + vgetcpu(&cpu, NULL, NULL);
> + pvti =&pvti_base[cpu];
> +
> + version = __pvclock_read_cycles(pvti,&ret);
> + } while (unlikely(pvti->version != version));
> +
> + return ret;
> +}
>

Instead of using vgetcpu() and rdtsc() independently, you can use rdtscp
to read both atomically. This removes the need for the preempt notifier.

> +
> +/*
> + * Initialize the generic pvclock vsyscall state. This will allocate
> + * a/some page(s) for the per-vcpu pvclock information, set up a
> + * fixmap mapping for the page(s)
> + */
> +int __init pvclock_init_vsyscall(void)
> +{
> + int cpu;
> +
> + /* Just one page for now */
> + if (nr_cpu_ids * sizeof(struct vcpu_time_info)> PAGE_SIZE) {
> + printk(KERN_WARNING "pvclock_vsyscall: too many CPUs to fit time_info into a single page\n");
> + return -ENOSPC;
> + }
> +
> + pvclock_vsyscall_time_info =
> + (struct pvclock_vcpu_time_info *)get_zeroed_page(GFP_KERNEL);
> + if (pvclock_vsyscall_time_info == NULL)
> + return -ENOMEM;
> +
>

Need to align the vcpu_time_infos on a cacheline boundary.

> + for (cpu = 0; cpu< nr_cpu_ids; cpu++)
> + pvclock_vsyscall_time_info[cpu].version = ~0;
> +
> + __set_fixmap(FIX_PVCLOCK_TIME_INFO, __pa(pvclock_vsyscall_time_info),
> + PAGE_KERNEL_VSYSCALL);
> +
> + preempt_notifier_init(&pvclock_vsyscall_notifier,
> + &pvclock_vsyscall_preempt_ops);
> + preempt_notifier_register(&pvclock_vsyscall_notifier);
> +
>

preempt notifiers are per-thread, not global, and will upset the cycle
counters. I'd drop them and use rdtscp instead (and give up if the
processor doesn't support it).

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


dan.magenheimer at oracle

Oct 6, 2009, 7:19 AM

Post #3 of 50 (768 views)
Permalink
RE: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

> From: Jeremy Fitzhardinge [mailto:jeremy.fitzhardinge [at] citrix]
> With this in place, I can do a gettimeofday in about 100ns on a 2.4GHz
> Q6600. I'm sure this could be tuned a bit more, but it is
> already much better than a syscall.

To evaluate the goodness of this, we really need a full
set of measurements for:

a) cost of rdtsc (and rdtscp if different)
b) cost of vsyscall+pvclock
c) cost of rdtsc emulated
d) cost of a hypercall that returns "hypervisor system time"

On a E6850 (3Ghz but let's use cycles), I measured;

a == 72 cycles
c == 1080 cycles
d == 780 cycles

It may be partly apples and oranges, but it looks
like a good guess for b on my machine is

b == 240 cycles

Not bad, but is there any additional context switch
cost to support it?

> From: Avi Kivity [mailto:avi [at] redhat]
> Instead of using vgetcpu() and rdtsc() independently, you can
> use rdtscp
> to read both atomically. This removes the need for the
> preempt notifier.

Xen does not currently expose rdtscp and so does not emulate
(or context switch) TSC_AUX. Context switching TSC_AUX
is certainly possible, but will likely be expensive.
If the primary reason for vsyscall+pvclock is to maximize
performance for gettimeofday/clock_gettime, this cost
would need to be added to the mix.

> preempt notifiers are per-thread, not global, and will upset
> the cycle
> counters. I'd drop them and use rdtscp instead (and give up if the
> processor doesn't support it).

Even if rdtscp is used, in the Intel processor lineup
only the very latest (Nehalem) supports rdtscp, so
"give up" doesn't seem like a very good option, at least
in the near future.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 6, 2009, 8:11 AM

Post #4 of 50 (763 views)
Permalink
Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/06/2009 04:19 PM, Dan Magenheimer wrote:
>> From: Jeremy Fitzhardinge [mailto:jeremy.fitzhardinge [at] citrix]
>> With this in place, I can do a gettimeofday in about 100ns on a 2.4GHz
>> Q6600. I'm sure this could be tuned a bit more, but it is
>> already much better than a syscall.
>>
> To evaluate the goodness of this, we really need a full
> set of measurements for:
>
> a) cost of rdtsc (and rdtscp if different)
> b) cost of vsyscall+pvclock
> c) cost of rdtsc emulated
> d) cost of a hypercall that returns "hypervisor system time"
>
> On a E6850 (3Ghz but let's use cycles), I measured;
>
> a == 72 cycles
> c == 1080 cycles
> d == 780 cycles
>
> It may be partly apples and oranges, but it looks
> like a good guess for b on my machine is
>
> b == 240 cycles
>

Two rdtscps should suffice (and I think they are much faster on modern
machines).

> Not bad, but is there any additional context switch
> cost to support it?
>

rdtscp requires an additional msr read/write on heavyweight host context
switches. Should be negligible compared to the savings.

>> From: Avi Kivity [mailto:avi [at] redhat]
>> Instead of using vgetcpu() and rdtsc() independently, you can
>> use rdtscp
>> to read both atomically. This removes the need for the
>> preempt notifier.
>>
> Xen does not currently expose rdtscp and so does not emulate
> (or context switch) TSC_AUX. Context switching TSC_AUX
> is certainly possible, but will likely be expensive.
> If the primary reason for vsyscall+pvclock is to maximize
> performance for gettimeofday/clock_gettime, this cost
> would need to be added to the mix.
>

It will cost ~100 cycles on heavyweight host context switch
(guest-to-guest).

>> preempt notifiers are per-thread, not global, and will upset
>> the cycle
>> counters. I'd drop them and use rdtscp instead (and give up if the
>> processor doesn't support it).
>>
> Even if rdtscp is used, in the Intel processor lineup
> only the very latest (Nehalem) supports rdtscp, so
> "give up" doesn't seem like a very good option, at least
> in the near future.
>

Why not? we still fall back to the guest kernel. By the time guest
kernels with rdtscp support are in the field, these machines will be
quiet old.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jeremy at goop

Oct 6, 2009, 11:46 AM

Post #5 of 50 (762 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/06/09 02:04, Avi Kivity wrote:
> On 10/06/2009 02:50 AM, Jeremy Fitzhardinge wrote:
>> This patch allows the pvclock mechanism to be used in usermode. To
>> do this, we map an extra page into usermode containing an array of
>> pvclock_vcpu_time_info structures which give the information required
>> to compute a global system clock from the tsc. With this, we can
>> implement pvclock_clocksource_vread().
>>
>> One complication is that usermode is subject to two levels of
>> scheduling:
>> kernel scheduling of tasks onto vcpus, and hypervisor scheduling of
>> vcpus onto pcpus. In either case the underlying pcpu changed, and with
>> it, the correct set of parameters to compute tsc->system clock. To
>> address this we install a preempt notifier on sched_out to increment
>> that vcpu's version number. Usermode can then check the version number
>> is unchanged while computing the time and retry if it has (the only
>> difference from the kernel's version of the algorithm is that the vcpu
>> may have changed, so we may need to switch pvclock_vcpu_time_info
>> structures.
>>
>> To use this feature, hypervisor-specific code is required
>> to call pvclock_init_vsyscall(), and if successful:
>> - cause the pvclock_vcpu_time_info structure at
>> pvclock_get_vsyscall_time_info(cpu) to be updated appropriately for
>> each vcpu.
>> - use pvclock_clocksource_vread as the implementation of clocksource
>> .vread.
>>
>> +
>> +cycle_t __vsyscall_fn pvclock_clocksource_vread(void)
>> +{
>> + const struct pvclock_vcpu_time_info *pvti_base;
>> + const struct pvclock_vcpu_time_info *pvti;
>> + cycle_t ret;
>> + u32 version;
>> +
>> + pvti_base = (struct pvclock_vcpu_time_info
>> *)fix_to_virt(FIX_PVCLOCK_TIME_INFO);
>> +
>> + /*
>> + * When looping to get a consistent (time-info, tsc) pair, we
>> + * also need to deal with the possibility we can switch vcpus,
>> + * so make sure we always re-fetch time-info for the current vcpu.
>> + */
>> + do {
>> + unsigned cpu;
>> +
>> + vgetcpu(&cpu, NULL, NULL);
>> + pvti =&pvti_base[cpu];
>> +
>> + version = __pvclock_read_cycles(pvti,&ret);
>> + } while (unlikely(pvti->version != version));
>> +
>> + return ret;
>> +}
>>
>
> Instead of using vgetcpu() and rdtsc() independently, you can use
> rdtscp to read both atomically. This removes the need for the preempt
> notifier.

rdtscp first appeared on Intel with Nehalem, so we need to support older
Intel chips.

You could use rdscp to get (tsc,cpu) atomically, but that's not
sufficient to be able to get a consistent snapshot of (tsc, time_info)
because it doesn't give you the pvclock_vcpu_time_info version number.
If TSC_AUX contained that too, it might be possible. Alternatively you
could compare the tsc with pvclock.tsc_timestamp, but unfortunately the
ABI doesn't specify that tsc_timestamp is updated in any particular
order compared to the rest of the fields, so you still can't use that to
get a consistent snapshot (we can revise the ABI, of course).

So either way it doesn't avoid the need to iterate. vgetcpu will use
rdtscp if available, but I agree it is unfortunate we need to do a
redundant rdtsc in that case.

Avoiding the preempt notifier would be nice. Definitely worth a
followup-patch.

>> +
>> +/*
>> + * Initialize the generic pvclock vsyscall state. This will allocate
>> + * a/some page(s) for the per-vcpu pvclock information, set up a
>> + * fixmap mapping for the page(s)
>> + */
>> +int __init pvclock_init_vsyscall(void)
>> +{
>> + int cpu;
>> +
>> + /* Just one page for now */
>> + if (nr_cpu_ids * sizeof(struct vcpu_time_info)> PAGE_SIZE) {
>> + printk(KERN_WARNING "pvclock_vsyscall: too many CPUs to fit
>> time_info into a single page\n");
>> + return -ENOSPC;
>> + }
>> +
>> + pvclock_vsyscall_time_info =
>> + (struct pvclock_vcpu_time_info *)get_zeroed_page(GFP_KERNEL);
>> + if (pvclock_vsyscall_time_info == NULL)
>> + return -ENOMEM;
>> +
>>
>
> Need to align the vcpu_time_infos on a cacheline boundary.

OK.

>> + for (cpu = 0; cpu< nr_cpu_ids; cpu++)
>> + pvclock_vsyscall_time_info[cpu].version = ~0;
>> +
>> + __set_fixmap(FIX_PVCLOCK_TIME_INFO,
>> __pa(pvclock_vsyscall_time_info),
>> + PAGE_KERNEL_VSYSCALL);
>> +
>> + preempt_notifier_init(&pvclock_vsyscall_notifier,
>> + &pvclock_vsyscall_preempt_ops);
>> + preempt_notifier_register(&pvclock_vsyscall_notifier);
>> +
>>
>
> preempt notifiers are per-thread, not global, and will upset the cycle
> counters.

Ah, so I need to register it on every new thread? That's a bit awkward.

This is intended to satisfy the cycle-counters who want to do
gettimeofday a million times a second, where I guess the tradeoff of
avoiding a pile of syscalls is worth a bit of context-switch overhead.

> I'd drop them and use rdtscp instead (and give up if the processor
> doesn't support it).
>

None of my test machines have rdtscp, so that won't do ;)

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 7, 2009, 3:25 AM

Post #6 of 50 (758 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/06/2009 08:46 PM, Jeremy Fitzhardinge wrote:
>
>> Instead of using vgetcpu() and rdtsc() independently, you can use
>> rdtscp to read both atomically. This removes the need for the preempt
>> notifier.
>>
> rdtscp first appeared on Intel with Nehalem, so we need to support older
> Intel chips.
>

We can support them by falling back to the kernel. I'm a bit worried
about the kernel playing with the hypervisor's version field. It's
better to introduce yet a new version for the kernel, and check both.

> You could use rdscp to get (tsc,cpu) atomically, but that's not
> sufficient to be able to get a consistent snapshot of (tsc, time_info)
> because it doesn't give you the pvclock_vcpu_time_info version number.
> If TSC_AUX contained that too, it might be possible. Alternatively you
> could compare the tsc with pvclock.tsc_timestamp, but unfortunately the
> ABI doesn't specify that tsc_timestamp is updated in any particular
> order compared to the rest of the fields, so you still can't use that to
> get a consistent snapshot (we can revise the ABI, of course).
>
> So either way it doesn't avoid the need to iterate. vgetcpu will use
> rdtscp if available, but I agree it is unfortunate we need to do a
> redundant rdtsc in that case.
>
>

def try_pvclock_vtime():
tsc, p0 = rdtscp()
v0 = pvclock[p0].version
tsc, p = rdtscp()
t = pvclock_time(pvclock[p], tsc)
if p != p0 or pvclock[p].version != v0:
raise Exception("Processor or timebased change under our feet")
return t

def pvclock_time():
while True:
try:
return try_pvlock_time()
except:
pass

So, two rdtscps and two compares.

>>> + for (cpu = 0; cpu< nr_cpu_ids; cpu++)
>>> + pvclock_vsyscall_time_info[cpu].version = ~0;
>>> +
>>> + __set_fixmap(FIX_PVCLOCK_TIME_INFO,
>>> __pa(pvclock_vsyscall_time_info),
>>> + PAGE_KERNEL_VSYSCALL);
>>> +
>>> + preempt_notifier_init(&pvclock_vsyscall_notifier,
>>> +&pvclock_vsyscall_preempt_ops);
>>> + preempt_notifier_register(&pvclock_vsyscall_notifier);
>>> +
>>>
>> preempt notifiers are per-thread, not global, and will upset the cycle
>> counters.
>>
> Ah, so I need to register it on every new thread? That's a bit awkward.
>

It's used to manage processor registers, much like the fpu. If a thread
uses a register that's not saved and restored by the normal context
switch code, it can register a preempt notifier to do that instead.

> This is intended to satisfy the cycle-counters who want to do
> gettimeofday a million times a second, where I guess the tradeoff of
> avoiding a pile of syscalls is worth a bit of context-switch overhead.
>

It's sufficient to increment a version counter on thread migration, no
need to do it on context switch.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jeremy at goop

Oct 7, 2009, 12:29 PM

Post #7 of 50 (759 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/07/09 03:25, Avi Kivity wrote:
> On 10/06/2009 08:46 PM, Jeremy Fitzhardinge wrote:
>>
>>> Instead of using vgetcpu() and rdtsc() independently, you can use
>>> rdtscp to read both atomically. This removes the need for the preempt
>>> notifier.
>>>
>> rdtscp first appeared on Intel with Nehalem, so we need to support older
>> Intel chips.
>>
>
> We can support them by falling back to the kernel.

Yes, but its easy enough to support them with the fast-path.

> I'm a bit worried about the kernel playing with the hypervisor's
> version field.

For Xen I explicitly made it not a problem by adding the notion of a
secondary pvclock_vcpu_time_info structure which is updated by copying,
aside from the version number which is updated as-is.

As far as I can tell it isn't a problem for KVM either. The guest
version number is atomic with respect to preemption by the hypervisor so
there's no scope for racing. (The ABI already guarantees that the
pvclock structures are never updated cross-cpu.)

It ultimately doesn't matter what the version number is so long as it
changes when the parameters are updated, and version numbers can't be
reused within a window where things get confused.

> It's better to introduce yet a new version for the kernel, and check
> both.

Two version numbers are awkward to read atomically at least on 32-bit.
And I don't think its necessary.

> def try_pvclock_vtime():
> tsc, p0 = rdtscp()
> v0 = pvclock[p0].version
> tsc, p = rdtscp()
> t = pvclock_time(pvclock[p], tsc)
> if p != p0 or pvclock[p].version != v0:
> raise Exception("Processor or timebased change under our feet")
> return t
>
> def pvclock_time():
> while True:
> try:
> return try_pvlock_time()
> except:
> pass
>
> So, two rdtscps and two compares.

Yep, that would work.

> It's sufficient to increment a version counter on thread migration, no
> need to do it on context switch.
>

That's true; switch_out is a pessimistic approximation of that. But is
there a convenient hook to test for migration?

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 7, 2009, 1:09 PM

Post #8 of 50 (759 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/07/2009 09:29 PM, Jeremy Fitzhardinge wrote:
>
>> I'm a bit worried about the kernel playing with the hypervisor's
>> version field.
>>
> For Xen I explicitly made it not a problem by adding the notion of a
> secondary pvclock_vcpu_time_info structure which is updated by copying,
> aside from the version number which is updated as-is.
>

When do you copy?

I'd rather have a single copy for guest and host.

> As far as I can tell it isn't a problem for KVM either. The guest
> version number is atomic with respect to preemption by the hypervisor so
> there's no scope for racing. (The ABI already guarantees that the
> pvclock structures are never updated cross-cpu.)
>
> It ultimately doesn't matter what the version number is so long as it
> changes when the parameters are updated, and version numbers can't be
> reused within a window where things get confused.
>

If the hypervisor does a pvclock->version = somethingelse->version++
then the guest may get confused. But I understand you have a
guest-private ->version?

>> It's better to introduce yet a new version for the kernel, and check
>> both.
>>
> Two version numbers are awkward to read atomically at least on 32-bit.
> And I don't think its necessary.
>

No need to read them atomically.

cpu1 = vgetcpu()
hver1 = pvclock[cpu1].hver
kver1 = pvclock[cpu1].kver
tsc = rdtsc
/* multipication magic with pvclock[cpu1]*/
cpu2 = vgetcpu()
hver2 = pvclock[cpu2].hver
kver2 = pvclock[cpu2].kver
valid = cpu1 == cpu2 && hver1 == hver2 && kver1 == kver2

>> It's sufficient to increment a version counter on thread migration, no
>> need to do it on context switch.
>>
>>
> That's true; switch_out is a pessimistic approximation of that. But is
> there a convenient hook to test for migration?
>

I'd guess not but it's probably easy to add one in the migration thread.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


dan.magenheimer at oracle

Oct 7, 2009, 1:48 PM

Post #9 of 50 (757 views)
Permalink
RE: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

> We can support them by falling back to the kernel. I'm a bit worried
> about the kernel playing with the hypervisor's version field. It's
> better to introduce yet a new version for the kernel, and check both.

On Nehalem, apps that need timestamp information at a high
frequency will likely use rdtsc/rdtscp directly.

I very much support Jeremy's efforts to make vsyscall+pvclock
work fast on processors other than the very newest ones.

Dan

> -----Original Message-----
> From: Avi Kivity [mailto:avi [at] redhat]
> Sent: Wednesday, October 07, 2009 4:26 AM
> To: Jeremy Fitzhardinge
> Cc: Jeremy Fitzhardinge; Dan Magenheimer; Xen-devel; Kurt Hackel; the
> arch/x86 maintainers; Linux Kernel Mailing List; Glauber de Oliveira
> Costa; Keir Fraser; Zach Brown; Chris Mason
> Subject: Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall
> implementation
>
>
> On 10/06/2009 08:46 PM, Jeremy Fitzhardinge wrote:
> >
> >> Instead of using vgetcpu() and rdtsc() independently, you can use
> >> rdtscp to read both atomically. This removes the need for
> the preempt
> >> notifier.
> >>
> > rdtscp first appeared on Intel with Nehalem, so we need to
> support older
> > Intel chips.
> >
>
> We can support them by falling back to the kernel. I'm a bit worried
> about the kernel playing with the hypervisor's version field. It's
> better to introduce yet a new version for the kernel, and check both.
>
> > You could use rdscp to get (tsc,cpu) atomically, but that's not
> > sufficient to be able to get a consistent snapshot of (tsc,
> time_info)
> > because it doesn't give you the pvclock_vcpu_time_info
> version number.
> > If TSC_AUX contained that too, it might be possible.
> Alternatively you
> > could compare the tsc with pvclock.tsc_timestamp, but
> unfortunately the
> > ABI doesn't specify that tsc_timestamp is updated in any particular
> > order compared to the rest of the fields, so you still
> can't use that to
> > get a consistent snapshot (we can revise the ABI, of course).
> >
> > So either way it doesn't avoid the need to iterate.
> vgetcpu will use
> > rdtscp if available, but I agree it is unfortunate we need to do a
> > redundant rdtsc in that case.
> >
> >
>
> def try_pvclock_vtime():
> tsc, p0 = rdtscp()
> v0 = pvclock[p0].version
> tsc, p = rdtscp()
> t = pvclock_time(pvclock[p], tsc)
> if p != p0 or pvclock[p].version != v0:
> raise Exception("Processor or timebased change under our feet")
> return t
>
> def pvclock_time():
> while True:
> try:
> return try_pvlock_time()
> except:
> pass
>
> So, two rdtscps and two compares.
>
> >>> + for (cpu = 0; cpu< nr_cpu_ids; cpu++)
> >>> + pvclock_vsyscall_time_info[cpu].version = ~0;
> >>> +
> >>> + __set_fixmap(FIX_PVCLOCK_TIME_INFO,
> >>> __pa(pvclock_vsyscall_time_info),
> >>> + PAGE_KERNEL_VSYSCALL);
> >>> +
> >>> + preempt_notifier_init(&pvclock_vsyscall_notifier,
> >>> +&pvclock_vsyscall_preempt_ops);
> >>> + preempt_notifier_register(&pvclock_vsyscall_notifier);
> >>> +
> >>>
> >> preempt notifiers are per-thread, not global, and will
> upset the cycle
> >> counters.
> >>
> > Ah, so I need to register it on every new thread? That's a
> bit awkward.
> >
>
> It's used to manage processor registers, much like the fpu.
> If a thread
> uses a register that's not saved and restored by the normal context
> switch code, it can register a preempt notifier to do that instead.
>
> > This is intended to satisfy the cycle-counters who want to do
> > gettimeofday a million times a second, where I guess the tradeoff of
> > avoiding a pile of syscalls is worth a bit of
> context-switch overhead.
> >
>
> It's sufficient to increment a version counter on thread
> migration, no
> need to do it on context switch.
>
> --
> Do not meddle in the internals of kernels, for they are
> subtle and quick to panic.
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 7, 2009, 2:08 PM

Post #10 of 50 (758 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/07/2009 10:48 PM, Dan Magenheimer wrote:
>> We can support them by falling back to the kernel. I'm a bit worried
>> about the kernel playing with the hypervisor's version field. It's
>> better to introduce yet a new version for the kernel, and check both.
>>
> On Nehalem, apps that need timestamp information at a high
> frequency will likely use rdtsc/rdtscp directly.
>
>

Then they will get incorrect timing once they are live migrated.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jeremy at goop

Oct 7, 2009, 2:19 PM

Post #11 of 50 (761 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/07/09 13:09, Avi Kivity wrote:
> On 10/07/2009 09:29 PM, Jeremy Fitzhardinge wrote:
>>
>>> I'm a bit worried about the kernel playing with the hypervisor's
>>> version field.
>>>
>> For Xen I explicitly made it not a problem by adding the notion of a
>> secondary pvclock_vcpu_time_info structure which is updated by copying,
>> aside from the version number which is updated as-is.
>>
>
> When do you copy?
>
> I'd rather have a single copy for guest and host.

When Xen updates the parameters normally. The interface never really
needed to share the memory between hypervisor and guest, and I think
avoiding it is a bit more robust.

But for KVM, you already use the MSR to place the pvclock_vcpu_time_info
structure, so you could just place it in the page and use the same
memory for kernel and usermode.

> If the hypervisor does a pvclock->version = somethingelse->version++
> then the guest may get confused. But I understand you have a
> guest-private ->version?

The guest should never get confused by the version being changed by the
hypervisor. It's already part of the ABI. Or did you mean something else?

I'm not sure what you mean by "guest-private version"; the versions are
always guest-private: te version is part of the pvclock structure,
which is per-vcpu, which is private to each guest. The guest nevern
maintains a separate long-term copy of the structure, only a transient
snapshot while computing time from the tsc (that's the current pvclock.c
code).

> No need to read them atomically.
>
> cpu1 = vgetcpu()
> hver1 = pvclock[cpu1].hver
> kver1 = pvclock[cpu1].kver
> tsc = rdtsc
> /* multipication magic with pvclock[cpu1]*/
> cpu2 = vgetcpu()
> hver2 = pvclock[cpu2].hver
> kver2 = pvclock[cpu2].kver
> valid = cpu1 == cpu2 && hver1 == hver2 && kver1 == kver2

I don't think that's necessary, but I can certainly live with it if it
makes you happier.

>>> It's sufficient to increment a version counter on thread migration, no
>>> need to do it on context switch.
>>>
>>>
>> That's true; switch_out is a pessimistic approximation of that. But is
>> there a convenient hook to test for migration?
>>
>
> I'd guess not but it's probably easy to add one in the migration thread.

Looking at that now.

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 7, 2009, 2:37 PM

Post #12 of 50 (757 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/07/2009 11:19 PM, Jeremy Fitzhardinge wrote:
>
>> When do you copy?
>>
>> I'd rather have a single copy for guest and host.
>>
> When Xen updates the parameters normally. The interface never really
> needed to share the memory between hypervisor and guest, and I think
> avoiding it is a bit more robust.
>
> But for KVM, you already use the MSR to place the pvclock_vcpu_time_info
> structure, so you could just place it in the page and use the same
> memory for kernel and usermode.
>

Yes.

>> If the hypervisor does a pvclock->version = somethingelse->version++
>> then the guest may get confused. But I understand you have a
>> guest-private ->version?
>>
> The guest should never get confused by the version being changed by the
> hypervisor. It's already part of the ABI. Or did you mean something else?
>

If the guest does a RMW on the version, but the host does not (copying
it from somewhere else), then the guest RMW can be lost.

Looking at the code, that's what kvm does:

vcpu->hv_clock.version += 2;

shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);

memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
sizeof(vcpu->hv_clock));

so a guest-side ++version can be lost.

> I'm not sure what you mean by "guest-private version"; the versions are
> always guest-private: te version is part of the pvclock structure,
> which is per-vcpu, which is private to each guest. The guest nevern
> maintains a separate long-term copy of the structure, only a transient
> snapshot while computing time from the tsc (that's the current pvclock.c
> code).
>

Same for kvm. I'm not worried about cross-guest corruption, just the
guest and host working together to confuse the guest.

>> No need to read them atomically.
>>
>> cpu1 = vgetcpu()
>> hver1 = pvclock[cpu1].hver
>> kver1 = pvclock[cpu1].kver
>> tsc = rdtsc
>> /* multipication magic with pvclock[cpu1]*/
>> cpu2 = vgetcpu()
>> hver2 = pvclock[cpu2].hver
>> kver2 = pvclock[cpu2].kver
>> valid = cpu1 == cpu2&& hver1 == hver2&& kver1 == kver2
>>
> I don't think that's necessary, but I can certainly live with it if it
> makes you happier.
>

I think the version issue requires it.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jeremy at goop

Oct 7, 2009, 2:51 PM

Post #13 of 50 (761 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/07/09 14:37, Avi Kivity wrote:
> If the guest does a RMW on the version, but the host does not (copying
> it from somewhere else), then the guest RMW can be lost.
>
> Looking at the code, that's what kvm does:
>
> vcpu->hv_clock.version += 2;
>
> shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
>
> memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
> sizeof(vcpu->hv_clock));
>
> so a guest-side ++version can be lost.

I see, yes. The Xen code explicitly reads back the guest version and
increments that (I realize now that's what you meant by guest-private
version). If you were to have a second version number it would have to
be separated as well to avoid being overwritten by the hypervisor.

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 7, 2009, 2:53 PM

Post #14 of 50 (760 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/07/2009 11:51 PM, Jeremy Fitzhardinge wrote:
> On 10/07/09 14:37, Avi Kivity wrote:
>
>> If the guest does a RMW on the version, but the host does not (copying
>> it from somewhere else), then the guest RMW can be lost.
>>
>> Looking at the code, that's what kvm does:
>>
>> vcpu->hv_clock.version += 2;
>>
>> shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
>>
>> memcpy(shared_kaddr + vcpu->time_offset,&vcpu->hv_clock,
>> sizeof(vcpu->hv_clock));
>>
>> so a guest-side ++version can be lost.
>>
> I see, yes. The Xen code explicitly reads back the guest version and
> increments that (I realize now that's what you meant by guest-private
> version). If you were to have a second version number it would have to
> be separated as well to avoid being overwritten by the hypervisor.
>

Yes. We have the space since a cacheline is 64 bytes (minimum) vs 32
bytes of pvclock data.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


dan.magenheimer at oracle

Oct 7, 2009, 3:36 PM

Post #15 of 50 (757 views)
Permalink
RE: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

> Then they will get incorrect timing once they are live migrated.

I've posted a proposed (OS-independent) solution for that and
am (slowly) in the process of implementing it.

> -----Original Message-----
> From: Avi Kivity [mailto:avi [at] redhat]
> Sent: Wednesday, October 07, 2009 3:08 PM
> To: Dan Magenheimer
> Cc: Jeremy Fitzhardinge; Jeremy Fitzhardinge; Xen-devel; Kurt Hackel;
> the arch/x86 maintainers; Linux Kernel Mailing List; Glauber
> de Oliveira
> Costa; Keir Fraser; Zach Brown; Chris Mason
> Subject: Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall
> implementation
>
>
> On 10/07/2009 10:48 PM, Dan Magenheimer wrote:
> >> We can support them by falling back to the kernel. I'm a
> bit worried
> >> about the kernel playing with the hypervisor's version field. It's
> >> better to introduce yet a new version for the kernel, and
> check both.
> >>
> > On Nehalem, apps that need timestamp information at a high
> > frequency will likely use rdtsc/rdtscp directly.
> >
> >
>
> Then they will get incorrect timing once they are live migrated.
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jeremy at goop

Oct 9, 2009, 5:24 PM

Post #16 of 50 (752 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/07/09 03:25, Avi Kivity wrote:
> def try_pvclock_vtime():
> tsc, p0 = rdtscp()
> v0 = pvclock[p0].version
> tsc, p = rdtscp()
> t = pvclock_time(pvclock[p], tsc)
> if p != p0 or pvclock[p].version != v0:
> raise Exception("Processor or timebased change under our feet")
> return t

This doesn't quite work.

If we end up migrating some time after the first rdtscp, then the
accesses to pvclock[] will be cross-cpu. Since we don't made any strong
SMP memory ordering guarantees on updating the structure, the snapshot
isn't guaranteed to be consistent even if we re-check the version at the
end.

So to use rdtscp we need to either redefine the update of
pvclock_vcpu_time_info to be SMP-safe, or keep the additional migration
check.

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 10, 2009, 11:10 AM

Post #17 of 50 (750 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/10/2009 02:24 AM, Jeremy Fitzhardinge wrote:
> On 10/07/09 03:25, Avi Kivity wrote:
>
>> def try_pvclock_vtime():
>> tsc, p0 = rdtscp()
>> v0 = pvclock[p0].version
>> tsc, p = rdtscp()
>> t = pvclock_time(pvclock[p], tsc)
>> if p != p0 or pvclock[p].version != v0:
>> raise Exception("Processor or timebased change under our feet")
>> return t
>>
> This doesn't quite work.
>
> If we end up migrating some time after the first rdtscp, then the
> accesses to pvclock[] will be cross-cpu. Since we don't made any strong
> SMP memory ordering guarantees on updating the structure, the snapshot
> isn't guaranteed to be consistent even if we re-check the version at the
> end.
>

We only hit this if we have a double migration, otherwise we see p != p0.

Most likely all existing implementations do have a write barrier on the
guest entry path, so if we add a read barrier between the two compares,
that ensures we're reading from the same cpu again.

> So to use rdtscp we need to either redefine the update of
> pvclock_vcpu_time_info to be SMP-safe, or keep the additional migration
> check.
>

I think we can update the ABI after verifying all implementations do
have a write barrier.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jeremy at goop

Oct 12, 2009, 11:20 AM

Post #18 of 50 (747 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/10/09 11:10, Avi Kivity wrote:
> On 10/10/2009 02:24 AM, Jeremy Fitzhardinge wrote:
>> On 10/07/09 03:25, Avi Kivity wrote:
>>
>>> def try_pvclock_vtime():
>>> tsc, p0 = rdtscp()
>>> v0 = pvclock[p0].version
>>> tsc, p = rdtscp()
>>> t = pvclock_time(pvclock[p], tsc)
>>> if p != p0 or pvclock[p].version != v0:
>>> raise Exception("Processor or timebased change under our feet")
>>> return t
>>>
>> This doesn't quite work.
>>
>> If we end up migrating some time after the first rdtscp, then the
>> accesses to pvclock[] will be cross-cpu. Since we don't made any strong
>> SMP memory ordering guarantees on updating the structure, the snapshot
>> isn't guaranteed to be consistent even if we re-check the version at the
>> end.
>>
>
> We only hit this if we have a double migration, otherwise we see p != p0.
>
> Most likely all existing implementations do have a write barrier on
> the guest entry path, so if we add a read barrier between the two
> compares, that ensures we're reading from the same cpu again.

There's a second problem: If the time_info gets updated between the
first rdtscp and the first version fetch, then we won't have a
consistent tsc,time_info pair. You could check if tsc_timestamp is >
tsc, but that won't necessarily work on save/restore/migrate.

>> So to use rdtscp we need to either redefine the update of
>> pvclock_vcpu_time_info to be SMP-safe, or keep the additional migration
>> check.
>>
>
> I think we can update the ABI after verifying all implementations do
> have a write barrier.
>

I suppose that works if you assume that:

1. every task->vcpu migration is associated with a hv/guest context
switch, and
2. every hv/guest context switch is a write barrier

I guess 2 is a given, but I can at least imagine cases where 1 might not
be true. Maybe. It all seems very subtle.

And I don't really see a gain. You avoid maintaining a second version
number, but at the cost of two rdtscps. In my measurements, the whole
vsyscall takes around 100ns to run, and a single rdtsc takes about 30,
so 30% of total. Unlike rdtsc, rdtscp is documented as being ordered in
the instruction stream, and so will take at least as long; two of them
will completely blow the vsyscall execution time.

(By contrast, lsl only takes around 10ns, which suggests it should be
used preferentially in vgetcpu anyway.)

AMD CPUs have traditionally been much better than Intel at these kinds
of things, so maybe rdtscp makes sense there. Or maybe Nehalem is much
better than my Core2 Q6600.

J

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 12, 2009, 11:29 AM

Post #19 of 50 (745 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/12/2009 08:20 PM, Jeremy Fitzhardinge wrote:
> On 10/10/09 11:10, Avi Kivity wrote:
>
>> On 10/10/2009 02:24 AM, Jeremy Fitzhardinge wrote:
>>
>>> On 10/07/09 03:25, Avi Kivity wrote:
>>>
>>>
>>>> def try_pvclock_vtime():
>>>> tsc, p0 = rdtscp()
>>>> v0 = pvclock[p0].version
>>>> tsc, p = rdtscp()
>>>> t = pvclock_time(pvclock[p], tsc)
>>>> if p != p0 or pvclock[p].version != v0:
>>>> raise Exception("Processor or timebased change under our feet")
>>>> return t
>>>>
> There's a second problem: If the time_info gets updated between the
> first rdtscp and the first version fetch, then we won't have a
> consistent tsc,time_info pair. You could check if tsc_timestamp is>
> tsc, but that won't necessarily work on save/restore/migrate.
>

Good catch. Doesn't that invalidate rdtscp based vgettimeofday on
non-virt as well (assuming p == cpu)?

> I suppose that works if you assume that:
>
> 1. every task->vcpu migration is associated with a hv/guest context
> switch, and
> 2. every hv/guest context switch is a write barrier
>
> I guess 2 is a given, but I can at least imagine cases where 1 might not
> be true. Maybe. It all seems very subtle.
>

What is 1 exactly? task switching to another vcpu? that doesn't incur
hypervisor involvement. vcpu moving to another cpu? That does.

> And I don't really see a gain. You avoid maintaining a second version
> number, but at the cost of two rdtscps. In my measurements, the whole
> vsyscall takes around 100ns to run, and a single rdtsc takes about 30,
> so 30% of total. Unlike rdtsc, rdtscp is documented as being ordered in
> the instruction stream, and so will take at least as long; two of them
> will completely blow the vsyscall execution time.
>

I agree, let's stick with the rdtscpless implementation.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jeremy at goop

Oct 12, 2009, 12:13 PM

Post #20 of 50 (749 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/12/09 11:29, Avi Kivity wrote:
> Good catch. Doesn't that invalidate rdtscp based vgettimeofday on
> non-virt as well (assuming p == cpu)?

The tsc clocksource assumes the tsc is (mostly?) synced; it doesn't use
rdtscp or make any attempt at per-cpu corrections.

>> I suppose that works if you assume that:
>>
>> 1. every task->vcpu migration is associated with a hv/guest context
>> switch, and
>> 2. every hv/guest context switch is a write barrier
>>
>> I guess 2 is a given, but I can at least imagine cases where 1 might not
>> be true. Maybe. It all seems very subtle.
>>
>
> What is 1 exactly? task switching to another vcpu? that doesn't
> incur hypervisor involvement. vcpu moving to another cpu? That does.
Aie... OK. So no barrier is required for a task double migration on
vcpus, because it ends up on the same pcpu and the ordering is local; if
there's a vcpu migration to a new pcpu in there too, then we always
expect a barrier.

>> And I don't really see a gain. You avoid maintaining a second version
>> number, but at the cost of two rdtscps. In my measurements, the whole
>> vsyscall takes around 100ns to run, and a single rdtsc takes about 30,
>> so 30% of total. Unlike rdtsc, rdtscp is documented as being ordered in
>> the instruction stream, and so will take at least as long; two of them
>> will completely blow the vsyscall execution time.
>>
>
> I agree, let's stick with the rdtscpless implementation.

OK, I'll use PeterZ's hint to try and find a more complete set of
migration points.

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 12, 2009, 11:39 PM

Post #21 of 50 (745 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/12/2009 09:13 PM, Jeremy Fitzhardinge wrote:
> On 10/12/09 11:29, Avi Kivity wrote:
>
>> Good catch. Doesn't that invalidate rdtscp based vgettimeofday on
>> non-virt as well (assuming p == cpu)?
>>
> The tsc clocksource assumes the tsc is (mostly?) synced; it doesn't use
> rdtscp or make any attempt at per-cpu corrections.
>

So it's broken or disabled when that assumption is wrong? We could
easily fix that now. Might even reuse the pvclock structures.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jeremy at goop

Oct 13, 2009, 1:00 PM

Post #22 of 50 (732 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/12/09 23:39, Avi Kivity wrote:
> On 10/12/2009 09:13 PM, Jeremy Fitzhardinge wrote:
>> On 10/12/09 11:29, Avi Kivity wrote:
>>
>>> Good catch. Doesn't that invalidate rdtscp based vgettimeofday on
>>> non-virt as well (assuming p == cpu)?
>>>
>> The tsc clocksource assumes the tsc is (mostly?) synced; it doesn't use
>> rdtscp or make any attempt at per-cpu corrections.
>>
>
> So it's broken or disabled when that assumption is wrong? We could
> easily fix that now. Might even reuse the pvclock structures.

Well, the kernel internally makes more or less the same assumption; the
vsyscall clocksource is the same as the kernel's internal one. I think
idea is that it just drops back to something like hpet if the tsc
doesn't have very simple SMP characteristics.

If the kernel could characterize the per-cpu properties of the tsc more
accurately, then it could use the pvclock mechanism on native.

J

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


avi at redhat

Oct 14, 2009, 5:32 AM

Post #23 of 50 (722 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/14/2009 05:00 AM, Jeremy Fitzhardinge wrote:
>
>> So it's broken or disabled when that assumption is wrong? We could
>> easily fix that now. Might even reuse the pvclock structures.
>>
> Well, the kernel internally makes more or less the same assumption; the
> vsyscall clocksource is the same as the kernel's internal one. I think
> idea is that it just drops back to something like hpet if the tsc
> doesn't have very simple SMP characteristics.
>
> If the kernel could characterize the per-cpu properties of the tsc more
> accurately, then it could use the pvclock mechanism on native.
>

It does - that's how kvm implements pvclock on the host side. See
kvmclock_cpufreq_notifier() in arch/x86/kvm/x86.c.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


jeremy at goop

Oct 15, 2009, 12:17 PM

Post #24 of 50 (715 views)
Permalink
Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

On 10/14/09 05:32, Avi Kivity wrote:
> On 10/14/2009 05:00 AM, Jeremy Fitzhardinge wrote:
>>
>>> So it's broken or disabled when that assumption is wrong? We could
>>> easily fix that now. Might even reuse the pvclock structures.
>>>
>> Well, the kernel internally makes more or less the same assumption; the
>> vsyscall clocksource is the same as the kernel's internal one. I think
>> idea is that it just drops back to something like hpet if the tsc
>> doesn't have very simple SMP characteristics.
>>
>> If the kernel could characterize the per-cpu properties of the tsc more
>> accurately, then it could use the pvclock mechanism on native.
>>
>
> It does - that's how kvm implements pvclock on the host side. See
> kvmclock_cpufreq_notifier() in arch/x86/kvm/x86.c.

The tsc clocksource currently seems a fair bit more picky though; it
doesn't attempt to sync tscs or work out their relative offsets or
rates. Which seems a bit defeatist.

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


dan.magenheimer at oracle

Oct 27, 2009, 10:29 AM

Post #25 of 50 (647 views)
Permalink
RE: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall implementation [In reply to]

Is there any way for an application to conclusively determine
programmatically if the "fast vsyscall" pvclock is functional
vs the much much slower gettimeofday/clock_gettime equivalents?

If not, might it be possible to implement some (sysfs?)
way to determine this, that would also be backwards compatible
to existing OS's that don't have pvclock+vsyscall supported?

Thanks,
Dan

> -----Original Message-----
> From: Avi Kivity [mailto:avi [at] redhat]
> Sent: Wednesday, October 14, 2009 6:33 AM
> To: Jeremy Fitzhardinge
> Cc: Dan Magenheimer; Jeremy Fitzhardinge; Kurt Hackel; the arch/x86
> maintainers; Linux Kernel Mailing List; Glauber de Oliveira Costa;
> Xen-devel; Keir Fraser; Zach Brown; Chris Mason; Ingo Molnar
> Subject: Re: [Xen-devel] Re: [PATCH 3/5] x86/pvclock: add vsyscall
> implementation
>
>
> On 10/14/2009 05:00 AM, Jeremy Fitzhardinge wrote:
> >
> >> So it's broken or disabled when that assumption is wrong? We could
> >> easily fix that now. Might even reuse the pvclock structures.
> >>
> > Well, the kernel internally makes more or less the same
> assumption; the
> > vsyscall clocksource is the same as the kernel's internal
> one. I think
> > idea is that it just drops back to something like hpet if the tsc
> > doesn't have very simple SMP characteristics.
> >
> > If the kernel could characterize the per-cpu properties of
> the tsc more
> > accurately, then it could use the pvclock mechanism on native.
> >
>
> It does - that's how kvm implements pvclock on the host side. See
> kvmclock_cpufreq_notifier() in arch/x86/kvm/x86.c.
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo [at] vger
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

First page Previous page 1 2 Next page Last page  View All Linux kernel RSS feed   Index | Next | Previous | View Threaded
 
 


Interested in having your list archived? Contact Gossamer Threads
 
  Web Applications & Managed Hosting Powered by Gossamer Threads Inc.