/* -*- linux-c -*- * time-estimation with minimal dependency on xtime * Copyright (C) 2006 Intel Corporation. * Copyright (C) 2010-2018 Red Hat Inc. * * This file is part of systemtap, and is free software. You can * redistribute it and/or modify it under the terms of the GNU General * Public License (GPL); either version 2, or (at your option) any * later version. */ #if defined (__i386__) || defined (__x86_64__) #include #endif #if defined (STAPCONF_TSC_KHZ) && \ !(defined (__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21)) // x86_64 didn't need a specific header until 2.6.21. Otherwise: #include #endif #ifdef STAPCONF_KTIME_GET_REAL #include #endif #include /* The interval at which the __stp_time_timer_callback routine runs, which resynchronizes our per-cpu base_ns/base_cycles values. A lower rate (higher interval) is sufficient if we get cpufreq-change notifications. */ #ifndef STP_TIME_SYNC_INTERVAL_NOCPUFREQ #define STP_TIME_SYNC_INTERVAL_NOCPUFREQ ((HZ+9)/10) /* ten times per second */ #endif #ifndef STP_TIME_SYNC_INTERVAL_CPUFREQ #define STP_TIME_SYNC_INTERVAL_CPUFREQ (HZ*10) /* once per ten seconds */ #endif static int __stp_cpufreq_notifier_registered = 0; #ifndef STP_TIME_SYNC_INTERVAL #define STP_TIME_SYNC_INTERVAL (__stp_cpufreq_notifier_registered ? \ STP_TIME_SYNC_INTERVAL_CPUFREQ : \ STP_TIME_SYNC_INTERVAL_NOCPUFREQ) #endif #ifndef NSEC_PER_MSEC #define NSEC_PER_MSEC 1000000L #endif typedef struct __stp_time_t { /* * A write lock is taken by __stp_time_timer_callback() and * __stp_time_cpufreq_callback(). Neither writer is in interrupt context, * and both disable interrupts before taking the lock, so there should be * no opportunity for deadlock. * * A read lock is taken by _stp_gettimeofday_us(). There is the potential * for this to occur at any time, so there is a slim chance that this will * happen while the write lock is held, and it will be impossible to get a * read lock. However, we can limit how long we try to get the lock to * avoid a deadlock. * * Note that seqlock is chosen so that readers don't block writers. It's * also important that readers can attempt a lock from _any_ context (e.g., * NMI), and some kernels don't have read_trylock. */ seqlock_t lock; /* These provide a reference time to correlate cycles to real time */ int64_t base_ns; cycles_t base_cycles; /* The frequency in kHz of this CPU's time stamp counter, for interpolating * cycle counts from the base time. */ unsigned int freq; /* Callback used to schedule updates of the base time */ struct timer_list timer; } stp_time_t; static void *stp_time = NULL; /* Try to estimate the number of CPU cycles in a millisecond - i.e. kHz. This * relies heavily on the accuracy of udelay. By calling udelay twice, we * attempt to account for overhead in the call. * * NB: interrupts should be disabled when calling this. * * FIXME: This not very accurate on Xen kernels! */ static unsigned int __stp_get_freq(void) { // If we can get the actual frequency of HW counter, we use it. #if defined (__ia64__) return local_cpu_data->itc_freq / 1000; #elif defined (__s390__) || defined (__s390x__) || defined (__arm__) // We don't need to find the cpu freq on s390 as the // TOD clock is always a fix freq. (see: POO pg 4-36.) return 0; #elif (defined (__i386__) || defined (__x86_64__)) && defined(STAPCONF_TSC_KHZ) return tsc_khz; #elif (defined (__i386__) || defined (__x86_64__)) && defined(STAPCONF_CPU_KHZ) return cpu_khz; #else /* __i386__ || __x86_64__ || __aarch64__ */ // If we don't know the actual frequency, we estimate it. cycles_t beg, mid, end; beg = get_cycles(); barrier(); udelay(1); barrier(); mid = get_cycles(); barrier(); udelay(1001); barrier(); end = get_cycles(); barrier(); return (beg - 2*mid + end); #endif } static void __stp_ktime_get_real_ts(struct timespec *ts) { #ifdef STAPCONF_KTIME_GET_REAL ktime_get_real_ts(ts); #else /* STAPCONF_KTIME_GET_REAL */ struct timeval tv; do_gettimeofday(&tv); ts->tv_sec = tv.tv_sec; ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC; #endif } /* Update this cpu's base_ns/base_cycles values. May be called from initialization or various other callback mechanisms. */ static stp_time_t* __stp_time_local_update(void) { unsigned long flags; stp_time_t *time; struct timespec ts; int64_t ns; cycles_t cycles; // PR20600: We used to disable interrupts here with // local_irq_save(), then call write_seqlock_irqsave() further // down when we write to 'time'. However, this caused intermittent // hangs. Only disabling interrupts once here seems to fix the // problem, at the cost of holding time->lock a bit longer than we // originally did. time = per_cpu_ptr(stp_time, smp_processor_id()); write_seqlock_irqsave(&time->lock, flags); __stp_ktime_get_real_ts(&ts); cycles = get_cycles(); ns = (NSEC_PER_SEC * (int64_t)ts.tv_sec) + ts.tv_nsec; time->base_ns = ns; time->base_cycles = cycles; write_sequnlock_irqrestore(&time->lock, flags); return time; } /* The cross-smp call. */ static void __stp_time_smp_callback(void *val) { (void) val; (void) __stp_time_local_update(); } /* The timer callback is in a softIRQ -- interrupts enabled. */ static void __stp_time_timer_callback(stp_timer_callback_parameter_t unused) { stp_time_t *time =__stp_time_local_update(); (void) unused; /* PR6481: make sure IRQs are enabled before resetting the timer (IRQs are disabled and then reenabled in __stp_time_local_update()). XXX: The worst that can probably happen is that we get two consecutive timer resets. */ if (likely(atomic_read(session_state()) != STAP_SESSION_STOPPED)) mod_timer(&time->timer, jiffies + STP_TIME_SYNC_INTERVAL); #ifdef DEBUG_TIME _stp_warn("cpu%d %p khz=%d base=%lld cycles=%lld\n", smp_processor_id(), (void*)time, time->freq, (long long)time->base_ns, (long long)time->base_cycles); #endif } /* This is called as an IPI, with interrupts disabled. */ static void __stp_init_time(void *info) { struct timespec ts; // XXX: The following code replaces the call to __stp_time_local_update(): #if 1 int64_t ns; cycles_t cycles; #endif stp_time_t *time = per_cpu_ptr(stp_time, smp_processor_id()); #if 1 // Note -- this is the same calculation as in __stp_time_local_update: __stp_ktime_get_real_ts(&ts); cycles = get_cycles(); ns = (NSEC_PER_SEC * (int64_t)ts.tv_sec) + ts.tv_nsec; time->base_ns = ns; time->base_cycles = cycles; #endif seqlock_init(&time->lock); time->freq = __stp_get_freq(); // XXX: The following is definitely not legal on kernel-rt, // since seqlock cannot be used when interrupts are disabled. // // In addition, this is also not recommended to do in a function // being called via on_each_cpu() / smp_call_function() // -- see comment in Linux kernel/smp.c //__stp_time_local_update(); timer_setup(&time->timer, __stp_time_timer_callback, 0); time->timer.expires = jiffies + STP_TIME_SYNC_INTERVAL; #ifndef STAPCONF_ADD_TIMER_ON add_timer(&time->timer); #endif } #ifdef CONFIG_CPU_FREQ /* The cpufreq callback is not in interrupt context -- interrupts enabled */ static int __stp_time_cpufreq_callback(struct notifier_block *self, unsigned long state, void *vfreqs) { unsigned long flags; struct cpufreq_freqs *freqs; int freq_khz; stp_time_t *time; struct timespec ts; int64_t ns; cycles_t cycles; int reset_timer_p = 0; switch (state) { case CPUFREQ_POSTCHANGE: #ifdef CPUFREQ_RESUMECHANGE case CPUFREQ_RESUMECHANGE: #endif freqs = (struct cpufreq_freqs *)vfreqs; freq_khz = freqs->new; time = per_cpu_ptr(stp_time, freqs->cpu); write_seqlock_irqsave(&time->lock, flags); if (time->freq != freq_khz) { time->freq = freq_khz; // NB: freqs->cpu may not equal smp_processor_id(), // so we can't update the subject processor's // base_ns/base_cycles values just now. reset_timer_p = 1; } write_sequnlock_irqrestore(&time->lock, flags); if (reset_timer_p) { #ifdef DEBUG_TIME _stp_warn ("cpu%d %p freq->%d\n", freqs->cpu, (void*)time, freqs->new); #endif #if defined(STAPCONF_SMPCALL_5ARGS) || defined(STAPCONF_SMPCALL_4ARGS) (void) smp_call_function_single (freqs->cpu, &__stp_time_smp_callback, 0, #ifdef STAPCONF_SMPCALL_5ARGS 1, /* nonatomic */ #endif 0); /* not wait */ #else /* RHEL4ish: cannot direct to a single cpu ... so broadcast to them all */ (void) smp_call_function (&__stp_time_smp_callback, NULL, 0, 0); #endif } break; } return NOTIFY_OK; } static struct notifier_block __stp_time_notifier = { .notifier_call = __stp_time_cpufreq_callback, }; static int __stp_constant_freq(void) { #ifdef STAPCONF_CONSTANT_TSC // If the CPU has constant tsc, we don't need to use cpufreq. return boot_cpu_has(X86_FEATURE_CONSTANT_TSC); #elif defined (__ia64__) || defined (__s390__) || defined (__s390x__) || defined (__arm__) // these architectures have constant time counter. return 1; #else return 0; #endif } #endif /* CONFIG_CPU_FREQ */ /* This function is called during module unloading. */ static void _stp_kill_time(void) { if (stp_time) { int cpu; for_each_online_cpu(cpu) { stp_time_t *time = per_cpu_ptr(stp_time, cpu); del_timer_sync(&time->timer); } #ifdef CONFIG_CPU_FREQ if (!__stp_constant_freq() && __stp_cpufreq_notifier_registered) { cpufreq_unregister_notifier(&__stp_time_notifier, CPUFREQ_TRANSITION_NOTIFIER); } #endif _stp_free_percpu(stp_time); stp_time = NULL; } } /* This function is called during module loading. */ static int _stp_init_time(void) { int cpu, ret = 0; might_sleep(); _stp_kill_time(); stp_time = _stp_alloc_percpu(sizeof(stp_time_t)); if (unlikely(stp_time == 0)) return -1; #ifdef STAPCONF_ONEACHCPU_RETRY ret = on_each_cpu(__stp_init_time, NULL, 0, 1); #else ret = on_each_cpu(__stp_init_time, NULL, 1); #endif #ifdef STAPCONF_ADD_TIMER_ON for_each_online_cpu(cpu) { stp_time_t *time = per_cpu_ptr(stp_time, cpu); add_timer_on(&time->timer, cpu); } #endif #ifdef CONFIG_CPU_FREQ if (!ret && !__stp_constant_freq()) { if (!cpufreq_register_notifier(&__stp_time_notifier, CPUFREQ_TRANSITION_NOTIFIER)) { __stp_cpufreq_notifier_registered = 1; for_each_online_cpu(cpu) { unsigned long flags; int freq_khz = cpufreq_get(cpu); // may block if (freq_khz > 0) { stp_time_t *time = per_cpu_ptr(stp_time, cpu); write_seqlock_irqsave(&time->lock, flags); time->freq = freq_khz; write_sequnlock_irqrestore(&time->lock, flags); } } } } #endif if (ret) _stp_kill_time(); return ret; } #ifndef STP_TIMELOCKDELAY #define STP_TIMELOCKDELAY 100 /* ns */ #endif #ifndef STP_TIMELOCKTRIES #define STP_TIMELOCKTRIES 10 /* total 1 us */ #endif static int64_t _stp_gettimeofday_ns(void) { int64_t base; cycles_t last; uint64_t delta; unsigned int freq; unsigned int seq; stp_time_t *time; int i = 0; if (!stp_time) return -1; preempt_disable(); /* XXX: why? Isn't this is only run from probe handlers? */ time = per_cpu_ptr(stp_time, smp_processor_id()); seq = read_seqbegin(&time->lock); base = time->base_ns; last = time->base_cycles; freq = time->freq; while (unlikely(read_seqretry(&time->lock, seq))) { if (/* very */ unlikely(++i >= STP_TIMELOCKTRIES)) { preempt_enable_no_resched(); _stp_warn ("_stp_gettimofday_ns seqlock timeout; see STP_TIMELOCK*"); return 0; } ndelay(STP_TIMELOCKDELAY); seq = read_seqbegin(&time->lock); base = time->base_ns; last = time->base_cycles; freq = time->freq; } delta = get_cycles() - last; preempt_enable_no_resched(); #if defined (__s390__) || defined (__s390x__) // The TOD clock on the s390 (read by get_cycles() ) // is converted to a nano-second value using the following: // (get_cycles() * 125) >> 7; delta = (delta * 125) >> 7; #elif defined (__arm__) /* arm always returns 0 for get_cycles() */ /* so this is just a fake value until we get a real fix. */ delta = 1000; #else /* __s390__ || __s390x__ */ // Verify units: // (D cycles) * (1E6 ns/ms) / (F cycles/ms [kHz]) = ns delta *= NSEC_PER_MSEC; if (freq == 0) return 0; do_div(delta, freq); #endif return base + delta; }