diff -urN linux-2.5.9/arch/i386/Config.help linux/arch/i386/Config.help --- linux-2.5.9/arch/i386/Config.help Mon Apr 22 18:27:58 2002 +++ linux/arch/i386/Config.help Mon Apr 22 20:12:03 2002 @@ -35,6 +35,16 @@ Say Y here if you are building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. +Collect preemption latency times +CONFIG_PREEMPT_TIMES + This option will instruct the preemption code to measure in-kernel + latencies due to various locking primitives. The results for the 20 + worst latency times are accessible via /proc/latencytimes. They are + reset on read. + + Say Y here if you are testing kernel preemption and trying to determine + where high-latency problems lie. Say N if you are unsure. + CONFIG_X86 This is Linux's home port. Linux was originally native to the Intel 386, and runs on all the later x86 processors including the Intel diff -urN linux-2.5.9/arch/i386/config.in linux/arch/i386/config.in --- linux-2.5.9/arch/i386/config.in Mon Apr 22 18:28:39 2002 +++ linux/arch/i386/config.in Mon Apr 22 20:12:03 2002 @@ -185,6 +185,7 @@ bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP bool 'Preemptible Kernel' CONFIG_PREEMPT +dep_bool 'Collect preemption latency times' CONFIG_PREEMPT_TIMES $CONFIG_PREEMPT if [ "$CONFIG_SMP" != "y" ]; then bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC diff -urN linux-2.5.9/arch/i386/kernel/irq.c linux/arch/i386/kernel/irq.c --- linux-2.5.9/arch/i386/kernel/irq.c Mon Apr 22 18:27:53 2002 +++ linux/arch/i386/kernel/irq.c Mon Apr 22 20:12:03 2002 @@ -557,6 +557,11 @@ * SMP cross-CPU interrupts have their own specific * handlers). */ +#ifdef CONFIG_PREEMPT_TIMES +extern void latency_cause(int,int); +#else +#define latency_cause(a, b) +#endif asmlinkage unsigned int do_IRQ(struct pt_regs regs) { /* @@ -575,9 +580,12 @@ struct irqaction * action; unsigned int status; + preempt_lock_start(-99); + kstat.irqs[cpu][irq]++; spin_lock(&desc->lock); desc->handler->ack(irq); + latency_cause(-99,~irq); /* REPLAY is when Linux resends an IRQ that was dropped earlier WAITING is used by probe to mark irqs that are being tested @@ -634,6 +642,8 @@ desc->handler->end(irq); spin_unlock(&desc->lock); + preempt_lock_stop(); + if (softirq_pending(cpu)) do_softirq(); return 1; diff -urN linux-2.5.9/include/asm-i386/preem_latency.h linux/include/asm-i386/preem_latency.h --- linux-2.5.9/include/asm-i386/preem_latency.h Wed Dec 31 19:00:00 1969 +++ linux/include/asm-i386/preem_latency.h Mon Apr 22 20:12:03 2002 @@ -0,0 +1,25 @@ +/* + * include/asm-sh/preem_latency.h + * timing support for preempt-stats patch + */ + +#ifndef _ASM_PREEM_LATENCY_H +#define _ASM_PREEM_LATENCY_H + +#include +#include /* To be sure we have cpu_khz declaration */ +#include /* do_div */ + +/* + * rdtsc is definded in asm/msr.h as + * __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx") + */ + +#define readclock(low) rdtscl(low) +#define readclock_init() +#define clock_to_usecs(clocks) ({ \ + unsigned long long quot = (unsigned long long) (clocks) * 10; \ + do_div(quot, (unsigned long)(cpu_khz / 100)); \ + quot; }) + +#endif /* _ASM_PREEM_LATENCY_H */ diff -urN linux-2.5.9/include/asm-i386/smplock.h linux/include/asm-i386/smplock.h --- linux-2.5.9/include/asm-i386/smplock.h Mon Apr 22 18:28:34 2002 +++ linux/include/asm-i386/smplock.h Mon Apr 22 20:12:03 2002 @@ -9,6 +9,16 @@ #include extern spinlock_t kernel_flag; +#ifdef CONFIG_PREEMPT_TIMES +#define lock_bkl() do { \ + current_thread_info()->preempt_count++; \ + preempt_lock_start(3); \ + _raw_spin_lock(&kernel_flag); \ +} while (0) +#else +#define lock_bkl() spin_lock(&kernel_flag) +#endif +#define unlock_bkl() spin_unlock(&kernel_flag) #ifdef CONFIG_SMP #define kernel_locked() spin_is_locked(&kernel_flag) @@ -46,7 +56,18 @@ spin_lock(&kernel_flag); \ } while (0) - +#ifdef CONFIG_PREEMPT_TIMES +#define lock_kernel() do { \ + if (current->lock_depth == -1) \ + lock_bkl(); \ + ++current->lock_depth; \ +} while (0) + +#define unlock_kernel() do { \ + if (--current->lock_depth < 0) \ + unlock_bkl(); \ +} while (0) +#else /* * Getting the big kernel lock. * @@ -93,3 +114,4 @@ "=m" (current->lock_depth)); #endif } +#endif /* CONFIG_PREEMPT_TIMES */ diff -urN linux-2.5.9/include/linux/spinlock.h linux/include/linux/spinlock.h --- linux-2.5.9/include/linux/spinlock.h Mon Apr 22 18:29:12 2002 +++ linux/include/linux/spinlock.h Mon Apr 22 20:12:03 2002 @@ -159,22 +159,55 @@ asmlinkage void preempt_schedule(void); +#ifdef CONFIG_PREEMPT_TIMES +extern void latency_start(const char *fname,unsigned lineno,int cause); +extern void latency_check(const char *fname,unsigned lineno, int cause); +extern void latency_end(const char *fname,unsigned lineno); +#define preempt_lock_start(c) \ +do { \ + if ((preempt_get_count() & ~PREEMPT_ACTIVE) == 1) \ + latency_start(__BASE_FILE__, __LINE__, c); \ +} while (0) +#define preempt_lock_force_start() \ +do { \ + latency_end(__FILE__, __LINE__); \ + latency_start(__FILE__, __LINE__, -99); \ +} while (0) +#define preempt_lock_stop() \ +do { \ + if ((preempt_get_count() & ~PREEMPT_ACTIVE) == 1) \ + latency_end(__BASE_FILE__, __LINE__); \ +} while (0) +#define preempt_lock_force_stop() \ +do { \ + latency_end(__FILE__, __LINE__); \ +} while (0) +#else +#define preempt_lock_start(c) do {} while (0) +#define preempt_lock_force_start(c) do {} while (0) +#define preempt_lock_stop() do {} while (0) +#define preempt_lock_force_stop() do {} while (0) +#endif /* CONFIG_PREEMPT_TIMES */ + #define preempt_get_count() (current_thread_info()->preempt_count) #define preempt_disable() \ do { \ ++current_thread_info()->preempt_count; \ + preempt_lock_start(1); \ barrier(); \ } while (0) #define preempt_enable_no_resched() \ do { \ + preempt_lock_stop(); \ --current_thread_info()->preempt_count; \ barrier(); \ } while (0) #define preempt_enable() \ do { \ + preempt_lock_stop(); \ --current_thread_info()->preempt_count; \ barrier(); \ if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ @@ -218,6 +251,9 @@ #define write_lock(lock) _raw_write_lock(lock) #define write_unlock(lock) _raw_write_unlock(lock) #define write_trylock(lock) _raw_write_trylock(lock) + +#define preempt_lock_start(c) do {} while (0) +#define preempt_lock_stop() do {} while (0) #endif /* "lock on reference count zero" */ diff -urN linux-2.5.9/kernel/Makefile linux/kernel/Makefile --- linux-2.5.9/kernel/Makefile Mon Apr 22 18:28:14 2002 +++ linux/kernel/Makefile Mon Apr 22 20:12:03 2002 @@ -21,6 +21,7 @@ obj-$(CONFIG_MODULES) += ksyms.o obj-$(CONFIG_PM) += pm.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_PREEMPT_TIMES) += preem_latency.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -urN linux-2.5.9/kernel/ksyms.c linux/kernel/ksyms.c --- linux-2.5.9/kernel/ksyms.c Mon Apr 22 18:27:44 2002 +++ linux/kernel/ksyms.c Mon Apr 22 20:12:03 2002 @@ -468,6 +468,10 @@ #ifdef CONFIG_PREEMPT EXPORT_SYMBOL(preempt_schedule); #endif +#ifdef CONFIG_PREEMPT_TIMES +EXPORT_SYMBOL(latency_end); +EXPORT_SYMBOL(latency_start); +#endif EXPORT_SYMBOL(schedule_timeout); EXPORT_SYMBOL(sys_sched_yield); EXPORT_SYMBOL(set_user_nice); diff -urN linux-2.5.9/kernel/preem_latency.c linux/kernel/preem_latency.c --- linux-2.5.9/kernel/preem_latency.c Wed Dec 31 19:00:00 1969 +++ linux/kernel/preem_latency.c Mon Apr 22 20:12:03 2002 @@ -0,0 +1,431 @@ +/* + * Preemption Latency Measurement Tool + * Copyright (C) 2001 MontaVista Software Inc. + * + * Measures the duration of locks held in the kernel. + * Useful for pinpointing long-held locks, which we + * can not preempt during. + * + * * 20011220 Robert M. Love + * - add SH arch support + * - tidy things to better support new arches + * - add include/asm/preem_latency.h + * - now synced with 2.4.17 + * + * * 20011115 Robert M. Love + * - resync with 2.4.15-pre4 and the latest preempt + * kernel patch + * + * * 20010923 Robert M. Love + * - remove causes "lowlat" and "check" + * - update for revised preemption patch + * - remove defines for preempt_lock_check and window + * + * * 20010919 Robert M. Love + * whitespace cleanup, remove unneeded code, remove + * ifdefs around latency_cause, etc + * + * * 20010918 Robert M. Love + * update for 2.4.10-pre11 and 2.4.9-ac13 + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * we need a method of timing. On i386 this is easy + * with the TSC. On other arches we can use their + * existing timing systems or hack something together. + * + * The defines are: + * readclock(x) put timing value in x (unsigned int) + * readclock_init() initialize timing + * + * readclock must be a monotonic incremental counter. + */ +#include + +#define NUM_LOG_ENTRY 20 +#define NUM_COUNT 2 +#define COMMAND_LENGTH 16 +#define NUM_INTERRUPT 16 + +const char * syscallStartFileName = "system_call"; +const char * entryCheckFileName = "ret_with_reschedule"; +const char * errorCodeStart = "error_code start"; +const char * deviceNotAvailableStart = "device_not_availablestart"; + +extern unsigned long cpu_khz; /* number of rdtsc ticks per second/1000 */ + +extern void latency_check(const char *, unsigned, int); +extern void latency_end(const char *, unsigned); + +unsigned theSyscallNumber; + +#define SfS 0 /* waiting for a preempt off start1 call */ +#define SfC 1 /* waiting for a preempt off end call */ +#define SfM 2 /* waiting for either of the above */ + +struct pointId { + const char * FileName; + unsigned Address; + unsigned FileLine; + unsigned ProcId; + char ProcName[COMMAND_LENGTH]; +}; + +/* worst recorded latencies */ +struct log_entry { + unsigned latency; + int cause; + int mask; + unsigned intrCount[NUM_INTERRUPT+1]; + struct pointId start; + struct pointId end; +}; + +static unsigned logFlag = 0; /* 0 - no logging; 1 - logging */ +static unsigned panicFlag = 0; /* set to 1 if something is really wrong */ + +struct LatencyData { + int breakCount; /* count interrupt and iret */ + const char * testName; /* the test name */ + unsigned syncFlag; /* for synchro between start and end */ + unsigned numIntrs; /* total number interrupts & intrs in range*/ + unsigned numCount; /* layout */ + unsigned numEntry; + int least; /* lowest latency we care about */ + unsigned count[NUM_COUNT]; + struct log_entry entry[NUM_LOG_ENTRY]; /* worst recorded latencies */ + int theCause; + int mask; + struct pointId latencyStart; + unsigned latencyStartCount; + unsigned startFlag; /* to debug between start() and start1() */ + +}; + +struct LatencyData latencyData[NR_CPUS] = { {0} }; + +static char *cause_name[] = { + "Unknown", + "spin_lock", + "reacqBKL", + "BKL" +}; + +#define CAUSE_MAX (sizeof(cause_name)/sizeof(cause_name[0]) - 1) + +unsigned numIntr = NUM_INTERRUPT; +unsigned intrCount[NUM_INTERRUPT+1]; + +#define CLEAR(x) do { \ + int zz; \ + for (zz=0; zz<= NUM_INTERRUPT; zz++) \ + x[zz] = 0; \ + } while(0) + +#define COPY(x, y) do { \ + int zz; \ + for (zz=0; zz<= NUM_INTERRUPT; zz++) \ + x[zz] = y[zz]; \ + } while(0) + +/* strategy : + * + * Four points are identified + * S : the starting point right after system call, user -> kernel + * S1: alternative starting points + * . faults (including device_not_available which does go through + * error_code : label) + * . interrupts (all go through do_IRQ) + * C : need_resched is checked and schedule() is invoked if flag is set + * B : before RESTORE ALL and if control is returnning to user land + * + * For S points : + * it MUST follow a break point + * it initializes timing values for the starting point + * + * For S1 points : + * if it follows a breakpoint, treat it as an S point + * otherwise, ignore it. + * + * For C points : + * it must follow a C point or a S point + * calculate the interval and reset the starting point + * + * For B points : + * It must follow a C point or an S point + * calculate the internval and invalidate the starting point + * + */ + +static void reset_latencyData(void) +{ + int i; + int cpu; + + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + latencyData[cpu].numCount = NUM_COUNT; + latencyData[cpu].numEntry = NUM_LOG_ENTRY; + latencyData[cpu].startFlag = 0; + latencyData[cpu].breakCount = 0; + latencyData[cpu].syncFlag = SfS; + latencyData[cpu].numIntrs = 0; + latencyData[cpu].least = 0; + for (i = 0; i < latencyData[cpu].numCount; + latencyData[cpu].count[i++] = 0); + } + return; +} + +asmlinkage void latency_start(const char *fname,unsigned lineno, int cause) +{ + struct LatencyData *l = &latencyData[smp_processor_id()]; + + /* if we are not logging or we have an error, do nothing */ + if ((logFlag == 0) || (panicFlag != 0)) + return; + + if (l->syncFlag != SfC) { + readclock(l->latencyStartCount); + l->startFlag = 1; + l->mask = 0; + l->theCause = cause; + l->syncFlag = SfC; + l->latencyStart.Address = (int)__builtin_return_address(0); + l->latencyStart.FileName = fname; + l->latencyStart.FileLine = lineno; + l->latencyStart.ProcId = current->pid; + memcpy(l->latencyStart.ProcName, current->comm, 16); + CLEAR(intrCount); + } +} + +void latency_cause(int was, int tobe) +{ + struct LatencyData *l = &latencyData[smp_processor_id()]; + + if (was == -100) { + l->mask |= tobe; + return; + } + + if (l->theCause == was) + l->theCause = tobe; +} + +void latency_logentry(unsigned diff, const char *fname, + unsigned lineno, unsigned Address, int mask) +{ + struct LatencyData *l = &latencyData[smp_processor_id()]; + unsigned lowest = 0xffffffff; + unsigned lowestIndex = 0; + unsigned sameIndex = 0xffffffff; + int i; + + if (diff < l->least) + return; + + /* check if we need to log this event */ + for (i = 0; i < NUM_LOG_ENTRY; i++) { + if (lowest > l->entry[i].latency) { + lowest = l->entry[i].latency; + lowestIndex = i; + } + /* If the call addresses match, it is the same path */ + if ((Address == l->entry[i].end.Address) && + (l->latencyStart.Address == l->entry[i].start.Address)){ + sameIndex = i; + break; + } + } + + if (sameIndex == 0xffffffff) { + i = lowestIndex; + /* + * we would like "least" to be the smallest latency in the table + * but we are pressed for time so we settle for it being what + * may have been the lowest, as this new entry may replace + * the found lowest with a higher value, in which case we don't + * have the true lowest. Oh well, we get closer as time + * passes. The real reason for this is to speed things up, + * so we don't worry too much if this isn't exact. + */ + l->least = l->entry[i].latency; + } else { + i = sameIndex; + } + + if (diff > l->entry[i].latency) { + l->entry[i].latency = diff; + l->entry[i].cause = l->theCause; + l->entry[i].mask = l->mask; + + l->entry[i].end.FileName = fname; + l->entry[i].end.FileLine = lineno; + l->entry[i].end.Address = Address; + l->entry[i].end.ProcId = current->pid; + memcpy(l->entry[i].end.ProcName, + current->comm, COMMAND_LENGTH); + + l->entry[i].start.FileName = l->latencyStart.FileName; + l->entry[i].start.FileLine = l->latencyStart.FileLine; + l->entry[i].start.Address = l->latencyStart.Address; + l->entry[i].start.ProcId = l->latencyStart.ProcId; + memcpy(l->entry[i].start.ProcName, + l->latencyStart.ProcName, COMMAND_LENGTH); + } + + l->numIntrs++; +} + +/* Called at end of preemption time */ +asmlinkage void latency_end(const char *fname, unsigned lineno) +{ + struct LatencyData *l = &latencyData[smp_processor_id()]; + unsigned endCount; + + /* if we are not logging or we have an error, do nothing */ + if ((logFlag == 0) || (panicFlag != 0)) { + return; + } + + /* read entry again */ + readclock(endCount); + + if (l->syncFlag == SfS) { + l->count[SfS] ++; + return; + } + + latency_logentry(endCount - l->latencyStartCount, fname, lineno, + (int)__builtin_return_address(0), l->mask); + l->syncFlag = SfS; + return; +} + +/* latency_check is for the end of one period and the start of another */ +asmlinkage void latency_check(const char *fname, unsigned lineno, int cause) +{ + struct LatencyData *l = &latencyData[smp_processor_id()]; + unsigned endCount; + + /* if we are not logging or we have an error, do nothing */ + if ((logFlag == 0) || (panicFlag != 0)) { + return; + } + /* read entry again */ + readclock(endCount); + + if (l->syncFlag != SfS) { + latency_logentry(endCount - l->latencyStartCount, fname, + lineno, (int)__builtin_return_address(0), l->mask); + } + + /* re-set the starting point */ + l->syncFlag = SfM; + l->mask = 0; + l->theCause = cause; + l->latencyStart.Address = (int) __builtin_return_address(0); + l->latencyStart.FileName = fname; + l->latencyStart.FileLine = lineno; + l->latencyStart.ProcId = current->pid; + memcpy(l->latencyStart.ProcName, current->comm, 16); + CLEAR(intrCount); + readclock(l->latencyStartCount); +} + +/* Note following code to read /proc file is not SMP-safe. */ + +static int g_read_completed = 0; +static int g_cpu = 0; + +static int latencytimes_read_proc(char *page_buffer, char **my_first_byte, + off_t virtual_start, int length, int *eof, void *data) +{ + int my_buffer_offset = 0; + char * const my_base = page_buffer; + int i,j,max; + struct LatencyData *l; + + if (virtual_start == 0){ + /* Just been opened */ + logFlag = 0; /* stop logging */ + g_read_completed = 0; + g_cpu = 0; + } else if (g_read_completed == NUM_LOG_ENTRY) { + if (++g_cpu >= smp_num_cpus) { + *eof = 1; + reset_latencyData(); + logFlag = 1; /* start logging */ + return 0; + } + + g_read_completed = 0; + } + + l = &latencyData[g_cpu]; + + if (g_read_completed == 0) { + my_buffer_offset += sprintf(my_base + my_buffer_offset, + "cpu %d worst %d latency times of %d measured in this period.\n" + " usec cause mask start line/file address end line/file\n", g_cpu,NUM_LOG_ENTRY,l->numIntrs); + } + + g_read_completed++; + for (j = max = i = 0;j < NUM_LOG_ENTRY; j++){ + if( l->entry[j].latency > max ){ + max = l->entry[j].latency; + i = j; + } + } + my_buffer_offset += + sprintf(my_base + my_buffer_offset, + "%6d %10s %8x %5d/%-15s %8x %5d/%s\n", + (int)clock_to_usecs(l->entry[i].latency), + (l->entry[i].cause == -99) ? "unknown" : + (l->entry[i].cause < 0) ? + irq_desc[~l->entry[i].cause].action->name : + (l->entry[i].cause > CAUSE_MAX) ? "Unknown" : + cause_name[l->entry[i].cause], + l->entry[i].mask, + l->entry[i].start.FileLine, + l->entry[i].start.FileName == (char *)0 ? + "entry.S" : + l->entry[i].start.FileName, + l->entry[i].start.Address, + /*l->entry[i].start.ProcId, + l->entry[i].start.ProcName,*/ + l->entry[i].end.FileLine, + l->entry[i].end.FileName == (char *)0 ? + "entry.S" : + l->entry[i].end.FileName /*, + l->entry[i].end.ProcId, + l->entry[i].end.ProcName*/); + + /* Clearing these two effectivly clears the entry */ + + l->entry[i].latency = 0; + l->entry[i].end.Address = 0; + *my_first_byte = page_buffer; + return my_buffer_offset; +} + +int __init latencytimes_init(void) +{ + readclock_init(); +#ifdef CONFIG_PROC_FS + create_proc_read_entry("latencytimes", 0, 0, latencytimes_read_proc, 0); +#endif + + reset_latencyData(); + return 0; +} + +__initcall(latencytimes_init); diff -urN linux-2.5.9/kernel/sched.c linux/kernel/sched.c --- linux-2.5.9/kernel/sched.c Mon Apr 22 18:28:20 2002 +++ linux/kernel/sched.c Mon Apr 22 20:12:03 2002 @@ -755,6 +755,11 @@ #if CONFIG_DEBUG_HIGHMEM check_highmem_ptes(); #endif + +#ifdef CONFIG_PREEMPT_TIMES + if (preempt_get_count()) + preempt_lock_force_stop(); +#endif need_resched: preempt_disable(); prev = current; @@ -837,6 +842,16 @@ preempt_enable_no_resched(); if (test_thread_flag(TIF_NEED_RESCHED)) goto need_resched; + +#ifdef CONFIG_PREEMPT_TIMES + if (preempt_get_count()) { + if (current->pid) + preempt_lock_force_start(); + else + preempt_lock_force_stop(); + } +#endif + return; } @@ -852,7 +867,9 @@ return; ti->preempt_count = PREEMPT_ACTIVE; + preempt_lock_start(2); schedule(); + preempt_lock_stop(); ti->preempt_count = 0; barrier(); } diff -urN linux-2.5.9/kernel/softirq.c linux/kernel/softirq.c --- linux-2.5.9/kernel/softirq.c Mon Apr 22 18:28:19 2002 +++ linux/kernel/softirq.c Mon Apr 22 20:12:03 2002 @@ -44,6 +44,12 @@ static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; +#ifdef CONFIG_PREEMPT_TIMES +extern void latency_cause(int,int); +#else +#define latency_cause(a, b) +#endif + /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency @@ -67,6 +73,7 @@ if (in_interrupt()) return; + preempt_lock_start(-99); local_irq_save(flags); @@ -84,6 +91,9 @@ local_irq_enable(); h = softirq_vec; + + latency_cause(-100, pending); + do { if (pending & 1) @@ -106,6 +116,7 @@ } local_irq_restore(flags); + preempt_lock_stop(); } /*