diff -urN linux-2.4.17-rc1-virgin/CREDITS linux-2.4.17-rc1-wli3/CREDITS --- linux-2.4.17-rc1-virgin/CREDITS Fri Dec 14 06:04:00 2001 +++ linux-2.4.17-rc1-wli3/CREDITS Fri Dec 14 02:44:44 2001 @@ -971,8 +971,8 @@ N: Nigel Gamble E: nigel@nrg.org -E: nigel@sgi.com D: Interrupt-driven printer driver +D: Preemptible kernel S: 120 Alley Way S: Mountain View, California 94040 S: USA diff -urN linux-2.4.17-rc1-virgin/Changelog-wli linux-2.4.17-rc1-wli3/Changelog-wli --- linux-2.4.17-rc1-virgin/Changelog-wli Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/Changelog-wli Mon Dec 17 00:03:54 2001 @@ -0,0 +1,36 @@ +Changelog for 2.4.17-rc1-wli3 +---------------------------------------------------------------------- +(1) in FNV change shift/add to multiply (William Irwin) +(2) inode hash function like Lever pagecache (William Irwin) +(3) attribution on comment in pagecache hash function (William Irwin) +(4) lock breaking patch, minus vmscan.c (Robert Love) +(5) back out conditional_schedule in wait_for_buffers (William Irwin) +(6) reverted to Lever dcache but shifting D_HASHBITS (William Irwin) +(7) shifting for high-order bits in UID hash (William Irwin) +(8) shifting for high-order bits in PID hash (William Irwin) +(9) removed comment about inode.c quadratic hashing (William Irwin) + +Changelog for 2.4.17-rc1-wli2 +---------------------------------------------------------------------- +(1) switch dcache to Mersenne hash (William Irwin) +(2) convert partial_name_hash() to FNV (William Irwin) +(3) back off HZ from 600 to 256 (William Irwin) + +Changelog for 2.4.17-rc1-wli1 +---------------------------------------------------------------------- +(1) reverse-mapping VM (Rik van Riel) +(2) preemptive kernel (Robert Love) +(3) realtime scheduler that scans less (George Anziger) +(4) page cache hash function (Chuck Lever) +(5) pidhash hash function (William Irwin) +(6) dentry cache hash function (Chuck Lever) +(7) check for priority == 0 in shrink_dcache_memory() (William Irwin) +(8) buffer cache hash function (Chuck Lever) +(9) uid hash function (William Irwin) +(10) inode hash function restored to Lever paper form (Chuck Lever) +(11) removal of statm_pgd_range() (William Irwin) +(12) elevator read starvation prevention (Andrew Morton) + +revert before distribution: +(1) bootmem rewrite +(2) timeslice change (HZ in asm-i386/param.h) diff -urN linux-2.4.17-rc1-virgin/Documentation/Configure.help linux-2.4.17-rc1-wli3/Documentation/Configure.help --- linux-2.4.17-rc1-virgin/Documentation/Configure.help Fri Dec 14 06:04:00 2001 +++ linux-2.4.17-rc1-wli3/Documentation/Configure.help Sun Dec 16 17:58:10 2001 @@ -266,6 +266,31 @@ If you have a system with several CPUs, you do not need to say Y here: the local APIC will be used automatically. +Preemptible Kernel +CONFIG_PREEMPT + This option reduces the latency of the kernel when reacting to + real-time or interactive events by allowing a low priority process to + be preempted even if it is in kernel mode executing a system call. + This allows applications to run more reliably even when the system is + under load due to other, lower priority, processes. + + Say Y here if you are building a kernel for a desktop system, embedded + system or real-time system. Say N if you are building a kernel for a + system where throughput is more important than interactive response, + such as a server system. Say N if you are unsure. + +Break Selected Locks +CONFIG_LOCK_BREAK + This option will break certain locks in high-latency regions + throughout the kernel. It is intended for use in conjunction with + the preemptible kernel (CONFIG_PREEMPT). Since in-kernel preemption + can not occur while locks are held, temporarily releasing and then + reacquiring long-held locks will further improve system response. + + Say Y if you are compiling for a system with strict latency + requirements such as an embedded, real-time, or audio processing + system. Say N otherwise. + Kernel math emulation CONFIG_MATH_EMULATION Linux can emulate a math coprocessor (used for floating point @@ -289,6 +314,28 @@ If you are not sure, say Y; apart from resulting in a 66 KB bigger kernel, it won't hurt. + +Real Time Scheduler +CONFIG_RTSCHED + + This option replaces the standard linux scheduler with a real time + scheduler. The real time scheduler provides load independent fast + context switch times for real time tasks where as the standard linux + scheduler slows down with increasing load (i.e. more tasks ready to + run). For non-real time tasks both schedulers context switch times are + load dependent. The real time scheduler also provides a configure + option for real time priorities ranging from 1 to a max of 2047 while + the standard schedulers real time priorities range from 1-99. + Real time tasks are tasks that have a scheduling policy of SCHED_FIFO + or SCHED_RR. Scheduling policy is set by the sched_setscheduler(2) + system call and is inherited thru fork and thread creation. + +Maximum Priority? +CONFIG_MAX_PRI + This option lets you set the number of priorities available to real time + tasks. Priorities 1 thru maximum priority are real time tasks. The + default here is 127. The system will quietly change any thing less than + 99 to 99 and any thing greater than 2047 to 2047. Timer and CPU usage LEDs CONFIG_LEDS diff -urN linux-2.4.17-rc1-virgin/Documentation/preempt-locking.txt linux-2.4.17-rc1-wli3/Documentation/preempt-locking.txt --- linux-2.4.17-rc1-virgin/Documentation/preempt-locking.txt Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/Documentation/preempt-locking.txt Fri Dec 14 02:44:44 2001 @@ -0,0 +1,94 @@ + Proper Locking Under a Preemptible Kernel: + Keeping Kernel Code Preempt-Safe + Robert Love + Last Updated: 21 Oct 2001 + + +INTRODUCTION + + +A preemptible kernel creates new locking issues. The issues are the same as +those under SMP: concurrency and reentrancy. Thankfully, the Linux preemptible +kernel model leverages existing SMP locking mechanisms. Thus, the kernel +requires explicit additional locking for very few additional situations. + +This document is for all kernel hackers. Developing code in the kernel +requires protecting these situations. As you will see, these situations would +normally require a lock, where they not per-CPU. + + +RULE #1: Per-CPU data structures need explicit protection + + +Two similar problems arise. An example code snippet: + + struct this_needs_locking tux[NR_CPUS]; + tux[smp_processor_id()] = some_value; + /* task is preempted here... */ + something = tux[smp_processor_id()]; + +First, since the data is per-CPU, it may not have explicit SMP locking, but +require it otherwise. Second, when a preempted task is finally rescheduled, +the previous value of smp_processor_id may not equal the current. You must +protect these situations by disabling preemption around them. + + +RULE #2: CPU state must be protected. + + +Under preemption, the state of the CPU must be protected. This is arch- +dependent, but includes CPU structures and state not preserved over a context +switch. For example, on x86, entering and exiting FPU mode is now a critical +section that must occur while preemption is disabled. Think what would happen +if the kernel is executing a floating-point instruction and is then preempted. +Remember, the kernel does not save FPU state except for user tasks. Therefore, +upon preemption, the FPU registers will be sold to the lowest bidder. Thus, +preemption must be disabled around such regions.i + +Note, some FPU functions are already explicitly preempt safe. For example, +kernel_fpu_begin and kernel_fpu_end will disable and enable preemption. +However, math_state_restore must be called with preemption disabled. + + +SOLUTION + + +Data protection under preemption is achieved by disabling preemption for the +duration of the critical region. + +preempt_enable() decrement the preempt counter +preempt_disable() increment the preempt counter +preempt_enable_no_resched() decrement, but do not immediately preempt + +The functions are nestable. In other words, you can call preempt_disable +n-times in a code path, and preemption will not be reenabled until the n-th +call to preempt_enable. The preempt statements define to nothing if +preemption is not enabled. + +Note that you do not need to explicitly prevent preemption if you are holding +any locks or interrupts are disabled, since preemption is implicitly disabled +in those cases. + +Example: + + cpucache_t *cc; /* this is per-CPU */ + preempt_disable(); + cc = cc_data(searchp); + if (cc && cc->avail) { + __free_block(searchp, cc_entry(cc), cc->avail); + cc->avail = 0; + } + preempt_enable(); + return 0; + +Notice how the preemption statements must encompass every reference of the +critical variables. Another example: + + int buf[NR_CPUS]; + set_cpu_val(buf); + if (buf[smp_processor_id()] == -1) printf(KERN_INFO "wee!\n"); + spin_lock(&buf_lock); + /* ... */ + +This code is not preempt-safe, but see how easily we can fix it by simply +moving the spin_lock up two lines. diff -urN linux-2.4.17-rc1-virgin/Documentation/rtsched.txt linux-2.4.17-rc1-wli3/Documentation/rtsched.txt --- linux-2.4.17-rc1-virgin/Documentation/rtsched.txt Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/Documentation/rtsched.txt Fri Dec 14 04:38:23 2001 @@ -0,0 +1,28 @@ + + Real Time Scheduler for Linux + ============================= + +The Real Time scheduler patch gives you an option to choose to build a +kernel with MontaVista's real time scheduler in it. If you don't choose +to enable the real time scheduler the kernel will be built the same as +if you had not installed the patch. + +If you enable the real time scheduler, you may also choose a max +priority for real time tasks. The available range is 99 to 2047. +Values outside this range are quietly moved to fall in the range. + +In order to enable the real time scheduler you must use one of the +kernel configure tools to turn it on. The question appears in the +processor options section of the configuration. + +Currently the scheduler is supported on all UP and SMP machines. + +Warning: The Real Time scheduler does not honor the "allowed_cpus" +member of the task_struct, thus it will not honor any attempt to define +cpu affinity. The latest preemption patch uses cpu affinity to prevent +cpu switching during preemption. This will not work with this scheduler +and may cause failures in kernels using preemption. In addition TUX +is known to use cpu affinity. It is believed that TUX will run with out +cpu affinity, but may have degraded performance. It is also known that +some soft irq tasks may use cpu affinity to improve performance. These +tasks will still work, however, the affinity will not happen. diff -urN linux-2.4.17-rc1-virgin/MAINTAINERS linux-2.4.17-rc1-wli3/MAINTAINERS --- linux-2.4.17-rc1-virgin/MAINTAINERS Fri Dec 14 06:04:00 2001 +++ linux-2.4.17-rc1-wli3/MAINTAINERS Fri Dec 14 02:44:44 2001 @@ -1242,6 +1242,14 @@ M: mostrows@styx.uwaterloo.ca S: Maintained +PREEMPTIBLE KERNEL +P: Robert M. Love +M: rml@tech9.net +L: linux-kernel@vger.kernel.org +L: kpreempt-tech@lists.sourceforge.net +W: http://tech9.net/rml/linux +S: Maintained + PROMISE DC4030 CACHING DISK CONTROLLER DRIVER P: Peter Denison M: promise@pnd-pc.demon.co.uk diff -urN linux-2.4.17-rc1-virgin/Makefile linux-2.4.17-rc1-wli3/Makefile --- linux-2.4.17-rc1-virgin/Makefile Fri Dec 14 06:04:00 2001 +++ linux-2.4.17-rc1-wli3/Makefile Sun Dec 16 22:41:12 2001 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 17 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc1-wli3 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -urN linux-2.4.17-rc1-virgin/arch/alpha/config.in linux-2.4.17-rc1-wli3/arch/alpha/config.in --- linux-2.4.17-rc1-virgin/arch/alpha/config.in Tue Nov 20 15:49:31 2001 +++ linux-2.4.17-rc1-wli3/arch/alpha/config.in Fri Dec 14 04:38:23 2001 @@ -216,6 +216,10 @@ then bool 'Symmetric multi-processing support' CONFIG_SMP fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi if [ "$CONFIG_SMP" = "y" ]; then define_bool CONFIG_HAVE_DEC_LOCK y diff -urN linux-2.4.17-rc1-virgin/arch/arm/config.in linux-2.4.17-rc1-wli3/arch/arm/config.in --- linux-2.4.17-rc1-virgin/arch/arm/config.in Fri Nov 9 13:58:02 2001 +++ linux-2.4.17-rc1-wli3/arch/arm/config.in Fri Dec 14 04:38:23 2001 @@ -329,6 +329,10 @@ else define_bool CONFIG_DISCONTIGMEM n fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu @@ -437,6 +441,7 @@ if [ "$CONFIG_CPU_32" = "y" -a "$CONFIG_ARCH_EBSA110" != "y" ]; then bool 'Kernel-mode alignment trap handler' CONFIG_ALIGNMENT_TRAP fi +dep_bool 'Preemptible Kernel (experimental)' CONFIG_PREEMPT $CONFIG_CPU_32 $CONFIG_EXPERIMENTAL endmenu source drivers/parport/Config.in diff -urN linux-2.4.17-rc1-virgin/arch/arm/kernel/entry-armv.S linux-2.4.17-rc1-wli3/arch/arm/kernel/entry-armv.S --- linux-2.4.17-rc1-virgin/arch/arm/kernel/entry-armv.S Thu Oct 25 13:53:45 2001 +++ linux-2.4.17-rc1-wli3/arch/arm/kernel/entry-armv.S Fri Dec 14 02:44:44 2001 @@ -672,6 +672,12 @@ add r4, sp, #S_SP mov r6, lr stmia r4, {r5, r6, r7, r8, r9} @ save sp_SVC, lr_SVC, pc, cpsr, old_ro +#ifdef CONFIG_PREEMPT + get_current_task r9 + ldr r8, [r9, #TSK_PREEMPT] + add r8, r8, #1 + str r8, [r9, #TSK_PREEMPT] +#endif 1: get_irqnr_and_base r0, r6, r5, lr movne r1, sp @ @@ -679,6 +685,25 @@ @ adrsvc ne, lr, 1b bne do_IRQ +#ifdef CONFIG_PREEMPT +2: ldr r8, [r9, #TSK_PREEMPT] + subs r8, r8, #1 + bne 3f + ldr r7, [r9, #TSK_NEED_RESCHED] + teq r7, #0 + beq 3f + ldr r6, .LCirqstat + ldr r0, [r6, #IRQSTAT_BH_COUNT] + teq r0, #0 + bne 3f + mov r0, #MODE_SVC + msr cpsr_c, r0 @ enable interrupts + bl SYMBOL_NAME(preempt_schedule) + mov r0, #I_BIT | MODE_SVC + msr cpsr_c, r0 @ disable interrupts + b 2b +3: str r8, [r9, #TSK_PREEMPT] +#endif ldr r0, [sp, #S_PSR] @ irqs are already disabled msr spsr, r0 ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr @@ -736,6 +761,9 @@ .LCprocfns: .word SYMBOL_NAME(processor) #endif .LCfp: .word SYMBOL_NAME(fp_enter) +#ifdef CONFIG_PREEMPT +.LCirqstat: .word SYMBOL_NAME(irq_stat) +#endif irq_prio_table @@ -775,6 +803,12 @@ stmdb r8, {sp, lr}^ alignment_trap r4, r7, __temp_irq zero_fp + get_current_task tsk +#ifdef CONFIG_PREEMPT + ldr r0, [tsk, #TSK_PREEMPT] + add r0, r0, #1 + str r0, [tsk, #TSK_PREEMPT] +#endif 1: get_irqnr_and_base r0, r6, r5, lr movne r1, sp adrsvc ne, lr, 1b @@ -782,8 +816,12 @@ @ routine called with r0 = irq number, r1 = struct pt_regs * @ bne do_IRQ +#ifdef CONFIG_PREEMPT + ldr r0, [tsk, #TSK_PREEMPT] + sub r0, r0, #1 + str r0, [tsk, #TSK_PREEMPT] +#endif mov why, #0 - get_current_task tsk b ret_to_user .align 5 diff -urN linux-2.4.17-rc1-virgin/arch/arm/tools/getconstants.c linux-2.4.17-rc1-wli3/arch/arm/tools/getconstants.c --- linux-2.4.17-rc1-virgin/arch/arm/tools/getconstants.c Thu Oct 11 09:04:57 2001 +++ linux-2.4.17-rc1-wli3/arch/arm/tools/getconstants.c Fri Dec 14 02:44:44 2001 @@ -13,6 +13,7 @@ #include #include +#include /* * Make sure that the compiler and target are compatible. @@ -38,6 +39,11 @@ DEFN("TSS_SAVE", OFF_TSK(thread.save)); DEFN("TSS_FPESAVE", OFF_TSK(thread.fpstate.soft.save)); + +#ifdef CONFIG_PREEMPT +DEFN("TSK_PREEMPT", OFF_TSK(preempt_count)); +DEFN("IRQSTAT_BH_COUNT", (unsigned long)&(((irq_cpustat_t *)0)->__local_bh_count)); +#endif #ifdef CONFIG_CPU_32 DEFN("TSS_DOMAIN", OFF_TSK(thread.domain)); diff -urN linux-2.4.17-rc1-virgin/arch/cris/config.in linux-2.4.17-rc1-wli3/arch/cris/config.in --- linux-2.4.17-rc1-virgin/arch/cris/config.in Mon Oct 15 13:42:14 2001 +++ linux-2.4.17-rc1-wli3/arch/cris/config.in Fri Dec 14 04:38:23 2001 @@ -11,6 +11,10 @@ mainmenu_option next_comment comment 'Code maturity level options' bool 'Prompt for development and/or incomplete code/drivers' CONFIG_EXPERIMENTAL +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc1-virgin/arch/i386/config.in linux-2.4.17-rc1-wli3/arch/i386/config.in --- linux-2.4.17-rc1-virgin/arch/i386/config.in Fri Dec 14 06:04:00 2001 +++ linux-2.4.17-rc1-wli3/arch/i386/config.in Sun Dec 16 17:58:10 2001 @@ -176,6 +176,10 @@ bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP +bool 'Preemptible Kernel' CONFIG_PREEMPT +if [ "$CONFIG_PREEMPT" = "y" ]; then + bool 'Break selected locks' CONFIG_LOCK_BREAK +fi if [ "$CONFIG_SMP" != "y" ]; then bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC @@ -188,10 +192,17 @@ else bool 'Multiquad NUMA system' CONFIG_MULTIQUAD fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi -if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then - define_bool CONFIG_HAVE_DEC_LOCK y +if [ "$CONFIG_SMP" = "y" -o "$CONFIG_PREEMPT" = "y" ]; then + if [ "$CONFIG_X86_CMPXCHG" = "y" ]; then + define_bool CONFIG_HAVE_DEC_LOCK y + fi fi + endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc1-virgin/arch/i386/kernel/entry.S linux-2.4.17-rc1-wli3/arch/i386/kernel/entry.S --- linux-2.4.17-rc1-virgin/arch/i386/kernel/entry.S Fri Nov 2 17:18:49 2001 +++ linux-2.4.17-rc1-wli3/arch/i386/kernel/entry.S Fri Dec 14 02:44:44 2001 @@ -71,7 +71,7 @@ * these are offsets into the task-struct. */ state = 0 -flags = 4 +preempt_count = 4 sigpending = 8 addr_limit = 12 exec_domain = 16 @@ -79,8 +79,28 @@ tsk_ptrace = 24 processor = 52 + /* These are offsets into the irq_stat structure + * There is one per cpu and it is aligned to 32 + * byte boundry (we put that here as a shift count) + */ +irq_array_shift = CONFIG_X86_L1_CACHE_SHIFT + +irq_stat_local_irq_count = 4 +irq_stat_local_bh_count = 8 + ENOSYS = 38 +#ifdef CONFIG_SMP +#define GET_CPU_INDX movl processor(%ebx),%eax; \ + shll $irq_array_shift,%eax +#define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx); \ + GET_CPU_INDX +#define CPU_INDX (,%eax) +#else +#define GET_CPU_INDX +#define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx) +#define CPU_INDX +#endif #define SAVE_ALL \ cld; \ @@ -247,12 +267,30 @@ ALIGN ENTRY(ret_from_intr) GET_CURRENT(%ebx) +#ifdef CONFIG_PREEMPT + cli + decl preempt_count(%ebx) +#endif ret_from_exception: movl EFLAGS(%esp),%eax # mix EFLAGS and CS movb CS(%esp),%al testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor? jne ret_from_sys_call +#ifdef CONFIG_PREEMPT + cmpl $0,preempt_count(%ebx) + jnz restore_all + cmpl $0,need_resched(%ebx) + jz restore_all + movl SYMBOL_NAME(irq_stat)+irq_stat_local_bh_count CPU_INDX,%ecx + addl SYMBOL_NAME(irq_stat)+irq_stat_local_irq_count CPU_INDX,%ecx + jnz restore_all + incl preempt_count(%ebx) + sti + call SYMBOL_NAME(preempt_schedule) + jmp ret_from_intr +#else jmp restore_all +#endif ALIGN reschedule: @@ -289,6 +327,9 @@ GET_CURRENT(%ebx) call *%edi addl $8,%esp +#ifdef CONFIG_PREEMPT + cli +#endif jmp ret_from_exception ENTRY(coprocessor_error) @@ -308,12 +349,18 @@ movl %cr0,%eax testl $0x4,%eax # EM (math emulation bit) jne device_not_available_emulate +#ifdef CONFIG_PREEMPT + cli +#endif call SYMBOL_NAME(math_state_restore) jmp ret_from_exception device_not_available_emulate: pushl $0 # temporary storage for ORIG_EIP call SYMBOL_NAME(math_emulate) addl $4,%esp +#ifdef CONFIG_PREEMPT + cli +#endif jmp ret_from_exception ENTRY(debug) diff -urN linux-2.4.17-rc1-virgin/arch/i386/kernel/i387.c linux-2.4.17-rc1-wli3/arch/i386/kernel/i387.c --- linux-2.4.17-rc1-virgin/arch/i386/kernel/i387.c Fri Feb 23 10:09:08 2001 +++ linux-2.4.17-rc1-wli3/arch/i386/kernel/i387.c Fri Dec 14 02:44:44 2001 @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -65,6 +66,8 @@ { struct task_struct *tsk = current; + preempt_disable(); + if (tsk->flags & PF_USEDFPU) { __save_init_fpu(tsk); return; diff -urN linux-2.4.17-rc1-virgin/arch/i386/kernel/traps.c linux-2.4.17-rc1-wli3/arch/i386/kernel/traps.c --- linux-2.4.17-rc1-virgin/arch/i386/kernel/traps.c Sun Sep 30 12:26:08 2001 +++ linux-2.4.17-rc1-wli3/arch/i386/kernel/traps.c Fri Dec 14 02:44:44 2001 @@ -697,6 +697,11 @@ */ asmlinkage void math_state_restore(struct pt_regs regs) { + /* + * CONFIG_PREEMPT + * Must be called with preemption disabled + */ + __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ if (current->used_math) { diff -urN linux-2.4.17-rc1-virgin/arch/i386/lib/dec_and_lock.c linux-2.4.17-rc1-wli3/arch/i386/lib/dec_and_lock.c --- linux-2.4.17-rc1-virgin/arch/i386/lib/dec_and_lock.c Fri Jul 7 18:20:16 2000 +++ linux-2.4.17-rc1-wli3/arch/i386/lib/dec_and_lock.c Fri Dec 14 02:44:44 2001 @@ -8,6 +8,7 @@ */ #include +#include #include int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) diff -urN linux-2.4.17-rc1-virgin/arch/ia64/config.in linux-2.4.17-rc1-wli3/arch/ia64/config.in --- linux-2.4.17-rc1-virgin/arch/ia64/config.in Fri Nov 9 14:26:17 2001 +++ linux-2.4.17-rc1-wli3/arch/ia64/config.in Fri Dec 14 04:38:23 2001 @@ -94,6 +94,10 @@ define_bool CONFIG_KCORE_ELF y # On IA-64, we always want an ELF /proc/kcore. bool 'SMP support' CONFIG_SMP +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi tristate 'Support running of Linux/x86 binaries' CONFIG_IA32_SUPPORT bool 'Performance monitor support' CONFIG_PERFMON tristate '/proc/pal support' CONFIG_IA64_PALINFO diff -urN linux-2.4.17-rc1-virgin/arch/m68k/config.in linux-2.4.17-rc1-wli3/arch/m68k/config.in --- linux-2.4.17-rc1-virgin/arch/m68k/config.in Mon Jun 11 19:15:27 2001 +++ linux-2.4.17-rc1-wli3/arch/m68k/config.in Fri Dec 14 04:38:23 2001 @@ -84,6 +84,10 @@ bool 'Use write-through caching for 68060 supervisor accesses' CONFIG_060_WRITETHROUGH fi fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc1-virgin/arch/mips/config.in linux-2.4.17-rc1-wli3/arch/mips/config.in --- linux-2.4.17-rc1-virgin/arch/mips/config.in Mon Oct 15 13:41:34 2001 +++ linux-2.4.17-rc1-wli3/arch/mips/config.in Fri Dec 14 04:38:23 2001 @@ -275,6 +275,10 @@ fi fi fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc1-virgin/arch/mips64/config.in linux-2.4.17-rc1-wli3/arch/mips64/config.in --- linux-2.4.17-rc1-virgin/arch/mips64/config.in Sun Sep 9 10:43:02 2001 +++ linux-2.4.17-rc1-wli3/arch/mips64/config.in Fri Dec 14 04:38:23 2001 @@ -25,6 +25,10 @@ bool ' Multi-Processing support' CONFIG_SMP #bool ' IP27 XXL' CONFIG_SGI_SN0_XXL fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu define_bool CONFIG_RWSEM_GENERIC_SPINLOCK y diff -urN linux-2.4.17-rc1-virgin/arch/parisc/config.in linux-2.4.17-rc1-wli3/arch/parisc/config.in --- linux-2.4.17-rc1-virgin/arch/parisc/config.in Tue Apr 17 17:19:25 2001 +++ linux-2.4.17-rc1-wli3/arch/parisc/config.in Fri Dec 14 04:38:23 2001 @@ -45,6 +45,10 @@ # # if [ "$CONFIG_PCI_EPIC" = "y" ]; then... # +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu diff -urN linux-2.4.17-rc1-virgin/arch/ppc/config.in linux-2.4.17-rc1-wli3/arch/ppc/config.in --- linux-2.4.17-rc1-virgin/arch/ppc/config.in Fri Nov 16 10:10:08 2001 +++ linux-2.4.17-rc1-wli3/arch/ppc/config.in Fri Dec 14 04:38:23 2001 @@ -108,6 +108,10 @@ if [ "$CONFIG_SMP" = "y" ]; then bool ' Distribute interrupts on all CPUs by default' CONFIG_IRQ_ALL_CPUS fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi if [ "$CONFIG_6xx" = "y" -a "$CONFIG_8260" = "n" ];then bool 'AltiVec Support' CONFIG_ALTIVEC diff -urN linux-2.4.17-rc1-virgin/arch/s390/config.in linux-2.4.17-rc1-wli3/arch/s390/config.in --- linux-2.4.17-rc1-virgin/arch/s390/config.in Fri Nov 9 13:58:02 2001 +++ linux-2.4.17-rc1-wli3/arch/s390/config.in Fri Dec 14 04:38:23 2001 @@ -32,6 +32,10 @@ comment 'Processor type and features' bool 'Symmetric multi-processing support' CONFIG_SMP bool 'IEEE FPU emulation' CONFIG_MATHEMU +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc1-virgin/arch/s390x/config.in linux-2.4.17-rc1-wli3/arch/s390x/config.in --- linux-2.4.17-rc1-virgin/arch/s390x/config.in Thu Oct 11 09:04:57 2001 +++ linux-2.4.17-rc1-wli3/arch/s390x/config.in Fri Dec 14 04:38:23 2001 @@ -26,6 +26,10 @@ if [ "$CONFIG_S390_SUPPORT" = "y" ]; then tristate 'Kernel support for 31 bit ELF binaries' CONFIG_BINFMT_ELF32 fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc1-virgin/arch/sh/config.in linux-2.4.17-rc1-wli3/arch/sh/config.in --- linux-2.4.17-rc1-virgin/arch/sh/config.in Mon Oct 15 13:36:48 2001 +++ linux-2.4.17-rc1-wli3/arch/sh/config.in Fri Dec 14 04:38:23 2001 @@ -22,6 +22,10 @@ bool ' Set version information on all module symbols' CONFIG_MODVERSIONS bool ' Kernel module loader' CONFIG_KMOD fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment @@ -124,6 +128,8 @@ hex 'Physical memory start address' CONFIG_MEMORY_START 08000000 hex 'Physical memory size' CONFIG_MEMORY_SIZE 00400000 fi +# Preemptible kernel feature +bool 'Preemptible Kernel' CONFIG_PREEMPT endmenu if [ "$CONFIG_SH_HP690" = "y" ]; then diff -urN linux-2.4.17-rc1-virgin/arch/sh/kernel/entry.S linux-2.4.17-rc1-wli3/arch/sh/kernel/entry.S --- linux-2.4.17-rc1-virgin/arch/sh/kernel/entry.S Mon Oct 8 10:39:18 2001 +++ linux-2.4.17-rc1-wli3/arch/sh/kernel/entry.S Fri Dec 14 02:44:44 2001 @@ -60,10 +60,18 @@ /* * These are offsets into the task-struct. */ -flags = 4 +preempt_count = 4 sigpending = 8 need_resched = 20 tsk_ptrace = 24 +flags = 84 + +/* + * And these offsets are into irq_stat. + * (Find irq_cpustat_t in asm-sh/hardirq.h) + */ +local_irq_count = 8 +local_bh_count = 12 PT_TRACESYS = 0x00000002 PF_USEDFPU = 0x00100000 @@ -143,7 +151,7 @@ mov.l __INV_IMASK, r11; \ stc sr, r10; \ and r11, r10; \ - stc k_g_imask, r11; \ + stc k_g_imask, r11; \ or r11, r10; \ ldc r10, sr @@ -304,8 +312,8 @@ mov.l @(tsk_ptrace,r0), r0 ! Is current PTRACE_SYSCALL'd? mov #PT_TRACESYS, r1 tst r1, r0 - bt ret_from_syscall - bra syscall_ret_trace + bf syscall_ret_trace + bra ret_from_syscall nop .align 2 @@ -505,8 +513,6 @@ .long syscall_ret_trace __syscall_ret: .long syscall_ret -__INV_IMASK: - .long 0xffffff0f ! ~(IMASK) .align 2 @@ -518,7 +524,84 @@ .align 2 1: .long SYMBOL_NAME(schedule) +#ifdef CONFIG_PREEMPT + ! + ! Returning from interrupt during kernel mode: check if + ! preempt_schedule should be called. If need_resched flag + ! is set, preempt_count is zero, and we're not currently + ! in an interrupt handler (local irq or bottom half) then + ! call preempt_schedule. + ! + ! Increment preempt_count to prevent a nested interrupt + ! from reentering preempt_schedule, then decrement after + ! and drop through to regular interrupt return which will + ! jump back and check again in case such an interrupt did + ! come in (and didn't preempt due to preempt_count). + ! + ! NOTE: because we just checked that preempt_count was + ! zero before getting to the call, can't we use immediate + ! values (1 and 0) rather than inc/dec? Also, rather than + ! drop through to ret_from_irq, we already know this thread + ! is kernel mode, can't we go direct to ret_from_kirq? In + ! fact, with proper interrupt nesting and so forth could + ! the loop simply be on the need_resched w/o checking the + ! other stuff again? Optimize later... + ! + .align 2 +ret_from_kirq: + ! Nonzero preempt_count prevents scheduling + stc k_current, r1 + mov.l @(preempt_count,r1), r0 + cmp/eq #0, r0 + bf restore_all + ! Zero need_resched prevents scheduling + mov.l @(need_resched,r1), r0 + cmp/eq #0, r0 + bt restore_all + ! If in_interrupt(), don't schedule + mov.l __irq_stat, r1 + mov.l @(local_irq_count,r1), r0 + mov.l @(local_bh_count,r1), r1 + or r1, r0 + cmp/eq #0, r0 + bf restore_all + ! Allow scheduling using preempt_schedule + ! Adjust preempt_count and SR as needed. + stc k_current, r1 + mov.l @(preempt_count,r1), r0 ! Could replace this ... + add #1, r0 ! ... and this w/mov #1? + mov.l r0, @(preempt_count,r1) + STI() + mov.l __preempt_schedule, r0 + jsr @r0 + nop + /* CLI */ + stc sr, r0 + or #0xf0, r0 + ldc r0, sr + ! + stc k_current, r1 + mov.l @(preempt_count,r1), r0 ! Could replace this ... + add #-1, r0 ! ... and this w/mov #0? + mov.l r0, @(preempt_count,r1) + ! Maybe should bra ret_from_kirq, or loop over need_resched? + ! For now, fall through to ret_from_irq again... +#endif /* CONFIG_PREEMPT */ + ret_from_irq: + mov #OFF_SR, r0 + mov.l @(r0,r15), r0 ! get status register + shll r0 + shll r0 ! kernel space? +#ifndef CONFIG_PREEMPT + bt restore_all ! Yes, it's from kernel, go back soon +#else /* CONFIG_PREEMPT */ + bt ret_from_kirq ! From kernel: maybe preempt_schedule +#endif /* CONFIG_PREEMPT */ + ! + bra ret_from_syscall + nop + ret_from_exception: mov #OFF_SR, r0 mov.l @(r0,r15), r0 ! get status register @@ -564,6 +647,13 @@ .long SYMBOL_NAME(do_signal) __irq_stat: .long SYMBOL_NAME(irq_stat) +#ifdef CONFIG_PREEMPT +__preempt_schedule: + .long SYMBOL_NAME(preempt_schedule) +#endif /* CONFIG_PREEMPT */ +__INV_IMASK: + .long 0xffffff0f ! ~(IMASK) + .align 2 restore_all: @@ -679,7 +769,7 @@ __fpu_prepare_fd: .long SYMBOL_NAME(fpu_prepare_fd) __init_task_flags: - .long SYMBOL_NAME(init_task_union)+4 + .long SYMBOL_NAME(init_task_union)+flags __PF_USEDFPU: .long PF_USEDFPU #endif diff -urN linux-2.4.17-rc1-virgin/arch/sh/kernel/irq.c linux-2.4.17-rc1-wli3/arch/sh/kernel/irq.c --- linux-2.4.17-rc1-virgin/arch/sh/kernel/irq.c Sat Sep 8 12:29:09 2001 +++ linux-2.4.17-rc1-wli3/arch/sh/kernel/irq.c Fri Dec 14 02:44:44 2001 @@ -229,6 +229,14 @@ struct irqaction * action; unsigned int status; + /* + * At this point we're now about to actually call handlers, + * and interrupts might get reenabled during them... bump + * preempt_count to prevent any preemption while the handler + * called here is pending... + */ + preempt_disable(); + /* Get IRQ number */ asm volatile("stc r2_bank, %0\n\t" "shlr2 %0\n\t" @@ -298,8 +306,17 @@ desc->handler->end(irq); spin_unlock(&desc->lock); + if (softirq_pending(cpu)) do_softirq(); + + /* + * We're done with the handlers, interrupts should be + * currently disabled; decrement preempt_count now so + * as we return preemption may be allowed... + */ + preempt_enable_no_resched(); + return 1; } diff -urN linux-2.4.17-rc1-virgin/arch/sparc/config.in linux-2.4.17-rc1-wli3/arch/sparc/config.in --- linux-2.4.17-rc1-virgin/arch/sparc/config.in Mon Jun 11 19:15:27 2001 +++ linux-2.4.17-rc1-wli3/arch/sparc/config.in Fri Dec 14 04:38:23 2001 @@ -28,6 +28,10 @@ define_bool CONFIG_VT_CONSOLE y bool 'Symmetric multi-processing support (does not work on sun4/sun4c)' CONFIG_SMP +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi # Identify this as a Sparc32 build define_bool CONFIG_SPARC32 y diff -urN linux-2.4.17-rc1-virgin/arch/sparc64/config.in linux-2.4.17-rc1-wli3/arch/sparc64/config.in --- linux-2.4.17-rc1-virgin/arch/sparc64/config.in Fri Dec 14 06:04:01 2001 +++ linux-2.4.17-rc1-wli3/arch/sparc64/config.in Fri Dec 14 04:38:23 2001 @@ -27,6 +27,10 @@ define_bool CONFIG_VT_CONSOLE y bool 'Symmetric multi-processing support' CONFIG_SMP +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi # Identify this as a Sparc64 build define_bool CONFIG_SPARC64 y diff -urN linux-2.4.17-rc1-virgin/drivers/block/elevator.c linux-2.4.17-rc1-wli3/drivers/block/elevator.c --- linux-2.4.17-rc1-virgin/drivers/block/elevator.c Thu Jul 19 20:59:41 2001 +++ linux-2.4.17-rc1-wli3/drivers/block/elevator.c Sat Dec 15 14:54:07 2001 @@ -74,11 +74,10 @@ return 0; } - int elevator_linus_merge(request_queue_t *q, struct request **req, struct list_head * head, struct buffer_head *bh, int rw, - int max_sectors) + int max_sectors, int max_bomb_segments) { struct list_head *entry = &q->queue_head; unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE; @@ -116,6 +115,56 @@ } } + /* + * If we failed to merge a read anywhere in the request + * queue, we really don't want to place it at the end + * of the list, behind lots of writes. So place it near + * the front. + * + * We don't want to place it in front of _all_ writes: that + * would create lots of seeking, and isn't tunable. + * We try to avoid promoting this read in front of existing + * reads. + * + * max_bomb_sectors becomes the maximum number of write + * requests which we allow to remain in place in front of + * a newly introduced read. We weight things a little bit, + * so large writes are more expensive than small ones, but it's + * requests which count, not sectors. + */ + if (max_bomb_segments && rw == READ && ret == ELEVATOR_NO_MERGE) { + int cur_latency = 0; + struct request * const cur_request = *req; + + entry = head->next; + while (entry != &q->queue_head) { + struct request *__rq; + + if (entry == &q->queue_head) + BUG(); + if (entry == q->queue_head.next && + q->head_active && !q->plugged) + BUG(); + __rq = blkdev_entry_to_request(entry); + + if (__rq == cur_request) { + /* + * This is where the old algorithm placed it. + * There's no point pushing it further back, + * so leave it here, in sorted order. + */ + break; + } + if (__rq->cmd == WRITE) { + cur_latency += 1 + __rq->nr_sectors / 64; + if (cur_latency >= max_bomb_segments) { + *req = __rq; + break; + } + } + entry = entry->next; + } + } return ret; } @@ -144,7 +193,7 @@ int elevator_noop_merge(request_queue_t *q, struct request **req, struct list_head * head, struct buffer_head *bh, int rw, - int max_sectors) + int max_sectors, int max_bomb_segments) { struct list_head *entry; unsigned int count = bh->b_size >> 9; @@ -188,7 +237,7 @@ output.queue_ID = elevator->queue_ID; output.read_latency = elevator->read_latency; output.write_latency = elevator->write_latency; - output.max_bomb_segments = 0; + output.max_bomb_segments = elevator->max_bomb_segments; if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t))) return -EFAULT; @@ -207,9 +256,12 @@ return -EINVAL; if (input.write_latency < 0) return -EINVAL; + if (input.max_bomb_segments < 0) + return -EINVAL; elevator->read_latency = input.read_latency; elevator->write_latency = input.write_latency; + elevator->max_bomb_segments = input.max_bomb_segments; return 0; } diff -urN linux-2.4.17-rc1-virgin/drivers/block/ll_rw_blk.c linux-2.4.17-rc1-wli3/drivers/block/ll_rw_blk.c --- linux-2.4.17-rc1-virgin/drivers/block/ll_rw_blk.c Mon Oct 29 12:11:17 2001 +++ linux-2.4.17-rc1-wli3/drivers/block/ll_rw_blk.c Sat Dec 15 14:54:07 2001 @@ -690,7 +690,8 @@ } else if (q->head_active && !q->plugged) head = head->next; - el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors); + el_ret = elevator->elevator_merge_fn(q, &req, head, bh, + rw, max_sectors, elevator->max_bomb_segments); switch (el_ret) { case ELEVATOR_BACK_MERGE: diff -urN linux-2.4.17-rc1-virgin/drivers/char/mem.c linux-2.4.17-rc1-wli3/drivers/char/mem.c --- linux-2.4.17-rc1-virgin/drivers/char/mem.c Fri Dec 14 06:04:02 2001 +++ linux-2.4.17-rc1-wli3/drivers/char/mem.c Sun Dec 16 17:58:10 2001 @@ -272,8 +272,6 @@ return virtr + read; } -extern long vwrite(char *buf, char *addr, unsigned long count); - /* * This function writes to the *virtual* memory as seen by the kernel. */ @@ -281,46 +279,12 @@ size_t count, loff_t *ppos) { unsigned long p = *ppos; - ssize_t wrote = 0; - ssize_t virtr = 0; - char * kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */ - - if (p < (unsigned long) high_memory) { - wrote = count; - if (count > (unsigned long) high_memory - p) - wrote = (unsigned long) high_memory - p; - - wrote = do_write_mem(file, (void*)p, p, buf, wrote, ppos); - - p += wrote; - buf += wrote; - count -= wrote; - } - - if (count > 0) { - kbuf = (char *)__get_free_page(GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - while (count > 0) { - int len = count; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - if (len && copy_from_user(kbuf, buf, len)) { - free_page((unsigned long)kbuf); - return -EFAULT; - } - len = vwrite(kbuf, (char *)p, len); - count -= len; - buf += len; - virtr += len; - p += len; - } - free_page((unsigned long)kbuf); - } - *ppos = p; - return virtr + wrote; + if (p >= (unsigned long) high_memory) + return 0; + if (count > (unsigned long) high_memory - p) + count = (unsigned long) high_memory - p; + return do_write_mem(file, (void*)p, p, buf, count, ppos); } #if !defined(__mc68000__) @@ -400,7 +364,7 @@ if (count > size) count = size; - zap_page_range(mm, addr, count); + zap_page_range(mm, addr, count, ZPR_NORMAL); zeromap_page_range(addr, count, PAGE_COPY); size -= count; diff -urN linux-2.4.17-rc1-virgin/drivers/char/mem.c~ linux-2.4.17-rc1-wli3/drivers/char/mem.c~ --- linux-2.4.17-rc1-virgin/drivers/char/mem.c~ Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/drivers/char/mem.c~ Fri Dec 14 03:53:03 2001 @@ -0,0 +1,642 @@ +/* + * linux/drivers/char/mem.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Added devfs support. + * Jan-11-1998, C. Scott Ananian + * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_I2C +extern int i2c_init_all(void); +#endif +#ifdef CONFIG_FB +extern void fbmem_init(void); +#endif +#ifdef CONFIG_PROM_CONSOLE +extern void prom_con_init(void); +#endif +#ifdef CONFIG_MDA_CONSOLE +extern void mda_console_init(void); +#endif +#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_CHAR) +extern void tapechar_init(void); +#endif + +static ssize_t do_write_mem(struct file * file, void *p, unsigned long realp, + const char * buf, size_t count, loff_t *ppos) +{ + ssize_t written; + + written = 0; +#if defined(__sparc__) || defined(__mc68000__) + /* we don't have page 0 mapped on sparc and m68k.. */ + if (realp < PAGE_SIZE) { + unsigned long sz = PAGE_SIZE-realp; + if (sz > count) sz = count; + /* Hmm. Do something? */ + buf+=sz; + p+=sz; + count-=sz; + written+=sz; + } +#endif + if (copy_from_user(p, buf, count)) + return -EFAULT; + written += count; + *ppos += written; + return written; +} + + +/* + * This funcion reads the *physical* memory. The f_pos points directly to the + * memory location. + */ +static ssize_t read_mem(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + unsigned long end_mem; + ssize_t read; + + end_mem = __pa(high_memory); + if (p >= end_mem) + return 0; + if (count > end_mem - p) + count = end_mem - p; + read = 0; +#if defined(__sparc__) || defined(__mc68000__) + /* we don't have page 0 mapped on sparc and m68k.. */ + if (p < PAGE_SIZE) { + unsigned long sz = PAGE_SIZE-p; + if (sz > count) + sz = count; + if (sz > 0) { + if (clear_user(buf, sz)) + return -EFAULT; + buf += sz; + p += sz; + count -= sz; + read += sz; + } + } +#endif + if (copy_to_user(buf, __va(p), count)) + return -EFAULT; + read += count; + *ppos += read; + return read; +} + +static ssize_t write_mem(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + unsigned long end_mem; + + end_mem = __pa(high_memory); + if (p >= end_mem) + return 0; + if (count > end_mem - p) + count = end_mem - p; + return do_write_mem(file, __va(p), p, buf, count, ppos); +} + +#ifndef pgprot_noncached + +/* + * This should probably be per-architecture in + */ +static inline pgprot_t pgprot_noncached(pgprot_t _prot) +{ + unsigned long prot = pgprot_val(_prot); + +#if defined(__i386__) || defined(__x86_64__) + /* On PPro and successors, PCD alone doesn't always mean + uncached because of interactions with the MTRRs. PCD | PWT + means definitely uncached. */ + if (boot_cpu_data.x86 > 3) + prot |= _PAGE_PCD | _PAGE_PWT; +#elif defined(__powerpc__) + prot |= _PAGE_NO_CACHE | _PAGE_GUARDED; +#elif defined(__mc68000__) +#ifdef SUN3_PAGE_NOCACHE + if (MMU_IS_SUN3) + prot |= SUN3_PAGE_NOCACHE; + else +#endif + if (MMU_IS_851 || MMU_IS_030) + prot |= _PAGE_NOCACHE030; + /* Use no-cache mode, serialized */ + else if (MMU_IS_040 || MMU_IS_060) + prot = (prot & _CACHEMASK040) | _PAGE_NOCACHE_S; +#endif + + return __pgprot(prot); +} + +#endif /* !pgprot_noncached */ + +/* + * Architectures vary in how they handle caching for addresses + * outside of main memory. + */ +static inline int noncached_address(unsigned long addr) +{ +#if defined(__i386__) + /* + * On the PPro and successors, the MTRRs are used to set + * memory types for physical addresses outside main memory, + * so blindly setting PCD or PWT on those pages is wrong. + * For Pentiums and earlier, the surround logic should disable + * caching for the high addresses through the KEN pin, but + * we maintain the tradition of paranoia in this code. + */ + return !( test_bit(X86_FEATURE_MTRR, &boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_K6_MTRR, &boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CYRIX_ARR, &boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CENTAUR_MCR, &boot_cpu_data.x86_capability) ) + && addr >= __pa(high_memory); +#else + return addr >= __pa(high_memory); +#endif +} + +static int mmap_mem(struct file * file, struct vm_area_struct * vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + + /* + * Accessing memory above the top the kernel knows about or + * through a file pointer that was marked O_SYNC will be + * done non-cached. + */ + if (noncached_address(offset) || (file->f_flags & O_SYNC)) + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + /* Don't try to swap out physical pages.. */ + vma->vm_flags |= VM_RESERVED; + + /* + * Don't dump addresses that are not real memory to a core file. + */ + if (offset >= __pa(high_memory) || (file->f_flags & O_SYNC)) + vma->vm_flags |= VM_IO; + + if (remap_page_range(vma->vm_start, offset, vma->vm_end-vma->vm_start, + vma->vm_page_prot)) + return -EAGAIN; + return 0; +} + +/* + * This function reads the *virtual* memory as seen by the kernel. + */ +static ssize_t read_kmem(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + ssize_t read = 0; + ssize_t virtr = 0; + char * kbuf; /* k-addr because vread() takes vmlist_lock rwlock */ + + if (p < (unsigned long) high_memory) { + read = count; + if (count > (unsigned long) high_memory - p) + read = (unsigned long) high_memory - p; + +#if defined(__sparc__) || defined(__mc68000__) + /* we don't have page 0 mapped on sparc and m68k.. */ + if (p < PAGE_SIZE && read > 0) { + size_t tmp = PAGE_SIZE - p; + if (tmp > read) tmp = read; + if (clear_user(buf, tmp)) + return -EFAULT; + buf += tmp; + p += tmp; + read -= tmp; + count -= tmp; + } +#endif + if (copy_to_user(buf, (char *)p, read)) + return -EFAULT; + p += read; + buf += read; + count -= read; + } + + if (count > 0) { + kbuf = (char *)__get_free_page(GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + while (count > 0) { + int len = count; + + if (len > PAGE_SIZE) + len = PAGE_SIZE; + len = vread(kbuf, (char *)p, len); + if (!len) + break; + if (copy_to_user(buf, kbuf, len)) { + free_page((unsigned long)kbuf); + return -EFAULT; + } + count -= len; + buf += len; + virtr += len; + p += len; + } + free_page((unsigned long)kbuf); + } + *ppos = p; + return virtr + read; +} + +/* + * This function writes to the *virtual* memory as seen by the kernel. + */ +static ssize_t write_kmem(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + + if (p >= (unsigned long) high_memory) + return 0; + if (count > (unsigned long) high_memory - p) + count = (unsigned long) high_memory - p; + return do_write_mem(file, (void*)p, p, buf, count, ppos); +} + +#if !defined(__mc68000__) +static ssize_t read_port(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + unsigned long i = *ppos; + char *tmp = buf; + + if (verify_area(VERIFY_WRITE,buf,count)) + return -EFAULT; + while (count-- > 0 && i < 65536) { + if (__put_user(inb(i),tmp) < 0) + return -EFAULT; + i++; + tmp++; + } + *ppos = i; + return tmp-buf; +} + +static ssize_t write_port(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + unsigned long i = *ppos; + const char * tmp = buf; + + if (verify_area(VERIFY_READ,buf,count)) + return -EFAULT; + while (count-- > 0 && i < 65536) { + char c; + if (__get_user(c, tmp)) + return -EFAULT; + outb(c,i); + i++; + tmp++; + } + *ppos = i; + return tmp-buf; +} +#endif + +static ssize_t read_null(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + return 0; +} + +static ssize_t write_null(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return count; +} + +/* + * For fun, we are using the MMU for this. + */ +static inline size_t read_zero_pagealigned(char * buf, size_t size) +{ + struct mm_struct *mm; + struct vm_area_struct * vma; + unsigned long addr=(unsigned long)buf; + + mm = current->mm; + /* Oops, this was forgotten before. -ben */ + down_read(&mm->mmap_sem); + + /* For private mappings, just map in zero pages. */ + for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { + unsigned long count; + + if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) + goto out_up; + if (vma->vm_flags & VM_SHARED) + break; + count = vma->vm_end - addr; + if (count > size) + count = size; + + zap_page_range(mm, addr, count); + zeromap_page_range(addr, count, PAGE_COPY); + + size -= count; + buf += count; + addr += count; + if (size == 0) + goto out_up; + } + + up_read(&mm->mmap_sem); + + /* The shared case is hard. Let's do the conventional zeroing. */ + do { + unsigned long unwritten = clear_user(buf, PAGE_SIZE); + if (unwritten) + return size + unwritten - PAGE_SIZE; + if (current->need_resched) + schedule(); + buf += PAGE_SIZE; + size -= PAGE_SIZE; + } while (size); + + return size; +out_up: + up_read(&mm->mmap_sem); + return size; +} + +static ssize_t read_zero(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + unsigned long left, unwritten, written = 0; + + if (!count) + return 0; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + left = count; + + /* do we want to be clever? Arbitrary cut-off */ + if (count >= PAGE_SIZE*4) { + unsigned long partial; + + /* How much left of the page? */ + partial = (PAGE_SIZE-1) & -(unsigned long) buf; + unwritten = clear_user(buf, partial); + written = partial - unwritten; + if (unwritten) + goto out; + left -= partial; + buf += partial; + unwritten = read_zero_pagealigned(buf, left & PAGE_MASK); + written += (left & PAGE_MASK) - unwritten; + if (unwritten) + goto out; + buf += left & PAGE_MASK; + left &= ~PAGE_MASK; + } + unwritten = clear_user(buf, left); + written += left - unwritten; +out: + return written ? written : -EFAULT; +} + +static int mmap_zero(struct file * file, struct vm_area_struct * vma) +{ + if (vma->vm_flags & VM_SHARED) + return shmem_zero_setup(vma); + if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) + return -EAGAIN; + return 0; +} + +static ssize_t write_full(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return -ENOSPC; +} + +/* + * Special lseek() function for /dev/null and /dev/zero. Most notably, you + * can fopen() both devices with "a" now. This was previously impossible. + * -- SRB. + */ + +static loff_t null_lseek(struct file * file, loff_t offset, int orig) +{ + return file->f_pos = 0; +} + +/* + * The memory devices use the full 32/64 bits of the offset, and so we cannot + * check against negative addresses: they are ok. The return value is weird, + * though, in that case (0). + * + * also note that seeking relative to the "end of file" isn't supported: + * it has no meaning, so it returns -EINVAL. + */ +static loff_t memory_lseek(struct file * file, loff_t offset, int orig) +{ + switch (orig) { + case 0: + file->f_pos = offset; + return file->f_pos; + case 1: + file->f_pos += offset; + return file->f_pos; + default: + return -EINVAL; + } +} + +static int open_port(struct inode * inode, struct file * filp) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +#define mmap_kmem mmap_mem +#define zero_lseek null_lseek +#define full_lseek null_lseek +#define write_zero write_null +#define read_full read_zero +#define open_mem open_port +#define open_kmem open_mem + +static struct file_operations mem_fops = { + llseek: memory_lseek, + read: read_mem, + write: write_mem, + mmap: mmap_mem, + open: open_mem, +}; + +static struct file_operations kmem_fops = { + llseek: memory_lseek, + read: read_kmem, + write: write_kmem, + mmap: mmap_kmem, + open: open_kmem, +}; + +static struct file_operations null_fops = { + llseek: null_lseek, + read: read_null, + write: write_null, +}; + +#if !defined(__mc68000__) +static struct file_operations port_fops = { + llseek: memory_lseek, + read: read_port, + write: write_port, + open: open_port, +}; +#endif + +static struct file_operations zero_fops = { + llseek: zero_lseek, + read: read_zero, + write: write_zero, + mmap: mmap_zero, +}; + +static struct file_operations full_fops = { + llseek: full_lseek, + read: read_full, + write: write_full, +}; + +static int memory_open(struct inode * inode, struct file * filp) +{ + switch (MINOR(inode->i_rdev)) { + case 1: + filp->f_op = &mem_fops; + break; + case 2: + filp->f_op = &kmem_fops; + break; + case 3: + filp->f_op = &null_fops; + break; +#if !defined(__mc68000__) + case 4: + filp->f_op = &port_fops; + break; +#endif + case 5: + filp->f_op = &zero_fops; + break; + case 7: + filp->f_op = &full_fops; + break; + case 8: + filp->f_op = &random_fops; + break; + case 9: + filp->f_op = &urandom_fops; + break; + default: + return -ENXIO; + } + if (filp->f_op && filp->f_op->open) + return filp->f_op->open(inode,filp); + return 0; +} + +void __init memory_devfs_register (void) +{ + /* These are never unregistered */ + static const struct { + unsigned short minor; + char *name; + umode_t mode; + struct file_operations *fops; + } list[] = { /* list of minor devices */ + {1, "mem", S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops}, + {2, "kmem", S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops}, + {3, "null", S_IRUGO | S_IWUGO, &null_fops}, + {4, "port", S_IRUSR | S_IWUSR | S_IRGRP, &port_fops}, + {5, "zero", S_IRUGO | S_IWUGO, &zero_fops}, + {7, "full", S_IRUGO | S_IWUGO, &full_fops}, + {8, "random", S_IRUGO | S_IWUSR, &random_fops}, + {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops} + }; + int i; + + for (i=0; i<(sizeof(list)/sizeof(*list)); i++) + devfs_register (NULL, list[i].name, DEVFS_FL_NONE, + MEM_MAJOR, list[i].minor, + list[i].mode | S_IFCHR, + list[i].fops, NULL); +} + +static struct file_operations memory_fops = { + open: memory_open, /* just a selector for the real open */ +}; + +int __init chr_dev_init(void) +{ + if (devfs_register_chrdev(MEM_MAJOR,"mem",&memory_fops)) + printk("unable to get major %d for memory devs\n", MEM_MAJOR); + memory_devfs_register(); + rand_initialize(); +#ifdef CONFIG_I2C + i2c_init_all(); +#endif +#if defined (CONFIG_FB) + fbmem_init(); +#endif +#if defined (CONFIG_PROM_CONSOLE) + prom_con_init(); +#endif +#if defined (CONFIG_MDA_CONSOLE) + mda_console_init(); +#endif + tty_init(); +#ifdef CONFIG_M68K_PRINTER + lp_m68k_init(); +#endif + misc_init(); +#if CONFIG_QIC02_TAPE + qic02_tape_init(); +#endif +#ifdef CONFIG_FTAPE + ftape_init(); +#endif +#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_CHAR) + tapechar_init(); +#endif + return 0; +} + +__initcall(chr_dev_init); diff -urN linux-2.4.17-rc1-virgin/drivers/char/tty_io.c linux-2.4.17-rc1-wli3/drivers/char/tty_io.c --- linux-2.4.17-rc1-virgin/drivers/char/tty_io.c Fri Dec 14 06:04:02 2001 +++ linux-2.4.17-rc1-wli3/drivers/char/tty_io.c Sun Dec 16 17:58:10 2001 @@ -722,6 +722,7 @@ ret = -ERESTARTSYS; if (signal_pending(current)) break; + debug_lock_break(551); if (current->need_resched) schedule(); } diff -urN linux-2.4.17-rc1-virgin/drivers/ieee1394/csr.c linux-2.4.17-rc1-wli3/drivers/ieee1394/csr.c --- linux-2.4.17-rc1-virgin/drivers/ieee1394/csr.c Thu Jul 19 17:48:15 2001 +++ linux-2.4.17-rc1-wli3/drivers/ieee1394/csr.c Fri Dec 14 02:44:44 2001 @@ -10,6 +10,7 @@ */ #include +#include #include "ieee1394_types.h" #include "hosts.h" diff -urN linux-2.4.17-rc1-virgin/fs/adfs/map.c linux-2.4.17-rc1-wli3/fs/adfs/map.c --- linux-2.4.17-rc1-virgin/fs/adfs/map.c Thu Oct 25 13:53:53 2001 +++ linux-2.4.17-rc1-wli3/fs/adfs/map.c Fri Dec 14 02:44:44 2001 @@ -12,6 +12,7 @@ #include #include #include +#include #include "adfs.h" diff -urN linux-2.4.17-rc1-virgin/fs/binfmt_elf.c linux-2.4.17-rc1-wli3/fs/binfmt_elf.c --- linux-2.4.17-rc1-virgin/fs/binfmt_elf.c Fri Dec 14 06:04:11 2001 +++ linux-2.4.17-rc1-wli3/fs/binfmt_elf.c Fri Dec 14 03:53:30 2001 @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include @@ -1032,25 +1032,6 @@ elf_fpregset_t fpu; /* NT_PRFPREG */ struct elf_prpsinfo psinfo; /* NT_PRPSINFO */ - /* first copy the parameters from user space */ - memset(&psinfo, 0, sizeof(psinfo)); - { - int i, len; - - len = current->mm->arg_end - current->mm->arg_start; - if (len >= ELF_PRARGSZ) - len = ELF_PRARGSZ-1; - copy_from_user(&psinfo.pr_psargs, - (const char *)current->mm->arg_start, len); - for(i = 0; i < len; i++) - if (psinfo.pr_psargs[i] == 0) - psinfo.pr_psargs[i] = ' '; - psinfo.pr_psargs[len] = 0; - - } - - /* now stop all vm operations */ - down_write(¤t->mm->mmap_sem); segs = current->mm->map_count; #ifdef DEBUG @@ -1092,6 +1073,7 @@ * Set up the notes in similar form to SVR4 core dumps made * with info from their /proc. */ + memset(&psinfo, 0, sizeof(psinfo)); memset(&prstatus, 0, sizeof(prstatus)); notes[0].name = "CORE"; @@ -1147,6 +1129,23 @@ psinfo.pr_flag = current->flags; psinfo.pr_uid = NEW_TO_OLD_UID(current->uid); psinfo.pr_gid = NEW_TO_OLD_GID(current->gid); + { + int i, len; + + set_fs(fs); + + len = current->mm->arg_end - current->mm->arg_start; + if (len >= ELF_PRARGSZ) + len = ELF_PRARGSZ-1; + copy_from_user(&psinfo.pr_psargs, + (const char *)current->mm->arg_start, len); + for(i = 0; i < len; i++) + if (psinfo.pr_psargs[i] == 0) + psinfo.pr_psargs[i] = ' '; + psinfo.pr_psargs[len] = 0; + + set_fs(KERNEL_DS); + } strncpy(psinfo.pr_fname, current->comm, sizeof(psinfo.pr_fname)); notes[2].name = "CORE"; @@ -1218,6 +1217,8 @@ if (!writenote(¬es[i], file)) goto end_coredump; + set_fs(fs); + DUMP_SEEK(dataoff); for(vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { @@ -1231,24 +1232,22 @@ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { - struct page* page; - struct vm_area_struct *vma; - - if (get_user_pages(current, current->mm, addr, 1, 0, 1, - &page, &vma) <= 0) { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(vma->vm_mm, addr); + if (pgd_none(*pgd)) + goto nextpage_coredump; + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd)) + goto nextpage_coredump; + pte = pte_offset(pmd, addr); + if (pte_none(*pte)) { +nextpage_coredump: DUMP_SEEK (file->f_pos + PAGE_SIZE); } else { - if (page == ZERO_PAGE(addr)) { - DUMP_SEEK (file->f_pos + PAGE_SIZE); - } else { - void *kaddr; - flush_cache_page(vma, addr); - kaddr = kmap(page); - DUMP_WRITE(kaddr, PAGE_SIZE); - flush_page_to_ram(page); - kunmap(page); - } - put_page(page); + DUMP_WRITE((void*)addr, PAGE_SIZE); } } } @@ -1261,7 +1260,6 @@ end_coredump: set_fs(fs); - up_write(¤t->mm->mmap_sem); return has_dumped; } #endif /* USE_ELF_CORE_DUMP */ diff -urN linux-2.4.17-rc1-virgin/fs/buffer.c linux-2.4.17-rc1-wli3/fs/buffer.c --- linux-2.4.17-rc1-virgin/fs/buffer.c Fri Dec 14 06:04:11 2001 +++ linux-2.4.17-rc1-wli3/fs/buffer.c Sun Dec 16 22:28:34 2001 @@ -254,7 +254,6 @@ while (next && --nr >= 0) { struct buffer_head *bh = next; next = bh->b_next_free; - if (!buffer_locked(bh)) { if (refile) __refile_buffer(bh); @@ -262,7 +261,13 @@ } if (dev && bh->b_dev != dev) continue; - +#if 0 + if (conditional_schedule_needed()) { + debug_lock_break(1); + spin_unlock(&lru_list_lock); + return -EAGAIN; + } +#endif get_bh(bh); spin_unlock(&lru_list_lock); wait_on_buffer (bh); @@ -459,13 +464,19 @@ return err; } -/* After several hours of tedious analysis, the following hash - * function won. Do not mess with it... -DaveM +/* + * The shift/add buffer cache hash function from Chuck Lever's paper. + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * page 6 describes the behavior of various buffer cache hashes. + * + * The lack of an attempt to mix the bits of dev in this hash + * function appears disturbing to me, but I don't have the + * resources to investigate the value of attempting to do so. + * -- wli */ -#define _hashfn(dev,block) \ - ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \ - (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \ - ((block) << (bh_hash_shift - 12)))) +#define _hashfn(dev, block) \ + ( (block << 7) - block + (block >> 10) + (block >> 18) ) + #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)] static inline void __insert_into_hash_list(struct buffer_head *bh) @@ -672,6 +683,13 @@ /* Not hashed? */ if (!bh->b_pprev) continue; + if (conditional_schedule_needed()) { + debug_lock_break(2); /* bkl is held too */ + get_bh(bh); + break_spin_lock_and_resched(&lru_list_lock); + put_bh(bh); + slept = 1; + } if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); @@ -719,11 +737,9 @@ static void free_more_memory(void) { - zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0]; - balance_dirty(); wakeup_bdflush(); - try_to_free_pages(zone, GFP_NOFS, 0); + try_to_free_pages(GFP_NOFS); run_task_queue(&tq_disk); current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); @@ -823,6 +839,8 @@ struct buffer_head *bh; struct inode tmp; int err = 0, err2; + + DEFINE_LOCK_COUNT(); INIT_LIST_HEAD(&tmp.i_dirty_buffers); @@ -844,6 +862,12 @@ spin_lock(&lru_list_lock); } } + /* haven't hit this code path ... */ + debug_lock_break(551); + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + break_spin_lock(&lru_list_lock); + } } while (!list_empty(&tmp.i_dirty_buffers)) { @@ -873,6 +897,7 @@ struct inode tmp; int err = 0, err2; + DEFINE_LOCK_COUNT(); INIT_LIST_HEAD(&tmp.i_dirty_data_buffers); spin_lock(&lru_list_lock); @@ -904,9 +929,14 @@ if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); + debug_lock_break(1); + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + conditional_schedule(); + } spin_lock(&lru_list_lock); } - + spin_unlock(&lru_list_lock); err2 = osync_inode_data_buffers(inode); @@ -933,6 +963,8 @@ struct list_head *list; int err = 0; + DEFINE_LOCK_COUNT(); + spin_lock(&lru_list_lock); repeat: @@ -940,6 +972,17 @@ for (list = inode->i_dirty_buffers.prev; bh = BH_ENTRY(list), list != &inode->i_dirty_buffers; list = bh->b_inode_buffers.prev) { + /* untested code path ... */ + debug_lock_break(551); + + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + if (conditional_schedule_needed()) { + break_spin_lock(&lru_list_lock); + goto repeat; + } + } + if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); diff -urN linux-2.4.17-rc1-virgin/fs/buffer.c~ linux-2.4.17-rc1-wli3/fs/buffer.c~ --- linux-2.4.17-rc1-virgin/fs/buffer.c~ Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/fs/buffer.c~ Sat Dec 15 08:36:14 2001 @@ -0,0 +1,2869 @@ +/* + * linux/fs/buffer.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'buffer.c' implements the buffer-cache functions. Race-conditions have + * been avoided by NEVER letting an interrupt change a buffer (except for the + * data, of course), but instead letting the caller do it. + */ + +/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */ + +/* Removed a lot of unnecessary code and simplified things now that + * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 + */ + +/* Speed up hash, lru, and free list operations. Use gfp() for allocating + * hash table, use SLAB cache for buffer heads. -DaveM + */ + +/* Added 32k buffer block sizes - these are required older ARM systems. + * - RMK + */ + +/* Thread it... -DaveM */ + +/* async buffer flushing, 1999 Andrea Arcangeli */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) +#define NR_RESERVED (10*MAX_BUF_PER_PAGE) +#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this + number of unused buffer heads */ + +/* Anti-deadlock ordering: + * lru_list_lock > hash_table_lock > unused_list_lock + */ + +#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers) + +/* + * Hash table gook.. + */ +static unsigned int bh_hash_mask; +static unsigned int bh_hash_shift; +static struct buffer_head **hash_table; +static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED; + +static struct buffer_head *lru_list[NR_LIST]; +static spinlock_t lru_list_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +static int nr_buffers_type[NR_LIST]; +static unsigned long size_buffers_type[NR_LIST]; + +static struct buffer_head * unused_list; +static int nr_unused_buffer_heads; +static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED; +static DECLARE_WAIT_QUEUE_HEAD(buffer_wait); + +static int grow_buffers(kdev_t dev, unsigned long block, int size); +static void __refile_buffer(struct buffer_head *); + +/* This is used by some architectures to estimate available memory. */ +atomic_t buffermem_pages = ATOMIC_INIT(0); + +/* Here is the parameter block for the bdflush process. If you add or + * remove any of the parameters, make sure to update kernel/sysctl.c + * and the documentation at linux/Documentation/sysctl/vm.txt. + */ + +#define N_PARAM 9 + +/* The dummy values in this structure are left in there for compatibility + * with old programs that play with the /proc entries. + */ +union bdflush_param { + struct { + int nfract; /* Percentage of buffer cache dirty to + activate bdflush */ + int dummy1; /* old "ndirty" */ + int dummy2; /* old "nrefill" */ + int dummy3; /* unused */ + int interval; /* jiffies delay between kupdate flushes */ + int age_buffer; /* Time for normal buffer to age before we flush it */ + int nfract_sync;/* Percentage of buffer cache dirty to + activate bdflush synchronously */ + int dummy4; /* unused */ + int dummy5; /* unused */ + } b_un; + unsigned int data[N_PARAM]; +} bdf_prm = {{40, 0, 0, 0, 5*HZ, 30*HZ, 60, 0, 0}}; + +/* These are the min and max parameter values that we will allow to be assigned */ +int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0}; +int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0}; + +void unlock_buffer(struct buffer_head *bh) +{ + clear_bit(BH_Wait_IO, &bh->b_state); + clear_bit(BH_launder, &bh->b_state); + clear_bit(BH_Lock, &bh->b_state); + smp_mb__after_clear_bit(); + if (waitqueue_active(&bh->b_wait)) + wake_up(&bh->b_wait); +} + +/* + * Rewrote the wait-routines to use the "new" wait-queue functionality, + * and getting rid of the cli-sti pairs. The wait-queue routines still + * need cli-sti, but now it's just a couple of 386 instructions or so. + * + * Note that the real wait_on_buffer() is an inline function that checks + * if 'b_wait' is set before calling this, so that the queues aren't set + * up unnecessarily. + */ +void __wait_on_buffer(struct buffer_head * bh) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + get_bh(bh); + add_wait_queue(&bh->b_wait, &wait); + do { + run_task_queue(&tq_disk); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!buffer_locked(bh)) + break; + schedule(); + } while (buffer_locked(bh)); + tsk->state = TASK_RUNNING; + remove_wait_queue(&bh->b_wait, &wait); + put_bh(bh); +} + +/* + * Default synchronous end-of-IO handler.. Just mark it up-to-date and + * unlock the buffer. This is what ll_rw_block uses too. + */ +void end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + mark_buffer_uptodate(bh, uptodate); + unlock_buffer(bh); + put_bh(bh); +} + +/* + * The buffers have been marked clean and locked. Just submit the dang + * things.. + */ +static void write_locked_buffers(struct buffer_head **array, unsigned int count) +{ + do { + struct buffer_head * bh = *array++; + bh->b_end_io = end_buffer_io_sync; + submit_bh(WRITE, bh); + } while (--count); +} + +/* + * Write some buffers from the head of the dirty queue. + * + * This must be called with the LRU lock held, and will + * return without it! + */ +#define NRSYNC (32) +static int write_some_buffers(kdev_t dev) +{ + struct buffer_head *next; + struct buffer_head *array[NRSYNC]; + unsigned int count; + int nr; + + next = lru_list[BUF_DIRTY]; + nr = nr_buffers_type[BUF_DIRTY]; + count = 0; + while (next && --nr >= 0) { + struct buffer_head * bh = next; + next = bh->b_next_free; + + if (dev && bh->b_dev != dev) + continue; + if (test_and_set_bit(BH_Lock, &bh->b_state)) + continue; + if (atomic_set_buffer_clean(bh)) { + __refile_buffer(bh); + get_bh(bh); + array[count++] = bh; + if (count < NRSYNC) + continue; + + spin_unlock(&lru_list_lock); + write_locked_buffers(array, count); + return -EAGAIN; + } + unlock_buffer(bh); + __refile_buffer(bh); + } + spin_unlock(&lru_list_lock); + + if (count) + write_locked_buffers(array, count); + return 0; +} + +/* + * Write out all buffers on the dirty list. + */ +static void write_unlocked_buffers(kdev_t dev) +{ + do { + spin_lock(&lru_list_lock); + } while (write_some_buffers(dev)); + run_task_queue(&tq_disk); +} + +/* + * Wait for a buffer on the proper list. + * + * This must be called with the LRU lock held, and + * will return with it released. + */ +static int wait_for_buffers(kdev_t dev, int index, int refile) +{ + struct buffer_head * next; + int nr; + + next = lru_list[index]; + nr = nr_buffers_type[index]; + while (next && --nr >= 0) { + struct buffer_head *bh = next; + next = bh->b_next_free; + + if (!buffer_locked(bh)) { + if (refile) + __refile_buffer(bh); + continue; + } + if (dev && bh->b_dev != dev) + continue; + + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer (bh); + put_bh(bh); + return -EAGAIN; + } + spin_unlock(&lru_list_lock); + return 0; +} + +static inline void wait_for_some_buffers(kdev_t dev) +{ + spin_lock(&lru_list_lock); + wait_for_buffers(dev, BUF_LOCKED, 1); +} + +static int wait_for_locked_buffers(kdev_t dev, int index, int refile) +{ + do { + spin_lock(&lru_list_lock); + } while (wait_for_buffers(dev, index, refile)); + return 0; +} + +/* Call sync_buffers with wait!=0 to ensure that the call does not + * return until all buffer writes have completed. Sync() may return + * before the writes have finished; fsync() may not. + */ + +/* Godamity-damn. Some buffers (bitmaps for filesystems) + * spontaneously dirty themselves without ever brelse being called. + * We will ultimately want to put these in a separate list, but for + * now we search all of the lists for dirty buffers. + */ +int sync_buffers(kdev_t dev, int wait) +{ + int err = 0; + + /* One pass for no-wait, three for wait: + * 0) write out all dirty, unlocked buffers; + * 1) wait for all dirty locked buffers; + * 2) write out all dirty, unlocked buffers; + * 2) wait for completion by waiting for all buffers to unlock. + */ + write_unlocked_buffers(dev); + if (wait) { + err = wait_for_locked_buffers(dev, BUF_DIRTY, 0); + write_unlocked_buffers(dev); + err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1); + } + return err; +} + +int fsync_super(struct super_block *sb) +{ + kdev_t dev = sb->s_dev; + sync_buffers(dev, 0); + + lock_kernel(); + sync_inodes_sb(sb); + DQUOT_SYNC(dev); + lock_super(sb); + if (sb->s_dirt && sb->s_op && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + unlock_kernel(); + + return sync_buffers(dev, 1); +} + +int fsync_no_super(kdev_t dev) +{ + sync_buffers(dev, 0); + return sync_buffers(dev, 1); +} + +int fsync_dev(kdev_t dev) +{ + sync_buffers(dev, 0); + + lock_kernel(); + sync_inodes(dev); + DQUOT_SYNC(dev); + sync_supers(dev); + unlock_kernel(); + + return sync_buffers(dev, 1); +} + +/* + * There's no real reason to pretend we should + * ever do anything differently + */ +void sync_dev(kdev_t dev) +{ + fsync_dev(dev); +} + +asmlinkage long sys_sync(void) +{ + fsync_dev(0); + return 0; +} + +/* + * filp may be NULL if called via the msync of a vma. + */ + +int file_fsync(struct file *filp, struct dentry *dentry, int datasync) +{ + struct inode * inode = dentry->d_inode; + struct super_block * sb; + kdev_t dev; + int ret; + + lock_kernel(); + /* sync the inode to buffers */ + write_inode_now(inode, 0); + + /* sync the superblock to buffers */ + sb = inode->i_sb; + lock_super(sb); + if (sb->s_op && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + + /* .. finally sync the buffers to disk */ + dev = inode->i_dev; + ret = sync_buffers(dev, 1); + unlock_kernel(); + return ret; +} + +asmlinkage long sys_fsync(unsigned int fd) +{ + struct file * file; + struct dentry * dentry; + struct inode * inode; + int err; + + err = -EBADF; + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_dentry; + inode = dentry->d_inode; + + err = -EINVAL; + if (!file->f_op || !file->f_op->fsync) + goto out_putf; + + /* We need to protect against concurrent writers.. */ + down(&inode->i_sem); + filemap_fdatasync(inode->i_mapping); + err = file->f_op->fsync(file, dentry, 0); + filemap_fdatawait(inode->i_mapping); + up(&inode->i_sem); + +out_putf: + fput(file); +out: + return err; +} + +asmlinkage long sys_fdatasync(unsigned int fd) +{ + struct file * file; + struct dentry * dentry; + struct inode * inode; + int err; + + err = -EBADF; + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_dentry; + inode = dentry->d_inode; + + err = -EINVAL; + if (!file->f_op || !file->f_op->fsync) + goto out_putf; + + down(&inode->i_sem); + filemap_fdatasync(inode->i_mapping); + err = file->f_op->fsync(file, dentry, 1); + filemap_fdatawait(inode->i_mapping); + up(&inode->i_sem); + +out_putf: + fput(file); +out: + return err; +} + +/* + * The shift/add buffer cache hash function from Chuck Lever's paper. + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * page 6 describes the behavior of various buffer cache hashes. + * + * The lack of an attempt to mix the bits of dev in this hash + * function appears disturbing to me, but I don't have the + * resources to investigate the value of attempting to do so. + * -- wli + */ +#define _hashfn(dev, block) \ + ( (block << 7) - block + (block >> 10) + (block >> 18) ) + +#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)] + +static inline void __insert_into_hash_list(struct buffer_head *bh) +{ + struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); + struct buffer_head *next = *head; + + *head = bh; + bh->b_pprev = head; + bh->b_next = next; + if (next != NULL) + next->b_pprev = &bh->b_next; +} + +static __inline__ void __hash_unlink(struct buffer_head *bh) +{ + struct buffer_head **pprev = bh->b_pprev; + if (pprev) { + struct buffer_head *next = bh->b_next; + if (next) + next->b_pprev = pprev; + *pprev = next; + bh->b_pprev = NULL; + } +} + +static void __insert_into_lru_list(struct buffer_head * bh, int blist) +{ + struct buffer_head **bhp = &lru_list[blist]; + + if (bh->b_prev_free || bh->b_next_free) BUG(); + + if(!*bhp) { + *bhp = bh; + bh->b_prev_free = bh; + } + bh->b_next_free = *bhp; + bh->b_prev_free = (*bhp)->b_prev_free; + (*bhp)->b_prev_free->b_next_free = bh; + (*bhp)->b_prev_free = bh; + nr_buffers_type[blist]++; + size_buffers_type[blist] += bh->b_size; +} + +static void __remove_from_lru_list(struct buffer_head * bh) +{ + struct buffer_head *next = bh->b_next_free; + if (next) { + struct buffer_head *prev = bh->b_prev_free; + int blist = bh->b_list; + + prev->b_next_free = next; + next->b_prev_free = prev; + if (lru_list[blist] == bh) { + if (next == bh) + next = NULL; + lru_list[blist] = next; + } + bh->b_next_free = NULL; + bh->b_prev_free = NULL; + nr_buffers_type[blist]--; + size_buffers_type[blist] -= bh->b_size; + } +} + +/* must be called with both the hash_table_lock and the lru_list_lock + held */ +static void __remove_from_queues(struct buffer_head *bh) +{ + __hash_unlink(bh); + __remove_from_lru_list(bh); +} + +static void remove_from_queues(struct buffer_head *bh) +{ + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + __remove_from_queues(bh); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); +} + +struct buffer_head * get_hash_table(kdev_t dev, int block, int size) +{ + struct buffer_head *bh, **p = &hash(dev, block); + + read_lock(&hash_table_lock); + + for (;;) { + bh = *p; + if (!bh) + break; + p = &bh->b_next; + if (bh->b_blocknr != block) + continue; + if (bh->b_size != size) + continue; + if (bh->b_dev != dev) + continue; + get_bh(bh); + break; + } + + read_unlock(&hash_table_lock); + return bh; +} + +void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode) +{ + spin_lock(&lru_list_lock); + if (bh->b_inode) + list_del(&bh->b_inode_buffers); + bh->b_inode = inode; + list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers); + spin_unlock(&lru_list_lock); +} + +void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode) +{ + spin_lock(&lru_list_lock); + if (bh->b_inode) + list_del(&bh->b_inode_buffers); + bh->b_inode = inode; + list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers); + spin_unlock(&lru_list_lock); +} + +/* The caller must have the lru_list lock before calling the + remove_inode_queue functions. */ +static void __remove_inode_queue(struct buffer_head *bh) +{ + bh->b_inode = NULL; + list_del(&bh->b_inode_buffers); +} + +static inline void remove_inode_queue(struct buffer_head *bh) +{ + if (bh->b_inode) + __remove_inode_queue(bh); +} + +int inode_has_buffers(struct inode *inode) +{ + int ret; + + spin_lock(&lru_list_lock); + ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers); + spin_unlock(&lru_list_lock); + + return ret; +} + +/* If invalidate_buffers() will trash dirty buffers, it means some kind + of fs corruption is going on. Trashing dirty data always imply losing + information that was supposed to be just stored on the physical layer + by the user. + + Thus invalidate_buffers in general usage is not allwowed to trash + dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to + be preserved. These buffers are simply skipped. + + We also skip buffers which are still in use. For example this can + happen if a userspace program is reading the block device. + + NOTE: In the case where the user removed a removable-media-disk even if + there's still dirty data not synced on disk (due a bug in the device driver + or due an error of the user), by not destroying the dirty buffers we could + generate corruption also on the next media inserted, thus a parameter is + necessary to handle this case in the most safe way possible (trying + to not corrupt also the new disk inserted with the data belonging to + the old now corrupted disk). Also for the ramdisk the natural thing + to do in order to release the ramdisk memory is to destroy dirty buffers. + + These are two special cases. Normal usage imply the device driver + to issue a sync on the device (without waiting I/O completion) and + then an invalidate_buffers call that doesn't trash dirty buffers. + + For handling cache coherency with the blkdev pagecache the 'update' case + is been introduced. It is needed to re-read from disk any pinned + buffer. NOTE: re-reading from disk is destructive so we can do it only + when we assume nobody is changing the buffercache under our I/O and when + we think the disk contains more recent information than the buffercache. + The update == 1 pass marks the buffers we need to update, the update == 2 + pass does the actual I/O. */ +void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) +{ + int i, nlist, slept; + struct buffer_head * bh, * bh_next; + kdev_t dev = to_kdev_t(bdev->bd_dev); /* will become bdev */ + + retry: + slept = 0; + spin_lock(&lru_list_lock); + for(nlist = 0; nlist < NR_LIST; nlist++) { + bh = lru_list[nlist]; + if (!bh) + continue; + for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) { + bh_next = bh->b_next_free; + + /* Another device? */ + if (bh->b_dev != dev) + continue; + /* Not hashed? */ + if (!bh->b_pprev) + continue; + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + slept = 1; + spin_lock(&lru_list_lock); + put_bh(bh); + } + + write_lock(&hash_table_lock); + /* All buffers in the lru lists are mapped */ + if (!buffer_mapped(bh)) + BUG(); + if (buffer_dirty(bh)) + printk("invalidate: dirty buffer\n"); + if (!atomic_read(&bh->b_count)) { + if (destroy_dirty_buffers || !buffer_dirty(bh)) { + remove_inode_queue(bh); + } + } else + printk("invalidate: busy buffer\n"); + + write_unlock(&hash_table_lock); + if (slept) + goto out; + } + } +out: + spin_unlock(&lru_list_lock); + if (slept) + goto retry; + + /* Get rid of the page cache */ + invalidate_inode_pages(bdev->bd_inode); +} + +void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers) +{ + struct block_device *bdev = bdget(dev); + if (bdev) { + invalidate_bdev(bdev, destroy_dirty_buffers); + bdput(bdev); + } +} + +static void free_more_memory(void) +{ + balance_dirty(); + wakeup_bdflush(); + try_to_free_pages(GFP_NOFS); + run_task_queue(&tq_disk); + current->policy |= SCHED_YIELD; + __set_current_state(TASK_RUNNING); + schedule(); +} + +void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) +{ + bh->b_list = BUF_CLEAN; + bh->b_end_io = handler; + bh->b_private = private; +} + +static void end_buffer_io_async(struct buffer_head * bh, int uptodate) +{ + static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + struct buffer_head *tmp; + struct page *page; + + mark_buffer_uptodate(bh, uptodate); + + /* This is a temporary buffer used for page I/O. */ + page = bh->b_page; + + if (!uptodate) + SetPageError(page); + + /* + * Be _very_ careful from here on. Bad things can happen if + * two buffer heads end IO at almost the same time and both + * decide that the page is now completely done. + * + * Async buffer_heads are here only as labels for IO, and get + * thrown away once the IO for this page is complete. IO is + * deemed complete once all buffers have been visited + * (b_count==0) and are now unlocked. We must make sure that + * only the _last_ buffer that decrements its count is the one + * that unlock the page.. + */ + spin_lock_irqsave(&page_uptodate_lock, flags); + mark_buffer_async(bh, 0); + unlock_buffer(bh); + tmp = bh->b_this_page; + while (tmp != bh) { + if (buffer_async(tmp) && buffer_locked(tmp)) + goto still_busy; + tmp = tmp->b_this_page; + } + + /* OK, the async IO on this page is complete. */ + spin_unlock_irqrestore(&page_uptodate_lock, flags); + + /* + * if none of the buffers had errors then we can set the + * page uptodate: + */ + if (!PageError(page)) + SetPageUptodate(page); + + UnlockPage(page); + + return; + +still_busy: + spin_unlock_irqrestore(&page_uptodate_lock, flags); + return; +} + +inline void set_buffer_async_io(struct buffer_head *bh) { + bh->b_end_io = end_buffer_io_async ; + mark_buffer_async(bh, 1); +} + +/* + * Synchronise all the inode's dirty buffers to the disk. + * + * We have conflicting pressures: we want to make sure that all + * initially dirty buffers get waited on, but that any subsequently + * dirtied buffers don't. After all, we don't want fsync to last + * forever if somebody is actively writing to the file. + * + * Do this in two main stages: first we copy dirty buffers to a + * temporary inode list, queueing the writes as we go. Then we clean + * up, waiting for those writes to complete. + * + * During this second stage, any subsequent updates to the file may end + * up refiling the buffer on the original inode's dirty list again, so + * there is a chance we will end up with a buffer queued for write but + * not yet completed on that list. So, as a final cleanup we go through + * the osync code to catch these locked, dirty buffers without requeuing + * any newly dirty buffers for write. + */ + +int fsync_inode_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct inode tmp; + int err = 0, err2; + + INIT_LIST_HEAD(&tmp.i_dirty_buffers); + + spin_lock(&lru_list_lock); + + while (!list_empty(&inode->i_dirty_buffers)) { + bh = BH_ENTRY(inode->i_dirty_buffers.next); + list_del(&bh->b_inode_buffers); + if (!buffer_dirty(bh) && !buffer_locked(bh)) + bh->b_inode = NULL; + else { + bh->b_inode = &tmp; + list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers); + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(&lru_list_lock); + ll_rw_block(WRITE, 1, &bh); + brelse(bh); + spin_lock(&lru_list_lock); + } + } + } + + while (!list_empty(&tmp.i_dirty_buffers)) { + bh = BH_ENTRY(tmp.i_dirty_buffers.prev); + remove_inode_queue(bh); + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + } + + spin_unlock(&lru_list_lock); + err2 = osync_inode_buffers(inode); + + if (err) + return err; + else + return err2; +} + +int fsync_inode_data_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct inode tmp; + int err = 0, err2; + + INIT_LIST_HEAD(&tmp.i_dirty_data_buffers); + + spin_lock(&lru_list_lock); + + while (!list_empty(&inode->i_dirty_data_buffers)) { + bh = BH_ENTRY(inode->i_dirty_data_buffers.next); + list_del(&bh->b_inode_buffers); + if (!buffer_dirty(bh) && !buffer_locked(bh)) + bh->b_inode = NULL; + else { + bh->b_inode = &tmp; + list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers); + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(&lru_list_lock); + ll_rw_block(WRITE, 1, &bh); + brelse(bh); + spin_lock(&lru_list_lock); + } + } + } + + while (!list_empty(&tmp.i_dirty_data_buffers)) { + bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev); + remove_inode_queue(bh); + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + } + + spin_unlock(&lru_list_lock); + err2 = osync_inode_data_buffers(inode); + + if (err) + return err; + else + return err2; +} + +/* + * osync is designed to support O_SYNC io. It waits synchronously for + * all already-submitted IO to complete, but does not queue any new + * writes to the disk. + * + * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as + * you dirty the buffers, and then use osync_inode_buffers to wait for + * completion. Any other dirty buffers which are not yet queued for + * write will not be flushed to disk by the osync. + */ + +int osync_inode_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct list_head *list; + int err = 0; + + spin_lock(&lru_list_lock); + + repeat: + + for (list = inode->i_dirty_buffers.prev; + bh = BH_ENTRY(list), list != &inode->i_dirty_buffers; + list = bh->b_inode_buffers.prev) { + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + goto repeat; + } + } + + spin_unlock(&lru_list_lock); + return err; +} + +int osync_inode_data_buffers(struct inode *inode) +{ + struct buffer_head *bh; + struct list_head *list; + int err = 0; + + spin_lock(&lru_list_lock); + + repeat: + + for (list = inode->i_dirty_data_buffers.prev; + bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers; + list = bh->b_inode_buffers.prev) { + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + goto repeat; + } + } + + spin_unlock(&lru_list_lock); + return err; +} + + +/* + * Invalidate any and all dirty buffers on a given inode. We are + * probably unmounting the fs, but that doesn't mean we have already + * done a sync(). Just drop the buffers from the inode list. + */ +void invalidate_inode_buffers(struct inode *inode) +{ + struct list_head * entry; + + spin_lock(&lru_list_lock); + while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers) + remove_inode_queue(BH_ENTRY(entry)); + while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers) + remove_inode_queue(BH_ENTRY(entry)); + spin_unlock(&lru_list_lock); +} + + +/* + * Ok, this is getblk, and it isn't very clear, again to hinder + * race-conditions. Most of the code is seldom used, (ie repeating), + * so it should be much more efficient than it looks. + * + * The algorithm is changed: hopefully better, and an elusive bug removed. + * + * 14.02.92: changed it to sync dirty buffers a bit: better performance + * when the filesystem starts to get full of dirty blocks (I hope). + */ +struct buffer_head * getblk(kdev_t dev, int block, int size) +{ + for (;;) { + struct buffer_head * bh; + + bh = get_hash_table(dev, block, size); + if (bh) + return bh; + + if (!grow_buffers(dev, block, size)) + free_more_memory(); + } +} + +/* -1 -> no need to flush + 0 -> async flush + 1 -> sync flush (wait for I/O completion) */ +static int balance_dirty_state(void) +{ + unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; + + dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; + dirty += size_buffers_type[BUF_LOCKED] >> PAGE_SHIFT; + tot = nr_free_buffer_pages(); + + dirty *= 100; + soft_dirty_limit = tot * bdf_prm.b_un.nfract; + hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync; + + /* First, check for the "real" dirty limit. */ + if (dirty > soft_dirty_limit) { + if (dirty > hard_dirty_limit) + return 1; + return 0; + } + + return -1; +} + +/* + * if a new dirty buffer is created we need to balance bdflush. + * + * in the future we might want to make bdflush aware of different + * pressures on different devices - thus the (currently unused) + * 'dev' parameter. + */ +void balance_dirty(void) +{ + int state = balance_dirty_state(); + + if (state < 0) + return; + + /* If we're getting into imbalance, start write-out */ + spin_lock(&lru_list_lock); + write_some_buffers(NODEV); + + /* + * And if we're _really_ out of balance, wait for + * some of the dirty/locked buffers ourselves and + * start bdflush. + * This will throttle heavy writers. + */ + if (state > 0) { + wait_for_some_buffers(NODEV); + wakeup_bdflush(); + } +} + +inline void __mark_dirty(struct buffer_head *bh) +{ + bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer; + refile_buffer(bh); +} + +/* atomic version, the user must call balance_dirty() by hand + as soon as it become possible to block */ +void __mark_buffer_dirty(struct buffer_head *bh) +{ + if (!atomic_set_buffer_dirty(bh)) + __mark_dirty(bh); +} + +void mark_buffer_dirty(struct buffer_head *bh) +{ + if (!atomic_set_buffer_dirty(bh)) { + __mark_dirty(bh); + balance_dirty(); + } +} + +void set_buffer_flushtime(struct buffer_head *bh) +{ + bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer; +} +EXPORT_SYMBOL(set_buffer_flushtime); + +/* + * A buffer may need to be moved from one buffer list to another + * (e.g. in case it is not shared any more). Handle this. + */ +static void __refile_buffer(struct buffer_head *bh) +{ + int dispose = BUF_CLEAN; + if (buffer_locked(bh)) + dispose = BUF_LOCKED; + if (buffer_dirty(bh)) + dispose = BUF_DIRTY; + if (dispose != bh->b_list) { + __remove_from_lru_list(bh); + bh->b_list = dispose; + if (dispose == BUF_CLEAN) + remove_inode_queue(bh); + __insert_into_lru_list(bh, dispose); + } +} + +void refile_buffer(struct buffer_head *bh) +{ + spin_lock(&lru_list_lock); + __refile_buffer(bh); + spin_unlock(&lru_list_lock); +} + +/* + * Release a buffer head + */ +void __brelse(struct buffer_head * buf) +{ + if (atomic_read(&buf->b_count)) { + put_bh(buf); + return; + } + printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n"); +} + +/* + * bforget() is like brelse(), except it discards any + * potentially dirty data. + */ +void __bforget(struct buffer_head * buf) +{ + mark_buffer_clean(buf); + __brelse(buf); +} + +/** + * bread() - reads a specified block and returns the bh + * @block: number of block + * @size: size (in bytes) to read + * + * Reads a specified block, and returns buffer head that + * contains it. It returns NULL if the block was unreadable. + */ +struct buffer_head * bread(kdev_t dev, int block, int size) +{ + struct buffer_head * bh; + + bh = getblk(dev, block, size); + touch_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + brelse(bh); + return NULL; +} + +/* + * Note: the caller should wake up the buffer_wait list if needed. + */ +static void __put_unused_buffer_head(struct buffer_head * bh) +{ + if (bh->b_inode) + BUG(); + if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { + kmem_cache_free(bh_cachep, bh); + } else { + bh->b_dev = B_FREE; + bh->b_blocknr = -1; + bh->b_this_page = NULL; + + nr_unused_buffer_heads++; + bh->b_next_free = unused_list; + unused_list = bh; + } +} + +void put_unused_buffer_head(struct buffer_head *bh) +{ + spin_lock(&unused_list_lock); + __put_unused_buffer_head(bh); + spin_unlock(&unused_list_lock); +} +EXPORT_SYMBOL(put_unused_buffer_head); + +/* + * Reserve NR_RESERVED buffer heads for async IO requests to avoid + * no-buffer-head deadlock. Return NULL on failure; waiting for + * buffer heads is now handled in create_buffers(). + */ +struct buffer_head * get_unused_buffer_head(int async) +{ + struct buffer_head * bh; + + spin_lock(&unused_list_lock); + if (nr_unused_buffer_heads > NR_RESERVED) { + bh = unused_list; + unused_list = bh->b_next_free; + nr_unused_buffer_heads--; + spin_unlock(&unused_list_lock); + return bh; + } + spin_unlock(&unused_list_lock); + + /* This is critical. We can't call out to the FS + * to get more buffer heads, because the FS may need + * more buffer-heads itself. Thus SLAB_NOFS. + */ + if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) { + bh->b_blocknr = -1; + bh->b_this_page = NULL; + return bh; + } + + /* + * If we need an async buffer, use the reserved buffer heads. + */ + if (async) { + spin_lock(&unused_list_lock); + if (unused_list) { + bh = unused_list; + unused_list = bh->b_next_free; + nr_unused_buffer_heads--; + spin_unlock(&unused_list_lock); + return bh; + } + spin_unlock(&unused_list_lock); + } + + return NULL; +} +EXPORT_SYMBOL(get_unused_buffer_head); + +void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) +{ + bh->b_page = page; + if (offset >= PAGE_SIZE) + BUG(); + if (PageHighMem(page)) + /* + * This catches illegal uses and preserves the offset: + */ + bh->b_data = (char *)(0 + offset); + else + bh->b_data = page_address(page) + offset; +} +EXPORT_SYMBOL(set_bh_page); + +/* + * Create the appropriate buffers when given a page for data area and + * the size of each buffer.. Use the bh->b_this_page linked list to + * follow the buffers created. Return NULL if unable to create more + * buffers. + * The async flag is used to differentiate async IO (paging, swapping) + * from ordinary buffer allocations, and only async requests are allowed + * to sleep waiting for buffer heads. + */ +static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async) +{ + struct buffer_head *bh, *head; + long offset; + +try_again: + head = NULL; + offset = PAGE_SIZE; + while ((offset -= size) >= 0) { + bh = get_unused_buffer_head(async); + if (!bh) + goto no_grow; + + bh->b_dev = NODEV; + bh->b_this_page = head; + head = bh; + + bh->b_state = 0; + bh->b_next_free = NULL; + bh->b_pprev = NULL; + atomic_set(&bh->b_count, 0); + bh->b_size = size; + + set_bh_page(bh, page, offset); + + bh->b_list = BUF_CLEAN; + bh->b_end_io = NULL; + } + return head; +/* + * In case anything failed, we just free everything we got. + */ +no_grow: + if (head) { + spin_lock(&unused_list_lock); + do { + bh = head; + head = head->b_this_page; + __put_unused_buffer_head(bh); + } while (head); + spin_unlock(&unused_list_lock); + + /* Wake up any waiters ... */ + wake_up(&buffer_wait); + } + + /* + * Return failure for non-async IO requests. Async IO requests + * are not allowed to fail, so we have to wait until buffer heads + * become available. But we don't want tasks sleeping with + * partially complete buffers, so all were released above. + */ + if (!async) + return NULL; + + /* We're _really_ low on memory. Now we just + * wait for old buffer heads to become free due to + * finishing IO. Since this is an async request and + * the reserve list is empty, we're sure there are + * async buffer heads in use. + */ + run_task_queue(&tq_disk); + + free_more_memory(); + goto try_again; +} + +/* + * Called when truncating a buffer on a page completely. + */ +static void discard_buffer(struct buffer_head * bh) +{ + if (buffer_mapped(bh)) { + mark_buffer_clean(bh); + lock_buffer(bh); + clear_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Mapped, &bh->b_state); + clear_bit(BH_Req, &bh->b_state); + clear_bit(BH_New, &bh->b_state); + remove_from_queues(bh); + unlock_buffer(bh); + } +} + +/** + * try_to_release_page - release old fs-specific metadata on a page + * + */ + +int try_to_release_page(struct page * page, int gfp_mask) +{ + if (!PageLocked(page)) + BUG(); + + if (!page->mapping) + goto try_to_free; + if (!page->mapping->a_ops->releasepage) + goto try_to_free; + if (page->mapping->a_ops->releasepage(page, gfp_mask)) + goto try_to_free; + /* + * We couldn't release buffer metadata; don't even bother trying + * to release buffers. + */ + return 0; +try_to_free: + return try_to_free_buffers(page, gfp_mask); +} + +/* + * We don't have to release all buffers here, but + * we have to be sure that no dirty buffer is left + * and no IO is going on (no buffer is locked), because + * we have truncated the file and are going to free the + * blocks on-disk.. + */ +int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + + if (!PageLocked(page)) + BUG(); + if (!page->buffers) + return 1; + + head = page->buffers; + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * is this block fully flushed? + */ + if (offset <= curr_off) + discard_buffer(bh); + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * subtle. We release buffer-heads only if this is + * the 'final' flushpage. We have invalidated the get_block + * cached value unconditionally, so real IO is not + * possible anymore. + * + * If the free doesn't work out, the buffers can be + * left around - they just turn into anonymous buffers + * instead. + */ + if (!offset) { + if (!try_to_release_page(page, 0)) + return 0; + } + + return 1; +} + +void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize) +{ + struct buffer_head *bh, *head, *tail; + + /* FIXME: create_buffers should fail if there's no enough memory */ + head = create_buffers(page, blocksize, 1); + if (page->buffers) + BUG(); + + bh = head; + do { + bh->b_dev = dev; + bh->b_blocknr = 0; + bh->b_end_io = NULL; + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + page->buffers = head; + page_cache_get(page); +} +EXPORT_SYMBOL(create_empty_buffers); + +/* + * We are taking a block for data and we don't want any output from any + * buffer-cache aliases starting from return from that function and + * until the moment when something will explicitly mark the buffer + * dirty (hopefully that will not happen until we will free that block ;-) + * We don't even need to mark it not-uptodate - nobody can expect + * anything from a newly allocated buffer anyway. We used to used + * unmap_buffer() for such invalidation, but that was wrong. We definitely + * don't want to mark the alias unmapped, for example - it would confuse + * anyone who might pick it with bread() afterwards... + */ + +static void unmap_underlying_metadata(struct buffer_head * bh) +{ + struct buffer_head *old_bh; + + old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); + if (old_bh) { + mark_buffer_clean(old_bh); + wait_on_buffer(old_bh); + clear_bit(BH_Req, &old_bh->b_state); + __brelse(old_bh); + } +} + +/* + * NOTE! All mapped/uptodate combinations are valid: + * + * Mapped Uptodate Meaning + * + * No No "unknown" - must do get_block() + * No Yes "hole" - zero-filled + * Yes No "allocated" - allocated on disk, not read in + * Yes Yes "valid" - allocated and up-to-date in memory. + * + * "Dirty" is valid only with the last case (mapped+uptodate). + */ + +/* + * block_write_full_page() is SMP threaded - the kernel lock is not held. + */ +static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) +{ + int err, i; + unsigned long block; + struct buffer_head *bh, *head; + + if (!PageLocked(page)) + BUG(); + + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits); + head = page->buffers; + + block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + bh = head; + i = 0; + + /* Stage 1: make sure we have all the buffers mapped! */ + do { + /* + * If the buffer isn't up-to-date, we can't be sure + * that the buffer has been initialized with the proper + * block number information etc.. + * + * Leave it to the low-level FS to make all those + * decisions (block #0 may actually be a valid block) + */ + if (!buffer_mapped(bh)) { + err = get_block(inode, block, bh, 1); + if (err) + goto out; + if (buffer_new(bh)) + unmap_underlying_metadata(bh); + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + /* Stage 2: lock the buffers, mark them clean */ + do { + lock_buffer(bh); + set_buffer_async_io(bh); + set_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Dirty, &bh->b_state); + bh = bh->b_this_page; + } while (bh != head); + + /* Stage 3: submit the IO */ + do { + struct buffer_head *next = bh->b_this_page; + submit_bh(WRITE, bh); + bh = next; + } while (bh != head); + + /* Done - end_buffer_io_async will unlock */ + SetPageUptodate(page); + return 0; + +out: + ClearPageUptodate(page); + UnlockPage(page); + return err; +} + +static int __block_prepare_write(struct inode *inode, struct page *page, + unsigned from, unsigned to, get_block_t *get_block) +{ + unsigned block_start, block_end; + unsigned long block; + int err = 0; + unsigned blocksize, bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; + char *kaddr = kmap(page); + + blocksize = 1 << inode->i_blkbits; + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + head = page->buffers; + + bbits = inode->i_blkbits; + block = page->index << (PAGE_CACHE_SHIFT - bbits); + + for(bh = head, block_start = 0; bh != head || !block_start; + block++, block_start=block_end, bh = bh->b_this_page) { + if (!bh) + BUG(); + block_end = block_start+blocksize; + if (block_end <= from) + continue; + if (block_start >= to) + break; + if (!buffer_mapped(bh)) { + err = get_block(inode, block, bh, 1); + if (err) + goto out; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh); + if (Page_Uptodate(page)) { + set_bit(BH_Uptodate, &bh->b_state); + continue; + } + if (block_end > to) + memset(kaddr+to, 0, block_end-to); + if (block_start < from) + memset(kaddr+block_start, 0, from-block_start); + if (block_end > to || block_start < from) + flush_dcache_page(page); + continue; + } + } + if (Page_Uptodate(page)) { + set_bit(BH_Uptodate, &bh->b_state); + continue; + } + if (!buffer_uptodate(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + } + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + err = -EIO; + if (!buffer_uptodate(*wait_bh)) + goto out; + } + return 0; +out: + return err; +} + +static int __block_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0, need_balance_dirty = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + + blocksize = 1 << inode->i_blkbits; + + for(bh = head = page->buffers, block_start = 0; + bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_bit(BH_Uptodate, &bh->b_state); + if (!atomic_set_buffer_dirty(bh)) { + __mark_dirty(bh); + buffer_insert_inode_data_queue(bh, inode); + need_balance_dirty = 1; + } + } + } + + if (need_balance_dirty) + balance_dirty(); + /* + * is this a partial write that happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' wether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return 0; +} + +/* + * Generic "read page" function for block devices that have the normal + * get_block functionality. This is most of the block device filesystems. + * Reads the page asynchronously --- the unlock_buffer() and + * mark_buffer_uptodate() functions propagate buffer state into the + * page struct once IO has completed. + */ +int block_read_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + unsigned long iblock, lblock; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize, blocks; + int nr, i; + + if (!PageLocked(page)) + PAGE_BUG(page); + blocksize = 1 << inode->i_blkbits; + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + head = page->buffers; + + blocks = PAGE_CACHE_SIZE >> inode->i_blkbits; + iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits; + bh = head; + nr = 0; + i = 0; + + do { + if (buffer_uptodate(bh)) + continue; + + if (!buffer_mapped(bh)) { + if (iblock < lblock) { + if (get_block(inode, iblock, bh, 0)) + continue; + } + if (!buffer_mapped(bh)) { + memset(kmap(page) + i*blocksize, 0, blocksize); + flush_dcache_page(page); + kunmap(page); + set_bit(BH_Uptodate, &bh->b_state); + continue; + } + /* get_block() might have updated the buffer synchronously */ + if (buffer_uptodate(bh)) + continue; + } + + arr[nr] = bh; + nr++; + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + if (!nr) { + /* + * all buffers are uptodate - we can set the page + * uptodate as well. + */ + SetPageUptodate(page); + UnlockPage(page); + return 0; + } + + /* Stage two: lock the buffers */ + for (i = 0; i < nr; i++) { + struct buffer_head * bh = arr[i]; + lock_buffer(bh); + set_buffer_async_io(bh); + } + + /* Stage 3: start the IO */ + for (i = 0; i < nr; i++) + submit_bh(READ, arr[i]); + + return 0; +} + +/* + * For moronic filesystems that do not allow holes in file. + * We may have to extend the file. + */ + +int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct page *new_page; + unsigned long pgpos; + long status; + unsigned zerofrom; + unsigned blocksize = 1 << inode->i_blkbits; + char *kaddr; + + while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { + status = -ENOMEM; + new_page = grab_cache_page(mapping, pgpos); + if (!new_page) + goto out; + /* we might sleep */ + if (*bytes>>PAGE_CACHE_SHIFT != pgpos) { + UnlockPage(new_page); + page_cache_release(new_page); + continue; + } + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + status = __block_prepare_write(inode, new_page, zerofrom, + PAGE_CACHE_SIZE, get_block); + if (status) + goto out_unmap; + kaddr = page_address(new_page); + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); + flush_dcache_page(new_page); + __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE); + kunmap(new_page); + UnlockPage(new_page); + page_cache_release(new_page); + } + + if (page->index < pgpos) { + /* completely inside the area */ + zerofrom = offset; + } else { + /* page covers the boundary, find the boundary offset */ + zerofrom = *bytes & ~PAGE_CACHE_MASK; + + /* if we will expand the thing last block will be filled */ + if (to > zerofrom && (zerofrom & (blocksize-1))) { + *bytes |= (blocksize-1); + (*bytes)++; + } + + /* starting below the boundary? Nothing to zero out */ + if (offset <= zerofrom) + zerofrom = offset; + } + status = __block_prepare_write(inode, page, zerofrom, to, get_block); + if (status) + goto out1; + kaddr = page_address(page); + if (zerofrom < offset) { + memset(kaddr+zerofrom, 0, offset-zerofrom); + flush_dcache_page(page); + __block_commit_write(inode, page, zerofrom, offset); + } + return 0; +out1: + ClearPageUptodate(page); + kunmap(page); + return status; + +out_unmap: + ClearPageUptodate(new_page); + kunmap(new_page); + UnlockPage(new_page); + page_cache_release(new_page); +out: + return status; +} + +int block_prepare_write(struct page *page, unsigned from, unsigned to, + get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + int err = __block_prepare_write(inode, page, from, to, get_block); + if (err) { + ClearPageUptodate(page); + kunmap(page); + } + return err; +} + +int block_commit_write(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + __block_commit_write(inode,page,from,to); + kunmap(page); + return 0; +} + +int generic_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + __block_commit_write(inode,page,from,to); + kunmap(page); + if (pos > inode->i_size) { + inode->i_size = pos; + mark_inode_dirty(inode); + } + return 0; +} + +int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) +{ + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + int err; + + blocksize = 1 << inode->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + + /* Find the buffer that contains "offset" */ + bh = page->buffers; + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + /* Hole? Nothing to do */ + if (buffer_uptodate(bh)) + goto unlock; + get_block(inode, iblock, bh, 0); + /* Still unmapped? Nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (Page_Uptodate(page)) + set_bit(BH_Uptodate, &bh->b_state); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + memset(kmap(page) + offset, 0, length); + flush_dcache_page(page); + kunmap(page); + + __mark_buffer_dirty(bh); + err = 0; + +unlock: + UnlockPage(page); + page_cache_release(page); +out: + return err; +} + +int block_write_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + int err; + + /* easy case */ + if (page->index < end_index) + return __block_write_full_page(inode, page, get_block); + + /* things got complicated... */ + offset = inode->i_size & (PAGE_CACHE_SIZE-1); + /* OK, are we completely out? */ + if (page->index >= end_index+1 || !offset) { + UnlockPage(page); + return -EIO; + } + + /* Sigh... will have to work, then... */ + err = __block_prepare_write(inode, page, 0, offset, get_block); + if (!err) { + memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + __block_commit_write(inode,page,0,offset); +done: + kunmap(page); + UnlockPage(page); + return err; + } + ClearPageUptodate(page); + goto done; +} + +int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block) +{ + struct buffer_head tmp; + struct inode *inode = mapping->host; + tmp.b_state = 0; + tmp.b_blocknr = 0; + get_block(inode, block, &tmp, 0); + return tmp.b_blocknr; +} + +int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block) +{ + int i, nr_blocks, retval; + unsigned long * blocks = iobuf->blocks; + + nr_blocks = iobuf->length / blocksize; + /* build the blocklist */ + for (i = 0; i < nr_blocks; i++, blocknr++) { + struct buffer_head bh; + + bh.b_state = 0; + bh.b_dev = inode->i_dev; + bh.b_size = blocksize; + + retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1); + if (retval) + goto out; + + if (rw == READ) { + if (buffer_new(&bh)) + BUG(); + if (!buffer_mapped(&bh)) { + /* there was an hole in the filesystem */ + blocks[i] = -1UL; + continue; + } + } else { + if (buffer_new(&bh)) + unmap_underlying_metadata(&bh); + if (!buffer_mapped(&bh)) + BUG(); + } + blocks[i] = bh.b_blocknr; + } + + retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize); + + out: + return retval; +} + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate) +{ + struct kiobuf *kiobuf; + + mark_buffer_uptodate(bh, uptodate); + + kiobuf = bh->b_private; + unlock_buffer(bh); + end_kio_request(kiobuf, uptodate); +} + +/* + * For brw_kiovec: submit a set of buffer_head temporary IOs and wait + * for them to complete. Clean up the buffer_heads afterwards. + */ + +static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size) +{ + int iosize, err; + int i; + struct buffer_head *tmp; + + iosize = 0; + err = 0; + + for (i = nr; --i >= 0; ) { + iosize += size; + tmp = bh[i]; + if (buffer_locked(tmp)) { + wait_on_buffer(tmp); + } + + if (!buffer_uptodate(tmp)) { + /* We are traversing bh'es in reverse order so + clearing iosize on error calculates the + amount of IO before the first error. */ + iosize = 0; + err = -EIO; + } + } + + if (iosize) + return iosize; + return err; +} + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked, page->uptodate, and + * maybe wait on page->wait. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, unsigned long b[], int size) +{ + int err; + int length; + int transferred; + int i; + int bufind; + int pageind; + int bhind; + int offset; + unsigned long blocknr; + struct kiobuf * iobuf = NULL; + struct page * map; + struct buffer_head *tmp, **bhs = NULL; + + if (!nr) + return 0; + + /* + * First, do some alignment and validity checks + */ + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + if ((iobuf->offset & (size-1)) || + (iobuf->length & (size-1))) + return -EINVAL; + if (!iobuf->nr_pages) + panic("brw_kiovec: iobuf not initialised"); + } + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + bufind = bhind = transferred = err = 0; + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + offset = iobuf->offset; + length = iobuf->length; + iobuf->errno = 0; + if (!bhs) + bhs = iobuf->bh; + + for (pageind = 0; pageind < iobuf->nr_pages; pageind++) { + map = iobuf->maplist[pageind]; + if (!map) { + err = -EFAULT; + goto finished; + } + + while (length > 0) { + blocknr = b[bufind++]; + if (blocknr == -1UL) { + if (rw == READ) { + /* there was an hole in the filesystem */ + memset(kmap(map) + offset, 0, size); + flush_dcache_page(map); + kunmap(map); + + transferred += size; + goto skip_block; + } else + BUG(); + } + tmp = bhs[bhind++]; + + tmp->b_size = size; + set_bh_page(tmp, map, offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf, iobuf); + tmp->b_dev = dev; + tmp->b_blocknr = blocknr; + tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req); + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + clear_bit(BH_Dirty, &tmp->b_state); + } else + set_bit(BH_Uptodate, &tmp->b_state); + + atomic_inc(&iobuf->io_count); + submit_bh(rw, tmp); + /* + * Wait for IO if we have got too much + */ + if (bhind >= KIO_MAX_SECTORS) { + kiobuf_wait_for_io(iobuf); /* wake-one */ + err = wait_kio(rw, bhind, bhs, size); + if (err >= 0) + transferred += err; + else + goto finished; + bhind = 0; + } + + skip_block: + length -= size; + offset += size; + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + } /* End of block loop */ + } /* End of page loop */ + } /* End of iovec loop */ + + /* Is there any IO still left to submit? */ + if (bhind) { + kiobuf_wait_for_io(iobuf); /* wake-one */ + err = wait_kio(rw, bhind, bhs, size); + if (err >= 0) + transferred += err; + else + goto finished; + } + + finished: + if (transferred) + return transferred; + return err; +} + +/* + * Start I/O on a page. + * This function expects the page to be locked and may return + * before I/O is complete. You then have to check page->locked, + * page->uptodate, and maybe wait on page->wait. + * + * brw_page() is SMP-safe, although it's being called with the + * kernel lock held - but the code is ready. + * + * FIXME: we need a swapper_inode->get_block function to remove + * some of the bmap kludges and interface ugliness here. + */ +int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size) +{ + struct buffer_head *head, *bh; + + if (!PageLocked(page)) + panic("brw_page: page not locked for I/O"); + + if (!page->buffers) + create_empty_buffers(page, dev, size); + head = bh = page->buffers; + + /* Stage 1: lock all the buffers */ + do { + lock_buffer(bh); + bh->b_blocknr = *(b++); + set_bit(BH_Mapped, &bh->b_state); + set_buffer_async_io(bh); + bh = bh->b_this_page; + } while (bh != head); + + /* Stage 2: start the IO */ + do { + struct buffer_head *next = bh->b_this_page; + submit_bh(rw, bh); + bh = next; + } while (bh != head); + return 0; +} + +int block_symlink(struct inode *inode, const char *symname, int len) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + int err = -ENOMEM; + char *kaddr; + + if (!page) + goto fail; + err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); + if (err) + goto fail_map; + kaddr = page_address(page); + memcpy(kaddr, symname, len-1); + mapping->a_ops->commit_write(NULL, page, 0, len-1); + /* + * Notice that we are _not_ going to block here - end of page is + * unmapped, so this will only try to map the rest of page, see + * that it is unmapped (typically even will not look into inode - + * ->i_size will be enough for everything) and zero it out. + * OTOH it's obviously correct and should make the page up-to-date. + */ + err = mapping->a_ops->readpage(NULL, page); + wait_on_page(page); + page_cache_release(page); + if (err < 0) + goto fail; + mark_inode_dirty(inode); + return 0; +fail_map: + UnlockPage(page); + page_cache_release(page); +fail: + return err; +} + +static inline void link_dev_buffers(struct page * page, struct buffer_head *head) +{ + struct buffer_head *bh, *tail; + + bh = head; + do { + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + page->buffers = head; + page_cache_get(page); +} + +/* + * Create the page-cache page that contains the requested block + */ +static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size) +{ + struct page * page; + struct buffer_head *bh; + + page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS); + if (IS_ERR(page)) + return NULL; + + if (!PageLocked(page)) + BUG(); + + bh = page->buffers; + if (bh) { + if (bh->b_size == size) + return page; + if (!try_to_free_buffers(page, GFP_NOFS)) + goto failed; + } + + bh = create_buffers(page, size, 0); + if (!bh) + goto failed; + link_dev_buffers(page, bh); + return page; + +failed: + UnlockPage(page); + page_cache_release(page); + return NULL; +} + +static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size) +{ + struct buffer_head *head = page->buffers; + struct buffer_head *bh = head; + unsigned int uptodate; + + uptodate = 1 << BH_Mapped; + if (Page_Uptodate(page)) + uptodate |= 1 << BH_Uptodate; + + write_lock(&hash_table_lock); + do { + if (!(bh->b_state & (1 << BH_Mapped))) { + init_buffer(bh, NULL, NULL); + bh->b_dev = dev; + bh->b_blocknr = block; + bh->b_state = uptodate; + } + + /* Insert the buffer into the hash lists if necessary */ + if (!bh->b_pprev) + __insert_into_hash_list(bh); + + block++; + bh = bh->b_this_page; + } while (bh != head); + write_unlock(&hash_table_lock); +} + +/* + * Try to increase the number of buffers available: the size argument + * is used to determine what kind of buffers we want. + */ +static int grow_buffers(kdev_t dev, unsigned long block, int size) +{ + struct page * page; + struct block_device *bdev; + unsigned long index; + int sizebits; + + /* Size must be multiple of hard sectorsize */ + if (size & (get_hardsect_size(dev)-1)) + BUG(); + /* Size must be within 512 bytes and PAGE_SIZE */ + if (size < 512 || size > PAGE_SIZE) + BUG(); + + sizebits = -1; + do { + sizebits++; + } while ((size << sizebits) < PAGE_SIZE); + + index = block >> sizebits; + block = index << sizebits; + + bdev = bdget(kdev_t_to_nr(dev)); + if (!bdev) { + printk("No block device for %s\n", kdevname(dev)); + BUG(); + } + + /* Create a page with the proper size buffers.. */ + page = grow_dev_page(bdev, index, size); + + /* This is "wrong" - talk to Al Viro */ + atomic_dec(&bdev->bd_count); + if (!page) + return 0; + + /* Hash in the buffers on the hash list */ + hash_page_buffers(page, dev, block, size); + UnlockPage(page); + page_cache_release(page); + + /* We hashed up this page, so increment buffermem */ + atomic_inc(&buffermem_pages); + return 1; +} + +static int sync_page_buffers(struct buffer_head *head, unsigned int gfp_mask) +{ + struct buffer_head * bh = head; + int tryagain = 0; + + do { + if (!buffer_dirty(bh) && !buffer_locked(bh)) + continue; + + /* Don't start IO first time around.. */ + if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) + continue; + + /* Second time through we start actively writing out.. */ + if (test_and_set_bit(BH_Lock, &bh->b_state)) { + if (!test_bit(BH_launder, &bh->b_state)) + continue; + wait_on_buffer(bh); + tryagain = 1; + continue; + } + + if (!atomic_set_buffer_clean(bh)) { + unlock_buffer(bh); + continue; + } + + __mark_buffer_clean(bh); + get_bh(bh); + set_bit(BH_launder, &bh->b_state); + bh->b_end_io = end_buffer_io_sync; + submit_bh(WRITE, bh); + tryagain = 0; + } while ((bh = bh->b_this_page) != head); + + return tryagain; +} + +/* + * Can the buffer be thrown out? + */ +#define BUFFER_BUSY_BITS ((1<b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) + +/* + * try_to_free_buffers() checks if all the buffers on this particular page + * are unused, and free's the page if so. + * + * Wake up bdflush() if this fails - if we're running low on memory due + * to dirty buffers, we need to flush them out as quickly as possible. + * + * NOTE: There are quite a number of ways that threads of control can + * obtain a reference to a buffer head within a page. So we must + * lock out all of these paths to cleanly toss the page. + */ +int try_to_free_buffers(struct page * page, unsigned int gfp_mask) +{ + struct buffer_head * tmp, * bh = page->buffers; + +cleaned_buffers_try_again: + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + tmp = bh; + do { + if (buffer_busy(tmp)) + goto busy_buffer_page; + tmp = tmp->b_this_page; + } while (tmp != bh); + + spin_lock(&unused_list_lock); + tmp = bh; + + /* if this buffer was hashed, this page counts as buffermem */ + if (bh->b_pprev) + atomic_dec(&buffermem_pages); + do { + struct buffer_head * p = tmp; + tmp = tmp->b_this_page; + + if (p->b_dev == B_FREE) BUG(); + + remove_inode_queue(p); + __remove_from_queues(p); + __put_unused_buffer_head(p); + } while (tmp != bh); + spin_unlock(&unused_list_lock); + + /* Wake up anyone waiting for buffer heads */ + wake_up(&buffer_wait); + + /* And free the page */ + page->buffers = NULL; + page_cache_release(page); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + return 1; + +busy_buffer_page: + /* Uhhuh, start writeback so that we don't end up with all dirty pages */ + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + if (gfp_mask & __GFP_IO) { + if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) { + if (sync_page_buffers(bh, gfp_mask)) { + /* no IO or waiting next time */ + gfp_mask = 0; + goto cleaned_buffers_try_again; + } + } + } + if (balance_dirty_state() >= 0) + wakeup_bdflush(); + return 0; +} +EXPORT_SYMBOL(try_to_free_buffers); + +/* ================== Debugging =================== */ + +void show_buffers(void) +{ +#ifdef CONFIG_SMP + struct buffer_head * bh; + int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0; + int nlist; + static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", }; +#endif + + printk("Buffer memory: %6dkB\n", + atomic_read(&buffermem_pages) << (PAGE_SHIFT-10)); + +#ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */ + if (!spin_trylock(&lru_list_lock)) + return; + for(nlist = 0; nlist < NR_LIST; nlist++) { + found = locked = dirty = used = lastused = 0; + bh = lru_list[nlist]; + if(!bh) continue; + + do { + found++; + if (buffer_locked(bh)) + locked++; + if (buffer_dirty(bh)) + dirty++; + if (atomic_read(&bh->b_count)) + used++, lastused = found; + bh = bh->b_next_free; + } while (bh != lru_list[nlist]); + { + int tmp = nr_buffers_type[nlist]; + if (found != tmp) + printk("%9s: BUG -> found %d, reported %d\n", + buf_types[nlist], found, tmp); + } + printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), " + "%d locked, %d dirty\n", + buf_types[nlist], found, size_buffers_type[nlist]>>10, + used, lastused, locked, dirty); + } + spin_unlock(&lru_list_lock); +#endif +} + +/* ===================== Init ======================= */ + +/* + * allocate the hash table and init the free list + * Use gfp() for the hash table to decrease TLB misses, use + * SLAB cache for buffer heads. + */ +void __init buffer_init(unsigned long mempages) +{ + int order, i; + unsigned int nr_hash; + + /* The buffer cache hash table is less important these days, + * trim it a bit. + */ + mempages >>= 14; + + mempages *= sizeof(struct buffer_head *); + + for (order = 0; (1 << order) < mempages; order++) + ; + + /* try to allocate something until we get it or we're asking + for something that is really too small */ + + do { + unsigned long tmp; + + nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *); + bh_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + bh_hash_shift = 0; + while((tmp >>= 1UL) != 0UL) + bh_hash_shift++; + + hash_table = (struct buffer_head **) + __get_free_pages(GFP_ATOMIC, order); + } while (hash_table == NULL && --order > 0); + printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!hash_table) + panic("Failed to allocate buffer hash table\n"); + + /* Setup hash chains. */ + for(i = 0; i < nr_hash; i++) + hash_table[i] = NULL; + + /* Setup lru lists. */ + for(i = 0; i < NR_LIST; i++) + lru_list[i] = NULL; + +} + + +/* ====================== bdflush support =================== */ + +/* This is a simple kernel daemon, whose job it is to provide a dynamic + * response to dirty buffers. Once this process is activated, we write back + * a limited number of buffers to the disks and then go back to sleep again. + */ + +DECLARE_WAIT_QUEUE_HEAD(bdflush_wait); + +void wakeup_bdflush(void) +{ + wake_up_interruptible(&bdflush_wait); +} + +/* + * Here we attempt to write back old buffers. We also try to flush inodes + * and supers as well, since this function is essentially "update", and + * otherwise there would be no way of ensuring that these quantities ever + * get written back. Ideally, we would have a timestamp on the inodes + * and superblocks so that we could write back only the old ones as well + */ + +static int sync_old_buffers(void) +{ + lock_kernel(); + sync_unlocked_inodes(); + sync_supers(0); + unlock_kernel(); + + for (;;) { + struct buffer_head *bh; + + spin_lock(&lru_list_lock); + bh = lru_list[BUF_DIRTY]; + if (!bh || time_before(jiffies, bh->b_flushtime)) + break; + if (write_some_buffers(NODEV)) + continue; + return 0; + } + spin_unlock(&lru_list_lock); + return 0; +} + +int block_sync_page(struct page *page) +{ + run_task_queue(&tq_disk); + return 0; +} + +/* This is the interface to bdflush. As we get more sophisticated, we can + * pass tuning parameters to this "process", to adjust how it behaves. + * We would want to verify each parameter, however, to make sure that it + * is reasonable. */ + +asmlinkage long sys_bdflush(int func, long data) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (func == 1) { + /* do_exit directly and let kupdate to do its work alone. */ + do_exit(0); +#if 0 /* left here as it's the only example of lazy-mm-stuff used from + a syscall that doesn't care about the current mm context. */ + int error; + struct mm_struct *user_mm; + + /* + * bdflush will spend all of it's time in kernel-space, + * without touching user-space, so we can switch it into + * 'lazy TLB mode' to reduce the cost of context-switches + * to and from bdflush. + */ + user_mm = start_lazy_tlb(); + error = sync_old_buffers(); + end_lazy_tlb(user_mm); + return error; +#endif + } + + /* Basically func 1 means read param 1, 2 means write param 1, etc */ + if (func >= 2) { + int i = (func-2) >> 1; + if (i >= 0 && i < N_PARAM) { + if ((func & 1) == 0) + return put_user(bdf_prm.data[i], (int*)data); + + if (data >= bdflush_min[i] && data <= bdflush_max[i]) { + bdf_prm.data[i] = data; + return 0; + } + } + return -EINVAL; + } + + /* Having func 0 used to launch the actual bdflush and then never + * return (unless explicitly killed). We return zero here to + * remain semi-compatible with present update(8) programs. + */ + return 0; +} + +/* + * This is the actual bdflush daemon itself. It used to be started from + * the syscall above, but now we launch it ourselves internally with + * kernel_thread(...) directly after the first thread in init/main.c + */ +int bdflush(void *startup) +{ + struct task_struct *tsk = current; + + /* + * We have a bare-bones task_struct, and really should fill + * in a few more things so "top" and /proc/2/{exe,root,cwd} + * display semi-sane things. Not real crucial though... + */ + + tsk->session = 1; + tsk->pgrp = 1; + strcpy(tsk->comm, "bdflush"); + + /* avoid getting signals */ + spin_lock_irq(&tsk->sigmask_lock); + flush_signals(tsk); + sigfillset(&tsk->blocked); + recalc_sigpending(tsk); + spin_unlock_irq(&tsk->sigmask_lock); + + complete((struct completion *)startup); + + for (;;) { + CHECK_EMERGENCY_SYNC + + spin_lock(&lru_list_lock); + if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) { + wait_for_some_buffers(NODEV); + interruptible_sleep_on(&bdflush_wait); + } + } +} + +/* + * This is the kernel update daemon. It was used to live in userspace + * but since it's need to run safely we want it unkillable by mistake. + * You don't need to change your userspace configuration since + * the userspace `update` will do_exit(0) at the first sys_bdflush(). + */ +int kupdate(void *startup) +{ + struct task_struct * tsk = current; + int interval; + + tsk->session = 1; + tsk->pgrp = 1; + strcpy(tsk->comm, "kupdated"); + + /* sigstop and sigcont will stop and wakeup kupdate */ + spin_lock_irq(&tsk->sigmask_lock); + sigfillset(&tsk->blocked); + siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP)); + recalc_sigpending(tsk); + spin_unlock_irq(&tsk->sigmask_lock); + + complete((struct completion *)startup); + + for (;;) { + wait_for_some_buffers(NODEV); + + /* update interval */ + interval = bdf_prm.b_un.interval; + if (interval) { + tsk->state = TASK_INTERRUPTIBLE; + schedule_timeout(interval); + } else { + stop_kupdate: + tsk->state = TASK_STOPPED; + schedule(); /* wait for SIGCONT */ + } + /* check for sigstop */ + if (signal_pending(tsk)) { + int stopped = 0; + spin_lock_irq(&tsk->sigmask_lock); + if (sigismember(&tsk->pending.signal, SIGSTOP)) { + sigdelset(&tsk->pending.signal, SIGSTOP); + stopped = 1; + } + recalc_sigpending(tsk); + spin_unlock_irq(&tsk->sigmask_lock); + if (stopped) + goto stop_kupdate; + } +#ifdef DEBUG + printk(KERN_DEBUG "kupdate() activated...\n"); +#endif + sync_old_buffers(); + } +} + +static int __init bdflush_init(void) +{ + static struct completion startup __initdata = COMPLETION_INITIALIZER(startup); + + kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + wait_for_completion(&startup); + kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + wait_for_completion(&startup); + return 0; +} + +module_init(bdflush_init) + diff -urN linux-2.4.17-rc1-virgin/fs/dcache.c linux-2.4.17-rc1-wli3/fs/dcache.c --- linux-2.4.17-rc1-virgin/fs/dcache.c Fri Dec 14 06:04:11 2001 +++ linux-2.4.17-rc1-wli3/fs/dcache.c Sun Dec 16 23:49:37 2001 @@ -320,11 +320,24 @@ void prune_dcache(int count) { + DEFINE_LOCK_COUNT(); + spin_lock(&dcache_lock); + +redo: for (;;) { struct dentry *dentry; struct list_head *tmp; + if (TEST_LOCK_COUNT(100)) { + RESET_LOCK_COUNT(); + debug_lock_break(1); + if (conditional_schedule_needed()) { + break_spin_lock(&dcache_lock); + goto redo; + } + } + tmp = dentry_unused.prev; if (tmp == &dentry_unused) @@ -480,6 +493,8 @@ struct list_head *next; int found = 0; + DEFINE_LOCK_COUNT(); + spin_lock(&dcache_lock); repeat: next = this_parent->d_subdirs.next; @@ -493,6 +508,12 @@ list_add(&dentry->d_lru, dentry_unused.prev); found++; } + if (TEST_LOCK_COUNT(500) && found > 10) { + debug_lock_break(1); + if (conditional_schedule_needed()) + goto out; + RESET_LOCK_COUNT(); + } /* * Descend a level if the d_subdirs list is non-empty. */ @@ -517,6 +538,7 @@ #endif goto resume; } +out: spin_unlock(&dcache_lock); return found; } @@ -546,6 +568,11 @@ * 0 - very urgent: shrink everything * ... * 6 - base-level: try to shrink a bit. + * + * Chuck Lever's dcache hash function relies on the aggressive + * shrinking where dentry_stat.nr_used is divided by priority. + * I added in a check for a priority of 0 to avoid division by 0. + * -- wli */ int shrink_dcache_memory(int priority, unsigned int gfp_mask) { @@ -565,6 +592,9 @@ if (!(gfp_mask & __GFP_FS)) return 0; + if(!priority) + BUG(); + count = dentry_stat.nr_unused / priority; prune_dcache(count); @@ -683,10 +713,45 @@ return res; } +/* + * The mult + shift 11 hash function from Chuck Lever's paper + * This apparently requires help from shrink_dcache_memory() + * and so that is added. + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * page 8 describes the hash function. + */ static inline struct list_head * d_hash(struct dentry * parent, unsigned long hash) { - hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash ^ (hash >> D_HASHBITS); + hash += (unsigned long) parent; + + /* + * The integer multiply Lever hash function appears to be too + * expensive even with hardware multiply support. Here we + * enter the realm of voodoo. + * + * The multiplicative hash function was this: + * hash *= 2654435761UL; + * hash >>= 11; + * The hard constant 11 is disturbing, and perhaps + * has some bearing on why this did not work well. + * + * The hash function used here is the Mersenne prime + * multiplicative hash function described in Lever's + * paper, which uses a shift/add implementation afforded + * by bit pattern properties of Mersenne primes. + * -- wli + * + * Added in more special sauce to use the upper D_HASHBITS + * of the computed hash key (which is voodoo). + * -- wli + * + * Reverted to the Lever hash function. + * -- wli + */ + + /* hash = (hash << 7) - hash + (hash >> 10) + (hash >> 18); */ + hash *= 2654435761UL; + hash >>= BITS_PER_LONG - D_HASHBITS; return dentry_hashtable + (hash & D_HASHMASK); } diff -urN linux-2.4.17-rc1-virgin/fs/exec.c linux-2.4.17-rc1-wli3/fs/exec.c --- linux-2.4.17-rc1-virgin/fs/exec.c Fri Dec 14 06:04:12 2001 +++ linux-2.4.17-rc1-wli3/fs/exec.c Fri Dec 14 02:44:44 2001 @@ -35,6 +35,7 @@ #include #include #include +#include #define __NO_VERSION__ #include @@ -279,6 +280,7 @@ flush_dcache_page(page); flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); + page_add_rmap(page, pte); tsk->mm->rss++; spin_unlock(&tsk->mm->page_table_lock); @@ -420,8 +422,8 @@ active_mm = current->active_mm; current->mm = mm; current->active_mm = mm; - task_unlock(current); activate_mm(active_mm, mm); + task_unlock(current); mm_release(); if (old_mm) { if (active_mm != old_mm) BUG(); diff -urN linux-2.4.17-rc1-virgin/fs/ext3/inode.c linux-2.4.17-rc1-wli3/fs/ext3/inode.c --- linux-2.4.17-rc1-virgin/fs/ext3/inode.c Fri Dec 14 06:04:12 2001 +++ linux-2.4.17-rc1-wli3/fs/ext3/inode.c Sun Dec 16 17:58:10 2001 @@ -1654,6 +1654,8 @@ } for (p = first; p < last; p++) { + debug_lock_break(1); /* bkl is held */ + conditional_schedule(); nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ @@ -1718,6 +1720,8 @@ /* Go read the buffer for the next level down */ bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize); + debug_lock_break(1); + conditional_schedule(); /* * A read failure? Report error and clear slot diff -urN linux-2.4.17-rc1-virgin/fs/ext3/namei.c linux-2.4.17-rc1-wli3/fs/ext3/namei.c --- linux-2.4.17-rc1-virgin/fs/ext3/namei.c Fri Nov 9 14:25:04 2001 +++ linux-2.4.17-rc1-wli3/fs/ext3/namei.c Sun Dec 16 17:58:10 2001 @@ -157,6 +157,8 @@ if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); + debug_lock_break(1); + conditional_schedule(); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ brelse(bh); diff -urN linux-2.4.17-rc1-virgin/fs/fat/cache.c linux-2.4.17-rc1-wli3/fs/fat/cache.c --- linux-2.4.17-rc1-virgin/fs/fat/cache.c Fri Oct 12 13:48:42 2001 +++ linux-2.4.17-rc1-wli3/fs/fat/cache.c Fri Dec 14 02:44:44 2001 @@ -14,6 +14,7 @@ #include #include #include +#include #if 0 # define PRINTK(x) printk x diff -urN linux-2.4.17-rc1-virgin/fs/inode.c linux-2.4.17-rc1-wli3/fs/inode.c --- linux-2.4.17-rc1-virgin/fs/inode.c Fri Dec 14 06:04:12 2001 +++ linux-2.4.17-rc1-wli3/fs/inode.c Sun Dec 16 23:57:18 2001 @@ -567,6 +567,12 @@ if (tmp == head) break; inode = list_entry(tmp, struct inode, i_list); + + debug_lock_break(2); /* bkl is also held */ + atomic_inc(&inode->i_count); + break_spin_lock_and_resched(&inode_lock); + atomic_dec(&inode->i_count); + if (inode->i_sb != sb) continue; invalidate_inode_buffers(inode); @@ -668,8 +674,11 @@ int count; struct inode * inode; + DEFINE_LOCK_COUNT(); + spin_lock(&inode_lock); +free_unused: count = 0; entry = inode_unused.prev; while (entry != &inode_unused) @@ -692,6 +701,14 @@ count++; if (!--goal) break; + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + debug_lock_break(1); + if (conditional_schedule_needed()) { + break_spin_lock(&inode_lock); + goto free_unused; + } + } } inodes_stat.nr_unused -= count; spin_unlock(&inode_lock); @@ -899,14 +916,23 @@ return inode; } +/* + * The properties have changed from Lever's paper. This is + * the multiplicative page cache hash function from Chuck Lever's paper, + * adapted to the inode hash table. + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * iput() appears to be showing up in profiles, So I put what appears to + * be a theoretically sounder hash function here. + * -- wli + */ static inline unsigned long hash(struct super_block *sb, unsigned long i_ino) { - unsigned long tmp = i_ino + ((unsigned long) sb / L1_CACHE_BYTES); - tmp = tmp + (tmp >> I_HASHBITS); - return tmp & I_HASHMASK; -} + unsigned long hashval = i_ino + (unsigned long) sb; -/* Yeah, I know about quadratic hash. Maybe, later. */ + hashval = (hashval * 2654435761UL) >> (BITS_PER_LONG - I_HASHBITS); + + return hashval & I_HASHMASK; +} /** * iunique - get a unique inode number diff -urN linux-2.4.17-rc1-virgin/fs/jbd/commit.c linux-2.4.17-rc1-wli3/fs/jbd/commit.c --- linux-2.4.17-rc1-virgin/fs/jbd/commit.c Fri Dec 14 06:04:12 2001 +++ linux-2.4.17-rc1-wli3/fs/jbd/commit.c Sun Dec 16 17:58:10 2001 @@ -212,6 +212,9 @@ __journal_remove_journal_head(bh); refile_buffer(bh); __brelse(bh); + debug_lock_break(2); + if (conditional_schedule_needed()) + break; } } if (bufs == ARRAY_SIZE(wbuf)) { @@ -235,8 +238,7 @@ journal_brelse_array(wbuf, bufs); lock_journal(journal); spin_lock(&journal_datalist_lock); - if (bufs) - goto write_out_data_locked; + goto write_out_data_locked; } /* @@ -272,6 +274,14 @@ */ while ((jh = commit_transaction->t_async_datalist)) { struct buffer_head *bh = jh2bh(jh); + if (conditional_schedule_needed()) { + debug_lock_break(551); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + lock_journal(journal); + spin_lock(&journal_datalist_lock); + continue; + } if (buffer_locked(bh)) { spin_unlock(&journal_datalist_lock); unlock_journal(journal); diff -urN linux-2.4.17-rc1-virgin/fs/proc/array.c linux-2.4.17-rc1-wli3/fs/proc/array.c --- linux-2.4.17-rc1-virgin/fs/proc/array.c Thu Oct 11 09:00:01 2001 +++ linux-2.4.17-rc1-wli3/fs/proc/array.c Fri Dec 14 06:05:17 2001 @@ -392,82 +392,11 @@ mmput(mm); return res; } - -static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, - int * pages, int * shared, int * dirty, int * total) -{ - pte_t * pte; - unsigned long end; - - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - pte = pte_offset(pmd, address); - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - pte_t page = *pte; - struct page *ptpage; - - address += PAGE_SIZE; - pte++; - if (pte_none(page)) - continue; - ++*total; - if (!pte_present(page)) - continue; - ptpage = pte_page(page); - if ((!VALID_PAGE(ptpage)) || PageReserved(ptpage)) - continue; - ++*pages; - if (pte_dirty(page)) - ++*dirty; - if (page_count(pte_page(page)) > 1) - ++*shared; - } while (address < end); -} - -static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size, - int * pages, int * shared, int * dirty, int * total) -{ - pmd_t * pmd; - unsigned long end; - - if (pgd_none(*pgd)) - return; - if (pgd_bad(*pgd)) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - return; - } - pmd = pmd_offset(pgd, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - statm_pte_range(pmd, address, end - address, pages, shared, dirty, total); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); -} - -static void statm_pgd_range(pgd_t * pgd, unsigned long address, unsigned long end, - int * pages, int * shared, int * dirty, int * total) -{ - while (address < end) { - statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgd++; - } -} +/* + * This thing is slow so I've ripped out the page table scanning. + * The VMA scanning is slow enough. + */ int proc_pid_statm(struct task_struct *task, char * buffer) { struct mm_struct *mm; @@ -482,23 +411,24 @@ struct vm_area_struct * vma; down_read(&mm->mmap_sem); vma = mm->mmap; + resident = mm->rss; + size = mm->total_vm; while (vma) { - pgd_t *pgd = pgd_offset(mm, vma->vm_start); - int pages = 0, shared = 0, dirty = 0, total = 0; + int pages, total; + + total = vma->vm_end - vma->vm_start; + pages = total >> PAGE_SHIFT; + + if (vma->vm_flags & VM_SHARED) + share += pages; - statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total); - resident += pages; - share += shared; - dt += dirty; - size += total; - if (vma->vm_flags & VM_EXECUTABLE) - trs += pages; /* text */ - else if (vma->vm_flags & VM_GROWSDOWN) - drs += pages; /* stack */ - else if (vma->vm_end > 0x60000000) - lrs += pages; /* library */ - else - drs += pages; + if (vma->vm_flags & VM_EXECUTABLE) { + if(vma->vm_end > TASK_UNMAPPED_BASE) + lrs += pages; /* library */ + else + trs += pages; /* text */ + } else + drs += pages; /* stack and data */ vma = vma->vm_next; } up_read(&mm->mmap_sem); diff -urN linux-2.4.17-rc1-virgin/fs/proc/proc_misc.c linux-2.4.17-rc1-wli3/fs/proc/proc_misc.c --- linux-2.4.17-rc1-virgin/fs/proc/proc_misc.c Tue Nov 20 21:29:09 2001 +++ linux-2.4.17-rc1-wli3/fs/proc/proc_misc.c Fri Dec 14 02:44:20 2001 @@ -164,7 +164,8 @@ "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" "Active: %8u kB\n" - "Inactive: %8u kB\n" + "Inact_dirty: %8u kB\n" + "Inact_clean: %8u kB\n" "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" @@ -178,7 +179,8 @@ K(pg_size - swapper_space.nrpages), K(swapper_space.nrpages), K(nr_active_pages), - K(nr_inactive_pages), + K(nr_inactive_dirty_pages), + K(nr_inactive_clean_pages), K(i.totalhigh), K(i.freehigh), K(i.totalram-i.totalhigh), diff -urN linux-2.4.17-rc1-virgin/fs/reiserfs/bitmap.c linux-2.4.17-rc1-wli3/fs/reiserfs/bitmap.c --- linux-2.4.17-rc1-virgin/fs/reiserfs/bitmap.c Fri Dec 14 06:04:14 2001 +++ linux-2.4.17-rc1-wli3/fs/reiserfs/bitmap.c Sun Dec 16 17:58:10 2001 @@ -410,19 +410,23 @@ amount_needed++ ; continue ; } - - reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ; + RFALSE( is_reusable (s, search_start, 0) == 0, + "vs-4140: bad block number found"); - RFALSE( buffer_locked (SB_AP_BITMAP (s)[i]) || - is_reusable (s, search_start, 0) == 0, - "vs-4140: bitmap block is locked or bad block number found"); + reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ; /* if this bit was already set, we've scheduled, and someone else ** has allocated it. loop around and try again */ if (reiserfs_test_and_set_le_bit (j, SB_AP_BITMAP (s)[i]->b_data)) { reiserfs_restore_prepared_buffer(s, SB_AP_BITMAP(s)[i]) ; + /* if this block has been allocated while we slept, it is + ** impossible to find any more contiguous blocks for ourselves. + ** If we are doing preallocation, give up now and return. + */ + if (for_prealloc) + goto free_and_return; amount_needed++ ; continue ; } diff -urN linux-2.4.17-rc1-virgin/fs/reiserfs/buffer2.c linux-2.4.17-rc1-wli3/fs/reiserfs/buffer2.c --- linux-2.4.17-rc1-virgin/fs/reiserfs/buffer2.c Fri Dec 14 06:04:14 2001 +++ linux-2.4.17-rc1-wli3/fs/reiserfs/buffer2.c Sun Dec 16 17:58:10 2001 @@ -55,6 +55,8 @@ PROC_EXP( unsigned int ctx_switches = kstat.context_swtch ); result = bread (super -> s_dev, n_block, n_size); + debug_lock_break(1); + conditional_schedule(); PROC_INFO_INC( super, breads ); PROC_EXP( if( kstat.context_swtch != ctx_switches ) PROC_INFO_INC( super, bread_miss ) ); diff -urN linux-2.4.17-rc1-virgin/fs/reiserfs/journal.c linux-2.4.17-rc1-wli3/fs/reiserfs/journal.c --- linux-2.4.17-rc1-virgin/fs/reiserfs/journal.c Fri Dec 14 06:04:15 2001 +++ linux-2.4.17-rc1-wli3/fs/reiserfs/journal.c Sun Dec 16 17:58:10 2001 @@ -574,6 +574,8 @@ /* lock the current transaction */ inline static void lock_journal(struct super_block *p_s_sb) { PROC_INFO_INC( p_s_sb, journal.lock_journal ); + debug_lock_break(1); + conditional_schedule(); while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) { PROC_INFO_INC( p_s_sb, journal.lock_journal_wait ); sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ; @@ -704,6 +706,8 @@ mark_buffer_dirty(tbh) ; } ll_rw_block(WRITE, 1, &tbh) ; + debug_lock_break(1); + conditional_schedule(); count++ ; put_bh(tbh) ; /* once for our get_hash */ } @@ -833,6 +837,8 @@ set_bit(BH_Dirty, &(SB_JOURNAL(p_s_sb)->j_header_bh->b_state)) ; ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ; wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; + debug_lock_break(1); + conditional_schedule(); if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { printk( "reiserfs: journal-837: IO error during journal replay\n" ); return -EIO ; @@ -2092,6 +2098,8 @@ } int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { + debug_lock_break(1); + conditional_schedule(); return do_journal_begin_r(th, p_s_sb, nblocks, 0) ; } @@ -2232,6 +2240,8 @@ } int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + debug_lock_break(1); + conditional_schedule(); return do_journal_end(th, p_s_sb, nblocks, 0) ; } @@ -2683,6 +2693,8 @@ RFALSE( buffer_locked(bh) && cur_tb != NULL, "waiting while do_balance was running\n") ; wait_on_buffer(bh) ; + debug_lock_break(1); + conditional_schedule(); } PROC_INFO_INC( p_s_sb, journal.prepare_retry ); retry_count++ ; @@ -2856,6 +2868,8 @@ /* copy all the real blocks into log area. dirty log blocks */ if (test_bit(BH_JDirty, &cn->bh->b_state)) { struct buffer_head *tmp_bh ; + debug_lock_break(1); + conditional_schedule(); /* getblk can sleep, so... */ tmp_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + ((cur_write_start + jindex) % JOURNAL_BLOCK_COUNT), p_s_sb->s_blocksize) ; diff -urN linux-2.4.17-rc1-virgin/fs/reiserfs/stree.c linux-2.4.17-rc1-wli3/fs/reiserfs/stree.c --- linux-2.4.17-rc1-virgin/fs/reiserfs/stree.c Fri Dec 14 06:04:15 2001 +++ linux-2.4.17-rc1-wli3/fs/reiserfs/stree.c Sun Dec 16 17:58:10 2001 @@ -648,9 +648,8 @@ stop at leaf level - set to DISK_LEAF_NODE_LEVEL */ ) { - int n_block_number = SB_ROOT_BLOCK (p_s_sb), - expected_level = SB_TREE_HEIGHT (p_s_sb), - n_block_size = p_s_sb->s_blocksize; + int n_block_number, expected_level; + int n_block_size = p_s_sb->s_blocksize; struct buffer_head * p_s_bh; struct path_element * p_s_last_element; int n_node_level, n_retval; @@ -662,7 +661,10 @@ #endif PROC_INFO_INC( p_s_sb, search_by_key ); - + + debug_lock_break(1); + conditional_schedule(); + /* As we add each node to a path we increase its count. This means that we must be careful to release all nodes in a path before we either discard the path struct or re-use the path struct, as we do here. */ @@ -674,6 +676,8 @@ /* With each iteration of this loop we search through the items in the current node, and calculate the next current node(next path element) for the next iteration of this loop.. */ + n_block_number = SB_ROOT_BLOCK (p_s_sb); + expected_level = SB_TREE_HEIGHT (p_s_sb); while ( 1 ) { #ifdef CONFIG_REISERFS_CHECK @@ -1099,6 +1103,9 @@ for (n_counter = *p_n_removed; n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) { + + debug_lock_break(1); + conditional_schedule(); if (item_moved (&s_ih, p_s_path)) { need_research = 1 ; diff -urN linux-2.4.17-rc1-virgin/include/asm-alpha/bootmem.h linux-2.4.17-rc1-wli3/include/asm-alpha/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-alpha/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-alpha/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,12 @@ +/* + * include/asm-alpha/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * Alpha has some NUMA systems, but it's uncertain to me what + * an appropriate value of NR_SEGMENTS should be. + * + * For the moment, the generic single-page definition is here, + * but those who run on Alpha may need to increase the value + * at least until the page stealing is in place. + */ +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/bootmem.h linux-2.4.17-rc1-wli3/include/asm-arm/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-arm/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-arm/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,9 @@ +/* + * include/asm-arm/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * ARM appeared to have little trouble with a single-page-sized + * segment pool, so the generic NR_SEGMENTS is okay for now. + * This will go away once page stealing is in place. + */ +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/dma.h linux-2.4.17-rc1-wli3/include/asm-arm/dma.h --- linux-2.4.17-rc1-virgin/include/asm-arm/dma.h Sun Aug 12 11:14:00 2001 +++ linux-2.4.17-rc1-wli3/include/asm-arm/dma.h Fri Dec 14 02:44:44 2001 @@ -5,6 +5,7 @@ #include #include +#include #include #include #include diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/hardirq.h linux-2.4.17-rc1-wli3/include/asm-arm/hardirq.h --- linux-2.4.17-rc1-virgin/include/asm-arm/hardirq.h Thu Oct 11 09:04:57 2001 +++ linux-2.4.17-rc1-wli3/include/asm-arm/hardirq.h Fri Dec 14 02:44:44 2001 @@ -34,6 +34,7 @@ #define irq_exit(cpu,irq) (local_irq_count(cpu)--) #define synchronize_irq() do { } while (0) +#define release_irqlock(cpu) do { } while (0) #else #error SMP not supported diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/mmu_context.h linux-2.4.17-rc1-wli3/include/asm-arm/mmu_context.h --- linux-2.4.17-rc1-virgin/include/asm-arm/mmu_context.h Mon Sep 18 15:15:24 2000 +++ linux-2.4.17-rc1-wli3/include/asm-arm/mmu_context.h Fri Dec 14 02:44:44 2001 @@ -42,6 +42,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned int cpu) { +#ifdef CONFIG_PREEMPT + if (preempt_is_disable() == 0) + BUG(); +#endif if (prev != next) { cpu_switch_mm(next->pgd, tsk); clear_bit(cpu, &prev->cpu_vm_mask); diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/pgalloc.h linux-2.4.17-rc1-wli3/include/asm-arm/pgalloc.h --- linux-2.4.17-rc1-virgin/include/asm-arm/pgalloc.h Sun Aug 12 11:14:00 2001 +++ linux-2.4.17-rc1-wli3/include/asm-arm/pgalloc.h Fri Dec 14 02:44:44 2001 @@ -57,40 +57,48 @@ { unsigned long *ret; + preempt_disable(); if ((ret = pgd_quicklist) != NULL) { pgd_quicklist = (unsigned long *)__pgd_next(ret); ret[1] = ret[2]; clean_dcache_entry(ret + 1); pgtable_cache_size--; } + preempt_enable(); return (pgd_t *)ret; } static inline void free_pgd_fast(pgd_t *pgd) { + preempt_disable(); __pgd_next(pgd) = (unsigned long) pgd_quicklist; pgd_quicklist = (unsigned long *) pgd; pgtable_cache_size++; + preempt_enable(); } static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address) { unsigned long *ret; + preempt_disable(); if((ret = pte_quicklist) != NULL) { pte_quicklist = (unsigned long *)__pte_next(ret); ret[0] = 0; clean_dcache_entry(ret); pgtable_cache_size--; } + preempt_enable(); return (pte_t *)ret; } static inline void free_pte_fast(pte_t *pte) { + preempt_disable(); __pte_next(pte) = (unsigned long) pte_quicklist; pte_quicklist = (unsigned long *) pte; pgtable_cache_size++; + preempt_enable(); } #else /* CONFIG_NO_PGT_CACHE */ diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/smplock.h linux-2.4.17-rc1-wli3/include/asm-arm/smplock.h --- linux-2.4.17-rc1-virgin/include/asm-arm/smplock.h Sun Aug 12 11:14:00 2001 +++ linux-2.4.17-rc1-wli3/include/asm-arm/smplock.h Fri Dec 14 02:44:44 2001 @@ -3,12 +3,17 @@ * * Default SMP lock implementation */ +#include #include #include extern spinlock_t kernel_flag; +#ifdef CONFIG_PREEMPT +#define kernel_locked() preempt_is_disable() +#else #define kernel_locked() spin_is_locked(&kernel_flag) +#endif /* * Release global kernel lock and global interrupt lock @@ -40,8 +45,14 @@ */ static inline void lock_kernel(void) { +#ifdef CONFIG_PREEMPT + if (current->lock_depth == -1) + spin_lock(&kernel_flag); + ++current->lock_depth; +#else if (!++current->lock_depth) spin_lock(&kernel_flag); +#endif } static inline void unlock_kernel(void) diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/softirq.h linux-2.4.17-rc1-wli3/include/asm-arm/softirq.h --- linux-2.4.17-rc1-virgin/include/asm-arm/softirq.h Sat Sep 8 12:02:31 2001 +++ linux-2.4.17-rc1-wli3/include/asm-arm/softirq.h Fri Dec 14 02:44:44 2001 @@ -5,20 +5,22 @@ #include #define __cpu_bh_enable(cpu) \ - do { barrier(); local_bh_count(cpu)--; } while (0) + do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0) #define cpu_bh_disable(cpu) \ - do { local_bh_count(cpu)++; barrier(); } while (0) + do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0) #define local_bh_disable() cpu_bh_disable(smp_processor_id()) #define __local_bh_enable() __cpu_bh_enable(smp_processor_id()) #define in_softirq() (local_bh_count(smp_processor_id()) != 0) -#define local_bh_enable() \ +#define _local_bh_enable() \ do { \ unsigned int *ptr = &local_bh_count(smp_processor_id()); \ if (!--*ptr && ptr[-2]) \ __asm__("bl%? __do_softirq": : : "lr");/* out of line */\ } while (0) + +#define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0) #endif /* __ASM_SOFTIRQ_H */ diff -urN linux-2.4.17-rc1-virgin/include/asm-cris/bootmem.h linux-2.4.17-rc1-wli3/include/asm-cris/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-cris/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-cris/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,11 @@ +/* + * include/asm-cris/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * Cris hasn't been tested with this yet, so + * port maintainers may want to increase the value + * of NR_SEGMENTS if this becomes a problem. + * This will go away once page stealing is in place. + */ + +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-generic/bootmem.h linux-2.4.17-rc1-wli3/include/asm-generic/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-generic/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-generic/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,25 @@ +#ifndef _ASM_BOOTMEM_H +#define _ASM_BOOTMEM_H + +/* + * include/asm-generic/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * NR_SEGMENTS is the number of line segment tree nodes held + * in the per-node segment pools. + * + * For the moment, this is a fixed size, because dynamically + * determining the number of segments per node would require + * a change of interface. On 32-bit machines with 4KB pages + * this is 170 distinct fragments of memory per page. + * + * So long as the arena for the tree nodes is statically + * allocated, this must be an arch-specific #define + * This can be eliminated entirely only by a change of + * interface. Page stealing is simple, but unsafe until + * after the absolutely necessary reservations are done. + */ + +#define NR_SEGMENTS (PAGE_SIZE/sizeof(segment_buf_t)) + +#endif /* _ASM_BOOTMEM_H */ diff -urN linux-2.4.17-rc1-virgin/include/asm-generic/rmap.h linux-2.4.17-rc1-wli3/include/asm-generic/rmap.h --- linux-2.4.17-rc1-virgin/include/asm-generic/rmap.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-generic/rmap.h Fri Dec 14 02:44:20 2001 @@ -0,0 +1,51 @@ +#ifndef _GENERIC_RMAP_H +#define _GENERIC_RMAP_H +/* + * linux/include/asm-generic/rmap.h + * + * Architecture dependant parts of the reverse mapping code, + * this version should work for most architectures with a + * 'normal' page table layout. + * + * We use the struct page of the page table page to find out + * the process and full address of a page table entry: + * - page->mapping points to the process' mm_struct + * - page->index has the high bits of the address + * - the lower bits of the address are calculated from the + * offset of the page table entry within the page table page + */ +#include + +static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, unsigned long address) +{ + struct page * page = virt_to_page(ptep); + + page->mapping = (void *)mm; + page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); +} + +static inline void pgtable_remove_rmap(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + + page->mapping = NULL; + page->index = 0; +} + +static inline struct mm_struct * ptep_to_mm(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + + return (struct mm_struct *) page->mapping; +} + +static inline unsigned long ptep_to_address(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + unsigned long low_bits; + + low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; + return page->index + low_bits; +} + +#endif /* _GENERIC_RMAP_H */ diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/bootmem.h linux-2.4.17-rc1-wli3/include/asm-i386/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-i386/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-i386/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,9 @@ +/* + * include/asm-i386/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * i386 has been well-tested with this value of NR_SEGMENTS. + * There are some i386 architectures with highly-fragmented + * memory that may need to alter it. + */ +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/hardirq.h linux-2.4.17-rc1-wli3/include/asm-i386/hardirq.h --- linux-2.4.17-rc1-virgin/include/asm-i386/hardirq.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/asm-i386/hardirq.h Sun Dec 16 18:05:01 2001 @@ -36,6 +36,8 @@ #define synchronize_irq() barrier() +#define release_irqlock(cpu) do { } while (0) + #else #include diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/highmem.h linux-2.4.17-rc1-wli3/include/asm-i386/highmem.h --- linux-2.4.17-rc1-virgin/include/asm-i386/highmem.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/asm-i386/highmem.h Sun Dec 16 18:05:01 2001 @@ -88,6 +88,7 @@ enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); if (page < highmem_start_page) return page_address(page); @@ -109,8 +110,10 @@ unsigned long vaddr = (unsigned long) kvaddr; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - if (vaddr < FIXADDR_START) // FIXME + if (vaddr < FIXADDR_START) { // FIXME + preempt_enable(); return; + } if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) BUG(); @@ -122,6 +125,8 @@ pte_clear(kmap_pte-idx); __flush_tlb_one(vaddr); #endif + + preempt_enable(); } #endif /* __KERNEL__ */ diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/hw_irq.h linux-2.4.17-rc1-wli3/include/asm-i386/hw_irq.h --- linux-2.4.17-rc1-virgin/include/asm-i386/hw_irq.h Thu Nov 22 11:46:18 2001 +++ linux-2.4.17-rc1-wli3/include/asm-i386/hw_irq.h Sun Dec 16 18:04:59 2001 @@ -95,6 +95,18 @@ #define __STR(x) #x #define STR(x) __STR(x) +#define GET_CURRENT \ + "movl %esp, %ebx\n\t" \ + "andl $-8192, %ebx\n\t" + +#ifdef CONFIG_PREEMPT +#define BUMP_LOCK_COUNT \ + GET_CURRENT \ + "incl 4(%ebx)\n\t" +#else +#define BUMP_LOCK_COUNT +#endif + #define SAVE_ALL \ "cld\n\t" \ "pushl %es\n\t" \ @@ -108,14 +120,11 @@ "pushl %ebx\n\t" \ "movl $" STR(__KERNEL_DS) ",%edx\n\t" \ "movl %edx,%ds\n\t" \ - "movl %edx,%es\n\t" + "movl %edx,%es\n\t" \ + BUMP_LOCK_COUNT #define IRQ_NAME2(nr) nr##_interrupt(void) #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -#define GET_CURRENT \ - "movl %esp, %ebx\n\t" \ - "andl $-8192, %ebx\n\t" /* * SMP has a few special interrupts for IPI messages diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/i387.h linux-2.4.17-rc1-wli3/include/asm-i386/i387.h --- linux-2.4.17-rc1-virgin/include/asm-i386/i387.h Thu Nov 22 11:48:58 2001 +++ linux-2.4.17-rc1-wli3/include/asm-i386/i387.h Sun Dec 16 18:16:02 2001 @@ -12,6 +12,7 @@ #define __ASM_I386_I387_H #include +#include #include #include #include @@ -24,7 +25,7 @@ extern void restore_fpu( struct task_struct *tsk ); extern void kernel_fpu_begin(void); -#define kernel_fpu_end() stts() +#define kernel_fpu_end() do { stts(); preempt_enable(); } while(0) #define unlazy_fpu( tsk ) do { \ diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/mmu_context.h linux-2.4.17-rc1-wli3/include/asm-i386/mmu_context.h --- linux-2.4.17-rc1-virgin/include/asm-i386/mmu_context.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/asm-i386/mmu_context.h Sun Dec 16 18:05:01 2001 @@ -27,6 +27,10 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu) { +#ifdef CONFIG_PREEMPT + if (preempt_is_disabled() == 0) + BUG(); +#endif if (prev != next) { /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/param.h linux-2.4.17-rc1-wli3/include/asm-i386/param.h --- linux-2.4.17-rc1-virgin/include/asm-i386/param.h Fri Oct 27 11:04:43 2000 +++ linux-2.4.17-rc1-wli3/include/asm-i386/param.h Sun Dec 16 01:24:48 2001 @@ -2,7 +2,8 @@ #define _ASMi386_PARAM_H #ifndef HZ -#define HZ 100 +/* #define HZ 100 */ +#define HZ 256 #endif #define EXEC_PAGESIZE 4096 diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/pgalloc.h linux-2.4.17-rc1-wli3/include/asm-i386/pgalloc.h --- linux-2.4.17-rc1-virgin/include/asm-i386/pgalloc.h Fri Dec 14 06:04:15 2001 +++ linux-2.4.17-rc1-wli3/include/asm-i386/pgalloc.h Sun Dec 16 18:05:01 2001 @@ -75,20 +75,26 @@ { unsigned long *ret; + preempt_disable(); if ((ret = pgd_quicklist) != NULL) { pgd_quicklist = (unsigned long *)(*ret); ret[0] = 0; pgtable_cache_size--; - } else + preempt_enable(); + } else { + preempt_enable(); ret = (unsigned long *)get_pgd_slow(); + } return (pgd_t *)ret; } static inline void free_pgd_fast(pgd_t *pgd) { + preempt_disable(); *(unsigned long *)pgd = (unsigned long) pgd_quicklist; pgd_quicklist = (unsigned long *) pgd; pgtable_cache_size++; + preempt_enable(); } static inline void free_pgd_slow(pgd_t *pgd) @@ -119,19 +125,23 @@ { unsigned long *ret; + preempt_disable(); if ((ret = (unsigned long *)pte_quicklist) != NULL) { pte_quicklist = (unsigned long *)(*ret); ret[0] = ret[1]; pgtable_cache_size--; } + preempt_enable(); return (pte_t *)ret; } static inline void pte_free_fast(pte_t *pte) { + preempt_disable(); *(unsigned long *)pte = (unsigned long) pte_quicklist; pte_quicklist = (unsigned long *) pte; pgtable_cache_size++; + preempt_enable(); } static __inline__ void pte_free_slow(pte_t *pte) diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/processor.h linux-2.4.17-rc1-wli3/include/asm-i386/processor.h --- linux-2.4.17-rc1-virgin/include/asm-i386/processor.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/asm-i386/processor.h Sun Dec 16 18:04:59 2001 @@ -502,7 +502,10 @@ { __asm__ __volatile__ ("prefetchw (%0)" : : "r"(x)); } -#define spin_lock_prefetch(x) prefetchw(x) +#define spin_lock_prefetch(x) do { \ + prefetchw(x); \ + preempt_prefetch(¤t->preempt_count); \ +} while(0) #endif diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/rmap.h linux-2.4.17-rc1-wli3/include/asm-i386/rmap.h --- linux-2.4.17-rc1-virgin/include/asm-i386/rmap.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-i386/rmap.h Fri Dec 14 02:44:20 2001 @@ -0,0 +1,7 @@ +#ifndef _I386_RMAP_H +#define _I386_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/smplock.h linux-2.4.17-rc1-wli3/include/asm-i386/smplock.h --- linux-2.4.17-rc1-virgin/include/asm-i386/smplock.h Thu Nov 22 11:46:20 2001 +++ linux-2.4.17-rc1-wli3/include/asm-i386/smplock.h Sun Dec 16 18:16:02 2001 @@ -10,7 +10,15 @@ extern spinlock_t kernel_flag; +#ifdef CONFIG_SMP #define kernel_locked() spin_is_locked(&kernel_flag) +#else +#ifdef CONFIG_PREEMPT +#define kernel_locked() preempt_is_disabled() +#else +#define kernel_locked() 1 +#endif +#endif /* * Release global kernel lock and global interrupt lock @@ -42,6 +50,11 @@ */ static __inline__ void lock_kernel(void) { +#ifdef CONFIG_PREEMPT + if (current->lock_depth == -1) + spin_lock(&kernel_flag); + ++current->lock_depth; +#else #if 1 if (!++current->lock_depth) spin_lock(&kernel_flag); @@ -53,6 +66,7 @@ "\n9:" :"=m" (__dummy_lock(&kernel_flag)), "=m" (current->lock_depth)); +#endif #endif } diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/softirq.h linux-2.4.17-rc1-wli3/include/asm-i386/softirq.h --- linux-2.4.17-rc1-virgin/include/asm-i386/softirq.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/asm-i386/softirq.h Sun Dec 16 18:05:01 2001 @@ -5,9 +5,9 @@ #include #define __cpu_bh_enable(cpu) \ - do { barrier(); local_bh_count(cpu)--; } while (0) + do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0) #define cpu_bh_disable(cpu) \ - do { local_bh_count(cpu)++; barrier(); } while (0) + do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0) #define local_bh_disable() cpu_bh_disable(smp_processor_id()) #define __local_bh_enable() __cpu_bh_enable(smp_processor_id()) @@ -22,7 +22,7 @@ * If you change the offsets in irq_stat then you have to * update this code as well. */ -#define local_bh_enable() \ +#define _local_bh_enable() \ do { \ unsigned int *ptr = &local_bh_count(smp_processor_id()); \ \ @@ -44,5 +44,7 @@ : "r" (ptr), "i" (do_softirq) \ /* no registers clobbered */ ); \ } while (0) + +#define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0) #endif /* __ASM_SOFTIRQ_H */ diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/spinlock.h linux-2.4.17-rc1-wli3/include/asm-i386/spinlock.h --- linux-2.4.17-rc1-virgin/include/asm-i386/spinlock.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/asm-i386/spinlock.h Sun Dec 16 18:04:59 2001 @@ -77,7 +77,7 @@ :"=m" (lock->lock) : : "memory" -static inline void spin_unlock(spinlock_t *lock) +static inline void _raw_spin_unlock(spinlock_t *lock) { #if SPINLOCK_DEBUG if (lock->magic != SPINLOCK_MAGIC) @@ -97,7 +97,7 @@ :"=q" (oldval), "=m" (lock->lock) \ :"0" (oldval) : "memory" -static inline void spin_unlock(spinlock_t *lock) +static inline void _raw_spin_unlock(spinlock_t *lock) { char oldval = 1; #if SPINLOCK_DEBUG @@ -113,7 +113,7 @@ #endif -static inline int spin_trylock(spinlock_t *lock) +static inline int _raw_spin_trylock(spinlock_t *lock) { char oldval; __asm__ __volatile__( @@ -123,7 +123,7 @@ return oldval > 0; } -static inline void spin_lock(spinlock_t *lock) +static inline void _raw_spin_lock(spinlock_t *lock) { #if SPINLOCK_DEBUG __label__ here; @@ -179,7 +179,7 @@ */ /* the spinlock helpers are in arch/i386/kernel/semaphore.c */ -static inline void read_lock(rwlock_t *rw) +static inline void _raw_read_lock(rwlock_t *rw) { #if SPINLOCK_DEBUG if (rw->magic != RWLOCK_MAGIC) @@ -188,7 +188,7 @@ __build_read_lock(rw, "__read_lock_failed"); } -static inline void write_lock(rwlock_t *rw) +static inline void _raw_write_lock(rwlock_t *rw) { #if SPINLOCK_DEBUG if (rw->magic != RWLOCK_MAGIC) @@ -197,10 +197,10 @@ __build_write_lock(rw, "__write_lock_failed"); } -#define read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") -#define write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory") +#define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") +#define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory") -static inline int write_trylock(rwlock_t *lock) +static inline int _raw_write_trylock(rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; if (atomic_sub_and_test(RW_LOCK_BIAS, count)) diff -urN linux-2.4.17-rc1-virgin/include/asm-ia64/bootmem.h linux-2.4.17-rc1-wli3/include/asm-ia64/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-ia64/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-ia64/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,19 @@ +#ifndef _ASM_BOOTMEM_H +#define _ASM_BOOTMEM_H + +/* + * include/asm-ia64/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * ACPI on IA64 is one of the heaviest memory-reserving subsystems + * of any architecture. This leads to enough fragmentation to exhaust + * the segment pool with the default NR_SEGMENTS several times over. + * This value has been tested on Intel Lion systems, but the author + * is well-aware of systems requiring still higher values. + * + * This will go away entirely once page stealing is in place. + */ + +#define NR_SEGMENTS ((8*PAGE_SIZE)/sizeof(segment_buf_t)) + +#endif /* _ASM_BOOTMEM_H */ diff -urN linux-2.4.17-rc1-virgin/include/asm-m68k/bootmem.h linux-2.4.17-rc1-wli3/include/asm-m68k/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-m68k/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-m68k/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,11 @@ +/* + * include/asm-m68k/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * m68k should in all likelihood be happy with this value of + * NR_SEGMENTS, though testing has been obstructed + * by issues unrelated to bootmem. + * NR_SEGMENTS will go away entirely once page stealing is in place. + */ + +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-mips/bootmem.h linux-2.4.17-rc1-wli3/include/asm-mips/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-mips/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-mips/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,11 @@ +/* + * include/asm-mips/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * This value of NR_SEGMENTS has been tested on a DecStation 5000/200 + * and it was happy with it. That does not rule out a possible need to + * increase the value on systems I've not tested. + * NR_SEGMENTS will go away once page stealing is in place. + */ + +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-mips64/bootmem.h linux-2.4.17-rc1-wli3/include/asm-mips64/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-mips64/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-mips64/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,11 @@ +/* + * include/asm-mips64/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * mips64 includes some very large memory machines with very fragmented + * memory. There are also likely to be patch conflicts as the discontig + * patch touches bootmem. This value is almost certainly wrong. + * Fortunately, NR_SEGMENTS will go away soon. + */ + +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-parisc/bootmem.h linux-2.4.17-rc1-wli3/include/asm-parisc/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-parisc/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-parisc/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,10 @@ +/* + * include/asm-parisc/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * PA-RISC memory maps have relatively few contiguous + * ranges of available memory, and so the generic NR_SEGMENTS + * will suffice until NR_SEGMENTS is eliminated. + */ + +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-ppc/bootmem.h linux-2.4.17-rc1-wli3/include/asm-ppc/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-ppc/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-ppc/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,10 @@ +/* + * include/asm-ppc/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * According to sources, 32-bit PPC has relatively few fragments + * of available memory, and so the generic NR_SEGMENTS should + * suffice until NR_SEGMENTS is eliminated. + */ + +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-s390/bootmem.h linux-2.4.17-rc1-wli3/include/asm-s390/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-s390/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-s390/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,10 @@ +/* + * include/asm-alpha/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * S390 will probably not need to change NR_SEGMENTS, + * as setup.c tracks memory fragments on its own and + * insists on less than 16. + * NR_SEGMENTS will go away once page stealing is in place. + */ +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-s390x/bootmem.h linux-2.4.17-rc1-wli3/include/asm-s390x/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-s390x/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-s390x/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,10 @@ +/* + * include/asm-s390x/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * S390x is unlikely to need to change NR_SEGMENTS, as it tracks ranges + * itself in setup.c and uses less than 16. + * NR_SEGMENTS will go away once page stealing is in place in the + * bootmem allocator. + */ +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-sh/bootmem.h linux-2.4.17-rc1-wli3/include/asm-sh/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-sh/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-sh/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,8 @@ +/* + * include/asm-sh/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * Super-H has not been tested, so NR_SEGMENTS may need to change. + * NR_SEGMENTS will be eliminated once page stealing is in place. + */ +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-sh/hardirq.h linux-2.4.17-rc1-wli3/include/asm-sh/hardirq.h --- linux-2.4.17-rc1-virgin/include/asm-sh/hardirq.h Sat Sep 8 12:29:09 2001 +++ linux-2.4.17-rc1-wli3/include/asm-sh/hardirq.h Fri Dec 14 02:44:44 2001 @@ -34,6 +34,8 @@ #define synchronize_irq() barrier() +#define release_irqlock(cpu) do { } while (0) + #else #error Super-H SMP is not available diff -urN linux-2.4.17-rc1-virgin/include/asm-sh/mmu_context.h linux-2.4.17-rc1-wli3/include/asm-sh/mmu_context.h --- linux-2.4.17-rc1-virgin/include/asm-sh/mmu_context.h Sat Sep 8 12:29:09 2001 +++ linux-2.4.17-rc1-wli3/include/asm-sh/mmu_context.h Fri Dec 14 02:44:44 2001 @@ -166,6 +166,10 @@ struct mm_struct *next, struct task_struct *tsk, unsigned int cpu) { +#ifdef CONFIG_PREEMPT + if (preempt_is_disabled() == 0) + BUG(); +#endif if (prev != next) { unsigned long __pgdir = (unsigned long)next->pgd; diff -urN linux-2.4.17-rc1-virgin/include/asm-sh/smplock.h linux-2.4.17-rc1-wli3/include/asm-sh/smplock.h --- linux-2.4.17-rc1-virgin/include/asm-sh/smplock.h Sat Sep 8 12:29:09 2001 +++ linux-2.4.17-rc1-wli3/include/asm-sh/smplock.h Fri Dec 14 02:44:44 2001 @@ -9,15 +9,88 @@ #include -#ifndef CONFIG_SMP - +#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT) +/* + * Should never happen, since linux/smp_lock.h catches this case; + * but in case this file is included directly with neither SMP nor + * PREEMPT configuration, provide same dummys as linux/smp_lock.h + */ #define lock_kernel() do { } while(0) #define unlock_kernel() do { } while(0) -#define release_kernel_lock(task, cpu, depth) ((depth) = 1) -#define reacquire_kernel_lock(task, cpu, depth) do { } while(0) +#define release_kernel_lock(task, cpu) do { } while(0) +#define reacquire_kernel_lock(task) do { } while(0) +#define kernel_locked() 1 + +#else /* CONFIG_SMP || CONFIG_PREEMPT */ + +#if CONFIG_SMP +#error "We do not support SMP on SH yet" +#endif +/* + * Default SMP lock implementation (i.e. the i386 version) + */ + +#include +#include + +extern spinlock_t kernel_flag; +#define lock_bkl() spin_lock(&kernel_flag) +#define unlock_bkl() spin_unlock(&kernel_flag) +#ifdef CONFIG_SMP +#define kernel_locked() spin_is_locked(&kernel_flag) +#elif CONFIG_PREEMPT +#define kernel_locked() preempt_is_disabled() +#else /* neither */ +#define kernel_locked() 1 +#endif + +/* + * Release global kernel lock and global interrupt lock + */ +#define release_kernel_lock(task, cpu) \ +do { \ + if (task->lock_depth >= 0) \ + spin_unlock(&kernel_flag); \ + release_irqlock(cpu); \ + __sti(); \ +} while (0) + +/* + * Re-acquire the kernel lock + */ +#define reacquire_kernel_lock(task) \ +do { \ + if (task->lock_depth >= 0) \ + spin_lock(&kernel_flag); \ +} while (0) + +/* + * Getting the big kernel lock. + * + * This cannot happen asynchronously, + * so we only need to worry about other + * CPU's. + */ +static __inline__ void lock_kernel(void) +{ +#ifdef CONFIG_PREEMPT + if (current->lock_depth == -1) + spin_lock(&kernel_flag); + ++current->lock_depth; #else -#error "We do not support SMP on SH" -#endif /* CONFIG_SMP */ + if (!++current->lock_depth) + spin_lock(&kernel_flag); +#endif +} + +static __inline__ void unlock_kernel(void) +{ + if (current->lock_depth < 0) + BUG(); + if (--current->lock_depth < 0) + spin_unlock(&kernel_flag); +} +#endif /* CONFIG_SMP || CONFIG_PREEMPT */ #endif /* __ASM_SH_SMPLOCK_H */ diff -urN linux-2.4.17-rc1-virgin/include/asm-sh/softirq.h linux-2.4.17-rc1-wli3/include/asm-sh/softirq.h --- linux-2.4.17-rc1-virgin/include/asm-sh/softirq.h Sat Sep 8 12:29:09 2001 +++ linux-2.4.17-rc1-wli3/include/asm-sh/softirq.h Fri Dec 14 02:44:44 2001 @@ -6,6 +6,7 @@ #define local_bh_disable() \ do { \ + preempt_disable(); \ local_bh_count(smp_processor_id())++; \ barrier(); \ } while (0) @@ -14,6 +15,7 @@ do { \ barrier(); \ local_bh_count(smp_processor_id())--; \ + preempt_enable(); \ } while (0) #define local_bh_enable() \ @@ -22,6 +24,7 @@ if (!--local_bh_count(smp_processor_id()) \ && softirq_pending(smp_processor_id())) { \ do_softirq(); \ + preempt_enable(); \ } \ } while (0) diff -urN linux-2.4.17-rc1-virgin/include/asm-sparc/bootmem.h linux-2.4.17-rc1-wli3/include/asm-sparc/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-sparc/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-sparc/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,11 @@ +/* + * include/asm-sparc/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * 32-bit SPARC generally doesn't feature discontiguous + * memory, so this value of NR_SEGMENTS likely to be good. + * NR_SEGMENTS will be eliminated once page stealing in + * the bootmem allocator is in place. + */ + +#include diff -urN linux-2.4.17-rc1-virgin/include/asm-sparc64/bootmem.h linux-2.4.17-rc1-wli3/include/asm-sparc64/bootmem.h --- linux-2.4.17-rc1-virgin/include/asm-sparc64/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/asm-sparc64/bootmem.h Fri Dec 14 03:21:15 2001 @@ -0,0 +1,10 @@ +/* + * include/asm-sparc64/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * 64-bit SPARC may need a larger NR_SEGMENTS than this + * but it's not clear what a better value would be. + * NR_SEGMENTS will be eliminated once page stealing + * in the bootmem allocator is in place. + */ +#include diff -urN linux-2.4.17-rc1-virgin/include/linux/bootmem.h linux-2.4.17-rc1-wli3/include/linux/bootmem.h --- linux-2.4.17-rc1-virgin/include/linux/bootmem.h Thu Nov 22 11:47:23 2001 +++ linux-2.4.17-rc1-wli3/include/linux/bootmem.h Sun Dec 16 18:16:02 2001 @@ -1,5 +1,6 @@ /* * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Segment tree-based memory reservation system, William Irwin, IBM, Oct 2001 */ #ifndef _LINUX_BOOTMEM_H #define _LINUX_BOOTMEM_H @@ -9,6 +10,8 @@ #include #include #include +#include +#include /* * simple boot-time physical memory area allocator. @@ -25,8 +28,8 @@ unsigned long node_boot_start; unsigned long node_low_pfn; void *node_bootmem_map; - unsigned long last_offset; - unsigned long last_pos; + segment_tree_root_t segment_tree; + segment_buf_t *free_segments; } bootmem_data_t; extern unsigned long __init bootmem_bootmap_pages (unsigned long); diff -urN linux-2.4.17-rc1-virgin/include/linux/brlock.h linux-2.4.17-rc1-wli3/include/linux/brlock.h --- linux-2.4.17-rc1-virgin/include/linux/brlock.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/linux/brlock.h Sun Dec 16 18:10:53 2001 @@ -171,11 +171,11 @@ } #else -# define br_read_lock(idx) ((void)(idx)) -# define br_read_unlock(idx) ((void)(idx)) -# define br_write_lock(idx) ((void)(idx)) -# define br_write_unlock(idx) ((void)(idx)) -#endif +# define br_read_lock(idx) ({ (void)(idx); preempt_disable(); }) +# define br_read_unlock(idx) ({ (void)(idx); preempt_enable(); }) +# define br_write_lock(idx) ({ (void)(idx); preempt_disable(); }) +# define br_write_unlock(idx) ({ (void)(idx); preempt_enable(); }) +#endif /* CONFIG_SMP */ /* * Now enumerate all of the possible sw/hw IRQ protected diff -urN linux-2.4.17-rc1-virgin/include/linux/dcache.h linux-2.4.17-rc1-wli3/include/linux/dcache.h --- linux-2.4.17-rc1-virgin/include/linux/dcache.h Thu Nov 22 11:46:18 2001 +++ linux-2.4.17-rc1-wli3/include/linux/dcache.h Sun Dec 16 18:05:51 2001 @@ -36,17 +36,58 @@ }; extern struct dentry_stat_t dentry_stat; -/* Name hashing routines. Initial hash value */ -/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */ -#define init_name_hash() 0 +/* + * Fowler, Noll, & Vo hash function + * -- wli + */ + +/* + * Initial hash value for Fowler, Noll, & Vo hash function. + * FreeBSD appears to use 33554467UL decimal / 0x2000023UL hex. + * Sources I see elsewhere (Noll's webpage) describe using an offset + * basis of 2166136261UL decimal / 0x811C9DC5UL hex. + * -- wli + */ +#define init_name_hash() 0x811C9DC5UL -/* partial hash update function. Assume roughly 4 bits per character */ -static __inline__ unsigned long partial_name_hash(unsigned long c, unsigned long prevhash) +/* + * This is a multiplicative hash function using the prime 16777619 + * The Fowler, Noll, and Vo hash function is rated the best in + * string hashing benchmarks published on gcc-patches and NetBSD + * mailing lists. + * -- wli + */ +static __inline__ unsigned long partial_name_hash(unsigned long c, + unsigned long prevhash) { - return (prevhash + (c << 4) + (c >> 4)) * 11; + /* + * A multiplicative definition would be: + * --wli + */ + return (prevhash * 0x01000193UL) ^ c; + + /* + * If I were to get overcomplicated, I would decode things + * for each bit of 0x01000193UL and then expand to the shift + * and add operations explicitly in order to avoid reliance on + * the compiler for this. + * The register pressure generated by this may not be a win + * on i386 vs. actual multiplication, but results remain + * to be seen. + * + * prevhash += (prevhash << 24) + * + (prevhash << 8) + * + (prevhash << 7) + * + (prevhash << 4) + * + (prevhash << 1); + * return prevhash ^ c; + */ } -/* Finally: cut down the number of bits to a int value (and try to avoid losing bits) */ +/* + * Finally: cut down the number of bits to a int value (and try to + * avoid losing bits) + */ static __inline__ unsigned long end_name_hash(unsigned long hash) { return (unsigned int) hash; @@ -126,31 +167,6 @@ extern spinlock_t dcache_lock; -/** - * d_drop - drop a dentry - * @dentry: dentry to drop - * - * d_drop() unhashes the entry from the parent - * dentry hashes, so that it won't be found through - * a VFS lookup any more. Note that this is different - * from deleting the dentry - d_delete will try to - * mark the dentry negative if possible, giving a - * successful _negative_ lookup, while d_drop will - * just make the cache lookup fail. - * - * d_drop() is used mainly for stuff that wants - * to invalidate a dentry for some reason (NFS - * timeouts or autofs deletes). - */ - -static __inline__ void d_drop(struct dentry * dentry) -{ - spin_lock(&dcache_lock); - list_del(&dentry->d_hash); - INIT_LIST_HEAD(&dentry->d_hash); - spin_unlock(&dcache_lock); -} - static __inline__ int dname_external(struct dentry *d) { return d->d_name.name != d->d_iname; @@ -275,3 +291,34 @@ #endif /* __KERNEL__ */ #endif /* __LINUX_DCACHE_H */ + +#if !defined(__LINUX_DCACHE_H_INLINES) && defined(_TASK_STRUCT_DEFINED) +#define __LINUX_DCACHE_H_INLINES + +#ifdef __KERNEL__ +/** + * d_drop - drop a dentry + * @dentry: dentry to drop + * + * d_drop() unhashes the entry from the parent + * dentry hashes, so that it won't be found through + * a VFS lookup any more. Note that this is different + * from deleting the dentry - d_delete will try to + * mark the dentry negative if possible, giving a + * successful _negative_ lookup, while d_drop will + * just make the cache lookup fail. + * + * d_drop() is used mainly for stuff that wants + * to invalidate a dentry for some reason (NFS + * timeouts or autofs deletes). + */ + +static __inline__ void d_drop(struct dentry * dentry) +{ + spin_lock(&dcache_lock); + list_del(&dentry->d_hash); + INIT_LIST_HEAD(&dentry->d_hash); + spin_unlock(&dcache_lock); +} +#endif +#endif diff -urN linux-2.4.17-rc1-virgin/include/linux/elevator.h linux-2.4.17-rc1-wli3/include/linux/elevator.h --- linux-2.4.17-rc1-virgin/include/linux/elevator.h Thu Feb 15 16:58:34 2001 +++ linux-2.4.17-rc1-wli3/include/linux/elevator.h Sat Dec 15 14:54:07 2001 @@ -5,8 +5,9 @@ struct list_head *, struct list_head *, int); -typedef int (elevator_merge_fn) (request_queue_t *, struct request **, struct list_head *, - struct buffer_head *, int, int); +typedef int (elevator_merge_fn)(request_queue_t *, struct request **, + struct list_head *, struct buffer_head *bh, + int rw, int max_sectors, int max_bomb_segments); typedef void (elevator_merge_cleanup_fn) (request_queue_t *, struct request *, int); @@ -16,6 +17,7 @@ { int read_latency; int write_latency; + int max_bomb_segments; elevator_merge_fn *elevator_merge_fn; elevator_merge_cleanup_fn *elevator_merge_cleanup_fn; @@ -24,13 +26,13 @@ unsigned int queue_ID; }; -int elevator_noop_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int); -void elevator_noop_merge_cleanup(request_queue_t *, struct request *, int); -void elevator_noop_merge_req(struct request *, struct request *); - -int elevator_linus_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int); -void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int); -void elevator_linus_merge_req(struct request *, struct request *); +elevator_merge_fn elevator_noop_merge; +elevator_merge_cleanup_fn elevator_noop_merge_cleanup; +elevator_merge_req_fn elevator_noop_merge_req; + +elevator_merge_fn elevator_linus_merge; +elevator_merge_cleanup_fn elevator_linus_merge_cleanup; +elevator_merge_req_fn elevator_linus_merge_req; typedef struct blkelv_ioctl_arg_s { int queue_ID; @@ -54,22 +56,6 @@ #define ELEVATOR_FRONT_MERGE 1 #define ELEVATOR_BACK_MERGE 2 -/* - * This is used in the elevator algorithm. We don't prioritise reads - * over writes any more --- although reads are more time-critical than - * writes, by treating them equally we increase filesystem throughput. - * This turns out to give better overall performance. -- sct - */ -#define IN_ORDER(s1,s2) \ - ((((s1)->rq_dev == (s2)->rq_dev && \ - (s1)->sector < (s2)->sector)) || \ - (s1)->rq_dev < (s2)->rq_dev) - -#define BHRQ_IN_ORDER(bh, rq) \ - ((((bh)->b_rdev == (rq)->rq_dev && \ - (bh)->b_rsector < (rq)->sector)) || \ - (bh)->b_rdev < (rq)->rq_dev) - static inline int elevator_request_latency(elevator_t * elevator, int rw) { int latency; @@ -85,7 +71,7 @@ ((elevator_t) { \ 0, /* read_latency */ \ 0, /* write_latency */ \ - \ + 0, /* max_bomb_segments */ \ elevator_noop_merge, /* elevator_merge_fn */ \ elevator_noop_merge_cleanup, /* elevator_merge_cleanup_fn */ \ elevator_noop_merge_req, /* elevator_merge_req_fn */ \ @@ -95,7 +81,7 @@ ((elevator_t) { \ 8192, /* read passovers */ \ 16384, /* write passovers */ \ - \ + 0, /* max_bomb_segments */ \ elevator_linus_merge, /* elevator_merge_fn */ \ elevator_linus_merge_cleanup, /* elevator_merge_cleanup_fn */ \ elevator_linus_merge_req, /* elevator_merge_req_fn */ \ diff -urN linux-2.4.17-rc1-virgin/include/linux/fs.h linux-2.4.17-rc1-wli3/include/linux/fs.h --- linux-2.4.17-rc1-virgin/include/linux/fs.h Fri Dec 14 06:04:15 2001 +++ linux-2.4.17-rc1-wli3/include/linux/fs.h Sun Dec 16 18:06:18 2001 @@ -283,7 +283,7 @@ extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); -#define touch_buffer(bh) mark_page_accessed(bh->b_page) +#define touch_buffer(bh) touch_page(bh->b_page) #include diff -urN linux-2.4.17-rc1-virgin/include/linux/fs_struct.h linux-2.4.17-rc1-wli3/include/linux/fs_struct.h --- linux-2.4.17-rc1-virgin/include/linux/fs_struct.h Fri Jul 13 15:10:44 2001 +++ linux-2.4.17-rc1-wli3/include/linux/fs_struct.h Fri Dec 14 02:44:44 2001 @@ -20,6 +20,15 @@ extern void exit_fs(struct task_struct *); extern void set_fs_altroot(void); +struct fs_struct *copy_fs_struct(struct fs_struct *old); +void put_fs_struct(struct fs_struct *fs); + +#endif +#endif + +#if !defined(_LINUX_FS_STRUCT_H_INLINES) && defined(_TASK_STRUCT_DEFINED) +#define _LINUX_FS_STRUCT_H_INLINES +#ifdef __KERNEL__ /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. Requires the big lock held. @@ -65,9 +74,5 @@ mntput(old_pwdmnt); } } - -struct fs_struct *copy_fs_struct(struct fs_struct *old); -void put_fs_struct(struct fs_struct *fs); - #endif #endif diff -urN linux-2.4.17-rc1-virgin/include/linux/highmem.h linux-2.4.17-rc1-wli3/include/linux/highmem.h --- linux-2.4.17-rc1-virgin/include/linux/highmem.h Fri Dec 14 06:04:15 2001 +++ linux-2.4.17-rc1-wli3/include/linux/highmem.h Sun Dec 16 18:05:01 2001 @@ -93,4 +93,15 @@ kunmap_atomic(vto, KM_USER1); } +static inline void copy_highpage(struct page *to, struct page *from) +{ + char *vfrom, *vto; + + vfrom = kmap(from); + vto = kmap(to); + copy_page(vto, vfrom); + kunmap(from); + kunmap(to); +} + #endif /* _LINUX_HIGHMEM_H */ diff -urN linux-2.4.17-rc1-virgin/include/linux/lock_break.h linux-2.4.17-rc1-wli3/include/linux/lock_break.h --- linux-2.4.17-rc1-virgin/include/linux/lock_break.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/linux/lock_break.h Sun Dec 16 18:04:59 2001 @@ -0,0 +1,84 @@ +/* + * include/linux/lock_break.h - lock breaking routines + * + * since in-kernel preemption can not occur while a lock is held, + * we can just drop and reacquire long-held locks when they are + * in a natural quiescent state to further lower system latency. + * + * (C) 2001 Robert Love + * + */ + +#ifndef _LINUX_LOCK_BREAK_H +#define _LINUX_LOCK_BREAK_H + +#include + +/* + * setting this to 1 will instruct debug_lock_break to + * note when the expected lock count does not equal the + * actual count. if the lock count is higher than expected, + * we aren't dropping enough locks. if it is 0, we are + * wasting our time since the system is already preemptible. + */ +#ifndef DEBUG_LOCK_BREAK +#define DEBUG_LOCK_BREAK 0 +#endif + +#ifdef CONFIG_LOCK_BREAK + +#define conditional_schedule_needed() (unlikely(current->need_resched)) + +/* + * setting the task's state to TASK_RUNNING is nothing but paranoia, + * in the case where a task is delinquent in properly putting itself + * to sleep. we should test without it. + */ +#define unconditional_schedule() do { \ + __set_current_state(TASK_RUNNING); \ + schedule(); \ +} while(0) + +#define conditional_schedule() do { \ + if (conditional_schedule_needed()) \ + unconditional_schedule(); \ +} while(0) + +#define break_spin_lock(n) do { \ + spin_unlock(n); \ + spin_lock(n); \ +} while(0) + +#define break_spin_lock_and_resched(n) do { \ + spin_unlock(n); \ + conditional_schedule(); \ + spin_lock(n); \ +} while(0) + +#if DEBUG_LOCK_BREAK +#define debug_lock_break(n) do { \ + if (current->preempt_count != n) \ + printk(KERN_ERR "lock_break: %s:%d: count was %d not %d\n", \ + __FILE__, __LINE__, current->preempt_count, n); \ +} while(0) +#else +#define debug_lock_break(n) +#endif + +#define DEFINE_LOCK_COUNT() int _lock_break_count = 0 +#define TEST_LOCK_COUNT(n) (++_lock_break_count > (n)) +#define RESET_LOCK_COUNT() _lock_break_count = 0 + +#else +#define unconditional_schedule() +#define conditional_schedule() +#define conditional_schedule_needed() 0 +#define break_spin_lock(n) +#define break_spin_lock_and_resched(n) +#define debug_lock_break(n) +#define DEFINE_LOCK_COUNT() +#define TEST_LOCK_COUNT(n) 0 +#define RESET_LOCK_COUNT() +#endif + +#endif /* _LINUX_LOCK_BREAK_H */ diff -urN linux-2.4.17-rc1-virgin/include/linux/mm.h linux-2.4.17-rc1-wli3/include/linux/mm.h --- linux-2.4.17-rc1-virgin/include/linux/mm.h Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/include/linux/mm.h Sun Dec 16 18:16:02 2001 @@ -19,7 +19,7 @@ extern int page_cluster; /* The inactive_clean lists are per zone. */ extern struct list_head active_list; -extern struct list_head inactive_list; +extern struct list_head inactive_dirty_list; #include #include @@ -121,6 +121,9 @@ */ extern pgprot_t protection_map[16]; +#define ZPR_MAX_BYTES 256*PAGE_SIZE +#define ZPR_NORMAL 0 /* perform zap_page_range request in one walk */ +#define ZPR_PARTITION 1 /* partition into a series of smaller operations */ /* * These are the virtual MM functions - opening of an area, closing and @@ -133,6 +136,9 @@ struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); }; +/* forward declaration; pte_chain is meant to be internal to rmap.c */ +struct pte_chain; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -159,6 +165,8 @@ updated asynchronously */ struct list_head lru; /* Pageout list, eg. active_list; protected by pagemap_lru_lock !! */ + unsigned long age; /* Page aging counter. */ + struct pte_chain * pte_chain; /* Reverse pte mapping pointer. */ wait_queue_head_t wait; /* Page locked? Stand in line... */ struct page **pprev_hash; /* Complement to *next_hash. */ struct buffer_head * buffers; /* Buffer maps us to a disk block. */ @@ -275,9 +283,9 @@ #define PG_referenced 2 #define PG_uptodate 3 #define PG_dirty 4 -#define PG_unused 5 -#define PG_lru 6 -#define PG_active 7 +#define PG_inactive_clean 5 +#define PG_active 6 +#define PG_inactive_dirty 7 #define PG_slab 8 #define PG_skip 10 #define PG_highmem 11 @@ -325,10 +333,16 @@ #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define TestandSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) +#define TestandClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) -#define PageLRU(page) test_bit(PG_lru, &(page)->flags) -#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) -#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) +#define PageInactiveDirty(page) test_bit(PG_inactive_dirty, &(page)->flags) +#define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty, &(page)->flags) +#define ClearPageInactiveDirty(page) clear_bit(PG_inactive_dirty, &(page)->flags) + +#define PageInactiveClean(page) test_bit(PG_inactive_clean, &(page)->flags) +#define SetPageInactiveClean(page) set_bit(PG_inactive_clean, &(page)->flags) +#define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean, &(page)->flags) #ifdef CONFIG_HIGHMEM #define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) @@ -339,6 +353,23 @@ #define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) #define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) +#define PageLRU(pp) \ + (PageActive(pp) | PageInactiveDirty(pp) | PageInactiveClean(pp)) + +/* + * Called whenever the VM references a page. We immediately reclaim + * the inactive clean pages because those are counted as freeable. + * We don't particularly care about the inactive dirty ones because + * we're never sure if those are freeable anyway. + */ +static inline void touch_page(struct page * page) +{ + if (PageInactiveClean(page)) + activate_page(page); + else + SetPageReferenced(page); +} + /* * Error return values for the *_nopage functions */ @@ -404,7 +435,7 @@ extern void shmem_lock(struct file * file, int lock); extern int shmem_zero_setup(struct vm_area_struct *); -extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size); +extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions); extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot); diff -urN linux-2.4.17-rc1-virgin/include/linux/mm.h~ linux-2.4.17-rc1-wli3/include/linux/mm.h~ --- linux-2.4.17-rc1-virgin/include/linux/mm.h~ Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/linux/mm.h~ Sun Dec 16 03:08:39 2001 @@ -0,0 +1,627 @@ +#ifndef _LINUX_MM_H +#define _LINUX_MM_H + +#include +#include + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +extern unsigned long max_mapnr; +extern unsigned long num_physpages; +extern void * high_memory; +extern int page_cluster; +/* The inactive_clean lists are per zone. */ +extern struct list_head active_list; +extern struct list_head inactive_dirty_list; + +#include +#include +#include + +/* + * Linux kernel virtual memory manager primitives. + * The idea being to have a "virtual" mm in the same way + * we have a virtual fs - giving a cleaner interface to the + * mm details, and allowing different kinds of memory mappings + * (from shared memory to executable loading to arbitrary + * mmap() functions). + */ + +/* + * This struct defines a memory VMM memory area. There is one of these + * per VM-area/task. A VM area is any part of the process virtual memory + * space that has a special rule for the page-fault handlers (ie a shared + * library, the executable area etc). + */ +struct vm_area_struct { + struct mm_struct * vm_mm; /* The address space we belong to. */ + unsigned long vm_start; /* Our start address within vm_mm. */ + unsigned long vm_end; /* The first byte after our end address + within vm_mm. */ + + /* linked list of VM areas per task, sorted by address */ + struct vm_area_struct *vm_next; + + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ + unsigned long vm_flags; /* Flags, listed below. */ + + rb_node_t vm_rb; + + /* + * For areas with an address space and backing store, + * one of the address_space->i_mmap{,shared} lists, + * for shm areas, the list of attaches, otherwise unused. + */ + struct vm_area_struct *vm_next_share; + struct vm_area_struct **vm_pprev_share; + + /* Function pointers to deal with this struct. */ + struct vm_operations_struct * vm_ops; + + /* Information about our backing store: */ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units, *not* PAGE_CACHE_SIZE */ + struct file * vm_file; /* File we map to (can be NULL). */ + unsigned long vm_raend; /* XXX: put full readahead info here. */ + void * vm_private_data; /* was vm_pte (shared mem) */ +}; + +/* + * vm_flags.. + */ +#define VM_READ 0x00000001 /* currently active flags */ +#define VM_WRITE 0x00000002 +#define VM_EXEC 0x00000004 +#define VM_SHARED 0x00000008 + +#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ +#define VM_MAYWRITE 0x00000020 +#define VM_MAYEXEC 0x00000040 +#define VM_MAYSHARE 0x00000080 + +#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define VM_GROWSUP 0x00000200 +#define VM_SHM 0x00000400 /* shared memory area, don't swap out */ +#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ + +#define VM_EXECUTABLE 0x00001000 +#define VM_LOCKED 0x00002000 +#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ + + /* Used by sys_madvise() */ +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ +#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ + +#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ +#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ +#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ + +#define VM_STACK_FLAGS 0x00000177 + +#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) +#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK +#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) +#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) +#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) + +/* read ahead limits */ +extern int vm_min_readahead; +extern int vm_max_readahead; + +/* + * mapping from the currently active vm_flags protection bits (the + * low four bits) to a page protection mask.. + */ +extern pgprot_t protection_map[16]; + + +/* + * These are the virtual MM functions - opening of an area, closing and + * unmapping it (needed to keep files on disk up-to-date etc), pointer + * to the functions called when a no-page or a wp-page exception occurs. + */ +struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct * area); + struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); +}; + +/* forward declaration; pte_chain is meant to be internal to rmap.c */ +struct pte_chain; + +/* + * Each physical page in the system has a struct page associated with + * it to keep track of whatever it is we are using the page for at the + * moment. Note that we have no way to track which tasks are using + * a page. + * + * Try to keep the most commonly accessed fields in single cache lines + * here (16 bytes or greater). This ordering should be particularly + * beneficial on 32-bit processors. + * + * The first line is data used in page cache lookup, the second line + * is used for linear searches (eg. clock algorithm scans). + * + * TODO: make this structure smaller, it could be as small as 32 bytes. + */ +typedef struct page { + struct list_head list; /* ->mapping has some page lists. */ + struct address_space *mapping; /* The inode (or ...) we belong to. */ + unsigned long index; /* Our offset within mapping. */ + struct page *next_hash; /* Next page sharing our hash bucket in + the pagecache hash table. */ + atomic_t count; /* Usage count, see below. */ + unsigned long flags; /* atomic flags, some possibly + updated asynchronously */ + struct list_head lru; /* Pageout list, eg. active_list; + protected by pagemap_lru_lock !! */ + unsigned long age; /* Page aging counter. */ + struct pte_chain * pte_chain; /* Reverse pte mapping pointer. */ + wait_queue_head_t wait; /* Page locked? Stand in line... */ + struct page **pprev_hash; /* Complement to *next_hash. */ + struct buffer_head * buffers; /* Buffer maps us to a disk block. */ + void *virtual; /* Kernel virtual address (NULL if + not kmapped, ie. highmem) */ + struct zone_struct *zone; /* Memory zone we are in. */ +} mem_map_t; + +/* + * Methods to modify the page usage count. + * + * What counts for a page usage: + * - cache mapping (page->mapping) + * - disk mapping (page->buffers) + * - page mapped in a task's page tables, each mapping + * is counted separately + * + * Also, many kernel routines increase the page count before a critical + * routine so they can be sure the page doesn't go away from under them. + */ +#define get_page(p) atomic_inc(&(p)->count) +#define put_page(p) __free_page(p) +#define put_page_testzero(p) atomic_dec_and_test(&(p)->count) +#define page_count(p) atomic_read(&(p)->count) +#define set_page_count(p,v) atomic_set(&(p)->count, v) + +/* + * Various page->flags bits: + * + * PG_reserved is set for special pages, which can never be swapped + * out. Some of them might not even exist (eg empty_bad_page)... + * + * Multiple processes may "see" the same page. E.g. for untouched + * mappings of /dev/null, all processes see the same page full of + * zeroes, and text pages of executables and shared libraries have + * only one copy in memory, at most, normally. + * + * For the non-reserved pages, page->count denotes a reference count. + * page->count == 0 means the page is free. + * page->count == 1 means the page is used for exactly one purpose + * (e.g. a private data page of one process). + * + * A page may be used for kmalloc() or anyone else who does a + * __get_free_page(). In this case the page->count is at least 1, and + * all other fields are unused but should be 0 or NULL. The + * management of this page is the responsibility of the one who uses + * it. + * + * The other pages (we may call them "process pages") are completely + * managed by the Linux memory manager: I/O, buffers, swapping etc. + * The following discussion applies only to them. + * + * A page may belong to an inode's memory mapping. In this case, + * page->mapping is the pointer to the inode, and page->index is the + * file offset of the page, in units of PAGE_CACHE_SIZE. + * + * A page may have buffers allocated to it. In this case, + * page->buffers is a circular list of these buffer heads. Else, + * page->buffers == NULL. + * + * For pages belonging to inodes, the page->count is the number of + * attaches, plus 1 if buffers are allocated to the page, plus one + * for the page cache itself. + * + * All pages belonging to an inode are in these doubly linked lists: + * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages; + * using the page->list list_head. These fields are also used for + * freelist managemet (when page->count==0). + * + * There is also a hash table mapping (mapping,index) to the page + * in memory if present. The lists for this hash table use the fields + * page->next_hash and page->pprev_hash. + * + * All process pages can do I/O: + * - inode pages may need to be read from disk, + * - inode pages which have been modified and are MAP_SHARED may need + * to be written to disk, + * - private pages which have been modified may need to be swapped out + * to swap space and (later) to be read back into memory. + * During disk I/O, PG_locked is used. This bit is set before I/O + * and reset when I/O completes. page->wait is a wait queue of all + * tasks waiting for the I/O on this page to complete. + * PG_uptodate tells whether the page's contents is valid. + * When a read completes, the page becomes uptodate, unless a disk I/O + * error happened. + * + * For choosing which pages to swap out, inode pages carry a + * PG_referenced bit, which is set any time the system accesses + * that page through the (mapping,index) hash table. This referenced + * bit, together with the referenced bit in the page tables, is used + * to manipulate page->age and move the page across the active, + * inactive_dirty and inactive_clean lists. + * + * Note that the referenced bit, the page->lru list_head and the + * active, inactive_dirty and inactive_clean lists are protected by + * the pagemap_lru_lock, and *NOT* by the usual PG_locked bit! + * + * PG_skip is used on sparc/sparc64 architectures to "skip" certain + * parts of the address space. + * + * PG_error is set to indicate that an I/O error occurred on this page. + * + * PG_arch_1 is an architecture specific page state bit. The generic + * code guarantees that this bit is cleared for a page when it first + * is entered into the page cache. + * + * PG_highmem pages are not permanently mapped into the kernel virtual + * address space, they need to be kmapped separately for doing IO on + * the pages. The struct page (these bits with information) are always + * mapped into kernel address space... + */ +#define PG_locked 0 /* Page is locked. Don't touch. */ +#define PG_error 1 +#define PG_referenced 2 +#define PG_uptodate 3 +#define PG_dirty 4 +#define PG_inactive_clean 5 +#define PG_active 6 +#define PG_inactive_dirty 7 +#define PG_slab 8 +#define PG_skip 10 +#define PG_highmem 11 +#define PG_checked 12 /* kill me in 2.5.. */ +#define PG_arch_1 13 +#define PG_reserved 14 +#define PG_launder 15 /* written out by VM pressure.. */ + +/* Make it prettier to test the above... */ +#define UnlockPage(page) unlock_page(page) +#define Page_Uptodate(page) test_bit(PG_uptodate, &(page)->flags) +#define SetPageUptodate(page) set_bit(PG_uptodate, &(page)->flags) +#define ClearPageUptodate(page) clear_bit(PG_uptodate, &(page)->flags) +#define PageDirty(page) test_bit(PG_dirty, &(page)->flags) +#define SetPageDirty(page) set_bit(PG_dirty, &(page)->flags) +#define ClearPageDirty(page) clear_bit(PG_dirty, &(page)->flags) +#define PageLocked(page) test_bit(PG_locked, &(page)->flags) +#define LockPage(page) set_bit(PG_locked, &(page)->flags) +#define TryLockPage(page) test_and_set_bit(PG_locked, &(page)->flags) +#define PageChecked(page) test_bit(PG_checked, &(page)->flags) +#define SetPageChecked(page) set_bit(PG_checked, &(page)->flags) +#define PageLaunder(page) test_bit(PG_launder, &(page)->flags) +#define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) + +extern void FASTCALL(set_page_dirty(struct page *)); + +/* + * The first mb is necessary to safely close the critical section opened by the + * TryLockPage(), the second mb is necessary to enforce ordering between + * the clear_bit and the read of the waitqueue (to avoid SMP races with a + * parallel wait_on_page). + */ +#define PageError(page) test_bit(PG_error, &(page)->flags) +#define SetPageError(page) set_bit(PG_error, &(page)->flags) +#define ClearPageError(page) clear_bit(PG_error, &(page)->flags) +#define PageReferenced(page) test_bit(PG_referenced, &(page)->flags) +#define SetPageReferenced(page) set_bit(PG_referenced, &(page)->flags) +#define ClearPageReferenced(page) clear_bit(PG_referenced, &(page)->flags) +#define PageTestandClearReferenced(page) test_and_clear_bit(PG_referenced, &(page)->flags) +#define PageSlab(page) test_bit(PG_slab, &(page)->flags) +#define PageSetSlab(page) set_bit(PG_slab, &(page)->flags) +#define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags) +#define PageReserved(page) test_bit(PG_reserved, &(page)->flags) + +#define PageActive(page) test_bit(PG_active, &(page)->flags) +#define SetPageActive(page) set_bit(PG_active, &(page)->flags) +#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define TestandSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) +#define TestandClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) + +#define PageInactiveDirty(page) test_bit(PG_inactive_dirty, &(page)->flags) +#define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty, &(page)->flags) +#define ClearPageInactiveDirty(page) clear_bit(PG_inactive_dirty, &(page)->flags) + +#define PageInactiveClean(page) test_bit(PG_inactive_clean, &(page)->flags) +#define SetPageInactiveClean(page) set_bit(PG_inactive_clean, &(page)->flags) +#define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean, &(page)->flags) + +#ifdef CONFIG_HIGHMEM +#define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) +#else +#define PageHighMem(page) 0 /* needed to optimize away at compile time */ +#endif + +#define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) +#define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) + +#define PageLRU(pp) \ + (PageActive(pp) | PageInactiveDirty(pp) | PageInactiveClean(pp)) + +/* + * Called whenever the VM references a page. We immediately reclaim + * the inactive clean pages because those are counted as freeable. + * We don't particularly care about the inactive dirty ones because + * we're never sure if those are freeable anyway. + */ +static inline void touch_page(struct page * page) +{ + if (PageInactiveClean(page)) + activate_page(page); + else + SetPageReferenced(page); +} + +/* + * Error return values for the *_nopage functions + */ +#define NOPAGE_SIGBUS (NULL) +#define NOPAGE_OOM ((struct page *) (-1)) + +/* The array of struct pages */ +extern mem_map_t * mem_map; + +/* + * There is only one page-allocator function, and two main namespaces to + * it. The alloc_page*() variants return 'struct page *' and as such + * can allocate highmem pages, the *get*page*() variants return + * virtual kernel addresses to the allocated page(s). + */ +extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order)); +extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)); +extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order); + +static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order) +{ + /* + * Gets optimized away by the compiler. + */ + if (order >= MAX_ORDER) + return NULL; + return _alloc_pages(gfp_mask, order); +} + +#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) + +extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); +extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); + +#define __get_free_page(gfp_mask) \ + __get_free_pages((gfp_mask),0) + +#define __get_dma_pages(gfp_mask, order) \ + __get_free_pages((gfp_mask) | GFP_DMA,(order)) + +/* + * The old interface name will be removed in 2.5: + */ +#define get_free_page get_zeroed_page + +/* + * There is only one 'core' page-freeing function. + */ +extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); +extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); + +#define __free_page(page) __free_pages((page), 0) +#define free_page(addr) free_pages((addr),0) + +extern void show_free_areas(void); +extern void show_free_areas_node(pg_data_t *pgdat); + +extern void clear_page_tables(struct mm_struct *, unsigned long, int); + +extern int fail_writepage(struct page *); +struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int unused); +struct file *shmem_file_setup(char * name, loff_t size); +extern void shmem_lock(struct file * file, int lock); +extern int shmem_zero_setup(struct vm_area_struct *); + +extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size); +extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); +extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); +extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot); + +extern int vmtruncate(struct inode * inode, loff_t offset); +extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); +extern int make_pages_present(unsigned long addr, unsigned long end); +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len); +extern int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len); +extern int ptrace_attach(struct task_struct *tsk); +extern int ptrace_detach(struct task_struct *, unsigned int); +extern void ptrace_disable(struct task_struct *); +extern int ptrace_check_attach(struct task_struct *task, int kill); + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, + int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + +/* + * On a two-level page table, this ends up being trivial. Thus the + * inlining and the symmetry break with pte_alloc() that does all + * of this out-of-line. + */ +static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + if (pgd_none(*pgd)) + return __pmd_alloc(mm, pgd, address); + return pmd_offset(pgd, address); +} + +extern int pgt_cache_water[2]; +extern int check_pgt_cache(void); + +extern void free_area_init(unsigned long * zones_size); +extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, + unsigned long * zones_size, unsigned long zone_start_paddr, + unsigned long *zholes_size); +extern void mem_init(void); +extern void show_mem(void); +extern void si_meminfo(struct sysinfo * val); +extern void swapin_readahead(swp_entry_t); + +extern struct address_space swapper_space; +#define PageSwapCache(page) ((page)->mapping == &swapper_space) + +static inline int is_page_cache_freeable(struct page * page) +{ + return page_count(page) - !!page->buffers == 1; +} + +extern int can_share_swap_page(struct page *); +extern int remove_exclusive_swap_page(struct page *); + +extern void __free_pte(pte_t); + +/* mmap.c */ +extern void lock_vma_mappings(struct vm_area_struct *); +extern void unlock_vma_mappings(struct vm_area_struct *); +extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); +extern void __insert_vm_struct(struct mm_struct *, struct vm_area_struct *); +extern void build_mmap_rb(struct mm_struct *); +extern void exit_mmap(struct mm_struct *); + +extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff); + +static inline unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + unsigned long ret = -EINVAL; + if ((offset + PAGE_ALIGN(len)) < offset) + goto out; + if (!(offset & ~PAGE_MASK)) + ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); +out: + return ret; +} + +extern int do_munmap(struct mm_struct *, unsigned long, size_t); + +extern unsigned long do_brk(unsigned long, unsigned long); + +static inline void __vma_unlink(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev) +{ + prev->vm_next = vma->vm_next; + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; +} + +static inline int can_vma_merge(struct vm_area_struct * vma, unsigned long vm_flags) +{ + if (!vma->vm_file && vma->vm_flags == vm_flags) + return 1; + else + return 0; +} + +struct zone_t; +/* filemap.c */ +extern void remove_inode_page(struct page *); +extern unsigned long page_unuse(struct page *); +extern void truncate_inode_pages(struct address_space *, loff_t); + +/* generic vm_area_ops exported for stackable file systems */ +extern int filemap_sync(struct vm_area_struct *, unsigned long, size_t, unsigned int); +extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int); + +/* + * GFP bitmasks.. + */ +/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low four bits) */ +#define __GFP_DMA 0x01 +#define __GFP_HIGHMEM 0x02 + +/* Action modifiers - doesn't change the zoning */ +#define __GFP_WAIT 0x10 /* Can wait and reschedule? */ +#define __GFP_HIGH 0x20 /* Should access emergency pools? */ +#define __GFP_IO 0x40 /* Can start low memory physical IO? */ +#define __GFP_HIGHIO 0x80 /* Can start high mem physical IO? */ +#define __GFP_FS 0x100 /* Can call down to low-level FS? */ + +#define GFP_NOHIGHIO (__GFP_HIGH | __GFP_WAIT | __GFP_IO) +#define GFP_NOIO (__GFP_HIGH | __GFP_WAIT) +#define GFP_NOFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO) +#define GFP_ATOMIC (__GFP_HIGH) +#define GFP_USER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) +#define GFP_HIGHUSER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS | __GFP_HIGHMEM) +#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) +#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) +#define GFP_KSWAPD ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) + +/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some + platforms, used as appropriate on others */ + +#define GFP_DMA __GFP_DMA + +/* vma is the first one with address < vma->vm_end, + * and even address < vma->vm_start. Have to extend vma. */ +static inline int expand_stack(struct vm_area_struct * vma, unsigned long address) +{ + unsigned long grow; + + /* + * vma->vm_start/vm_end cannot change under us because the caller is required + * to hold the mmap_sem in write mode. We need to get the spinlock only + * before relocating the vma range ourself. + */ + address &= PAGE_MASK; + spin_lock(&vma->vm_mm->page_table_lock); + grow = (vma->vm_start - address) >> PAGE_SHIFT; + if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || + ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { + spin_unlock(&vma->vm_mm->page_table_lock); + return -ENOMEM; + } + vma->vm_start = address; + vma->vm_pgoff -= grow; + vma->vm_mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; +} + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev); + +/* Look up the first VMA which intersects the interval start_addr..end_addr-1, + NULL if none. Assume start_addr < end_addr. */ +static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +{ + struct vm_area_struct * vma = find_vma(mm,start_addr); + + if (vma && end_addr <= vma->vm_start) + vma = NULL; + return vma; +} + +extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); + +#endif /* __KERNEL__ */ + +#endif diff -urN linux-2.4.17-rc1-virgin/include/linux/mmzone.h linux-2.4.17-rc1-wli3/include/linux/mmzone.h --- linux-2.4.17-rc1-virgin/include/linux/mmzone.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/linux/mmzone.h Sun Dec 16 18:05:00 2001 @@ -39,12 +39,15 @@ */ spinlock_t lock; unsigned long free_pages; + unsigned long inactive_clean_pages; + unsigned long inactive_dirty_pages; unsigned long pages_min, pages_low, pages_high; int need_balance; /* * free areas of different sizes */ + struct list_head inactive_clean_list; free_area_t free_area[MAX_ORDER]; /* @@ -112,9 +115,6 @@ extern int numnodes; extern pg_data_t *pgdat_list; - -#define memclass(pgzone, classzone) (((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \ - && ((pgzone) <= (classzone))) /* * The following two are not meant for general usage. They are here as diff -urN linux-2.4.17-rc1-virgin/include/linux/pagemap.h linux-2.4.17-rc1-wli3/include/linux/pagemap.h --- linux-2.4.17-rc1-virgin/include/linux/pagemap.h Thu Nov 22 11:46:44 2001 +++ linux-2.4.17-rc1-wli3/include/linux/pagemap.h Sun Dec 16 18:16:02 2001 @@ -51,21 +51,17 @@ extern void page_cache_init(unsigned long); /* - * We use a power-of-two hash table to avoid a modulus, - * and get a reasonable hash by knowing roughly how the - * inode pointer and indexes are distributed (ie, we - * roughly know which bits are "significant") - * - * For the time being it will work for struct address_space too (most of - * them sitting inside the inodes). We might want to change it later. + * The multiplicative page cache hash from Chuck Lever's paper. + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * page 3 describes the behavior of the different page cache hash + * functions. This could be painful without integer multiplies, so + * perhaps for wider portability conditional definitions would win. + * -- wli */ -static inline unsigned long _page_hashfn(struct address_space * mapping, unsigned long index) +static inline unsigned long _page_hashfn (struct address_space *mapping, unsigned long index) { -#define i (((unsigned long) mapping)/(sizeof(struct inode) & ~ (sizeof(struct inode) - 1))) -#define s(x) ((x)+((x)>>PAGE_HASH_BITS)) - return s(i+index) & (PAGE_HASH_SIZE-1); -#undef i -#undef s + return ((((unsigned long) mapping + index) * 2654435761UL) >> + (32 - PAGE_HASH_BITS)) & (PAGE_HASH_SIZE - 1); } #define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index)) diff -urN linux-2.4.17-rc1-virgin/include/linux/sched.h linux-2.4.17-rc1-wli3/include/linux/sched.h --- linux-2.4.17-rc1-virgin/include/linux/sched.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/linux/sched.h Mon Dec 17 00:12:14 2001 @@ -26,6 +26,7 @@ #include #include #include +#include struct exec_domain; @@ -88,6 +89,7 @@ #define TASK_UNINTERRUPTIBLE 2 #define TASK_ZOMBIE 4 #define TASK_STOPPED 8 +#define PREEMPT_ACTIVE 0x40000000 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -115,6 +117,21 @@ #define SCHED_OTHER 0 #define SCHED_FIFO 1 #define SCHED_RR 2 +#ifdef CONFIG_RTSCHED +#ifdef CONFIG_MAX_PRI +#if CONFIG_MAX_PRI < 99 +#define MAX_PRI 99 +#elif CONFIG_MAX_PRI > 2047 +#define MAX_PRI 2047 +#else +#define MAX_PRI CONFIG_MAX_PRI +#endif +#else +#define MAX_PRI 127 +#endif +#else +#define MAX_PRI 99 +#endif /* * This is an additional bit set when we want to @@ -154,6 +171,9 @@ #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); +#ifdef CONFIG_PREEMPT +asmlinkage void preempt_schedule(void); +#endif extern int schedule_task(struct tq_struct *task); extern void flush_scheduled_tasks(void); @@ -199,7 +219,9 @@ } /* Maximum number of active map areas.. This is a random (large) number */ -#define MAX_MAP_COUNT (65536) +#define DEFAULT_MAX_MAP_COUNT (65536) + +extern int max_map_count; struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ @@ -283,7 +305,17 @@ * offsets of these are hardcoded elsewhere - touch with care */ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ - unsigned long flags; /* per process flags, defined below */ + /* + * We want the preempt_count in this cache line, but we + * a) don't want to mess up the offsets in asm code, and + * b) the alignment of the next line below, + * so we move "flags" down + * + * Also note we don't make preempt_count volatile, but we do + * need to make sure it is never hiding in a register when + * we have an interrupt, so we need to use barrier() + */ + int preempt_count; /* 0=> preemptable, < 0 => BUG */ int sigpending; mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead @@ -319,12 +351,14 @@ * that's just fine.) */ struct list_head run_list; +#ifdef CONFIG_RTSCHED + int counter_recalc; +#endif unsigned long sleep_time; struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; - struct list_head local_pages; - unsigned int allocation_order, nr_local_pages; + unsigned long flags; /* task state */ struct linux_binfmt *binfmt; @@ -401,6 +435,10 @@ int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; +#ifdef CONFIG_RTSCHED + int effprio; /* effective real time priority */ + void (*newprio)(struct task_struct*, int); +#endif /* Thread group tracking */ u32 parent_exec_id; @@ -517,11 +555,22 @@ extern struct mm_struct init_mm; extern struct task_struct *init_tasks[NR_CPUS]; +/* + * A pid hash function using a prime near golden + * ratio to the machine word size (32 bits). The + * results of this are unknown. + * + * Added shift to extract high-order bits of computed + * hash function. + * -- wli + */ + /* PID hashing. (shouldnt this be dynamic?) */ #define PIDHASH_SZ (4096 >> 2) +#define PIDHASH_BITS 10 extern struct task_struct *pidhash[PIDHASH_SZ]; - -#define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) +#define pid_hashfn(x) \ + (((2654435761UL*(x)) >> (BITS_PER_LONG-PIDHASH_BITS)) & (PIDHASH_SZ-1)) static inline void hash_pid(struct task_struct *p) { @@ -874,10 +923,16 @@ static inline void del_from_runqueue(struct task_struct * p) { +#ifdef CONFIG_RTSCHED +extern void __del_from_runqueue(struct task_struct * p); + + __del_from_runqueue(p); +#else nr_running--; p->sleep_time = jiffies; list_del(&p->run_list); p->run_list.next = NULL; +#endif } static inline int task_on_runqueue(struct task_struct *p) @@ -925,6 +980,11 @@ mntput(rootmnt); return res; } + +#define _TASK_STRUCT_DEFINED +#include +#include +#include #endif /* __KERNEL__ */ diff -urN linux-2.4.17-rc1-virgin/include/linux/segment_tree.h linux-2.4.17-rc1-wli3/include/linux/segment_tree.h --- linux-2.4.17-rc1-virgin/include/linux/segment_tree.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/linux/segment_tree.h Sun Dec 16 18:04:59 2001 @@ -0,0 +1,362 @@ +/* + * linux/include/linux/segment_tree.h + * + * Copyright (C) Oct 2001 William Irwin, IBM + * + * Implementation of segment trees augmented with length information. + * + * In this context, "segment" refers to "line segment". In particular, + * I am storing closed intervals of numbers in this tree. One very + * important invariant maintained is that all the intervals in the + * tree are disjoint. This fact is actually used to help with efficient + * search, because since they are all disjoint, they are ordered + * according to any representative, in particular, the starting and + * ending points. + * + * The separate tree on length is used to help with searches for + * intervals of at least a particular length, and does not have + * any special properties otherwise. + */ + +#ifndef _SEGMENT_TREE_H +#define _SEGMENT_TREE_H + +#include +#include + +typedef struct segment_tree_node { + treap_node_t start; + treap_node_t length; +} segment_tree_node_t; + +typedef union segment_buf { + segment_tree_node_t segment; + union segment_buf *next; +} segment_buf_t; + +typedef struct segment_tree_root { + treap_node_t *start_tree; + treap_node_t *length_tree; +} segment_tree_root_t; + +#define segment_length(node) ((node)->length.value) +#define segment_start(node) ((node)->start.value) +#define segment_end(node) ((node)->start.value + (node)->length.value - 1) + +#define segment_above_point(node, point) \ + (segment_end(node) > (point)) + +#define segment_below_point(node, point) \ + (segment_start(node) < (point)) + +#define segment_contains_point(node, point) \ + (segment_start(node) <= (point) && segment_end(node) >= (point)) + +#define segment_above(node1, node2) \ + (segment_start(node1) > segment_end(node2)) + +#define segment_below(node1, node2) \ + (segment_end(node1) < segment_start(node2)) + +#define segment_disjoint(node1, node2) \ + (segment_above(node1, node2) || segment_below(node1, node2)) + +#define segment_intersect(node1, node2) \ + (segment_start(node1) <= segment_end(node2) \ + && segment_start(node2) <= segment_end(node1)) + +#define segment_contains(node1, node2) \ + (segment_start(node1) <= segment_start(node2) \ + && segment_end(node1) >= segment_end(node2)) + +#define segment_set_endpoints(node, start, end) \ + do { \ + segment_length(node) = (end) - (start) + 1; \ + segment_start(node) = (start); \ + } while(0) + +#define segment_unite(node1, node2) \ + segment_set_endpoints(node1, \ + min(segment_start(node1),segment_start(node2)), \ + max(segment_end(node1), segment_end(node2))) + +#define segment_union(seg_union, node1, node2) \ + segment_set_endpoints(seg_union, \ + min(segment_start(node1),segment_start(node2)), \ + max(segment_end(node1), segment_end(node2))) + +#define segment_intersection(intersect, node1, node2) \ + segment_set_endpoints(intersect, \ + max(segment_start(node1), segment_start(node2)), \ + min(segment_end(node1), segment_end(node2))) + +#define segment_set_start(node, start) \ + segment_set_endpoints(node, start, segment_end(node)) + +#define segment_set_end(node, end) \ + segment_set_endpoints(node, segment_start(node), end) + +#define start_segment_treap(node) \ + treap_entry((node), segment_tree_node_t, start) +#define length_segment_treap(node) \ + treap_entry((node), segment_tree_node_t, length) + +#define start_treap(node) segment_start(start_segment_treap(node)) +#define end_treap(node) segment_end(start_segment_treap(node)) + +static inline unsigned segment_tree_contains_point(segment_tree_node_t *root, + unsigned long point) +{ + treap_node_t *node; + + if(!root) + return 0; + + node = &root->start; + while(node) { + if(segment_contains_point(start_segment_treap(node), point)) + return 1; + else if(segment_below_point(start_segment_treap(node), point)) + node = node->right; + else if(segment_above_point(start_segment_treap(node), point)) + node = node->left; + else + BUG(); + } + return 0; +} + +static inline unsigned segment_tree_intersects(segment_tree_node_t *root, + segment_tree_node_t *segment) +{ + treap_node_t *node; + + if(!root) + return 0; + + node = &root->start; + while(node) { + if(segment_intersect(start_segment_treap(node), segment)) + return 1; + else if(segment_below(start_segment_treap(node), segment)) + node = node->right; + else if(segment_above(start_segment_treap(node), segment)) + node = node->left; + else + BUG(); + } + return 0; +} + +/* + * There are five cases here. + * (1) the segments are disjoint + * (2) the entire segment is removed + * (3) something from the beginning of the segment is removed + * (4) something from the end of the segment is removed + * (5) the segment is split into two fragments + */ +static inline void segment_complement( segment_tree_node_t **segment, + segment_tree_node_t *to_remove, + segment_tree_node_t **fragment) +{ + + if(segment_disjoint(*segment, to_remove)) { + + *fragment = NULL; + + } else if(segment_contains(to_remove, *segment)) { + + *segment = *fragment = NULL; + + } else if(segment_start(*segment) >= segment_start(to_remove)) { + unsigned long start, end; + *fragment = NULL; + start = segment_end(to_remove) + 1; + end = segment_end(*segment); + segment_set_endpoints(*segment, start, end); + + } else if(segment_end(*segment) <= segment_end(to_remove)) { + unsigned long start, end; + *fragment = NULL; + start = segment_start(*segment); + end = segment_start(to_remove) - 1; + segment_set_endpoints(*segment, start, end); + + } else { + unsigned long start_seg, end_seg, start_frag, end_frag; + + start_seg = segment_start(*segment); + end_seg = segment_start(to_remove) - 1; + + start_frag = segment_end(to_remove) + 1; + end_frag = segment_end(*segment); + + segment_set_endpoints(*segment, start_seg, end_seg); + segment_set_endpoints(*fragment, start_frag, end_frag); + + } +} + +/* + * Efficiently determining all possible line segments which intersect + * with another line segment requires splitting the start treap according + * to the endpoints. This is a derived key so it unfortunately may not be + * shared with the generic treap implementation. + */ +static inline void segment_end_split(treap_root_t root, unsigned long end, + treap_root_t less, treap_root_t more) +{ + treap_root_t tree = root; + treap_node_t sentinel; + + sentinel.value = end; + sentinel.priority = ULONG_MAX; + sentinel.left = sentinel.right = sentinel.parent = NULL; + + while(1) { + if(!*root) { + *root = &sentinel; + goto finish; + } else if(end > end_treap(*root) && !(*root)->right) { + (*root)->right = &sentinel; + sentinel.parent = *root; + root = &(*root)->right; + goto upward; + } else if(end <= end_treap(*root) && !(*root)->left) { + (*root)->left = &sentinel; + sentinel.parent = *root; + root = &(*root)->left; + goto upward; + } else if(end > end_treap(*root)) + root = &(*root)->right; + else /* end <= end_treap(*root) */ + root = &(*root)->left; + } + +upward: + + while(1) { + if((*root)->left && (*root)->left->priority > (*root)->priority) + treap_rotate_right(root); + else if((*root)->right + && (*root)->right->priority > (*root)->priority) + treap_rotate_left(root); + + if(!(*root)->parent) + goto finish; + else if(!(*root)->parent->parent) + root = tree; + else if((*root)->parent->parent->left == (*root)->parent) + root = &(*root)->parent->parent->left; + else if((*root)->parent->parent->right == (*root)->parent) + root = &(*root)->parent->parent->right; + } + +finish: + *less = (*root)->left; + *more = (*root)->right; + + if(*less) (*less)->parent = NULL; + if(*more) (*more)->parent = NULL; + + *root = NULL; +} + +#define segment_length_link(node) \ + treap_node_link(&start_segment_treap(node)->length) + +#define segment_start_link(node) \ + treap_node_link(&start_segment_treap(node)->start) + +#define segment_delete(node) \ + do { \ + treap_root_delete(segment_start_link(node)); \ + treap_root_delete(segment_length_link(node)); \ + } while(0) + +static inline void segment_all_intersect(treap_root_t root, + unsigned long start, + unsigned long end, + treap_root_t intersect) +{ + treap_node_t *less_end, *more_end, *more_start, *less_start; + less_start = more_start = NULL; + + if(start) { + less_end = more_end = NULL; + segment_end_split(root, start, &less_end, &more_end); + treap_split(&more_end, end + 1, &less_start, &more_start); + *root = NULL; + treap_join(root, &less_end, &more_start); + } else { + treap_split(root, end + 1, &less_start, &more_start); + *root = more_start; + } + *intersect = less_start; +} + +#if 0 +/* + * If for some reason there is a reason to visualize the trees, + * the following routines may be useful examples as to how they + * may be rendered using dot from AT&T's graphviz. + */ + +extern void early_printk(const char *fmt, ...); + +static void print_ptr_graph(treap_root_t root) { + if(!*root) + return; + else if(!(*root)->marker) { + segment_tree_node_t *seg = start_segment_treap(*root); + (*root)->marker = 1UL; + early_printk("x%p [label=\"%p, start=%lu,\\nlength=%lu\"];\n", + *root, *root, segment_start(seg), segment_length(seg)); + if((*root)->parent) + early_printk("x%p -> x%p [label=\"parent\"];\n", + *root, (*root)->parent); + if((*root)->left) + early_printk("x%p -> x%p [label=\"left\"];\n", + *root, (*root)->left); + if((*root)->right) + early_printk("x%p -> x%p [label=\"right\"];\n", + *root, (*root)->right); + + print_ptr_graph(&(*root)->parent); + print_ptr_graph(&(*root)->left); + print_ptr_graph(&(*root)->right); + (*root)->marker = 0UL; + } + /* + * This is no good for cycle detection since we also traverse + * the parent links. It's -very- cyclic with those. + */ +} +static void print_length_graph(treap_root_t root) { + if(!*root) + return; + else if(!(*root)->marker) { + segment_tree_node_t *seg = length_segment_treap(*root); + (*root)->marker = 1UL; + early_printk("x%p [label=\"%p: start=%lu,\\nlength=%lu\"];\n", + *root, *root, segment_start(seg), segment_length(seg)); + if((*root)->parent) + early_printk("x%p -> x%p [label=\"parent\"];\n", + *root, (*root)->parent); + if((*root)->left) + early_printk("x%p -> x%p [label=\"left\"];\n", + *root, (*root)->left); + if((*root)->right) + early_printk("x%p -> x%p [label=\"right\"];\n", + *root, (*root)->right); + + print_length_graph(&(*root)->parent); + print_length_graph(&(*root)->left); + print_length_graph(&(*root)->right); + (*root)->marker = 0UL; + } +} +#endif + +#endif /* _SEGMENT_TREE_H */ diff -urN linux-2.4.17-rc1-virgin/include/linux/smp.h linux-2.4.17-rc1-wli3/include/linux/smp.h --- linux-2.4.17-rc1-virgin/include/linux/smp.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/linux/smp.h Sun Dec 16 18:05:00 2001 @@ -81,7 +81,9 @@ #define smp_processor_id() 0 #define hard_smp_processor_id() 0 #define smp_threads_ready 1 +#ifndef CONFIG_PREEMPT #define kernel_lock() +#endif #define cpu_logical_map(cpu) 0 #define cpu_number_map(cpu) 0 #define smp_call_function(func,info,retry,wait) ({ 0; }) diff -urN linux-2.4.17-rc1-virgin/include/linux/smp_lock.h linux-2.4.17-rc1-wli3/include/linux/smp_lock.h --- linux-2.4.17-rc1-virgin/include/linux/smp_lock.h Thu Nov 22 11:46:27 2001 +++ linux-2.4.17-rc1-wli3/include/linux/smp_lock.h Sun Dec 16 18:16:02 2001 @@ -3,7 +3,7 @@ #include -#ifndef CONFIG_SMP +#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT) #define lock_kernel() do { } while(0) #define unlock_kernel() do { } while(0) diff -urN linux-2.4.17-rc1-virgin/include/linux/spinlock.h linux-2.4.17-rc1-wli3/include/linux/spinlock.h --- linux-2.4.17-rc1-virgin/include/linux/spinlock.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/linux/spinlock.h Sun Dec 16 18:05:00 2001 @@ -2,6 +2,7 @@ #define __LINUX_SPINLOCK_H #include +#include /* * These are the generic versions of the spinlocks and read-write @@ -45,8 +46,10 @@ #if (DEBUG_SPINLOCKS < 1) +#ifndef CONFIG_PREEMPT #define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic) #define ATOMIC_DEC_AND_LOCK +#endif /* * Your basic spinlocks, allowing only a single CPU anywhere @@ -62,11 +65,11 @@ #endif #define spin_lock_init(lock) do { } while(0) -#define spin_lock(lock) (void)(lock) /* Not "unused variable". */ +#define _raw_spin_lock(lock) (void)(lock) /* Not "unused variable". */ #define spin_is_locked(lock) (0) -#define spin_trylock(lock) ({1; }) +#define _raw_spin_trylock(lock) ({1; }) #define spin_unlock_wait(lock) do { } while(0) -#define spin_unlock(lock) do { } while(0) +#define _raw_spin_unlock(lock) do { } while(0) #elif (DEBUG_SPINLOCKS < 2) @@ -125,12 +128,76 @@ #endif #define rwlock_init(lock) do { } while(0) -#define read_lock(lock) (void)(lock) /* Not "unused variable". */ -#define read_unlock(lock) do { } while(0) -#define write_lock(lock) (void)(lock) /* Not "unused variable". */ -#define write_unlock(lock) do { } while(0) +#define _raw_read_lock(lock) (void)(lock) /* Not "unused variable". */ +#define _raw_read_unlock(lock) do { } while(0) +#define _raw_write_lock(lock) (void)(lock) /* Not "unused variable". */ +#define _raw_write_unlock(lock) do { } while(0) #endif /* !SMP */ + +#ifdef CONFIG_PREEMPT + +#define preempt_is_disabled() (current->preempt_count) +#define preempt_prefetch(a) prefetchw(a) + +#define preempt_disable() \ +do { \ + ++current->preempt_count; \ + barrier(); \ +} while (0) + +#define preempt_enable_no_resched() \ +do { \ + --current->preempt_count; \ + barrier(); \ +} while (0) + +#define preempt_enable() \ +do { \ + --current->preempt_count; \ + barrier(); \ + if (unlikely((current->preempt_count == 0) && current->need_resched)) \ + preempt_schedule(); \ +} while (0) + +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + _raw_spin_lock(lock); \ +} while(0) +#define spin_trylock(lock) ({preempt_disable(); _raw_spin_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock(lock) \ +do { \ + _raw_spin_unlock(lock); \ + preempt_enable(); \ +} while (0) + +#define read_lock(lock) ({preempt_disable(); _raw_read_lock(lock);}) +#define read_unlock(lock) ({_raw_read_unlock(lock); preempt_enable();}) +#define write_lock(lock) ({preempt_disable(); _raw_write_lock(lock);}) +#define write_unlock(lock) ({_raw_write_unlock(lock); preempt_enable();}) +#define write_trylock(lock) ({preempt_disable(); _raw_write_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) + +#else + +#define preempt_is_disabled() do { } while (0) +#define preempt_disable() do { } while (0) +#define preempt_enable_no_resched() +#define preempt_enable() do { } while (0) +#define preempt_prefetch(a) + +#define spin_lock(lock) _raw_spin_lock(lock) +#define spin_trylock(lock) _raw_spin_trylock(lock) +#define spin_unlock(lock) _raw_spin_unlock(lock) + +#define read_lock(lock) _raw_read_lock(lock) +#define read_unlock(lock) _raw_read_unlock(lock) +#define write_lock(lock) _raw_write_lock(lock) +#define write_unlock(lock) _raw_write_unlock(lock) +#define write_trylock(lock) _raw_write_trylock(lock) +#endif /* "lock on reference count zero" */ #ifndef ATOMIC_DEC_AND_LOCK diff -urN linux-2.4.17-rc1-virgin/include/linux/swap.h linux-2.4.17-rc1-wli3/include/linux/swap.h --- linux-2.4.17-rc1-virgin/include/linux/swap.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/linux/swap.h Sun Dec 16 18:05:00 2001 @@ -86,8 +86,8 @@ extern unsigned int nr_free_pages(void); extern unsigned int nr_free_buffer_pages(void); extern int nr_active_pages; -extern int nr_inactive_pages; -extern atomic_t nr_async_pages; +extern int nr_inactive_dirty_pages; +extern int nr_inactive_clean_pages; extern atomic_t page_cache_size; extern atomic_t buffermem_pages; extern spinlock_t pagecache_lock; @@ -100,18 +100,42 @@ struct zone_t; +/* linux/mm/rmap.c */ +extern int FASTCALL(page_referenced(struct page *)); +extern void FASTCALL(page_add_rmap(struct page *, pte_t *)); +extern void FASTCALL(page_remove_rmap(struct page *, pte_t *)); +extern int FASTCALL(try_to_unmap(struct page *)); + +/* try_to_unmap return values */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 +#define SWAP_ERROR 3 + /* linux/mm/swap.c */ +extern int total_swap_pages; extern void FASTCALL(lru_cache_add(struct page *)); extern void FASTCALL(__lru_cache_del(struct page *)); extern void FASTCALL(lru_cache_del(struct page *)); extern void FASTCALL(activate_page(struct page *)); +extern void FASTCALL(activate_page_nolock(struct page *)); +extern void FASTCALL(deactivate_page(struct page *)); +extern void FASTCALL(deactivate_page_nolock(struct page *)); extern void swap_setup(void); /* linux/mm/vmscan.c */ +extern struct page * FASTCALL(reclaim_page(zone_t *)); extern wait_queue_head_t kswapd_wait; -extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); +extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask)); +extern void wakeup_kswapd(void); +extern int free_shortage(void); +extern int total_free_shortage(void); +extern int inactive_shortage(void); +extern int total_inactive_shortage(void); +extern unsigned int zone_free_shortage(zone_t *zone); +extern unsigned int zone_inactive_shortage(zone_t *zone); /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *); @@ -125,6 +149,7 @@ extern void show_swap_cache_info(void); #endif extern int add_to_swap_cache(struct page *, swp_entry_t); +extern int add_to_swap(struct page *); extern void __delete_from_swap_cache(struct page *page); extern void delete_from_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *page); @@ -158,7 +183,14 @@ extern spinlock_t pagemap_lru_lock; -extern void FASTCALL(mark_page_accessed(struct page *)); +/* + * Page aging defines. These seem to work great in FreeBSD, + * no need to reinvent the wheel. + */ +#define PAGE_AGE_START 5 +#define PAGE_AGE_ADV 3 +#define PAGE_AGE_DECL 1 +#define PAGE_AGE_MAX 64 /* * List add/del helper macros. These must be called @@ -166,39 +198,60 @@ */ #define DEBUG_LRU_PAGE(page) \ do { \ - if (!PageLRU(page)) \ - BUG(); \ if (PageActive(page)) \ BUG(); \ + if (PageInactiveDirty(page)) \ + BUG(); \ + if (PageInactiveClean(page)) \ + BUG(); \ } while (0) -#define add_page_to_active_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - SetPageActive(page); \ - list_add(&(page)->lru, &active_list); \ - nr_active_pages++; \ -} while (0) - -#define add_page_to_inactive_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - list_add(&(page)->lru, &inactive_list); \ - nr_inactive_pages++; \ -} while (0) - -#define del_page_from_active_list(page) \ -do { \ - list_del(&(page)->lru); \ - ClearPageActive(page); \ - nr_active_pages--; \ -} while (0) +#define add_page_to_active_list(page) { \ + DEBUG_LRU_PAGE(page); \ + SetPageActive(page); \ + list_add(&(page)->lru, &active_list); \ + nr_active_pages++; \ +} + +#define add_page_to_inactive_dirty_list(page) { \ + DEBUG_LRU_PAGE(page); \ + SetPageInactiveDirty(page); \ + list_add(&(page)->lru, &inactive_dirty_list); \ + nr_inactive_dirty_pages++; \ + page->zone->inactive_dirty_pages++; \ +} + +#define add_page_to_inactive_clean_list(page) { \ + DEBUG_LRU_PAGE(page); \ + SetPageInactiveClean(page); \ + list_add(&(page)->lru, &page->zone->inactive_clean_list); \ + page->zone->inactive_clean_pages++; \ + nr_inactive_clean_pages++; \ +} + +#define del_page_from_active_list(page) { \ + list_del(&(page)->lru); \ + ClearPageActive(page); \ + nr_active_pages--; \ + DEBUG_LRU_PAGE(page); \ +} + +#define del_page_from_inactive_dirty_list(page) { \ + list_del(&(page)->lru); \ + ClearPageInactiveDirty(page); \ + nr_inactive_dirty_pages--; \ + page->zone->inactive_dirty_pages--; \ + DEBUG_LRU_PAGE(page); \ +} + +#define del_page_from_inactive_clean_list(page) { \ + list_del(&(page)->lru); \ + ClearPageInactiveClean(page); \ + page->zone->inactive_clean_pages--; \ + nr_inactive_clean_pages--; \ + DEBUG_LRU_PAGE(page); \ +} -#define del_page_from_inactive_list(page) \ -do { \ - list_del(&(page)->lru); \ - nr_inactive_pages--; \ -} while (0) extern spinlock_t swaplock; diff -urN linux-2.4.17-rc1-virgin/include/linux/swapctl.h linux-2.4.17-rc1-wli3/include/linux/swapctl.h --- linux-2.4.17-rc1-virgin/include/linux/swapctl.h Mon Sep 17 16:15:02 2001 +++ linux-2.4.17-rc1-wli3/include/linux/swapctl.h Fri Dec 14 02:44:20 2001 @@ -10,4 +10,13 @@ typedef pager_daemon_v1 pager_daemon_t; extern pager_daemon_t pager_daemon; +typedef struct freepages_v1 +{ + unsigned int min; + unsigned int low; + unsigned int high; +} freepages_v1; +typedef freepages_v1 freepages_t; +extern freepages_t freepages; + #endif /* _LINUX_SWAPCTL_H */ diff -urN linux-2.4.17-rc1-virgin/include/linux/sysctl.h linux-2.4.17-rc1-wli3/include/linux/sysctl.h --- linux-2.4.17-rc1-virgin/include/linux/sysctl.h Mon Nov 26 05:29:17 2001 +++ linux-2.4.17-rc1-wli3/include/linux/sysctl.h Sun Dec 16 18:05:00 2001 @@ -140,6 +140,7 @@ VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */ VM_PGT_CACHE=9, /* struct: Set page table cache parameters */ VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ + VM_MAX_MAP_COUNT=11, /* int: Maximum number of active map areas */ VM_MIN_READAHEAD=12, /* Min file readahead */ VM_MAX_READAHEAD=13 /* Max file readahead */ }; diff -urN linux-2.4.17-rc1-virgin/include/linux/tqueue.h linux-2.4.17-rc1-wli3/include/linux/tqueue.h --- linux-2.4.17-rc1-virgin/include/linux/tqueue.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc1-wli3/include/linux/tqueue.h Sun Dec 16 18:05:00 2001 @@ -94,6 +94,22 @@ extern spinlock_t tqueue_lock; /* + * Call all "bottom halfs" on a given list. + */ + +extern void __run_task_queue(task_queue *list); + +static inline void run_task_queue(task_queue *list) +{ + if (TQ_ACTIVE(*list)) + __run_task_queue(list); +} + +#endif /* _LINUX_TQUEUE_H */ + +#if !defined(_LINUX_TQUEUE_H_INLINES) && defined(_TASK_STRUCT_DEFINED) +#define _LINUX_TQUEUE_H_INLINES +/* * Queue a task on a tq. Return non-zero if it was successfully * added. */ @@ -109,17 +125,4 @@ } return ret; } - -/* - * Call all "bottom halfs" on a given list. - */ - -extern void __run_task_queue(task_queue *list); - -static inline void run_task_queue(task_queue *list) -{ - if (TQ_ACTIVE(*list)) - __run_task_queue(list); -} - -#endif /* _LINUX_TQUEUE_H */ +#endif diff -urN linux-2.4.17-rc1-virgin/include/linux/treap.h linux-2.4.17-rc1-wli3/include/linux/treap.h --- linux-2.4.17-rc1-virgin/include/linux/treap.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/include/linux/treap.h Sun Dec 16 18:04:59 2001 @@ -0,0 +1,300 @@ +/* + * linux/include/linux/treap.h + * + * Copyright (C) 2001 William Irwin, IBM + * + * Simple treap implementation, following Aragon and Seidel. + * + * Treaps are a simple binary search tree structure, with a twist that + * radically simplifies their management. That is that they keep both + * the search key and a randomly generated priority. They are then both + * heap-ordered according to the priority and binary search tree ordered + * according to the search keys. They are specifically designed for, and + * also reputed to be effective at range tree and segment tree structures + * according to both Knuth and dynamic sets according to the + * Blelloch/Reid-Miller paper. + * + * The rotations themselves are simple, and they are done less often + * than for some kinds of trees, where splay trees where specifically + * mentioned by Knuth. The decision process as to when to perform a + * rotation is simplified by the heap structure. Rotations are done in + * two instances: when rotating a node down to a leaf position before + * deletion, and in restoring the heap ordering after an insertion. + * + * Treaps also support fast splitting and joining operations, which + * make them convenient for interval searches. + * + * One important fact to observe is that when joining, all of the + * members of the left tree must be less than all the members of + * the right tree, or otherwise the search tree ordering breaks. + */ + +#ifndef _TREAP_H +#define _TREAP_H + +#include + +typedef struct treap_node { + unsigned long priority; + unsigned long value; + struct treap_node *left, *right, *parent; + unsigned long marker; +} treap_node_t; + +typedef treap_node_t **treap_root_t; + +#define TREAP_INIT(root) \ + do { \ + *root = NULL; \ + } while(0) + +#define treap_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +#define treap_node_link(node) \ + ((!(node) || !(node)->parent) ? NULL : \ + ((node) == (node)->parent->left) ? &(node)->parent->left \ + : &(node)->parent->right) + +#define treap_find_parent_and_remove_child(tmp, parent) \ + do { \ + parent = tmp->parent; \ + if(parent && parent->left == tmp) \ + parent->left = NULL; \ + else if(parent && parent->right == tmp) \ + parent->right = NULL; \ + else if(parent) \ + BUG(); \ + } while(0) + + +#define treap_find_leftmost_leaf(node) \ + do { \ + if(!node) \ + break; \ + while(1) { \ + if(node->left) \ + node = node->left; \ + else if(node->right) \ + node = node->right; \ + else \ + break; \ + } \ + } while(0) + +/* + * The diagram according to which the assignments in rotation are done: + * + * T T + * | | + * y <- left x + * / \ / \ + * x C right -> A y + * / \ / \ + * A B B C + * + * Some of these assignments are not necessary, as the edges do + * not change. In these cases the assignments are retained as comments. + */ + +static inline void treap_rotate_left(treap_root_t root) +{ + treap_node_t *x, *y, *B, *T; + /* treap_node_t *A, *C; */ + + if(*root) { + x = *root; + T = x->parent; + y = x->right; + if(y) { + if(T && T->left == x) T->left = y; + if(T && T->right == x) T->right = y; + + y->parent = T; + *root = y; + + /* A = x->left; */ + + B = y->left; + + /* C = y->right; */ + + y->left = x; + x->parent = y; + + /* + x->left = A; + if(A) A->parent = x; + */ + + x->right = B; + if(B) B->parent = x; + + /* + y->right = C; + if(C) C->parent = y; + */ + } + } +} + +static inline void treap_rotate_right(treap_root_t root) +{ + treap_node_t *x, *y, *B, *T; + /* treap_node_t *A, *C; */ + + if(*root) { + y = *root; + T = y->parent; + x = y->left; + if(x) { + if(T && T->left == y) T->left = x; + if(T && T->right == y) T->right = x; + + x->parent = T; + *root = x; + + /* A = x->left; */ + + B = x->right; + + /* C = y->right; */ + + x->right = y; + y->parent = x; + + /* + x->left = A; + if(A) A->parent = x; + */ + + y->left = B; + if(B) B->parent = y; + + /* + y->right = C; + if(C) C->parent = y; + */ + } + } +} + +static inline treap_node_t *treap_root_delete(treap_root_t root) +{ + struct treap_node *tmp; + + while(1) { + + if(!root || !*root) return NULL; + else if(!(*root)->left && !(*root)->right) { + tmp = *root; + *root = tmp->parent = NULL; + return tmp; + } else if(!(*root)->left) { + treap_rotate_left(root); + root = &(*root)->left; + } else if(!(*root)->right) { + treap_rotate_right(root); + root = &(*root)->right; + } else if((*root)->left->priority > (*root)->right->priority) { + treap_rotate_right(root); + root = &(*root)->right; + } else { + treap_rotate_left(root); + root = &(*root)->left; + } + } +} + +static inline void treap_insert(treap_root_t root, treap_node_t *node) +{ + treap_root_t tree = root; + node->left = node->right = node->parent = NULL; + + while(1) { + if(!*root) { + *root = node; + return; + } else if(node->value <= (*root)->value && !(*root)->left) { + (*root)->left = node; + node->parent = *root; + root = &(*root)->left; + break; + } else if(node->value > (*root)->value && !(*root)->right) { + (*root)->right = node; + node->parent = *root; + root = &(*root)->right; + break; + } else if(node->value <= (*root)->value) { + root = &(*root)->left; + } else { /* node->value > (*root)->value */ + root = &(*root)->right; + } + } + while(1) { + if(!*root) return; + else if((*root)->left + && (*root)->left->priority > (*root)->priority) + treap_rotate_right(root); + else if((*root)->right + && (*root)->right->priority > (*root)->priority) + treap_rotate_left(root); + + if(!(*root)->parent) + return; + else if(!(*root)->parent->parent) + root = tree; + else if((*root)->parent == (*root)->parent->parent->left) + root = &(*root)->parent->parent->left; + else if((*root)->parent == (*root)->parent->parent->right) + root = &(*root)->parent->parent->right; + + } +} + +static inline treap_node_t *treap_delete(treap_root_t root, unsigned long k) +{ + while(1) { + if(!*root) return NULL; + else if(k < (*root)->value) root = &(*root)->left; + else if(k > (*root)->value) root = &(*root)->right; + else return treap_root_delete(root); + } +} + +static inline void treap_split(treap_root_t root, unsigned long k, + treap_root_t less, treap_root_t more) +{ + treap_node_t sentinel; + + sentinel.value = k; + sentinel.priority = ULONG_MAX; + sentinel.parent = sentinel.left = sentinel.right = NULL; + + treap_insert(root, &sentinel); + *less = (*root)->left; + *more = (*root)->right; + + if(*less) (*less)->parent = NULL; + if(*more) (*more)->parent = NULL; + + *root = NULL; +} + +static inline void treap_join(treap_root_t root, + treap_root_t left, treap_root_t right) +{ + treap_node_t sentinel; + sentinel.priority = 0UL; + sentinel.left = *left; + sentinel.right = *right; + sentinel.parent = NULL; + + if(*left) (*left)->parent = &sentinel; + if(*right) (*right)->parent = &sentinel; + + *root = &sentinel; + treap_root_delete(root); +} + +#endif /* _TREAP_H */ diff -urN linux-2.4.17-rc1-virgin/kernel/exit.c linux-2.4.17-rc1-wli3/kernel/exit.c --- linux-2.4.17-rc1-virgin/kernel/exit.c Wed Nov 21 14:42:27 2001 +++ linux-2.4.17-rc1-wli3/kernel/exit.c Sun Dec 16 17:58:10 2001 @@ -190,6 +190,8 @@ } i++; set >>= 1; + debug_lock_break(1); + conditional_schedule(); } } } @@ -273,6 +275,10 @@ struct mm_struct * start_lazy_tlb(void) { struct mm_struct *mm = current->mm; +#ifdef CONFIG_PREEMPT + if (preempt_is_disabled() == 0) + BUG(); +#endif current->mm = NULL; /* active_mm is still 'mm' */ atomic_inc(&mm->mm_count); @@ -284,6 +290,10 @@ { struct mm_struct *active_mm = current->active_mm; +#ifdef CONFIG_PREEMPT + if (preempt_is_disabled() == 0) + BUG(); +#endif current->mm = mm; if (mm != active_mm) { current->active_mm = mm; @@ -307,8 +317,8 @@ /* more a memory barrier than a real lock */ task_lock(tsk); tsk->mm = NULL; - task_unlock(tsk); enter_lazy_tlb(mm, current, smp_processor_id()); + task_unlock(tsk); mmput(mm); } } diff -urN linux-2.4.17-rc1-virgin/kernel/fork.c linux-2.4.17-rc1-wli3/kernel/fork.c --- linux-2.4.17-rc1-virgin/kernel/fork.c Wed Nov 21 10:18:42 2001 +++ linux-2.4.17-rc1-wli3/kernel/fork.c Fri Dec 14 04:38:23 2001 @@ -260,9 +260,6 @@ void mmput(struct mm_struct *mm) { if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { - extern struct mm_struct *swap_mm; - if (swap_mm == mm) - swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); list_del(&mm->mmlist); mmlist_nr--; spin_unlock(&mmlist_lock); @@ -604,6 +601,12 @@ if (p->binfmt && p->binfmt->module) __MOD_INC_USE_COUNT(p->binfmt->module); +#ifdef CONFIG_PREEMPT + /* Since we are keeping the context switch off state as part + * of the context, make sure we start with it off. + */ + p->preempt_count = 1; +#endif p->did_exec = 0; p->swappable = 0; p->state = TASK_UNINTERRUPTIBLE; @@ -649,8 +652,6 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; - INIT_LIST_HEAD(&p->local_pages); - retval = -ENOMEM; /* copy all the process information */ if (copy_files(clone_flags, p)) @@ -682,10 +683,20 @@ * more scheduling fairness. This is only important in the first * timeslice, on the long run the scheduling behaviour is unchanged. */ + /* + * SCHED_FIFO tasks don't count down and have a negative counter. + * Don't change these, least they all end up at -1. + */ +#ifdef CONFIG_RTSCHED + if (p->policy != SCHED_FIFO) +#endif + { + p->counter = (current->counter + 1) >> 1; current->counter >>= 1; if (!current->counter) current->need_resched = 1; + } /* * Ok, add it to the run-queues and make it diff -urN linux-2.4.17-rc1-virgin/kernel/ksyms.c linux-2.4.17-rc1-wli3/kernel/ksyms.c --- linux-2.4.17-rc1-virgin/kernel/ksyms.c Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/kernel/ksyms.c Fri Dec 14 02:44:44 2001 @@ -436,6 +436,9 @@ EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); EXPORT_SYMBOL(schedule); +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(preempt_schedule); +#endif EXPORT_SYMBOL(schedule_timeout); EXPORT_SYMBOL(jiffies); EXPORT_SYMBOL(xtime); diff -urN linux-2.4.17-rc1-virgin/kernel/ptrace.c linux-2.4.17-rc1-wli3/kernel/ptrace.c --- linux-2.4.17-rc1-virgin/kernel/ptrace.c Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/kernel/ptrace.c Fri Dec 14 04:06:29 2001 @@ -121,17 +121,119 @@ } /* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages + * Access another process' address space, one page at a time. */ +static int access_one_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, void *buf, int len, int write) +{ + pgd_t * pgdir; + pmd_t * pgmiddle; + pte_t * pgtable; + char *maddr; + struct page *page; + +repeat: + spin_lock(&mm->page_table_lock); + pgdir = pgd_offset(vma->vm_mm, addr); + if (pgd_none(*pgdir)) + goto fault_in_page; + if (pgd_bad(*pgdir)) + goto bad_pgd; + pgmiddle = pmd_offset(pgdir, addr); + if (pmd_none(*pgmiddle)) + goto fault_in_page; + if (pmd_bad(*pgmiddle)) + goto bad_pmd; + pgtable = pte_offset(pgmiddle, addr); + if (!pte_present(*pgtable)) + goto fault_in_page; + if (write && (!pte_write(*pgtable) || !pte_dirty(*pgtable))) + goto fault_in_page; + page = pte_page(*pgtable); + + /* ZERO_PAGE is special: reads from it are ok even though it's marked reserved */ + if (page != ZERO_PAGE(addr) || write) { + if ((!VALID_PAGE(page)) || PageReserved(page)) { + spin_unlock(&mm->page_table_lock); + return 0; + } + } + get_page(page); + spin_unlock(&mm->page_table_lock); + flush_cache_page(vma, addr); + + if (write) { + maddr = kmap(page); + memcpy(maddr + (addr & ~PAGE_MASK), buf, len); + flush_page_to_ram(page); + flush_icache_page(vma, page); + kunmap(page); + } else { + maddr = kmap(page); + memcpy(buf, maddr + (addr & ~PAGE_MASK), len); + flush_page_to_ram(page); + kunmap(page); + } + put_page(page); + return len; + +fault_in_page: + spin_unlock(&mm->page_table_lock); + /* -1: out of memory. 0 - unmapped page */ + if (handle_mm_fault(mm, vma, addr, write) > 0) + goto repeat; + return 0; + +bad_pgd: + spin_unlock(&mm->page_table_lock); + pgd_ERROR(*pgdir); + return 0; + +bad_pmd: + spin_unlock(&mm->page_table_lock); + pmd_ERROR(*pgmiddle); + return 0; +} + +static int access_mm(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long addr, void *buf, int len, int write) +{ + int copied = 0; + + for (;;) { + unsigned long offset = addr & ~PAGE_MASK; + int this_len = PAGE_SIZE - offset; + int retval; + + if (this_len > len) + this_len = len; + retval = access_one_page(mm, vma, addr, buf, this_len, write); + copied += retval; + if (retval != this_len) + break; + + len -= retval; + if (!len) + break; + + addr += retval; + buf += retval; + + if (addr < vma->vm_end) + continue; + if (!vma->vm_next) + break; + if (vma->vm_next->vm_start != vma->vm_end) + break; + + vma = vma->vm_next; + } + return copied; +} int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) { + int copied; struct mm_struct *mm; - struct vm_area_struct *vma; - struct page *page; - void *old_buf = buf; + struct vm_area_struct * vma; /* Worry about races with exit() */ task_lock(tsk); @@ -143,41 +245,14 @@ return 0; down_read(&mm->mmap_sem); - /* ignore errors, just check how much was sucessfully transfered */ - while (len) { - int bytes, ret, offset; - void *maddr; - - ret = get_user_pages(current, mm, addr, 1, - write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; + vma = find_extend_vma(mm, addr); + copied = 0; + if (vma) + copied = access_mm(mm, vma, addr, buf, len, write); - flush_cache_page(vma, addr); - - maddr = kmap(page); - if (write) { - memcpy(maddr + offset, buf, bytes); - flush_page_to_ram(page); - flush_icache_page(vma, page); - } else { - memcpy(buf, maddr + offset, bytes); - flush_page_to_ram(page); - } - kunmap(page); - put_page(page); - len -= bytes; - buf += bytes; - } up_read(&mm->mmap_sem); mmput(mm); - - return buf - old_buf; + return copied; } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len) diff -urN linux-2.4.17-rc1-virgin/kernel/rtsched.h linux-2.4.17-rc1-wli3/kernel/rtsched.h --- linux-2.4.17-rc1-virgin/kernel/rtsched.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/kernel/rtsched.h Sun Dec 16 18:04:59 2001 @@ -0,0 +1,1218 @@ +/* + * linux/kernel/rtsched.h + * + * NOTE: This is a .h file that is mostly source, not the usual convention. + * It is coded this way to allow the depend rules to correctly set + * up the make file dependencies. This is an alternate scheduler + * that replaces the core scheduler in sched.c. It does not, however, + * replace most of the static support functions that call schedule. + * By making this an include file for sched.c, all of those functions + * are retained without the need for duplicate code and its attendant + * support issues. At the same time, keeping it a seperate file allows + * diff and patch to work most cleanly and correctly. + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001 MontaVista Software Inc. + * + * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar + * 2000-03-15 Added the Real Time run queue support by George Anzinger + * 2000-8-29 Added code to do lazy recalculation of counters + * by George Anzinger + */ + +/* + * 'sched.c' is the main kernel file. It contains scheduling primitives + * (sleep_on, wakeup, schedule etc) as well as a number of simple system + * call functions (type getpid()), which just extract a field from + * current-task + */ + +#ifndef preempt_disable +#define preempt_disable() +#define preempt_enable() +#define preempt_is_disabled() 0 +#define preempt_enable_no_resched() +#endif + +/* + * scheduler variables + */ +#define VERSION_DATE "<20011203.1609.50>" +/* + * We align per-CPU scheduling data on cacheline boundaries, + * to prevent cacheline ping-pong. + */ +static union { + struct schedule_data { + struct task_struct * curr; + cycles_t last_schedule; + struct list_head schedule_data_list; + int cpu,effprio; + } schedule_data; + char __pad [SMP_CACHE_BYTES]; +} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0,{0,0},0,0}}}; + +#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr +static void newprio_ready_q(struct task_struct * tptr,int newprio); +#ifdef CONFIG_SMP +static void newprio_executing(struct task_struct *tptr,int newprio); +static struct list_head hed_cpu_prio __cacheline_aligned = + LIST_HEAD_INIT(hed_cpu_prio); +#endif +/* + * task_on_rq tests for task actually in the ready queue. + * task_on_runque tests for task either on ready queue or being executed + * (by virtue of our seting a running tasks run_list.next to 1) + */ +#define task_on_rq(p) ((unsigned)p->run_list.next > 1) + +static struct list_head rq[MAX_PRI+1] ____cacheline_aligned; + +static struct ready_queue { + int recalc; /* # of counter recalculations on SCHED_OTHER */ + int ticks; /* # of ticks for all in SCHED_OTHER ready Q */ +} runq ____cacheline_aligned; + +/* set the bit map up with guard bits below. This will result in + * priority -1 if there are no tasks in the ready queue which will + * happen as we are not putting the idle tasks in the ready queue. + */ +static struct { + int guard; + int rq_bit_ary[(MAX_PRI/32) +1]; +}rq_bits = {-1,{0,0,0,0}}; +#define rq_bit_map rq_bits.rq_bit_ary + +static int high_prio=0; + +#define Rdy_Q_Hed(pri) &rq[pri] + +#define PREEMPTION_THRESHOLD 1 + +#define NOT_RT 0 /* Use priority zero for non-RT processes */ +#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule + +struct kernel_stat kstat; + +#ifdef CONFIG_SMP + +/* + * At the moment, we will ignor cpus_allowed, primarily because if it were + * used, we would have a conflict in the runq.ticks count (i.e. since we + * are not scheduleing some tasks, the count would not reflect what is + * is really on the list). Oh, and also, nowhere is there code in the + * kernel to set cpus_allowed to anything but -1. In the long run, we + * would like to try seperate lists for each cpu, at which point + * cpus_allowed could be used to direct the task to the proper list. + + * Well, darn, now there is code that messes with cpus_allowed. We will change + * sometime soon.... + */ + +#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) +#define can_schedule(p,cpu) \ + ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) + +#else + +#define idle_task(cpu) (&init_task) +#define can_schedule(p,cpu) (1) + +#endif + +void scheduling_functions_start_here(void) { } + +/* + * This is the function that decides how desirable a process is.. + * You can weigh different processes against each other depending + * on what CPU they've run on lately etc to try to handle cache + * and TLB miss penalties. + * + * Return values: + * -1000: never select this + * 0: out of time, recalculate counters (but it might still be + * selected) + * +ve: "goodness" value (the larger, the better) + */ + +static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) +{ + int weight; + + /* + * goodness is NEVER called for Realtime processes! + * Realtime process, select the first one on the + * runqueue (taking priorities within processes + * into account). + + */ + /* + * Give the process a first-approximation goodness value + * according to the number of clock-ticks it has left. + * + * Don't do any other calculations if the time slice is + * over or if this is an idle task. + */ + weight = p->counter; + if (weight <= 0) + goto out; + +#ifdef CONFIG_SMP + /* Give a largish advantage to the same processor... */ + /* (this is equivalent to penalizing other processors) */ + if (p->processor == this_cpu) + weight += PROC_CHANGE_PENALTY; +#endif + + /* .. and a slight advantage to the current MM */ + if (p->mm == this_mm || !p->mm) + weight += 1; + weight += 20 - p->nice; + +out: + return weight; +} + +/* + * the 'goodness value' of replacing a process on a given CPU. + * positive value means 'replace', zero or negative means 'dont'. + */ +static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +{ + return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); +} + +/* + * This is ugly, but reschedule_idle() is very timing-critical. + * We are called with the runqueue spinlock held and we must + * not claim the tasklist_lock. + */ +static FASTCALL(void reschedule_idle(struct task_struct * p)); + +static void reschedule_idle(struct task_struct * p) +{ +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(), target_cpu; + struct task_struct *target_tsk; + struct list_head *cptr; + struct schedule_data *sch; + int best_cpu; + + /* + * shortcut if the woken up task's last CPU is + * idle now. + */ + best_cpu = p->processor; + target_tsk = idle_task(best_cpu); + if (cpu_curr(best_cpu) == target_tsk) + goto preempt_now; + /* + * For real time, the choice is simple. We just check + * if the most available processor is working on a lower + * priority task. If so we bounce it, if not, there is + * nothing more important than what we are doing. + * Note that this will pick up any idle cpu(s) we may + * have as they will have effprio of -1. + */ + cptr = hed_cpu_prio.prev; + sch = list_entry(cptr, + struct schedule_data, + schedule_data_list); + target_tsk = sch->curr; + if (p->effprio > sch->effprio){ + goto preempt_now; + } + /* + * If all cpus are doing real time and we failed + * above, then there is no help for this task. + */ + if ( sch->effprio ) + goto out_no_target; + /* + * Non-real time contender and one or more processors + * doing non-real time things. + + * So we have a non-real time task contending among + * other non-real time tasks on one or more processors + * We know we have no idle cpus. + */ + /* + * No CPU is idle, but maybe this process has enough priority + * to preempt it's preferred CPU. + */ + target_tsk = cpu_curr(best_cpu); + if (target_tsk->effprio == 0 && + preemption_goodness(target_tsk, p, best_cpu) > 0) + goto preempt_now; + + for (; cptr != &hed_cpu_prio; cptr = cptr->prev ){ + sch =list_entry(cptr, + struct schedule_data, + schedule_data_list); + if (sch->effprio != 0) + break; + if (sch->cpu != best_cpu){ + target_tsk = sch->curr; + if ( preemption_goodness(target_tsk, p, sch->cpu) > + PREEMPTION_THRESHOLD) + goto preempt_now; + } + + } + +out_no_target: + return; + +preempt_now: + target_cpu = target_tsk->processor; + target_tsk->need_resched = 1; + /* + * the APIC stuff can go outside of the lock because + * it uses no task information, only CPU#. + */ + if ((target_cpu != this_cpu) + && (target_tsk != idle_task(target_cpu))) + smp_send_reschedule(target_cpu); + return; +#else /* UP */ + struct task_struct *tsk; + + tsk = cpu_curr(0); + if ((high_prio > tsk->effprio) || + (!tsk->effprio && preemption_goodness(tsk, p, 0) > + PREEMPTION_THRESHOLD)){ + tsk->need_resched = 1; + } +#endif +} + +/* + * This routine maintains the list of smp processors. This is + * a by directional list maintained in priority order. The above + * code used this list to find a processor to use for a new task. + * The search will be backward thru the list as we want to take + * the lowest prioity cpu first. We put equal prioities such that + * the new one will be ahead of the old, so the new should stay + * around a bit longer. + */ + +#ifdef CONFIG_SMP +static inline void re_queue_cpu(struct task_struct *next, + struct schedule_data *sch) +{ + struct list_head *cpuptr; + list_del(&sch->schedule_data_list); + sch->effprio = next->effprio; + cpuptr = hed_cpu_prio.next; + while (cpuptr != &hed_cpu_prio && + sch->effprio < list_entry(cpuptr, + struct schedule_data, + schedule_data_list)->effprio + ) + cpuptr = cpuptr->next; + list_add_tail(&sch->schedule_data_list,cpuptr); + next->newprio = &newprio_executing; +} +#else +#define re_queue_cpu(a,b) +#endif +/* + * Careful! + * + * This has to add the process to the _beginning_ of the + * run-queue, not the end. See the comment about "This is + * subtle" in the scheduler proper.. + * + * For real time tasks we do this a bit differently. We + * keep a priority list of ready tasks. We remove tasks + * from this list when they are running so a running real + * time task will not be in either the ready list or the run + * queue. Also, in the name of speed and real time, only + * priority is important so we spend a few bytes on the queue. + * We have a doubly linked list for each priority. This makes + * Insert and removal very fast. We also keep a bit map of + * the priority queues where a bit says if the queue is empty + * or not. We also keep loose track of the highest priority + * queue that is currently occupied. This high_prio mark + * is updated when a higher priority task enters the ready + * queue and only goes down when we look for a task in the + * ready queue at high_prio and find none. Then, and only + * then, we examine the bit map to find the true high_prio. + */ + +#define BF 31 /* bit flip constant */ +#define set_rq_bit(bit) set_bit(BF-((bit)&0x1f),&rq_bit_map[(bit) >> 5]) +#define clear_rq_bit(bit) clear_bit(BF-((bit)&0x1f),&rq_bit_map[(bit) >> 5]) + +static inline void _del_from_runqueue(struct task_struct * p) +{ + nr_running--; + list_del( &p->run_list ); + if (list_empty(Rdy_Q_Hed(p->effprio))){ + clear_rq_bit(p->effprio); + } + /* p->run_list.next = NULL; !=0 prevents requeue */ + p->run_list.next = NULL; + p->newprio = NULL; + if( !p->effprio) runq.ticks -= p->counter; +} +/* Exported for main.c, also used in init code here */ +void __del_from_runqueue(struct task_struct * p) +{ + _del_from_runqueue(p); +} +static inline struct task_struct * get_next_task(struct task_struct * prev, + int this_cpu) +{ + struct list_head *next, *rqptr; + struct task_struct *it=0; + int *i,c,oldcounter; + + repeat_schedule: + rqptr = Rdy_Q_Hed(high_prio); + next = rqptr->next; + if (unlikely( next == rqptr)){ + for (i=&rq_bit_map[MAX_PRI/32],high_prio=BF+((MAX_PRI/32)*32); + (*i == 0);high_prio -=32,i--); + high_prio -= ffz(~*i); + if (unlikely(high_prio < 0)){ + /* + * No tasks to run, return this cpu's idle task + * It is not in the ready queue, so no need to remove it. + * But first make sure its priority keeps it out of + * the way. + */ + high_prio = 0; + it = idle_task(this_cpu); + it->effprio = -1; + return it; + } + goto repeat_schedule; + } + /* + * If there is only one task on the list, it is a no brainer. + * But really, this also prevents us from looping on recalulation + * if the one and only task is trying to yield. These sort of + * loops are NOT_FUN. Note: we use likely() to tilt toward + * real-time tasks, even thou they are, usually unlikely. We + * are, after all, a real time scheduler. + */ + if ( likely(high_prio || next->next == rqptr)){ + it = list_entry(next, struct task_struct, run_list); + back_from_figure_non_rt_next: + _del_from_runqueue(it); + return it; + } + /* + * Here we set up a SCHED_OTHER yield. Note that for other policies + * yield is handled else where. This means we can use == and = + * instead of & and &= to test and clear the flag. If the prev + * task has all the runq.ticks, then we just do the recaculation + * version and let the winner take all (yield fails). Otherwise + * we fource the counter to zero for the loop and put it back + * after we found some other task. We must remember to update + * runq.ticks during all this. Also, we don't give it all back + * if the yielder has more than the next guy. + */ + oldcounter = 0; + if ( unlikely(prev->policy == (SCHED_YIELD | SCHED_OTHER)) ){ + if ( unlikely(prev->counter == runq.ticks)) { + prev->policy = SCHED_OTHER; + runq.ticks = 0; + }else{ + oldcounter = prev->counter; + prev->counter = 0; + } + } + c = -1000; + if (likely(runq.ticks > 0)) { + do { + int weight; + struct task_struct *p = + list_entry(next, struct task_struct, run_list); + /* if (can_schedule(p, this_cpu))*/ { + weight = goodness(p, this_cpu, prev->active_mm); + if (weight > c) + c = weight, it = p; + } + next = next->next; + } while (next != rqptr); + /* + * if we get out of sync with the runq.ticks counter + * force it to 0 and catch it next time around. Note we + * catch a negative counter on entry. + */ + if ( unlikely(c <= 0 )){ + runq.ticks = 0; + } + }else{ +#ifdef CONFIG_SMP + /* + * Here we update the tasks that are current on other + * processors + */ + struct list_head *wkptr, + *cptr=&aligned_data[(this_cpu)]. + schedule_data. + schedule_data_list; + + runq.ticks = 0; + list_for_each ( wkptr, &hed_cpu_prio) { + struct task_struct *p; + if (cptr == wkptr ) continue; + p = list_entry(wkptr, + struct schedule_data, + schedule_data_list)->curr; + if ( p->effprio == 0){ + p->counter = (p->counter >> 1) + + NICE_TO_TICKS(p->nice); + p->counter_recalc++; + } + } +#else + runq.ticks = 0; +#endif + runq.recalc++; + do { + int weight; + struct task_struct *p = + list_entry(next, struct task_struct, run_list); + runq.ticks += + p->counter = NICE_TO_TICKS(p->nice); + p->counter_recalc++; + /* if (can_schedule(p, this_cpu)) */ + { + weight = goodness(p, this_cpu, prev->active_mm); + if (weight > c) + c = weight, it = p; + } + next = next->next; + } while (next != rqptr); + } + /* Undo the stuff we did for SCHED_YIELD. We know we did something + * if oldcounter != 0. + */ + if (unlikely(oldcounter)){ + + prev->counter = (it->counter < oldcounter) ? + it->counter : + oldcounter; + runq.ticks += prev->counter-oldcounter; + prev->policy &= ~SCHED_YIELD; + } + goto back_from_figure_non_rt_next; + +} +/* Add to the head of the run queue */ +static inline void add_to_runqueue(struct task_struct * p,int cpu) +{ + struct list_head *next; + int prio; + /* idle tasks, don't get put in the list */ + if (unlikely(p == idle_task(cpu))) return; + prio = p->effprio; + next = Rdy_Q_Hed(prio); + if (list_empty(next)) { /* an empty queue */ + set_rq_bit(prio); + if (high_prio < prio) { + high_prio = prio; + } + } + list_add(&p->run_list,next); + p->newprio = newprio_ready_q; + if ( likely(!p->effprio )) { + int diff,c; + if ((diff = runq.recalc - p->counter_recalc) != 0) { + p->counter_recalc = runq.recalc; + c = NICE_TO_TICKS(p->nice) << 1; + p->counter = diff > 8 ? c - 1 : /* max priority */ + c + ((p->counter - c) >> diff); + } + runq.ticks += p->counter; + } + nr_running++; +} + +/* + * This function is only called from schedule() so it need not worry + * about updating the counter as it should never be out of date. + * If you change this, remember to do the update. + */ +static inline void add_last_runqueue(struct task_struct * p) +{ + struct list_head *next = Rdy_Q_Hed(p->effprio); + + if (list_empty(next)) { /* empty list, set the bit */ + set_rq_bit(p->effprio); + if (p->effprio > high_prio){ + high_prio = p->effprio; + } + } + list_add_tail(&p->run_list,next); + p->newprio = newprio_ready_q; + if ( !p->effprio ) runq.ticks += p->counter; + nr_running++; +} + + +static inline void move_first_runqueue(struct task_struct * p) +{ + list_del(&p->run_list); + list_add_tail(&p->run_list, Rdy_Q_Hed(p->effprio)); +} +/* + * When we have a task in some queue by priority, we need + * to provide a way to change that priority. Depending on the + * queue we must do different things. We handle this by putting + * a function address in the task_struct (newprio()). + * + * First a front end routine to take care of the case were the task + * is not in any priority queues. We take the runqueue_lock + * here, so the caller must not. Since we may be called + * recursively, protect against a dead lock. + */ +static struct task_struct *newprio_inuse; +static int newprio_inuse_count; + +void set_newprio(struct task_struct * tptr, int newprio) +{ + if ( newprio_inuse != current){ + spin_lock_irq(&runqueue_lock); + newprio_inuse = current; + } + newprio_inuse_count++; + if (! tptr->newprio ) { + tptr->effprio = newprio; + }else if ( tptr->effprio != newprio) { + tptr->newprio(tptr,newprio); + } + if ( ! --newprio_inuse_count ){ + spin_unlock_irq(&runqueue_lock); + newprio_inuse = 0; + } +} + + +/* + * Here are the routines we use for the ready queue and an executing + * process. Note that the executing process may fall out of favor + * as a result of the change. We do the right thing. Note that newprio + * is not cleared so we test here to see if the task is still running. + */ + +static void newprio_ready_q(struct task_struct * tptr,int newprio) +{ + _del_from_runqueue(tptr); + tptr->effprio = newprio; + add_to_runqueue(tptr,0); + reschedule_idle(tptr); +} +#ifdef CONFIG_SMP +static void newprio_executing(struct task_struct *tptr,int newprio) +{ + int cpu; + struct schedule_data *sched_data; + if(!newprio || newprio < tptr->effprio){ + tptr->need_resched = 1; + } + cpu = tptr->processor; + sched_data = & aligned_data[cpu].schedule_data; + tptr->effprio = newprio; + if( sched_data->curr != tptr) return; /* if not expected, out of here */ + re_queue_cpu(tptr,sched_data); + if ((cpu != smp_processor_id()) && tptr->need_resched) + smp_send_reschedule(cpu); +} +#endif + + + +/* + * Wake up a process. Put it on the ready-queue if it's not + * already there. The "current" process is not on the + * ready-queue (it makes it much easier to figure out if we + * need to preempt, esp. the real time case). It is possible + * to wake the current process. This happens when it is waken + * before schedule has had a chance to put it properly to + * sleep. If schedule did not turn on ints in the middle of + * things this would all be ok, however, it does so we have the + * possibility of being in that window. + * The "current" process is never on the + * run-queue (except when the actual re-schedule is in + * progress), and as such you're allowed to do the simpler + * "current->state = TASK_RUNNING" to mark yourself runnable + * without the overhead of this. + */ +static inline int try_to_wake_up(struct task_struct * p, int synchronous) +{ + unsigned long flags; + int success = 0; + + /* + * We want the common case fall through straight, thus the goto. + */ + spin_lock_irqsave(&runqueue_lock, flags); + p->state = TASK_RUNNING; + if ( task_on_runqueue(p) ) + goto out; + add_to_runqueue(p,0); + if (!synchronous /*|| !(p->cpus_allowed & (1 << smp_processor_id())*/) + reschedule_idle(p); + success = 1; +out: + spin_unlock_irqrestore(&runqueue_lock, flags); + return success; +} + +inline int wake_up_process(struct task_struct * p) +{ + return try_to_wake_up(p, 0); +} +/* + * schedule_tail() is getting called from the fork return path. This + * cleans up all remaining scheduler things, without impacting the + * common case. + */ +static inline void __schedule_tail(struct task_struct *prev) +{ +#ifdef CONFIG_SMP + + /* + * fast path falls through. We have to clear cpus_runnable before + * checking prev->state to avoid a wakeup race. Protect against + * the task exiting early. + */ + task_lock(prev); + task_release_cpu(prev); + mb(); + if (task_on_rq(prev)) + goto needs_resched; + +out_unlock: + task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ + return; + + /* + * Slow path - we 'push' the previous process and + * reschedule_idle() will attempt to find a new + * processor for it. (but it might preempt the + * current process as well.) We must take the runqueue + * lock and re-check prev->state to be correct. It might + * still happen that this process has a preemption + * 'in progress' already - but this is not a problem and + * might happen in other circumstances as well. + */ +needs_resched: + { + unsigned long flags; + + /* + * Avoid taking the runqueue lock in cases where + * no preemption-check is necessery: + * Note: Idle task is NEVER on the ready queue so + * no need to check if prev was idle. + */ + + spin_lock_irqsave(&runqueue_lock, flags); + if (task_on_rq(prev) /* && !task_has_cpu(prev)*/ ) + reschedule_idle(prev); + spin_unlock_irqrestore(&runqueue_lock, flags); + goto out_unlock; + } +#define smp_label_a _smp_label_a: +#define smp_label_b _smp_label_b: +#else + prev->policy &= ~SCHED_YIELD; +#define smp_label_a +#define smp_label_b +#endif /* CONFIG_SMP */ +} + +asmlinkage void schedule_tail(struct task_struct *prev) +{ + __schedule_tail(prev); + preempt_enable(); +} + +/* + * 'schedule()' is the scheduler function. It's a very simple and nice + * scheduler: it's not perfect, but certainly works for most things. + * + * The goto is "interesting". + * + * NOTE!! Task 0 is the 'idle' task, which gets called when no other + * tasks can run. It can not be killed, and it cannot sleep. The 'state' + * information in task[0] is never used. + */ +asmlinkage void schedule(void) +{ + struct schedule_data * sched_data; + struct task_struct *prev, *next; + int this_cpu; + + spin_lock_prefetch(&runqueue_lock); + try_try_again: + + preempt_disable(); + + if (unlikely(!current->active_mm)) BUG(); + prev = current; + this_cpu = prev->processor; + + if (unlikely(in_interrupt())) { + printk("Scheduling in interrupt\n"); + BUG(); + } + + release_kernel_lock(prev, this_cpu); + + /* + * 'sched_data' is protected by the fact that we can run + * only one process per CPU. + */ + sched_data = & aligned_data[this_cpu].schedule_data; + + spin_lock_irq(&runqueue_lock); + +#ifdef CONFIG_PREEMPT + /* + * Note that this is an '&' NOT an '&&'... + */ + if (preempt_is_disabled() & PREEMPT_ACTIVE) goto sw_TASK_RUNNING; +#endif + if (prev->state == TASK_INTERRUPTIBLE) { + //case TASK_INTERRUPTIBLE: + if (likely( ! signal_pending(prev))) { + goto sw_default; + } + prev->state = TASK_RUNNING; + } + + if (prev->state != TASK_RUNNING) { + goto sw_default; + } + //case TASK_RUNNING: +#ifdef CONFIG_PREEMPT + sw_TASK_RUNNING: +#endif + /* + * move an exhausted RR process to be last.. + * Do the same for Yields + */ + if (!prev->counter && (prev->policy & SCHED_RR)) + goto move_rr_last; + if (prev->policy & SCHED_YIELD) + goto move_yield_last; + /* + * There is a case where current is already + * in the ready que. That is where it was + * on the way out, but the wait already + * expired, so wake_up_process has already + * done it. In this case, we don't!! + */ + if (!task_on_rq(prev)) + add_to_runqueue(prev,this_cpu); + goto move_rr_back; + //default: + sw_default: + prev->sleep_time = jiffies; + prev->run_list.next = 0; + + move_rr_back: + prev->need_resched = 0; + smp_label_a + next = get_next_task(prev, this_cpu); + smp_label_b + next->run_list.next = (struct list_head *)1; + sched_data->curr = next; + re_queue_cpu(next,sched_data); + spin_unlock_irq(&runqueue_lock); + + if (unlikely(prev == next)) { + goto same_process; + } + +#ifdef CONFIG_SMP + /* + * maintain the per-process 'last schedule' value. + * (this has to be recalculated even if we reschedule to + * the same process) Currently this is only used on SMP, + * and it's approximate, so we do not have to maintain + * it while holding the runqueue spinlock. + */ + sched_data->last_schedule = get_cycles(); + + /* + * We drop the scheduler lock early (it's a global spinlock), + * thus we have to lock the previous process from getting + * rescheduled during switch_to() (since we are still on his stack). + * + * Here is how we do it. The cpus_runnable flag will be held until + * the task is truly available. On the other hand, this task + * is put in the ready queue during the above runqueue_lock so + * it may be picked up by another cpu. Suppose that cpu is this + * one. Now the prior cpu left the task in the ready queue and + * we have just pluck it from there. No conflict so far, but if + * cpus_runnable is not clear, the other cpu is still in the switch code. + * There are no locks there SAVE THIS ONE!!! Oh woe is me! + * At the same time, under these conditions, i.e. a task is + * coming out of the ready queue before we actually switch, it + * would be good to not switch cpus. So lets define a "wanted" + * bit in the cpus_runnable member. Oops, it is now a cpu bit mask + * so, since only a few folks look at it, we will fudge it a bit. + * Choose an addition that is more than on bit away from a single bit + * + + * We will spin here waiting for cpus_runnable to go to zero. Until + * this happens, we must not change the processor value as + * interrupt code depends on this being right for "current". + */ +#define WANTED 10 +#define TAKEN 20 + { + unsigned long cur_cpus_runnable = next->cpus_runnable; + + atomic_add(WANTED,(atomic_t *)&next->cpus_runnable); + /* + * It is either "WANTED+cur_cpus_runnable" which means we + * need to wait or is: + * A. The old cpu_id + WANTED or + * B. WANTED - 1 which means it cleared (or was clear). + * C. TAKEN + cur_cpus_runnable + */ + while ((cur_cpus_runnable != ~0UL) && + (volatile int)next->cpus_runnable == + WANTED + cur_cpus_runnable) { + unsigned long my_cpu = 1 << this_cpu; + + barrier(); + /* + * OK, so while we wait, lets look in on prev and see + * if he is wanted. + */ + if ( (volatile int)prev->cpus_runnable != my_cpu) { + /* + * Another cpu wants the task we have yet to + * switch away from. Lets steal it back. + * Once WANTED is set on prev, we can clear it + * either here or in schedule_tail. The other + * cpu can clear it by coming here where it will + * be known by him as next... + + * Here, we set it to (TAKEN+my_cpu), in + * schedule_tail it is set to my_cpu + */ + spin_lock_irq(&runqueue_lock); + if ( (volatile int)prev->cpus_runnable != my_cpu) { + spin_unlock_irq(&runqueue_lock); + continue; + } + /* + * Three possibilities on the state of next: + * 0.) cpus_runnable has gone to ~0UL. Means the + * prior cpu has finished and is not + * interested. So put back in ready queue. + * 5.) Other cpu noticed our interest and stoled + * it back (cpus_runnable will be + * TAKEN + his flag). Do nothing. + * 3.) No change, put back in the ready queue + * Note, case 3 presents a bit of a race on our + * clearing the WANTED bit. So, we subtract and + * if the result is negative, set it to zero. + */ + if ( (volatile int)next->cpus_runnable != + cur_cpus_runnable + TAKEN) { + atomic_add(-WANTED, + (atomic_t *)&next->cpus_runnable); + if ((volatile int)next->cpus_runnable < 0) { + next->cpus_runnable = ~0UL; + } + add_to_runqueue(next,this_cpu); + } + /* + * So much for "next". Now lets take prev. + * Setting cpus_runnable to TAKEN+old will pop the + * waiter out of the wait loop. + * We then wait for him to clear TAKEN to + * complete the handshake. We hand shake here + * to keep the other cpu from seeing some later + * state that may be wrong. + */ + prev->cpus_runnable = TAKEN + my_cpu; + next = prev; + spin_unlock_irq(&runqueue_lock); + while ((volatile int)prev->cpus_runnable == + TAKEN + my_cpu) { + barrier(); + } + spin_lock_irq(&runqueue_lock); + goto _smp_label_b; + } + } + /* + * if we poped out of the while because cpus_runnable has TAKEN + * set it means the prior owner stoled back the task. Time to + * rescan the ready queue (after clearing the TAKEN bit to + * complete the handshake). The other possibilities are: + * cpus_runnable = WANTED -1 ( was clear when we started) + * cpus_runnable = -1 (was his, but the other cpu finished, + * seting -1) + */ + if ((volatile int)next->cpus_runnable == + TAKEN + cur_cpus_runnable){ + atomic_add(-TAKEN,(atomic_t *)&next->cpus_runnable); + spin_lock_irq(&runqueue_lock); + goto _smp_label_a; + } + } + /* + * Gosh wasn't that fun! + */ + task_set_cpu(next,this_cpu); +#endif /* CONFIG_SMP */ + + /* + * An interesting problem here. Since we turned on interrupts, + * we could now have a need schedule flag set in prev. Actually + * this can only happen on interrupt and then only be meaningful + * if it is done by a wakeup() call to reschedule_idle(). This + * is covered as that code will set the need_resched flag in the + * task found by cpu_curr() which comes from the cpu structs + * which we have already updated. + + * The remaining problems come from left over timeouts against + * prev, but he was the target and he is gone now... unless + * we did not really switch. So in the switch path we will + * clear the need_resched flag, not in the no switch path. + */ + + kstat.context_swtch++; + /* + * there are 3 processes which are affected by a context switch: + * + * prev == .... ==> (last => next) + * + * It's the 'much more previous' 'prev' that is on next's stack, + * but prev is set to (the just run) 'last' process by switch_to(). + * This might sound slightly confusing but makes tons of sense. + */ + prepare_to_switch(); + { + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + if (!mm) { + if (next->active_mm) BUG(); + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next, this_cpu); + } else { + if (next->active_mm != mm) BUG(); + switch_mm(oldmm, mm, next, this_cpu); + } + + if (!prev->mm) { + prev->active_mm = NULL; + mmdrop(oldmm); + } + } + + /* + * This just switches the register state and the + * stack. + */ + switch_to(prev, next, prev); + __schedule_tail(prev); + prev->need_resched = 0; + +same_process: + reacquire_kernel_lock(current); + preempt_enable_no_resched(); + if ( ! current->need_resched) + return; + + /* The task managed to get its need_resched flag set already! + */ + goto try_try_again; + + + move_rr_last: + prev->counter = NICE_TO_TICKS(prev->nice); + + move_yield_last: + if (prev->effprio) /* non-real time tasks get cleared later */ + prev->policy &= ~SCHED_YIELD; + add_last_runqueue(prev); + goto move_rr_back; + +} +static inline struct task_struct *find_process_by_pid(pid_t pid); + +static int setscheduler(pid_t pid, int policy, + struct sched_param *param) +{ + struct sched_param lp; + struct task_struct *p; + int retval; + + retval = -EINVAL; + if (!param || pid < 0) + goto out_nounlock; + + retval = -EFAULT; + if (copy_from_user(&lp, param, sizeof(struct sched_param))) + goto out_nounlock; + + /* + * We play safe to avoid deadlocks. + */ + read_lock_irq(&tasklist_lock); + spin_lock(&runqueue_lock); + + p = find_process_by_pid(pid); + + retval = -ESRCH; + if (!p) + goto out_unlock; + + if (policy < 0) + policy = p->policy; + else { + retval = -EINVAL; + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_OTHER) + goto out_unlock; + } + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are 1..MAX_PRI, valid + * priority for SCHED_OTHER is 0. + */ + retval = -EINVAL; + if (lp.sched_priority < 0 || lp.sched_priority > MAX_PRI) + goto out_unlock; + if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) + goto out_unlock; + + retval = -EPERM; + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = 0; + p->policy = policy; + if ( policy == SCHED_FIFO) { + p->counter = -100; /* we don't count down neg couters */ + }else{ + p->counter = NICE_TO_TICKS(p->nice); + } + + p->rt_priority = lp.sched_priority; + + spin_unlock_irq(&runqueue_lock); + set_newprio(p,lp.sched_priority); + goto out_readunlock; + +out_unlock: + spin_unlock_irq(&runqueue_lock); + out_readunlock: + read_unlock(&tasklist_lock); + +out_nounlock: + return retval; +} +asmlinkage long sys_sched_yield(void) +{ + /* + * Trick. sched_yield() first checks to see if it will be REALLY + * lonly in the ready queue. It just returns if it is the only + * game in town. The multilple ready queues really help here. + * (This test does not have + * to be atomic.) In threaded applications this optimization + * gets triggered quite often. + */ + if ( ! list_empty(Rdy_Q_Hed(current->effprio))){ + /* + * I think this is safe as only the current task can + * here and only the current task will be clearing this bit + */ + current->policy |= SCHED_YIELD; + schedule(); + } + return 0; +} +/* Seems to be the first place we hear about a given cpu as it comes up. + * A new (including the first) cpu is reporting for duty. Since he is + * already running we must patch him into the processor queue. + * We get here the first time the processor enters the idle code and also + * one more time for the boot cpu so... be careful to not redo what is + * already done. Also note that the fork that created the task put it + * in the ready queue, so we need to take it out, except the initial cpus + * task was not created by a fork. No matter, the removal code works even + * then. + * We give the idle task prioity -1 to keep it out of the way of tasks + * that have real work to do. + */ +extern unsigned long wait_init_idle; + +void __init init_idle(void) +{ + struct schedule_data * sched_data; + int cpu=smp_processor_id(); + sched_data = &aligned_data[cpu].schedule_data; + + if (task_on_rq(current)) { + del_from_runqueue(current); + } + sched_data->curr = current; + sched_data->last_schedule = get_cycles(); + current->effprio = current->rt_priority = 0; + sched_data->effprio = -1; /* idle flag */ + sched_data->cpu = cpu; + clear_bit(current->processor, &wait_init_idle); +#ifdef CONFIG_SMP + if ( ! sched_data->schedule_data_list.next ) { + list_add_tail(&sched_data->schedule_data_list,&hed_cpu_prio); + } +#endif +} + +extern void init_timervecs (void); + +void __init sched_init(void) +{ + /* + * We have to do a little magic to get the first + * process right in SMP mode. + */ + int cpu = smp_processor_id(); + int nr; + int i; + + init_task.processor = cpu; + /* Init the ready queue */ + for (i=0;i<=MAX_PRI ;i++){ + INIT_LIST_HEAD(Rdy_Q_Hed(i)); + } + + + for(nr = 0; nr < PIDHASH_SZ; nr++) + pidhash[nr] = NULL; + printk("rtsched version " VERSION_DATE "\n"); + + init_timervecs(); + + init_bh(TIMER_BH, timer_bh); + init_bh(TQUEUE_BH, tqueue_bh); + init_bh(IMMEDIATE_BH, immediate_bh); + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current, cpu); +} diff -urN linux-2.4.17-rc1-virgin/kernel/sched.c linux-2.4.17-rc1-wli3/kernel/sched.c --- linux-2.4.17-rc1-virgin/kernel/sched.c Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/kernel/sched.c Fri Dec 14 04:38:23 2001 @@ -92,6 +92,10 @@ spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ +#ifdef CONFIG_RTSCHED +extern struct task_struct *child_reaper; +#include "rtsched.h" +#else static LIST_HEAD(runqueue_head); /* @@ -373,6 +377,7 @@ { return try_to_wake_up(p, 0); } +#endif /* ifdef CONFIG_RTSCHED */ static void process_timeout(unsigned long __data) { @@ -458,7 +463,7 @@ out: return timeout < 0 ? 0 : timeout; } - +#ifndef CONFIG_RTSCHED /* * schedule_tail() is getting called from the fork return path. This * cleans up all remaining scheduler things, without impacting the @@ -491,7 +496,7 @@ task_lock(prev); task_release_cpu(prev); mb(); - if (prev->state == TASK_RUNNING) + if (task_on_runqueue(prev)) goto needs_resched; out_unlock: @@ -521,7 +526,7 @@ goto out_unlock; spin_lock_irqsave(&runqueue_lock, flags); - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) + if (task_on_runqueue(prev) && !task_has_cpu(prev)) reschedule_idle(prev); spin_unlock_irqrestore(&runqueue_lock, flags); goto out_unlock; @@ -534,6 +539,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) { __schedule_tail(prev); + preempt_enable(); } /* @@ -556,6 +562,8 @@ spin_lock_prefetch(&runqueue_lock); + preempt_disable(); + if (!current->active_mm) BUG(); need_resched_back: prev = current; @@ -583,6 +591,9 @@ move_last_runqueue(prev); } +#ifdef CONFIG_PREEMPT + if (preempt_is_disabled() & PREEMPT_ACTIVE) goto treat_like_run; +#endif switch (prev->state) { case TASK_INTERRUPTIBLE: if (signal_pending(prev)) { @@ -593,6 +604,9 @@ del_from_runqueue(prev); case TASK_RUNNING:; } +#ifdef CONFIG_PREEMPT + treat_like_run: +#endif prev->need_resched = 0; /* @@ -701,8 +715,10 @@ reacquire_kernel_lock(current); if (current->need_resched) goto need_resched_back; + preempt_enable_no_resched(); return; } +#endif /* ifndef CONFIG_RTSCHED */ /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything @@ -897,7 +913,7 @@ tsk = find_task_by_pid(pid); return tsk; } - +#ifndef CONFIG_RTSCHED static int setscheduler(pid_t pid, int policy, struct sched_param *param) { @@ -967,6 +983,7 @@ out_nounlock: return retval; } +#endif /* ifndef CONFIG_RTSCHED */ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param *param) @@ -979,6 +996,34 @@ return setscheduler(pid, -1, param); } +#ifdef CONFIG_PREEMPT + +#ifdef CONFIG_SMP +#define lock_to_this_cpu() \ + unsigned long old_cpus_allowed = current->cpus_allowed; \ + current->cpus_allowed = 1UL << smp_processor_id() +#define restore_cpus_allowed() current->cpus_allowed = old_cpus_allowed +#else +#define lock_to_this_cpu() +#define restore_cpus_allowed() +#endif /* !CONFIG_SMP */ + +asmlinkage void preempt_schedule(void) +{ + while (current->need_resched) { + /* it would be ideal not to lock tasks to their cpu here, + * but only around the data that needs such locking */ + lock_to_this_cpu(); + current->preempt_count += PREEMPT_ACTIVE + 1; + barrier(); + schedule(); + current->preempt_count -= PREEMPT_ACTIVE + 1; + barrier(); + restore_cpus_allowed(); + } +} +#endif /* CONFIG_PREEMPT */ + asmlinkage long sys_sched_getscheduler(pid_t pid) { struct task_struct *p; @@ -1030,6 +1075,7 @@ return retval; } +#ifndef CONFIG_RTSCHED asmlinkage long sys_sched_yield(void) { /* @@ -1070,7 +1116,7 @@ } return 0; } - +#endif /* ifndef CONFIG_RTSCHED */ asmlinkage long sys_sched_get_priority_max(int policy) { int ret = -EINVAL; @@ -1078,7 +1124,7 @@ switch (policy) { case SCHED_FIFO: case SCHED_RR: - ret = 99; + ret = MAX_PRI; break; case SCHED_OTHER: ret = 0; @@ -1297,6 +1343,7 @@ atomic_inc(¤t->files->count); } +#ifndef CONFIG_RTSCHED extern unsigned long wait_init_idle; void __init init_idle(void) @@ -1342,3 +1389,4 @@ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, cpu); } +#endif /* ifndef CONFIG_RTSCHED */ diff -urN linux-2.4.17-rc1-virgin/kernel/sysctl.c linux-2.4.17-rc1-wli3/kernel/sysctl.c --- linux-2.4.17-rc1-virgin/kernel/sysctl.c Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/kernel/sysctl.c Fri Dec 14 02:44:20 2001 @@ -260,6 +260,8 @@ }; static ctl_table vm_table[] = { + {VM_FREEPG, "freepages", + &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec}, {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &bdflush_min, &bdflush_max}, @@ -271,6 +273,8 @@ &pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec}, {VM_PAGE_CLUSTER, "page-cluster", &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_MAX_MAP_COUNT, "max_map_count", + &max_map_count, sizeof(int), 0644, NULL, &proc_dointvec}, {VM_MIN_READAHEAD, "min-readahead", &vm_min_readahead,sizeof(int), 0644, NULL, &proc_dointvec}, {VM_MAX_READAHEAD, "max-readahead", diff -urN linux-2.4.17-rc1-virgin/kernel/timer.c linux-2.4.17-rc1-wli3/kernel/timer.c --- linux-2.4.17-rc1-virgin/kernel/timer.c Mon Oct 8 10:41:41 2001 +++ linux-2.4.17-rc1-wli3/kernel/timer.c Fri Dec 14 04:38:23 2001 @@ -583,7 +583,15 @@ update_one_process(p, user_tick, system, cpu); if (p->pid) { +#ifdef CONFIG_RTSCHED + /* SCHED_FIFO and the idle(s) have counters set to -100, + * so we won't count them, seems like a good idea for + * both schedulers, but, being pure... + */ + if (p->counter >= 0 && --p->counter <= 0) { +#else if (--p->counter <= 0) { +#endif p->counter = 0; p->need_resched = 1; } diff -urN linux-2.4.17-rc1-virgin/kernel/user.c linux-2.4.17-rc1-wli3/kernel/user.c --- linux-2.4.17-rc1-virgin/kernel/user.c Tue Nov 28 22:43:39 2000 +++ linux-2.4.17-rc1-wli3/kernel/user.c Sun Dec 16 23:52:26 2001 @@ -19,7 +19,14 @@ #define UIDHASH_BITS 8 #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) -#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) ^ uid) & UIDHASH_MASK) + +/* + * hash function borrowed from Chuck Lever's paper + * The effects of this replacement have not been measured. + * -- wli + */ +#define __uidhashfn(uid) \ + (((2654435761UL*(uid)) >> (BITS_PER_LONG-UIDHASH_BITS)) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn(uid)) static kmem_cache_t *uid_cachep; diff -urN linux-2.4.17-rc1-virgin/lib/dec_and_lock.c linux-2.4.17-rc1-wli3/lib/dec_and_lock.c --- linux-2.4.17-rc1-virgin/lib/dec_and_lock.c Wed Oct 3 09:11:26 2001 +++ linux-2.4.17-rc1-wli3/lib/dec_and_lock.c Fri Dec 14 02:44:44 2001 @@ -1,5 +1,6 @@ #include #include +#include #include /* diff -urN linux-2.4.17-rc1-virgin/mm/Makefile linux-2.4.17-rc1-wli3/mm/Makefile --- linux-2.4.17-rc1-virgin/mm/Makefile Wed Oct 24 15:21:18 2001 +++ linux-2.4.17-rc1-wli3/mm/Makefile Fri Dec 14 02:44:20 2001 @@ -14,7 +14,7 @@ obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o + shmem.o rmap.o obj-$(CONFIG_HIGHMEM) += highmem.o diff -urN linux-2.4.17-rc1-virgin/mm/TODO linux-2.4.17-rc1-wli3/mm/TODO --- linux-2.4.17-rc1-virgin/mm/TODO Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/mm/TODO Fri Dec 14 02:44:20 2001 @@ -0,0 +1,31 @@ + VM TODO list + +Forever valid TODO entries: + - keep up with the official kernel + - port over bugfixes + - minimise the diff by keeping code in sync, where possible + +Easy short-term features: + - reclaim swap space from refill_inactive() + - simplify SMP locking + - replace foo()/foo_pgd()/foo_pmd()/foo_pte() stuff with + one single function using a for_each_pte() macro + for_each_pte(ptep, mm, start_address, end_address) + - stronger drop behind / unused object dropping, all the way + to the far end of the inactive list + - per-zone active/inactive list (wli) + - fix page_launder() to not eat horrible amounts of CPU or flush + all pages to disk at once + - better VM balancing, clean vs. dirty ratio + +Long-term features: + - extensive VM statistics + - IO clustering for page_launder() and sync_old_buffers() + - readahead on per-VMA level (+ drop behind?) + - more graceful degradation when the load gets high + - reducing readahead + - unfair pageout so not all apps fall over + - memory objects, using pagecache and tmpfs for storage so + the memory object itself doesn't introduce any new overhead + - using the memory objects, removing page table copying from fork() + - load control able to deal with really extreme loads, swapping diff -urN linux-2.4.17-rc1-virgin/mm/bootmem.c linux-2.4.17-rc1-wli3/mm/bootmem.c --- linux-2.4.17-rc1-virgin/mm/bootmem.c Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/mm/bootmem.c Fri Dec 14 03:21:15 2001 @@ -3,8 +3,9 @@ * * Copyright (C) 1999 Ingo Molnar * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Segment tree memory reservation system, William Irwin, IBM, Oct 2001 * - * simple boot-time physical memory area allocator and + * Simple boot-time physical memory area allocator and * free memory collector. It's used to deal with reserved * system memory and memory holes as well. */ @@ -17,40 +18,192 @@ #include #include #include -#include +#include /* - * Access to this subsystem has to be serialized externally. (this is - * true for the boot process anyway) + * Design notes: + * + * This design was arrived at by considering four principal concerns, + * beyond properly representing discontiguous memory machines: + * + * (1) Machines on which the physical address space is highly fragmented. + * (2) Machines where nodes' memory fragments may be interleaved. + * (3) Machines whose physical address space layouts are irregular. + * (4) Machines requiring heavy boot-time memory reservation activity. + * + * These design concerns led to an implementation which represented + * available physical memory explicitly in terms of intervals to save + * space and also one utilizing an efficient search structure. These + * design concerns may not be universally important; however, small + * benefits should be seen even on low-memory machines, or machines + * without significant boot-time memory reservation activity. + * + * Concern (3) is perhaps the principal concern. In this situation, + * there is very little prior knowledge of memory range to node + * mappings, so perhaps a large portion of the work the bootmem + * allocator is intended to do must be done "up front" when bitmaps + * associated with memory ranges are used to represent availability + * information. While it is possible to use bitmaps for that purpose, + * it is my belief that the reduced space overhead of the segment + * trees and the obliviousness of their storage management with + * respect to the address ranges they represent is advantageous. + * + * In order to motivate how (2) is addressed, the notion of + * "residency" is useful. When a memory range is associated with + * a node, only a certain portion of it is actually available. + * the ratio of available memory to the size of the memory range + * being tracked, sizeof(available memory)/sizeof(memory in map), + * is what I call the residency of the range. When the map of the + * available memory requires a contiguous range of memory that is + * a larger proportion of the range of memory being tracked than + * the residency of that range, then the algorithm can no longer + * properly function. + * So to address that, a representation has been chosen which does + * not grow with the size of the range of memory being represented. + * The residency requirements of the bitmap-based representation + * are 1/(8*sizeof(page)) on byte addressed machines. But the range + * set representation has no specific residency requirements. + * Segment pools need not be drawn from a contiguous range of memory + * larger than the combined size of a header for tracking all the + * segment pools and the size of a single range structure. Dynamic + * addition of segment pools is not implemented here yet. + */ + +/* + * Access to this subsystem has to be serialized externally. (This is + * true for the boot process anyway.) + */ + +/* + * Alignment has to be a power of 2 value. + * These macros abstract out common address calculations for alignments. + */ +#define RND_DN(x,n) ((x) & ~((n)-1)) +#define RND_UP(x,n) RND_DN((x) + (n) - 1, n) +#define DIV_DN(x,n) ((x) / (n)) +#define DIV_UP(x,n) DIV_DN((x) + ((n) - 1), n) + +/* + * The highest and lowest page frame numbers on the system. + * These refer to physical addresses backed by memory regardless + * of runtime availability. */ unsigned long max_low_pfn; unsigned long min_low_pfn; -/* return the number of _pages_ that will be allocated for the boot bitmap */ -unsigned long __init bootmem_bootmap_pages (unsigned long pages) +/* + * This is a poor choice of random seeds for deterministic + * behavior during debugging. Oddly enough it does not seem + * to damage the structure of the trees. + */ +static unsigned long __initdata random_seed = 1UL; + +/* + * Park-Miller random number generator, using Schrage's + * technique for overflow handling. + */ +static unsigned long __init rand(void) { - unsigned long mapsize; + unsigned long a = 16807; + unsigned long q = 12773; + unsigned long r = 2386; + unsigned long k; + + k = random_seed / q; + random_seed = a*(random_seed - k*q) - r*k; + return random_seed; +} - mapsize = (pages+7)/8; - mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; - mapsize >>= PAGE_SHIFT; +/* + * Initialize the segment pool, which occupies node_bootmem_map. + * This is the memory from which the tree nodes tracking available + * memory are allocated. + */ +static void __init segment_pool_init(bootmem_data_t *bdata) +{ + unsigned k; + segment_buf_t *segment_pool = (segment_buf_t *)bdata->node_bootmem_map; - return mapsize; + for(k = 0; k < NR_SEGMENTS - 1; ++k) + segment_pool[k].next = &segment_pool[k+1]; + segment_pool[NR_SEGMENTS-1].next = NULL; + bdata->free_segments = segment_pool; +} + +/* + * Allocates a tree node from a node's segment pool, initializing the + * whole of the memory block to zeroes. + */ +static segment_tree_node_t * __init segment_alloc(bootmem_data_t *bdata) +{ + segment_tree_node_t *tmp = (segment_tree_node_t *)bdata->free_segments; + + if(!bdata->free_segments) + return NULL; + + bdata->free_segments = bdata->free_segments->next; + memset(tmp, 0, sizeof(segment_tree_node_t)); + return tmp; +} + +/* + * Convenience operation to insert a tree node into both + * of the segment trees associated with a node. The randomized + * priorities are used here. + */ +static void __init segment_insert(segment_tree_root_t *root, + segment_tree_node_t *node) +{ + node->start.priority = rand(); + node->length.priority = rand(); + treap_insert(&root->start_tree, &node->start); + treap_insert(&root->length_tree, &node->length); +} + +/* + * Returns a segment tree node to the node-local pool of available + * tree nodes. + */ +static void __init segment_free(bootmem_data_t *bdata, + segment_tree_node_t *node) +{ + segment_buf_t *tmp; + + if(!node) + return; + + tmp = (segment_buf_t *)node; + tmp->next = bdata->free_segments; + bdata->free_segments = tmp; +} + +/* + * Return the number of _pages_ that will be allocated for the bootmem + * segment pool. Its sole purpose is to warn callers of the bootmem + * interface in advance of its size, so that a suitably large range of + * physical memory may be found to hold it. + */ +unsigned long __init bootmem_bootmap_pages (unsigned long pages) +{ + return DIV_UP(NR_SEGMENTS*sizeof(segment_buf_t),PAGE_SIZE); } /* * Called once to set up the allocator itself. + * Its responsibilities are manipulate the bootmem_data_t within + * a node, initializing its address range and node-local segment + * pool fields. It is supposed to calculate the amount of memory + * required for the node_bootmem_map, but this is not possible + * without a change of interface. */ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, unsigned long mapstart, unsigned long start, unsigned long end) { bootmem_data_t *bdata = pgdat->bdata; - unsigned long mapsize = ((end - start)+7)/8; pgdat->node_next = pgdat_list; pgdat_list = pgdat; - mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); bdata->node_boot_start = (start << PAGE_SHIFT); bdata->node_low_pfn = end; @@ -59,300 +212,701 @@ * Initially all pages are reserved - setup_arch() has to * register free RAM areas explicitly. */ - memset(bdata->node_bootmem_map, 0xff, mapsize); + bdata->segment_tree.start_tree = NULL; + bdata->segment_tree.length_tree = NULL; + segment_pool_init(bdata); - return mapsize; + return RND_UP(NR_SEGMENTS*sizeof(segment_buf_t), PAGE_SIZE); } /* - * Marks a particular physical memory range as unallocatable. Usable RAM - * might be used for boot-time allocations - or it might get added - * to the free page pool later on. + * reserve_bootmem_core marks a particular segment of physical + * memory as unavailable. Available memory might be used for boot-time + * allocations, or it might be made available again later on. + * + * Its behavior is to mark the specified range of physical memory + * as unavailable, irrespective of alignment constraints (in contrast + * to prior incarnations, which page-aligned the starting and ending + * addresses of the unavailable interval of memory). */ -static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +static void __init reserve_bootmem_core(bootmem_data_t *bdata, + unsigned long addr, unsigned long size) { - unsigned long i; + unsigned long start; + unsigned long end; + segment_tree_node_t split_segment, segment; + segment_tree_node_t reserved_left, reserved_right; + segment_tree_node_t *multiple_left, *multiple_right; + treap_node_t *tmp, *parent, *intersect; + /* - * round up, partially reserved pages are considered - * fully reserved. + * Round up, partially reserved pages are considered fully reserved. */ - unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; - unsigned long eidx = (addr + size - bdata->node_boot_start + - PAGE_SIZE-1)/PAGE_SIZE; - unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; + start = addr; + end = start + size - 1; - if (!size) BUG(); + segment_set_endpoints(&segment, start, end); - if (sidx < 0) - BUG(); - if (eidx < 0) - BUG(); - if (sidx >= eidx) - BUG(); - if ((addr >> PAGE_SHIFT) >= bdata->node_low_pfn) - BUG(); - if (end > bdata->node_low_pfn) - BUG(); - for (i = sidx; i < eidx; i++) - if (test_and_set_bit(i, bdata->node_bootmem_map)) - printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); + segment_all_intersect(&bdata->segment_tree.start_tree, + start, end, &intersect); + + /* + * If the set of intersecting intervals is empty, report + * the entire interval as multiply-reserved. Then the + * condition of the loop ensures a proper exit will follow. + */ + if(!intersect) + printk(KERN_WARNING "the interval [%lu, %lu] " + "was multiply reserved (!intersect)\n", + segment_start(&segment), + segment_end(&segment)); + + /* + * For error-checking, this must be called only for a single + * node per reservation. The next step in strict error checking + * would be to track the fragments of the interval to reserve + * that do not lie within any available interval and then report + * them as multiply-reserved. + * + * Unfortunately, error checking that way appears to require + * unbounded allocations in order to maintain the set of multiply + * reserved intervals, so it is not entirely robust. + * + * For the moment, a cruder form of error checking is done: + * if the available interval does not contain the interval + * to be reserved, then the complement of the reserved + * interval with respect to the available interval is reported + * as multiply reserved. This may multiply report multiply + * reserved ranges, but it is still less verbose than the + * mechanism used in the bitmap-based allocator. + */ + + /* + * Destructive post-order traversal of the set of + * intersecting intervals. + */ + tmp = intersect; + treap_find_leftmost_leaf(tmp); + while(tmp) { + segment_tree_node_t *fragment = &split_segment; + segment_tree_node_t *avail = start_segment_treap(tmp); + treap_find_parent_and_remove_child(tmp, parent); + + multiple_left = &reserved_left; + multiple_right = &reserved_right; + + if(!segment_contains(avail, &segment)) { + segment_set_endpoints(multiple_left, + segment_start(&segment), + segment_end(&segment)); + segment_complement(&multiple_left, avail, + &multiple_right); + if(multiple_left) + printk(KERN_WARNING "the interval [%lu, %lu] " + " was multiply reserved (left)\n", + segment_start(multiple_left), + segment_end(multiple_left)); + if(multiple_right) + printk(KERN_WARNING "the interval [%lu, %lu] " + " was multiply reserved (right)\n", + segment_start(multiple_right), + segment_end(multiple_right)); + } + + if(!treap_root_delete(segment_length_link(tmp))) + treap_root_delete(&bdata->segment_tree.length_tree); + + segment_complement(&avail, &segment, &fragment); + + if(!avail) + segment_free(bdata, start_segment_treap(tmp)); + else + segment_insert(&bdata->segment_tree, avail); + + if(fragment) { + + avail = segment_alloc(bdata); + + if(!avail) + BUG(); + + segment_set_endpoints(avail, segment_start(fragment), + segment_end(fragment)); + segment_insert(&bdata->segment_tree, avail); + } + + tmp = parent; + treap_find_leftmost_leaf(tmp); + } } -static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +/* + * free_bootmem_core marks a particular segment of the physical + * address space as available. Its semantics are to make the range + * of addresses available, irrespective of alignment constraints. + */ +static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, + unsigned long size) { - unsigned long i; - unsigned long start; + unsigned long start, end; + segment_tree_node_t segment, *avail, intersection, freed; + treap_node_t *tmp, *parent, *intersect = NULL; + + start = addr; + end = start + size - 1; + + segment_set_endpoints(&segment, start, end); + segment_set_endpoints(&freed, start, end); + + segment_all_intersect(&bdata->segment_tree.start_tree, + start ? start - 1 : start, end + 1, &intersect); + /* - * round down end of usable mem, partially free pages are - * considered reserved. + * Error checking here is simple: + * If the available segment and the segment being freed truly + * intersect, their intersection should be reported as multiply + * made available. */ - unsigned long sidx; - unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; - unsigned long end = (addr + size)/PAGE_SIZE; - - if (!size) BUG(); - if (end > bdata->node_low_pfn) - BUG(); /* - * Round up the beginning of the address. + * Destructive post-order traversal of the set of intervals + * intersecting with the freed interval expanded by one. This + * provides for merging of available intervals, as all the + * adjacent intervals are united with newly available interval. */ - start = (addr + PAGE_SIZE-1) / PAGE_SIZE; - sidx = start - (bdata->node_boot_start/PAGE_SIZE); + tmp = intersect; + treap_find_leftmost_leaf(tmp); + while(tmp) { + + avail = start_segment_treap(tmp); + treap_find_parent_and_remove_child(tmp, parent); + + if(segment_intersect(&freed, avail)) { + segment_intersection(&intersection, &freed, avail); + printk(KERN_WARNING "the interval [%lu, %lu] " + "was multiply made available\n", + segment_start(&intersection), + segment_end(&intersection)); + } - for (i = sidx; i < eidx; i++) { - if (!test_and_clear_bit(i, bdata->node_bootmem_map)) - BUG(); + segment_unite(&segment, avail); + + if(!treap_root_delete(segment_length_link(tmp))) + treap_root_delete(&bdata->segment_tree.length_tree); + + segment_free(bdata, avail); + + tmp = parent; + treap_find_leftmost_leaf(tmp); } + + avail = segment_alloc(bdata); + if(!avail) + BUG(); + + segment_set_endpoints(avail, segment_start(&segment), + segment_end(&segment)); + + segment_insert(&bdata->segment_tree, avail); } /* - * We 'merge' subsequent allocations to save space. We might 'lose' - * some fraction of a page if allocations cannot be satisfied due to - * size constraints on boxes where there is physical RAM space - * fragmentation - in these cases * (mostly large memory boxes) this - * is not a problem. + * The terms are borrowed from linear programming. + * A feasible line segment is one which contains a subinterval + * aligned on the appropriate boundary of sufficient length. + * + * The objective function is the magnitude of the least residue + * of the smallest aligned address within the subinterval minus the goal + * mod the largest page frame number. A conditional is used instead of + * of remainder so as to avoid the overhead of division. * - * On low memory boxes we get it right in 100% of the cases. + * The idea here is to iterate over the feasible set and minimize + * the objective function (by exhaustive search). The search space + * is "thinned" prior to the iteration by using the heuristic that + * the interval must be at least of the length requested, though + * that is not sufficient because of alignment constraints. */ +#define FEASIBLE(seg, len, align) \ +( \ + (segment_end(seg) >= RND_UP(segment_start(seg), align)) \ + && \ + ((segment_end(seg) - RND_UP(segment_start(seg), align)) > (len))\ +) + +#define STARTS_BELOW(seg,goal,align,len) \ + (RND_UP(segment_start(seg), align) <= (goal)) + +#define ENDS_ABOVE(seg, goal, align, len) \ + ((segment_end(seg) > (goal)) && ((segment_end(seg) - (goal)) > (len))) + +#define GOAL_WITHIN(seg,goal,align,len) \ + (STARTS_BELOW(seg,goal,align,len) && ENDS_ABOVE(seg,goal,align,len)) + +#define GOAL_ABOVE(seg, goal, align) \ + ((goal) > segment_end(seg)) + +#define DISTANCE_BELOW(seg, goal, align) \ + (segment_start(seg) - (goal)) + +#define DISTANCE_ABOVE(seg, goal, align) \ + (((ULONG_MAX - (goal)) + 1) + segment_start(seg)) + +#define OBJECTIVE(seg, goal, align, len) \ +( GOAL_WITHIN(seg,goal,align,len) \ + ? 0UL \ + : ( \ + GOAL_ABOVE(seg, goal, align) \ + ? DISTANCE_ABOVE(seg, goal, align) \ + : DISTANCE_BELOW(seg, goal, align) \ + ) \ +) + +#define UNVISITED 0 +#define LEFT_SEARCHED 1 +#define RIGHT_SEARCHED 2 +#define VISITED 3 + /* - * alignment has to be a power of 2 value. + * __alloc_bootmem_core attempts to satisfy reservation requests + * of a certain size with alignment constraints, so that the beginning + * of the allocated line segment is as near as possible to the goal + * in the following sense: + * + * The beginning of the allocated line segment is either the lowest + * possible address above the goal, or the lowest possible address + * overall. This actually has a simple notion of distance, namely + * (goal - start) % (MAX_ADDR + 1). The OBJECTIVE macros measures + * this distance, albeit with some arithmetic complications. + * + * The algorithm proceeds as follows: + * (1) Divide the set of available intervals into those which are + * long enough and those which are not long enough, ignoring + * alignment constraints. + * (2) Perform depth-first search over the tree of supposedly + * long enough intervals for the best possible interval. + * + * The FEASIBLE macro is used to determine whether it is truly + * possible to place an aligned interval of sufficient length + * within the interval, and it is needed because the true length + * of the interval is not sufficient to determine that, and + * because it is not truly possible to subdivide the set of available + * intervals according to this criterion with pure tree operations. + * + * As address ranges are the granularity of available interval tracking, + * this should provide optimal merging behavior. */ + static void * __init __alloc_bootmem_core (bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal) { - unsigned long i, start = 0; + unsigned long length; + segment_tree_node_t left_half, right_half, reserved, *left, *right; + segment_tree_node_t *optimum, *node; + treap_node_t *tmp, *infeasible, *feasible; void *ret; - unsigned long offset, remaining_size; - unsigned long areasize, preferred, incr; - unsigned long eidx = bdata->node_low_pfn - (bdata->node_boot_start >> - PAGE_SHIFT); - if (!size) BUG(); + feasible = infeasible = NULL; - if (align & (align-1)) + if(!align) + align = 1; + + length = size; + if(!length) BUG(); - offset = 0; - if (align && - (bdata->node_boot_start & (align - 1UL)) != 0) - offset = (align - (bdata->node_boot_start & (align - 1UL))); - offset >>= PAGE_SHIFT; - - /* - * We try to allocate bootmem pages above 'goal' - * first, then we try to allocate lower pages. - */ - if (goal && (goal >= bdata->node_boot_start) && - ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) { - preferred = goal - bdata->node_boot_start; - } else - preferred = 0; - - preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; - preferred += offset; - areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; - incr = align >> PAGE_SHIFT ? : 1; - -restart_scan: - for (i = preferred; i < eidx; i += incr) { - unsigned long j; - if (test_bit(i, bdata->node_bootmem_map)) + treap_split(&bdata->segment_tree.length_tree, length, &infeasible, + &feasible); + optimum = NULL; + + tmp = feasible; + while(tmp) { + + if(tmp->marker == UNVISITED) { + if(tmp->left) { + tmp->marker = LEFT_SEARCHED; + tmp = tmp->left; + continue; + } else if(tmp->right) { + tmp->marker = RIGHT_SEARCHED; + tmp = tmp->right; + continue; + } else + tmp->marker = VISITED; + } else if(tmp->marker == LEFT_SEARCHED) { + if(tmp->right) { + tmp->marker = RIGHT_SEARCHED; + tmp = tmp->right; + continue; + } else + tmp->marker = VISITED; + } else if(tmp->marker == RIGHT_SEARCHED) + tmp->marker = VISITED; + else if(tmp->marker == VISITED) { + tmp->marker = UNVISITED; + tmp = tmp->parent; continue; - for (j = i + 1; j < i + areasize; ++j) { - if (j >= eidx) - goto fail_block; - if (test_bit (j, bdata->node_bootmem_map)) - goto fail_block; - } - start = i; - goto found; - fail_block:; + } else + BUG(); + + if(!tmp) + break; + + node = length_segment_treap(tmp); + + if(!optimum && FEASIBLE(node, length, align)) + + optimum = node; + + else if(FEASIBLE(node, length, align) + && (OBJECTIVE(node, goal, align, length) + < OBJECTIVE(optimum, goal, align, length))) + + optimum = node; + } - if (preferred) { - preferred = offset; - goto restart_scan; + + /* + * Restore the set of available intervals keyed by length, + * taking into account the need to remove the optimum from + * the set if it has been determined. + */ + if(!optimum) { + treap_join(&bdata->segment_tree.length_tree, &feasible, + &infeasible); + return NULL; } - return NULL; -found: - if (start >= eidx) - BUG(); + + if(!treap_root_delete(treap_node_link(&optimum->start))) + treap_root_delete(&bdata->segment_tree.start_tree); + + if(!treap_root_delete(treap_node_link(&optimum->length))) + treap_root_delete(&feasible); + + treap_join(&bdata->segment_tree.length_tree, &infeasible, &feasible); /* - * Is the next page of the previous allocation-end the start - * of this allocation's buffer? If yes then we can 'merge' - * the previous partial page with this allocation. - */ - if (align <= PAGE_SIZE - && bdata->last_offset && bdata->last_pos+1 == start) { - offset = (bdata->last_offset+align-1) & ~(align-1); - if (offset > PAGE_SIZE) + * Now the iteration has converged to the optimal feasible interval. + * Within that interval we must now choose a subinterval + * satisfying the alignment constraints and do the appropriate + * splitting of the interval from which it was drawn. + */ + + segment_set_endpoints(&reserved, goal, goal + length - 1); + + if(!segment_contains_point(optimum, goal) + || !segment_contains(optimum, &reserved)) + + segment_set_endpoints(&reserved, + RND_UP(segment_start(optimum), align), + RND_UP(segment_start(optimum),align)+length-1); + + segment_set_endpoints(&left_half, segment_start(optimum), + segment_end(optimum)); + + left = &left_half; + right = &right_half; + segment_complement(&left, &reserved, &right); + + if(!left && !right) + segment_free(bdata, optimum); + + if(left) { + segment_set_endpoints(optimum, segment_start(left), + segment_end(left)); + segment_insert(&bdata->segment_tree, optimum); + } + + if(right) { + segment_tree_node_t *segment = segment_alloc(bdata); + if(!segment) BUG(); - remaining_size = PAGE_SIZE-offset; - if (size < remaining_size) { - areasize = 0; - // last_pos unchanged - bdata->last_offset = offset+size; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + - bdata->node_boot_start); - } else { - remaining_size = size - remaining_size; - areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + - bdata->node_boot_start); - bdata->last_pos = start+areasize-1; - bdata->last_offset = remaining_size; - } - bdata->last_offset &= ~PAGE_MASK; - } else { - bdata->last_pos = start + areasize - 1; - bdata->last_offset = size & ~PAGE_MASK; - ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); + segment_set_endpoints(segment, segment_start(right), + segment_end(right)); + segment_insert(&bdata->segment_tree, segment); } + /* - * Reserve the area now: + * Convert the physical address to a kernel virtual address, + * zero out the memory within the interval, and return it. */ - for (i = start; i < start+areasize; i++) - if (test_and_set_bit(i, bdata->node_bootmem_map)) - BUG(); + ret = (void *)(phys_to_virt(segment_start(&reserved))); memset(ret, 0, size); + return ret; } +/* + * free_all_bootmem_core's responsibilities are to initialize the + * node_mem_map array of struct page with the availability information + * regarding physical memory, and to make available the memory the + * bootmem allocator itself used for tracking available physical memory. + * Here the prior behavior with respect to page alignment is emulated + * by reducing the granularity of the address ranges to page frames, + * using the conservative approximation of the largest page-aligned + * interval lying within the interval seen to be available, or making + * no memory available if the interval is smaller than a page in length. + */ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) { - struct page *page = pgdat->node_mem_map; - bootmem_data_t *bdata = pgdat->bdata; - unsigned long i, count, total = 0; - unsigned long idx; + unsigned long total = 0UL, mapstart, start, end; + unsigned long node_start = pgdat->bdata->node_boot_start >> PAGE_SHIFT; + struct page *page; + treap_node_t *parent, *tmp; - if (!bdata->node_bootmem_map) BUG(); + mapstart = virt_to_phys(pgdat->bdata->node_bootmem_map); - count = 0; - idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); - for (i = 0; i < idx; i++, page++) { - if (!test_bit(i, bdata->node_bootmem_map)) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - } - } - total += count; +#ifdef DEBUG_BOOTMEM + + printk("Available physical memory:\n"); + +#endif /* DEBUG_BOOTMEM */ + + free_bootmem_core(pgdat->bdata, mapstart, + RND_UP(NR_SEGMENTS*sizeof(segment_buf_t), PAGE_SIZE)); /* - * Now free the allocator bitmap itself, it's not - * needed anymore: + * Destructive post-order traversal of the length tree. + * The tree is never used again, so no attempt is made + * to restore it to working order. */ - page = virt_to_page(bdata->node_bootmem_map); - count = 0; - for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); + tmp = pgdat->bdata->segment_tree.length_tree; + treap_find_leftmost_leaf(tmp); + while(tmp) { + segment_tree_node_t *segment = length_segment_treap(tmp); + + /* + * This calculation differs from that in prior + * incarnations in this subsystem, so I describe it + * in passing detail here. + * + ******************************************************* + * + * We have start so that start is the least pfn with + * + * PAGE_SIZE * start >= segment_start(segment) + * + * so after division and ceiling: + * + * start = DIV_UP(segment_start(segment), PAGE_SIZE) + * + ******************************************************* + * + * Now the last pfn is the greatest pfn such that + * + * PAGE_SIZE * last + PAGE_SIZE - 1 <= segment_end(segment) + * + * -or- + * + * PAGE_SIZE * (last + 1) <= segment_end(segment) + 1 + * + * giving us after division and flooring: + * + * last + 1 = DIV_DN(segment_end(segment) + 1, PAGE_SIZE) + * + * or using end as a -strict- upper bound (i.e. end > pfn), + * we have + * + * end = DIV_DN(segment_end(segment) + 1, PAGE_SIZE) + * + */ + + start = DIV_UP(segment_start(segment), PAGE_SIZE); + end = DIV_DN(segment_end(segment) + 1, PAGE_SIZE); + +#ifdef DEBUG_BOOTMEM + + if(start < end) + printk("available segment: [%lu,%lu]\n", + start * PAGE_SIZE, + end * PAGE_SIZE - 1); + +#endif /* DEBUG_BOOTMEM */ + + for( page = pgdat->node_mem_map + (start - node_start); + page < pgdat->node_mem_map + (end - node_start); + ++page) { + + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + } + + /* + * In most calculations in this file, closed intervals + * are considered. In this instance, a half-open interval + * is being considered, and so the usual end - start + 1 + * calculation does not apply. + */ + if(start < end) + total += end - start; + + treap_find_parent_and_remove_child(tmp, parent); + tmp = parent; + treap_find_leftmost_leaf(tmp); } - total += count; - bdata->node_bootmem_map = NULL; return total; } -unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) +/* + * Wrappers around the core routines so that they operate on the + * per-node memory structures (pg_data_t *pgdat). + */ +unsigned long __init init_bootmem_node (pg_data_t *pgdat, + unsigned long freepfn, + unsigned long startpfn, + unsigned long endpfn) { - return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); + return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); } -void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) { reserve_bootmem_core(pgdat->bdata, physaddr, size); } -void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); + if(ptr) + return ptr; + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) { - return(free_bootmem_core(pgdat->bdata, physaddr, size)); + free_bootmem_core(pgdat->bdata, physaddr, size); } unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) { - return(free_all_bootmem_core(pgdat)); + return free_all_bootmem_core(pgdat); } +/* + * Non-node-aware wrappers for the core routines. The per-node + * structures are hidden by using the global variable contig_page_data. + */ unsigned long __init init_bootmem (unsigned long start, unsigned long pages) { max_low_pfn = pages; min_low_pfn = start; - return(init_bootmem_core(&contig_page_data, start, 0, pages)); + return init_bootmem_core(&contig_page_data, start, 0, pages); } -void __init reserve_bootmem (unsigned long addr, unsigned long size) +/* + * In multinode configurations it is not desirable to make memory + * available without information about the node assignment of the + * memory range, so even though reserve_bootmem() may operate + * without node information this cannot. + * + * This apparent inconsistency in the interface actually makes + * some sense, as when presented with irregular node to memory range + * assignments in firmware tables, the original request to make memory + * available will be aware of its node assignment. But an outstanding + * issue is that a non-node-aware memory reservation request (via + * alloc_bootmem()) will not know to which node to return the memory. + * + * Resolving that issue would involve tracking dynamic allocations + * separately from assertions regarding the presence of physical + * memory, which is feasible given a change of interface, or perhaps a + * separate tree in each node for memory reserved by dynamic allocations. + */ +void __init free_bootmem (unsigned long addr, unsigned long size) { - reserve_bootmem_core(contig_page_data.bdata, addr, size); + free_bootmem_core(contig_page_data.bdata, addr, size); } -void __init free_bootmem (unsigned long addr, unsigned long size) +/* + * reserve_bootmem operates without node information, yet is node + * aware. In situations where it may not be clear to where a given + * physical memory range is assigned this performs the task of + * searching the nodes on behalf of the caller. + */ +void __init reserve_bootmem (unsigned long addr, unsigned long size) { - return(free_bootmem_core(contig_page_data.bdata, addr, size)); + unsigned long start, end; + unsigned in_any_node = 0; + segment_tree_node_t segment, *tree; + pg_data_t *pgdat = pgdat_list; + + start = addr; + end = start + size - 1; + + segment_set_endpoints(&segment, start, end); + + /* + * For error checking, this must determine the node(s) within + * which an interval to be reserved lies. Otherwise, once the + * error checking is in place, the memory will be reported as + * multiply-reserved on those nodes not containing the memory. + */ + while(pgdat) { + unsigned in_node; + + tree = start_segment_treap(pgdat->bdata->segment_tree.start_tree); + in_node = segment_tree_intersects(tree, &segment); + in_any_node |= in_node; + + if(in_node) + reserve_bootmem_node(pgdat, addr, size); + + pgdat = pgdat->node_next; + } + if(!in_any_node) + printk(KERN_WARNING "the interval [%lu, %lu] " + "was multiply reserved\n", + segment_start(&segment), + segment_end(&segment)); } +/* + * free_all_bootmem is now a convenience function, and iterates over + * all the nodes, performing free_all_bootmem_core. + */ unsigned long __init free_all_bootmem (void) { - return(free_all_bootmem_core(&contig_page_data)); + pg_data_t *pgdat = pgdat_list; + unsigned long total = 0UL; + + while(pgdat) { + total += free_all_bootmem_core(pgdat); + pgdat = pgdat->node_next; + } + + return total; } -void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) +/* + * __alloc_bootmem performs a search over all nodes in order to satisfy + * an allocation request, for when it is unimportant from which node + * the memory used to satisfy an allocation is drawn. + */ +void * __init __alloc_bootmem (unsigned long size, unsigned long align, + unsigned long goal) { pg_data_t *pgdat = pgdat_list; void *ptr; while (pgdat) { - if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, - align, goal))) - return(ptr); - pgdat = pgdat->node_next; - } - /* - * Whoops, we cannot satisfy the allocation request. - */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); -void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) -{ - void *ptr; + if(ptr) + return ptr; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); - if (ptr) - return (ptr); + pgdat = pgdat->node_next; + } - /* - * Whoops, we cannot satisfy the allocation request. - */ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } - diff -urN linux-2.4.17-rc1-virgin/mm/filemap.c linux-2.4.17-rc1-wli3/mm/filemap.c --- linux-2.4.17-rc1-virgin/mm/filemap.c Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/mm/filemap.c Sun Dec 16 17:58:10 2001 @@ -53,7 +53,7 @@ EXPORT_SYMBOL(vm_min_readahead); -spinlock_t pagecache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +spinlock_t pagecache_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; /* * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock * with the pagecache_lock held. @@ -63,7 +63,7 @@ * pagemap_lru_lock -> * pagecache_lock */ -spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +spinlock_t pagemap_lru_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; #define CLUSTER_PAGES (1 << page_cluster) #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) @@ -234,7 +234,7 @@ static void truncate_complete_page(struct page *page) { /* Leave it on the LRU if it gets converted into anonymous buffers */ - if (!page->buffers || do_flushpage(page, 0)) + if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0))) lru_cache_del(page); /* @@ -296,6 +296,7 @@ page_cache_release(page); + /* we hit this with lock depth of 1 or 2 */ if (current->need_resched) { __set_current_state(TASK_RUNNING); schedule(); @@ -406,6 +407,8 @@ } page_cache_release(page); + + debug_lock_break(551); if (current->need_resched) { __set_current_state(TASK_RUNNING); schedule(); @@ -454,6 +457,11 @@ return page; } +static struct page * __find_page(struct address_space * mapping, unsigned long index) +{ + return __find_page_nolock(mapping, index, *page_hash(mapping,index)); +} + /* * By the time this is called, the page is locked and * we don't have to worry about any races any more. @@ -594,12 +602,16 @@ list_del(&page->list); list_add(&page->list, &mapping->locked_pages); - if (!PageDirty(page)) - continue; - page_cache_get(page); spin_unlock(&pagecache_lock); + /* BKL is held ... */ + debug_lock_break(1); + conditional_schedule(); + + if (!PageDirty(page)) + goto clean; + lock_page(page); if (PageDirty(page)) { @@ -607,7 +619,7 @@ writepage(page); } else UnlockPage(page); - +clean: page_cache_release(page); spin_lock(&pagecache_lock); } @@ -623,14 +635,28 @@ */ void filemap_fdatawait(struct address_space * mapping) { + DEFINE_LOCK_COUNT(); + spin_lock(&pagecache_lock); +restart: while (!list_empty(&mapping->locked_pages)) { struct page *page = list_entry(mapping->locked_pages.next, struct page, list); list_del(&page->list); list_add(&page->list, &mapping->clean_pages); - + + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + debug_lock_break(2); + if (conditional_schedule_needed()) { + page_cache_get(page); + break_spin_lock_and_resched(&pagecache_lock); + page_cache_release(page); + goto restart; + } + } + if (!PageLocked(page)) continue; @@ -894,6 +920,7 @@ * the hash-list needs a held write-lock. */ repeat: + break_spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, hash); if (page) { page_cache_get(page); @@ -970,7 +997,53 @@ /* - * Same as grab_cache_page, but do not wait if the page is unavailable. + * We combine this with read-ahead to deactivate pages when we + * think there's sequential IO going on. Note that this is + * harmless since we don't actually evict the pages from memory + * but just move them to the inactive list. + * + * TODO: + * - make the readahead code smarter + * - move readahead to the VMA level so we can do the same + * trick with mmap() + * + * Rik van Riel, 2000 + */ +static void drop_behind(struct file * file, unsigned long index) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long start; + + /* Nothing to drop-behind if we're on the first page. */ + if (!index) + return; + + if (index > file->f_rawin) + start = index - file->f_rawin; + else + start = 0; + + /* + * Go backwards from index-1 and drop all pages in the + * readahead window. Since the readahead window may have + * been increased since the last time we were called, we + * stop when the page isn't there. + */ + spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); + while (--index >= start) { + page = __find_page(mapping, index); + if (!page || !PageActive(page)) + break; + deactivate_page_nolock(page); + } + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); +} + +/* Same as grab_cache_page, but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. @@ -1240,6 +1313,12 @@ if (filp->f_ramax > max_readahead) filp->f_ramax = max_readahead; + /* + * Move the pages that have already been passed + * to the inactive list. + */ + drop_behind(filp, index); + #ifdef PROFILE_READAHEAD profile_readahead((reada_ok == 2), filp); #endif @@ -1248,25 +1327,6 @@ return; } -/* - * Mark a page as having seen activity. - * - * If it was already so marked, move it - * to the active queue and drop the referenced - * bit. Otherwise, just mark it for future - * action.. - */ -void mark_page_accessed(struct page *page) -{ - if (!PageActive(page) && PageReferenced(page)) { - activate_page(page); - ClearPageReferenced(page); - return; - } - - /* Mark the page referenced, AFTER checking for previous usage.. */ - SetPageReferenced(page); -} /* * This is a generic file read routine, and uses the @@ -1375,7 +1435,7 @@ * beginning or we just did an lseek. */ if (!offset || !filp->f_reada) - mark_page_accessed(page); + touch_page(page); /* * Ok, we have the page, and it's up-to-date, so @@ -1492,8 +1552,8 @@ ssize_t retval; int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress; struct kiobuf * iobuf; - struct address_space * mapping = filp->f_dentry->d_inode->i_mapping; - struct inode * inode = mapping->host; + struct inode * inode = filp->f_dentry->d_inode; + struct address_space * mapping = inode->i_mapping; new_iobuf = 0; iobuf = filp->f_iobuf; @@ -1774,7 +1834,7 @@ nr = max; /* And limit it to a sane percentage of the inactive list.. */ - max = nr_inactive_pages / 2; + max = nr_inactive_clean_pages / 2; if (nr > max) nr = max; @@ -1919,7 +1979,7 @@ * Found the page and have a reference on it, need to check sharing * and possibly copy it over to another page.. */ - mark_page_accessed(page); + touch_page(page); flush_page_to_ram(page); return page; @@ -2055,6 +2115,8 @@ address += PAGE_SIZE; pte++; } while (address && (address < end)); + debug_lock_break(1); + break_spin_lock(&vma->vm_mm->page_table_lock); return error; } @@ -2085,6 +2147,9 @@ address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + + debug_lock_break(1); + break_spin_lock(&vma->vm_mm->page_table_lock); return error; } @@ -2343,7 +2408,7 @@ int error = 0; /* This caps the number of vma's this process can own */ - if (vma->vm_mm->map_count > MAX_MAP_COUNT) + if (vma->vm_mm->map_count > max_map_count) return -ENOMEM; if (start == vma->vm_start) { @@ -2443,7 +2508,7 @@ if (vma->vm_flags & VM_LOCKED) return -EINVAL; - zap_page_range(vma->vm_mm, start, end - start); + zap_page_range(vma->vm_mm, start, end - start, ZPR_PARTITION); return 0; } @@ -2773,7 +2838,7 @@ page = __read_cache_page(mapping, index, filler, data); if (IS_ERR(page)) goto out; - mark_page_accessed(page); + touch_page(page); if (Page_Uptodate(page)) goto out; @@ -2970,6 +3035,7 @@ unsigned long index, offset; long page_fault; char *kaddr; + int deactivate = 1; /* * Try to find the page in the cache. If it isn't there, @@ -2978,8 +3044,10 @@ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) + if (bytes > count) { bytes = count; + deactivate = 0; + } /* * Bring in the user page that we will copy from _first_. @@ -3023,8 +3091,11 @@ unlock: kunmap(page); /* Mark it unlocked again and drop the page.. */ - SetPageReferenced(page); UnlockPage(page); + if (deactivate) + deactivate_page(page); + else + touch_page(page); page_cache_release(page); if (status < 0) diff -urN linux-2.4.17-rc1-virgin/mm/filemap.c~ linux-2.4.17-rc1-wli3/mm/filemap.c~ --- linux-2.4.17-rc1-virgin/mm/filemap.c~ Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/mm/filemap.c~ Fri Dec 14 02:44:20 2001 @@ -0,0 +1,3144 @@ +/* + * linux/mm/filemap.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * This file handles the generic file mmap semantics used by + * most "normal" filesystems (but you don't /have/ to use this: + * the NFS filesystem used to do this differently, for example) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +/* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. + * + * Shared mappings now work. 15.8.1995 Bruno. + * + * finished 'unifying' the page and buffer cache and SMP-threaded the + * page-cache, 21.05.1999, Ingo Molnar + * + * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli + */ + +atomic_t page_cache_size = ATOMIC_INIT(0); +unsigned int page_hash_bits; +struct page **page_hash_table; + +int vm_max_readahead = 31; +int vm_min_readahead = 3; +EXPORT_SYMBOL(vm_max_readahead); +EXPORT_SYMBOL(vm_min_readahead); + + +spinlock_t pagecache_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +/* + * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock + * with the pagecache_lock held. + * + * Ordering: + * swap_lock -> + * pagemap_lru_lock -> + * pagecache_lock + */ +spinlock_t pagemap_lru_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; + +#define CLUSTER_PAGES (1 << page_cluster) +#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) + +static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p)); +static void add_page_to_hash_queue(struct page * page, struct page **p) +{ + struct page *next = *p; + + *p = page; + page->next_hash = next; + page->pprev_hash = p; + if (next) + next->pprev_hash = &page->next_hash; + if (page->buffers) + PAGE_BUG(page); + atomic_inc(&page_cache_size); +} + +static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page) +{ + struct list_head *head = &mapping->clean_pages; + + mapping->nrpages++; + list_add(&page->list, head); + page->mapping = mapping; +} + +static inline void remove_page_from_inode_queue(struct page * page) +{ + struct address_space * mapping = page->mapping; + + mapping->nrpages--; + list_del(&page->list); + page->mapping = NULL; +} + +static inline void remove_page_from_hash_queue(struct page * page) +{ + struct page *next = page->next_hash; + struct page **pprev = page->pprev_hash; + + if (next) + next->pprev_hash = pprev; + *pprev = next; + page->pprev_hash = NULL; + atomic_dec(&page_cache_size); +} + +/* + * Remove a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. + */ +void __remove_inode_page(struct page *page) +{ + if (PageDirty(page)) BUG(); + remove_page_from_inode_queue(page); + remove_page_from_hash_queue(page); +} + +void remove_inode_page(struct page *page) +{ + if (!PageLocked(page)) + PAGE_BUG(page); + + spin_lock(&pagecache_lock); + __remove_inode_page(page); + spin_unlock(&pagecache_lock); +} + +static inline int sync_page(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping && mapping->a_ops && mapping->a_ops->sync_page) + return mapping->a_ops->sync_page(page); + return 0; +} + +/* + * Add a page to the dirty page list. + */ +void set_page_dirty(struct page *page) +{ + if (!test_and_set_bit(PG_dirty, &page->flags)) { + struct address_space *mapping = page->mapping; + + if (mapping) { + spin_lock(&pagecache_lock); + list_del(&page->list); + list_add(&page->list, &mapping->dirty_pages); + spin_unlock(&pagecache_lock); + + if (mapping->host) + mark_inode_dirty_pages(mapping->host); + } + } +} + +/** + * invalidate_inode_pages - Invalidate all the unlocked pages of one inode + * @inode: the inode which pages we want to invalidate + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + */ + +void invalidate_inode_pages(struct inode * inode) +{ + struct list_head *head, *curr; + struct page * page; + + head = &inode->i_mapping->clean_pages; + + spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); + curr = head->next; + + while (curr != head) { + page = list_entry(curr, struct page, list); + curr = curr->next; + + /* We cannot invalidate something in dirty.. */ + if (PageDirty(page)) + continue; + + /* ..or locked */ + if (TryLockPage(page)) + continue; + + if (page->buffers && !try_to_free_buffers(page, 0)) + goto unlock; + + if (page_count(page) != 1) + goto unlock; + + __lru_cache_del(page); + __remove_inode_page(page); + UnlockPage(page); + page_cache_release(page); + continue; +unlock: + UnlockPage(page); + continue; + } + + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); +} + +static int do_flushpage(struct page *page, unsigned long offset) +{ + int (*flushpage) (struct page *, unsigned long); + flushpage = page->mapping->a_ops->flushpage; + if (flushpage) + return (*flushpage)(page, offset); + return block_flushpage(page, offset); +} + +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + if (page->buffers) + do_flushpage(page, partial); +} + +static void truncate_complete_page(struct page *page) +{ + /* Leave it on the LRU if it gets converted into anonymous buffers */ + if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0))) + lru_cache_del(page); + + /* + * We remove the page from the page cache _after_ we have + * destroyed all buffer-cache references to it. Otherwise some + * other process might think this inode page is not in the + * page cache and creates a buffer-cache alias to it causing + * all sorts of fun problems ... + */ + ClearPageDirty(page); + ClearPageUptodate(page); + remove_inode_page(page); + page_cache_release(page); +} + +static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); +static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) +{ + struct list_head *curr; + struct page * page; + int unlocked = 0; + + restart: + curr = head->prev; + while (curr != head) { + unsigned long offset; + + page = list_entry(curr, struct page, list); + offset = page->index; + + /* Is one of the pages to truncate? */ + if ((offset >= start) || (*partial && (offset + 1) == start)) { + int failed; + + page_cache_get(page); + failed = TryLockPage(page); + + list_del(head); + if (!failed) + /* Restart after this page */ + list_add_tail(head, curr); + else + /* Restart on this page */ + list_add(head, curr); + + spin_unlock(&pagecache_lock); + unlocked = 1; + + if (!failed) { + if (*partial && (offset + 1) == start) { + truncate_partial_page(page, *partial); + *partial = 0; + } else + truncate_complete_page(page); + + UnlockPage(page); + } else + wait_on_page(page); + + page_cache_release(page); + + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&pagecache_lock); + goto restart; + } + curr = curr->prev; + } + return unlocked; +} + + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from with to truncate + * + * Truncate the page cache at a set offset, removing the pages + * that are beyond that offset (and zeroing out partial pages). + * If any page is locked we wait for it to become unlocked. + */ +void truncate_inode_pages(struct address_space * mapping, loff_t lstart) +{ + unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + int unlocked; + + spin_lock(&pagecache_lock); + do { + unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial); + unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial); + unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial); + } while (unlocked); + /* Traversed all three lists without dropping the lock */ + spin_unlock(&pagecache_lock); +} + +static inline int invalidate_this_page2(struct page * page, + struct list_head * curr, + struct list_head * head) +{ + int unlocked = 1; + + /* + * The page is locked and we hold the pagecache_lock as well + * so both page_count(page) and page->buffers stays constant here. + */ + if (page_count(page) == 1 + !!page->buffers) { + /* Restart after this page */ + list_del(head); + list_add_tail(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + truncate_complete_page(page); + } else { + if (page->buffers) { + /* Restart after this page */ + list_del(head); + list_add_tail(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + block_invalidate_page(page); + } else + unlocked = 0; + + ClearPageDirty(page); + ClearPageUptodate(page); + } + + return unlocked; +} + +static int FASTCALL(invalidate_list_pages2(struct list_head *)); +static int invalidate_list_pages2(struct list_head *head) +{ + struct list_head *curr; + struct page * page; + int unlocked = 0; + + restart: + curr = head->prev; + while (curr != head) { + page = list_entry(curr, struct page, list); + + if (!TryLockPage(page)) { + int __unlocked; + + __unlocked = invalidate_this_page2(page, curr, head); + UnlockPage(page); + unlocked |= __unlocked; + if (!__unlocked) { + curr = curr->prev; + continue; + } + } else { + /* Restart on this page */ + list_del(head); + list_add(head, curr); + + page_cache_get(page); + spin_unlock(&pagecache_lock); + unlocked = 1; + wait_on_page(page); + } + + page_cache_release(page); + if (current->need_resched) { + __set_current_state(TASK_RUNNING); + schedule(); + } + + spin_lock(&pagecache_lock); + goto restart; + } + return unlocked; +} + +/** + * invalidate_inode_pages2 - Clear all the dirty bits around if it can't + * free the pages because they're mapped. + * @mapping: the address_space which pages we want to invalidate + */ +void invalidate_inode_pages2(struct address_space * mapping) +{ + int unlocked; + + spin_lock(&pagecache_lock); + do { + unlocked = invalidate_list_pages2(&mapping->clean_pages); + unlocked |= invalidate_list_pages2(&mapping->dirty_pages); + unlocked |= invalidate_list_pages2(&mapping->locked_pages); + } while (unlocked); + spin_unlock(&pagecache_lock); +} + +static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) +{ + goto inside; + + for (;;) { + page = page->next_hash; +inside: + if (!page) + goto not_found; + if (page->mapping != mapping) + continue; + if (page->index == offset) + break; + } + +not_found: + return page; +} + +static struct page * __find_page(struct address_space * mapping, unsigned long index) +{ + return __find_page_nolock(mapping, index, *page_hash(mapping,index)); +} + +/* + * By the time this is called, the page is locked and + * we don't have to worry about any races any more. + * + * Start the IO.. + */ +static int writeout_one_page(struct page *page) +{ + struct buffer_head *bh, *head = page->buffers; + + bh = head; + do { + if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh)) + continue; + + bh->b_flushtime = jiffies; + ll_rw_block(WRITE, 1, &bh); + } while ((bh = bh->b_this_page) != head); + return 0; +} + +int waitfor_one_page(struct page *page) +{ + int error = 0; + struct buffer_head *bh, *head = page->buffers; + + bh = head; + do { + wait_on_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + error = -EIO; + } while ((bh = bh->b_this_page) != head); + return error; +} + +static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *)) +{ + struct list_head *curr; + struct page *page; + int retval = 0; + + spin_lock(&pagecache_lock); + curr = head->next; + while (curr != head) { + page = list_entry(curr, struct page, list); + curr = curr->next; + if (!page->buffers) + continue; + if (page->index >= end) + continue; + if (page->index < start) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + lock_page(page); + + /* The buffers could have been free'd while we waited for the page lock */ + if (page->buffers) + retval |= fn(page); + + UnlockPage(page); + spin_lock(&pagecache_lock); + curr = page->list.next; + page_cache_release(page); + } + spin_unlock(&pagecache_lock); + + return retval; +} + +/* + * Two-stage data sync: first start the IO, then go back and + * collect the information.. + */ +int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx) +{ + int retval; + + /* writeout dirty buffers on pages from both clean and dirty lists */ + retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page); + + /* now wait for locked buffers on pages from both clean and dirty lists */ + retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page); + retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page); + + return retval; +} + +/* + * In-memory filesystems have to fail their + * writepage function - and this has to be + * worked around in the VM layer.. + * + * We + * - mark the page dirty again (but do NOT + * add it back to the inode dirty list, as + * that would livelock in fdatasync) + * - activate the page so that the page stealer + * doesn't try to write it out over and over + * again. + */ +int fail_writepage(struct page *page) +{ + /* Only activate on memory-pressure, not fsync.. */ + if (PageLaunder(page)) { + activate_page(page); + SetPageReferenced(page); + } + + /* Set the page dirty again, unlock */ + SetPageDirty(page); + UnlockPage(page); + return 0; +} + +EXPORT_SYMBOL(fail_writepage); + +/** + * filemap_fdatasync - walk the list of dirty pages of the given address space + * and writepage() all of them. + * + * @mapping: address space structure to write + * + */ +void filemap_fdatasync(struct address_space * mapping) +{ + int (*writepage)(struct page *) = mapping->a_ops->writepage; + + spin_lock(&pagecache_lock); + + while (!list_empty(&mapping->dirty_pages)) { + struct page *page = list_entry(mapping->dirty_pages.next, struct page, list); + + list_del(&page->list); + list_add(&page->list, &mapping->locked_pages); + + if (!PageDirty(page)) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + + lock_page(page); + + if (PageDirty(page)) { + ClearPageDirty(page); + writepage(page); + } else + UnlockPage(page); + + page_cache_release(page); + spin_lock(&pagecache_lock); + } + spin_unlock(&pagecache_lock); +} + +/** + * filemap_fdatawait - walk the list of locked pages of the given address space + * and wait for all of them. + * + * @mapping: address space structure to wait for + * + */ +void filemap_fdatawait(struct address_space * mapping) +{ + spin_lock(&pagecache_lock); + + while (!list_empty(&mapping->locked_pages)) { + struct page *page = list_entry(mapping->locked_pages.next, struct page, list); + + list_del(&page->list); + list_add(&page->list, &mapping->clean_pages); + + if (!PageLocked(page)) + continue; + + page_cache_get(page); + spin_unlock(&pagecache_lock); + + ___wait_on_page(page); + + page_cache_release(page); + spin_lock(&pagecache_lock); + } + spin_unlock(&pagecache_lock); +} + +/* + * Add a page to the inode page cache. + * + * The caller must have locked the page and + * set all the page flags correctly.. + */ +void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index) +{ + if (!PageLocked(page)) + BUG(); + + page->index = index; + page_cache_get(page); + spin_lock(&pagecache_lock); + add_page_to_inode_queue(mapping, page); + add_page_to_hash_queue(page, page_hash(mapping, index)); + spin_unlock(&pagecache_lock); + + lru_cache_add(page); +} + +/* + * This adds a page to the page cache, starting out as locked, + * owned by us, but unreferenced, not uptodate and with no errors. + */ +static inline void __add_to_page_cache(struct page * page, + struct address_space *mapping, unsigned long offset, + struct page **hash) +{ + unsigned long flags; + + flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked); + page->flags = flags | (1 << PG_locked); + page_cache_get(page); + page->index = offset; + add_page_to_inode_queue(mapping, page); + add_page_to_hash_queue(page, hash); +} + +void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset) +{ + spin_lock(&pagecache_lock); + __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset)); + spin_unlock(&pagecache_lock); + lru_cache_add(page); +} + +int add_to_page_cache_unique(struct page * page, + struct address_space *mapping, unsigned long offset, + struct page **hash) +{ + int err; + struct page *alias; + + spin_lock(&pagecache_lock); + alias = __find_page_nolock(mapping, offset, *hash); + + err = 1; + if (!alias) { + __add_to_page_cache(page,mapping,offset,hash); + err = 0; + } + + spin_unlock(&pagecache_lock); + if (!err) + lru_cache_add(page); + return err; +} + +/* + * This adds the requested page to the page cache if it isn't already there, + * and schedules an I/O to read in its contents from disk. + */ +static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); +static int page_cache_read(struct file * file, unsigned long offset) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct page **hash = page_hash(mapping, offset); + struct page *page; + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + spin_unlock(&pagecache_lock); + if (page) + return 0; + + page = page_cache_alloc(mapping); + if (!page) + return -ENOMEM; + + if (!add_to_page_cache_unique(page, mapping, offset, hash)) { + int error = mapping->a_ops->readpage(file, page); + page_cache_release(page); + return error; + } + /* + * We arrive here in the unlikely event that someone + * raced with us and added our page to the cache first. + */ + page_cache_release(page); + return 0; +} + +/* + * Read in an entire cluster at once. A cluster is usually a 64k- + * aligned block that includes the page requested in "offset." + */ +static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset, + unsigned long filesize)); +static int read_cluster_nonblocking(struct file * file, unsigned long offset, + unsigned long filesize) +{ + unsigned long pages = CLUSTER_PAGES; + + offset = CLUSTER_OFFSET(offset); + while ((pages-- > 0) && (offset < filesize)) { + int error = page_cache_read(file, offset); + if (error < 0) + return error; + offset ++; + } + + return 0; +} + +/* + * Wait for a page to get unlocked. + * + * This must be called with the caller "holding" the page, + * ie with increased "page->count" so that the page won't + * go away during the wait.. + */ +void ___wait_on_page(struct page *page) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(&page->wait, &wait); + do { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!PageLocked(page)) + break; + sync_page(page); + schedule(); + } while (PageLocked(page)); + tsk->state = TASK_RUNNING; + remove_wait_queue(&page->wait, &wait); +} + +void unlock_page(struct page *page) +{ + clear_bit(PG_launder, &(page)->flags); + smp_mb__before_clear_bit(); + if (!test_and_clear_bit(PG_locked, &(page)->flags)) + BUG(); + smp_mb__after_clear_bit(); + if (waitqueue_active(&(page)->wait)) + wake_up(&(page)->wait); +} + +/* + * Get a lock on the page, assuming we need to sleep + * to get it.. + */ +static void __lock_page(struct page *page) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue_exclusive(&page->wait, &wait); + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (PageLocked(page)) { + sync_page(page); + schedule(); + } + if (!TryLockPage(page)) + break; + } + tsk->state = TASK_RUNNING; + remove_wait_queue(&page->wait, &wait); +} + + +/* + * Get an exclusive lock on the page, optimistically + * assuming it's not locked.. + */ +void lock_page(struct page *page) +{ + if (TryLockPage(page)) + __lock_page(page); +} + +/* + * a rather lightweight function, finding and getting a reference to a + * hashed page atomically. + */ +struct page * __find_get_page(struct address_space *mapping, + unsigned long offset, struct page **hash) +{ + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + if (page) + page_cache_get(page); + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Same as above, but trylock it instead of incrementing the count. + */ +struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) +{ + struct page *page; + struct page **hash = page_hash(mapping, offset); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, offset, *hash); + if (page) { + if (TryLockPage(page)) + page = NULL; + } + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Must be called with the pagecache lock held, + * will return with it held (but it may be dropped + * during blocking operations.. + */ +static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *)); +static struct page * __find_lock_page_helper(struct address_space *mapping, + unsigned long offset, struct page *hash) +{ + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ +repeat: + page = __find_page_nolock(mapping, offset, hash); + if (page) { + page_cache_get(page); + if (TryLockPage(page)) { + spin_unlock(&pagecache_lock); + lock_page(page); + spin_lock(&pagecache_lock); + + /* Has the page been re-allocated while we slept? */ + if (page->mapping != mapping || page->index != offset) { + UnlockPage(page); + page_cache_release(page); + goto repeat; + } + } + } + return page; +} + +/* + * Same as the above, but lock the page too, verifying that + * it's still valid once we own it. + */ +struct page * __find_lock_page (struct address_space *mapping, + unsigned long offset, struct page **hash) +{ + struct page *page; + + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, offset, *hash); + spin_unlock(&pagecache_lock); + return page; +} + +/* + * Same as above, but create the page if required.. + */ +struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask) +{ + struct page *page; + struct page **hash = page_hash(mapping, index); + + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, index, *hash); + spin_unlock(&pagecache_lock); + if (!page) { + struct page *newpage = alloc_page(gfp_mask); + page = ERR_PTR(-ENOMEM); + if (newpage) { + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, index, *hash); + if (likely(!page)) { + page = newpage; + __add_to_page_cache(page, mapping, index, hash); + newpage = NULL; + } + spin_unlock(&pagecache_lock); + if (newpage == NULL) + lru_cache_add(page); + else + page_cache_release(newpage); + } + } + return page; +} + +/* + * Returns locked page at given index in given cache, creating it if needed. + */ +struct page *grab_cache_page(struct address_space *mapping, unsigned long index) +{ + return find_or_create_page(mapping, index, mapping->gfp_mask); +} + + +/* + * We combine this with read-ahead to deactivate pages when we + * think there's sequential IO going on. Note that this is + * harmless since we don't actually evict the pages from memory + * but just move them to the inactive list. + * + * TODO: + * - make the readahead code smarter + * - move readahead to the VMA level so we can do the same + * trick with mmap() + * + * Rik van Riel, 2000 + */ +static void drop_behind(struct file * file, unsigned long index) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long start; + + /* Nothing to drop-behind if we're on the first page. */ + if (!index) + return; + + if (index > file->f_rawin) + start = index - file->f_rawin; + else + start = 0; + + /* + * Go backwards from index-1 and drop all pages in the + * readahead window. Since the readahead window may have + * been increased since the last time we were called, we + * stop when the page isn't there. + */ + spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); + while (--index >= start) { + page = __find_page(mapping, index); + if (!page || !PageActive(page)) + break; + deactivate_page_nolock(page); + } + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); +} + +/* Same as grab_cache_page, but do not wait if the page is unavailable. + * This is intended for speculative data generators, where the data can + * be regenerated if the page couldn't be grabbed. This routine should + * be safe to call while holding the lock for another page. + */ +struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index) +{ + struct page *page, **hash; + + hash = page_hash(mapping, index); + page = __find_get_page(mapping, index, hash); + + if ( page ) { + if ( !TryLockPage(page) ) { + /* Page found and locked */ + /* This test is overly paranoid, but what the heck... */ + if ( unlikely(page->mapping != mapping || page->index != index) ) { + /* Someone reallocated this page under us. */ + UnlockPage(page); + page_cache_release(page); + return NULL; + } else { + return page; + } + } else { + /* Page locked by someone else */ + page_cache_release(page); + return NULL; + } + } + + page = page_cache_alloc(mapping); + if ( unlikely(!page) ) + return NULL; /* Failed to allocate a page */ + + if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) { + /* Someone else grabbed the page already. */ + page_cache_release(page); + return NULL; + } + + return page; +} + +#if 0 +#define PROFILE_READAHEAD +#define DEBUG_READAHEAD +#endif + +/* + * Read-ahead profiling information + * -------------------------------- + * Every PROFILE_MAXREADCOUNT, the following information is written + * to the syslog: + * Percentage of asynchronous read-ahead. + * Average of read-ahead fields context value. + * If DEBUG_READAHEAD is defined, a snapshot of these fields is written + * to the syslog. + */ + +#ifdef PROFILE_READAHEAD + +#define PROFILE_MAXREADCOUNT 1000 + +static unsigned long total_reada; +static unsigned long total_async; +static unsigned long total_ramax; +static unsigned long total_ralen; +static unsigned long total_rawin; + +static void profile_readahead(int async, struct file *filp) +{ + unsigned long flags; + + ++total_reada; + if (async) + ++total_async; + + total_ramax += filp->f_ramax; + total_ralen += filp->f_ralen; + total_rawin += filp->f_rawin; + + if (total_reada > PROFILE_MAXREADCOUNT) { + save_flags(flags); + cli(); + if (!(total_reada > PROFILE_MAXREADCOUNT)) { + restore_flags(flags); + return; + } + + printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n", + total_ramax/total_reada, + total_ralen/total_reada, + total_rawin/total_reada, + (total_async*100)/total_reada); +#ifdef DEBUG_READAHEAD + printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n", + filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend); +#endif + + total_reada = 0; + total_async = 0; + total_ramax = 0; + total_ralen = 0; + total_rawin = 0; + + restore_flags(flags); + } +} +#endif /* defined PROFILE_READAHEAD */ + +/* + * Read-ahead context: + * ------------------- + * The read ahead context fields of the "struct file" are the following: + * - f_raend : position of the first byte after the last page we tried to + * read ahead. + * - f_ramax : current read-ahead maximum size. + * - f_ralen : length of the current IO read block we tried to read-ahead. + * - f_rawin : length of the current read-ahead window. + * if last read-ahead was synchronous then + * f_rawin = f_ralen + * otherwise (was asynchronous) + * f_rawin = previous value of f_ralen + f_ralen + * + * Read-ahead limits: + * ------------------ + * MIN_READAHEAD : minimum read-ahead size when read-ahead. + * MAX_READAHEAD : maximum read-ahead size when read-ahead. + * + * Synchronous read-ahead benefits: + * -------------------------------- + * Using reasonable IO xfer length from peripheral devices increase system + * performances. + * Reasonable means, in this context, not too large but not too small. + * The actual maximum value is: + * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined + * and 32K if defined (4K page size assumed). + * + * Asynchronous read-ahead benefits: + * --------------------------------- + * Overlapping next read request and user process execution increase system + * performance. + * + * Read-ahead risks: + * ----------------- + * We have to guess which further data are needed by the user process. + * If these data are often not really needed, it's bad for system + * performances. + * However, we know that files are often accessed sequentially by + * application programs and it seems that it is possible to have some good + * strategy in that guessing. + * We only try to read-ahead files that seems to be read sequentially. + * + * Asynchronous read-ahead risks: + * ------------------------------ + * In order to maximize overlapping, we must start some asynchronous read + * request from the device, as soon as possible. + * We must be very careful about: + * - The number of effective pending IO read requests. + * ONE seems to be the only reasonable value. + * - The total memory pool usage for the file access stream. + * This maximum memory usage is implicitly 2 IO read chunks: + * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined, + * 64k if defined (4K page size assumed). + */ + +static inline int get_max_readahead(struct inode * inode) +{ + if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)]) + return vm_max_readahead; + return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)]; +} + +static void generic_file_readahead(int reada_ok, + struct file * filp, struct inode * inode, + struct page * page) +{ + unsigned long end_index; + unsigned long index = page->index; + unsigned long max_ahead, ahead; + unsigned long raend; + int max_readahead = get_max_readahead(inode); + + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + + raend = filp->f_raend; + max_ahead = 0; + +/* + * The current page is locked. + * If the current position is inside the previous read IO request, do not + * try to reread previously read ahead pages. + * Otherwise decide or not to read ahead some pages synchronously. + * If we are not going to read ahead, set the read ahead context for this + * page only. + */ + if (PageLocked(page)) { + if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) { + raend = index; + if (raend < end_index) + max_ahead = filp->f_ramax; + filp->f_rawin = 0; + filp->f_ralen = 1; + if (!max_ahead) { + filp->f_raend = index + filp->f_ralen; + filp->f_rawin += filp->f_ralen; + } + } + } +/* + * The current page is not locked. + * If we were reading ahead and, + * if the current max read ahead size is not zero and, + * if the current position is inside the last read-ahead IO request, + * it is the moment to try to read ahead asynchronously. + * We will later force unplug device in order to force asynchronous read IO. + */ + else if (reada_ok && filp->f_ramax && raend >= 1 && + index <= raend && index + filp->f_ralen >= raend) { +/* + * Add ONE page to max_ahead in order to try to have about the same IO max size + * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE. + * Compute the position of the last page we have tried to read in order to + * begin to read ahead just at the next page. + */ + raend -= 1; + if (raend < end_index) + max_ahead = filp->f_ramax + 1; + + if (max_ahead) { + filp->f_rawin = filp->f_ralen; + filp->f_ralen = 0; + reada_ok = 2; + } + } +/* + * Try to read ahead pages. + * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the + * scheduler, will work enough for us to avoid too bad actuals IO requests. + */ + ahead = 0; + while (ahead < max_ahead) { + ahead ++; + if ((raend + ahead) >= end_index) + break; + if (page_cache_read(filp, raend + ahead) < 0) + break; + } +/* + * If we tried to read ahead some pages, + * If we tried to read ahead asynchronously, + * Try to force unplug of the device in order to start an asynchronous + * read IO request. + * Update the read-ahead context. + * Store the length of the current read-ahead window. + * Double the current max read ahead size. + * That heuristic avoid to do some large IO for files that are not really + * accessed sequentially. + */ + if (ahead) { + filp->f_ralen += ahead; + filp->f_rawin += filp->f_ralen; + filp->f_raend = raend + ahead + 1; + + filp->f_ramax += filp->f_ramax; + + if (filp->f_ramax > max_readahead) + filp->f_ramax = max_readahead; + + /* + * Move the pages that have already been passed + * to the inactive list. + */ + drop_behind(filp, index); + +#ifdef PROFILE_READAHEAD + profile_readahead((reada_ok == 2), filp); +#endif + } + + return; +} + + +/* + * This is a generic file read routine, and uses the + * inode->i_op->readpage() function for the actual low-level + * stuff. + * + * This is really ugly. But the goto's actually try to clarify some + * of the logic when it comes to error handling etc. + */ +void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) +{ + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + unsigned long index, offset; + struct page *cached_page; + int reada_ok; + int error; + int max_readahead = get_max_readahead(inode); + + cached_page = NULL; + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + +/* + * If the current position is outside the previous read-ahead window, + * we reset the current read-ahead context and set read ahead max to zero + * (will be set to just needed value later), + * otherwise, we assume that the file accesses are sequential enough to + * continue read-ahead. + */ + if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) { + reada_ok = 0; + filp->f_raend = 0; + filp->f_ralen = 0; + filp->f_ramax = 0; + filp->f_rawin = 0; + } else { + reada_ok = 1; + } +/* + * Adjust the current value of read-ahead max. + * If the read operation stay in the first half page, force no readahead. + * Otherwise try to increase read ahead max just enough to do the read request. + * Then, at least MIN_READAHEAD if read ahead is ok, + * and at most MAX_READAHEAD in all cases. + */ + if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) { + filp->f_ramax = 0; + } else { + unsigned long needed; + + needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1; + + if (filp->f_ramax < needed) + filp->f_ramax = needed; + + if (reada_ok && filp->f_ramax < vm_min_readahead) + filp->f_ramax = vm_min_readahead; + if (filp->f_ramax > max_readahead) + filp->f_ramax = max_readahead; + } + + for (;;) { + struct page *page, **hash; + unsigned long end_index, nr, ret; + + end_index = inode->i_size >> PAGE_CACHE_SHIFT; + + if (index > end_index) + break; + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = inode->i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) + break; + } + + nr = nr - offset; + + /* + * Try to find the data in the page cache.. + */ + hash = page_hash(mapping, index); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, index, *hash); + if (!page) + goto no_cached_page; +found_page: + page_cache_get(page); + spin_unlock(&pagecache_lock); + + if (!Page_Uptodate(page)) + goto page_not_up_to_date; + generic_file_readahead(reada_ok, filp, inode, page); +page_ok: + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping->i_mmap_shared != NULL) + flush_dcache_page(page); + + /* + * Mark the page accessed if we read the + * beginning or we just did an lseek. + */ + if (!offset || !filp->f_reada) + touch_page(page); + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + + page_cache_release(page); + if (ret == nr && desc->count) + continue; + break; + +/* + * Ok, the page was not immediately readable, so let's try to read ahead while we're at it.. + */ +page_not_up_to_date: + generic_file_readahead(reada_ok, filp, inode, page); + + if (Page_Uptodate(page)) + goto page_ok; + + /* Get exclusive access to the page ... */ + lock_page(page); + + /* Did it get unhashed before we got the lock? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + continue; + } + + /* Did somebody else fill it already? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto page_ok; + } + +readpage: + /* ... and start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (!error) { + if (Page_Uptodate(page)) + goto page_ok; + + /* Again, try some read-ahead while waiting for the page to finish.. */ + generic_file_readahead(reada_ok, filp, inode, page); + wait_on_page(page); + if (Page_Uptodate(page)) + goto page_ok; + error = -EIO; + } + + /* UHHUH! A synchronous read error occurred. Report it */ + desc->error = error; + page_cache_release(page); + break; + +no_cached_page: + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + * + * We get here with the page cache lock held. + */ + if (!cached_page) { + spin_unlock(&pagecache_lock); + cached_page = page_cache_alloc(mapping); + if (!cached_page) { + desc->error = -ENOMEM; + break; + } + + /* + * Somebody may have added the page while we + * dropped the page cache lock. Check for that. + */ + spin_lock(&pagecache_lock); + page = __find_page_nolock(mapping, index, *hash); + if (page) + goto found_page; + } + + /* + * Ok, add the new page to the hash-queues... + */ + page = cached_page; + __add_to_page_cache(page, mapping, index, hash); + spin_unlock(&pagecache_lock); + lru_cache_add(page); + cached_page = NULL; + + goto readpage; + } + + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + filp->f_reada = 1; + if (cached_page) + page_cache_release(cached_page); + UPDATE_ATIME(inode); +} + +static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset) +{ + ssize_t retval; + int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress; + struct kiobuf * iobuf; + struct inode * inode = filp->f_dentry->d_inode; + struct address_space * mapping = inode->i_mapping; + + new_iobuf = 0; + iobuf = filp->f_iobuf; + if (test_and_set_bit(0, &filp->f_iobuf_lock)) { + /* + * A parallel read/write is using the preallocated iobuf + * so just run slow and allocate a new one. + */ + retval = alloc_kiovec(1, &iobuf); + if (retval) + goto out; + new_iobuf = 1; + } + + blocksize = 1 << inode->i_blkbits; + blocksize_bits = inode->i_blkbits; + blocksize_mask = blocksize - 1; + chunk_size = KIO_MAX_ATOMIC_IO << 10; + + retval = -EINVAL; + if ((offset & blocksize_mask) || (count & blocksize_mask)) + goto out_free; + if (!mapping->a_ops->direct_IO) + goto out_free; + + /* + * Flush to disk exlusively the _data_, metadata must remains + * completly asynchronous or performance will go to /dev/null. + */ + filemap_fdatasync(mapping); + retval = fsync_inode_data_buffers(inode); + filemap_fdatawait(mapping); + if (retval < 0) + goto out_free; + + progress = retval = 0; + while (count > 0) { + iosize = count; + if (iosize > chunk_size) + iosize = chunk_size; + + retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); + if (retval) + break; + + retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); + + if (rw == READ && retval > 0) + mark_dirty_kiobuf(iobuf, retval); + + if (retval >= 0) { + count -= retval; + buf += retval; + progress += retval; + } + + unmap_kiobuf(iobuf); + + if (retval != iosize) + break; + } + + if (progress) + retval = progress; + + out_free: + if (!new_iobuf) + clear_bit(0, &filp->f_iobuf_lock); + else + free_kiovec(1, &iobuf); + out: + return retval; +} + +int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long left, count = desc->count; + + if (size > count) + size = count; + + kaddr = kmap(page); + left = __copy_to_user(desc->buf, kaddr + offset, size); + kunmap(page); + + if (left) { + size -= left; + desc->error = -EFAULT; + } + desc->count = count - size; + desc->written += size; + desc->buf += size; + return size; +} + +/* + * This is the "read()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) +{ + ssize_t retval; + + if ((ssize_t) count < 0) + return -EINVAL; + + if (filp->f_flags & O_DIRECT) + goto o_direct; + + retval = -EFAULT; + if (access_ok(VERIFY_WRITE, buf, count)) { + retval = 0; + + if (count) { + read_descriptor_t desc; + + desc.written = 0; + desc.count = count; + desc.buf = buf; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor); + + retval = desc.written; + if (!retval) + retval = desc.error; + } + } + out: + return retval; + + o_direct: + { + loff_t pos = *ppos, size; + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + + retval = 0; + if (!count) + goto out; /* skip atime */ + size = inode->i_size; + if (pos < size) { + if (pos + count > size) + count = size - pos; + retval = generic_file_direct_IO(READ, filp, buf, count, pos); + if (retval > 0) + *ppos = pos + retval; + } + UPDATE_ATIME(filp->f_dentry->d_inode); + goto out; + } +} + +static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) +{ + ssize_t written; + unsigned long count = desc->count; + struct file *file = (struct file *) desc->buf; + + if (size > count) + size = count; + + if (file->f_op->sendpage) { + written = file->f_op->sendpage(file, page, offset, + size, &file->f_pos, sizef_op->write(file, kaddr + offset, size, &file->f_pos); + kunmap(page); + + set_fs(old_fs); + } + if (written < 0) { + desc->error = written; + written = 0; + } + desc->count = count - written; + desc->written += written; + return written; +} + +asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ + ssize_t retval; + struct file * in_file, * out_file; + struct inode * in_inode, * out_inode; + + /* + * Get input file, and verify that it is ok.. + */ + retval = -EBADF; + in_file = fget(in_fd); + if (!in_file) + goto out; + if (!(in_file->f_mode & FMODE_READ)) + goto fput_in; + retval = -EINVAL; + in_inode = in_file->f_dentry->d_inode; + if (!in_inode) + goto fput_in; + if (!in_inode->i_mapping->a_ops->readpage) + goto fput_in; + retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count); + if (retval) + goto fput_in; + + /* + * Get output file, and verify that it is ok.. + */ + retval = -EBADF; + out_file = fget(out_fd); + if (!out_file) + goto fput_in; + if (!(out_file->f_mode & FMODE_WRITE)) + goto fput_out; + retval = -EINVAL; + if (!out_file->f_op || !out_file->f_op->write) + goto fput_out; + out_inode = out_file->f_dentry->d_inode; + retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count); + if (retval) + goto fput_out; + + retval = 0; + if (count) { + read_descriptor_t desc; + loff_t pos = 0, *ppos; + + retval = -EFAULT; + ppos = &in_file->f_pos; + if (offset) { + if (get_user(pos, offset)) + goto fput_out; + ppos = &pos; + } + + desc.written = 0; + desc.count = count; + desc.buf = (char *) out_file; + desc.error = 0; + do_generic_file_read(in_file, ppos, &desc, file_send_actor); + + retval = desc.written; + if (!retval) + retval = desc.error; + if (offset) + put_user(pos, offset); + } + +fput_out: + fput(out_file); +fput_in: + fput(in_file); +out: + return retval; +} + +static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + unsigned long max; + + if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + return -EINVAL; + + /* Limit it to the size of the file.. */ + max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT; + if (index > max) + return 0; + max -= index; + if (nr > max) + nr = max; + + /* And limit it to a sane percentage of the inactive list.. */ + max = nr_inactive_clean_pages / 2; + if (nr > max) + nr = max; + + while (nr) { + page_cache_read(file, index); + index++; + nr--; + } + return 0; +} + +asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +{ + ssize_t ret; + struct file *file; + + ret = -EBADF; + file = fget(fd); + if (file) { + if (file->f_mode & FMODE_READ) { + unsigned long start = offset >> PAGE_CACHE_SHIFT; + unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT; + ret = do_readahead(file, start, len); + } + fput(file); + } + return ret; +} + +/* + * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are + * sure this is sequential access, we don't need a flexible read-ahead + * window size -- we can always use a large fixed size window. + */ +static void nopage_sequential_readahead(struct vm_area_struct * vma, + unsigned long pgoff, unsigned long filesize) +{ + unsigned long ra_window; + + ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode); + ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1); + + /* vm_raend is zero if we haven't read ahead in this area yet. */ + if (vma->vm_raend == 0) + vma->vm_raend = vma->vm_pgoff + ra_window; + + /* + * If we've just faulted the page half-way through our window, + * then schedule reads for the next window, and release the + * pages in the previous window. + */ + if ((pgoff + (ra_window >> 1)) == vma->vm_raend) { + unsigned long start = vma->vm_pgoff + vma->vm_raend; + unsigned long end = start + ra_window; + + if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff)) + end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff; + if (start > end) + return; + + while ((start < end) && (start < filesize)) { + if (read_cluster_nonblocking(vma->vm_file, + start, filesize) < 0) + break; + start += CLUSTER_PAGES; + } + run_task_queue(&tq_disk); + + /* if we're far enough past the beginning of this area, + recycle pages that are in the previous window. */ + if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) { + unsigned long window = ra_window << PAGE_SHIFT; + + end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT); + end -= window + window; + filemap_sync(vma, end - window, window, MS_INVALIDATE); + } + + vma->vm_raend += ra_window; + } + + return; +} + +/* + * filemap_nopage() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. + * + * The goto's are kind of ugly, but this streamlines the normal case of having + * it in the page cache, and handles the special cases reasonably without + * having a lot of duplicated code. + */ +struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused) +{ + int error; + struct file *file = area->vm_file; + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + struct page *page, **hash; + unsigned long size, pgoff, endoff; + + pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + +retry_all: + /* + * An external ptracer can access pages that normally aren't + * accessible.. + */ + size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if ((pgoff >= size) && (area->vm_mm == current->mm)) + return NULL; + + /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */ + if (size > endoff) + size = endoff; + + /* + * Do we have something in the page cache already? + */ + hash = page_hash(mapping, pgoff); +retry_find: + page = __find_get_page(mapping, pgoff, hash); + if (!page) + goto no_cached_page; + + /* + * Ok, found a page in the page cache, now we need to check + * that it's up-to-date. + */ + if (!Page_Uptodate(page)) + goto page_not_uptodate; + +success: + /* + * Try read-ahead for sequential areas. + */ + if (VM_SequentialReadHint(area)) + nopage_sequential_readahead(area, pgoff, size); + + /* + * Found the page and have a reference on it, need to check sharing + * and possibly copy it over to another page.. + */ + touch_page(page); + flush_page_to_ram(page); + return page; + +no_cached_page: + /* + * If the requested offset is within our file, try to read a whole + * cluster of pages at once. + * + * Otherwise, we're off the end of a privately mapped file, + * so we need to map a zero page. + */ + if ((pgoff < size) && !VM_RandomReadHint(area)) + error = read_cluster_nonblocking(file, pgoff, size); + else + error = page_cache_read(file, pgoff); + + /* + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. + */ + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + if (error == -ENOMEM) + return NOPAGE_OOM; + return NULL; + +page_not_uptodate: + lock_page(page); + + /* Did it get unhashed while we waited for it? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry_all; + } + + /* Did somebody else get it up-to-date? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto success; + } + + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page(page); + if (Page_Uptodate(page)) + goto success; + } + + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + lock_page(page); + + /* Somebody truncated the page on us? */ + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry_all; + } + + /* Somebody else successfully read it in? */ + if (Page_Uptodate(page)) { + UnlockPage(page); + goto success; + } + ClearPageError(page); + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page(page); + if (Page_Uptodate(page)) + goto success; + } + + /* + * Things didn't work out. Return zero to tell the + * mm layer so, possibly freeing the page cache page first. + */ + page_cache_release(page); + return NULL; +} + +/* Called with mm->page_table_lock held to protect against other + * threads/the swapper from ripping pte's out from under us. + */ +static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + pte_t pte = *ptep; + + if (pte_present(pte)) { + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) { + flush_tlb_page(vma, address); + set_page_dirty(page); + } + } + return 0; +} + +static inline int filemap_sync_pte_range(pmd_t * pmd, + unsigned long address, unsigned long size, + struct vm_area_struct *vma, unsigned long offset, unsigned int flags) +{ + pte_t * pte; + unsigned long end; + int error; + + if (pmd_none(*pmd)) + return 0; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return 0; + } + pte = pte_offset(pmd, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + error = 0; + do { + error |= filemap_sync_pte(pte, vma, address + offset, flags); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); + return error; +} + +static inline int filemap_sync_pmd_range(pgd_t * pgd, + unsigned long address, unsigned long size, + struct vm_area_struct *vma, unsigned int flags) +{ + pmd_t * pmd; + unsigned long offset, end; + int error; + + if (pgd_none(*pgd)) + return 0; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return 0; + } + pmd = pmd_offset(pgd, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + error = 0; + do { + error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return error; +} + +int filemap_sync(struct vm_area_struct * vma, unsigned long address, + size_t size, unsigned int flags) +{ + pgd_t * dir; + unsigned long end = address + size; + int error = 0; + + /* Aquire the lock early; it may be possible to avoid dropping + * and reaquiring it repeatedly. + */ + spin_lock(&vma->vm_mm->page_table_lock); + + dir = pgd_offset(vma->vm_mm, address); + flush_cache_range(vma->vm_mm, end - size, end); + if (address >= end) + BUG(); + do { + error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + flush_tlb_range(vma->vm_mm, end - size, end); + + spin_unlock(&vma->vm_mm->page_table_lock); + + return error; +} + +static struct vm_operations_struct generic_file_vm_ops = { + nopage: filemap_nopage, +}; + +/* This is used for a general mmap of a disk file */ + +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { + if (!mapping->a_ops->writepage) + return -EINVAL; + } + if (!mapping->a_ops->readpage) + return -ENOEXEC; + UPDATE_ATIME(inode); + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +/* + * The msync() system call. + */ + +static int msync_interval(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int flags) +{ + struct file * file = vma->vm_file; + if (file && (vma->vm_flags & VM_SHARED)) { + int error; + error = filemap_sync(vma, start, end-start, flags); + + if (!error && (flags & MS_SYNC)) { + struct inode * inode = file->f_dentry->d_inode; + down(&inode->i_sem); + filemap_fdatasync(inode->i_mapping); + if (file->f_op && file->f_op->fsync) + error = file->f_op->fsync(file, file->f_dentry, 1); + filemap_fdatawait(inode->i_mapping); + up(&inode->i_sem); + } + return error; + } + return 0; +} + +asmlinkage long sys_msync(unsigned long start, size_t len, int flags) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error, error = -EINVAL; + + down_read(¤t->mm->mmap_sem); + if (start & ~PAGE_MASK) + goto out; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) + goto out; + error = 0; + if (end == start) + goto out; + /* + * If the interval [start,end) covers some unmapped address ranges, + * just ignore them, but return -EFAULT at the end. + */ + vma = find_vma(current->mm, start); + unmapped_error = 0; + for (;;) { + /* Still start < end. */ + error = -EFAULT; + if (!vma) + goto out; + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -EFAULT; + start = vma->vm_start; + } + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = msync_interval(vma, start, end, flags); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = msync_interval(vma, start, vma->vm_end, flags); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } +out: + up_read(¤t->mm->mmap_sem); + return error; +} + +static inline void setup_read_behavior(struct vm_area_struct * vma, + int behavior) +{ + VM_ClearReadHint(vma); + switch(behavior) { + case MADV_SEQUENTIAL: + vma->vm_flags |= VM_SEQ_READ; + break; + case MADV_RANDOM: + vma->vm_flags |= VM_RAND_READ; + break; + default: + break; + } + return; +} + +static long madvise_fixup_start(struct vm_area_struct * vma, + unsigned long end, int behavior) +{ + struct vm_area_struct * n; + struct mm_struct * mm = vma->vm_mm; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_end = end; + setup_read_behavior(n, behavior); + n->vm_raend = 0; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_start = end; + __insert_vm_struct(mm, n); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static long madvise_fixup_end(struct vm_area_struct * vma, + unsigned long start, int behavior) +{ + struct vm_area_struct * n; + struct mm_struct * mm = vma->vm_mm; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_start = start; + n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; + setup_read_behavior(n, behavior); + n->vm_raend = 0; + if (n->vm_file) + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_end = start; + __insert_vm_struct(mm, n); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +static long madvise_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int behavior) +{ + struct vm_area_struct * left, * right; + struct mm_struct * mm = vma->vm_mm; + + left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!left) + return -EAGAIN; + right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!right) { + kmem_cache_free(vm_area_cachep, left); + return -EAGAIN; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + right->vm_start = end; + right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; + left->vm_raend = 0; + right->vm_raend = 0; + if (vma->vm_file) + atomic_add(2, &vma->vm_file->f_count); + + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; + vma->vm_raend = 0; + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + vma->vm_start = start; + vma->vm_end = end; + setup_read_behavior(vma, behavior); + __insert_vm_struct(mm, left); + __insert_vm_struct(mm, right); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + return 0; +} + +/* + * We can potentially split a vm area into separate + * areas, each area with its own behavior. + */ +static long madvise_behavior(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int behavior) +{ + int error = 0; + + /* This caps the number of vma's this process can own */ + if (vma->vm_mm->map_count > max_map_count) + return -ENOMEM; + + if (start == vma->vm_start) { + if (end == vma->vm_end) { + setup_read_behavior(vma, behavior); + vma->vm_raend = 0; + } else + error = madvise_fixup_start(vma, end, behavior); + } else { + if (end == vma->vm_end) + error = madvise_fixup_end(vma, start, behavior); + else + error = madvise_fixup_middle(vma, start, end, behavior); + } + + return error; +} + +/* + * Schedule all required I/O operations, then run the disk queue + * to make sure they are started. Do not wait for completion. + */ +static long madvise_willneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + long error = -EBADF; + struct file * file; + unsigned long size, rlim_rss; + + /* Doesn't work if there's no mapped file. */ + if (!vma->vm_file) + return error; + file = vma->vm_file; + size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + /* Make sure this doesn't exceed the process's max rss. */ + error = -EIO; + rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur : + LONG_MAX; /* default: see resource.h */ + if ((vma->vm_mm->rss + (end - start)) > rlim_rss) + return error; + + /* round to cluster boundaries if this isn't a "random" area. */ + if (!VM_RandomReadHint(vma)) { + start = CLUSTER_OFFSET(start); + end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1); + + while ((start < end) && (start < size)) { + error = read_cluster_nonblocking(file, start, size); + start += CLUSTER_PAGES; + if (error < 0) + break; + } + } else { + while ((start < end) && (start < size)) { + error = page_cache_read(file, start); + start++; + if (error < 0) + break; + } + } + + /* Don't wait for someone else to push these requests. */ + run_task_queue(&tq_disk); + + return error; +} + +/* + * Application no longer needs these pages. If the pages are dirty, + * it's OK to just throw them away. The app will be more careful about + * data it wants to keep. Be sure to free swap resources too. The + * zap_page_range call sets things up for refill_inactive to actually free + * these pages later if no one else has touched them in the meantime, + * although we could add these pages to a global reuse list for + * refill_inactive to pick up before reclaiming other pages. + * + * NB: This interface discards data rather than pushes it out to swap, + * as some implementations do. This has performance implications for + * applications like large transactional databases which want to discard + * pages in anonymous maps after committing to backing store the data + * that was kept in them. There is no reason to write this data out to + * the swap area if the application is discarding it. + * + * An interface that causes the system to free clean pages and flush + * dirty pages is already available as msync(MS_INVALIDATE). + */ +static long madvise_dontneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_flags & VM_LOCKED) + return -EINVAL; + + zap_page_range(vma->vm_mm, start, end - start); + return 0; +} + +static long madvise_vma(struct vm_area_struct * vma, unsigned long start, + unsigned long end, int behavior) +{ + long error = -EBADF; + + switch (behavior) { + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + error = madvise_behavior(vma, start, end, behavior); + break; + + case MADV_WILLNEED: + error = madvise_willneed(vma, start, end); + break; + + case MADV_DONTNEED: + error = madvise_dontneed(vma, start, end); + break; + + default: + error = -EINVAL; + break; + } + + return error; +} + +/* + * The madvise(2) system call. + * + * Applications can use madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the + * kernel without affecting the correct operation of the application. + * + * behavior values: + * MADV_NORMAL - the default behavior is to read clusters. This + * results in some read-ahead and read-behind. + * MADV_RANDOM - the system should read the minimum amount of data + * on any access, since it is unlikely that the appli- + * cation will need more than what it asks for. + * MADV_SEQUENTIAL - pages in the given range will probably be accessed + * once, so they can be aggressively read ahead, and + * can be freed soon after they are accessed. + * MADV_WILLNEED - the application is notifying the system to read + * some pages ahead. + * MADV_DONTNEED - the application is finished with the given range, + * so the kernel can free resources associated with it. + * + * return values: + * zero - success + * -EINVAL - start + len < 0, start is not page-aligned, + * "behavior" is not a valid value, or application + * is attempting to release locked or shared pages. + * -ENOMEM - addresses in the specified range are not currently + * mapped, or are outside the AS of the process. + * -EIO - an I/O error occurred while paging in data. + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error = 0; + int error = -EINVAL; + + down_write(¤t->mm->mmap_sem); + + if (start & ~PAGE_MASK) + goto out; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = madvise_vma(vma, start, end, + behavior); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = madvise_vma(vma, start, vma->vm_end, behavior); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } + +out: + up_write(¤t->mm->mmap_sem); + return error; +} + +/* + * Later we can get more picky about what "in core" means precisely. + * For now, simply check to see if the page is in the page cache, + * and is up to date; i.e. that no page-in operation would be required + * at this time if an application were to map and access this page. + */ +static unsigned char mincore_page(struct vm_area_struct * vma, + unsigned long pgoff) +{ + unsigned char present = 0; + struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping; + struct page * page, ** hash = page_hash(as, pgoff); + + spin_lock(&pagecache_lock); + page = __find_page_nolock(as, pgoff, *hash); + if ((page) && (Page_Uptodate(page))) + present = 1; + spin_unlock(&pagecache_lock); + + return present; +} + +static long mincore_vma(struct vm_area_struct * vma, + unsigned long start, unsigned long end, unsigned char * vec) +{ + long error, i, remaining; + unsigned char * tmp; + + error = -ENOMEM; + if (!vma->vm_file) + return error; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + error = -EAGAIN; + tmp = (unsigned char *) __get_free_page(GFP_KERNEL); + if (!tmp) + return error; + + /* (end - start) is # of pages, and also # of bytes in "vec */ + remaining = (end - start), + + error = 0; + for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { + int j = 0; + long thispiece = (remaining < PAGE_SIZE) ? + remaining : PAGE_SIZE; + + while (j < thispiece) + tmp[j++] = mincore_page(vma, start++); + + if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { + error = -EFAULT; + break; + } + } + + free_page((unsigned long) tmp); + return error; +} + +/* + * The mincore(2) system call. + * + * mincore() returns the memory residency status of the pages in the + * current process's address space specified by [addr, addr + len). + * The status is returned in a vector of bytes. The least significant + * bit of each byte is 1 if the referenced page is in memory, otherwise + * it is zero. + * + * Because the status of a page can change after mincore() checks it + * but before it returns to the application, the returned vector may + * contain stale information. Only locked pages are guaranteed to + * remain in memory. + * + * return values: + * zero - success + * -EFAULT - vec points to an illegal address + * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE, + * or len has a nonpositive value + * -ENOMEM - Addresses in the range [addr, addr + len] are + * invalid for the address space of this process, or + * specify one or more pages which are not currently + * mapped + * -EAGAIN - A kernel resource was temporarily unavailable. + */ +asmlinkage long sys_mincore(unsigned long start, size_t len, + unsigned char * vec) +{ + int index = 0; + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error = 0; + long error = -EINVAL; + + down_read(¤t->mm->mmap_sem); + + if (start & ~PAGE_CACHE_MASK) + goto out; + len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK; + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = mincore_vma(vma, start, end, + &vec[index]); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = mincore_vma(vma, start, vma->vm_end, &vec[index]); + if (error) + goto out; + index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; + start = vma->vm_end; + vma = vma->vm_next; + } + +out: + up_read(¤t->mm->mmap_sem); + return error; +} + +static inline +struct page *__read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page **hash = page_hash(mapping, index); + struct page *page, *cached_page = NULL; + int err; +repeat: + page = __find_get_page(mapping, index, hash); + if (!page) { + if (!cached_page) { + cached_page = page_cache_alloc(mapping); + if (!cached_page) + return ERR_PTR(-ENOMEM); + } + page = cached_page; + if (add_to_page_cache_unique(page, mapping, index, hash)) + goto repeat; + cached_page = NULL; + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + } + if (cached_page) + page_cache_release(cached_page); + return page; +} + +/* + * Read into the page cache. If a page already exists, + * and Page_Uptodate() is not set, try to fill the page. + */ +struct page *read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + int err; + +retry: + page = __read_cache_page(mapping, index, filler, data); + if (IS_ERR(page)) + goto out; + touch_page(page); + if (Page_Uptodate(page)) + goto out; + + lock_page(page); + if (!page->mapping) { + UnlockPage(page); + page_cache_release(page); + goto retry; + } + if (Page_Uptodate(page)) { + UnlockPage(page); + goto out; + } + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + out: + return page; +} + +static inline struct page * __grab_cache_page(struct address_space *mapping, + unsigned long index, struct page **cached_page) +{ + struct page *page, **hash = page_hash(mapping, index); +repeat: + page = __find_lock_page(mapping, index, hash); + if (!page) { + if (!*cached_page) { + *cached_page = page_cache_alloc(mapping); + if (!*cached_page) + return NULL; + } + page = *cached_page; + if (add_to_page_cache_unique(page, mapping, index, hash)) + goto repeat; + *cached_page = NULL; + } + return page; +} + +inline void remove_suid(struct inode *inode) +{ + unsigned int mode; + + /* set S_IGID if S_IXGRP is set, and always set S_ISUID */ + mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID; + + /* was any of the uid bits set? */ + mode &= inode->i_mode; + if (mode && !capable(CAP_FSETID)) { + inode->i_mode &= ~mode; + mark_inode_dirty(inode); + } +} + +/* + * Write to a file through the page cache. + * + * We currently put everything into the page cache prior to writing it. + * This is not a problem when writing full pages. With partial pages, + * however, we first have to read the data into the cache, then + * dirty the page, and finally schedule it for writing. Alternatively, we + * could write-through just the portion of data that would go into that + * page, but that would kill performance for applications that write data + * line by line, and it's prone to race conditions. + * + * Note that this routine doesn't try to keep track of dirty pages. Each + * file system has to do this all by itself, unfortunately. + * okir@monad.swb.de + */ +ssize_t +generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + loff_t pos; + struct page *page, *cached_page; + unsigned long written; + long status = 0; + int err; + unsigned bytes; + + if ((ssize_t) count < 0) + return -EINVAL; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + cached_page = NULL; + + down(&inode->i_sem); + + pos = *ppos; + err = -EINVAL; + if (pos < 0) + goto out; + + err = file->f_error; + if (err) { + file->f_error = 0; + goto out; + } + + written = 0; + + /* FIXME: this is for backwards compatibility with 2.4 */ + if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) + pos = inode->i_size; + + /* + * Check whether we've reached the file size limit. + */ + err = -EFBIG; + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) { + /* send_sig(SIGXFSZ, current, 0); */ + count = limit - (u32)pos; + } + } + + /* + * LFS rule + */ + if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { + if (pos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (count > MAX_NON_LFS - (u32)pos) { + /* send_sig(SIGXFSZ, current, 0); */ + count = MAX_NON_LFS - (u32)pos; + } + } + + /* + * Are we about to exceed the fs block limit ? + * + * If we have written data it becomes a short write + * If we have exceeded without writing data we send + * a signal and give them an EFBIG. + * + * Linus frestrict idea will clean these up nicely.. + */ + + if (!S_ISBLK(inode->i_mode)) { + if (pos >= inode->i_sb->s_maxbytes) + { + if (count || pos > inode->i_sb->s_maxbytes) { + send_sig(SIGXFSZ, current, 0); + err = -EFBIG; + goto out; + } + /* zero-length writes at ->s_maxbytes are OK */ + } + + if (pos + count > inode->i_sb->s_maxbytes) + count = inode->i_sb->s_maxbytes - pos; + } else { + if (is_read_only(inode->i_rdev)) { + err = -EPERM; + goto out; + } + if (pos >= inode->i_size) { + if (count || pos > inode->i_size) { + err = -ENOSPC; + goto out; + } + } + + if (pos + count > inode->i_size) + count = inode->i_size - pos; + } + + err = 0; + if (count == 0) + goto out; + + remove_suid(inode); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + + if (file->f_flags & O_DIRECT) + goto o_direct; + + do { + unsigned long index, offset; + long page_fault; + char *kaddr; + int deactivate = 1; + + /* + * Try to find the page in the cache. If it isn't there, + * allocate a free page. + */ + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) { + bytes = count; + deactivate = 0; + } + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + */ + { volatile unsigned char dummy; + __get_user(dummy, buf); + __get_user(dummy, buf+bytes-1); + } + + status = -ENOMEM; /* we'll assign it later anyway */ + page = __grab_cache_page(mapping, index, &cached_page); + if (!page) + break; + + /* We have exclusive IO access to the page.. */ + if (!PageLocked(page)) { + PAGE_BUG(page); + } + + kaddr = kmap(page); + status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes); + if (status) + goto unlock; + page_fault = __copy_from_user(kaddr+offset, buf, bytes); + flush_dcache_page(page); + status = mapping->a_ops->commit_write(file, page, offset, offset+bytes); + if (page_fault) + goto fail_write; + if (!status) + status = bytes; + + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + } +unlock: + kunmap(page); + /* Mark it unlocked again and drop the page.. */ + UnlockPage(page); + if (deactivate) + deactivate_page(page); + else + touch_page(page); + page_cache_release(page); + + if (status < 0) + break; + } while (count); + *ppos = pos; + + if (cached_page) + page_cache_release(cached_page); + + /* For now, when the user asks for O_SYNC, we'll actually + * provide O_DSYNC. */ + if (status >= 0) { + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) + status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA); + } + +out_status: + err = written ? written : status; +out: + + up(&inode->i_sem); + return err; +fail_write: + status = -EFAULT; + goto unlock; + +o_direct: + written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos); + if (written > 0) { + loff_t end = pos + written; + if (end > inode->i_size && !S_ISBLK(inode->i_mode)) { + inode->i_size = end; + mark_inode_dirty(inode); + } + *ppos = end; + invalidate_inode_pages2(mapping); + } + /* + * Sync the fs metadata but not the minor inode changes and + * of course not the data as we did direct DMA for the IO. + */ + if (written >= 0 && file->f_flags & O_SYNC) + status = generic_osync_inode(inode, OSYNC_METADATA); + goto out_status; +} + +void __init page_cache_init(unsigned long mempages) +{ + unsigned long htable_size, order; + + htable_size = mempages; + htable_size *= sizeof(struct page *); + for(order = 0; (PAGE_SIZE << order) < htable_size; order++) + ; + + do { + unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *); + + page_hash_bits = 0; + while((tmp >>= 1UL) != 0UL) + page_hash_bits++; + + page_hash_table = (struct page **) + __get_free_pages(GFP_ATOMIC, order); + } while(page_hash_table == NULL && --order > 0); + + printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n", + (1 << page_hash_bits), order, (PAGE_SIZE << order)); + if (!page_hash_table) + panic("Failed to allocate page hash table\n"); + memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); +} diff -urN linux-2.4.17-rc1-virgin/mm/highmem.c linux-2.4.17-rc1-wli3/mm/highmem.c --- linux-2.4.17-rc1-virgin/mm/highmem.c Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/mm/highmem.c Mon Oct 22 15:01:57 2001 @@ -32,7 +32,7 @@ */ static int pkmap_count[LAST_PKMAP]; static unsigned int last_pkmap_nr; -static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +static spinlock_t kmap_lock = SPIN_LOCK_UNLOCKED; pte_t * pkmap_page_table; diff -urN linux-2.4.17-rc1-virgin/mm/memory.c linux-2.4.17-rc1-wli3/mm/memory.c --- linux-2.4.17-rc1-virgin/mm/memory.c Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/mm/memory.c Sun Dec 16 17:58:10 2001 @@ -46,6 +46,7 @@ #include #include +#include #include #include @@ -101,6 +102,7 @@ } pte = pte_offset(dir, 0); pmd_clear(dir); + pgtable_remove_rmap(pte); pte_free(pte); } @@ -235,9 +237,11 @@ if (pte_none(pte)) goto cont_copy_pte_range_noset; + /* pte contains position in swap, so copy. */ if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); - goto cont_copy_pte_range; + set_pte(dst_pte, pte); + goto cont_copy_pte_range_noset; } ptepage = pte_page(pte); if ((!VALID_PAGE(ptepage)) || @@ -258,6 +262,7 @@ dst->rss++; cont_copy_pte_range: set_pte(dst_pte, pte); + page_add_rmap(ptepage, dst_pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out_unlock; @@ -313,8 +318,10 @@ continue; if (pte_present(pte)) { struct page *page = pte_page(pte); - if (VALID_PAGE(page) && !PageReserved(page)) + if (VALID_PAGE(page) && !PageReserved(page)) { freed ++; + page_remove_rmap(page, ptep); + } /* This will eventually call __free_pte on the pte. */ tlb_remove_page(tlb, ptep, address + offset); } else { @@ -355,7 +362,8 @@ /* * remove user pages in a given range. */ -void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +void do_zap_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size) { mmu_gather_t *tlb; pgd_t * dir; @@ -397,16 +405,17 @@ spin_unlock(&mm->page_table_lock); } + /* * Do a quick page-table lookup for a single page. */ -static struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) +static struct page * follow_page(unsigned long address, int write) { pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; - pgd = pgd_offset(mm, address); + pgd = pgd_offset(current->mm, address); if (pgd_none(*pgd) || pgd_bad(*pgd)) goto out; @@ -442,74 +451,21 @@ return page; } -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) -{ - int i = 0; - - do { - struct vm_area_struct * vma; - - vma = find_extend_vma(mm, start); - - if ( !vma || - (!force && - ((write && (!(vma->vm_flags & VM_WRITE))) || - (!write && (!(vma->vm_flags & VM_READ))) ) )) { - if (i) return i; - return -EFAULT; - } - - spin_lock(&mm->page_table_lock); - do { - struct page *map; - while (!(map = follow_page(mm, start, write))) { - spin_unlock(&mm->page_table_lock); - switch (handle_mm_fault(mm, vma, start, write)) { - case 1: - tsk->min_flt++; - break; - case 2: - tsk->maj_flt++; - break; - case 0: - if (i) return i; - return -EFAULT; - default: - if (i) return i; - return -ENOMEM; - } - spin_lock(&mm->page_table_lock); - } - if (pages) { - pages[i] = get_page_map(map); - /* FIXME: call the correct function, - * depending on the type of the found page - */ - if (pages[i]) - page_cache_get(pages[i]); - } - if (vmas) - vmas[i] = vma; - i++; - start += PAGE_SIZE; - len--; - } while(len && start < vma->vm_end); - spin_unlock(&mm->page_table_lock); - } while(len); - return i; -} - /* * Force in an entire range of pages from the current process's user VA, * and pin them in physical memory. */ -#define dprintk(x...) +#define dprintk(x...) int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) { - int pgcount, err; + unsigned long ptr, end; + int err; struct mm_struct * mm; + struct vm_area_struct * vma = 0; + struct page * map; + int i; + int datain = (rw == READ); /* Make sure the iobuf is not already mapped somewhere. */ if (iobuf->nr_pages) @@ -518,37 +474,79 @@ mm = current->mm; dprintk ("map_user_kiobuf: begin\n"); - pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE; - /* mapping 0 bytes is not permitted */ - if (!pgcount) BUG(); - err = expand_kiobuf(iobuf, pgcount); + ptr = va & PAGE_MASK; + end = (va + len + PAGE_SIZE - 1) & PAGE_MASK; + err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT); if (err) return err; + down_read(&mm->mmap_sem); + + err = -EFAULT; iobuf->locked = 0; - iobuf->offset = va & (PAGE_SIZE-1); + iobuf->offset = va & ~PAGE_MASK; iobuf->length = len; - /* Try to fault in all of the necessary pages */ - down_read(&mm->mmap_sem); - /* rw==READ means read from disk, write into memory area */ - err = get_user_pages(current, mm, va, pgcount, - (rw==READ), 0, iobuf->maplist, NULL); - up_read(&mm->mmap_sem); - if (err < 0) { - unmap_kiobuf(iobuf); - dprintk ("map_user_kiobuf: end %d\n", err); - return err; - } - iobuf->nr_pages = err; - while (pgcount--) { - /* FIXME: flush superflous for rw==READ, - * probably wrong function for rw==WRITE - */ - flush_dcache_page(iobuf->maplist[pgcount]); + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + if (!vma || ptr >= vma->vm_end) { + vma = find_vma(current->mm, ptr); + if (!vma) + goto out_unlock; + if (vma->vm_start > ptr) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_unlock; + if (expand_stack(vma, ptr)) + goto out_unlock; + } + if (((datain) && (!(vma->vm_flags & VM_WRITE))) || + (!(vma->vm_flags & VM_READ))) { + err = -EACCES; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + while (!(map = follow_page(ptr, datain))) { + int ret; + + spin_unlock(&mm->page_table_lock); + ret = handle_mm_fault(current->mm, vma, ptr, datain); + if (ret <= 0) { + if (!ret) + goto out_unlock; + else { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + } + map = get_page_map(map); + if (map) { + flush_dcache_page(map); + page_cache_get(map); + } else + printk (KERN_INFO "Mapped page missing [%d]\n", i); + spin_unlock(&mm->page_table_lock); + iobuf->maplist[i] = map; + iobuf->nr_pages = ++i; + + ptr += PAGE_SIZE; } + + up_read(&mm->mmap_sem); dprintk ("map_user_kiobuf: end OK\n"); return 0; + + out_unlock: + up_read(&mm->mmap_sem); + unmap_kiobuf(iobuf); + dprintk ("map_user_kiobuf: end %d\n", err); + return err; } /* @@ -598,9 +596,6 @@ if (map) { if (iobuf->locked) UnlockPage(map); - /* FIXME: cache flush missing for rw==READ - * FIXME: call the correct reference counting function - */ page_cache_release(map); } } @@ -609,6 +604,20 @@ iobuf->locked = 0; } +void zap_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size, int actions) +{ + while (size) { + unsigned long chunk = size; + + if (actions & ZPR_PARTITION && chunk > ZPR_MAX_BYTES) + chunk = ZPR_MAX_BYTES; + do_zap_page_range(mm, address, chunk); + + address += chunk; + size -= chunk; + } +} /* * Lock down all of the pages of a kiovec for IO. @@ -718,11 +727,15 @@ return 0; } -static inline void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static inline void zeromap_pte_range(struct mm_struct *mm, pte_t * pte, + unsigned long address, unsigned long size, + pgprot_t prot) { unsigned long end; + debug_lock_break(1); + break_spin_lock(&mm->page_table_lock); + address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -750,7 +763,7 @@ pte_t * pte = pte_alloc(mm, pmd, address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, address, end - address, prot); + zeromap_pte_range(mm, pte, address, end - address, prot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -953,7 +966,9 @@ if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; + page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + page_add_rmap(new_page, page_table); lru_cache_add(new_page); /* Free the old page.. */ @@ -984,7 +999,7 @@ /* mapping wholly truncated? */ if (mpnt->vm_pgoff >= pgoff) { - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, ZPR_NORMAL); continue; } @@ -997,7 +1012,7 @@ /* Ok, partially affected.. */ start += diff << PAGE_SHIFT; len = (len - diff) << PAGE_SHIFT; - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, ZPR_NORMAL); } while ((mpnt = mpnt->vm_next_share) != NULL); } @@ -1035,10 +1050,16 @@ do_expand: limit = current->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out; + if (limit != RLIM_INFINITY) { + if (inode->i_size >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (offset > limit) { + send_sig(SIGXFSZ, current, 0); + offset = limit; + } + } inode->i_size = offset; out_truncate: @@ -1047,11 +1068,8 @@ inode->i_op->truncate(inode); unlock_kernel(); } - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); out: - return -EFBIG; + return 0; } /* @@ -1114,8 +1132,6 @@ ret = 2; } - mark_page_accessed(page); - lock_page(page); /* @@ -1145,6 +1161,7 @@ flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); + page_add_rmap(page, page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); @@ -1160,14 +1177,13 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { pte_t entry; + struct page * page = ZERO_PAGE(addr); /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); /* ..except if it's a write access */ if (write_access) { - struct page *page; - /* Allocate our own private page. */ spin_unlock(&mm->page_table_lock); @@ -1186,10 +1202,10 @@ flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add(page); - mark_page_accessed(page); } set_pte(page_table, entry); + page_add_rmap(page, page_table); /* ignores ZERO_PAGE */ /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); @@ -1234,11 +1250,9 @@ */ if (write_access && !(vma->vm_flags & VM_SHARED)) { struct page * page = alloc_page(GFP_HIGHUSER); - if (!page) { - page_cache_release(new_page); + if (!page) return -1; - } - copy_user_highpage(page, new_page, address); + copy_highpage(page, new_page); page_cache_release(new_page); lru_cache_add(page); new_page = page; @@ -1264,6 +1278,7 @@ if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); set_pte(page_table, entry); + page_add_rmap(new_page, page_table); } else { /* One of our sibling threads was faster, back out. */ page_cache_release(new_page); @@ -1421,25 +1436,30 @@ goto out; } } + pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); } out: return pte_offset(pmd, address); } +/* + * Simplistic page force-in.. + */ int make_pages_present(unsigned long addr, unsigned long end) { - int ret, len, write; + int write; + struct mm_struct *mm = current->mm; struct vm_area_struct * vma; - vma = find_vma(current->mm, addr); + vma = find_vma(mm, addr); write = (vma->vm_flags & VM_WRITE) != 0; if (addr >= end) BUG(); - if (end > vma->vm_end) - BUG(); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - return ret == len ? 0 : -1; + do { + if (handle_mm_fault(mm, vma, addr, write) < 0) + return -1; + addr += PAGE_SIZE; + } while (addr < end); + return 0; } diff -urN linux-2.4.17-rc1-virgin/mm/memory.c~ linux-2.4.17-rc1-wli3/mm/memory.c~ --- linux-2.4.17-rc1-virgin/mm/memory.c~ Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/mm/memory.c~ Fri Dec 14 02:44:20 2001 @@ -0,0 +1,1446 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +unsigned long max_mapnr; +unsigned long num_physpages; +void * high_memory; +struct page *highmem_start_page; + +/* + * We special-case the C-O-W ZERO_PAGE, because it's such + * a common occurrence (no need to read the page to know + * that it's zero - better for the cache and memory subsystem). + */ +static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) +{ + if (from == ZERO_PAGE(address)) { + clear_user_highpage(to, address); + return; + } + copy_user_highpage(to, from, address); +} + +mem_map_t * mem_map; + +/* + * Called by TLB shootdown + */ +void __free_pte(pte_t pte) +{ + struct page *page = pte_page(pte); + if ((!VALID_PAGE(page)) || PageReserved(page)) + return; + if (pte_dirty(pte)) + set_page_dirty(page); + free_page_and_swap_cache(page); +} + + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static inline void free_one_pmd(pmd_t * dir) +{ + pte_t * pte; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return; + } + pte = pte_offset(dir, 0); + pmd_clear(dir); + pgtable_remove_rmap(pte); + pte_free(pte); +} + +static inline void free_one_pgd(pgd_t * dir) +{ + int j; + pmd_t * pmd; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, 0); + pgd_clear(dir); + for (j = 0; j < PTRS_PER_PMD ; j++) { + prefetchw(pmd+j+(PREFETCH_STRIDE/16)); + free_one_pmd(pmd+j); + } + pmd_free(pmd); +} + +/* Low and high watermarks for page table cache. + The system should try to have pgt_water[0] <= cache elements <= pgt_water[1] + */ +int pgt_cache_water[2] = { 25, 50 }; + +/* Returns the number of pages freed */ +int check_pgt_cache(void) +{ + return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]); +} + + +/* + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. + */ +void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) +{ + pgd_t * page_dir = mm->pgd; + + spin_lock(&mm->page_table_lock); + page_dir += first; + do { + free_one_pgd(page_dir); + page_dir++; + } while (--nr); + spin_unlock(&mm->page_table_lock); + + /* keep the page table cache within bounds */ + check_pgt_cache(); +} + +#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + * + * 08Jan98 Merged into one routine from several inline routines to reduce + * variable count and make things faster. -jj + * + * dst->page_table_lock is held on entry and exit, + * but may be dropped within pmd_alloc() and pte_alloc(). + */ +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pgd_t * src_pgd, * dst_pgd; + unsigned long address = vma->vm_start; + unsigned long end = vma->vm_end; + unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE; + + src_pgd = pgd_offset(src, address)-1; + dst_pgd = pgd_offset(dst, address)-1; + + for (;;) { + pmd_t * src_pmd, * dst_pmd; + + src_pgd++; dst_pgd++; + + /* copy_pmd_range */ + + if (pgd_none(*src_pgd)) + goto skip_copy_pmd_range; + if (pgd_bad(*src_pgd)) { + pgd_ERROR(*src_pgd); + pgd_clear(src_pgd); +skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; + if (!address || (address >= end)) + goto out; + continue; + } + + src_pmd = pmd_offset(src_pgd, address); + dst_pmd = pmd_alloc(dst, dst_pgd, address); + if (!dst_pmd) + goto nomem; + + do { + pte_t * src_pte, * dst_pte; + + /* copy_pte_range */ + + if (pmd_none(*src_pmd)) + goto skip_copy_pte_range; + if (pmd_bad(*src_pmd)) { + pmd_ERROR(*src_pmd); + pmd_clear(src_pmd); +skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; + if (address >= end) + goto out; + goto cont_copy_pmd_range; + } + + src_pte = pte_offset(src_pmd, address); + dst_pte = pte_alloc(dst, dst_pmd, address); + if (!dst_pte) + goto nomem; + + spin_lock(&src->page_table_lock); + do { + pte_t pte = *src_pte; + struct page *ptepage; + + /* copy_one_pte */ + + if (pte_none(pte)) + goto cont_copy_pte_range_noset; + /* pte contains position in swap, so copy. */ + if (!pte_present(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + set_pte(dst_pte, pte); + goto cont_copy_pte_range_noset; + } + ptepage = pte_page(pte); + if ((!VALID_PAGE(ptepage)) || + PageReserved(ptepage)) + goto cont_copy_pte_range; + + /* If it's a COW mapping, write protect it both in the parent and the child */ + if (cow) { + ptep_set_wrprotect(src_pte); + pte = *src_pte; + } + + /* If it's a shared mapping, mark it clean in the child */ + if (vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + get_page(ptepage); + dst->rss++; + +cont_copy_pte_range: set_pte(dst_pte, pte); + page_add_rmap(ptepage, dst_pte); +cont_copy_pte_range_noset: address += PAGE_SIZE; + if (address >= end) + goto out_unlock; + src_pte++; + dst_pte++; + } while ((unsigned long)src_pte & PTE_TABLE_MASK); + spin_unlock(&src->page_table_lock); + +cont_copy_pmd_range: src_pmd++; + dst_pmd++; + } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + } +out_unlock: + spin_unlock(&src->page_table_lock); +out: + return 0; +nomem: + return -ENOMEM; +} + +/* + * Return indicates whether a page was freed so caller can adjust rss + */ +static inline void forget_pte(pte_t page) +{ + if (!pte_none(page)) { + printk("forget_pte: old mapping existed!\n"); + BUG(); + } +} + +static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size) +{ + unsigned long offset; + pte_t * ptep; + int freed = 0; + + if (pmd_none(*pmd)) + return 0; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return 0; + } + ptep = pte_offset(pmd, address); + offset = address & ~PMD_MASK; + if (offset + size > PMD_SIZE) + size = PMD_SIZE - offset; + size &= PAGE_MASK; + for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { + pte_t pte = *ptep; + if (pte_none(pte)) + continue; + if (pte_present(pte)) { + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page)) { + freed ++; + page_remove_rmap(page, ptep); + } + /* This will eventually call __free_pte on the pte. */ + tlb_remove_page(tlb, ptep, address + offset); + } else { + free_swap_and_cache(pte_to_swp_entry(pte)); + pte_clear(ptep); + } + } + + return freed; +} + +static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size) +{ + pmd_t * pmd; + unsigned long end; + int freed; + + if (pgd_none(*dir)) + return 0; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return 0; + } + pmd = pmd_offset(dir, address); + end = address + size; + if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) + end = ((address + PGDIR_SIZE) & PGDIR_MASK); + freed = 0; + do { + freed += zap_pte_range(tlb, pmd, address, end - address); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return freed; +} + +/* + * remove user pages in a given range. + */ +void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +{ + mmu_gather_t *tlb; + pgd_t * dir; + unsigned long start = address, end = address + size; + int freed = 0; + + dir = pgd_offset(mm, address); + + /* + * This is a long-lived spinlock. That's fine. + * There's no contention, because the page table + * lock only protects against kswapd anyway, and + * even if kswapd happened to be looking at this + * process we _want_ it to get stuck. + */ + if (address >= end) + BUG(); + spin_lock(&mm->page_table_lock); + flush_cache_range(mm, address, end); + tlb = tlb_gather_mmu(mm); + + do { + freed += zap_pmd_range(tlb, dir, address, end - address); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + + /* this will flush any remaining tlb entries */ + tlb_finish_mmu(tlb, start, end); + + /* + * Update rss for the mm_struct (not necessarily current->mm) + * Notice that rss is an unsigned long. + */ + if (mm->rss > freed) + mm->rss -= freed; + else + mm->rss = 0; + spin_unlock(&mm->page_table_lock); +} + + +/* + * Do a quick page-table lookup for a single page. + */ +static struct page * follow_page(unsigned long address, int write) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *ptep, pte; + + pgd = pgd_offset(current->mm, address); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + goto out; + + ptep = pte_offset(pmd, address); + if (!ptep) + goto out; + + pte = *ptep; + if (pte_present(pte)) { + if (!write || + (pte_write(pte) && pte_dirty(pte))) + return pte_page(pte); + } + +out: + return 0; +} + +/* + * Given a physical address, is there a useful struct page pointing to + * it? This may become more complex in the future if we start dealing + * with IO-aperture pages in kiobufs. + */ + +static inline struct page * get_page_map(struct page *page) +{ + if (!VALID_PAGE(page)) + return 0; + return page; +} + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin them in physical memory. + */ + +#define dprintk(x...) +int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) +{ + unsigned long ptr, end; + int err; + struct mm_struct * mm; + struct vm_area_struct * vma = 0; + struct page * map; + int i; + int datain = (rw == READ); + + /* Make sure the iobuf is not already mapped somewhere. */ + if (iobuf->nr_pages) + return -EINVAL; + + mm = current->mm; + dprintk ("map_user_kiobuf: begin\n"); + + ptr = va & PAGE_MASK; + end = (va + len + PAGE_SIZE - 1) & PAGE_MASK; + err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT); + if (err) + return err; + + down_read(&mm->mmap_sem); + + err = -EFAULT; + iobuf->locked = 0; + iobuf->offset = va & ~PAGE_MASK; + iobuf->length = len; + + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + if (!vma || ptr >= vma->vm_end) { + vma = find_vma(current->mm, ptr); + if (!vma) + goto out_unlock; + if (vma->vm_start > ptr) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_unlock; + if (expand_stack(vma, ptr)) + goto out_unlock; + } + if (((datain) && (!(vma->vm_flags & VM_WRITE))) || + (!(vma->vm_flags & VM_READ))) { + err = -EACCES; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + while (!(map = follow_page(ptr, datain))) { + int ret; + + spin_unlock(&mm->page_table_lock); + ret = handle_mm_fault(current->mm, vma, ptr, datain); + if (ret <= 0) { + if (!ret) + goto out_unlock; + else { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + } + map = get_page_map(map); + if (map) { + flush_dcache_page(map); + page_cache_get(map); + } else + printk (KERN_INFO "Mapped page missing [%d]\n", i); + spin_unlock(&mm->page_table_lock); + iobuf->maplist[i] = map; + iobuf->nr_pages = ++i; + + ptr += PAGE_SIZE; + } + + up_read(&mm->mmap_sem); + dprintk ("map_user_kiobuf: end OK\n"); + return 0; + + out_unlock: + up_read(&mm->mmap_sem); + unmap_kiobuf(iobuf); + dprintk ("map_user_kiobuf: end %d\n", err); + return err; +} + +/* + * Mark all of the pages in a kiobuf as dirty + * + * We need to be able to deal with short reads from disk: if an IO error + * occurs, the number of bytes read into memory may be less than the + * size of the kiobuf, so we have to stop marking pages dirty once the + * requested byte count has been reached. + */ + +void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes) +{ + int index, offset, remaining; + struct page *page; + + index = iobuf->offset >> PAGE_SHIFT; + offset = iobuf->offset & ~PAGE_MASK; + remaining = bytes; + if (remaining > iobuf->length) + remaining = iobuf->length; + + while (remaining > 0 && index < iobuf->nr_pages) { + page = iobuf->maplist[index]; + + if (!PageReserved(page)) + SetPageDirty(page); + + remaining -= (PAGE_SIZE - offset); + offset = 0; + index++; + } +} + +/* + * Unmap all of the pages referenced by a kiobuf. We release the pages, + * and unlock them if they were locked. + */ + +void unmap_kiobuf (struct kiobuf *iobuf) +{ + int i; + struct page *map; + + for (i = 0; i < iobuf->nr_pages; i++) { + map = iobuf->maplist[i]; + if (map) { + if (iobuf->locked) + UnlockPage(map); + page_cache_release(map); + } + } + + iobuf->nr_pages = 0; + iobuf->locked = 0; +} + + +/* + * Lock down all of the pages of a kiovec for IO. + * + * If any page is mapped twice in the kiovec, we return the error -EINVAL. + * + * The optional wait parameter causes the lock call to block until all + * pages can be locked if set. If wait==0, the lock operation is + * aborted if any locked pages are found and -EAGAIN is returned. + */ + +int lock_kiovec(int nr, struct kiobuf *iovec[], int wait) +{ + struct kiobuf *iobuf; + int i, j; + struct page *page, **ppage; + int doublepage = 0; + int repeat = 0; + + repeat: + + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + + if (iobuf->locked) + continue; + + ppage = iobuf->maplist; + for (j = 0; j < iobuf->nr_pages; ppage++, j++) { + page = *ppage; + if (!page) + continue; + + if (TryLockPage(page)) { + while (j--) { + struct page *tmp = *--ppage; + if (tmp) + UnlockPage(tmp); + } + goto retry; + } + } + iobuf->locked = 1; + } + + return 0; + + retry: + + /* + * We couldn't lock one of the pages. Undo the locking so far, + * wait on the page we got to, and try again. + */ + + unlock_kiovec(nr, iovec); + if (!wait) + return -EAGAIN; + + /* + * Did the release also unlock the page we got stuck on? + */ + if (!PageLocked(page)) { + /* + * If so, we may well have the page mapped twice + * in the IO address range. Bad news. Of + * course, it _might_ just be a coincidence, + * but if it happens more than once, chances + * are we have a double-mapped page. + */ + if (++doublepage >= 3) + return -EINVAL; + + /* Try again... */ + wait_on_page(page); + } + + if (++repeat < 16) + goto repeat; + return -EAGAIN; +} + +/* + * Unlock all of the pages of a kiovec after IO. + */ + +int unlock_kiovec(int nr, struct kiobuf *iovec[]) +{ + struct kiobuf *iobuf; + int i, j; + struct page *page, **ppage; + + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + + if (!iobuf->locked) + continue; + iobuf->locked = 0; + + ppage = iobuf->maplist; + for (j = 0; j < iobuf->nr_pages; ppage++, j++) { + page = *ppage; + if (!page) + continue; + UnlockPage(page); + } + } + return 0; +} + +static inline void zeromap_pte_range(pte_t * pte, unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); + pte_t oldpage = ptep_get_and_clear(pte); + set_pte(pte, zero_pte); + forget_pte(oldpage); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + pte_t * pte = pte_alloc(mm, pmd, address); + if (!pte) + return -ENOMEM; + zeromap_pte_range(pte, address, end - address, prot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long beg = address; + unsigned long end = address + size; + struct mm_struct *mm = current->mm; + + dir = pgd_offset(mm, address); + flush_cache_range(mm, beg, end); + if (address >= end) + BUG(); + + spin_lock(&mm->page_table_lock); + do { + pmd_t *pmd = pmd_alloc(mm, dir, address); + error = -ENOMEM; + if (!pmd) + break; + error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + if (error) + break; + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + spin_unlock(&mm->page_table_lock); + flush_tlb_range(mm, beg, end); + return error; +} + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + struct page *page; + pte_t oldpage; + oldpage = ptep_get_and_clear(pte); + + page = virt_to_page(__va(phys_addr)); + if ((!VALID_PAGE(page)) || PageReserved(page)) + set_pte(pte, mk_pte_phys(phys_addr, prot)); + forget_pte(oldpage); + address += PAGE_SIZE; + phys_addr += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + phys_addr -= address; + do { + pte_t * pte = pte_alloc(mm, pmd, address); + if (!pte) + return -ENOMEM; + remap_pte_range(pte, address, end - address, address + phys_addr, prot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +/* Note: this is only safe if the mm semaphore is held when called. */ +int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long beg = from; + unsigned long end = from + size; + struct mm_struct *mm = current->mm; + + phys_addr -= from; + dir = pgd_offset(mm, from); + flush_cache_range(mm, beg, end); + if (from >= end) + BUG(); + + spin_lock(&mm->page_table_lock); + do { + pmd_t *pmd = pmd_alloc(mm, dir, from); + error = -ENOMEM; + if (!pmd) + break; + error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + if (error) + break; + from = (from + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (from && (from < end)); + spin_unlock(&mm->page_table_lock); + flush_tlb_range(mm, beg, end); + return error; +} + +/* + * Establish a new mapping: + * - flush the old one + * - update the page tables + * - inform the TLB about the new one + * + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) +{ + set_pte(page_table, entry); + flush_tlb_page(vma, address); + update_mmu_cache(vma, address, entry); +} + +/* + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, + pte_t *page_table) +{ + flush_page_to_ram(new_page); + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We hold the mm semaphore and the page_table_lock on entry and exit + * with the page_table_lock released. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, pte_t *page_table, pte_t pte) +{ + struct page *old_page, *new_page; + + old_page = pte_page(pte); + if (!VALID_PAGE(old_page)) + goto bad_wp_page; + + if (!TryLockPage(old_page)) { + int reuse = can_share_swap_page(old_page); + unlock_page(old_page); + if (reuse) { + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + spin_unlock(&mm->page_table_lock); + return 1; /* Minor fault */ + } + } + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); + spin_unlock(&mm->page_table_lock); + + new_page = alloc_page(GFP_HIGHUSER); + if (!new_page) + goto no_mem; + copy_cow_page(old_page,new_page,address); + + /* + * Re-check the pte - we dropped the lock + */ + spin_lock(&mm->page_table_lock); + if (pte_same(*page_table, pte)) { + if (PageReserved(old_page)) + ++mm->rss; + page_remove_rmap(old_page, page_table); + break_cow(vma, new_page, address, page_table); + page_add_rmap(new_page, page_table); + lru_cache_add(new_page); + + /* Free the old page.. */ + new_page = old_page; + } + spin_unlock(&mm->page_table_lock); + page_cache_release(new_page); + page_cache_release(old_page); + return 1; /* Minor fault */ + +bad_wp_page: + spin_unlock(&mm->page_table_lock); + printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page); + return -1; +no_mem: + page_cache_release(old_page); + return -1; +} + +static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff) +{ + do { + struct mm_struct *mm = mpnt->vm_mm; + unsigned long start = mpnt->vm_start; + unsigned long end = mpnt->vm_end; + unsigned long len = end - start; + unsigned long diff; + + /* mapping wholly truncated? */ + if (mpnt->vm_pgoff >= pgoff) { + zap_page_range(mm, start, len); + continue; + } + + /* mapping wholly unaffected? */ + len = len >> PAGE_SHIFT; + diff = pgoff - mpnt->vm_pgoff; + if (diff >= len) + continue; + + /* Ok, partially affected.. */ + start += diff << PAGE_SHIFT; + len = (len - diff) << PAGE_SHIFT; + zap_page_range(mm, start, len); + } while ((mpnt = mpnt->vm_next_share) != NULL); +} + +/* + * Handle all mappings that got truncated by a "truncate()" + * system call. + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode * inode, loff_t offset) +{ + unsigned long pgoff; + struct address_space *mapping = inode->i_mapping; + unsigned long limit; + + if (inode->i_size < offset) + goto do_expand; + inode->i_size = offset; + spin_lock(&mapping->i_shared_lock); + if (!mapping->i_mmap && !mapping->i_mmap_shared) + goto out_unlock; + + pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (mapping->i_mmap != NULL) + vmtruncate_list(mapping->i_mmap, pgoff); + if (mapping->i_mmap_shared != NULL) + vmtruncate_list(mapping->i_mmap_shared, pgoff); + +out_unlock: + spin_unlock(&mapping->i_shared_lock); + truncate_inode_pages(mapping, offset); + goto out_truncate; + +do_expand: + limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY) { + if (inode->i_size >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (offset > limit) { + send_sig(SIGXFSZ, current, 0); + offset = limit; + } + } + inode->i_size = offset; + +out_truncate: + if (inode->i_op && inode->i_op->truncate) { + lock_kernel(); + inode->i_op->truncate(inode); + unlock_kernel(); + } +out: + return 0; +} + +/* + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + */ +void swapin_readahead(swp_entry_t entry) +{ + int i, num; + struct page *new_page; + unsigned long offset; + + /* + * Get the number of handles we should do readahead io to. + */ + num = valid_swaphandles(entry, &offset); + for (i = 0; i < num; offset++, i++) { + /* Ok, do the async read-ahead now */ + new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset)); + if (!new_page) + break; + page_cache_release(new_page); + } + return; +} + +/* + * We hold the mm semaphore and the page_table_lock on entry and + * should release the pagetable lock on exit.. + */ +static int do_swap_page(struct mm_struct * mm, + struct vm_area_struct * vma, unsigned long address, + pte_t * page_table, pte_t orig_pte, int write_access) +{ + struct page *page; + swp_entry_t entry = pte_to_swp_entry(orig_pte); + pte_t pte; + int ret = 1; + + spin_unlock(&mm->page_table_lock); + page = lookup_swap_cache(entry); + if (!page) { + swapin_readahead(entry); + page = read_swap_cache_async(entry); + if (!page) { + /* + * Back out if somebody else faulted in this pte while + * we released the page table lock. + */ + int retval; + spin_lock(&mm->page_table_lock); + retval = pte_same(*page_table, orig_pte) ? -1 : 1; + spin_unlock(&mm->page_table_lock); + return retval; + } + + /* Had to read the page from swap area: Major fault */ + ret = 2; + } + + lock_page(page); + + /* + * Back out if somebody else faulted in this pte while we + * released the page table lock. + */ + spin_lock(&mm->page_table_lock); + if (!pte_same(*page_table, orig_pte)) { + spin_unlock(&mm->page_table_lock); + unlock_page(page); + page_cache_release(page); + return 1; + } + + /* The page isn't present yet, go ahead with the fault. */ + + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + + mm->rss++; + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) + pte = pte_mkdirty(pte_mkwrite(pte)); + unlock_page(page); + + flush_page_to_ram(page); + flush_icache_page(vma, page); + set_pte(page_table, pte); + page_add_rmap(page, page_table); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + spin_unlock(&mm->page_table_lock); + return ret; +} + +/* + * We are called with the MM semaphore and page_table_lock + * spinlock held to protect against concurrent faults in + * multithreaded programs. + */ +static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) +{ + pte_t entry; + struct page * page = ZERO_PAGE(addr); + + /* Read-only mapping of ZERO_PAGE. */ + entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + + /* ..except if it's a write access */ + if (write_access) { + /* Allocate our own private page. */ + spin_unlock(&mm->page_table_lock); + + page = alloc_page(GFP_HIGHUSER); + if (!page) + goto no_mem; + clear_user_highpage(page, addr); + + spin_lock(&mm->page_table_lock); + if (!pte_none(*page_table)) { + page_cache_release(page); + spin_unlock(&mm->page_table_lock); + return 1; + } + mm->rss++; + flush_page_to_ram(page); + entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + lru_cache_add(page); + } + + set_pte(page_table, entry); + page_add_rmap(page, page_table); /* ignores ZERO_PAGE */ + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, entry); + spin_unlock(&mm->page_table_lock); + return 1; /* Minor fault */ + +no_mem: + return -1; +} + +/* + * do_no_page() tries to create a new page mapping. It aggressively + * tries to share with existing pages, but makes a separate copy if + * the "write_access" parameter is true in order to avoid the next + * page fault. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * This is called with the MM semaphore held and the page table + * spinlock held. Exit with the spinlock released. + */ +static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *page_table) +{ + struct page * new_page; + pte_t entry; + + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, page_table, write_access, address); + spin_unlock(&mm->page_table_lock); + + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); + + if (new_page == NULL) /* no page was available -- SIGBUS */ + return 0; + if (new_page == NOPAGE_OOM) + return -1; + + /* + * Should we do an early C-O-W break? + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) { + struct page * page = alloc_page(GFP_HIGHUSER); + if (!page) + return -1; + copy_highpage(page, new_page); + page_cache_release(new_page); + lru_cache_add(page); + new_page = page; + } + + spin_lock(&mm->page_table_lock); + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if write_access is true, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + ++mm->rss; + flush_page_to_ram(new_page); + flush_icache_page(vma, new_page); + entry = mk_pte(new_page, vma->vm_page_prot); + if (write_access) + entry = pte_mkwrite(pte_mkdirty(entry)); + set_pte(page_table, entry); + page_add_rmap(new_page, page_table); + } else { + /* One of our sibling threads was faster, back out. */ + page_cache_release(new_page); + spin_unlock(&mm->page_table_lock); + return 1; + } + + /* no need to invalidate: a not-present page shouldn't be cached */ + update_mmu_cache(vma, address, entry); + spin_unlock(&mm->page_table_lock); + return 2; /* Major fault */ +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * Note the "page_table_lock". It is to protect against kswapd removing + * pages from under us. Note that kswapd only ever _removes_ pages, never + * adds them. As such, once we have noticed that the page is not present, + * we can drop the lock early. + * + * The adding of pages is protected by the MM semaphore (which we hold), + * so we don't need to worry about a page being suddenly been added into + * our VM. + * + * We enter with the pagetable spinlock held, we are supposed to + * release it when done. + */ +static inline int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct * vma, unsigned long address, + int write_access, pte_t * pte) +{ + pte_t entry; + + entry = *pte; + if (!pte_present(entry)) { + /* + * If it truly wasn't present, we know that kswapd + * and the PTE updates will not touch it later. So + * drop the lock. + */ + if (pte_none(entry)) + return do_no_page(mm, vma, address, write_access, pte); + return do_swap_page(mm, vma, address, pte, entry, write_access); + } + + if (write_access) { + if (!pte_write(entry)) + return do_wp_page(mm, vma, address, pte, entry); + + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + establish_pte(vma, address, pte, entry); + spin_unlock(&mm->page_table_lock); + return 1; +} + +/* + * By the time we get here, we already hold the mm semaphore + */ +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, int write_access) +{ + pgd_t *pgd; + pmd_t *pmd; + + current->state = TASK_RUNNING; + pgd = pgd_offset(mm, address); + + /* + * We need the page table lock to synchronize with kswapd + * and the SMP-safe atomic PTE updates. + */ + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc(mm, pgd, address); + + if (pmd) { + pte_t * pte = pte_alloc(mm, pmd, address); + if (pte) + return handle_pte_fault(mm, vma, address, write_access, pte); + } + spin_unlock(&mm->page_table_lock); + return -1; +} + +/* + * Allocate page middle directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + * + * On a two-level page table, this ends up actually being entirely + * optimized away. + */ +pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pmd_t *new; + + /* "fast" allocation can happen without dropping the lock.. */ + new = pmd_alloc_one_fast(mm, address); + if (!new) { + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (!pgd_none(*pgd)) { + pmd_free(new); + goto out; + } + } + pgd_populate(mm, pgd, new); +out: + return pmd_offset(pgd, address); +} + +/* + * Allocate the page table directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + */ +pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (pmd_none(*pmd)) { + pte_t *new; + + /* "fast" allocation can happen without dropping the lock.. */ + new = pte_alloc_one_fast(mm, address); + if (!new) { + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (!pmd_none(*pmd)) { + pte_free(new); + goto out; + } + } + pgtable_add_rmap(new, mm, address); + pmd_populate(mm, pmd, new); + } +out: + return pte_offset(pmd, address); +} + +/* + * Simplistic page force-in.. + */ +int make_pages_present(unsigned long addr, unsigned long end) +{ + int write; + struct mm_struct *mm = current->mm; + struct vm_area_struct * vma; + + vma = find_vma(mm, addr); + write = (vma->vm_flags & VM_WRITE) != 0; + if (addr >= end) + BUG(); + do { + if (handle_mm_fault(mm, vma, addr, write) < 0) + return -1; + addr += PAGE_SIZE; + } while (addr < end); + return 0; +} diff -urN linux-2.4.17-rc1-virgin/mm/mmap.c linux-2.4.17-rc1-wli3/mm/mmap.c --- linux-2.4.17-rc1-virgin/mm/mmap.c Sun Nov 4 10:17:20 2001 +++ linux-2.4.17-rc1-wli3/mm/mmap.c Sun Dec 16 17:58:10 2001 @@ -45,6 +45,7 @@ }; int sysctl_overcommit_memory; +int max_map_count = DEFAULT_MAX_MAP_COUNT; /* Check that a process has enough memory to allocate a * new virtual mapping. @@ -413,7 +414,7 @@ return -EINVAL; /* Too many mappings? */ - if (mm->map_count > MAX_MAP_COUNT) + if (mm->map_count > max_map_count) return -ENOMEM; /* Obtain the address to map to. we verify (or select) it and ensure @@ -569,7 +570,7 @@ fput(file); /* Undo any partial mapping done by a device driver. */ - zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start, ZPR_NORMAL); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; @@ -919,7 +920,7 @@ /* If we'll make "hole", check the vm areas limit */ if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) - && mm->map_count >= MAX_MAP_COUNT) + && mm->map_count >= max_map_count) return -ENOMEM; /* @@ -967,7 +968,7 @@ remove_shared_vm_struct(mpnt); mm->map_count--; - zap_page_range(mm, st, size); + zap_page_range(mm, st, size, ZPR_PARTITION); /* * Fix the mapping, and free the old area if it wasn't reused. @@ -1040,7 +1041,7 @@ > current->rlim[RLIMIT_AS].rlim_cur) return -ENOMEM; - if (mm->map_count > MAX_MAP_COUNT) + if (mm->map_count > max_map_count) return -ENOMEM; if (!vm_enough_memory(len >> PAGE_SHIFT)) @@ -1127,7 +1128,7 @@ } mm->map_count--; remove_shared_vm_struct(mpnt); - zap_page_range(mm, start, size); + zap_page_range(mm, start, size, ZPR_PARTITION); if (mpnt->vm_file) fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); diff -urN linux-2.4.17-rc1-virgin/mm/mmap.c~ linux-2.4.17-rc1-wli3/mm/mmap.c~ --- linux-2.4.17-rc1-virgin/mm/mmap.c~ Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/mm/mmap.c~ Fri Dec 14 02:44:20 2001 @@ -0,0 +1,1173 @@ +/* + * linux/mm/mmap.c + * + * Written by obz. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * WARNING: the debugging will use recursive algorithms so never enable this + * unless you know what you are doing. + */ +#undef DEBUG_MM_RB + +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ +pgprot_t protection_map[16] = { + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 +}; + +int sysctl_overcommit_memory; +int max_map_count = DEFAULT_MAX_MAP_COUNT; + +/* Check that a process has enough memory to allocate a + * new virtual mapping. + */ +int vm_enough_memory(long pages) +{ + /* Stupid algorithm to decide if we have enough memory: while + * simple, it hopefully works in most obvious cases.. Easy to + * fool it, but this should catch most mistakes. + */ + /* 23/11/98 NJC: Somewhat less stupid version of algorithm, + * which tries to do "TheRightThing". Instead of using half of + * (buffers+cache), use the minimum values. Allow an extra 2% + * of num_physpages for safety margin. + */ + + unsigned long free; + + /* Sometimes we want to use more memory than we have. */ + if (sysctl_overcommit_memory) + return 1; + + /* The page cache contains buffer pages these days.. */ + free = atomic_read(&page_cache_size); + free += nr_free_pages(); + free += nr_swap_pages; + + /* + * This double-counts: the nrpages are both in the page-cache + * and in the swapper space. At the same time, this compensates + * for the swap-space over-allocation (ie "nr_swap_pages" being + * too small. + */ + free += swapper_space.nrpages; + + /* + * The code below doesn't account for free space in the inode + * and dentry slab cache, slab cache fragmentation, inodes and + * dentries which will become freeable under VM load, etc. + * Lets just hope all these (complex) factors balance out... + */ + free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT; + free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT; + + return free > pages; +} + +/* Remove one vm structure from the inode's i_mapping address space. */ +static inline void __remove_shared_vm_struct(struct vm_area_struct *vma) +{ + struct file * file = vma->vm_file; + + if (file) { + struct inode *inode = file->f_dentry->d_inode; + if (vma->vm_flags & VM_DENYWRITE) + atomic_inc(&inode->i_writecount); + if(vma->vm_next_share) + vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share; + *vma->vm_pprev_share = vma->vm_next_share; + } +} + +static inline void remove_shared_vm_struct(struct vm_area_struct *vma) +{ + lock_vma_mappings(vma); + __remove_shared_vm_struct(vma); + unlock_vma_mappings(vma); +} + +void lock_vma_mappings(struct vm_area_struct *vma) +{ + struct address_space *mapping; + + mapping = NULL; + if (vma->vm_file) + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + if (mapping) + spin_lock(&mapping->i_shared_lock); +} + +void unlock_vma_mappings(struct vm_area_struct *vma) +{ + struct address_space *mapping; + + mapping = NULL; + if (vma->vm_file) + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + if (mapping) + spin_unlock(&mapping->i_shared_lock); +} + +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ +asmlinkage unsigned long sys_brk(unsigned long brk) +{ + unsigned long rlim, retval; + unsigned long newbrk, oldbrk; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + + if (brk < mm->end_code) + goto out; + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) + goto set_brk; + + /* Always allow shrinking brk. */ + if (brk <= mm->brk) { + if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + goto set_brk; + goto out; + } + + /* Check against rlimit.. */ + rlim = current->rlim[RLIMIT_DATA].rlim_cur; + if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) + goto out; + + /* Check against existing mmap mappings. */ + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + goto out; + + /* Check if we have enough memory.. */ + if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) + goto out; + + /* Ok, looks good - let it rip. */ + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + goto out; +set_brk: + mm->brk = brk; +out: + retval = mm->brk; + up_write(&mm->mmap_sem); + return retval; +} + +/* Combine the mmap "prot" and "flags" argument into one "vm_flags" used + * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits + * into "VM_xxx". + */ +static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flags) +{ +#define _trans(x,bit1,bit2) \ +((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0) + + unsigned long prot_bits, flag_bits; + prot_bits = + _trans(prot, PROT_READ, VM_READ) | + _trans(prot, PROT_WRITE, VM_WRITE) | + _trans(prot, PROT_EXEC, VM_EXEC); + flag_bits = + _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) | + _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) | + _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE); + return prot_bits | flag_bits; +#undef _trans +} + +#ifdef DEBUG_MM_RB +static int browse_rb(rb_node_t * rb_node) { + int i = 0; + if (rb_node) { + i++; + i += browse_rb(rb_node->rb_left); + i += browse_rb(rb_node->rb_right); + } + return i; +} + +static void validate_mm(struct mm_struct * mm) { + int bug = 0; + int i = 0; + struct vm_area_struct * tmp = mm->mmap; + while (tmp) { + tmp = tmp->vm_next; + i++; + } + if (i != mm->map_count) + printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; + i = browse_rb(mm->mm_rb.rb_node); + if (i != mm->map_count) + printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; + if (bug) + BUG(); +} +#else +#define validate_mm(mm) do { } while (0) +#endif + +static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct ** pprev, + rb_node_t *** rb_link, rb_node_t ** rb_parent) +{ + struct vm_area_struct * vma; + rb_node_t ** __rb_link, * __rb_parent, * rb_prev; + + __rb_link = &mm->mm_rb.rb_node; + rb_prev = __rb_parent = NULL; + vma = NULL; + + while (*__rb_link) { + struct vm_area_struct *vma_tmp; + + __rb_parent = *__rb_link; + vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + return vma; + __rb_link = &__rb_parent->rb_left; + } else { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } + } + + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + *rb_link = __rb_link; + *rb_parent = __rb_parent; + return vma; +} + +static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, + rb_node_t * rb_parent) +{ + if (prev) { + vma->vm_next = prev->vm_next; + prev->vm_next = vma; + } else { + mm->mmap = vma; + if (rb_parent) + vma->vm_next = rb_entry(rb_parent, struct vm_area_struct, vm_rb); + else + vma->vm_next = NULL; + } +} + +static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma, + rb_node_t ** rb_link, rb_node_t * rb_parent) +{ + rb_link_node(&vma->vm_rb, rb_parent, rb_link); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); +} + +static inline void __vma_link_file(struct vm_area_struct * vma) +{ + struct file * file; + + file = vma->vm_file; + if (file) { + struct inode * inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct vm_area_struct **head; + + if (vma->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + + head = &mapping->i_mmap; + if (vma->vm_flags & VM_SHARED) + head = &mapping->i_mmap_shared; + + /* insert vma into inode's share list */ + if((vma->vm_next_share = *head) != NULL) + (*head)->vm_pprev_share = &vma->vm_next_share; + *head = vma; + vma->vm_pprev_share = head; + } +} + +static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, + rb_node_t ** rb_link, rb_node_t * rb_parent) +{ + __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_rb(mm, vma, rb_link, rb_parent); + __vma_link_file(vma); +} + +static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, + rb_node_t ** rb_link, rb_node_t * rb_parent) +{ + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + __vma_link(mm, vma, prev, rb_link, rb_parent); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + + mm->map_count++; + validate_mm(mm); +} + +static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev, + rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags) +{ + spinlock_t * lock = &mm->page_table_lock; + if (!prev) { + prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); + goto merge_next; + } + if (prev->vm_end == addr && can_vma_merge(prev, vm_flags)) { + struct vm_area_struct * next; + + spin_lock(lock); + prev->vm_end = end; + next = prev->vm_next; + if (next && prev->vm_end == next->vm_start && can_vma_merge(next, vm_flags)) { + prev->vm_end = next->vm_end; + __vma_unlink(mm, next, prev); + spin_unlock(lock); + + mm->map_count--; + kmem_cache_free(vm_area_cachep, next); + return 1; + } + spin_unlock(lock); + return 1; + } + + prev = prev->vm_next; + if (prev) { + merge_next: + if (!can_vma_merge(prev, vm_flags)) + return 0; + if (end == prev->vm_start) { + spin_lock(lock); + prev->vm_start = addr; + spin_unlock(lock); + return 1; + } + } + + return 0; +} + +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, unsigned long pgoff) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned int vm_flags; + int correct_wcount = 0; + int error; + rb_node_t ** rb_link, * rb_parent; + + if (file && (!file->f_op || !file->f_op->mmap)) + return -ENODEV; + + if ((len = PAGE_ALIGN(len)) == 0) + return addr; + + if (len > TASK_SIZE) + return -EINVAL; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EINVAL; + + /* Too many mappings? */ + if (mm->map_count > max_map_count) + return -ENOMEM; + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(file, addr, len, pgoff, flags); + if (addr & ~PAGE_MASK) + return addr; + + /* Do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + /* mlock MCL_FUTURE? */ + if (vm_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + if (file) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* Make sure we don't allow writing to an append-only file.. */ + if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* make sure there are no mandatory locks on the file. */ + if (locks_verify_locked(file->f_dentry->d_inode)) + return -EAGAIN; + + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + break; + + default: + return -EINVAL; + } + } else { + vm_flags |= VM_SHARED | VM_MAYSHARE; + switch (flags & MAP_TYPE) { + default: + return -EINVAL; + case MAP_PRIVATE: + vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + /* fall through */ + case MAP_SHARED: + break; + } + } + + /* Clear old maps */ + error = -ENOMEM; +munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limit. */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + /* Private writable mapping? Check memory availability.. */ + if ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && + !(flags & MAP_NORESERVE) && + !vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + /* Can we just expand an old anonymous mapping? */ + if (!file && !(vm_flags & VM_SHARED) && rb_parent) + if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags)) + goto out; + + /* Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = vm_flags; + vma->vm_page_prot = protection_map[vm_flags & 0x0f]; + vma->vm_ops = NULL; + vma->vm_pgoff = pgoff; + vma->vm_file = NULL; + vma->vm_private_data = NULL; + vma->vm_raend = 0; + + if (file) { + error = -EINVAL; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + goto free_vma; + if (vm_flags & VM_DENYWRITE) { + error = deny_write_access(file); + if (error) + goto free_vma; + correct_wcount = 1; + } + vma->vm_file = file; + get_file(file); + error = file->f_op->mmap(file, vma); + if (error) + goto unmap_and_free_vma; + } else if (flags & MAP_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + addr = vma->vm_start; + + vma_link(mm, vma, prev, rb_link, rb_parent); + if (correct_wcount) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + +out: + mm->total_vm += len >> PAGE_SHIFT; + if (vm_flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + return addr; + +unmap_and_free_vma: + if (correct_wcount) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + vma->vm_file = NULL; + fput(file); + + /* Undo any partial mapping done by a device driver. */ + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); +free_vma: + kmem_cache_free(vm_area_cachep, vma); + return error; +} + +/* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * + * Ugly calling convention alert: + * Return value with the low bits set means error value, + * ie + * if (ret & ~PAGE_MASK) + * error = ret; + * + * This function "knows" that -ENOMEM has the bits set. + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA +static inline unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct vm_area_struct *vma; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(current->mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + addr = PAGE_ALIGN(TASK_UNMAPPED_BASE); + + for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) + return -ENOMEM; + if (!vma || addr + len <= vma->vm_start) + return addr; + addr = vma->vm_end; + } +} +#else +extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +#endif + +unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) +{ + if (flags & MAP_FIXED) { + if (addr > TASK_SIZE - len) + return -EINVAL; + if (addr & ~PAGE_MASK) + return -EINVAL; + return addr; + } + + if (file && file->f_op && file->f_op->get_unmapped_area) + return file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); + + return arch_get_unmapped_area(file, addr, len, pgoff, flags); +} + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct *vma = NULL; + + if (mm) { + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = mm->mmap_cache; + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + rb_node_t * rb_node; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct * vma_tmp; + + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; + } + if (vma) + mm->mmap_cache = vma; + } + } + return vma; +} + +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + if (mm) { + /* Go through the RB tree quickly. */ + struct vm_area_struct * vma; + rb_node_t * rb_node, * rb_last_right, * rb_prev; + + rb_node = mm->mm_rb.rb_node; + rb_last_right = rb_prev = NULL; + vma = NULL; + + while (rb_node) { + struct vm_area_struct * vma_tmp; + + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + rb_prev = rb_last_right; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else { + rb_last_right = rb_node; + rb_node = rb_node->rb_right; + } + } + if (vma) { + if (vma->vm_rb.rb_left) { + rb_prev = vma->vm_rb.rb_left; + while (rb_prev->rb_right) + rb_prev = rb_prev->rb_right; + } + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + if ((rb_prev ? (*pprev)->vm_next : mm->mmap) != vma) + BUG(); + return vma; + } + } + *pprev = NULL; + return NULL; +} + +struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct * vma; + unsigned long start; + + addr &= PAGE_MASK; + vma = find_vma(mm,addr); + if (!vma) + return NULL; + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) { + make_pages_present(addr, start); + } + return vma; +} + +/* Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * This function works out what part of an area is affected and + * adjusts the mapping information. Since the actual page + * manipulation is done in do_mmap(), none need be done here, + * though it would probably be more appropriate. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list, so it needs to be + * reinserted if necessary. + * + * The 4 main cases are: + * Unmapping the whole area + * Unmapping from the start of the segment to a point in it + * Unmapping from an intermediate point to the end + * Unmapping between to intermediate points, making a hole. + * + * Case 4 involves the creation of 2 new areas, for each side of + * the hole. If possible, we reuse the existing area rather than + * allocate a new one, and the return indicates whether the old + * area was reused. + */ +static struct vm_area_struct * unmap_fixup(struct mm_struct *mm, + struct vm_area_struct *area, unsigned long addr, size_t len, + struct vm_area_struct *extra) +{ + struct vm_area_struct *mpnt; + unsigned long end = addr + len; + + area->vm_mm->total_vm -= len >> PAGE_SHIFT; + if (area->vm_flags & VM_LOCKED) + area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + + /* Unmapping the whole area. */ + if (addr == area->vm_start && end == area->vm_end) { + if (area->vm_ops && area->vm_ops->close) + area->vm_ops->close(area); + if (area->vm_file) + fput(area->vm_file); + kmem_cache_free(vm_area_cachep, area); + return extra; + } + + /* Work out to one of the ends. */ + if (end == area->vm_end) { + /* + * here area isn't visible to the semaphore-less readers + * so we don't need to update it under the spinlock. + */ + area->vm_end = addr; + lock_vma_mappings(area); + spin_lock(&mm->page_table_lock); + } else if (addr == area->vm_start) { + area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT; + /* same locking considerations of the above case */ + area->vm_start = end; + lock_vma_mappings(area); + spin_lock(&mm->page_table_lock); + } else { + /* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */ + /* Add end mapping -- leave beginning for below */ + mpnt = extra; + extra = NULL; + + mpnt->vm_mm = area->vm_mm; + mpnt->vm_start = end; + mpnt->vm_end = area->vm_end; + mpnt->vm_page_prot = area->vm_page_prot; + mpnt->vm_flags = area->vm_flags; + mpnt->vm_raend = 0; + mpnt->vm_ops = area->vm_ops; + mpnt->vm_pgoff = area->vm_pgoff + ((end - area->vm_start) >> PAGE_SHIFT); + mpnt->vm_file = area->vm_file; + mpnt->vm_private_data = area->vm_private_data; + if (mpnt->vm_file) + get_file(mpnt->vm_file); + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + area->vm_end = addr; /* Truncate area */ + + /* Because mpnt->vm_file == area->vm_file this locks + * things correctly. + */ + lock_vma_mappings(area); + spin_lock(&mm->page_table_lock); + __insert_vm_struct(mm, mpnt); + } + + __insert_vm_struct(mm, area); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(area); + return extra; +} + +/* + * Try to free as many page directory entries as we can, + * without having to work very hard at actually scanning + * the page tables themselves. + * + * Right now we try to free page tables if we have a nice + * PGDIR-aligned area that got free'd up. We could be more + * granular if we want to, but this is fast and simple, + * and covers the bad cases. + * + * "prev", if it exists, points to a vma before the one + * we just free'd - but there's no telling how much before. + */ +static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev, + unsigned long start, unsigned long end) +{ + unsigned long first = start & PGDIR_MASK; + unsigned long last = end + PGDIR_SIZE - 1; + unsigned long start_index, end_index; + + if (!prev) { + prev = mm->mmap; + if (!prev) + goto no_mmaps; + if (prev->vm_end > start) { + if (last > prev->vm_start) + last = prev->vm_start; + goto no_mmaps; + } + } + for (;;) { + struct vm_area_struct *next = prev->vm_next; + + if (next) { + if (next->vm_start < start) { + prev = next; + continue; + } + if (last > next->vm_start) + last = next->vm_start; + } + if (prev->vm_end > first) + first = prev->vm_end + PGDIR_SIZE - 1; + break; + } +no_mmaps: + /* + * If the PGD bits are not consecutive in the virtual address, the + * old method of shifting the VA >> by PGDIR_SHIFT doesn't work. + */ + start_index = pgd_index(first); + end_index = pgd_index(last); + if (end_index > start_index) { + clear_page_tables(mm, start_index, end_index - start_index); + flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK); + } +} + +/* Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardine + */ +int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt, *prev, **npp, *free, *extra; + + if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return -EINVAL; + + /* Check if this memory area is ok - put it on the temporary + * list if so.. The checks here are pretty simple -- + * every area affected in some way (by any overlap) is put + * on the list. If nothing is put on, nothing is affected. + */ + mpnt = find_vma_prev(mm, addr, &prev); + if (!mpnt) + return 0; + /* we have addr < mpnt->vm_end */ + + if (mpnt->vm_start >= addr+len) + return 0; + + /* If we'll make "hole", check the vm areas limit */ + if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) + && mm->map_count >= max_map_count) + return -ENOMEM; + + /* + * We may need one additional vma to fix up the mappings ... + * and this is the last chance for an easy error exit. + */ + extra = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!extra) + return -ENOMEM; + + npp = (prev ? &prev->vm_next : &mm->mmap); + free = NULL; + spin_lock(&mm->page_table_lock); + for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) { + *npp = mpnt->vm_next; + mpnt->vm_next = free; + free = mpnt; + rb_erase(&mpnt->vm_rb, &mm->mm_rb); + } + mm->mmap_cache = NULL; /* Kill the cache. */ + spin_unlock(&mm->page_table_lock); + + /* Ok - we have the memory areas we should free on the 'free' list, + * so release them, and unmap the page range.. + * If the one of the segments is only being partially unmapped, + * it will put new vm_area_struct(s) into the address space. + * In that case we have to be careful with VM_DENYWRITE. + */ + while ((mpnt = free) != NULL) { + unsigned long st, end, size; + struct file *file = NULL; + + free = free->vm_next; + + st = addr < mpnt->vm_start ? mpnt->vm_start : addr; + end = addr+len; + end = end > mpnt->vm_end ? mpnt->vm_end : end; + size = end - st; + + if (mpnt->vm_flags & VM_DENYWRITE && + (st != mpnt->vm_start || end != mpnt->vm_end) && + (file = mpnt->vm_file) != NULL) { + atomic_dec(&file->f_dentry->d_inode->i_writecount); + } + remove_shared_vm_struct(mpnt); + mm->map_count--; + + zap_page_range(mm, st, size); + + /* + * Fix the mapping, and free the old area if it wasn't reused. + */ + extra = unmap_fixup(mm, mpnt, st, size, extra); + if (file) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + } + validate_mm(mm); + + /* Release the extra vma struct if it wasn't used */ + if (extra) + kmem_cache_free(vm_area_cachep, extra); + + free_pgtables(mm, prev, addr, addr+len); + + return 0; +} + +asmlinkage long sys_munmap(unsigned long addr, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + +/* + * this is really a simplified "do_mmap". it only handles + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned long flags; + rb_node_t ** rb_link, * rb_parent; + + len = PAGE_ALIGN(len); + if (!len) + return addr; + + /* + * mlock MCL_FUTURE? + */ + if (mm->def_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + /* + * Clear old maps. this also does some error checking for us + */ + munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limits *after* clearing old maps... */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + if (mm->map_count > max_map_count) + return -ENOMEM; + + if (!vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + flags = calc_vm_flags(PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_PRIVATE) | mm->def_flags; + + flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + /* Can we just expand an old anonymous mapping? */ + if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, flags)) + goto out; + + /* + * create a vma struct for an anonymous mapping + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = flags; + vma->vm_page_prot = protection_map[flags & 0x0f]; + vma->vm_ops = NULL; + vma->vm_pgoff = 0; + vma->vm_file = NULL; + vma->vm_private_data = NULL; + + vma_link(mm, vma, prev, rb_link, rb_parent); + +out: + mm->total_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + return addr; +} + +/* Build the RB tree corresponding to the VMA list. */ +void build_mmap_rb(struct mm_struct * mm) +{ + struct vm_area_struct * vma; + rb_node_t ** rb_link, * rb_parent; + + mm->mm_rb = RB_ROOT; + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + __vma_link_rb(mm, vma, rb_link, rb_parent); + rb_parent = &vma->vm_rb; + rb_link = &rb_parent->rb_right; + } +} + +/* Release all mmaps. */ +void exit_mmap(struct mm_struct * mm) +{ + struct vm_area_struct * mpnt; + + release_segments(mm); + spin_lock(&mm->page_table_lock); + mpnt = mm->mmap; + mm->mmap = mm->mmap_cache = NULL; + mm->mm_rb = RB_ROOT; + mm->rss = 0; + spin_unlock(&mm->page_table_lock); + mm->total_vm = 0; + mm->locked_vm = 0; + + flush_cache_mm(mm); + while (mpnt) { + struct vm_area_struct * next = mpnt->vm_next; + unsigned long start = mpnt->vm_start; + unsigned long end = mpnt->vm_end; + unsigned long size = end - start; + + if (mpnt->vm_ops) { + if (mpnt->vm_ops->close) + mpnt->vm_ops->close(mpnt); + } + mm->map_count--; + remove_shared_vm_struct(mpnt); + zap_page_range(mm, start, size); + if (mpnt->vm_file) + fput(mpnt->vm_file); + kmem_cache_free(vm_area_cachep, mpnt); + mpnt = next; + } + flush_tlb_mm(mm); + + /* This is just debugging */ + if (mm->map_count) + BUG(); + + clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); +} + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap ring. If vm_file is non-NULL + * then the i_shared_lock must be held here. + */ +void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + rb_node_t ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + BUG(); + __vma_link(mm, vma, prev, rb_link, rb_parent); + mm->map_count++; + validate_mm(mm); +} + +void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + rb_node_t ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + BUG(); + vma_link(mm, vma, prev, rb_link, rb_parent); + validate_mm(mm); +} diff -urN linux-2.4.17-rc1-virgin/mm/mremap.c linux-2.4.17-rc1-wli3/mm/mremap.c --- linux-2.4.17-rc1-virgin/mm/mremap.c Thu Sep 20 20:31:26 2001 +++ linux-2.4.17-rc1-wli3/mm/mremap.c Sun Dec 16 17:58:10 2001 @@ -61,8 +61,14 @@ { int error = 0; pte_t pte; + struct page * page = NULL; + + if (pte_present(*src)) + page = pte_page(*src); if (!pte_none(*src)) { + if (page) + page_remove_rmap(page, src); pte = ptep_get_and_clear(src); if (!dst) { /* No dest? We must put it back. */ @@ -70,6 +76,8 @@ error++; } set_pte(dst, pte); + if (page) + page_add_rmap(page, dst); } return error; } @@ -118,7 +126,7 @@ flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, len); + zap_page_range(mm, new_addr, len, ZPR_NORMAL); return -1; } diff -urN linux-2.4.17-rc1-virgin/mm/mremap.c~ linux-2.4.17-rc1-wli3/mm/mremap.c~ --- linux-2.4.17-rc1-virgin/mm/mremap.c~ Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/mm/mremap.c~ Fri Dec 14 02:44:20 2001 @@ -0,0 +1,360 @@ +/* + * linux/mm/remap.c + * + * (C) Copyright 1996 Linus Torvalds + */ + +#include +#include +#include +#include +#include + +#include +#include + +extern int vm_enough_memory(long pages); + +static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr) +{ + pgd_t * pgd; + pmd_t * pmd; + pte_t * pte = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + goto end; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + goto end; + } + + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd)) + goto end; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto end; + } + + pte = pte_offset(pmd, addr); + if (pte_none(*pte)) + pte = NULL; +end: + return pte; +} + +static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr) +{ + pmd_t * pmd; + pte_t * pte = NULL; + + pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); + if (pmd) + pte = pte_alloc(mm, pmd, addr); + return pte; +} + +static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst) +{ + int error = 0; + pte_t pte; + struct page * page = NULL; + + if (pte_present(*src)) + page = pte_page(*src); + + if (!pte_none(*src)) { + if (page) + page_remove_rmap(page, src); + pte = ptep_get_and_clear(src); + if (!dst) { + /* No dest? We must put it back. */ + dst = src; + error++; + } + set_pte(dst, pte); + if (page) + page_add_rmap(page, dst); + } + return error; +} + +static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr) +{ + int error = 0; + pte_t * src; + + spin_lock(&mm->page_table_lock); + src = get_one_pte(mm, old_addr); + if (src) + error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr)); + spin_unlock(&mm->page_table_lock); + return error; +} + +static int move_page_tables(struct mm_struct * mm, + unsigned long new_addr, unsigned long old_addr, unsigned long len) +{ + unsigned long offset = len; + + flush_cache_range(mm, old_addr, old_addr + len); + + /* + * This is not the clever way to do this, but we're taking the + * easy way out on the assumption that most remappings will be + * only a few pages.. This also makes error recovery easier. + */ + while (offset) { + offset -= PAGE_SIZE; + if (move_one_page(mm, old_addr + offset, new_addr + offset)) + goto oops_we_failed; + } + flush_tlb_range(mm, old_addr, old_addr + len); + return 0; + + /* + * Ok, the move failed because we didn't have enough pages for + * the new page table tree. This is unlikely, but we have to + * take the possibility into account. In that case we just move + * all the pages back (this will work, because we still have + * the old page tables) + */ +oops_we_failed: + flush_cache_range(mm, new_addr, new_addr + len); + while ((offset += PAGE_SIZE) < len) + move_one_page(mm, new_addr + offset, old_addr + offset); + zap_page_range(mm, new_addr, len); + return -1; +} + +static inline unsigned long move_vma(struct vm_area_struct * vma, + unsigned long addr, unsigned long old_len, unsigned long new_len, + unsigned long new_addr) +{ + struct mm_struct * mm = vma->vm_mm; + struct vm_area_struct * new_vma, * next, * prev; + int allocated_vma; + + new_vma = NULL; + next = find_vma_prev(mm, new_addr, &prev); + if (next) { + if (prev && prev->vm_end == new_addr && + can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = new_addr + new_len; + spin_unlock(&mm->page_table_lock); + new_vma = prev; + if (next != prev->vm_next) + BUG(); + if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = next->vm_end; + __vma_unlink(mm, next, prev); + spin_unlock(&mm->page_table_lock); + + mm->map_count--; + kmem_cache_free(vm_area_cachep, next); + } + } else if (next->vm_start == new_addr + new_len && + can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + next->vm_start = new_addr; + spin_unlock(&mm->page_table_lock); + new_vma = next; + } + } else { + prev = find_vma(mm, new_addr-1); + if (prev && prev->vm_end == new_addr && + can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = new_addr + new_len; + spin_unlock(&mm->page_table_lock); + new_vma = prev; + } + } + + allocated_vma = 0; + if (!new_vma) { + new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!new_vma) + goto out; + allocated_vma = 1; + } + + if (!move_page_tables(current->mm, new_addr, addr, old_len)) { + if (allocated_vma) { + *new_vma = *vma; + new_vma->vm_start = new_addr; + new_vma->vm_end = new_addr+new_len; + new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + new_vma->vm_raend = 0; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + insert_vm_struct(current->mm, new_vma); + } + do_munmap(current->mm, addr, old_len); + current->mm->total_vm += new_len >> PAGE_SHIFT; + if (new_vma->vm_flags & VM_LOCKED) { + current->mm->locked_vm += new_len >> PAGE_SHIFT; + make_pages_present(new_vma->vm_start, + new_vma->vm_end); + } + return new_addr; + } + if (allocated_vma) + kmem_cache_free(vm_area_cachep, new_vma); + out: + return -ENOMEM; +} + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise + * This option implies MREMAP_MAYMOVE. + */ +unsigned long do_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) + goto out; + + if (addr & ~PAGE_MASK) + goto out; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + /* new_addr is only valid if MREMAP_FIXED is specified */ + if (flags & MREMAP_FIXED) { + if (new_addr & ~PAGE_MASK) + goto out; + if (!(flags & MREMAP_MAYMOVE)) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ + if ((new_addr <= addr) && (new_addr+new_len) > addr) + goto out; + + if ((addr <= new_addr) && (addr+old_len) > new_addr) + goto out; + + do_munmap(current->mm, new_addr, new_len); + } + + /* + * Always allow a shrinking remap: that just unmaps + * the unnecessary pages.. + */ + ret = addr; + if (old_len >= new_len) { + do_munmap(current->mm, addr+new_len, old_len - new_len); + if (!(flags & MREMAP_FIXED) || (new_addr == addr)) + goto out; + } + + /* + * Ok, we need to grow.. or relocate. + */ + ret = -EFAULT; + vma = find_vma(current->mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + goto out; + if (vma->vm_flags & VM_DONTEXPAND) { + if (new_len > old_len) + goto out; + } + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; + locked += new_len - old_len; + ret = -EAGAIN; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + goto out; + } + ret = -ENOMEM; + if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) + > current->rlim[RLIMIT_AS].rlim_cur) + goto out; + /* Private writable mapping? Check memory availability.. */ + if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && + !(flags & MAP_NORESERVE) && + !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT)) + goto out; + + /* old_len exactly to the end of the area.. + * And we're not relocating the area. + */ + if (old_len == vma->vm_end - addr && + !((flags & MREMAP_FIXED) && (addr != new_addr)) && + (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { + unsigned long max_addr = TASK_SIZE; + if (vma->vm_next) + max_addr = vma->vm_next->vm_start; + /* can we just expand the current mapping? */ + if (max_addr - addr >= new_len) { + int pages = (new_len - old_len) >> PAGE_SHIFT; + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_end = addr + new_len; + spin_unlock(&vma->vm_mm->page_table_lock); + current->mm->total_vm += pages; + if (vma->vm_flags & VM_LOCKED) { + current->mm->locked_vm += pages; + make_pages_present(addr + old_len, + addr + new_len); + } + ret = addr; + goto out; + } + } + + /* + * We weren't able to just expand or shrink the area, + * we need to create a new one and move it.. + */ + ret = -ENOMEM; + if (flags & MREMAP_MAYMOVE) { + if (!(flags & MREMAP_FIXED)) { + unsigned long map_flags = 0; + if (vma->vm_flags & VM_SHARED) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); + ret = new_addr; + if (new_addr & ~PAGE_MASK) + goto out; + } + ret = move_vma(vma, addr, old_len, new_len, new_addr); + } +out: + return ret; +} + +asmlinkage unsigned long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + unsigned long ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + up_write(¤t->mm->mmap_sem); + return ret; +} diff -urN linux-2.4.17-rc1-virgin/mm/page_alloc.c linux-2.4.17-rc1-wli3/mm/page_alloc.c --- linux-2.4.17-rc1-virgin/mm/page_alloc.c Mon Nov 19 16:35:40 2001 +++ linux-2.4.17-rc1-wli3/mm/page_alloc.c Fri Dec 14 02:44:20 2001 @@ -21,8 +21,9 @@ int nr_swap_pages; int nr_active_pages; -int nr_inactive_pages; -struct list_head inactive_list; +int nr_inactive_dirty_pages; +int nr_inactive_clean_pages; +struct list_head inactive_dirty_list; struct list_head active_list; pg_data_t *pgdat_list; @@ -80,16 +81,17 @@ BUG(); if (PageLocked(page)) BUG(); - if (PageLRU(page)) - BUG(); if (PageActive(page)) BUG(); + if (PageInactiveDirty(page)) + BUG(); + if (PageInactiveClean(page)) + BUG(); + if (page->pte_chain) + BUG(); page->flags &= ~((1<flags & PF_FREE_PAGES) - goto local_freelist; - back_local_freelist: - + page->age = PAGE_AGE_START; + zone = page->zone; mask = (~0UL) << order; @@ -134,17 +136,6 @@ memlist_add_head(&(base + page_idx)->list, &area->free_list); spin_unlock_irqrestore(&zone->lock, flags); - return; - - local_freelist: - if (current->nr_local_pages) - goto back_local_freelist; - if (in_interrupt()) - goto back_local_freelist; - - list_add(&page->list, ¤t->local_pages); - page->index = order; - current->nr_local_pages++; } #define MARK_USED(index, order, area) \ @@ -203,10 +194,7 @@ set_page_count(page, 1); if (BAD_RANGE(zone,page)) BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); + DEBUG_LRU_PAGE(page); return page; } curr_order++; @@ -225,78 +213,87 @@ } #endif -static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); -static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) +/* + * If we are able to directly reclaim pages, we move pages from the + * inactive_clean list onto the free list until the zone has enough + * free pages or until the inactive_clean pages are exhausted. + * If we cannot do the work ourselves, call kswapd. + */ +static void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim)); +static void fixup_freespace(zone_t * zone, int direct_reclaim) { - struct page * page = NULL; - int __freed = 0; + if (direct_reclaim) { + struct page * page; + do { + if ((page = reclaim_page(zone))) + __free_pages_ok(page, 0); + } while (page && zone->free_pages <= zone->pages_min); + } else + wakeup_kswapd(); +} + +#define PAGES_MIN 0 +#define PAGES_LOW 1 +#define PAGES_HIGH 2 - if (!(gfp_mask & __GFP_WAIT)) - goto out; - if (in_interrupt()) - BUG(); - - current->allocation_order = order; - current->flags |= PF_MEMALLOC | PF_FREE_PAGES; - - __freed = try_to_free_pages(classzone, gfp_mask, order); - - current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); - - if (current->nr_local_pages) { - struct list_head * entry, * local_pages; - struct page * tmp; - int nr_pages; - - local_pages = ¤t->local_pages; - - if (likely(__freed)) { - /* pick from the last inserted so we're lifo */ - entry = local_pages->next; - do { - tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(tmp->zone, classzone)) { - list_del(entry); - current->nr_local_pages--; - set_page_count(tmp, 1); - page = tmp; - - if (page->buffers) - BUG(); - if (page->mapping) - BUG(); - if (!VALID_PAGE(page)) - BUG(); - if (PageSwapCache(page)) - BUG(); - if (PageLocked(page)) - BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); - if (PageDirty(page)) - BUG(); +/* + * This function does the dirty work for __alloc_pages + * and is separated out to keep the code size smaller. + * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM) + */ +static struct page * __alloc_pages_limit(zonelist_t *zonelist, + unsigned long order, int limit, int direct_reclaim) +{ + zone_t **zone = zonelist->zones; + unsigned long water_mark = 0; - break; - } - } while ((entry = entry->next) != local_pages); + for (;;) { + zone_t *z = *(zone++); + + if (!z) + break; + if (!z->size) + BUG(); + + /* + * We allocate if the number of free + inactive_clean + * pages is above the watermark. + */ + switch (limit) { + default: + case PAGES_MIN: + water_mark += z->pages_min; + break; + case PAGES_LOW: + water_mark += z->pages_low; + break; + case PAGES_HIGH: + water_mark += z->pages_high; } - nr_pages = current->nr_local_pages; - /* free in reverse order so that the global order will be lifo */ - while ((entry = local_pages->prev) != local_pages) { - list_del(entry); - tmp = list_entry(entry, struct page, list); - __free_pages_ok(tmp, tmp->index); - if (!nr_pages--) - BUG(); + if (z->free_pages + z->inactive_clean_pages >= water_mark) { + struct page *page = NULL; + /* If possible, reclaim a page directly. */ + if (direct_reclaim) + page = reclaim_page(z); + /* If that fails, fall back to rmqueue. */ + if (!page) + page = rmqueue(z, order); + if (page) + return page; + } else if (water_mark > z->need_balance) { + /* Set kswapd's free+clean target for the zone. + * we could do this in the init code, but this way + * we support arbitrary fallback between zones. + * + * XXX: how about DISCONTIGMEM boxes ? + */ + z->need_balance = water_mark; } - current->nr_local_pages = 0; } - out: - *freed = __freed; - return page; + + /* Found nothing. */ + return NULL; } /* @@ -304,100 +301,239 @@ */ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) { - unsigned long min; - zone_t **zone, * classzone; + zone_t **zone; + int min, direct_reclaim = 0; struct page * page; - int freed; + /* + * (If anyone calls gfp from interrupts nonatomically then it + * will sooner or later tripped up by a schedule().) + * + * We are falling back to lower-level zones if allocation + * in a higher zone fails. + */ + + /* + * Can we take pages directly from the inactive_clean + * list? + */ + if (order == 0 && (gfp_mask & __GFP_WAIT)) + direct_reclaim = 1; + +try_again: + /* + * First, see if we have any zones with lots of free memory. + * + * We allocate free memory first because it doesn't contain + * any data ... DUH! + */ zone = zonelist->zones; - classzone = *zone; min = 1UL << order; for (;;) { zone_t *z = *(zone++); if (!z) break; + if (!z->size) + BUG(); min += z->pages_low; if (z->free_pages > min) { page = rmqueue(z, order); if (page) return page; - } + } else if (z->free_pages < z->pages_min) + fixup_freespace(z, direct_reclaim); } - classzone->need_balance = 1; - mb(); - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + /* + * Try to allocate a page from a zone with a HIGH + * amount of free + inactive_clean pages. + * + * If there is a lot of activity, inactive_target + * will be high and we'll have a good chance of + * finding a page using the HIGH limit. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); + if (page) + return page; + + wakeup_kswapd(); + /* + * Then try to allocate a page from a zone with more + * than zone->pages_low free + inactive_clean pages. + * + * When the working set is very large and VM activity + * is low, we're most likely to have our allocation + * succeed here. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); + if (page) + return page; + + /* + * OK, none of the zones on our zonelist has lots + * of pages free. + * + * We wake up kswapd, in the hope that kswapd will + * resolve this situation before memory gets tight. + * + * We'll also help a bit trying to free pages, this + * way statistics will make sure really fast allocators + * are slowed down more than slow allocators and other + * programs in the system shouldn't be impacted as much + * by the hogs. + */ + if ((gfp_mask & __GFP_WAIT) && !(current->flags & (PF_MEMALLOC | PF_MEMDIE))) + try_to_free_pages(gfp_mask); + /* + * After waking up kswapd, we try to allocate a page + * from any zone which isn't critical yet. + * + * Kswapd should, in most situations, bring the situation + * back to normal in no time. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); + if (page) + return page; + + /* + * Damn, we didn't succeed. + */ + if (!(current->flags & PF_MEMALLOC)) { + /* + * Are we dealing with a higher order allocation? + * + * Try to defragment some memory. + */ + if (order > 0 && (gfp_mask & __GFP_WAIT)) + goto defragment; + + /* + * When we arrive here, we are really tight on memory. + * Since kswapd didn't succeed in freeing pages for us, + * we try to help it. + * + * Single page allocs loop until the allocation succeeds. + * Multi-page allocs can fail due to memory fragmentation; + * in that case we bail out to prevent infinite loops and + * hanging device drivers ... + * + * Another issue are GFP_NOFS allocations; because they + * do not have __GFP_FS set it's possible we cannot make + * any progress freeing pages, in that case it's better + * to give up than to deadlock the kernel looping here. + * + * NFS: we must yield the CPU (to rpciod) to avoid deadlock. + */ + if (gfp_mask & __GFP_WAIT) { + __set_current_state(TASK_RUNNING); + current->policy |= SCHED_YIELD; + schedule(); + if (!order || free_shortage()) { + int progress = try_to_free_pages(gfp_mask); + if (progress || (gfp_mask & __GFP_FS)) + goto try_again; + /* + * Fail in case no progress was made and the + * allocation may not be able to block on IO. + */ + return NULL; + } + } + } + + /* + * Final phase: allocate anything we can! + * + * Higher order allocations, GFP_ATOMIC allocations and + * recursive allocations (PF_MEMALLOC) end up here. + * + * Only recursive allocations can use the very last pages + * in the system, otherwise it would be just too easy to + * deadlock the system... + */ zone = zonelist->zones; min = 1UL << order; for (;;) { - unsigned long local_min; zone_t *z = *(zone++); + struct page * page = NULL; if (!z) break; - local_min = z->pages_min; - if (!(gfp_mask & __GFP_WAIT)) - local_min >>= 2; - min += local_min; - if (z->free_pages > min) { + /* + * SUBTLE: direct_reclaim is only possible if the task + * becomes PF_MEMALLOC while looping above. This will + * happen when the OOM killer selects this task for + * instant execution... + */ + if (direct_reclaim) { + page = reclaim_page(z); + if (page) + return page; + } + + /* XXX: is pages_min/4 a good amount to reserve for this? */ + min += z->pages_min / 4; + if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && !in_interrupt())) { page = rmqueue(z, order); if (page) return page; } } + goto out_failed; - /* here we're in the low on memory slow path */ -rebalance: - if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { + /* + * Naive "defragmentation" for higher-order allocations. First we + * free the inactive_clean pages to see if we can allocate our + * allocation, then we call page_launder() to clean some dirty + * pages and we try once more. + * + * We might want to turn this into something which defragments + * memory based on physical page, simply by looking for unmapped + * pages next to pages on the free list... + */ +defragment: + { + int freed = 0; zone = zonelist->zones; +defragment_again: for (;;) { zone_t *z = *(zone++); if (!z) break; - - page = rmqueue(z, order); - if (page) - return page; + if (!z->size) + continue; + while (z->inactive_clean_pages) { + struct page * page; + /* Move one page to the free list. */ + page = reclaim_page(z); + if (!page) + break; + __free_page(page); + /* Try if the allocation succeeds. */ + page = rmqueue(z, order); + if (page) + return page; + } } - return NULL; - } - - /* Atomic allocations - we can't balance anything */ - if (!(gfp_mask & __GFP_WAIT)) - return NULL; - - page = balance_classzone(classzone, gfp_mask, order, &freed); - if (page) - return page; - - zone = zonelist->zones; - min = 1UL << order; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - min += z->pages_min; - if (z->free_pages > min) { - page = rmqueue(z, order); - if (page) - return page; + /* XXX: do real defragmentation instead of calling launder ? */ + if (!freed) { + freed = 1; + current->flags |= PF_MEMALLOC; + try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; + goto defragment_again; } } - /* Don't let big-order allocations loop */ - if (order > 3) - return NULL; - - /* Yield for kswapd, and try again */ - current->policy |= SCHED_YIELD; - __set_current_state(TASK_RUNNING); - schedule(); - goto rebalance; + +out_failed: + /* No luck.. */ +// printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order); + return NULL; } /* @@ -429,7 +565,8 @@ void page_cache_release(struct page *page) { if (!PageReserved(page) && put_page_testzero(page)) { - if (PageLRU(page)) + if (PageActive(page) || PageInactiveDirty(page) || + PageInactiveClean(page)) lru_cache_del(page); __free_pages_ok(page, 0); } @@ -537,10 +674,18 @@ tmpdat = tmpdat->node_next; } - printk("( Active: %d, inactive: %d, free: %d )\n", - nr_active_pages, - nr_inactive_pages, - nr_free_pages()); + printk("Free pages: %6dkB (%6dkB HighMem)\n", + nr_free_pages() << (PAGE_SHIFT-10), + nr_free_highpages() << (PAGE_SHIFT-10)); + + printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n", + nr_active_pages, + nr_inactive_dirty_pages, + nr_inactive_clean_pages, + nr_free_pages(), + freepages.min, + freepages.low, + freepages.high); for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; @@ -660,7 +805,7 @@ printk("On node %d totalpages: %lu\n", nid, realtotalpages); INIT_LIST_HEAD(&active_list); - INIT_LIST_HEAD(&inactive_list); + INIT_LIST_HEAD(&inactive_dirty_list); /* * Some architectures (with lots of mem and discontinous memory @@ -709,7 +854,10 @@ zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; zone->free_pages = 0; + zone->inactive_clean_pages = 0; + zone->inactive_dirty_pages = 0; zone->need_balance = 0; + INIT_LIST_HEAD(&zone->inactive_clean_list); if (!size) continue; @@ -723,7 +871,20 @@ zone->pages_min = mask; zone->pages_low = mask*2; zone->pages_high = mask*3; - + /* + * Add these free targets to the global free target; + * we have to be SURE that freepages.high is higher + * than SUM [zone->pages_min] for all zones, otherwise + * we may have bad bad problems. + * + * This means we cannot make the freepages array writable + * in /proc, but have to add a separate extra_free_target + * for people who require it to catch load spikes in eg. + * gigabit ethernet routing... + */ + freepages.min += mask; + freepages.low += mask*2; + freepages.high += mask*3; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; diff -urN linux-2.4.17-rc1-virgin/mm/rmap.c linux-2.4.17-rc1-wli3/mm/rmap.c --- linux-2.4.17-rc1-virgin/mm/rmap.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/mm/rmap.c Fri Dec 14 04:21:37 2001 @@ -0,0 +1,354 @@ +/* + * mm/rmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + * + * + * Simple, low overhead pte-based reverse mapping scheme. + * This is kept modular because we may want to experiment + * with object-based reverse mapping schemes. Please try + * to keep this thing as modular as possible. + */ + +/* + * Locking: + * - the page->pte_chain is protected by the pagemap_lru_lock, + * we probably want to change this to a per-page lock in the + * future + * - because swapout locking is opposite to the locking order + * in the page fault path, the swapout path uses trylocks + * on the mm->page_table_lock + */ +#include +#include +#include + +#include +#include +#include + +#ifdef DEBUG +/* #define DEBUG */ +#undef DEBUG +#endif + +/* + * Shared pages have a chain of pte_chain structures, used to locate + * all the mappings to this page. We only need a pointer to the pte + * here, the page struct for the page table page contains the process + * it belongs to and the offset within that process. + * + * A singly linked list should be fine for most, if not all, workloads. + * On fork-after-exec the mapping we'll be removing will still be near + * the start of the list, on mixed application systems the short-lived + * processes will have their mappings near the start of the list and + * in systems with long-lived applications the relative overhead of + * exit() will be lower since the applications are long-lived. + */ +struct pte_chain { + struct pte_chain * next; + pte_t * ptep; +}; + +static struct pte_chain * pte_chain_freelist; +static inline struct pte_chain * pte_chain_alloc(void); +static inline void pte_chain_free(struct pte_chain *, struct pte_chain *, struct page *); +static void alloc_new_pte_chains(void); + +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of processes which referenced the page. + * Caller needs to hold the pagemap_lru_lock. + */ +int FASTCALL(page_referenced(struct page *)); +int page_referenced(struct page * page) +{ + struct pte_chain * pc; + int referenced = 0; + + if (PageTestandClearReferenced(page)) + referenced++; + + /* Check all the page tables mapping this page. */ + for (pc = page->pte_chain; pc; pc = pc->next) { + if (ptep_test_and_clear_young(pc->ptep)) + referenced++; + } + + return referenced; +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @ptep: the page table entry mapping this page + * + * Add a new pte reverse mapping to a page. + * The caller needs to hold the mm->page_table_lock. + */ +void FASTCALL(page_add_rmap(struct page *, pte_t *)); +void page_add_rmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pte_chain, * pc; + struct page * pte_page = virt_to_page(ptep); + + if (!page || !ptep) + BUG(); + if (!pte_present(*ptep)) + BUG(); + if (!pte_page->mapping) + BUG(); + if (!VALID_PAGE(page) || PageReserved(page)) + return; + + spin_lock(&pagemap_lru_lock); +#ifdef DEBUG + for (pc = page->pte_chain; pc; pc = pc->next) { + if (pc->ptep == ptep) + BUG(); + } +#endif + pte_chain = pte_chain_alloc(); + + /* Hook up the pte_chain to the page. */ + pte_chain->ptep = ptep; + pte_chain->next = page->pte_chain; + page->pte_chain = pte_chain; + + spin_unlock(&pagemap_lru_lock); +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * @ptep: page table entry to remove + * + * Removes the reverse mapping from the pte_chain of the page, + * after that the caller can clear the page table entry and free + * the page. + * Caller needs to hold the mm->page_table_lock. + */ +void FASTCALL(page_remove_rmap(struct page *, pte_t *)); +void page_remove_rmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pc, * prev_pc = NULL; + + if (!page || !ptep) + BUG(); + if (!VALID_PAGE(page) || PageReserved(page)) + return; + + spin_lock(&pagemap_lru_lock); + for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) { + if (pc->ptep == ptep) { + pte_chain_free(pc, prev_pc, page); + goto out; + } + } +#ifdef DEBUG + /* Not found. This should NEVER happen! */ + printk("page_remove_rmap: pte_chain %p not present...\n", ptep); + printk("page_remove_rmap: only found: "); + for (pc = page->pte_chain; pc; pc = pc->next) + printk("%p ", pc->ptep); + printk("\n"); + /* panic("page_remove_rmap: giving up.\n"); */ +#endif + +out: + spin_unlock(&pagemap_lru_lock); + return; + +} + +/** + * try_to_unmap_one - worker function for try_to_unmap + * @page: page to unmap + * @ptep: page table entry to unmap from page + * + * Internal helper function for try_to_unmap, called for each page + * table entry mapping a page. Because locking order here is opposite + * to the locking order used by the page fault path, we use trylocks. + * Locking: + * pagemap_lru_lock page_launder() + * page lock page_launder(), trylock + * mm->page_table_lock try_to_unmap_one(), trylock + */ +int FASTCALL(try_to_unmap_one(struct page *, pte_t *)); +int try_to_unmap_one(struct page * page, pte_t * ptep) +{ + unsigned long address = ptep_to_address(ptep); + struct mm_struct * mm = ptep_to_mm(ptep); + struct vm_area_struct * vma; + pte_t pte; + int ret; + + if (!mm) + BUG(); + + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + if (!spin_trylock(&mm->page_table_lock)) + return SWAP_AGAIN; + + /* During mremap, it's possible pages are not in a VMA. */ + vma = find_vma(mm, address); + if (!vma) { + ret = SWAP_FAIL; + goto out_unlock; + } + + /* The page is mlock()d, we cannot swap it out. */ + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unlock; + } + + /* Nuke the page table entry. */ + pte = ptep_get_and_clear(ptep); + flush_tlb_page(vma, address); + flush_cache_page(vma, address); + + /* Store the swap location in the pte. See handle_pte_fault() ... */ + if (PageSwapCache(page)) { + swp_entry_t entry; + entry.val = page->index; + swap_duplicate(entry); + set_pte(ptep, swp_entry_to_pte(entry)); + } + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pte)) + set_page_dirty(page); + + mm->rss--; + page_cache_release(page); + ret = SWAP_SUCCESS; + +out_unlock: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold pagemap_lru_lock + * and the page lock. Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable + * SWAP_ERROR - an error occurred + */ +int FASTCALL(try_to_unmap(struct page *)); +int try_to_unmap(struct page * page) +{ + struct pte_chain * pc, * next_pc, * prev_pc = NULL; + int ret = SWAP_SUCCESS; + + /* This page should not be on the pageout lists. */ + if (!VALID_PAGE(page) || PageReserved(page)) + BUG(); + if (!PageLocked(page)) + BUG(); + /* We need backing store to swap out a page. */ + if (!page->mapping) + BUG(); + + for (pc = page->pte_chain; pc; pc = next_pc) { + next_pc = pc->next; + switch (try_to_unmap_one(page, pc->ptep)) { + case SWAP_SUCCESS: + /* Free the pte_chain struct. */ + pte_chain_free(pc, prev_pc, page); + break; + case SWAP_AGAIN: + /* Skip this pte, remembering status. */ + prev_pc = pc; + ret = SWAP_AGAIN; + continue; + case SWAP_FAIL: + return SWAP_FAIL; + case SWAP_ERROR: + return SWAP_ERROR; + } + } + + return ret; +} + +/** + * pte_chain_free - free pte_chain structure + * @pte_chain: pte_chain struct to free + * @prev_pte_chain: previous pte_chain on the list (may be NULL) + * @page: page this pte_chain hangs off (may be NULL) + * + * This function unlinks pte_chain from the singly linked list it + * may be on and adds the pte_chain to the free list. May also be + * called for new pte_chain structures which aren't on any list yet. + * Caller needs to hold the pagemap_lru_list. + */ +static inline void pte_chain_free(struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain, struct page * page) +{ + if (prev_pte_chain) + prev_pte_chain->next = pte_chain->next; + else if (page) + page->pte_chain = pte_chain->next; + + pte_chain->ptep = NULL; + pte_chain->next = pte_chain_freelist; + pte_chain_freelist = pte_chain; +} + +/** + * pte_chain_alloc - allocate a pte_chain struct + * + * Returns a pointer to a fresh pte_chain structure. Allocates new + * pte_chain structures as required. + * Caller needs to hold the pagemap_lru_lock. + */ +static inline struct pte_chain * pte_chain_alloc(void) +{ + struct pte_chain * pte_chain; + + /* Allocate new pte_chain structs as needed. */ + if (!pte_chain_freelist) + alloc_new_pte_chains(); + + /* Grab the first pte_chain from the freelist. */ + pte_chain = pte_chain_freelist; + pte_chain_freelist = pte_chain->next; + pte_chain->next = NULL; + + return pte_chain; +} + +/** + * alloc_new_pte_chains - convert a free page to pte_chain structures + * + * Grabs a free page and converts it to pte_chain structures. We really + * should pre-allocate these earlier in the pagefault path or come up + * with some other trick. + */ +static void alloc_new_pte_chains(void) +{ + struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC); + int i = PAGE_SIZE / sizeof(struct pte_chain); + + if (pte_chain) { + for (; i-- > 0; pte_chain++) + pte_chain_free(pte_chain, NULL, NULL); + } else { + /* Yeah yeah, I'll fix the pte_chain allocation ... */ + panic("Fix pte_chain allocation, you lazy bastard!\n"); + } +} diff -urN linux-2.4.17-rc1-virgin/mm/shmem.c linux-2.4.17-rc1-wli3/mm/shmem.c --- linux-2.4.17-rc1-virgin/mm/shmem.c Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/mm/shmem.c Wed Nov 21 09:57:57 2001 @@ -1193,7 +1193,7 @@ follow_link: shmem_follow_link, }; -static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long * blocks, unsigned long *inodes) +static int shmem_parse_options(char *options, int *mode, unsigned long * blocks, unsigned long *inodes) { char *this_char, *value, *rest; @@ -1205,7 +1205,7 @@ *value++ = 0; } else { printk(KERN_ERR - "tmpfs: No value for mount option '%s'\n", + "shmem_parse_options: No value for option '%s'\n", this_char); return 1; } @@ -1230,20 +1230,8 @@ *mode = simple_strtoul(value,&rest,8); if (*rest) goto bad_val; - } else if (!strcmp(this_char,"uid")) { - if (!uid) - continue; - *uid = simple_strtoul(value,&rest,0); - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"gid")) { - if (!gid) - continue; - *gid = simple_strtoul(value,&rest,0); - if (*rest) - goto bad_val; } else { - printk(KERN_ERR "tmpfs: Bad mount option %s\n", + printk(KERN_ERR "shmem_parse_options: Bad option %s\n", this_char); return 1; } @@ -1251,7 +1239,7 @@ return 0; bad_val: - printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", + printk(KERN_ERR "shmem_parse_options: Bad value '%s' for option '%s'\n", value, this_char); return 1; @@ -1263,7 +1251,7 @@ unsigned long max_blocks = sbinfo->max_blocks; unsigned long max_inodes = sbinfo->max_inodes; - if (shmem_parse_options (data, NULL, NULL, NULL, &max_blocks, &max_inodes)) + if (shmem_parse_options (data, NULL, &max_blocks, &max_inodes)) return -EINVAL; return shmem_set_size(sbinfo, max_blocks, max_inodes); } @@ -1280,8 +1268,6 @@ struct dentry * root; unsigned long blocks, inodes; int mode = S_IRWXUGO | S_ISVTX; - uid_t uid = current->fsuid; - gid_t gid = current->fsgid; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); struct sysinfo si; @@ -1293,8 +1279,10 @@ blocks = inodes = si.totalram / 2; #ifdef CONFIG_TMPFS - if (shmem_parse_options (data, &mode, &uid, &gid, &blocks, &inodes)) + if (shmem_parse_options (data, &mode, &blocks, &inodes)) { + printk(KERN_ERR "tmpfs invalid option\n"); return NULL; + } #endif spin_lock_init (&sbinfo->stat_lock); @@ -1311,8 +1299,6 @@ if (!inode) return NULL; - inode->i_uid = uid; - inode->i_gid = gid; root = d_alloc_root(inode); if (!root) { iput(inode); diff -urN linux-2.4.17-rc1-virgin/mm/slab.c linux-2.4.17-rc1-wli3/mm/slab.c --- linux-2.4.17-rc1-virgin/mm/slab.c Fri Dec 14 06:04:16 2001 +++ linux-2.4.17-rc1-wli3/mm/slab.c Fri Dec 14 02:44:44 2001 @@ -49,7 +49,9 @@ * constructors and destructors are called without any locking. * Several members in kmem_cache_t and slab_t never change, they * are accessed without any locking. - * The per-cpu arrays are never accessed from the wrong cpu, no locking. + * The per-cpu arrays are never accessed from the wrong cpu, no locking, + * they are however called with local interrupts disabled so no + * preempt_disable needed. * The non-constant members are protected with a per-cache irq spinlock. * * Further notes from the original documentation: @@ -109,11 +111,9 @@ #if DEBUG # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ - SLAB_NO_REAP | SLAB_CACHE_DMA | \ - SLAB_MUST_HWCACHE_ALIGN) + SLAB_NO_REAP | SLAB_CACHE_DMA) #else -# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ - SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN) +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | SLAB_CACHE_DMA) #endif /* @@ -651,7 +651,7 @@ flags &= ~SLAB_POISON; } #if FORCED_DEBUG - if ((size < (PAGE_SIZE>>3)) && !(flags & SLAB_MUST_HWCACHE_ALIGN)) + if (size < (PAGE_SIZE>>3)) /* * do not red zone large object, causes severe * fragmentation. @@ -1282,9 +1282,10 @@ }) #ifdef CONFIG_SMP -void* kmem_cache_alloc_batch(kmem_cache_t* cachep, cpucache_t* cc, int flags) +void* kmem_cache_alloc_batch(kmem_cache_t* cachep, int flags) { int batchcount = cachep->batchcount; + cpucache_t* cc = cc_data(cachep); spin_lock(&cachep->spinlock); while (batchcount--) { @@ -1333,7 +1334,7 @@ objp = cc_entry(cc)[--cc->avail]; } else { STATS_INC_ALLOCMISS(cachep); - objp = kmem_cache_alloc_batch(cachep,cc,flags); + objp = kmem_cache_alloc_batch(cachep,flags); if (!objp) goto alloc_new_slab_nolock; } @@ -1921,13 +1922,12 @@ #endif #ifdef CONFIG_SMP { - cpucache_t *cc = cc_data(cachep); unsigned int batchcount = cachep->batchcount; unsigned int limit; - if (cc) - limit = cc->limit; - else + if (cc_data(cachep)) + limit = cc_data(cachep)->limit; + else limit = 0; len += sprintf(page+len, " : %4u %4u", limit, batchcount); diff -urN linux-2.4.17-rc1-virgin/mm/swap.c linux-2.4.17-rc1-wli3/mm/swap.c --- linux-2.4.17-rc1-virgin/mm/swap.c Tue Nov 6 22:44:20 2001 +++ linux-2.4.17-rc1-wli3/mm/swap.c Fri Dec 14 02:44:20 2001 @@ -24,6 +24,20 @@ #include /* for copy_to/from_user */ #include +/* + * We identify three levels of free memory. We never let free mem + * fall below the freepages.min except for atomic allocations. We + * start background swapping if we fall below freepages.high free + * pages, and we begin intensive swapping below freepages.low. + * + * Actual initialization is done in mm/page_alloc.c + */ +freepages_t freepages = { + 0, /* freepages.min */ + 0, /* freepages.low */ + 0 /* freepages.high */ +}; + /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -33,17 +47,59 @@ 8, /* do swap I/O in clusters of this size */ }; +/** + * (de)activate_page - move pages from/to active and inactive lists + * @page: the page we want to move + * @nolock - are we already holding the pagemap_lru_lock? + * + * Deactivate_page will move an active page to the right + * inactive list, while activate_page will move a page back + * from one of the inactive lists to the active list. If + * called on a page which is not on any of the lists, the + * page is left alone. + */ +void FASTCALL(deactivate_page_nolock(struct page *)); +void deactivate_page_nolock(struct page * page) +{ + /* + * Don't touch it if it's not on the active list. + * (some pages aren't on any list at all) + */ + ClearPageReferenced(page); + if (PageActive(page)) { + page->age = 0; + del_page_from_active_list(page); + add_page_to_inactive_dirty_list(page); + } +} + +void FASTCALL(deactivate_page(struct page *)); +void deactivate_page(struct page * page) +{ + spin_lock(&pagemap_lru_lock); + deactivate_page_nolock(page); + spin_unlock(&pagemap_lru_lock); +} + /* * Move an inactive page to the active list. */ -static inline void activate_page_nolock(struct page * page) +void FASTCALL(activate_page_nolock(struct page *)); +void activate_page_nolock(struct page * page) { - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(page); + if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); add_page_to_active_list(page); } + + /* Make sure the page gets a fair chance at staying active. */ + page->age = max((int)page->age, PAGE_AGE_START); } +void FASTCALL(activate_page(struct page *)); void activate_page(struct page * page) { spin_lock(&pagemap_lru_lock); @@ -55,11 +111,12 @@ * lru_cache_add: add a page to the page lists * @page: the page to add */ +void FASTCALL(lru_cache_add(struct page *)); void lru_cache_add(struct page * page) { - if (!TestSetPageLRU(page)) { + if (!PageLRU(page)) { spin_lock(&pagemap_lru_lock); - add_page_to_inactive_list(page); + add_page_to_active_list(page); spin_unlock(&pagemap_lru_lock); } } @@ -71,14 +128,15 @@ * This function is for when the caller already holds * the pagemap_lru_lock. */ +void FASTCALL(__lru_cache_del(struct page *)); void __lru_cache_del(struct page * page) { - if (TestClearPageLRU(page)) { - if (PageActive(page)) { - del_page_from_active_list(page); - } else { - del_page_from_inactive_list(page); - } + if (PageActive(page)) { + del_page_from_active_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); } } @@ -86,6 +144,7 @@ * lru_cache_del: remove a page from the page lists * @page: the page to remove */ +void FASTCALL(lru_cache_del(struct page *)); void lru_cache_del(struct page * page) { spin_lock(&pagemap_lru_lock); diff -urN linux-2.4.17-rc1-virgin/mm/swap_state.c linux-2.4.17-rc1-wli3/mm/swap_state.c --- linux-2.4.17-rc1-virgin/mm/swap_state.c Wed Oct 31 15:31:03 2001 +++ linux-2.4.17-rc1-wli3/mm/swap_state.c Fri Dec 14 02:44:20 2001 @@ -89,6 +89,40 @@ return 0; } +/** + * add_to_swap - allocate swap space for a page + * @page: page we want to move to swap + * + * Allocate swap space for the page and add the page to the + * swap cache. Caller needs to hold the page lock. + */ +int add_to_swap(struct page * page) +{ + swp_entry_t entry; + + if (!PageLocked(page)) + BUG(); + + for (;;) { + entry = get_swap_page(); + if (!entry.val) + return 0; + /* + * Add it to the swap cache and mark it dirty + * (adding to the page cache will clear the dirty + * and uptodate bits, so we need to do it again) + */ + if (add_to_swap_cache(page, entry) == 0) { + SetPageUptodate(page); + set_page_dirty(page); + swap_free(entry); + return 1; + } + /* Raced with "speculative" read_swap_cache_async */ + swap_free(entry); + } +} + /* * This must be called only on pages that have * been verified to be in the swap cache. diff -urN linux-2.4.17-rc1-virgin/mm/swapfile.c linux-2.4.17-rc1-wli3/mm/swapfile.c --- linux-2.4.17-rc1-virgin/mm/swapfile.c Sat Nov 3 17:05:25 2001 +++ linux-2.4.17-rc1-wli3/mm/swapfile.c Sun Dec 16 17:58:10 2001 @@ -374,6 +374,7 @@ return; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + page_add_rmap(page, dir); swap_free(entry); ++vma->vm_mm->rss; } @@ -696,6 +697,7 @@ * interactive performance. Interruptible check on * signal_pending() would be nice, but changes the spec? */ + debug_lock_break(551); if (current->need_resched) schedule(); } @@ -1121,6 +1123,13 @@ if (swap_info[i].flags != SWP_USED) continue; for (j = 0; j < swap_info[i].max; ++j) { + if (conditional_schedule_needed()) { + debug_lock_break(551); + swap_list_unlock(); + debug_lock_break(551); + unconditional_schedule(); + swap_list_lock(); + } switch (swap_info[i].swap_map[j]) { case 0: case SWAP_MAP_BAD: diff -urN linux-2.4.17-rc1-virgin/mm/swapfile.c~ linux-2.4.17-rc1-wli3/mm/swapfile.c~ --- linux-2.4.17-rc1-virgin/mm/swapfile.c~ Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc1-wli3/mm/swapfile.c~ Fri Dec 14 02:44:20 2001 @@ -0,0 +1,1291 @@ +/* + * linux/mm/swapfile.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include +#include +#include +#include +#include +#include /* for blk_size */ +#include +#include +#include +#include + +#include + +spinlock_t swaplock = SPIN_LOCK_UNLOCKED; +unsigned int nr_swapfiles; +int total_swap_pages; +static int swap_overflow; + +static const char Bad_file[] = "Bad swap file entry "; +static const char Unused_file[] = "Unused swap file entry "; +static const char Bad_offset[] = "Bad swap offset entry "; +static const char Unused_offset[] = "Unused swap offset entry "; + +struct swap_list_t swap_list = {-1, -1}; + +struct swap_info_struct swap_info[MAX_SWAPFILES]; + +#define SWAPFILE_CLUSTER 256 + +static inline int scan_swap_map(struct swap_info_struct *si) +{ + unsigned long offset; + /* + * We try to cluster swap pages by allocating them + * sequentially in swap. Once we've allocated + * SWAPFILE_CLUSTER pages this way, however, we resort to + * first-free allocation, starting a new cluster. This + * prevents us from scattering swap pages all over the entire + * swap partition, so that we reduce overall disk seek times + * between swap pages. -- sct */ + if (si->cluster_nr) { + while (si->cluster_next <= si->highest_bit) { + offset = si->cluster_next++; + if (si->swap_map[offset]) + continue; + si->cluster_nr--; + goto got_page; + } + } + si->cluster_nr = SWAPFILE_CLUSTER; + + /* try to find an empty (even not aligned) cluster. */ + offset = si->lowest_bit; + check_next_cluster: + if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) + { + int nr; + for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) + if (si->swap_map[nr]) + { + offset = nr+1; + goto check_next_cluster; + } + /* We found a completly empty cluster, so start + * using it. + */ + goto got_page; + } + /* No luck, so now go finegrined as usual. -Andrea */ + for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { + if (si->swap_map[offset]) + continue; + si->lowest_bit = offset+1; + got_page: + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + if (si->lowest_bit > si->highest_bit) { + si->lowest_bit = si->max; + si->highest_bit = 0; + } + si->swap_map[offset] = 1; + nr_swap_pages--; + si->cluster_next = offset+1; + return offset; + } + si->lowest_bit = si->max; + si->highest_bit = 0; + return 0; +} + +swp_entry_t get_swap_page(void) +{ + struct swap_info_struct * p; + unsigned long offset; + swp_entry_t entry; + int type, wrapped = 0; + + entry.val = 0; /* Out of memory */ + swap_list_lock(); + type = swap_list.next; + if (type < 0) + goto out; + if (nr_swap_pages <= 0) + goto out; + + while (1) { + p = &swap_info[type]; + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + swap_device_lock(p); + offset = scan_swap_map(p); + swap_device_unlock(p); + if (offset) { + entry = SWP_ENTRY(type,offset); + type = swap_info[type].next; + if (type < 0 || + p->prio != swap_info[type].prio) { + swap_list.next = swap_list.head; + } else { + swap_list.next = type; + } + goto out; + } + } + type = p->next; + if (!wrapped) { + if (type < 0 || p->prio != swap_info[type].prio) { + type = swap_list.head; + wrapped = 1; + } + } else + if (type < 0) + goto out; /* out of swap space */ + } +out: + swap_list_unlock(); + return entry; +} + +static struct swap_info_struct * swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry.val) + goto out; + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + p = & swap_info[type]; + if (!(p->flags & SWP_USED)) + goto bad_device; + offset = SWP_OFFSET(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_free; + swap_list_lock(); + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = type; + swap_device_lock(p); + return p; + +bad_free: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + goto out; +bad_offset: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + goto out; +bad_device: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + goto out; +bad_nofile: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); +out: + return NULL; +} + +static void swap_info_put(struct swap_info_struct * p) +{ + swap_device_unlock(p); + swap_list_unlock(); +} + +static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) +{ + int count = p->swap_map[offset]; + + if (count < SWAP_MAP_MAX) { + count--; + p->swap_map[offset] = count; + if (!count) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + nr_swap_pages++; + } + } + return count; +} + +/* + * Caller has made sure that the swapdevice corresponding to entry + * is still around or has not been recycled. + */ +void swap_free(swp_entry_t entry) +{ + struct swap_info_struct * p; + + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, SWP_OFFSET(entry)); + swap_info_put(p); + } +} + +/* + * Check if we're the only user of a swap page, + * when the page is locked. + */ +static int exclusive_swap_page(struct page *page) +{ + int retval = 0; + struct swap_info_struct * p; + swp_entry_t entry; + + entry.val = page->index; + p = swap_info_get(entry); + if (p) { + /* Is the only swap cache user the cache itself? */ + if (p->swap_map[SWP_OFFSET(entry)] == 1) { + /* Recheck the page count with the pagecache lock held.. */ + spin_lock(&pagecache_lock); + if (page_count(page) - !!page->buffers == 2) + retval = 1; + spin_unlock(&pagecache_lock); + } + swap_info_put(p); + } + return retval; +} + +/* + * We can use this swap cache entry directly + * if there are no other references to it. + * + * Here "exclusive_swap_page()" does the real + * work, but we opportunistically check whether + * we need to get all the locks first.. + */ +int can_share_swap_page(struct page *page) +{ + int retval = 0; + + if (!PageLocked(page)) + BUG(); + switch (page_count(page)) { + case 3: + if (!page->buffers) + break; + /* Fallthrough */ + case 2: + if (!PageSwapCache(page)) + break; + retval = exclusive_swap_page(page); + break; + case 1: + if (PageReserved(page)) + break; + retval = 1; + } + return retval; +} + +/* + * Work out if there are any other processes sharing this + * swap cache page. Free it if you can. Return success. + */ +int remove_exclusive_swap_page(struct page *page) +{ + int retval; + struct swap_info_struct * p; + swp_entry_t entry; + + if (!PageLocked(page)) + BUG(); + if (!PageSwapCache(page)) + return 0; + if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */ + return 0; + + entry.val = page->index; + p = swap_info_get(entry); + if (!p) + return 0; + + /* Is the only swap cache user the cache itself? */ + retval = 0; + if (p->swap_map[SWP_OFFSET(entry)] == 1) { + /* Recheck the page count with the pagecache lock held.. */ + spin_lock(&pagecache_lock); + if (page_count(page) - !!page->buffers == 2) { + __delete_from_swap_cache(page); + SetPageDirty(page); + retval = 1; + } + spin_unlock(&pagecache_lock); + } + swap_info_put(p); + + if (retval) { + block_flushpage(page, 0); + swap_free(entry); + page_cache_release(page); + } + + return retval; +} + +/* + * Free the swap entry like above, but also try to + * free the page cache entry if it is the last user. + */ +void free_swap_and_cache(swp_entry_t entry) +{ + struct swap_info_struct * p; + struct page *page = NULL; + + p = swap_info_get(entry); + if (p) { + if (swap_entry_free(p, SWP_OFFSET(entry)) == 1) + page = find_trylock_page(&swapper_space, entry.val); + swap_info_put(p); + } + if (page) { + page_cache_get(page); + /* Only cache user (+us), or swap space full? Free it! */ + if (page_count(page) == 2 || vm_swap_full()) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + UnlockPage(page); + page_cache_release(page); + } +} + +/* + * The swap entry has been read in advance, and we return 1 to indicate + * that the page has been used or is no longer needed. + * + * Always set the resulting pte to be nowrite (the same as COW pages + * after one process has exited). We don't know just how many PTEs will + * share this swap entry, so be cautious and let do_wp_page work out + * what to do if a write is requested later. + */ +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, + pte_t *dir, swp_entry_t entry, struct page* page) +{ + pte_t pte = *dir; + + if (likely(pte_to_swp_entry(pte).val != entry.val)) + return; + if (unlikely(pte_none(pte) || pte_present(pte))) + return; + get_page(page); + set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + page_add_rmap(page, dir); + swap_free(entry); + ++vma->vm_mm->rss; +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, + unsigned long address, unsigned long size, unsigned long offset, + swp_entry_t entry, struct page* page) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return; + } + pte = pte_offset(dir, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, + unsigned long address, unsigned long size, + swp_entry_t entry, struct page* page) +{ + pmd_t * pmd; + unsigned long offset, end; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + if (address >= end) + BUG(); + do { + unuse_pmd(vma, pmd, address, end - address, offset, entry, + page); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, + swp_entry_t entry, struct page* page) +{ + unsigned long start = vma->vm_start, end = vma->vm_end; + + if (start >= end) + BUG(); + do { + unuse_pgd(vma, pgdir, start, end - start, entry, page); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } while (start && (start < end)); +} + +static void unuse_process(struct mm_struct * mm, + swp_entry_t entry, struct page* page) +{ + struct vm_area_struct* vma; + + /* + * Go through process' page directory. + */ + spin_lock(&mm->page_table_lock); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + pgd_t * pgd = pgd_offset(mm, vma->vm_start); + unuse_vma(vma, pgd, entry, page); + } + spin_unlock(&mm->page_table_lock); + return; +} + +/* + * Scan swap_map from current position to next entry still in use. + * Recycle to start on reaching the end, returning 0 when empty. + */ +static int find_next_to_unuse(struct swap_info_struct *si, int prev) +{ + int max = si->max; + int i = prev; + int count; + + /* + * No need for swap_device_lock(si) here: we're just looking + * for whether an entry is in use, not modifying it; false + * hits are okay, and sys_swapoff() has already prevented new + * allocations from this area (while holding swap_list_lock()). + */ + for (;;) { + if (++i >= max) { + if (!prev) { + i = 0; + break; + } + /* + * No entries in use at top of swap_map, + * loop back to start and recheck there. + */ + max = prev + 1; + prev = 0; + i = 1; + } + count = si->swap_map[i]; + if (count && count != SWAP_MAP_BAD) + break; + } + return i; +} + +/* + * We completely avoid races by reading each swap page in advance, + * and then search for the process using it. All the necessary + * page table adjustments can then be made atomically. + */ +static int try_to_unuse(unsigned int type) +{ + struct swap_info_struct * si = &swap_info[type]; + struct mm_struct *start_mm; + unsigned short *swap_map; + unsigned short swcount; + struct page *page; + swp_entry_t entry; + int i = 0; + int retval = 0; + int reset_overflow = 0; + + /* + * When searching mms for an entry, a good strategy is to + * start at the first mm we freed the previous entry from + * (though actually we don't notice whether we or coincidence + * freed the entry). Initialize this start_mm with a hold. + * + * A simpler strategy would be to start at the last mm we + * freed the previous entry from; but that would take less + * advantage of mmlist ordering (now preserved by swap_out()), + * which clusters forked address spaces together, most recent + * child immediately after parent. If we race with dup_mmap(), + * we very much want to resolve parent before child, otherwise + * we may miss some entries: using last mm would invert that. + */ + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + + /* + * Keep on scanning until all entries have gone. Usually, + * one pass through swap_map is enough, but not necessarily: + * mmput() removes mm from mmlist before exit_mmap() and its + * zap_page_range(). That's not too bad, those entries are + * on their way out, and handled faster there than here. + * do_munmap() behaves similarly, taking the range out of mm's + * vma list before zap_page_range(). But unfortunately, when + * unmapping a part of a vma, it takes the whole out first, + * then reinserts what's left after (might even reschedule if + * open() method called) - so swap entries may be invisible + * to swapoff for a while, then reappear - but that is rare. + */ + while ((i = find_next_to_unuse(si, i))) { + /* + * Get a page for the entry, using the existing swap + * cache page if there is one. Otherwise, get a clean + * page and read the swap into it. + */ + swap_map = &si->swap_map[i]; + entry = SWP_ENTRY(type, i); + page = read_swap_cache_async(entry); + if (!page) { + /* + * Either swap_duplicate() failed because entry + * has been freed independently, and will not be + * reused since sys_swapoff() already disabled + * allocation from here, or alloc_page() failed. + */ + if (!*swap_map) + continue; + retval = -ENOMEM; + break; + } + + /* + * Don't hold on to start_mm if it looks like exiting. + */ + if (atomic_read(&start_mm->mm_users) == 1) { + mmput(start_mm); + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } + + /* + * Wait for and lock page. When do_swap_page races with + * try_to_unuse, do_swap_page can handle the fault much + * faster than try_to_unuse can locate the entry. This + * apparently redundant "wait_on_page" lets try_to_unuse + * defer to do_swap_page in such a case - in some tests, + * do_swap_page and try_to_unuse repeatedly compete. + */ + wait_on_page(page); + lock_page(page); + + /* + * Remove all references to entry, without blocking. + * Whenever we reach init_mm, there's no address space + * to search, but use it as a reminder to search shmem. + */ + swcount = *swap_map; + if (swcount > 1) { + flush_page_to_ram(page); + if (start_mm == &init_mm) + shmem_unuse(entry, page); + else + unuse_process(start_mm, entry, page); + } + if (*swap_map > 1) { + int set_start_mm = (*swap_map >= swcount); + struct list_head *p = &start_mm->mmlist; + struct mm_struct *new_start_mm = start_mm; + struct mm_struct *mm; + + spin_lock(&mmlist_lock); + while (*swap_map > 1 && + (p = p->next) != &start_mm->mmlist) { + mm = list_entry(p, struct mm_struct, mmlist); + swcount = *swap_map; + if (mm == &init_mm) { + set_start_mm = 1; + shmem_unuse(entry, page); + } else + unuse_process(mm, entry, page); + if (set_start_mm && *swap_map < swcount) { + new_start_mm = mm; + set_start_mm = 0; + } + } + atomic_inc(&new_start_mm->mm_users); + spin_unlock(&mmlist_lock); + mmput(start_mm); + start_mm = new_start_mm; + } + + /* + * How could swap count reach 0x7fff when the maximum + * pid is 0x7fff, and there's no way to repeat a swap + * page within an mm (except in shmem, where it's the + * shared object which takes the reference count)? + * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. + * + * If that's wrong, then we should worry more about + * exit_mmap() and do_munmap() cases described above: + * we might be resetting SWAP_MAP_MAX too early here. + * We know "Undead"s can happen, they're okay, so don't + * report them; but do report if we reset SWAP_MAP_MAX. + */ + if (*swap_map == SWAP_MAP_MAX) { + swap_list_lock(); + swap_device_lock(si); + nr_swap_pages++; + *swap_map = 1; + swap_device_unlock(si); + swap_list_unlock(); + reset_overflow = 1; + } + + /* + * If a reference remains (rare), we would like to leave + * the page in the swap cache; but try_to_swap_out could + * then re-duplicate the entry once we drop page lock, + * so we might loop indefinitely; also, that page could + * not be swapped out to other storage meanwhile. So: + * delete from cache even if there's another reference, + * after ensuring that the data has been saved to disk - + * since if the reference remains (rarer), it will be + * read from disk into another page. Splitting into two + * pages would be incorrect if swap supported "shared + * private" pages, but they are handled by tmpfs files. + * Note shmem_unuse already deleted its from swap cache. + */ + swcount = *swap_map; + if ((swcount > 0) != PageSwapCache(page)) + BUG(); + if ((swcount > 1) && PageDirty(page)) { + rw_swap_page(WRITE, page); + lock_page(page); + } + if (PageSwapCache(page)) + delete_from_swap_cache(page); + + /* + * So we could skip searching mms once swap count went + * to 1, we did not mark any present ptes as dirty: must + * mark page dirty so try_to_swap_out will preserve it. + */ + SetPageDirty(page); + UnlockPage(page); + page_cache_release(page); + + /* + * Make sure that we aren't completely killing + * interactive performance. Interruptible check on + * signal_pending() would be nice, but changes the spec? + */ + if (current->need_resched) + schedule(); + } + + mmput(start_mm); + if (reset_overflow) { + printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); + swap_overflow = 0; + } + return retval; +} + +asmlinkage long sys_swapoff(const char * specialfile) +{ + struct swap_info_struct * p = NULL; + unsigned short *swap_map; + struct nameidata nd; + int i, type, prev; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + err = user_path_walk(specialfile, &nd); + if (err) + goto out; + + lock_kernel(); + prev = -1; + swap_list_lock(); + for (type = swap_list.head; type >= 0; type = swap_info[type].next) { + p = swap_info + type; + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + if (p->swap_file == nd.dentry) + break; + } + prev = type; + } + err = -EINVAL; + if (type < 0) { + swap_list_unlock(); + goto out_dput; + } + + if (prev < 0) { + swap_list.head = p->next; + } else { + swap_info[prev].next = p->next; + } + if (type == swap_list.next) { + /* just pick something that's safe... */ + swap_list.next = swap_list.head; + } + nr_swap_pages -= p->pages; + total_swap_pages -= p->pages; + p->flags = SWP_USED; + swap_list_unlock(); + unlock_kernel(); + err = try_to_unuse(type); + lock_kernel(); + if (err) { + /* re-insert swap space back into swap_list */ + swap_list_lock(); + for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) + if (p->prio >= swap_info[i].prio) + break; + p->next = i; + if (prev < 0) + swap_list.head = swap_list.next = p - swap_info; + else + swap_info[prev].next = p - swap_info; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; + p->flags = SWP_WRITEOK; + swap_list_unlock(); + goto out_dput; + } + if (p->swap_device) + blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP); + path_release(&nd); + + swap_list_lock(); + swap_device_lock(p); + nd.mnt = p->swap_vfsmnt; + nd.dentry = p->swap_file; + p->swap_vfsmnt = NULL; + p->swap_file = NULL; + p->swap_device = 0; + p->max = 0; + swap_map = p->swap_map; + p->swap_map = NULL; + p->flags = 0; + swap_device_unlock(p); + swap_list_unlock(); + vfree(swap_map); + err = 0; + +out_dput: + unlock_kernel(); + path_release(&nd); +out: + return err; +} + +int get_swaparea_info(char *buf) +{ + char * page = (char *) __get_free_page(GFP_KERNEL); + struct swap_info_struct *ptr = swap_info; + int i, j, len = 0, usedswap; + + if (!page) + return -ENOMEM; + + len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n"); + for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { + if ((ptr->flags & SWP_USED) && ptr->swap_map) { + char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt, + page, PAGE_SIZE); + + len += sprintf(buf + len, "%-31s ", path); + + if (!ptr->swap_device) + len += sprintf(buf + len, "file\t\t"); + else + len += sprintf(buf + len, "partition\t"); + + usedswap = 0; + for (j = 0; j < ptr->max; ++j) + switch (ptr->swap_map[j]) { + case SWAP_MAP_BAD: + case 0: + continue; + default: + usedswap++; + } + len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), + usedswap << (PAGE_SHIFT - 10), ptr->prio); + } + } + free_page((unsigned long) page); + return len; +} + +int is_swap_partition(kdev_t dev) { + struct swap_info_struct *ptr = swap_info; + int i; + + for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { + if (ptr->flags & SWP_USED) + if (ptr->swap_device == dev) + return 1; + } + return 0; +} + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +asmlinkage long sys_swapon(const char * specialfile, int swap_flags) +{ + struct swap_info_struct * p; + struct nameidata nd; + struct inode * swap_inode; + unsigned int type; + int i, j, prev; + int error; + static int least_priority = 0; + union swap_header *swap_header = 0; + int swap_header_version; + int nr_good_pages = 0; + unsigned long maxpages = 1; + int swapfilesize; + struct block_device *bdev = NULL; + unsigned short *swap_map; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + lock_kernel(); + swap_list_lock(); + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + error = -EPERM; + if (type >= MAX_SWAPFILES) { + swap_list_unlock(); + goto out; + } + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + p->flags = SWP_USED; + p->swap_file = NULL; + p->swap_vfsmnt = NULL; + p->swap_device = 0; + p->swap_map = NULL; + p->lowest_bit = 0; + p->highest_bit = 0; + p->cluster_nr = 0; + p->sdev_lock = SPIN_LOCK_UNLOCKED; + p->next = -1; + if (swap_flags & SWAP_FLAG_PREFER) { + p->prio = + (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; + } else { + p->prio = --least_priority; + } + swap_list_unlock(); + error = user_path_walk(specialfile, &nd); + if (error) + goto bad_swap_2; + + p->swap_file = nd.dentry; + p->swap_vfsmnt = nd.mnt; + swap_inode = nd.dentry->d_inode; + error = -EINVAL; + + if (S_ISBLK(swap_inode->i_mode)) { + kdev_t dev = swap_inode->i_rdev; + struct block_device_operations *bdops; + + p->swap_device = dev; + set_blocksize(dev, PAGE_SIZE); + + bd_acquire(swap_inode); + bdev = swap_inode->i_bdev; + bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode)); + if (bdops) bdev->bd_op = bdops; + + error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP); + if (error) + goto bad_swap_2; + set_blocksize(dev, PAGE_SIZE); + error = -ENODEV; + if (!dev || (blk_size[MAJOR(dev)] && + !blk_size[MAJOR(dev)][MINOR(dev)])) + goto bad_swap; + swapfilesize = 0; + if (blk_size[MAJOR(dev)]) + swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] + >> (PAGE_SHIFT - 10); + } else if (S_ISREG(swap_inode->i_mode)) + swapfilesize = swap_inode->i_size >> PAGE_SHIFT; + else + goto bad_swap; + + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + struct swap_info_struct *q = &swap_info[i]; + if (i == type || !q->swap_file) + continue; + if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping) + goto bad_swap; + } + + swap_header = (void *) __get_free_page(GFP_USER); + if (!swap_header) { + printk("Unable to start swapping: out of memory :-)\n"); + error = -ENOMEM; + goto bad_swap; + } + + lock_page(virt_to_page(swap_header)); + rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header); + + if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) + swap_header_version = 1; + else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) + swap_header_version = 2; + else { + printk("Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + + switch (swap_header_version) { + case 1: + memset(((char *) swap_header)+PAGE_SIZE-10,0,10); + j = 0; + p->lowest_bit = 0; + p->highest_bit = 0; + for (i = 1 ; i < 8*PAGE_SIZE ; i++) { + if (test_bit(i,(char *) swap_header)) { + if (!p->lowest_bit) + p->lowest_bit = i; + p->highest_bit = i; + maxpages = i+1; + j++; + } + } + nr_good_pages = j; + p->swap_map = vmalloc(maxpages * sizeof(short)); + if (!p->swap_map) { + error = -ENOMEM; + goto bad_swap; + } + for (i = 1 ; i < maxpages ; i++) { + if (test_bit(i,(char *) swap_header)) + p->swap_map[i] = 0; + else + p->swap_map[i] = SWAP_MAP_BAD; + } + break; + + case 2: + /* Check the swap header's sub-version and the size of + the swap file and bad block lists */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); + error = -EINVAL; + goto bad_swap; + } + + p->lowest_bit = 1; + maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; + p->highest_bit = maxpages - 1; + + error = -EINVAL; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; + + /* OK, set up the swap map and apply the bad block list */ + if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + error = -ENOMEM; + goto bad_swap; + } + + error = 0; + memset(p->swap_map, 0, maxpages * sizeof(short)); + for (i=0; iinfo.nr_badpages; i++) { + int page = swap_header->info.badpages[i]; + if (page <= 0 || page >= swap_header->info.last_page) + error = -EINVAL; + else + p->swap_map[page] = SWAP_MAP_BAD; + } + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; + if (error) + goto bad_swap; + } + + if (swapfilesize && maxpages > swapfilesize) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + error = -EINVAL; + goto bad_swap; + } + if (!nr_good_pages) { + printk(KERN_WARNING "Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + p->swap_map[0] = SWAP_MAP_BAD; + swap_list_lock(); + swap_device_lock(p); + p->max = maxpages; + p->flags = SWP_WRITEOK; + p->pages = nr_good_pages; + nr_swap_pages += nr_good_pages; + total_swap_pages += nr_good_pages; + printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n", + nr_good_pages<<(PAGE_SHIFT-10), p->prio); + + /* insert swap space into swap_list: */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { + if (p->prio >= swap_info[i].prio) { + break; + } + prev = i; + } + p->next = i; + if (prev < 0) { + swap_list.head = swap_list.next = p - swap_info; + } else { + swap_info[prev].next = p - swap_info; + } + swap_device_unlock(p); + swap_list_unlock(); + error = 0; + goto out; +bad_swap: + if (bdev) + blkdev_put(bdev, BDEV_SWAP); +bad_swap_2: + swap_list_lock(); + swap_map = p->swap_map; + nd.mnt = p->swap_vfsmnt; + nd.dentry = p->swap_file; + p->swap_device = 0; + p->swap_file = NULL; + p->swap_vfsmnt = NULL; + p->swap_map = NULL; + p->flags = 0; + if (!(swap_flags & SWAP_FLAG_PREFER)) + ++least_priority; + swap_list_unlock(); + if (swap_map) + vfree(swap_map); + path_release(&nd); +out: + if (swap_header) + free_page((long) swap_header); + unlock_kernel(); + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i; + unsigned long nr_to_be_unused = 0; + + swap_list_lock(); + for (i = 0; i < nr_swapfiles; i++) { + unsigned int j; + if (swap_info[i].flags != SWP_USED) + continue; + for (j = 0; j < swap_info[i].max; ++j) { + switch (swap_info[i].swap_map[j]) { + case 0: + case SWAP_MAP_BAD: + continue; + default: + nr_to_be_unused++; + } + } + } + val->freeswap = nr_swap_pages + nr_to_be_unused; + val->totalswap = total_swap_pages + nr_to_be_unused; + swap_list_unlock(); +} + +/* + * Verify that a swap entry is valid and increment its swap map count. + * + * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as + * "permanent", but will be reclaimed by the next swapoff. + */ +int swap_duplicate(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + int result = 0; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = SWP_OFFSET(entry); + + swap_device_lock(p); + if (offset < p->max && p->swap_map[offset]) { + if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { + p->swap_map[offset]++; + result = 1; + } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + if (swap_overflow++ < 5) + printk(KERN_WARNING "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = SWAP_MAP_MAX; + result = 1; + } + } + swap_device_unlock(p); +out: + return result; + +bad_file: + printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + goto out; +} + +/* + * Page lock needs to be held in all cases to prevent races with + * swap file deletion. + */ +int swap_count(struct page *page) +{ + struct swap_info_struct * p; + unsigned long offset, type; + swp_entry_t entry; + int retval = 0; + + entry.val = page->index; + if (!entry.val) + goto bad_entry; + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = SWP_OFFSET(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_unused; + retval = p->swap_map[offset]; +out: + return retval; + +bad_entry: + printk(KERN_ERR "swap_count: null entry!\n"); + goto out; +bad_file: + printk(KERN_ERR "swap_count: %s%08lx\n", Bad_file, entry.val); + goto out; +bad_offset: + printk(KERN_ERR "swap_count: %s%08lx\n", Bad_offset, entry.val); + goto out; +bad_unused: + printk(KERN_ERR "swap_count: %s%08lx\n", Unused_offset, entry.val); + goto out; +} + +/* + * Prior swap_duplicate protects against swap device deletion. + */ +void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, + kdev_t *dev, struct inode **swapf) +{ + unsigned long type; + struct swap_info_struct *p; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val); + return; + } + + p = &swap_info[type]; + *offset = SWP_OFFSET(entry); + if (*offset >= p->max && *offset != 0) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val); + return; + } + if (p->swap_map && !p->swap_map[*offset]) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val); + return; + } + if (!(p->flags & SWP_USED)) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val); + return; + } + + if (p->swap_device) { + *dev = p->swap_device; + } else if (p->swap_file) { + *swapf = p->swap_file->d_inode; + } else { + printk(KERN_ERR "rw_swap_page: no swap file or device\n"); + } + return; +} + +/* + * swap_device_lock prevents swap_map being freed. Don't grab an extra + * reference on the swaphandle, it doesn't matter if it becomes unused. + */ +int valid_swaphandles(swp_entry_t entry, unsigned long *offset) +{ + int ret = 0, i = 1 << page_cluster; + unsigned long toff; + struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info; + + if (!page_cluster) /* no readahead */ + return 0; + toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster; + if (!toff) /* first page is swap header */ + toff++, i--; + *offset = toff; + + swap_device_lock(swapdev); + do { + /* Don't read-ahead past the end of the swap area */ + if (toff >= swapdev->max) + break; + /* Don't read in free or bad pages */ + if (!swapdev->swap_map[toff]) + break; + if (swapdev->swap_map[toff] == SWAP_MAP_BAD) + break; + toff++; + ret++; + } while (--i); + swap_device_unlock(swapdev); + return ret; +} diff -urN linux-2.4.17-rc1-virgin/mm/vmalloc.c linux-2.4.17-rc1-wli3/mm/vmalloc.c --- linux-2.4.17-rc1-virgin/mm/vmalloc.c Fri Dec 14 06:04:17 2001 +++ linux-2.4.17-rc1-wli3/mm/vmalloc.c Mon Sep 17 13:16:31 2001 @@ -6,7 +6,6 @@ * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 */ -#include #include #include #include @@ -274,43 +273,6 @@ if (count == 0) goto finished; *buf = *addr; - buf++; - addr++; - count--; - } while (--n > 0); - } -finished: - read_unlock(&vmlist_lock); - return buf - buf_start; -} - -long vwrite(char *buf, char *addr, unsigned long count) -{ - struct vm_struct *tmp; - char *vaddr, *buf_start = buf; - unsigned long n; - - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - - read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) - continue; - while (addr < vaddr) { - if (count == 0) - goto finished; - buf++; - addr++; - count--; - } - n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *addr = *buf; buf++; addr++; count--; diff -urN linux-2.4.17-rc1-virgin/mm/vmscan.c linux-2.4.17-rc1-wli3/mm/vmscan.c --- linux-2.4.17-rc1-virgin/mm/vmscan.c Sat Nov 17 19:18:17 2001 +++ linux-2.4.17-rc1-wli3/mm/vmscan.c Fri Dec 14 02:44:20 2001 @@ -32,349 +32,267 @@ */ #define DEF_PRIORITY (6) -/* - * The swap-out function returns 1 if it successfully - * scanned all the pages it was asked to (`count'). - * It returns zero if it couldn't do anything, - * - * rss may decrease because pages are shared, but this - * doesn't count as having freed a page. - */ +int vm_static_inactive_target; -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) +static inline void age_page_up(struct page *page) { - pte_t pte; - swp_entry_t entry; + page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); +} - /* Don't look at this pte if it's been accessed recently. */ - if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) { - mark_page_accessed(page); - return 0; - } +static inline void age_page_down(struct page *page) +{ + page->age -= min(PAGE_AGE_DECL, (int)page->age); +} - /* Don't bother unmapping pages that are active */ - if (PageActive(page)) - return 0; +/* + * Estimate whether a zone has enough inactive or free pages.. + */ +static unsigned int zone_inactive_plenty(zone_t *zone) +{ + unsigned int inactive; - /* Don't bother replenishing zones not under pressure.. */ - if (!memclass(page->zone, classzone)) + if (!zone->size) return 0; + + inactive = zone->inactive_dirty_pages; + inactive += zone->inactive_clean_pages; + inactive += zone->free_pages; - if (TryLockPage(page)) - return 0; + return (inactive > (zone->size * 2 / 5)); +} - /* From this point on, the odds are that we're going to - * nuke this pte, so read and clear the pte. This hook - * is needed on CPUs which update the accessed and dirty - * bits in hardware. - */ - flush_cache_page(vma, address); - pte = ptep_get_and_clear(page_table); - flush_tlb_page(vma, address); +#define FREE_PLENTY_FACTOR 4 +static unsigned int zone_free_plenty(zone_t *zone) +{ + unsigned int free, target; - if (pte_dirty(pte)) - set_page_dirty(page); + target = max((int) zone->pages_high, zone->need_balance); - /* - * Is the page already in the swap cache? If so, then - * we can just drop our reference to it without doing - * any IO - it's already up-to-date on disk. - */ - if (PageSwapCache(page)) { - entry.val = page->index; - swap_duplicate(entry); -set_swap_pte: - set_pte(page_table, swp_entry_to_pte(entry)); -drop_pte: - mm->rss--; - UnlockPage(page); - { - int freeable = page_count(page) - !!page->buffers <= 2; - page_cache_release(page); - return freeable; - } - } + free = zone->free_pages; + free += zone->inactive_clean_pages; - /* - * Is it a clean page? Then it must be recoverable - * by just paging it in again, and we can just drop - * it.. or if it's dirty but has backing store, - * just mark the page dirty and drop it. - * - * However, this won't actually free any real - * memory, as the page will just be in the page cache - * somewhere, and as such we should just continue - * our scan. - * - * Basically, this just makes it possible for us to do - * some real work in the future in "refill_inactive()". - */ - if (page->mapping) - goto drop_pte; - if (!PageDirty(page)) - goto drop_pte; + return free > target * FREE_PLENTY_FACTOR; +} - /* - * Anonymous buffercache pages can be left behind by - * concurrent truncate and pagefault. - */ - if (page->buffers) - goto preserve; +static unsigned int free_plenty(void) +{ + unsigned int free; - /* - * This is a dirty, swappable page. First of all, - * get a suitable swap entry for it, and make sure - * we have the swap cache set up to associate the - * page with that swap entry. - */ - for (;;) { - entry = get_swap_page(); - if (!entry.val) - break; - /* Add it to the swap cache and mark it dirty - * (adding to the page cache will clear the dirty - * and uptodate bits, so we need to do it again) - */ - if (add_to_swap_cache(page, entry) == 0) { - SetPageUptodate(page); - set_page_dirty(page); - goto set_swap_pte; - } - /* Raced with "speculative" read_swap_cache_async */ - swap_free(entry); - } + free = nr_free_pages(); + free += nr_inactive_clean_pages; - /* No swap space left */ -preserve: - set_pte(page_table, pte); - UnlockPage(page); - return 0; + return free > freepages.high * FREE_PLENTY_FACTOR; } -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) +static inline int page_mapping_inuse(struct page * page) { - pte_t * pte; - unsigned long pmd_end; + struct address_space * mapping = page->mapping; - if (pmd_none(*dir)) - return count; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return count; - } - - pte = pte_offset(dir, address); - - pmd_end = (address + PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; + /* Page is in somebody's page tables. */ + if (page->pte_chain) + return 1; - do { - if (pte_present(*pte)) { - struct page *page = pte_page(*pte); - - if (VALID_PAGE(page) && !PageReserved(page)) { - count -= try_to_swap_out(mm, vma, address, pte, page, classzone); - if (!count) { - address += PAGE_SIZE; - break; - } - } - } - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); - mm->swap_address = address; - return count; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) -{ - pmd_t * pmd; - unsigned long pgd_end; - - if (pgd_none(*dir)) - return count; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return count; - } + /* XXX: does this happen ? */ + if (!mapping) + return 0; - pmd = pmd_offset(dir, address); + /* File is mmaped by somebody. */ + if (mapping->i_mmap || mapping->i_mmap_shared) + return 1; - pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; - if (pgd_end && (end > pgd_end)) - end = pgd_end; - - do { - count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); - if (!count) - break; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return count; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone) -{ - pgd_t *pgdir; - unsigned long end; - - /* Don't swap out areas which are reserved */ - if (vma->vm_flags & VM_RESERVED) - return count; - - pgdir = pgd_offset(mm, address); - - end = vma->vm_end; - if (address >= end) - BUG(); - do { - count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); - if (!count) - break; - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while (address && (address < end)); - return count; + return 0; } -/* Placeholder for swap_out(): may be updated by fork.c:mmput() */ -struct mm_struct *swap_mm = &init_mm; - -/* - * Returns remaining count of pages to be swapped out by followup call. +/** + * reclaim_page - reclaims one page from the inactive_clean list + * @zone: reclaim a page from this zone + * + * The pages on the inactive_clean can be instantly reclaimed. + * The tests look impressive, but most of the time we'll grab + * the first page of the list and exit successfully. */ -static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone) +struct page * reclaim_page(zone_t * zone) { - unsigned long address; - struct vm_area_struct* vma; + struct page * page = NULL; + struct list_head * page_lru; + swp_entry_t entry = {0}; + int maxscan; /* - * Find the proper vm-area after freezing the vma chain - * and ptes. + * We need to hold the pagecache_lock around all tests to make sure + * reclaim_page() cannot race with find_get_page() and friends. */ - spin_lock(&mm->page_table_lock); - address = mm->swap_address; - if (address == TASK_SIZE || swap_mm != mm) { - /* We raced: don't count this mm but try again */ - ++*mmcounter; - goto out_unlock; - } - vma = find_vma(mm, address); - if (vma) { - if (address < vma->vm_start) - address = vma->vm_start; - - for (;;) { - count = swap_out_vma(mm, vma, address, count, classzone); - vma = vma->vm_next; - if (!vma) - break; - if (!count) - goto out_unlock; - address = vma->vm_start; + spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); + maxscan = zone->inactive_clean_pages; + while ((page_lru = zone->inactive_clean_list.prev) != + &zone->inactive_clean_list && maxscan--) { + page = list_entry(page_lru, struct page, lru); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (unlikely(!PageInactiveClean(page))) { + printk("VM: reclaim_page, wrong page on list.\n"); + list_del(page_lru); + page->zone->inactive_clean_pages--; + continue; } - } - /* Indicate that we reached the end of address space */ - mm->swap_address = TASK_SIZE; -out_unlock: - spin_unlock(&mm->page_table_lock); - return count; -} + /* Page is being freed */ + if (unlikely(page_count(page)) == 0) { + list_del(page_lru); + list_add(page_lru, &zone->inactive_clean_list); + continue; + } -static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)); -static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone) -{ - int counter, nr_pages = SWAP_CLUSTER_MAX; - struct mm_struct *mm; + /* Page cannot be reclaimed ? Move to inactive_dirty list. */ + if (unlikely(page->pte_chain || page->buffers || + PageReferenced(page) || PageDirty(page) || + page_count(page) > 1 || TryLockPage(page))) { + del_page_from_inactive_clean_list(page); + add_page_to_inactive_dirty_list(page); + continue; + } - counter = mmlist_nr; - do { - if (unlikely(current->need_resched)) { - __set_current_state(TASK_RUNNING); - schedule(); + /* OK, remove the page from the caches. */ + if (PageSwapCache(page)) { + entry.val = page->index; + __delete_from_swap_cache(page); + goto found_page; } - spin_lock(&mmlist_lock); - mm = swap_mm; - while (mm->swap_address == TASK_SIZE || mm == &init_mm) { - mm->swap_address = 0; - mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); - if (mm == swap_mm) - goto empty; - swap_mm = mm; + if (page->mapping) { + __remove_inode_page(page); + goto found_page; } - /* Make sure the mm doesn't disappear when we drop the lock.. */ - atomic_inc(&mm->mm_users); - spin_unlock(&mmlist_lock); + /* We should never ever get here. */ + printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); + list_del(page_lru); + zone->inactive_clean_pages--; + UnlockPage(page); + } + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); + return NULL; - nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone); +found_page: + del_page_from_inactive_clean_list(page); + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); + if (entry.val) + swap_free(entry); + UnlockPage(page); + page->age = PAGE_AGE_START; + if (page_count(page) != 1) + printk("VM: reclaim_page, found page with count %d!\n", + page_count(page)); + return page; +} - mmput(mm); +static inline int page_dirty(struct page *page) +{ + struct buffer_head *tmp, *bh; - if (!nr_pages) - return 1; - } while (--counter >= 0); + if (PageDirty(page)) + return 1; - return 0; + if (page->mapping && !page->buffers) + return 0; + + tmp = bh = page->buffers; + + do { + if (tmp->b_state & ((1<b_this_page; + } while (tmp != bh); -empty: - spin_unlock(&mmlist_lock); return 0; } -static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)); -static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority) +/** + * page_launder - clean dirty inactive pages, move to inactive_clean list + * @gfp_mask: what operations we are allowed to do + * @sync: are we allowed to do synchronous IO in emergencies ? + * + * This function is called when we are low on free / inactive_clean + * pages, its purpose is to refill the free/clean list as efficiently + * as possible. + * + * This means we do writes asynchronously as long as possible and will + * only sleep on IO when we don't have another option. Since writeouts + * cause disk seeks and make read IO slower, we skip writes alltogether + * when the amount of dirty pages is small. + * + * This code is heavily inspired by the FreeBSD source code. Thanks + * go out to Matthew Dillon. + */ +#define CAN_DO_FS ((gfp_mask & __GFP_FS) && should_write) +#define WRITE_LOW_WATER 5 +#define WRITE_HIGH_WATER 10 +int page_launder(int gfp_mask) { + int maxscan, cleaned_pages; struct list_head * entry; - int max_scan = nr_inactive_pages / priority; - int max_mapped = nr_pages << (9 - priority); + cleaned_pages = 0; + + /* The main launder loop. */ spin_lock(&pagemap_lru_lock); - while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { + maxscan = nr_inactive_dirty_pages; + while (--maxscan >= 0 && (entry = inactive_dirty_list.prev) != &inactive_dirty_list) { struct page * page; - if (unlikely(current->need_resched)) { - spin_unlock(&pagemap_lru_lock); - __set_current_state(TASK_RUNNING); - schedule(); - spin_lock(&pagemap_lru_lock); - continue; - } - page = list_entry(entry, struct page, lru); - if (unlikely(!PageLRU(page))) - BUG(); - if (unlikely(PageActive(page))) - BUG(); - list_del(entry); - list_add(entry, &inactive_list); + list_add(entry, &inactive_dirty_list); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (!PageInactiveDirty(page)) { + printk("VM: page_launder, wrong page on list.\n"); + list_del(entry); + nr_inactive_dirty_pages--; + page->zone->inactive_dirty_pages--; + continue; + } /* - * Zero page counts can happen because we unlink the pages - * _after_ decrementing the usage count.. + * The page is in active use or really unfreeable. Move to + * the active list and adjust the page age if needed. */ - if (unlikely(!page_count(page))) + if ((page_referenced(page) || page->age) && + page_mapping_inuse(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + page->age = max((int)page->age, PAGE_AGE_START); continue; + } - if (!memclass(page->zone, classzone)) + /* + * The page is still in the page tables of some process, + * move it to the active list but leave page age at 0; + * either swap_out() will make it freeable soon or it is + * mlock()ed... + * + * The !PageLocked() test is to protect us from ourselves, + * see the code around the writepage() call. + */ + if ((page_count(page) > (1 + !!page->buffers)) && + !PageLocked(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); continue; + } - /* Racy check to avoid trylocking when not worthwhile */ - if (!page->buffers && (page_count(page) != 1 || !page->mapping)) - goto page_mapped; + /* + * If this zone has plenty of pages free, don't spend time + * on cleaning it but only move clean pages out of the way + * so we won't have to scan those again. + */ + if (zone_free_plenty(page->zone) || page_count(page) == 0) { + continue; + } /* * The page is locked. IO in progress? @@ -391,12 +309,49 @@ continue; } - if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) { + /* + * Anonymous process memory without backing store. Try to + * allocate it some swap space here. + * + * XXX: implement swap clustering ? + */ + if (page->pte_chain && !page->mapping && !page->buffers) { + page_cache_get(page); + spin_unlock(&pagemap_lru_lock); + if (!add_to_swap(page)) { + activate_page(page); + UnlockPage(page); + page_cache_release(page); + spin_lock(&pagemap_lru_lock); + continue; + } + page_cache_release(page); + spin_lock(&pagemap_lru_lock); + } + + /* + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (page->pte_chain) { + switch (try_to_unmap(page)) { + case SWAP_ERROR: + case SWAP_FAIL: + goto page_active; + case SWAP_AGAIN: + UnlockPage(page); + continue; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } + } + + if (PageDirty(page) && page->mapping) { /* * It is not critical here to write it only if * the page is unmapped beause any direct writer * like O_DIRECT would set the PG_dirty bitflag - * on the phisical page after having successfully + * on the physical page after having successfully * pinned it and after the I/O to the page is finished, * so the direct writes to the page cannot get lost. */ @@ -425,7 +380,7 @@ if (page->buffers) { spin_unlock(&pagemap_lru_lock); - /* avoid to free a locked page */ + /* To avoid freeing our page before we're done. */ page_cache_get(page); if (try_to_release_page(page, gfp_mask)) { @@ -443,14 +398,14 @@ /* effectively free the page here */ page_cache_release(page); - if (--nr_pages) - continue; - break; + cleaned_pages++; + continue; } else { /* - * The page is still in pagecache so undo the stuff - * before the try_to_release_page since we've not - * finished and we can now try the next step. + * We freed the buffers but may have + * slept; undo the stuff we did before + * try_to_release_page and fall through + * to the next step. */ page_cache_release(page); @@ -466,224 +421,279 @@ } } - spin_lock(&pagecache_lock); /* - * this is the non-racy check for busy page. + * If the page is really freeable now, move it to the + * inactive_clean list. + * + * We re-test everything since the page could have been + * used by somebody else while we waited on IO above. + * This test is not safe from races, but only the one + * in reclaim_page() needs to be. */ - if (!page->mapping || !is_page_cache_freeable(page)) { - spin_unlock(&pagecache_lock); + if (page->mapping && !PageDirty(page) && !page->pte_chain && + page_count(page) == 1) { + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); UnlockPage(page); -page_mapped: - if (--max_mapped >= 0) - continue; - + cleaned_pages++; + } else { /* - * Alert! We've found too many mapped pages on the - * inactive list, so we start swapping out now! + * OK, we don't know what to do with the page. + * It's no use keeping it here, so we move it to + * the active list. */ - spin_unlock(&pagemap_lru_lock); - swap_out(priority, gfp_mask, classzone); - return nr_pages; - } - - /* - * It is critical to check PageDirty _after_ we made sure - * the page is freeable* so not in use by anybody. - */ - if (PageDirty(page)) { - spin_unlock(&pagecache_lock); +page_active: + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); UnlockPage(page); - continue; } - - /* point of no return */ - if (likely(!PageSwapCache(page))) { - __remove_inode_page(page); - spin_unlock(&pagecache_lock); - } else { - swp_entry_t swap; - swap.val = page->index; - __delete_from_swap_cache(page); - spin_unlock(&pagecache_lock); - swap_free(swap); - } - - __lru_cache_del(page); - UnlockPage(page); - - /* effectively free the page here */ - page_cache_release(page); - - if (--nr_pages) - continue; - break; } spin_unlock(&pagemap_lru_lock); - return nr_pages; + /* Return the number of pages moved to the inactive_clean list. */ + return cleaned_pages; } -/* - * This moves pages from the active list to - * the inactive list. +/** + * refill_inactive - scan the active list and find pages to deactivate + * @priority: how much are we allowed to scan * - * We move them the other way when we see the - * reference bit on the page. + * This function will scan a portion of the active list to find + * unused pages, those pages will then be moved to the inactive list. */ -static void refill_inactive(int nr_pages) +int refill_inactive(int priority) { - struct list_head * entry; + struct list_head * page_lru; + struct page * page; + int maxscan = nr_active_pages >> priority; + int nr_deactivated = 0; + /* Take the lock while messing with the list... */ spin_lock(&pagemap_lru_lock); - entry = active_list.prev; - while (nr_pages-- && entry != &active_list) { - struct page * page; + while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) { + page = list_entry(page_lru, struct page, lru); - page = list_entry(entry, struct page, lru); - entry = entry->prev; - if (PageTestandClearReferenced(page)) { - list_del(&page->lru); - list_add(&page->lru, &active_list); + /* Wrong page on list?! (list corruption, should not happen) */ + if (unlikely(!PageActive(page))) { + printk("VM: refill_inactive, wrong page on list.\n"); + list_del(page_lru); + nr_active_pages--; continue; } - del_page_from_active_list(page); - add_page_to_inactive_list(page); - SetPageReferenced(page); + /* + * Do aging on the pages. Every time a page is referenced, + * page->age gets incremented. If it wasn't referenced, we + * decrement page->age. The page gets moved to the inactive + * list when one of the following is true: + * - the page age reaches 0 + * - the object the page belongs to isn't in active use + * - the object the page belongs to is hogging the cache + */ + if (PageTestandClearReferenced(page)) { + age_page_up(page); + } else { + age_page_down(page); + } + + /* + * Don't deactivate pages from zones which have + * plenty inactive pages. + */ + if (unlikely(zone_inactive_plenty(page->zone) && + zone_free_plenty(page->zone))) { + goto skip_page; + } + + /* + * If the page age is 'hot' AND the object the page + * is in is still in use, we keep the page. Otherwise + * we move it to the inactive_dirty list. + */ + if (page->age && page_mapping_inuse(page)) { +skip_page: + list_del(page_lru); + list_add(page_lru, &active_list); + } else { + deactivate_page_nolock(page); + nr_deactivated++; + } + + /* Low latency reschedule point. */ + if (unlikely(current->need_resched)) { + spin_unlock(&pagemap_lru_lock); + __set_current_state(TASK_RUNNING); + schedule(); + if (!inactive_shortage()) + return 1; + spin_lock(&pagemap_lru_lock); + } } spin_unlock(&pagemap_lru_lock); + + return nr_deactivated; } -static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) +/* + * Check if there are zones with a severe shortage of free pages, + * or if all zones have a minor shortage. + */ +int free_shortage(void) { - int chunk_size = nr_pages; - unsigned long ratio; + pg_data_t *pgdat; + unsigned int global_free = 0; + unsigned int global_target = freepages.high; - nr_pages -= kmem_cache_reap(gfp_mask); - if (nr_pages <= 0) - return 0; + /* Are we low on free pages anywhere? */ + pgdat = pgdat_list; + do { + int i; + for(i = 0; i < MAX_NR_ZONES; i++) { + zone_t *zone = pgdat->node_zones+ i; + unsigned int free; - nr_pages = chunk_size; - /* try to keep the active list 2/3 of the size of the cache */ - ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2); - refill_inactive(ratio); + if (!zone->size) + continue; - nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority); - if (nr_pages <= 0) - return 0; + free = zone->free_pages; + free += zone->inactive_clean_pages; - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); -#ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); -#endif + /* Local shortage? */ + if (free < zone->pages_low) + return 1; - return nr_pages; + global_free += free; + } + pgdat = pgdat->node_next; + } while (pgdat); + + /* Global shortage? */ + return global_free < global_target; } -int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order) +static inline unsigned int inactive_target(void) { - int priority = DEF_PRIORITY; - int nr_pages = SWAP_CLUSTER_MAX; + unsigned int mem; - do { - nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; - } while (--priority); + mem = nr_active_pages; + mem += nr_inactive_dirty_pages; + mem += nr_inactive_clean_pages; - /* - * Hmm.. Cache shrink failed - time to kill something? - * Mhwahahhaha! This is the part I really like. Giggle. - */ - out_of_memory(); - return 0; + return mem / 4; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - -static int check_classzone_need_balance(zone_t * classzone) +/* + * Are we low on inactive pages globally or in any zone? + */ +int inactive_shortage(void) { - zone_t * first_classzone; + pg_data_t *pgdat; + unsigned int global_target = freepages.high + inactive_target(); + unsigned int global_inactive = 0; - first_classzone = classzone->zone_pgdat->node_zones; - while (classzone >= first_classzone) { - if (classzone->free_pages > classzone->pages_high) - return 0; - classzone--; - } - return 1; -} + pgdat = pgdat_list; + do { + int i; + for(i = 0; i < MAX_NR_ZONES; i++) { + zone_t *zone = pgdat->node_zones + i; + unsigned int inactive, target; -static int kswapd_balance_pgdat(pg_data_t * pgdat) -{ - int need_more_balance = 0, i; - zone_t * zone; + if (!zone->size) + continue; - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (unlikely(current->need_resched)) - schedule(); - if (!zone->need_balance) - continue; - if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { - zone->need_balance = 0; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - continue; + inactive = zone->inactive_dirty_pages; + inactive += zone->inactive_clean_pages; + inactive += zone->free_pages; + + target = max((int) zone->pages_high, zone->need_balance); + /* Local shortage? */ + if (inactive < target) + return 1; + + global_inactive += inactive; } - if (check_classzone_need_balance(zone)) - need_more_balance = 1; - else - zone->need_balance = 0; - } + pgdat = pgdat->node_next; + } while (pgdat); - return need_more_balance; + /* Global shortage? */ + return global_inactive < global_target; } -static void kswapd_balance(void) +/* + * Worker function for kswapd and try_to_free_pages, we get + * called whenever there is a shortage of free/inactive_clean + * pages. + * + * This function will also move pages to the inactive list, + * if needed. + */ +static int do_try_to_free_pages(unsigned int gfp_mask) { - int need_more_balance; - pg_data_t * pgdat; + int ret = 0; - do { - need_more_balance = 0; - pgdat = pgdat_list; - do - need_more_balance |= kswapd_balance_pgdat(pgdat); - while ((pgdat = pgdat->node_next)); - } while (need_more_balance); -} + /* + * Eat memory from filesystem page cache, buffer cache, + * dentry, inode and filesystem quota caches. + */ + ret += page_launder(gfp_mask); + shrink_dcache_memory(DEF_PRIORITY, gfp_mask); + shrink_icache_memory(1, gfp_mask); +#ifdef CONFIG_QUOTA + shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); +#endif -static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) -{ - zone_t * zone; - int i; + /* + * If needed, we move pages from the active list + * to the inactive list. + */ + if (inactive_shortage() || free_shortage()) + ret += refill_inactive(0); - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!zone->need_balance) - continue; - return 0; - } + /* + * Reclaim unused slab cache memory. + */ + kmem_cache_reap(gfp_mask); - return 1; + /* + * Hmm.. Cache shrink failed - time to kill something? + * Mhwahahhaha! This is the part I really like. Giggle. + */ + if (!ret) + out_of_memory(); + + return ret; } -static int kswapd_can_sleep(void) -{ - pg_data_t * pgdat; +DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - pgdat = pgdat_list; +/* + * Move some pages from the inactive_clean lists to the free + * lists so atomic allocations have pages to work from. + * + * We refill the freelist in a bump from pages_min to pages_low + * in order to give the buddy allocator something to play with. + */ +static void refill_freelist(void) +{ + pg_data_t * pgdat = pgdat_list; + int i; do { - if (kswapd_can_sleep_pgdat(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->node_next)); + for(i = 0; i < MAX_NR_ZONES; i++) { + zone_t *zone = pgdat->node_zones + i; + if (!zone->size || zone->free_pages >= zone->pages_min) + continue; - return 1; + while (zone->free_pages < zone->pages_low) { + struct page * page; + page = reclaim_page(zone); + if (!page) + break; + __free_page(page); + } + } + pgdat = pgdat->node_next; + } while (pgdat); } /* @@ -702,7 +712,6 @@ int kswapd(void *unused) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); daemonize(); strcpy(tsk->comm, "kswapd"); @@ -726,24 +735,65 @@ * Kswapd main loop. */ for (;;) { - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + static long recalc = 0; - mb(); - if (kswapd_can_sleep()) - schedule(); + /* + * We try to rebalance the VM either when we are short + * on free pages or when we have a shortage of inactive + * pages and are getting low on free pages. + */ + if (free_shortage() || (inactive_shortage() && !free_plenty())) + do_try_to_free_pages(GFP_KSWAPD); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kswapd_wait, &wait); + refill_freelist(); - /* - * If we actually get into a low-memory situation, - * the processes needing more memory will wake us - * up on a more timely basis. + /* Once a second ... */ + if (time_after(jiffies, recalc + HZ)) { + recalc = jiffies; + + /* Do background page aging. */ + refill_inactive(DEF_PRIORITY); + } + + /* + * We go to sleep if either the free page shortage + * or the inactive page shortage is gone. We do this + * because: + * 1) we need no more free pages or + * 2) the inactive pages need to be flushed to disk, + * it wouldn't help to eat CPU time now ... + * + * We go to sleep for one second, but if it's needed + * we'll be woken up earlier... */ - kswapd_balance(); - run_task_queue(&tq_disk); + if (!free_shortage() || !inactive_shortage()) { + interruptible_sleep_on_timeout(&kswapd_wait, HZ); + } + } +} + +void wakeup_kswapd(void) +{ + if (waitqueue_active(&kswapd_wait)) + wake_up_interruptible(&kswapd_wait); +} + +/* + * Called by non-kswapd processes when they want more + * memory but are unable to sleep on kswapd because + * they might be holding some IO locks ... + */ +int try_to_free_pages(unsigned int gfp_mask) +{ + int ret = 1; + + if (gfp_mask & __GFP_WAIT) { + current->flags |= PF_MEMALLOC; + ret = do_try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; } + + return ret; } static int __init kswapd_init(void) diff -urN linux-2.4.17-rc1-virgin/net/socket.c linux-2.4.17-rc1-wli3/net/socket.c --- linux-2.4.17-rc1-virgin/net/socket.c Fri Dec 14 06:04:18 2001 +++ linux-2.4.17-rc1-wli3/net/socket.c Fri Dec 14 02:44:44 2001 @@ -133,7 +133,7 @@ static struct net_proto_family *net_families[NPROTO]; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) static atomic_t net_family_lockct = ATOMIC_INIT(0); static spinlock_t net_family_lock = SPIN_LOCK_UNLOCKED;