diff -urN linux-2.4.17-rc1-virgin/CREDITS linux-2.4.17-rc1-wli3/CREDITS
--- linux-2.4.17-rc1-virgin/CREDITS	Fri Dec 14 06:04:00 2001
+++ linux-2.4.17-rc1-wli3/CREDITS	Fri Dec 14 02:44:44 2001
@@ -971,8 +971,8 @@
 
 N: Nigel Gamble
 E: nigel@nrg.org
-E: nigel@sgi.com
 D: Interrupt-driven printer driver
+D: Preemptible kernel
 S: 120 Alley Way
 S: Mountain View, California 94040
 S: USA
diff -urN linux-2.4.17-rc1-virgin/Changelog-wli linux-2.4.17-rc1-wli3/Changelog-wli
--- linux-2.4.17-rc1-virgin/Changelog-wli	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/Changelog-wli	Mon Dec 17 00:03:54 2001
@@ -0,0 +1,36 @@
+Changelog for 2.4.17-rc1-wli3
+----------------------------------------------------------------------
+(1) in FNV change shift/add to multiply                (William Irwin)
+(2) inode hash function like Lever pagecache           (William Irwin)
+(3) attribution on comment in pagecache hash function  (William Irwin)
+(4) lock breaking patch, minus vmscan.c                (Robert Love)
+(5) back out conditional_schedule in wait_for_buffers  (William Irwin)
+(6) reverted to Lever dcache but shifting D_HASHBITS   (William Irwin)
+(7) shifting for high-order bits in UID hash           (William Irwin)
+(8) shifting for high-order bits in PID hash           (William Irwin)
+(9) removed comment about inode.c quadratic hashing    (William Irwin)
+
+Changelog for 2.4.17-rc1-wli2
+----------------------------------------------------------------------
+(1) switch dcache to Mersenne hash                     (William Irwin)
+(2) convert partial_name_hash() to FNV                 (William Irwin)
+(3) back off HZ from 600 to 256                        (William Irwin)
+
+Changelog for 2.4.17-rc1-wli1
+----------------------------------------------------------------------
+(1)  reverse-mapping VM                                (Rik van Riel)
+(2)  preemptive kernel                                 (Robert Love)
+(3)  realtime scheduler that scans less                (George Anziger)
+(4)  page cache hash function                          (Chuck Lever)
+(5)  pidhash hash function                             (William Irwin)
+(6)  dentry cache hash function                        (Chuck Lever)
+(7)  check for priority == 0 in shrink_dcache_memory() (William Irwin)
+(8)  buffer cache hash function                        (Chuck Lever)
+(9)  uid hash function                                 (William Irwin)
+(10) inode hash function restored to Lever paper form  (Chuck Lever)
+(11) removal of statm_pgd_range()                      (William Irwin)
+(12) elevator read starvation prevention               (Andrew Morton)
+
+revert before distribution:
+(1) bootmem rewrite
+(2) timeslice change (HZ in asm-i386/param.h)
diff -urN linux-2.4.17-rc1-virgin/Documentation/Configure.help linux-2.4.17-rc1-wli3/Documentation/Configure.help
--- linux-2.4.17-rc1-virgin/Documentation/Configure.help	Fri Dec 14 06:04:00 2001
+++ linux-2.4.17-rc1-wli3/Documentation/Configure.help	Sun Dec 16 17:58:10 2001
@@ -266,6 +266,31 @@
   If you have a system with several CPUs, you do not need to say Y
   here: the local APIC will be used automatically.
 
+Preemptible Kernel
+CONFIG_PREEMPT
+  This option reduces the latency of the kernel when reacting to
+  real-time or interactive events by allowing a low priority process to
+  be preempted even if it is in kernel mode executing a system call.
+  This allows applications to run more reliably even when the system is
+  under load due to other, lower priority, processes.
+
+  Say Y here if you are building a kernel for a desktop system, embedded
+  system or real-time system.  Say N if you are building a kernel for a
+  system where throughput is more important than interactive response,
+  such as a server system.  Say N if you are unsure.
+
+Break Selected Locks
+CONFIG_LOCK_BREAK
+  This option will break certain locks in high-latency regions
+  throughout the kernel.  It is intended for use in conjunction with
+  the preemptible kernel (CONFIG_PREEMPT).  Since in-kernel preemption
+  can not occur while locks are held, temporarily releasing and then
+  reacquiring long-held locks will further improve system response.
+
+  Say Y if you are compiling for a system with strict latency
+  requirements such as an embedded, real-time, or audio processing
+  system.  Say N otherwise.
+
 Kernel math emulation
 CONFIG_MATH_EMULATION
   Linux can emulate a math coprocessor (used for floating point
@@ -289,6 +314,28 @@
 
   If you are not sure, say Y; apart from resulting in a 66 KB bigger
   kernel, it won't hurt.
+
+Real Time Scheduler
+CONFIG_RTSCHED
+
+  This option replaces the standard linux scheduler with a real time
+  scheduler.  The real time scheduler provides load independent fast
+  context switch times for real time tasks where as the standard linux
+  scheduler slows down with increasing load (i.e. more tasks ready to
+  run). For non-real time tasks both schedulers context switch times are
+  load dependent.  The real time scheduler also provides a configure
+  option for real time priorities ranging from 1 to a max of 2047 while 
+  the standard schedulers real time priorities range from 1-99. 
+  Real time tasks are tasks that have a scheduling policy of SCHED_FIFO 
+  or SCHED_RR.  Scheduling policy is set by the sched_setscheduler(2)
+  system call and is inherited thru fork and thread creation.
+ 
+Maximum Priority?
+CONFIG_MAX_PRI
+  This option lets you set the number of priorities available to real time
+  tasks.  Priorities 1 thru maximum priority are real time tasks.  The
+  default here is 127.  The system will quietly change any thing less than
+  99 to 99 and any thing greater than 2047 to 2047.
 
 Timer and CPU usage LEDs
 CONFIG_LEDS
diff -urN linux-2.4.17-rc1-virgin/Documentation/preempt-locking.txt linux-2.4.17-rc1-wli3/Documentation/preempt-locking.txt
--- linux-2.4.17-rc1-virgin/Documentation/preempt-locking.txt	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/Documentation/preempt-locking.txt	Fri Dec 14 02:44:44 2001
@@ -0,0 +1,94 @@
+		  Proper Locking Under a Preemptible Kernel:
+		       Keeping Kernel Code Preempt-Safe
+			  Robert Love <rml@tech9.net>
+			   Last Updated: 21 Oct 2001
+
+
+INTRODUCTION
+
+
+A preemptible kernel creates new locking issues.  The issues are the same as
+those under SMP: concurrency and reentrancy.  Thankfully, the Linux preemptible
+kernel model leverages existing SMP locking mechanisms.  Thus, the kernel
+requires explicit additional locking for very few additional situations.
+
+This document is for all kernel hackers.  Developing code in the kernel
+requires protecting these situations.  As you will see, these situations would 
+normally require a lock, where they not per-CPU.
+ 
+
+RULE #1: Per-CPU data structures need explicit protection
+
+
+Two similar problems arise. An example code snippet:
+
+	struct this_needs_locking tux[NR_CPUS];
+	tux[smp_processor_id()] = some_value;
+	/* task is preempted here... */
+	something = tux[smp_processor_id()];
+
+First, since the data is per-CPU, it may not have explicit SMP locking, but
+require it otherwise.  Second, when a preempted task is finally rescheduled,
+the previous value of smp_processor_id may not equal the current.  You must
+protect these situations by disabling preemption around them.
+
+
+RULE #2: CPU state must be protected.
+
+
+Under preemption, the state of the CPU must be protected.  This is arch-
+dependent, but includes CPU structures and state not preserved over a context
+switch.  For example, on x86, entering and exiting FPU mode is now a critical
+section that must occur while preemption is disabled.  Think what would happen
+if the kernel is executing a floating-point instruction and is then preempted.
+Remember, the kernel does not save FPU state except for user tasks.  Therefore,
+upon preemption, the FPU registers will be sold to the lowest bidder.  Thus,
+preemption must be disabled around such regions.i
+
+Note, some FPU functions are already explicitly preempt safe.  For example,
+kernel_fpu_begin and kernel_fpu_end will disable and enable preemption.
+However, math_state_restore must be called with preemption disabled.
+
+
+SOLUTION
+
+
+Data protection under preemption is achieved by disabling preemption for the
+duration of the critical region.
+
+preempt_enable()		decrement the preempt counter
+preempt_disable()		increment the preempt counter
+preempt_enable_no_resched()	decrement, but do not immediately preempt
+
+The functions are nestable.  In other words, you can call preempt_disable
+n-times in a code path, and preemption will not be reenabled until the n-th
+call to preempt_enable.  The preempt statements define to nothing if
+preemption is not enabled.
+
+Note that you do not need to explicitly prevent preemption if you are holding
+any locks or interrupts are disabled, since preemption is implicitly disabled
+in those cases.
+
+Example:
+
+	cpucache_t *cc; /* this is per-CPU */
+	preempt_disable();
+	cc = cc_data(searchp);
+	if (cc && cc->avail) {
+		__free_block(searchp, cc_entry(cc), cc->avail);
+		cc->avail = 0;
+	}
+	preempt_enable();
+	return 0;
+
+Notice how the preemption statements must encompass every reference of the
+critical variables.  Another example:
+
+	int buf[NR_CPUS];
+	set_cpu_val(buf);
+	if (buf[smp_processor_id()] == -1) printf(KERN_INFO "wee!\n");
+	spin_lock(&buf_lock);
+	/* ... */
+
+This code is not preempt-safe, but see how easily we can fix it by simply
+moving the spin_lock up two lines.
diff -urN linux-2.4.17-rc1-virgin/Documentation/rtsched.txt linux-2.4.17-rc1-wli3/Documentation/rtsched.txt
--- linux-2.4.17-rc1-virgin/Documentation/rtsched.txt	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/Documentation/rtsched.txt	Fri Dec 14 04:38:23 2001
@@ -0,0 +1,28 @@
+
+  Real Time Scheduler for Linux
+  =============================
+
+The Real Time scheduler patch gives you an option to choose to build a
+kernel with MontaVista's real time scheduler in it.  If you don't choose
+to enable the real time scheduler the kernel will be built the same as
+if you had not installed the patch.
+
+If you enable the real time scheduler, you may also choose a max
+priority for real time tasks.  The available range is 99 to 2047.
+Values outside this range are quietly moved to fall in the range.
+
+In order to enable the real time scheduler you must use one of the
+kernel configure tools to turn it on.  The question appears in the
+processor options section of the configuration.
+
+Currently the scheduler is supported on all UP and SMP machines.
+
+Warning: The Real Time scheduler does not honor the "allowed_cpus"
+member of the task_struct, thus it will not honor any attempt to define
+cpu affinity.  The latest preemption patch uses cpu affinity to prevent
+cpu switching during preemption.  This will not work with this scheduler
+and may cause failures in kernels using preemption.  In addition TUX
+is known to use cpu affinity.  It is believed that TUX will run with out
+cpu affinity, but may have degraded performance.  It is also known that
+some soft irq tasks may use cpu affinity to improve performance.  These
+tasks will still work, however, the affinity will not happen.
diff -urN linux-2.4.17-rc1-virgin/MAINTAINERS linux-2.4.17-rc1-wli3/MAINTAINERS
--- linux-2.4.17-rc1-virgin/MAINTAINERS	Fri Dec 14 06:04:00 2001
+++ linux-2.4.17-rc1-wli3/MAINTAINERS	Fri Dec 14 02:44:44 2001
@@ -1242,6 +1242,14 @@
 M:	mostrows@styx.uwaterloo.ca
 S:	Maintained
 
+PREEMPTIBLE KERNEL
+P:	Robert M. Love
+M:	rml@tech9.net
+L:	linux-kernel@vger.kernel.org
+L:	kpreempt-tech@lists.sourceforge.net
+W:	http://tech9.net/rml/linux
+S:	Maintained
+
 PROMISE DC4030 CACHING DISK CONTROLLER DRIVER
 P:	Peter Denison
 M:	promise@pnd-pc.demon.co.uk
diff -urN linux-2.4.17-rc1-virgin/Makefile linux-2.4.17-rc1-wli3/Makefile
--- linux-2.4.17-rc1-virgin/Makefile	Fri Dec 14 06:04:00 2001
+++ linux-2.4.17-rc1-wli3/Makefile	Sun Dec 16 22:41:12 2001
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 17
-EXTRAVERSION = -rc1
+EXTRAVERSION = -rc1-wli3
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -urN linux-2.4.17-rc1-virgin/arch/alpha/config.in linux-2.4.17-rc1-wli3/arch/alpha/config.in
--- linux-2.4.17-rc1-virgin/arch/alpha/config.in	Tue Nov 20 15:49:31 2001
+++ linux-2.4.17-rc1-wli3/arch/alpha/config.in	Fri Dec 14 04:38:23 2001
@@ -216,6 +216,10 @@
 then
 	bool 'Symmetric multi-processing support' CONFIG_SMP
 fi
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+	int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 
 if [ "$CONFIG_SMP" = "y" ]; then
    define_bool CONFIG_HAVE_DEC_LOCK y
diff -urN linux-2.4.17-rc1-virgin/arch/arm/config.in linux-2.4.17-rc1-wli3/arch/arm/config.in
--- linux-2.4.17-rc1-virgin/arch/arm/config.in	Fri Nov  9 13:58:02 2001
+++ linux-2.4.17-rc1-wli3/arch/arm/config.in	Fri Dec 14 04:38:23 2001
@@ -329,6 +329,10 @@
 else
    define_bool CONFIG_DISCONTIGMEM n
 fi
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 
 endmenu
 
@@ -437,6 +441,7 @@
 if [ "$CONFIG_CPU_32" = "y" -a "$CONFIG_ARCH_EBSA110" != "y" ]; then
    bool 'Kernel-mode alignment trap handler' CONFIG_ALIGNMENT_TRAP
 fi
+dep_bool 'Preemptible Kernel (experimental)' CONFIG_PREEMPT $CONFIG_CPU_32 $CONFIG_EXPERIMENTAL
 endmenu
 
 source drivers/parport/Config.in
diff -urN linux-2.4.17-rc1-virgin/arch/arm/kernel/entry-armv.S linux-2.4.17-rc1-wli3/arch/arm/kernel/entry-armv.S
--- linux-2.4.17-rc1-virgin/arch/arm/kernel/entry-armv.S	Thu Oct 25 13:53:45 2001
+++ linux-2.4.17-rc1-wli3/arch/arm/kernel/entry-armv.S	Fri Dec 14 02:44:44 2001
@@ -672,6 +672,12 @@
 		add	r4, sp, #S_SP
 		mov	r6, lr
 		stmia	r4, {r5, r6, r7, r8, r9}	@ save sp_SVC, lr_SVC, pc, cpsr, old_ro
+#ifdef CONFIG_PREEMPT
+		get_current_task r9
+		ldr	r8, [r9, #TSK_PREEMPT]
+		add	r8, r8, #1
+		str	r8, [r9, #TSK_PREEMPT]
+#endif
 1:		get_irqnr_and_base r0, r6, r5, lr
 		movne	r1, sp
 		@
@@ -679,6 +685,25 @@
 		@
 		adrsvc	ne, lr, 1b
 		bne	do_IRQ
+#ifdef CONFIG_PREEMPT
+2:		ldr	r8, [r9, #TSK_PREEMPT]
+		subs	r8, r8, #1
+		bne	3f
+		ldr	r7, [r9, #TSK_NEED_RESCHED]
+		teq	r7, #0
+		beq	3f
+		ldr	r6, .LCirqstat
+		ldr	r0, [r6, #IRQSTAT_BH_COUNT]
+		teq	r0, #0
+		bne	3f
+		mov	r0, #MODE_SVC
+		msr	cpsr_c, r0		@ enable interrupts
+		bl	SYMBOL_NAME(preempt_schedule)
+		mov	r0, #I_BIT | MODE_SVC
+		msr	cpsr_c, r0              @ disable interrupts
+		b	2b
+3:		str	r8, [r9, #TSK_PREEMPT]
+#endif
 		ldr	r0, [sp, #S_PSR]		@ irqs are already disabled
 		msr	spsr, r0
 		ldmia	sp, {r0 - pc}^			@ load r0 - pc, cpsr
@@ -736,6 +761,9 @@
 .LCprocfns:	.word	SYMBOL_NAME(processor)
 #endif
 .LCfp:		.word	SYMBOL_NAME(fp_enter)
+#ifdef CONFIG_PREEMPT
+.LCirqstat:	.word	SYMBOL_NAME(irq_stat)
+#endif
 
 		irq_prio_table
 
@@ -775,6 +803,12 @@
 		stmdb	r8, {sp, lr}^
 		alignment_trap r4, r7, __temp_irq
 		zero_fp
+		get_current_task tsk
+#ifdef CONFIG_PREEMPT
+		ldr	r0, [tsk, #TSK_PREEMPT]
+		add	r0, r0, #1
+		str	r0, [tsk, #TSK_PREEMPT]
+#endif
 1:		get_irqnr_and_base r0, r6, r5, lr
 		movne	r1, sp
 		adrsvc	ne, lr, 1b
@@ -782,8 +816,12 @@
 		@ routine called with r0 = irq number, r1 = struct pt_regs *
 		@
 		bne	do_IRQ
+#ifdef CONFIG_PREEMPT
+		ldr	r0, [tsk, #TSK_PREEMPT]
+		sub	r0, r0, #1
+		str	r0, [tsk, #TSK_PREEMPT]
+#endif
 		mov	why, #0
-		get_current_task tsk
 		b	ret_to_user
 
 		.align	5
diff -urN linux-2.4.17-rc1-virgin/arch/arm/tools/getconstants.c linux-2.4.17-rc1-wli3/arch/arm/tools/getconstants.c
--- linux-2.4.17-rc1-virgin/arch/arm/tools/getconstants.c	Thu Oct 11 09:04:57 2001
+++ linux-2.4.17-rc1-wli3/arch/arm/tools/getconstants.c	Fri Dec 14 02:44:44 2001
@@ -13,6 +13,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
+#include <asm/hardirq.h>
 
 /*
  * Make sure that the compiler and target are compatible.
@@ -38,6 +39,11 @@
 
 DEFN("TSS_SAVE",		OFF_TSK(thread.save));
 DEFN("TSS_FPESAVE",		OFF_TSK(thread.fpstate.soft.save));
+
+#ifdef CONFIG_PREEMPT
+DEFN("TSK_PREEMPT",		OFF_TSK(preempt_count));
+DEFN("IRQSTAT_BH_COUNT",	(unsigned long)&(((irq_cpustat_t *)0)->__local_bh_count));
+#endif
 
 #ifdef CONFIG_CPU_32
 DEFN("TSS_DOMAIN",		OFF_TSK(thread.domain));
diff -urN linux-2.4.17-rc1-virgin/arch/cris/config.in linux-2.4.17-rc1-wli3/arch/cris/config.in
--- linux-2.4.17-rc1-virgin/arch/cris/config.in	Mon Oct 15 13:42:14 2001
+++ linux-2.4.17-rc1-wli3/arch/cris/config.in	Fri Dec 14 04:38:23 2001
@@ -11,6 +11,10 @@
 mainmenu_option next_comment
 comment 'Code maturity level options'
 bool 'Prompt for development and/or incomplete code/drivers' CONFIG_EXPERIMENTAL
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 endmenu
 
 mainmenu_option next_comment
diff -urN linux-2.4.17-rc1-virgin/arch/i386/config.in linux-2.4.17-rc1-wli3/arch/i386/config.in
--- linux-2.4.17-rc1-virgin/arch/i386/config.in	Fri Dec 14 06:04:00 2001
+++ linux-2.4.17-rc1-wli3/arch/i386/config.in	Sun Dec 16 17:58:10 2001
@@ -176,6 +176,10 @@
 bool 'Math emulation' CONFIG_MATH_EMULATION
 bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
 bool 'Symmetric multi-processing support' CONFIG_SMP
+bool 'Preemptible Kernel' CONFIG_PREEMPT
+if [ "$CONFIG_PREEMPT" = "y" ]; then
+   bool 'Break selected locks' CONFIG_LOCK_BREAK
+fi
 if [ "$CONFIG_SMP" != "y" ]; then
    bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC
    dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC
@@ -188,10 +192,17 @@
 else
    bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
 fi
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 
-if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
-   define_bool CONFIG_HAVE_DEC_LOCK y
+if [ "$CONFIG_SMP" = "y" -o "$CONFIG_PREEMPT" = "y" ]; then
+   if [ "$CONFIG_X86_CMPXCHG" = "y" ]; then
+      define_bool CONFIG_HAVE_DEC_LOCK y
+   fi
 fi
+
 endmenu
 
 mainmenu_option next_comment
diff -urN linux-2.4.17-rc1-virgin/arch/i386/kernel/entry.S linux-2.4.17-rc1-wli3/arch/i386/kernel/entry.S
--- linux-2.4.17-rc1-virgin/arch/i386/kernel/entry.S	Fri Nov  2 17:18:49 2001
+++ linux-2.4.17-rc1-wli3/arch/i386/kernel/entry.S	Fri Dec 14 02:44:44 2001
@@ -71,7 +71,7 @@
  * these are offsets into the task-struct.
  */
 state		=  0
-flags		=  4
+preempt_count	=  4
 sigpending	=  8
 addr_limit	= 12
 exec_domain	= 16
@@ -79,8 +79,28 @@
 tsk_ptrace	= 24
 processor	= 52
 
+        /* These are offsets into the irq_stat structure
+         * There is one per cpu and it is aligned to 32
+         * byte boundry (we put that here as a shift count)
+         */
+irq_array_shift                 = CONFIG_X86_L1_CACHE_SHIFT
+
+irq_stat_local_irq_count        = 4
+irq_stat_local_bh_count         = 8
+
 ENOSYS = 38
 
+#ifdef CONFIG_SMP
+#define GET_CPU_INDX	movl processor(%ebx),%eax;  \
+                        shll $irq_array_shift,%eax
+#define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx); \
+                             GET_CPU_INDX
+#define CPU_INDX (,%eax)
+#else
+#define GET_CPU_INDX
+#define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx)
+#define CPU_INDX
+#endif
 
 #define SAVE_ALL \
 	cld; \
@@ -247,12 +267,30 @@
 	ALIGN
 ENTRY(ret_from_intr)
 	GET_CURRENT(%ebx)
+#ifdef CONFIG_PREEMPT
+	cli
+	decl preempt_count(%ebx)
+#endif
 ret_from_exception:
 	movl EFLAGS(%esp),%eax		# mix EFLAGS and CS
 	movb CS(%esp),%al
 	testl $(VM_MASK | 3),%eax	# return to VM86 mode or non-supervisor?
 	jne ret_from_sys_call
+#ifdef CONFIG_PREEMPT
+	cmpl $0,preempt_count(%ebx)
+	jnz restore_all
+	cmpl $0,need_resched(%ebx)
+	jz restore_all
+	movl SYMBOL_NAME(irq_stat)+irq_stat_local_bh_count CPU_INDX,%ecx
+	addl SYMBOL_NAME(irq_stat)+irq_stat_local_irq_count CPU_INDX,%ecx
+	jnz restore_all
+	incl preempt_count(%ebx)
+	sti
+	call SYMBOL_NAME(preempt_schedule)
+	jmp ret_from_intr
+#else
 	jmp restore_all
+#endif
 
 	ALIGN
 reschedule:
@@ -289,6 +327,9 @@
 	GET_CURRENT(%ebx)
 	call *%edi
 	addl $8,%esp
+#ifdef CONFIG_PREEMPT
+	cli
+#endif
 	jmp ret_from_exception
 
 ENTRY(coprocessor_error)
@@ -308,12 +349,18 @@
 	movl %cr0,%eax
 	testl $0x4,%eax			# EM (math emulation bit)
 	jne device_not_available_emulate
+#ifdef CONFIG_PREEMPT
+	cli
+#endif
 	call SYMBOL_NAME(math_state_restore)
 	jmp ret_from_exception
 device_not_available_emulate:
 	pushl $0		# temporary storage for ORIG_EIP
 	call  SYMBOL_NAME(math_emulate)
 	addl $4,%esp
+#ifdef CONFIG_PREEMPT
+	cli
+#endif
 	jmp ret_from_exception
 
 ENTRY(debug)
diff -urN linux-2.4.17-rc1-virgin/arch/i386/kernel/i387.c linux-2.4.17-rc1-wli3/arch/i386/kernel/i387.c
--- linux-2.4.17-rc1-virgin/arch/i386/kernel/i387.c	Fri Feb 23 10:09:08 2001
+++ linux-2.4.17-rc1-wli3/arch/i386/kernel/i387.c	Fri Dec 14 02:44:44 2001
@@ -10,6 +10,7 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/math_emu.h>
@@ -65,6 +66,8 @@
 {
 	struct task_struct *tsk = current;
 
+	preempt_disable();
+	
 	if (tsk->flags & PF_USEDFPU) {
 		__save_init_fpu(tsk);
 		return;
diff -urN linux-2.4.17-rc1-virgin/arch/i386/kernel/traps.c linux-2.4.17-rc1-wli3/arch/i386/kernel/traps.c
--- linux-2.4.17-rc1-virgin/arch/i386/kernel/traps.c	Sun Sep 30 12:26:08 2001
+++ linux-2.4.17-rc1-wli3/arch/i386/kernel/traps.c	Fri Dec 14 02:44:44 2001
@@ -697,6 +697,11 @@
  */
 asmlinkage void math_state_restore(struct pt_regs regs)
 {
+	/*
+	 * CONFIG_PREEMPT
+	 * Must be called with preemption disabled
+	 */
+
 	__asm__ __volatile__("clts");		/* Allow maths ops (or we recurse) */
 
 	if (current->used_math) {
diff -urN linux-2.4.17-rc1-virgin/arch/i386/lib/dec_and_lock.c linux-2.4.17-rc1-wli3/arch/i386/lib/dec_and_lock.c
--- linux-2.4.17-rc1-virgin/arch/i386/lib/dec_and_lock.c	Fri Jul  7 18:20:16 2000
+++ linux-2.4.17-rc1-wli3/arch/i386/lib/dec_and_lock.c	Fri Dec 14 02:44:44 2001
@@ -8,6 +8,7 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/sched.h>
 #include <asm/atomic.h>
 
 int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
diff -urN linux-2.4.17-rc1-virgin/arch/ia64/config.in linux-2.4.17-rc1-wli3/arch/ia64/config.in
--- linux-2.4.17-rc1-virgin/arch/ia64/config.in	Fri Nov  9 14:26:17 2001
+++ linux-2.4.17-rc1-wli3/arch/ia64/config.in	Fri Dec 14 04:38:23 2001
@@ -94,6 +94,10 @@
 define_bool CONFIG_KCORE_ELF y	# On IA-64, we always want an ELF /proc/kcore.
 
 bool 'SMP support' CONFIG_SMP
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 tristate 'Support running of Linux/x86 binaries' CONFIG_IA32_SUPPORT
 bool 'Performance monitor support' CONFIG_PERFMON
 tristate '/proc/pal support' CONFIG_IA64_PALINFO
diff -urN linux-2.4.17-rc1-virgin/arch/m68k/config.in linux-2.4.17-rc1-wli3/arch/m68k/config.in
--- linux-2.4.17-rc1-virgin/arch/m68k/config.in	Mon Jun 11 19:15:27 2001
+++ linux-2.4.17-rc1-wli3/arch/m68k/config.in	Fri Dec 14 04:38:23 2001
@@ -84,6 +84,10 @@
       bool 'Use write-through caching for 68060 supervisor accesses' CONFIG_060_WRITETHROUGH
    fi
 fi
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 endmenu
 
 mainmenu_option next_comment
diff -urN linux-2.4.17-rc1-virgin/arch/mips/config.in linux-2.4.17-rc1-wli3/arch/mips/config.in
--- linux-2.4.17-rc1-virgin/arch/mips/config.in	Mon Oct 15 13:41:34 2001
+++ linux-2.4.17-rc1-wli3/arch/mips/config.in	Fri Dec 14 04:38:23 2001
@@ -275,6 +275,10 @@
       fi
    fi
 fi
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 endmenu
 
 mainmenu_option next_comment
diff -urN linux-2.4.17-rc1-virgin/arch/mips64/config.in linux-2.4.17-rc1-wli3/arch/mips64/config.in
--- linux-2.4.17-rc1-virgin/arch/mips64/config.in	Sun Sep  9 10:43:02 2001
+++ linux-2.4.17-rc1-wli3/arch/mips64/config.in	Fri Dec 14 04:38:23 2001
@@ -25,6 +25,10 @@
    bool '  Multi-Processing support' CONFIG_SMP
    #bool '  IP27 XXL' CONFIG_SGI_SN0_XXL
 fi
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 endmenu
 
 define_bool CONFIG_RWSEM_GENERIC_SPINLOCK y
diff -urN linux-2.4.17-rc1-virgin/arch/parisc/config.in linux-2.4.17-rc1-wli3/arch/parisc/config.in
--- linux-2.4.17-rc1-virgin/arch/parisc/config.in	Tue Apr 17 17:19:25 2001
+++ linux-2.4.17-rc1-wli3/arch/parisc/config.in	Fri Dec 14 04:38:23 2001
@@ -45,6 +45,10 @@
 #
 # if [ "$CONFIG_PCI_EPIC" = "y" ]; then...
 #
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 
 endmenu
 
diff -urN linux-2.4.17-rc1-virgin/arch/ppc/config.in linux-2.4.17-rc1-wli3/arch/ppc/config.in
--- linux-2.4.17-rc1-virgin/arch/ppc/config.in	Fri Nov 16 10:10:08 2001
+++ linux-2.4.17-rc1-wli3/arch/ppc/config.in	Fri Dec 14 04:38:23 2001
@@ -108,6 +108,10 @@
 if [ "$CONFIG_SMP" = "y" ]; then
   bool '  Distribute interrupts on all CPUs by default' CONFIG_IRQ_ALL_CPUS
 fi
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+  int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 
 if [ "$CONFIG_6xx" = "y" -a "$CONFIG_8260" = "n" ];then
   bool 'AltiVec Support' CONFIG_ALTIVEC
diff -urN linux-2.4.17-rc1-virgin/arch/s390/config.in linux-2.4.17-rc1-wli3/arch/s390/config.in
--- linux-2.4.17-rc1-virgin/arch/s390/config.in	Fri Nov  9 13:58:02 2001
+++ linux-2.4.17-rc1-wli3/arch/s390/config.in	Fri Dec 14 04:38:23 2001
@@ -32,6 +32,10 @@
 comment 'Processor type and features'
 bool 'Symmetric multi-processing support' CONFIG_SMP
 bool 'IEEE FPU emulation' CONFIG_MATHEMU
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 endmenu
 
 mainmenu_option next_comment
diff -urN linux-2.4.17-rc1-virgin/arch/s390x/config.in linux-2.4.17-rc1-wli3/arch/s390x/config.in
--- linux-2.4.17-rc1-virgin/arch/s390x/config.in	Thu Oct 11 09:04:57 2001
+++ linux-2.4.17-rc1-wli3/arch/s390x/config.in	Fri Dec 14 04:38:23 2001
@@ -26,6 +26,10 @@
 if [ "$CONFIG_S390_SUPPORT" = "y" ]; then
   tristate 'Kernel support for 31 bit ELF binaries' CONFIG_BINFMT_ELF32 
 fi
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 endmenu
 
 mainmenu_option next_comment
diff -urN linux-2.4.17-rc1-virgin/arch/sh/config.in linux-2.4.17-rc1-wli3/arch/sh/config.in
--- linux-2.4.17-rc1-virgin/arch/sh/config.in	Mon Oct 15 13:36:48 2001
+++ linux-2.4.17-rc1-wli3/arch/sh/config.in	Fri Dec 14 04:38:23 2001
@@ -22,6 +22,10 @@
    bool '  Set version information on all module symbols' CONFIG_MODVERSIONS
    bool '  Kernel module loader' CONFIG_KMOD
 fi
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 endmenu
 
 mainmenu_option next_comment
@@ -124,6 +128,8 @@
    hex 'Physical memory start address' CONFIG_MEMORY_START 08000000
    hex 'Physical memory size' CONFIG_MEMORY_SIZE 00400000
 fi
+# Preemptible kernel feature
+bool 'Preemptible Kernel' CONFIG_PREEMPT
 endmenu
 
 if [ "$CONFIG_SH_HP690" = "y" ]; then
diff -urN linux-2.4.17-rc1-virgin/arch/sh/kernel/entry.S linux-2.4.17-rc1-wli3/arch/sh/kernel/entry.S
--- linux-2.4.17-rc1-virgin/arch/sh/kernel/entry.S	Mon Oct  8 10:39:18 2001
+++ linux-2.4.17-rc1-wli3/arch/sh/kernel/entry.S	Fri Dec 14 02:44:44 2001
@@ -60,10 +60,18 @@
 /*
  * These are offsets into the task-struct.
  */
-flags		=  4
+preempt_count	=  4
 sigpending	=  8
 need_resched	= 20
 tsk_ptrace	= 24
+flags		= 84
+
+/*
+ * And these offsets are into irq_stat.
+ * (Find irq_cpustat_t in asm-sh/hardirq.h)
+ */
+local_irq_count =  8
+local_bh_count  = 12
 
 PT_TRACESYS  = 0x00000002
 PF_USEDFPU   = 0x00100000
@@ -143,7 +151,7 @@
 	mov.l	__INV_IMASK, r11;	\
 	stc	sr, r10;		\
 	and	r11, r10;		\
-	stc	k_g_imask, r11;	\
+	stc	k_g_imask, r11;		\
 	or	r11, r10;		\
 	ldc	r10, sr
 
@@ -304,8 +312,8 @@
 	mov.l	@(tsk_ptrace,r0), r0	! Is current PTRACE_SYSCALL'd?
 	mov	#PT_TRACESYS, r1
 	tst	r1, r0
-	bt	ret_from_syscall
-	bra	syscall_ret_trace
+	bf	syscall_ret_trace
+	bra	ret_from_syscall
 	 nop	 
 
 	.align	2
@@ -505,8 +513,6 @@
 	.long	syscall_ret_trace
 __syscall_ret:
 	.long	syscall_ret
-__INV_IMASK:
-	.long	0xffffff0f	! ~(IMASK)
 
 
 	.align	2
@@ -518,7 +524,84 @@
 	.align	2
 1:	.long	SYMBOL_NAME(schedule)
 
+#ifdef CONFIG_PREEMPT	
+	!
+	! Returning from interrupt during kernel mode: check if
+	! preempt_schedule should be called. If need_resched flag
+	! is set, preempt_count is zero, and we're not currently
+	! in an interrupt handler (local irq or bottom half) then
+	! call preempt_schedule. 
+	!
+	! Increment preempt_count to prevent a nested interrupt
+	! from reentering preempt_schedule, then decrement after
+	! and drop through to regular interrupt return which will
+	! jump back and check again in case such an interrupt did
+	! come in (and didn't preempt due to preempt_count).
+	!
+	! NOTE:	because we just checked that preempt_count was
+	! zero before getting to the call, can't we use immediate
+	! values (1 and 0) rather than inc/dec? Also, rather than
+	! drop through to ret_from_irq, we already know this thread
+	! is kernel mode, can't we go direct to ret_from_kirq? In
+	! fact, with proper interrupt nesting and so forth could
+	! the loop simply be on the need_resched w/o checking the
+	! other stuff again? Optimize later...
+	!
+	.align	2
+ret_from_kirq:
+	! Nonzero preempt_count prevents scheduling
+	stc	k_current, r1
+	mov.l	@(preempt_count,r1), r0
+	cmp/eq	#0, r0
+	bf	restore_all
+	! Zero need_resched prevents scheduling
+	mov.l	@(need_resched,r1), r0
+	cmp/eq	#0, r0
+	bt	restore_all
+	! If in_interrupt(), don't schedule
+	mov.l	__irq_stat, r1
+	mov.l	@(local_irq_count,r1), r0
+	mov.l	@(local_bh_count,r1), r1
+	or	r1, r0
+	cmp/eq	#0, r0
+	bf	restore_all
+	! Allow scheduling using preempt_schedule
+	! Adjust preempt_count and SR as needed.
+	stc	k_current, r1
+	mov.l	@(preempt_count,r1), r0	! Could replace this ...
+	add	#1, r0			! ... and this w/mov #1?
+	mov.l	r0, @(preempt_count,r1)
+	STI()
+	mov.l	__preempt_schedule, r0
+	jsr	@r0
+	 nop	
+	/* CLI */
+	stc	sr, r0
+	or	#0xf0, r0
+	ldc	r0, sr
+	!
+	stc	k_current, r1
+	mov.l	@(preempt_count,r1), r0	! Could replace this ...
+	add	#-1, r0			! ... and this w/mov #0?
+	mov.l	r0, @(preempt_count,r1)
+	! Maybe should bra ret_from_kirq, or loop over need_resched?
+	! For now, fall through to ret_from_irq again...
+#endif /* CONFIG_PREEMPT */
+	
 ret_from_irq:
+	mov	#OFF_SR, r0
+	mov.l	@(r0,r15), r0	! get status register
+	shll	r0
+	shll	r0		! kernel space?
+#ifndef CONFIG_PREEMPT
+	bt	restore_all	! Yes, it's from kernel, go back soon
+#else /* CONFIG_PREEMPT */
+	bt	ret_from_kirq	! From kernel: maybe preempt_schedule
+#endif /* CONFIG_PREEMPT */
+	!
+	bra	ret_from_syscall
+	 nop
+
 ret_from_exception:
 	mov	#OFF_SR, r0
 	mov.l	@(r0,r15), r0	! get status register
@@ -564,6 +647,13 @@
 	.long	SYMBOL_NAME(do_signal)
 __irq_stat:
 	.long	SYMBOL_NAME(irq_stat)
+#ifdef CONFIG_PREEMPT
+__preempt_schedule:
+	.long	SYMBOL_NAME(preempt_schedule)
+#endif /* CONFIG_PREEMPT */	
+__INV_IMASK:
+	.long	0xffffff0f	! ~(IMASK)
+
 
 	.align 2
 restore_all:
@@ -679,7 +769,7 @@
 __fpu_prepare_fd:
 	.long	SYMBOL_NAME(fpu_prepare_fd)
 __init_task_flags:
-	.long	SYMBOL_NAME(init_task_union)+4
+	.long	SYMBOL_NAME(init_task_union)+flags
 __PF_USEDFPU:
 	.long	PF_USEDFPU
 #endif
diff -urN linux-2.4.17-rc1-virgin/arch/sh/kernel/irq.c linux-2.4.17-rc1-wli3/arch/sh/kernel/irq.c
--- linux-2.4.17-rc1-virgin/arch/sh/kernel/irq.c	Sat Sep  8 12:29:09 2001
+++ linux-2.4.17-rc1-wli3/arch/sh/kernel/irq.c	Fri Dec 14 02:44:44 2001
@@ -229,6 +229,14 @@
 	struct irqaction * action;
 	unsigned int status;
 
+	/*
+	 * At this point we're now about to actually call handlers,
+	 * and interrupts might get reenabled during them... bump
+	 * preempt_count to prevent any preemption while the handler
+ 	 * called here is pending...
+ 	 */
+ 	preempt_disable();
+
 	/* Get IRQ number */
 	asm volatile("stc	r2_bank, %0\n\t"
 		     "shlr2	%0\n\t"
@@ -298,8 +306,17 @@
 	desc->handler->end(irq);
 	spin_unlock(&desc->lock);
 
+
 	if (softirq_pending(cpu))
 		do_softirq();
+
+	/*
+	 * We're done with the handlers, interrupts should be
+	 * currently disabled; decrement preempt_count now so
+	 * as we return preemption may be allowed...
+	 */
+	preempt_enable_no_resched();
+
 	return 1;
 }
 
diff -urN linux-2.4.17-rc1-virgin/arch/sparc/config.in linux-2.4.17-rc1-wli3/arch/sparc/config.in
--- linux-2.4.17-rc1-virgin/arch/sparc/config.in	Mon Jun 11 19:15:27 2001
+++ linux-2.4.17-rc1-wli3/arch/sparc/config.in	Fri Dec 14 04:38:23 2001
@@ -28,6 +28,10 @@
 define_bool CONFIG_VT_CONSOLE y
 
 bool 'Symmetric multi-processing support (does not work on sun4/sun4c)' CONFIG_SMP
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 
 # Identify this as a Sparc32 build
 define_bool CONFIG_SPARC32 y
diff -urN linux-2.4.17-rc1-virgin/arch/sparc64/config.in linux-2.4.17-rc1-wli3/arch/sparc64/config.in
--- linux-2.4.17-rc1-virgin/arch/sparc64/config.in	Fri Dec 14 06:04:01 2001
+++ linux-2.4.17-rc1-wli3/arch/sparc64/config.in	Fri Dec 14 04:38:23 2001
@@ -27,6 +27,10 @@
 define_bool CONFIG_VT_CONSOLE y
 
 bool 'Symmetric multi-processing support' CONFIG_SMP
+bool 'Real Time Scheduler' CONFIG_RTSCHED
+if [ "$CONFIG_RTSCHED" = "y" ]; then
+    int 'Maximum Priority?' CONFIG_MAX_PRI 127
+fi
 
 # Identify this as a Sparc64 build
 define_bool CONFIG_SPARC64 y
diff -urN linux-2.4.17-rc1-virgin/drivers/block/elevator.c linux-2.4.17-rc1-wli3/drivers/block/elevator.c
--- linux-2.4.17-rc1-virgin/drivers/block/elevator.c	Thu Jul 19 20:59:41 2001
+++ linux-2.4.17-rc1-wli3/drivers/block/elevator.c	Sat Dec 15 14:54:07 2001
@@ -74,11 +74,10 @@
 	return 0;
 }
 
-
 int elevator_linus_merge(request_queue_t *q, struct request **req,
 			 struct list_head * head,
 			 struct buffer_head *bh, int rw,
-			 int max_sectors)
+			 int max_sectors, int max_bomb_segments)
 {
 	struct list_head *entry = &q->queue_head;
 	unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE;
@@ -116,6 +115,56 @@
 		}
 	}
 
+	/*
+	 * If we failed to merge a read anywhere in the request
+	 * queue, we really don't want to place it at the end
+	 * of the list, behind lots of writes.  So place it near
+	 * the front.
+	 *
+	 * We don't want to place it in front of _all_ writes: that
+	 * would create lots of seeking, and isn't tunable.
+	 * We try to avoid promoting this read in front of existing
+	 * reads.
+	 *
+	 * max_bomb_sectors becomes the maximum number of write
+	 * requests which we allow to remain in place in front of
+	 * a newly introduced read.  We weight things a little bit,
+	 * so large writes are more expensive than small ones, but it's
+	 * requests which count, not sectors.
+	 */
+	if (max_bomb_segments && rw == READ && ret == ELEVATOR_NO_MERGE) {
+		int cur_latency = 0;
+		struct request * const cur_request = *req;
+
+		entry = head->next;
+		while (entry != &q->queue_head) {
+			struct request *__rq;
+
+			if (entry == &q->queue_head)
+				BUG();
+			if (entry == q->queue_head.next &&
+					q->head_active && !q->plugged)
+				BUG();
+			__rq = blkdev_entry_to_request(entry);
+
+			if (__rq == cur_request) {
+				/*
+				 * This is where the old algorithm placed it.
+				 * There's no point pushing it further back,
+				 * so leave it here, in sorted order.
+				 */
+				break;
+			}
+			if (__rq->cmd == WRITE) {
+				cur_latency += 1 + __rq->nr_sectors / 64;
+				if (cur_latency >= max_bomb_segments) {
+					*req = __rq;
+					break;
+				}
+			}
+			entry = entry->next;
+		}
+	}
 	return ret;
 }
 
@@ -144,7 +193,7 @@
 int elevator_noop_merge(request_queue_t *q, struct request **req,
 			struct list_head * head,
 			struct buffer_head *bh, int rw,
-			int max_sectors)
+			int max_sectors, int max_bomb_segments)
 {
 	struct list_head *entry;
 	unsigned int count = bh->b_size >> 9;
@@ -188,7 +237,7 @@
 	output.queue_ID			= elevator->queue_ID;
 	output.read_latency		= elevator->read_latency;
 	output.write_latency		= elevator->write_latency;
-	output.max_bomb_segments	= 0;
+	output.max_bomb_segments	= elevator->max_bomb_segments;
 
 	if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t)))
 		return -EFAULT;
@@ -207,9 +256,12 @@
 		return -EINVAL;
 	if (input.write_latency < 0)
 		return -EINVAL;
+	if (input.max_bomb_segments < 0)
+		return -EINVAL;
 
 	elevator->read_latency		= input.read_latency;
 	elevator->write_latency		= input.write_latency;
+	elevator->max_bomb_segments	= input.max_bomb_segments;
 	return 0;
 }
 
diff -urN linux-2.4.17-rc1-virgin/drivers/block/ll_rw_blk.c linux-2.4.17-rc1-wli3/drivers/block/ll_rw_blk.c
--- linux-2.4.17-rc1-virgin/drivers/block/ll_rw_blk.c	Mon Oct 29 12:11:17 2001
+++ linux-2.4.17-rc1-wli3/drivers/block/ll_rw_blk.c	Sat Dec 15 14:54:07 2001
@@ -690,7 +690,8 @@
 	} else if (q->head_active && !q->plugged)
 		head = head->next;
 
-	el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors);
+	el_ret = elevator->elevator_merge_fn(q, &req, head, bh,
+				rw, max_sectors, elevator->max_bomb_segments);
 	switch (el_ret) {
 
 		case ELEVATOR_BACK_MERGE:
diff -urN linux-2.4.17-rc1-virgin/drivers/char/mem.c linux-2.4.17-rc1-wli3/drivers/char/mem.c
--- linux-2.4.17-rc1-virgin/drivers/char/mem.c	Fri Dec 14 06:04:02 2001
+++ linux-2.4.17-rc1-wli3/drivers/char/mem.c	Sun Dec 16 17:58:10 2001
@@ -272,8 +272,6 @@
  	return virtr + read;
 }
 
-extern long vwrite(char *buf, char *addr, unsigned long count);
-
 /*
  * This function writes to the *virtual* memory as seen by the kernel.
  */
@@ -281,46 +279,12 @@
 			  size_t count, loff_t *ppos)
 {
 	unsigned long p = *ppos;
-	ssize_t wrote = 0;
-	ssize_t virtr = 0;
-	char * kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */
-
-	if (p < (unsigned long) high_memory) {
-		wrote = count;
-		if (count > (unsigned long) high_memory - p)
-			wrote = (unsigned long) high_memory - p;
-
-		wrote = do_write_mem(file, (void*)p, p, buf, wrote, ppos);
-
-		p += wrote;
-		buf += wrote;
-		count -= wrote;
-	}
-
-	if (count > 0) {
-		kbuf = (char *)__get_free_page(GFP_KERNEL);
-		if (!kbuf)
-			return -ENOMEM;
-		while (count > 0) {
-			int len = count;
-
-			if (len > PAGE_SIZE)
-				len = PAGE_SIZE;
-			if (len && copy_from_user(kbuf, buf, len)) {
-				free_page((unsigned long)kbuf);
-				return -EFAULT;
-			}
-			len = vwrite(kbuf, (char *)p, len);
-			count -= len;
-			buf += len;
-			virtr += len;
-			p += len;
-		}
-		free_page((unsigned long)kbuf);
-	}
 
- 	*ppos = p;
- 	return virtr + wrote;
+	if (p >= (unsigned long) high_memory)
+		return 0;
+	if (count > (unsigned long) high_memory - p)
+		count = (unsigned long) high_memory - p;
+	return do_write_mem(file, (void*)p, p, buf, count, ppos);
 }
 
 #if !defined(__mc68000__)
@@ -400,7 +364,7 @@
 		if (count > size)
 			count = size;
 
-		zap_page_range(mm, addr, count);
+		zap_page_range(mm, addr, count, ZPR_NORMAL);
         	zeromap_page_range(addr, count, PAGE_COPY);
 
 		size -= count;
diff -urN linux-2.4.17-rc1-virgin/drivers/char/mem.c~ linux-2.4.17-rc1-wli3/drivers/char/mem.c~
--- linux-2.4.17-rc1-virgin/drivers/char/mem.c~	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/drivers/char/mem.c~	Fri Dec 14 03:53:03 2001
@@ -0,0 +1,642 @@
+/*
+ *  linux/drivers/char/mem.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Added devfs support. 
+ *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
+ *  Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/tpqic02.h>
+#include <linux/ftape.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mman.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/raw.h>
+#include <linux/tty.h>
+#include <linux/capability.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/pgalloc.h>
+
+#ifdef CONFIG_I2C
+extern int i2c_init_all(void);
+#endif
+#ifdef CONFIG_FB
+extern void fbmem_init(void);
+#endif
+#ifdef CONFIG_PROM_CONSOLE
+extern void prom_con_init(void);
+#endif
+#ifdef CONFIG_MDA_CONSOLE
+extern void mda_console_init(void);
+#endif
+#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_CHAR)
+extern void tapechar_init(void);
+#endif
+     
+static ssize_t do_write_mem(struct file * file, void *p, unsigned long realp,
+			    const char * buf, size_t count, loff_t *ppos)
+{
+	ssize_t written;
+
+	written = 0;
+#if defined(__sparc__) || defined(__mc68000__)
+	/* we don't have page 0 mapped on sparc and m68k.. */
+	if (realp < PAGE_SIZE) {
+		unsigned long sz = PAGE_SIZE-realp;
+		if (sz > count) sz = count; 
+		/* Hmm. Do something? */
+		buf+=sz;
+		p+=sz;
+		count-=sz;
+		written+=sz;
+	}
+#endif
+	if (copy_from_user(p, buf, count))
+		return -EFAULT;
+	written += count;
+	*ppos += written;
+	return written;
+}
+
+
+/*
+ * This funcion reads the *physical* memory. The f_pos points directly to the 
+ * memory location. 
+ */
+static ssize_t read_mem(struct file * file, char * buf,
+			size_t count, loff_t *ppos)
+{
+	unsigned long p = *ppos;
+	unsigned long end_mem;
+	ssize_t read;
+	
+	end_mem = __pa(high_memory);
+	if (p >= end_mem)
+		return 0;
+	if (count > end_mem - p)
+		count = end_mem - p;
+	read = 0;
+#if defined(__sparc__) || defined(__mc68000__)
+	/* we don't have page 0 mapped on sparc and m68k.. */
+	if (p < PAGE_SIZE) {
+		unsigned long sz = PAGE_SIZE-p;
+		if (sz > count) 
+			sz = count; 
+		if (sz > 0) {
+			if (clear_user(buf, sz))
+				return -EFAULT;
+			buf += sz; 
+			p += sz; 
+			count -= sz; 
+			read += sz; 
+		}
+	}
+#endif
+	if (copy_to_user(buf, __va(p), count))
+		return -EFAULT;
+	read += count;
+	*ppos += read;
+	return read;
+}
+
+static ssize_t write_mem(struct file * file, const char * buf, 
+			 size_t count, loff_t *ppos)
+{
+	unsigned long p = *ppos;
+	unsigned long end_mem;
+
+	end_mem = __pa(high_memory);
+	if (p >= end_mem)
+		return 0;
+	if (count > end_mem - p)
+		count = end_mem - p;
+	return do_write_mem(file, __va(p), p, buf, count, ppos);
+}
+
+#ifndef pgprot_noncached
+
+/*
+ * This should probably be per-architecture in <asm/pgtable.h>
+ */
+static inline pgprot_t pgprot_noncached(pgprot_t _prot)
+{
+	unsigned long prot = pgprot_val(_prot);
+
+#if defined(__i386__) || defined(__x86_64__)
+	/* On PPro and successors, PCD alone doesn't always mean 
+	    uncached because of interactions with the MTRRs. PCD | PWT
+	    means definitely uncached. */ 
+	if (boot_cpu_data.x86 > 3)
+		prot |= _PAGE_PCD | _PAGE_PWT;
+#elif defined(__powerpc__)
+	prot |= _PAGE_NO_CACHE | _PAGE_GUARDED;
+#elif defined(__mc68000__)
+#ifdef SUN3_PAGE_NOCACHE
+	if (MMU_IS_SUN3)
+		prot |= SUN3_PAGE_NOCACHE;
+	else
+#endif
+	if (MMU_IS_851 || MMU_IS_030)
+		prot |= _PAGE_NOCACHE030;
+	/* Use no-cache mode, serialized */
+	else if (MMU_IS_040 || MMU_IS_060)
+		prot = (prot & _CACHEMASK040) | _PAGE_NOCACHE_S;
+#endif
+
+	return __pgprot(prot);
+}
+
+#endif /* !pgprot_noncached */
+
+/*
+ * Architectures vary in how they handle caching for addresses 
+ * outside of main memory.
+ */
+static inline int noncached_address(unsigned long addr)
+{
+#if defined(__i386__)
+	/* 
+	 * On the PPro and successors, the MTRRs are used to set
+	 * memory types for physical addresses outside main memory, 
+	 * so blindly setting PCD or PWT on those pages is wrong.
+	 * For Pentiums and earlier, the surround logic should disable 
+	 * caching for the high addresses through the KEN pin, but
+	 * we maintain the tradition of paranoia in this code.
+	 */
+ 	return !( test_bit(X86_FEATURE_MTRR, &boot_cpu_data.x86_capability) ||
+		  test_bit(X86_FEATURE_K6_MTRR, &boot_cpu_data.x86_capability) ||
+		  test_bit(X86_FEATURE_CYRIX_ARR, &boot_cpu_data.x86_capability) ||
+		  test_bit(X86_FEATURE_CENTAUR_MCR, &boot_cpu_data.x86_capability) )
+	  && addr >= __pa(high_memory);
+#else
+	return addr >= __pa(high_memory);
+#endif
+}
+
+static int mmap_mem(struct file * file, struct vm_area_struct * vma)
+{
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+
+	/*
+	 * Accessing memory above the top the kernel knows about or
+	 * through a file pointer that was marked O_SYNC will be
+	 * done non-cached.
+	 */
+	if (noncached_address(offset) || (file->f_flags & O_SYNC))
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	/* Don't try to swap out physical pages.. */
+	vma->vm_flags |= VM_RESERVED;
+
+	/*
+	 * Don't dump addresses that are not real memory to a core file.
+	 */
+	if (offset >= __pa(high_memory) || (file->f_flags & O_SYNC))
+		vma->vm_flags |= VM_IO;
+
+	if (remap_page_range(vma->vm_start, offset, vma->vm_end-vma->vm_start,
+			     vma->vm_page_prot))
+		return -EAGAIN;
+	return 0;
+}
+
+/*
+ * This function reads the *virtual* memory as seen by the kernel.
+ */
+static ssize_t read_kmem(struct file *file, char *buf, 
+			 size_t count, loff_t *ppos)
+{
+	unsigned long p = *ppos;
+	ssize_t read = 0;
+	ssize_t virtr = 0;
+	char * kbuf; /* k-addr because vread() takes vmlist_lock rwlock */
+		
+	if (p < (unsigned long) high_memory) {
+		read = count;
+		if (count > (unsigned long) high_memory - p)
+			read = (unsigned long) high_memory - p;
+
+#if defined(__sparc__) || defined(__mc68000__)
+		/* we don't have page 0 mapped on sparc and m68k.. */
+		if (p < PAGE_SIZE && read > 0) {
+			size_t tmp = PAGE_SIZE - p;
+			if (tmp > read) tmp = read;
+			if (clear_user(buf, tmp))
+				return -EFAULT;
+			buf += tmp;
+			p += tmp;
+			read -= tmp;
+			count -= tmp;
+		}
+#endif
+		if (copy_to_user(buf, (char *)p, read))
+			return -EFAULT;
+		p += read;
+		buf += read;
+		count -= read;
+	}
+
+	if (count > 0) {
+		kbuf = (char *)__get_free_page(GFP_KERNEL);
+		if (!kbuf)
+			return -ENOMEM;
+		while (count > 0) {
+			int len = count;
+
+			if (len > PAGE_SIZE)
+				len = PAGE_SIZE;
+			len = vread(kbuf, (char *)p, len);
+			if (!len)
+				break;
+			if (copy_to_user(buf, kbuf, len)) {
+				free_page((unsigned long)kbuf);
+				return -EFAULT;
+			}
+			count -= len;
+			buf += len;
+			virtr += len;
+			p += len;
+		}
+		free_page((unsigned long)kbuf);
+	}
+ 	*ppos = p;
+ 	return virtr + read;
+}
+
+/*
+ * This function writes to the *virtual* memory as seen by the kernel.
+ */
+static ssize_t write_kmem(struct file * file, const char * buf, 
+			  size_t count, loff_t *ppos)
+{
+	unsigned long p = *ppos;
+
+	if (p >= (unsigned long) high_memory)
+		return 0;
+	if (count > (unsigned long) high_memory - p)
+		count = (unsigned long) high_memory - p;
+	return do_write_mem(file, (void*)p, p, buf, count, ppos);
+}
+
+#if !defined(__mc68000__)
+static ssize_t read_port(struct file * file, char * buf,
+			 size_t count, loff_t *ppos)
+{
+	unsigned long i = *ppos;
+	char *tmp = buf;
+
+	if (verify_area(VERIFY_WRITE,buf,count))
+		return -EFAULT; 
+	while (count-- > 0 && i < 65536) {
+		if (__put_user(inb(i),tmp) < 0) 
+			return -EFAULT;  
+		i++;
+		tmp++;
+	}
+	*ppos = i;
+	return tmp-buf;
+}
+
+static ssize_t write_port(struct file * file, const char * buf,
+			  size_t count, loff_t *ppos)
+{
+	unsigned long i = *ppos;
+	const char * tmp = buf;
+
+	if (verify_area(VERIFY_READ,buf,count))
+		return -EFAULT;
+	while (count-- > 0 && i < 65536) {
+		char c;
+		if (__get_user(c, tmp)) 
+			return -EFAULT; 
+		outb(c,i);
+		i++;
+		tmp++;
+	}
+	*ppos = i;
+	return tmp-buf;
+}
+#endif
+
+static ssize_t read_null(struct file * file, char * buf,
+			 size_t count, loff_t *ppos)
+{
+	return 0;
+}
+
+static ssize_t write_null(struct file * file, const char * buf,
+			  size_t count, loff_t *ppos)
+{
+	return count;
+}
+
+/*
+ * For fun, we are using the MMU for this.
+ */
+static inline size_t read_zero_pagealigned(char * buf, size_t size)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct * vma;
+	unsigned long addr=(unsigned long)buf;
+
+	mm = current->mm;
+	/* Oops, this was forgotten before. -ben */
+	down_read(&mm->mmap_sem);
+
+	/* For private mappings, just map in zero pages. */
+	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
+		unsigned long count;
+
+		if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
+			goto out_up;
+		if (vma->vm_flags & VM_SHARED)
+			break;
+		count = vma->vm_end - addr;
+		if (count > size)
+			count = size;
+
+		zap_page_range(mm, addr, count);
+        	zeromap_page_range(addr, count, PAGE_COPY);
+
+		size -= count;
+		buf += count;
+		addr += count;
+		if (size == 0)
+			goto out_up;
+	}
+
+	up_read(&mm->mmap_sem);
+	
+	/* The shared case is hard. Let's do the conventional zeroing. */ 
+	do {
+		unsigned long unwritten = clear_user(buf, PAGE_SIZE);
+		if (unwritten)
+			return size + unwritten - PAGE_SIZE;
+		if (current->need_resched)
+			schedule();
+		buf += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	} while (size);
+
+	return size;
+out_up:
+	up_read(&mm->mmap_sem);
+	return size;
+}
+
+static ssize_t read_zero(struct file * file, char * buf, 
+			 size_t count, loff_t *ppos)
+{
+	unsigned long left, unwritten, written = 0;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	left = count;
+
+	/* do we want to be clever? Arbitrary cut-off */
+	if (count >= PAGE_SIZE*4) {
+		unsigned long partial;
+
+		/* How much left of the page? */
+		partial = (PAGE_SIZE-1) & -(unsigned long) buf;
+		unwritten = clear_user(buf, partial);
+		written = partial - unwritten;
+		if (unwritten)
+			goto out;
+		left -= partial;
+		buf += partial;
+		unwritten = read_zero_pagealigned(buf, left & PAGE_MASK);
+		written += (left & PAGE_MASK) - unwritten;
+		if (unwritten)
+			goto out;
+		buf += left & PAGE_MASK;
+		left &= ~PAGE_MASK;
+	}
+	unwritten = clear_user(buf, left);
+	written += left - unwritten;
+out:
+	return written ? written : -EFAULT;
+}
+
+static int mmap_zero(struct file * file, struct vm_area_struct * vma)
+{
+	if (vma->vm_flags & VM_SHARED)
+		return shmem_zero_setup(vma);
+	if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot))
+		return -EAGAIN;
+	return 0;
+}
+
+static ssize_t write_full(struct file * file, const char * buf,
+			  size_t count, loff_t *ppos)
+{
+	return -ENOSPC;
+}
+
+/*
+ * Special lseek() function for /dev/null and /dev/zero.  Most notably, you
+ * can fopen() both devices with "a" now.  This was previously impossible.
+ * -- SRB.
+ */
+
+static loff_t null_lseek(struct file * file, loff_t offset, int orig)
+{
+	return file->f_pos = 0;
+}
+
+/*
+ * The memory devices use the full 32/64 bits of the offset, and so we cannot
+ * check against negative addresses: they are ok. The return value is weird,
+ * though, in that case (0).
+ *
+ * also note that seeking relative to the "end of file" isn't supported:
+ * it has no meaning, so it returns -EINVAL.
+ */
+static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
+{
+	switch (orig) {
+		case 0:
+			file->f_pos = offset;
+			return file->f_pos;
+		case 1:
+			file->f_pos += offset;
+			return file->f_pos;
+		default:
+			return -EINVAL;
+	}
+}
+
+static int open_port(struct inode * inode, struct file * filp)
+{
+	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+#define mmap_kmem	mmap_mem
+#define zero_lseek	null_lseek
+#define full_lseek      null_lseek
+#define write_zero	write_null
+#define read_full       read_zero
+#define open_mem	open_port
+#define open_kmem	open_mem
+
+static struct file_operations mem_fops = {
+	llseek:		memory_lseek,
+	read:		read_mem,
+	write:		write_mem,
+	mmap:		mmap_mem,
+	open:		open_mem,
+};
+
+static struct file_operations kmem_fops = {
+	llseek:		memory_lseek,
+	read:		read_kmem,
+	write:		write_kmem,
+	mmap:		mmap_kmem,
+	open:		open_kmem,
+};
+
+static struct file_operations null_fops = {
+	llseek:		null_lseek,
+	read:		read_null,
+	write:		write_null,
+};
+
+#if !defined(__mc68000__)
+static struct file_operations port_fops = {
+	llseek:		memory_lseek,
+	read:		read_port,
+	write:		write_port,
+	open:		open_port,
+};
+#endif
+
+static struct file_operations zero_fops = {
+	llseek:		zero_lseek,
+	read:		read_zero,
+	write:		write_zero,
+	mmap:		mmap_zero,
+};
+
+static struct file_operations full_fops = {
+	llseek:		full_lseek,
+	read:		read_full,
+	write:		write_full,
+};
+
+static int memory_open(struct inode * inode, struct file * filp)
+{
+	switch (MINOR(inode->i_rdev)) {
+		case 1:
+			filp->f_op = &mem_fops;
+			break;
+		case 2:
+			filp->f_op = &kmem_fops;
+			break;
+		case 3:
+			filp->f_op = &null_fops;
+			break;
+#if !defined(__mc68000__)
+		case 4:
+			filp->f_op = &port_fops;
+			break;
+#endif
+		case 5:
+			filp->f_op = &zero_fops;
+			break;
+		case 7:
+			filp->f_op = &full_fops;
+			break;
+		case 8:
+			filp->f_op = &random_fops;
+			break;
+		case 9:
+			filp->f_op = &urandom_fops;
+			break;
+		default:
+			return -ENXIO;
+	}
+	if (filp->f_op && filp->f_op->open)
+		return filp->f_op->open(inode,filp);
+	return 0;
+}
+
+void __init memory_devfs_register (void)
+{
+    /*  These are never unregistered  */
+    static const struct {
+	unsigned short minor;
+	char *name;
+	umode_t mode;
+	struct file_operations *fops;
+    } list[] = { /* list of minor devices */
+	{1, "mem",     S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops},
+	{2, "kmem",    S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops},
+	{3, "null",    S_IRUGO | S_IWUGO,           &null_fops},
+	{4, "port",    S_IRUSR | S_IWUSR | S_IRGRP, &port_fops},
+	{5, "zero",    S_IRUGO | S_IWUGO,           &zero_fops},
+	{7, "full",    S_IRUGO | S_IWUGO,           &full_fops},
+	{8, "random",  S_IRUGO | S_IWUSR,           &random_fops},
+	{9, "urandom", S_IRUGO | S_IWUSR,           &urandom_fops}
+    };
+    int i;
+
+    for (i=0; i<(sizeof(list)/sizeof(*list)); i++)
+	devfs_register (NULL, list[i].name, DEVFS_FL_NONE,
+			MEM_MAJOR, list[i].minor,
+			list[i].mode | S_IFCHR,
+			list[i].fops, NULL);
+}
+
+static struct file_operations memory_fops = {
+	open:		memory_open,	/* just a selector for the real open */
+};
+
+int __init chr_dev_init(void)
+{
+	if (devfs_register_chrdev(MEM_MAJOR,"mem",&memory_fops))
+		printk("unable to get major %d for memory devs\n", MEM_MAJOR);
+	memory_devfs_register();
+	rand_initialize();
+#ifdef CONFIG_I2C
+	i2c_init_all();
+#endif
+#if defined (CONFIG_FB)
+	fbmem_init();
+#endif
+#if defined (CONFIG_PROM_CONSOLE)
+	prom_con_init();
+#endif
+#if defined (CONFIG_MDA_CONSOLE)
+	mda_console_init();
+#endif
+	tty_init();
+#ifdef CONFIG_M68K_PRINTER
+	lp_m68k_init();
+#endif
+	misc_init();
+#if CONFIG_QIC02_TAPE
+	qic02_tape_init();
+#endif
+#ifdef CONFIG_FTAPE
+	ftape_init();
+#endif
+#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_CHAR)
+	tapechar_init();
+#endif
+	return 0;
+}
+
+__initcall(chr_dev_init);
diff -urN linux-2.4.17-rc1-virgin/drivers/char/tty_io.c linux-2.4.17-rc1-wli3/drivers/char/tty_io.c
--- linux-2.4.17-rc1-virgin/drivers/char/tty_io.c	Fri Dec 14 06:04:02 2001
+++ linux-2.4.17-rc1-wli3/drivers/char/tty_io.c	Sun Dec 16 17:58:10 2001
@@ -722,6 +722,7 @@
 			ret = -ERESTARTSYS;
 			if (signal_pending(current))
 				break;
+			debug_lock_break(551);
 			if (current->need_resched)
 				schedule();
 		}
diff -urN linux-2.4.17-rc1-virgin/drivers/ieee1394/csr.c linux-2.4.17-rc1-wli3/drivers/ieee1394/csr.c
--- linux-2.4.17-rc1-virgin/drivers/ieee1394/csr.c	Thu Jul 19 17:48:15 2001
+++ linux-2.4.17-rc1-wli3/drivers/ieee1394/csr.c	Fri Dec 14 02:44:44 2001
@@ -10,6 +10,7 @@
  */
 
 #include <linux/string.h>
+#include <linux/sched.h>
 
 #include "ieee1394_types.h"
 #include "hosts.h"
diff -urN linux-2.4.17-rc1-virgin/fs/adfs/map.c linux-2.4.17-rc1-wli3/fs/adfs/map.c
--- linux-2.4.17-rc1-virgin/fs/adfs/map.c	Thu Oct 25 13:53:53 2001
+++ linux-2.4.17-rc1-wli3/fs/adfs/map.c	Fri Dec 14 02:44:44 2001
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/adfs_fs.h>
 #include <linux/spinlock.h>
+#include <linux/sched.h>
 
 #include "adfs.h"
 
diff -urN linux-2.4.17-rc1-virgin/fs/binfmt_elf.c linux-2.4.17-rc1-wli3/fs/binfmt_elf.c
--- linux-2.4.17-rc1-virgin/fs/binfmt_elf.c	Fri Dec 14 06:04:11 2001
+++ linux-2.4.17-rc1-wli3/fs/binfmt_elf.c	Fri Dec 14 03:53:30 2001
@@ -32,7 +32,7 @@
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/compiler.h>
-#include <linux/highmem.h>
+#include <linux/limits.h>
 
 #include <asm/uaccess.h>
 #include <asm/param.h>
@@ -1032,25 +1032,6 @@
 	elf_fpregset_t fpu;		/* NT_PRFPREG */
 	struct elf_prpsinfo psinfo;	/* NT_PRPSINFO */
 
-	/* first copy the parameters from user space */
-	memset(&psinfo, 0, sizeof(psinfo));
-	{
-		int i, len;
-
-		len = current->mm->arg_end - current->mm->arg_start;
-		if (len >= ELF_PRARGSZ)
-			len = ELF_PRARGSZ-1;
-		copy_from_user(&psinfo.pr_psargs,
-			      (const char *)current->mm->arg_start, len);
-		for(i = 0; i < len; i++)
-			if (psinfo.pr_psargs[i] == 0)
-				psinfo.pr_psargs[i] = ' ';
-		psinfo.pr_psargs[len] = 0;
-
-	}
-
-	/* now stop all vm operations */
-	down_write(&current->mm->mmap_sem);
 	segs = current->mm->map_count;
 
 #ifdef DEBUG
@@ -1092,6 +1073,7 @@
 	 * Set up the notes in similar form to SVR4 core dumps made
 	 * with info from their /proc.
 	 */
+	memset(&psinfo, 0, sizeof(psinfo));
 	memset(&prstatus, 0, sizeof(prstatus));
 
 	notes[0].name = "CORE";
@@ -1147,6 +1129,23 @@
 	psinfo.pr_flag = current->flags;
 	psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
 	psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
+	{
+		int i, len;
+
+		set_fs(fs);
+
+		len = current->mm->arg_end - current->mm->arg_start;
+		if (len >= ELF_PRARGSZ)
+			len = ELF_PRARGSZ-1;
+		copy_from_user(&psinfo.pr_psargs,
+			      (const char *)current->mm->arg_start, len);
+		for(i = 0; i < len; i++)
+			if (psinfo.pr_psargs[i] == 0)
+				psinfo.pr_psargs[i] = ' ';
+		psinfo.pr_psargs[len] = 0;
+
+		set_fs(KERNEL_DS);
+	}
 	strncpy(psinfo.pr_fname, current->comm, sizeof(psinfo.pr_fname));
 
 	notes[2].name = "CORE";
@@ -1218,6 +1217,8 @@
 		if (!writenote(&notes[i], file))
 			goto end_coredump;
 
+	set_fs(fs);
+
 	DUMP_SEEK(dataoff);
 
 	for(vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
@@ -1231,24 +1232,22 @@
 		for (addr = vma->vm_start;
 		     addr < vma->vm_end;
 		     addr += PAGE_SIZE) {
-			struct page* page;
-			struct vm_area_struct *vma;
-
-			if (get_user_pages(current, current->mm, addr, 1, 0, 1,
-						&page, &vma) <= 0) {
+			pgd_t *pgd;
+			pmd_t *pmd;
+			pte_t *pte;
+
+			pgd = pgd_offset(vma->vm_mm, addr);
+			if (pgd_none(*pgd))
+				goto nextpage_coredump;
+			pmd = pmd_offset(pgd, addr);
+			if (pmd_none(*pmd))
+				goto nextpage_coredump;
+			pte = pte_offset(pmd, addr);
+			if (pte_none(*pte)) {
+nextpage_coredump:
 				DUMP_SEEK (file->f_pos + PAGE_SIZE);
 			} else {
-				if (page == ZERO_PAGE(addr)) {
-					DUMP_SEEK (file->f_pos + PAGE_SIZE);
-				} else {
-					void *kaddr;
-					flush_cache_page(vma, addr);
-					kaddr = kmap(page);
-					DUMP_WRITE(kaddr, PAGE_SIZE);
-					flush_page_to_ram(page);
-					kunmap(page);
-				}
-				put_page(page);
+				DUMP_WRITE((void*)addr, PAGE_SIZE);
 			}
 		}
 	}
@@ -1261,7 +1260,6 @@
 
  end_coredump:
 	set_fs(fs);
-	up_write(&current->mm->mmap_sem);
 	return has_dumped;
 }
 #endif		/* USE_ELF_CORE_DUMP */
diff -urN linux-2.4.17-rc1-virgin/fs/buffer.c linux-2.4.17-rc1-wli3/fs/buffer.c
--- linux-2.4.17-rc1-virgin/fs/buffer.c	Fri Dec 14 06:04:11 2001
+++ linux-2.4.17-rc1-wli3/fs/buffer.c	Sun Dec 16 22:28:34 2001
@@ -254,7 +254,6 @@
 	while (next && --nr >= 0) {
 		struct buffer_head *bh = next;
 		next = bh->b_next_free;
-
 		if (!buffer_locked(bh)) {
 			if (refile)
 				__refile_buffer(bh);
@@ -262,7 +261,13 @@
 		}
 		if (dev && bh->b_dev != dev)
 			continue;
-
+#if 0
+		if (conditional_schedule_needed()) {
+			debug_lock_break(1);
+			spin_unlock(&lru_list_lock);
+			return -EAGAIN;
+		}
+#endif
 		get_bh(bh);
 		spin_unlock(&lru_list_lock);
 		wait_on_buffer (bh);
@@ -459,13 +464,19 @@
 	return err;
 }
 
-/* After several hours of tedious analysis, the following hash
- * function won.  Do not mess with it... -DaveM
+/*
+ * The shift/add buffer cache hash function from Chuck Lever's paper.
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ * page 6 describes the behavior of various buffer cache hashes.
+ *
+ * The lack of an attempt to mix the bits of dev in this hash
+ * function appears disturbing to me, but I don't have the
+ * resources to investigate the value of attempting to do so.
+ * -- wli
  */
-#define _hashfn(dev,block)	\
-	((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
-	 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
-	  ((block) << (bh_hash_shift - 12))))
+#define _hashfn(dev, block) \
+	( (block << 7) - block + (block >> 10) + (block >> 18) )
+	
 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
 
 static inline void __insert_into_hash_list(struct buffer_head *bh)
@@ -672,6 +683,13 @@
 			/* Not hashed? */
 			if (!bh->b_pprev)
 				continue;
+			if (conditional_schedule_needed()) {
+				debug_lock_break(2); /* bkl is held too */
+				get_bh(bh);
+				break_spin_lock_and_resched(&lru_list_lock);
+				put_bh(bh);
+				slept = 1;
+			}
 			if (buffer_locked(bh)) {
 				get_bh(bh);
 				spin_unlock(&lru_list_lock);
@@ -719,11 +737,9 @@
 
 static void free_more_memory(void)
 {
-	zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
-	
 	balance_dirty();
 	wakeup_bdflush();
-	try_to_free_pages(zone, GFP_NOFS, 0);
+	try_to_free_pages(GFP_NOFS);
 	run_task_queue(&tq_disk);
 	current->policy |= SCHED_YIELD;
 	__set_current_state(TASK_RUNNING);
@@ -823,6 +839,8 @@
 	struct buffer_head *bh;
 	struct inode tmp;
 	int err = 0, err2;
+
+	DEFINE_LOCK_COUNT();
 	
 	INIT_LIST_HEAD(&tmp.i_dirty_buffers);
 	
@@ -844,6 +862,12 @@
 				spin_lock(&lru_list_lock);
 			}
 		}
+		/* haven't hit this code path ... */
+		debug_lock_break(551);
+		if (TEST_LOCK_COUNT(32)) {
+			RESET_LOCK_COUNT();
+			break_spin_lock(&lru_list_lock);
+		}
 	}
 
 	while (!list_empty(&tmp.i_dirty_buffers)) {
@@ -873,6 +897,7 @@
 	struct inode tmp;
 	int err = 0, err2;
 	
+	DEFINE_LOCK_COUNT();
 	INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
 	
 	spin_lock(&lru_list_lock);
@@ -904,9 +929,14 @@
 		if (!buffer_uptodate(bh))
 			err = -EIO;
 		brelse(bh);
+		debug_lock_break(1);
+		if (TEST_LOCK_COUNT(32)) {
+			RESET_LOCK_COUNT();
+			conditional_schedule();
+		}
 		spin_lock(&lru_list_lock);
 	}
-	
+
 	spin_unlock(&lru_list_lock);
 	err2 = osync_inode_data_buffers(inode);
 
@@ -933,6 +963,8 @@
 	struct list_head *list;
 	int err = 0;
 
+	DEFINE_LOCK_COUNT();
+
 	spin_lock(&lru_list_lock);
 	
  repeat:
@@ -940,6 +972,17 @@
 	for (list = inode->i_dirty_buffers.prev; 
 	     bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
 	     list = bh->b_inode_buffers.prev) {
+		/* untested code path ... */
+		debug_lock_break(551);
+ 
+		if (TEST_LOCK_COUNT(32)) {
+			RESET_LOCK_COUNT();
+			if (conditional_schedule_needed()) {
+				break_spin_lock(&lru_list_lock);
+				goto repeat;
+			}
+		}
+ 
 		if (buffer_locked(bh)) {
 			get_bh(bh);
 			spin_unlock(&lru_list_lock);
diff -urN linux-2.4.17-rc1-virgin/fs/buffer.c~ linux-2.4.17-rc1-wli3/fs/buffer.c~
--- linux-2.4.17-rc1-virgin/fs/buffer.c~	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/fs/buffer.c~	Sat Dec 15 08:36:14 2001
@@ -0,0 +1,2869 @@
+/*
+ *  linux/fs/buffer.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ *  'buffer.c' implements the buffer-cache functions. Race-conditions have
+ * been avoided by NEVER letting an interrupt change a buffer (except for the
+ * data, of course), but instead letting the caller do it.
+ */
+
+/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
+
+/* Removed a lot of unnecessary code and simplified things now that
+ * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
+ */
+
+/* Speed up hash, lru, and free list operations.  Use gfp() for allocating
+ * hash table, use SLAB cache for buffer heads. -DaveM
+ */
+
+/* Added 32k buffer block sizes - these are required older ARM systems.
+ * - RMK
+ */
+
+/* Thread it... -DaveM */
+
+/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/locks.h>
+#include <linux/errno.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
+#include <linux/smp_lock.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/sysrq.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/iobuf.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/bitops.h>
+#include <asm/mmu_context.h>
+
+#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
+#define NR_RESERVED (10*MAX_BUF_PER_PAGE)
+#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
+					     number of unused buffer heads */
+
+/* Anti-deadlock ordering:
+ *	lru_list_lock > hash_table_lock > unused_list_lock
+ */
+
+#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
+
+/*
+ * Hash table gook..
+ */
+static unsigned int bh_hash_mask;
+static unsigned int bh_hash_shift;
+static struct buffer_head **hash_table;
+static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
+
+static struct buffer_head *lru_list[NR_LIST];
+static spinlock_t lru_list_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+static int nr_buffers_type[NR_LIST];
+static unsigned long size_buffers_type[NR_LIST];
+
+static struct buffer_head * unused_list;
+static int nr_unused_buffer_heads;
+static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
+
+static int grow_buffers(kdev_t dev, unsigned long block, int size);
+static void __refile_buffer(struct buffer_head *);
+
+/* This is used by some architectures to estimate available memory. */
+atomic_t buffermem_pages = ATOMIC_INIT(0);
+
+/* Here is the parameter block for the bdflush process. If you add or
+ * remove any of the parameters, make sure to update kernel/sysctl.c
+ * and the documentation at linux/Documentation/sysctl/vm.txt.
+ */
+
+#define N_PARAM 9
+
+/* The dummy values in this structure are left in there for compatibility
+ * with old programs that play with the /proc entries.
+ */
+union bdflush_param {
+	struct {
+		int nfract;	/* Percentage of buffer cache dirty to 
+				   activate bdflush */
+		int dummy1;	/* old "ndirty" */
+		int dummy2;	/* old "nrefill" */
+		int dummy3;	/* unused */
+		int interval;	/* jiffies delay between kupdate flushes */
+		int age_buffer;	/* Time for normal buffer to age before we flush it */
+		int nfract_sync;/* Percentage of buffer cache dirty to 
+				   activate bdflush synchronously */
+		int dummy4;	/* unused */
+		int dummy5;	/* unused */
+	} b_un;
+	unsigned int data[N_PARAM];
+} bdf_prm = {{40, 0, 0, 0, 5*HZ, 30*HZ, 60, 0, 0}};
+
+/* These are the min and max parameter values that we will allow to be assigned */
+int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   0, 0, 0};
+int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
+
+void unlock_buffer(struct buffer_head *bh)
+{
+	clear_bit(BH_Wait_IO, &bh->b_state);
+	clear_bit(BH_launder, &bh->b_state);
+	clear_bit(BH_Lock, &bh->b_state);
+	smp_mb__after_clear_bit();
+	if (waitqueue_active(&bh->b_wait))
+		wake_up(&bh->b_wait);
+}
+
+/*
+ * Rewrote the wait-routines to use the "new" wait-queue functionality,
+ * and getting rid of the cli-sti pairs. The wait-queue routines still
+ * need cli-sti, but now it's just a couple of 386 instructions or so.
+ *
+ * Note that the real wait_on_buffer() is an inline function that checks
+ * if 'b_wait' is set before calling this, so that the queues aren't set
+ * up unnecessarily.
+ */
+void __wait_on_buffer(struct buffer_head * bh)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	get_bh(bh);
+	add_wait_queue(&bh->b_wait, &wait);
+	do {
+		run_task_queue(&tq_disk);
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (!buffer_locked(bh))
+			break;
+		schedule();
+	} while (buffer_locked(bh));
+	tsk->state = TASK_RUNNING;
+	remove_wait_queue(&bh->b_wait, &wait);
+	put_bh(bh);
+}
+
+/*
+ * Default synchronous end-of-IO handler..  Just mark it up-to-date and
+ * unlock the buffer. This is what ll_rw_block uses too.
+ */
+void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	mark_buffer_uptodate(bh, uptodate);
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+/*
+ * The buffers have been marked clean and locked.  Just submit the dang
+ * things.. 
+ */
+static void write_locked_buffers(struct buffer_head **array, unsigned int count)
+{
+	do {
+		struct buffer_head * bh = *array++;
+		bh->b_end_io = end_buffer_io_sync;
+		submit_bh(WRITE, bh);
+	} while (--count);
+}
+
+/*
+ * Write some buffers from the head of the dirty queue.
+ *
+ * This must be called with the LRU lock held, and will
+ * return without it!
+ */
+#define NRSYNC (32)
+static int write_some_buffers(kdev_t dev)
+{
+	struct buffer_head *next;
+	struct buffer_head *array[NRSYNC];
+	unsigned int count;
+	int nr;
+
+	next = lru_list[BUF_DIRTY];
+	nr = nr_buffers_type[BUF_DIRTY];
+	count = 0;
+	while (next && --nr >= 0) {
+		struct buffer_head * bh = next;
+		next = bh->b_next_free;
+
+		if (dev && bh->b_dev != dev)
+			continue;
+		if (test_and_set_bit(BH_Lock, &bh->b_state))
+			continue;
+		if (atomic_set_buffer_clean(bh)) {
+			__refile_buffer(bh);
+			get_bh(bh);
+			array[count++] = bh;
+			if (count < NRSYNC)
+				continue;
+
+			spin_unlock(&lru_list_lock);
+			write_locked_buffers(array, count);
+			return -EAGAIN;
+		}
+		unlock_buffer(bh);
+		__refile_buffer(bh);
+	}
+	spin_unlock(&lru_list_lock);
+
+	if (count)
+		write_locked_buffers(array, count);
+	return 0;
+}
+
+/*
+ * Write out all buffers on the dirty list.
+ */
+static void write_unlocked_buffers(kdev_t dev)
+{
+	do {
+		spin_lock(&lru_list_lock);
+	} while (write_some_buffers(dev));
+	run_task_queue(&tq_disk);
+}
+
+/*
+ * Wait for a buffer on the proper list.
+ *
+ * This must be called with the LRU lock held, and
+ * will return with it released.
+ */
+static int wait_for_buffers(kdev_t dev, int index, int refile)
+{
+	struct buffer_head * next;
+	int nr;
+
+	next = lru_list[index];
+	nr = nr_buffers_type[index];
+	while (next && --nr >= 0) {
+		struct buffer_head *bh = next;
+		next = bh->b_next_free;
+
+		if (!buffer_locked(bh)) {
+			if (refile)
+				__refile_buffer(bh);
+			continue;
+		}
+		if (dev && bh->b_dev != dev)
+			continue;
+
+		get_bh(bh);
+		spin_unlock(&lru_list_lock);
+		wait_on_buffer (bh);
+		put_bh(bh);
+		return -EAGAIN;
+	}
+	spin_unlock(&lru_list_lock);
+	return 0;
+}
+
+static inline void wait_for_some_buffers(kdev_t dev)
+{
+	spin_lock(&lru_list_lock);
+	wait_for_buffers(dev, BUF_LOCKED, 1);
+}
+
+static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
+{
+	do {
+		spin_lock(&lru_list_lock);
+	} while (wait_for_buffers(dev, index, refile));
+	return 0;
+}
+
+/* Call sync_buffers with wait!=0 to ensure that the call does not
+ * return until all buffer writes have completed.  Sync() may return
+ * before the writes have finished; fsync() may not.
+ */
+
+/* Godamity-damn.  Some buffers (bitmaps for filesystems)
+ * spontaneously dirty themselves without ever brelse being called.
+ * We will ultimately want to put these in a separate list, but for
+ * now we search all of the lists for dirty buffers.
+ */
+int sync_buffers(kdev_t dev, int wait)
+{
+	int err = 0;
+
+	/* One pass for no-wait, three for wait:
+	 * 0) write out all dirty, unlocked buffers;
+	 * 1) wait for all dirty locked buffers;
+	 * 2) write out all dirty, unlocked buffers;
+	 * 2) wait for completion by waiting for all buffers to unlock.
+	 */
+	write_unlocked_buffers(dev);
+	if (wait) {
+		err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
+		write_unlocked_buffers(dev);
+		err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
+	}
+	return err;
+}
+
+int fsync_super(struct super_block *sb)
+{
+	kdev_t dev = sb->s_dev;
+	sync_buffers(dev, 0);
+
+	lock_kernel();
+	sync_inodes_sb(sb);
+	DQUOT_SYNC(dev);
+	lock_super(sb);
+	if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+	unlock_kernel();
+
+	return sync_buffers(dev, 1);
+}
+
+int fsync_no_super(kdev_t dev)
+{
+	sync_buffers(dev, 0);
+	return sync_buffers(dev, 1);
+}
+
+int fsync_dev(kdev_t dev)
+{
+	sync_buffers(dev, 0);
+
+	lock_kernel();
+	sync_inodes(dev);
+	DQUOT_SYNC(dev);
+	sync_supers(dev);
+	unlock_kernel();
+
+	return sync_buffers(dev, 1);
+}
+
+/*
+ * There's no real reason to pretend we should
+ * ever do anything differently
+ */
+void sync_dev(kdev_t dev)
+{
+	fsync_dev(dev);
+}
+
+asmlinkage long sys_sync(void)
+{
+	fsync_dev(0);
+	return 0;
+}
+
+/*
+ *	filp may be NULL if called via the msync of a vma.
+ */
+ 
+int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+	struct inode * inode = dentry->d_inode;
+	struct super_block * sb;
+	kdev_t dev;
+	int ret;
+
+	lock_kernel();
+	/* sync the inode to buffers */
+	write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	lock_super(sb);
+	if (sb->s_op && sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+
+	/* .. finally sync the buffers to disk */
+	dev = inode->i_dev;
+	ret = sync_buffers(dev, 1);
+	unlock_kernel();
+	return ret;
+}
+
+asmlinkage long sys_fsync(unsigned int fd)
+{
+	struct file * file;
+	struct dentry * dentry;
+	struct inode * inode;
+	int err;
+
+	err = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	dentry = file->f_dentry;
+	inode = dentry->d_inode;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->fsync)
+		goto out_putf;
+
+	/* We need to protect against concurrent writers.. */
+	down(&inode->i_sem);
+	filemap_fdatasync(inode->i_mapping);
+	err = file->f_op->fsync(file, dentry, 0);
+	filemap_fdatawait(inode->i_mapping);
+	up(&inode->i_sem);
+
+out_putf:
+	fput(file);
+out:
+	return err;
+}
+
+asmlinkage long sys_fdatasync(unsigned int fd)
+{
+	struct file * file;
+	struct dentry * dentry;
+	struct inode * inode;
+	int err;
+
+	err = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	dentry = file->f_dentry;
+	inode = dentry->d_inode;
+
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->fsync)
+		goto out_putf;
+
+	down(&inode->i_sem);
+	filemap_fdatasync(inode->i_mapping);
+	err = file->f_op->fsync(file, dentry, 1);
+	filemap_fdatawait(inode->i_mapping);
+	up(&inode->i_sem);
+
+out_putf:
+	fput(file);
+out:
+	return err;
+}
+
+/*
+ * The shift/add buffer cache hash function from Chuck Lever's paper.
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ * page 6 describes the behavior of various buffer cache hashes.
+ *
+ * The lack of an attempt to mix the bits of dev in this hash
+ * function appears disturbing to me, but I don't have the
+ * resources to investigate the value of attempting to do so.
+ * -- wli
+ */
+#define _hashfn(dev, block) \
+	( (block << 7) - block + (block >> 10) + (block >> 18) )
+	
+#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
+
+static inline void __insert_into_hash_list(struct buffer_head *bh)
+{
+	struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
+	struct buffer_head *next = *head;
+
+	*head = bh;
+	bh->b_pprev = head;
+	bh->b_next = next;
+	if (next != NULL)
+		next->b_pprev = &bh->b_next;
+}
+
+static __inline__ void __hash_unlink(struct buffer_head *bh)
+{
+	struct buffer_head **pprev = bh->b_pprev;
+	if (pprev) {
+		struct buffer_head *next = bh->b_next;
+		if (next)
+			next->b_pprev = pprev;
+		*pprev = next;
+		bh->b_pprev = NULL;
+	}
+}
+
+static void __insert_into_lru_list(struct buffer_head * bh, int blist)
+{
+	struct buffer_head **bhp = &lru_list[blist];
+
+	if (bh->b_prev_free || bh->b_next_free) BUG();
+
+	if(!*bhp) {
+		*bhp = bh;
+		bh->b_prev_free = bh;
+	}
+	bh->b_next_free = *bhp;
+	bh->b_prev_free = (*bhp)->b_prev_free;
+	(*bhp)->b_prev_free->b_next_free = bh;
+	(*bhp)->b_prev_free = bh;
+	nr_buffers_type[blist]++;
+	size_buffers_type[blist] += bh->b_size;
+}
+
+static void __remove_from_lru_list(struct buffer_head * bh)
+{
+	struct buffer_head *next = bh->b_next_free;
+	if (next) {
+		struct buffer_head *prev = bh->b_prev_free;
+		int blist = bh->b_list;
+
+		prev->b_next_free = next;
+		next->b_prev_free = prev;
+		if (lru_list[blist] == bh) {
+			if (next == bh)
+				next = NULL;
+			lru_list[blist] = next;
+		}
+		bh->b_next_free = NULL;
+		bh->b_prev_free = NULL;
+		nr_buffers_type[blist]--;
+		size_buffers_type[blist] -= bh->b_size;
+	}
+}
+
+/* must be called with both the hash_table_lock and the lru_list_lock
+   held */
+static void __remove_from_queues(struct buffer_head *bh)
+{
+	__hash_unlink(bh);
+	__remove_from_lru_list(bh);
+}
+
+static void remove_from_queues(struct buffer_head *bh)
+{
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+	__remove_from_queues(bh);
+	write_unlock(&hash_table_lock);	
+	spin_unlock(&lru_list_lock);
+}
+
+struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
+{
+	struct buffer_head *bh, **p = &hash(dev, block);
+
+	read_lock(&hash_table_lock);
+
+	for (;;) {
+		bh = *p;
+		if (!bh)
+			break;
+		p = &bh->b_next;
+		if (bh->b_blocknr != block)
+			continue;
+		if (bh->b_size != size)
+			continue;
+		if (bh->b_dev != dev)
+			continue;
+		get_bh(bh);
+		break;
+	}
+
+	read_unlock(&hash_table_lock);
+	return bh;
+}
+
+void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
+{
+	spin_lock(&lru_list_lock);
+	if (bh->b_inode)
+		list_del(&bh->b_inode_buffers);
+	bh->b_inode = inode;
+	list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
+	spin_unlock(&lru_list_lock);
+}
+
+void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
+{
+	spin_lock(&lru_list_lock);
+	if (bh->b_inode)
+		list_del(&bh->b_inode_buffers);
+	bh->b_inode = inode;
+	list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
+	spin_unlock(&lru_list_lock);
+}
+
+/* The caller must have the lru_list lock before calling the 
+   remove_inode_queue functions.  */
+static void __remove_inode_queue(struct buffer_head *bh)
+{
+	bh->b_inode = NULL;
+	list_del(&bh->b_inode_buffers);
+}
+
+static inline void remove_inode_queue(struct buffer_head *bh)
+{
+	if (bh->b_inode)
+		__remove_inode_queue(bh);
+}
+
+int inode_has_buffers(struct inode *inode)
+{
+	int ret;
+	
+	spin_lock(&lru_list_lock);
+	ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
+	spin_unlock(&lru_list_lock);
+	
+	return ret;
+}
+
+/* If invalidate_buffers() will trash dirty buffers, it means some kind
+   of fs corruption is going on. Trashing dirty data always imply losing
+   information that was supposed to be just stored on the physical layer
+   by the user.
+
+   Thus invalidate_buffers in general usage is not allwowed to trash
+   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
+   be preserved.  These buffers are simply skipped.
+  
+   We also skip buffers which are still in use.  For example this can
+   happen if a userspace program is reading the block device.
+
+   NOTE: In the case where the user removed a removable-media-disk even if
+   there's still dirty data not synced on disk (due a bug in the device driver
+   or due an error of the user), by not destroying the dirty buffers we could
+   generate corruption also on the next media inserted, thus a parameter is
+   necessary to handle this case in the most safe way possible (trying
+   to not corrupt also the new disk inserted with the data belonging to
+   the old now corrupted disk). Also for the ramdisk the natural thing
+   to do in order to release the ramdisk memory is to destroy dirty buffers.
+
+   These are two special cases. Normal usage imply the device driver
+   to issue a sync on the device (without waiting I/O completion) and
+   then an invalidate_buffers call that doesn't trash dirty buffers.
+
+   For handling cache coherency with the blkdev pagecache the 'update' case
+   is been introduced. It is needed to re-read from disk any pinned
+   buffer. NOTE: re-reading from disk is destructive so we can do it only
+   when we assume nobody is changing the buffercache under our I/O and when
+   we think the disk contains more recent information than the buffercache.
+   The update == 1 pass marks the buffers we need to update, the update == 2
+   pass does the actual I/O. */
+void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
+{
+	int i, nlist, slept;
+	struct buffer_head * bh, * bh_next;
+	kdev_t dev = to_kdev_t(bdev->bd_dev);	/* will become bdev */
+
+ retry:
+	slept = 0;
+	spin_lock(&lru_list_lock);
+	for(nlist = 0; nlist < NR_LIST; nlist++) {
+		bh = lru_list[nlist];
+		if (!bh)
+			continue;
+		for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
+			bh_next = bh->b_next_free;
+
+			/* Another device? */
+			if (bh->b_dev != dev)
+				continue;
+			/* Not hashed? */
+			if (!bh->b_pprev)
+				continue;
+			if (buffer_locked(bh)) {
+				get_bh(bh);
+				spin_unlock(&lru_list_lock);
+				wait_on_buffer(bh);
+				slept = 1;
+				spin_lock(&lru_list_lock);
+				put_bh(bh);
+			}
+
+			write_lock(&hash_table_lock);
+			/* All buffers in the lru lists are mapped */
+			if (!buffer_mapped(bh))
+				BUG();
+			if (buffer_dirty(bh))
+				printk("invalidate: dirty buffer\n");
+			if (!atomic_read(&bh->b_count)) {
+				if (destroy_dirty_buffers || !buffer_dirty(bh)) {
+					remove_inode_queue(bh);
+				}
+			} else
+				printk("invalidate: busy buffer\n");
+
+			write_unlock(&hash_table_lock);
+			if (slept)
+				goto out;
+		}
+	}
+out:
+	spin_unlock(&lru_list_lock);
+	if (slept)
+		goto retry;
+
+	/* Get rid of the page cache */
+	invalidate_inode_pages(bdev->bd_inode);
+}
+
+void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
+{
+	struct block_device *bdev = bdget(dev);
+	if (bdev) {
+		invalidate_bdev(bdev, destroy_dirty_buffers);
+		bdput(bdev);
+	}
+}
+
+static void free_more_memory(void)
+{
+	balance_dirty();
+	wakeup_bdflush();
+	try_to_free_pages(GFP_NOFS);
+	run_task_queue(&tq_disk);
+	current->policy |= SCHED_YIELD;
+	__set_current_state(TASK_RUNNING);
+	schedule();
+}
+
+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+{
+	bh->b_list = BUF_CLEAN;
+	bh->b_end_io = handler;
+	bh->b_private = private;
+}
+
+static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
+{
+	static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
+	unsigned long flags;
+	struct buffer_head *tmp;
+	struct page *page;
+
+	mark_buffer_uptodate(bh, uptodate);
+
+	/* This is a temporary buffer used for page I/O. */
+	page = bh->b_page;
+
+	if (!uptodate)
+		SetPageError(page);
+
+	/*
+	 * Be _very_ careful from here on. Bad things can happen if
+	 * two buffer heads end IO at almost the same time and both
+	 * decide that the page is now completely done.
+	 *
+	 * Async buffer_heads are here only as labels for IO, and get
+	 * thrown away once the IO for this page is complete.  IO is
+	 * deemed complete once all buffers have been visited
+	 * (b_count==0) and are now unlocked. We must make sure that
+	 * only the _last_ buffer that decrements its count is the one
+	 * that unlock the page..
+	 */
+	spin_lock_irqsave(&page_uptodate_lock, flags);
+	mark_buffer_async(bh, 0);
+	unlock_buffer(bh);
+	tmp = bh->b_this_page;
+	while (tmp != bh) {
+		if (buffer_async(tmp) && buffer_locked(tmp))
+			goto still_busy;
+		tmp = tmp->b_this_page;
+	}
+
+	/* OK, the async IO on this page is complete. */
+	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+
+	/*
+	 * if none of the buffers had errors then we can set the
+	 * page uptodate:
+	 */
+	if (!PageError(page))
+		SetPageUptodate(page);
+
+	UnlockPage(page);
+
+	return;
+
+still_busy:
+	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	return;
+}
+
+inline void set_buffer_async_io(struct buffer_head *bh) {
+    bh->b_end_io = end_buffer_io_async ;
+    mark_buffer_async(bh, 1);
+}
+
+/*
+ * Synchronise all the inode's dirty buffers to the disk.
+ *
+ * We have conflicting pressures: we want to make sure that all
+ * initially dirty buffers get waited on, but that any subsequently
+ * dirtied buffers don't.  After all, we don't want fsync to last
+ * forever if somebody is actively writing to the file.
+ *
+ * Do this in two main stages: first we copy dirty buffers to a
+ * temporary inode list, queueing the writes as we go.  Then we clean
+ * up, waiting for those writes to complete.
+ * 
+ * During this second stage, any subsequent updates to the file may end
+ * up refiling the buffer on the original inode's dirty list again, so
+ * there is a chance we will end up with a buffer queued for write but
+ * not yet completed on that list.  So, as a final cleanup we go through
+ * the osync code to catch these locked, dirty buffers without requeuing
+ * any newly dirty buffers for write.
+ */
+
+int fsync_inode_buffers(struct inode *inode)
+{
+	struct buffer_head *bh;
+	struct inode tmp;
+	int err = 0, err2;
+	
+	INIT_LIST_HEAD(&tmp.i_dirty_buffers);
+	
+	spin_lock(&lru_list_lock);
+
+	while (!list_empty(&inode->i_dirty_buffers)) {
+		bh = BH_ENTRY(inode->i_dirty_buffers.next);
+		list_del(&bh->b_inode_buffers);
+		if (!buffer_dirty(bh) && !buffer_locked(bh))
+			bh->b_inode = NULL;
+		else {
+			bh->b_inode = &tmp;
+			list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
+			if (buffer_dirty(bh)) {
+				get_bh(bh);
+				spin_unlock(&lru_list_lock);
+				ll_rw_block(WRITE, 1, &bh);
+				brelse(bh);
+				spin_lock(&lru_list_lock);
+			}
+		}
+	}
+
+	while (!list_empty(&tmp.i_dirty_buffers)) {
+		bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
+		remove_inode_queue(bh);
+		get_bh(bh);
+		spin_unlock(&lru_list_lock);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			err = -EIO;
+		brelse(bh);
+		spin_lock(&lru_list_lock);
+	}
+	
+	spin_unlock(&lru_list_lock);
+	err2 = osync_inode_buffers(inode);
+
+	if (err)
+		return err;
+	else
+		return err2;
+}
+
+int fsync_inode_data_buffers(struct inode *inode)
+{
+	struct buffer_head *bh;
+	struct inode tmp;
+	int err = 0, err2;
+	
+	INIT_LIST_HEAD(&tmp.i_dirty_data_buffers);
+	
+	spin_lock(&lru_list_lock);
+
+	while (!list_empty(&inode->i_dirty_data_buffers)) {
+		bh = BH_ENTRY(inode->i_dirty_data_buffers.next);
+		list_del(&bh->b_inode_buffers);
+		if (!buffer_dirty(bh) && !buffer_locked(bh))
+			bh->b_inode = NULL;
+		else {
+			bh->b_inode = &tmp;
+			list_add(&bh->b_inode_buffers, &tmp.i_dirty_data_buffers);
+			if (buffer_dirty(bh)) {
+				get_bh(bh);
+				spin_unlock(&lru_list_lock);
+				ll_rw_block(WRITE, 1, &bh);
+				brelse(bh);
+				spin_lock(&lru_list_lock);
+			}
+		}
+	}
+
+	while (!list_empty(&tmp.i_dirty_data_buffers)) {
+		bh = BH_ENTRY(tmp.i_dirty_data_buffers.prev);
+		remove_inode_queue(bh);
+		get_bh(bh);
+		spin_unlock(&lru_list_lock);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			err = -EIO;
+		brelse(bh);
+		spin_lock(&lru_list_lock);
+	}
+	
+	spin_unlock(&lru_list_lock);
+	err2 = osync_inode_data_buffers(inode);
+
+	if (err)
+		return err;
+	else
+		return err2;
+}
+
+/*
+ * osync is designed to support O_SYNC io.  It waits synchronously for
+ * all already-submitted IO to complete, but does not queue any new
+ * writes to the disk.
+ *
+ * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
+ * you dirty the buffers, and then use osync_inode_buffers to wait for
+ * completion.  Any other dirty buffers which are not yet queued for
+ * write will not be flushed to disk by the osync.
+ */
+
+int osync_inode_buffers(struct inode *inode)
+{
+	struct buffer_head *bh;
+	struct list_head *list;
+	int err = 0;
+
+	spin_lock(&lru_list_lock);
+	
+ repeat:
+	
+	for (list = inode->i_dirty_buffers.prev; 
+	     bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
+	     list = bh->b_inode_buffers.prev) {
+		if (buffer_locked(bh)) {
+			get_bh(bh);
+			spin_unlock(&lru_list_lock);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				err = -EIO;
+			brelse(bh);
+			spin_lock(&lru_list_lock);
+			goto repeat;
+		}
+	}
+
+	spin_unlock(&lru_list_lock);
+	return err;
+}
+
+int osync_inode_data_buffers(struct inode *inode)
+{
+	struct buffer_head *bh;
+	struct list_head *list;
+	int err = 0;
+
+	spin_lock(&lru_list_lock);
+	
+ repeat:
+
+	for (list = inode->i_dirty_data_buffers.prev; 
+	     bh = BH_ENTRY(list), list != &inode->i_dirty_data_buffers;
+	     list = bh->b_inode_buffers.prev) {
+		if (buffer_locked(bh)) {
+			get_bh(bh);
+			spin_unlock(&lru_list_lock);
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				err = -EIO;
+			brelse(bh);
+			spin_lock(&lru_list_lock);
+			goto repeat;
+		}
+	}
+
+	spin_unlock(&lru_list_lock);
+	return err;
+}
+
+
+/*
+ * Invalidate any and all dirty buffers on a given inode.  We are
+ * probably unmounting the fs, but that doesn't mean we have already
+ * done a sync().  Just drop the buffers from the inode list.
+ */
+void invalidate_inode_buffers(struct inode *inode)
+{
+	struct list_head * entry;
+	
+	spin_lock(&lru_list_lock);
+	while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
+		remove_inode_queue(BH_ENTRY(entry));
+	while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
+		remove_inode_queue(BH_ENTRY(entry));
+	spin_unlock(&lru_list_lock);
+}
+
+
+/*
+ * Ok, this is getblk, and it isn't very clear, again to hinder
+ * race-conditions. Most of the code is seldom used, (ie repeating),
+ * so it should be much more efficient than it looks.
+ *
+ * The algorithm is changed: hopefully better, and an elusive bug removed.
+ *
+ * 14.02.92: changed it to sync dirty buffers a bit: better performance
+ * when the filesystem starts to get full of dirty blocks (I hope).
+ */
+struct buffer_head * getblk(kdev_t dev, int block, int size)
+{
+	for (;;) {
+		struct buffer_head * bh;
+
+		bh = get_hash_table(dev, block, size);
+		if (bh)
+			return bh;
+
+		if (!grow_buffers(dev, block, size))
+			free_more_memory();
+	}
+}
+
+/* -1 -> no need to flush
+    0 -> async flush
+    1 -> sync flush (wait for I/O completion) */
+static int balance_dirty_state(void)
+{
+	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
+
+	dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
+	dirty += size_buffers_type[BUF_LOCKED] >> PAGE_SHIFT;
+	tot = nr_free_buffer_pages();
+
+	dirty *= 100;
+	soft_dirty_limit = tot * bdf_prm.b_un.nfract;
+	hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
+
+	/* First, check for the "real" dirty limit. */
+	if (dirty > soft_dirty_limit) {
+		if (dirty > hard_dirty_limit)
+			return 1;
+		return 0;
+	}
+
+	return -1;
+}
+
+/*
+ * if a new dirty buffer is created we need to balance bdflush.
+ *
+ * in the future we might want to make bdflush aware of different
+ * pressures on different devices - thus the (currently unused)
+ * 'dev' parameter.
+ */
+void balance_dirty(void)
+{
+	int state = balance_dirty_state();
+
+	if (state < 0)
+		return;
+
+	/* If we're getting into imbalance, start write-out */
+	spin_lock(&lru_list_lock);
+	write_some_buffers(NODEV);
+
+	/*
+	 * And if we're _really_ out of balance, wait for
+	 * some of the dirty/locked buffers ourselves and
+	 * start bdflush.
+	 * This will throttle heavy writers.
+	 */
+	if (state > 0) {
+		wait_for_some_buffers(NODEV);
+		wakeup_bdflush();
+	}
+}
+
+inline void __mark_dirty(struct buffer_head *bh)
+{
+	bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
+	refile_buffer(bh);
+}
+
+/* atomic version, the user must call balance_dirty() by hand
+   as soon as it become possible to block */
+void __mark_buffer_dirty(struct buffer_head *bh)
+{
+	if (!atomic_set_buffer_dirty(bh))
+		__mark_dirty(bh);
+}
+
+void mark_buffer_dirty(struct buffer_head *bh)
+{
+	if (!atomic_set_buffer_dirty(bh)) {
+		__mark_dirty(bh);
+		balance_dirty();
+	}
+}
+
+void set_buffer_flushtime(struct buffer_head *bh)
+{
+	bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
+}
+EXPORT_SYMBOL(set_buffer_flushtime);
+
+/*
+ * A buffer may need to be moved from one buffer list to another
+ * (e.g. in case it is not shared any more). Handle this.
+ */
+static void __refile_buffer(struct buffer_head *bh)
+{
+	int dispose = BUF_CLEAN;
+	if (buffer_locked(bh))
+		dispose = BUF_LOCKED;
+	if (buffer_dirty(bh))
+		dispose = BUF_DIRTY;
+	if (dispose != bh->b_list) {
+		__remove_from_lru_list(bh);
+		bh->b_list = dispose;
+		if (dispose == BUF_CLEAN)
+			remove_inode_queue(bh);
+		__insert_into_lru_list(bh, dispose);
+	}
+}
+
+void refile_buffer(struct buffer_head *bh)
+{
+	spin_lock(&lru_list_lock);
+	__refile_buffer(bh);
+	spin_unlock(&lru_list_lock);
+}
+
+/*
+ * Release a buffer head
+ */
+void __brelse(struct buffer_head * buf)
+{
+	if (atomic_read(&buf->b_count)) {
+		put_bh(buf);
+		return;
+	}
+	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
+}
+
+/*
+ * bforget() is like brelse(), except it discards any
+ * potentially dirty data.
+ */
+void __bforget(struct buffer_head * buf)
+{
+	mark_buffer_clean(buf);
+	__brelse(buf);
+}
+
+/**
+ *	bread() - reads a specified block and returns the bh
+ *	@block: number of block
+ *	@size: size (in bytes) to read
+ * 
+ *	Reads a specified block, and returns buffer head that
+ *	contains it. It returns NULL if the block was unreadable.
+ */
+struct buffer_head * bread(kdev_t dev, int block, int size)
+{
+	struct buffer_head * bh;
+
+	bh = getblk(dev, block, size);
+	touch_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	ll_rw_block(READ, 1, &bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	brelse(bh);
+	return NULL;
+}
+
+/*
+ * Note: the caller should wake up the buffer_wait list if needed.
+ */
+static void __put_unused_buffer_head(struct buffer_head * bh)
+{
+	if (bh->b_inode)
+		BUG();
+	if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
+		kmem_cache_free(bh_cachep, bh);
+	} else {
+		bh->b_dev = B_FREE;
+		bh->b_blocknr = -1;
+		bh->b_this_page = NULL;
+
+		nr_unused_buffer_heads++;
+		bh->b_next_free = unused_list;
+		unused_list = bh;
+	}
+}
+
+void put_unused_buffer_head(struct buffer_head *bh)
+{
+	spin_lock(&unused_list_lock);
+	__put_unused_buffer_head(bh);
+	spin_unlock(&unused_list_lock);
+}
+EXPORT_SYMBOL(put_unused_buffer_head);
+
+/*
+ * Reserve NR_RESERVED buffer heads for async IO requests to avoid
+ * no-buffer-head deadlock.  Return NULL on failure; waiting for
+ * buffer heads is now handled in create_buffers().
+ */ 
+struct buffer_head * get_unused_buffer_head(int async)
+{
+	struct buffer_head * bh;
+
+	spin_lock(&unused_list_lock);
+	if (nr_unused_buffer_heads > NR_RESERVED) {
+		bh = unused_list;
+		unused_list = bh->b_next_free;
+		nr_unused_buffer_heads--;
+		spin_unlock(&unused_list_lock);
+		return bh;
+	}
+	spin_unlock(&unused_list_lock);
+
+	/* This is critical.  We can't call out to the FS
+	 * to get more buffer heads, because the FS may need
+	 * more buffer-heads itself.  Thus SLAB_NOFS.
+	 */
+	if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
+		bh->b_blocknr = -1;
+		bh->b_this_page = NULL;
+		return bh;
+	}
+
+	/*
+	 * If we need an async buffer, use the reserved buffer heads.
+	 */
+	if (async) {
+		spin_lock(&unused_list_lock);
+		if (unused_list) {
+			bh = unused_list;
+			unused_list = bh->b_next_free;
+			nr_unused_buffer_heads--;
+			spin_unlock(&unused_list_lock);
+			return bh;
+		}
+		spin_unlock(&unused_list_lock);
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(get_unused_buffer_head);
+
+void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
+{
+	bh->b_page = page;
+	if (offset >= PAGE_SIZE)
+		BUG();
+	if (PageHighMem(page))
+		/*
+		 * This catches illegal uses and preserves the offset:
+		 */
+		bh->b_data = (char *)(0 + offset);
+	else
+		bh->b_data = page_address(page) + offset;
+}
+EXPORT_SYMBOL(set_bh_page);
+
+/*
+ * Create the appropriate buffers when given a page for data area and
+ * the size of each buffer.. Use the bh->b_this_page linked list to
+ * follow the buffers created.  Return NULL if unable to create more
+ * buffers.
+ * The async flag is used to differentiate async IO (paging, swapping)
+ * from ordinary buffer allocations, and only async requests are allowed
+ * to sleep waiting for buffer heads. 
+ */
+static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
+{
+	struct buffer_head *bh, *head;
+	long offset;
+
+try_again:
+	head = NULL;
+	offset = PAGE_SIZE;
+	while ((offset -= size) >= 0) {
+		bh = get_unused_buffer_head(async);
+		if (!bh)
+			goto no_grow;
+
+		bh->b_dev = NODEV;
+		bh->b_this_page = head;
+		head = bh;
+
+		bh->b_state = 0;
+		bh->b_next_free = NULL;
+		bh->b_pprev = NULL;
+		atomic_set(&bh->b_count, 0);
+		bh->b_size = size;
+
+		set_bh_page(bh, page, offset);
+
+		bh->b_list = BUF_CLEAN;
+		bh->b_end_io = NULL;
+	}
+	return head;
+/*
+ * In case anything failed, we just free everything we got.
+ */
+no_grow:
+	if (head) {
+		spin_lock(&unused_list_lock);
+		do {
+			bh = head;
+			head = head->b_this_page;
+			__put_unused_buffer_head(bh);
+		} while (head);
+		spin_unlock(&unused_list_lock);
+
+		/* Wake up any waiters ... */
+		wake_up(&buffer_wait);
+	}
+
+	/*
+	 * Return failure for non-async IO requests.  Async IO requests
+	 * are not allowed to fail, so we have to wait until buffer heads
+	 * become available.  But we don't want tasks sleeping with 
+	 * partially complete buffers, so all were released above.
+	 */
+	if (!async)
+		return NULL;
+
+	/* We're _really_ low on memory. Now we just
+	 * wait for old buffer heads to become free due to
+	 * finishing IO.  Since this is an async request and
+	 * the reserve list is empty, we're sure there are 
+	 * async buffer heads in use.
+	 */
+	run_task_queue(&tq_disk);
+
+	free_more_memory();
+	goto try_again;
+}
+
+/*
+ * Called when truncating a buffer on a page completely.
+ */
+static void discard_buffer(struct buffer_head * bh)
+{
+	if (buffer_mapped(bh)) {
+		mark_buffer_clean(bh);
+		lock_buffer(bh);
+		clear_bit(BH_Uptodate, &bh->b_state);
+		clear_bit(BH_Mapped, &bh->b_state);
+		clear_bit(BH_Req, &bh->b_state);
+		clear_bit(BH_New, &bh->b_state);
+		remove_from_queues(bh);
+		unlock_buffer(bh);
+	}
+}
+
+/**
+ * try_to_release_page - release old fs-specific metadata on a page
+ *
+ */
+
+int try_to_release_page(struct page * page, int gfp_mask)
+{
+	if (!PageLocked(page))
+		BUG();
+	
+	if (!page->mapping)
+		goto try_to_free;
+	if (!page->mapping->a_ops->releasepage)
+		goto try_to_free;
+	if (page->mapping->a_ops->releasepage(page, gfp_mask))
+		goto try_to_free;
+	/*
+	 * We couldn't release buffer metadata; don't even bother trying
+	 * to release buffers.
+	 */
+	return 0;
+try_to_free:	
+	return try_to_free_buffers(page, gfp_mask);
+}
+
+/*
+ * We don't have to release all buffers here, but
+ * we have to be sure that no dirty buffer is left
+ * and no IO is going on (no buffer is locked), because
+ * we have truncated the file and are going to free the
+ * blocks on-disk..
+ */
+int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int curr_off = 0;
+
+	if (!PageLocked(page))
+		BUG();
+	if (!page->buffers)
+		return 1;
+
+	head = page->buffers;
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		/*
+		 * is this block fully flushed?
+		 */
+		if (offset <= curr_off)
+			discard_buffer(bh);
+		curr_off = next_off;
+		bh = next;
+	} while (bh != head);
+
+	/*
+	 * subtle. We release buffer-heads only if this is
+	 * the 'final' flushpage. We have invalidated the get_block
+	 * cached value unconditionally, so real IO is not
+	 * possible anymore.
+	 *
+	 * If the free doesn't work out, the buffers can be
+	 * left around - they just turn into anonymous buffers
+	 * instead.
+	 */
+	if (!offset) {
+		if (!try_to_release_page(page, 0))
+			return 0;
+	}
+
+	return 1;
+}
+
+void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
+{
+	struct buffer_head *bh, *head, *tail;
+
+	/* FIXME: create_buffers should fail if there's no enough memory */
+	head = create_buffers(page, blocksize, 1);
+	if (page->buffers)
+		BUG();
+
+	bh = head;
+	do {
+		bh->b_dev = dev;
+		bh->b_blocknr = 0;
+		bh->b_end_io = NULL;
+		tail = bh;
+		bh = bh->b_this_page;
+	} while (bh);
+	tail->b_this_page = head;
+	page->buffers = head;
+	page_cache_get(page);
+}
+EXPORT_SYMBOL(create_empty_buffers);
+
+/*
+ * We are taking a block for data and we don't want any output from any
+ * buffer-cache aliases starting from return from that function and
+ * until the moment when something will explicitly mark the buffer
+ * dirty (hopefully that will not happen until we will free that block ;-)
+ * We don't even need to mark it not-uptodate - nobody can expect
+ * anything from a newly allocated buffer anyway. We used to used
+ * unmap_buffer() for such invalidation, but that was wrong. We definitely
+ * don't want to mark the alias unmapped, for example - it would confuse
+ * anyone who might pick it with bread() afterwards...
+ */
+
+static void unmap_underlying_metadata(struct buffer_head * bh)
+{
+	struct buffer_head *old_bh;
+
+	old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
+	if (old_bh) {
+		mark_buffer_clean(old_bh);
+		wait_on_buffer(old_bh);
+		clear_bit(BH_Req, &old_bh->b_state);
+		__brelse(old_bh);
+	}
+}
+
+/*
+ * NOTE! All mapped/uptodate combinations are valid:
+ *
+ *	Mapped	Uptodate	Meaning
+ *
+ *	No	No		"unknown" - must do get_block()
+ *	No	Yes		"hole" - zero-filled
+ *	Yes	No		"allocated" - allocated on disk, not read in
+ *	Yes	Yes		"valid" - allocated and up-to-date in memory.
+ *
+ * "Dirty" is valid only with the last case (mapped+uptodate).
+ */
+
+/*
+ * block_write_full_page() is SMP threaded - the kernel lock is not held.
+ */
+static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
+{
+	int err, i;
+	unsigned long block;
+	struct buffer_head *bh, *head;
+
+	if (!PageLocked(page))
+		BUG();
+
+	if (!page->buffers)
+		create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
+	head = page->buffers;
+
+	block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	bh = head;
+	i = 0;
+
+	/* Stage 1: make sure we have all the buffers mapped! */
+	do {
+		/*
+		 * If the buffer isn't up-to-date, we can't be sure
+		 * that the buffer has been initialized with the proper
+		 * block number information etc..
+		 *
+		 * Leave it to the low-level FS to make all those
+		 * decisions (block #0 may actually be a valid block)
+		 */
+		if (!buffer_mapped(bh)) {
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				goto out;
+			if (buffer_new(bh))
+				unmap_underlying_metadata(bh);
+		}
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	/* Stage 2: lock the buffers, mark them clean */
+	do {
+		lock_buffer(bh);
+		set_buffer_async_io(bh);
+		set_bit(BH_Uptodate, &bh->b_state);
+		clear_bit(BH_Dirty, &bh->b_state);
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	/* Stage 3: submit the IO */
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		submit_bh(WRITE, bh);
+		bh = next;
+	} while (bh != head);
+
+	/* Done - end_buffer_io_async will unlock */
+	SetPageUptodate(page);
+	return 0;
+
+out:
+	ClearPageUptodate(page);
+	UnlockPage(page);
+	return err;
+}
+
+static int __block_prepare_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to, get_block_t *get_block)
+{
+	unsigned block_start, block_end;
+	unsigned long block;
+	int err = 0;
+	unsigned blocksize, bbits;
+	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
+	char *kaddr = kmap(page);
+
+	blocksize = 1 << inode->i_blkbits;
+	if (!page->buffers)
+		create_empty_buffers(page, inode->i_dev, blocksize);
+	head = page->buffers;
+
+	bbits = inode->i_blkbits;
+	block = page->index << (PAGE_CACHE_SHIFT - bbits);
+
+	for(bh = head, block_start = 0; bh != head || !block_start;
+	    block++, block_start=block_end, bh = bh->b_this_page) {
+		if (!bh)
+			BUG();
+		block_end = block_start+blocksize;
+		if (block_end <= from)
+			continue;
+		if (block_start >= to)
+			break;
+		if (!buffer_mapped(bh)) {
+			err = get_block(inode, block, bh, 1);
+			if (err)
+				goto out;
+			if (buffer_new(bh)) {
+				unmap_underlying_metadata(bh);
+				if (Page_Uptodate(page)) {
+					set_bit(BH_Uptodate, &bh->b_state);
+					continue;
+				}
+				if (block_end > to)
+					memset(kaddr+to, 0, block_end-to);
+				if (block_start < from)
+					memset(kaddr+block_start, 0, from-block_start);
+				if (block_end > to || block_start < from)
+					flush_dcache_page(page);
+				continue;
+			}
+		}
+		if (Page_Uptodate(page)) {
+			set_bit(BH_Uptodate, &bh->b_state);
+			continue; 
+		}
+		if (!buffer_uptodate(bh) &&
+		     (block_start < from || block_end > to)) {
+			ll_rw_block(READ, 1, &bh);
+			*wait_bh++=bh;
+		}
+	}
+	/*
+	 * If we issued read requests - let them complete.
+	 */
+	while(wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		err = -EIO;
+		if (!buffer_uptodate(*wait_bh))
+			goto out;
+	}
+	return 0;
+out:
+	return err;
+}
+
+static int __block_commit_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	int partial = 0, need_balance_dirty = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	for(bh = head = page->buffers, block_start = 0;
+	    bh != head || !block_start;
+	    block_start=block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_bit(BH_Uptodate, &bh->b_state);
+			if (!atomic_set_buffer_dirty(bh)) {
+				__mark_dirty(bh);
+				buffer_insert_inode_data_queue(bh, inode);
+				need_balance_dirty = 1;
+			}
+		}
+	}
+
+	if (need_balance_dirty)
+		balance_dirty();
+	/*
+	 * is this a partial write that happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' wether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * mark_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int block_read_full_page(struct page *page, get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned long iblock, lblock;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocksize, blocks;
+	int nr, i;
+
+	if (!PageLocked(page))
+		PAGE_BUG(page);
+	blocksize = 1 << inode->i_blkbits;
+	if (!page->buffers)
+		create_empty_buffers(page, inode->i_dev, blocksize);
+	head = page->buffers;
+
+	blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
+	iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
+	bh = head;
+	nr = 0;
+	i = 0;
+
+	do {
+		if (buffer_uptodate(bh))
+			continue;
+
+		if (!buffer_mapped(bh)) {
+			if (iblock < lblock) {
+				if (get_block(inode, iblock, bh, 0))
+					continue;
+			}
+			if (!buffer_mapped(bh)) {
+				memset(kmap(page) + i*blocksize, 0, blocksize);
+				flush_dcache_page(page);
+				kunmap(page);
+				set_bit(BH_Uptodate, &bh->b_state);
+				continue;
+			}
+			/* get_block() might have updated the buffer synchronously */
+			if (buffer_uptodate(bh))
+				continue;
+		}
+
+		arr[nr] = bh;
+		nr++;
+	} while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+	if (!nr) {
+		/*
+		 * all buffers are uptodate - we can set the page
+		 * uptodate as well.
+		 */
+		SetPageUptodate(page);
+		UnlockPage(page);
+		return 0;
+	}
+
+	/* Stage two: lock the buffers */
+	for (i = 0; i < nr; i++) {
+		struct buffer_head * bh = arr[i];
+		lock_buffer(bh);
+		set_buffer_async_io(bh);
+	}
+
+	/* Stage 3: start the IO */
+	for (i = 0; i < nr; i++)
+		submit_bh(READ, arr[i]);
+
+	return 0;
+}
+
+/*
+ * For moronic filesystems that do not allow holes in file.
+ * We may have to extend the file.
+ */
+
+int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct page *new_page;
+	unsigned long pgpos;
+	long status;
+	unsigned zerofrom;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	char *kaddr;
+
+	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
+		status = -ENOMEM;
+		new_page = grab_cache_page(mapping, pgpos);
+		if (!new_page)
+			goto out;
+		/* we might sleep */
+		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
+			UnlockPage(new_page);
+			page_cache_release(new_page);
+			continue;
+		}
+		zerofrom = *bytes & ~PAGE_CACHE_MASK;
+		if (zerofrom & (blocksize-1)) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+		status = __block_prepare_write(inode, new_page, zerofrom,
+						PAGE_CACHE_SIZE, get_block);
+		if (status)
+			goto out_unmap;
+		kaddr = page_address(new_page);
+		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
+		flush_dcache_page(new_page);
+		__block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
+		kunmap(new_page);
+		UnlockPage(new_page);
+		page_cache_release(new_page);
+	}
+
+	if (page->index < pgpos) {
+		/* completely inside the area */
+		zerofrom = offset;
+	} else {
+		/* page covers the boundary, find the boundary offset */
+		zerofrom = *bytes & ~PAGE_CACHE_MASK;
+
+		/* if we will expand the thing last block will be filled */
+		if (to > zerofrom && (zerofrom & (blocksize-1))) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+
+		/* starting below the boundary? Nothing to zero out */
+		if (offset <= zerofrom)
+			zerofrom = offset;
+	}
+	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
+	if (status)
+		goto out1;
+	kaddr = page_address(page);
+	if (zerofrom < offset) {
+		memset(kaddr+zerofrom, 0, offset-zerofrom);
+		flush_dcache_page(page);
+		__block_commit_write(inode, page, zerofrom, offset);
+	}
+	return 0;
+out1:
+	ClearPageUptodate(page);
+	kunmap(page);
+	return status;
+
+out_unmap:
+	ClearPageUptodate(new_page);
+	kunmap(new_page);
+	UnlockPage(new_page);
+	page_cache_release(new_page);
+out:
+	return status;
+}
+
+int block_prepare_write(struct page *page, unsigned from, unsigned to,
+			get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	int err = __block_prepare_write(inode, page, from, to, get_block);
+	if (err) {
+		ClearPageUptodate(page);
+		kunmap(page);
+	}
+	return err;
+}
+
+int block_commit_write(struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	__block_commit_write(inode,page,from,to);
+	kunmap(page);
+	return 0;
+}
+
+int generic_commit_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	__block_commit_write(inode,page,from,to);
+	kunmap(page);
+	if (pos > inode->i_size) {
+		inode->i_size = pos;
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+
+int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
+{
+	unsigned long index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, iblock, length, pos;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	struct buffer_head *bh;
+	int err;
+
+	blocksize = 1 << inode->i_blkbits;
+	length = offset & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	length = blocksize - length;
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	
+	page = grab_cache_page(mapping, index);
+	err = -ENOMEM;
+	if (!page)
+		goto out;
+
+	if (!page->buffers)
+		create_empty_buffers(page, inode->i_dev, blocksize);
+
+	/* Find the buffer that contains "offset" */
+	bh = page->buffers;
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+	if (!buffer_mapped(bh)) {
+		/* Hole? Nothing to do */
+		if (buffer_uptodate(bh))
+			goto unlock;
+		get_block(inode, iblock, bh, 0);
+		/* Still unmapped? Nothing to do */
+		if (!buffer_mapped(bh))
+			goto unlock;
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (Page_Uptodate(page))
+		set_bit(BH_Uptodate, &bh->b_state);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	memset(kmap(page) + offset, 0, length);
+	flush_dcache_page(page);
+	kunmap(page);
+
+	__mark_buffer_dirty(bh);
+	err = 0;
+
+unlock:
+	UnlockPage(page);
+	page_cache_release(page);
+out:
+	return err;
+}
+
+int block_write_full_page(struct page *page, get_block_t *get_block)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	int err;
+
+	/* easy case */
+	if (page->index < end_index)
+		return __block_write_full_page(inode, page, get_block);
+
+	/* things got complicated... */
+	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
+	/* OK, are we completely out? */
+	if (page->index >= end_index+1 || !offset) {
+		UnlockPage(page);
+		return -EIO;
+	}
+
+	/* Sigh... will have to work, then... */
+	err = __block_prepare_write(inode, page, 0, offset, get_block);
+	if (!err) {
+		memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		__block_commit_write(inode,page,0,offset);
+done:
+		kunmap(page);
+		UnlockPage(page);
+		return err;
+	}
+	ClearPageUptodate(page);
+	goto done;
+}
+
+int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
+{
+	struct buffer_head tmp;
+	struct inode *inode = mapping->host;
+	tmp.b_state = 0;
+	tmp.b_blocknr = 0;
+	get_block(inode, block, &tmp, 0);
+	return tmp.b_blocknr;
+}
+
+int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
+{
+	int i, nr_blocks, retval;
+	unsigned long * blocks = iobuf->blocks;
+
+	nr_blocks = iobuf->length / blocksize;
+	/* build the blocklist */
+	for (i = 0; i < nr_blocks; i++, blocknr++) {
+		struct buffer_head bh;
+
+		bh.b_state = 0;
+		bh.b_dev = inode->i_dev;
+		bh.b_size = blocksize;
+
+		retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
+		if (retval)
+			goto out;
+
+		if (rw == READ) {
+			if (buffer_new(&bh))
+				BUG();
+			if (!buffer_mapped(&bh)) {
+				/* there was an hole in the filesystem */
+				blocks[i] = -1UL;
+				continue;
+			}
+		} else {
+			if (buffer_new(&bh))
+				unmap_underlying_metadata(&bh);
+			if (!buffer_mapped(&bh))
+				BUG();
+		}
+		blocks[i] = bh.b_blocknr;
+	}
+
+	retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
+
+ out:
+	return retval;
+}
+
+/*
+ * IO completion routine for a buffer_head being used for kiobuf IO: we
+ * can't dispatch the kiobuf callback until io_count reaches 0.  
+ */
+
+static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
+{
+	struct kiobuf *kiobuf;
+	
+	mark_buffer_uptodate(bh, uptodate);
+
+	kiobuf = bh->b_private;
+	unlock_buffer(bh);
+	end_kio_request(kiobuf, uptodate);
+}
+
+/*
+ * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
+ * for them to complete.  Clean up the buffer_heads afterwards.  
+ */
+
+static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
+{
+	int iosize, err;
+	int i;
+	struct buffer_head *tmp;
+
+	iosize = 0;
+	err = 0;
+
+	for (i = nr; --i >= 0; ) {
+		iosize += size;
+		tmp = bh[i];
+		if (buffer_locked(tmp)) {
+			wait_on_buffer(tmp);
+		}
+		
+		if (!buffer_uptodate(tmp)) {
+			/* We are traversing bh'es in reverse order so
+                           clearing iosize on error calculates the
+                           amount of IO before the first error. */
+			iosize = 0;
+			err = -EIO;
+		}
+	}
+	
+	if (iosize)
+		return iosize;
+	return err;
+}
+
+/*
+ * Start I/O on a physical range of kernel memory, defined by a vector
+ * of kiobuf structs (much like a user-space iovec list).
+ *
+ * The kiobuf must already be locked for IO.  IO is submitted
+ * asynchronously: you need to check page->locked, page->uptodate, and
+ * maybe wait on page->wait.
+ *
+ * It is up to the caller to make sure that there are enough blocks
+ * passed in to completely map the iobufs to disk.
+ */
+
+int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
+	       kdev_t dev, unsigned long b[], int size)
+{
+	int		err;
+	int		length;
+	int		transferred;
+	int		i;
+	int		bufind;
+	int		pageind;
+	int		bhind;
+	int		offset;
+	unsigned long	blocknr;
+	struct kiobuf *	iobuf = NULL;
+	struct page *	map;
+	struct buffer_head *tmp, **bhs = NULL;
+
+	if (!nr)
+		return 0;
+	
+	/* 
+	 * First, do some alignment and validity checks 
+	 */
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		if ((iobuf->offset & (size-1)) ||
+		    (iobuf->length & (size-1)))
+			return -EINVAL;
+		if (!iobuf->nr_pages)
+			panic("brw_kiovec: iobuf not initialised");
+	}
+
+	/* 
+	 * OK to walk down the iovec doing page IO on each page we find. 
+	 */
+	bufind = bhind = transferred = err = 0;
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		offset = iobuf->offset;
+		length = iobuf->length;
+		iobuf->errno = 0;
+		if (!bhs)
+			bhs = iobuf->bh;
+		
+		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
+			map  = iobuf->maplist[pageind];
+			if (!map) {
+				err = -EFAULT;
+				goto finished;
+			}
+			
+			while (length > 0) {
+				blocknr = b[bufind++];
+				if (blocknr == -1UL) {
+					if (rw == READ) {
+						/* there was an hole in the filesystem */
+						memset(kmap(map) + offset, 0, size);
+						flush_dcache_page(map);
+						kunmap(map);
+
+						transferred += size;
+						goto skip_block;
+					} else
+						BUG();
+				}
+				tmp = bhs[bhind++];
+
+				tmp->b_size = size;
+				set_bh_page(tmp, map, offset);
+				tmp->b_this_page = tmp;
+
+				init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
+				tmp->b_dev = dev;
+				tmp->b_blocknr = blocknr;
+				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
+
+				if (rw == WRITE) {
+					set_bit(BH_Uptodate, &tmp->b_state);
+					clear_bit(BH_Dirty, &tmp->b_state);
+				} else
+					set_bit(BH_Uptodate, &tmp->b_state);
+
+				atomic_inc(&iobuf->io_count);
+				submit_bh(rw, tmp);
+				/* 
+				 * Wait for IO if we have got too much 
+				 */
+				if (bhind >= KIO_MAX_SECTORS) {
+					kiobuf_wait_for_io(iobuf); /* wake-one */
+					err = wait_kio(rw, bhind, bhs, size);
+					if (err >= 0)
+						transferred += err;
+					else
+						goto finished;
+					bhind = 0;
+				}
+
+			skip_block:
+				length -= size;
+				offset += size;
+
+				if (offset >= PAGE_SIZE) {
+					offset = 0;
+					break;
+				}
+			} /* End of block loop */
+		} /* End of page loop */		
+	} /* End of iovec loop */
+
+	/* Is there any IO still left to submit? */
+	if (bhind) {
+		kiobuf_wait_for_io(iobuf); /* wake-one */
+		err = wait_kio(rw, bhind, bhs, size);
+		if (err >= 0)
+			transferred += err;
+		else
+			goto finished;
+	}
+
+ finished:
+	if (transferred)
+		return transferred;
+	return err;
+}
+
+/*
+ * Start I/O on a page.
+ * This function expects the page to be locked and may return
+ * before I/O is complete. You then have to check page->locked,
+ * page->uptodate, and maybe wait on page->wait.
+ *
+ * brw_page() is SMP-safe, although it's being called with the
+ * kernel lock held - but the code is ready.
+ *
+ * FIXME: we need a swapper_inode->get_block function to remove
+ *        some of the bmap kludges and interface ugliness here.
+ */
+int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
+{
+	struct buffer_head *head, *bh;
+
+	if (!PageLocked(page))
+		panic("brw_page: page not locked for I/O");
+
+	if (!page->buffers)
+		create_empty_buffers(page, dev, size);
+	head = bh = page->buffers;
+
+	/* Stage 1: lock all the buffers */
+	do {
+		lock_buffer(bh);
+		bh->b_blocknr = *(b++);
+		set_bit(BH_Mapped, &bh->b_state);
+		set_buffer_async_io(bh);
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	/* Stage 2: start the IO */
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		submit_bh(rw, bh);
+		bh = next;
+	} while (bh != head);
+	return 0;
+}
+
+int block_symlink(struct inode *inode, const char *symname, int len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	int err = -ENOMEM;
+	char *kaddr;
+
+	if (!page)
+		goto fail;
+	err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
+	if (err)
+		goto fail_map;
+	kaddr = page_address(page);
+	memcpy(kaddr, symname, len-1);
+	mapping->a_ops->commit_write(NULL, page, 0, len-1);
+	/*
+	 * Notice that we are _not_ going to block here - end of page is
+	 * unmapped, so this will only try to map the rest of page, see
+	 * that it is unmapped (typically even will not look into inode -
+	 * ->i_size will be enough for everything) and zero it out.
+	 * OTOH it's obviously correct and should make the page up-to-date.
+	 */
+	err = mapping->a_ops->readpage(NULL, page);
+	wait_on_page(page);
+	page_cache_release(page);
+	if (err < 0)
+		goto fail;
+	mark_inode_dirty(inode);
+	return 0;
+fail_map:
+	UnlockPage(page);
+	page_cache_release(page);
+fail:
+	return err;
+}
+
+static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
+{
+	struct buffer_head *bh, *tail;
+
+	bh = head;
+	do {
+		tail = bh;
+		bh = bh->b_this_page;
+	} while (bh);
+	tail->b_this_page = head;
+	page->buffers = head;
+	page_cache_get(page);
+}
+
+/*
+ * Create the page-cache page that contains the requested block
+ */
+static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
+{
+	struct page * page;
+	struct buffer_head *bh;
+
+	page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
+	if (IS_ERR(page))
+		return NULL;
+
+	if (!PageLocked(page))
+		BUG();
+
+	bh = page->buffers;
+	if (bh) {
+		if (bh->b_size == size)
+			return page;
+		if (!try_to_free_buffers(page, GFP_NOFS))
+			goto failed;
+	}
+
+	bh = create_buffers(page, size, 0);
+	if (!bh)
+		goto failed;
+	link_dev_buffers(page, bh);
+	return page;
+
+failed:
+	UnlockPage(page);
+	page_cache_release(page);
+	return NULL;
+}
+
+static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
+{
+	struct buffer_head *head = page->buffers;
+	struct buffer_head *bh = head;
+	unsigned int uptodate;
+
+	uptodate = 1 << BH_Mapped;
+	if (Page_Uptodate(page))
+		uptodate |= 1 << BH_Uptodate;
+
+	write_lock(&hash_table_lock);
+	do {
+		if (!(bh->b_state & (1 << BH_Mapped))) {
+			init_buffer(bh, NULL, NULL);
+			bh->b_dev = dev;
+			bh->b_blocknr = block;
+			bh->b_state = uptodate;
+		}
+
+		/* Insert the buffer into the hash lists if necessary */
+		if (!bh->b_pprev)
+			__insert_into_hash_list(bh);
+
+		block++;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	write_unlock(&hash_table_lock);
+}
+
+/*
+ * Try to increase the number of buffers available: the size argument
+ * is used to determine what kind of buffers we want.
+ */
+static int grow_buffers(kdev_t dev, unsigned long block, int size)
+{
+	struct page * page;
+	struct block_device *bdev;
+	unsigned long index;
+	int sizebits;
+
+	/* Size must be multiple of hard sectorsize */
+	if (size & (get_hardsect_size(dev)-1))
+		BUG();
+	/* Size must be within 512 bytes and PAGE_SIZE */
+	if (size < 512 || size > PAGE_SIZE)
+		BUG();
+
+	sizebits = -1;
+	do {
+		sizebits++;
+	} while ((size << sizebits) < PAGE_SIZE);
+
+	index = block >> sizebits;
+	block = index << sizebits;
+
+	bdev = bdget(kdev_t_to_nr(dev));
+	if (!bdev) {
+		printk("No block device for %s\n", kdevname(dev));
+		BUG();
+	}
+
+	/* Create a page with the proper size buffers.. */
+	page = grow_dev_page(bdev, index, size);
+
+	/* This is "wrong" - talk to Al Viro */
+	atomic_dec(&bdev->bd_count);
+	if (!page)
+		return 0;
+
+	/* Hash in the buffers on the hash list */
+	hash_page_buffers(page, dev, block, size);
+	UnlockPage(page);
+	page_cache_release(page);
+
+	/* We hashed up this page, so increment buffermem */
+	atomic_inc(&buffermem_pages);
+	return 1;
+}
+
+static int sync_page_buffers(struct buffer_head *head, unsigned int gfp_mask)
+{
+	struct buffer_head * bh = head;
+	int tryagain = 0;
+
+	do {
+		if (!buffer_dirty(bh) && !buffer_locked(bh))
+			continue;
+
+		/* Don't start IO first time around.. */
+		if (!test_and_set_bit(BH_Wait_IO, &bh->b_state))
+			continue;
+
+		/* Second time through we start actively writing out.. */
+		if (test_and_set_bit(BH_Lock, &bh->b_state)) {
+			if (!test_bit(BH_launder, &bh->b_state))
+				continue;
+			wait_on_buffer(bh);
+			tryagain = 1;
+			continue;
+		}
+
+		if (!atomic_set_buffer_clean(bh)) {
+			unlock_buffer(bh);
+			continue;
+		}
+
+		__mark_buffer_clean(bh);
+		get_bh(bh);
+		set_bit(BH_launder, &bh->b_state);
+		bh->b_end_io = end_buffer_io_sync;
+		submit_bh(WRITE, bh);
+		tryagain = 0;
+	} while ((bh = bh->b_this_page) != head);
+
+	return tryagain;
+}
+
+/*
+ * Can the buffer be thrown out?
+ */
+#define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock))
+#define buffer_busy(bh)		(atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
+
+/*
+ * try_to_free_buffers() checks if all the buffers on this particular page
+ * are unused, and free's the page if so.
+ *
+ * Wake up bdflush() if this fails - if we're running low on memory due
+ * to dirty buffers, we need to flush them out as quickly as possible.
+ *
+ * NOTE: There are quite a number of ways that threads of control can
+ *       obtain a reference to a buffer head within a page.  So we must
+ *	 lock out all of these paths to cleanly toss the page.
+ */
+int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
+{
+	struct buffer_head * tmp, * bh = page->buffers;
+
+cleaned_buffers_try_again:
+	spin_lock(&lru_list_lock);
+	write_lock(&hash_table_lock);
+	tmp = bh;
+	do {
+		if (buffer_busy(tmp))
+			goto busy_buffer_page;
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
+
+	spin_lock(&unused_list_lock);
+	tmp = bh;
+
+	/* if this buffer was hashed, this page counts as buffermem */
+	if (bh->b_pprev)
+		atomic_dec(&buffermem_pages);
+	do {
+		struct buffer_head * p = tmp;
+		tmp = tmp->b_this_page;
+
+		if (p->b_dev == B_FREE) BUG();
+
+		remove_inode_queue(p);
+		__remove_from_queues(p);
+		__put_unused_buffer_head(p);
+	} while (tmp != bh);
+	spin_unlock(&unused_list_lock);
+
+	/* Wake up anyone waiting for buffer heads */
+	wake_up(&buffer_wait);
+
+	/* And free the page */
+	page->buffers = NULL;
+	page_cache_release(page);
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
+	return 1;
+
+busy_buffer_page:
+	/* Uhhuh, start writeback so that we don't end up with all dirty pages */
+	write_unlock(&hash_table_lock);
+	spin_unlock(&lru_list_lock);
+	if (gfp_mask & __GFP_IO) {
+		if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
+			if (sync_page_buffers(bh, gfp_mask)) {
+				/* no IO or waiting next time */
+				gfp_mask = 0;
+				goto cleaned_buffers_try_again;
+			}
+		}
+	}
+	if (balance_dirty_state() >= 0)
+		wakeup_bdflush();
+	return 0;
+}
+EXPORT_SYMBOL(try_to_free_buffers);
+
+/* ================== Debugging =================== */
+
+void show_buffers(void)
+{
+#ifdef CONFIG_SMP
+	struct buffer_head * bh;
+	int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
+	int nlist;
+	static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
+#endif
+
+	printk("Buffer memory:   %6dkB\n",
+			atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
+
+#ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
+	if (!spin_trylock(&lru_list_lock))
+		return;
+	for(nlist = 0; nlist < NR_LIST; nlist++) {
+		found = locked = dirty = used = lastused = 0;
+		bh = lru_list[nlist];
+		if(!bh) continue;
+
+		do {
+			found++;
+			if (buffer_locked(bh))
+				locked++;
+			if (buffer_dirty(bh))
+				dirty++;
+			if (atomic_read(&bh->b_count))
+				used++, lastused = found;
+			bh = bh->b_next_free;
+		} while (bh != lru_list[nlist]);
+		{
+			int tmp = nr_buffers_type[nlist];
+			if (found != tmp)
+				printk("%9s: BUG -> found %d, reported %d\n",
+				       buf_types[nlist], found, tmp);
+		}
+		printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
+		       "%d locked, %d dirty\n",
+		       buf_types[nlist], found, size_buffers_type[nlist]>>10,
+		       used, lastused, locked, dirty);
+	}
+	spin_unlock(&lru_list_lock);
+#endif
+}
+
+/* ===================== Init ======================= */
+
+/*
+ * allocate the hash table and init the free list
+ * Use gfp() for the hash table to decrease TLB misses, use
+ * SLAB cache for buffer heads.
+ */
+void __init buffer_init(unsigned long mempages)
+{
+	int order, i;
+	unsigned int nr_hash;
+
+	/* The buffer cache hash table is less important these days,
+	 * trim it a bit.
+	 */
+	mempages >>= 14;
+
+	mempages *= sizeof(struct buffer_head *);
+
+	for (order = 0; (1 << order) < mempages; order++)
+		;
+
+	/* try to allocate something until we get it or we're asking
+	   for something that is really too small */
+
+	do {
+		unsigned long tmp;
+
+		nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
+		bh_hash_mask = (nr_hash - 1);
+
+		tmp = nr_hash;
+		bh_hash_shift = 0;
+		while((tmp >>= 1UL) != 0UL)
+			bh_hash_shift++;
+
+		hash_table = (struct buffer_head **)
+		    __get_free_pages(GFP_ATOMIC, order);
+	} while (hash_table == NULL && --order > 0);
+	printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
+	       nr_hash, order, (PAGE_SIZE << order));
+
+	if (!hash_table)
+		panic("Failed to allocate buffer hash table\n");
+
+	/* Setup hash chains. */
+	for(i = 0; i < nr_hash; i++)
+		hash_table[i] = NULL;
+
+	/* Setup lru lists. */
+	for(i = 0; i < NR_LIST; i++)
+		lru_list[i] = NULL;
+
+}
+
+
+/* ====================== bdflush support =================== */
+
+/* This is a simple kernel daemon, whose job it is to provide a dynamic
+ * response to dirty buffers.  Once this process is activated, we write back
+ * a limited number of buffers to the disks and then go back to sleep again.
+ */
+
+DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
+
+void wakeup_bdflush(void)
+{
+	wake_up_interruptible(&bdflush_wait);
+}
+
+/* 
+ * Here we attempt to write back old buffers.  We also try to flush inodes 
+ * and supers as well, since this function is essentially "update", and 
+ * otherwise there would be no way of ensuring that these quantities ever 
+ * get written back.  Ideally, we would have a timestamp on the inodes
+ * and superblocks so that we could write back only the old ones as well
+ */
+
+static int sync_old_buffers(void)
+{
+	lock_kernel();
+	sync_unlocked_inodes();
+	sync_supers(0);
+	unlock_kernel();
+
+	for (;;) {
+		struct buffer_head *bh;
+
+		spin_lock(&lru_list_lock);
+		bh = lru_list[BUF_DIRTY];
+		if (!bh || time_before(jiffies, bh->b_flushtime))
+			break;
+		if (write_some_buffers(NODEV))
+			continue;
+		return 0;
+	}
+	spin_unlock(&lru_list_lock);
+	return 0;
+}
+
+int block_sync_page(struct page *page)
+{
+	run_task_queue(&tq_disk);
+	return 0;
+}
+
+/* This is the interface to bdflush.  As we get more sophisticated, we can
+ * pass tuning parameters to this "process", to adjust how it behaves. 
+ * We would want to verify each parameter, however, to make sure that it 
+ * is reasonable. */
+
+asmlinkage long sys_bdflush(int func, long data)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (func == 1) {
+		/* do_exit directly and let kupdate to do its work alone. */
+		do_exit(0);
+#if 0 /* left here as it's the only example of lazy-mm-stuff used from
+	 a syscall that doesn't care about the current mm context. */
+		int error;
+		struct mm_struct *user_mm;
+
+		/*
+		 * bdflush will spend all of it's time in kernel-space,
+		 * without touching user-space, so we can switch it into
+		 * 'lazy TLB mode' to reduce the cost of context-switches
+		 * to and from bdflush.
+		 */
+		user_mm = start_lazy_tlb();
+		error = sync_old_buffers();
+		end_lazy_tlb(user_mm);
+		return error;
+#endif
+	}
+
+	/* Basically func 1 means read param 1, 2 means write param 1, etc */
+	if (func >= 2) {
+		int i = (func-2) >> 1;
+		if (i >= 0 && i < N_PARAM) {
+			if ((func & 1) == 0)
+				return put_user(bdf_prm.data[i], (int*)data);
+
+			if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
+				bdf_prm.data[i] = data;
+				return 0;
+			}
+		}
+		return -EINVAL;
+	}
+
+	/* Having func 0 used to launch the actual bdflush and then never
+	 * return (unless explicitly killed). We return zero here to 
+	 * remain semi-compatible with present update(8) programs.
+	 */
+	return 0;
+}
+
+/*
+ * This is the actual bdflush daemon itself. It used to be started from
+ * the syscall above, but now we launch it ourselves internally with
+ * kernel_thread(...)  directly after the first thread in init/main.c
+ */
+int bdflush(void *startup)
+{
+	struct task_struct *tsk = current;
+
+	/*
+	 *	We have a bare-bones task_struct, and really should fill
+	 *	in a few more things so "top" and /proc/2/{exe,root,cwd}
+	 *	display semi-sane things. Not real crucial though...  
+	 */
+
+	tsk->session = 1;
+	tsk->pgrp = 1;
+	strcpy(tsk->comm, "bdflush");
+
+	/* avoid getting signals */
+	spin_lock_irq(&tsk->sigmask_lock);
+	flush_signals(tsk);
+	sigfillset(&tsk->blocked);
+	recalc_sigpending(tsk);
+	spin_unlock_irq(&tsk->sigmask_lock);
+
+	complete((struct completion *)startup);
+
+	for (;;) {
+		CHECK_EMERGENCY_SYNC
+
+		spin_lock(&lru_list_lock);
+		if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) {
+			wait_for_some_buffers(NODEV);
+			interruptible_sleep_on(&bdflush_wait);
+		}
+	}
+}
+
+/*
+ * This is the kernel update daemon. It was used to live in userspace
+ * but since it's need to run safely we want it unkillable by mistake.
+ * You don't need to change your userspace configuration since
+ * the userspace `update` will do_exit(0) at the first sys_bdflush().
+ */
+int kupdate(void *startup)
+{
+	struct task_struct * tsk = current;
+	int interval;
+
+	tsk->session = 1;
+	tsk->pgrp = 1;
+	strcpy(tsk->comm, "kupdated");
+
+	/* sigstop and sigcont will stop and wakeup kupdate */
+	spin_lock_irq(&tsk->sigmask_lock);
+	sigfillset(&tsk->blocked);
+	siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
+	recalc_sigpending(tsk);
+	spin_unlock_irq(&tsk->sigmask_lock);
+
+	complete((struct completion *)startup);
+
+	for (;;) {
+		wait_for_some_buffers(NODEV);
+
+		/* update interval */
+		interval = bdf_prm.b_un.interval;
+		if (interval) {
+			tsk->state = TASK_INTERRUPTIBLE;
+			schedule_timeout(interval);
+		} else {
+		stop_kupdate:
+			tsk->state = TASK_STOPPED;
+			schedule(); /* wait for SIGCONT */
+		}
+		/* check for sigstop */
+		if (signal_pending(tsk)) {
+			int stopped = 0;
+			spin_lock_irq(&tsk->sigmask_lock);
+			if (sigismember(&tsk->pending.signal, SIGSTOP)) {
+				sigdelset(&tsk->pending.signal, SIGSTOP);
+				stopped = 1;
+			}
+			recalc_sigpending(tsk);
+			spin_unlock_irq(&tsk->sigmask_lock);
+			if (stopped)
+				goto stop_kupdate;
+		}
+#ifdef DEBUG
+		printk(KERN_DEBUG "kupdate() activated...\n");
+#endif
+		sync_old_buffers();
+	}
+}
+
+static int __init bdflush_init(void)
+{
+	static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
+
+	kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	wait_for_completion(&startup);
+	kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	wait_for_completion(&startup);
+	return 0;
+}
+
+module_init(bdflush_init)
+
diff -urN linux-2.4.17-rc1-virgin/fs/dcache.c linux-2.4.17-rc1-wli3/fs/dcache.c
--- linux-2.4.17-rc1-virgin/fs/dcache.c	Fri Dec 14 06:04:11 2001
+++ linux-2.4.17-rc1-wli3/fs/dcache.c	Sun Dec 16 23:49:37 2001
@@ -320,11 +320,24 @@
  
 void prune_dcache(int count)
 {
+	DEFINE_LOCK_COUNT();
+
 	spin_lock(&dcache_lock);
+
+redo:
 	for (;;) {
 		struct dentry *dentry;
 		struct list_head *tmp;
 
+		if (TEST_LOCK_COUNT(100)) {
+			RESET_LOCK_COUNT();
+			debug_lock_break(1);
+			if (conditional_schedule_needed()) {
+				break_spin_lock(&dcache_lock);
+				goto redo;
+			}
+		}
+
 		tmp = dentry_unused.prev;
 
 		if (tmp == &dentry_unused)
@@ -480,6 +493,8 @@
 	struct list_head *next;
 	int found = 0;
 
+	DEFINE_LOCK_COUNT();
+
 	spin_lock(&dcache_lock);
 repeat:
 	next = this_parent->d_subdirs.next;
@@ -493,6 +508,12 @@
 			list_add(&dentry->d_lru, dentry_unused.prev);
 			found++;
 		}
+		if (TEST_LOCK_COUNT(500) && found > 10) {
+			debug_lock_break(1);
+			if (conditional_schedule_needed())
+				goto out;
+			RESET_LOCK_COUNT();
+		}
 		/*
 		 * Descend a level if the d_subdirs list is non-empty.
 		 */
@@ -517,6 +538,7 @@
 #endif
 		goto resume;
 	}
+out:
 	spin_unlock(&dcache_lock);
 	return found;
 }
@@ -546,6 +568,11 @@
  *   0 - very urgent: shrink everything
  *  ...
  *   6 - base-level: try to shrink a bit.
+ *
+ * Chuck Lever's dcache hash function relies on the aggressive
+ * shrinking where dentry_stat.nr_used is divided by priority.
+ * I added in a check for a priority of 0 to avoid division by 0.
+ * -- wli
  */
 int shrink_dcache_memory(int priority, unsigned int gfp_mask)
 {
@@ -565,6 +592,9 @@
 	if (!(gfp_mask & __GFP_FS))
 		return 0;
 
+	if(!priority)
+		BUG();
+
 	count = dentry_stat.nr_unused / priority;
 
 	prune_dcache(count);
@@ -683,10 +713,45 @@
 	return res;
 }
 
+/*
+ * The mult + shift 11 hash function from Chuck Lever's paper
+ * This apparently requires help from shrink_dcache_memory()
+ * and so that is added.
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ * page 8 describes the hash function.
+ */
 static inline struct list_head * d_hash(struct dentry * parent, unsigned long hash)
 {
-	hash += (unsigned long) parent / L1_CACHE_BYTES;
-	hash = hash ^ (hash >> D_HASHBITS);
+	hash += (unsigned long) parent;
+
+	/*
+	 * The integer multiply Lever hash function appears to be too
+	 * expensive even with hardware multiply support. Here we
+	 * enter the realm of voodoo.
+	 *
+	 * The multiplicative hash function was this:
+	 * hash *= 2654435761UL;
+	 * hash >>= 11;
+	 * The hard constant 11 is disturbing, and perhaps
+	 * has some bearing on why this did not work well.
+	 *
+	 * The hash function used here is the Mersenne prime
+	 * multiplicative hash function described in Lever's
+	 * paper, which uses a shift/add implementation afforded
+	 * by bit pattern properties of Mersenne primes.
+	 * -- wli
+	 *
+	 * Added in more special sauce to use the upper D_HASHBITS
+	 * of the computed hash key (which is voodoo).
+	 * -- wli
+	 *
+	 * Reverted to the Lever hash function.
+	 * -- wli
+	 */
+
+	/* hash = (hash << 7) - hash + (hash >> 10) + (hash >> 18); */
+	hash *= 2654435761UL;
+	hash >>= BITS_PER_LONG - D_HASHBITS;
 	return dentry_hashtable + (hash & D_HASHMASK);
 }
 
diff -urN linux-2.4.17-rc1-virgin/fs/exec.c linux-2.4.17-rc1-wli3/fs/exec.c
--- linux-2.4.17-rc1-virgin/fs/exec.c	Fri Dec 14 06:04:12 2001
+++ linux-2.4.17-rc1-wli3/fs/exec.c	Fri Dec 14 02:44:44 2001
@@ -35,6 +35,7 @@
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/personality.h>
+#include <linux/swap.h>
 #define __NO_VERSION__
 #include <linux/module.h>
 
@@ -279,6 +280,7 @@
 	flush_dcache_page(page);
 	flush_page_to_ram(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
+	page_add_rmap(page, pte);
 	tsk->mm->rss++;
 	spin_unlock(&tsk->mm->page_table_lock);
 
@@ -420,8 +422,8 @@
 		active_mm = current->active_mm;
 		current->mm = mm;
 		current->active_mm = mm;
-		task_unlock(current);
 		activate_mm(active_mm, mm);
+		task_unlock(current);
 		mm_release();
 		if (old_mm) {
 			if (active_mm != old_mm) BUG();
diff -urN linux-2.4.17-rc1-virgin/fs/ext3/inode.c linux-2.4.17-rc1-wli3/fs/ext3/inode.c
--- linux-2.4.17-rc1-virgin/fs/ext3/inode.c	Fri Dec 14 06:04:12 2001
+++ linux-2.4.17-rc1-wli3/fs/ext3/inode.c	Sun Dec 16 17:58:10 2001
@@ -1654,6 +1654,8 @@
 	}
 
 	for (p = first; p < last; p++) {
+		debug_lock_break(1); /* bkl is held */
+		conditional_schedule();
 		nr = le32_to_cpu(*p);
 		if (nr) {
 			/* accumulate blocks to free if they're contiguous */
@@ -1718,6 +1720,8 @@
 
 			/* Go read the buffer for the next level down */
 			bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize);
+			debug_lock_break(1);
+			conditional_schedule();
 
 			/*
 			 * A read failure? Report error and clear slot
diff -urN linux-2.4.17-rc1-virgin/fs/ext3/namei.c linux-2.4.17-rc1-wli3/fs/ext3/namei.c
--- linux-2.4.17-rc1-virgin/fs/ext3/namei.c	Fri Nov  9 14:25:04 2001
+++ linux-2.4.17-rc1-wli3/fs/ext3/namei.c	Sun Dec 16 17:58:10 2001
@@ -157,6 +157,8 @@
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 			goto next;
 		wait_on_buffer(bh);
+		debug_lock_break(1);
+		conditional_schedule();
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
 			brelse(bh);
diff -urN linux-2.4.17-rc1-virgin/fs/fat/cache.c linux-2.4.17-rc1-wli3/fs/fat/cache.c
--- linux-2.4.17-rc1-virgin/fs/fat/cache.c	Fri Oct 12 13:48:42 2001
+++ linux-2.4.17-rc1-wli3/fs/fat/cache.c	Fri Dec 14 02:44:44 2001
@@ -14,6 +14,7 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/fat_cvf.h>
+#include <linux/sched.h>
 
 #if 0
 #  define PRINTK(x) printk x
diff -urN linux-2.4.17-rc1-virgin/fs/inode.c linux-2.4.17-rc1-wli3/fs/inode.c
--- linux-2.4.17-rc1-virgin/fs/inode.c	Fri Dec 14 06:04:12 2001
+++ linux-2.4.17-rc1-wli3/fs/inode.c	Sun Dec 16 23:57:18 2001
@@ -567,6 +567,12 @@
 		if (tmp == head)
 			break;
 		inode = list_entry(tmp, struct inode, i_list);
+
+		debug_lock_break(2); /* bkl is also held */
+		atomic_inc(&inode->i_count);
+		break_spin_lock_and_resched(&inode_lock);
+		atomic_dec(&inode->i_count);
+
 		if (inode->i_sb != sb)
 			continue;
 		invalidate_inode_buffers(inode);
@@ -668,8 +674,11 @@
 	int count;
 	struct inode * inode;
 
+	DEFINE_LOCK_COUNT();
+
 	spin_lock(&inode_lock);
 
+free_unused:
 	count = 0;
 	entry = inode_unused.prev;
 	while (entry != &inode_unused)
@@ -692,6 +701,14 @@
 		count++;
 		if (!--goal)
 			break;
+		if (TEST_LOCK_COUNT(32)) {
+			RESET_LOCK_COUNT();
+			debug_lock_break(1);
+			if (conditional_schedule_needed()) {
+				break_spin_lock(&inode_lock);
+				goto free_unused;
+			}
+		}
 	}
 	inodes_stat.nr_unused -= count;
 	spin_unlock(&inode_lock);
@@ -899,14 +916,23 @@
 	return inode;
 }
 
+/*
+ * The properties have changed from Lever's paper. This is
+ * the multiplicative page cache hash function from Chuck Lever's paper,
+ * adapted to the inode hash table.
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ * iput() appears to be showing up in profiles, So I put what appears to
+ * be a theoretically sounder hash function here.
+ * -- wli
+ */
 static inline unsigned long hash(struct super_block *sb, unsigned long i_ino)
 {
-	unsigned long tmp = i_ino + ((unsigned long) sb / L1_CACHE_BYTES);
-	tmp = tmp + (tmp >> I_HASHBITS);
-	return tmp & I_HASHMASK;
-}
+	unsigned long hashval = i_ino + (unsigned long) sb;
 
-/* Yeah, I know about quadratic hash. Maybe, later. */
+	hashval = (hashval * 2654435761UL) >> (BITS_PER_LONG - I_HASHBITS);
+
+	return hashval & I_HASHMASK;
+}
 
 /**
  *	iunique - get a unique inode number
diff -urN linux-2.4.17-rc1-virgin/fs/jbd/commit.c linux-2.4.17-rc1-wli3/fs/jbd/commit.c
--- linux-2.4.17-rc1-virgin/fs/jbd/commit.c	Fri Dec 14 06:04:12 2001
+++ linux-2.4.17-rc1-wli3/fs/jbd/commit.c	Sun Dec 16 17:58:10 2001
@@ -212,6 +212,9 @@
 				__journal_remove_journal_head(bh);
 				refile_buffer(bh);
 				__brelse(bh);
+				debug_lock_break(2);
+				if (conditional_schedule_needed())
+					break;
 			}
 		}
 		if (bufs == ARRAY_SIZE(wbuf)) {
@@ -235,8 +238,7 @@
 		journal_brelse_array(wbuf, bufs);
 		lock_journal(journal);
 		spin_lock(&journal_datalist_lock);
-		if (bufs)
-			goto write_out_data_locked;
+		goto write_out_data_locked;
 	}
 
 	/*
@@ -272,6 +274,14 @@
 	 */
 	while ((jh = commit_transaction->t_async_datalist)) {
 		struct buffer_head *bh = jh2bh(jh);
+		if (conditional_schedule_needed()) {
+			debug_lock_break(551);
+			spin_unlock(&journal_datalist_lock);
+			unlock_journal(journal);
+			lock_journal(journal);
+			spin_lock(&journal_datalist_lock);
+			continue;
+		}
 		if (buffer_locked(bh)) {
 			spin_unlock(&journal_datalist_lock);
 			unlock_journal(journal);
diff -urN linux-2.4.17-rc1-virgin/fs/proc/array.c linux-2.4.17-rc1-wli3/fs/proc/array.c
--- linux-2.4.17-rc1-virgin/fs/proc/array.c	Thu Oct 11 09:00:01 2001
+++ linux-2.4.17-rc1-wli3/fs/proc/array.c	Fri Dec 14 06:05:17 2001
@@ -392,82 +392,11 @@
 		mmput(mm);
 	return res;
 }
-		
-static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
-	int * pages, int * shared, int * dirty, int * total)
-{
-	pte_t * pte;
-	unsigned long end;
-
-	if (pmd_none(*pmd))
-		return;
-	if (pmd_bad(*pmd)) {
-		pmd_ERROR(*pmd);
-		pmd_clear(pmd);
-		return;
-	}
-	pte = pte_offset(pmd, address);
-	address &= ~PMD_MASK;
-	end = address + size;
-	if (end > PMD_SIZE)
-		end = PMD_SIZE;
-	do {
-		pte_t page = *pte;
-		struct page *ptpage;
-
-		address += PAGE_SIZE;
-		pte++;
-		if (pte_none(page))
-			continue;
-		++*total;
-		if (!pte_present(page))
-			continue;
-		ptpage = pte_page(page);
-		if ((!VALID_PAGE(ptpage)) || PageReserved(ptpage))
-			continue;
-		++*pages;
-		if (pte_dirty(page))
-			++*dirty;
-		if (page_count(pte_page(page)) > 1)
-			++*shared;
-	} while (address < end);
-}
-
-static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size,
-	int * pages, int * shared, int * dirty, int * total)
-{
-	pmd_t * pmd;
-	unsigned long end;
-
-	if (pgd_none(*pgd))
-		return;
-	if (pgd_bad(*pgd)) {
-		pgd_ERROR(*pgd);
-		pgd_clear(pgd);
-		return;
-	}
-	pmd = pmd_offset(pgd, address);
-	address &= ~PGDIR_MASK;
-	end = address + size;
-	if (end > PGDIR_SIZE)
-		end = PGDIR_SIZE;
-	do {
-		statm_pte_range(pmd, address, end - address, pages, shared, dirty, total);
-		address = (address + PMD_SIZE) & PMD_MASK;
-		pmd++;
-	} while (address < end);
-}
-
-static void statm_pgd_range(pgd_t * pgd, unsigned long address, unsigned long end,
-	int * pages, int * shared, int * dirty, int * total)
-{
-	while (address < end) {
-		statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total);
-		address = (address + PGDIR_SIZE) & PGDIR_MASK;
-		pgd++;
-	}
-}
 
+/*
+ * This thing is slow so I've ripped out the page table scanning.
+ * The VMA scanning is slow enough.
+ */
 int proc_pid_statm(struct task_struct *task, char * buffer)
 {
 	struct mm_struct *mm;
@@ -482,23 +411,24 @@
 		struct vm_area_struct * vma;
 		down_read(&mm->mmap_sem);
 		vma = mm->mmap;
+		resident = mm->rss;
+		size = mm->total_vm;
 		while (vma) {
-			pgd_t *pgd = pgd_offset(mm, vma->vm_start);
-			int pages = 0, shared = 0, dirty = 0, total = 0;
+			int pages, total;
+
+			total = vma->vm_end - vma->vm_start;
+			pages = total >> PAGE_SHIFT;
+
+			if (vma->vm_flags & VM_SHARED)
+				share += pages;
 
-			statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
-			resident += pages;
-			share += shared;
-			dt += dirty;
-			size += total;
-			if (vma->vm_flags & VM_EXECUTABLE)
-				trs += pages;	/* text */
-			else if (vma->vm_flags & VM_GROWSDOWN)
-				drs += pages;	/* stack */
-			else if (vma->vm_end > 0x60000000)
-				lrs += pages;	/* library */
-			else
-				drs += pages;
+			if (vma->vm_flags & VM_EXECUTABLE) {
+				if(vma->vm_end > TASK_UNMAPPED_BASE)
+					lrs += pages;    /* library */
+				else
+					trs += pages;	/* text */
+			} else
+				drs += pages;	/* stack and data */
 			vma = vma->vm_next;
 		}
 		up_read(&mm->mmap_sem);
diff -urN linux-2.4.17-rc1-virgin/fs/proc/proc_misc.c linux-2.4.17-rc1-wli3/fs/proc/proc_misc.c
--- linux-2.4.17-rc1-virgin/fs/proc/proc_misc.c	Tue Nov 20 21:29:09 2001
+++ linux-2.4.17-rc1-wli3/fs/proc/proc_misc.c	Fri Dec 14 02:44:20 2001
@@ -164,7 +164,8 @@
 		"Cached:       %8lu kB\n"
 		"SwapCached:   %8lu kB\n"
 		"Active:       %8u kB\n"
-		"Inactive:     %8u kB\n"
+		"Inact_dirty: %8u kB\n"
+		"Inact_clean: %8u kB\n"
 		"HighTotal:    %8lu kB\n"
 		"HighFree:     %8lu kB\n"
 		"LowTotal:     %8lu kB\n"
@@ -178,7 +179,8 @@
 		K(pg_size - swapper_space.nrpages),
 		K(swapper_space.nrpages),
 		K(nr_active_pages),
-		K(nr_inactive_pages),
+		K(nr_inactive_dirty_pages),
+		K(nr_inactive_clean_pages),
 		K(i.totalhigh),
 		K(i.freehigh),
 		K(i.totalram-i.totalhigh),
diff -urN linux-2.4.17-rc1-virgin/fs/reiserfs/bitmap.c linux-2.4.17-rc1-wli3/fs/reiserfs/bitmap.c
--- linux-2.4.17-rc1-virgin/fs/reiserfs/bitmap.c	Fri Dec 14 06:04:14 2001
+++ linux-2.4.17-rc1-wli3/fs/reiserfs/bitmap.c	Sun Dec 16 17:58:10 2001
@@ -410,19 +410,23 @@
 	amount_needed++ ;
 	continue ;
     }
-       
 
-    reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ;
+   RFALSE( is_reusable (s, search_start, 0) == 0,
+           "vs-4140: bad block number found");
 
-    RFALSE( buffer_locked (SB_AP_BITMAP (s)[i]) || 
-	    is_reusable (s, search_start, 0) == 0,
-	    "vs-4140: bitmap block is locked or bad block number found");
+   reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ;
 
     /* if this bit was already set, we've scheduled, and someone else
     ** has allocated it.  loop around and try again
     */
     if (reiserfs_test_and_set_le_bit (j, SB_AP_BITMAP (s)[i]->b_data)) {
 	reiserfs_restore_prepared_buffer(s, SB_AP_BITMAP(s)[i]) ;
+	/* if this block has been allocated while we slept, it is
+	** impossible to find any more contiguous blocks for ourselves.
+	** If we are doing preallocation, give up now and return.
+	*/
+	if (for_prealloc)
+	    goto free_and_return;
 	amount_needed++ ;
 	continue ;
     }    
diff -urN linux-2.4.17-rc1-virgin/fs/reiserfs/buffer2.c linux-2.4.17-rc1-wli3/fs/reiserfs/buffer2.c
--- linux-2.4.17-rc1-virgin/fs/reiserfs/buffer2.c	Fri Dec 14 06:04:14 2001
+++ linux-2.4.17-rc1-wli3/fs/reiserfs/buffer2.c	Sun Dec 16 17:58:10 2001
@@ -55,6 +55,8 @@
     PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
 
     result = bread (super -> s_dev, n_block, n_size);
+    debug_lock_break(1);
+    conditional_schedule();
     PROC_INFO_INC( super, breads );
     PROC_EXP( if( kstat.context_swtch != ctx_switches ) 
 	      PROC_INFO_INC( super, bread_miss ) );
diff -urN linux-2.4.17-rc1-virgin/fs/reiserfs/journal.c linux-2.4.17-rc1-wli3/fs/reiserfs/journal.c
--- linux-2.4.17-rc1-virgin/fs/reiserfs/journal.c	Fri Dec 14 06:04:15 2001
+++ linux-2.4.17-rc1-wli3/fs/reiserfs/journal.c	Sun Dec 16 17:58:10 2001
@@ -574,6 +574,8 @@
 /* lock the current transaction */
 inline static void lock_journal(struct super_block *p_s_sb) {
   PROC_INFO_INC( p_s_sb, journal.lock_journal );
+  debug_lock_break(1);
+  conditional_schedule();
   while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) {
     PROC_INFO_INC( p_s_sb, journal.lock_journal_wait );
     sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
@@ -704,6 +706,8 @@
 	mark_buffer_dirty(tbh) ;
       }
       ll_rw_block(WRITE, 1, &tbh) ;
+      debug_lock_break(1);
+      conditional_schedule();
       count++ ;
       put_bh(tbh) ; /* once for our get_hash */
     } 
@@ -833,6 +837,8 @@
     set_bit(BH_Dirty, &(SB_JOURNAL(p_s_sb)->j_header_bh->b_state)) ;
     ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ;
     wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; 
+    debug_lock_break(1);
+    conditional_schedule();
     if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) {
       printk( "reiserfs: journal-837: IO error during journal replay\n" );
       return -EIO ;
@@ -2092,6 +2098,8 @@
 }
 
 int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  * p_s_sb, unsigned long nblocks) {
+  debug_lock_break(1);
+  conditional_schedule();
   return do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
 }
 
@@ -2232,6 +2240,8 @@
 }
 
 int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
+  debug_lock_break(1);
+  conditional_schedule();
   return do_journal_end(th, p_s_sb, nblocks, 0) ;
 }
 
@@ -2683,6 +2693,8 @@
       RFALSE( buffer_locked(bh) && cur_tb != NULL,
 	      "waiting while do_balance was running\n") ;
       wait_on_buffer(bh) ;
+      debug_lock_break(1);
+      conditional_schedule();
     }
     PROC_INFO_INC( p_s_sb, journal.prepare_retry );
     retry_count++ ;
@@ -2856,6 +2868,8 @@
     /* copy all the real blocks into log area.  dirty log blocks */
     if (test_bit(BH_JDirty, &cn->bh->b_state)) {
       struct buffer_head *tmp_bh ;
+      debug_lock_break(1);
+      conditional_schedule();		/* getblk can sleep, so... */
       tmp_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + 
 		     ((cur_write_start + jindex) % JOURNAL_BLOCK_COUNT), 
 				       p_s_sb->s_blocksize) ;
diff -urN linux-2.4.17-rc1-virgin/fs/reiserfs/stree.c linux-2.4.17-rc1-wli3/fs/reiserfs/stree.c
--- linux-2.4.17-rc1-virgin/fs/reiserfs/stree.c	Fri Dec 14 06:04:15 2001
+++ linux-2.4.17-rc1-wli3/fs/reiserfs/stree.c	Sun Dec 16 17:58:10 2001
@@ -648,9 +648,8 @@
                                        stop at leaf level - set to
                                        DISK_LEAF_NODE_LEVEL */
     ) {
-    int  n_block_number = SB_ROOT_BLOCK (p_s_sb),
-      expected_level = SB_TREE_HEIGHT (p_s_sb),
-      n_block_size    = p_s_sb->s_blocksize;
+    int n_block_number, expected_level;
+    int n_block_size    = p_s_sb->s_blocksize;
     struct buffer_head  *       p_s_bh;
     struct path_element *       p_s_last_element;
     int				n_node_level, n_retval;
@@ -662,7 +661,10 @@
 #endif
     
     PROC_INFO_INC( p_s_sb, search_by_key );
-    
+
+    debug_lock_break(1);
+    conditional_schedule();
+
     /* As we add each node to a path we increase its count.  This means that
        we must be careful to release all nodes in a path before we either
        discard the path struct or re-use the path struct, as we do here. */
@@ -674,6 +676,8 @@
     /* With each iteration of this loop we search through the items in the
        current node, and calculate the next current node(next path element)
        for the next iteration of this loop.. */
+    n_block_number = SB_ROOT_BLOCK (p_s_sb);
+    expected_level = SB_TREE_HEIGHT (p_s_sb);
     while ( 1 ) {
 
 #ifdef CONFIG_REISERFS_CHECK
@@ -1099,6 +1103,9 @@
 
 	    for (n_counter = *p_n_removed;
 		 n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) {
+
+		debug_lock_break(1);
+		conditional_schedule();
 
 		if (item_moved (&s_ih, p_s_path)) {
 		    need_research = 1 ;
diff -urN linux-2.4.17-rc1-virgin/include/asm-alpha/bootmem.h linux-2.4.17-rc1-wli3/include/asm-alpha/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-alpha/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-alpha/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,12 @@
+/*
+ * include/asm-alpha/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * Alpha has some NUMA systems, but it's uncertain to me what
+ * an appropriate value of NR_SEGMENTS should be.
+ *
+ * For the moment, the generic single-page definition is here,
+ * but those who run on Alpha may need to increase the value
+ * at least until the page stealing is in place.
+ */
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/bootmem.h linux-2.4.17-rc1-wli3/include/asm-arm/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-arm/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-arm/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,9 @@
+/*
+ * include/asm-arm/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * ARM appeared to have little trouble with a single-page-sized
+ * segment pool, so the generic NR_SEGMENTS is okay for now.
+ * This will go away once page stealing is in place.
+ */
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/dma.h linux-2.4.17-rc1-wli3/include/asm-arm/dma.h
--- linux-2.4.17-rc1-virgin/include/asm-arm/dma.h	Sun Aug 12 11:14:00 2001
+++ linux-2.4.17-rc1-wli3/include/asm-arm/dma.h	Fri Dec 14 02:44:44 2001
@@ -5,6 +5,7 @@
 
 #include <linux/config.h>
 #include <linux/spinlock.h>
+#include <linux/sched.h>
 #include <asm/system.h>
 #include <asm/memory.h>
 #include <asm/scatterlist.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/hardirq.h linux-2.4.17-rc1-wli3/include/asm-arm/hardirq.h
--- linux-2.4.17-rc1-virgin/include/asm-arm/hardirq.h	Thu Oct 11 09:04:57 2001
+++ linux-2.4.17-rc1-wli3/include/asm-arm/hardirq.h	Fri Dec 14 02:44:44 2001
@@ -34,6 +34,7 @@
 #define irq_exit(cpu,irq)	(local_irq_count(cpu)--)
 
 #define synchronize_irq()	do { } while (0)
+#define release_irqlock(cpu)	do { } while (0)
 
 #else
 #error SMP not supported
diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/mmu_context.h linux-2.4.17-rc1-wli3/include/asm-arm/mmu_context.h
--- linux-2.4.17-rc1-virgin/include/asm-arm/mmu_context.h	Mon Sep 18 15:15:24 2000
+++ linux-2.4.17-rc1-wli3/include/asm-arm/mmu_context.h	Fri Dec 14 02:44:44 2001
@@ -42,6 +42,10 @@
 switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	  struct task_struct *tsk, unsigned int cpu)
 {
+#ifdef CONFIG_PREEMPT
+	if (preempt_is_disable() == 0)
+		BUG();
+#endif
 	if (prev != next) {
 		cpu_switch_mm(next->pgd, tsk);
 		clear_bit(cpu, &prev->cpu_vm_mask);
diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/pgalloc.h linux-2.4.17-rc1-wli3/include/asm-arm/pgalloc.h
--- linux-2.4.17-rc1-virgin/include/asm-arm/pgalloc.h	Sun Aug 12 11:14:00 2001
+++ linux-2.4.17-rc1-wli3/include/asm-arm/pgalloc.h	Fri Dec 14 02:44:44 2001
@@ -57,40 +57,48 @@
 {
 	unsigned long *ret;
 
+	preempt_disable();
 	if ((ret = pgd_quicklist) != NULL) {
 		pgd_quicklist = (unsigned long *)__pgd_next(ret);
 		ret[1] = ret[2];
 		clean_dcache_entry(ret + 1);
 		pgtable_cache_size--;
 	}
+	preempt_enable();
 	return (pgd_t *)ret;
 }
 
 static inline void free_pgd_fast(pgd_t *pgd)
 {
+	preempt_disable();
 	__pgd_next(pgd) = (unsigned long) pgd_quicklist;
 	pgd_quicklist = (unsigned long *) pgd;
 	pgtable_cache_size++;
+	preempt_enable();
 }
 
 static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address)
 {
 	unsigned long *ret;
 
+	preempt_disable();
 	if((ret = pte_quicklist) != NULL) {
 		pte_quicklist = (unsigned long *)__pte_next(ret);
 		ret[0] = 0;
 		clean_dcache_entry(ret);
 		pgtable_cache_size--;
 	}
+	preempt_enable();
 	return (pte_t *)ret;
 }
 
 static inline void free_pte_fast(pte_t *pte)
 {
+	preempt_disable();
 	__pte_next(pte) = (unsigned long) pte_quicklist;
 	pte_quicklist = (unsigned long *) pte;
 	pgtable_cache_size++;
+	preempt_enable();
 }
 
 #else	/* CONFIG_NO_PGT_CACHE */
diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/smplock.h linux-2.4.17-rc1-wli3/include/asm-arm/smplock.h
--- linux-2.4.17-rc1-virgin/include/asm-arm/smplock.h	Sun Aug 12 11:14:00 2001
+++ linux-2.4.17-rc1-wli3/include/asm-arm/smplock.h	Fri Dec 14 02:44:44 2001
@@ -3,12 +3,17 @@
  *
  * Default SMP lock implementation
  */
+#include <linux/config.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 
 extern spinlock_t kernel_flag;
 
+#ifdef CONFIG_PREEMPT
+#define kernel_locked()		preempt_is_disable()
+#else
 #define kernel_locked()		spin_is_locked(&kernel_flag)
+#endif
 
 /*
  * Release global kernel lock and global interrupt lock
@@ -40,8 +45,14 @@
  */
 static inline void lock_kernel(void)
 {
+#ifdef CONFIG_PREEMPT
+	if (current->lock_depth == -1)
+		spin_lock(&kernel_flag);
+	++current->lock_depth;
+#else
 	if (!++current->lock_depth)
 		spin_lock(&kernel_flag);
+#endif
 }
 
 static inline void unlock_kernel(void)
diff -urN linux-2.4.17-rc1-virgin/include/asm-arm/softirq.h linux-2.4.17-rc1-wli3/include/asm-arm/softirq.h
--- linux-2.4.17-rc1-virgin/include/asm-arm/softirq.h	Sat Sep  8 12:02:31 2001
+++ linux-2.4.17-rc1-wli3/include/asm-arm/softirq.h	Fri Dec 14 02:44:44 2001
@@ -5,20 +5,22 @@
 #include <asm/hardirq.h>
 
 #define __cpu_bh_enable(cpu) \
-		do { barrier(); local_bh_count(cpu)--; } while (0)
+		do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0)
 #define cpu_bh_disable(cpu) \
-		do { local_bh_count(cpu)++; barrier(); } while (0)
+		do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0)
 
 #define local_bh_disable()	cpu_bh_disable(smp_processor_id())
 #define __local_bh_enable()	__cpu_bh_enable(smp_processor_id())
 
 #define in_softirq()		(local_bh_count(smp_processor_id()) != 0)
 
-#define local_bh_enable()						\
+#define _local_bh_enable()						\
 do {									\
 	unsigned int *ptr = &local_bh_count(smp_processor_id());	\
 	if (!--*ptr && ptr[-2])						\
 		__asm__("bl%? __do_softirq": : : "lr");/* out of line */\
 } while (0)
+
+#define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0)
 
 #endif	/* __ASM_SOFTIRQ_H */
diff -urN linux-2.4.17-rc1-virgin/include/asm-cris/bootmem.h linux-2.4.17-rc1-wli3/include/asm-cris/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-cris/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-cris/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,11 @@
+/*
+ * include/asm-cris/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * Cris hasn't been tested with this yet, so
+ * port maintainers may want to increase the value
+ * of NR_SEGMENTS if this becomes a problem.
+ * This will go away once page stealing is in place.
+ */
+
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-generic/bootmem.h linux-2.4.17-rc1-wli3/include/asm-generic/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-generic/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-generic/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,25 @@
+#ifndef _ASM_BOOTMEM_H
+#define _ASM_BOOTMEM_H
+
+/*
+ * include/asm-generic/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * NR_SEGMENTS is the number of line segment tree nodes held
+ * in the per-node segment pools.
+ *
+ * For the moment, this is a fixed size, because dynamically
+ * determining the number of segments per node would require
+ * a change of interface. On 32-bit machines with 4KB pages
+ * this is 170 distinct fragments of memory per page.
+ *
+ * So long as the arena for the tree nodes is statically
+ * allocated, this must be an arch-specific #define
+ * This can be eliminated entirely only by a change of
+ * interface. Page stealing is simple, but unsafe until
+ * after the absolutely necessary reservations are done.
+ */
+
+#define NR_SEGMENTS (PAGE_SIZE/sizeof(segment_buf_t))
+
+#endif /* _ASM_BOOTMEM_H */
diff -urN linux-2.4.17-rc1-virgin/include/asm-generic/rmap.h linux-2.4.17-rc1-wli3/include/asm-generic/rmap.h
--- linux-2.4.17-rc1-virgin/include/asm-generic/rmap.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-generic/rmap.h	Fri Dec 14 02:44:20 2001
@@ -0,0 +1,51 @@
+#ifndef _GENERIC_RMAP_H
+#define _GENERIC_RMAP_H
+/*
+ * linux/include/asm-generic/rmap.h
+ *
+ * Architecture dependant parts of the reverse mapping code,
+ * this version should work for most architectures with a
+ * 'normal' page table layout.
+ *
+ * We use the struct page of the page table page to find out
+ * the process and full address of a page table entry:
+ * - page->mapping points to the process' mm_struct
+ * - page->index has the high bits of the address
+ * - the lower bits of the address are calculated from the
+ *   offset of the page table entry within the page table page
+ */
+#include <linux/mm.h>
+
+static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, unsigned long address)
+{
+	struct page * page = virt_to_page(ptep);
+
+	page->mapping = (void *)mm;
+	page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+}
+
+static inline void pgtable_remove_rmap(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+
+	page->mapping = NULL;
+	page->index = 0;
+}
+
+static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+
+	return (struct mm_struct *) page->mapping;
+}
+
+static inline unsigned long ptep_to_address(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+	unsigned long low_bits;
+
+	low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+	return page->index + low_bits;
+}
+
+#endif /* _GENERIC_RMAP_H */
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/bootmem.h linux-2.4.17-rc1-wli3/include/asm-i386/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-i386/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,9 @@
+/*
+ * include/asm-i386/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * i386 has been well-tested with this value of NR_SEGMENTS.
+ * There are some i386 architectures with highly-fragmented
+ * memory that may need to alter it.
+ */
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/hardirq.h linux-2.4.17-rc1-wli3/include/asm-i386/hardirq.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/hardirq.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/asm-i386/hardirq.h	Sun Dec 16 18:05:01 2001
@@ -36,6 +36,8 @@
 
 #define synchronize_irq()	barrier()
 
+#define release_irqlock(cpu)	do { } while (0)
+
 #else
 
 #include <asm/atomic.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/highmem.h linux-2.4.17-rc1-wli3/include/asm-i386/highmem.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/highmem.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/asm-i386/highmem.h	Sun Dec 16 18:05:01 2001
@@ -88,6 +88,7 @@
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
+	preempt_disable();
 	if (page < highmem_start_page)
 		return page_address(page);
 
@@ -109,8 +110,10 @@
 	unsigned long vaddr = (unsigned long) kvaddr;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
-	if (vaddr < FIXADDR_START) // FIXME
+	if (vaddr < FIXADDR_START) { // FIXME
+		preempt_enable();
 		return;
+	}
 
 	if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
 		BUG();
@@ -122,6 +125,8 @@
 	pte_clear(kmap_pte-idx);
 	__flush_tlb_one(vaddr);
 #endif
+
+	preempt_enable();
 }
 
 #endif /* __KERNEL__ */
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/hw_irq.h linux-2.4.17-rc1-wli3/include/asm-i386/hw_irq.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/hw_irq.h	Thu Nov 22 11:46:18 2001
+++ linux-2.4.17-rc1-wli3/include/asm-i386/hw_irq.h	Sun Dec 16 18:04:59 2001
@@ -95,6 +95,18 @@
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
+#define GET_CURRENT \
+	"movl %esp, %ebx\n\t" \
+	"andl $-8192, %ebx\n\t"
+
+#ifdef CONFIG_PREEMPT
+#define BUMP_LOCK_COUNT \
+	GET_CURRENT \
+	"incl 4(%ebx)\n\t"
+#else
+#define BUMP_LOCK_COUNT
+#endif
+
 #define SAVE_ALL \
 	"cld\n\t" \
 	"pushl %es\n\t" \
@@ -108,14 +120,11 @@
 	"pushl %ebx\n\t" \
 	"movl $" STR(__KERNEL_DS) ",%edx\n\t" \
 	"movl %edx,%ds\n\t" \
-	"movl %edx,%es\n\t"
+	"movl %edx,%es\n\t" \
+	BUMP_LOCK_COUNT
 
 #define IRQ_NAME2(nr) nr##_interrupt(void)
 #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
-
-#define GET_CURRENT \
-	"movl %esp, %ebx\n\t" \
-	"andl $-8192, %ebx\n\t"
 
 /*
  *	SMP has a few special interrupts for IPI messages
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/i387.h linux-2.4.17-rc1-wli3/include/asm-i386/i387.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/i387.h	Thu Nov 22 11:48:58 2001
+++ linux-2.4.17-rc1-wli3/include/asm-i386/i387.h	Sun Dec 16 18:16:02 2001
@@ -12,6 +12,7 @@
 #define __ASM_I386_I387_H
 
 #include <linux/sched.h>
+#include <linux/spinlock.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
 #include <asm/user.h>
@@ -24,7 +25,7 @@
 extern void restore_fpu( struct task_struct *tsk );
 
 extern void kernel_fpu_begin(void);
-#define kernel_fpu_end() stts()
+#define kernel_fpu_end() do { stts(); preempt_enable(); } while(0)
 
 
 #define unlazy_fpu( tsk ) do { \
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/mmu_context.h linux-2.4.17-rc1-wli3/include/asm-i386/mmu_context.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/mmu_context.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/asm-i386/mmu_context.h	Sun Dec 16 18:05:01 2001
@@ -27,6 +27,10 @@
 
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
 {
+#ifdef CONFIG_PREEMPT
+	if (preempt_is_disabled() == 0)
+		BUG();
+#endif
 	if (prev != next) {
 		/* stop flush ipis for the previous mm */
 		clear_bit(cpu, &prev->cpu_vm_mask);
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/param.h linux-2.4.17-rc1-wli3/include/asm-i386/param.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/param.h	Fri Oct 27 11:04:43 2000
+++ linux-2.4.17-rc1-wli3/include/asm-i386/param.h	Sun Dec 16 01:24:48 2001
@@ -2,7 +2,8 @@
 #define _ASMi386_PARAM_H
 
 #ifndef HZ
-#define HZ 100
+/* #define HZ 100 */
+#define HZ 256
 #endif
 
 #define EXEC_PAGESIZE	4096
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/pgalloc.h linux-2.4.17-rc1-wli3/include/asm-i386/pgalloc.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/pgalloc.h	Fri Dec 14 06:04:15 2001
+++ linux-2.4.17-rc1-wli3/include/asm-i386/pgalloc.h	Sun Dec 16 18:05:01 2001
@@ -75,20 +75,26 @@
 {
 	unsigned long *ret;
 
+	preempt_disable();
 	if ((ret = pgd_quicklist) != NULL) {
 		pgd_quicklist = (unsigned long *)(*ret);
 		ret[0] = 0;
 		pgtable_cache_size--;
-	} else
+		preempt_enable();
+	} else {
+		preempt_enable();
 		ret = (unsigned long *)get_pgd_slow();
+	}
 	return (pgd_t *)ret;
 }
 
 static inline void free_pgd_fast(pgd_t *pgd)
 {
+	preempt_disable();
 	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
 	pgd_quicklist = (unsigned long *) pgd;
 	pgtable_cache_size++;
+	preempt_enable();
 }
 
 static inline void free_pgd_slow(pgd_t *pgd)
@@ -119,19 +125,23 @@
 {
 	unsigned long *ret;
 
+	preempt_disable();
 	if ((ret = (unsigned long *)pte_quicklist) != NULL) {
 		pte_quicklist = (unsigned long *)(*ret);
 		ret[0] = ret[1];
 		pgtable_cache_size--;
 	}
+	preempt_enable();
 	return (pte_t *)ret;
 }
 
 static inline void pte_free_fast(pte_t *pte)
 {
+	preempt_disable();
 	*(unsigned long *)pte = (unsigned long) pte_quicklist;
 	pte_quicklist = (unsigned long *) pte;
 	pgtable_cache_size++;
+	preempt_enable();
 }
 
 static __inline__ void pte_free_slow(pte_t *pte)
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/processor.h linux-2.4.17-rc1-wli3/include/asm-i386/processor.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/processor.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/asm-i386/processor.h	Sun Dec 16 18:04:59 2001
@@ -502,7 +502,10 @@
 {
 	 __asm__ __volatile__ ("prefetchw (%0)" : : "r"(x));
 }
-#define spin_lock_prefetch(x)	prefetchw(x)
+#define spin_lock_prefetch(x) do {				\
+	prefetchw(x);						\
+	preempt_prefetch(&current->preempt_count);		\
+} while(0)
 
 #endif
 
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/rmap.h linux-2.4.17-rc1-wli3/include/asm-i386/rmap.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/rmap.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-i386/rmap.h	Fri Dec 14 02:44:20 2001
@@ -0,0 +1,7 @@
+#ifndef _I386_RMAP_H
+#define _I386_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/smplock.h linux-2.4.17-rc1-wli3/include/asm-i386/smplock.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/smplock.h	Thu Nov 22 11:46:20 2001
+++ linux-2.4.17-rc1-wli3/include/asm-i386/smplock.h	Sun Dec 16 18:16:02 2001
@@ -10,7 +10,15 @@
 
 extern spinlock_t kernel_flag;
 
+#ifdef CONFIG_SMP
 #define kernel_locked()		spin_is_locked(&kernel_flag)
+#else
+#ifdef CONFIG_PREEMPT
+#define kernel_locked()		preempt_is_disabled()
+#else
+#define kernel_locked()		1
+#endif
+#endif
 
 /*
  * Release global kernel lock and global interrupt lock
@@ -42,6 +50,11 @@
  */
 static __inline__ void lock_kernel(void)
 {
+#ifdef CONFIG_PREEMPT
+	if (current->lock_depth == -1)
+		spin_lock(&kernel_flag);
+	++current->lock_depth;
+#else
 #if 1
 	if (!++current->lock_depth)
 		spin_lock(&kernel_flag);
@@ -53,6 +66,7 @@
 		"\n9:"
 		:"=m" (__dummy_lock(&kernel_flag)),
 		 "=m" (current->lock_depth));
+#endif
 #endif
 }
 
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/softirq.h linux-2.4.17-rc1-wli3/include/asm-i386/softirq.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/softirq.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/asm-i386/softirq.h	Sun Dec 16 18:05:01 2001
@@ -5,9 +5,9 @@
 #include <asm/hardirq.h>
 
 #define __cpu_bh_enable(cpu) \
-		do { barrier(); local_bh_count(cpu)--; } while (0)
+		do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0)
 #define cpu_bh_disable(cpu) \
-		do { local_bh_count(cpu)++; barrier(); } while (0)
+		do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0)
 
 #define local_bh_disable()	cpu_bh_disable(smp_processor_id())
 #define __local_bh_enable()	__cpu_bh_enable(smp_processor_id())
@@ -22,7 +22,7 @@
  * If you change the offsets in irq_stat then you have to
  * update this code as well.
  */
-#define local_bh_enable()						\
+#define _local_bh_enable()						\
 do {									\
 	unsigned int *ptr = &local_bh_count(smp_processor_id());	\
 									\
@@ -44,5 +44,7 @@
 		: "r" (ptr), "i" (do_softirq)				\
 		/* no registers clobbered */ );				\
 } while (0)
+
+#define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0)
 
 #endif	/* __ASM_SOFTIRQ_H */
diff -urN linux-2.4.17-rc1-virgin/include/asm-i386/spinlock.h linux-2.4.17-rc1-wli3/include/asm-i386/spinlock.h
--- linux-2.4.17-rc1-virgin/include/asm-i386/spinlock.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/asm-i386/spinlock.h	Sun Dec 16 18:04:59 2001
@@ -77,7 +77,7 @@
 		:"=m" (lock->lock) : : "memory"
 
 
-static inline void spin_unlock(spinlock_t *lock)
+static inline void _raw_spin_unlock(spinlock_t *lock)
 {
 #if SPINLOCK_DEBUG
 	if (lock->magic != SPINLOCK_MAGIC)
@@ -97,7 +97,7 @@
 		:"=q" (oldval), "=m" (lock->lock) \
 		:"0" (oldval) : "memory"
 
-static inline void spin_unlock(spinlock_t *lock)
+static inline void _raw_spin_unlock(spinlock_t *lock)
 {
 	char oldval = 1;
 #if SPINLOCK_DEBUG
@@ -113,7 +113,7 @@
 
 #endif
 
-static inline int spin_trylock(spinlock_t *lock)
+static inline int _raw_spin_trylock(spinlock_t *lock)
 {
 	char oldval;
 	__asm__ __volatile__(
@@ -123,7 +123,7 @@
 	return oldval > 0;
 }
 
-static inline void spin_lock(spinlock_t *lock)
+static inline void _raw_spin_lock(spinlock_t *lock)
 {
 #if SPINLOCK_DEBUG
 	__label__ here;
@@ -179,7 +179,7 @@
  */
 /* the spinlock helpers are in arch/i386/kernel/semaphore.c */
 
-static inline void read_lock(rwlock_t *rw)
+static inline void _raw_read_lock(rwlock_t *rw)
 {
 #if SPINLOCK_DEBUG
 	if (rw->magic != RWLOCK_MAGIC)
@@ -188,7 +188,7 @@
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void write_lock(rwlock_t *rw)
+static inline void _raw_write_lock(rwlock_t *rw)
 {
 #if SPINLOCK_DEBUG
 	if (rw->magic != RWLOCK_MAGIC)
@@ -197,10 +197,10 @@
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-#define read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
-#define write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
+#define _raw_read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
+#define _raw_write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
 
-static inline int write_trylock(rwlock_t *lock)
+static inline int _raw_write_trylock(rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
diff -urN linux-2.4.17-rc1-virgin/include/asm-ia64/bootmem.h linux-2.4.17-rc1-wli3/include/asm-ia64/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-ia64/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-ia64/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,19 @@
+#ifndef _ASM_BOOTMEM_H
+#define _ASM_BOOTMEM_H
+
+/*
+ * include/asm-ia64/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * ACPI on IA64 is one of the heaviest memory-reserving subsystems
+ * of any architecture. This leads to enough fragmentation to exhaust
+ * the segment pool with the default NR_SEGMENTS several times over.
+ * This value has been tested on Intel Lion systems, but the author
+ * is well-aware of systems requiring still higher values.
+ *
+ * This will go away entirely once page stealing is in place.
+ */
+
+#define NR_SEGMENTS ((8*PAGE_SIZE)/sizeof(segment_buf_t))
+
+#endif /* _ASM_BOOTMEM_H */
diff -urN linux-2.4.17-rc1-virgin/include/asm-m68k/bootmem.h linux-2.4.17-rc1-wli3/include/asm-m68k/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-m68k/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-m68k/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,11 @@
+/*
+ * include/asm-m68k/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * m68k should in all likelihood be happy with this value of
+ * NR_SEGMENTS, though testing has been obstructed
+ * by issues unrelated to bootmem.
+ * NR_SEGMENTS will go away entirely once page stealing is in place.
+ */
+
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-mips/bootmem.h linux-2.4.17-rc1-wli3/include/asm-mips/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-mips/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-mips/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,11 @@
+/*
+ * include/asm-mips/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * This value of NR_SEGMENTS has been tested on a DecStation 5000/200
+ * and it was happy with it. That does not rule out a possible need to
+ * increase the value on systems I've not tested.
+ * NR_SEGMENTS will go away once page stealing is in place.
+ */
+
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-mips64/bootmem.h linux-2.4.17-rc1-wli3/include/asm-mips64/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-mips64/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-mips64/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,11 @@
+/*
+ * include/asm-mips64/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * mips64 includes some very large memory machines with very fragmented
+ * memory. There are also likely to be patch conflicts as the discontig
+ * patch touches bootmem. This value is almost certainly wrong.
+ * Fortunately, NR_SEGMENTS will go away soon.
+ */
+
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-parisc/bootmem.h linux-2.4.17-rc1-wli3/include/asm-parisc/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-parisc/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-parisc/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,10 @@
+/*
+ * include/asm-parisc/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * PA-RISC memory maps have relatively few contiguous
+ * ranges of available memory, and so the generic NR_SEGMENTS 
+ * will suffice until NR_SEGMENTS is eliminated.
+ */
+
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-ppc/bootmem.h linux-2.4.17-rc1-wli3/include/asm-ppc/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-ppc/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-ppc/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,10 @@
+/*
+ * include/asm-ppc/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * According to sources, 32-bit PPC has relatively few fragments
+ * of available memory, and so the generic NR_SEGMENTS should
+ * suffice until NR_SEGMENTS is eliminated.
+ */
+
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-s390/bootmem.h linux-2.4.17-rc1-wli3/include/asm-s390/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-s390/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-s390/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,10 @@
+/*
+ * include/asm-alpha/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * S390 will probably not need to change NR_SEGMENTS,
+ * as setup.c tracks memory fragments on its own and
+ * insists on less than 16.
+ * NR_SEGMENTS will go away once page stealing is in place.
+ */
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-s390x/bootmem.h linux-2.4.17-rc1-wli3/include/asm-s390x/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-s390x/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-s390x/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,10 @@
+/*
+ * include/asm-s390x/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * S390x is unlikely to need to change NR_SEGMENTS, as it tracks ranges
+ * itself in setup.c and uses less than 16.
+ * NR_SEGMENTS will go away once page stealing is in place in the
+ * bootmem allocator.
+ */
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-sh/bootmem.h linux-2.4.17-rc1-wli3/include/asm-sh/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-sh/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-sh/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,8 @@
+/*
+ * include/asm-sh/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * Super-H has not been tested, so NR_SEGMENTS may need to change.
+ * NR_SEGMENTS will be eliminated once page stealing is in place.
+ */
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-sh/hardirq.h linux-2.4.17-rc1-wli3/include/asm-sh/hardirq.h
--- linux-2.4.17-rc1-virgin/include/asm-sh/hardirq.h	Sat Sep  8 12:29:09 2001
+++ linux-2.4.17-rc1-wli3/include/asm-sh/hardirq.h	Fri Dec 14 02:44:44 2001
@@ -34,6 +34,8 @@
 
 #define synchronize_irq()	barrier()
 
+#define release_irqlock(cpu)	do { } while (0)
+
 #else
 
 #error Super-H SMP is not available
diff -urN linux-2.4.17-rc1-virgin/include/asm-sh/mmu_context.h linux-2.4.17-rc1-wli3/include/asm-sh/mmu_context.h
--- linux-2.4.17-rc1-virgin/include/asm-sh/mmu_context.h	Sat Sep  8 12:29:09 2001
+++ linux-2.4.17-rc1-wli3/include/asm-sh/mmu_context.h	Fri Dec 14 02:44:44 2001
@@ -166,6 +166,10 @@
 				 struct mm_struct *next,
 				 struct task_struct *tsk, unsigned int cpu)
 {
+#ifdef CONFIG_PREEMPT
+	if (preempt_is_disabled() == 0)
+		BUG();
+#endif
 	if (prev != next) {
 		unsigned long __pgdir = (unsigned long)next->pgd;
 
diff -urN linux-2.4.17-rc1-virgin/include/asm-sh/smplock.h linux-2.4.17-rc1-wli3/include/asm-sh/smplock.h
--- linux-2.4.17-rc1-virgin/include/asm-sh/smplock.h	Sat Sep  8 12:29:09 2001
+++ linux-2.4.17-rc1-wli3/include/asm-sh/smplock.h	Fri Dec 14 02:44:44 2001
@@ -9,15 +9,88 @@
 
 #include <linux/config.h>
 
-#ifndef CONFIG_SMP
-
+#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT)
+/*
+ * Should never happen, since linux/smp_lock.h catches this case;
+ * but in case this file is included directly with neither SMP nor
+ * PREEMPT configuration, provide same dummys as linux/smp_lock.h
+ */
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
-#define release_kernel_lock(task, cpu, depth)	((depth) = 1)
-#define reacquire_kernel_lock(task, cpu, depth)	do { } while(0)
+#define release_kernel_lock(task, cpu)		do { } while(0)
+#define reacquire_kernel_lock(task)		do { } while(0)
+#define kernel_locked()		1
+
+#else /* CONFIG_SMP || CONFIG_PREEMPT */
+
+#if CONFIG_SMP
+#error "We do not support SMP on SH yet"
+#endif
+/*
+ * Default SMP lock implementation (i.e. the i386 version)
+ */
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+
+extern spinlock_t kernel_flag;
+#define lock_bkl() spin_lock(&kernel_flag)
+#define unlock_bkl() spin_unlock(&kernel_flag)
 
+#ifdef CONFIG_SMP
+#define kernel_locked()		spin_is_locked(&kernel_flag)
+#elif  CONFIG_PREEMPT
+#define kernel_locked()		preempt_is_disabled()
+#else  /* neither */
+#define kernel_locked()		1
+#endif
+
+/*
+ * Release global kernel lock and global interrupt lock
+ */
+#define release_kernel_lock(task, cpu) \
+do { \
+	if (task->lock_depth >= 0) \
+		spin_unlock(&kernel_flag); \
+	release_irqlock(cpu); \
+	__sti(); \
+} while (0)
+
+/*
+ * Re-acquire the kernel lock
+ */
+#define reacquire_kernel_lock(task) \
+do { \
+	if (task->lock_depth >= 0) \
+		spin_lock(&kernel_flag); \
+} while (0)
+
+/*
+ * Getting the big kernel lock.
+ *
+ * This cannot happen asynchronously,
+ * so we only need to worry about other
+ * CPU's.
+ */
+static __inline__ void lock_kernel(void)
+{
+#ifdef CONFIG_PREEMPT
+	if (current->lock_depth == -1)
+		spin_lock(&kernel_flag);
+	++current->lock_depth;
 #else
-#error "We do not support SMP on SH"
-#endif /* CONFIG_SMP */
+	if (!++current->lock_depth)
+		spin_lock(&kernel_flag);
+#endif
+}
+
+static __inline__ void unlock_kernel(void)
+{
+	if (current->lock_depth < 0)
+		BUG();
+	if (--current->lock_depth < 0)
+		spin_unlock(&kernel_flag);
+}
+#endif /* CONFIG_SMP || CONFIG_PREEMPT */
 
 #endif /* __ASM_SH_SMPLOCK_H */
diff -urN linux-2.4.17-rc1-virgin/include/asm-sh/softirq.h linux-2.4.17-rc1-wli3/include/asm-sh/softirq.h
--- linux-2.4.17-rc1-virgin/include/asm-sh/softirq.h	Sat Sep  8 12:29:09 2001
+++ linux-2.4.17-rc1-wli3/include/asm-sh/softirq.h	Fri Dec 14 02:44:44 2001
@@ -6,6 +6,7 @@
 
 #define local_bh_disable()			\
 do {						\
+	preempt_disable();			\
 	local_bh_count(smp_processor_id())++;	\
 	barrier();				\
 } while (0)
@@ -14,6 +15,7 @@
 do {						\
 	barrier();				\
 	local_bh_count(smp_processor_id())--;	\
+	preempt_enable();			\
 } while (0)
 
 #define local_bh_enable()				\
@@ -22,6 +24,7 @@
 	if (!--local_bh_count(smp_processor_id())	\
 	    && softirq_pending(smp_processor_id())) {	\
 		do_softirq();				\
+	preempt_enable();				\
 	}						\
 } while (0)
 
diff -urN linux-2.4.17-rc1-virgin/include/asm-sparc/bootmem.h linux-2.4.17-rc1-wli3/include/asm-sparc/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-sparc/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-sparc/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,11 @@
+/*
+ * include/asm-sparc/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * 32-bit SPARC generally doesn't feature discontiguous
+ * memory, so this value of NR_SEGMENTS likely to be good.
+ * NR_SEGMENTS will be eliminated once page stealing in
+ * the bootmem allocator is in place.
+ */
+
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/asm-sparc64/bootmem.h linux-2.4.17-rc1-wli3/include/asm-sparc64/bootmem.h
--- linux-2.4.17-rc1-virgin/include/asm-sparc64/bootmem.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/asm-sparc64/bootmem.h	Fri Dec 14 03:21:15 2001
@@ -0,0 +1,10 @@
+/*
+ * include/asm-sparc64/bootmem.h
+ * (C) Nov 2001 William Irwin, IBM
+ *
+ * 64-bit SPARC may need a larger NR_SEGMENTS than this
+ * but it's not clear what a better value would be.
+ * NR_SEGMENTS will be eliminated once page stealing
+ * in the bootmem allocator is in place.
+ */
+#include <asm-generic/bootmem.h>
diff -urN linux-2.4.17-rc1-virgin/include/linux/bootmem.h linux-2.4.17-rc1-wli3/include/linux/bootmem.h
--- linux-2.4.17-rc1-virgin/include/linux/bootmem.h	Thu Nov 22 11:47:23 2001
+++ linux-2.4.17-rc1-wli3/include/linux/bootmem.h	Sun Dec 16 18:16:02 2001
@@ -1,5 +1,6 @@
 /*
  * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ * Segment tree-based memory reservation system, William Irwin, IBM, Oct 2001
  */
 #ifndef _LINUX_BOOTMEM_H
 #define _LINUX_BOOTMEM_H
@@ -9,6 +10,8 @@
 #include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/mmzone.h>
+#include <linux/segment_tree.h>
+#include <asm/bootmem.h>
 
 /*
  *  simple boot-time physical memory area allocator.
@@ -25,8 +28,8 @@
 	unsigned long node_boot_start;
 	unsigned long node_low_pfn;
 	void *node_bootmem_map;
-	unsigned long last_offset;
-	unsigned long last_pos;
+	segment_tree_root_t segment_tree;
+	segment_buf_t *free_segments;
 } bootmem_data_t;
 
 extern unsigned long __init bootmem_bootmap_pages (unsigned long);
diff -urN linux-2.4.17-rc1-virgin/include/linux/brlock.h linux-2.4.17-rc1-wli3/include/linux/brlock.h
--- linux-2.4.17-rc1-virgin/include/linux/brlock.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/linux/brlock.h	Sun Dec 16 18:10:53 2001
@@ -171,11 +171,11 @@
 }
 
 #else
-# define br_read_lock(idx)	((void)(idx))
-# define br_read_unlock(idx)	((void)(idx))
-# define br_write_lock(idx)	((void)(idx))
-# define br_write_unlock(idx)	((void)(idx))
-#endif
+# define br_read_lock(idx)	({ (void)(idx); preempt_disable(); })
+# define br_read_unlock(idx)	({ (void)(idx); preempt_enable(); })
+# define br_write_lock(idx)	({ (void)(idx); preempt_disable(); })
+# define br_write_unlock(idx)	({ (void)(idx); preempt_enable(); })
+#endif	/* CONFIG_SMP */
 
 /*
  * Now enumerate all of the possible sw/hw IRQ protected
diff -urN linux-2.4.17-rc1-virgin/include/linux/dcache.h linux-2.4.17-rc1-wli3/include/linux/dcache.h
--- linux-2.4.17-rc1-virgin/include/linux/dcache.h	Thu Nov 22 11:46:18 2001
+++ linux-2.4.17-rc1-wli3/include/linux/dcache.h	Sun Dec 16 18:05:51 2001
@@ -36,17 +36,58 @@
 };
 extern struct dentry_stat_t dentry_stat;
 
-/* Name hashing routines. Initial hash value */
-/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
-#define init_name_hash()		0
+/*
+ * Fowler, Noll, & Vo hash function
+ * -- wli
+ */
+
+/*
+ * Initial hash value for Fowler, Noll, & Vo hash function.
+ * FreeBSD appears to use 33554467UL decimal / 0x2000023UL hex.
+ * Sources I see elsewhere (Noll's webpage) describe using an offset
+ * basis of 2166136261UL decimal / 0x811C9DC5UL hex.
+ * -- wli
+ */
+#define init_name_hash()		0x811C9DC5UL
 
-/* partial hash update function. Assume roughly 4 bits per character */
-static __inline__ unsigned long partial_name_hash(unsigned long c, unsigned long prevhash)
+/*
+ * This is a multiplicative hash function using the prime 16777619
+ * The Fowler, Noll, and Vo hash function is rated the best in
+ * string hashing benchmarks published on gcc-patches and NetBSD
+ * mailing lists.
+ * -- wli
+ */
+static __inline__ unsigned long partial_name_hash(unsigned long c,
+						unsigned long prevhash)
 {
-	return (prevhash + (c << 4) + (c >> 4)) * 11;
+	/*
+	 * A multiplicative definition would be:
+	 * --wli
+	 */
+	 return (prevhash * 0x01000193UL) ^ c;
+
+	/*
+	 * If I were to get overcomplicated, I would decode things
+	 * for each bit of 0x01000193UL and then expand to the shift
+	 * and add operations explicitly in order to avoid reliance on
+	 * the compiler for this.
+	 * The register pressure generated by this may not be a win
+	 * on i386 vs. actual multiplication, but results remain
+	 * to be seen.
+	 *
+	 * prevhash +=	  (prevhash << 24)
+	 *		+ (prevhash << 8)
+	 *		+ (prevhash << 7)
+	 *		+ (prevhash << 4)
+	 *		+ (prevhash << 1);
+	 * return prevhash ^ c;
+	 */
 }
 
-/* Finally: cut down the number of bits to a int value (and try to avoid losing bits) */
+/*
+ * Finally: cut down the number of bits to a int value (and try to
+ * avoid losing bits)
+ */
 static __inline__ unsigned long end_name_hash(unsigned long hash)
 {
 	return (unsigned int) hash;
@@ -126,31 +167,6 @@
 
 extern spinlock_t dcache_lock;
 
-/**
- * d_drop - drop a dentry
- * @dentry: dentry to drop
- *
- * d_drop() unhashes the entry from the parent
- * dentry hashes, so that it won't be found through
- * a VFS lookup any more. Note that this is different
- * from deleting the dentry - d_delete will try to
- * mark the dentry negative if possible, giving a
- * successful _negative_ lookup, while d_drop will
- * just make the cache lookup fail.
- *
- * d_drop() is used mainly for stuff that wants
- * to invalidate a dentry for some reason (NFS
- * timeouts or autofs deletes).
- */
-
-static __inline__ void d_drop(struct dentry * dentry)
-{
-	spin_lock(&dcache_lock);
-	list_del(&dentry->d_hash);
-	INIT_LIST_HEAD(&dentry->d_hash);
-	spin_unlock(&dcache_lock);
-}
-
 static __inline__ int dname_external(struct dentry *d)
 {
 	return d->d_name.name != d->d_iname; 
@@ -275,3 +291,34 @@
 #endif /* __KERNEL__ */
 
 #endif	/* __LINUX_DCACHE_H */
+
+#if !defined(__LINUX_DCACHE_H_INLINES) && defined(_TASK_STRUCT_DEFINED)
+#define __LINUX_DCACHE_H_INLINES
+
+#ifdef __KERNEL__
+/**
+ * d_drop - drop a dentry
+ * @dentry: dentry to drop
+ *
+ * d_drop() unhashes the entry from the parent
+ * dentry hashes, so that it won't be found through
+ * a VFS lookup any more. Note that this is different
+ * from deleting the dentry - d_delete will try to
+ * mark the dentry negative if possible, giving a
+ * successful _negative_ lookup, while d_drop will
+ * just make the cache lookup fail.
+ *
+ * d_drop() is used mainly for stuff that wants
+ * to invalidate a dentry for some reason (NFS
+ * timeouts or autofs deletes).
+ */
+
+static __inline__ void d_drop(struct dentry * dentry)
+{
+	spin_lock(&dcache_lock);
+	list_del(&dentry->d_hash);
+	INIT_LIST_HEAD(&dentry->d_hash);
+	spin_unlock(&dcache_lock);
+}
+#endif
+#endif
diff -urN linux-2.4.17-rc1-virgin/include/linux/elevator.h linux-2.4.17-rc1-wli3/include/linux/elevator.h
--- linux-2.4.17-rc1-virgin/include/linux/elevator.h	Thu Feb 15 16:58:34 2001
+++ linux-2.4.17-rc1-wli3/include/linux/elevator.h	Sat Dec 15 14:54:07 2001
@@ -5,8 +5,9 @@
 			    struct list_head *,
 			    struct list_head *, int);
 
-typedef int (elevator_merge_fn) (request_queue_t *, struct request **, struct list_head *,
-				 struct buffer_head *, int, int);
+typedef int (elevator_merge_fn)(request_queue_t *, struct request **,
+				struct list_head *, struct buffer_head *bh,
+				int rw, int max_sectors, int max_bomb_segments);
 
 typedef void (elevator_merge_cleanup_fn) (request_queue_t *, struct request *, int);
 
@@ -16,6 +17,7 @@
 {
 	int read_latency;
 	int write_latency;
+	int max_bomb_segments;
 
 	elevator_merge_fn *elevator_merge_fn;
 	elevator_merge_cleanup_fn *elevator_merge_cleanup_fn;
@@ -24,13 +26,13 @@
 	unsigned int queue_ID;
 };
 
-int elevator_noop_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int);
-void elevator_noop_merge_cleanup(request_queue_t *, struct request *, int);
-void elevator_noop_merge_req(struct request *, struct request *);
-
-int elevator_linus_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int);
-void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int);
-void elevator_linus_merge_req(struct request *, struct request *);
+elevator_merge_fn elevator_noop_merge;
+elevator_merge_cleanup_fn elevator_noop_merge_cleanup;
+elevator_merge_req_fn elevator_noop_merge_req;
+
+elevator_merge_fn elevator_linus_merge;
+elevator_merge_cleanup_fn elevator_linus_merge_cleanup;
+elevator_merge_req_fn elevator_linus_merge_req;
 
 typedef struct blkelv_ioctl_arg_s {
 	int queue_ID;
@@ -54,22 +56,6 @@
 #define ELEVATOR_FRONT_MERGE	1
 #define ELEVATOR_BACK_MERGE	2
 
-/*
- * This is used in the elevator algorithm.  We don't prioritise reads
- * over writes any more --- although reads are more time-critical than
- * writes, by treating them equally we increase filesystem throughput.
- * This turns out to give better overall performance.  -- sct
- */
-#define IN_ORDER(s1,s2)				\
-	((((s1)->rq_dev == (s2)->rq_dev &&	\
-	   (s1)->sector < (s2)->sector)) ||	\
-	 (s1)->rq_dev < (s2)->rq_dev)
-
-#define BHRQ_IN_ORDER(bh, rq)			\
-	((((bh)->b_rdev == (rq)->rq_dev &&	\
-	   (bh)->b_rsector < (rq)->sector)) ||	\
-	 (bh)->b_rdev < (rq)->rq_dev)
-
 static inline int elevator_request_latency(elevator_t * elevator, int rw)
 {
 	int latency;
@@ -85,7 +71,7 @@
 ((elevator_t) {								\
 	0,				/* read_latency */		\
 	0,				/* write_latency */		\
-									\
+	0,				/* max_bomb_segments */		\
 	elevator_noop_merge,		/* elevator_merge_fn */		\
 	elevator_noop_merge_cleanup,	/* elevator_merge_cleanup_fn */	\
 	elevator_noop_merge_req,	/* elevator_merge_req_fn */	\
@@ -95,7 +81,7 @@
 ((elevator_t) {								\
 	8192,				/* read passovers */		\
 	16384,				/* write passovers */		\
-									\
+	0,				/* max_bomb_segments */		\
 	elevator_linus_merge,		/* elevator_merge_fn */		\
 	elevator_linus_merge_cleanup,	/* elevator_merge_cleanup_fn */	\
 	elevator_linus_merge_req,	/* elevator_merge_req_fn */	\
diff -urN linux-2.4.17-rc1-virgin/include/linux/fs.h linux-2.4.17-rc1-wli3/include/linux/fs.h
--- linux-2.4.17-rc1-virgin/include/linux/fs.h	Fri Dec 14 06:04:15 2001
+++ linux-2.4.17-rc1-wli3/include/linux/fs.h	Sun Dec 16 18:06:18 2001
@@ -283,7 +283,7 @@
 
 extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset);
 
-#define touch_buffer(bh)	mark_page_accessed(bh->b_page)
+#define touch_buffer(bh)	touch_page(bh->b_page)
 
 
 #include <linux/pipe_fs_i.h>
diff -urN linux-2.4.17-rc1-virgin/include/linux/fs_struct.h linux-2.4.17-rc1-wli3/include/linux/fs_struct.h
--- linux-2.4.17-rc1-virgin/include/linux/fs_struct.h	Fri Jul 13 15:10:44 2001
+++ linux-2.4.17-rc1-wli3/include/linux/fs_struct.h	Fri Dec 14 02:44:44 2001
@@ -20,6 +20,15 @@
 extern void exit_fs(struct task_struct *);
 extern void set_fs_altroot(void);
 
+struct fs_struct *copy_fs_struct(struct fs_struct *old);
+void put_fs_struct(struct fs_struct *fs);
+
+#endif
+#endif
+
+#if !defined(_LINUX_FS_STRUCT_H_INLINES) && defined(_TASK_STRUCT_DEFINED)
+#define _LINUX_FS_STRUCT_H_INLINES
+#ifdef __KERNEL__
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
  * It can block. Requires the big lock held.
@@ -65,9 +74,5 @@
 		mntput(old_pwdmnt);
 	}
 }
-
-struct fs_struct *copy_fs_struct(struct fs_struct *old);
-void put_fs_struct(struct fs_struct *fs);
-
 #endif
 #endif
diff -urN linux-2.4.17-rc1-virgin/include/linux/highmem.h linux-2.4.17-rc1-wli3/include/linux/highmem.h
--- linux-2.4.17-rc1-virgin/include/linux/highmem.h	Fri Dec 14 06:04:15 2001
+++ linux-2.4.17-rc1-wli3/include/linux/highmem.h	Sun Dec 16 18:05:01 2001
@@ -93,4 +93,15 @@
 	kunmap_atomic(vto, KM_USER1);
 }
 
+static inline void copy_highpage(struct page *to, struct page *from)
+{
+	char *vfrom, *vto;
+
+	vfrom = kmap(from);
+	vto = kmap(to);
+	copy_page(vto, vfrom);
+	kunmap(from);
+	kunmap(to);
+}
+
 #endif /* _LINUX_HIGHMEM_H */
diff -urN linux-2.4.17-rc1-virgin/include/linux/lock_break.h linux-2.4.17-rc1-wli3/include/linux/lock_break.h
--- linux-2.4.17-rc1-virgin/include/linux/lock_break.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/linux/lock_break.h	Sun Dec 16 18:04:59 2001
@@ -0,0 +1,84 @@
+/*
+ * include/linux/lock_break.h - lock breaking routines
+ *
+ * since in-kernel preemption can not occur while a lock is held,
+ * we can just drop and reacquire long-held locks when they are
+ * in a natural quiescent state to further lower system latency.
+ *
+ * (C) 2001 Robert Love
+ *
+ */
+
+#ifndef _LINUX_LOCK_BREAK_H
+#define _LINUX_LOCK_BREAK_H
+
+#include <linux/compiler.h>
+
+/*
+ * setting this to 1 will instruct debug_lock_break to
+ * note when the expected lock count does not equal the
+ * actual count. if the lock count is higher than expected,
+ * we aren't dropping enough locks.  if it is 0, we are
+ * wasting our time since the system is already preemptible.
+ */
+#ifndef DEBUG_LOCK_BREAK
+#define DEBUG_LOCK_BREAK 0
+#endif
+
+#ifdef CONFIG_LOCK_BREAK
+
+#define conditional_schedule_needed() (unlikely(current->need_resched))
+
+/*
+ * setting the task's state to TASK_RUNNING is nothing but paranoia,
+ * in the case where a task is delinquent in properly putting itself
+ * to sleep.  we should test without it.
+ */
+#define unconditional_schedule() do { \
+	__set_current_state(TASK_RUNNING); \
+	schedule(); \
+} while(0)
+
+#define conditional_schedule() do { \
+	if (conditional_schedule_needed()) \
+		unconditional_schedule(); \
+} while(0)
+
+#define break_spin_lock(n) do { \
+	spin_unlock(n); \
+	spin_lock(n); \
+} while(0)
+
+#define break_spin_lock_and_resched(n) do { \
+	spin_unlock(n); \
+	conditional_schedule(); \
+	spin_lock(n); \
+} while(0)
+
+#if DEBUG_LOCK_BREAK
+#define debug_lock_break(n) do { \
+	if (current->preempt_count != n) \
+		printk(KERN_ERR "lock_break: %s:%d: count was %d not %d\n", \
+			__FILE__, __LINE__, current->preempt_count, n); \
+} while(0)
+#else
+#define debug_lock_break(n)
+#endif
+
+#define DEFINE_LOCK_COUNT() int _lock_break_count = 0
+#define TEST_LOCK_COUNT(n) (++_lock_break_count > (n))
+#define RESET_LOCK_COUNT() _lock_break_count = 0
+
+#else
+#define unconditional_schedule()
+#define conditional_schedule()
+#define conditional_schedule_needed() 0
+#define break_spin_lock(n)
+#define break_spin_lock_and_resched(n)
+#define debug_lock_break(n)
+#define DEFINE_LOCK_COUNT()
+#define TEST_LOCK_COUNT(n) 0
+#define RESET_LOCK_COUNT()
+#endif
+
+#endif /* _LINUX_LOCK_BREAK_H */
diff -urN linux-2.4.17-rc1-virgin/include/linux/mm.h linux-2.4.17-rc1-wli3/include/linux/mm.h
--- linux-2.4.17-rc1-virgin/include/linux/mm.h	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/include/linux/mm.h	Sun Dec 16 18:16:02 2001
@@ -19,7 +19,7 @@
 extern int page_cluster;
 /* The inactive_clean lists are per zone. */
 extern struct list_head active_list;
-extern struct list_head inactive_list;
+extern struct list_head inactive_dirty_list;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -121,6 +121,9 @@
  */
 extern pgprot_t protection_map[16];
 
+#define ZPR_MAX_BYTES 256*PAGE_SIZE
+#define ZPR_NORMAL 0 /* perform zap_page_range request in one walk */
+#define ZPR_PARTITION 1 /* partition into a series of smaller operations */
 
 /*
  * These are the virtual MM functions - opening of an area, closing and
@@ -133,6 +136,9 @@
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused);
 };
 
+/* forward declaration; pte_chain is meant to be internal to rmap.c */
+struct pte_chain;
+
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -159,6 +165,8 @@
 					   updated asynchronously */
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by pagemap_lru_lock !! */
+	unsigned long age;		/* Page aging counter. */
+	struct pte_chain * pte_chain;	/* Reverse pte mapping pointer. */
 	wait_queue_head_t wait;		/* Page locked?  Stand in line... */
 	struct page **pprev_hash;	/* Complement to *next_hash. */
 	struct buffer_head * buffers;	/* Buffer maps us to a disk block. */
@@ -275,9 +283,9 @@
 #define PG_referenced		 2
 #define PG_uptodate		 3
 #define PG_dirty		 4
-#define PG_unused		 5
-#define PG_lru			 6
-#define PG_active		 7
+#define PG_inactive_clean	 5
+#define PG_active		 6
+#define PG_inactive_dirty	 7
 #define PG_slab			 8
 #define PG_skip			10
 #define PG_highmem		11
@@ -325,10 +333,16 @@
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
+#define TestandSetPageActive(page)	test_and_set_bit(PG_active, &(page)->flags)
+#define TestandClearPageActive(page)	test_and_clear_bit(PG_active, &(page)->flags)
 
-#define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
-#define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
-#define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
+#define PageInactiveDirty(page)	test_bit(PG_inactive_dirty, &(page)->flags)
+#define SetPageInactiveDirty(page)	set_bit(PG_inactive_dirty, &(page)->flags)
+#define ClearPageInactiveDirty(page)	clear_bit(PG_inactive_dirty, &(page)->flags)
+
+#define PageInactiveClean(page)	test_bit(PG_inactive_clean, &(page)->flags)
+#define SetPageInactiveClean(page)	set_bit(PG_inactive_clean, &(page)->flags)
+#define ClearPageInactiveClean(page)	clear_bit(PG_inactive_clean, &(page)->flags)
 
 #ifdef CONFIG_HIGHMEM
 #define PageHighMem(page)		test_bit(PG_highmem, &(page)->flags)
@@ -339,6 +353,23 @@
 #define SetPageReserved(page)		set_bit(PG_reserved, &(page)->flags)
 #define ClearPageReserved(page)		clear_bit(PG_reserved, &(page)->flags)
 
+#define PageLRU(pp) \
+	(PageActive(pp) | PageInactiveDirty(pp) | PageInactiveClean(pp))
+
+/*
+ * Called whenever the VM references a page. We immediately reclaim
+ * the inactive clean pages because those are counted as freeable.
+ * We don't particularly care about the inactive dirty ones because
+ * we're never sure if those are freeable anyway.
+ */
+static inline void touch_page(struct page * page)
+{
+	if (PageInactiveClean(page))
+		activate_page(page);
+	else
+		SetPageReferenced(page);
+}
+
 /*
  * Error return values for the *_nopage functions
  */
@@ -404,7 +435,7 @@
 extern void shmem_lock(struct file * file, int lock);
 extern int shmem_zero_setup(struct vm_area_struct *);
 
-extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size);
+extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions);
 extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
 extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
 extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);
diff -urN linux-2.4.17-rc1-virgin/include/linux/mm.h~ linux-2.4.17-rc1-wli3/include/linux/mm.h~
--- linux-2.4.17-rc1-virgin/include/linux/mm.h~	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/linux/mm.h~	Sun Dec 16 03:08:39 2001
@@ -0,0 +1,627 @@
+#ifndef _LINUX_MM_H
+#define _LINUX_MM_H
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+
+#ifdef __KERNEL__
+
+#include <linux/config.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/mmzone.h>
+#include <linux/swap.h>
+#include <linux/rbtree.h>
+
+extern unsigned long max_mapnr;
+extern unsigned long num_physpages;
+extern void * high_memory;
+extern int page_cluster;
+/* The inactive_clean lists are per zone. */
+extern struct list_head active_list;
+extern struct list_head inactive_dirty_list;
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/atomic.h>
+
+/*
+ * Linux kernel virtual memory manager primitives.
+ * The idea being to have a "virtual" mm in the same way
+ * we have a virtual fs - giving a cleaner interface to the
+ * mm details, and allowing different kinds of memory mappings
+ * (from shared memory to executable loading to arbitrary
+ * mmap() functions).
+ */
+
+/*
+ * This struct defines a memory VMM memory area. There is one of these
+ * per VM-area/task.  A VM area is any part of the process virtual memory
+ * space that has a special rule for the page-fault handlers (ie a shared
+ * library, the executable area etc).
+ */
+struct vm_area_struct {
+	struct mm_struct * vm_mm;	/* The address space we belong to. */
+	unsigned long vm_start;		/* Our start address within vm_mm. */
+	unsigned long vm_end;		/* The first byte after our end address
+					   within vm_mm. */
+
+	/* linked list of VM areas per task, sorted by address */
+	struct vm_area_struct *vm_next;
+
+	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
+	unsigned long vm_flags;		/* Flags, listed below. */
+
+	rb_node_t vm_rb;
+
+	/*
+	 * For areas with an address space and backing store,
+	 * one of the address_space->i_mmap{,shared} lists,
+	 * for shm areas, the list of attaches, otherwise unused.
+	 */
+	struct vm_area_struct *vm_next_share;
+	struct vm_area_struct **vm_pprev_share;
+
+	/* Function pointers to deal with this struct. */
+	struct vm_operations_struct * vm_ops;
+
+	/* Information about our backing store: */
+	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
+					   units, *not* PAGE_CACHE_SIZE */
+	struct file * vm_file;		/* File we map to (can be NULL). */
+	unsigned long vm_raend;		/* XXX: put full readahead info here. */
+	void * vm_private_data;		/* was vm_pte (shared mem) */
+};
+
+/*
+ * vm_flags..
+ */
+#define VM_READ		0x00000001	/* currently active flags */
+#define VM_WRITE	0x00000002
+#define VM_EXEC		0x00000004
+#define VM_SHARED	0x00000008
+
+#define VM_MAYREAD	0x00000010	/* limits for mprotect() etc */
+#define VM_MAYWRITE	0x00000020
+#define VM_MAYEXEC	0x00000040
+#define VM_MAYSHARE	0x00000080
+
+#define VM_GROWSDOWN	0x00000100	/* general info on the segment */
+#define VM_GROWSUP	0x00000200
+#define VM_SHM		0x00000400	/* shared memory area, don't swap out */
+#define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
+
+#define VM_EXECUTABLE	0x00001000
+#define VM_LOCKED	0x00002000
+#define VM_IO           0x00004000	/* Memory mapped I/O or similar */
+
+					/* Used by sys_madvise() */
+#define VM_SEQ_READ	0x00008000	/* App will access data sequentially */
+#define VM_RAND_READ	0x00010000	/* App will not benefit from clustered reads */
+
+#define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
+#define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
+#define VM_RESERVED	0x00080000	/* Don't unmap it from swap_out */
+
+#define VM_STACK_FLAGS	0x00000177
+
+#define VM_READHINTMASK			(VM_SEQ_READ | VM_RAND_READ)
+#define VM_ClearReadHint(v)		(v)->vm_flags &= ~VM_READHINTMASK
+#define VM_NormalReadHint(v)		(!((v)->vm_flags & VM_READHINTMASK))
+#define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
+#define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
+
+/* read ahead limits */
+extern int vm_min_readahead;
+extern int vm_max_readahead;
+
+/*
+ * mapping from the currently active vm_flags protection bits (the
+ * low four bits) to a page protection mask..
+ */
+extern pgprot_t protection_map[16];
+
+
+/*
+ * These are the virtual MM functions - opening of an area, closing and
+ * unmapping it (needed to keep files on disk up-to-date etc), pointer
+ * to the functions called when a no-page or a wp-page exception occurs. 
+ */
+struct vm_operations_struct {
+	void (*open)(struct vm_area_struct * area);
+	void (*close)(struct vm_area_struct * area);
+	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused);
+};
+
+/* forward declaration; pte_chain is meant to be internal to rmap.c */
+struct pte_chain;
+
+/*
+ * Each physical page in the system has a struct page associated with
+ * it to keep track of whatever it is we are using the page for at the
+ * moment. Note that we have no way to track which tasks are using
+ * a page.
+ *
+ * Try to keep the most commonly accessed fields in single cache lines
+ * here (16 bytes or greater).  This ordering should be particularly
+ * beneficial on 32-bit processors.
+ *
+ * The first line is data used in page cache lookup, the second line
+ * is used for linear searches (eg. clock algorithm scans). 
+ *
+ * TODO: make this structure smaller, it could be as small as 32 bytes.
+ */
+typedef struct page {
+	struct list_head list;		/* ->mapping has some page lists. */
+	struct address_space *mapping;	/* The inode (or ...) we belong to. */
+	unsigned long index;		/* Our offset within mapping. */
+	struct page *next_hash;		/* Next page sharing our hash bucket in
+					   the pagecache hash table. */
+	atomic_t count;			/* Usage count, see below. */
+	unsigned long flags;		/* atomic flags, some possibly
+					   updated asynchronously */
+	struct list_head lru;		/* Pageout list, eg. active_list;
+					   protected by pagemap_lru_lock !! */
+	unsigned long age;		/* Page aging counter. */
+	struct pte_chain * pte_chain;	/* Reverse pte mapping pointer. */
+	wait_queue_head_t wait;		/* Page locked?  Stand in line... */
+	struct page **pprev_hash;	/* Complement to *next_hash. */
+	struct buffer_head * buffers;	/* Buffer maps us to a disk block. */
+	void *virtual;			/* Kernel virtual address (NULL if
+					   not kmapped, ie. highmem) */
+	struct zone_struct *zone;	/* Memory zone we are in. */
+} mem_map_t;
+
+/*
+ * Methods to modify the page usage count.
+ *
+ * What counts for a page usage:
+ * - cache mapping   (page->mapping)
+ * - disk mapping    (page->buffers)
+ * - page mapped in a task's page tables, each mapping
+ *   is counted separately
+ *
+ * Also, many kernel routines increase the page count before a critical
+ * routine so they can be sure the page doesn't go away from under them.
+ */
+#define get_page(p)		atomic_inc(&(p)->count)
+#define put_page(p)		__free_page(p)
+#define put_page_testzero(p) 	atomic_dec_and_test(&(p)->count)
+#define page_count(p)		atomic_read(&(p)->count)
+#define set_page_count(p,v) 	atomic_set(&(p)->count, v)
+
+/*
+ * Various page->flags bits:
+ *
+ * PG_reserved is set for special pages, which can never be swapped
+ * out. Some of them might not even exist (eg empty_bad_page)...
+ *
+ * Multiple processes may "see" the same page. E.g. for untouched
+ * mappings of /dev/null, all processes see the same page full of
+ * zeroes, and text pages of executables and shared libraries have
+ * only one copy in memory, at most, normally.
+ *
+ * For the non-reserved pages, page->count denotes a reference count.
+ *   page->count == 0 means the page is free.
+ *   page->count == 1 means the page is used for exactly one purpose
+ *   (e.g. a private data page of one process).
+ *
+ * A page may be used for kmalloc() or anyone else who does a
+ * __get_free_page(). In this case the page->count is at least 1, and
+ * all other fields are unused but should be 0 or NULL. The
+ * management of this page is the responsibility of the one who uses
+ * it.
+ *
+ * The other pages (we may call them "process pages") are completely
+ * managed by the Linux memory manager: I/O, buffers, swapping etc.
+ * The following discussion applies only to them.
+ *
+ * A page may belong to an inode's memory mapping. In this case,
+ * page->mapping is the pointer to the inode, and page->index is the
+ * file offset of the page, in units of PAGE_CACHE_SIZE.
+ *
+ * A page may have buffers allocated to it. In this case,
+ * page->buffers is a circular list of these buffer heads. Else,
+ * page->buffers == NULL.
+ *
+ * For pages belonging to inodes, the page->count is the number of
+ * attaches, plus 1 if buffers are allocated to the page, plus one
+ * for the page cache itself.
+ *
+ * All pages belonging to an inode are in these doubly linked lists:
+ * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages;
+ * using the page->list list_head. These fields are also used for
+ * freelist managemet (when page->count==0).
+ *
+ * There is also a hash table mapping (mapping,index) to the page
+ * in memory if present. The lists for this hash table use the fields
+ * page->next_hash and page->pprev_hash.
+ *
+ * All process pages can do I/O:
+ * - inode pages may need to be read from disk,
+ * - inode pages which have been modified and are MAP_SHARED may need
+ *   to be written to disk,
+ * - private pages which have been modified may need to be swapped out
+ *   to swap space and (later) to be read back into memory.
+ * During disk I/O, PG_locked is used. This bit is set before I/O
+ * and reset when I/O completes. page->wait is a wait queue of all
+ * tasks waiting for the I/O on this page to complete.
+ * PG_uptodate tells whether the page's contents is valid.
+ * When a read completes, the page becomes uptodate, unless a disk I/O
+ * error happened.
+ *
+ * For choosing which pages to swap out, inode pages carry a
+ * PG_referenced bit, which is set any time the system accesses
+ * that page through the (mapping,index) hash table. This referenced
+ * bit, together with the referenced bit in the page tables, is used
+ * to manipulate page->age and move the page across the active,
+ * inactive_dirty and inactive_clean lists.
+ *
+ * Note that the referenced bit, the page->lru list_head and the
+ * active, inactive_dirty and inactive_clean lists are protected by
+ * the pagemap_lru_lock, and *NOT* by the usual PG_locked bit!
+ *
+ * PG_skip is used on sparc/sparc64 architectures to "skip" certain
+ * parts of the address space.
+ *
+ * PG_error is set to indicate that an I/O error occurred on this page.
+ *
+ * PG_arch_1 is an architecture specific page state bit.  The generic
+ * code guarantees that this bit is cleared for a page when it first
+ * is entered into the page cache.
+ *
+ * PG_highmem pages are not permanently mapped into the kernel virtual
+ * address space, they need to be kmapped separately for doing IO on
+ * the pages. The struct page (these bits with information) are always
+ * mapped into kernel address space...
+ */
+#define PG_locked		 0	/* Page is locked. Don't touch. */
+#define PG_error		 1
+#define PG_referenced		 2
+#define PG_uptodate		 3
+#define PG_dirty		 4
+#define PG_inactive_clean	 5
+#define PG_active		 6
+#define PG_inactive_dirty	 7
+#define PG_slab			 8
+#define PG_skip			10
+#define PG_highmem		11
+#define PG_checked		12	/* kill me in 2.5.<early>. */
+#define PG_arch_1		13
+#define PG_reserved		14
+#define PG_launder		15	/* written out by VM pressure.. */
+
+/* Make it prettier to test the above... */
+#define UnlockPage(page)	unlock_page(page)
+#define Page_Uptodate(page)	test_bit(PG_uptodate, &(page)->flags)
+#define SetPageUptodate(page)	set_bit(PG_uptodate, &(page)->flags)
+#define ClearPageUptodate(page)	clear_bit(PG_uptodate, &(page)->flags)
+#define PageDirty(page)		test_bit(PG_dirty, &(page)->flags)
+#define SetPageDirty(page)	set_bit(PG_dirty, &(page)->flags)
+#define ClearPageDirty(page)	clear_bit(PG_dirty, &(page)->flags)
+#define PageLocked(page)	test_bit(PG_locked, &(page)->flags)
+#define LockPage(page)		set_bit(PG_locked, &(page)->flags)
+#define TryLockPage(page)	test_and_set_bit(PG_locked, &(page)->flags)
+#define PageChecked(page)	test_bit(PG_checked, &(page)->flags)
+#define SetPageChecked(page)	set_bit(PG_checked, &(page)->flags)
+#define PageLaunder(page)	test_bit(PG_launder, &(page)->flags)
+#define SetPageLaunder(page)	set_bit(PG_launder, &(page)->flags)
+
+extern void FASTCALL(set_page_dirty(struct page *));
+
+/*
+ * The first mb is necessary to safely close the critical section opened by the
+ * TryLockPage(), the second mb is necessary to enforce ordering between
+ * the clear_bit and the read of the waitqueue (to avoid SMP races with a
+ * parallel wait_on_page).
+ */
+#define PageError(page)		test_bit(PG_error, &(page)->flags)
+#define SetPageError(page)	set_bit(PG_error, &(page)->flags)
+#define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
+#define PageReferenced(page)	test_bit(PG_referenced, &(page)->flags)
+#define SetPageReferenced(page)	set_bit(PG_referenced, &(page)->flags)
+#define ClearPageReferenced(page)	clear_bit(PG_referenced, &(page)->flags)
+#define PageTestandClearReferenced(page)	test_and_clear_bit(PG_referenced, &(page)->flags)
+#define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
+#define PageSetSlab(page)	set_bit(PG_slab, &(page)->flags)
+#define PageClearSlab(page)	clear_bit(PG_slab, &(page)->flags)
+#define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
+
+#define PageActive(page)	test_bit(PG_active, &(page)->flags)
+#define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
+#define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
+#define TestandSetPageActive(page)	test_and_set_bit(PG_active, &(page)->flags)
+#define TestandClearPageActive(page)	test_and_clear_bit(PG_active, &(page)->flags)
+
+#define PageInactiveDirty(page)	test_bit(PG_inactive_dirty, &(page)->flags)
+#define SetPageInactiveDirty(page)	set_bit(PG_inactive_dirty, &(page)->flags)
+#define ClearPageInactiveDirty(page)	clear_bit(PG_inactive_dirty, &(page)->flags)
+
+#define PageInactiveClean(page)	test_bit(PG_inactive_clean, &(page)->flags)
+#define SetPageInactiveClean(page)	set_bit(PG_inactive_clean, &(page)->flags)
+#define ClearPageInactiveClean(page)	clear_bit(PG_inactive_clean, &(page)->flags)
+
+#ifdef CONFIG_HIGHMEM
+#define PageHighMem(page)		test_bit(PG_highmem, &(page)->flags)
+#else
+#define PageHighMem(page)		0 /* needed to optimize away at compile time */
+#endif
+
+#define SetPageReserved(page)		set_bit(PG_reserved, &(page)->flags)
+#define ClearPageReserved(page)		clear_bit(PG_reserved, &(page)->flags)
+
+#define PageLRU(pp) \
+	(PageActive(pp) | PageInactiveDirty(pp) | PageInactiveClean(pp))
+
+/*
+ * Called whenever the VM references a page. We immediately reclaim
+ * the inactive clean pages because those are counted as freeable.
+ * We don't particularly care about the inactive dirty ones because
+ * we're never sure if those are freeable anyway.
+ */
+static inline void touch_page(struct page * page)
+{
+	if (PageInactiveClean(page))
+		activate_page(page);
+	else
+		SetPageReferenced(page);
+}
+
+/*
+ * Error return values for the *_nopage functions
+ */
+#define NOPAGE_SIGBUS	(NULL)
+#define NOPAGE_OOM	((struct page *) (-1))
+
+/* The array of struct pages */
+extern mem_map_t * mem_map;
+
+/*
+ * There is only one page-allocator function, and two main namespaces to
+ * it. The alloc_page*() variants return 'struct page *' and as such
+ * can allocate highmem pages, the *get*page*() variants return
+ * virtual kernel addresses to the allocated page(s).
+ */
+extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order));
+extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist));
+extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order);
+
+static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+	/*
+	 * Gets optimized away by the compiler.
+	 */
+	if (order >= MAX_ORDER)
+		return NULL;
+	return _alloc_pages(gfp_mask, order);
+}
+
+#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
+
+extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order));
+extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
+
+#define __get_free_page(gfp_mask) \
+		__get_free_pages((gfp_mask),0)
+
+#define __get_dma_pages(gfp_mask, order) \
+		__get_free_pages((gfp_mask) | GFP_DMA,(order))
+
+/*
+ * The old interface name will be removed in 2.5:
+ */
+#define get_free_page get_zeroed_page
+
+/*
+ * There is only one 'core' page-freeing function.
+ */
+extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
+extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
+
+#define __free_page(page) __free_pages((page), 0)
+#define free_page(addr) free_pages((addr),0)
+
+extern void show_free_areas(void);
+extern void show_free_areas_node(pg_data_t *pgdat);
+
+extern void clear_page_tables(struct mm_struct *, unsigned long, int);
+
+extern int fail_writepage(struct page *);
+struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int unused);
+struct file *shmem_file_setup(char * name, loff_t size);
+extern void shmem_lock(struct file * file, int lock);
+extern int shmem_zero_setup(struct vm_area_struct *);
+
+extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size);
+extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
+extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
+extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);
+
+extern int vmtruncate(struct inode * inode, loff_t offset);
+extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
+extern int make_pages_present(unsigned long addr, unsigned long end);
+extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
+extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len);
+extern int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len);
+extern int ptrace_attach(struct task_struct *tsk);
+extern int ptrace_detach(struct task_struct *, unsigned int);
+extern void ptrace_disable(struct task_struct *);
+extern int ptrace_check_attach(struct task_struct *task, int kill);
+
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
+		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+
+/*
+ * On a two-level page table, this ends up being trivial. Thus the
+ * inlining and the symmetry break with pte_alloc() that does all
+ * of this out-of-line.
+ */
+static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+	if (pgd_none(*pgd))
+		return __pmd_alloc(mm, pgd, address);
+	return pmd_offset(pgd, address);
+}
+
+extern int pgt_cache_water[2];
+extern int check_pgt_cache(void);
+
+extern void free_area_init(unsigned long * zones_size);
+extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
+	unsigned long * zones_size, unsigned long zone_start_paddr, 
+	unsigned long *zholes_size);
+extern void mem_init(void);
+extern void show_mem(void);
+extern void si_meminfo(struct sysinfo * val);
+extern void swapin_readahead(swp_entry_t);
+
+extern struct address_space swapper_space;
+#define PageSwapCache(page) ((page)->mapping == &swapper_space)
+
+static inline int is_page_cache_freeable(struct page * page)
+{
+	return page_count(page) - !!page->buffers == 1;
+}
+
+extern int can_share_swap_page(struct page *);
+extern int remove_exclusive_swap_page(struct page *);
+
+extern void __free_pte(pte_t);
+
+/* mmap.c */
+extern void lock_vma_mappings(struct vm_area_struct *);
+extern void unlock_vma_mappings(struct vm_area_struct *);
+extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
+extern void __insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
+extern void build_mmap_rb(struct mm_struct *);
+extern void exit_mmap(struct mm_struct *);
+
+extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+
+extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flag, unsigned long pgoff);
+
+static inline unsigned long do_mmap(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flag, unsigned long offset)
+{
+	unsigned long ret = -EINVAL;
+	if ((offset + PAGE_ALIGN(len)) < offset)
+		goto out;
+	if (!(offset & ~PAGE_MASK))
+		ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+out:
+	return ret;
+}
+
+extern int do_munmap(struct mm_struct *, unsigned long, size_t);
+
+extern unsigned long do_brk(unsigned long, unsigned long);
+
+static inline void __vma_unlink(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev)
+{
+	prev->vm_next = vma->vm_next;
+	rb_erase(&vma->vm_rb, &mm->mm_rb);
+	if (mm->mmap_cache == vma)
+		mm->mmap_cache = prev;
+}
+
+static inline int can_vma_merge(struct vm_area_struct * vma, unsigned long vm_flags)
+{
+	if (!vma->vm_file && vma->vm_flags == vm_flags)
+		return 1;
+	else
+		return 0;
+}
+
+struct zone_t;
+/* filemap.c */
+extern void remove_inode_page(struct page *);
+extern unsigned long page_unuse(struct page *);
+extern void truncate_inode_pages(struct address_space *, loff_t);
+
+/* generic vm_area_ops exported for stackable file systems */
+extern int filemap_sync(struct vm_area_struct *, unsigned long,	size_t, unsigned int);
+extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int);
+
+/*
+ * GFP bitmasks..
+ */
+/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low four bits) */
+#define __GFP_DMA	0x01
+#define __GFP_HIGHMEM	0x02
+
+/* Action modifiers - doesn't change the zoning */
+#define __GFP_WAIT	0x10	/* Can wait and reschedule? */
+#define __GFP_HIGH	0x20	/* Should access emergency pools? */
+#define __GFP_IO	0x40	/* Can start low memory physical IO? */
+#define __GFP_HIGHIO	0x80	/* Can start high mem physical IO? */
+#define __GFP_FS	0x100	/* Can call down to low-level FS? */
+
+#define GFP_NOHIGHIO	(__GFP_HIGH | __GFP_WAIT | __GFP_IO)
+#define GFP_NOIO	(__GFP_HIGH | __GFP_WAIT)
+#define GFP_NOFS	(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO)
+#define GFP_ATOMIC	(__GFP_HIGH)
+#define GFP_USER	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
+#define GFP_HIGHUSER	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_KERNEL	(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
+#define GFP_NFS		(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
+#define GFP_KSWAPD	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
+
+/* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
+   platforms, used as appropriate on others */
+
+#define GFP_DMA		__GFP_DMA
+
+/* vma is the first one with  address < vma->vm_end,
+ * and even  address < vma->vm_start. Have to extend vma. */
+static inline int expand_stack(struct vm_area_struct * vma, unsigned long address)
+{
+	unsigned long grow;
+
+	/*
+	 * vma->vm_start/vm_end cannot change under us because the caller is required
+	 * to hold the mmap_sem in write mode. We need to get the spinlock only
+	 * before relocating the vma range ourself.
+	 */
+	address &= PAGE_MASK;
+ 	spin_lock(&vma->vm_mm->page_table_lock);
+	grow = (vma->vm_start - address) >> PAGE_SHIFT;
+	if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
+	    ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) {
+		spin_unlock(&vma->vm_mm->page_table_lock);
+		return -ENOMEM;
+	}
+	vma->vm_start = address;
+	vma->vm_pgoff -= grow;
+	vma->vm_mm->total_vm += grow;
+	if (vma->vm_flags & VM_LOCKED)
+		vma->vm_mm->locked_vm += grow;
+	spin_unlock(&vma->vm_mm->page_table_lock);
+	return 0;
+}
+
+/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
+extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
+					     struct vm_area_struct **pprev);
+
+/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
+   NULL if none.  Assume start_addr < end_addr. */
+static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
+{
+	struct vm_area_struct * vma = find_vma(mm,start_addr);
+
+	if (vma && end_addr <= vma->vm_start)
+		vma = NULL;
+	return vma;
+}
+
+extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
+
+#endif /* __KERNEL__ */
+
+#endif
diff -urN linux-2.4.17-rc1-virgin/include/linux/mmzone.h linux-2.4.17-rc1-wli3/include/linux/mmzone.h
--- linux-2.4.17-rc1-virgin/include/linux/mmzone.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/linux/mmzone.h	Sun Dec 16 18:05:00 2001
@@ -39,12 +39,15 @@
 	 */
 	spinlock_t		lock;
 	unsigned long		free_pages;
+	unsigned long		inactive_clean_pages;
+	unsigned long		inactive_dirty_pages;
 	unsigned long		pages_min, pages_low, pages_high;
 	int			need_balance;
 
 	/*
 	 * free areas of different sizes
 	 */
+	struct list_head	inactive_clean_list;
 	free_area_t		free_area[MAX_ORDER];
 
 	/*
@@ -112,9 +115,6 @@
 
 extern int numnodes;
 extern pg_data_t *pgdat_list;
-
-#define memclass(pgzone, classzone)	(((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \
-			&& ((pgzone) <= (classzone)))
 
 /*
  * The following two are not meant for general usage. They are here as
diff -urN linux-2.4.17-rc1-virgin/include/linux/pagemap.h linux-2.4.17-rc1-wli3/include/linux/pagemap.h
--- linux-2.4.17-rc1-virgin/include/linux/pagemap.h	Thu Nov 22 11:46:44 2001
+++ linux-2.4.17-rc1-wli3/include/linux/pagemap.h	Sun Dec 16 18:16:02 2001
@@ -51,21 +51,17 @@
 extern void page_cache_init(unsigned long);
 
 /*
- * We use a power-of-two hash table to avoid a modulus,
- * and get a reasonable hash by knowing roughly how the
- * inode pointer and indexes are distributed (ie, we
- * roughly know which bits are "significant")
- *
- * For the time being it will work for struct address_space too (most of
- * them sitting inside the inodes). We might want to change it later.
+ * The multiplicative page cache hash from Chuck Lever's paper.
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ * page 3 describes the behavior of the different page cache hash
+ * functions. This could be painful without integer multiplies, so
+ * perhaps for wider portability conditional definitions would win.
+ * -- wli
  */
-static inline unsigned long _page_hashfn(struct address_space * mapping, unsigned long index)
+static inline unsigned long _page_hashfn (struct address_space *mapping, unsigned long index)
 {
-#define i (((unsigned long) mapping)/(sizeof(struct inode) & ~ (sizeof(struct inode) - 1)))
-#define s(x) ((x)+((x)>>PAGE_HASH_BITS))
-	return s(i+index) & (PAGE_HASH_SIZE-1);
-#undef i
-#undef s
+	return ((((unsigned long) mapping + index) * 2654435761UL) >>
+		 (32 - PAGE_HASH_BITS)) & (PAGE_HASH_SIZE - 1);
 }
 
 #define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))
diff -urN linux-2.4.17-rc1-virgin/include/linux/sched.h linux-2.4.17-rc1-wli3/include/linux/sched.h
--- linux-2.4.17-rc1-virgin/include/linux/sched.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/linux/sched.h	Mon Dec 17 00:12:14 2001
@@ -26,6 +26,7 @@
 #include <linux/signal.h>
 #include <linux/securebits.h>
 #include <linux/fs_struct.h>
+#include <linux/lock_break.h>
 
 struct exec_domain;
 
@@ -88,6 +89,7 @@
 #define TASK_UNINTERRUPTIBLE	2
 #define TASK_ZOMBIE		4
 #define TASK_STOPPED		8
+#define PREEMPT_ACTIVE		0x40000000
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -115,6 +117,21 @@
 #define SCHED_OTHER		0
 #define SCHED_FIFO		1
 #define SCHED_RR		2
+#ifdef CONFIG_RTSCHED
+#ifdef CONFIG_MAX_PRI
+#if CONFIG_MAX_PRI < 99
+#define MAX_PRI                 99
+#elif CONFIG_MAX_PRI > 2047
+#define MAX_PRI                 2047
+#else
+#define MAX_PRI                 CONFIG_MAX_PRI
+#endif
+#else
+#define MAX_PRI                 127
+#endif
+#else
+#define MAX_PRI                 99
+#endif
 
 /*
  * This is an additional bit set when we want to
@@ -154,6 +171,9 @@
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
 asmlinkage void schedule(void);
+#ifdef CONFIG_PREEMPT
+asmlinkage void preempt_schedule(void);
+#endif
 
 extern int schedule_task(struct tq_struct *task);
 extern void flush_scheduled_tasks(void);
@@ -199,7 +219,9 @@
 }
 
 /* Maximum number of active map areas.. This is a random (large) number */
-#define MAX_MAP_COUNT	(65536)
+#define DEFAULT_MAX_MAP_COUNT	(65536)
+
+extern int max_map_count;
 
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
@@ -283,7 +305,17 @@
 	 * offsets of these are hardcoded elsewhere - touch with care
 	 */
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
-	unsigned long flags;	/* per process flags, defined below */
+        /*
+         * We want the preempt_count in this cache line, but we
+         * a) don't want to mess up the offsets in asm code, and
+         * b) the alignment of the next line below,
+         * so we move "flags" down
+	 *
+	 * Also note we don't make preempt_count volatile, but we do
+	 * need to make sure it is never hiding in a register when
+	 * we have an interrupt, so we need to use barrier()
+         */
+	int preempt_count;          /* 0=> preemptable, < 0 => BUG */
 	int sigpending;
 	mm_segment_t addr_limit;	/* thread address space:
 					 	0-0xBFFFFFFF for user-thead
@@ -319,12 +351,14 @@
 	 * that's just fine.)
 	 */
 	struct list_head run_list;
+#ifdef CONFIG_RTSCHED
+        int counter_recalc;
+#endif
 	unsigned long sleep_time;
 
 	struct task_struct *next_task, *prev_task;
 	struct mm_struct *active_mm;
-	struct list_head local_pages;
-	unsigned int allocation_order, nr_local_pages;
+	unsigned long flags;
 
 /* task state */
 	struct linux_binfmt *binfmt;
@@ -401,6 +435,10 @@
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
+#ifdef CONFIG_RTSCHED
+        int effprio;                    /* effective real time priority */
+        void (*newprio)(struct task_struct*, int);
+#endif
 	
 /* Thread group tracking */
    	u32 parent_exec_id;
@@ -517,11 +555,22 @@
 extern struct   mm_struct init_mm;
 extern struct task_struct *init_tasks[NR_CPUS];
 
+/*
+ * A pid hash function using a prime near golden
+ * ratio to the machine word size (32 bits). The
+ * results of this are unknown.
+ *
+ * Added shift to extract high-order bits of computed
+ * hash function.
+ * -- wli
+ */
+
 /* PID hashing. (shouldnt this be dynamic?) */
 #define PIDHASH_SZ (4096 >> 2)
+#define PIDHASH_BITS 10
 extern struct task_struct *pidhash[PIDHASH_SZ];
-
-#define pid_hashfn(x)	((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
+#define pid_hashfn(x) \
+	(((2654435761UL*(x)) >> (BITS_PER_LONG-PIDHASH_BITS)) & (PIDHASH_SZ-1))
 
 static inline void hash_pid(struct task_struct *p)
 {
@@ -874,10 +923,16 @@
 
 static inline void del_from_runqueue(struct task_struct * p)
 {
+#ifdef CONFIG_RTSCHED
+extern void __del_from_runqueue(struct task_struct * p);
+
+        __del_from_runqueue(p);
+#else
 	nr_running--;
 	p->sleep_time = jiffies;
 	list_del(&p->run_list);
 	p->run_list.next = NULL;
+#endif
 }
 
 static inline int task_on_runqueue(struct task_struct *p)
@@ -925,6 +980,11 @@
 	mntput(rootmnt);
 	return res;
 }
+
+#define _TASK_STRUCT_DEFINED
+#include <linux/dcache.h>
+#include <linux/tqueue.h>
+#include <linux/fs_struct.h>
 
 #endif /* __KERNEL__ */
 
diff -urN linux-2.4.17-rc1-virgin/include/linux/segment_tree.h linux-2.4.17-rc1-wli3/include/linux/segment_tree.h
--- linux-2.4.17-rc1-virgin/include/linux/segment_tree.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/linux/segment_tree.h	Sun Dec 16 18:04:59 2001
@@ -0,0 +1,362 @@
+/*
+ * linux/include/linux/segment_tree.h
+ *
+ * Copyright (C) Oct 2001 William Irwin, IBM
+ *
+ * Implementation of segment trees augmented with length information.
+ *
+ * In this context, "segment" refers to "line segment". In particular,
+ * I am storing closed intervals of numbers in this tree. One very
+ * important invariant maintained is that all the intervals in the
+ * tree are disjoint. This fact is actually used to help with efficient
+ * search, because since they are all disjoint, they are ordered
+ * according to any representative, in particular, the starting and
+ * ending points.
+ *
+ * The separate tree on length is used to help with searches for
+ * intervals of at least a particular length, and does not have
+ * any special properties otherwise.
+ */
+
+#ifndef _SEGMENT_TREE_H
+#define _SEGMENT_TREE_H
+
+#include <linux/treap.h>
+#include <linux/kernel.h>
+
+typedef struct segment_tree_node {
+	treap_node_t start;
+	treap_node_t length;
+} segment_tree_node_t;
+
+typedef union segment_buf {
+	segment_tree_node_t segment;
+	union segment_buf *next;
+} segment_buf_t;
+
+typedef struct segment_tree_root {
+	treap_node_t *start_tree;
+	treap_node_t *length_tree;
+} segment_tree_root_t;
+
+#define segment_length(node) ((node)->length.value)
+#define segment_start(node) ((node)->start.value)
+#define segment_end(node) ((node)->start.value + (node)->length.value - 1)
+
+#define segment_above_point(node, point) \
+	(segment_end(node) > (point))
+
+#define segment_below_point(node, point) \
+	(segment_start(node) < (point))
+
+#define segment_contains_point(node, point) \
+	(segment_start(node) <= (point) && segment_end(node) >= (point))
+
+#define segment_above(node1, node2) \
+	(segment_start(node1) > segment_end(node2))
+
+#define segment_below(node1, node2) \
+	(segment_end(node1) < segment_start(node2))
+
+#define segment_disjoint(node1, node2) \
+	(segment_above(node1, node2) || segment_below(node1, node2))
+
+#define segment_intersect(node1, node2) \
+	(segment_start(node1) <= segment_end(node2) \
+		&& segment_start(node2) <= segment_end(node1))
+
+#define segment_contains(node1, node2) \
+	(segment_start(node1) <= segment_start(node2) \
+		&& segment_end(node1) >= segment_end(node2))
+
+#define segment_set_endpoints(node, start, end) \
+	do { \
+		segment_length(node) = (end) - (start) + 1; \
+		segment_start(node) = (start); \
+	} while(0)
+
+#define segment_unite(node1, node2) \
+	segment_set_endpoints(node1, \
+		min(segment_start(node1),segment_start(node2)), \
+		max(segment_end(node1), segment_end(node2)))
+
+#define segment_union(seg_union, node1, node2) \
+	segment_set_endpoints(seg_union, \
+		min(segment_start(node1),segment_start(node2)), \
+		max(segment_end(node1), segment_end(node2)))
+
+#define segment_intersection(intersect, node1, node2) \
+	segment_set_endpoints(intersect, \
+		max(segment_start(node1), segment_start(node2)), \
+		min(segment_end(node1), segment_end(node2)))
+
+#define segment_set_start(node, start) \
+	segment_set_endpoints(node, start, segment_end(node))
+
+#define segment_set_end(node, end) \
+	segment_set_endpoints(node, segment_start(node), end)
+
+#define start_segment_treap(node) \
+	treap_entry((node), segment_tree_node_t, start)
+#define length_segment_treap(node) \
+	treap_entry((node), segment_tree_node_t, length)
+
+#define start_treap(node) segment_start(start_segment_treap(node))
+#define end_treap(node)   segment_end(start_segment_treap(node))
+
+static inline unsigned segment_tree_contains_point(segment_tree_node_t *root,
+							unsigned long point)
+{
+	treap_node_t *node;
+
+	if(!root)
+		return 0;
+
+	node = &root->start;
+	while(node) {
+		if(segment_contains_point(start_segment_treap(node), point))
+			return 1;
+		else if(segment_below_point(start_segment_treap(node), point))
+			node = node->right;
+		else if(segment_above_point(start_segment_treap(node), point))
+			node = node->left;
+		else
+			BUG();
+	}
+	return 0;
+}
+
+static inline unsigned segment_tree_intersects(segment_tree_node_t *root,
+						segment_tree_node_t *segment)
+{
+	treap_node_t *node;
+
+	if(!root)
+		return 0;
+
+	node = &root->start;
+	while(node) {
+		if(segment_intersect(start_segment_treap(node), segment))
+			return 1;
+		else if(segment_below(start_segment_treap(node), segment))
+			node = node->right;
+		else if(segment_above(start_segment_treap(node), segment))
+			node = node->left;
+		else
+			BUG();
+	}
+	return 0;
+}
+
+/*
+ * There are five cases here.
+ * (1) the segments are disjoint
+ * (2) the entire segment is removed
+ * (3) something from the beginning of the segment is removed
+ * (4) something from the end of the segment is removed
+ * (5) the segment is split into two fragments
+ */
+static inline void segment_complement(	segment_tree_node_t **segment,
+					segment_tree_node_t  *to_remove,
+					segment_tree_node_t **fragment)
+{
+
+	if(segment_disjoint(*segment, to_remove)) {
+
+		*fragment = NULL;
+
+	} else if(segment_contains(to_remove, *segment)) {
+
+		*segment = *fragment = NULL;
+
+	} else if(segment_start(*segment) >= segment_start(to_remove)) {
+		unsigned long start, end;
+		*fragment = NULL;
+		start = segment_end(to_remove) + 1;
+		end = segment_end(*segment);
+		segment_set_endpoints(*segment, start, end);
+
+	} else if(segment_end(*segment) <= segment_end(to_remove)) {
+		unsigned long start, end;
+		*fragment = NULL;
+		start = segment_start(*segment);
+		end = segment_start(to_remove) - 1;
+		segment_set_endpoints(*segment, start, end);
+
+	} else {
+		unsigned long start_seg, end_seg, start_frag, end_frag;
+
+		start_seg = segment_start(*segment);
+		end_seg = segment_start(to_remove) - 1;
+
+		start_frag = segment_end(to_remove) + 1;
+		end_frag = segment_end(*segment);
+
+		segment_set_endpoints(*segment, start_seg, end_seg);
+		segment_set_endpoints(*fragment, start_frag, end_frag);
+
+	}
+}
+
+/*
+ * Efficiently determining all possible line segments which intersect
+ * with another line segment requires splitting the start treap according
+ * to the endpoints. This is a derived key so it unfortunately may not be 
+ * shared with the generic treap implementation.
+ */
+static inline void segment_end_split(treap_root_t root, unsigned long end,
+				treap_root_t less, treap_root_t more)
+{
+	treap_root_t tree = root;
+	treap_node_t sentinel;
+
+	sentinel.value = end;
+	sentinel.priority = ULONG_MAX;
+	sentinel.left = sentinel.right = sentinel.parent = NULL;
+
+	while(1) {
+		if(!*root) {
+			*root = &sentinel;
+			goto finish;
+		} else if(end > end_treap(*root) && !(*root)->right) {
+			(*root)->right = &sentinel;
+			sentinel.parent = *root;
+			root = &(*root)->right;
+			goto upward;
+		} else if(end <= end_treap(*root) && !(*root)->left) {
+			(*root)->left  = &sentinel;
+			sentinel.parent = *root;
+			root = &(*root)->left;
+			goto upward;
+		} else if(end > end_treap(*root))
+			root = &(*root)->right;
+		else /* end <= end_treap(*root) */
+			root = &(*root)->left;
+	}
+
+upward:
+
+	while(1) {
+		if((*root)->left && (*root)->left->priority > (*root)->priority)
+			treap_rotate_right(root);
+		else if((*root)->right
+				&& (*root)->right->priority > (*root)->priority)
+			treap_rotate_left(root);
+
+		if(!(*root)->parent)
+			goto finish;
+		else if(!(*root)->parent->parent)
+			root = tree;
+		else if((*root)->parent->parent->left == (*root)->parent)
+			root = &(*root)->parent->parent->left;
+		else if((*root)->parent->parent->right == (*root)->parent)
+			root = &(*root)->parent->parent->right;
+	}
+
+finish:
+	*less = (*root)->left;
+	*more = (*root)->right;
+
+	if(*less) (*less)->parent = NULL;
+	if(*more) (*more)->parent = NULL;
+
+	*root = NULL;
+}
+
+#define segment_length_link(node) \
+	treap_node_link(&start_segment_treap(node)->length)
+
+#define segment_start_link(node) \
+	treap_node_link(&start_segment_treap(node)->start)
+
+#define segment_delete(node) \
+	do { \
+		treap_root_delete(segment_start_link(node)); \
+		treap_root_delete(segment_length_link(node)); \
+	} while(0)
+
+static inline void segment_all_intersect(treap_root_t root,
+					unsigned long start,
+					unsigned long end,
+					treap_root_t intersect)
+{
+	treap_node_t *less_end, *more_end, *more_start, *less_start;
+	less_start = more_start = NULL;
+
+	if(start) {
+		less_end = more_end = NULL;
+		segment_end_split(root, start, &less_end, &more_end);
+		treap_split(&more_end, end + 1, &less_start, &more_start);
+		*root = NULL;
+		treap_join(root, &less_end, &more_start);
+	} else {
+		treap_split(root, end + 1, &less_start, &more_start);
+		*root = more_start;
+	}
+	*intersect = less_start;
+}
+
+#if 0
+/*
+ * If for some reason there is a reason to visualize the trees,
+ * the following routines may be useful examples as to how they
+ * may be rendered using dot from AT&T's graphviz.
+ */
+
+extern void early_printk(const char *fmt, ...);
+
+static void print_ptr_graph(treap_root_t root) {
+	if(!*root)
+		return;
+	else if(!(*root)->marker) {
+		segment_tree_node_t *seg = start_segment_treap(*root);
+		(*root)->marker = 1UL;
+		early_printk("x%p [label=\"%p, start=%lu,\\nlength=%lu\"];\n",
+				*root, *root, segment_start(seg), segment_length(seg));
+		if((*root)->parent)
+			early_printk("x%p -> x%p [label=\"parent\"];\n",
+						*root, (*root)->parent);
+		if((*root)->left)
+			early_printk("x%p -> x%p [label=\"left\"];\n",
+						*root, (*root)->left);
+		if((*root)->right)
+			early_printk("x%p -> x%p [label=\"right\"];\n",
+						*root, (*root)->right);
+
+		print_ptr_graph(&(*root)->parent);
+		print_ptr_graph(&(*root)->left);
+		print_ptr_graph(&(*root)->right);
+		(*root)->marker = 0UL;
+	}
+	/*
+	 * This is no good for cycle detection since we also traverse
+	 * the parent links. It's -very- cyclic with those.
+	 */
+}
+static void print_length_graph(treap_root_t root) {
+	if(!*root)
+		return;
+	else if(!(*root)->marker) {
+		segment_tree_node_t *seg = length_segment_treap(*root);
+		(*root)->marker = 1UL;
+		early_printk("x%p [label=\"%p: start=%lu,\\nlength=%lu\"];\n",
+				*root, *root, segment_start(seg), segment_length(seg));
+		if((*root)->parent)
+			early_printk("x%p -> x%p [label=\"parent\"];\n",
+						*root, (*root)->parent);
+		if((*root)->left)
+			early_printk("x%p -> x%p [label=\"left\"];\n",
+						*root, (*root)->left);
+		if((*root)->right)
+			early_printk("x%p -> x%p [label=\"right\"];\n",
+						*root, (*root)->right);
+
+		print_length_graph(&(*root)->parent);
+		print_length_graph(&(*root)->left);
+		print_length_graph(&(*root)->right);
+		(*root)->marker = 0UL;
+	}
+}
+#endif
+
+#endif /* _SEGMENT_TREE_H */
diff -urN linux-2.4.17-rc1-virgin/include/linux/smp.h linux-2.4.17-rc1-wli3/include/linux/smp.h
--- linux-2.4.17-rc1-virgin/include/linux/smp.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/linux/smp.h	Sun Dec 16 18:05:00 2001
@@ -81,7 +81,9 @@
 #define smp_processor_id()			0
 #define hard_smp_processor_id()			0
 #define smp_threads_ready			1
+#ifndef CONFIG_PREEMPT
 #define kernel_lock()
+#endif
 #define cpu_logical_map(cpu)			0
 #define cpu_number_map(cpu)			0
 #define smp_call_function(func,info,retry,wait)	({ 0; })
diff -urN linux-2.4.17-rc1-virgin/include/linux/smp_lock.h linux-2.4.17-rc1-wli3/include/linux/smp_lock.h
--- linux-2.4.17-rc1-virgin/include/linux/smp_lock.h	Thu Nov 22 11:46:27 2001
+++ linux-2.4.17-rc1-wli3/include/linux/smp_lock.h	Sun Dec 16 18:16:02 2001
@@ -3,7 +3,7 @@
 
 #include <linux/config.h>
 
-#ifndef CONFIG_SMP
+#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT)
 
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
diff -urN linux-2.4.17-rc1-virgin/include/linux/spinlock.h linux-2.4.17-rc1-wli3/include/linux/spinlock.h
--- linux-2.4.17-rc1-virgin/include/linux/spinlock.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/linux/spinlock.h	Sun Dec 16 18:05:00 2001
@@ -2,6 +2,7 @@
 #define __LINUX_SPINLOCK_H
 
 #include <linux/config.h>
+#include <linux/compiler.h>
 
 /*
  * These are the generic versions of the spinlocks and read-write
@@ -45,8 +46,10 @@
 
 #if (DEBUG_SPINLOCKS < 1)
 
+#ifndef CONFIG_PREEMPT
 #define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic)
 #define ATOMIC_DEC_AND_LOCK
+#endif
 
 /*
  * Your basic spinlocks, allowing only a single CPU anywhere
@@ -62,11 +65,11 @@
 #endif
 
 #define spin_lock_init(lock)	do { } while(0)
-#define spin_lock(lock)		(void)(lock) /* Not "unused variable". */
+#define _raw_spin_lock(lock)	(void)(lock) /* Not "unused variable". */
 #define spin_is_locked(lock)	(0)
-#define spin_trylock(lock)	({1; })
+#define _raw_spin_trylock(lock)	({1; })
 #define spin_unlock_wait(lock)	do { } while(0)
-#define spin_unlock(lock)	do { } while(0)
+#define _raw_spin_unlock(lock)	do { } while(0)
 
 #elif (DEBUG_SPINLOCKS < 2)
 
@@ -125,12 +128,76 @@
 #endif
 
 #define rwlock_init(lock)	do { } while(0)
-#define read_lock(lock)		(void)(lock) /* Not "unused variable". */
-#define read_unlock(lock)	do { } while(0)
-#define write_lock(lock)	(void)(lock) /* Not "unused variable". */
-#define write_unlock(lock)	do { } while(0)
+#define _raw_read_lock(lock)	(void)(lock) /* Not "unused variable". */
+#define _raw_read_unlock(lock)	do { } while(0)
+#define _raw_write_lock(lock)	(void)(lock) /* Not "unused variable". */
+#define _raw_write_unlock(lock)	do { } while(0)
 
 #endif /* !SMP */
+
+#ifdef CONFIG_PREEMPT
+
+#define preempt_is_disabled() (current->preempt_count)
+#define preempt_prefetch(a) prefetchw(a)
+
+#define preempt_disable() \
+do { \
+	++current->preempt_count; \
+	barrier(); \
+} while (0)
+
+#define preempt_enable_no_resched() \
+do { \
+	--current->preempt_count; \
+	barrier(); \
+} while (0)
+
+#define preempt_enable() \
+do { \
+	--current->preempt_count; \
+	barrier(); \
+	if (unlikely((current->preempt_count == 0) && current->need_resched)) \
+		preempt_schedule(); \
+} while (0)
+
+#define spin_lock(lock)	\
+do { \
+	preempt_disable(); \
+	_raw_spin_lock(lock); \
+} while(0)
+#define spin_trylock(lock)	({preempt_disable(); _raw_spin_trylock(lock) ? \
+					1 : ({preempt_enable(); 0;});})
+#define spin_unlock(lock) \
+do { \
+	_raw_spin_unlock(lock); \
+	preempt_enable(); \
+} while (0)
+
+#define read_lock(lock)		({preempt_disable(); _raw_read_lock(lock);})
+#define read_unlock(lock)	({_raw_read_unlock(lock); preempt_enable();})
+#define write_lock(lock)	({preempt_disable(); _raw_write_lock(lock);})
+#define write_unlock(lock)	({_raw_write_unlock(lock); preempt_enable();})
+#define write_trylock(lock)	({preempt_disable(); _raw_write_trylock(lock) ? \
+					1 : ({preempt_enable(); 0;});})
+
+#else
+
+#define preempt_is_disabled() do { } while (0)
+#define preempt_disable()    do { } while (0)
+#define preempt_enable_no_resched()
+#define preempt_enable()     do { } while (0)
+#define preempt_prefetch(a)
+
+#define spin_lock(lock)		_raw_spin_lock(lock)
+#define spin_trylock(lock)	_raw_spin_trylock(lock)
+#define spin_unlock(lock)	_raw_spin_unlock(lock)
+
+#define read_lock(lock)		_raw_read_lock(lock)
+#define read_unlock(lock)	_raw_read_unlock(lock)
+#define write_lock(lock)	_raw_write_lock(lock)
+#define write_unlock(lock)	_raw_write_unlock(lock)
+#define write_trylock(lock)	_raw_write_trylock(lock)
+#endif
 
 /* "lock on reference count zero" */
 #ifndef ATOMIC_DEC_AND_LOCK
diff -urN linux-2.4.17-rc1-virgin/include/linux/swap.h linux-2.4.17-rc1-wli3/include/linux/swap.h
--- linux-2.4.17-rc1-virgin/include/linux/swap.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/linux/swap.h	Sun Dec 16 18:05:00 2001
@@ -86,8 +86,8 @@
 extern unsigned int nr_free_pages(void);
 extern unsigned int nr_free_buffer_pages(void);
 extern int nr_active_pages;
-extern int nr_inactive_pages;
-extern atomic_t nr_async_pages;
+extern int nr_inactive_dirty_pages;
+extern int nr_inactive_clean_pages;
 extern atomic_t page_cache_size;
 extern atomic_t buffermem_pages;
 extern spinlock_t pagecache_lock;
@@ -100,18 +100,42 @@
 
 struct zone_t;
 
+/* linux/mm/rmap.c */
+extern int FASTCALL(page_referenced(struct page *));
+extern void FASTCALL(page_add_rmap(struct page *, pte_t *));
+extern void FASTCALL(page_remove_rmap(struct page *, pte_t *));
+extern int FASTCALL(try_to_unmap(struct page *));
+
+/* try_to_unmap return values */
+#define	SWAP_SUCCESS	0
+#define	SWAP_AGAIN	1
+#define	SWAP_FAIL	2
+#define	SWAP_ERROR	3
+
 /* linux/mm/swap.c */
+extern int total_swap_pages;
 extern void FASTCALL(lru_cache_add(struct page *));
 extern void FASTCALL(__lru_cache_del(struct page *));
 extern void FASTCALL(lru_cache_del(struct page *));
 
 extern void FASTCALL(activate_page(struct page *));
+extern void FASTCALL(activate_page_nolock(struct page *));
+extern void FASTCALL(deactivate_page(struct page *));
+extern void FASTCALL(deactivate_page_nolock(struct page *));
 
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
+extern struct page * FASTCALL(reclaim_page(zone_t *));
 extern wait_queue_head_t kswapd_wait;
-extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
+extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask));
+extern void wakeup_kswapd(void);
+extern int free_shortage(void);
+extern int total_free_shortage(void);
+extern int inactive_shortage(void);
+extern int total_inactive_shortage(void);
+extern unsigned int zone_free_shortage(zone_t *zone);
+extern unsigned int zone_inactive_shortage(zone_t *zone);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *);
@@ -125,6 +149,7 @@
 extern void show_swap_cache_info(void);
 #endif
 extern int add_to_swap_cache(struct page *, swp_entry_t);
+extern int add_to_swap(struct page *);
 extern void __delete_from_swap_cache(struct page *page);
 extern void delete_from_swap_cache(struct page *page);
 extern void free_page_and_swap_cache(struct page *page);
@@ -158,7 +183,14 @@
 
 extern spinlock_t pagemap_lru_lock;
 
-extern void FASTCALL(mark_page_accessed(struct page *));
+/*
+ * Page aging defines. These seem to work great in FreeBSD,
+ * no need to reinvent the wheel.
+ */
+#define PAGE_AGE_START 5
+#define PAGE_AGE_ADV 3
+#define PAGE_AGE_DECL 1
+#define PAGE_AGE_MAX 64
 
 /*
  * List add/del helper macros. These must be called
@@ -166,39 +198,60 @@
  */
 #define DEBUG_LRU_PAGE(page)			\
 do {						\
-	if (!PageLRU(page))			\
-		BUG();				\
 	if (PageActive(page))			\
 		BUG();				\
+	if (PageInactiveDirty(page))		\
+		BUG();				\
+	if (PageInactiveClean(page))		\
+		BUG();				\
 } while (0)
 
-#define add_page_to_active_list(page)		\
-do {						\
-	DEBUG_LRU_PAGE(page);			\
-	SetPageActive(page);			\
-	list_add(&(page)->lru, &active_list);	\
-	nr_active_pages++;			\
-} while (0)
-
-#define add_page_to_inactive_list(page)		\
-do {						\
-	DEBUG_LRU_PAGE(page);			\
-	list_add(&(page)->lru, &inactive_list);	\
-	nr_inactive_pages++;			\
-} while (0)
-
-#define del_page_from_active_list(page)		\
-do {						\
-	list_del(&(page)->lru);			\
-	ClearPageActive(page);			\
-	nr_active_pages--;			\
-} while (0)
+#define add_page_to_active_list(page) { \
+	DEBUG_LRU_PAGE(page); \
+	SetPageActive(page); \
+	list_add(&(page)->lru, &active_list); \
+	nr_active_pages++; \
+}
+
+#define add_page_to_inactive_dirty_list(page) { \
+	DEBUG_LRU_PAGE(page); \
+	SetPageInactiveDirty(page); \
+	list_add(&(page)->lru, &inactive_dirty_list); \
+	nr_inactive_dirty_pages++; \
+	page->zone->inactive_dirty_pages++; \
+}
+
+#define add_page_to_inactive_clean_list(page) { \
+	DEBUG_LRU_PAGE(page); \
+	SetPageInactiveClean(page); \
+	list_add(&(page)->lru, &page->zone->inactive_clean_list); \
+	page->zone->inactive_clean_pages++; \
+	nr_inactive_clean_pages++; \
+}
+
+#define del_page_from_active_list(page) { \
+	list_del(&(page)->lru); \
+	ClearPageActive(page); \
+	nr_active_pages--; \
+	DEBUG_LRU_PAGE(page); \
+}
+
+#define del_page_from_inactive_dirty_list(page) { \
+	list_del(&(page)->lru); \
+	ClearPageInactiveDirty(page); \
+	nr_inactive_dirty_pages--; \
+	page->zone->inactive_dirty_pages--; \
+	DEBUG_LRU_PAGE(page); \
+}
+
+#define del_page_from_inactive_clean_list(page) { \
+	list_del(&(page)->lru); \
+	ClearPageInactiveClean(page); \
+	page->zone->inactive_clean_pages--; \
+	nr_inactive_clean_pages--; \
+	DEBUG_LRU_PAGE(page); \
+}
 
-#define del_page_from_inactive_list(page)	\
-do {						\
-	list_del(&(page)->lru);			\
-	nr_inactive_pages--;			\
-} while (0)
 
 extern spinlock_t swaplock;
 
diff -urN linux-2.4.17-rc1-virgin/include/linux/swapctl.h linux-2.4.17-rc1-wli3/include/linux/swapctl.h
--- linux-2.4.17-rc1-virgin/include/linux/swapctl.h	Mon Sep 17 16:15:02 2001
+++ linux-2.4.17-rc1-wli3/include/linux/swapctl.h	Fri Dec 14 02:44:20 2001
@@ -10,4 +10,13 @@
 typedef pager_daemon_v1 pager_daemon_t;
 extern pager_daemon_t pager_daemon;
 
+typedef struct freepages_v1
+{
+	unsigned int	min;
+	unsigned int	low;
+	unsigned int	high;
+} freepages_v1;
+typedef freepages_v1 freepages_t;
+extern freepages_t freepages;
+
 #endif /* _LINUX_SWAPCTL_H */
diff -urN linux-2.4.17-rc1-virgin/include/linux/sysctl.h linux-2.4.17-rc1-wli3/include/linux/sysctl.h
--- linux-2.4.17-rc1-virgin/include/linux/sysctl.h	Mon Nov 26 05:29:17 2001
+++ linux-2.4.17-rc1-wli3/include/linux/sysctl.h	Sun Dec 16 18:05:00 2001
@@ -140,6 +140,7 @@
 	VM_PAGERDAEMON=8,	/* struct: Control kswapd behaviour */
 	VM_PGT_CACHE=9,		/* struct: Set page table cache parameters */
 	VM_PAGE_CLUSTER=10,	/* int: set number of pages to swap together */
+	VM_MAX_MAP_COUNT=11,	/* int: Maximum number of active map areas */
        VM_MIN_READAHEAD=12,    /* Min file readahead */
        VM_MAX_READAHEAD=13     /* Max file readahead */
 };
diff -urN linux-2.4.17-rc1-virgin/include/linux/tqueue.h linux-2.4.17-rc1-wli3/include/linux/tqueue.h
--- linux-2.4.17-rc1-virgin/include/linux/tqueue.h	Thu Nov 22 11:46:19 2001
+++ linux-2.4.17-rc1-wli3/include/linux/tqueue.h	Sun Dec 16 18:05:00 2001
@@ -94,6 +94,22 @@
 extern spinlock_t tqueue_lock;
 
 /*
+ * Call all "bottom halfs" on a given list.
+ */
+
+extern void __run_task_queue(task_queue *list);
+
+static inline void run_task_queue(task_queue *list)
+{
+	if (TQ_ACTIVE(*list))
+		__run_task_queue(list);
+}
+
+#endif /* _LINUX_TQUEUE_H */
+
+#if !defined(_LINUX_TQUEUE_H_INLINES) && defined(_TASK_STRUCT_DEFINED)
+#define _LINUX_TQUEUE_H_INLINES
+/*
  * Queue a task on a tq.  Return non-zero if it was successfully
  * added.
  */
@@ -109,17 +125,4 @@
 	}
 	return ret;
 }
-
-/*
- * Call all "bottom halfs" on a given list.
- */
-
-extern void __run_task_queue(task_queue *list);
-
-static inline void run_task_queue(task_queue *list)
-{
-	if (TQ_ACTIVE(*list))
-		__run_task_queue(list);
-}
-
-#endif /* _LINUX_TQUEUE_H */
+#endif
diff -urN linux-2.4.17-rc1-virgin/include/linux/treap.h linux-2.4.17-rc1-wli3/include/linux/treap.h
--- linux-2.4.17-rc1-virgin/include/linux/treap.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/include/linux/treap.h	Sun Dec 16 18:04:59 2001
@@ -0,0 +1,300 @@
+/*
+ * linux/include/linux/treap.h
+ *
+ * Copyright (C) 2001 William Irwin, IBM
+ *
+ * Simple treap implementation, following Aragon and Seidel.
+ *
+ * Treaps are a simple binary search tree structure, with a twist that
+ * radically simplifies their management. That is that they keep both
+ * the search key and a randomly generated priority. They are then both
+ * heap-ordered according to the priority and binary search tree ordered
+ * according to the search keys. They are specifically designed for, and
+ * also reputed to be effective at range tree and segment tree structures
+ * according to both Knuth and dynamic sets according to the
+ * Blelloch/Reid-Miller paper.
+ *
+ * The rotations themselves are simple, and they are done less often
+ * than for some kinds of trees, where splay trees where specifically
+ * mentioned by Knuth. The decision process as to when to perform a
+ * rotation is simplified by the heap structure. Rotations are done in
+ * two instances: when rotating a node down to a leaf position before
+ * deletion, and in restoring the heap ordering after an insertion.
+ *
+ * Treaps also support fast splitting and joining operations, which
+ * make them convenient for interval searches.
+ *
+ * One important fact to observe is that when joining, all of the
+ * members of the left tree must be less than all the members of
+ * the right tree, or otherwise the search tree ordering breaks.
+ */
+
+#ifndef _TREAP_H
+#define _TREAP_H
+
+#include <linux/kernel.h>
+
+typedef struct treap_node {
+	unsigned long priority;
+	unsigned long value;
+	struct treap_node *left, *right, *parent;
+	unsigned long marker;
+} treap_node_t;
+
+typedef treap_node_t **treap_root_t;
+
+#define TREAP_INIT(root) \
+	do { \
+		*root = NULL; \
+	} while(0)
+
+#define treap_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+#define treap_node_link(node) \
+	((!(node) || !(node)->parent) ? NULL :				  \
+		((node) == (node)->parent->left) ? &(node)->parent->left  \
+						 : &(node)->parent->right)
+
+#define treap_find_parent_and_remove_child(tmp, parent)	\
+	do {						\
+		parent = tmp->parent;			\
+		if(parent && parent->left == tmp)	\
+			parent->left = NULL;		\
+		else if(parent && parent->right == tmp)	\
+			parent->right = NULL;		\
+		else if(parent)				\
+			BUG();				\
+	} while(0)
+
+
+#define treap_find_leftmost_leaf(node)					\
+	do {								\
+		if(!node)						\
+			break;						\
+		while(1) {						\
+			if(node->left)					\
+				node = node->left;			\
+			else if(node->right)				\
+				node = node->right;			\
+			else						\
+				break;					\
+		}							\
+	} while(0)
+
+/*
+ * The diagram according to which the assignments in rotation are done:
+ *
+ *            T                            T
+ *            |                            |
+ *            y       <- left              x
+ *          /   \                        /   \
+ *        x      C     right ->         A     y
+ *       /  \                                / \
+ *     A     B                              B   C
+ *
+ * Some of these assignments are not necessary, as the edges do
+ * not change. In these cases the assignments are retained as comments.
+ */
+
+static inline void treap_rotate_left(treap_root_t root)
+{
+	treap_node_t *x, *y, *B, *T;
+	/* treap_node_t *A, *C; */
+
+	if(*root) {
+		x = *root;
+		T = x->parent;
+		y = x->right;
+		if(y) {
+			if(T && T->left  == x) T->left  = y;
+			if(T && T->right == x) T->right = y;
+
+			y->parent = T;
+			*root = y;
+
+			/* A = x->left; */
+
+			B = y->left;
+
+			/* C = y->right; */
+
+			y->left = x;
+			x->parent = y;
+
+			/*
+			x->left = A;
+			if(A) A->parent = x;
+			*/
+
+			x->right = B;
+			if(B) B->parent = x;
+
+			/*
+			y->right = C;
+			if(C) C->parent = y;
+			*/
+		}
+	}
+}
+
+static inline void treap_rotate_right(treap_root_t root)
+{
+	treap_node_t *x, *y, *B, *T;
+	/* treap_node_t *A, *C; */
+
+	if(*root) {
+		y = *root;
+		T = y->parent;
+		x = y->left;
+		if(x) {
+			if(T && T->left  == y) T->left  = x;
+			if(T && T->right == y) T->right = x;
+
+			x->parent = T;
+			*root = x;
+
+			/* A = x->left; */
+
+			B = x->right;
+
+			/* C = y->right; */
+
+			x->right = y;
+			y->parent = x;
+
+			/*
+			x->left = A;
+			if(A) A->parent = x;
+			*/
+
+			y->left = B;
+			if(B) B->parent = y;
+
+			/*
+			y->right = C;
+			if(C) C->parent = y;
+			*/
+		}
+	}
+}
+
+static inline treap_node_t *treap_root_delete(treap_root_t root)
+{
+	struct treap_node *tmp;
+
+	while(1) {
+
+		if(!root || !*root) return NULL;
+		else if(!(*root)->left && !(*root)->right) {
+			tmp = *root;
+			*root = tmp->parent = NULL;
+			return tmp;
+		} else if(!(*root)->left) {
+			treap_rotate_left(root);
+			root = &(*root)->left;
+		} else if(!(*root)->right) {
+			treap_rotate_right(root);
+			root = &(*root)->right;
+		} else if((*root)->left->priority > (*root)->right->priority) {
+			treap_rotate_right(root);
+			root = &(*root)->right;
+		} else {
+			treap_rotate_left(root);
+			root = &(*root)->left;
+		}
+	}
+}
+
+static inline void treap_insert(treap_root_t root, treap_node_t *node)
+{
+	treap_root_t tree = root;
+	node->left = node->right = node->parent = NULL;
+
+	while(1) {
+		if(!*root) {
+			*root = node;
+			return;
+		} else if(node->value <= (*root)->value && !(*root)->left) {
+			(*root)->left = node;
+			node->parent = *root;
+			root = &(*root)->left;
+			break;
+		} else if(node->value > (*root)->value && !(*root)->right) {
+			(*root)->right = node;
+			node->parent = *root;
+			root = &(*root)->right;
+			break;
+		} else if(node->value <= (*root)->value) {
+			root = &(*root)->left;
+		} else {  /* node->value > (*root)->value */
+			root = &(*root)->right;
+		}
+	}
+	while(1) {
+		if(!*root) return;
+		else if((*root)->left
+				&& (*root)->left->priority > (*root)->priority)
+			treap_rotate_right(root);
+		else if((*root)->right
+				&& (*root)->right->priority > (*root)->priority)
+			treap_rotate_left(root);
+
+		if(!(*root)->parent)
+			return;
+		else if(!(*root)->parent->parent)
+			root = tree;
+		else if((*root)->parent == (*root)->parent->parent->left)
+			root = &(*root)->parent->parent->left;
+		else if((*root)->parent == (*root)->parent->parent->right)
+			root = &(*root)->parent->parent->right;
+
+	}
+}
+
+static inline treap_node_t *treap_delete(treap_root_t root, unsigned long k)
+{
+	while(1) {
+		if(!*root) return NULL;
+		else if(k < (*root)->value) root = &(*root)->left;
+		else if(k > (*root)->value) root = &(*root)->right;
+		else return treap_root_delete(root);
+	}
+}
+
+static inline void treap_split(treap_root_t root, unsigned long k,
+					treap_root_t less, treap_root_t more)
+{
+	treap_node_t sentinel;
+
+	sentinel.value = k;
+	sentinel.priority = ULONG_MAX;
+	sentinel.parent = sentinel.left = sentinel.right = NULL;
+
+	treap_insert(root, &sentinel);
+	*less = (*root)->left;
+	*more = (*root)->right;
+
+	if(*less) (*less)->parent = NULL;
+	if(*more) (*more)->parent = NULL;
+
+	*root = NULL;
+}
+
+static inline void treap_join(treap_root_t root,
+				treap_root_t left, treap_root_t right)
+{
+	treap_node_t sentinel;
+	sentinel.priority = 0UL;
+	sentinel.left = *left;
+	sentinel.right = *right;
+	sentinel.parent = NULL;
+
+	if(*left)  (*left)->parent  = &sentinel;
+	if(*right) (*right)->parent = &sentinel;
+
+	*root = &sentinel;
+	treap_root_delete(root);
+}
+
+#endif /* _TREAP_H */
diff -urN linux-2.4.17-rc1-virgin/kernel/exit.c linux-2.4.17-rc1-wli3/kernel/exit.c
--- linux-2.4.17-rc1-virgin/kernel/exit.c	Wed Nov 21 14:42:27 2001
+++ linux-2.4.17-rc1-wli3/kernel/exit.c	Sun Dec 16 17:58:10 2001
@@ -190,6 +190,8 @@
 			}
 			i++;
 			set >>= 1;
+			debug_lock_break(1);
+			conditional_schedule();
 		}
 	}
 }
@@ -273,6 +275,10 @@
 struct mm_struct * start_lazy_tlb(void)
 {
 	struct mm_struct *mm = current->mm;
+#ifdef CONFIG_PREEMPT
+	if (preempt_is_disabled() == 0)
+		BUG();
+#endif
 	current->mm = NULL;
 	/* active_mm is still 'mm' */
 	atomic_inc(&mm->mm_count);
@@ -284,6 +290,10 @@
 {
 	struct mm_struct *active_mm = current->active_mm;
 
+#ifdef CONFIG_PREEMPT
+	if (preempt_is_disabled() == 0)
+		BUG();
+#endif
 	current->mm = mm;
 	if (mm != active_mm) {
 		current->active_mm = mm;
@@ -307,8 +317,8 @@
 		/* more a memory barrier than a real lock */
 		task_lock(tsk);
 		tsk->mm = NULL;
-		task_unlock(tsk);
 		enter_lazy_tlb(mm, current, smp_processor_id());
+		task_unlock(tsk);
 		mmput(mm);
 	}
 }
diff -urN linux-2.4.17-rc1-virgin/kernel/fork.c linux-2.4.17-rc1-wli3/kernel/fork.c
--- linux-2.4.17-rc1-virgin/kernel/fork.c	Wed Nov 21 10:18:42 2001
+++ linux-2.4.17-rc1-wli3/kernel/fork.c	Fri Dec 14 04:38:23 2001
@@ -260,9 +260,6 @@
 void mmput(struct mm_struct *mm)
 {
 	if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
-		extern struct mm_struct *swap_mm;
-		if (swap_mm == mm)
-			swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 		list_del(&mm->mmlist);
 		mmlist_nr--;
 		spin_unlock(&mmlist_lock);
@@ -604,6 +601,12 @@
 	if (p->binfmt && p->binfmt->module)
 		__MOD_INC_USE_COUNT(p->binfmt->module);
 
+#ifdef CONFIG_PREEMPT
+        /* Since we are keeping the context switch off state as part
+         * of the context, make sure we start with it off.
+         */
+	p->preempt_count = 1;
+#endif
 	p->did_exec = 0;
 	p->swappable = 0;
 	p->state = TASK_UNINTERRUPTIBLE;
@@ -649,8 +652,6 @@
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
-	INIT_LIST_HEAD(&p->local_pages);
-
 	retval = -ENOMEM;
 	/* copy all the process information */
 	if (copy_files(clone_flags, p))
@@ -682,10 +683,20 @@
 	 * more scheduling fairness. This is only important in the first
 	 * timeslice, on the long run the scheduling behaviour is unchanged.
 	 */
+        /*
+         * SCHED_FIFO tasks don't count down and have a negative counter.
+         * Don't change these, least they all end up at -1.
+ 	 */
+#ifdef CONFIG_RTSCHED
+        if (p->policy != SCHED_FIFO)
+#endif
+        {
+
 	p->counter = (current->counter + 1) >> 1;
 	current->counter >>= 1;
 	if (!current->counter)
 		current->need_resched = 1;
+        }
 
 	/*
 	 * Ok, add it to the run-queues and make it
diff -urN linux-2.4.17-rc1-virgin/kernel/ksyms.c linux-2.4.17-rc1-wli3/kernel/ksyms.c
--- linux-2.4.17-rc1-virgin/kernel/ksyms.c	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/kernel/ksyms.c	Fri Dec 14 02:44:44 2001
@@ -436,6 +436,9 @@
 EXPORT_SYMBOL(interruptible_sleep_on);
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 EXPORT_SYMBOL(schedule);
+#ifdef CONFIG_PREEMPT
+EXPORT_SYMBOL(preempt_schedule);
+#endif
 EXPORT_SYMBOL(schedule_timeout);
 EXPORT_SYMBOL(jiffies);
 EXPORT_SYMBOL(xtime);
diff -urN linux-2.4.17-rc1-virgin/kernel/ptrace.c linux-2.4.17-rc1-wli3/kernel/ptrace.c
--- linux-2.4.17-rc1-virgin/kernel/ptrace.c	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/kernel/ptrace.c	Fri Dec 14 04:06:29 2001
@@ -121,17 +121,119 @@
 }
 
 /*
- * Access another process' address space.
- * Source/target buffer must be kernel space, 
- * Do not walk the page table directly, use get_user_pages
+ * Access another process' address space, one page at a time.
  */
+static int access_one_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, void *buf, int len, int write)
+{
+	pgd_t * pgdir;
+	pmd_t * pgmiddle;
+	pte_t * pgtable;
+	char *maddr; 
+	struct page *page;
+
+repeat:
+	spin_lock(&mm->page_table_lock);
+	pgdir = pgd_offset(vma->vm_mm, addr);
+	if (pgd_none(*pgdir))
+		goto fault_in_page;
+	if (pgd_bad(*pgdir))
+		goto bad_pgd;
+	pgmiddle = pmd_offset(pgdir, addr);
+	if (pmd_none(*pgmiddle))
+		goto fault_in_page;
+	if (pmd_bad(*pgmiddle))
+		goto bad_pmd;
+	pgtable = pte_offset(pgmiddle, addr);
+	if (!pte_present(*pgtable))
+		goto fault_in_page;
+	if (write && (!pte_write(*pgtable) || !pte_dirty(*pgtable)))
+		goto fault_in_page;
+	page = pte_page(*pgtable);
+
+	/* ZERO_PAGE is special: reads from it are ok even though it's marked reserved */
+	if (page != ZERO_PAGE(addr) || write) {
+		if ((!VALID_PAGE(page)) || PageReserved(page)) {
+			spin_unlock(&mm->page_table_lock);
+			return 0;
+		}
+	}
+	get_page(page);
+	spin_unlock(&mm->page_table_lock);
+	flush_cache_page(vma, addr);
+
+	if (write) {
+		maddr = kmap(page);
+		memcpy(maddr + (addr & ~PAGE_MASK), buf, len);
+		flush_page_to_ram(page);
+		flush_icache_page(vma, page);
+		kunmap(page);
+	} else {
+		maddr = kmap(page);
+		memcpy(buf, maddr + (addr & ~PAGE_MASK), len);
+		flush_page_to_ram(page);
+		kunmap(page);
+	}
+	put_page(page);
+	return len;
+
+fault_in_page:
+	spin_unlock(&mm->page_table_lock);
+	/* -1: out of memory. 0 - unmapped page */
+	if (handle_mm_fault(mm, vma, addr, write) > 0)
+		goto repeat;
+	return 0;
+
+bad_pgd:
+	spin_unlock(&mm->page_table_lock);
+	pgd_ERROR(*pgdir);
+	return 0;
+
+bad_pmd:
+	spin_unlock(&mm->page_table_lock);
+	pmd_ERROR(*pgmiddle);
+	return 0;
+}
+
+static int access_mm(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long addr, void *buf, int len, int write)
+{
+	int copied = 0;
+
+	for (;;) {
+		unsigned long offset = addr & ~PAGE_MASK;
+		int this_len = PAGE_SIZE - offset;
+		int retval;
+
+		if (this_len > len)
+			this_len = len;
+		retval = access_one_page(mm, vma, addr, buf, this_len, write);
+		copied += retval;
+		if (retval != this_len)
+			break;
+
+		len -= retval;
+		if (!len)
+			break;
+
+		addr += retval;
+		buf += retval;
+
+		if (addr < vma->vm_end)
+			continue;	
+		if (!vma->vm_next)
+			break;
+		if (vma->vm_next->vm_start != vma->vm_end)
+			break;
+	
+		vma = vma->vm_next;
+	}
+	return copied;
+}
 
 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
 {
+	int copied;
 	struct mm_struct *mm;
-	struct vm_area_struct *vma;
-	struct page *page;
-	void *old_buf = buf;
+	struct vm_area_struct * vma;
 
 	/* Worry about races with exit() */
 	task_lock(tsk);
@@ -143,41 +245,14 @@
 		return 0;
 
 	down_read(&mm->mmap_sem);
-	/* ignore errors, just check how much was sucessfully transfered */
-	while (len) {
-		int bytes, ret, offset;
-		void *maddr;
-
-		ret = get_user_pages(current, mm, addr, 1,
-				write, 1, &page, &vma);
-		if (ret <= 0)
-			break;
-
-		bytes = len;
-		offset = addr & (PAGE_SIZE-1);
-		if (bytes > PAGE_SIZE-offset)
-			bytes = PAGE_SIZE-offset;
+	vma = find_extend_vma(mm, addr);
+	copied = 0;
+	if (vma)
+		copied = access_mm(mm, vma, addr, buf, len, write);
 
-		flush_cache_page(vma, addr);
-
-		maddr = kmap(page);
-		if (write) {
-			memcpy(maddr + offset, buf, bytes);
-			flush_page_to_ram(page);
-			flush_icache_page(vma, page);
-		} else {
-			memcpy(buf, maddr + offset, bytes);
-			flush_page_to_ram(page);
-		}
-		kunmap(page);
-		put_page(page);
-		len -= bytes;
-		buf += bytes;
-	}
 	up_read(&mm->mmap_sem);
 	mmput(mm);
-	
-	return buf - old_buf;
+	return copied;
 }
 
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len)
diff -urN linux-2.4.17-rc1-virgin/kernel/rtsched.h linux-2.4.17-rc1-wli3/kernel/rtsched.h
--- linux-2.4.17-rc1-virgin/kernel/rtsched.h	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/kernel/rtsched.h	Sun Dec 16 18:04:59 2001
@@ -0,0 +1,1218 @@
+/*
+ *  linux/kernel/rtsched.h
+ *
+ *  NOTE: This is a .h file that is mostly source, not the usual convention.
+ *        It is coded this way to allow the depend rules to correctly set
+ *        up the make file dependencies.  This is an alternate scheduler
+ *        that replaces the core scheduler in sched.c.  It does not, however,
+ *        replace most of the static support functions that call schedule.
+ *        By making this an include file for sched.c, all of those functions
+ *        are retained without the need for duplicate code and its attendant
+ *        support issues.  At the same time, keeping it a seperate file allows
+ *        diff and patch to work most cleanly and correctly.
+ *
+ *  Kernel scheduler and related syscalls
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001 MontaVista Software Inc.
+ *
+ *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
+ *  2000-03-15  Added the Real Time run queue support by George Anzinger
+ *  2000-8-29   Added code to do lazy recalculation of counters 
+ *              by George Anzinger
+ */
+
+/*
+ * 'sched.c' is the main kernel file. It contains scheduling primitives
+ * (sleep_on, wakeup, schedule etc) as well as a number of simple system
+ * call functions (type getpid()), which just extract a field from
+ * current-task
+ */
+
+#ifndef preempt_disable
+#define preempt_disable()
+#define preempt_enable()
+#define preempt_is_disabled() 0
+#define preempt_enable_no_resched()
+#endif
+
+/*
+ * scheduler variables
+ */
+#define VERSION_DATE "<20011203.1609.50>"
+/*
+ * We align per-CPU scheduling data on cacheline boundaries,
+ * to prevent cacheline ping-pong.
+ */
+static union {
+	struct schedule_data {
+		struct task_struct * curr;
+		cycles_t last_schedule;
+                struct list_head schedule_data_list;
+                int cpu,effprio;
+	} schedule_data;
+	char __pad [SMP_CACHE_BYTES];
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0,{0,0},0,0}}};
+
+#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
+static void newprio_ready_q(struct task_struct * tptr,int newprio);
+#ifdef CONFIG_SMP
+static void newprio_executing(struct task_struct *tptr,int newprio);
+static struct list_head hed_cpu_prio __cacheline_aligned = 
+                                                LIST_HEAD_INIT(hed_cpu_prio);
+#endif
+/*
+ * task_on_rq tests for task actually in the ready queue.
+ * task_on_runque tests for task either on ready queue or being executed
+ * (by virtue of our seting a running tasks run_list.next to 1)
+ */
+#define task_on_rq(p) ((unsigned)p->run_list.next > 1)
+
+static struct list_head rq[MAX_PRI+1]  ____cacheline_aligned;
+
+static struct ready_queue {
+        int recalc;            /* # of counter recalculations on SCHED_OTHER */
+        int ticks;             /* # of ticks for all in SCHED_OTHER ready Q */
+} runq ____cacheline_aligned;
+
+/* set the bit map up with guard bits below.  This will result in
+ * priority -1 if there are no tasks in the ready queue which will
+ * happen as we are not putting the idle tasks in the ready queue.
+ */
+static struct {
+        int guard;
+        int rq_bit_ary[(MAX_PRI/32) +1];
+}rq_bits = {-1,{0,0,0,0}};
+#define rq_bit_map rq_bits.rq_bit_ary   
+     
+static int high_prio=0;
+
+#define Rdy_Q_Hed(pri) &rq[pri]
+
+#define PREEMPTION_THRESHOLD 1
+
+#define NOT_RT 0   /* Use priority zero for non-RT processes */
+#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+
+struct kernel_stat kstat;
+
+#ifdef CONFIG_SMP
+
+/*
+ * At the moment, we will ignor cpus_allowed, primarily because if it were
+ * used, we would have a conflict in the runq.ticks count (i.e. since we
+ * are not scheduleing some tasks, the count would not reflect what is
+ * is really on the list).  Oh, and also, nowhere is there code in the
+ * kernel to set cpus_allowed to anything but -1.  In the long run, we
+ * would like to try seperate lists for each cpu, at which point 
+ * cpus_allowed could be used to direct the task to the proper list.
+
+ * Well, darn, now there is code that messes with cpus_allowed.  We will change
+ * sometime soon....
+ */
+
+#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
+#define can_schedule(p,cpu) \
+	((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
+
+#else
+
+#define idle_task(cpu) (&init_task)
+#define can_schedule(p,cpu) (1)
+
+#endif
+
+void scheduling_functions_start_here(void) { }
+
+/*
+ * This is the function that decides how desirable a process is..
+ * You can weigh different processes against each other depending
+ * on what CPU they've run on lately etc to try to handle cache
+ * and TLB miss penalties.
+ *
+ * Return values:
+ *	 -1000: never select this
+ *	     0: out of time, recalculate counters (but it might still be
+ *		selected)
+ *	   +ve: "goodness" value (the larger, the better)
+ */
+
+static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+{
+	int weight;
+
+	/*
+	 * goodness is NEVER called for Realtime processes!
+	 * Realtime process, select the first one on the
+	 * runqueue (taking priorities within processes
+	 * into account).
+	 
+         */
+	/*
+	 * Give the process a first-approximation goodness value
+	 * according to the number of clock-ticks it has left.
+	 *
+	 * Don't do any other calculations if the time slice is
+	 * over or if this is an idle task.
+	 */
+	weight = p->counter;
+	if (weight <= 0)
+		goto out;
+			
+#ifdef CONFIG_SMP
+	/* Give a largish advantage to the same processor...   */
+	/* (this is equivalent to penalizing other processors) */
+	if (p->processor == this_cpu)
+		weight += PROC_CHANGE_PENALTY;
+#endif
+
+	/* .. and a slight advantage to the current MM */
+	if (p->mm == this_mm || !p->mm)
+		weight += 1;
+	weight += 20 - p->nice;
+
+out:
+	return weight;
+}
+
+/*
+ * the 'goodness value' of replacing a process on a given CPU.
+ * positive value means 'replace', zero or negative means 'dont'.
+ */
+static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
+{
+	return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
+}
+
+/*
+ * This is ugly, but reschedule_idle() is very timing-critical.
+ * We are called with the runqueue spinlock held and we must
+ * not claim the tasklist_lock.
+ */
+static FASTCALL(void reschedule_idle(struct task_struct * p));
+
+static void reschedule_idle(struct task_struct * p)
+{
+#ifdef CONFIG_SMP
+	int this_cpu = smp_processor_id(), target_cpu;
+	struct task_struct  *target_tsk;
+        struct list_head *cptr;
+        struct schedule_data *sch;
+	int  best_cpu;
+
+	/*
+	 * shortcut if the woken up task's last CPU is
+	 * idle now.
+	 */
+	best_cpu = p->processor;
+	target_tsk = idle_task(best_cpu);
+	if (cpu_curr(best_cpu) == target_tsk)
+		goto preempt_now;
+        /*
+         * For real time, the choice is simple.  We just check
+         * if the most available processor is working on a lower
+         * priority task.  If so we bounce it, if not, there is
+         * nothing more important than what we are doing.
+         * Note that this will pick up any idle cpu(s) we may
+         * have as they will have effprio of -1.
+         */
+        cptr = hed_cpu_prio.prev;
+        sch = list_entry(cptr,
+                         struct schedule_data,
+                         schedule_data_list);
+        target_tsk = sch->curr;
+        if (p->effprio > sch->effprio){
+                goto preempt_now;
+        }
+        /*
+         * If all cpus are doing real time and we failed
+         * above, then there is no help for this task.
+         */
+        if ( sch->effprio ) 
+                goto out_no_target;               
+	/*
+         * Non-real time contender and one or more processors
+         * doing non-real time things.
+
+         * So we have a non-real time task contending among
+         * other non-real time tasks on one or more processors
+         * We know we have no idle cpus.
+         */
+	/*
+	 * No CPU is idle, but maybe this process has enough priority
+	 * to preempt it's preferred CPU.
+	 */
+	target_tsk = cpu_curr(best_cpu);
+	if (target_tsk->effprio == 0 &&
+            preemption_goodness(target_tsk, p, best_cpu) > 0)
+		goto preempt_now;
+
+        for (; cptr != &hed_cpu_prio;  cptr = cptr->prev ){
+                sch =list_entry(cptr,
+                                struct schedule_data,
+                                schedule_data_list);
+                if (sch->effprio != 0) 
+                        break;
+                if (sch->cpu != best_cpu){
+                        target_tsk = sch->curr;
+                        if ( preemption_goodness(target_tsk, p, sch->cpu) > 
+                             PREEMPTION_THRESHOLD)
+                                goto  preempt_now;
+		}
+               
+	}
+	
+out_no_target:
+	return;
+
+preempt_now:
+	target_cpu = target_tsk->processor;
+	target_tsk->need_resched = 1;
+	/*
+	 * the APIC stuff can go outside of the lock because
+	 * it uses no task information, only CPU#.
+	 */
+	if ((target_cpu != this_cpu)
+            && (target_tsk != idle_task(target_cpu)))
+		smp_send_reschedule(target_cpu);
+	return;
+#else /* UP */
+	struct task_struct *tsk;
+
+	tsk = cpu_curr(0);
+	if ((high_prio > tsk->effprio) ||
+            (!tsk->effprio && preemption_goodness(tsk, p, 0) > 
+             PREEMPTION_THRESHOLD)){
+		tsk->need_resched = 1;
+	}
+#endif
+}
+
+/*
+ * This routine maintains the list of smp processors.  This is 
+ * a by directional list maintained in priority order.  The above
+ * code used this list to find a processor to use for a new task.
+ * The search will be backward thru the list as we want to take 
+ * the lowest prioity cpu first.  We put equal prioities such that
+ * the new one will be ahead of the old, so the new should stay
+ * around a bit longer. 
+ */
+
+#ifdef CONFIG_SMP
+static inline void re_queue_cpu(struct task_struct *next,
+                                struct schedule_data *sch)
+{
+        struct list_head *cpuptr;
+        list_del(&sch->schedule_data_list);
+        sch->effprio = next->effprio;
+        cpuptr = hed_cpu_prio.next;
+        while (cpuptr != &hed_cpu_prio &&
+               sch->effprio < list_entry(cpuptr,
+                                         struct schedule_data,
+                                         schedule_data_list)->effprio
+                ) 
+                cpuptr = cpuptr->next;
+        list_add_tail(&sch->schedule_data_list,cpuptr);
+        next->newprio = &newprio_executing;
+}
+#else
+#define re_queue_cpu(a,b)
+#endif
+/*
+ * Careful!
+ *
+ * This has to add the process to the _beginning_ of the
+ * run-queue, not the end. See the comment about "This is
+ * subtle" in the scheduler proper..
+ * 
+ * For real time tasks we do this a bit differently.  We 
+ * keep a priority list of ready tasks.  We remove tasks 
+ * from this list when they are running so a running real
+ * time task will not be in either the ready list or the run
+ * queue.  Also, in the name of speed and real time, only
+ * priority is important so we spend a few bytes on the queue.
+ * We have a doubly linked list for each priority.  This makes
+ * Insert and removal very fast.  We also keep a bit map of
+ * the priority queues where a bit says if the queue is empty
+ * or not.  We also keep loose track of the highest priority
+ * queue that is currently occupied.  This high_prio mark 
+ * is updated when a higher priority task enters the ready
+ * queue and only goes down when we look for a task in the
+ * ready queue at high_prio and find none.  Then, and only
+ * then, we examine the bit map to find the true high_prio.
+ */
+
+#define BF 31  /* bit flip constant */
+#define   set_rq_bit(bit)    set_bit(BF-((bit)&0x1f),&rq_bit_map[(bit) >> 5])
+#define clear_rq_bit(bit)  clear_bit(BF-((bit)&0x1f),&rq_bit_map[(bit) >> 5])
+
+static inline void _del_from_runqueue(struct task_struct * p)
+{
+	nr_running--;
+        list_del( &p->run_list );
+        if (list_empty(Rdy_Q_Hed(p->effprio))){
+                clear_rq_bit(p->effprio);
+        }
+	/*                  p->run_list.next = NULL; !=0 prevents requeue */
+	p->run_list.next = NULL;
+        p->newprio = NULL;
+        if( !p->effprio) runq.ticks -= p->counter;
+}
+/* Exported for main.c, also used in init code here */
+void __del_from_runqueue(struct task_struct * p)
+{
+        _del_from_runqueue(p);
+}
+static inline struct task_struct * get_next_task(struct task_struct * prev,
+                                                 int this_cpu)
+{
+        struct list_head *next, *rqptr;
+        struct task_struct *it=0;
+        int *i,c,oldcounter;
+
+ repeat_schedule:
+        rqptr = Rdy_Q_Hed(high_prio);
+        next = rqptr->next;
+        if (unlikely( next == rqptr)){ 
+                for (i=&rq_bit_map[MAX_PRI/32],high_prio=BF+((MAX_PRI/32)*32);
+                     (*i == 0);high_prio -=32,i--);
+                high_prio -= ffz(~*i);
+                if (unlikely(high_prio < 0)){
+                        /*
+                         * No tasks to run, return this cpu's idle task 
+                         * It is not in the ready queue, so no need to remove it.
+                         * But first make sure its priority keeps it out of 
+                         * the way.
+                         */
+                        high_prio = 0;
+                        it = idle_task(this_cpu);
+                        it->effprio = -1;
+                        return it;
+                }
+                goto repeat_schedule;
+        }
+        /*
+         * If there is only one task on the list, it is a no brainer.
+         * But really, this also prevents us from looping on recalulation
+         * if the one and only task is trying to yield.  These sort of 
+         * loops are NOT_FUN.  Note: we use likely() to tilt toward 
+         * real-time tasks, even thou they are, usually unlikely.  We
+         * are, after all, a real time scheduler.
+         */
+        if ( likely(high_prio || next->next == rqptr)){ 
+                it = list_entry(next, struct task_struct, run_list);
+ back_from_figure_non_rt_next:
+                _del_from_runqueue(it);
+                return it;               
+        }
+        /*
+         * Here we set up a SCHED_OTHER yield.  Note that for other policies
+         * yield is handled else where.  This means we can use == and = 
+         * instead of & and &= to test and clear the flag.  If the prev 
+         * task has all the runq.ticks, then we just do the recaculation
+         * version and let the winner take all (yield fails).  Otherwise
+         * we fource the counter to zero for the loop and put it back
+         * after we found some other task.  We must remember to update
+         * runq.ticks during all this.  Also, we don't give it all back
+         * if the yielder has more than the next guy.
+         */
+        oldcounter = 0;
+        if ( unlikely(prev->policy == (SCHED_YIELD | SCHED_OTHER)) ){
+                if ( unlikely(prev->counter == runq.ticks)) {
+                        prev->policy = SCHED_OTHER;
+                        runq.ticks = 0;
+                }else{
+                        oldcounter = prev->counter;
+                        prev->counter = 0;
+                }
+        }
+        c = -1000;
+        if (likely(runq.ticks > 0)) {
+                do {
+                        int weight;
+                        struct task_struct *p = 
+                                list_entry(next, struct task_struct, run_list);
+                        /* if (can_schedule(p, this_cpu))*/ {
+                                weight = goodness(p, this_cpu, prev->active_mm);
+                                if (weight > c)
+                                        c = weight, it = p;
+                        }
+                        next = next->next;
+                } while (next != rqptr);
+                /*
+                 * if we get out of sync with the runq.ticks counter 
+                 * force it to 0 and catch it next time around. Note we
+                 * catch a negative counter on entry.
+                 */
+                if ( unlikely(c <= 0 )){ 
+                        runq.ticks = 0;
+                }
+        }else{
+#ifdef CONFIG_SMP
+                /*
+                 * Here we update the tasks that are current on other 
+                 * processors
+                 */
+                struct list_head *wkptr,
+                        *cptr=&aligned_data[(this_cpu)].
+                        schedule_data.
+                        schedule_data_list;
+
+                runq.ticks = 0;
+                list_for_each ( wkptr, &hed_cpu_prio) {
+                        struct task_struct *p;
+                        if (cptr == wkptr ) continue;
+                        p = list_entry(wkptr,
+                                       struct schedule_data, 
+                                       schedule_data_list)->curr;
+                        if ( p->effprio == 0){
+                                p->counter = (p->counter >> 1) + 
+                                        NICE_TO_TICKS(p->nice);
+                                p->counter_recalc++;
+                        }
+                }
+#else
+                runq.ticks = 0;
+#endif
+                runq.recalc++;
+                do {
+                        int weight;
+                        struct task_struct *p = 
+                                list_entry(next, struct task_struct, run_list);
+                        runq.ticks += 
+                                p->counter = NICE_TO_TICKS(p->nice);
+                        p->counter_recalc++;
+                        /* if (can_schedule(p, this_cpu)) */
+                        {
+                                weight = goodness(p, this_cpu, prev->active_mm);
+                                if (weight > c)
+                                        c = weight, it = p;
+                        }
+                        next = next->next;
+                } while (next != rqptr);
+        }
+        /* Undo the stuff we did for SCHED_YIELD.  We know we did something
+         * if oldcounter != 0.
+         */
+        if (unlikely(oldcounter)){
+                
+                prev->counter = (it->counter < oldcounter) ? 
+                        it->counter : 
+                        oldcounter;
+                runq.ticks += prev->counter-oldcounter;
+                prev->policy &= ~SCHED_YIELD;
+        }
+        goto back_from_figure_non_rt_next;
+
+}
+/* Add to the head of the run queue */
+static inline void add_to_runqueue(struct task_struct * p,int cpu)
+{
+	struct list_head *next;
+        int prio;
+        /* idle tasks, don't get put in the list */
+        if (unlikely(p == idle_task(cpu))) return;  
+        prio = p->effprio;
+        next = Rdy_Q_Hed(prio);
+        if (list_empty(next)) { /* an empty queue */
+                set_rq_bit(prio);
+                if (high_prio < prio) {
+                        high_prio = prio;
+                }
+        }
+        list_add(&p->run_list,next);
+        p->newprio = newprio_ready_q;
+        if ( likely(!p->effprio )) {
+                int diff,c;
+		if ((diff = runq.recalc - p->counter_recalc) != 0) {
+ 			p->counter_recalc = runq.recalc;
+			c = NICE_TO_TICKS(p->nice) << 1;
+			p->counter = diff > 8 ? c - 1 :  /* max priority */
+				                c + ((p->counter - c) >> diff);
+		}
+                runq.ticks += p->counter;
+        }
+	nr_running++;
+}
+
+/*
+ * This function is only called from schedule() so it need not worry
+ * about updating the counter as it should never be out of date.
+ * If you change this, remember to do the update.
+ */
+static inline void add_last_runqueue(struct task_struct * p)
+{
+	struct list_head *next = Rdy_Q_Hed(p->effprio);
+
+        if (list_empty(next)) {         /* empty list, set the bit */
+                set_rq_bit(p->effprio);
+                if (p->effprio > high_prio){
+                        high_prio = p->effprio;
+                }
+        }
+        list_add_tail(&p->run_list,next);
+        p->newprio = newprio_ready_q;
+        if ( !p->effprio ) runq.ticks += p->counter;
+        nr_running++;
+}
+
+
+static inline void move_first_runqueue(struct task_struct * p)
+{
+	list_del(&p->run_list);
+	list_add_tail(&p->run_list, Rdy_Q_Hed(p->effprio));
+}
+/*
+ * When we have a task in some queue by priority, we need
+ * to provide a way to change that priority.  Depending on the
+ * queue we must do different things.  We handle this by putting
+ * a function address in the task_struct (newprio()).
+ *
+ * First a front end routine to take care of the case were the task
+ * is not in any priority queues.  We take the runqueue_lock
+ * here, so the caller must not.  Since we may be called
+ * recursively, protect against a dead lock.
+ */
+static struct task_struct *newprio_inuse;
+static int newprio_inuse_count;
+
+void set_newprio(struct task_struct * tptr, int newprio)
+{
+	if ( newprio_inuse != current){
+                spin_lock_irq(&runqueue_lock);
+                newprio_inuse = current;
+        }
+        newprio_inuse_count++;
+        if (! tptr->newprio ) {
+                tptr->effprio = newprio;
+        }else if ( tptr->effprio != newprio) {
+                tptr->newprio(tptr,newprio);
+        }
+        if ( ! --newprio_inuse_count ){
+                spin_unlock_irq(&runqueue_lock);
+                newprio_inuse = 0;
+        }
+}
+
+
+/*
+ * Here are the routines we use for the ready queue and an executing
+ * process.  Note that the executing process may fall out of favor
+ * as a result of the change.  We do the right thing. Note that newprio
+ * is not cleared so we test here to see if the task is still running.
+ */
+
+static void newprio_ready_q(struct task_struct * tptr,int newprio)
+{
+        _del_from_runqueue(tptr);
+        tptr->effprio = newprio;
+	add_to_runqueue(tptr,0);
+	reschedule_idle(tptr);
+}
+#ifdef CONFIG_SMP
+static void newprio_executing(struct task_struct *tptr,int newprio)
+{
+        int cpu;
+        struct schedule_data *sched_data;
+        if(!newprio || newprio < tptr->effprio){
+		tptr->need_resched = 1;
+        }
+        cpu = tptr->processor;
+	sched_data = & aligned_data[cpu].schedule_data;
+        tptr->effprio = newprio;
+        if( sched_data->curr != tptr) return; /* if not expected, out of here */
+        re_queue_cpu(tptr,sched_data);
+	if ((cpu != smp_processor_id()) && tptr->need_resched)
+		smp_send_reschedule(cpu);
+}
+#endif
+
+
+
+/*
+ * Wake up a process. Put it on the ready-queue if it's not
+ * already there.  The "current" process is not on the 
+ * ready-queue (it makes it much easier to figure out if we
+ * need to preempt, esp. the real time case).  It is possible
+ * to wake the current process.  This happens when it is waken
+ * before schedule has had a chance to put it properly to
+ * sleep.  If schedule did not turn on ints in the middle of
+ * things this would all be ok, however, it does so we have the
+ * possibility of being in that window.  
+ * The "current" process is never on the
+ * run-queue (except when the actual re-schedule is in
+ * progress), and as such you're allowed to do the simpler
+ * "current->state = TASK_RUNNING" to mark yourself runnable
+ * without the overhead of this.
+ */
+static inline int try_to_wake_up(struct task_struct * p, int synchronous)
+{
+	unsigned long flags;
+	int success = 0;
+
+	/*
+	 * We want the common case fall through straight, thus the goto.
+	 */
+	spin_lock_irqsave(&runqueue_lock, flags);
+	p->state = TASK_RUNNING;
+	if ( task_on_runqueue(p) )
+		goto out;
+	add_to_runqueue(p,0);
+	if (!synchronous /*|| !(p->cpus_allowed & (1 << smp_processor_id())*/)
+		reschedule_idle(p);
+	success = 1;
+out:
+	spin_unlock_irqrestore(&runqueue_lock, flags);
+	return success;
+}
+
+inline int wake_up_process(struct task_struct * p)
+{
+	return try_to_wake_up(p, 0);
+}
+/*
+ * schedule_tail() is getting called from the fork return path. This
+ * cleans up all remaining scheduler things, without impacting the
+ * common case.
+ */
+static inline void __schedule_tail(struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+
+	/*
+	 * fast path falls through. We have to clear cpus_runnable before
+	 * checking prev->state to avoid a wakeup race. Protect against
+	 * the task exiting early.
+	 */
+	task_lock(prev);
+	task_release_cpu(prev);
+	mb();
+        if (task_on_rq(prev))
+		goto needs_resched;
+
+out_unlock:
+	task_unlock(prev);	/* Synchronise here with release_task() if prev is TASK_ZOMBIE */
+	return;
+
+	/*
+	 * Slow path - we 'push' the previous process and
+	 * reschedule_idle() will attempt to find a new
+	 * processor for it. (but it might preempt the
+	 * current process as well.) We must take the runqueue
+	 * lock and re-check prev->state to be correct. It might
+	 * still happen that this process has a preemption
+	 * 'in progress' already - but this is not a problem and
+	 * might happen in other circumstances as well.
+	 */
+needs_resched:
+	{
+		unsigned long flags;
+
+		/*
+		 * Avoid taking the runqueue lock in cases where
+		 * no preemption-check is necessery:
+                  * Note: Idle task is NEVER on the ready queue so
+                 *       no need to check if prev was idle.
+		 */
+
+		spin_lock_irqsave(&runqueue_lock, flags);
+		if (task_on_rq(prev) /* && !task_has_cpu(prev)*/ )
+			reschedule_idle(prev);
+		spin_unlock_irqrestore(&runqueue_lock, flags);
+		goto out_unlock;
+	}
+#define smp_label_a _smp_label_a:
+#define smp_label_b _smp_label_b:
+#else
+	prev->policy &= ~SCHED_YIELD;
+#define smp_label_a
+#define smp_label_b 
+#endif /* CONFIG_SMP */
+}
+
+asmlinkage void schedule_tail(struct task_struct *prev)
+{
+	__schedule_tail(prev);
+	preempt_enable();
+}
+
+/*
+ *  'schedule()' is the scheduler function. It's a very simple and nice
+ * scheduler: it's not perfect, but certainly works for most things.
+ *
+ * The goto is "interesting".
+ *
+ *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
+ * tasks can run. It can not be killed, and it cannot sleep. The 'state'
+ * information in task[0] is never used.
+ */
+asmlinkage void schedule(void)
+{
+	struct schedule_data * sched_data;
+	struct task_struct *prev, *next;
+	int this_cpu;
+
+	spin_lock_prefetch(&runqueue_lock);
+ try_try_again:
+
+	preempt_disable(); 
+
+	if (unlikely(!current->active_mm)) BUG();
+	prev = current;
+	this_cpu = prev->processor;
+
+	if (unlikely(in_interrupt())) {
+		printk("Scheduling in interrupt\n");
+		BUG();
+	}
+
+	release_kernel_lock(prev, this_cpu);
+
+	/*
+	 * 'sched_data' is protected by the fact that we can run
+	 * only one process per CPU.
+	 */
+	sched_data = & aligned_data[this_cpu].schedule_data;
+
+	spin_lock_irq(&runqueue_lock);
+
+#ifdef CONFIG_PREEMPT
+        /*
+         * Note that this is an '&' NOT an '&&'...
+         */
+        if (preempt_is_disabled() & PREEMPT_ACTIVE) goto sw_TASK_RUNNING;
+#endif	
+	if (prev->state == TASK_INTERRUPTIBLE) {
+		//case TASK_INTERRUPTIBLE:
+                if (likely( ! signal_pending(prev))) {
+                        goto sw_default;
+                }
+                prev->state = TASK_RUNNING;
+        }
+
+	if (prev->state != TASK_RUNNING) {
+                goto sw_default;
+        }
+        //case TASK_RUNNING:
+#ifdef CONFIG_PREEMPT
+ sw_TASK_RUNNING:
+#endif
+        /*
+         * move an exhausted RR process to be last.. 
+         * Do the same for Yields
+         */
+        if (!prev->counter && (prev->policy & SCHED_RR))
+                goto move_rr_last;
+        if (prev->policy & SCHED_YIELD)
+                goto move_yield_last;
+        /*
+         * There is a case where current is already
+         * in the ready que.  That is where it was
+         * on the way out, but the wait already
+         * expired, so wake_up_process has already
+         * done it.  In this case, we don't!!
+         */
+        if (!task_on_rq(prev))
+                add_to_runqueue(prev,this_cpu);
+        goto move_rr_back;
+        //default:
+ sw_default:
+        prev->sleep_time = jiffies;
+        prev->run_list.next = 0;
+	
+ move_rr_back:
+	prev->need_resched = 0;
+ smp_label_a
+        next = get_next_task(prev, this_cpu);
+ smp_label_b
+        next->run_list.next = (struct list_head *)1;
+        sched_data->curr = next;
+        re_queue_cpu(next,sched_data);
+	spin_unlock_irq(&runqueue_lock);
+
+	if (unlikely(prev == next)) {
+		goto same_process;
+	}
+
+#ifdef CONFIG_SMP
+ 	/*
+ 	 * maintain the per-process 'last schedule' value.
+ 	 * (this has to be recalculated even if we reschedule to
+ 	 * the same process) Currently this is only used on SMP,
+	 * and it's approximate, so we do not have to maintain
+	 * it while holding the runqueue spinlock.
+ 	 */
+ 	sched_data->last_schedule = get_cycles();
+
+	/*
+	 * We drop the scheduler lock early (it's a global spinlock),
+	 * thus we have to lock the previous process from getting
+	 * rescheduled during switch_to() (since we are still on his stack).
+         *
+         * Here is how we do it.  The cpus_runnable flag will be held until
+         * the task is truly available.  On the other hand, this task
+         * is put in the ready queue during the above runqueue_lock so
+         * it may be picked up by another cpu.  Suppose that cpu is this
+         * one.  Now the prior cpu left the task in the ready queue and
+         * we have just pluck it from there.  No conflict so far, but if
+         * cpus_runnable is not clear, the other cpu is still in the switch code.
+         * There are no locks there SAVE THIS ONE!!!  Oh woe is me!  
+         * At the same time, under these conditions, i.e. a task is
+         * coming out of the ready queue before we actually switch, it
+         * would be good to not switch cpus.  So lets define a "wanted"
+         * bit in the cpus_runnable member.  Oops, it is now a cpu bit mask
+         * so, since only a few folks look at it, we will fudge it a bit.  
+         * Choose an addition that is more than on bit away from a single bit
+         * 
+
+         * We will spin here waiting for cpus_runnable to go to zero.  Until
+         * this happens, we must not change the processor value as
+         * interrupt code depends on this being right for "current".
+	 */
+#define WANTED 10
+#define TAKEN  20
+        {
+                unsigned long cur_cpus_runnable = next->cpus_runnable;
+
+                atomic_add(WANTED,(atomic_t *)&next->cpus_runnable);
+                /*
+                 * It is either "WANTED+cur_cpus_runnable" which means we 
+                 * need to wait or is:
+                 * A. The old cpu_id + WANTED or
+                 * B. WANTED - 1 which means it cleared (or was clear).
+                 * C. TAKEN + cur_cpus_runnable
+                 */
+                while ((cur_cpus_runnable != ~0UL) && 
+                       (volatile int)next->cpus_runnable == 
+                       WANTED + cur_cpus_runnable) {
+                        unsigned long my_cpu = 1 << this_cpu;
+
+                        barrier();
+                        /*
+                         * OK, so while we wait, lets look in on prev and see
+                         * if he is wanted.
+                         */
+                        if ( (volatile int)prev->cpus_runnable !=  my_cpu) {
+                                /*
+                                 * Another cpu wants the task we have yet to 
+                                 * switch away from.  Lets steal it back.
+                                 * Once WANTED is set on prev, we can clear it 
+                                 * either here or in schedule_tail.  The other
+                                 * cpu can clear it by coming here where it will
+                                 * be known by him as next...
+                        
+                                 * Here, we set it to (TAKEN+my_cpu), in 
+                                 * schedule_tail it is set to my_cpu
+                                 */
+                                spin_lock_irq(&runqueue_lock);
+                                if ( (volatile int)prev->cpus_runnable != my_cpu) {
+                                        spin_unlock_irq(&runqueue_lock);
+                                        continue;
+                                }
+                                /*
+                                 * Three possibilities on the state of next:
+                                 * 0.) cpus_runnable has gone to ~0UL.  Means the
+                                 *     prior cpu has finished and is not
+                                 *     interested.  So put back in ready queue.
+                                 * 5.) Other cpu noticed our interest and stoled
+                                 *     it back (cpus_runnable will be 
+                                 *     TAKEN + his flag).  Do nothing.
+                                 * 3.) No change, put back in the ready queue
+                                 * Note, case 3 presents a bit of a race on our
+                                 * clearing the WANTED bit.  So, we subtract and
+                                 * if the result is negative, set it to zero.
+                                 */
+                                if ( (volatile int)next->cpus_runnable != 
+                                     cur_cpus_runnable + TAKEN) {
+                                        atomic_add(-WANTED,
+                                                   (atomic_t *)&next->cpus_runnable);
+                                        if ((volatile int)next->cpus_runnable < 0) {
+                                                next->cpus_runnable = ~0UL;
+                                        }
+                                        add_to_runqueue(next,this_cpu);
+                                }
+                                /*
+                                 * So much for "next".  Now lets take prev.
+                                 * Setting cpus_runnable to TAKEN+old will pop the
+                                 * waiter out of the wait loop.
+                                 * We then wait for him to clear TAKEN to 
+                                 * complete the handshake.  We hand shake here
+                                 * to keep the other cpu from seeing some later
+                                 * state that may be wrong.
+                                 */
+                                prev->cpus_runnable = TAKEN + my_cpu; 
+                                next = prev;
+                                spin_unlock_irq(&runqueue_lock);
+                                while ((volatile int)prev->cpus_runnable == 
+                                       TAKEN + my_cpu) {
+                                        barrier();
+                                }
+                                spin_lock_irq(&runqueue_lock);
+                                goto _smp_label_b;
+                        }
+                }              
+                /*
+                 * if we poped out of the while because cpus_runnable has TAKEN
+                 * set it means the prior owner stoled back the task.  Time to 
+                 * rescan the ready queue (after clearing the TAKEN bit to
+                 * complete the handshake).  The other possibilities are:
+                 * cpus_runnable = WANTED -1 ( was clear when we started)
+                 * cpus_runnable = -1      (was his, but the other cpu finished, 
+                 *                          seting -1)
+                 */
+                if ((volatile int)next->cpus_runnable == 
+                    TAKEN + cur_cpus_runnable){
+                        atomic_add(-TAKEN,(atomic_t *)&next->cpus_runnable);
+                        spin_lock_irq(&runqueue_lock);
+                        goto _smp_label_a;
+                }
+        }
+        /*
+         * Gosh wasn't that fun!
+         */
+	task_set_cpu(next,this_cpu);
+#endif /* CONFIG_SMP */
+
+        /* 
+         * An interesting problem here. Since we turned on interrupts,
+         * we could now have a need schedule flag set in prev.  Actually
+         * this can only happen on interrupt and then only be meaningful
+         * if it is done by a wakeup() call to reschedule_idle().  This
+         * is covered as that code will set the need_resched flag in the
+         * task found by cpu_curr() which comes from the cpu structs
+         * which we have already updated.
+
+         * The remaining problems come from left over timeouts against
+         * prev, but he was the target and he is gone now... unless
+         * we did not really switch.  So in the switch path we will 
+         * clear the need_resched flag, not in the no switch path.
+         */
+
+	kstat.context_swtch++;
+	/*
+	 * there are 3 processes which are affected by a context switch:
+	 *
+	 * prev == .... ==> (last => next)
+	 *
+	 * It's the 'much more previous' 'prev' that is on next's stack,
+	 * but prev is set to (the just run) 'last' process by switch_to().
+	 * This might sound slightly confusing but makes tons of sense.
+	 */
+	prepare_to_switch();
+	{
+		struct mm_struct *mm = next->mm;
+		struct mm_struct *oldmm = prev->active_mm;
+		if (!mm) {
+			if (next->active_mm) BUG();
+			next->active_mm = oldmm;
+			atomic_inc(&oldmm->mm_count);
+			enter_lazy_tlb(oldmm, next, this_cpu);
+		} else {
+			if (next->active_mm != mm) BUG();
+			switch_mm(oldmm, mm, next, this_cpu);
+		}
+
+		if (!prev->mm) {
+			prev->active_mm = NULL;
+			mmdrop(oldmm);
+		}
+	}
+
+	/*
+	 * This just switches the register state and the
+	 * stack.
+	 */
+	switch_to(prev, next, prev);
+	__schedule_tail(prev);
+	prev->need_resched = 0;
+
+same_process:
+        reacquire_kernel_lock(current);
+	preempt_enable_no_resched(); 
+        if ( ! current->need_resched)
+                return;
+
+        /* The task managed to get its need_resched flag set already!
+         */
+        goto try_try_again;
+
+
+ move_rr_last:
+        prev->counter = NICE_TO_TICKS(prev->nice);
+
+ move_yield_last:
+        if (prev->effprio)    /* non-real time tasks get cleared later */
+                prev->policy &= ~SCHED_YIELD;
+        add_last_runqueue(prev);
+	goto move_rr_back;
+
+}
+static inline struct task_struct *find_process_by_pid(pid_t pid);
+
+static int setscheduler(pid_t pid, int policy, 
+			struct sched_param *param)
+{
+	struct sched_param lp;
+	struct task_struct *p;
+	int retval;
+
+	retval = -EINVAL;
+	if (!param || pid < 0)
+		goto out_nounlock;
+
+	retval = -EFAULT;
+	if (copy_from_user(&lp, param, sizeof(struct sched_param)))
+		goto out_nounlock;
+
+	/*
+	 * We play safe to avoid deadlocks.
+	 */
+	read_lock_irq(&tasklist_lock);
+	spin_lock(&runqueue_lock);
+
+	p = find_process_by_pid(pid);
+
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+			
+	if (policy < 0)
+		policy = p->policy;
+	else {
+		retval = -EINVAL;
+		if (policy != SCHED_FIFO && policy != SCHED_RR &&
+				policy != SCHED_OTHER)
+			goto out_unlock;
+	}
+	
+	/*
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are 1..MAX_PRI, valid
+	 * priority for SCHED_OTHER is 0.
+	 */
+	retval = -EINVAL;
+	if (lp.sched_priority < 0 || lp.sched_priority > MAX_PRI)
+		goto out_unlock;
+	if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
+		goto out_unlock;
+
+	retval = -EPERM;
+	if ((policy == SCHED_FIFO || policy == SCHED_RR) && 
+	    !capable(CAP_SYS_NICE))
+		goto out_unlock;
+	if ((current->euid != p->euid) && (current->euid != p->uid) &&
+	    !capable(CAP_SYS_NICE))
+		goto out_unlock;
+
+	retval = 0;
+	p->policy = policy;
+        if ( policy == SCHED_FIFO) {
+                p->counter = -100;        /* we don't count down neg couters */
+        }else{
+                p->counter = NICE_TO_TICKS(p->nice);
+        }
+
+        p->rt_priority = lp.sched_priority;
+
+	spin_unlock_irq(&runqueue_lock);
+        set_newprio(p,lp.sched_priority);
+        goto out_readunlock;
+                        
+out_unlock:
+	spin_unlock_irq(&runqueue_lock);
+ out_readunlock:
+	read_unlock(&tasklist_lock);
+
+out_nounlock:
+	return retval;
+}
+asmlinkage long sys_sched_yield(void)
+{
+	/*
+	 * Trick. sched_yield() first checks to see if it will be REALLY
+         * lonly in the ready queue. It just returns if it is the only
+         * game in town. The multilple ready queues really help here.
+         * (This test does not have
+	 * to be atomic.) In threaded applications this optimization
+	 * gets triggered quite often.
+	 */
+        if ( ! list_empty(Rdy_Q_Hed(current->effprio))){
+                /* 
+                 * I think this is safe as only the current task can 
+                 * here and only the current task will be clearing this bit
+                 */
+                current->policy |= SCHED_YIELD;
+                schedule();
+        }
+	return 0;
+}
+/* Seems to be the first place we hear about a given cpu as it comes up.
+ * A new (including the first) cpu is reporting for duty.  Since he is
+ * already running we must patch him into the processor queue.  
+ * We get here the first time the processor enters the idle code and also
+ * one more time for the boot cpu so... be careful to not redo what is 
+ * already done.  Also note that the fork that created the task put it
+ * in the ready queue, so we need to take it out, except the initial cpus
+ * task was not created by a fork.  No matter, the removal code works even 
+ * then.
+ * We give the idle task prioity -1 to keep it out of the way of tasks 
+ * that have real work to do.
+ */
+extern unsigned long wait_init_idle;
+
+void __init init_idle(void)
+{
+	struct schedule_data * sched_data;
+        int cpu=smp_processor_id();
+	sched_data = &aligned_data[cpu].schedule_data;
+
+        if (task_on_rq(current)) {
+                del_from_runqueue(current);
+        }
+	sched_data->curr = current;
+	sched_data->last_schedule = get_cycles();
+        current->effprio = current->rt_priority = 0;
+        sched_data->effprio = -1;        /* idle flag */
+        sched_data->cpu = cpu;
+	clear_bit(current->processor, &wait_init_idle);
+#ifdef CONFIG_SMP
+        if ( ! sched_data->schedule_data_list.next ) {
+                list_add_tail(&sched_data->schedule_data_list,&hed_cpu_prio);
+        }
+#endif
+}
+
+extern void init_timervecs (void);
+
+void __init sched_init(void)
+{
+	/*
+	 * We have to do a little magic to get the first
+	 * process right in SMP mode.
+	 */
+	int cpu = smp_processor_id();
+	int nr;
+        int i;
+
+	init_task.processor = cpu;
+        /* Init the ready queue */
+        for (i=0;i<=MAX_PRI ;i++){
+                INIT_LIST_HEAD(Rdy_Q_Hed(i));
+        }
+
+
+	for(nr = 0; nr < PIDHASH_SZ; nr++)
+		pidhash[nr] = NULL;
+        printk("rtsched version " VERSION_DATE "\n");
+
+	init_timervecs();
+
+	init_bh(TIMER_BH, timer_bh);
+	init_bh(TQUEUE_BH, tqueue_bh);
+	init_bh(IMMEDIATE_BH, immediate_bh);
+
+	/*
+	 * The boot idle thread does lazy MMU switching as well:
+	 */
+	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current, cpu);
+}
diff -urN linux-2.4.17-rc1-virgin/kernel/sched.c linux-2.4.17-rc1-wli3/kernel/sched.c
--- linux-2.4.17-rc1-virgin/kernel/sched.c	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/kernel/sched.c	Fri Dec 14 04:38:23 2001
@@ -92,6 +92,10 @@
 spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
 
+#ifdef CONFIG_RTSCHED
+extern struct task_struct *child_reaper;
+#include "rtsched.h"
+#else
 static LIST_HEAD(runqueue_head);
 
 /*
@@ -373,6 +377,7 @@
 {
 	return try_to_wake_up(p, 0);
 }
+#endif /* ifdef CONFIG_RTSCHED */
 
 static void process_timeout(unsigned long __data)
 {
@@ -458,7 +463,7 @@
  out:
 	return timeout < 0 ? 0 : timeout;
 }
-
+#ifndef CONFIG_RTSCHED
 /*
  * schedule_tail() is getting called from the fork return path. This
  * cleans up all remaining scheduler things, without impacting the
@@ -491,7 +496,7 @@
 	task_lock(prev);
 	task_release_cpu(prev);
 	mb();
-	if (prev->state == TASK_RUNNING)
+	if (task_on_runqueue(prev))
 		goto needs_resched;
 
 out_unlock:
@@ -521,7 +526,7 @@
 			goto out_unlock;
 
 		spin_lock_irqsave(&runqueue_lock, flags);
-		if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
+		if (task_on_runqueue(prev) && !task_has_cpu(prev))
 			reschedule_idle(prev);
 		spin_unlock_irqrestore(&runqueue_lock, flags);
 		goto out_unlock;
@@ -534,6 +539,7 @@
 asmlinkage void schedule_tail(struct task_struct *prev)
 {
 	__schedule_tail(prev);
+	preempt_enable();
 }
 
 /*
@@ -556,6 +562,8 @@
 
 	spin_lock_prefetch(&runqueue_lock);
 
+	preempt_disable(); 
+
 	if (!current->active_mm) BUG();
 need_resched_back:
 	prev = current;
@@ -583,6 +591,9 @@
 			move_last_runqueue(prev);
 		}
 
+#ifdef CONFIG_PREEMPT
+	if (preempt_is_disabled() & PREEMPT_ACTIVE) goto treat_like_run;
+#endif
 	switch (prev->state) {
 		case TASK_INTERRUPTIBLE:
 			if (signal_pending(prev)) {
@@ -593,6 +604,9 @@
 			del_from_runqueue(prev);
 		case TASK_RUNNING:;
 	}
+#ifdef CONFIG_PREEMPT
+	treat_like_run:
+#endif
 	prev->need_resched = 0;
 
 	/*
@@ -701,8 +715,10 @@
 	reacquire_kernel_lock(current);
 	if (current->need_resched)
 		goto need_resched_back;
+	preempt_enable_no_resched();
 	return;
 }
+#endif /* ifndef CONFIG_RTSCHED */
 
 /*
  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
@@ -897,7 +913,7 @@
 		tsk = find_task_by_pid(pid);
 	return tsk;
 }
-
+#ifndef CONFIG_RTSCHED
 static int setscheduler(pid_t pid, int policy, 
 			struct sched_param *param)
 {
@@ -967,6 +983,7 @@
 out_nounlock:
 	return retval;
 }
+#endif /* ifndef CONFIG_RTSCHED */
 
 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 
 				      struct sched_param *param)
@@ -979,6 +996,34 @@
 	return setscheduler(pid, -1, param);
 }
 
+#ifdef CONFIG_PREEMPT
+
+#ifdef CONFIG_SMP
+#define lock_to_this_cpu()				\
+        unsigned long old_cpus_allowed = current->cpus_allowed;	\
+        current->cpus_allowed = 1UL << smp_processor_id()
+#define restore_cpus_allowed() current->cpus_allowed = old_cpus_allowed
+#else
+#define lock_to_this_cpu()
+#define restore_cpus_allowed()
+#endif /* !CONFIG_SMP */
+
+asmlinkage void preempt_schedule(void)
+{
+	while (current->need_resched) {
+		/* it would be ideal not to lock tasks to their cpu here,
+		 * but only around the data that needs such locking */
+		lock_to_this_cpu();
+		current->preempt_count += PREEMPT_ACTIVE + 1;
+		barrier();
+		schedule();
+		current->preempt_count -= PREEMPT_ACTIVE + 1;
+		barrier();
+		restore_cpus_allowed();
+	}
+}
+#endif /* CONFIG_PREEMPT */
+
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
 	struct task_struct *p;
@@ -1030,6 +1075,7 @@
 	return retval;
 }
 
+#ifndef CONFIG_RTSCHED
 asmlinkage long sys_sched_yield(void)
 {
 	/*
@@ -1070,7 +1116,7 @@
 	}
 	return 0;
 }
-
+#endif /* ifndef CONFIG_RTSCHED */
 asmlinkage long sys_sched_get_priority_max(int policy)
 {
 	int ret = -EINVAL;
@@ -1078,7 +1124,7 @@
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
-		ret = 99;
+		ret = MAX_PRI;
 		break;
 	case SCHED_OTHER:
 		ret = 0;
@@ -1297,6 +1343,7 @@
 	atomic_inc(&current->files->count);
 }
 
+#ifndef CONFIG_RTSCHED
 extern unsigned long wait_init_idle;
 
 void __init init_idle(void)
@@ -1342,3 +1389,4 @@
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current, cpu);
 }
+#endif /* ifndef CONFIG_RTSCHED */
diff -urN linux-2.4.17-rc1-virgin/kernel/sysctl.c linux-2.4.17-rc1-wli3/kernel/sysctl.c
--- linux-2.4.17-rc1-virgin/kernel/sysctl.c	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/kernel/sysctl.c	Fri Dec 14 02:44:20 2001
@@ -260,6 +260,8 @@
 };
 
 static ctl_table vm_table[] = {
+	{VM_FREEPG, "freepages",
+	 &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec},
 	{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
 	 &proc_dointvec_minmax, &sysctl_intvec, NULL,
 	 &bdflush_min, &bdflush_max},
@@ -271,6 +273,8 @@
 	 &pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_PAGE_CLUSTER, "page-cluster", 
 	 &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec},
+	{VM_MAX_MAP_COUNT, "max_map_count",
+	 &max_map_count, sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_MIN_READAHEAD, "min-readahead",
 	&vm_min_readahead,sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_MAX_READAHEAD, "max-readahead",
diff -urN linux-2.4.17-rc1-virgin/kernel/timer.c linux-2.4.17-rc1-wli3/kernel/timer.c
--- linux-2.4.17-rc1-virgin/kernel/timer.c	Mon Oct  8 10:41:41 2001
+++ linux-2.4.17-rc1-wli3/kernel/timer.c	Fri Dec 14 04:38:23 2001
@@ -583,7 +583,15 @@
 
 	update_one_process(p, user_tick, system, cpu);
 	if (p->pid) {
+#ifdef CONFIG_RTSCHED
+                /* SCHED_FIFO and the idle(s) have counters set to -100, 
+                 * so we won't count them, seems like a good idea for 
+                 * both schedulers, but, being pure...
+                 */
+		if (p->counter >= 0 && --p->counter <= 0) {
+#else
 		if (--p->counter <= 0) {
+#endif
 			p->counter = 0;
 			p->need_resched = 1;
 		}
diff -urN linux-2.4.17-rc1-virgin/kernel/user.c linux-2.4.17-rc1-wli3/kernel/user.c
--- linux-2.4.17-rc1-virgin/kernel/user.c	Tue Nov 28 22:43:39 2000
+++ linux-2.4.17-rc1-wli3/kernel/user.c	Sun Dec 16 23:52:26 2001
@@ -19,7 +19,14 @@
 #define UIDHASH_BITS		8
 #define UIDHASH_SZ		(1 << UIDHASH_BITS)
 #define UIDHASH_MASK		(UIDHASH_SZ - 1)
-#define __uidhashfn(uid)	(((uid >> UIDHASH_BITS) ^ uid) & UIDHASH_MASK)
+
+/*
+ * hash function borrowed from Chuck Lever's paper
+ * The effects of this replacement have not been measured.
+ * -- wli
+ */
+#define __uidhashfn(uid) \
+	(((2654435761UL*(uid)) >> (BITS_PER_LONG-UIDHASH_BITS)) & UIDHASH_MASK)
 #define uidhashentry(uid)	(uidhash_table + __uidhashfn(uid))
 
 static kmem_cache_t *uid_cachep;
diff -urN linux-2.4.17-rc1-virgin/lib/dec_and_lock.c linux-2.4.17-rc1-wli3/lib/dec_and_lock.c
--- linux-2.4.17-rc1-virgin/lib/dec_and_lock.c	Wed Oct  3 09:11:26 2001
+++ linux-2.4.17-rc1-wli3/lib/dec_and_lock.c	Fri Dec 14 02:44:44 2001
@@ -1,5 +1,6 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/sched.h>
 #include <asm/atomic.h>
 
 /*
diff -urN linux-2.4.17-rc1-virgin/mm/Makefile linux-2.4.17-rc1-wli3/mm/Makefile
--- linux-2.4.17-rc1-virgin/mm/Makefile	Wed Oct 24 15:21:18 2001
+++ linux-2.4.17-rc1-wli3/mm/Makefile	Fri Dec 14 02:44:20 2001
@@ -14,7 +14,7 @@
 obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
-	    shmem.o
+	    shmem.o rmap.o
 
 obj-$(CONFIG_HIGHMEM) += highmem.o
 
diff -urN linux-2.4.17-rc1-virgin/mm/TODO linux-2.4.17-rc1-wli3/mm/TODO
--- linux-2.4.17-rc1-virgin/mm/TODO	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/mm/TODO	Fri Dec 14 02:44:20 2001
@@ -0,0 +1,31 @@
+		VM TODO list
+
+Forever valid TODO entries:
+  - keep up with the official kernel
+  - port over bugfixes
+  - minimise the diff by keeping code in sync, where possible
+
+Easy short-term features:
+  - reclaim swap space from refill_inactive()
+  - simplify SMP locking 
+  - replace foo()/foo_pgd()/foo_pmd()/foo_pte() stuff with
+    one single function using a for_each_pte() macro
+       for_each_pte(ptep, mm, start_address, end_address)
+  - stronger drop behind / unused object dropping, all the way
+    to the far end of the inactive list
+  - per-zone active/inactive list (wli)
+  - fix page_launder() to not eat horrible amounts of CPU or flush
+    all pages to disk at once
+  - better VM balancing, clean vs. dirty ratio
+
+Long-term features:
+  - extensive VM statistics
+  - IO clustering for page_launder() and sync_old_buffers()
+  - readahead on per-VMA level (+ drop behind?)
+  - more graceful degradation when the load gets high
+     - reducing readahead
+     - unfair pageout so not all apps fall over
+  - memory objects, using pagecache and tmpfs for storage so
+    the memory object itself doesn't introduce any new overhead
+  - using the memory objects, removing page table copying from fork()
+  - load control able to deal with really extreme loads, swapping
diff -urN linux-2.4.17-rc1-virgin/mm/bootmem.c linux-2.4.17-rc1-wli3/mm/bootmem.c
--- linux-2.4.17-rc1-virgin/mm/bootmem.c	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/mm/bootmem.c	Fri Dec 14 03:21:15 2001
@@ -3,8 +3,9 @@
  *
  *  Copyright (C) 1999 Ingo Molnar
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *  Segment tree memory reservation system, William Irwin, IBM, Oct 2001
  *
- *  simple boot-time physical memory area allocator and
+ *  Simple boot-time physical memory area allocator and
  *  free memory collector. It's used to deal with reserved
  *  system memory and memory holes as well.
  */
@@ -17,40 +18,192 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/mmzone.h>
-#include <asm/dma.h>
+#include <linux/segment_tree.h>
 
 /*
- * Access to this subsystem has to be serialized externally. (this is
- * true for the boot process anyway)
+ * Design notes:
+ *
+ * This design was arrived at by considering four principal concerns,
+ * beyond properly representing discontiguous memory machines:
+ *
+ * (1) Machines on which the physical address space is highly fragmented.
+ * (2) Machines where nodes' memory fragments may be interleaved.
+ * (3) Machines whose physical address space layouts are irregular.
+ * (4) Machines requiring heavy boot-time memory reservation activity.
+ *
+ * These design concerns led to an implementation which represented
+ * available physical memory explicitly in terms of intervals to save
+ * space and also one utilizing an efficient search structure. These
+ * design concerns may not be universally important; however, small
+ * benefits should be seen even on low-memory machines, or machines
+ * without significant boot-time memory reservation activity.
+ *
+ * Concern (3) is perhaps the principal concern. In this situation,
+ * there is very little prior knowledge of memory range to node
+ * mappings, so perhaps a large portion of the work the bootmem
+ * allocator is intended to do must be done "up front" when bitmaps
+ * associated with memory ranges are used to represent availability
+ * information. While it is possible to use bitmaps for that purpose,
+ * it is my belief that the reduced space overhead of the segment
+ * trees and the obliviousness of their storage management with
+ * respect to the address ranges they represent is advantageous.
+ *
+ * In order to motivate how (2) is addressed, the notion of
+ * "residency" is useful. When a memory range is associated with
+ * a node, only a certain portion of it is actually available.
+ * the ratio of available memory to the size of the memory range
+ * being tracked, sizeof(available memory)/sizeof(memory in map),
+ * is what I call the residency of the range. When the map of the
+ * available memory requires a contiguous range of memory that is
+ * a larger proportion of the range of memory being tracked than
+ * the residency of that range, then the algorithm can no longer
+ * properly function.
+ * So to address that, a representation has been chosen which does
+ * not grow with the size of the range of memory being represented.
+ * The residency requirements of the bitmap-based representation
+ * are 1/(8*sizeof(page)) on byte addressed machines. But the range
+ * set representation has no specific residency requirements.
+ * Segment pools need not be drawn from a contiguous range of memory
+ * larger than the combined size of a header for tracking all the
+ * segment pools and the size of a single range structure. Dynamic
+ * addition of segment pools is not implemented here yet.
+ */
+
+/*
+ * Access to this subsystem has to be serialized externally. (This is
+ * true for the boot process anyway.)
+ */
+
+/*
+ * Alignment has to be a power of 2 value.
+ * These macros abstract out common address calculations for alignments.
+ */
+#define RND_DN(x,n) ((x) & ~((n)-1))
+#define RND_UP(x,n) RND_DN((x) + (n) - 1, n)
+#define DIV_DN(x,n) ((x) / (n))
+#define DIV_UP(x,n) DIV_DN((x) + ((n) - 1), n)
+
+/*
+ * The highest and lowest page frame numbers on the system.
+ * These refer to physical addresses backed by memory regardless
+ * of runtime availability.
  */
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 
-/* return the number of _pages_ that will be allocated for the boot bitmap */
-unsigned long __init bootmem_bootmap_pages (unsigned long pages)
+/*
+ * This is a poor choice of random seeds for deterministic
+ * behavior during debugging. Oddly enough it does not seem
+ * to damage the structure of the trees.
+ */
+static unsigned long __initdata random_seed = 1UL;
+
+/*
+ * Park-Miller random number generator, using Schrage's
+ * technique for overflow handling.
+ */
+static unsigned long __init rand(void)
 {
-	unsigned long mapsize;
+	unsigned long a = 16807;
+	unsigned long q = 12773;
+	unsigned long r = 2386;
+	unsigned long k;
+
+	k = random_seed / q;
+	random_seed = a*(random_seed - k*q) - r*k;
+	return random_seed;
+}
 
-	mapsize = (pages+7)/8;
-	mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
-	mapsize >>= PAGE_SHIFT;
+/*
+ * Initialize the segment pool, which occupies node_bootmem_map.
+ * This is the memory from which the tree nodes tracking available
+ * memory are allocated.
+ */
+static void __init segment_pool_init(bootmem_data_t *bdata)
+{
+	unsigned k;
+	segment_buf_t *segment_pool = (segment_buf_t *)bdata->node_bootmem_map;
 
-	return mapsize;
+	for(k = 0; k < NR_SEGMENTS - 1; ++k)
+		segment_pool[k].next = &segment_pool[k+1];
+	segment_pool[NR_SEGMENTS-1].next = NULL;
+	bdata->free_segments = segment_pool;
+}
+
+/*
+ * Allocates a tree node from a node's segment pool, initializing the
+ * whole of the memory block to zeroes.
+ */
+static segment_tree_node_t * __init segment_alloc(bootmem_data_t *bdata)
+{
+	segment_tree_node_t *tmp = (segment_tree_node_t *)bdata->free_segments;
+
+	if(!bdata->free_segments)
+		return NULL;
+
+	bdata->free_segments = bdata->free_segments->next;
+	memset(tmp, 0, sizeof(segment_tree_node_t));
+	return tmp;
+}
+
+/*
+ * Convenience operation to insert a tree node into both
+ * of the segment trees associated with a node. The randomized
+ * priorities are used here.
+ */
+static void __init segment_insert(segment_tree_root_t *root,
+			segment_tree_node_t *node)
+{
+	node->start.priority  = rand();
+	node->length.priority = rand();
+	treap_insert(&root->start_tree, &node->start);
+	treap_insert(&root->length_tree, &node->length);
+}
+
+/*
+ * Returns a segment tree node to the node-local pool of available
+ * tree nodes.
+ */
+static void __init segment_free(bootmem_data_t *bdata,
+						segment_tree_node_t *node)
+{
+	segment_buf_t *tmp;
+
+	if(!node)
+		return;
+
+	tmp = (segment_buf_t *)node;
+	tmp->next = bdata->free_segments;
+	bdata->free_segments = tmp;
+}
+
+/*
+ * Return the number of _pages_ that will be allocated for the bootmem
+ * segment pool. Its sole purpose is to warn callers of the bootmem
+ * interface in advance of its size, so that a suitably large range of
+ * physical memory may be found to hold it.
+ */
+unsigned long __init bootmem_bootmap_pages (unsigned long pages)
+{
+	return DIV_UP(NR_SEGMENTS*sizeof(segment_buf_t),PAGE_SIZE);
 }
 
 /*
  * Called once to set up the allocator itself.
+ * Its responsibilities are manipulate the bootmem_data_t within
+ * a node, initializing its address range and node-local segment
+ * pool fields. It is supposed to calculate the amount of memory
+ * required for the node_bootmem_map, but this is not possible
+ * without a change of interface.
  */
 static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 	unsigned long mapstart, unsigned long start, unsigned long end)
 {
 	bootmem_data_t *bdata = pgdat->bdata;
-	unsigned long mapsize = ((end - start)+7)/8;
 
 	pgdat->node_next = pgdat_list;
 	pgdat_list = pgdat;
 
-	mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
 	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
 	bdata->node_boot_start = (start << PAGE_SHIFT);
 	bdata->node_low_pfn = end;
@@ -59,300 +212,701 @@
 	 * Initially all pages are reserved - setup_arch() has to
 	 * register free RAM areas explicitly.
 	 */
-	memset(bdata->node_bootmem_map, 0xff, mapsize);
+	bdata->segment_tree.start_tree = NULL;
+	bdata->segment_tree.length_tree = NULL;
+	segment_pool_init(bdata);
 
-	return mapsize;
+	return RND_UP(NR_SEGMENTS*sizeof(segment_buf_t), PAGE_SIZE);
 }
 
 /*
- * Marks a particular physical memory range as unallocatable. Usable RAM
- * might be used for boot-time allocations - or it might get added
- * to the free page pool later on.
+ * reserve_bootmem_core marks a particular segment of physical
+ * memory as unavailable. Available memory might be used for boot-time
+ * allocations, or it might be made available again later on.
+ *
+ * Its behavior is to mark the specified range of physical memory
+ * as unavailable, irrespective of alignment constraints (in contrast
+ * to prior incarnations, which page-aligned the starting and ending
+ * addresses of the unavailable interval of memory).
  */
-static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
+static void __init reserve_bootmem_core(bootmem_data_t *bdata,
+					unsigned long addr, unsigned long size)
 {
-	unsigned long i;
+	unsigned long start;
+	unsigned long end;
+	segment_tree_node_t split_segment, segment;
+	segment_tree_node_t reserved_left, reserved_right;
+	segment_tree_node_t *multiple_left, *multiple_right;
+	treap_node_t *tmp, *parent, *intersect;
+
 	/*
-	 * round up, partially reserved pages are considered
-	 * fully reserved.
+	 * Round up, partially reserved pages are considered fully reserved.
 	 */
-	unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
-	unsigned long eidx = (addr + size - bdata->node_boot_start + 
-							PAGE_SIZE-1)/PAGE_SIZE;
-	unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
+	start = addr;
+	end   = start + size - 1;
 
-	if (!size) BUG();
+	segment_set_endpoints(&segment, start, end);
 
-	if (sidx < 0)
-		BUG();
-	if (eidx < 0)
-		BUG();
-	if (sidx >= eidx)
-		BUG();
-	if ((addr >> PAGE_SHIFT) >= bdata->node_low_pfn)
-		BUG();
-	if (end > bdata->node_low_pfn)
-		BUG();
-	for (i = sidx; i < eidx; i++)
-		if (test_and_set_bit(i, bdata->node_bootmem_map))
-			printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
+	segment_all_intersect(&bdata->segment_tree.start_tree,
+				start, end, &intersect);
+
+	/*
+	 * If the set of intersecting intervals is empty, report
+	 * the entire interval as multiply-reserved. Then the
+	 * condition of the loop ensures a proper exit will follow.
+	 */
+	if(!intersect)
+		printk(KERN_WARNING "the interval [%lu, %lu] "
+					"was multiply reserved (!intersect)\n",
+					segment_start(&segment),
+					segment_end(&segment));
+
+	/*
+	 * For error-checking, this must be called only for a single
+	 * node per reservation. The next step in strict error checking
+	 * would be to track the fragments of the interval to reserve
+	 * that do not lie within any available interval and then report
+	 * them as multiply-reserved.
+	 *
+	 * Unfortunately, error checking that way appears to require
+	 * unbounded allocations in order to maintain the set of multiply
+	 * reserved intervals, so it is not entirely robust.
+	 *
+	 * For the moment, a cruder form of error checking is done:
+	 * if the available interval does not contain the interval
+	 * to be reserved, then the complement of the reserved
+	 * interval with respect to the available interval is reported
+	 * as multiply reserved. This may multiply report multiply
+	 * reserved ranges, but it is still less verbose than the
+	 * mechanism used in the bitmap-based allocator.
+	 */
+
+	/*
+	 * Destructive post-order traversal of the set of
+	 * intersecting intervals.
+	 */
+	tmp = intersect;
+	treap_find_leftmost_leaf(tmp);
+	while(tmp) {
+		segment_tree_node_t *fragment = &split_segment;
+		segment_tree_node_t *avail = start_segment_treap(tmp);
+		treap_find_parent_and_remove_child(tmp, parent);
+
+		multiple_left  = &reserved_left;
+		multiple_right = &reserved_right;
+
+		if(!segment_contains(avail, &segment)) {
+			segment_set_endpoints(multiple_left,
+					segment_start(&segment),
+					segment_end(&segment));
+			segment_complement(&multiple_left, avail,
+							&multiple_right);
+			if(multiple_left)
+				printk(KERN_WARNING "the interval [%lu, %lu] "
+					" was multiply reserved (left)\n",
+					segment_start(multiple_left),
+					segment_end(multiple_left));
+			if(multiple_right)
+				printk(KERN_WARNING "the interval [%lu, %lu] "
+					" was multiply reserved (right)\n",
+					segment_start(multiple_right),
+					segment_end(multiple_right));
+		}
+
+		if(!treap_root_delete(segment_length_link(tmp)))
+			treap_root_delete(&bdata->segment_tree.length_tree);
+
+		segment_complement(&avail, &segment, &fragment);
+
+		if(!avail)
+			segment_free(bdata, start_segment_treap(tmp));
+		else
+			segment_insert(&bdata->segment_tree, avail);
+
+		if(fragment) {
+
+			avail = segment_alloc(bdata);
+
+			if(!avail)
+				BUG();
+
+			segment_set_endpoints(avail, segment_start(fragment),
+						segment_end(fragment));
+			segment_insert(&bdata->segment_tree, avail);
+		}
+
+		tmp = parent;
+		treap_find_leftmost_leaf(tmp);
+	}
 }
 
-static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
+/*
+ * free_bootmem_core marks a particular segment of the physical
+ * address space as available. Its semantics are to make the range
+ * of addresses available, irrespective of alignment constraints.
+ */
+static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
+							unsigned long size)
 {
-	unsigned long i;
-	unsigned long start;
+	unsigned long start, end;
+	segment_tree_node_t segment, *avail, intersection, freed;
+	treap_node_t *tmp, *parent, *intersect = NULL;
+
+	start = addr;
+	end = start + size - 1;
+
+	segment_set_endpoints(&segment, start, end);
+	segment_set_endpoints(&freed, start, end);
+
+	segment_all_intersect(&bdata->segment_tree.start_tree,
+			start ? start - 1 : start, end + 1, &intersect);
+
 	/*
-	 * round down end of usable mem, partially free pages are
-	 * considered reserved.
+	 * Error checking here is simple:
+	 * If the available segment and the segment being freed truly
+	 * intersect, their intersection should be reported as multiply
+	 * made available.
 	 */
-	unsigned long sidx;
-	unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
-	unsigned long end = (addr + size)/PAGE_SIZE;
-
-	if (!size) BUG();
-	if (end > bdata->node_low_pfn)
-		BUG();
 
 	/*
-	 * Round up the beginning of the address.
+	 * Destructive post-order traversal of the set of intervals
+	 * intersecting with the freed interval expanded by one. This
+	 * provides for merging of available intervals, as all the
+	 * adjacent intervals are united with newly available interval.
 	 */
-	start = (addr + PAGE_SIZE-1) / PAGE_SIZE;
-	sidx = start - (bdata->node_boot_start/PAGE_SIZE);
+	tmp = intersect;
+	treap_find_leftmost_leaf(tmp);
+	while(tmp) {
+
+		avail = start_segment_treap(tmp);
+		treap_find_parent_and_remove_child(tmp, parent);
+
+		if(segment_intersect(&freed, avail)) {
+			segment_intersection(&intersection, &freed, avail);
+			printk(KERN_WARNING "the interval [%lu, %lu] "
+				"was multiply made available\n",
+				segment_start(&intersection),
+				segment_end(&intersection));
+		}
 
-	for (i = sidx; i < eidx; i++) {
-		if (!test_and_clear_bit(i, bdata->node_bootmem_map))
-			BUG();
+		segment_unite(&segment, avail);
+
+		if(!treap_root_delete(segment_length_link(tmp)))
+			treap_root_delete(&bdata->segment_tree.length_tree);
+
+		segment_free(bdata, avail);
+
+		tmp = parent;
+		treap_find_leftmost_leaf(tmp);
 	}
+
+	avail = segment_alloc(bdata);
+	if(!avail)
+		BUG();
+
+	segment_set_endpoints(avail, segment_start(&segment),
+					segment_end(&segment));
+
+	segment_insert(&bdata->segment_tree, avail);
 }
 
 /*
- * We 'merge' subsequent allocations to save space. We might 'lose'
- * some fraction of a page if allocations cannot be satisfied due to
- * size constraints on boxes where there is physical RAM space
- * fragmentation - in these cases * (mostly large memory boxes) this
- * is not a problem.
+ * The terms are borrowed from linear programming.
+ * A feasible line segment is one which contains a subinterval
+ * aligned on the appropriate boundary of sufficient length.
+ *
+ * The objective function is the magnitude of the least residue
+ * of the smallest aligned address within the subinterval minus the goal
+ * mod the largest page frame number. A conditional is used instead of
+ * of remainder so as to avoid the overhead of division.
  *
- * On low memory boxes we get it right in 100% of the cases.
+ * The idea here is to iterate over the feasible set and minimize
+ * the objective function (by exhaustive search). The search space
+ * is "thinned" prior to the iteration by using the heuristic that
+ * the interval must be at least of the length requested, though
+ * that is not sufficient because of alignment constraints.
  */
 
+#define FEASIBLE(seg, len, align)					\
+(									\
+	(segment_end(seg) >= RND_UP(segment_start(seg), align))		\
+		&&							\
+	((segment_end(seg) - RND_UP(segment_start(seg), align)) > (len))\
+)
+
+#define STARTS_BELOW(seg,goal,align,len) \
+	(RND_UP(segment_start(seg), align) <= (goal))
+
+#define ENDS_ABOVE(seg, goal, align, len) \
+	((segment_end(seg) > (goal)) && ((segment_end(seg) - (goal)) > (len)))
+
+#define GOAL_WITHIN(seg,goal,align,len) \
+	(STARTS_BELOW(seg,goal,align,len) && ENDS_ABOVE(seg,goal,align,len))
+
+#define GOAL_ABOVE(seg, goal, align) \
+	((goal) > segment_end(seg))
+
+#define DISTANCE_BELOW(seg, goal, align) \
+	(segment_start(seg) - (goal))
+
+#define DISTANCE_ABOVE(seg, goal, align) \
+	(((ULONG_MAX - (goal)) + 1) + segment_start(seg))
+
+#define OBJECTIVE(seg, goal, align, len)				\
+(	GOAL_WITHIN(seg,goal,align,len) 				\
+	? 0UL								\
+	: (								\
+		GOAL_ABOVE(seg, goal, align)				\
+		? DISTANCE_ABOVE(seg, goal, align)			\
+		: DISTANCE_BELOW(seg, goal, align)			\
+	)								\
+)
+
+#define UNVISITED	0
+#define LEFT_SEARCHED	1
+#define RIGHT_SEARCHED	2
+#define VISITED		3
+
 /*
- * alignment has to be a power of 2 value.
+ * __alloc_bootmem_core attempts to satisfy reservation requests
+ * of a certain size with alignment constraints, so that the beginning
+ * of the allocated line segment is as near as possible to the goal
+ * in the following sense:
+ *
+ * The beginning of the allocated line segment is either the lowest
+ * possible address above the goal, or the lowest possible address
+ * overall. This actually has a simple notion of distance, namely
+ * (goal - start) % (MAX_ADDR + 1). The OBJECTIVE macros measures
+ * this distance, albeit with some arithmetic complications.
+ *
+ * The algorithm proceeds as follows:
+ * (1) Divide the set of available intervals into those which are
+ *     long enough and those which are not long enough, ignoring
+ *     alignment constraints.
+ * (2) Perform depth-first search over the tree of supposedly
+ *     long enough intervals for the best possible interval.
+ *
+ * The FEASIBLE macro is used to determine whether it is truly
+ * possible to place an aligned interval of sufficient length
+ * within the interval, and it is needed because the true length
+ * of the interval is not sufficient to determine that, and
+ * because it is not truly possible to subdivide the set of available
+ * intervals according to this criterion with pure tree operations.
+ *
+ * As address ranges are the granularity of available interval tracking,
+ * this should provide optimal merging behavior.
  */
+
 static void * __init __alloc_bootmem_core (bootmem_data_t *bdata, 
 	unsigned long size, unsigned long align, unsigned long goal)
 {
-	unsigned long i, start = 0;
+	unsigned long length;
+	segment_tree_node_t left_half, right_half, reserved, *left, *right;
+	segment_tree_node_t *optimum, *node;
+	treap_node_t *tmp, *infeasible, *feasible;
 	void *ret;
-	unsigned long offset, remaining_size;
-	unsigned long areasize, preferred, incr;
-	unsigned long eidx = bdata->node_low_pfn - (bdata->node_boot_start >>
-							PAGE_SHIFT);
 
-	if (!size) BUG();
+	feasible = infeasible = NULL;
 
-	if (align & (align-1))
+	if(!align)
+		align = 1;
+
+	length = size;
+	if(!length)
 		BUG();
 
-	offset = 0;
-	if (align &&
-	    (bdata->node_boot_start & (align - 1UL)) != 0)
-		offset = (align - (bdata->node_boot_start & (align - 1UL)));
-	offset >>= PAGE_SHIFT;
-
-	/*
-	 * We try to allocate bootmem pages above 'goal'
-	 * first, then we try to allocate lower pages.
-	 */
-	if (goal && (goal >= bdata->node_boot_start) && 
-			((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
-		preferred = goal - bdata->node_boot_start;
-	} else
-		preferred = 0;
-
-	preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
-	preferred += offset;
-	areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
-	incr = align >> PAGE_SHIFT ? : 1;
-
-restart_scan:
-	for (i = preferred; i < eidx; i += incr) {
-		unsigned long j;
-		if (test_bit(i, bdata->node_bootmem_map))
+	treap_split(&bdata->segment_tree.length_tree, length, &infeasible,
+								&feasible);
+	optimum = NULL;
+
+	tmp = feasible;
+	while(tmp) {
+
+		if(tmp->marker == UNVISITED) {
+			if(tmp->left) {
+				tmp->marker = LEFT_SEARCHED;
+				tmp = tmp->left;
+				continue;
+			} else if(tmp->right) {
+				tmp->marker = RIGHT_SEARCHED;
+				tmp = tmp->right;
+				continue;
+			} else
+				tmp->marker = VISITED;
+		} else if(tmp->marker == LEFT_SEARCHED) {
+			if(tmp->right) {
+				tmp->marker = RIGHT_SEARCHED;
+				tmp = tmp->right;
+				continue;
+			} else
+				tmp->marker = VISITED;
+		} else if(tmp->marker == RIGHT_SEARCHED)
+			tmp->marker = VISITED;
+		else if(tmp->marker == VISITED) {
+			tmp->marker = UNVISITED;
+			tmp = tmp->parent;
 			continue;
-		for (j = i + 1; j < i + areasize; ++j) {
-			if (j >= eidx)
-				goto fail_block;
-			if (test_bit (j, bdata->node_bootmem_map))
-				goto fail_block;
-		}
-		start = i;
-		goto found;
-	fail_block:;
+		} else
+			BUG();
+
+		if(!tmp)
+			break;
+
+		node = length_segment_treap(tmp);
+
+		if(!optimum && FEASIBLE(node, length, align))
+
+			optimum = node;
+
+		else if(FEASIBLE(node, length, align)
+			&& (OBJECTIVE(node, goal, align, length)
+				< OBJECTIVE(optimum, goal, align, length)))
+
+			optimum = node;
+
 	}
-	if (preferred) {
-		preferred = offset;
-		goto restart_scan;
+
+	/*
+	 * Restore the set of available intervals keyed by length,
+	 * taking into account the need to remove the optimum from
+	 * the set if it has been determined.
+	 */
+	if(!optimum) {
+		treap_join(&bdata->segment_tree.length_tree, &feasible,
+								&infeasible);
+		return NULL;
 	}
-	return NULL;
-found:
-	if (start >= eidx)
-		BUG();
+
+	if(!treap_root_delete(treap_node_link(&optimum->start)))
+		treap_root_delete(&bdata->segment_tree.start_tree);
+
+	if(!treap_root_delete(treap_node_link(&optimum->length)))
+		treap_root_delete(&feasible);
+
+	treap_join(&bdata->segment_tree.length_tree, &infeasible, &feasible);
 
 	/*
-	 * Is the next page of the previous allocation-end the start
-	 * of this allocation's buffer? If yes then we can 'merge'
-	 * the previous partial page with this allocation.
-	 */
-	if (align <= PAGE_SIZE
-	    && bdata->last_offset && bdata->last_pos+1 == start) {
-		offset = (bdata->last_offset+align-1) & ~(align-1);
-		if (offset > PAGE_SIZE)
+	 * Now the iteration has converged to the optimal feasible interval.
+	 * Within that interval we must now choose a subinterval
+	 * satisfying the alignment constraints and do the appropriate
+	 * splitting of the interval from which it was drawn.
+	 */
+
+	segment_set_endpoints(&reserved, goal, goal + length - 1);
+
+	if(!segment_contains_point(optimum, goal)
+		|| !segment_contains(optimum, &reserved))
+
+		segment_set_endpoints(&reserved,
+				RND_UP(segment_start(optimum), align),
+				RND_UP(segment_start(optimum),align)+length-1);
+
+	segment_set_endpoints(&left_half, segment_start(optimum),
+					segment_end(optimum));
+
+	left = &left_half;
+	right = &right_half;
+	segment_complement(&left, &reserved, &right);
+
+	if(!left && !right)
+		segment_free(bdata, optimum);
+
+	if(left) {
+		segment_set_endpoints(optimum, segment_start(left),
+						segment_end(left));
+		segment_insert(&bdata->segment_tree, optimum);
+	}
+
+	if(right) {
+		segment_tree_node_t *segment = segment_alloc(bdata);
+		if(!segment)
 			BUG();
-		remaining_size = PAGE_SIZE-offset;
-		if (size < remaining_size) {
-			areasize = 0;
-			// last_pos unchanged
-			bdata->last_offset = offset+size;
-			ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
-						bdata->node_boot_start);
-		} else {
-			remaining_size = size - remaining_size;
-			areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
-			ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
-						bdata->node_boot_start);
-			bdata->last_pos = start+areasize-1;
-			bdata->last_offset = remaining_size;
-		}
-		bdata->last_offset &= ~PAGE_MASK;
-	} else {
-		bdata->last_pos = start + areasize - 1;
-		bdata->last_offset = size & ~PAGE_MASK;
-		ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
+		segment_set_endpoints(segment, segment_start(right),
+						segment_end(right));
+		segment_insert(&bdata->segment_tree, segment);
 	}
+
 	/*
-	 * Reserve the area now:
+	 * Convert the physical address to a kernel virtual address,
+	 * zero out the memory within the interval, and return it.
 	 */
-	for (i = start; i < start+areasize; i++)
-		if (test_and_set_bit(i, bdata->node_bootmem_map))
-			BUG();
+	ret = (void *)(phys_to_virt(segment_start(&reserved)));
 	memset(ret, 0, size);
+
 	return ret;
 }
 
+/*
+ * free_all_bootmem_core's responsibilities are to initialize the
+ * node_mem_map array of struct page with the availability information
+ * regarding physical memory, and to make available the memory the
+ * bootmem allocator itself used for tracking available physical memory.
+ * Here the prior behavior with respect to page alignment is emulated
+ * by reducing the granularity of the address ranges to page frames,
+ * using the conservative approximation of the largest page-aligned
+ * interval lying within the interval seen to be available, or making
+ * no memory available if the interval is smaller than a page in length.
+ */
 static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 {
-	struct page *page = pgdat->node_mem_map;
-	bootmem_data_t *bdata = pgdat->bdata;
-	unsigned long i, count, total = 0;
-	unsigned long idx;
+	unsigned long total = 0UL, mapstart, start, end;
+	unsigned long node_start = pgdat->bdata->node_boot_start >> PAGE_SHIFT;
+	struct page *page;
+	treap_node_t *parent, *tmp;
 
-	if (!bdata->node_bootmem_map) BUG();
+	mapstart = virt_to_phys(pgdat->bdata->node_bootmem_map);
 
-	count = 0;
-	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
-	for (i = 0; i < idx; i++, page++) {
-		if (!test_bit(i, bdata->node_bootmem_map)) {
-			count++;
-			ClearPageReserved(page);
-			set_page_count(page, 1);
-			__free_page(page);
-		}
-	}
-	total += count;
+#ifdef DEBUG_BOOTMEM
+
+	printk("Available physical memory:\n");
+
+#endif /* DEBUG_BOOTMEM */
+
+	free_bootmem_core(pgdat->bdata, mapstart,
+			RND_UP(NR_SEGMENTS*sizeof(segment_buf_t), PAGE_SIZE));
 
 	/*
-	 * Now free the allocator bitmap itself, it's not
-	 * needed anymore:
+	 * Destructive post-order traversal of the length tree.
+	 * The tree is never used again, so no attempt is made
+	 * to restore it to working order.
 	 */
-	page = virt_to_page(bdata->node_bootmem_map);
-	count = 0;
-	for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
-		count++;
-		ClearPageReserved(page);
-		set_page_count(page, 1);
-		__free_page(page);
+	tmp = pgdat->bdata->segment_tree.length_tree;
+	treap_find_leftmost_leaf(tmp);
+	while(tmp) {
+		segment_tree_node_t *segment = length_segment_treap(tmp);
+
+		/*
+		 * This calculation differs from that in prior
+		 * incarnations in this subsystem, so I describe it
+		 * in passing detail here.
+		 *
+		 *******************************************************
+		 *
+		 * We have start so that start is the least pfn with
+		 *
+		 * PAGE_SIZE * start >= segment_start(segment)
+		 *
+		 * so after division and ceiling:
+		 *
+		 * start = DIV_UP(segment_start(segment), PAGE_SIZE)
+		 *
+		 *******************************************************
+		 *
+		 * Now the last pfn is the greatest pfn such that
+		 *
+		 * PAGE_SIZE * last + PAGE_SIZE - 1 <=  segment_end(segment)
+		 *
+		 *   -or-
+		 *
+		 * PAGE_SIZE * (last + 1) <= segment_end(segment) + 1
+		 *
+		 * giving us after division and flooring:
+		 *
+		 * last + 1 = DIV_DN(segment_end(segment) + 1, PAGE_SIZE)
+		 *
+		 * or using end as a -strict- upper bound (i.e. end > pfn),
+		 * we have
+		 *
+		 * end = DIV_DN(segment_end(segment) + 1, PAGE_SIZE)
+		 *
+		 */
+
+		start =  DIV_UP(segment_start(segment),   PAGE_SIZE);
+		end   =  DIV_DN(segment_end(segment) + 1, PAGE_SIZE);
+
+#ifdef DEBUG_BOOTMEM
+
+		if(start < end)
+			printk("available segment: [%lu,%lu]\n",
+				start * PAGE_SIZE,
+				end   * PAGE_SIZE - 1);
+
+#endif /* DEBUG_BOOTMEM */
+
+		for(	page  =  pgdat->node_mem_map + (start - node_start);
+			page  <  pgdat->node_mem_map + (end   - node_start);
+			++page) {
+
+                		ClearPageReserved(page);
+                		set_page_count(page, 1);
+                		__free_page(page);
+		}
+
+		/*
+		 * In most calculations in this file, closed intervals
+		 * are considered. In this instance, a half-open interval
+		 * is being considered, and so the usual end - start + 1
+		 * calculation does not apply.
+		 */
+		if(start < end)
+			total += end - start;
+
+		treap_find_parent_and_remove_child(tmp, parent);
+		tmp = parent;
+		treap_find_leftmost_leaf(tmp);
 	}
-	total += count;
-	bdata->node_bootmem_map = NULL;
 
 	return total;
 }
 
-unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
+/*
+ * Wrappers around the core routines so that they operate on the
+ * per-node memory structures (pg_data_t *pgdat).
+ */
+unsigned long __init init_bootmem_node (pg_data_t *pgdat,
+					unsigned long freepfn,
+					unsigned long startpfn,
+					unsigned long endpfn)
 {
-	return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
+	return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
 }
 
-void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
+void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr,
+							unsigned long size)
 {
 	reserve_bootmem_core(pgdat->bdata, physaddr, size);
 }
 
-void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
+void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size,
+					unsigned long align, unsigned long goal)
+{
+	void *ptr;
+
+	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
+	if(ptr)
+		return ptr;
+
+	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of memory");
+	return NULL;
+}
+
+void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr,
+							unsigned long size)
 {
-	return(free_bootmem_core(pgdat->bdata, physaddr, size));
+	free_bootmem_core(pgdat->bdata, physaddr, size);
 }
 
 unsigned long __init free_all_bootmem_node (pg_data_t *pgdat)
 {
-	return(free_all_bootmem_core(pgdat));
+	return free_all_bootmem_core(pgdat);
 }
 
+/*
+ * Non-node-aware wrappers for the core routines. The per-node
+ * structures are hidden by using the global variable contig_page_data.
+ */
 unsigned long __init init_bootmem (unsigned long start, unsigned long pages)
 {
 	max_low_pfn = pages;
 	min_low_pfn = start;
-	return(init_bootmem_core(&contig_page_data, start, 0, pages));
+	return init_bootmem_core(&contig_page_data, start, 0, pages);
 }
 
-void __init reserve_bootmem (unsigned long addr, unsigned long size)
+/*
+ * In multinode configurations it is not desirable to make memory
+ * available without information about the node assignment of the
+ * memory range, so even though reserve_bootmem() may operate
+ * without node information this cannot.
+ *
+ * This apparent inconsistency in the interface actually makes
+ * some sense, as when presented with irregular node to memory range
+ * assignments in firmware tables, the original request to make memory
+ * available will be aware of its node assignment. But an outstanding
+ * issue is that a non-node-aware memory reservation request (via
+ * alloc_bootmem()) will not know to which node to return the memory.
+ *
+ * Resolving that issue would involve tracking dynamic allocations
+ * separately from assertions regarding the presence of physical
+ * memory, which is feasible given a change of interface, or perhaps a
+ * separate tree in each node for memory reserved by dynamic allocations.
+ */
+void __init free_bootmem (unsigned long addr, unsigned long size)
 {
-	reserve_bootmem_core(contig_page_data.bdata, addr, size);
+	free_bootmem_core(contig_page_data.bdata, addr, size);
 }
 
-void __init free_bootmem (unsigned long addr, unsigned long size)
+/*
+ * reserve_bootmem operates without node information, yet is node
+ * aware. In situations where it may not be clear to where a given
+ * physical memory range is assigned this performs the task of
+ * searching the nodes on behalf of the caller.
+ */
+void __init reserve_bootmem (unsigned long addr, unsigned long size)
 {
-	return(free_bootmem_core(contig_page_data.bdata, addr, size));
+	unsigned long start, end;
+	unsigned in_any_node = 0;
+	segment_tree_node_t segment, *tree;
+	pg_data_t *pgdat = pgdat_list;
+
+	start = addr;
+	end   = start + size - 1;
+
+	segment_set_endpoints(&segment, start, end);
+
+	/*
+	 * For error checking, this must determine the node(s) within
+	 * which an interval to be reserved lies. Otherwise, once the
+	 * error checking is in place, the memory will be reported as
+	 * multiply-reserved on those nodes not containing the memory.
+	 */
+	while(pgdat) {
+		unsigned in_node;
+
+		tree = start_segment_treap(pgdat->bdata->segment_tree.start_tree);
+		in_node = segment_tree_intersects(tree, &segment);
+		in_any_node |= in_node;
+
+		if(in_node)
+			reserve_bootmem_node(pgdat, addr, size);
+
+		pgdat = pgdat->node_next;
+	}
+	if(!in_any_node)
+		printk(KERN_WARNING "the interval [%lu, %lu] "
+			"was multiply reserved\n",
+			segment_start(&segment),
+			segment_end(&segment));
 }
 
+/*
+ * free_all_bootmem is now a convenience function, and iterates over
+ * all the nodes, performing free_all_bootmem_core.
+ */
 unsigned long __init free_all_bootmem (void)
 {
-	return(free_all_bootmem_core(&contig_page_data));
+	pg_data_t *pgdat = pgdat_list;
+	unsigned long total = 0UL;
+
+	while(pgdat) {
+		total += free_all_bootmem_core(pgdat);
+		pgdat =  pgdat->node_next;
+	}
+
+	return total;
 }
 
-void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
+/*
+ * __alloc_bootmem performs a search over all nodes in order to satisfy
+ * an allocation request, for when it is unimportant from which node
+ * the memory used to satisfy an allocation is drawn.
+ */
+void * __init __alloc_bootmem (unsigned long size, unsigned long align,
+							unsigned long goal)
 {
 	pg_data_t *pgdat = pgdat_list;
 	void *ptr;
 
 	while (pgdat) {
-		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-						align, goal)))
-			return(ptr);
-		pgdat = pgdat->node_next;
-	}
-	/*
-	 * Whoops, we cannot satisfy the allocation request.
-	 */
-	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
-	panic("Out of memory");
-	return NULL;
-}
+		ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
 
-void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal)
-{
-	void *ptr;
+		if(ptr)
+			return ptr;
 
-	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
-	if (ptr)
-		return (ptr);
+		pgdat = pgdat->node_next;
+	}
 
-	/*
-	 * Whoops, we cannot satisfy the allocation request.
-	 */
 	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
 	panic("Out of memory");
 	return NULL;
 }
-
diff -urN linux-2.4.17-rc1-virgin/mm/filemap.c linux-2.4.17-rc1-wli3/mm/filemap.c
--- linux-2.4.17-rc1-virgin/mm/filemap.c	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/mm/filemap.c	Sun Dec 16 17:58:10 2001
@@ -53,7 +53,7 @@
 EXPORT_SYMBOL(vm_min_readahead);
 
 
-spinlock_t pagecache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+spinlock_t pagecache_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 /*
  * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock 
  *	with the pagecache_lock held.
@@ -63,7 +63,7 @@
  *		pagemap_lru_lock ->
  *			pagecache_lock
  */
-spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+spinlock_t pagemap_lru_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 
 #define CLUSTER_PAGES		(1 << page_cluster)
 #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
@@ -234,7 +234,7 @@
 static void truncate_complete_page(struct page *page)
 {
 	/* Leave it on the LRU if it gets converted into anonymous buffers */
-	if (!page->buffers || do_flushpage(page, 0))
+	if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0)))
 		lru_cache_del(page);
 
 	/*
@@ -296,6 +296,7 @@
 
 			page_cache_release(page);
 
+			/* we hit this with lock depth of 1 or 2 */
 			if (current->need_resched) {
 				__set_current_state(TASK_RUNNING);
 				schedule();
@@ -406,6 +407,8 @@
 		}
 
 		page_cache_release(page);
+
+		debug_lock_break(551);
 		if (current->need_resched) {
 			__set_current_state(TASK_RUNNING);
 			schedule();
@@ -454,6 +457,11 @@
 	return page;
 }
 
+static struct page * __find_page(struct address_space * mapping, unsigned long index)
+{
+	return __find_page_nolock(mapping, index, *page_hash(mapping,index));
+}
+
 /*
  * By the time this is called, the page is locked and
  * we don't have to worry about any races any more.
@@ -594,12 +602,16 @@
 		list_del(&page->list);
 		list_add(&page->list, &mapping->locked_pages);
 
-		if (!PageDirty(page))
-			continue;
-
 		page_cache_get(page);
 		spin_unlock(&pagecache_lock);
 
+		/* BKL is held ... */
+		debug_lock_break(1);
+		conditional_schedule();
+
+		if (!PageDirty(page))
+			goto clean;
+
 		lock_page(page);
 
 		if (PageDirty(page)) {
@@ -607,7 +619,7 @@
 			writepage(page);
 		} else
 			UnlockPage(page);
-
+clean:
 		page_cache_release(page);
 		spin_lock(&pagecache_lock);
 	}
@@ -623,14 +635,28 @@
  */
 void filemap_fdatawait(struct address_space * mapping)
 {
+	DEFINE_LOCK_COUNT();
+
 	spin_lock(&pagecache_lock);
 
+restart:
         while (!list_empty(&mapping->locked_pages)) {
 		struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
 
 		list_del(&page->list);
 		list_add(&page->list, &mapping->clean_pages);
-
+ 
+		if (TEST_LOCK_COUNT(32)) {
+			RESET_LOCK_COUNT();
+			debug_lock_break(2);
+			if (conditional_schedule_needed()) {
+				page_cache_get(page);
+				break_spin_lock_and_resched(&pagecache_lock);
+				page_cache_release(page);
+				goto restart;
+			}
+		}
+ 
 		if (!PageLocked(page))
 			continue;
 
@@ -894,6 +920,7 @@
 	 * the hash-list needs a held write-lock.
 	 */
 repeat:
+	break_spin_lock(&pagecache_lock);
 	page = __find_page_nolock(mapping, offset, hash);
 	if (page) {
 		page_cache_get(page);
@@ -970,7 +997,53 @@
 
 
 /*
- * Same as grab_cache_page, but do not wait if the page is unavailable.
+ * We combine this with read-ahead to deactivate pages when we
+ * think there's sequential IO going on. Note that this is
+ * harmless since we don't actually evict the pages from memory
+ * but just move them to the inactive list.
+ *
+ * TODO:
+ * - make the readahead code smarter
+ * - move readahead to the VMA level so we can do the same
+ *   trick with mmap()
+ *
+ * Rik van Riel, 2000
+ */
+static void drop_behind(struct file * file, unsigned long index)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long start;
+
+	/* Nothing to drop-behind if we're on the first page. */
+	if (!index)
+		return;
+
+	if (index > file->f_rawin)
+		start = index - file->f_rawin;
+	else
+		start = 0;
+
+	/*
+	 * Go backwards from index-1 and drop all pages in the
+	 * readahead window. Since the readahead window may have
+	 * been increased since the last time we were called, we
+	 * stop when the page isn't there.
+	 */
+	spin_lock(&pagemap_lru_lock);
+	spin_lock(&pagecache_lock);
+	while (--index >= start) {
+		page = __find_page(mapping, index);
+		if (!page || !PageActive(page))
+			break;
+		deactivate_page_nolock(page);
+	}
+	spin_unlock(&pagecache_lock);
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/* Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
  * be regenerated if the page couldn't be grabbed.  This routine should
  * be safe to call while holding the lock for another page.
@@ -1240,6 +1313,12 @@
 		if (filp->f_ramax > max_readahead)
 			filp->f_ramax = max_readahead;
 
+		/*
+		 * Move the pages that have already been passed
+		 * to the inactive list.
+		 */
+		drop_behind(filp, index);
+
 #ifdef PROFILE_READAHEAD
 		profile_readahead((reada_ok == 2), filp);
 #endif
@@ -1248,25 +1327,6 @@
 	return;
 }
 
-/*
- * Mark a page as having seen activity.
- *
- * If it was already so marked, move it
- * to the active queue and drop the referenced
- * bit. Otherwise, just mark it for future
- * action..
- */
-void mark_page_accessed(struct page *page)
-{
-	if (!PageActive(page) && PageReferenced(page)) {
-		activate_page(page);
-		ClearPageReferenced(page);
-		return;
-	}
-
-	/* Mark the page referenced, AFTER checking for previous usage.. */
-	SetPageReferenced(page);
-}
 
 /*
  * This is a generic file read routine, and uses the
@@ -1375,7 +1435,7 @@
 		 * beginning or we just did an lseek.
 		 */
 		if (!offset || !filp->f_reada)
-			mark_page_accessed(page);
+			touch_page(page);
 
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
@@ -1492,8 +1552,8 @@
 	ssize_t retval;
 	int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
 	struct kiobuf * iobuf;
-	struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
-	struct inode * inode = mapping->host;
+	struct inode * inode = filp->f_dentry->d_inode;
+	struct address_space * mapping = inode->i_mapping;
 
 	new_iobuf = 0;
 	iobuf = filp->f_iobuf;
@@ -1774,7 +1834,7 @@
 		nr = max;
 
 	/* And limit it to a sane percentage of the inactive list.. */
-	max = nr_inactive_pages / 2;
+	max = nr_inactive_clean_pages / 2;
 	if (nr > max)
 		nr = max;
 
@@ -1919,7 +1979,7 @@
 	 * Found the page and have a reference on it, need to check sharing
 	 * and possibly copy it over to another page..
 	 */
-	mark_page_accessed(page);
+	touch_page(page);
 	flush_page_to_ram(page);
 	return page;
 
@@ -2055,6 +2115,8 @@
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
+	debug_lock_break(1);
+	break_spin_lock(&vma->vm_mm->page_table_lock);
 	return error;
 }
 
@@ -2085,6 +2147,9 @@
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
+
+	debug_lock_break(1);
+	break_spin_lock(&vma->vm_mm->page_table_lock);
 	return error;
 }
 
@@ -2343,7 +2408,7 @@
 	int error = 0;
 
 	/* This caps the number of vma's this process can own */
-	if (vma->vm_mm->map_count > MAX_MAP_COUNT)
+	if (vma->vm_mm->map_count > max_map_count)
 		return -ENOMEM;
 
 	if (start == vma->vm_start) {
@@ -2443,7 +2508,7 @@
 	if (vma->vm_flags & VM_LOCKED)
 		return -EINVAL;
 
-	zap_page_range(vma->vm_mm, start, end - start);
+	zap_page_range(vma->vm_mm, start, end - start, ZPR_PARTITION);
 	return 0;
 }
 
@@ -2773,7 +2838,7 @@
 	page = __read_cache_page(mapping, index, filler, data);
 	if (IS_ERR(page))
 		goto out;
-	mark_page_accessed(page);
+	touch_page(page);
 	if (Page_Uptodate(page))
 		goto out;
 
@@ -2970,6 +3035,7 @@
 		unsigned long index, offset;
 		long page_fault;
 		char *kaddr;
+		int deactivate = 1;
 
 		/*
 		 * Try to find the page in the cache. If it isn't there,
@@ -2978,8 +3044,10 @@
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
+		if (bytes > count) {
 			bytes = count;
+			deactivate = 0;
+		}
 
 		/*
 		 * Bring in the user page that we will copy from _first_.
@@ -3023,8 +3091,11 @@
 unlock:
 		kunmap(page);
 		/* Mark it unlocked again and drop the page.. */
-		SetPageReferenced(page);
 		UnlockPage(page);
+		if (deactivate)
+			deactivate_page(page);
+		else
+			touch_page(page);
 		page_cache_release(page);
 
 		if (status < 0)
diff -urN linux-2.4.17-rc1-virgin/mm/filemap.c~ linux-2.4.17-rc1-wli3/mm/filemap.c~
--- linux-2.4.17-rc1-virgin/mm/filemap.c~	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/mm/filemap.c~	Fri Dec 14 02:44:20 2001
@@ -0,0 +1,3144 @@
+/*
+ *	linux/mm/filemap.c
+ *
+ * Copyright (C) 1994-1999  Linus Torvalds
+ */
+
+/*
+ * This file handles the generic file mmap semantics used by
+ * most "normal" filesystems (but you don't /have/ to use this:
+ * the NFS filesystem used to do this differently, for example)
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/locks.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/smp_lock.h>
+#include <linux/blkdev.h>
+#include <linux/file.h>
+#include <linux/swapctl.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/iobuf.h>
+#include <linux/compiler.h>
+
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mman.h>
+
+#include <linux/highmem.h>
+
+/*
+ * Shared mappings implemented 30.11.1994. It's not fully working yet,
+ * though.
+ *
+ * Shared mappings now work. 15.8.1995  Bruno.
+ *
+ * finished 'unifying' the page and buffer cache and SMP-threaded the
+ * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+ *
+ * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
+ */
+
+atomic_t page_cache_size = ATOMIC_INIT(0);
+unsigned int page_hash_bits;
+struct page **page_hash_table;
+
+int vm_max_readahead = 31;
+int vm_min_readahead = 3;
+EXPORT_SYMBOL(vm_max_readahead);
+EXPORT_SYMBOL(vm_min_readahead);
+
+
+spinlock_t pagecache_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+/*
+ * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock 
+ *	with the pagecache_lock held.
+ *
+ * Ordering:
+ *	swap_lock ->
+ *		pagemap_lru_lock ->
+ *			pagecache_lock
+ */
+spinlock_t pagemap_lru_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+
+#define CLUSTER_PAGES		(1 << page_cluster)
+#define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
+
+static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
+static void add_page_to_hash_queue(struct page * page, struct page **p)
+{
+	struct page *next = *p;
+
+	*p = page;
+	page->next_hash = next;
+	page->pprev_hash = p;
+	if (next)
+		next->pprev_hash = &page->next_hash;
+	if (page->buffers)
+		PAGE_BUG(page);
+	atomic_inc(&page_cache_size);
+}
+
+static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
+{
+	struct list_head *head = &mapping->clean_pages;
+
+	mapping->nrpages++;
+	list_add(&page->list, head);
+	page->mapping = mapping;
+}
+
+static inline void remove_page_from_inode_queue(struct page * page)
+{
+	struct address_space * mapping = page->mapping;
+
+	mapping->nrpages--;
+	list_del(&page->list);
+	page->mapping = NULL;
+}
+
+static inline void remove_page_from_hash_queue(struct page * page)
+{
+	struct page *next = page->next_hash;
+	struct page **pprev = page->pprev_hash;
+
+	if (next)
+		next->pprev_hash = pprev;
+	*pprev = next;
+	page->pprev_hash = NULL;
+	atomic_dec(&page_cache_size);
+}
+
+/*
+ * Remove a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe.
+ */
+void __remove_inode_page(struct page *page)
+{
+	if (PageDirty(page)) BUG();
+	remove_page_from_inode_queue(page);
+	remove_page_from_hash_queue(page);
+}
+
+void remove_inode_page(struct page *page)
+{
+	if (!PageLocked(page))
+		PAGE_BUG(page);
+
+	spin_lock(&pagecache_lock);
+	__remove_inode_page(page);
+	spin_unlock(&pagecache_lock);
+}
+
+static inline int sync_page(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
+		return mapping->a_ops->sync_page(page);
+	return 0;
+}
+
+/*
+ * Add a page to the dirty page list.
+ */
+void set_page_dirty(struct page *page)
+{
+	if (!test_and_set_bit(PG_dirty, &page->flags)) {
+		struct address_space *mapping = page->mapping;
+
+		if (mapping) {
+			spin_lock(&pagecache_lock);
+			list_del(&page->list);
+			list_add(&page->list, &mapping->dirty_pages);
+			spin_unlock(&pagecache_lock);
+
+			if (mapping->host)
+				mark_inode_dirty_pages(mapping->host);
+		}
+	}
+}
+
+/**
+ * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
+ * @inode: the inode which pages we want to invalidate
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ */
+
+void invalidate_inode_pages(struct inode * inode)
+{
+	struct list_head *head, *curr;
+	struct page * page;
+
+	head = &inode->i_mapping->clean_pages;
+
+	spin_lock(&pagemap_lru_lock);
+	spin_lock(&pagecache_lock);
+	curr = head->next;
+
+	while (curr != head) {
+		page = list_entry(curr, struct page, list);
+		curr = curr->next;
+
+		/* We cannot invalidate something in dirty.. */
+		if (PageDirty(page))
+			continue;
+
+		/* ..or locked */
+		if (TryLockPage(page))
+			continue;
+
+		if (page->buffers && !try_to_free_buffers(page, 0))
+			goto unlock;
+
+		if (page_count(page) != 1)
+			goto unlock;
+
+		__lru_cache_del(page);
+		__remove_inode_page(page);
+		UnlockPage(page);
+		page_cache_release(page);
+		continue;
+unlock:
+		UnlockPage(page);
+		continue;
+	}
+
+	spin_unlock(&pagecache_lock);
+	spin_unlock(&pagemap_lru_lock);
+}
+
+static int do_flushpage(struct page *page, unsigned long offset)
+{
+	int (*flushpage) (struct page *, unsigned long);
+	flushpage = page->mapping->a_ops->flushpage;
+	if (flushpage)
+		return (*flushpage)(page, offset);
+	return block_flushpage(page, offset);
+}
+
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+	if (page->buffers)
+		do_flushpage(page, partial);
+}
+
+static void truncate_complete_page(struct page *page)
+{
+	/* Leave it on the LRU if it gets converted into anonymous buffers */
+	if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0)))
+		lru_cache_del(page);
+
+	/*
+	 * We remove the page from the page cache _after_ we have
+	 * destroyed all buffer-cache references to it. Otherwise some
+	 * other process might think this inode page is not in the
+	 * page cache and creates a buffer-cache alias to it causing
+	 * all sorts of fun problems ...  
+	 */
+	ClearPageDirty(page);
+	ClearPageUptodate(page);
+	remove_inode_page(page);
+	page_cache_release(page);
+}
+
+static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
+static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
+{
+	struct list_head *curr;
+	struct page * page;
+	int unlocked = 0;
+
+ restart:
+	curr = head->prev;
+	while (curr != head) {
+		unsigned long offset;
+
+		page = list_entry(curr, struct page, list);
+		offset = page->index;
+
+		/* Is one of the pages to truncate? */
+		if ((offset >= start) || (*partial && (offset + 1) == start)) {
+			int failed;
+
+			page_cache_get(page);
+			failed = TryLockPage(page);
+
+			list_del(head);
+			if (!failed)
+				/* Restart after this page */
+				list_add_tail(head, curr);
+			else
+				/* Restart on this page */
+				list_add(head, curr);
+
+			spin_unlock(&pagecache_lock);
+			unlocked = 1;
+
+ 			if (!failed) {
+				if (*partial && (offset + 1) == start) {
+					truncate_partial_page(page, *partial);
+					*partial = 0;
+				} else 
+					truncate_complete_page(page);
+
+				UnlockPage(page);
+			} else
+ 				wait_on_page(page);
+
+			page_cache_release(page);
+
+			if (current->need_resched) {
+				__set_current_state(TASK_RUNNING);
+				schedule();
+			}
+
+			spin_lock(&pagecache_lock);
+			goto restart;
+		}
+		curr = curr->prev;
+	}
+	return unlocked;
+}
+
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from with to truncate
+ *
+ * Truncate the page cache at a set offset, removing the pages
+ * that are beyond that offset (and zeroing out partial pages).
+ * If any page is locked we wait for it to become unlocked.
+ */
+void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 
+{
+	unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+	int unlocked;
+
+	spin_lock(&pagecache_lock);
+	do {
+		unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
+		unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
+		unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
+	} while (unlocked);
+	/* Traversed all three lists without dropping the lock */
+	spin_unlock(&pagecache_lock);
+}
+
+static inline int invalidate_this_page2(struct page * page,
+					struct list_head * curr,
+					struct list_head * head)
+{
+	int unlocked = 1;
+
+	/*
+	 * The page is locked and we hold the pagecache_lock as well
+	 * so both page_count(page) and page->buffers stays constant here.
+	 */
+	if (page_count(page) == 1 + !!page->buffers) {
+		/* Restart after this page */
+		list_del(head);
+		list_add_tail(head, curr);
+
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+		truncate_complete_page(page);
+	} else {
+		if (page->buffers) {
+			/* Restart after this page */
+			list_del(head);
+			list_add_tail(head, curr);
+
+			page_cache_get(page);
+			spin_unlock(&pagecache_lock);
+			block_invalidate_page(page);
+		} else
+			unlocked = 0;
+
+		ClearPageDirty(page);
+		ClearPageUptodate(page);
+	}
+
+	return unlocked;
+}
+
+static int FASTCALL(invalidate_list_pages2(struct list_head *));
+static int invalidate_list_pages2(struct list_head *head)
+{
+	struct list_head *curr;
+	struct page * page;
+	int unlocked = 0;
+
+ restart:
+	curr = head->prev;
+	while (curr != head) {
+		page = list_entry(curr, struct page, list);
+
+		if (!TryLockPage(page)) {
+			int __unlocked;
+
+			__unlocked = invalidate_this_page2(page, curr, head);
+			UnlockPage(page);
+			unlocked |= __unlocked;
+			if (!__unlocked) {
+				curr = curr->prev;
+				continue;
+			}
+		} else {
+			/* Restart on this page */
+			list_del(head);
+			list_add(head, curr);
+
+			page_cache_get(page);
+			spin_unlock(&pagecache_lock);
+			unlocked = 1;
+			wait_on_page(page);
+		}
+
+		page_cache_release(page);
+		if (current->need_resched) {
+			__set_current_state(TASK_RUNNING);
+			schedule();
+		}
+
+		spin_lock(&pagecache_lock);
+		goto restart;
+	}
+	return unlocked;
+}
+
+/**
+ * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
+ * free the pages because they're mapped.
+ * @mapping: the address_space which pages we want to invalidate
+ */
+void invalidate_inode_pages2(struct address_space * mapping)
+{
+	int unlocked;
+
+	spin_lock(&pagecache_lock);
+	do {
+		unlocked = invalidate_list_pages2(&mapping->clean_pages);
+		unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
+		unlocked |= invalidate_list_pages2(&mapping->locked_pages);
+	} while (unlocked);
+	spin_unlock(&pagecache_lock);
+}
+
+static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
+{
+	goto inside;
+
+	for (;;) {
+		page = page->next_hash;
+inside:
+		if (!page)
+			goto not_found;
+		if (page->mapping != mapping)
+			continue;
+		if (page->index == offset)
+			break;
+	}
+
+not_found:
+	return page;
+}
+
+static struct page * __find_page(struct address_space * mapping, unsigned long index)
+{
+	return __find_page_nolock(mapping, index, *page_hash(mapping,index));
+}
+
+/*
+ * By the time this is called, the page is locked and
+ * we don't have to worry about any races any more.
+ *
+ * Start the IO..
+ */
+static int writeout_one_page(struct page *page)
+{
+	struct buffer_head *bh, *head = page->buffers;
+
+	bh = head;
+	do {
+		if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
+			continue;
+
+		bh->b_flushtime = jiffies;
+		ll_rw_block(WRITE, 1, &bh);	
+	} while ((bh = bh->b_this_page) != head);
+	return 0;
+}
+
+int waitfor_one_page(struct page *page)
+{
+	int error = 0;
+	struct buffer_head *bh, *head = page->buffers;
+
+	bh = head;
+	do {
+		wait_on_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			error = -EIO;
+	} while ((bh = bh->b_this_page) != head);
+	return error;
+}
+
+static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
+{
+	struct list_head *curr;
+	struct page *page;
+	int retval = 0;
+
+	spin_lock(&pagecache_lock);
+	curr = head->next;
+	while (curr != head) {
+		page = list_entry(curr, struct page, list);
+		curr = curr->next;
+		if (!page->buffers)
+			continue;
+		if (page->index >= end)
+			continue;
+		if (page->index < start)
+			continue;
+
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+		lock_page(page);
+
+		/* The buffers could have been free'd while we waited for the page lock */
+		if (page->buffers)
+			retval |= fn(page);
+
+		UnlockPage(page);
+		spin_lock(&pagecache_lock);
+		curr = page->list.next;
+		page_cache_release(page);
+	}
+	spin_unlock(&pagecache_lock);
+
+	return retval;
+}
+
+/*
+ * Two-stage data sync: first start the IO, then go back and
+ * collect the information..
+ */
+int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
+{
+	int retval;
+
+	/* writeout dirty buffers on pages from both clean and dirty lists */
+	retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
+	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
+	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
+
+	/* now wait for locked buffers on pages from both clean and dirty lists */
+	retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
+	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
+	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
+
+	return retval;
+}
+
+/*
+ * In-memory filesystems have to fail their
+ * writepage function - and this has to be
+ * worked around in the VM layer..
+ *
+ * We
+ *  - mark the page dirty again (but do NOT
+ *    add it back to the inode dirty list, as
+ *    that would livelock in fdatasync)
+ *  - activate the page so that the page stealer
+ *    doesn't try to write it out over and over
+ *    again.
+ */
+int fail_writepage(struct page *page)
+{
+	/* Only activate on memory-pressure, not fsync.. */
+	if (PageLaunder(page)) {
+		activate_page(page);
+		SetPageReferenced(page);
+	}
+
+	/* Set the page dirty again, unlock */
+	SetPageDirty(page);
+	UnlockPage(page);
+	return 0;
+}
+
+EXPORT_SYMBOL(fail_writepage);
+
+/**
+ *      filemap_fdatasync - walk the list of dirty pages of the given address space
+ *     	and writepage() all of them.
+ * 
+ *      @mapping: address space structure to write
+ *
+ */
+void filemap_fdatasync(struct address_space * mapping)
+{
+	int (*writepage)(struct page *) = mapping->a_ops->writepage;
+
+	spin_lock(&pagecache_lock);
+
+        while (!list_empty(&mapping->dirty_pages)) {
+		struct page *page = list_entry(mapping->dirty_pages.next, struct page, list);
+
+		list_del(&page->list);
+		list_add(&page->list, &mapping->locked_pages);
+
+		if (!PageDirty(page))
+			continue;
+
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+
+		lock_page(page);
+
+		if (PageDirty(page)) {
+			ClearPageDirty(page);
+			writepage(page);
+		} else
+			UnlockPage(page);
+
+		page_cache_release(page);
+		spin_lock(&pagecache_lock);
+	}
+	spin_unlock(&pagecache_lock);
+}
+
+/**
+ *      filemap_fdatawait - walk the list of locked pages of the given address space
+ *     	and wait for all of them.
+ * 
+ *      @mapping: address space structure to wait for
+ *
+ */
+void filemap_fdatawait(struct address_space * mapping)
+{
+	spin_lock(&pagecache_lock);
+
+        while (!list_empty(&mapping->locked_pages)) {
+		struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
+
+		list_del(&page->list);
+		list_add(&page->list, &mapping->clean_pages);
+
+		if (!PageLocked(page))
+			continue;
+
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+
+		___wait_on_page(page);
+
+		page_cache_release(page);
+		spin_lock(&pagecache_lock);
+	}
+	spin_unlock(&pagecache_lock);
+}
+
+/*
+ * Add a page to the inode page cache.
+ *
+ * The caller must have locked the page and 
+ * set all the page flags correctly..
+ */
+void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
+{
+	if (!PageLocked(page))
+		BUG();
+
+	page->index = index;
+	page_cache_get(page);
+	spin_lock(&pagecache_lock);
+	add_page_to_inode_queue(mapping, page);
+	add_page_to_hash_queue(page, page_hash(mapping, index));
+	spin_unlock(&pagecache_lock);
+
+	lru_cache_add(page);
+}
+
+/*
+ * This adds a page to the page cache, starting out as locked,
+ * owned by us, but unreferenced, not uptodate and with no errors.
+ */
+static inline void __add_to_page_cache(struct page * page,
+	struct address_space *mapping, unsigned long offset,
+	struct page **hash)
+{
+	unsigned long flags;
+
+	flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
+	page->flags = flags | (1 << PG_locked);
+	page_cache_get(page);
+	page->index = offset;
+	add_page_to_inode_queue(mapping, page);
+	add_page_to_hash_queue(page, hash);
+}
+
+void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
+{
+	spin_lock(&pagecache_lock);
+	__add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
+	spin_unlock(&pagecache_lock);
+	lru_cache_add(page);
+}
+
+int add_to_page_cache_unique(struct page * page,
+	struct address_space *mapping, unsigned long offset,
+	struct page **hash)
+{
+	int err;
+	struct page *alias;
+
+	spin_lock(&pagecache_lock);
+	alias = __find_page_nolock(mapping, offset, *hash);
+
+	err = 1;
+	if (!alias) {
+		__add_to_page_cache(page,mapping,offset,hash);
+		err = 0;
+	}
+
+	spin_unlock(&pagecache_lock);
+	if (!err)
+		lru_cache_add(page);
+	return err;
+}
+
+/*
+ * This adds the requested page to the page cache if it isn't already there,
+ * and schedules an I/O to read in its contents from disk.
+ */
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+static int page_cache_read(struct file * file, unsigned long offset)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct page **hash = page_hash(mapping, offset);
+	struct page *page; 
+
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(mapping, offset, *hash);
+	spin_unlock(&pagecache_lock);
+	if (page)
+		return 0;
+
+	page = page_cache_alloc(mapping);
+	if (!page)
+		return -ENOMEM;
+
+	if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
+		int error = mapping->a_ops->readpage(file, page);
+		page_cache_release(page);
+		return error;
+	}
+	/*
+	 * We arrive here in the unlikely event that someone 
+	 * raced with us and added our page to the cache first.
+	 */
+	page_cache_release(page);
+	return 0;
+}
+
+/*
+ * Read in an entire cluster at once.  A cluster is usually a 64k-
+ * aligned block that includes the page requested in "offset."
+ */
+static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
+					     unsigned long filesize));
+static int read_cluster_nonblocking(struct file * file, unsigned long offset,
+	unsigned long filesize)
+{
+	unsigned long pages = CLUSTER_PAGES;
+
+	offset = CLUSTER_OFFSET(offset);
+	while ((pages-- > 0) && (offset < filesize)) {
+		int error = page_cache_read(file, offset);
+		if (error < 0)
+			return error;
+		offset ++;
+	}
+
+	return 0;
+}
+
+/* 
+ * Wait for a page to get unlocked.
+ *
+ * This must be called with the caller "holding" the page,
+ * ie with increased "page->count" so that the page won't
+ * go away during the wait..
+ */
+void ___wait_on_page(struct page *page)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	add_wait_queue(&page->wait, &wait);
+	do {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (!PageLocked(page))
+			break;
+		sync_page(page);
+		schedule();
+	} while (PageLocked(page));
+	tsk->state = TASK_RUNNING;
+	remove_wait_queue(&page->wait, &wait);
+}
+
+void unlock_page(struct page *page)
+{
+	clear_bit(PG_launder, &(page)->flags);
+	smp_mb__before_clear_bit();
+	if (!test_and_clear_bit(PG_locked, &(page)->flags))
+		BUG();
+	smp_mb__after_clear_bit(); 
+	if (waitqueue_active(&(page)->wait))
+	wake_up(&(page)->wait);
+}
+
+/*
+ * Get a lock on the page, assuming we need to sleep
+ * to get it..
+ */
+static void __lock_page(struct page *page)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	add_wait_queue_exclusive(&page->wait, &wait);
+	for (;;) {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (PageLocked(page)) {
+			sync_page(page);
+			schedule();
+		}
+		if (!TryLockPage(page))
+			break;
+	}
+	tsk->state = TASK_RUNNING;
+	remove_wait_queue(&page->wait, &wait);
+}
+	
+
+/*
+ * Get an exclusive lock on the page, optimistically
+ * assuming it's not locked..
+ */
+void lock_page(struct page *page)
+{
+	if (TryLockPage(page))
+		__lock_page(page);
+}
+
+/*
+ * a rather lightweight function, finding and getting a reference to a
+ * hashed page atomically.
+ */
+struct page * __find_get_page(struct address_space *mapping,
+			      unsigned long offset, struct page **hash)
+{
+	struct page *page;
+
+	/*
+	 * We scan the hash list read-only. Addition to and removal from
+	 * the hash-list needs a held write-lock.
+	 */
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(mapping, offset, *hash);
+	if (page)
+		page_cache_get(page);
+	spin_unlock(&pagecache_lock);
+	return page;
+}
+
+/*
+ * Same as above, but trylock it instead of incrementing the count.
+ */
+struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
+{
+	struct page *page;
+	struct page **hash = page_hash(mapping, offset);
+
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(mapping, offset, *hash);
+	if (page) {
+		if (TryLockPage(page))
+			page = NULL;
+	}
+	spin_unlock(&pagecache_lock);
+	return page;
+}
+
+/*
+ * Must be called with the pagecache lock held,
+ * will return with it held (but it may be dropped
+ * during blocking operations..
+ */
+static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
+static struct page * __find_lock_page_helper(struct address_space *mapping,
+					unsigned long offset, struct page *hash)
+{
+	struct page *page;
+
+	/*
+	 * We scan the hash list read-only. Addition to and removal from
+	 * the hash-list needs a held write-lock.
+	 */
+repeat:
+	page = __find_page_nolock(mapping, offset, hash);
+	if (page) {
+		page_cache_get(page);
+		if (TryLockPage(page)) {
+			spin_unlock(&pagecache_lock);
+			lock_page(page);
+			spin_lock(&pagecache_lock);
+
+			/* Has the page been re-allocated while we slept? */
+			if (page->mapping != mapping || page->index != offset) {
+				UnlockPage(page);
+				page_cache_release(page);
+				goto repeat;
+			}
+		}
+	}
+	return page;
+}
+
+/*
+ * Same as the above, but lock the page too, verifying that
+ * it's still valid once we own it.
+ */
+struct page * __find_lock_page (struct address_space *mapping,
+				unsigned long offset, struct page **hash)
+{
+	struct page *page;
+
+	spin_lock(&pagecache_lock);
+	page = __find_lock_page_helper(mapping, offset, *hash);
+	spin_unlock(&pagecache_lock);
+	return page;
+}
+
+/*
+ * Same as above, but create the page if required..
+ */
+struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
+{
+	struct page *page;
+	struct page **hash = page_hash(mapping, index);
+
+	spin_lock(&pagecache_lock);
+	page = __find_lock_page_helper(mapping, index, *hash);
+	spin_unlock(&pagecache_lock);
+	if (!page) {
+		struct page *newpage = alloc_page(gfp_mask);
+		page = ERR_PTR(-ENOMEM);
+		if (newpage) {
+			spin_lock(&pagecache_lock);
+			page = __find_lock_page_helper(mapping, index, *hash);
+			if (likely(!page)) {
+				page = newpage;
+				__add_to_page_cache(page, mapping, index, hash);
+				newpage = NULL;
+			}
+			spin_unlock(&pagecache_lock);
+			if (newpage == NULL)
+				lru_cache_add(page);
+			else 
+				page_cache_release(newpage);
+		}
+	}
+	return page;	
+}
+
+/*
+ * Returns locked page at given index in given cache, creating it if needed.
+ */
+struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
+{
+	return find_or_create_page(mapping, index, mapping->gfp_mask);
+}
+
+
+/*
+ * We combine this with read-ahead to deactivate pages when we
+ * think there's sequential IO going on. Note that this is
+ * harmless since we don't actually evict the pages from memory
+ * but just move them to the inactive list.
+ *
+ * TODO:
+ * - make the readahead code smarter
+ * - move readahead to the VMA level so we can do the same
+ *   trick with mmap()
+ *
+ * Rik van Riel, 2000
+ */
+static void drop_behind(struct file * file, unsigned long index)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long start;
+
+	/* Nothing to drop-behind if we're on the first page. */
+	if (!index)
+		return;
+
+	if (index > file->f_rawin)
+		start = index - file->f_rawin;
+	else
+		start = 0;
+
+	/*
+	 * Go backwards from index-1 and drop all pages in the
+	 * readahead window. Since the readahead window may have
+	 * been increased since the last time we were called, we
+	 * stop when the page isn't there.
+	 */
+	spin_lock(&pagemap_lru_lock);
+	spin_lock(&pagecache_lock);
+	while (--index >= start) {
+		page = __find_page(mapping, index);
+		if (!page || !PageActive(page))
+			break;
+		deactivate_page_nolock(page);
+	}
+	spin_unlock(&pagecache_lock);
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/* Same as grab_cache_page, but do not wait if the page is unavailable.
+ * This is intended for speculative data generators, where the data can
+ * be regenerated if the page couldn't be grabbed.  This routine should
+ * be safe to call while holding the lock for another page.
+ */
+struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
+{
+	struct page *page, **hash;
+
+	hash = page_hash(mapping, index);
+	page = __find_get_page(mapping, index, hash);
+
+	if ( page ) {
+		if ( !TryLockPage(page) ) {
+			/* Page found and locked */
+			/* This test is overly paranoid, but what the heck... */
+			if ( unlikely(page->mapping != mapping || page->index != index) ) {
+				/* Someone reallocated this page under us. */
+				UnlockPage(page);
+				page_cache_release(page);
+				return NULL;
+			} else {
+				return page;
+			}
+		} else {
+			/* Page locked by someone else */
+			page_cache_release(page);
+			return NULL;
+		}
+	}
+
+	page = page_cache_alloc(mapping);
+	if ( unlikely(!page) )
+		return NULL;	/* Failed to allocate a page */
+
+	if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
+		/* Someone else grabbed the page already. */
+		page_cache_release(page);
+		return NULL;
+	}
+
+	return page;
+}
+
+#if 0
+#define PROFILE_READAHEAD
+#define DEBUG_READAHEAD
+#endif
+
+/*
+ * Read-ahead profiling information
+ * --------------------------------
+ * Every PROFILE_MAXREADCOUNT, the following information is written 
+ * to the syslog:
+ *   Percentage of asynchronous read-ahead.
+ *   Average of read-ahead fields context value.
+ * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
+ * to the syslog.
+ */
+
+#ifdef PROFILE_READAHEAD
+
+#define PROFILE_MAXREADCOUNT 1000
+
+static unsigned long total_reada;
+static unsigned long total_async;
+static unsigned long total_ramax;
+static unsigned long total_ralen;
+static unsigned long total_rawin;
+
+static void profile_readahead(int async, struct file *filp)
+{
+	unsigned long flags;
+
+	++total_reada;
+	if (async)
+		++total_async;
+
+	total_ramax	+= filp->f_ramax;
+	total_ralen	+= filp->f_ralen;
+	total_rawin	+= filp->f_rawin;
+
+	if (total_reada > PROFILE_MAXREADCOUNT) {
+		save_flags(flags);
+		cli();
+		if (!(total_reada > PROFILE_MAXREADCOUNT)) {
+			restore_flags(flags);
+			return;
+		}
+
+		printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
+			total_ramax/total_reada,
+			total_ralen/total_reada,
+			total_rawin/total_reada,
+			(total_async*100)/total_reada);
+#ifdef DEBUG_READAHEAD
+		printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
+			filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
+#endif
+
+		total_reada	= 0;
+		total_async	= 0;
+		total_ramax	= 0;
+		total_ralen	= 0;
+		total_rawin	= 0;
+
+		restore_flags(flags);
+	}
+}
+#endif  /* defined PROFILE_READAHEAD */
+
+/*
+ * Read-ahead context:
+ * -------------------
+ * The read ahead context fields of the "struct file" are the following:
+ * - f_raend : position of the first byte after the last page we tried to
+ *	       read ahead.
+ * - f_ramax : current read-ahead maximum size.
+ * - f_ralen : length of the current IO read block we tried to read-ahead.
+ * - f_rawin : length of the current read-ahead window.
+ *		if last read-ahead was synchronous then
+ *			f_rawin = f_ralen
+ *		otherwise (was asynchronous)
+ *			f_rawin = previous value of f_ralen + f_ralen
+ *
+ * Read-ahead limits:
+ * ------------------
+ * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
+ * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
+ *
+ * Synchronous read-ahead benefits:
+ * --------------------------------
+ * Using reasonable IO xfer length from peripheral devices increase system 
+ * performances.
+ * Reasonable means, in this context, not too large but not too small.
+ * The actual maximum value is:
+ *	MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
+ *      and 32K if defined (4K page size assumed).
+ *
+ * Asynchronous read-ahead benefits:
+ * ---------------------------------
+ * Overlapping next read request and user process execution increase system 
+ * performance.
+ *
+ * Read-ahead risks:
+ * -----------------
+ * We have to guess which further data are needed by the user process.
+ * If these data are often not really needed, it's bad for system 
+ * performances.
+ * However, we know that files are often accessed sequentially by 
+ * application programs and it seems that it is possible to have some good 
+ * strategy in that guessing.
+ * We only try to read-ahead files that seems to be read sequentially.
+ *
+ * Asynchronous read-ahead risks:
+ * ------------------------------
+ * In order to maximize overlapping, we must start some asynchronous read 
+ * request from the device, as soon as possible.
+ * We must be very careful about:
+ * - The number of effective pending IO read requests.
+ *   ONE seems to be the only reasonable value.
+ * - The total memory pool usage for the file access stream.
+ *   This maximum memory usage is implicitly 2 IO read chunks:
+ *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
+ *   64k if defined (4K page size assumed).
+ */
+
+static inline int get_max_readahead(struct inode * inode)
+{
+	if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
+		return vm_max_readahead;
+	return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
+}
+
+static void generic_file_readahead(int reada_ok,
+	struct file * filp, struct inode * inode,
+	struct page * page)
+{
+	unsigned long end_index;
+	unsigned long index = page->index;
+	unsigned long max_ahead, ahead;
+	unsigned long raend;
+	int max_readahead = get_max_readahead(inode);
+
+	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+
+	raend = filp->f_raend;
+	max_ahead = 0;
+
+/*
+ * The current page is locked.
+ * If the current position is inside the previous read IO request, do not
+ * try to reread previously read ahead pages.
+ * Otherwise decide or not to read ahead some pages synchronously.
+ * If we are not going to read ahead, set the read ahead context for this 
+ * page only.
+ */
+	if (PageLocked(page)) {
+		if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
+			raend = index;
+			if (raend < end_index)
+				max_ahead = filp->f_ramax;
+			filp->f_rawin = 0;
+			filp->f_ralen = 1;
+			if (!max_ahead) {
+				filp->f_raend  = index + filp->f_ralen;
+				filp->f_rawin += filp->f_ralen;
+			}
+		}
+	}
+/*
+ * The current page is not locked.
+ * If we were reading ahead and,
+ * if the current max read ahead size is not zero and,
+ * if the current position is inside the last read-ahead IO request,
+ *   it is the moment to try to read ahead asynchronously.
+ * We will later force unplug device in order to force asynchronous read IO.
+ */
+	else if (reada_ok && filp->f_ramax && raend >= 1 &&
+		 index <= raend && index + filp->f_ralen >= raend) {
+/*
+ * Add ONE page to max_ahead in order to try to have about the same IO max size
+ * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
+ * Compute the position of the last page we have tried to read in order to 
+ * begin to read ahead just at the next page.
+ */
+		raend -= 1;
+		if (raend < end_index)
+			max_ahead = filp->f_ramax + 1;
+
+		if (max_ahead) {
+			filp->f_rawin = filp->f_ralen;
+			filp->f_ralen = 0;
+			reada_ok      = 2;
+		}
+	}
+/*
+ * Try to read ahead pages.
+ * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
+ * scheduler, will work enough for us to avoid too bad actuals IO requests.
+ */
+	ahead = 0;
+	while (ahead < max_ahead) {
+		ahead ++;
+		if ((raend + ahead) >= end_index)
+			break;
+		if (page_cache_read(filp, raend + ahead) < 0)
+			break;
+	}
+/*
+ * If we tried to read ahead some pages,
+ * If we tried to read ahead asynchronously,
+ *   Try to force unplug of the device in order to start an asynchronous
+ *   read IO request.
+ * Update the read-ahead context.
+ * Store the length of the current read-ahead window.
+ * Double the current max read ahead size.
+ *   That heuristic avoid to do some large IO for files that are not really
+ *   accessed sequentially.
+ */
+	if (ahead) {
+		filp->f_ralen += ahead;
+		filp->f_rawin += filp->f_ralen;
+		filp->f_raend = raend + ahead + 1;
+
+		filp->f_ramax += filp->f_ramax;
+
+		if (filp->f_ramax > max_readahead)
+			filp->f_ramax = max_readahead;
+
+		/*
+		 * Move the pages that have already been passed
+		 * to the inactive list.
+		 */
+		drop_behind(filp, index);
+
+#ifdef PROFILE_READAHEAD
+		profile_readahead((reada_ok == 2), filp);
+#endif
+	}
+
+	return;
+}
+
+
+/*
+ * This is a generic file read routine, and uses the
+ * inode->i_op->readpage() function for the actual low-level
+ * stuff.
+ *
+ * This is really ugly. But the goto's actually try to clarify some
+ * of the logic when it comes to error handling etc.
+ */
+void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
+{
+	struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+	unsigned long index, offset;
+	struct page *cached_page;
+	int reada_ok;
+	int error;
+	int max_readahead = get_max_readahead(inode);
+
+	cached_page = NULL;
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_CACHE_MASK;
+
+/*
+ * If the current position is outside the previous read-ahead window, 
+ * we reset the current read-ahead context and set read ahead max to zero
+ * (will be set to just needed value later),
+ * otherwise, we assume that the file accesses are sequential enough to
+ * continue read-ahead.
+ */
+	if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
+		reada_ok = 0;
+		filp->f_raend = 0;
+		filp->f_ralen = 0;
+		filp->f_ramax = 0;
+		filp->f_rawin = 0;
+	} else {
+		reada_ok = 1;
+	}
+/*
+ * Adjust the current value of read-ahead max.
+ * If the read operation stay in the first half page, force no readahead.
+ * Otherwise try to increase read ahead max just enough to do the read request.
+ * Then, at least MIN_READAHEAD if read ahead is ok,
+ * and at most MAX_READAHEAD in all cases.
+ */
+	if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
+		filp->f_ramax = 0;
+	} else {
+		unsigned long needed;
+
+		needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
+
+		if (filp->f_ramax < needed)
+			filp->f_ramax = needed;
+
+		if (reada_ok && filp->f_ramax < vm_min_readahead)
+				filp->f_ramax = vm_min_readahead;
+		if (filp->f_ramax > max_readahead)
+			filp->f_ramax = max_readahead;
+	}
+
+	for (;;) {
+		struct page *page, **hash;
+		unsigned long end_index, nr, ret;
+
+		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+			
+		if (index > end_index)
+			break;
+		nr = PAGE_CACHE_SIZE;
+		if (index == end_index) {
+			nr = inode->i_size & ~PAGE_CACHE_MASK;
+			if (nr <= offset)
+				break;
+		}
+
+		nr = nr - offset;
+
+		/*
+		 * Try to find the data in the page cache..
+		 */
+		hash = page_hash(mapping, index);
+
+		spin_lock(&pagecache_lock);
+		page = __find_page_nolock(mapping, index, *hash);
+		if (!page)
+			goto no_cached_page;
+found_page:
+		page_cache_get(page);
+		spin_unlock(&pagecache_lock);
+
+		if (!Page_Uptodate(page))
+			goto page_not_up_to_date;
+		generic_file_readahead(reada_ok, filp, inode, page);
+page_ok:
+		/* If users can be writing to this page using arbitrary
+		 * virtual addresses, take care about potential aliasing
+		 * before reading the page on the kernel side.
+		 */
+		if (mapping->i_mmap_shared != NULL)
+			flush_dcache_page(page);
+
+		/*
+		 * Mark the page accessed if we read the
+		 * beginning or we just did an lseek.
+		 */
+		if (!offset || !filp->f_reada)
+			touch_page(page);
+
+		/*
+		 * Ok, we have the page, and it's up-to-date, so
+		 * now we can copy it to user space...
+		 *
+		 * The actor routine returns how many bytes were actually used..
+		 * NOTE! This may not be the same as how much of a user buffer
+		 * we filled up (we may be padding etc), so we can only update
+		 * "pos" here (the actor routine has to update the user buffer
+		 * pointers and the remaining count).
+		 */
+		ret = actor(desc, page, offset, nr);
+		offset += ret;
+		index += offset >> PAGE_CACHE_SHIFT;
+		offset &= ~PAGE_CACHE_MASK;
+
+		page_cache_release(page);
+		if (ret == nr && desc->count)
+			continue;
+		break;
+
+/*
+ * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
+ */
+page_not_up_to_date:
+		generic_file_readahead(reada_ok, filp, inode, page);
+
+		if (Page_Uptodate(page))
+			goto page_ok;
+
+		/* Get exclusive access to the page ... */
+		lock_page(page);
+
+		/* Did it get unhashed before we got the lock? */
+		if (!page->mapping) {
+			UnlockPage(page);
+			page_cache_release(page);
+			continue;
+		}
+
+		/* Did somebody else fill it already? */
+		if (Page_Uptodate(page)) {
+			UnlockPage(page);
+			goto page_ok;
+		}
+
+readpage:
+		/* ... and start the actual read. The read will unlock the page. */
+		error = mapping->a_ops->readpage(filp, page);
+
+		if (!error) {
+			if (Page_Uptodate(page))
+				goto page_ok;
+
+			/* Again, try some read-ahead while waiting for the page to finish.. */
+			generic_file_readahead(reada_ok, filp, inode, page);
+			wait_on_page(page);
+			if (Page_Uptodate(page))
+				goto page_ok;
+			error = -EIO;
+		}
+
+		/* UHHUH! A synchronous read error occurred. Report it */
+		desc->error = error;
+		page_cache_release(page);
+		break;
+
+no_cached_page:
+		/*
+		 * Ok, it wasn't cached, so we need to create a new
+		 * page..
+		 *
+		 * We get here with the page cache lock held.
+		 */
+		if (!cached_page) {
+			spin_unlock(&pagecache_lock);
+			cached_page = page_cache_alloc(mapping);
+			if (!cached_page) {
+				desc->error = -ENOMEM;
+				break;
+			}
+
+			/*
+			 * Somebody may have added the page while we
+			 * dropped the page cache lock. Check for that.
+			 */
+			spin_lock(&pagecache_lock);
+			page = __find_page_nolock(mapping, index, *hash);
+			if (page)
+				goto found_page;
+		}
+
+		/*
+		 * Ok, add the new page to the hash-queues...
+		 */
+		page = cached_page;
+		__add_to_page_cache(page, mapping, index, hash);
+		spin_unlock(&pagecache_lock);
+		lru_cache_add(page);		
+		cached_page = NULL;
+
+		goto readpage;
+	}
+
+	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+	filp->f_reada = 1;
+	if (cached_page)
+		page_cache_release(cached_page);
+	UPDATE_ATIME(inode);
+}
+
+static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
+{
+	ssize_t retval;
+	int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
+	struct kiobuf * iobuf;
+	struct inode * inode = filp->f_dentry->d_inode;
+	struct address_space * mapping = inode->i_mapping;
+
+	new_iobuf = 0;
+	iobuf = filp->f_iobuf;
+	if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
+		/*
+		 * A parallel read/write is using the preallocated iobuf
+		 * so just run slow and allocate a new one.
+		 */
+		retval = alloc_kiovec(1, &iobuf);
+		if (retval)
+			goto out;
+		new_iobuf = 1;
+	}
+
+	blocksize = 1 << inode->i_blkbits;
+	blocksize_bits = inode->i_blkbits;
+	blocksize_mask = blocksize - 1;
+	chunk_size = KIO_MAX_ATOMIC_IO << 10;
+
+	retval = -EINVAL;
+	if ((offset & blocksize_mask) || (count & blocksize_mask))
+		goto out_free;
+	if (!mapping->a_ops->direct_IO)
+		goto out_free;
+
+	/*
+	 * Flush to disk exlusively the _data_, metadata must remains
+	 * completly asynchronous or performance will go to /dev/null.
+	 */
+	filemap_fdatasync(mapping);
+	retval = fsync_inode_data_buffers(inode);
+	filemap_fdatawait(mapping);
+	if (retval < 0)
+		goto out_free;
+
+	progress = retval = 0;
+	while (count > 0) {
+		iosize = count;
+		if (iosize > chunk_size)
+			iosize = chunk_size;
+
+		retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
+		if (retval)
+			break;
+
+		retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
+
+		if (rw == READ && retval > 0)
+			mark_dirty_kiobuf(iobuf, retval);
+		
+		if (retval >= 0) {
+			count -= retval;
+			buf += retval;
+			progress += retval;
+		}
+
+		unmap_kiobuf(iobuf);
+
+		if (retval != iosize)
+			break;
+	}
+
+	if (progress)
+		retval = progress;
+
+ out_free:
+	if (!new_iobuf)
+		clear_bit(0, &filp->f_iobuf_lock);
+	else
+		free_kiovec(1, &iobuf);
+ out:	
+	return retval;
+}
+
+int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
+{
+	char *kaddr;
+	unsigned long left, count = desc->count;
+
+	if (size > count)
+		size = count;
+
+	kaddr = kmap(page);
+	left = __copy_to_user(desc->buf, kaddr + offset, size);
+	kunmap(page);
+	
+	if (left) {
+		size -= left;
+		desc->error = -EFAULT;
+	}
+	desc->count = count - size;
+	desc->written += size;
+	desc->buf += size;
+	return size;
+}
+
+/*
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+{
+	ssize_t retval;
+
+	if ((ssize_t) count < 0)
+		return -EINVAL;
+
+	if (filp->f_flags & O_DIRECT)
+		goto o_direct;
+
+	retval = -EFAULT;
+	if (access_ok(VERIFY_WRITE, buf, count)) {
+		retval = 0;
+
+		if (count) {
+			read_descriptor_t desc;
+
+			desc.written = 0;
+			desc.count = count;
+			desc.buf = buf;
+			desc.error = 0;
+			do_generic_file_read(filp, ppos, &desc, file_read_actor);
+
+			retval = desc.written;
+			if (!retval)
+				retval = desc.error;
+		}
+	}
+ out:
+	return retval;
+
+ o_direct:
+	{
+		loff_t pos = *ppos, size;
+		struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
+		struct inode *inode = mapping->host;
+
+		retval = 0;
+		if (!count)
+			goto out; /* skip atime */
+		size = inode->i_size;
+		if (pos < size) {
+			if (pos + count > size)
+				count = size - pos;
+			retval = generic_file_direct_IO(READ, filp, buf, count, pos);
+			if (retval > 0)
+				*ppos = pos + retval;
+		}
+		UPDATE_ATIME(filp->f_dentry->d_inode);
+		goto out;
+	}
+}
+
+static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
+{
+	ssize_t written;
+	unsigned long count = desc->count;
+	struct file *file = (struct file *) desc->buf;
+
+	if (size > count)
+		size = count;
+
+ 	if (file->f_op->sendpage) {
+ 		written = file->f_op->sendpage(file, page, offset,
+					       size, &file->f_pos, size<count);
+	} else {
+		char *kaddr;
+		mm_segment_t old_fs;
+
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+
+		kaddr = kmap(page);
+		written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
+		kunmap(page);
+
+		set_fs(old_fs);
+	}
+	if (written < 0) {
+		desc->error = written;
+		written = 0;
+	}
+	desc->count = count - written;
+	desc->written += written;
+	return written;
+}
+
+asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+	ssize_t retval;
+	struct file * in_file, * out_file;
+	struct inode * in_inode, * out_inode;
+
+	/*
+	 * Get input file, and verify that it is ok..
+	 */
+	retval = -EBADF;
+	in_file = fget(in_fd);
+	if (!in_file)
+		goto out;
+	if (!(in_file->f_mode & FMODE_READ))
+		goto fput_in;
+	retval = -EINVAL;
+	in_inode = in_file->f_dentry->d_inode;
+	if (!in_inode)
+		goto fput_in;
+	if (!in_inode->i_mapping->a_ops->readpage)
+		goto fput_in;
+	retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
+	if (retval)
+		goto fput_in;
+
+	/*
+	 * Get output file, and verify that it is ok..
+	 */
+	retval = -EBADF;
+	out_file = fget(out_fd);
+	if (!out_file)
+		goto fput_in;
+	if (!(out_file->f_mode & FMODE_WRITE))
+		goto fput_out;
+	retval = -EINVAL;
+	if (!out_file->f_op || !out_file->f_op->write)
+		goto fput_out;
+	out_inode = out_file->f_dentry->d_inode;
+	retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
+	if (retval)
+		goto fput_out;
+
+	retval = 0;
+	if (count) {
+		read_descriptor_t desc;
+		loff_t pos = 0, *ppos;
+
+		retval = -EFAULT;
+		ppos = &in_file->f_pos;
+		if (offset) {
+			if (get_user(pos, offset))
+				goto fput_out;
+			ppos = &pos;
+		}
+
+		desc.written = 0;
+		desc.count = count;
+		desc.buf = (char *) out_file;
+		desc.error = 0;
+		do_generic_file_read(in_file, ppos, &desc, file_send_actor);
+
+		retval = desc.written;
+		if (!retval)
+			retval = desc.error;
+		if (offset)
+			put_user(pos, offset);
+	}
+
+fput_out:
+	fput(out_file);
+fput_in:
+	fput(in_file);
+out:
+	return retval;
+}
+
+static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	unsigned long max;
+
+	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+		return -EINVAL;
+
+	/* Limit it to the size of the file.. */
+	max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
+	if (index > max)
+		return 0;
+	max -= index;
+	if (nr > max)
+		nr = max;
+
+	/* And limit it to a sane percentage of the inactive list.. */
+	max = nr_inactive_clean_pages / 2;
+	if (nr > max)
+		nr = max;
+
+	while (nr) {
+		page_cache_read(file, index);
+		index++;
+		nr--;
+	}
+	return 0;
+}
+
+asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
+{
+	ssize_t ret;
+	struct file *file;
+
+	ret = -EBADF;
+	file = fget(fd);
+	if (file) {
+		if (file->f_mode & FMODE_READ) {
+			unsigned long start = offset >> PAGE_CACHE_SHIFT;
+			unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
+			ret = do_readahead(file, start, len);
+		}
+		fput(file);
+	}
+	return ret;
+}
+
+/*
+ * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
+ * sure this is sequential access, we don't need a flexible read-ahead
+ * window size -- we can always use a large fixed size window.
+ */
+static void nopage_sequential_readahead(struct vm_area_struct * vma,
+	unsigned long pgoff, unsigned long filesize)
+{
+	unsigned long ra_window;
+
+	ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
+	ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
+
+	/* vm_raend is zero if we haven't read ahead in this area yet.  */
+	if (vma->vm_raend == 0)
+		vma->vm_raend = vma->vm_pgoff + ra_window;
+
+	/*
+	 * If we've just faulted the page half-way through our window,
+	 * then schedule reads for the next window, and release the
+	 * pages in the previous window.
+	 */
+	if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
+		unsigned long start = vma->vm_pgoff + vma->vm_raend;
+		unsigned long end = start + ra_window;
+
+		if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
+			end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
+		if (start > end)
+			return;
+
+		while ((start < end) && (start < filesize)) {
+			if (read_cluster_nonblocking(vma->vm_file,
+							start, filesize) < 0)
+				break;
+			start += CLUSTER_PAGES;
+		}
+		run_task_queue(&tq_disk);
+
+		/* if we're far enough past the beginning of this area,
+		   recycle pages that are in the previous window. */
+		if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
+			unsigned long window = ra_window << PAGE_SHIFT;
+
+			end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
+			end -= window + window;
+			filemap_sync(vma, end - window, window, MS_INVALIDATE);
+		}
+
+		vma->vm_raend += ra_window;
+	}
+
+	return;
+}
+
+/*
+ * filemap_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * The goto's are kind of ugly, but this streamlines the normal case of having
+ * it in the page cache, and handles the special cases reasonably without
+ * having a lot of duplicated code.
+ */
+struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
+{
+	int error;
+	struct file *file = area->vm_file;
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+	struct page *page, **hash;
+	unsigned long size, pgoff, endoff;
+
+	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+	endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+
+retry_all:
+	/*
+	 * An external ptracer can access pages that normally aren't
+	 * accessible..
+	 */
+	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if ((pgoff >= size) && (area->vm_mm == current->mm))
+		return NULL;
+
+	/* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */
+	if (size > endoff)
+		size = endoff;
+
+	/*
+	 * Do we have something in the page cache already?
+	 */
+	hash = page_hash(mapping, pgoff);
+retry_find:
+	page = __find_get_page(mapping, pgoff, hash);
+	if (!page)
+		goto no_cached_page;
+
+	/*
+	 * Ok, found a page in the page cache, now we need to check
+	 * that it's up-to-date.
+	 */
+	if (!Page_Uptodate(page))
+		goto page_not_uptodate;
+
+success:
+ 	/*
+	 * Try read-ahead for sequential areas.
+	 */
+	if (VM_SequentialReadHint(area))
+		nopage_sequential_readahead(area, pgoff, size);
+
+	/*
+	 * Found the page and have a reference on it, need to check sharing
+	 * and possibly copy it over to another page..
+	 */
+	touch_page(page);
+	flush_page_to_ram(page);
+	return page;
+
+no_cached_page:
+	/*
+	 * If the requested offset is within our file, try to read a whole 
+	 * cluster of pages at once.
+	 *
+	 * Otherwise, we're off the end of a privately mapped file,
+	 * so we need to map a zero page.
+	 */
+	if ((pgoff < size) && !VM_RandomReadHint(area))
+		error = read_cluster_nonblocking(file, pgoff, size);
+	else
+		error = page_cache_read(file, pgoff);
+
+	/*
+	 * The page we want has now been added to the page cache.
+	 * In the unlikely event that someone removed it in the
+	 * meantime, we'll just come back here and read it again.
+	 */
+	if (error >= 0)
+		goto retry_find;
+
+	/*
+	 * An error return from page_cache_read can result if the
+	 * system is low on memory, or a problem occurs while trying
+	 * to schedule I/O.
+	 */
+	if (error == -ENOMEM)
+		return NOPAGE_OOM;
+	return NULL;
+
+page_not_uptodate:
+	lock_page(page);
+
+	/* Did it get unhashed while we waited for it? */
+	if (!page->mapping) {
+		UnlockPage(page);
+		page_cache_release(page);
+		goto retry_all;
+	}
+
+	/* Did somebody else get it up-to-date? */
+	if (Page_Uptodate(page)) {
+		UnlockPage(page);
+		goto success;
+	}
+
+	if (!mapping->a_ops->readpage(file, page)) {
+		wait_on_page(page);
+		if (Page_Uptodate(page))
+			goto success;
+	}
+
+	/*
+	 * Umm, take care of errors if the page isn't up-to-date.
+	 * Try to re-read it _once_. We do this synchronously,
+	 * because there really aren't any performance issues here
+	 * and we need to check for errors.
+	 */
+	lock_page(page);
+
+	/* Somebody truncated the page on us? */
+	if (!page->mapping) {
+		UnlockPage(page);
+		page_cache_release(page);
+		goto retry_all;
+	}
+
+	/* Somebody else successfully read it in? */
+	if (Page_Uptodate(page)) {
+		UnlockPage(page);
+		goto success;
+	}
+	ClearPageError(page);
+	if (!mapping->a_ops->readpage(file, page)) {
+		wait_on_page(page);
+		if (Page_Uptodate(page))
+			goto success;
+	}
+
+	/*
+	 * Things didn't work out. Return zero to tell the
+	 * mm layer so, possibly freeing the page cache page first.
+	 */
+	page_cache_release(page);
+	return NULL;
+}
+
+/* Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
+static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
+	unsigned long address, unsigned int flags)
+{
+	pte_t pte = *ptep;
+
+	if (pte_present(pte)) {
+		struct page *page = pte_page(pte);
+		if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
+			flush_tlb_page(vma, address);
+			set_page_dirty(page);
+		}
+	}
+	return 0;
+}
+
+static inline int filemap_sync_pte_range(pmd_t * pmd,
+	unsigned long address, unsigned long size, 
+	struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
+{
+	pte_t * pte;
+	unsigned long end;
+	int error;
+
+	if (pmd_none(*pmd))
+		return 0;
+	if (pmd_bad(*pmd)) {
+		pmd_ERROR(*pmd);
+		pmd_clear(pmd);
+		return 0;
+	}
+	pte = pte_offset(pmd, address);
+	offset += address & PMD_MASK;
+	address &= ~PMD_MASK;
+	end = address + size;
+	if (end > PMD_SIZE)
+		end = PMD_SIZE;
+	error = 0;
+	do {
+		error |= filemap_sync_pte(pte, vma, address + offset, flags);
+		address += PAGE_SIZE;
+		pte++;
+	} while (address && (address < end));
+	return error;
+}
+
+static inline int filemap_sync_pmd_range(pgd_t * pgd,
+	unsigned long address, unsigned long size, 
+	struct vm_area_struct *vma, unsigned int flags)
+{
+	pmd_t * pmd;
+	unsigned long offset, end;
+	int error;
+
+	if (pgd_none(*pgd))
+		return 0;
+	if (pgd_bad(*pgd)) {
+		pgd_ERROR(*pgd);
+		pgd_clear(pgd);
+		return 0;
+	}
+	pmd = pmd_offset(pgd, address);
+	offset = address & PGDIR_MASK;
+	address &= ~PGDIR_MASK;
+	end = address + size;
+	if (end > PGDIR_SIZE)
+		end = PGDIR_SIZE;
+	error = 0;
+	do {
+		error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
+		address = (address + PMD_SIZE) & PMD_MASK;
+		pmd++;
+	} while (address && (address < end));
+	return error;
+}
+
+int filemap_sync(struct vm_area_struct * vma, unsigned long address,
+	size_t size, unsigned int flags)
+{
+	pgd_t * dir;
+	unsigned long end = address + size;
+	int error = 0;
+
+	/* Aquire the lock early; it may be possible to avoid dropping
+	 * and reaquiring it repeatedly.
+	 */
+	spin_lock(&vma->vm_mm->page_table_lock);
+
+	dir = pgd_offset(vma->vm_mm, address);
+	flush_cache_range(vma->vm_mm, end - size, end);
+	if (address >= end)
+		BUG();
+	do {
+		error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
+		dir++;
+	} while (address && (address < end));
+	flush_tlb_range(vma->vm_mm, end - size, end);
+
+	spin_unlock(&vma->vm_mm->page_table_lock);
+
+	return error;
+}
+
+static struct vm_operations_struct generic_file_vm_ops = {
+	nopage:		filemap_nopage,
+};
+
+/* This is used for a general mmap of a disk file */
+
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
+		if (!mapping->a_ops->writepage)
+			return -EINVAL;
+	}
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	UPDATE_ATIME(inode);
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
+
+/*
+ * The msync() system call.
+ */
+
+static int msync_interval(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, int flags)
+{
+	struct file * file = vma->vm_file;
+	if (file && (vma->vm_flags & VM_SHARED)) {
+		int error;
+		error = filemap_sync(vma, start, end-start, flags);
+
+		if (!error && (flags & MS_SYNC)) {
+			struct inode * inode = file->f_dentry->d_inode;
+			down(&inode->i_sem);
+			filemap_fdatasync(inode->i_mapping);
+			if (file->f_op && file->f_op->fsync)
+				error = file->f_op->fsync(file, file->f_dentry, 1);
+			filemap_fdatawait(inode->i_mapping);
+			up(&inode->i_sem);
+		}
+		return error;
+	}
+	return 0;
+}
+
+asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
+{
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error, error = -EINVAL;
+
+	down_read(&current->mm->mmap_sem);
+	if (start & ~PAGE_MASK)
+		goto out;
+	len = (len + ~PAGE_MASK) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
+		goto out;
+	error = 0;
+	if (end == start)
+		goto out;
+	/*
+	 * If the interval [start,end) covers some unmapped address ranges,
+	 * just ignore them, but return -EFAULT at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	unmapped_error = 0;
+	for (;;) {
+		/* Still start < end. */
+		error = -EFAULT;
+		if (!vma)
+			goto out;
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -EFAULT;
+			start = vma->vm_start;
+		}
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = msync_interval(vma, start, end, flags);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = msync_interval(vma, start, vma->vm_end, flags);
+		if (error)
+			goto out;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+out:
+	up_read(&current->mm->mmap_sem);
+	return error;
+}
+
+static inline void setup_read_behavior(struct vm_area_struct * vma,
+	int behavior)
+{
+	VM_ClearReadHint(vma);
+	switch(behavior) {
+		case MADV_SEQUENTIAL:
+			vma->vm_flags |= VM_SEQ_READ;
+			break;
+		case MADV_RANDOM:
+			vma->vm_flags |= VM_RAND_READ;
+			break;
+		default:
+			break;
+	}
+	return;
+}
+
+static long madvise_fixup_start(struct vm_area_struct * vma,
+	unsigned long end, int behavior)
+{
+	struct vm_area_struct * n;
+	struct mm_struct * mm = vma->vm_mm;
+
+	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!n)
+		return -EAGAIN;
+	*n = *vma;
+	n->vm_end = end;
+	setup_read_behavior(n, behavior);
+	n->vm_raend = 0;
+	if (n->vm_file)
+		get_file(n->vm_file);
+	if (n->vm_ops && n->vm_ops->open)
+		n->vm_ops->open(n);
+	vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
+	lock_vma_mappings(vma);
+	spin_lock(&mm->page_table_lock);
+	vma->vm_start = end;
+	__insert_vm_struct(mm, n);
+	spin_unlock(&mm->page_table_lock);
+	unlock_vma_mappings(vma);
+	return 0;
+}
+
+static long madvise_fixup_end(struct vm_area_struct * vma,
+	unsigned long start, int behavior)
+{
+	struct vm_area_struct * n;
+	struct mm_struct * mm = vma->vm_mm;
+
+	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!n)
+		return -EAGAIN;
+	*n = *vma;
+	n->vm_start = start;
+	n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
+	setup_read_behavior(n, behavior);
+	n->vm_raend = 0;
+	if (n->vm_file)
+		get_file(n->vm_file);
+	if (n->vm_ops && n->vm_ops->open)
+		n->vm_ops->open(n);
+	lock_vma_mappings(vma);
+	spin_lock(&mm->page_table_lock);
+	vma->vm_end = start;
+	__insert_vm_struct(mm, n);
+	spin_unlock(&mm->page_table_lock);
+	unlock_vma_mappings(vma);
+	return 0;
+}
+
+static long madvise_fixup_middle(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, int behavior)
+{
+	struct vm_area_struct * left, * right;
+	struct mm_struct * mm = vma->vm_mm;
+
+	left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!left)
+		return -EAGAIN;
+	right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!right) {
+		kmem_cache_free(vm_area_cachep, left);
+		return -EAGAIN;
+	}
+	*left = *vma;
+	*right = *vma;
+	left->vm_end = start;
+	right->vm_start = end;
+	right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
+	left->vm_raend = 0;
+	right->vm_raend = 0;
+	if (vma->vm_file)
+		atomic_add(2, &vma->vm_file->f_count);
+
+	if (vma->vm_ops && vma->vm_ops->open) {
+		vma->vm_ops->open(left);
+		vma->vm_ops->open(right);
+	}
+	vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
+	vma->vm_raend = 0;
+	lock_vma_mappings(vma);
+	spin_lock(&mm->page_table_lock);
+	vma->vm_start = start;
+	vma->vm_end = end;
+	setup_read_behavior(vma, behavior);
+	__insert_vm_struct(mm, left);
+	__insert_vm_struct(mm, right);
+	spin_unlock(&mm->page_table_lock);
+	unlock_vma_mappings(vma);
+	return 0;
+}
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, int behavior)
+{
+	int error = 0;
+
+	/* This caps the number of vma's this process can own */
+	if (vma->vm_mm->map_count > max_map_count)
+		return -ENOMEM;
+
+	if (start == vma->vm_start) {
+		if (end == vma->vm_end) {
+			setup_read_behavior(vma, behavior);
+			vma->vm_raend = 0;
+		} else
+			error = madvise_fixup_start(vma, end, behavior);
+	} else {
+		if (end == vma->vm_end)
+			error = madvise_fixup_end(vma, start, behavior);
+		else
+			error = madvise_fixup_middle(vma, start, end, behavior);
+	}
+
+	return error;
+}
+
+/*
+ * Schedule all required I/O operations, then run the disk queue
+ * to make sure they are started.  Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end)
+{
+	long error = -EBADF;
+	struct file * file;
+	unsigned long size, rlim_rss;
+
+	/* Doesn't work if there's no mapped file. */
+	if (!vma->vm_file)
+		return error;
+	file = vma->vm_file;
+	size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
+							PAGE_CACHE_SHIFT;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	/* Make sure this doesn't exceed the process's max rss. */
+	error = -EIO;
+	rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
+				LONG_MAX; /* default: see resource.h */
+	if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
+		return error;
+
+	/* round to cluster boundaries if this isn't a "random" area. */
+	if (!VM_RandomReadHint(vma)) {
+		start = CLUSTER_OFFSET(start);
+		end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
+
+		while ((start < end) && (start < size)) {
+			error = read_cluster_nonblocking(file, start, size);
+			start += CLUSTER_PAGES;
+			if (error < 0)
+				break;
+		}
+	} else {
+		while ((start < end) && (start < size)) {
+			error = page_cache_read(file, start);
+			start++;
+			if (error < 0)
+				break;
+		}
+	}
+
+	/* Don't wait for someone else to push these requests. */
+	run_task_queue(&tq_disk);
+
+	return error;
+}
+
+/*
+ * Application no longer needs these pages.  If the pages are dirty,
+ * it's OK to just throw them away.  The app will be more careful about
+ * data it wants to keep.  Be sure to free swap resources too.  The
+ * zap_page_range call sets things up for refill_inactive to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * refill_inactive to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do.  This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them.  There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end)
+{
+	if (vma->vm_flags & VM_LOCKED)
+		return -EINVAL;
+
+	zap_page_range(vma->vm_mm, start, end - start);
+	return 0;
+}
+
+static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
+	unsigned long end, int behavior)
+{
+	long error = -EBADF;
+
+	switch (behavior) {
+	case MADV_NORMAL:
+	case MADV_SEQUENTIAL:
+	case MADV_RANDOM:
+		error = madvise_behavior(vma, start, end, behavior);
+		break;
+
+	case MADV_WILLNEED:
+		error = madvise_willneed(vma, start, end);
+		break;
+
+	case MADV_DONTNEED:
+		error = madvise_dontneed(vma, start, end);
+		break;
+
+	default:
+		error = -EINVAL;
+		break;
+	}
+		
+	return error;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area.  The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques.  The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ *  MADV_NORMAL - the default behavior is to read clusters.  This
+ *		results in some read-ahead and read-behind.
+ *  MADV_RANDOM - the system should read the minimum amount of data
+ *		on any access, since it is unlikely that the appli-
+ *		cation will need more than what it asks for.
+ *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ *		once, so they can be aggressively read ahead, and
+ *		can be freed soon after they are accessed.
+ *  MADV_WILLNEED - the application is notifying the system to read
+ *		some pages ahead.
+ *  MADV_DONTNEED - the application is finished with the given range,
+ *		so the kernel can free resources associated with it.
+ *
+ * return values:
+ *  zero    - success
+ *  -EINVAL - start + len < 0, start is not page-aligned,
+ *		"behavior" is not a valid value, or application
+ *		is attempting to release locked or shared pages.
+ *  -ENOMEM - addresses in the specified range are not currently
+ *		mapped, or are outside the AS of the process.
+ *  -EIO    - an I/O error occurred while paging in data.
+ *  -EBADF  - map exists, but area maps something that isn't a file.
+ *  -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
+{
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error = 0;
+	int error = -EINVAL;
+
+	down_write(&current->mm->mmap_sem);
+
+	if (start & ~PAGE_MASK)
+		goto out;
+	len = (len + ~PAGE_MASK) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = madvise_vma(vma, start, end,
+							behavior);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = madvise_vma(vma, start, vma->vm_end, behavior);
+		if (error)
+			goto out;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+out:
+	up_write(&current->mm->mmap_sem);
+	return error;
+}
+
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct vm_area_struct * vma,
+	unsigned long pgoff)
+{
+	unsigned char present = 0;
+	struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
+	struct page * page, ** hash = page_hash(as, pgoff);
+
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(as, pgoff, *hash);
+	if ((page) && (Page_Uptodate(page)))
+		present = 1;
+	spin_unlock(&pagecache_lock);
+
+	return present;
+}
+
+static long mincore_vma(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, unsigned char * vec)
+{
+	long error, i, remaining;
+	unsigned char * tmp;
+
+	error = -ENOMEM;
+	if (!vma->vm_file)
+		return error;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	error = -EAGAIN;
+	tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+	if (!tmp)
+		return error;
+
+	/* (end - start) is # of pages, and also # of bytes in "vec */
+	remaining = (end - start),
+
+	error = 0;
+	for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+		int j = 0;
+		long thispiece = (remaining < PAGE_SIZE) ?
+						remaining : PAGE_SIZE;
+
+		while (j < thispiece)
+			tmp[j++] = mincore_page(vma, start++);
+
+		if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+			error = -EFAULT;
+			break;
+		}
+	}
+
+	free_page((unsigned long) tmp);
+	return error;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes.  The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information.  Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ *  zero    - success
+ *  -EFAULT - vec points to an illegal address
+ *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
+ *		or len has a nonpositive value
+ *  -ENOMEM - Addresses in the range [addr, addr + len] are
+ *		invalid for the address space of this process, or
+ *		specify one or more pages which are not currently
+ *		mapped
+ *  -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_mincore(unsigned long start, size_t len,
+	unsigned char * vec)
+{
+	int index = 0;
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error = 0;
+	long error = -EINVAL;
+
+	down_read(&current->mm->mmap_sem);
+
+	if (start & ~PAGE_CACHE_MASK)
+		goto out;
+	len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = mincore_vma(vma, start, end,
+							&vec[index]);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
+		if (error)
+			goto out;
+		index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+out:
+	up_read(&current->mm->mmap_sem);
+	return error;
+}
+
+static inline
+struct page *__read_cache_page(struct address_space *mapping,
+				unsigned long index,
+				int (*filler)(void *,struct page*),
+				void *data)
+{
+	struct page **hash = page_hash(mapping, index);
+	struct page *page, *cached_page = NULL;
+	int err;
+repeat:
+	page = __find_get_page(mapping, index, hash);
+	if (!page) {
+		if (!cached_page) {
+			cached_page = page_cache_alloc(mapping);
+			if (!cached_page)
+				return ERR_PTR(-ENOMEM);
+		}
+		page = cached_page;
+		if (add_to_page_cache_unique(page, mapping, index, hash))
+			goto repeat;
+		cached_page = NULL;
+		err = filler(data, page);
+		if (err < 0) {
+			page_cache_release(page);
+			page = ERR_PTR(err);
+		}
+	}
+	if (cached_page)
+		page_cache_release(cached_page);
+	return page;
+}
+
+/*
+ * Read into the page cache. If a page already exists,
+ * and Page_Uptodate() is not set, try to fill the page.
+ */
+struct page *read_cache_page(struct address_space *mapping,
+				unsigned long index,
+				int (*filler)(void *,struct page*),
+				void *data)
+{
+	struct page *page;
+	int err;
+
+retry:
+	page = __read_cache_page(mapping, index, filler, data);
+	if (IS_ERR(page))
+		goto out;
+	touch_page(page);
+	if (Page_Uptodate(page))
+		goto out;
+
+	lock_page(page);
+	if (!page->mapping) {
+		UnlockPage(page);
+		page_cache_release(page);
+		goto retry;
+	}
+	if (Page_Uptodate(page)) {
+		UnlockPage(page);
+		goto out;
+	}
+	err = filler(data, page);
+	if (err < 0) {
+		page_cache_release(page);
+		page = ERR_PTR(err);
+	}
+ out:
+	return page;
+}
+
+static inline struct page * __grab_cache_page(struct address_space *mapping,
+				unsigned long index, struct page **cached_page)
+{
+	struct page *page, **hash = page_hash(mapping, index);
+repeat:
+	page = __find_lock_page(mapping, index, hash);
+	if (!page) {
+		if (!*cached_page) {
+			*cached_page = page_cache_alloc(mapping);
+			if (!*cached_page)
+				return NULL;
+		}
+		page = *cached_page;
+		if (add_to_page_cache_unique(page, mapping, index, hash))
+			goto repeat;
+		*cached_page = NULL;
+	}
+	return page;
+}
+
+inline void remove_suid(struct inode *inode)
+{
+	unsigned int mode;
+
+	/* set S_IGID if S_IXGRP is set, and always set S_ISUID */
+	mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
+
+	/* was any of the uid bits set? */
+	mode &= inode->i_mode;
+	if (mode && !capable(CAP_FSETID)) {
+		inode->i_mode &= ~mode;
+		mark_inode_dirty(inode);
+	}
+}
+
+/*
+ * Write to a file through the page cache. 
+ *
+ * We currently put everything into the page cache prior to writing it.
+ * This is not a problem when writing full pages. With partial pages,
+ * however, we first have to read the data into the cache, then
+ * dirty the page, and finally schedule it for writing. Alternatively, we
+ * could write-through just the portion of data that would go into that
+ * page, but that would kill performance for applications that write data
+ * line by line, and it's prone to race conditions.
+ *
+ * Note that this routine doesn't try to keep track of dirty pages. Each
+ * file system has to do this all by itself, unfortunately.
+ *							okir@monad.swb.de
+ */
+ssize_t
+generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode	*inode = mapping->host;
+	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	loff_t		pos;
+	struct page	*page, *cached_page;
+	unsigned long	written;
+	long		status = 0;
+	int		err;
+	unsigned	bytes;
+
+	if ((ssize_t) count < 0)
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	cached_page = NULL;
+
+	down(&inode->i_sem);
+
+	pos = *ppos;
+	err = -EINVAL;
+	if (pos < 0)
+		goto out;
+
+	err = file->f_error;
+	if (err) {
+		file->f_error = 0;
+		goto out;
+	}
+
+	written = 0;
+
+	/* FIXME: this is for backwards compatibility with 2.4 */
+	if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
+		pos = inode->i_size;
+
+	/*
+	 * Check whether we've reached the file size limit.
+	 */
+	err = -EFBIG;
+	
+	if (limit != RLIM_INFINITY) {
+		if (pos >= limit) {
+			send_sig(SIGXFSZ, current, 0);
+			goto out;
+		}
+		if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) {
+			/* send_sig(SIGXFSZ, current, 0); */
+			count = limit - (u32)pos;
+		}
+	}
+
+	/*
+	 *	LFS rule 
+	 */
+	if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
+		if (pos >= MAX_NON_LFS) {
+			send_sig(SIGXFSZ, current, 0);
+			goto out;
+		}
+		if (count > MAX_NON_LFS - (u32)pos) {
+			/* send_sig(SIGXFSZ, current, 0); */
+			count = MAX_NON_LFS - (u32)pos;
+		}
+	}
+
+	/*
+	 *	Are we about to exceed the fs block limit ?
+	 *
+	 *	If we have written data it becomes a short write
+	 *	If we have exceeded without writing data we send
+	 *	a signal and give them an EFBIG.
+	 *
+	 *	Linus frestrict idea will clean these up nicely..
+	 */
+	 
+	if (!S_ISBLK(inode->i_mode)) {
+		if (pos >= inode->i_sb->s_maxbytes)
+		{
+			if (count || pos > inode->i_sb->s_maxbytes) {
+				send_sig(SIGXFSZ, current, 0);
+				err = -EFBIG;
+				goto out;
+			}
+			/* zero-length writes at ->s_maxbytes are OK */
+		}
+
+		if (pos + count > inode->i_sb->s_maxbytes)
+			count = inode->i_sb->s_maxbytes - pos;
+	} else {
+		if (is_read_only(inode->i_rdev)) {
+			err = -EPERM;
+			goto out;
+		}
+		if (pos >= inode->i_size) {
+			if (count || pos > inode->i_size) {
+				err = -ENOSPC;
+				goto out;
+			}
+		}
+
+		if (pos + count > inode->i_size)
+			count = inode->i_size - pos;
+	}
+
+	err = 0;
+	if (count == 0)
+		goto out;
+
+	remove_suid(inode);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	mark_inode_dirty_sync(inode);
+
+	if (file->f_flags & O_DIRECT)
+		goto o_direct;
+
+	do {
+		unsigned long index, offset;
+		long page_fault;
+		char *kaddr;
+		int deactivate = 1;
+
+		/*
+		 * Try to find the page in the cache. If it isn't there,
+		 * allocate a free page.
+		 */
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count) {
+			bytes = count;
+			deactivate = 0;
+		}
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 */
+		{ volatile unsigned char dummy;
+			__get_user(dummy, buf);
+			__get_user(dummy, buf+bytes-1);
+		}
+
+		status = -ENOMEM;	/* we'll assign it later anyway */
+		page = __grab_cache_page(mapping, index, &cached_page);
+		if (!page)
+			break;
+
+		/* We have exclusive IO access to the page.. */
+		if (!PageLocked(page)) {
+			PAGE_BUG(page);
+		}
+
+		kaddr = kmap(page);
+		status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
+		if (status)
+			goto unlock;
+		page_fault = __copy_from_user(kaddr+offset, buf, bytes);
+		flush_dcache_page(page);
+		status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
+		if (page_fault)
+			goto fail_write;
+		if (!status)
+			status = bytes;
+
+		if (status >= 0) {
+			written += status;
+			count -= status;
+			pos += status;
+			buf += status;
+		}
+unlock:
+		kunmap(page);
+		/* Mark it unlocked again and drop the page.. */
+		UnlockPage(page);
+		if (deactivate)
+			deactivate_page(page);
+		else
+			touch_page(page);
+		page_cache_release(page);
+
+		if (status < 0)
+			break;
+	} while (count);
+	*ppos = pos;
+
+	if (cached_page)
+		page_cache_release(cached_page);
+
+	/* For now, when the user asks for O_SYNC, we'll actually
+	 * provide O_DSYNC. */
+	if (status >= 0) {
+		if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
+			status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
+	}
+	
+out_status:	
+	err = written ? written : status;
+out:
+
+	up(&inode->i_sem);
+	return err;
+fail_write:
+	status = -EFAULT;
+	goto unlock;
+
+o_direct:
+	written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
+	if (written > 0) {
+		loff_t end = pos + written;
+		if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
+			inode->i_size = end;
+			mark_inode_dirty(inode);
+		}
+		*ppos = end;
+		invalidate_inode_pages2(mapping);
+	}
+	/*
+	 * Sync the fs metadata but not the minor inode changes and
+	 * of course not the data as we did direct DMA for the IO.
+	 */
+	if (written >= 0 && file->f_flags & O_SYNC)
+		status = generic_osync_inode(inode, OSYNC_METADATA);
+	goto out_status;
+}
+
+void __init page_cache_init(unsigned long mempages)
+{
+	unsigned long htable_size, order;
+
+	htable_size = mempages;
+	htable_size *= sizeof(struct page *);
+	for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
+		;
+
+	do {
+		unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
+
+		page_hash_bits = 0;
+		while((tmp >>= 1UL) != 0UL)
+			page_hash_bits++;
+
+		page_hash_table = (struct page **)
+			__get_free_pages(GFP_ATOMIC, order);
+	} while(page_hash_table == NULL && --order > 0);
+
+	printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+	       (1 << page_hash_bits), order, (PAGE_SIZE << order));
+	if (!page_hash_table)
+		panic("Failed to allocate page hash table\n");
+	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
+}
diff -urN linux-2.4.17-rc1-virgin/mm/highmem.c linux-2.4.17-rc1-wli3/mm/highmem.c
--- linux-2.4.17-rc1-virgin/mm/highmem.c	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/mm/highmem.c	Mon Oct 22 15:01:57 2001
@@ -32,7 +32,7 @@
  */
 static int pkmap_count[LAST_PKMAP];
 static unsigned int last_pkmap_nr;
-static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+static spinlock_t kmap_lock = SPIN_LOCK_UNLOCKED;
 
 pte_t * pkmap_page_table;
 
diff -urN linux-2.4.17-rc1-virgin/mm/memory.c linux-2.4.17-rc1-wli3/mm/memory.c
--- linux-2.4.17-rc1-virgin/mm/memory.c	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/mm/memory.c	Sun Dec 16 17:58:10 2001
@@ -46,6 +46,7 @@
 #include <linux/pagemap.h>
 
 #include <asm/pgalloc.h>
+#include <asm/rmap.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 
@@ -101,6 +102,7 @@
 	}
 	pte = pte_offset(dir, 0);
 	pmd_clear(dir);
+	pgtable_remove_rmap(pte);
 	pte_free(pte);
 }
 
@@ -235,9 +237,11 @@
 
 				if (pte_none(pte))
 					goto cont_copy_pte_range_noset;
+				/* pte contains position in swap, so copy. */
 				if (!pte_present(pte)) {
 					swap_duplicate(pte_to_swp_entry(pte));
-					goto cont_copy_pte_range;
+					set_pte(dst_pte, pte);
+					goto cont_copy_pte_range_noset;
 				}
 				ptepage = pte_page(pte);
 				if ((!VALID_PAGE(ptepage)) || 
@@ -258,6 +262,7 @@
 				dst->rss++;
 
 cont_copy_pte_range:		set_pte(dst_pte, pte);
+				page_add_rmap(ptepage, dst_pte);
 cont_copy_pte_range_noset:	address += PAGE_SIZE;
 				if (address >= end)
 					goto out_unlock;
@@ -313,8 +318,10 @@
 			continue;
 		if (pte_present(pte)) {
 			struct page *page = pte_page(pte);
-			if (VALID_PAGE(page) && !PageReserved(page))
+			if (VALID_PAGE(page) && !PageReserved(page)) {
 				freed ++;
+				page_remove_rmap(page, ptep);
+			}
 			/* This will eventually call __free_pte on the pte. */
 			tlb_remove_page(tlb, ptep, address + offset);
 		} else {
@@ -355,7 +362,8 @@
 /*
  * remove user pages in a given range.
  */
-void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
+void do_zap_page_range(struct mm_struct *mm, unsigned long address,
+			      unsigned long size)
 {
 	mmu_gather_t *tlb;
 	pgd_t * dir;
@@ -397,16 +405,17 @@
 	spin_unlock(&mm->page_table_lock);
 }
 
+
 /*
  * Do a quick page-table lookup for a single page. 
  */
-static struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) 
+static struct page * follow_page(unsigned long address, int write) 
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
 
-	pgd = pgd_offset(mm, address);
+	pgd = pgd_offset(current->mm, address);
 	if (pgd_none(*pgd) || pgd_bad(*pgd))
 		goto out;
 
@@ -442,74 +451,21 @@
 	return page;
 }
 
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
-		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas)
-{
-	int i = 0;
-
-	do {
-		struct vm_area_struct *	vma;
-
-		vma = find_extend_vma(mm, start);
-
-		if ( !vma ||
-		    (!force &&
-		     	((write && (!(vma->vm_flags & VM_WRITE))) ||
-		    	 (!write && (!(vma->vm_flags & VM_READ))) ) )) {
-			if (i) return i;
-			return -EFAULT;
-		}
-
-		spin_lock(&mm->page_table_lock);
-		do {
-			struct page *map;
-			while (!(map = follow_page(mm, start, write))) {
-				spin_unlock(&mm->page_table_lock);
-				switch (handle_mm_fault(mm, vma, start, write)) {
-				case 1:
-					tsk->min_flt++;
-					break;
-				case 2:
-					tsk->maj_flt++;
-					break;
-				case 0:
-					if (i) return i;
-					return -EFAULT;
-				default:
-					if (i) return i;
-					return -ENOMEM;
-				}
-				spin_lock(&mm->page_table_lock);
-			}
-			if (pages) {
-				pages[i] = get_page_map(map);
-				/* FIXME: call the correct function,
-				 * depending on the type of the found page
-				 */
-				if (pages[i])
-					page_cache_get(pages[i]);
-			}
-			if (vmas)
-				vmas[i] = vma;
-			i++;
-			start += PAGE_SIZE;
-			len--;
-		} while(len && start < vma->vm_end);
-		spin_unlock(&mm->page_table_lock);
-	} while(len);
-	return i;
-}
-
 /*
  * Force in an entire range of pages from the current process's user VA,
  * and pin them in physical memory.  
  */
-#define dprintk(x...)
 
+#define dprintk(x...)
 int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
 {
-	int pgcount, err;
+	unsigned long		ptr, end;
+	int			err;
 	struct mm_struct *	mm;
+	struct vm_area_struct *	vma = 0;
+	struct page *		map;
+	int			i;
+	int			datain = (rw == READ);
 	
 	/* Make sure the iobuf is not already mapped somewhere. */
 	if (iobuf->nr_pages)
@@ -518,37 +474,79 @@
 	mm = current->mm;
 	dprintk ("map_user_kiobuf: begin\n");
 	
-	pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE;
-	/* mapping 0 bytes is not permitted */
-	if (!pgcount) BUG();
-	err = expand_kiobuf(iobuf, pgcount);
+	ptr = va & PAGE_MASK;
+	end = (va + len + PAGE_SIZE - 1) & PAGE_MASK;
+	err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT);
 	if (err)
 		return err;
 
+	down_read(&mm->mmap_sem);
+
+	err = -EFAULT;
 	iobuf->locked = 0;
-	iobuf->offset = va & (PAGE_SIZE-1);
+	iobuf->offset = va & ~PAGE_MASK;
 	iobuf->length = len;
 	
-	/* Try to fault in all of the necessary pages */
-	down_read(&mm->mmap_sem);
-	/* rw==READ means read from disk, write into memory area */
-	err = get_user_pages(current, mm, va, pgcount,
-			(rw==READ), 0, iobuf->maplist, NULL);
-	up_read(&mm->mmap_sem);
-	if (err < 0) {
-		unmap_kiobuf(iobuf);
-		dprintk ("map_user_kiobuf: end %d\n", err);
-		return err;
-	}
-	iobuf->nr_pages = err;
-	while (pgcount--) {
-		/* FIXME: flush superflous for rw==READ,
-		 * probably wrong function for rw==WRITE
-		 */
-		flush_dcache_page(iobuf->maplist[pgcount]);
+	i = 0;
+	
+	/* 
+	 * First of all, try to fault in all of the necessary pages
+	 */
+	while (ptr < end) {
+		if (!vma || ptr >= vma->vm_end) {
+			vma = find_vma(current->mm, ptr);
+			if (!vma) 
+				goto out_unlock;
+			if (vma->vm_start > ptr) {
+				if (!(vma->vm_flags & VM_GROWSDOWN))
+					goto out_unlock;
+				if (expand_stack(vma, ptr))
+					goto out_unlock;
+			}
+			if (((datain) && (!(vma->vm_flags & VM_WRITE))) ||
+					(!(vma->vm_flags & VM_READ))) {
+				err = -EACCES;
+				goto out_unlock;
+			}
+		}
+		spin_lock(&mm->page_table_lock);
+		while (!(map = follow_page(ptr, datain))) {
+			int ret;
+
+			spin_unlock(&mm->page_table_lock);
+			ret = handle_mm_fault(current->mm, vma, ptr, datain);
+			if (ret <= 0) {
+				if (!ret)
+					goto out_unlock;
+				else {
+					err = -ENOMEM;
+					goto out_unlock;
+				}
+			}
+			spin_lock(&mm->page_table_lock);
+		}			
+		map = get_page_map(map);
+		if (map) {
+			flush_dcache_page(map);
+			page_cache_get(map);
+		} else
+			printk (KERN_INFO "Mapped page missing [%d]\n", i);
+		spin_unlock(&mm->page_table_lock);
+		iobuf->maplist[i] = map;
+		iobuf->nr_pages = ++i;
+		
+		ptr += PAGE_SIZE;
 	}
+
+	up_read(&mm->mmap_sem);
 	dprintk ("map_user_kiobuf: end OK\n");
 	return 0;
+
+ out_unlock:
+	up_read(&mm->mmap_sem);
+	unmap_kiobuf(iobuf);
+	dprintk ("map_user_kiobuf: end %d\n", err);
+	return err;
 }
 
 /*
@@ -598,9 +596,6 @@
 		if (map) {
 			if (iobuf->locked)
 				UnlockPage(map);
-			/* FIXME: cache flush missing for rw==READ
-			 * FIXME: call the correct reference counting function
-			 */
 			page_cache_release(map);
 		}
 	}
@@ -609,6 +604,20 @@
 	iobuf->locked = 0;
 }
 
+void zap_page_range(struct mm_struct *mm, unsigned long address,
+		    unsigned long size, int actions)
+{
+	while (size) {
+		unsigned long chunk = size;
+		
+		if (actions & ZPR_PARTITION && chunk > ZPR_MAX_BYTES)
+			chunk = ZPR_MAX_BYTES;
+		do_zap_page_range(mm, address, chunk);
+
+		address += chunk;
+		size -= chunk;
+	}
+}
 
 /*
  * Lock down all of the pages of a kiovec for IO.
@@ -718,11 +727,15 @@
 	return 0;
 }
 
-static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
-                                     unsigned long size, pgprot_t prot)
+static inline void zeromap_pte_range(struct mm_struct *mm, pte_t * pte,
+				     unsigned long address, unsigned long size,
+				     pgprot_t prot)
 {
 	unsigned long end;
 
+	debug_lock_break(1);
+	break_spin_lock(&mm->page_table_lock);
+
 	address &= ~PMD_MASK;
 	end = address + size;
 	if (end > PMD_SIZE)
@@ -750,7 +763,7 @@
 		pte_t * pte = pte_alloc(mm, pmd, address);
 		if (!pte)
 			return -ENOMEM;
-		zeromap_pte_range(pte, address, end - address, prot);
+		zeromap_pte_range(mm, pte, address, end - address, prot);
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address && (address < end));
@@ -953,7 +966,9 @@
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
 			++mm->rss;
+		page_remove_rmap(old_page, page_table);
 		break_cow(vma, new_page, address, page_table);
+		page_add_rmap(new_page, page_table);
 		lru_cache_add(new_page);
 
 		/* Free the old page.. */
@@ -984,7 +999,7 @@
 
 		/* mapping wholly truncated? */
 		if (mpnt->vm_pgoff >= pgoff) {
-			zap_page_range(mm, start, len);
+			zap_page_range(mm, start, len, ZPR_NORMAL);
 			continue;
 		}
 
@@ -997,7 +1012,7 @@
 		/* Ok, partially affected.. */
 		start += diff << PAGE_SHIFT;
 		len = (len - diff) << PAGE_SHIFT;
-		zap_page_range(mm, start, len);
+		zap_page_range(mm, start, len, ZPR_NORMAL);
 	} while ((mpnt = mpnt->vm_next_share) != NULL);
 }
 
@@ -1035,10 +1050,16 @@
 
 do_expand:
 	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
-	if (limit != RLIM_INFINITY && offset > limit)
-		goto out_sig;
-	if (offset > inode->i_sb->s_maxbytes)
-		goto out;
+	if (limit != RLIM_INFINITY) {
+		if (inode->i_size >= limit) {
+			send_sig(SIGXFSZ, current, 0);
+			goto out;
+		}
+		if (offset > limit) {
+			send_sig(SIGXFSZ, current, 0);
+			offset = limit;
+		}
+	}
 	inode->i_size = offset;
 
 out_truncate:
@@ -1047,11 +1068,8 @@
 		inode->i_op->truncate(inode);
 		unlock_kernel();
 	}
-	return 0;
-out_sig:
-	send_sig(SIGXFSZ, current, 0);
 out:
-	return -EFBIG;
+	return 0;
 }
 
 /* 
@@ -1114,8 +1132,6 @@
 		ret = 2;
 	}
 
-	mark_page_accessed(page);
-
 	lock_page(page);
 
 	/*
@@ -1145,6 +1161,7 @@
 	flush_page_to_ram(page);
 	flush_icache_page(vma, page);
 	set_pte(page_table, pte);
+	page_add_rmap(page, page_table);
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
@@ -1160,14 +1177,13 @@
 static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
 {
 	pte_t entry;
+	struct page * page = ZERO_PAGE(addr);
 
 	/* Read-only mapping of ZERO_PAGE. */
 	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 
 	/* ..except if it's a write access */
 	if (write_access) {
-		struct page *page;
-
 		/* Allocate our own private page. */
 		spin_unlock(&mm->page_table_lock);
 
@@ -1186,10 +1202,10 @@
 		flush_page_to_ram(page);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		lru_cache_add(page);
-		mark_page_accessed(page);
 	}
 
 	set_pte(page_table, entry);
+	page_add_rmap(page, page_table); /* ignores ZERO_PAGE */
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, entry);
@@ -1234,11 +1250,9 @@
 	 */
 	if (write_access && !(vma->vm_flags & VM_SHARED)) {
 		struct page * page = alloc_page(GFP_HIGHUSER);
-		if (!page) {
-			page_cache_release(new_page);
+		if (!page)
 			return -1;
-		}
-		copy_user_highpage(page, new_page, address);
+		copy_highpage(page, new_page);
 		page_cache_release(new_page);
 		lru_cache_add(page);
 		new_page = page;
@@ -1264,6 +1278,7 @@
 		if (write_access)
 			entry = pte_mkwrite(pte_mkdirty(entry));
 		set_pte(page_table, entry);
+		page_add_rmap(new_page, page_table);
 	} else {
 		/* One of our sibling threads was faster, back out. */
 		page_cache_release(new_page);
@@ -1421,25 +1436,30 @@
 				goto out;
 			}
 		}
+		pgtable_add_rmap(new, mm, address);
 		pmd_populate(mm, pmd, new);
 	}
 out:
 	return pte_offset(pmd, address);
 }
 
+/*
+ * Simplistic page force-in..
+ */
 int make_pages_present(unsigned long addr, unsigned long end)
 {
-	int ret, len, write;
+	int write;
+	struct mm_struct *mm = current->mm;
 	struct vm_area_struct * vma;
 
-	vma = find_vma(current->mm, addr);
+	vma = find_vma(mm, addr);
 	write = (vma->vm_flags & VM_WRITE) != 0;
 	if (addr >= end)
 		BUG();
-	if (end > vma->vm_end)
-		BUG();
-	len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
-	ret = get_user_pages(current, current->mm, addr,
-			len, write, 0, NULL, NULL);
-	return ret == len ? 0 : -1;
+	do {
+		if (handle_mm_fault(mm, vma, addr, write) < 0)
+			return -1;
+		addr += PAGE_SIZE;
+	} while (addr < end);
+	return 0;
 }
diff -urN linux-2.4.17-rc1-virgin/mm/memory.c~ linux-2.4.17-rc1-wli3/mm/memory.c~
--- linux-2.4.17-rc1-virgin/mm/memory.c~	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/mm/memory.c~	Fri Dec 14 02:44:20 2001
@@ -0,0 +1,1446 @@
+/*
+ *  linux/mm/memory.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ */
+
+/*
+ * demand-loading started 01.12.91 - seems it is high on the list of
+ * things wanted, and it should be easy to implement. - Linus
+ */
+
+/*
+ * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
+ * pages started 02.12.91, seems to work. - Linus.
+ *
+ * Tested sharing by executing about 30 /bin/sh: under the old kernel it
+ * would have taken more than the 6M I have free, but it worked well as
+ * far as I could see.
+ *
+ * Also corrected some "invalidate()"s - I wasn't doing enough of them.
+ */
+
+/*
+ * Real VM (paging to/from disk) started 18.12.91. Much more work and
+ * thought has to go into this. Oh, well..
+ * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
+ *		Found it. Everything seems to work now.
+ * 20.12.91  -  Ok, making the swap-device changeable like the root.
+ */
+
+/*
+ * 05.04.94  -  Multi-page memory management added for v1.1.
+ * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
+ *
+ * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
+ *		(Gerhard.Wichert@pdb.siemens.de)
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/smp_lock.h>
+#include <linux/swapctl.h>
+#include <linux/iobuf.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#include <asm/pgalloc.h>
+#include <asm/rmap.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+
+unsigned long max_mapnr;
+unsigned long num_physpages;
+void * high_memory;
+struct page *highmem_start_page;
+
+/*
+ * We special-case the C-O-W ZERO_PAGE, because it's such
+ * a common occurrence (no need to read the page to know
+ * that it's zero - better for the cache and memory subsystem).
+ */
+static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
+{
+	if (from == ZERO_PAGE(address)) {
+		clear_user_highpage(to, address);
+		return;
+	}
+	copy_user_highpage(to, from, address);
+}
+
+mem_map_t * mem_map;
+
+/*
+ * Called by TLB shootdown 
+ */
+void __free_pte(pte_t pte)
+{
+	struct page *page = pte_page(pte);
+	if ((!VALID_PAGE(page)) || PageReserved(page))
+		return;
+	if (pte_dirty(pte))
+		set_page_dirty(page);		
+	free_page_and_swap_cache(page);
+}
+
+
+/*
+ * Note: this doesn't free the actual pages themselves. That
+ * has been handled earlier when unmapping all the memory regions.
+ */
+static inline void free_one_pmd(pmd_t * dir)
+{
+	pte_t * pte;
+
+	if (pmd_none(*dir))
+		return;
+	if (pmd_bad(*dir)) {
+		pmd_ERROR(*dir);
+		pmd_clear(dir);
+		return;
+	}
+	pte = pte_offset(dir, 0);
+	pmd_clear(dir);
+	pgtable_remove_rmap(pte);
+	pte_free(pte);
+}
+
+static inline void free_one_pgd(pgd_t * dir)
+{
+	int j;
+	pmd_t * pmd;
+
+	if (pgd_none(*dir))
+		return;
+	if (pgd_bad(*dir)) {
+		pgd_ERROR(*dir);
+		pgd_clear(dir);
+		return;
+	}
+	pmd = pmd_offset(dir, 0);
+	pgd_clear(dir);
+	for (j = 0; j < PTRS_PER_PMD ; j++) {
+		prefetchw(pmd+j+(PREFETCH_STRIDE/16));
+		free_one_pmd(pmd+j);
+	}
+	pmd_free(pmd);
+}
+
+/* Low and high watermarks for page table cache.
+   The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
+ */
+int pgt_cache_water[2] = { 25, 50 };
+
+/* Returns the number of pages freed */
+int check_pgt_cache(void)
+{
+	return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
+}
+
+
+/*
+ * This function clears all user-level page tables of a process - this
+ * is needed by execve(), so that old pages aren't in the way.
+ */
+void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
+{
+	pgd_t * page_dir = mm->pgd;
+
+	spin_lock(&mm->page_table_lock);
+	page_dir += first;
+	do {
+		free_one_pgd(page_dir);
+		page_dir++;
+	} while (--nr);
+	spin_unlock(&mm->page_table_lock);
+
+	/* keep the page table cache within bounds */
+	check_pgt_cache();
+}
+
+#define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
+#define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
+
+/*
+ * copy one vm_area from one task to the other. Assumes the page tables
+ * already present in the new task to be cleared in the whole range
+ * covered by this vma.
+ *
+ * 08Jan98 Merged into one routine from several inline routines to reduce
+ *         variable count and make things faster. -jj
+ *
+ * dst->page_table_lock is held on entry and exit,
+ * but may be dropped within pmd_alloc() and pte_alloc().
+ */
+int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
+			struct vm_area_struct *vma)
+{
+	pgd_t * src_pgd, * dst_pgd;
+	unsigned long address = vma->vm_start;
+	unsigned long end = vma->vm_end;
+	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE;
+
+	src_pgd = pgd_offset(src, address)-1;
+	dst_pgd = pgd_offset(dst, address)-1;
+
+	for (;;) {
+		pmd_t * src_pmd, * dst_pmd;
+
+		src_pgd++; dst_pgd++;
+		
+		/* copy_pmd_range */
+		
+		if (pgd_none(*src_pgd))
+			goto skip_copy_pmd_range;
+		if (pgd_bad(*src_pgd)) {
+			pgd_ERROR(*src_pgd);
+			pgd_clear(src_pgd);
+skip_copy_pmd_range:	address = (address + PGDIR_SIZE) & PGDIR_MASK;
+			if (!address || (address >= end))
+				goto out;
+			continue;
+		}
+
+		src_pmd = pmd_offset(src_pgd, address);
+		dst_pmd = pmd_alloc(dst, dst_pgd, address);
+		if (!dst_pmd)
+			goto nomem;
+
+		do {
+			pte_t * src_pte, * dst_pte;
+		
+			/* copy_pte_range */
+		
+			if (pmd_none(*src_pmd))
+				goto skip_copy_pte_range;
+			if (pmd_bad(*src_pmd)) {
+				pmd_ERROR(*src_pmd);
+				pmd_clear(src_pmd);
+skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
+				if (address >= end)
+					goto out;
+				goto cont_copy_pmd_range;
+			}
+
+			src_pte = pte_offset(src_pmd, address);
+			dst_pte = pte_alloc(dst, dst_pmd, address);
+			if (!dst_pte)
+				goto nomem;
+
+			spin_lock(&src->page_table_lock);			
+			do {
+				pte_t pte = *src_pte;
+				struct page *ptepage;
+				
+				/* copy_one_pte */
+
+				if (pte_none(pte))
+					goto cont_copy_pte_range_noset;
+				/* pte contains position in swap, so copy. */
+				if (!pte_present(pte)) {
+					swap_duplicate(pte_to_swp_entry(pte));
+					set_pte(dst_pte, pte);
+					goto cont_copy_pte_range_noset;
+				}
+				ptepage = pte_page(pte);
+				if ((!VALID_PAGE(ptepage)) || 
+				    PageReserved(ptepage))
+					goto cont_copy_pte_range;
+
+				/* If it's a COW mapping, write protect it both in the parent and the child */
+				if (cow) {
+					ptep_set_wrprotect(src_pte);
+					pte = *src_pte;
+				}
+
+				/* If it's a shared mapping, mark it clean in the child */
+				if (vma->vm_flags & VM_SHARED)
+					pte = pte_mkclean(pte);
+				pte = pte_mkold(pte);
+				get_page(ptepage);
+				dst->rss++;
+
+cont_copy_pte_range:		set_pte(dst_pte, pte);
+				page_add_rmap(ptepage, dst_pte);
+cont_copy_pte_range_noset:	address += PAGE_SIZE;
+				if (address >= end)
+					goto out_unlock;
+				src_pte++;
+				dst_pte++;
+			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
+			spin_unlock(&src->page_table_lock);
+		
+cont_copy_pmd_range:	src_pmd++;
+			dst_pmd++;
+		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
+	}
+out_unlock:
+	spin_unlock(&src->page_table_lock);
+out:
+	return 0;
+nomem:
+	return -ENOMEM;
+}
+
+/*
+ * Return indicates whether a page was freed so caller can adjust rss
+ */
+static inline void forget_pte(pte_t page)
+{
+	if (!pte_none(page)) {
+		printk("forget_pte: old mapping existed!\n");
+		BUG();
+	}
+}
+
+static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
+{
+	unsigned long offset;
+	pte_t * ptep;
+	int freed = 0;
+
+	if (pmd_none(*pmd))
+		return 0;
+	if (pmd_bad(*pmd)) {
+		pmd_ERROR(*pmd);
+		pmd_clear(pmd);
+		return 0;
+	}
+	ptep = pte_offset(pmd, address);
+	offset = address & ~PMD_MASK;
+	if (offset + size > PMD_SIZE)
+		size = PMD_SIZE - offset;
+	size &= PAGE_MASK;
+	for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
+		pte_t pte = *ptep;
+		if (pte_none(pte))
+			continue;
+		if (pte_present(pte)) {
+			struct page *page = pte_page(pte);
+			if (VALID_PAGE(page) && !PageReserved(page)) {
+				freed ++;
+				page_remove_rmap(page, ptep);
+			}
+			/* This will eventually call __free_pte on the pte. */
+			tlb_remove_page(tlb, ptep, address + offset);
+		} else {
+			free_swap_and_cache(pte_to_swp_entry(pte));
+			pte_clear(ptep);
+		}
+	}
+
+	return freed;
+}
+
+static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size)
+{
+	pmd_t * pmd;
+	unsigned long end;
+	int freed;
+
+	if (pgd_none(*dir))
+		return 0;
+	if (pgd_bad(*dir)) {
+		pgd_ERROR(*dir);
+		pgd_clear(dir);
+		return 0;
+	}
+	pmd = pmd_offset(dir, address);
+	end = address + size;
+	if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
+		end = ((address + PGDIR_SIZE) & PGDIR_MASK);
+	freed = 0;
+	do {
+		freed += zap_pte_range(tlb, pmd, address, end - address);
+		address = (address + PMD_SIZE) & PMD_MASK; 
+		pmd++;
+	} while (address < end);
+	return freed;
+}
+
+/*
+ * remove user pages in a given range.
+ */
+void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
+{
+	mmu_gather_t *tlb;
+	pgd_t * dir;
+	unsigned long start = address, end = address + size;
+	int freed = 0;
+
+	dir = pgd_offset(mm, address);
+
+	/*
+	 * This is a long-lived spinlock. That's fine.
+	 * There's no contention, because the page table
+	 * lock only protects against kswapd anyway, and
+	 * even if kswapd happened to be looking at this
+	 * process we _want_ it to get stuck.
+	 */
+	if (address >= end)
+		BUG();
+	spin_lock(&mm->page_table_lock);
+	flush_cache_range(mm, address, end);
+	tlb = tlb_gather_mmu(mm);
+
+	do {
+		freed += zap_pmd_range(tlb, dir, address, end - address);
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
+		dir++;
+	} while (address && (address < end));
+
+	/* this will flush any remaining tlb entries */
+	tlb_finish_mmu(tlb, start, end);
+
+	/*
+	 * Update rss for the mm_struct (not necessarily current->mm)
+	 * Notice that rss is an unsigned long.
+	 */
+	if (mm->rss > freed)
+		mm->rss -= freed;
+	else
+		mm->rss = 0;
+	spin_unlock(&mm->page_table_lock);
+}
+
+
+/*
+ * Do a quick page-table lookup for a single page. 
+ */
+static struct page * follow_page(unsigned long address, int write) 
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+
+	pgd = pgd_offset(current->mm, address);
+	if (pgd_none(*pgd) || pgd_bad(*pgd))
+		goto out;
+
+	pmd = pmd_offset(pgd, address);
+	if (pmd_none(*pmd) || pmd_bad(*pmd))
+		goto out;
+
+	ptep = pte_offset(pmd, address);
+	if (!ptep)
+		goto out;
+
+	pte = *ptep;
+	if (pte_present(pte)) {
+		if (!write ||
+		    (pte_write(pte) && pte_dirty(pte)))
+			return pte_page(pte);
+	}
+
+out:
+	return 0;
+}
+
+/* 
+ * Given a physical address, is there a useful struct page pointing to
+ * it?  This may become more complex in the future if we start dealing
+ * with IO-aperture pages in kiobufs.
+ */
+
+static inline struct page * get_page_map(struct page *page)
+{
+	if (!VALID_PAGE(page))
+		return 0;
+	return page;
+}
+
+/*
+ * Force in an entire range of pages from the current process's user VA,
+ * and pin them in physical memory.  
+ */
+
+#define dprintk(x...)
+int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
+{
+	unsigned long		ptr, end;
+	int			err;
+	struct mm_struct *	mm;
+	struct vm_area_struct *	vma = 0;
+	struct page *		map;
+	int			i;
+	int			datain = (rw == READ);
+	
+	/* Make sure the iobuf is not already mapped somewhere. */
+	if (iobuf->nr_pages)
+		return -EINVAL;
+
+	mm = current->mm;
+	dprintk ("map_user_kiobuf: begin\n");
+	
+	ptr = va & PAGE_MASK;
+	end = (va + len + PAGE_SIZE - 1) & PAGE_MASK;
+	err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT);
+	if (err)
+		return err;
+
+	down_read(&mm->mmap_sem);
+
+	err = -EFAULT;
+	iobuf->locked = 0;
+	iobuf->offset = va & ~PAGE_MASK;
+	iobuf->length = len;
+	
+	i = 0;
+	
+	/* 
+	 * First of all, try to fault in all of the necessary pages
+	 */
+	while (ptr < end) {
+		if (!vma || ptr >= vma->vm_end) {
+			vma = find_vma(current->mm, ptr);
+			if (!vma) 
+				goto out_unlock;
+			if (vma->vm_start > ptr) {
+				if (!(vma->vm_flags & VM_GROWSDOWN))
+					goto out_unlock;
+				if (expand_stack(vma, ptr))
+					goto out_unlock;
+			}
+			if (((datain) && (!(vma->vm_flags & VM_WRITE))) ||
+					(!(vma->vm_flags & VM_READ))) {
+				err = -EACCES;
+				goto out_unlock;
+			}
+		}
+		spin_lock(&mm->page_table_lock);
+		while (!(map = follow_page(ptr, datain))) {
+			int ret;
+
+			spin_unlock(&mm->page_table_lock);
+			ret = handle_mm_fault(current->mm, vma, ptr, datain);
+			if (ret <= 0) {
+				if (!ret)
+					goto out_unlock;
+				else {
+					err = -ENOMEM;
+					goto out_unlock;
+				}
+			}
+			spin_lock(&mm->page_table_lock);
+		}			
+		map = get_page_map(map);
+		if (map) {
+			flush_dcache_page(map);
+			page_cache_get(map);
+		} else
+			printk (KERN_INFO "Mapped page missing [%d]\n", i);
+		spin_unlock(&mm->page_table_lock);
+		iobuf->maplist[i] = map;
+		iobuf->nr_pages = ++i;
+		
+		ptr += PAGE_SIZE;
+	}
+
+	up_read(&mm->mmap_sem);
+	dprintk ("map_user_kiobuf: end OK\n");
+	return 0;
+
+ out_unlock:
+	up_read(&mm->mmap_sem);
+	unmap_kiobuf(iobuf);
+	dprintk ("map_user_kiobuf: end %d\n", err);
+	return err;
+}
+
+/*
+ * Mark all of the pages in a kiobuf as dirty 
+ *
+ * We need to be able to deal with short reads from disk: if an IO error
+ * occurs, the number of bytes read into memory may be less than the
+ * size of the kiobuf, so we have to stop marking pages dirty once the
+ * requested byte count has been reached.
+ */
+
+void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes)
+{
+	int index, offset, remaining;
+	struct page *page;
+	
+	index = iobuf->offset >> PAGE_SHIFT;
+	offset = iobuf->offset & ~PAGE_MASK;
+	remaining = bytes;
+	if (remaining > iobuf->length)
+		remaining = iobuf->length;
+	
+	while (remaining > 0 && index < iobuf->nr_pages) {
+		page = iobuf->maplist[index];
+		
+		if (!PageReserved(page))
+			SetPageDirty(page);
+
+		remaining -= (PAGE_SIZE - offset);
+		offset = 0;
+		index++;
+	}
+}
+
+/*
+ * Unmap all of the pages referenced by a kiobuf.  We release the pages,
+ * and unlock them if they were locked. 
+ */
+
+void unmap_kiobuf (struct kiobuf *iobuf) 
+{
+	int i;
+	struct page *map;
+	
+	for (i = 0; i < iobuf->nr_pages; i++) {
+		map = iobuf->maplist[i];
+		if (map) {
+			if (iobuf->locked)
+				UnlockPage(map);
+			page_cache_release(map);
+		}
+	}
+	
+	iobuf->nr_pages = 0;
+	iobuf->locked = 0;
+}
+
+
+/*
+ * Lock down all of the pages of a kiovec for IO.
+ *
+ * If any page is mapped twice in the kiovec, we return the error -EINVAL.
+ *
+ * The optional wait parameter causes the lock call to block until all
+ * pages can be locked if set.  If wait==0, the lock operation is
+ * aborted if any locked pages are found and -EAGAIN is returned.
+ */
+
+int lock_kiovec(int nr, struct kiobuf *iovec[], int wait)
+{
+	struct kiobuf *iobuf;
+	int i, j;
+	struct page *page, **ppage;
+	int doublepage = 0;
+	int repeat = 0;
+	
+ repeat:
+	
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+
+		if (iobuf->locked)
+			continue;
+
+		ppage = iobuf->maplist;
+		for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
+			page = *ppage;
+			if (!page)
+				continue;
+			
+			if (TryLockPage(page)) {
+				while (j--) {
+					struct page *tmp = *--ppage;
+					if (tmp)
+						UnlockPage(tmp);
+				}
+				goto retry;
+			}
+		}
+		iobuf->locked = 1;
+	}
+
+	return 0;
+	
+ retry:
+	
+	/* 
+	 * We couldn't lock one of the pages.  Undo the locking so far,
+	 * wait on the page we got to, and try again.  
+	 */
+	
+	unlock_kiovec(nr, iovec);
+	if (!wait)
+		return -EAGAIN;
+	
+	/* 
+	 * Did the release also unlock the page we got stuck on?
+	 */
+	if (!PageLocked(page)) {
+		/* 
+		 * If so, we may well have the page mapped twice
+		 * in the IO address range.  Bad news.  Of
+		 * course, it _might_ just be a coincidence,
+		 * but if it happens more than once, chances
+		 * are we have a double-mapped page. 
+		 */
+		if (++doublepage >= 3) 
+			return -EINVAL;
+		
+		/* Try again...  */
+		wait_on_page(page);
+	}
+	
+	if (++repeat < 16)
+		goto repeat;
+	return -EAGAIN;
+}
+
+/*
+ * Unlock all of the pages of a kiovec after IO.
+ */
+
+int unlock_kiovec(int nr, struct kiobuf *iovec[])
+{
+	struct kiobuf *iobuf;
+	int i, j;
+	struct page *page, **ppage;
+	
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+
+		if (!iobuf->locked)
+			continue;
+		iobuf->locked = 0;
+		
+		ppage = iobuf->maplist;
+		for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
+			page = *ppage;
+			if (!page)
+				continue;
+			UnlockPage(page);
+		}
+	}
+	return 0;
+}
+
+static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
+                                     unsigned long size, pgprot_t prot)
+{
+	unsigned long end;
+
+	address &= ~PMD_MASK;
+	end = address + size;
+	if (end > PMD_SIZE)
+		end = PMD_SIZE;
+	do {
+		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
+		pte_t oldpage = ptep_get_and_clear(pte);
+		set_pte(pte, zero_pte);
+		forget_pte(oldpage);
+		address += PAGE_SIZE;
+		pte++;
+	} while (address && (address < end));
+}
+
+static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address,
+                                    unsigned long size, pgprot_t prot)
+{
+	unsigned long end;
+
+	address &= ~PGDIR_MASK;
+	end = address + size;
+	if (end > PGDIR_SIZE)
+		end = PGDIR_SIZE;
+	do {
+		pte_t * pte = pte_alloc(mm, pmd, address);
+		if (!pte)
+			return -ENOMEM;
+		zeromap_pte_range(pte, address, end - address, prot);
+		address = (address + PMD_SIZE) & PMD_MASK;
+		pmd++;
+	} while (address && (address < end));
+	return 0;
+}
+
+int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
+{
+	int error = 0;
+	pgd_t * dir;
+	unsigned long beg = address;
+	unsigned long end = address + size;
+	struct mm_struct *mm = current->mm;
+
+	dir = pgd_offset(mm, address);
+	flush_cache_range(mm, beg, end);
+	if (address >= end)
+		BUG();
+
+	spin_lock(&mm->page_table_lock);
+	do {
+		pmd_t *pmd = pmd_alloc(mm, dir, address);
+		error = -ENOMEM;
+		if (!pmd)
+			break;
+		error = zeromap_pmd_range(mm, pmd, address, end - address, prot);
+		if (error)
+			break;
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
+		dir++;
+	} while (address && (address < end));
+	spin_unlock(&mm->page_table_lock);
+	flush_tlb_range(mm, beg, end);
+	return error;
+}
+
+/*
+ * maps a range of physical memory into the requested pages. the old
+ * mappings are removed. any references to nonexistent pages results
+ * in null mappings (currently treated as "copy-on-access")
+ */
+static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
+	unsigned long phys_addr, pgprot_t prot)
+{
+	unsigned long end;
+
+	address &= ~PMD_MASK;
+	end = address + size;
+	if (end > PMD_SIZE)
+		end = PMD_SIZE;
+	do {
+		struct page *page;
+		pte_t oldpage;
+		oldpage = ptep_get_and_clear(pte);
+
+		page = virt_to_page(__va(phys_addr));
+		if ((!VALID_PAGE(page)) || PageReserved(page))
+ 			set_pte(pte, mk_pte_phys(phys_addr, prot));
+		forget_pte(oldpage);
+		address += PAGE_SIZE;
+		phys_addr += PAGE_SIZE;
+		pte++;
+	} while (address && (address < end));
+}
+
+static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
+	unsigned long phys_addr, pgprot_t prot)
+{
+	unsigned long end;
+
+	address &= ~PGDIR_MASK;
+	end = address + size;
+	if (end > PGDIR_SIZE)
+		end = PGDIR_SIZE;
+	phys_addr -= address;
+	do {
+		pte_t * pte = pte_alloc(mm, pmd, address);
+		if (!pte)
+			return -ENOMEM;
+		remap_pte_range(pte, address, end - address, address + phys_addr, prot);
+		address = (address + PMD_SIZE) & PMD_MASK;
+		pmd++;
+	} while (address && (address < end));
+	return 0;
+}
+
+/*  Note: this is only safe if the mm semaphore is held when called. */
+int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
+{
+	int error = 0;
+	pgd_t * dir;
+	unsigned long beg = from;
+	unsigned long end = from + size;
+	struct mm_struct *mm = current->mm;
+
+	phys_addr -= from;
+	dir = pgd_offset(mm, from);
+	flush_cache_range(mm, beg, end);
+	if (from >= end)
+		BUG();
+
+	spin_lock(&mm->page_table_lock);
+	do {
+		pmd_t *pmd = pmd_alloc(mm, dir, from);
+		error = -ENOMEM;
+		if (!pmd)
+			break;
+		error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot);
+		if (error)
+			break;
+		from = (from + PGDIR_SIZE) & PGDIR_MASK;
+		dir++;
+	} while (from && (from < end));
+	spin_unlock(&mm->page_table_lock);
+	flush_tlb_range(mm, beg, end);
+	return error;
+}
+
+/*
+ * Establish a new mapping:
+ *  - flush the old one
+ *  - update the page tables
+ *  - inform the TLB about the new one
+ *
+ * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
+ */
+static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
+{
+	set_pte(page_table, entry);
+	flush_tlb_page(vma, address);
+	update_mmu_cache(vma, address, entry);
+}
+
+/*
+ * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
+ */
+static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
+		pte_t *page_table)
+{
+	flush_page_to_ram(new_page);
+	flush_cache_page(vma, address);
+	establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Goto-purists beware: the only reason for goto's here is that it results
+ * in better assembly code.. The "default" path will see no jumps at all.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We hold the mm semaphore and the page_table_lock on entry and exit
+ * with the page_table_lock released.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
+	unsigned long address, pte_t *page_table, pte_t pte)
+{
+	struct page *old_page, *new_page;
+
+	old_page = pte_page(pte);
+	if (!VALID_PAGE(old_page))
+		goto bad_wp_page;
+
+	if (!TryLockPage(old_page)) {
+		int reuse = can_share_swap_page(old_page);
+		unlock_page(old_page);
+		if (reuse) {
+			flush_cache_page(vma, address);
+			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
+			spin_unlock(&mm->page_table_lock);
+			return 1;	/* Minor fault */
+		}
+	}
+
+	/*
+	 * Ok, we need to copy. Oh, well..
+	 */
+	page_cache_get(old_page);
+	spin_unlock(&mm->page_table_lock);
+
+	new_page = alloc_page(GFP_HIGHUSER);
+	if (!new_page)
+		goto no_mem;
+	copy_cow_page(old_page,new_page,address);
+
+	/*
+	 * Re-check the pte - we dropped the lock
+	 */
+	spin_lock(&mm->page_table_lock);
+	if (pte_same(*page_table, pte)) {
+		if (PageReserved(old_page))
+			++mm->rss;
+		page_remove_rmap(old_page, page_table);
+		break_cow(vma, new_page, address, page_table);
+		page_add_rmap(new_page, page_table);
+		lru_cache_add(new_page);
+
+		/* Free the old page.. */
+		new_page = old_page;
+	}
+	spin_unlock(&mm->page_table_lock);
+	page_cache_release(new_page);
+	page_cache_release(old_page);
+	return 1;	/* Minor fault */
+
+bad_wp_page:
+	spin_unlock(&mm->page_table_lock);
+	printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page);
+	return -1;
+no_mem:
+	page_cache_release(old_page);
+	return -1;
+}
+
+static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff)
+{
+	do {
+		struct mm_struct *mm = mpnt->vm_mm;
+		unsigned long start = mpnt->vm_start;
+		unsigned long end = mpnt->vm_end;
+		unsigned long len = end - start;
+		unsigned long diff;
+
+		/* mapping wholly truncated? */
+		if (mpnt->vm_pgoff >= pgoff) {
+			zap_page_range(mm, start, len);
+			continue;
+		}
+
+		/* mapping wholly unaffected? */
+		len = len >> PAGE_SHIFT;
+		diff = pgoff - mpnt->vm_pgoff;
+		if (diff >= len)
+			continue;
+
+		/* Ok, partially affected.. */
+		start += diff << PAGE_SHIFT;
+		len = (len - diff) << PAGE_SHIFT;
+		zap_page_range(mm, start, len);
+	} while ((mpnt = mpnt->vm_next_share) != NULL);
+}
+
+/*
+ * Handle all mappings that got truncated by a "truncate()"
+ * system call.
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page.  Ugly, but necessary.
+ */
+int vmtruncate(struct inode * inode, loff_t offset)
+{
+	unsigned long pgoff;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long limit;
+
+	if (inode->i_size < offset)
+		goto do_expand;
+	inode->i_size = offset;
+	spin_lock(&mapping->i_shared_lock);
+	if (!mapping->i_mmap && !mapping->i_mmap_shared)
+		goto out_unlock;
+
+	pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (mapping->i_mmap != NULL)
+		vmtruncate_list(mapping->i_mmap, pgoff);
+	if (mapping->i_mmap_shared != NULL)
+		vmtruncate_list(mapping->i_mmap_shared, pgoff);
+
+out_unlock:
+	spin_unlock(&mapping->i_shared_lock);
+	truncate_inode_pages(mapping, offset);
+	goto out_truncate;
+
+do_expand:
+	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	if (limit != RLIM_INFINITY) {
+		if (inode->i_size >= limit) {
+			send_sig(SIGXFSZ, current, 0);
+			goto out;
+		}
+		if (offset > limit) {
+			send_sig(SIGXFSZ, current, 0);
+			offset = limit;
+		}
+	}
+	inode->i_size = offset;
+
+out_truncate:
+	if (inode->i_op && inode->i_op->truncate) {
+		lock_kernel();
+		inode->i_op->truncate(inode);
+		unlock_kernel();
+	}
+out:
+	return 0;
+}
+
+/* 
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time.  We also make sure to queue
+ * the 'original' request together with the readahead ones...  
+ */
+void swapin_readahead(swp_entry_t entry)
+{
+	int i, num;
+	struct page *new_page;
+	unsigned long offset;
+
+	/*
+	 * Get the number of handles we should do readahead io to.
+	 */
+	num = valid_swaphandles(entry, &offset);
+	for (i = 0; i < num; offset++, i++) {
+		/* Ok, do the async read-ahead now */
+		new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset));
+		if (!new_page)
+			break;
+		page_cache_release(new_page);
+	}
+	return;
+}
+
+/*
+ * We hold the mm semaphore and the page_table_lock on entry and
+ * should release the pagetable lock on exit..
+ */
+static int do_swap_page(struct mm_struct * mm,
+	struct vm_area_struct * vma, unsigned long address,
+	pte_t * page_table, pte_t orig_pte, int write_access)
+{
+	struct page *page;
+	swp_entry_t entry = pte_to_swp_entry(orig_pte);
+	pte_t pte;
+	int ret = 1;
+
+	spin_unlock(&mm->page_table_lock);
+	page = lookup_swap_cache(entry);
+	if (!page) {
+		swapin_readahead(entry);
+		page = read_swap_cache_async(entry);
+		if (!page) {
+			/*
+			 * Back out if somebody else faulted in this pte while
+			 * we released the page table lock.
+			 */
+			int retval;
+			spin_lock(&mm->page_table_lock);
+			retval = pte_same(*page_table, orig_pte) ? -1 : 1;
+			spin_unlock(&mm->page_table_lock);
+			return retval;
+		}
+
+		/* Had to read the page from swap area: Major fault */
+		ret = 2;
+	}
+
+	lock_page(page);
+
+	/*
+	 * Back out if somebody else faulted in this pte while we
+	 * released the page table lock.
+	 */
+	spin_lock(&mm->page_table_lock);
+	if (!pte_same(*page_table, orig_pte)) {
+		spin_unlock(&mm->page_table_lock);
+		unlock_page(page);
+		page_cache_release(page);
+		return 1;
+	}
+
+	/* The page isn't present yet, go ahead with the fault. */
+		
+	swap_free(entry);
+	if (vm_swap_full())
+		remove_exclusive_swap_page(page);
+
+	mm->rss++;
+	pte = mk_pte(page, vma->vm_page_prot);
+	if (write_access && can_share_swap_page(page))
+		pte = pte_mkdirty(pte_mkwrite(pte));
+	unlock_page(page);
+
+	flush_page_to_ram(page);
+	flush_icache_page(vma, page);
+	set_pte(page_table, pte);
+	page_add_rmap(page, page_table);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, address, pte);
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+/*
+ * We are called with the MM semaphore and page_table_lock
+ * spinlock held to protect against concurrent faults in
+ * multithreaded programs. 
+ */
+static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
+{
+	pte_t entry;
+	struct page * page = ZERO_PAGE(addr);
+
+	/* Read-only mapping of ZERO_PAGE. */
+	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+
+	/* ..except if it's a write access */
+	if (write_access) {
+		/* Allocate our own private page. */
+		spin_unlock(&mm->page_table_lock);
+
+		page = alloc_page(GFP_HIGHUSER);
+		if (!page)
+			goto no_mem;
+		clear_user_highpage(page, addr);
+
+		spin_lock(&mm->page_table_lock);
+		if (!pte_none(*page_table)) {
+			page_cache_release(page);
+			spin_unlock(&mm->page_table_lock);
+			return 1;
+		}
+		mm->rss++;
+		flush_page_to_ram(page);
+		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+		lru_cache_add(page);
+	}
+
+	set_pte(page_table, entry);
+	page_add_rmap(page, page_table); /* ignores ZERO_PAGE */
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, addr, entry);
+	spin_unlock(&mm->page_table_lock);
+	return 1;	/* Minor fault */
+
+no_mem:
+	return -1;
+}
+
+/*
+ * do_no_page() tries to create a new page mapping. It aggressively
+ * tries to share with existing pages, but makes a separate copy if
+ * the "write_access" parameter is true in order to avoid the next
+ * page fault.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * This is called with the MM semaphore held and the page table
+ * spinlock held. Exit with the spinlock released.
+ */
+static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
+	unsigned long address, int write_access, pte_t *page_table)
+{
+	struct page * new_page;
+	pte_t entry;
+
+	if (!vma->vm_ops || !vma->vm_ops->nopage)
+		return do_anonymous_page(mm, vma, page_table, write_access, address);
+	spin_unlock(&mm->page_table_lock);
+
+	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
+
+	if (new_page == NULL)	/* no page was available -- SIGBUS */
+		return 0;
+	if (new_page == NOPAGE_OOM)
+		return -1;
+
+	/*
+	 * Should we do an early C-O-W break?
+	 */
+	if (write_access && !(vma->vm_flags & VM_SHARED)) {
+		struct page * page = alloc_page(GFP_HIGHUSER);
+		if (!page)
+			return -1;
+		copy_highpage(page, new_page);
+		page_cache_release(new_page);
+		lru_cache_add(page);
+		new_page = page;
+	}
+
+	spin_lock(&mm->page_table_lock);
+	/*
+	 * This silly early PAGE_DIRTY setting removes a race
+	 * due to the bad i386 page protection. But it's valid
+	 * for other architectures too.
+	 *
+	 * Note that if write_access is true, we either now have
+	 * an exclusive copy of the page, or this is a shared mapping,
+	 * so we can make it writable and dirty to avoid having to
+	 * handle that later.
+	 */
+	/* Only go through if we didn't race with anybody else... */
+	if (pte_none(*page_table)) {
+		++mm->rss;
+		flush_page_to_ram(new_page);
+		flush_icache_page(vma, new_page);
+		entry = mk_pte(new_page, vma->vm_page_prot);
+		if (write_access)
+			entry = pte_mkwrite(pte_mkdirty(entry));
+		set_pte(page_table, entry);
+		page_add_rmap(new_page, page_table);
+	} else {
+		/* One of our sibling threads was faster, back out. */
+		page_cache_release(new_page);
+		spin_unlock(&mm->page_table_lock);
+		return 1;
+	}
+
+	/* no need to invalidate: a not-present page shouldn't be cached */
+	update_mmu_cache(vma, address, entry);
+	spin_unlock(&mm->page_table_lock);
+	return 2;	/* Major fault */
+}
+
+/*
+ * These routines also need to handle stuff like marking pages dirty
+ * and/or accessed for architectures that don't do it in hardware (most
+ * RISC architectures).  The early dirtying is also good on the i386.
+ *
+ * There is also a hook called "update_mmu_cache()" that architectures
+ * with external mmu caches can use to update those (ie the Sparc or
+ * PowerPC hashed page tables that act as extended TLBs).
+ *
+ * Note the "page_table_lock". It is to protect against kswapd removing
+ * pages from under us. Note that kswapd only ever _removes_ pages, never
+ * adds them. As such, once we have noticed that the page is not present,
+ * we can drop the lock early.
+ *
+ * The adding of pages is protected by the MM semaphore (which we hold),
+ * so we don't need to worry about a page being suddenly been added into
+ * our VM.
+ *
+ * We enter with the pagetable spinlock held, we are supposed to
+ * release it when done.
+ */
+static inline int handle_pte_fault(struct mm_struct *mm,
+	struct vm_area_struct * vma, unsigned long address,
+	int write_access, pte_t * pte)
+{
+	pte_t entry;
+
+	entry = *pte;
+	if (!pte_present(entry)) {
+		/*
+		 * If it truly wasn't present, we know that kswapd
+		 * and the PTE updates will not touch it later. So
+		 * drop the lock.
+		 */
+		if (pte_none(entry))
+			return do_no_page(mm, vma, address, write_access, pte);
+		return do_swap_page(mm, vma, address, pte, entry, write_access);
+	}
+
+	if (write_access) {
+		if (!pte_write(entry))
+			return do_wp_page(mm, vma, address, pte, entry);
+
+		entry = pte_mkdirty(entry);
+	}
+	entry = pte_mkyoung(entry);
+	establish_pte(vma, address, pte, entry);
+	spin_unlock(&mm->page_table_lock);
+	return 1;
+}
+
+/*
+ * By the time we get here, we already hold the mm semaphore
+ */
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+	unsigned long address, int write_access)
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+
+	current->state = TASK_RUNNING;
+	pgd = pgd_offset(mm, address);
+
+	/*
+	 * We need the page table lock to synchronize with kswapd
+	 * and the SMP-safe atomic PTE updates.
+	 */
+	spin_lock(&mm->page_table_lock);
+	pmd = pmd_alloc(mm, pgd, address);
+
+	if (pmd) {
+		pte_t * pte = pte_alloc(mm, pmd, address);
+		if (pte)
+			return handle_pte_fault(mm, vma, address, write_access, pte);
+	}
+	spin_unlock(&mm->page_table_lock);
+	return -1;
+}
+
+/*
+ * Allocate page middle directory.
+ *
+ * We've already handled the fast-path in-line, and we own the
+ * page table lock.
+ *
+ * On a two-level page table, this ends up actually being entirely
+ * optimized away.
+ */
+pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+	pmd_t *new;
+
+	/* "fast" allocation can happen without dropping the lock.. */
+	new = pmd_alloc_one_fast(mm, address);
+	if (!new) {
+		spin_unlock(&mm->page_table_lock);
+		new = pmd_alloc_one(mm, address);
+		spin_lock(&mm->page_table_lock);
+		if (!new)
+			return NULL;
+
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (!pgd_none(*pgd)) {
+			pmd_free(new);
+			goto out;
+		}
+	}
+	pgd_populate(mm, pgd, new);
+out:
+	return pmd_offset(pgd, address);
+}
+
+/*
+ * Allocate the page table directory.
+ *
+ * We've already handled the fast-path in-line, and we own the
+ * page table lock.
+ */
+pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+	if (pmd_none(*pmd)) {
+		pte_t *new;
+
+		/* "fast" allocation can happen without dropping the lock.. */
+		new = pte_alloc_one_fast(mm, address);
+		if (!new) {
+			spin_unlock(&mm->page_table_lock);
+			new = pte_alloc_one(mm, address);
+			spin_lock(&mm->page_table_lock);
+			if (!new)
+				return NULL;
+
+			/*
+			 * Because we dropped the lock, we should re-check the
+			 * entry, as somebody else could have populated it..
+			 */
+			if (!pmd_none(*pmd)) {
+				pte_free(new);
+				goto out;
+			}
+		}
+		pgtable_add_rmap(new, mm, address);
+		pmd_populate(mm, pmd, new);
+	}
+out:
+	return pte_offset(pmd, address);
+}
+
+/*
+ * Simplistic page force-in..
+ */
+int make_pages_present(unsigned long addr, unsigned long end)
+{
+	int write;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct * vma;
+
+	vma = find_vma(mm, addr);
+	write = (vma->vm_flags & VM_WRITE) != 0;
+	if (addr >= end)
+		BUG();
+	do {
+		if (handle_mm_fault(mm, vma, addr, write) < 0)
+			return -1;
+		addr += PAGE_SIZE;
+	} while (addr < end);
+	return 0;
+}
diff -urN linux-2.4.17-rc1-virgin/mm/mmap.c linux-2.4.17-rc1-wli3/mm/mmap.c
--- linux-2.4.17-rc1-virgin/mm/mmap.c	Sun Nov  4 10:17:20 2001
+++ linux-2.4.17-rc1-wli3/mm/mmap.c	Sun Dec 16 17:58:10 2001
@@ -45,6 +45,7 @@
 };
 
 int sysctl_overcommit_memory;
+int max_map_count = DEFAULT_MAX_MAP_COUNT;
 
 /* Check that a process has enough memory to allocate a
  * new virtual mapping.
@@ -413,7 +414,7 @@
 		return -EINVAL;
 
 	/* Too many mappings? */
-	if (mm->map_count > MAX_MAP_COUNT)
+	if (mm->map_count > max_map_count)
 		return -ENOMEM;
 
 	/* Obtain the address to map to. we verify (or select) it and ensure
@@ -569,7 +570,7 @@
 	fput(file);
 
 	/* Undo any partial mapping done by a device driver. */
-	zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+	zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start, ZPR_NORMAL);
 free_vma:
 	kmem_cache_free(vm_area_cachep, vma);
 	return error;
@@ -919,7 +920,7 @@
 
 	/* If we'll make "hole", check the vm areas limit */
 	if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len)
-	    && mm->map_count >= MAX_MAP_COUNT)
+	    && mm->map_count >= max_map_count)
 		return -ENOMEM;
 
 	/*
@@ -967,7 +968,7 @@
 		remove_shared_vm_struct(mpnt);
 		mm->map_count--;
 
-		zap_page_range(mm, st, size);
+		zap_page_range(mm, st, size, ZPR_PARTITION);
 
 		/*
 		 * Fix the mapping, and free the old area if it wasn't reused.
@@ -1040,7 +1041,7 @@
 	    > current->rlim[RLIMIT_AS].rlim_cur)
 		return -ENOMEM;
 
-	if (mm->map_count > MAX_MAP_COUNT)
+	if (mm->map_count > max_map_count)
 		return -ENOMEM;
 
 	if (!vm_enough_memory(len >> PAGE_SHIFT))
@@ -1127,7 +1128,7 @@
 		}
 		mm->map_count--;
 		remove_shared_vm_struct(mpnt);
-		zap_page_range(mm, start, size);
+		zap_page_range(mm, start, size, ZPR_PARTITION);
 		if (mpnt->vm_file)
 			fput(mpnt->vm_file);
 		kmem_cache_free(vm_area_cachep, mpnt);
diff -urN linux-2.4.17-rc1-virgin/mm/mmap.c~ linux-2.4.17-rc1-wli3/mm/mmap.c~
--- linux-2.4.17-rc1-virgin/mm/mmap.c~	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/mm/mmap.c~	Fri Dec 14 02:44:20 2001
@@ -0,0 +1,1173 @@
+/*
+ *	linux/mm/mmap.c
+ *
+ * Written by obz.
+ */
+#include <linux/slab.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/personality.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+
+/*
+ * WARNING: the debugging will use recursive algorithms so never enable this
+ * unless you know what you are doing.
+ */
+#undef DEBUG_MM_RB
+
+/* description of effects of mapping type and prot in current implementation.
+ * this is due to the limited x86 page protection hardware.  The expected
+ * behavior is in parens:
+ *
+ * map_type	prot
+ *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
+ * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
+ *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
+ *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
+ *		
+ * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
+ *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
+ *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
+ *
+ */
+pgprot_t protection_map[16] = {
+	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
+	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
+};
+
+int sysctl_overcommit_memory;
+int max_map_count = DEFAULT_MAX_MAP_COUNT;
+
+/* Check that a process has enough memory to allocate a
+ * new virtual mapping.
+ */
+int vm_enough_memory(long pages)
+{
+	/* Stupid algorithm to decide if we have enough memory: while
+	 * simple, it hopefully works in most obvious cases.. Easy to
+	 * fool it, but this should catch most mistakes.
+	 */
+	/* 23/11/98 NJC: Somewhat less stupid version of algorithm,
+	 * which tries to do "TheRightThing".  Instead of using half of
+	 * (buffers+cache), use the minimum values.  Allow an extra 2%
+	 * of num_physpages for safety margin.
+	 */
+
+	unsigned long free;
+	
+        /* Sometimes we want to use more memory than we have. */
+	if (sysctl_overcommit_memory)
+	    return 1;
+
+	/* The page cache contains buffer pages these days.. */
+	free = atomic_read(&page_cache_size);
+	free += nr_free_pages();
+	free += nr_swap_pages;
+
+	/*
+	 * This double-counts: the nrpages are both in the page-cache
+	 * and in the swapper space. At the same time, this compensates
+	 * for the swap-space over-allocation (ie "nr_swap_pages" being
+	 * too small.
+	 */
+	free += swapper_space.nrpages;
+
+	/*
+	 * The code below doesn't account for free space in the inode
+	 * and dentry slab cache, slab cache fragmentation, inodes and
+	 * dentries which will become freeable under VM load, etc.
+	 * Lets just hope all these (complex) factors balance out...
+	 */
+	free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT;
+	free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT;
+
+	return free > pages;
+}
+
+/* Remove one vm structure from the inode's i_mapping address space. */
+static inline void __remove_shared_vm_struct(struct vm_area_struct *vma)
+{
+	struct file * file = vma->vm_file;
+
+	if (file) {
+		struct inode *inode = file->f_dentry->d_inode;
+		if (vma->vm_flags & VM_DENYWRITE)
+			atomic_inc(&inode->i_writecount);
+		if(vma->vm_next_share)
+			vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share;
+		*vma->vm_pprev_share = vma->vm_next_share;
+	}
+}
+
+static inline void remove_shared_vm_struct(struct vm_area_struct *vma)
+{
+	lock_vma_mappings(vma);
+	__remove_shared_vm_struct(vma);
+	unlock_vma_mappings(vma);
+}
+
+void lock_vma_mappings(struct vm_area_struct *vma)
+{
+	struct address_space *mapping;
+
+	mapping = NULL;
+	if (vma->vm_file)
+		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+	if (mapping)
+		spin_lock(&mapping->i_shared_lock);
+}
+
+void unlock_vma_mappings(struct vm_area_struct *vma)
+{
+	struct address_space *mapping;
+
+	mapping = NULL;
+	if (vma->vm_file)
+		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+	if (mapping)
+		spin_unlock(&mapping->i_shared_lock);
+}
+
+/*
+ *  sys_brk() for the most part doesn't need the global kernel
+ *  lock, except when an application is doing something nasty
+ *  like trying to un-brk an area that has already been mapped
+ *  to a regular file.  in this case, the unmapping will need
+ *  to invoke file system routines that need the global lock.
+ */
+asmlinkage unsigned long sys_brk(unsigned long brk)
+{
+	unsigned long rlim, retval;
+	unsigned long newbrk, oldbrk;
+	struct mm_struct *mm = current->mm;
+
+	down_write(&mm->mmap_sem);
+
+	if (brk < mm->end_code)
+		goto out;
+	newbrk = PAGE_ALIGN(brk);
+	oldbrk = PAGE_ALIGN(mm->brk);
+	if (oldbrk == newbrk)
+		goto set_brk;
+
+	/* Always allow shrinking brk. */
+	if (brk <= mm->brk) {
+		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
+			goto set_brk;
+		goto out;
+	}
+
+	/* Check against rlimit.. */
+	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
+		goto out;
+
+	/* Check against existing mmap mappings. */
+	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+		goto out;
+
+	/* Check if we have enough memory.. */
+	if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT))
+		goto out;
+
+	/* Ok, looks good - let it rip. */
+	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+		goto out;
+set_brk:
+	mm->brk = brk;
+out:
+	retval = mm->brk;
+	up_write(&mm->mmap_sem);
+	return retval;
+}
+
+/* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
+ * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
+ * into "VM_xxx".
+ */
+static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flags)
+{
+#define _trans(x,bit1,bit2) \
+((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
+
+	unsigned long prot_bits, flag_bits;
+	prot_bits =
+		_trans(prot, PROT_READ, VM_READ) |
+		_trans(prot, PROT_WRITE, VM_WRITE) |
+		_trans(prot, PROT_EXEC, VM_EXEC);
+	flag_bits =
+		_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
+		_trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
+		_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
+	return prot_bits | flag_bits;
+#undef _trans
+}
+
+#ifdef DEBUG_MM_RB
+static int browse_rb(rb_node_t * rb_node) {
+	int i = 0;
+	if (rb_node) {
+		i++;
+		i += browse_rb(rb_node->rb_left);
+		i += browse_rb(rb_node->rb_right);
+	}
+	return i;
+}
+
+static void validate_mm(struct mm_struct * mm) {
+	int bug = 0;
+	int i = 0;
+	struct vm_area_struct * tmp = mm->mmap;
+	while (tmp) {
+		tmp = tmp->vm_next;
+		i++;
+	}
+	if (i != mm->map_count)
+		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
+	i = browse_rb(mm->mm_rb.rb_node);
+	if (i != mm->map_count)
+		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
+	if (bug)
+		BUG();
+}
+#else
+#define validate_mm(mm) do { } while (0)
+#endif
+
+static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned long addr,
+						struct vm_area_struct ** pprev,
+						rb_node_t *** rb_link, rb_node_t ** rb_parent)
+{
+	struct vm_area_struct * vma;
+	rb_node_t ** __rb_link, * __rb_parent, * rb_prev;
+
+	__rb_link = &mm->mm_rb.rb_node;
+	rb_prev = __rb_parent = NULL;
+	vma = NULL;
+
+	while (*__rb_link) {
+		struct vm_area_struct *vma_tmp;
+
+		__rb_parent = *__rb_link;
+		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
+
+		if (vma_tmp->vm_end > addr) {
+			vma = vma_tmp;
+			if (vma_tmp->vm_start <= addr)
+				return vma;
+			__rb_link = &__rb_parent->rb_left;
+		} else {
+			rb_prev = __rb_parent;
+			__rb_link = &__rb_parent->rb_right;
+		}
+	}
+
+	*pprev = NULL;
+	if (rb_prev)
+		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+	*rb_link = __rb_link;
+	*rb_parent = __rb_parent;
+	return vma;
+}
+
+static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
+				   rb_node_t * rb_parent)
+{
+	if (prev) {
+		vma->vm_next = prev->vm_next;
+		prev->vm_next = vma;
+	} else {
+		mm->mmap = vma;
+		if (rb_parent)
+			vma->vm_next = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
+		else
+			vma->vm_next = NULL;
+	}
+}
+
+static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma,
+				 rb_node_t ** rb_link, rb_node_t * rb_parent)
+{
+	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
+	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+}
+
+static inline void __vma_link_file(struct vm_area_struct * vma)
+{
+	struct file * file;
+
+	file = vma->vm_file;
+	if (file) {
+		struct inode * inode = file->f_dentry->d_inode;
+		struct address_space *mapping = inode->i_mapping;
+		struct vm_area_struct **head;
+
+		if (vma->vm_flags & VM_DENYWRITE)
+			atomic_dec(&inode->i_writecount);
+
+		head = &mapping->i_mmap;
+		if (vma->vm_flags & VM_SHARED)
+			head = &mapping->i_mmap_shared;
+      
+		/* insert vma into inode's share list */
+		if((vma->vm_next_share = *head) != NULL)
+			(*head)->vm_pprev_share = &vma->vm_next_share;
+		*head = vma;
+		vma->vm_pprev_share = head;
+	}
+}
+
+static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma,  struct vm_area_struct * prev,
+		       rb_node_t ** rb_link, rb_node_t * rb_parent)
+{
+	__vma_link_list(mm, vma, prev, rb_parent);
+	__vma_link_rb(mm, vma, rb_link, rb_parent);
+	__vma_link_file(vma);
+}
+
+static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
+			    rb_node_t ** rb_link, rb_node_t * rb_parent)
+{
+	lock_vma_mappings(vma);
+	spin_lock(&mm->page_table_lock);
+	__vma_link(mm, vma, prev, rb_link, rb_parent);
+	spin_unlock(&mm->page_table_lock);
+	unlock_vma_mappings(vma);
+
+	mm->map_count++;
+	validate_mm(mm);
+}
+
+static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev,
+		     rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags)
+{
+	spinlock_t * lock = &mm->page_table_lock;
+	if (!prev) {
+		prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
+		goto merge_next;
+	}
+	if (prev->vm_end == addr && can_vma_merge(prev, vm_flags)) {
+		struct vm_area_struct * next;
+
+		spin_lock(lock);
+		prev->vm_end = end;
+		next = prev->vm_next;
+		if (next && prev->vm_end == next->vm_start && can_vma_merge(next, vm_flags)) {
+			prev->vm_end = next->vm_end;
+			__vma_unlink(mm, next, prev);
+			spin_unlock(lock);
+
+			mm->map_count--;
+			kmem_cache_free(vm_area_cachep, next);
+			return 1;
+		}
+		spin_unlock(lock);
+		return 1;
+	}
+
+	prev = prev->vm_next;
+	if (prev) {
+ merge_next:
+		if (!can_vma_merge(prev, vm_flags))
+			return 0;
+		if (end == prev->vm_start) {
+			spin_lock(lock);
+			prev->vm_start = addr;
+			spin_unlock(lock);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
+	unsigned long prot, unsigned long flags, unsigned long pgoff)
+{
+	struct mm_struct * mm = current->mm;
+	struct vm_area_struct * vma, * prev;
+	unsigned int vm_flags;
+	int correct_wcount = 0;
+	int error;
+	rb_node_t ** rb_link, * rb_parent;
+
+	if (file && (!file->f_op || !file->f_op->mmap))
+		return -ENODEV;
+
+	if ((len = PAGE_ALIGN(len)) == 0)
+		return addr;
+
+	if (len > TASK_SIZE)
+		return -EINVAL;
+
+	/* offset overflow? */
+	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+		return -EINVAL;
+
+	/* Too many mappings? */
+	if (mm->map_count > max_map_count)
+		return -ENOMEM;
+
+	/* Obtain the address to map to. we verify (or select) it and ensure
+	 * that it represents a valid section of the address space.
+	 */
+	addr = get_unmapped_area(file, addr, len, pgoff, flags);
+	if (addr & ~PAGE_MASK)
+		return addr;
+
+	/* Do simple checking here so the lower-level routines won't have
+	 * to. we assume access permissions have been handled by the open
+	 * of the memory object, so we don't do any here.
+	 */
+	vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+	/* mlock MCL_FUTURE? */
+	if (vm_flags & VM_LOCKED) {
+		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
+		locked += len;
+		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
+			return -EAGAIN;
+	}
+
+	if (file) {
+		switch (flags & MAP_TYPE) {
+		case MAP_SHARED:
+			if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE))
+				return -EACCES;
+
+			/* Make sure we don't allow writing to an append-only file.. */
+			if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & FMODE_WRITE))
+				return -EACCES;
+
+			/* make sure there are no mandatory locks on the file. */
+			if (locks_verify_locked(file->f_dentry->d_inode))
+				return -EAGAIN;
+
+			vm_flags |= VM_SHARED | VM_MAYSHARE;
+			if (!(file->f_mode & FMODE_WRITE))
+				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
+
+			/* fall through */
+		case MAP_PRIVATE:
+			if (!(file->f_mode & FMODE_READ))
+				return -EACCES;
+			break;
+
+		default:
+			return -EINVAL;
+		}
+	} else {
+		vm_flags |= VM_SHARED | VM_MAYSHARE;
+		switch (flags & MAP_TYPE) {
+		default:
+			return -EINVAL;
+		case MAP_PRIVATE:
+			vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
+			/* fall through */
+		case MAP_SHARED:
+			break;
+		}
+	}
+
+	/* Clear old maps */
+	error = -ENOMEM;
+munmap_back:
+	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	if (vma && vma->vm_start < addr + len) {
+		if (do_munmap(mm, addr, len))
+			return -ENOMEM;
+		goto munmap_back;
+	}
+
+	/* Check against address space limit. */
+	if ((mm->total_vm << PAGE_SHIFT) + len
+	    > current->rlim[RLIMIT_AS].rlim_cur)
+		return -ENOMEM;
+
+	/* Private writable mapping? Check memory availability.. */
+	if ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
+	    !(flags & MAP_NORESERVE)				 &&
+	    !vm_enough_memory(len >> PAGE_SHIFT))
+		return -ENOMEM;
+
+	/* Can we just expand an old anonymous mapping? */
+	if (!file && !(vm_flags & VM_SHARED) && rb_parent)
+		if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags))
+			goto out;
+
+	/* Determine the object being mapped and call the appropriate
+	 * specific mapper. the address has already been validated, but
+	 * not unmapped, but the maps are removed from the list.
+	 */
+	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!vma)
+		return -ENOMEM;
+
+	vma->vm_mm = mm;
+	vma->vm_start = addr;
+	vma->vm_end = addr + len;
+	vma->vm_flags = vm_flags;
+	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+	vma->vm_ops = NULL;
+	vma->vm_pgoff = pgoff;
+	vma->vm_file = NULL;
+	vma->vm_private_data = NULL;
+	vma->vm_raend = 0;
+
+	if (file) {
+		error = -EINVAL;
+		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+			goto free_vma;
+		if (vm_flags & VM_DENYWRITE) {
+			error = deny_write_access(file);
+			if (error)
+				goto free_vma;
+			correct_wcount = 1;
+		}
+		vma->vm_file = file;
+		get_file(file);
+		error = file->f_op->mmap(file, vma);
+		if (error)
+			goto unmap_and_free_vma;
+	} else if (flags & MAP_SHARED) {
+		error = shmem_zero_setup(vma);
+		if (error)
+			goto free_vma;
+	}
+
+	/* Can addr have changed??
+	 *
+	 * Answer: Yes, several device drivers can do it in their
+	 *         f_op->mmap method. -DaveM
+	 */
+	addr = vma->vm_start;
+
+	vma_link(mm, vma, prev, rb_link, rb_parent);
+	if (correct_wcount)
+		atomic_inc(&file->f_dentry->d_inode->i_writecount);
+
+out:	
+	mm->total_vm += len >> PAGE_SHIFT;
+	if (vm_flags & VM_LOCKED) {
+		mm->locked_vm += len >> PAGE_SHIFT;
+		make_pages_present(addr, addr + len);
+	}
+	return addr;
+
+unmap_and_free_vma:
+	if (correct_wcount)
+		atomic_inc(&file->f_dentry->d_inode->i_writecount);
+	vma->vm_file = NULL;
+	fput(file);
+
+	/* Undo any partial mapping done by a device driver. */
+	zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+free_vma:
+	kmem_cache_free(vm_area_cachep, vma);
+	return error;
+}
+
+/* Get an address range which is currently unmapped.
+ * For shmat() with addr=0.
+ *
+ * Ugly calling convention alert:
+ * Return value with the low bits set means error value,
+ * ie
+ *	if (ret & ~PAGE_MASK)
+ *		error = ret;
+ *
+ * This function "knows" that -ENOMEM has the bits set.
+ */
+#ifndef HAVE_ARCH_UNMAPPED_AREA
+static inline unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct vm_area_struct *vma;
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(current->mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+	addr = PAGE_ALIGN(TASK_UNMAPPED_BASE);
+
+	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr)
+			return -ENOMEM;
+		if (!vma || addr + len <= vma->vm_start)
+			return addr;
+		addr = vma->vm_end;
+	}
+}
+#else
+extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+#endif	
+
+unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	if (flags & MAP_FIXED) {
+		if (addr > TASK_SIZE - len)
+			return -EINVAL;
+		if (addr & ~PAGE_MASK)
+			return -EINVAL;
+		return addr;
+	}
+
+	if (file && file->f_op && file->f_op->get_unmapped_area)
+		return file->f_op->get_unmapped_area(file, addr, len, pgoff, flags);
+
+	return arch_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
+/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
+struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = NULL;
+
+	if (mm) {
+		/* Check the cache first. */
+		/* (Cache hit rate is typically around 35%.) */
+		vma = mm->mmap_cache;
+		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+			rb_node_t * rb_node;
+
+			rb_node = mm->mm_rb.rb_node;
+			vma = NULL;
+
+			while (rb_node) {
+				struct vm_area_struct * vma_tmp;
+
+				vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+				if (vma_tmp->vm_end > addr) {
+					vma = vma_tmp;
+					if (vma_tmp->vm_start <= addr)
+						break;
+					rb_node = rb_node->rb_left;
+				} else
+					rb_node = rb_node->rb_right;
+			}
+			if (vma)
+				mm->mmap_cache = vma;
+		}
+	}
+	return vma;
+}
+
+/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
+struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
+				      struct vm_area_struct **pprev)
+{
+	if (mm) {
+		/* Go through the RB tree quickly. */
+		struct vm_area_struct * vma;
+		rb_node_t * rb_node, * rb_last_right, * rb_prev;
+		
+		rb_node = mm->mm_rb.rb_node;
+		rb_last_right = rb_prev = NULL;
+		vma = NULL;
+
+		while (rb_node) {
+			struct vm_area_struct * vma_tmp;
+
+			vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+			if (vma_tmp->vm_end > addr) {
+				vma = vma_tmp;
+				rb_prev = rb_last_right;
+				if (vma_tmp->vm_start <= addr)
+					break;
+				rb_node = rb_node->rb_left;
+			} else {
+				rb_last_right = rb_node;
+				rb_node = rb_node->rb_right;
+			}
+		}
+		if (vma) {
+			if (vma->vm_rb.rb_left) {
+				rb_prev = vma->vm_rb.rb_left;
+				while (rb_prev->rb_right)
+					rb_prev = rb_prev->rb_right;
+			}
+			*pprev = NULL;
+			if (rb_prev)
+				*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+			if ((rb_prev ? (*pprev)->vm_next : mm->mmap) != vma)
+				BUG();
+			return vma;
+		}
+	}
+	*pprev = NULL;
+	return NULL;
+}
+
+struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr)
+{
+	struct vm_area_struct * vma;
+	unsigned long start;
+
+	addr &= PAGE_MASK;
+	vma = find_vma(mm,addr);
+	if (!vma)
+		return NULL;
+	if (vma->vm_start <= addr)
+		return vma;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		return NULL;
+	start = vma->vm_start;
+	if (expand_stack(vma, addr))
+		return NULL;
+	if (vma->vm_flags & VM_LOCKED) {
+		make_pages_present(addr, start);
+	}
+	return vma;
+}
+
+/* Normal function to fix up a mapping
+ * This function is the default for when an area has no specific
+ * function.  This may be used as part of a more specific routine.
+ * This function works out what part of an area is affected and
+ * adjusts the mapping information.  Since the actual page
+ * manipulation is done in do_mmap(), none need be done here,
+ * though it would probably be more appropriate.
+ *
+ * By the time this function is called, the area struct has been
+ * removed from the process mapping list, so it needs to be
+ * reinserted if necessary.
+ *
+ * The 4 main cases are:
+ *    Unmapping the whole area
+ *    Unmapping from the start of the segment to a point in it
+ *    Unmapping from an intermediate point to the end
+ *    Unmapping between to intermediate points, making a hole.
+ *
+ * Case 4 involves the creation of 2 new areas, for each side of
+ * the hole.  If possible, we reuse the existing area rather than
+ * allocate a new one, and the return indicates whether the old
+ * area was reused.
+ */
+static struct vm_area_struct * unmap_fixup(struct mm_struct *mm, 
+	struct vm_area_struct *area, unsigned long addr, size_t len, 
+	struct vm_area_struct *extra)
+{
+	struct vm_area_struct *mpnt;
+	unsigned long end = addr + len;
+
+	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
+	if (area->vm_flags & VM_LOCKED)
+		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
+
+	/* Unmapping the whole area. */
+	if (addr == area->vm_start && end == area->vm_end) {
+		if (area->vm_ops && area->vm_ops->close)
+			area->vm_ops->close(area);
+		if (area->vm_file)
+			fput(area->vm_file);
+		kmem_cache_free(vm_area_cachep, area);
+		return extra;
+	}
+
+	/* Work out to one of the ends. */
+	if (end == area->vm_end) {
+		/*
+		 * here area isn't visible to the semaphore-less readers
+		 * so we don't need to update it under the spinlock.
+		 */
+		area->vm_end = addr;
+		lock_vma_mappings(area);
+		spin_lock(&mm->page_table_lock);
+	} else if (addr == area->vm_start) {
+		area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
+		/* same locking considerations of the above case */
+		area->vm_start = end;
+		lock_vma_mappings(area);
+		spin_lock(&mm->page_table_lock);
+	} else {
+	/* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */
+		/* Add end mapping -- leave beginning for below */
+		mpnt = extra;
+		extra = NULL;
+
+		mpnt->vm_mm = area->vm_mm;
+		mpnt->vm_start = end;
+		mpnt->vm_end = area->vm_end;
+		mpnt->vm_page_prot = area->vm_page_prot;
+		mpnt->vm_flags = area->vm_flags;
+		mpnt->vm_raend = 0;
+		mpnt->vm_ops = area->vm_ops;
+		mpnt->vm_pgoff = area->vm_pgoff + ((end - area->vm_start) >> PAGE_SHIFT);
+		mpnt->vm_file = area->vm_file;
+		mpnt->vm_private_data = area->vm_private_data;
+		if (mpnt->vm_file)
+			get_file(mpnt->vm_file);
+		if (mpnt->vm_ops && mpnt->vm_ops->open)
+			mpnt->vm_ops->open(mpnt);
+		area->vm_end = addr;	/* Truncate area */
+
+		/* Because mpnt->vm_file == area->vm_file this locks
+		 * things correctly.
+		 */
+		lock_vma_mappings(area);
+		spin_lock(&mm->page_table_lock);
+		__insert_vm_struct(mm, mpnt);
+	}
+
+	__insert_vm_struct(mm, area);
+	spin_unlock(&mm->page_table_lock);
+	unlock_vma_mappings(area);
+	return extra;
+}
+
+/*
+ * Try to free as many page directory entries as we can,
+ * without having to work very hard at actually scanning
+ * the page tables themselves.
+ *
+ * Right now we try to free page tables if we have a nice
+ * PGDIR-aligned area that got free'd up. We could be more
+ * granular if we want to, but this is fast and simple,
+ * and covers the bad cases.
+ *
+ * "prev", if it exists, points to a vma before the one
+ * we just free'd - but there's no telling how much before.
+ */
+static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev,
+	unsigned long start, unsigned long end)
+{
+	unsigned long first = start & PGDIR_MASK;
+	unsigned long last = end + PGDIR_SIZE - 1;
+	unsigned long start_index, end_index;
+
+	if (!prev) {
+		prev = mm->mmap;
+		if (!prev)
+			goto no_mmaps;
+		if (prev->vm_end > start) {
+			if (last > prev->vm_start)
+				last = prev->vm_start;
+			goto no_mmaps;
+		}
+	}
+	for (;;) {
+		struct vm_area_struct *next = prev->vm_next;
+
+		if (next) {
+			if (next->vm_start < start) {
+				prev = next;
+				continue;
+			}
+			if (last > next->vm_start)
+				last = next->vm_start;
+		}
+		if (prev->vm_end > first)
+			first = prev->vm_end + PGDIR_SIZE - 1;
+		break;
+	}
+no_mmaps:
+	/*
+	 * If the PGD bits are not consecutive in the virtual address, the
+	 * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
+	 */
+	start_index = pgd_index(first);
+	end_index = pgd_index(last);
+	if (end_index > start_index) {
+		clear_page_tables(mm, start_index, end_index - start_index);
+		flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);
+	}
+}
+
+/* Munmap is split into 2 main parts -- this part which finds
+ * what needs doing, and the areas themselves, which do the
+ * work.  This now handles partial unmappings.
+ * Jeremy Fitzhardine <jeremy@sw.oz.au>
+ */
+int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
+{
+	struct vm_area_struct *mpnt, *prev, **npp, *free, *extra;
+
+	if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
+		return -EINVAL;
+
+	if ((len = PAGE_ALIGN(len)) == 0)
+		return -EINVAL;
+
+	/* Check if this memory area is ok - put it on the temporary
+	 * list if so..  The checks here are pretty simple --
+	 * every area affected in some way (by any overlap) is put
+	 * on the list.  If nothing is put on, nothing is affected.
+	 */
+	mpnt = find_vma_prev(mm, addr, &prev);
+	if (!mpnt)
+		return 0;
+	/* we have  addr < mpnt->vm_end  */
+
+	if (mpnt->vm_start >= addr+len)
+		return 0;
+
+	/* If we'll make "hole", check the vm areas limit */
+	if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len)
+	    && mm->map_count >= max_map_count)
+		return -ENOMEM;
+
+	/*
+	 * We may need one additional vma to fix up the mappings ... 
+	 * and this is the last chance for an easy error exit.
+	 */
+	extra = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!extra)
+		return -ENOMEM;
+
+	npp = (prev ? &prev->vm_next : &mm->mmap);
+	free = NULL;
+	spin_lock(&mm->page_table_lock);
+	for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) {
+		*npp = mpnt->vm_next;
+		mpnt->vm_next = free;
+		free = mpnt;
+		rb_erase(&mpnt->vm_rb, &mm->mm_rb);
+	}
+	mm->mmap_cache = NULL;	/* Kill the cache. */
+	spin_unlock(&mm->page_table_lock);
+
+	/* Ok - we have the memory areas we should free on the 'free' list,
+	 * so release them, and unmap the page range..
+	 * If the one of the segments is only being partially unmapped,
+	 * it will put new vm_area_struct(s) into the address space.
+	 * In that case we have to be careful with VM_DENYWRITE.
+	 */
+	while ((mpnt = free) != NULL) {
+		unsigned long st, end, size;
+		struct file *file = NULL;
+
+		free = free->vm_next;
+
+		st = addr < mpnt->vm_start ? mpnt->vm_start : addr;
+		end = addr+len;
+		end = end > mpnt->vm_end ? mpnt->vm_end : end;
+		size = end - st;
+
+		if (mpnt->vm_flags & VM_DENYWRITE &&
+		    (st != mpnt->vm_start || end != mpnt->vm_end) &&
+		    (file = mpnt->vm_file) != NULL) {
+			atomic_dec(&file->f_dentry->d_inode->i_writecount);
+		}
+		remove_shared_vm_struct(mpnt);
+		mm->map_count--;
+
+		zap_page_range(mm, st, size);
+
+		/*
+		 * Fix the mapping, and free the old area if it wasn't reused.
+		 */
+		extra = unmap_fixup(mm, mpnt, st, size, extra);
+		if (file)
+			atomic_inc(&file->f_dentry->d_inode->i_writecount);
+	}
+	validate_mm(mm);
+
+	/* Release the extra vma struct if it wasn't used */
+	if (extra)
+		kmem_cache_free(vm_area_cachep, extra);
+
+	free_pgtables(mm, prev, addr, addr+len);
+
+	return 0;
+}
+
+asmlinkage long sys_munmap(unsigned long addr, size_t len)
+{
+	int ret;
+	struct mm_struct *mm = current->mm;
+
+	down_write(&mm->mmap_sem);
+	ret = do_munmap(mm, addr, len);
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+
+/*
+ *  this is really a simplified "do_mmap".  it only handles
+ *  anonymous maps.  eventually we may be able to do some
+ *  brk-specific accounting here.
+ */
+unsigned long do_brk(unsigned long addr, unsigned long len)
+{
+	struct mm_struct * mm = current->mm;
+	struct vm_area_struct * vma, * prev;
+	unsigned long flags;
+	rb_node_t ** rb_link, * rb_parent;
+
+	len = PAGE_ALIGN(len);
+	if (!len)
+		return addr;
+
+	/*
+	 * mlock MCL_FUTURE?
+	 */
+	if (mm->def_flags & VM_LOCKED) {
+		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
+		locked += len;
+		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
+			return -EAGAIN;
+	}
+
+	/*
+	 * Clear old maps.  this also does some error checking for us
+	 */
+ munmap_back:
+	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	if (vma && vma->vm_start < addr + len) {
+		if (do_munmap(mm, addr, len))
+			return -ENOMEM;
+		goto munmap_back;
+	}
+
+	/* Check against address space limits *after* clearing old maps... */
+	if ((mm->total_vm << PAGE_SHIFT) + len
+	    > current->rlim[RLIMIT_AS].rlim_cur)
+		return -ENOMEM;
+
+	if (mm->map_count > max_map_count)
+		return -ENOMEM;
+
+	if (!vm_enough_memory(len >> PAGE_SHIFT))
+		return -ENOMEM;
+
+	flags = calc_vm_flags(PROT_READ|PROT_WRITE|PROT_EXEC,
+				MAP_FIXED|MAP_PRIVATE) | mm->def_flags;
+
+	flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+	/* Can we just expand an old anonymous mapping? */
+	if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, flags))
+		goto out;
+
+	/*
+	 * create a vma struct for an anonymous mapping
+	 */
+	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!vma)
+		return -ENOMEM;
+
+	vma->vm_mm = mm;
+	vma->vm_start = addr;
+	vma->vm_end = addr + len;
+	vma->vm_flags = flags;
+	vma->vm_page_prot = protection_map[flags & 0x0f];
+	vma->vm_ops = NULL;
+	vma->vm_pgoff = 0;
+	vma->vm_file = NULL;
+	vma->vm_private_data = NULL;
+
+	vma_link(mm, vma, prev, rb_link, rb_parent);
+
+out:
+	mm->total_vm += len >> PAGE_SHIFT;
+	if (flags & VM_LOCKED) {
+		mm->locked_vm += len >> PAGE_SHIFT;
+		make_pages_present(addr, addr + len);
+	}
+	return addr;
+}
+
+/* Build the RB tree corresponding to the VMA list. */
+void build_mmap_rb(struct mm_struct * mm)
+{
+	struct vm_area_struct * vma;
+	rb_node_t ** rb_link, * rb_parent;
+
+	mm->mm_rb = RB_ROOT;
+	rb_link = &mm->mm_rb.rb_node;
+	rb_parent = NULL;
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		__vma_link_rb(mm, vma, rb_link, rb_parent);
+		rb_parent = &vma->vm_rb;
+		rb_link = &rb_parent->rb_right;
+	}
+}
+
+/* Release all mmaps. */
+void exit_mmap(struct mm_struct * mm)
+{
+	struct vm_area_struct * mpnt;
+
+	release_segments(mm);
+	spin_lock(&mm->page_table_lock);
+	mpnt = mm->mmap;
+	mm->mmap = mm->mmap_cache = NULL;
+	mm->mm_rb = RB_ROOT;
+	mm->rss = 0;
+	spin_unlock(&mm->page_table_lock);
+	mm->total_vm = 0;
+	mm->locked_vm = 0;
+
+	flush_cache_mm(mm);
+	while (mpnt) {
+		struct vm_area_struct * next = mpnt->vm_next;
+		unsigned long start = mpnt->vm_start;
+		unsigned long end = mpnt->vm_end;
+		unsigned long size = end - start;
+
+		if (mpnt->vm_ops) {
+			if (mpnt->vm_ops->close)
+				mpnt->vm_ops->close(mpnt);
+		}
+		mm->map_count--;
+		remove_shared_vm_struct(mpnt);
+		zap_page_range(mm, start, size);
+		if (mpnt->vm_file)
+			fput(mpnt->vm_file);
+		kmem_cache_free(vm_area_cachep, mpnt);
+		mpnt = next;
+	}
+	flush_tlb_mm(mm);
+
+	/* This is just debugging */
+	if (mm->map_count)
+		BUG();
+
+	clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
+}
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap ring.  If vm_file is non-NULL
+ * then the i_shared_lock must be held here.
+ */
+void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+{
+	struct vm_area_struct * __vma, * prev;
+	rb_node_t ** rb_link, * rb_parent;
+
+	__vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
+	if (__vma && __vma->vm_start < vma->vm_end)
+		BUG();
+	__vma_link(mm, vma, prev, rb_link, rb_parent);
+	mm->map_count++;
+	validate_mm(mm);
+}
+
+void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+{
+	struct vm_area_struct * __vma, * prev;
+	rb_node_t ** rb_link, * rb_parent;
+
+	__vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
+	if (__vma && __vma->vm_start < vma->vm_end)
+		BUG();
+	vma_link(mm, vma, prev, rb_link, rb_parent);
+	validate_mm(mm);
+}
diff -urN linux-2.4.17-rc1-virgin/mm/mremap.c linux-2.4.17-rc1-wli3/mm/mremap.c
--- linux-2.4.17-rc1-virgin/mm/mremap.c	Thu Sep 20 20:31:26 2001
+++ linux-2.4.17-rc1-wli3/mm/mremap.c	Sun Dec 16 17:58:10 2001
@@ -61,8 +61,14 @@
 {
 	int error = 0;
 	pte_t pte;
+	struct page * page = NULL;
+
+	if (pte_present(*src))
+		page = pte_page(*src);
 
 	if (!pte_none(*src)) {
+		if (page)
+			page_remove_rmap(page, src);
 		pte = ptep_get_and_clear(src);
 		if (!dst) {
 			/* No dest?  We must put it back. */
@@ -70,6 +76,8 @@
 			error++;
 		}
 		set_pte(dst, pte);
+		if (page)
+			page_add_rmap(page, dst);
 	}
 	return error;
 }
@@ -118,7 +126,7 @@
 	flush_cache_range(mm, new_addr, new_addr + len);
 	while ((offset += PAGE_SIZE) < len)
 		move_one_page(mm, new_addr + offset, old_addr + offset);
-	zap_page_range(mm, new_addr, len);
+	zap_page_range(mm, new_addr, len, ZPR_NORMAL);
 	return -1;
 }
 
diff -urN linux-2.4.17-rc1-virgin/mm/mremap.c~ linux-2.4.17-rc1-wli3/mm/mremap.c~
--- linux-2.4.17-rc1-virgin/mm/mremap.c~	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/mm/mremap.c~	Fri Dec 14 02:44:20 2001
@@ -0,0 +1,360 @@
+/*
+ *	linux/mm/remap.c
+ *
+ *	(C) Copyright 1996 Linus Torvalds
+ */
+
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+
+extern int vm_enough_memory(long pages);
+
+static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t * pgd;
+	pmd_t * pmd;
+	pte_t * pte = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none(*pgd))
+		goto end;
+	if (pgd_bad(*pgd)) {
+		pgd_ERROR(*pgd);
+		pgd_clear(pgd);
+		goto end;
+	}
+
+	pmd = pmd_offset(pgd, addr);
+	if (pmd_none(*pmd))
+		goto end;
+	if (pmd_bad(*pmd)) {
+		pmd_ERROR(*pmd);
+		pmd_clear(pmd);
+		goto end;
+	}
+
+	pte = pte_offset(pmd, addr);
+	if (pte_none(*pte))
+		pte = NULL;
+end:
+	return pte;
+}
+
+static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr)
+{
+	pmd_t * pmd;
+	pte_t * pte = NULL;
+
+	pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr);
+	if (pmd)
+		pte = pte_alloc(mm, pmd, addr);
+	return pte;
+}
+
+static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst)
+{
+	int error = 0;
+	pte_t pte;
+	struct page * page = NULL;
+
+	if (pte_present(*src))
+		page = pte_page(*src);
+
+	if (!pte_none(*src)) {
+		if (page)
+			page_remove_rmap(page, src);
+		pte = ptep_get_and_clear(src);
+		if (!dst) {
+			/* No dest?  We must put it back. */
+			dst = src;
+			error++;
+		}
+		set_pte(dst, pte);
+		if (page)
+			page_add_rmap(page, dst);
+	}
+	return error;
+}
+
+static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr)
+{
+	int error = 0;
+	pte_t * src;
+
+	spin_lock(&mm->page_table_lock);
+	src = get_one_pte(mm, old_addr);
+	if (src)
+		error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr));
+	spin_unlock(&mm->page_table_lock);
+	return error;
+}
+
+static int move_page_tables(struct mm_struct * mm,
+	unsigned long new_addr, unsigned long old_addr, unsigned long len)
+{
+	unsigned long offset = len;
+
+	flush_cache_range(mm, old_addr, old_addr + len);
+
+	/*
+	 * This is not the clever way to do this, but we're taking the
+	 * easy way out on the assumption that most remappings will be
+	 * only a few pages.. This also makes error recovery easier.
+	 */
+	while (offset) {
+		offset -= PAGE_SIZE;
+		if (move_one_page(mm, old_addr + offset, new_addr + offset))
+			goto oops_we_failed;
+	}
+	flush_tlb_range(mm, old_addr, old_addr + len);
+	return 0;
+
+	/*
+	 * Ok, the move failed because we didn't have enough pages for
+	 * the new page table tree. This is unlikely, but we have to
+	 * take the possibility into account. In that case we just move
+	 * all the pages back (this will work, because we still have
+	 * the old page tables)
+	 */
+oops_we_failed:
+	flush_cache_range(mm, new_addr, new_addr + len);
+	while ((offset += PAGE_SIZE) < len)
+		move_one_page(mm, new_addr + offset, old_addr + offset);
+	zap_page_range(mm, new_addr, len);
+	return -1;
+}
+
+static inline unsigned long move_vma(struct vm_area_struct * vma,
+	unsigned long addr, unsigned long old_len, unsigned long new_len,
+	unsigned long new_addr)
+{
+	struct mm_struct * mm = vma->vm_mm;
+	struct vm_area_struct * new_vma, * next, * prev;
+	int allocated_vma;
+
+	new_vma = NULL;
+	next = find_vma_prev(mm, new_addr, &prev);
+	if (next) {
+		if (prev && prev->vm_end == new_addr &&
+		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+			spin_lock(&mm->page_table_lock);
+			prev->vm_end = new_addr + new_len;
+			spin_unlock(&mm->page_table_lock);
+			new_vma = prev;
+			if (next != prev->vm_next)
+				BUG();
+			if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) {
+				spin_lock(&mm->page_table_lock);
+				prev->vm_end = next->vm_end;
+				__vma_unlink(mm, next, prev);
+				spin_unlock(&mm->page_table_lock);
+
+				mm->map_count--;
+				kmem_cache_free(vm_area_cachep, next);
+			}
+		} else if (next->vm_start == new_addr + new_len &&
+			   can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+			spin_lock(&mm->page_table_lock);
+			next->vm_start = new_addr;
+			spin_unlock(&mm->page_table_lock);
+			new_vma = next;
+		}
+	} else {
+		prev = find_vma(mm, new_addr-1);
+		if (prev && prev->vm_end == new_addr &&
+		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+			spin_lock(&mm->page_table_lock);
+			prev->vm_end = new_addr + new_len;
+			spin_unlock(&mm->page_table_lock);
+			new_vma = prev;
+		}
+	}
+
+	allocated_vma = 0;
+	if (!new_vma) {
+		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		if (!new_vma)
+			goto out;
+		allocated_vma = 1;
+	}
+
+	if (!move_page_tables(current->mm, new_addr, addr, old_len)) {
+		if (allocated_vma) {
+			*new_vma = *vma;
+			new_vma->vm_start = new_addr;
+			new_vma->vm_end = new_addr+new_len;
+			new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+			new_vma->vm_raend = 0;
+			if (new_vma->vm_file)
+				get_file(new_vma->vm_file);
+			if (new_vma->vm_ops && new_vma->vm_ops->open)
+				new_vma->vm_ops->open(new_vma);
+			insert_vm_struct(current->mm, new_vma);
+		}
+		do_munmap(current->mm, addr, old_len);
+		current->mm->total_vm += new_len >> PAGE_SHIFT;
+		if (new_vma->vm_flags & VM_LOCKED) {
+			current->mm->locked_vm += new_len >> PAGE_SHIFT;
+			make_pages_present(new_vma->vm_start,
+					   new_vma->vm_end);
+		}
+		return new_addr;
+	}
+	if (allocated_vma)
+		kmem_cache_free(vm_area_cachep, new_vma);
+ out:
+	return -ENOMEM;
+}
+
+/*
+ * Expand (or shrink) an existing mapping, potentially moving it at the
+ * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ *
+ * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
+ * This option implies MREMAP_MAYMOVE.
+ */
+unsigned long do_mremap(unsigned long addr,
+	unsigned long old_len, unsigned long new_len,
+	unsigned long flags, unsigned long new_addr)
+{
+	struct vm_area_struct *vma;
+	unsigned long ret = -EINVAL;
+
+	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+		goto out;
+
+	if (addr & ~PAGE_MASK)
+		goto out;
+
+	old_len = PAGE_ALIGN(old_len);
+	new_len = PAGE_ALIGN(new_len);
+
+	/* new_addr is only valid if MREMAP_FIXED is specified */
+	if (flags & MREMAP_FIXED) {
+		if (new_addr & ~PAGE_MASK)
+			goto out;
+		if (!(flags & MREMAP_MAYMOVE))
+			goto out;
+
+		if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+			goto out;
+
+		/* Check if the location we're moving into overlaps the
+		 * old location at all, and fail if it does.
+		 */
+		if ((new_addr <= addr) && (new_addr+new_len) > addr)
+			goto out;
+
+		if ((addr <= new_addr) && (addr+old_len) > new_addr)
+			goto out;
+
+		do_munmap(current->mm, new_addr, new_len);
+	}
+
+	/*
+	 * Always allow a shrinking remap: that just unmaps
+	 * the unnecessary pages..
+	 */
+	ret = addr;
+	if (old_len >= new_len) {
+		do_munmap(current->mm, addr+new_len, old_len - new_len);
+		if (!(flags & MREMAP_FIXED) || (new_addr == addr))
+			goto out;
+	}
+
+	/*
+	 * Ok, we need to grow..  or relocate.
+	 */
+	ret = -EFAULT;
+	vma = find_vma(current->mm, addr);
+	if (!vma || vma->vm_start > addr)
+		goto out;
+	/* We can't remap across vm area boundaries */
+	if (old_len > vma->vm_end - addr)
+		goto out;
+	if (vma->vm_flags & VM_DONTEXPAND) {
+		if (new_len > old_len)
+			goto out;
+	}
+	if (vma->vm_flags & VM_LOCKED) {
+		unsigned long locked = current->mm->locked_vm << PAGE_SHIFT;
+		locked += new_len - old_len;
+		ret = -EAGAIN;
+		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
+			goto out;
+	}
+	ret = -ENOMEM;
+	if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
+	    > current->rlim[RLIMIT_AS].rlim_cur)
+		goto out;
+	/* Private writable mapping? Check memory availability.. */
+	if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
+	    !(flags & MAP_NORESERVE)				 &&
+	    !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT))
+		goto out;
+
+	/* old_len exactly to the end of the area..
+	 * And we're not relocating the area.
+	 */
+	if (old_len == vma->vm_end - addr &&
+	    !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
+	    (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
+		unsigned long max_addr = TASK_SIZE;
+		if (vma->vm_next)
+			max_addr = vma->vm_next->vm_start;
+		/* can we just expand the current mapping? */
+		if (max_addr - addr >= new_len) {
+			int pages = (new_len - old_len) >> PAGE_SHIFT;
+			spin_lock(&vma->vm_mm->page_table_lock);
+			vma->vm_end = addr + new_len;
+			spin_unlock(&vma->vm_mm->page_table_lock);
+			current->mm->total_vm += pages;
+			if (vma->vm_flags & VM_LOCKED) {
+				current->mm->locked_vm += pages;
+				make_pages_present(addr + old_len,
+						   addr + new_len);
+			}
+			ret = addr;
+			goto out;
+		}
+	}
+
+	/*
+	 * We weren't able to just expand or shrink the area,
+	 * we need to create a new one and move it..
+	 */
+	ret = -ENOMEM;
+	if (flags & MREMAP_MAYMOVE) {
+		if (!(flags & MREMAP_FIXED)) {
+			unsigned long map_flags = 0;
+			if (vma->vm_flags & VM_SHARED)
+				map_flags |= MAP_SHARED;
+
+			new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags);
+			ret = new_addr;
+			if (new_addr & ~PAGE_MASK)
+				goto out;
+		}
+		ret = move_vma(vma, addr, old_len, new_len, new_addr);
+	}
+out:
+	return ret;
+}
+
+asmlinkage unsigned long sys_mremap(unsigned long addr,
+	unsigned long old_len, unsigned long new_len,
+	unsigned long flags, unsigned long new_addr)
+{
+	unsigned long ret;
+
+	down_write(&current->mm->mmap_sem);
+	ret = do_mremap(addr, old_len, new_len, flags, new_addr);
+	up_write(&current->mm->mmap_sem);
+	return ret;
+}
diff -urN linux-2.4.17-rc1-virgin/mm/page_alloc.c linux-2.4.17-rc1-wli3/mm/page_alloc.c
--- linux-2.4.17-rc1-virgin/mm/page_alloc.c	Mon Nov 19 16:35:40 2001
+++ linux-2.4.17-rc1-wli3/mm/page_alloc.c	Fri Dec 14 02:44:20 2001
@@ -21,8 +21,9 @@
 
 int nr_swap_pages;
 int nr_active_pages;
-int nr_inactive_pages;
-struct list_head inactive_list;
+int nr_inactive_dirty_pages;
+int nr_inactive_clean_pages;
+struct list_head inactive_dirty_list;
 struct list_head active_list;
 pg_data_t *pgdat_list;
 
@@ -80,16 +81,17 @@
 		BUG();
 	if (PageLocked(page))
 		BUG();
-	if (PageLRU(page))
-		BUG();
 	if (PageActive(page))
 		BUG();
+	if (PageInactiveDirty(page))
+		BUG();
+	if (PageInactiveClean(page))
+		BUG();
+	if (page->pte_chain)
+		BUG();
 	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
-
-	if (current->flags & PF_FREE_PAGES)
-		goto local_freelist;
- back_local_freelist:
-
+	page->age = PAGE_AGE_START;
+	
 	zone = page->zone;
 
 	mask = (~0UL) << order;
@@ -134,17 +136,6 @@
 	memlist_add_head(&(base + page_idx)->list, &area->free_list);
 
 	spin_unlock_irqrestore(&zone->lock, flags);
-	return;
-
- local_freelist:
-	if (current->nr_local_pages)
-		goto back_local_freelist;
-	if (in_interrupt())
-		goto back_local_freelist;		
-
-	list_add(&page->list, &current->local_pages);
-	page->index = order;
-	current->nr_local_pages++;
 }
 
 #define MARK_USED(index, order, area) \
@@ -203,10 +194,7 @@
 			set_page_count(page, 1);
 			if (BAD_RANGE(zone,page))
 				BUG();
-			if (PageLRU(page))
-				BUG();
-			if (PageActive(page))
-				BUG();
+			DEBUG_LRU_PAGE(page);
 			return page;	
 		}
 		curr_order++;
@@ -225,78 +213,87 @@
 }
 #endif
 
-static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
-static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
+/*
+ * If we are able to directly reclaim pages, we move pages from the
+ * inactive_clean list onto the free list until the zone has enough
+ * free pages or until the inactive_clean pages are exhausted.
+ * If we cannot do the work ourselves, call kswapd.
+ */
+static void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim));
+static void fixup_freespace(zone_t * zone, int direct_reclaim)
 {
-	struct page * page = NULL;
-	int __freed = 0;
+	if (direct_reclaim) {
+		struct page * page;
+		do {
+			if ((page = reclaim_page(zone)))
+				__free_pages_ok(page, 0);
+		} while (page && zone->free_pages <= zone->pages_min);
+	} else
+		wakeup_kswapd();
+}
+
+#define PAGES_MIN	0
+#define PAGES_LOW	1
+#define PAGES_HIGH	2
 
-	if (!(gfp_mask & __GFP_WAIT))
-		goto out;
-	if (in_interrupt())
-		BUG();
-
-	current->allocation_order = order;
-	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
-
-	__freed = try_to_free_pages(classzone, gfp_mask, order);
-
-	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
-
-	if (current->nr_local_pages) {
-		struct list_head * entry, * local_pages;
-		struct page * tmp;
-		int nr_pages;
-
-		local_pages = &current->local_pages;
-
-		if (likely(__freed)) {
-			/* pick from the last inserted so we're lifo */
-			entry = local_pages->next;
-			do {
-				tmp = list_entry(entry, struct page, list);
-				if (tmp->index == order && memclass(tmp->zone, classzone)) {
-					list_del(entry);
-					current->nr_local_pages--;
-					set_page_count(tmp, 1);
-					page = tmp;
-
-					if (page->buffers)
-						BUG();
-					if (page->mapping)
-						BUG();
-					if (!VALID_PAGE(page))
-						BUG();
-					if (PageSwapCache(page))
-						BUG();
-					if (PageLocked(page))
-						BUG();
-					if (PageLRU(page))
-						BUG();
-					if (PageActive(page))
-						BUG();
-					if (PageDirty(page))
-						BUG();
+/*
+ * This function does the dirty work for __alloc_pages
+ * and is separated out to keep the code size smaller.
+ * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
+ */
+static struct page * __alloc_pages_limit(zonelist_t *zonelist,
+			unsigned long order, int limit, int direct_reclaim)
+{
+	zone_t **zone = zonelist->zones;
+	unsigned long water_mark = 0;
 
-					break;
-				}
-			} while ((entry = entry->next) != local_pages);
+	for (;;) {
+		zone_t *z = *(zone++);
+
+		if (!z)
+			break;
+		if (!z->size)
+			BUG();
+
+		/*
+		 * We allocate if the number of free + inactive_clean
+		 * pages is above the watermark.
+		 */
+		switch (limit) {
+			default:
+			case PAGES_MIN:
+				water_mark += z->pages_min;
+				break;
+			case PAGES_LOW:
+				water_mark += z->pages_low;
+				break;
+			case PAGES_HIGH:
+				water_mark += z->pages_high;
 		}
 
-		nr_pages = current->nr_local_pages;
-		/* free in reverse order so that the global order will be lifo */
-		while ((entry = local_pages->prev) != local_pages) {
-			list_del(entry);
-			tmp = list_entry(entry, struct page, list);
-			__free_pages_ok(tmp, tmp->index);
-			if (!nr_pages--)
-				BUG();
+		if (z->free_pages + z->inactive_clean_pages >= water_mark) {
+			struct page *page = NULL;
+			/* If possible, reclaim a page directly. */
+			if (direct_reclaim)
+				page = reclaim_page(z);
+			/* If that fails, fall back to rmqueue. */
+			if (!page)
+				page = rmqueue(z, order);
+			if (page)
+				return page;
+		} else if (water_mark > z->need_balance) {
+			/* Set kswapd's free+clean target for the zone.
+			 * we could do this in the init code, but this way
+			 * we support arbitrary fallback between zones.
+			 *
+			 * XXX: how about DISCONTIGMEM boxes ?
+			 */
+			z->need_balance = water_mark;
 		}
-		current->nr_local_pages = 0;
 	}
- out:
-	*freed = __freed;
-	return page;
+
+	/* Found nothing. */
+	return NULL;
 }
 
 /*
@@ -304,100 +301,239 @@
  */
 struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
 {
-	unsigned long min;
-	zone_t **zone, * classzone;
+	zone_t **zone;
+	int min, direct_reclaim = 0;
 	struct page * page;
-	int freed;
 
+	/*
+	 * (If anyone calls gfp from interrupts nonatomically then it
+	 * will sooner or later tripped up by a schedule().)
+	 *
+	 * We are falling back to lower-level zones if allocation
+	 * in a higher zone fails.
+	 */
+
+	/*
+	 * Can we take pages directly from the inactive_clean
+	 * list?
+	 */
+	if (order == 0 && (gfp_mask & __GFP_WAIT))
+		direct_reclaim = 1;
+
+try_again:
+	/*
+	 * First, see if we have any zones with lots of free memory.
+	 *
+	 * We allocate free memory first because it doesn't contain
+	 * any data ... DUH!
+	 */
 	zone = zonelist->zones;
-	classzone = *zone;
 	min = 1UL << order;
 	for (;;) {
 		zone_t *z = *(zone++);
 		if (!z)
 			break;
+		if (!z->size)
+			BUG();
 
 		min += z->pages_low;
 		if (z->free_pages > min) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
-		}
+		} else if (z->free_pages < z->pages_min)
+			fixup_freespace(z, direct_reclaim);
 	}
 
-	classzone->need_balance = 1;
-	mb();
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+	/*
+	 * Try to allocate a page from a zone with a HIGH
+	 * amount of free + inactive_clean pages.
+	 *
+	 * If there is a lot of activity, inactive_target
+	 * will be high and we'll have a good chance of
+	 * finding a page using the HIGH limit.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
+	if (page)
+		return page;
+
+	wakeup_kswapd();
+	/*
+	 * Then try to allocate a page from a zone with more
+	 * than zone->pages_low free + inactive_clean pages.
+	 *
+	 * When the working set is very large and VM activity
+	 * is low, we're most likely to have our allocation
+	 * succeed here.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * OK, none of the zones on our zonelist has lots
+	 * of pages free.
+	 *
+	 * We wake up kswapd, in the hope that kswapd will
+	 * resolve this situation before memory gets tight.
+	 *
+	 * We'll also help a bit trying to free pages, this
+	 * way statistics will make sure really fast allocators
+	 * are slowed down more than slow allocators and other
+	 * programs in the system shouldn't be impacted as much
+	 * by the hogs.
+	 */
+	if ((gfp_mask & __GFP_WAIT) && !(current->flags & (PF_MEMALLOC | PF_MEMDIE)))
+		try_to_free_pages(gfp_mask);
 
+	/*
+	 * After waking up kswapd, we try to allocate a page
+	 * from any zone which isn't critical yet.
+	 *
+	 * Kswapd should, in most situations, bring the situation
+	 * back to normal in no time.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * Damn, we didn't succeed.
+	 */
+	if (!(current->flags & PF_MEMALLOC)) {
+		/*
+		 * Are we dealing with a higher order allocation?
+		 *
+		 * Try to defragment some memory.
+		 */
+		if (order > 0 && (gfp_mask & __GFP_WAIT))
+			goto defragment;
+
+		/*
+		 * When we arrive here, we are really tight on memory.
+		 * Since kswapd didn't succeed in freeing pages for us,
+		 * we try to help it.
+		 *
+		 * Single page allocs loop until the allocation succeeds.
+		 * Multi-page allocs can fail due to memory fragmentation;
+		 * in that case we bail out to prevent infinite loops and
+		 * hanging device drivers ...
+		 *
+		 * Another issue are GFP_NOFS allocations; because they
+		 * do not have __GFP_FS set it's possible we cannot make
+		 * any progress freeing pages, in that case it's better
+		 * to give up than to deadlock the kernel looping here.
+		 *
+		 * NFS: we must yield the CPU (to rpciod) to avoid deadlock.
+		 */
+		if (gfp_mask & __GFP_WAIT) {
+			__set_current_state(TASK_RUNNING);
+			current->policy |= SCHED_YIELD;
+			schedule();
+			if (!order || free_shortage()) {
+				int progress = try_to_free_pages(gfp_mask);
+				if (progress || (gfp_mask & __GFP_FS))
+					goto try_again;
+				/*
+				 * Fail in case no progress was made and the
+				 * allocation may not be able to block on IO.
+				 */
+				return NULL;
+			}
+		}
+	}
+
+	/*
+	 * Final phase: allocate anything we can!
+	 *
+	 * Higher order allocations, GFP_ATOMIC allocations and
+	 * recursive allocations (PF_MEMALLOC) end up here.
+	 *
+	 * Only recursive allocations can use the very last pages
+	 * in the system, otherwise it would be just too easy to
+	 * deadlock the system...
+	 */
 	zone = zonelist->zones;
 	min = 1UL << order;
 	for (;;) {
-		unsigned long local_min;
 		zone_t *z = *(zone++);
+		struct page * page = NULL;
 		if (!z)
 			break;
 
-		local_min = z->pages_min;
-		if (!(gfp_mask & __GFP_WAIT))
-			local_min >>= 2;
-		min += local_min;
-		if (z->free_pages > min) {
+		/*
+		 * SUBTLE: direct_reclaim is only possible if the task
+		 * becomes PF_MEMALLOC while looping above. This will
+		 * happen when the OOM killer selects this task for
+		 * instant execution...
+		 */
+		if (direct_reclaim) {
+			page = reclaim_page(z);
+			if (page)
+				return page;
+		}
+
+		/* XXX: is pages_min/4 a good amount to reserve for this? */
+		min += z->pages_min / 4;
+		if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && !in_interrupt())) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
 		}
 	}
+	goto out_failed;
 
-	/* here we're in the low on memory slow path */
 
-rebalance:
-	if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
+	/*
+	 * Naive "defragmentation" for higher-order allocations. First we
+	 * free the inactive_clean pages to see if we can allocate our
+	 * allocation, then we call page_launder() to clean some dirty
+	 * pages and we try once more.
+	 *
+	 * We might want to turn this into something which defragments
+	 * memory based on physical page, simply by looking for unmapped
+	 * pages next to pages on the free list...
+	 */
+defragment:
+	{
+		int freed = 0;
 		zone = zonelist->zones;
+defragment_again:
 		for (;;) {
 			zone_t *z = *(zone++);
 			if (!z)
 				break;
-
-			page = rmqueue(z, order);
-			if (page)
-				return page;
+			if (!z->size)
+				continue;
+			while (z->inactive_clean_pages) {
+				struct page * page;
+				/* Move one page to the free list. */
+				page = reclaim_page(z);
+				if (!page)
+					break;
+				__free_page(page);
+				/* Try if the allocation succeeds. */
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
 		}
-		return NULL;
-	}
-
-	/* Atomic allocations - we can't balance anything */
-	if (!(gfp_mask & __GFP_WAIT))
-		return NULL;
-
-	page = balance_classzone(classzone, gfp_mask, order, &freed);
-	if (page)
-		return page;
-
-	zone = zonelist->zones;
-	min = 1UL << order;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
 
-		min += z->pages_min;
-		if (z->free_pages > min) {
-			page = rmqueue(z, order);
-			if (page)
-				return page;
+		/* XXX: do real defragmentation instead of calling launder ? */
+		if (!freed) {
+			freed = 1;
+			current->flags |= PF_MEMALLOC;
+			try_to_free_pages(gfp_mask);
+			current->flags &= ~PF_MEMALLOC;
+			goto defragment_again;
 		}
 	}
 
-	/* Don't let big-order allocations loop */
-	if (order > 3)
-		return NULL;
-
-	/* Yield for kswapd, and try again */
-	current->policy |= SCHED_YIELD;
-	__set_current_state(TASK_RUNNING);
-	schedule();
-	goto rebalance;
+
+out_failed:
+	/* No luck.. */
+//	printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
+	return NULL;
 }
 
 /*
@@ -429,7 +565,8 @@
 void page_cache_release(struct page *page)
 {
 	if (!PageReserved(page) && put_page_testzero(page)) {
-		if (PageLRU(page))
+		if (PageActive(page) || PageInactiveDirty(page) ||
+					PageInactiveClean(page))
 			lru_cache_del(page);
 		__free_pages_ok(page, 0);
 	}
@@ -537,10 +674,18 @@
 		tmpdat = tmpdat->node_next;
 	}
 
-	printk("( Active: %d, inactive: %d, free: %d )\n",
-	       nr_active_pages,
-	       nr_inactive_pages,
-	       nr_free_pages());
+	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
+		nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_highpages() << (PAGE_SHIFT-10));
+
+	printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
+		nr_active_pages,
+		nr_inactive_dirty_pages,
+		nr_inactive_clean_pages,
+		nr_free_pages(),
+		freepages.min,
+		freepages.low,
+		freepages.high);
 
 	for (type = 0; type < MAX_NR_ZONES; type++) {
 		struct list_head *head, *curr;
@@ -660,7 +805,7 @@
 	printk("On node %d totalpages: %lu\n", nid, realtotalpages);
 
 	INIT_LIST_HEAD(&active_list);
-	INIT_LIST_HEAD(&inactive_list);
+	INIT_LIST_HEAD(&inactive_dirty_list);
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -709,7 +854,10 @@
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
+		zone->inactive_clean_pages = 0;
+		zone->inactive_dirty_pages = 0;
 		zone->need_balance = 0;
+		INIT_LIST_HEAD(&zone->inactive_clean_list);
 		if (!size)
 			continue;
 
@@ -723,7 +871,20 @@
 		zone->pages_min = mask;
 		zone->pages_low = mask*2;
 		zone->pages_high = mask*3;
-
+		/*
+		 * Add these free targets to the global free target;
+		 * we have to be SURE that freepages.high is higher
+		 * than SUM [zone->pages_min] for all zones, otherwise
+		 * we may have bad bad problems.
+		 *
+		 * This means we cannot make the freepages array writable
+		 * in /proc, but have to add a separate extra_free_target
+		 * for people who require it to catch load spikes in eg.
+		 * gigabit ethernet routing...
+		 */
+		freepages.min += mask;
+		freepages.low += mask*2;
+		freepages.high += mask*3;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
diff -urN linux-2.4.17-rc1-virgin/mm/rmap.c linux-2.4.17-rc1-wli3/mm/rmap.c
--- linux-2.4.17-rc1-virgin/mm/rmap.c	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/mm/rmap.c	Fri Dec 14 04:21:37 2001
@@ -0,0 +1,354 @@
+/*
+ * mm/rmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
+ * Released under the General Public License (GPL).
+ *
+ *
+ * Simple, low overhead pte-based reverse mapping scheme.
+ * This is kept modular because we may want to experiment
+ * with object-based reverse mapping schemes. Please try
+ * to keep this thing as modular as possible.
+ */
+
+/*
+ * Locking:
+ * - the page->pte_chain is protected by the pagemap_lru_lock,
+ *   we probably want to change this to a per-page lock in the
+ *   future
+ * - because swapout locking is opposite to the locking order
+ *   in the page fault path, the swapout path uses trylocks
+ *   on the mm->page_table_lock
+ */
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+#include <asm/pgalloc.h>
+#include <asm/rmap.h>
+#include <asm/smplock.h>
+
+#ifdef DEBUG
+/* #define DEBUG */
+#undef DEBUG
+#endif
+
+/*
+ * Shared pages have a chain of pte_chain structures, used to locate
+ * all the mappings to this page. We only need a pointer to the pte
+ * here, the page struct for the page table page contains the process
+ * it belongs to and the offset within that process.
+ *
+ * A singly linked list should be fine for most, if not all, workloads.
+ * On fork-after-exec the mapping we'll be removing will still be near
+ * the start of the list, on mixed application systems the short-lived
+ * processes will have their mappings near the start of the list and
+ * in systems with long-lived applications the relative overhead of
+ * exit() will be lower since the applications are long-lived.
+ */
+struct pte_chain {
+	struct pte_chain * next;
+	pte_t * ptep;
+};
+
+static struct pte_chain * pte_chain_freelist;
+static inline struct pte_chain * pte_chain_alloc(void);
+static inline void pte_chain_free(struct pte_chain *, struct pte_chain *, struct page *);
+static void alloc_new_pte_chains(void);
+
+/**
+ * page_referenced - test if the page was referenced
+ * @page: the page to test
+ *
+ * Quick test_and_clear_referenced for all mappings to a page,
+ * returns the number of processes which referenced the page.
+ * Caller needs to hold the pagemap_lru_lock.
+ */
+int FASTCALL(page_referenced(struct page *));
+int page_referenced(struct page * page)
+{
+	struct pte_chain * pc;
+	int referenced = 0;
+
+	if (PageTestandClearReferenced(page))
+		referenced++;
+
+	/* Check all the page tables mapping this page. */
+	for (pc = page->pte_chain; pc; pc = pc->next) {
+		if (ptep_test_and_clear_young(pc->ptep))
+			referenced++;
+	}
+
+	return referenced;
+}
+
+/**
+ * page_add_rmap - add reverse mapping entry to a page
+ * @page: the page to add the mapping to
+ * @ptep: the page table entry mapping this page
+ *
+ * Add a new pte reverse mapping to a page.
+ * The caller needs to hold the mm->page_table_lock.
+ */
+void FASTCALL(page_add_rmap(struct page *, pte_t *));
+void page_add_rmap(struct page * page, pte_t * ptep)
+{
+	struct pte_chain * pte_chain, * pc;
+	struct page * pte_page = virt_to_page(ptep);
+
+	if (!page || !ptep)
+		BUG();
+	if (!pte_present(*ptep))
+		BUG();
+	if (!pte_page->mapping)
+		BUG();
+	if (!VALID_PAGE(page) || PageReserved(page))
+		return;
+
+	spin_lock(&pagemap_lru_lock);
+#ifdef DEBUG
+	for (pc = page->pte_chain; pc; pc = pc->next) {
+		if (pc->ptep == ptep)
+			BUG();
+	}
+#endif
+	pte_chain = pte_chain_alloc();
+
+	/* Hook up the pte_chain to the page. */
+	pte_chain->ptep = ptep;
+	pte_chain->next = page->pte_chain;
+	page->pte_chain = pte_chain;
+
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * page_remove_rmap - take down reverse mapping to a page
+ * @page: page to remove mapping from
+ * @ptep: page table entry to remove
+ *
+ * Removes the reverse mapping from the pte_chain of the page,
+ * after that the caller can clear the page table entry and free
+ * the page.
+ * Caller needs to hold the mm->page_table_lock.
+ */
+void FASTCALL(page_remove_rmap(struct page *, pte_t *));
+void page_remove_rmap(struct page * page, pte_t * ptep)
+{
+	struct pte_chain * pc, * prev_pc = NULL;
+
+	if (!page || !ptep)
+		BUG();
+	if (!VALID_PAGE(page) || PageReserved(page))
+		return;
+
+	spin_lock(&pagemap_lru_lock);
+	for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) {
+		if (pc->ptep == ptep) {
+			pte_chain_free(pc, prev_pc, page);
+			goto out;
+		}
+	}
+#ifdef DEBUG
+	/* Not found. This should NEVER happen! */
+	printk("page_remove_rmap: pte_chain %p not present...\n", ptep);
+	printk("page_remove_rmap: only found: ");
+	for (pc = page->pte_chain; pc; pc = pc->next)
+		printk("%p ", pc->ptep);
+	printk("\n");
+	/* panic("page_remove_rmap: giving up.\n"); */
+#endif
+
+out:
+	spin_unlock(&pagemap_lru_lock);
+	return;
+			
+}
+
+/**
+ * try_to_unmap_one - worker function for try_to_unmap
+ * @page: page to unmap
+ * @ptep: page table entry to unmap from page
+ *
+ * Internal helper function for try_to_unmap, called for each page
+ * table entry mapping a page. Because locking order here is opposite
+ * to the locking order used by the page fault path, we use trylocks.
+ * Locking:
+ *	pagemap_lru_lock		page_launder()
+ *	    page lock			page_launder(), trylock
+ *		mm->page_table_lock	try_to_unmap_one(), trylock
+ */
+int FASTCALL(try_to_unmap_one(struct page *, pte_t *));
+int try_to_unmap_one(struct page * page, pte_t * ptep)
+{
+	unsigned long address = ptep_to_address(ptep);
+	struct mm_struct * mm = ptep_to_mm(ptep);
+	struct vm_area_struct * vma;
+	pte_t pte;
+	int ret;
+
+	if (!mm)
+		BUG();
+
+	/*
+	 * We need the page_table_lock to protect us from page faults,
+	 * munmap, fork, etc...
+	 */
+	if (!spin_trylock(&mm->page_table_lock))
+		return SWAP_AGAIN;
+
+	/* During mremap, it's possible pages are not in a VMA. */
+	vma = find_vma(mm, address);
+	if (!vma) {
+		ret = SWAP_FAIL;
+		goto out_unlock;
+	}
+
+	/* The page is mlock()d, we cannot swap it out. */
+	if (vma->vm_flags & VM_LOCKED) {
+		ret = SWAP_FAIL;
+		goto out_unlock;
+	}
+
+	/* Nuke the page table entry. */
+	pte = ptep_get_and_clear(ptep);
+	flush_tlb_page(vma, address);
+	flush_cache_page(vma, address);
+
+	/* Store the swap location in the pte. See handle_pte_fault() ... */
+	if (PageSwapCache(page)) {
+		swp_entry_t entry;
+		entry.val = page->index;
+		swap_duplicate(entry);
+		set_pte(ptep, swp_entry_to_pte(entry));
+	}
+
+	/* Move the dirty bit to the physical page now the pte is gone. */
+	if (pte_dirty(pte))
+		set_page_dirty(page);
+
+	mm->rss--;
+	page_cache_release(page);
+	ret = SWAP_SUCCESS;
+
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+/**
+ * try_to_unmap - try to remove all page table mappings to a page
+ * @page: the page to get unmapped
+ *
+ * Tries to remove all the page table entries which are mapping this
+ * page, used in the pageout path.  Caller must hold pagemap_lru_lock
+ * and the page lock.  Return values are:
+ *
+ * SWAP_SUCCESS	- we succeeded in removing all mappings
+ * SWAP_AGAIN	- we missed a trylock, try again later
+ * SWAP_FAIL	- the page is unswappable
+ * SWAP_ERROR	- an error occurred
+ */
+int FASTCALL(try_to_unmap(struct page *));
+int try_to_unmap(struct page * page)
+{
+	struct pte_chain * pc, * next_pc, * prev_pc = NULL;
+	int ret = SWAP_SUCCESS;
+
+	/* This page should not be on the pageout lists. */
+	if (!VALID_PAGE(page) || PageReserved(page))
+		BUG();
+	if (!PageLocked(page))
+		BUG();
+	/* We need backing store to swap out a page. */
+	if (!page->mapping)
+		BUG();
+
+	for (pc = page->pte_chain; pc; pc = next_pc) {
+		next_pc = pc->next;
+		switch (try_to_unmap_one(page, pc->ptep)) {
+			case SWAP_SUCCESS:
+				/* Free the pte_chain struct. */
+				pte_chain_free(pc, prev_pc, page);
+				break;
+			case SWAP_AGAIN:
+				/* Skip this pte, remembering status. */
+				prev_pc = pc;
+				ret = SWAP_AGAIN;
+				continue;
+			case SWAP_FAIL:
+				return SWAP_FAIL;
+			case SWAP_ERROR:
+				return SWAP_ERROR;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * pte_chain_free - free pte_chain structure
+ * @pte_chain: pte_chain struct to free
+ * @prev_pte_chain: previous pte_chain on the list (may be NULL)
+ * @page: page this pte_chain hangs off (may be NULL)
+ *
+ * This function unlinks pte_chain from the singly linked list it
+ * may be on and adds the pte_chain to the free list. May also be
+ * called for new pte_chain structures which aren't on any list yet.
+ * Caller needs to hold the pagemap_lru_list.
+ */
+static inline void pte_chain_free(struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain, struct page * page)
+{
+	if (prev_pte_chain)
+		prev_pte_chain->next = pte_chain->next;
+	else if (page)
+		page->pte_chain = pte_chain->next;
+
+	pte_chain->ptep = NULL;
+	pte_chain->next = pte_chain_freelist;
+	pte_chain_freelist = pte_chain;
+}
+
+/**
+ * pte_chain_alloc - allocate a pte_chain struct
+ *
+ * Returns a pointer to a fresh pte_chain structure. Allocates new
+ * pte_chain structures as required.
+ * Caller needs to hold the pagemap_lru_lock.
+ */
+static inline struct pte_chain * pte_chain_alloc(void)
+{
+	struct pte_chain * pte_chain;
+
+	/* Allocate new pte_chain structs as needed. */
+	if (!pte_chain_freelist)
+		alloc_new_pte_chains();
+
+	/* Grab the first pte_chain from the freelist. */
+	pte_chain = pte_chain_freelist;
+	pte_chain_freelist = pte_chain->next;
+	pte_chain->next = NULL;
+
+	return pte_chain;
+}
+
+/**
+ * alloc_new_pte_chains - convert a free page to pte_chain structures
+ *
+ * Grabs a free page and converts it to pte_chain structures. We really
+ * should pre-allocate these earlier in the pagefault path or come up
+ * with some other trick.
+ */
+static void alloc_new_pte_chains(void)
+{
+	struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC);
+	int i = PAGE_SIZE / sizeof(struct pte_chain);
+
+	if (pte_chain) {
+		for (; i-- > 0; pte_chain++)
+			pte_chain_free(pte_chain, NULL, NULL);
+	} else {
+		/* Yeah yeah, I'll fix the pte_chain allocation ... */
+		panic("Fix pte_chain allocation, you lazy bastard!\n");
+	}
+}
diff -urN linux-2.4.17-rc1-virgin/mm/shmem.c linux-2.4.17-rc1-wli3/mm/shmem.c
--- linux-2.4.17-rc1-virgin/mm/shmem.c	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/mm/shmem.c	Wed Nov 21 09:57:57 2001
@@ -1193,7 +1193,7 @@
 	follow_link:	shmem_follow_link,
 };
 
-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long * blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, unsigned long * blocks, unsigned long *inodes)
 {
 	char *this_char, *value, *rest;
 
@@ -1205,7 +1205,7 @@
 			*value++ = 0;
 		} else {
 			printk(KERN_ERR 
-			    "tmpfs: No value for mount option '%s'\n", 
+			    "shmem_parse_options: No value for option '%s'\n", 
 			    this_char);
 			return 1;
 		}
@@ -1230,20 +1230,8 @@
 			*mode = simple_strtoul(value,&rest,8);
 			if (*rest)
 				goto bad_val;
-		} else if (!strcmp(this_char,"uid")) {
-			if (!uid)
-				continue;
-			*uid = simple_strtoul(value,&rest,0);
-			if (*rest)
-				goto bad_val;
-		} else if (!strcmp(this_char,"gid")) {
-			if (!gid)
-				continue;
-			*gid = simple_strtoul(value,&rest,0);
-			if (*rest)
-				goto bad_val;
 		} else {
-			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
+			printk(KERN_ERR "shmem_parse_options: Bad option %s\n",
 			       this_char);
 			return 1;
 		}
@@ -1251,7 +1239,7 @@
 	return 0;
 
 bad_val:
-	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 
+	printk(KERN_ERR "shmem_parse_options: Bad value '%s' for option '%s'\n", 
 	       value, this_char);
 	return 1;
 
@@ -1263,7 +1251,7 @@
 	unsigned long max_blocks = sbinfo->max_blocks;
 	unsigned long max_inodes = sbinfo->max_inodes;
 
-	if (shmem_parse_options (data, NULL, NULL, NULL, &max_blocks, &max_inodes))
+	if (shmem_parse_options (data, NULL, &max_blocks, &max_inodes))
 		return -EINVAL;
 	return shmem_set_size(sbinfo, max_blocks, max_inodes);
 }
@@ -1280,8 +1268,6 @@
 	struct dentry * root;
 	unsigned long blocks, inodes;
 	int mode   = S_IRWXUGO | S_ISVTX;
-	uid_t uid = current->fsuid;
-	gid_t gid = current->fsgid;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 	struct sysinfo si;
 
@@ -1293,8 +1279,10 @@
 	blocks = inodes = si.totalram / 2;
 
 #ifdef CONFIG_TMPFS
-	if (shmem_parse_options (data, &mode, &uid, &gid, &blocks, &inodes))
+	if (shmem_parse_options (data, &mode, &blocks, &inodes)) {
+		printk(KERN_ERR "tmpfs invalid option\n");
 		return NULL;
+	}
 #endif
 
 	spin_lock_init (&sbinfo->stat_lock);
@@ -1311,8 +1299,6 @@
 	if (!inode)
 		return NULL;
 
-	inode->i_uid = uid;
-	inode->i_gid = gid;
 	root = d_alloc_root(inode);
 	if (!root) {
 		iput(inode);
diff -urN linux-2.4.17-rc1-virgin/mm/slab.c linux-2.4.17-rc1-wli3/mm/slab.c
--- linux-2.4.17-rc1-virgin/mm/slab.c	Fri Dec 14 06:04:16 2001
+++ linux-2.4.17-rc1-wli3/mm/slab.c	Fri Dec 14 02:44:44 2001
@@ -49,7 +49,9 @@
  *  constructors and destructors are called without any locking.
  *  Several members in kmem_cache_t and slab_t never change, they
  *	are accessed without any locking.
- *  The per-cpu arrays are never accessed from the wrong cpu, no locking.
+ *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
+ *  	they are however called with local interrupts disabled so no
+ *  	preempt_disable needed.
  *  The non-constant members are protected with a per-cache irq spinlock.
  *
  * Further notes from the original documentation:
@@ -109,11 +111,9 @@
 #if DEBUG
 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
-			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
-			 SLAB_MUST_HWCACHE_ALIGN)
+			 SLAB_NO_REAP | SLAB_CACHE_DMA)
 #else
-# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
-			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN)
+# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | SLAB_CACHE_DMA)
 #endif
 
 /*
@@ -651,7 +651,7 @@
 		flags &= ~SLAB_POISON;
 	}
 #if FORCED_DEBUG
-	if ((size < (PAGE_SIZE>>3)) && !(flags & SLAB_MUST_HWCACHE_ALIGN))
+	if (size < (PAGE_SIZE>>3))
 		/*
 		 * do not red zone large object, causes severe
 		 * fragmentation.
@@ -1282,9 +1282,10 @@
 })
 
 #ifdef CONFIG_SMP
-void* kmem_cache_alloc_batch(kmem_cache_t* cachep, cpucache_t* cc, int flags)
+void* kmem_cache_alloc_batch(kmem_cache_t* cachep, int flags)
 {
 	int batchcount = cachep->batchcount;
+	cpucache_t* cc = cc_data(cachep);
 
 	spin_lock(&cachep->spinlock);
 	while (batchcount--) {
@@ -1333,7 +1334,7 @@
 				objp = cc_entry(cc)[--cc->avail];
 			} else {
 				STATS_INC_ALLOCMISS(cachep);
-				objp = kmem_cache_alloc_batch(cachep,cc,flags);
+				objp = kmem_cache_alloc_batch(cachep,flags);
 				if (!objp)
 					goto alloc_new_slab_nolock;
 			}
@@ -1921,13 +1922,12 @@
 #endif
 #ifdef CONFIG_SMP
 		{
-			cpucache_t *cc = cc_data(cachep);
 			unsigned int batchcount = cachep->batchcount;
 			unsigned int limit;
 
-			if (cc)
-				limit = cc->limit;
-			else
+			if (cc_data(cachep))
+				limit = cc_data(cachep)->limit;
+			 else
 				limit = 0;
 			len += sprintf(page+len, " : %4u %4u",
 					limit, batchcount);
diff -urN linux-2.4.17-rc1-virgin/mm/swap.c linux-2.4.17-rc1-wli3/mm/swap.c
--- linux-2.4.17-rc1-virgin/mm/swap.c	Tue Nov  6 22:44:20 2001
+++ linux-2.4.17-rc1-wli3/mm/swap.c	Fri Dec 14 02:44:20 2001
@@ -24,6 +24,20 @@
 #include <asm/uaccess.h> /* for copy_to/from_user */
 #include <asm/pgtable.h>
 
+/*
+ * We identify three levels of free memory.  We never let free mem
+ * fall below the freepages.min except for atomic allocations.  We
+ * start background swapping if we fall below freepages.high free
+ * pages, and we begin intensive swapping below freepages.low.
+ *
+ * Actual initialization is done in mm/page_alloc.c
+ */
+freepages_t freepages = {
+	0,	/* freepages.min */
+	0,	/* freepages.low */
+	0	/* freepages.high */
+};
+
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
@@ -33,17 +47,59 @@
 	8,	/* do swap I/O in clusters of this size */
 };
 
+/**
+ * (de)activate_page - move pages from/to active and inactive lists
+ * @page: the page we want to move
+ * @nolock - are we already holding the pagemap_lru_lock?
+ *
+ * Deactivate_page will move an active page to the right
+ * inactive list, while activate_page will move a page back
+ * from one of the inactive lists to the active list. If
+ * called on a page which is not on any of the lists, the
+ * page is left alone.
+ */
+void FASTCALL(deactivate_page_nolock(struct page *));
+void deactivate_page_nolock(struct page * page)
+{
+	/*
+	 * Don't touch it if it's not on the active list.
+	 * (some pages aren't on any list at all)
+	 */
+	ClearPageReferenced(page);
+	if (PageActive(page)) {
+		page->age = 0;
+		del_page_from_active_list(page);
+		add_page_to_inactive_dirty_list(page);
+	}
+}	
+
+void FASTCALL(deactivate_page(struct page *));
+void deactivate_page(struct page * page)
+{
+	spin_lock(&pagemap_lru_lock);
+	deactivate_page_nolock(page);
+	spin_unlock(&pagemap_lru_lock);
+}
+
 /*
  * Move an inactive page to the active list.
  */
-static inline void activate_page_nolock(struct page * page)
+void FASTCALL(activate_page_nolock(struct page *));
+void activate_page_nolock(struct page * page)
 {
-	if (PageLRU(page) && !PageActive(page)) {
-		del_page_from_inactive_list(page);
+	if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+		add_page_to_active_list(page);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
 		add_page_to_active_list(page);
 	}
+
+	/* Make sure the page gets a fair chance at staying active. */
+	page->age = max((int)page->age, PAGE_AGE_START);
 }
 
+void FASTCALL(activate_page(struct page *));
 void activate_page(struct page * page)
 {
 	spin_lock(&pagemap_lru_lock);
@@ -55,11 +111,12 @@
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
+void FASTCALL(lru_cache_add(struct page *));
 void lru_cache_add(struct page * page)
 {
-	if (!TestSetPageLRU(page)) {
+	if (!PageLRU(page)) {
 		spin_lock(&pagemap_lru_lock);
-		add_page_to_inactive_list(page);
+		add_page_to_active_list(page);
 		spin_unlock(&pagemap_lru_lock);
 	}
 }
@@ -71,14 +128,15 @@
  * This function is for when the caller already holds
  * the pagemap_lru_lock.
  */
+void FASTCALL(__lru_cache_del(struct page *));
 void __lru_cache_del(struct page * page)
 {
-	if (TestClearPageLRU(page)) {
-		if (PageActive(page)) {
-			del_page_from_active_list(page);
-		} else {
-			del_page_from_inactive_list(page);
-		}
+	if (PageActive(page)) {
+		del_page_from_active_list(page);
+	} else if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
 	}
 }
 
@@ -86,6 +144,7 @@
  * lru_cache_del: remove a page from the page lists
  * @page: the page to remove
  */
+void FASTCALL(lru_cache_del(struct page *));
 void lru_cache_del(struct page * page)
 {
 	spin_lock(&pagemap_lru_lock);
diff -urN linux-2.4.17-rc1-virgin/mm/swap_state.c linux-2.4.17-rc1-wli3/mm/swap_state.c
--- linux-2.4.17-rc1-virgin/mm/swap_state.c	Wed Oct 31 15:31:03 2001
+++ linux-2.4.17-rc1-wli3/mm/swap_state.c	Fri Dec 14 02:44:20 2001
@@ -89,6 +89,40 @@
 	return 0;
 }
 
+/**
+ * add_to_swap - allocate swap space for a page
+ * @page: page we want to move to swap
+ *
+ * Allocate swap space for the page and add the page to the
+ * swap cache.  Caller needs to hold the page lock.
+ */
+int add_to_swap(struct page * page)
+{
+	swp_entry_t entry;
+
+	if (!PageLocked(page))
+		BUG();
+
+	for (;;) {
+		entry = get_swap_page();
+		if (!entry.val)
+			return 0;
+		/*
+		 * Add it to the swap cache and mark it dirty
+		 * (adding to the page cache will clear the dirty
+		 * and uptodate bits, so we need to do it again)
+		 */
+		if (add_to_swap_cache(page, entry) == 0) {
+			SetPageUptodate(page);
+			set_page_dirty(page);
+			swap_free(entry);
+			return 1;
+		}
+		/* Raced with "speculative" read_swap_cache_async */
+		swap_free(entry);
+	}
+}
+
 /*
  * This must be called only on pages that have
  * been verified to be in the swap cache.
diff -urN linux-2.4.17-rc1-virgin/mm/swapfile.c linux-2.4.17-rc1-wli3/mm/swapfile.c
--- linux-2.4.17-rc1-virgin/mm/swapfile.c	Sat Nov  3 17:05:25 2001
+++ linux-2.4.17-rc1-wli3/mm/swapfile.c	Sun Dec 16 17:58:10 2001
@@ -374,6 +374,7 @@
 		return;
 	get_page(page);
 	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
+	page_add_rmap(page, dir);
 	swap_free(entry);
 	++vma->vm_mm->rss;
 }
@@ -696,6 +697,7 @@
 		 * interactive performance.  Interruptible check on
 		 * signal_pending() would be nice, but changes the spec?
 		 */
+		debug_lock_break(551);
 		if (current->need_resched)
 			schedule();
 	}
@@ -1121,6 +1123,13 @@
 		if (swap_info[i].flags != SWP_USED)
 			continue;
 		for (j = 0; j < swap_info[i].max; ++j) {
+			if (conditional_schedule_needed()) {
+				debug_lock_break(551);
+				swap_list_unlock();
+				debug_lock_break(551);
+				unconditional_schedule();
+				swap_list_lock();
+			}
 			switch (swap_info[i].swap_map[j]) {
 				case 0:
 				case SWAP_MAP_BAD:
diff -urN linux-2.4.17-rc1-virgin/mm/swapfile.c~ linux-2.4.17-rc1-wli3/mm/swapfile.c~
--- linux-2.4.17-rc1-virgin/mm/swapfile.c~	Wed Dec 31 16:00:00 1969
+++ linux-2.4.17-rc1-wli3/mm/swapfile.c~	Fri Dec 14 02:44:20 2001
@@ -0,0 +1,1291 @@
+/*
+ *  linux/mm/swapfile.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ */
+
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
+#include <linux/blkdev.h> /* for blk_size */
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/shm.h>
+#include <linux/compiler.h>
+
+#include <asm/pgtable.h>
+
+spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
+unsigned int nr_swapfiles;
+int total_swap_pages;
+static int swap_overflow;
+
+static const char Bad_file[] = "Bad swap file entry ";
+static const char Unused_file[] = "Unused swap file entry ";
+static const char Bad_offset[] = "Bad swap offset entry ";
+static const char Unused_offset[] = "Unused swap offset entry ";
+
+struct swap_list_t swap_list = {-1, -1};
+
+struct swap_info_struct swap_info[MAX_SWAPFILES];
+
+#define SWAPFILE_CLUSTER 256
+
+static inline int scan_swap_map(struct swap_info_struct *si)
+{
+	unsigned long offset;
+	/* 
+	 * We try to cluster swap pages by allocating them
+	 * sequentially in swap.  Once we've allocated
+	 * SWAPFILE_CLUSTER pages this way, however, we resort to
+	 * first-free allocation, starting a new cluster.  This
+	 * prevents us from scattering swap pages all over the entire
+	 * swap partition, so that we reduce overall disk seek times
+	 * between swap pages.  -- sct */
+	if (si->cluster_nr) {
+		while (si->cluster_next <= si->highest_bit) {
+			offset = si->cluster_next++;
+			if (si->swap_map[offset])
+				continue;
+			si->cluster_nr--;
+			goto got_page;
+		}
+	}
+	si->cluster_nr = SWAPFILE_CLUSTER;
+
+	/* try to find an empty (even not aligned) cluster. */
+	offset = si->lowest_bit;
+ check_next_cluster:
+	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
+	{
+		int nr;
+		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
+			if (si->swap_map[nr])
+			{
+				offset = nr+1;
+				goto check_next_cluster;
+			}
+		/* We found a completly empty cluster, so start
+		 * using it.
+		 */
+		goto got_page;
+	}
+	/* No luck, so now go finegrined as usual. -Andrea */
+	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
+		if (si->swap_map[offset])
+			continue;
+		si->lowest_bit = offset+1;
+	got_page:
+		if (offset == si->lowest_bit)
+			si->lowest_bit++;
+		if (offset == si->highest_bit)
+			si->highest_bit--;
+		if (si->lowest_bit > si->highest_bit) {
+			si->lowest_bit = si->max;
+			si->highest_bit = 0;
+		}
+		si->swap_map[offset] = 1;
+		nr_swap_pages--;
+		si->cluster_next = offset+1;
+		return offset;
+	}
+	si->lowest_bit = si->max;
+	si->highest_bit = 0;
+	return 0;
+}
+
+swp_entry_t get_swap_page(void)
+{
+	struct swap_info_struct * p;
+	unsigned long offset;
+	swp_entry_t entry;
+	int type, wrapped = 0;
+
+	entry.val = 0;	/* Out of memory */
+	swap_list_lock();
+	type = swap_list.next;
+	if (type < 0)
+		goto out;
+	if (nr_swap_pages <= 0)
+		goto out;
+
+	while (1) {
+		p = &swap_info[type];
+		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
+			swap_device_lock(p);
+			offset = scan_swap_map(p);
+			swap_device_unlock(p);
+			if (offset) {
+				entry = SWP_ENTRY(type,offset);
+				type = swap_info[type].next;
+				if (type < 0 ||
+					p->prio != swap_info[type].prio) {
+						swap_list.next = swap_list.head;
+				} else {
+					swap_list.next = type;
+				}
+				goto out;
+			}
+		}
+		type = p->next;
+		if (!wrapped) {
+			if (type < 0 || p->prio != swap_info[type].prio) {
+				type = swap_list.head;
+				wrapped = 1;
+			}
+		} else
+			if (type < 0)
+				goto out;	/* out of swap space */
+	}
+out:
+	swap_list_unlock();
+	return entry;
+}
+
+static struct swap_info_struct * swap_info_get(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+	unsigned long offset, type;
+
+	if (!entry.val)
+		goto out;
+	type = SWP_TYPE(entry);
+	if (type >= nr_swapfiles)
+		goto bad_nofile;
+	p = & swap_info[type];
+	if (!(p->flags & SWP_USED))
+		goto bad_device;
+	offset = SWP_OFFSET(entry);
+	if (offset >= p->max)
+		goto bad_offset;
+	if (!p->swap_map[offset])
+		goto bad_free;
+	swap_list_lock();
+	if (p->prio > swap_info[swap_list.next].prio)
+		swap_list.next = type;
+	swap_device_lock(p);
+	return p;
+
+bad_free:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
+	goto out;
+bad_offset:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
+	goto out;
+bad_device:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
+	goto out;
+bad_nofile:
+	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
+out:
+	return NULL;
+}	
+
+static void swap_info_put(struct swap_info_struct * p)
+{
+	swap_device_unlock(p);
+	swap_list_unlock();
+}
+
+static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+{
+	int count = p->swap_map[offset];
+
+	if (count < SWAP_MAP_MAX) {
+		count--;
+		p->swap_map[offset] = count;
+		if (!count) {
+			if (offset < p->lowest_bit)
+				p->lowest_bit = offset;
+			if (offset > p->highest_bit)
+				p->highest_bit = offset;
+			nr_swap_pages++;
+		}
+	}
+	return count;
+}
+
+/*
+ * Caller has made sure that the swapdevice corresponding to entry
+ * is still around or has not been recycled.
+ */
+void swap_free(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+
+	p = swap_info_get(entry);
+	if (p) {
+		swap_entry_free(p, SWP_OFFSET(entry));
+		swap_info_put(p);
+	}
+}
+
+/*
+ * Check if we're the only user of a swap page,
+ * when the page is locked.
+ */
+static int exclusive_swap_page(struct page *page)
+{
+	int retval = 0;
+	struct swap_info_struct * p;
+	swp_entry_t entry;
+
+	entry.val = page->index;
+	p = swap_info_get(entry);
+	if (p) {
+		/* Is the only swap cache user the cache itself? */
+		if (p->swap_map[SWP_OFFSET(entry)] == 1) {
+			/* Recheck the page count with the pagecache lock held.. */
+			spin_lock(&pagecache_lock);
+			if (page_count(page) - !!page->buffers == 2)
+				retval = 1;
+			spin_unlock(&pagecache_lock);
+		}
+		swap_info_put(p);
+	}
+	return retval;
+}
+
+/*
+ * We can use this swap cache entry directly
+ * if there are no other references to it.
+ *
+ * Here "exclusive_swap_page()" does the real
+ * work, but we opportunistically check whether
+ * we need to get all the locks first..
+ */
+int can_share_swap_page(struct page *page)
+{
+	int retval = 0;
+
+	if (!PageLocked(page))
+		BUG();
+	switch (page_count(page)) {
+	case 3:
+		if (!page->buffers)
+			break;
+		/* Fallthrough */
+	case 2:
+		if (!PageSwapCache(page))
+			break;
+		retval = exclusive_swap_page(page);
+		break;
+	case 1:
+		if (PageReserved(page))
+			break;
+		retval = 1;
+	}
+	return retval;
+}
+
+/*
+ * Work out if there are any other processes sharing this
+ * swap cache page. Free it if you can. Return success.
+ */
+int remove_exclusive_swap_page(struct page *page)
+{
+	int retval;
+	struct swap_info_struct * p;
+	swp_entry_t entry;
+
+	if (!PageLocked(page))
+		BUG();
+	if (!PageSwapCache(page))
+		return 0;
+	if (page_count(page) - !!page->buffers != 2)	/* 2: us + cache */
+		return 0;
+
+	entry.val = page->index;
+	p = swap_info_get(entry);
+	if (!p)
+		return 0;
+
+	/* Is the only swap cache user the cache itself? */
+	retval = 0;
+	if (p->swap_map[SWP_OFFSET(entry)] == 1) {
+		/* Recheck the page count with the pagecache lock held.. */
+		spin_lock(&pagecache_lock);
+		if (page_count(page) - !!page->buffers == 2) {
+			__delete_from_swap_cache(page);
+			SetPageDirty(page);
+			retval = 1;
+		}
+		spin_unlock(&pagecache_lock);
+	}
+	swap_info_put(p);
+
+	if (retval) {
+		block_flushpage(page, 0);
+		swap_free(entry);
+		page_cache_release(page);
+	}
+
+	return retval;
+}
+
+/*
+ * Free the swap entry like above, but also try to
+ * free the page cache entry if it is the last user.
+ */
+void free_swap_and_cache(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+	struct page *page = NULL;
+
+	p = swap_info_get(entry);
+	if (p) {
+		if (swap_entry_free(p, SWP_OFFSET(entry)) == 1)
+			page = find_trylock_page(&swapper_space, entry.val);
+		swap_info_put(p);
+	}
+	if (page) {
+		page_cache_get(page);
+		/* Only cache user (+us), or swap space full? Free it! */
+		if (page_count(page) == 2 || vm_swap_full()) {
+			delete_from_swap_cache(page);
+			SetPageDirty(page);
+		}
+		UnlockPage(page);
+		page_cache_release(page);
+	}
+}
+
+/*
+ * The swap entry has been read in advance, and we return 1 to indicate
+ * that the page has been used or is no longer needed.
+ *
+ * Always set the resulting pte to be nowrite (the same as COW pages
+ * after one process has exited).  We don't know just how many PTEs will
+ * share this swap entry, so be cautious and let do_wp_page work out
+ * what to do if a write is requested later.
+ */
+/* mmlist_lock and vma->vm_mm->page_table_lock are held */
+static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
+	pte_t *dir, swp_entry_t entry, struct page* page)
+{
+	pte_t pte = *dir;
+
+	if (likely(pte_to_swp_entry(pte).val != entry.val))
+		return;
+	if (unlikely(pte_none(pte) || pte_present(pte)))
+		return;
+	get_page(page);
+	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
+	page_add_rmap(page, dir);
+	swap_free(entry);
+	++vma->vm_mm->rss;
+}
+
+/* mmlist_lock and vma->vm_mm->page_table_lock are held */
+static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
+	unsigned long address, unsigned long size, unsigned long offset,
+	swp_entry_t entry, struct page* page)
+{
+	pte_t * pte;
+	unsigned long end;
+
+	if (pmd_none(*dir))
+		return;
+	if (pmd_bad(*dir)) {
+		pmd_ERROR(*dir);
+		pmd_clear(dir);
+		return;
+	}
+	pte = pte_offset(dir, address);
+	offset += address & PMD_MASK;
+	address &= ~PMD_MASK;
+	end = address + size;
+	if (end > PMD_SIZE)
+		end = PMD_SIZE;
+	do {
+		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
+		address += PAGE_SIZE;
+		pte++;
+	} while (address && (address < end));
+}
+
+/* mmlist_lock and vma->vm_mm->page_table_lock are held */
+static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
+	unsigned long address, unsigned long size,
+	swp_entry_t entry, struct page* page)
+{
+	pmd_t * pmd;
+	unsigned long offset, end;
+
+	if (pgd_none(*dir))
+		return;
+	if (pgd_bad(*dir)) {
+		pgd_ERROR(*dir);
+		pgd_clear(dir);
+		return;
+	}
+	pmd = pmd_offset(dir, address);
+	offset = address & PGDIR_MASK;
+	address &= ~PGDIR_MASK;
+	end = address + size;
+	if (end > PGDIR_SIZE)
+		end = PGDIR_SIZE;
+	if (address >= end)
+		BUG();
+	do {
+		unuse_pmd(vma, pmd, address, end - address, offset, entry,
+			  page);
+		address = (address + PMD_SIZE) & PMD_MASK;
+		pmd++;
+	} while (address && (address < end));
+}
+
+/* mmlist_lock and vma->vm_mm->page_table_lock are held */
+static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+			swp_entry_t entry, struct page* page)
+{
+	unsigned long start = vma->vm_start, end = vma->vm_end;
+
+	if (start >= end)
+		BUG();
+	do {
+		unuse_pgd(vma, pgdir, start, end - start, entry, page);
+		start = (start + PGDIR_SIZE) & PGDIR_MASK;
+		pgdir++;
+	} while (start && (start < end));
+}
+
+static void unuse_process(struct mm_struct * mm,
+			swp_entry_t entry, struct page* page)
+{
+	struct vm_area_struct* vma;
+
+	/*
+	 * Go through process' page directory.
+	 */
+	spin_lock(&mm->page_table_lock);
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
+		unuse_vma(vma, pgd, entry, page);
+	}
+	spin_unlock(&mm->page_table_lock);
+	return;
+}
+
+/*
+ * Scan swap_map from current position to next entry still in use.
+ * Recycle to start on reaching the end, returning 0 when empty.
+ */
+static int find_next_to_unuse(struct swap_info_struct *si, int prev)
+{
+	int max = si->max;
+	int i = prev;
+	int count;
+
+	/*
+	 * No need for swap_device_lock(si) here: we're just looking
+	 * for whether an entry is in use, not modifying it; false
+	 * hits are okay, and sys_swapoff() has already prevented new
+	 * allocations from this area (while holding swap_list_lock()).
+	 */
+	for (;;) {
+		if (++i >= max) {
+			if (!prev) {
+				i = 0;
+				break;
+			}
+			/*
+			 * No entries in use at top of swap_map,
+			 * loop back to start and recheck there.
+			 */
+			max = prev + 1;
+			prev = 0;
+			i = 1;
+		}
+		count = si->swap_map[i];
+		if (count && count != SWAP_MAP_BAD)
+			break;
+	}
+	return i;
+}
+
+/*
+ * We completely avoid races by reading each swap page in advance,
+ * and then search for the process using it.  All the necessary
+ * page table adjustments can then be made atomically.
+ */
+static int try_to_unuse(unsigned int type)
+{
+	struct swap_info_struct * si = &swap_info[type];
+	struct mm_struct *start_mm;
+	unsigned short *swap_map;
+	unsigned short swcount;
+	struct page *page;
+	swp_entry_t entry;
+	int i = 0;
+	int retval = 0;
+	int reset_overflow = 0;
+
+	/*
+	 * When searching mms for an entry, a good strategy is to
+	 * start at the first mm we freed the previous entry from
+	 * (though actually we don't notice whether we or coincidence
+	 * freed the entry).  Initialize this start_mm with a hold.
+	 *
+	 * A simpler strategy would be to start at the last mm we
+	 * freed the previous entry from; but that would take less
+	 * advantage of mmlist ordering (now preserved by swap_out()),
+	 * which clusters forked address spaces together, most recent
+	 * child immediately after parent.  If we race with dup_mmap(),
+	 * we very much want to resolve parent before child, otherwise
+	 * we may miss some entries: using last mm would invert that.
+	 */
+	start_mm = &init_mm;
+	atomic_inc(&init_mm.mm_users);
+
+	/*
+	 * Keep on scanning until all entries have gone.  Usually,
+	 * one pass through swap_map is enough, but not necessarily:
+	 * mmput() removes mm from mmlist before exit_mmap() and its
+	 * zap_page_range().  That's not too bad, those entries are
+	 * on their way out, and handled faster there than here.
+	 * do_munmap() behaves similarly, taking the range out of mm's
+	 * vma list before zap_page_range().  But unfortunately, when
+	 * unmapping a part of a vma, it takes the whole out first,
+	 * then reinserts what's left after (might even reschedule if
+	 * open() method called) - so swap entries may be invisible
+	 * to swapoff for a while, then reappear - but that is rare.
+	 */
+	while ((i = find_next_to_unuse(si, i))) {
+		/* 
+		 * Get a page for the entry, using the existing swap
+		 * cache page if there is one.  Otherwise, get a clean
+		 * page and read the swap into it. 
+		 */
+		swap_map = &si->swap_map[i];
+		entry = SWP_ENTRY(type, i);
+		page = read_swap_cache_async(entry);
+		if (!page) {
+			/*
+			 * Either swap_duplicate() failed because entry
+			 * has been freed independently, and will not be
+			 * reused since sys_swapoff() already disabled
+			 * allocation from here, or alloc_page() failed.
+			 */
+			if (!*swap_map)
+				continue;
+			retval = -ENOMEM;
+			break;
+		}
+
+		/*
+		 * Don't hold on to start_mm if it looks like exiting.
+		 */
+		if (atomic_read(&start_mm->mm_users) == 1) {
+			mmput(start_mm);
+			start_mm = &init_mm;
+			atomic_inc(&init_mm.mm_users);
+		}
+
+		/*
+		 * Wait for and lock page.  When do_swap_page races with
+		 * try_to_unuse, do_swap_page can handle the fault much
+		 * faster than try_to_unuse can locate the entry.  This
+		 * apparently redundant "wait_on_page" lets try_to_unuse
+		 * defer to do_swap_page in such a case - in some tests,
+		 * do_swap_page and try_to_unuse repeatedly compete.
+		 */
+		wait_on_page(page);
+		lock_page(page);
+
+		/*
+		 * Remove all references to entry, without blocking.
+		 * Whenever we reach init_mm, there's no address space
+		 * to search, but use it as a reminder to search shmem.
+		 */
+		swcount = *swap_map;
+		if (swcount > 1) {
+			flush_page_to_ram(page);
+			if (start_mm == &init_mm)
+				shmem_unuse(entry, page);
+			else
+				unuse_process(start_mm, entry, page);
+		}
+		if (*swap_map > 1) {
+			int set_start_mm = (*swap_map >= swcount);
+			struct list_head *p = &start_mm->mmlist;
+			struct mm_struct *new_start_mm = start_mm;
+			struct mm_struct *mm;
+
+			spin_lock(&mmlist_lock);
+			while (*swap_map > 1 &&
+					(p = p->next) != &start_mm->mmlist) {
+				mm = list_entry(p, struct mm_struct, mmlist);
+				swcount = *swap_map;
+				if (mm == &init_mm) {
+					set_start_mm = 1;
+					shmem_unuse(entry, page);
+				} else
+					unuse_process(mm, entry, page);
+				if (set_start_mm && *swap_map < swcount) {
+					new_start_mm = mm;
+					set_start_mm = 0;
+				}
+			}
+			atomic_inc(&new_start_mm->mm_users);
+			spin_unlock(&mmlist_lock);
+			mmput(start_mm);
+			start_mm = new_start_mm;
+		}
+
+		/*
+		 * How could swap count reach 0x7fff when the maximum
+		 * pid is 0x7fff, and there's no way to repeat a swap
+		 * page within an mm (except in shmem, where it's the
+		 * shared object which takes the reference count)?
+		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
+		 *
+		 * If that's wrong, then we should worry more about
+		 * exit_mmap() and do_munmap() cases described above:
+		 * we might be resetting SWAP_MAP_MAX too early here.
+		 * We know "Undead"s can happen, they're okay, so don't
+		 * report them; but do report if we reset SWAP_MAP_MAX.
+		 */
+		if (*swap_map == SWAP_MAP_MAX) {
+			swap_list_lock();
+			swap_device_lock(si);
+			nr_swap_pages++;
+			*swap_map = 1;
+			swap_device_unlock(si);
+			swap_list_unlock();
+			reset_overflow = 1;
+		}
+
+		/*
+		 * If a reference remains (rare), we would like to leave
+		 * the page in the swap cache; but try_to_swap_out could
+		 * then re-duplicate the entry once we drop page lock,
+		 * so we might loop indefinitely; also, that page could
+		 * not be swapped out to other storage meanwhile.  So:
+		 * delete from cache even if there's another reference,
+		 * after ensuring that the data has been saved to disk -
+		 * since if the reference remains (rarer), it will be
+		 * read from disk into another page.  Splitting into two
+		 * pages would be incorrect if swap supported "shared
+		 * private" pages, but they are handled by tmpfs files.
+		 * Note shmem_unuse already deleted its from swap cache.
+		 */
+		swcount = *swap_map;
+		if ((swcount > 0) != PageSwapCache(page))
+			BUG();
+		if ((swcount > 1) && PageDirty(page)) {
+			rw_swap_page(WRITE, page);
+			lock_page(page);
+		}
+		if (PageSwapCache(page))
+			delete_from_swap_cache(page);
+
+		/*
+		 * So we could skip searching mms once swap count went
+		 * to 1, we did not mark any present ptes as dirty: must
+		 * mark page dirty so try_to_swap_out will preserve it.
+		 */
+		SetPageDirty(page);
+		UnlockPage(page);
+		page_cache_release(page);
+
+		/*
+		 * Make sure that we aren't completely killing
+		 * interactive performance.  Interruptible check on
+		 * signal_pending() would be nice, but changes the spec?
+		 */
+		if (current->need_resched)
+			schedule();
+	}
+
+	mmput(start_mm);
+	if (reset_overflow) {
+		printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
+		swap_overflow = 0;
+	}
+	return retval;
+}
+
+asmlinkage long sys_swapoff(const char * specialfile)
+{
+	struct swap_info_struct * p = NULL;
+	unsigned short *swap_map;
+	struct nameidata nd;
+	int i, type, prev;
+	int err;
+	
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = user_path_walk(specialfile, &nd);
+	if (err)
+		goto out;
+
+	lock_kernel();
+	prev = -1;
+	swap_list_lock();
+	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
+		p = swap_info + type;
+		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
+			if (p->swap_file == nd.dentry)
+			  break;
+		}
+		prev = type;
+	}
+	err = -EINVAL;
+	if (type < 0) {
+		swap_list_unlock();
+		goto out_dput;
+	}
+
+	if (prev < 0) {
+		swap_list.head = p->next;
+	} else {
+		swap_info[prev].next = p->next;
+	}
+	if (type == swap_list.next) {
+		/* just pick something that's safe... */
+		swap_list.next = swap_list.head;
+	}
+	nr_swap_pages -= p->pages;
+	total_swap_pages -= p->pages;
+	p->flags = SWP_USED;
+	swap_list_unlock();
+	unlock_kernel();
+	err = try_to_unuse(type);
+	lock_kernel();
+	if (err) {
+		/* re-insert swap space back into swap_list */
+		swap_list_lock();
+		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
+			if (p->prio >= swap_info[i].prio)
+				break;
+		p->next = i;
+		if (prev < 0)
+			swap_list.head = swap_list.next = p - swap_info;
+		else
+			swap_info[prev].next = p - swap_info;
+		nr_swap_pages += p->pages;
+		total_swap_pages += p->pages;
+		p->flags = SWP_WRITEOK;
+		swap_list_unlock();
+		goto out_dput;
+	}
+	if (p->swap_device)
+		blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP);
+	path_release(&nd);
+
+	swap_list_lock();
+	swap_device_lock(p);
+	nd.mnt = p->swap_vfsmnt;
+	nd.dentry = p->swap_file;
+	p->swap_vfsmnt = NULL;
+	p->swap_file = NULL;
+	p->swap_device = 0;
+	p->max = 0;
+	swap_map = p->swap_map;
+	p->swap_map = NULL;
+	p->flags = 0;
+	swap_device_unlock(p);
+	swap_list_unlock();
+	vfree(swap_map);
+	err = 0;
+
+out_dput:
+	unlock_kernel();
+	path_release(&nd);
+out:
+	return err;
+}
+
+int get_swaparea_info(char *buf)
+{
+	char * page = (char *) __get_free_page(GFP_KERNEL);
+	struct swap_info_struct *ptr = swap_info;
+	int i, j, len = 0, usedswap;
+
+	if (!page)
+		return -ENOMEM;
+
+	len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
+	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
+		if ((ptr->flags & SWP_USED) && ptr->swap_map) {
+			char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt,
+						page, PAGE_SIZE);
+
+			len += sprintf(buf + len, "%-31s ", path);
+
+			if (!ptr->swap_device)
+				len += sprintf(buf + len, "file\t\t");
+			else
+				len += sprintf(buf + len, "partition\t");
+
+			usedswap = 0;
+			for (j = 0; j < ptr->max; ++j)
+				switch (ptr->swap_map[j]) {
+					case SWAP_MAP_BAD:
+					case 0:
+						continue;
+					default:
+						usedswap++;
+				}
+			len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), 
+				usedswap << (PAGE_SHIFT - 10), ptr->prio);
+		}
+	}
+	free_page((unsigned long) page);
+	return len;
+}
+
+int is_swap_partition(kdev_t dev) {
+	struct swap_info_struct *ptr = swap_info;
+	int i;
+
+	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
+		if (ptr->flags & SWP_USED)
+			if (ptr->swap_device == dev)
+				return 1;
+	}
+	return 0;
+}
+
+/*
+ * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
+ *
+ * The swapon system call
+ */
+asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
+{
+	struct swap_info_struct * p;
+	struct nameidata nd;
+	struct inode * swap_inode;
+	unsigned int type;
+	int i, j, prev;
+	int error;
+	static int least_priority = 0;
+	union swap_header *swap_header = 0;
+	int swap_header_version;
+	int nr_good_pages = 0;
+	unsigned long maxpages = 1;
+	int swapfilesize;
+	struct block_device *bdev = NULL;
+	unsigned short *swap_map;
+	
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	lock_kernel();
+	swap_list_lock();
+	p = swap_info;
+	for (type = 0 ; type < nr_swapfiles ; type++,p++)
+		if (!(p->flags & SWP_USED))
+			break;
+	error = -EPERM;
+	if (type >= MAX_SWAPFILES) {
+		swap_list_unlock();
+		goto out;
+	}
+	if (type >= nr_swapfiles)
+		nr_swapfiles = type+1;
+	p->flags = SWP_USED;
+	p->swap_file = NULL;
+	p->swap_vfsmnt = NULL;
+	p->swap_device = 0;
+	p->swap_map = NULL;
+	p->lowest_bit = 0;
+	p->highest_bit = 0;
+	p->cluster_nr = 0;
+	p->sdev_lock = SPIN_LOCK_UNLOCKED;
+	p->next = -1;
+	if (swap_flags & SWAP_FLAG_PREFER) {
+		p->prio =
+		  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
+	} else {
+		p->prio = --least_priority;
+	}
+	swap_list_unlock();
+	error = user_path_walk(specialfile, &nd);
+	if (error)
+		goto bad_swap_2;
+
+	p->swap_file = nd.dentry;
+	p->swap_vfsmnt = nd.mnt;
+	swap_inode = nd.dentry->d_inode;
+	error = -EINVAL;
+
+	if (S_ISBLK(swap_inode->i_mode)) {
+		kdev_t dev = swap_inode->i_rdev;
+		struct block_device_operations *bdops;
+
+		p->swap_device = dev;
+		set_blocksize(dev, PAGE_SIZE);
+		
+		bd_acquire(swap_inode);
+		bdev = swap_inode->i_bdev;
+		bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode));
+		if (bdops) bdev->bd_op = bdops;
+
+		error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
+		if (error)
+			goto bad_swap_2;
+		set_blocksize(dev, PAGE_SIZE);
+		error = -ENODEV;
+		if (!dev || (blk_size[MAJOR(dev)] &&
+		     !blk_size[MAJOR(dev)][MINOR(dev)]))
+			goto bad_swap;
+		swapfilesize = 0;
+		if (blk_size[MAJOR(dev)])
+			swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
+				>> (PAGE_SHIFT - 10);
+	} else if (S_ISREG(swap_inode->i_mode))
+		swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
+	else
+		goto bad_swap;
+
+	error = -EBUSY;
+	for (i = 0 ; i < nr_swapfiles ; i++) {
+		struct swap_info_struct *q = &swap_info[i];
+		if (i == type || !q->swap_file)
+			continue;
+		if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping)
+			goto bad_swap;
+	}
+
+	swap_header = (void *) __get_free_page(GFP_USER);
+	if (!swap_header) {
+		printk("Unable to start swapping: out of memory :-)\n");
+		error = -ENOMEM;
+		goto bad_swap;
+	}
+
+	lock_page(virt_to_page(swap_header));
+	rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header);
+
+	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
+		swap_header_version = 1;
+	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
+		swap_header_version = 2;
+	else {
+		printk("Unable to find swap-space signature\n");
+		error = -EINVAL;
+		goto bad_swap;
+	}
+	
+	switch (swap_header_version) {
+	case 1:
+		memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
+		j = 0;
+		p->lowest_bit = 0;
+		p->highest_bit = 0;
+		for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
+			if (test_bit(i,(char *) swap_header)) {
+				if (!p->lowest_bit)
+					p->lowest_bit = i;
+				p->highest_bit = i;
+				maxpages = i+1;
+				j++;
+			}
+		}
+		nr_good_pages = j;
+		p->swap_map = vmalloc(maxpages * sizeof(short));
+		if (!p->swap_map) {
+			error = -ENOMEM;		
+			goto bad_swap;
+		}
+		for (i = 1 ; i < maxpages ; i++) {
+			if (test_bit(i,(char *) swap_header))
+				p->swap_map[i] = 0;
+			else
+				p->swap_map[i] = SWAP_MAP_BAD;
+		}
+		break;
+
+	case 2:
+		/* Check the swap header's sub-version and the size of
+                   the swap file and bad block lists */
+		if (swap_header->info.version != 1) {
+			printk(KERN_WARNING
+			       "Unable to handle swap header version %d\n",
+			       swap_header->info.version);
+			error = -EINVAL;
+			goto bad_swap;
+		}
+
+		p->lowest_bit  = 1;
+		maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1;
+		if (maxpages > swap_header->info.last_page)
+			maxpages = swap_header->info.last_page;
+		p->highest_bit = maxpages - 1;
+
+		error = -EINVAL;
+		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+			goto bad_swap;
+		
+		/* OK, set up the swap map and apply the bad block list */
+		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
+			error = -ENOMEM;
+			goto bad_swap;
+		}
+
+		error = 0;
+		memset(p->swap_map, 0, maxpages * sizeof(short));
+		for (i=0; i<swap_header->info.nr_badpages; i++) {
+			int page = swap_header->info.badpages[i];
+			if (page <= 0 || page >= swap_header->info.last_page)
+				error = -EINVAL;
+			else
+				p->swap_map[page] = SWAP_MAP_BAD;
+		}
+		nr_good_pages = swap_header->info.last_page -
+				swap_header->info.nr_badpages -
+				1 /* header page */;
+		if (error) 
+			goto bad_swap;
+	}
+	
+	if (swapfilesize && maxpages > swapfilesize) {
+		printk(KERN_WARNING
+		       "Swap area shorter than signature indicates\n");
+		error = -EINVAL;
+		goto bad_swap;
+	}
+	if (!nr_good_pages) {
+		printk(KERN_WARNING "Empty swap-file\n");
+		error = -EINVAL;
+		goto bad_swap;
+	}
+	p->swap_map[0] = SWAP_MAP_BAD;
+	swap_list_lock();
+	swap_device_lock(p);
+	p->max = maxpages;
+	p->flags = SWP_WRITEOK;
+	p->pages = nr_good_pages;
+	nr_swap_pages += nr_good_pages;
+	total_swap_pages += nr_good_pages;
+	printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
+	       nr_good_pages<<(PAGE_SHIFT-10), p->prio);
+
+	/* insert swap space into swap_list: */
+	prev = -1;
+	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
+		if (p->prio >= swap_info[i].prio) {
+			break;
+		}
+		prev = i;
+	}
+	p->next = i;
+	if (prev < 0) {
+		swap_list.head = swap_list.next = p - swap_info;
+	} else {
+		swap_info[prev].next = p - swap_info;
+	}
+	swap_device_unlock(p);
+	swap_list_unlock();
+	error = 0;
+	goto out;
+bad_swap:
+	if (bdev)
+		blkdev_put(bdev, BDEV_SWAP);
+bad_swap_2:
+	swap_list_lock();
+	swap_map = p->swap_map;
+	nd.mnt = p->swap_vfsmnt;
+	nd.dentry = p->swap_file;
+	p->swap_device = 0;
+	p->swap_file = NULL;
+	p->swap_vfsmnt = NULL;
+	p->swap_map = NULL;
+	p->flags = 0;
+	if (!(swap_flags & SWAP_FLAG_PREFER))
+		++least_priority;
+	swap_list_unlock();
+	if (swap_map)
+		vfree(swap_map);
+	path_release(&nd);
+out:
+	if (swap_header)
+		free_page((long) swap_header);
+	unlock_kernel();
+	return error;
+}
+
+void si_swapinfo(struct sysinfo *val)
+{
+	unsigned int i;
+	unsigned long nr_to_be_unused = 0;
+
+	swap_list_lock();
+	for (i = 0; i < nr_swapfiles; i++) {
+		unsigned int j;
+		if (swap_info[i].flags != SWP_USED)
+			continue;
+		for (j = 0; j < swap_info[i].max; ++j) {
+			switch (swap_info[i].swap_map[j]) {
+				case 0:
+				case SWAP_MAP_BAD:
+					continue;
+				default:
+					nr_to_be_unused++;
+			}
+		}
+	}
+	val->freeswap = nr_swap_pages + nr_to_be_unused;
+	val->totalswap = total_swap_pages + nr_to_be_unused;
+	swap_list_unlock();
+}
+
+/*
+ * Verify that a swap entry is valid and increment its swap map count.
+ *
+ * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
+ * "permanent", but will be reclaimed by the next swapoff.
+ */
+int swap_duplicate(swp_entry_t entry)
+{
+	struct swap_info_struct * p;
+	unsigned long offset, type;
+	int result = 0;
+
+	type = SWP_TYPE(entry);
+	if (type >= nr_swapfiles)
+		goto bad_file;
+	p = type + swap_info;
+	offset = SWP_OFFSET(entry);
+
+	swap_device_lock(p);
+	if (offset < p->max && p->swap_map[offset]) {
+		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
+			p->swap_map[offset]++;
+			result = 1;
+		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
+			if (swap_overflow++ < 5)
+				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
+			p->swap_map[offset] = SWAP_MAP_MAX;
+			result = 1;
+		}
+	}
+	swap_device_unlock(p);
+out:
+	return result;
+
+bad_file:
+	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
+	goto out;
+}
+
+/*
+ * Page lock needs to be held in all cases to prevent races with
+ * swap file deletion.
+ */
+int swap_count(struct page *page)
+{
+	struct swap_info_struct * p;
+	unsigned long offset, type;
+	swp_entry_t entry;
+	int retval = 0;
+
+	entry.val = page->index;
+	if (!entry.val)
+		goto bad_entry;
+	type = SWP_TYPE(entry);
+	if (type >= nr_swapfiles)
+		goto bad_file;
+	p = type + swap_info;
+	offset = SWP_OFFSET(entry);
+	if (offset >= p->max)
+		goto bad_offset;
+	if (!p->swap_map[offset])
+		goto bad_unused;
+	retval = p->swap_map[offset];
+out:
+	return retval;
+
+bad_entry:
+	printk(KERN_ERR "swap_count: null entry!\n");
+	goto out;
+bad_file:
+	printk(KERN_ERR "swap_count: %s%08lx\n", Bad_file, entry.val);
+	goto out;
+bad_offset:
+	printk(KERN_ERR "swap_count: %s%08lx\n", Bad_offset, entry.val);
+	goto out;
+bad_unused:
+	printk(KERN_ERR "swap_count: %s%08lx\n", Unused_offset, entry.val);
+	goto out;
+}
+
+/*
+ * Prior swap_duplicate protects against swap device deletion.
+ */
+void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, 
+			kdev_t *dev, struct inode **swapf)
+{
+	unsigned long type;
+	struct swap_info_struct *p;
+
+	type = SWP_TYPE(entry);
+	if (type >= nr_swapfiles) {
+		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val);
+		return;
+	}
+
+	p = &swap_info[type];
+	*offset = SWP_OFFSET(entry);
+	if (*offset >= p->max && *offset != 0) {
+		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val);
+		return;
+	}
+	if (p->swap_map && !p->swap_map[*offset]) {
+		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val);
+		return;
+	}
+	if (!(p->flags & SWP_USED)) {
+		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val);
+		return;
+	}
+
+	if (p->swap_device) {
+		*dev = p->swap_device;
+	} else if (p->swap_file) {
+		*swapf = p->swap_file->d_inode;
+	} else {
+		printk(KERN_ERR "rw_swap_page: no swap file or device\n");
+	}
+	return;
+}
+
+/*
+ * swap_device_lock prevents swap_map being freed. Don't grab an extra
+ * reference on the swaphandle, it doesn't matter if it becomes unused.
+ */
+int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
+{
+	int ret = 0, i = 1 << page_cluster;
+	unsigned long toff;
+	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
+
+	if (!page_cluster)	/* no readahead */
+		return 0;
+	toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster;
+	if (!toff)		/* first page is swap header */
+		toff++, i--;
+	*offset = toff;
+
+	swap_device_lock(swapdev);
+	do {
+		/* Don't read-ahead past the end of the swap area */
+		if (toff >= swapdev->max)
+			break;
+		/* Don't read in free or bad pages */
+		if (!swapdev->swap_map[toff])
+			break;
+		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
+			break;
+		toff++;
+		ret++;
+	} while (--i);
+	swap_device_unlock(swapdev);
+	return ret;
+}
diff -urN linux-2.4.17-rc1-virgin/mm/vmalloc.c linux-2.4.17-rc1-wli3/mm/vmalloc.c
--- linux-2.4.17-rc1-virgin/mm/vmalloc.c	Fri Dec 14 06:04:17 2001
+++ linux-2.4.17-rc1-wli3/mm/vmalloc.c	Mon Sep 17 13:16:31 2001
@@ -6,7 +6,6 @@
  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
  */
 
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/spinlock.h>
@@ -274,43 +273,6 @@
 			if (count == 0)
 				goto finished;
 			*buf = *addr;
-			buf++;
-			addr++;
-			count--;
-		} while (--n > 0);
-	}
-finished:
-	read_unlock(&vmlist_lock);
-	return buf - buf_start;
-}
-
-long vwrite(char *buf, char *addr, unsigned long count)
-{
-	struct vm_struct *tmp;
-	char *vaddr, *buf_start = buf;
-	unsigned long n;
-
-	/* Don't allow overflow */
-	if ((unsigned long) addr + count < count)
-		count = -(unsigned long) addr;
-
-	read_lock(&vmlist_lock);
-	for (tmp = vmlist; tmp; tmp = tmp->next) {
-		vaddr = (char *) tmp->addr;
-		if (addr >= vaddr + tmp->size - PAGE_SIZE)
-			continue;
-		while (addr < vaddr) {
-			if (count == 0)
-				goto finished;
-			buf++;
-			addr++;
-			count--;
-		}
-		n = vaddr + tmp->size - PAGE_SIZE - addr;
-		do {
-			if (count == 0)
-				goto finished;
-			*addr = *buf;
 			buf++;
 			addr++;
 			count--;
diff -urN linux-2.4.17-rc1-virgin/mm/vmscan.c linux-2.4.17-rc1-wli3/mm/vmscan.c
--- linux-2.4.17-rc1-virgin/mm/vmscan.c	Sat Nov 17 19:18:17 2001
+++ linux-2.4.17-rc1-wli3/mm/vmscan.c	Fri Dec 14 02:44:20 2001
@@ -32,349 +32,267 @@
  */
 #define DEF_PRIORITY (6)
 
-/*
- * The swap-out function returns 1 if it successfully
- * scanned all the pages it was asked to (`count').
- * It returns zero if it couldn't do anything,
- *
- * rss may decrease because pages are shared, but this
- * doesn't count as having freed a page.
- */
+int vm_static_inactive_target;
 
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
+static inline void age_page_up(struct page *page)
 {
-	pte_t pte;
-	swp_entry_t entry;
+	page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); 
+}
 
-	/* Don't look at this pte if it's been accessed recently. */
-	if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
-		mark_page_accessed(page);
-		return 0;
-	}
+static inline void age_page_down(struct page *page)
+{
+	page->age -= min(PAGE_AGE_DECL, (int)page->age);
+}
 
-	/* Don't bother unmapping pages that are active */
-	if (PageActive(page))
-		return 0;
+/*
+ * Estimate whether a zone has enough inactive or free pages..
+ */
+static unsigned int zone_inactive_plenty(zone_t *zone)
+{
+	unsigned int inactive;
 
-	/* Don't bother replenishing zones not under pressure.. */
-	if (!memclass(page->zone, classzone))
+	if (!zone->size)
 		return 0;
+		
+	inactive = zone->inactive_dirty_pages;
+	inactive += zone->inactive_clean_pages;
+	inactive += zone->free_pages;
 
-	if (TryLockPage(page))
-		return 0;
+	return (inactive > (zone->size * 2 / 5));
+}
 
-	/* From this point on, the odds are that we're going to
-	 * nuke this pte, so read and clear the pte.  This hook
-	 * is needed on CPUs which update the accessed and dirty
-	 * bits in hardware.
-	 */
-	flush_cache_page(vma, address);
-	pte = ptep_get_and_clear(page_table);
-	flush_tlb_page(vma, address);
+#define FREE_PLENTY_FACTOR 4
+static unsigned int zone_free_plenty(zone_t *zone)
+{
+	unsigned int free, target;
 
-	if (pte_dirty(pte))
-		set_page_dirty(page);
+	target = max((int) zone->pages_high, zone->need_balance);
 
-	/*
-	 * Is the page already in the swap cache? If so, then
-	 * we can just drop our reference to it without doing
-	 * any IO - it's already up-to-date on disk.
-	 */
-	if (PageSwapCache(page)) {
-		entry.val = page->index;
-		swap_duplicate(entry);
-set_swap_pte:
-		set_pte(page_table, swp_entry_to_pte(entry));
-drop_pte:
-		mm->rss--;
-		UnlockPage(page);
-		{
-			int freeable = page_count(page) - !!page->buffers <= 2;
-			page_cache_release(page);
-			return freeable;
-		}
-	}
+	free = zone->free_pages;
+	free += zone->inactive_clean_pages;
 
-	/*
-	 * Is it a clean page? Then it must be recoverable
-	 * by just paging it in again, and we can just drop
-	 * it..  or if it's dirty but has backing store,
-	 * just mark the page dirty and drop it.
-	 *
-	 * However, this won't actually free any real
-	 * memory, as the page will just be in the page cache
-	 * somewhere, and as such we should just continue
-	 * our scan.
-	 *
-	 * Basically, this just makes it possible for us to do
-	 * some real work in the future in "refill_inactive()".
-	 */
-	if (page->mapping)
-		goto drop_pte;
-	if (!PageDirty(page))
-		goto drop_pte;
+	return free > target * FREE_PLENTY_FACTOR;
+}
 
-	/*
-	 * Anonymous buffercache pages can be left behind by
-	 * concurrent truncate and pagefault.
-	 */
-	if (page->buffers)
-		goto preserve;
+static unsigned int free_plenty(void)
+{
+	unsigned int free;
 
-	/*
-	 * This is a dirty, swappable page.  First of all,
-	 * get a suitable swap entry for it, and make sure
-	 * we have the swap cache set up to associate the
-	 * page with that swap entry.
-	 */
-	for (;;) {
-		entry = get_swap_page();
-		if (!entry.val)
-			break;
-		/* Add it to the swap cache and mark it dirty
-		 * (adding to the page cache will clear the dirty
-		 * and uptodate bits, so we need to do it again)
-		 */
-		if (add_to_swap_cache(page, entry) == 0) {
-			SetPageUptodate(page);
-			set_page_dirty(page);
-			goto set_swap_pte;
-		}
-		/* Raced with "speculative" read_swap_cache_async */
-		swap_free(entry);
-	}
+	free = nr_free_pages();
+	free += nr_inactive_clean_pages;
 
-	/* No swap space left */
-preserve:
-	set_pte(page_table, pte);
-	UnlockPage(page);
-	return 0;
+	return free > freepages.high * FREE_PLENTY_FACTOR;
 }
 
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
+static inline int page_mapping_inuse(struct page * page)
 {
-	pte_t * pte;
-	unsigned long pmd_end;
+	struct address_space * mapping = page->mapping;
 
-	if (pmd_none(*dir))
-		return count;
-	if (pmd_bad(*dir)) {
-		pmd_ERROR(*dir);
-		pmd_clear(dir);
-		return count;
-	}
-	
-	pte = pte_offset(dir, address);
-	
-	pmd_end = (address + PMD_SIZE) & PMD_MASK;
-	if (end > pmd_end)
-		end = pmd_end;
+	/* Page is in somebody's page tables. */
+	if (page->pte_chain)
+		return 1;
 
-	do {
-		if (pte_present(*pte)) {
-			struct page *page = pte_page(*pte);
-
-			if (VALID_PAGE(page) && !PageReserved(page)) {
-				count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
-				if (!count) {
-					address += PAGE_SIZE;
-					break;
-				}
-			}
-		}
-		address += PAGE_SIZE;
-		pte++;
-	} while (address && (address < end));
-	mm->swap_address = address;
-	return count;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
-{
-	pmd_t * pmd;
-	unsigned long pgd_end;
-
-	if (pgd_none(*dir))
-		return count;
-	if (pgd_bad(*dir)) {
-		pgd_ERROR(*dir);
-		pgd_clear(dir);
-		return count;
-	}
+	/* XXX: does this happen ? */
+	if (!mapping)
+		return 0;
 
-	pmd = pmd_offset(dir, address);
+	/* File is mmaped by somebody. */
+	if (mapping->i_mmap || mapping->i_mmap_shared)
+		return 1;
 
-	pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;	
-	if (pgd_end && (end > pgd_end))
-		end = pgd_end;
-	
-	do {
-		count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
-		if (!count)
-			break;
-		address = (address + PMD_SIZE) & PMD_MASK;
-		pmd++;
-	} while (address && (address < end));
-	return count;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
-{
-	pgd_t *pgdir;
-	unsigned long end;
-
-	/* Don't swap out areas which are reserved */
-	if (vma->vm_flags & VM_RESERVED)
-		return count;
-
-	pgdir = pgd_offset(mm, address);
-
-	end = vma->vm_end;
-	if (address >= end)
-		BUG();
-	do {
-		count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
-		if (!count)
-			break;
-		address = (address + PGDIR_SIZE) & PGDIR_MASK;
-		pgdir++;
-	} while (address && (address < end));
-	return count;
+	return 0;
 }
 
-/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
-struct mm_struct *swap_mm = &init_mm;
-
-/*
- * Returns remaining count of pages to be swapped out by followup call.
+/**
+ * reclaim_page -	reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
  */
-static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
+struct page * reclaim_page(zone_t * zone)
 {
-	unsigned long address;
-	struct vm_area_struct* vma;
+	struct page * page = NULL;
+	struct list_head * page_lru;
+	swp_entry_t entry = {0};
+	int maxscan;
 
 	/*
-	 * Find the proper vm-area after freezing the vma chain 
-	 * and ptes.
+	 * We need to hold the pagecache_lock around all tests to make sure
+	 * reclaim_page() cannot race with find_get_page() and friends.
 	 */
-	spin_lock(&mm->page_table_lock);
-	address = mm->swap_address;
-	if (address == TASK_SIZE || swap_mm != mm) {
-		/* We raced: don't count this mm but try again */
-		++*mmcounter;
-		goto out_unlock;
-	}
-	vma = find_vma(mm, address);
-	if (vma) {
-		if (address < vma->vm_start)
-			address = vma->vm_start;
-
-		for (;;) {
-			count = swap_out_vma(mm, vma, address, count, classzone);
-			vma = vma->vm_next;
-			if (!vma)
-				break;
-			if (!count)
-				goto out_unlock;
-			address = vma->vm_start;
+	spin_lock(&pagemap_lru_lock);
+	spin_lock(&pagecache_lock);
+	maxscan = zone->inactive_clean_pages;
+	while ((page_lru = zone->inactive_clean_list.prev) !=
+			&zone->inactive_clean_list && maxscan--) {
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (unlikely(!PageInactiveClean(page))) {
+			printk("VM: reclaim_page, wrong page on list.\n");
+			list_del(page_lru);
+			page->zone->inactive_clean_pages--;
+			continue;
 		}
-	}
-	/* Indicate that we reached the end of address space */
-	mm->swap_address = TASK_SIZE;
 
-out_unlock:
-	spin_unlock(&mm->page_table_lock);
-	return count;
-}
+		/* Page is being freed */
+		if (unlikely(page_count(page)) == 0) {
+			list_del(page_lru);
+			list_add(page_lru, &zone->inactive_clean_list);
+			continue;
+		}
 
-static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone));
-static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)
-{
-	int counter, nr_pages = SWAP_CLUSTER_MAX;
-	struct mm_struct *mm;
+		/* Page cannot be reclaimed ?  Move to inactive_dirty list. */
+		if (unlikely(page->pte_chain || page->buffers ||
+				PageReferenced(page) || PageDirty(page) ||
+				page_count(page) > 1 || TryLockPage(page))) {
+			del_page_from_inactive_clean_list(page);
+			add_page_to_inactive_dirty_list(page);
+			continue;
+		}
 
-	counter = mmlist_nr;
-	do {
-		if (unlikely(current->need_resched)) {
-			__set_current_state(TASK_RUNNING);
-			schedule();
+		/* OK, remove the page from the caches. */
+                if (PageSwapCache(page)) {
+			entry.val = page->index;
+			__delete_from_swap_cache(page);
+			goto found_page;
 		}
 
-		spin_lock(&mmlist_lock);
-		mm = swap_mm;
-		while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
-			mm->swap_address = 0;
-			mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
-			if (mm == swap_mm)
-				goto empty;
-			swap_mm = mm;
+		if (page->mapping) {
+			__remove_inode_page(page);
+			goto found_page;
 		}
 
-		/* Make sure the mm doesn't disappear when we drop the lock.. */
-		atomic_inc(&mm->mm_users);
-		spin_unlock(&mmlist_lock);
+		/* We should never ever get here. */
+		printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+		list_del(page_lru);
+		zone->inactive_clean_pages--;
+		UnlockPage(page);
+	}
+	spin_unlock(&pagecache_lock);
+	spin_unlock(&pagemap_lru_lock);
+	return NULL;
 
-		nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
+found_page:
+	del_page_from_inactive_clean_list(page);
+	spin_unlock(&pagecache_lock);
+	spin_unlock(&pagemap_lru_lock);
+	if (entry.val)
+		swap_free(entry);
+	UnlockPage(page);
+	page->age = PAGE_AGE_START;
+	if (page_count(page) != 1)
+		printk("VM: reclaim_page, found page with count %d!\n",
+				page_count(page));
+	return page;
+}
 
-		mmput(mm);
+static inline int page_dirty(struct page *page)
+{
+	struct buffer_head *tmp, *bh;
 
-		if (!nr_pages)
-			return 1;
-	} while (--counter >= 0);
+	if (PageDirty(page))
+		return 1;
 
-	return 0;
+	if (page->mapping && !page->buffers)
+		return 0;
+
+	tmp = bh = page->buffers;
+
+	do {
+		if (tmp->b_state & ((1<<BH_Dirty) | (1<<BH_Lock)))
+			return 1;
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
 
-empty:
-	spin_unlock(&mmlist_lock);
 	return 0;
 }
 
-static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority));
-static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ * @sync: are we allowed to do synchronous IO in emergencies ?
+ *
+ * This function is called when we are low on free / inactive_clean
+ * pages, its purpose is to refill the free/clean list as efficiently
+ * as possible.
+ *
+ * This means we do writes asynchronously as long as possible and will
+ * only sleep on IO when we don't have another option. Since writeouts
+ * cause disk seeks and make read IO slower, we skip writes alltogether
+ * when the amount of dirty pages is small.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
+ */
+#define	CAN_DO_FS	((gfp_mask & __GFP_FS) && should_write)
+#define	WRITE_LOW_WATER		5
+#define	WRITE_HIGH_WATER	10
+int page_launder(int gfp_mask)
 {
+	int maxscan, cleaned_pages;
 	struct list_head * entry;
-	int max_scan = nr_inactive_pages / priority;
-	int max_mapped = nr_pages << (9 - priority);
 
+	cleaned_pages = 0;
+	
+	/* The main launder loop. */
 	spin_lock(&pagemap_lru_lock);
-	while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) {
+	maxscan = nr_inactive_dirty_pages;
+	while (--maxscan >= 0 && (entry = inactive_dirty_list.prev) != &inactive_dirty_list) {
 		struct page * page;
 
-		if (unlikely(current->need_resched)) {
-			spin_unlock(&pagemap_lru_lock);
-			__set_current_state(TASK_RUNNING);
-			schedule();
-			spin_lock(&pagemap_lru_lock);
-			continue;
-		}
-
 		page = list_entry(entry, struct page, lru);
 
-		if (unlikely(!PageLRU(page)))
-			BUG();
-		if (unlikely(PageActive(page)))
-			BUG();
-
 		list_del(entry);
-		list_add(entry, &inactive_list);
+		list_add(entry, &inactive_dirty_list);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (!PageInactiveDirty(page)) {
+			printk("VM: page_launder, wrong page on list.\n");
+			list_del(entry);
+			nr_inactive_dirty_pages--;
+			page->zone->inactive_dirty_pages--;
+			continue;
+		}
 
 		/*
-		 * Zero page counts can happen because we unlink the pages
-		 * _after_ decrementing the usage count..
+		 * The page is in active use or really unfreeable. Move to
+		 * the active list and adjust the page age if needed.
 		 */
-		if (unlikely(!page_count(page)))
+		if ((page_referenced(page) || page->age) &&
+				page_mapping_inuse(page)) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_active_list(page);
+			page->age = max((int)page->age, PAGE_AGE_START);
 			continue;
+		}
 
-		if (!memclass(page->zone, classzone))
+		/*
+		 * The page is still in the page tables of some process,
+		 * move it to the active list but leave page age at 0;
+		 * either swap_out() will make it freeable soon or it is
+		 * mlock()ed...
+		 *
+		 * The !PageLocked() test is to protect us from ourselves,
+		 * see the code around the writepage() call.
+		 */
+		if ((page_count(page) > (1 + !!page->buffers)) &&
+						!PageLocked(page)) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_active_list(page);
 			continue;
+		}
 
-		/* Racy check to avoid trylocking when not worthwhile */
-		if (!page->buffers && (page_count(page) != 1 || !page->mapping))
-			goto page_mapped;
+		/*
+		 * If this zone has plenty of pages free, don't spend time
+		 * on cleaning it but only move clean pages out of the way
+		 * so we won't have to scan those again.
+		 */
+		if (zone_free_plenty(page->zone) || page_count(page) == 0) {
+			continue;
+		}
 
 		/*
 		 * The page is locked. IO in progress?
@@ -391,12 +309,49 @@
 			continue;
 		}
 
-		if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
+		/*
+		 * Anonymous process memory without backing store. Try to
+		 * allocate it some swap space here.
+		 *
+		 * XXX: implement swap clustering ?
+		 */
+		if (page->pte_chain && !page->mapping && !page->buffers) {
+			page_cache_get(page);
+			spin_unlock(&pagemap_lru_lock);
+			if (!add_to_swap(page)) {
+				activate_page(page);
+				UnlockPage(page);
+				page_cache_release(page);
+				spin_lock(&pagemap_lru_lock);
+				continue;
+			}
+			page_cache_release(page);
+			spin_lock(&pagemap_lru_lock);
+		}
+
+		/*
+		 * The page is mapped into the page tables of one or more
+		 * processes. Try to unmap it here.
+		 */
+		if (page->pte_chain) {
+			switch (try_to_unmap(page)) {
+				case SWAP_ERROR:
+				case SWAP_FAIL:
+					goto page_active;
+				case SWAP_AGAIN:
+					UnlockPage(page);
+					continue;
+				case SWAP_SUCCESS:
+					; /* try to free the page below */
+			}
+		}
+
+		if (PageDirty(page) && page->mapping) {
 			/*
 			 * It is not critical here to write it only if
 			 * the page is unmapped beause any direct writer
 			 * like O_DIRECT would set the PG_dirty bitflag
-			 * on the phisical page after having successfully
+			 * on the physical page after having successfully
 			 * pinned it and after the I/O to the page is finished,
 			 * so the direct writes to the page cannot get lost.
 			 */
@@ -425,7 +380,7 @@
 		if (page->buffers) {
 			spin_unlock(&pagemap_lru_lock);
 
-			/* avoid to free a locked page */
+			/* To avoid freeing our page before we're done. */
 			page_cache_get(page);
 
 			if (try_to_release_page(page, gfp_mask)) {
@@ -443,14 +398,14 @@
 					/* effectively free the page here */
 					page_cache_release(page);
 
-					if (--nr_pages)
-						continue;
-					break;
+					cleaned_pages++;
+					continue;
 				} else {
 					/*
-					 * The page is still in pagecache so undo the stuff
-					 * before the try_to_release_page since we've not
-					 * finished and we can now try the next step.
+					 * We freed the buffers but may have
+					 * slept; undo the stuff we did before
+					 * try_to_release_page and fall through
+					 * to the next step.
 					 */
 					page_cache_release(page);
 
@@ -466,224 +421,279 @@
 			}
 		}
 
-		spin_lock(&pagecache_lock);
 
 		/*
-		 * this is the non-racy check for busy page.
+		 * If the page is really freeable now, move it to the
+		 * inactive_clean list.
+		 *
+		 * We re-test everything since the page could have been
+		 * used by somebody else while we waited on IO above.
+		 * This test is not safe from races, but only the one
+		 * in reclaim_page() needs to be.
 		 */
-		if (!page->mapping || !is_page_cache_freeable(page)) {
-			spin_unlock(&pagecache_lock);
+		if (page->mapping && !PageDirty(page) && !page->pte_chain &&
+				page_count(page) == 1) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_clean_list(page);
 			UnlockPage(page);
-page_mapped:
-			if (--max_mapped >= 0)
-				continue;
-
+			cleaned_pages++;
+		} else {
 			/*
-			 * Alert! We've found too many mapped pages on the
-			 * inactive list, so we start swapping out now!
+			 * OK, we don't know what to do with the page.
+			 * It's no use keeping it here, so we move it to
+			 * the active list.
 			 */
-			spin_unlock(&pagemap_lru_lock);
-			swap_out(priority, gfp_mask, classzone);
-			return nr_pages;
-		}
-
-		/*
-		 * It is critical to check PageDirty _after_ we made sure
-		 * the page is freeable* so not in use by anybody.
-		 */
-		if (PageDirty(page)) {
-			spin_unlock(&pagecache_lock);
+page_active:
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_active_list(page);
 			UnlockPage(page);
-			continue;
 		}
-
-		/* point of no return */
-		if (likely(!PageSwapCache(page))) {
-			__remove_inode_page(page);
-			spin_unlock(&pagecache_lock);
-		} else {
-			swp_entry_t swap;
-			swap.val = page->index;
-			__delete_from_swap_cache(page);
-			spin_unlock(&pagecache_lock);
-			swap_free(swap);
-		}
-
-		__lru_cache_del(page);
-		UnlockPage(page);
-
-		/* effectively free the page here */
-		page_cache_release(page);
-
-		if (--nr_pages)
-			continue;
-		break;
 	}
 	spin_unlock(&pagemap_lru_lock);
 
-	return nr_pages;
+	/* Return the number of pages moved to the inactive_clean list. */
+	return cleaned_pages;
 }
 
-/*
- * This moves pages from the active list to
- * the inactive list.
+/**
+ * refill_inactive - scan the active list and find pages to deactivate
+ * @priority: how much are we allowed to scan
  *
- * We move them the other way when we see the
- * reference bit on the page.
+ * This function will scan a portion of the active list to find
+ * unused pages, those pages will then be moved to the inactive list.
  */
-static void refill_inactive(int nr_pages)
+int refill_inactive(int priority)
 {
-	struct list_head * entry;
+	struct list_head * page_lru;
+	struct page * page;
+	int maxscan = nr_active_pages >> priority;
+	int nr_deactivated = 0;
 
+	/* Take the lock while messing with the list... */
 	spin_lock(&pagemap_lru_lock);
-	entry = active_list.prev;
-	while (nr_pages-- && entry != &active_list) {
-		struct page * page;
+	while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
+		page = list_entry(page_lru, struct page, lru);
 
-		page = list_entry(entry, struct page, lru);
-		entry = entry->prev;
-		if (PageTestandClearReferenced(page)) {
-			list_del(&page->lru);
-			list_add(&page->lru, &active_list);
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (unlikely(!PageActive(page))) {
+			printk("VM: refill_inactive, wrong page on list.\n");
+			list_del(page_lru);
+			nr_active_pages--;
 			continue;
 		}
 
-		del_page_from_active_list(page);
-		add_page_to_inactive_list(page);
-		SetPageReferenced(page);
+		/*
+		 * Do aging on the pages.  Every time a page is referenced,
+		 * page->age gets incremented.  If it wasn't referenced, we
+		 * decrement page->age.  The page gets moved to the inactive
+		 * list when one of the following is true:
+		 * - the page age reaches 0
+		 * - the object the page belongs to isn't in active use
+		 * - the object the page belongs to is hogging the cache
+		 */
+		if (PageTestandClearReferenced(page)) {
+			age_page_up(page);
+		} else {
+			age_page_down(page);
+		}
+
+		/*
+		 * Don't deactivate pages from zones which have
+		 * plenty inactive pages.
+		 */
+		if (unlikely(zone_inactive_plenty(page->zone) &&
+				zone_free_plenty(page->zone))) {
+			goto skip_page;
+		}
+
+		/* 
+		 * If the page age is 'hot' AND the object the page
+		 * is in is still in use, we keep the page. Otherwise
+		 * we move it to the inactive_dirty list.
+		 */
+		if (page->age && page_mapping_inuse(page)) {
+skip_page:
+			list_del(page_lru);
+			list_add(page_lru, &active_list);
+		} else {
+			deactivate_page_nolock(page);
+			nr_deactivated++;
+		}
+
+		/* Low latency reschedule point. */
+		if (unlikely(current->need_resched)) {
+			spin_unlock(&pagemap_lru_lock);
+			 __set_current_state(TASK_RUNNING);
+			schedule();
+			if (!inactive_shortage())
+				return 1;
+			spin_lock(&pagemap_lru_lock);
+		}
 	}
 	spin_unlock(&pagemap_lru_lock);
+
+	return nr_deactivated;
 }
 
-static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
-static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
+/*
+ * Check if there are zones with a severe shortage of free pages,
+ * or if all zones have a minor shortage.
+ */
+int free_shortage(void)
 {
-	int chunk_size = nr_pages;
-	unsigned long ratio;
+	pg_data_t *pgdat;
+	unsigned int global_free = 0;
+	unsigned int global_target = freepages.high;
 
-	nr_pages -= kmem_cache_reap(gfp_mask);
-	if (nr_pages <= 0)
-		return 0;
+	/* Are we low on free pages anywhere? */
+	pgdat = pgdat_list;
+	do {
+		int i;
+		for(i = 0; i < MAX_NR_ZONES; i++) {
+			zone_t *zone = pgdat->node_zones+ i;
+			unsigned int free;
 
-	nr_pages = chunk_size;
-	/* try to keep the active list 2/3 of the size of the cache */
-	ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
-	refill_inactive(ratio);
+			if (!zone->size)
+				continue;
 
-	nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
-	if (nr_pages <= 0)
-		return 0;
+			free = zone->free_pages;
+			free += zone->inactive_clean_pages;
 
-	shrink_dcache_memory(priority, gfp_mask);
-	shrink_icache_memory(priority, gfp_mask);
-#ifdef CONFIG_QUOTA
-	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
-#endif
+			/* Local shortage? */
+			if (free < zone->pages_low)
+				return 1;
 
-	return nr_pages;
+			global_free += free;
+		}
+		pgdat = pgdat->node_next;
+	} while (pgdat);
+
+	/* Global shortage? */
+	return global_free < global_target;
 }
 
-int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
+static inline unsigned int inactive_target(void)
 {
-	int priority = DEF_PRIORITY;
-	int nr_pages = SWAP_CLUSTER_MAX;
+	unsigned int mem;
 
-	do {
-		nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
-		if (nr_pages <= 0)
-			return 1;
-	} while (--priority);
+	mem = nr_active_pages;
+	mem += nr_inactive_dirty_pages;
+	mem += nr_inactive_clean_pages;
 
-	/*
-	 * Hmm.. Cache shrink failed - time to kill something?
-	 * Mhwahahhaha! This is the part I really like. Giggle.
-	 */
-	out_of_memory();
-	return 0;
+	return mem / 4;
 }
 
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-
-static int check_classzone_need_balance(zone_t * classzone)
+/*
+ * Are we low on inactive pages globally or in any zone?
+ */
+int inactive_shortage(void)
 {
-	zone_t * first_classzone;
+	pg_data_t *pgdat;
+	unsigned int global_target = freepages.high + inactive_target();
+	unsigned int global_inactive = 0;
 
-	first_classzone = classzone->zone_pgdat->node_zones;
-	while (classzone >= first_classzone) {
-		if (classzone->free_pages > classzone->pages_high)
-			return 0;
-		classzone--;
-	}
-	return 1;
-}
+	pgdat = pgdat_list;
+	do {
+		int i;
+		for(i = 0; i < MAX_NR_ZONES; i++) {
+			zone_t *zone = pgdat->node_zones + i;
+			unsigned int inactive, target;
 
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
-{
-	int need_more_balance = 0, i;
-	zone_t * zone;
+			if (!zone->size)
+				continue;
 
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		if (unlikely(current->need_resched))
-			schedule();
-		if (!zone->need_balance)
-			continue;
-		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
-			zone->need_balance = 0;
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
-			continue;
+			inactive  = zone->inactive_dirty_pages;
+			inactive += zone->inactive_clean_pages;
+			inactive += zone->free_pages;
+
+			target = max((int) zone->pages_high, zone->need_balance);
+			/* Local shortage? */
+			if (inactive < target)
+				return 1;
+
+			global_inactive += inactive;
 		}
-		if (check_classzone_need_balance(zone))
-			need_more_balance = 1;
-		else
-			zone->need_balance = 0;
-	}
+		pgdat = pgdat->node_next;
+	} while (pgdat);
 
-	return need_more_balance;
+	/* Global shortage? */
+	return global_inactive < global_target;
 }
 
-static void kswapd_balance(void)
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages(unsigned int gfp_mask)
 {
-	int need_more_balance;
-	pg_data_t * pgdat;
+	int ret = 0;
 
-	do {
-		need_more_balance = 0;
-		pgdat = pgdat_list;
-		do
-			need_more_balance |= kswapd_balance_pgdat(pgdat);
-		while ((pgdat = pgdat->node_next));
-	} while (need_more_balance);
-}
+	/*
+	 * Eat memory from filesystem page cache, buffer cache,
+	 * dentry, inode and filesystem quota caches.
+	 */
+	ret += page_launder(gfp_mask);
+	shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+	shrink_icache_memory(1, gfp_mask);
+#ifdef CONFIG_QUOTA
+	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+#endif
 
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
-{
-	zone_t * zone;
-	int i;
+	/*
+	 * If needed, we move pages from the active list
+	 * to the inactive list.
+	 */
+	if (inactive_shortage() || free_shortage())
+		ret += refill_inactive(0);
 
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		if (!zone->need_balance)
-			continue;
-		return 0;
-	}
+	/* 	
+	 * Reclaim unused slab cache memory.
+	 */
+	kmem_cache_reap(gfp_mask);
 
-	return 1;
+	/*
+	 * Hmm.. Cache shrink failed - time to kill something?
+	 * Mhwahahhaha! This is the part I really like. Giggle.
+	 */
+	if (!ret)
+		out_of_memory();
+
+	return ret;
 }
 
-static int kswapd_can_sleep(void)
-{
-	pg_data_t * pgdat;
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 
-	pgdat = pgdat_list;
+/*
+ * Move some pages from the inactive_clean lists to the free
+ * lists so atomic allocations have pages to work from.
+ *
+ * We refill the freelist in a bump from pages_min to pages_low
+ * in order to give the buddy allocator something to play with.
+ */
+static void refill_freelist(void)
+{
+	pg_data_t * pgdat = pgdat_list;
+	int i;
 	do {
-		if (kswapd_can_sleep_pgdat(pgdat))
-			continue;
-		return 0;
-	} while ((pgdat = pgdat->node_next));
+		for(i = 0; i < MAX_NR_ZONES; i++) {
+			zone_t *zone = pgdat->node_zones + i;
+			if (!zone->size || zone->free_pages >= zone->pages_min)
+				continue;
 
-	return 1;
+			while (zone->free_pages < zone->pages_low) {
+				struct page * page;
+				page = reclaim_page(zone);
+				if (!page)
+					break;
+				__free_page(page);
+			}
+		}
+		pgdat = pgdat->node_next;
+	} while (pgdat);
 }
 
 /*
@@ -702,7 +712,6 @@
 int kswapd(void *unused)
 {
 	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
 
 	daemonize();
 	strcpy(tsk->comm, "kswapd");
@@ -726,24 +735,65 @@
 	 * Kswapd main loop.
 	 */
 	for (;;) {
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&kswapd_wait, &wait);
+		static long recalc = 0;
 
-		mb();
-		if (kswapd_can_sleep())
-			schedule();
+		/*
+		 * We try to rebalance the VM either when we are short
+		 * on free pages or when we have a shortage of inactive
+		 * pages and are getting low on free pages.
+		 */
+		if (free_shortage() || (inactive_shortage() && !free_plenty()))
+			do_try_to_free_pages(GFP_KSWAPD);
 
-		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&kswapd_wait, &wait);
+		refill_freelist();
 
-		/*
-		 * If we actually get into a low-memory situation,
-		 * the processes needing more memory will wake us
-		 * up on a more timely basis.
+		/* Once a second ... */
+		if (time_after(jiffies, recalc + HZ)) {
+			recalc = jiffies;
+
+			/* Do background page aging. */
+			refill_inactive(DEF_PRIORITY);
+		}
+
+		/* 
+		 * We go to sleep if either the free page shortage
+		 * or the inactive page shortage is gone. We do this
+		 * because:
+		 * 1) we need no more free pages   or
+		 * 2) the inactive pages need to be flushed to disk,
+		 *    it wouldn't help to eat CPU time now ...
+		 *
+		 * We go to sleep for one second, but if it's needed
+		 * we'll be woken up earlier...
 		 */
-		kswapd_balance();
-		run_task_queue(&tq_disk);
+		if (!free_shortage() || !inactive_shortage()) {
+			interruptible_sleep_on_timeout(&kswapd_wait, HZ);
+		}
+	}
+}
+
+void wakeup_kswapd(void)
+{
+	if (waitqueue_active(&kswapd_wait))
+		wake_up_interruptible(&kswapd_wait);
+}
+
+/*
+ * Called by non-kswapd processes when they want more
+ * memory but are unable to sleep on kswapd because
+ * they might be holding some IO locks ...
+ */
+int try_to_free_pages(unsigned int gfp_mask)
+{
+	int ret = 1;
+
+	if (gfp_mask & __GFP_WAIT) {
+		current->flags |= PF_MEMALLOC;
+		ret = do_try_to_free_pages(gfp_mask);
+		current->flags &= ~PF_MEMALLOC;
 	}
+
+	return ret;
 }
 
 static int __init kswapd_init(void)
diff -urN linux-2.4.17-rc1-virgin/net/socket.c linux-2.4.17-rc1-wli3/net/socket.c
--- linux-2.4.17-rc1-virgin/net/socket.c	Fri Dec 14 06:04:18 2001
+++ linux-2.4.17-rc1-wli3/net/socket.c	Fri Dec 14 02:44:44 2001
@@ -133,7 +133,7 @@
 
 static struct net_proto_family *net_families[NPROTO];
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 static atomic_t net_family_lockct = ATOMIC_INIT(0);
 static spinlock_t net_family_lock = SPIN_LOCK_UNLOCKED;