diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/arch/i386/Kconfig 950-shpte/arch/i386/Kconfig --- 900-mjb1/arch/i386/Kconfig Thu Mar 27 22:15:28 2003 +++ 950-shpte/arch/i386/Kconfig Sat Mar 29 07:53:14 2003 @@ -751,6 +751,15 @@ config 4K_STACK detection. It is much more reliable than the currently in-kernel version. +config SHAREPTE + bool "Share 3rd-level pagetables between processes" + help + Normally each address space has its own complete page table for all + its mappings. This can mean many mappings of a set of shared data + pages. With this option, the VM will attempt to share the bottom + level of the page table between address spaces that are sharing data + pages. + config MATH_EMULATION bool "Math emulation" ---help--- diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/arch/i386/kernel/vm86.c 950-shpte/arch/i386/kernel/vm86.c --- 900-mjb1/arch/i386/kernel/vm86.c Mon Mar 17 21:43:39 2003 +++ 950-shpte/arch/i386/kernel/vm86.c Sat Mar 29 07:53:14 2003 @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include @@ -125,6 +127,7 @@ struct pt_regs * save_v86_state(struct k static void mark_screen_rdonly(struct task_struct * tsk) { + struct ptpage *ptepage; pgd_t *pgd; pmd_t *pmd; pte_t *pte, *mapped; @@ -148,6 +151,8 @@ static void mark_screen_rdonly(struct ta pmd_clear(pmd); goto out; } + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); pte = mapped = pte_offset_map(pmd, 0xA0000); for (i = 0; i < 32; i++) { if (pte_present(*pte)) @@ -155,6 +160,7 @@ static void mark_screen_rdonly(struct ta pte++; } pte_unmap(mapped); + pte_page_unlock(ptepage); out: spin_unlock(&tsk->mm->page_table_lock); preempt_enable(); diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/arch/i386/mm/pgtable.c 950-shpte/arch/i386/mm/pgtable.c --- 900-mjb1/arch/i386/mm/pgtable.c Mon Mar 17 21:43:39 2003 +++ 950-shpte/arch/i386/mm/pgtable.c Sat Mar 29 07:53:14 2003 @@ -146,24 +146,27 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +struct ptpage *pte_alloc_one(struct mm_struct *mm, unsigned long address) { int count = 0; - struct page *pte; + struct ptpage *pte; do { #if CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0); + pte = (struct ptpage *)alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0); #else - pte = alloc_pages(GFP_KERNEL, 0); + pte = (struct ptpage *)alloc_pages(GFP_KERNEL, 0); #endif - if (pte) - clear_highpage(pte); - else { + if (pte) { + clear_highpage((struct page *)pte); + pte->mapcount = pte->swapcount= 0; + pte->pte.mmdirect = 0; + break; + } else { current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ); } - } while (!pte && (count++ < 10)); + } while (count++ < 10); return pte; } diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/fs/exec.c 950-shpte/fs/exec.c --- 900-mjb1/fs/exec.c Thu Mar 27 21:57:38 2003 +++ 950-shpte/fs/exec.c Sat Mar 29 07:53:14 2003 @@ -50,6 +50,7 @@ #include #include #include +#include #ifdef CONFIG_KMOD #include @@ -320,7 +321,7 @@ void put_dirty_page(struct task_struct * set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); - tsk->mm->rss++; + increment_rss(pmd_ptpage(*pmd)); spin_unlock(&tsk->mm->page_table_lock); /* no need for flush_tlb */ diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/include/asm-generic/rmap.h 950-shpte/include/asm-generic/rmap.h --- 900-mjb1/include/asm-generic/rmap.h Thu Feb 13 11:08:13 2003 +++ 950-shpte/include/asm-generic/rmap.h Sat Mar 29 07:53:14 2003 @@ -26,39 +26,12 @@ */ #include -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - static inline unsigned long ptep_to_address(pte_t * ptep) { - struct page * page = kmap_atomic_to_page(ptep); + struct ptpage * page = (struct ptpage *)kmap_atomic_to_page(ptep); unsigned long low_bits; low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; + return page->virtual + low_bits; } #if CONFIG_HIGHPTE @@ -86,5 +59,10 @@ static inline void rmap_ptep_unmap(pte_t return; } #endif + +extern void pgtable_add_rmap(struct ptpage * ptepage, struct mm_struct * mm, unsigned long address); +extern void pgtable_add_rmap_locked(struct ptpage * ptepage, struct mm_struct * mm, unsigned long address); +extern void pgtable_remove_rmap(struct ptpage * ptepage, struct mm_struct *mm); +extern void pgtable_remove_rmap_locked(struct ptpage * ptepage, struct mm_struct *mm); #endif /* _GENERIC_RMAP_H */ diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/include/asm-generic/tlb.h 950-shpte/include/asm-generic/tlb.h --- 900-mjb1/include/asm-generic/tlb.h Wed Mar 26 22:54:36 2003 +++ 950-shpte/include/asm-generic/tlb.h Sat Mar 29 07:53:14 2003 @@ -85,13 +85,6 @@ tlb_flush_mmu(struct mmu_gather *tlb, un static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - int freed = tlb->freed; - struct mm_struct *mm = tlb->mm; - int rss = mm->rss; - - if (rss < freed) - freed = rss; - mm->rss = rss - freed; tlb_flush_mmu(tlb, start, end); /* keep the page table cache within bounds */ diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/include/asm-i386/pgalloc.h 950-shpte/include/asm-i386/pgalloc.h --- 900-mjb1/include/asm-i386/pgalloc.h Thu Feb 13 11:08:13 2003 +++ 950-shpte/include/asm-i386/pgalloc.h Sat Mar 29 07:53:14 2003 @@ -10,10 +10,10 @@ #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct ptpage *pte) { set_pmd(pmd, __pmd(_PAGE_TABLE + - ((unsigned long long)page_to_pfn(pte) << + ((unsigned long long)page_to_pfn((struct page *)pte) << (unsigned long long) PAGE_SHIFT))); } /* @@ -24,20 +24,20 @@ pgd_t *pgd_alloc(struct mm_struct *); void pgd_free(pgd_t *pgd); pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -struct page *pte_alloc_one(struct mm_struct *, unsigned long); +struct ptpage *pte_alloc_one(struct mm_struct *, unsigned long); static inline void pte_free_kernel(pte_t *pte) { free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) +static inline void pte_free(struct ptpage *pte) { - __free_page(pte); + __free_page((struct page *)pte); } -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),((struct page *)pte)) /* * allocating and freeing a pmd is trivial: the 1-entry pmd is diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/include/asm-i386/pgtable.h 950-shpte/include/asm-i386/pgtable.h --- 900-mjb1/include/asm-i386/pgtable.h Thu Mar 27 22:12:25 2003 +++ 950-shpte/include/asm-i386/pgtable.h Sat Mar 29 07:53:14 2003 @@ -115,6 +115,7 @@ void pgtable_cache_init(void); #define _PAGE_PROTNONE 0x080 /* If not present */ #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _PAGE_TABLE_RDONLY (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -123,6 +124,10 @@ void pgtable_cache_init(void); #define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PMD_NONE __pgprot(_PAGE_PRESENT | _PAGE_ACCESSED) +#define PMD_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) +#define PMD_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) + #define _PAGE_KERNEL \ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) @@ -159,6 +164,15 @@ extern unsigned long __PAGE_KERNEL; #define __S110 PAGE_SHARED #define __S111 PAGE_SHARED +#define __PMD000 PMD_NONE +#define __PMD001 PMD_READONLY +#define __PMD010 PMD_SHARED +#define __PMD011 PMD_SHARED +#define __PMD100 PMD_READONLY +#define __PMD101 PMD_READONLY +#define __PMD110 PMD_SHARED +#define __PMD111 PMD_SHARED + /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are @@ -175,8 +189,8 @@ extern unsigned long pg0[1024]; #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) - +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_RW)) != \ + (_KERNPG_TABLE & ~_PAGE_RW)) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) @@ -201,6 +215,9 @@ static inline pte_t pte_mkexec(pte_t pte static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } +static inline int pmd_write(pmd_t pmd) { return (pmd).pmd & _PAGE_RW; } +static inline pmd_t pmd_wrprotect(pmd_t pmd) { (pmd).pmd &= ~_PAGE_RW; return pmd; } +static inline pmd_t pmd_mkwrite(pmd_t pmd) { (pmd).pmd |= _PAGE_RW; return pmd; } static inline int ptep_test_and_clear_dirty(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } static inline int ptep_test_and_clear_young(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); } @@ -222,6 +239,13 @@ static inline pte_t pte_modify(pte_t pte return pte; } +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + pmd.pmd &= ~(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER); + pmd.pmd |= pgprot_val(newprot); + return pmd; +} + #define page_pte(page) page_pte_prot(page, __pgprot(0)) #define pmd_page_kernel(pmd) \ @@ -231,6 +255,8 @@ static inline pte_t pte_modify(pte_t pte #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) #endif /* !CONFIG_DISCONTIGMEM */ +#define pmd_ptpage(pmd) ((struct ptpage *)pmd_page(pmd)) + #define pmd_large(pmd) \ ((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) @@ -279,12 +305,20 @@ static inline pte_t pte_modify(pte_t pte ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) #define pte_offset_map_nested(dir, address) \ ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) +#define pte_page_map(__page, address) \ + ((pte_t *)kmap_atomic(__page,KM_PTE0) + pte_index(address)) +#define pte_page_map_nested(__page, address) \ + ((pte_t *)kmap_atomic(__page,KM_PTE1) + pte_index(address)) #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) #else #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) #define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) +#define pte_page_map(__page, address) \ + ((pte_t *)page_address(__page) + pte_index(address)) +#define pte_page_map_nested(__page, address) \ + ((pte_t *)page_address(__page) + pte_index(address)) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) #endif diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/include/linux/mm.h 950-shpte/include/linux/mm.h --- 900-mjb1/include/linux/mm.h Thu Mar 27 21:57:40 2003 +++ 950-shpte/include/linux/mm.h Sat Mar 29 07:53:14 2003 @@ -105,6 +105,7 @@ struct vm_area_struct { #define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_NONLINEAR 0x00800000 /* VM contains nonlinear mappings */ #ifdef CONFIG_STACK_GROWSUP #define VM_STACK_FLAGS (VM_GROWSUP | VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT) @@ -124,7 +125,6 @@ struct vm_area_struct { */ extern pgprot_t protection_map[16]; - /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -137,8 +137,9 @@ struct vm_operations_struct { int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); }; -/* forward declaration; pte_chain is meant to be internal to rmap.c */ +/* forward declaration; pte_chain and mm_chain are meant to be internal to rmap.c */ struct pte_chain; +struct mm_chain; struct mmu_gather; struct inode; @@ -196,6 +197,26 @@ struct page { */ #include +struct ptpage { + unsigned long flags; /* atomic flags, some possibly + updated asynchronously */ + atomic_t count; /* Usage count, see below. */ + unsigned long virtual; /* virtual address this page maps */ + unsigned short mapcount; /* Number of pages mapped to this page */ + unsigned short swapcount; /* Number of swap pages in this page */ + union { + struct mm_chain *mmchain;/* Reverse mm_struct mapping pointer */ + struct mm_struct *mmdirect; + } pte; + struct semaphore sem; +}; + +static inline void clear_pte_page(struct ptpage *ptepage) +{ + ClearPagePtepage(ptepage); + memset(&ptepage->sem, 0, sizeof(struct semaphore)); +} + /* * Methods to modify the page usage count. * @@ -400,14 +421,19 @@ struct file *shmem_file_setup(char * nam void shmem_lock(struct file * file, int lock); int shmem_zero_setup(struct vm_area_struct *); +void increment_rss(struct ptpage *ptpage); +void decrement_rss(struct ptpage *ptpage); +void increment_swapcount(struct ptpage *ptpage); +void decrement_swapcount(struct ptpage *ptpage); + void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted); -void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, +void unmap_page_range(struct mmu_gather **tlb, struct vm_area_struct *vma, unsigned long address, unsigned long size); -void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr); +void unmap_all_pages(struct mm_struct *mm); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/include/linux/page-flags.h 950-shpte/include/linux/page-flags.h --- 900-mjb1/include/linux/page-flags.h Thu Mar 27 21:57:38 2003 +++ 950-shpte/include/linux/page-flags.h Sat Mar 29 07:53:14 2003 @@ -74,7 +74,8 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ -#define PG_anon 20 /* Anonymous page */ +#define PG_ptepage 20 /* This page is a pte page */ +#define PG_anon 22 /* Anonymous page */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -246,6 +247,12 @@ extern void get_full_page_state(struct p #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) + +#define PagePtepage(page) test_bit(PG_ptepage, &(page)->flags) +#define SetPagePtepage(page) set_bit(PG_ptepage, &(page)->flags) +#define TestSetPagePtepage(page) test_and_set_bit(PG_ptepage, &(page)->flags) +#define ClearPagePtepage(page) clear_bit(PG_ptepage, &(page)->flags) +#define TestClearPagePtepage(page) test_and_clear_bit(PG_ptepage, &(page)->flags) #define PageReclaim(page) test_bit(PG_reclaim, &(page)->flags) #define SetPageReclaim(page) set_bit(PG_reclaim, &(page)->flags) diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/include/linux/ptshare.h 950-shpte/include/linux/ptshare.h --- 900-mjb1/include/linux/ptshare.h Wed Dec 31 16:00:00 1969 +++ 950-shpte/include/linux/ptshare.h Sat Mar 29 07:53:14 2003 @@ -0,0 +1,159 @@ +#ifndef _LINUX_PTSHARE_H +#define _LINUX_PTSHARE_H + +#include + +#include +#include + +/* + * Lock primitives for the pte page. They're aliased to the + * pte chain lock in struct page, since pte pages can't have + * pte chains. + */ + + +static inline void pte_page_lock(struct ptpage *ptepage) +{ + pte_chain_lock((struct page *)ptepage); +} + +static inline int pte_page_trylock(struct ptpage *ptepage) +{ + return pte_chain_trylock((struct page *)ptepage); +} + +static inline void pte_page_unlock(struct ptpage *ptepage) +{ + pte_chain_unlock((struct page *)ptepage); +} + +/* + * Provide a primitive for taking a pmd entry and using it to + * get the corresponding pte_page_lock. This function takes + * the page_table_lock briefly to freeze the pmd entry, so it can + * only be used in places where the page_table_lock is not held. + * The pte page pointer is returned, since most callers will want it + * and it's handy. + */ + +static inline struct ptpage *pte_page_lock_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct ptpage *ptepage; + + spin_lock(&mm->page_table_lock); + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + spin_unlock(&mm->page_table_lock); + return ptepage; +} + +/* + * Functions to handle shared page tables + */ + +#ifdef CONFIG_SHAREPTE + +int zap_shared_range(struct mmu_gather **tlb, pmd_t *pmd, unsigned long address, + unsigned long end); +int zap_shared_pmd(struct mm_struct *mm, pmd_t *pmd); +pte_t *pte_alloc_unshare(struct mm_struct *mm, pmd_t *pmd, + unsigned long address); +pte_t *pte_map_unshare(struct mm_struct *mm, pmd_t *pmd, + unsigned long address); +int share_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd); +void unshare_page_range(struct mm_struct *mm, unsigned long address, + unsigned long len); +pte_t *mprotect_shared_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long end); +void mremap_unshare(struct mm_struct *mm, pmd_t *src_pmd, pmd_t *dst_pmd, + unsigned long src_addr, unsigned long dst_addr); +pte_t *pte_fault_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address, int write_access); +int fork_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd); + +#else + +static inline void unshare_page_range(struct mm_struct *mm, + unsigned long address, unsigned long len) +{ + return; +} + +static inline int fork_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd) +{ + return copy_page_range(dst, src, vma); +} + + +static inline int zap_shared_range(struct mmu_gather **tlb, pmd_t *pmd, + unsigned long address, unsigned long end) +{ + pte_page_lock(pmd_ptpage(*pmd)); + return 1; +} + +static inline int zap_shared_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + return 1; +} + +static inline pte_t *pte_alloc_unshare(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *pte; + + pte = pte_alloc_map(mm, pmd, address); + if (pte) + pte_page_lock(pmd_ptpage(*pmd)); + + return pte; +} + +static inline pte_t *pte_map_unshare(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *pte; + + if (pmd_present(*pmd)) { + pte_page_lock(pmd_ptpage(*pmd)); + pte = pte_offset_map(pmd, address); + } else + pte = NULL; + + return pte; +} + +static inline pte_t * +mprotect_shared_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long end) +{ + pte_page_lock(pmd_ptpage(*pmd)); + return pte_offset_map(pmd, address); +} + +static inline void +mremap_unshare(struct mm_struct *mm, pmd_t *src_pmd, pmd_t *dst_pmd, + unsigned long src_addr, unsigned long dst_addr) +{ + return; +} + +static inline pte_t * +pte_fault_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address, int write_access) +{ + pte_t *pte; + + pte = pte_alloc_map(mm, pmd, address); + if (pte) + pte_page_lock(pmd_ptpage(*pmd)); + + return pte; +} +#endif /* CONFIG_SHARE_PTE */ + +#endif diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/include/linux/rmap-locking.h 950-shpte/include/linux/rmap-locking.h --- 900-mjb1/include/linux/rmap-locking.h Thu Jan 9 19:16:14 2003 +++ 950-shpte/include/linux/rmap-locking.h Sat Mar 29 07:53:14 2003 @@ -28,6 +28,18 @@ static inline void pte_chain_lock(struct #endif } +static inline int pte_chain_trylock(struct page *page) +{ + preempt_disable(); +#ifdef CONFIG_SMP + if (test_and_set_bit(PG_chainlock, &page->flags)) { + preempt_enable(); + return 0; + } +#endif + return 1; +} + static inline void pte_chain_unlock(struct page *page) { #ifdef CONFIG_SMP diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/include/linux/sched.h 950-shpte/include/linux/sched.h --- 900-mjb1/include/linux/sched.h Thu Mar 27 22:18:13 2003 +++ 950-shpte/include/linux/sched.h Sat Mar 29 07:53:14 2003 @@ -192,6 +192,7 @@ struct mm_struct { struct vm_area_struct * mmap_cache; /* last find_vma result */ unsigned long free_area_cache; /* first hole */ pgd_t * pgd; + atomic_t ptepages; /* Number of pte pages allocated */ atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ int map_count; /* number of VMAs */ diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/include/linux/swapops.h 950-shpte/include/linux/swapops.h --- 900-mjb1/include/linux/swapops.h Wed Mar 26 22:54:38 2003 +++ 950-shpte/include/linux/swapops.h Sat Mar 29 10:20:23 2003 @@ -1,3 +1,5 @@ +#include + /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/kernel/fork.c 950-shpte/kernel/fork.c --- 900-mjb1/kernel/fork.c Thu Mar 27 22:09:08 2003 +++ 950-shpte/kernel/fork.c Sat Mar 29 07:53:14 2003 @@ -30,6 +30,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -251,6 +255,7 @@ static inline int dup_mmap(struct mm_str struct vm_area_struct * mpnt, *tmp, **pprev; int retval; unsigned long charge = 0; + pmd_t *prev_pmd = 0; down_write(&oldmm->mmap_sem); flush_cache_mm(current->mm); @@ -260,6 +265,7 @@ static inline int dup_mmap(struct mm_str mm->free_area_cache = TASK_UNMAPPED_BASE; mm->map_count = 0; mm->rss = 0; + atomic_set(&mm->ptepages, 0); mm->cpu_vm_mask = 0; pprev = &mm->mmap; @@ -314,7 +320,7 @@ static inline int dup_mmap(struct mm_str *pprev = tmp; pprev = &tmp->vm_next; mm->map_count++; - retval = copy_page_range(mm, current->mm, tmp); + retval = fork_page_range(mm, current->mm, tmp, &prev_pmd); spin_unlock(&mm->page_table_lock); if (tmp->vm_ops && tmp->vm_ops->open) diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/mm/Makefile 950-shpte/mm/Makefile --- 900-mjb1/mm/Makefile Thu Mar 27 23:23:43 2003 +++ 950-shpte/mm/Makefile Sat Mar 29 07:53:14 2003 @@ -12,3 +12,5 @@ obj-y := bootmem.o fadvise.o filemap.o slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o + +obj-$(CONFIG_SHAREPTE) += ptshare.o diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/mm/fremap.c 950-shpte/mm/fremap.c --- 900-mjb1/mm/fremap.c Wed Mar 26 22:54:38 2003 +++ 950-shpte/mm/fremap.c Sat Mar 29 07:53:14 2003 @@ -13,11 +13,13 @@ #include #include #include +#include #include #include #include -static inline int zap_pte(struct mm_struct *mm, pte_t *ptep) +static inline int +zap_pte(struct mm_struct *mm, struct ptpage *ptepage, pte_t *ptep) { pte_t pte = *ptep; @@ -34,7 +36,7 @@ static inline int zap_pte(struct mm_stru set_page_dirty(page); page_remove_rmap(page, ptep); page_cache_release(page); - mm->rss--; + decrement_rss(ptepage); } } return 1; @@ -42,6 +44,7 @@ static inline int zap_pte(struct mm_stru if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(ptep); + decrement_swapcount(ptepage); return 0; } } @@ -54,6 +57,7 @@ int install_page(struct mm_struct *mm, s unsigned long addr, struct page *page, pgprot_t prot) { int err = -ENOMEM, flush; + struct ptpage *ptepage; pte_t *pte, entry; pgd_t *pgd; pmd_t *pmd; @@ -62,29 +66,31 @@ int install_page(struct mm_struct *mm, s pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; - pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); + pgd = pgd_offset(mm, addr); pmd = pmd_alloc(mm, pgd, addr); if (!pmd) goto err_unlock; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_unshare(mm, pmd, addr); if (!pte) goto err_unlock; - flush = zap_pte(mm, pte); + ptepage = pmd_ptpage(*pmd); + flush = zap_pte(mm, ptepage, pte); - mm->rss++; flush_page_to_ram(page); flush_icache_page(vma, page); entry = mk_pte(page, prot); set_pte(pte, entry); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); + increment_rss(ptepage); if (flush) flush_tlb_page(vma, addr); + pte_page_unlock(ptepage); spin_unlock(&mm->page_table_lock); pte_chain_free(pte_chain); return 0; @@ -151,9 +157,15 @@ long sys_remap_file_pages(unsigned long if (vma && (vma->vm_flags & VM_SHARED) && vma->vm_ops && vma->vm_ops->populate && end > start && start >= vma->vm_start && - end <= vma->vm_end) + end <= vma->vm_end) { + vma->vm_flags |= VM_NONLINEAR; + + /* Unshare all the pte pages in the entire vma range */ + unshare_page_range(mm, vma->vm_start, vma->vm_end); + err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot, pgoff, flags & MAP_NONBLOCK); + } up_read(&mm->mmap_sem); diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/mm/memory.c 950-shpte/mm/memory.c --- 900-mjb1/mm/memory.c Thu Mar 27 21:57:38 2003 +++ 950-shpte/mm/memory.c Sat Mar 29 07:53:14 2003 @@ -36,6 +36,20 @@ * (Gerhard.Wichert@pdb.siemens.de) */ +/* + * A note on locking of the page table structure: + * + * The top level lock that protects the page table is the + * mm->page_table_lock. This lock protects the pgd and pmd layer. + * However, with the advent of shared pte pages, this lock is not + * sufficient. The pte layer is now protected by the pte_page_lock, + * set in the struct page of the pte page. Note that with this + * locking scheme, once the pgd and pmd layers have been set in the + * page fault path and the pte_page_lock has been taken, the + * page_table_lock can be released. + * + */ + #include #include #include @@ -45,6 +59,7 @@ #include #include #include +#include #include #include @@ -78,79 +93,10 @@ static inline void copy_cow_page(struct copy_user_highpage(to, from, address); } -/* - * Note: this doesn't free the actual pages themselves. That - * has been handled earlier when unmapping all the memory regions. - */ -static inline void free_one_pmd(struct mmu_gather *tlb, pmd_t * dir) -{ - struct page *page; - - if (pmd_none(*dir)) - return; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return; - } - page = pmd_page(*dir); - pmd_clear(dir); - pgtable_remove_rmap(page); - pte_free_tlb(tlb, page); -} - -static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) -{ - pmd_t * pmd, * md, * emd; - - if (pgd_none(*dir)) - return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return; - } - pmd = pmd_offset(dir, 0); - pgd_clear(dir); - /* - * Beware if changing the loop below. It once used int j, - * for (j = 0; j < PTRS_PER_PMD; j++) - * free_one_pmd(pmd+j); - * but some older i386 compilers (e.g. egcs-2.91.66, gcc-2.95.3) - * terminated the loop with a _signed_ address comparison - * using "jle", when configured for HIGHMEM64GB (X86_PAE). - * If also configured for 3GB of kernel virtual address space, - * if page at physical 0x3ffff000 virtual 0x7ffff000 is used as - * a pmd, when that mm exits the loop goes on to free "entries" - * found at 0x80000000 onwards. The loop below compiles instead - * to be terminated by unsigned address comparison using "jb". - */ - for (md = pmd, emd = pmd + PTRS_PER_PMD; md < emd; md++) - free_one_pmd(tlb,md); - pmd_free_tlb(tlb, pmd); -} - -/* - * This function clears all user-level page tables of a process - this - * is needed by execve(), so that old pages aren't in the way. - * - * Must be called with pagetable lock held. - */ -void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr) -{ - pgd_t * page_dir = tlb->mm->pgd; - - page_dir += first; - do { - free_one_pgd(tlb, page_dir); - page_dir++; - } while (--nr); -} - pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { if (!pmd_present(*pmd)) { - struct page *new; + struct ptpage *new; spin_unlock(&mm->page_table_lock); new = pte_alloc_one(mm, address); @@ -166,8 +112,12 @@ pte_t * pte_alloc_map(struct mm_struct * pte_free(new); goto out; } + SetPagePtepage(new); pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); + atomic_inc(&mm->ptepages); + inc_page_state(nr_page_table_pages); + init_MUTEX(&new->sem); } out: return pte_offset_map(pmd, address); @@ -192,7 +142,6 @@ pte_t * pte_alloc_kernel(struct mm_struc pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); pmd_populate_kernel(mm, pmd, new); } out: @@ -261,6 +210,7 @@ skip_copy_pmd_range: address = (address goto nomem; do { + struct ptpage *src_page, *dst_page; pte_t * src_pte, * dst_pte; /* copy_pte_range */ @@ -280,7 +230,10 @@ skip_copy_pte_range: dst_pte = pte_alloc_map(dst, dst_pmd, address); if (!dst_pte) goto nomem; - spin_lock(&src->page_table_lock); + spin_lock(&src->page_table_lock); + src_page = pmd_ptpage(*src_pmd); + dst_page = pmd_ptpage(*dst_pmd); + pte_page_lock(src_page); src_pte = pte_offset_map_nested(src_pmd, address); do { pte_t pte = *src_pte; @@ -296,6 +249,7 @@ skip_copy_pte_range: if (!pte_file(pte)) swap_duplicate(pte_to_swp_entry(pte)); set_pte(dst_pte, pte); + increment_swapcount(dst_page); goto cont_copy_pte_range_noset; } pfn = pte_pfn(pte); @@ -329,7 +283,7 @@ skip_copy_pte_range: pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - dst->rss++; + increment_rss(dst_page); cont_copy_pte_range: set_pte(dst_pte, pte); @@ -345,6 +299,7 @@ cont_copy_pte_range: * pte_chain allocation failed, and we need to * run page reclaim. */ + pte_page_unlock(src_page); pte_unmap_nested(src_pte); pte_unmap(dst_pte); spin_unlock(&src->page_table_lock); @@ -354,12 +309,15 @@ cont_copy_pte_range: if (!pte_chain) goto nomem; spin_lock(&src->page_table_lock); + src_page = pmd_ptpage(*src_pmd); + pte_page_lock(src_page); dst_pte = pte_offset_map(dst_pmd, address); src_pte = pte_offset_map_nested(src_pmd, address); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { + pte_page_unlock(src_page); pte_unmap_nested(src_pte); pte_unmap(dst_pte); goto out_unlock; @@ -367,6 +325,7 @@ cont_copy_pte_range_noset: src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); + pte_page_unlock(src_page); pte_unmap_nested(src_pte-1); pte_unmap(dst_pte-1); spin_unlock(&src->page_table_lock); @@ -392,23 +351,17 @@ zap_pte_range(struct mmu_gather *tlb, pm { unsigned long offset; pte_t *ptep; + struct ptpage *ptepage = pmd_ptpage(*pmd); - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - ptep = pte_offset_map(pmd, address); offset = address & ~PMD_MASK; if (offset + size > PMD_SIZE) size = PMD_SIZE - offset; size &= PAGE_MASK; + + ptep = pte_offset_map(pmd, address); + for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { pte_t pte = *ptep; - if (pte_none(pte)) - continue; if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); @@ -424,20 +377,32 @@ zap_pte_range(struct mmu_gather *tlb, pm mark_page_accessed(page); tlb->freed++; page_remove_rmap(page, ptep); + decrement_rss(ptepage); tlb_remove_page(tlb, page); } } - } else { - if (!pte_file(pte)) + } else if (!pte_none(pte)) { + if (!pte_file(pte)) { free_swap_and_cache(pte_to_swp_entry(pte)); + decrement_swapcount(ptepage); + } pte_clear(ptep); } + if (!ptepage->mapcount && !ptepage->swapcount) { + pmd_clear(pmd); + pgtable_remove_rmap_locked(ptepage, tlb->mm); + atomic_dec(&tlb->mm->ptepages); + dec_page_state(nr_page_table_pages); + clear_pte_page(ptepage); + pte_free_tlb(tlb, ptepage); + break; + } } pte_unmap(ptep-1); } static void -zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir, +zap_pmd_range(struct mmu_gather **tlb, pgd_t * dir, unsigned long address, unsigned long size) { pmd_t * pmd; @@ -455,13 +420,27 @@ zap_pmd_range(struct mmu_gather *tlb, pg if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end = ((address + PGDIR_SIZE) & PGDIR_MASK); do { - zap_pte_range(tlb, pmd, address, end - address); + if (pmd_none(*pmd)) + goto skip_pmd; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto skip_pmd; + } + + if (zap_shared_range(tlb, pmd, address, end)) { + struct ptpage *ptepage = pmd_ptpage(*pmd); + zap_pte_range(*tlb, pmd, address, end - address); + pte_page_unlock(ptepage); + } +skip_pmd: address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); } -void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, +void +unmap_page_range(struct mmu_gather **tlb, struct vm_area_struct *vma, unsigned long address, unsigned long end) { pgd_t * dir; @@ -474,13 +453,13 @@ void unmap_page_range(struct mmu_gather BUG_ON(address >= end); dir = pgd_offset(vma->vm_mm, address); - tlb_start_vma(tlb, vma); + tlb_start_vma(*tlb, vma); do { zap_pmd_range(tlb, dir, address, end - address); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - tlb_end_vma(tlb, vma); + tlb_end_vma(*tlb, vma); } /* Dispose of an entire struct mmu_gather per rescheduling point */ @@ -570,7 +549,7 @@ int unmap_vmas(struct mmu_gather **tlbp, tlb_start_valid = 1; } - unmap_page_range(*tlbp, vma, start, start + block); + unmap_page_range(tlbp, vma, start, start + block); start += block; zap_bytes -= block; if ((long)zap_bytes > 0) @@ -620,6 +599,179 @@ void zap_page_range(struct vm_area_struc spin_unlock(&mm->page_table_lock); } +/** + * unmap_all_pages - unmap all the pages for an mm_struct + * @mm: the mm_struct to unmap + * + * This function is only called when an mm_struct is about to be + * released. It walks through all vmas and removes their pages + * from the page table. It understands shared pte pages and will + * decrement the count appropriately. + */ +void unmap_all_pages(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + struct ptpage *ptepage; + struct page *pagevec[16]; + int npages = 0; + unsigned long address; + unsigned long vm_end, pmd_end, pte_end; + + lru_add_drain(); + + vma = mm->mmap; + + /* On the off chance that the first vma is hugetlb... */ + if (is_vm_hugetlb_page(vma)) { + unmap_hugepage_range(vma, vma->vm_start, vma->vm_end); + vma = vma->vm_next; + mm->map_count--; + } + + for (;;) { + if (!vma) + goto out; + + address = vma->vm_start; +next_vma: + vm_end = vma->vm_end; + mm->map_count--; + /* + * Advance the vma pointer to the next vma. + * To facilitate coalescing adjacent vmas, the + * pointer always points to the next one + * beyond the range we're currently working + * on, which means vma will be null on the + * last iteration. + */ + vma = vma->vm_next; + if (vma) { + /* + * Go ahead and include hugetlb vmas + * in the range we process. The pmd + * entry will be cleared by close, so + * we'll just skip over them. This is + * easier than trying to avoid them. + */ + if (is_vm_hugetlb_page(vma)) + unmap_hugepage_range(vma, vma->vm_start, vma->vm_end); + + /* + * Coalesce adjacent vmas and process + * them all in one iteration. + */ + if (vma->vm_start == vm_end) { + goto next_vma; + } + } + pgd = pgd_offset(mm, address); + do { + if (pgd_none(*pgd)) + goto skip_pgd; + + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); +skip_pgd: + address = (address + PGDIR_SIZE) & PGDIR_MASK; + if (address > vm_end) + address = vm_end; + goto next_pgd; + } + pmd = pmd_offset(pgd, address); + if (vm_end > ((address + PGDIR_SIZE) & PGDIR_MASK)) + pmd_end = (address + PGDIR_SIZE) & PGDIR_MASK; + else + pmd_end = vm_end; + + do { + if (pmd_none(*pmd)) + goto skip_pmd; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); +skip_pmd: + address = (address + PMD_SIZE) & PMD_MASK; + if (address > pmd_end) + address = pmd_end; + goto next_pmd; + } + if (!zap_shared_pmd(mm, pmd)) + goto skip_pmd; + + ptepage = pmd_ptpage(*pmd); + pte = pte_offset_map(pmd, address); + if (pmd_end > ((address + PMD_SIZE) & PMD_MASK)) + pte_end = (address + PMD_SIZE) & PMD_MASK; + else + pte_end = pmd_end; + do { + pte_t pteval = *pte; + + if (pte_none(pteval)) + goto next_pte; + if (pte_present(pteval)) { + unsigned long pfn = pte_pfn(pteval); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) { + if (pte_dirty(pteval)) + set_page_dirty(page); + if (page->mapping && + pte_young(pteval) && + !PageSwapCache(page)) + mark_page_accessed(page); + page_remove_rmap(page, pte); + decrement_rss(ptepage); + pagevec[npages++] = page; + if (npages == 16) { + free_pages_and_swap_cache(pagevec, npages); + npages = 0; + } + + } + } + } else { + free_swap_and_cache(pte_to_swp_entry(pteval)); + decrement_swapcount(ptepage); + } + pte_clear(pte); + if (!ptepage->mapcount && !ptepage->swapcount) { + pmd_clear(pmd); + pgtable_remove_rmap(ptepage, mm); + atomic_dec(&mm->ptepages); + dec_page_state(nr_page_table_pages); + clear_pte_page(ptepage); + pte_free(ptepage); + address = pte_end; + break; + } +next_pte: + address += PAGE_SIZE; + pte++; + } while (address < pte_end); + pte_unmap(pte-1); +next_pmd: + pmd++; + } while (address < pmd_end); +next_pgd: + pgd++; + } while (address < vm_end); + } + +out: + if (npages) + free_pages_and_swap_cache(pagevec, npages); + + if (atomic_read(&mm->ptepages) != 0) + BUG(); + + flush_tlb_mm(mm); +} + /* * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. @@ -863,11 +1015,14 @@ static inline int remap_pmd_range(struct end = PGDIR_SIZE; phys_addr -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); + pte_t *pte; + pte = pte_alloc_unshare(mm, pmd, base + address); if (!pte) return -ENOMEM; + remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); pte_unmap(pte); + pte_page_unlock(pmd_ptpage(*pmd)); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -957,6 +1112,7 @@ static int do_wp_page(struct mm_struct * unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) { struct page *old_page, *new_page; + struct ptpage *ptepage = pmd_ptpage(*pmd); unsigned long pfn = pte_pfn(pte); struct pte_chain *pte_chain = NULL; int ret; @@ -992,7 +1148,7 @@ static int do_wp_page(struct mm_struct * * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) @@ -1005,11 +1161,11 @@ static int do_wp_page(struct mm_struct * /* * Re-check the pte - we dropped the lock */ - spin_lock(&mm->page_table_lock); + ptepage = pte_page_lock_pmd(mm, pmd); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { - if (PageReserved(old_page)) - ++mm->rss; + if (PageReserved(old_page)) + increment_rss(ptepage); page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); SetPageAnon(new_page); @@ -1030,7 +1186,7 @@ no_mem: oom: ret = VM_FAULT_OOM; out: - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); pte_chain_free(pte_chain); return ret; } @@ -1152,13 +1308,14 @@ static int do_swap_page(struct mm_struct pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) { struct page *page; + struct ptpage *ptepage = pmd_ptpage(*pmd); swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; int ret = VM_FAULT_MINOR; struct pte_chain *pte_chain = NULL; pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry); @@ -1168,14 +1325,14 @@ static int do_swap_page(struct mm_struct * Back out if somebody else faulted in this pte while * we released the page table lock. */ - spin_lock(&mm->page_table_lock); + ptepage = pte_page_lock_pmd(mm, pmd); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, orig_pte)) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); goto out; } @@ -1196,11 +1353,11 @@ static int do_swap_page(struct mm_struct * Back out if somebody else faulted in this pte while we * released the page table lock. */ - spin_lock(&mm->page_table_lock); + ptepage = pte_page_lock_pmd(mm, pmd); page_table = pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); unlock_page(page); page_cache_release(page); ret = VM_FAULT_MINOR; @@ -1213,7 +1370,6 @@ static int do_swap_page(struct mm_struct if (vm_swap_full()) remove_exclusive_swap_page(page); - mm->rss++; pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = pte_mkdirty(pte_mkwrite(pte)); @@ -1224,11 +1380,13 @@ static int do_swap_page(struct mm_struct set_pte(page_table, pte); SetPageAnon(page); pte_chain = page_add_rmap(page, page_table, pte_chain); + increment_rss(ptepage); + decrement_swapcount(ptepage); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); out: pte_chain_free(pte_chain); return ret; @@ -1246,20 +1404,10 @@ do_anonymous_page(struct mm_struct *mm, { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; + struct ptpage *ptepage = pmd_ptpage(*pmd); + struct pte_chain *pte_chain = NULL; int ret; - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1267,44 +1415,48 @@ do_anonymous_page(struct mm_struct *mm, if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); + + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + ret = VM_FAULT_OOM; + goto out; + } page = alloc_page(GFP_HIGHUSER); - if (!page) - goto no_mem; + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } clear_user_highpage(page, addr); - spin_lock(&mm->page_table_lock); + ptepage = pte_page_lock_pmd(mm, pmd); page_table = pte_offset_map(pmd, addr); if (!pte_none(*page_table)) { pte_unmap(page_table); page_cache_release(page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); ret = VM_FAULT_MINOR; goto out; } - mm->rss++; flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); SetPageAnon(page); + pte_chain = page_add_rmap(page, page_table, pte_chain); + increment_rss(ptepage); } set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); ret = VM_FAULT_MINOR; - goto out; -no_mem: - ret = VM_FAULT_OOM; out: pte_chain_free(pte_chain); return ret; @@ -1327,6 +1479,7 @@ do_no_page(struct mm_struct *mm, struct unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) { struct page * new_page; + struct ptpage *ptepage = pmd_ptpage(*pmd); pte_t entry; struct pte_chain *pte_chain; int ret; @@ -1335,7 +1488,7 @@ do_no_page(struct mm_struct *mm, struct return do_anonymous_page(mm, vma, page_table, pmd, write_access, address); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); @@ -1365,7 +1518,7 @@ do_no_page(struct mm_struct *mm, struct new_page = page; } - spin_lock(&mm->page_table_lock); + pte_page_lock_pmd(mm, pmd); page_table = pte_offset_map(pmd, address); /* @@ -1380,7 +1533,6 @@ do_no_page(struct mm_struct *mm, struct */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - ++mm->rss; flush_page_to_ram(new_page); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); @@ -1389,18 +1541,19 @@ do_no_page(struct mm_struct *mm, struct set_pte(page_table, entry); pte_chain = page_add_rmap(new_page, page_table, pte_chain); pte_unmap(page_table); + increment_rss(ptepage); } else { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); ret = VM_FAULT_MINOR; goto out; } /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(ptepage); ret = VM_FAULT_MAJOR; goto out; oom: @@ -1495,7 +1648,7 @@ static inline int handle_pte_fault(struc entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); pte_unmap(pte); - spin_unlock(&mm->page_table_lock); + pte_page_unlock(pmd_ptpage(*pmd)); return VM_FAULT_MINOR; } @@ -1524,9 +1677,13 @@ int handle_mm_fault(struct mm_struct *mm pmd = pmd_alloc(mm, pgd, address); if (pmd) { - pte_t * pte = pte_alloc_map(mm, pmd, address); - if (pte) + pte_t * pte; + + pte = pte_fault_alloc(mm, vma, pmd, address, write_access); + if (pte) { + spin_unlock(&mm->page_table_lock); return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + } } spin_unlock(&mm->page_table_lock); return VM_FAULT_OOM; diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/mm/mmap.c 950-shpte/mm/mmap.c --- 900-mjb1/mm/mmap.c Wed Mar 26 22:54:38 2003 +++ 950-shpte/mm/mmap.c Sat Mar 29 07:53:14 2003 @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include @@ -606,6 +608,7 @@ munmap_back: return -ENOMEM; goto munmap_back; } + unshare_page_range(mm, addr, len); /* Check against address space limit. */ if ((mm->total_vm << PAGE_SHIFT) + len @@ -1015,69 +1018,6 @@ find_extend_vma(struct mm_struct * mm, u } #endif -/* - * Try to free as many page directory entries as we can, - * without having to work very hard at actually scanning - * the page tables themselves. - * - * Right now we try to free page tables if we have a nice - * PGDIR-aligned area that got free'd up. We could be more - * granular if we want to, but this is fast and simple, - * and covers the bad cases. - * - * "prev", if it exists, points to a vma before the one - * we just free'd - but there's no telling how much before. - */ -static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, - unsigned long start, unsigned long end) -{ - unsigned long first = start & PGDIR_MASK; - unsigned long last = end + PGDIR_SIZE - 1; - unsigned long start_index, end_index; - struct mm_struct *mm = tlb->mm; - - if (!prev) { - prev = mm->mmap; - if (!prev) - goto no_mmaps; - if (prev->vm_end > start) { - if (last > prev->vm_start) - last = prev->vm_start; - goto no_mmaps; - } - } - for (;;) { - struct vm_area_struct *next = prev->vm_next; - - if (next) { - if (next->vm_start < start) { - prev = next; - continue; - } - if (last > next->vm_start) - last = next->vm_start; - } - if (prev->vm_end > first) - first = prev->vm_end + PGDIR_SIZE - 1; - break; - } -no_mmaps: - if (last < first) /* for arches with discontiguous pgd indices */ - return; - /* - * If the PGD bits are not consecutive in the virtual address, the - * old method of shifting the VA >> by PGDIR_SHIFT doesn't work. - */ - start_index = pgd_index(first); - if (start_index < FIRST_USER_PGD_NR) - start_index = FIRST_USER_PGD_NR; - end_index = pgd_index(last); - if (end_index > start_index) { - clear_page_tables(tlb, start_index, end_index - start_index); - flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK); - } -} - /* Normal function to fix up a mapping * This function is the default for when an area has no specific * function. This may be used as part of a more specific routine. @@ -1143,7 +1083,6 @@ static void unmap_region(struct mm_struc tlb = tlb_gather_mmu(mm, 0); unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted); vm_unacct_memory(nr_accounted); - free_pgtables(tlb, prev, start, end); tlb_finish_mmu(tlb, start, end); } @@ -1402,25 +1341,16 @@ void build_mmap_rb(struct mm_struct * mm /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { - struct mmu_gather *tlb; struct vm_area_struct *vma; - unsigned long nr_accounted = 0; profile_exit_mmap(mm); lru_add_drain(); - spin_lock(&mm->page_table_lock); - - tlb = tlb_gather_mmu(mm, 1); flush_cache_mm(mm); - /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ - mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, - ~0UL, &nr_accounted); - vm_unacct_memory(nr_accounted); - BUG_ON(mm->map_count); /* This is just debugging */ - clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); - tlb_finish_mmu(tlb, 0, TASK_SIZE); + unmap_all_pages(mm); + + BUG_ON(mm->map_count); /* This is just debugging */ vma = mm->mmap; mm->mmap = mm->mmap_cache = NULL; @@ -1429,14 +1359,20 @@ void exit_mmap(struct mm_struct *mm) mm->total_vm = 0; mm->locked_vm = 0; - spin_unlock(&mm->page_table_lock); - /* * Walk the list again, actually closing and freeing it * without holding any MM locks. */ while (vma) { struct vm_area_struct *next = vma->vm_next; + + /* + * If the VMA has been charged for, account for its + * removal + */ + if (vma->vm_flags & VM_ACCOUNT) + vm_unacct_memory((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + remove_shared_vm_struct(vma); if (vma->vm_ops) { if (vma->vm_ops->close) diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/mm/mprotect.c 950-shpte/mm/mprotect.c --- 900-mjb1/mm/mprotect.c Fri Dec 13 23:18:15 2002 +++ 950-shpte/mm/mprotect.c Sat Mar 29 07:53:14 2003 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -24,7 +26,7 @@ #include static inline void -change_pte_range(pmd_t *pmd, unsigned long address, +change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, unsigned long size, pgprot_t newprot) { pte_t * pte; @@ -37,11 +39,14 @@ change_pte_range(pmd_t *pmd, unsigned lo pmd_clear(pmd); return; } - pte = pte_offset_map(pmd, address); - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; + end = (address + PMD_SIZE) & PMD_MASK; + if (end > (address + size)) + end = address + size; + + pte = mprotect_shared_range(vma, pmd, address, end); + if (pte == NULL) + return; + do { if (pte_present(*pte)) { pte_t entry; @@ -56,11 +61,12 @@ change_pte_range(pmd_t *pmd, unsigned lo address += PAGE_SIZE; pte++; } while (address && (address < end)); + pte_page_unlock(pmd_ptpage(*pmd)); pte_unmap(pte - 1); } static inline void -change_pmd_range(pgd_t *pgd, unsigned long address, +change_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long address, unsigned long size, pgprot_t newprot) { pmd_t * pmd; @@ -74,12 +80,12 @@ change_pmd_range(pgd_t *pgd, unsigned lo return; } pmd = pmd_offset(pgd, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; + end = (address + PGDIR_SIZE) & PGDIR_MASK; + if (end > (address + size)) + end = address + size; + do { - change_pte_range(pmd, address, end - address, newprot); + change_pte_range(vma, pmd, address, end - address, newprot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -98,7 +104,7 @@ change_protection(struct vm_area_struct BUG(); spin_lock(¤t->mm->page_table_lock); do { - change_pmd_range(dir, start, end - start, newprot); + change_pmd_range(vma, dir, start, end - start, newprot); start = (start + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (start && (start < end)); diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/mm/mremap.c 950-shpte/mm/mremap.c --- 900-mjb1/mm/mremap.c Mon Mar 17 21:43:50 2003 +++ 950-shpte/mm/mremap.c Sat Mar 29 07:53:14 2003 @@ -16,106 +16,23 @@ #include #include #include +#include #include #include #include #include -static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte = NULL; - - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) - goto end; - if (pgd_bad(*pgd)) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - goto end; - } - - pmd = pmd_offset(pgd, addr); - if (pmd_none(*pmd)) - goto end; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - goto end; - } - - pte = pte_offset_map_nested(pmd, addr); - if (pte_none(*pte)) { - pte_unmap_nested(pte); - pte = NULL; - } -end: - return pte; -} - -#ifdef CONFIG_HIGHPTE /* Save a few cycles on the sane machines */ -static inline int page_table_present(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - pmd_t *pmd; - - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) - return 0; - pmd = pmd_offset(pgd, addr); - return pmd_present(*pmd); -} -#else -#define page_table_present(mm, addr) (1) -#endif - -static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) -{ - pmd_t *pmd; - pte_t *pte = NULL; - - pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); - return pte; -} - -static int -copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst, - struct pte_chain **pte_chainp) -{ - int error = 0; - pte_t pte; - struct page *page = NULL; - - if (pte_present(*src)) - page = pte_page(*src); - - if (!pte_none(*src)) { - if (page) - page_remove_rmap(page, src); - pte = ptep_get_and_clear(src); - if (!dst) { - /* No dest? We must put it back. */ - dst = src; - error++; - } - set_pte(dst, pte); - if (page) - *pte_chainp = page_add_rmap(page, dst, *pte_chainp); - } - return error; -} - static int move_one_page(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr) { struct mm_struct *mm = vma->vm_mm; int error = 0; - pte_t *src, *dst; + struct ptpage *src_page, *dst_page; + pgd_t *src_pgd, *dst_pgd; + pmd_t *src_pmd, *dst_pmd; + pte_t *src_pte, *dst_pte; struct pte_chain *pte_chain; pte_chain = pte_chain_alloc(GFP_KERNEL); @@ -124,28 +41,61 @@ move_one_page(struct vm_area_struct *vma goto out; } spin_lock(&mm->page_table_lock); - src = get_one_pte_map_nested(mm, old_addr); - if (src) { - /* - * Look to see whether alloc_one_pte_map needs to perform a - * memory allocation. If it does then we need to drop the - * atomic kmap - */ - if (!page_table_present(mm, new_addr)) { - pte_unmap_nested(src); - src = NULL; + src_pgd = pgd_offset(mm, old_addr); + dst_pgd = pgd_offset(mm, new_addr); + src_pmd = pmd_offset(src_pgd, old_addr); + + /* If there isn't a pmd to copy from, we're done */ + if (!src_pmd) + goto out_unlock; + if (!pmd_present(*src_pmd)) + goto out_unlock; + + dst_pmd = pmd_alloc(mm, dst_pgd, new_addr); + if (!dst_pmd) { + error++; + goto out_unlock; + } + + mremap_unshare(vma->vm_mm, src_pmd, dst_pmd, old_addr, new_addr); + + dst_pte = pte_alloc_map(mm, dst_pmd, new_addr); + if (!dst_pte) { + error++; + goto out_unlock; + } + dst_page = pmd_ptpage(*dst_pmd); + pte_page_lock(dst_page); + + src_page = pmd_ptpage(*src_pmd); + if (src_page != dst_page) + pte_page_lock(src_page); + src_pte = pte_offset_map_nested(src_pmd, old_addr); + + if (!pte_none(*src_pte)) { + pte_t pte = ptep_get_and_clear(src_pte); + set_pte(dst_pte, pte); + if (pte_present(pte)) { + struct page *page = pte_page(pte); + page_remove_rmap(page, src_pte); + if (src_page != dst_page) { + decrement_rss(src_page); + increment_rss(dst_page); + } + pte_chain = page_add_rmap(page, dst_pte, pte_chain); } - dst = alloc_one_pte_map(mm, new_addr); - if (src == NULL) - src = get_one_pte_map_nested(mm, old_addr); - error = copy_one_pte(mm, src, dst, &pte_chain); - pte_unmap_nested(src); - pte_unmap(dst); } + pte_unmap_nested(src_pte); + pte_unmap(dst_pte); + pte_page_unlock(dst_page); + if (src_page != dst_page) + pte_page_unlock(src_page); flush_tlb_page(vma, old_addr); + +out_unlock: spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); out: + pte_chain_free(pte_chain); return error; } diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/mm/msync.c 950-shpte/mm/msync.c --- 900-mjb1/mm/msync.c Sun Nov 17 20:29:31 2002 +++ 950-shpte/mm/msync.c Sat Mar 29 07:53:14 2003 @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include @@ -43,6 +45,7 @@ static int filemap_sync_pte_range(pmd_t unsigned long address, unsigned long end, struct vm_area_struct *vma, unsigned int flags) { + struct ptpage *ptepage; pte_t *pte; int error; @@ -53,6 +56,8 @@ static int filemap_sync_pte_range(pmd_t pmd_clear(pmd); return 0; } + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); pte = pte_offset_map(pmd, address); if ((address & PMD_MASK) != (end & PMD_MASK)) end = (address & PMD_MASK) + PMD_SIZE; @@ -64,6 +69,7 @@ static int filemap_sync_pte_range(pmd_t } while (address && (address < end)); pte_unmap(pte - 1); + pte_page_unlock(ptepage); return error; } diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/mm/ptshare.c 950-shpte/mm/ptshare.c --- 900-mjb1/mm/ptshare.c Wed Dec 31 16:00:00 1969 +++ 950-shpte/mm/ptshare.c Sat Mar 29 07:53:14 2003 @@ -0,0 +1,841 @@ +/* + * mm/ptshare.c + * + * Shared page table support. + * + * Created 2002 by Dave McCracken (dmccr@us.ibm.com) + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * Protections that can be set on the pmd entry (see discussion in mmap.c). + */ +static pgprot_t protection_pmd[8] = { + __PMD000, __PMD001, __PMD010, __PMD011, __PMD100, __PMD101, __PMD110, __PMD111 +}; + +/** + * is_pte_shared - Basic test for whether a pte page is shared + * @ptepage - the struct page of the pte page to test + * + * The count field in the page struct counts how many page tables are using this pte + * page. The share test simply tests for more then one reference. + */ +static inline int is_pte_shared(struct ptpage *ptepage) +{ + return page_count(ptepage) > 1; +} + +/** + * pte_needs_unshare - Test whether a pte page needs to be unshared at fault time + * @mm - The mm_struct being faulted + * @vma - The vma describing the range the faulting address is in + * @pmd - The pmd entry of the faulting address + * @address - The faulting address itself + * @write_access - True if it was a write fault + * + * This function makes the decision whether a pte page needs to be + * unshared or not. Note that page_count() == 1 isn't even tested + * here. The assumption is that if the pmd entry is marked writeable, + * then the page is either already unshared or doesn't need to be + * unshared. This catches the situation where task B unshares the pte + * page, then task A faults and needs to unprotect the pmd entry. + * This is actually done in pte_unshare. + * + * This function should be called with the page_table_lock held. + */ +static int pte_needs_unshare(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address, + int write_access) +{ + struct ptpage *ptepage; + + /* It's not even there, nothing to unshare. */ + if (!pmd_present(*pmd)) + return 0; + + /* + * If it's already writable, then it doesn't need to be unshared. + * It's either already not shared or it's part of a large shared + * region that will never need to be unshared. + */ + if (pmd_write(*pmd)) + return 0; + + /* If this isn't a write fault we don't need to unshare. */ + if (!write_access) + return 0; + + /* + * If this page fits entirely inside a shared region, don't unshare it. + */ + ptepage = pmd_ptpage(*pmd); + if ((vma->vm_flags & VM_SHARED) && + (vma->vm_start <= ptepage->virtual) && + (vma->vm_end >= (ptepage->virtual + PMD_SIZE))) { + return 0; + } + /* + * Ok, we have to unshare. + */ + return 1; +} + +/** + * pte_unshare - Unshare a pte page + * @mm: the mm_struct that gets an unshared pte page + * @pmd: a pointer to the pmd entry that needs unsharing + * @address: the virtual address that triggered the unshare + * + * Here is where a pte page is actually unshared. It actually covers + * a couple of possible conditions. If the page_count() is already 1, + * then that means it just needs to be set writeable. Otherwise, a + * new page needs to be allocated. + * + * When each pte entry is copied, it is evaluated for COW protection, + * as well as checking whether the swap count needs to be incremented. + * + * This function must be called with the page_table_lock held. It + * will release and reacquire the lock when it allocates a new page. + * + * The function must also be called with the pte_page_lock held on the + * old page. This lock will also be dropped, then reacquired when we + * allocate a new page. The pte_page_lock will be taken on the new + * page. Whichever pte page is returned will have its pte_page_lock + * held. + */ + +static pte_t *pte_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + pte_t *src_ptb, *dst_ptb; + struct ptpage *oldpage, *newpage, *tmppage; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL; + int base, addr; + int end, page_end; + int src_unshare; + +retry: + tmppage = oldpage = pmd_ptpage(*pmd); + + /* If it's already unshared, we just need to set it writeable */ + if (!is_pte_shared(oldpage)) + goto is_unshared; + + pte_page_unlock(oldpage); + spin_unlock(&mm->page_table_lock); + newpage = pte_alloc_one(mm, address); + if (newpage) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (pte_chain) { + down(&oldpage->sem); + } + } + spin_lock(&mm->page_table_lock); + if (unlikely(!newpage)) + return NULL; + if (!pte_chain) { + put_page((struct page *)newpage); + return NULL; + } + + /* + * Fetch the ptepage pointer again in case it changed while + * the lock was dropped. + */ + oldpage = pmd_ptpage(*pmd); + pte_page_lock(oldpage); + if (tmppage != oldpage) { + up(&tmppage->sem); + pte_free(newpage); + pte_chain_free(pte_chain); + goto retry; + } + + /* See if it got unshared while we dropped the lock */ + if (!is_pte_shared(oldpage)) { + pte_free(newpage); + up(&oldpage->sem); + goto is_unshared; + } + + pte_page_lock(newpage); + + init_MUTEX(&newpage->sem); + newpage->mapcount = newpage->swapcount = 0; + + base = addr = oldpage->virtual; + page_end = base + PMD_SIZE; + vma = find_vma(mm, base); + src_unshare = page_count(oldpage) == 2; + dst_ptb = pte_page_map((struct page *)newpage, base); + + if (!vma || (page_end <= vma->vm_start)) { + goto no_vma; + } + + if (vma->vm_start > addr) + addr = vma->vm_start; + + if (vma->vm_end < page_end) + end = vma->vm_end; + else + end = page_end; + + src_ptb = pte_page_map_nested((struct page *)oldpage, base); + + do { + unsigned int cow = 0; + pte_t *src_pte = src_ptb + pte_index(addr); + pte_t *dst_pte = dst_ptb + pte_index(addr); + + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + + do { + pte_t pte = *src_pte; + struct page *page; + + if (pte_none(pte)) + goto unshare_skip_set; + + if (!pte_present(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + set_pte(dst_pte, pte); + newpage->swapcount++; + goto unshare_skip_set; + } + page = pte_page(pte); + if (!PageReserved(page)) { + /* COW mappings require write protecting both sides */ + if (cow) { + pte = pte_wrprotect(pte); + if (src_unshare) + set_pte(src_pte, pte); + } + /* If it's a shared mapping, + * mark it clean in the new mapping + */ + if (vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + get_page(page); + newpage->mapcount++; + } + set_pte(dst_pte, pte); + pte_chain = page_add_rmap(page, dst_pte, pte_chain); + if (!pte_chain) + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + pte_unmap_nested(src_ptb); + pte_unmap(dst_ptb); + pte_page_unlock(newpage); + pte_page_unlock(oldpage); + spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + spin_lock(&mm->page_table_lock); + return NULL; + } + spin_lock(&mm->page_table_lock); + oldpage = pmd_ptpage(*pmd); + pte_page_lock(oldpage); + pte_page_lock(newpage); + dst_ptb = pte_page_map((struct page *)newpage, addr); + src_ptb = pte_page_map_nested((struct page *)oldpage, addr); + } +unshare_skip_set: + src_pte++; + dst_pte++; + addr += PAGE_SIZE; + } while (addr < end); + + if (addr >= page_end) + break; + + vma = vma->vm_next; + if (!vma) + break; + + if (page_end <= vma->vm_start) + break; + + addr = vma->vm_start; + if (vma->vm_end < page_end) + end = vma->vm_end; + else + end = page_end; + } while (1); + + pte_unmap_nested(src_ptb); + +no_vma: + up(&oldpage->sem); + SetPagePtepage(newpage); + pgtable_remove_rmap_locked(oldpage, mm); + pgtable_add_rmap_locked(newpage, mm, base); + pmd_populate(mm, pmd, newpage); + inc_page_state(nr_page_table_pages); + + flush_tlb_mm(mm); + + put_page((struct page *)oldpage); + + pte_page_unlock(oldpage); + pte_chain_free(pte_chain); + return dst_ptb + pte_index(address); + +is_unshared: + pmd_populate(mm, pmd, oldpage); + flush_tlb_mm(mm); + pte_chain_free(pte_chain); + return pte_offset_map(pmd, address); +} + +/** + * pte_try_to_share - Attempt to find a pte page that can be shared + * @mm: the mm_struct that needs a pte page + * @vma: the vm_area the address is in + * @pmd: a pointer to the pmd entry that needs filling + * @address: the address that caused the fault + * + * This function is called during a page fault. If there is no pte + * page for this address, it checks the vma to see if it is shared, + * and if it spans the pte page. If so, it goes to the address_space + * structure and looks through for matching vmas from other tasks that + * already have a pte page that can be shared. If it finds one, it + * attaches it and makes it a shared page. + */ + +static pte_t *pte_try_to_share(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address) +{ + struct address_space *as; + struct vm_area_struct *lvma; + struct ptpage *ptepage; + unsigned long base; + pte_t *pte = NULL; + + /* It's not even shared memory. We definitely can't share the page. */ + if (!(vma->vm_flags & VM_SHARED)) + return NULL; + + /* Areas with nonlinear mappings can't be shared */ + if (vma->vm_flags & VM_NONLINEAR) + return NULL; + + /* We can only share if the entire pte page fits inside the vma */ + base = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + if ((base < vma->vm_start) || (vma->vm_end < (base + PMD_SIZE))) + return NULL; + + as = vma->vm_file->f_dentry->d_inode->i_mapping; + + down(&as->i_shared_sem); + + list_for_each_entry(lvma, &as->i_mmap_shared, shared) { + pgd_t *lpgd; + pmd_t *lpmd; + pmd_t pmdval; + + /* Skip the one we're working on */ + if (lvma == vma) + continue; + + /* We can't share with a nonlinear vma */ + if (lvma->vm_flags & VM_NONLINEAR) + return NULL; + + /* It has to be mapping to the same address */ + if ((lvma->vm_start != vma->vm_start) || + (lvma->vm_end != vma->vm_end) || + (lvma->vm_pgoff != vma->vm_pgoff)) + continue; + + lpgd = pgd_offset(lvma->vm_mm, address); + lpmd = pmd_offset(lpgd, address); + + /* This page table doesn't have a pte page either, so skip it. */ + if (!pmd_present(*lpmd)) + continue; + + /* Ok, we can share it. */ + + ptepage = pmd_ptpage(*lpmd); + pte_page_lock(ptepage); + get_page(ptepage); + pgtable_add_rmap_locked(ptepage, mm, address); + /* + * If this vma is only mapping it read-only, set the + * pmd entry read-only to protect it from writes. + * Otherwise set it writeable. + */ + pmdval = *lpmd; + pmdval = pmd_modify(pmdval, protection_pmd[vma->vm_flags & 0x7]); + set_pmd(pmd, pmdval); + pte = pte_page_map((struct page *)ptepage, address); + break; + } + up(&as->i_shared_sem); + return pte; +} + +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) + +/** + * share_page_range - share a range of pages at the pte page level at fork time + * @dst: the mm_struct of the forked child + * @src: the mm_struct of the forked parent + * @vma: the vm_area to be shared + * @prev_pmd: A pointer to the pmd entry we did at last invocation + * + * This function shares pte pages between parent and child at fork. + * If the vm_area is shared and spans the page, it sets it + * writeable. Otherwise it sets it read-only. The prev_pmd parameter + * is used to keep track of pte pages we've already shared, since this + * function can be called with multiple vmas that point to the same + * pte page. + */ +int share_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long address = vma->vm_start; + unsigned long end = vma->vm_end; + + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst, src, vma); + + src_pgd = pgd_offset(src, address)-1; + dst_pgd = pgd_offset(dst, address)-1; + + for (;;) { + pmd_t * src_pmd, * dst_pmd; + + src_pgd++; dst_pgd++; + + if (pgd_none(*src_pgd)) + goto skip_share_pmd_range; + if (pgd_bad(*src_pgd)) { + pgd_ERROR(*src_pgd); + pgd_clear(src_pgd); +skip_share_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; + if (!address || (address >= end)) + goto out; + continue; + } + + src_pmd = pmd_offset(src_pgd, address); + dst_pmd = pmd_alloc(dst, dst_pgd, address); + if (!dst_pmd) + goto nomem; + + spin_lock(&src->page_table_lock); + + do { + pmd_t pmdval = *src_pmd; + struct ptpage *ptepage = pmd_ptpage(pmdval); + + if (pmd_none(pmdval)) + goto skip_share_pte_range; + if (pmd_bad(pmdval)) { + pmd_ERROR(*src_pmd); + pmd_clear(src_pmd); + goto skip_share_pte_range; + } + + /* + * We set the pmd read-only in both the parent and the + * child unless it's a writeable shared region that + * spans the entire pte page. + */ + if ((((vma->vm_flags & (VM_SHARED|VM_WRITE)) != + (VM_SHARED|VM_WRITE)) || + (ptepage->virtual < vma->vm_start) || + ((ptepage->virtual + PMD_SIZE) > vma->vm_end)) && + pmd_write(pmdval)) { + pmdval = pmd_wrprotect(pmdval); + set_pmd(src_pmd, pmdval); + } + set_pmd(dst_pmd, pmdval); + + /* Only do this if we haven't seen this pte page before */ + if (src_pmd != *prev_pmd) { + get_page(ptepage); + pgtable_add_rmap(ptepage, dst, address); + atomic_inc(&dst->ptepages); + *prev_pmd = src_pmd; + dst->rss += ptepage->mapcount; + } + +skip_share_pte_range: address = (address + PMD_SIZE) & PMD_MASK; + if (address >= end) + goto out_unlock; + + src_pmd++; + dst_pmd++; + } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + spin_unlock(&src->page_table_lock); + } + +out_unlock: + spin_unlock(&src->page_table_lock); + +out: + return 0; +nomem: + return -ENOMEM; +} + +/** + * fork_page_range - Either copy or share a page range at fork time + * @dst: the mm_struct of the forked child + * @src: the mm_struct of the forked parent + * @vma: the vm_area to be shared + * @prev_pmd: A pointer to the pmd entry we did at last invocation + * + * This wrapper decides whether to share page tables on fork or just make + * a copy. The current criterion is whether a page table has more than 3 + * pte pages, since all forked processes will unshare 3 pte pages after fork, + * even the ones doing an immediate exec. Tests indicate that if a page + * table has more than 3 pte pages, it's a performance win to share. + */ +int fork_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma, pmd_t **prev_pmd) +{ + if (atomic_read(&src->ptepages) > 3) + return share_page_range(dst, src, vma, prev_pmd); + + return copy_page_range(dst, src, vma); +} + +/** + * unshare_page_range - Make sure no pte pages are shared in a given range + * @mm: the mm_struct whose page table we unshare from + * @address: the base address of the range + * @len: the size of the range + * + * This function is called when a memory region is mapped. It makes sure there + * are no shared pte pages in the region. This is necessary to make sure the + * parent and child don't try to map competing regions into the same shared + * pte page. + */ +void unshare_page_range(struct mm_struct *mm, unsigned long address, unsigned long len) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + struct ptpage *ptepage; + unsigned long end = address + len; + unsigned long pmd_end; + + spin_lock(&mm->page_table_lock); + + do { + pmd_end = (address + PGDIR_SIZE) & PGDIR_MASK; + if (pmd_end > end) + pmd_end = end; + + pgd = pgd_offset(mm, address); + if (pgd_present(*pgd)) do { + pmd = pmd_offset(pgd, address); + if (pmd_present(*pmd)) { + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pte = pte_unshare(mm, pmd, address); + pte_unmap(pte); + ptepage = pmd_ptpage(*pmd); + } + pte_page_unlock(ptepage); + } + address = (address + PMD_SIZE) & PMD_MASK; + } while (address < pmd_end); + /* The end of the last time around is the start of the next one */ + address = pmd_end; + } while (address < end); + spin_unlock(&mm->page_table_lock); +} + +/** + * pte_alloc_unshare - Map and return an unshared pte page, allocating one if necessary + * @mm - The current mm_struct + * @pmd - The pmd entry that needs to be mapped and/or allocated + * @address - The current address, needed if a new pte page is allocated + * + * For a given pmd entry, make sure a pte page exists and is not shared, then map + * it and return it locked. + * + * This function must be called with the page_table_lock held. It takes the + * pte_page_lock for the pte page being returned and returns with it locked. + * It is up to the caller to unlock it. If the pte_alloc_map fails, NULL is + * returned and no lock is taken. + */ +pte_t *pte_alloc_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + pte_t *pte; + + if (pmd_present(*pmd)) { + struct ptpage *ptepage; + + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pte = pte_unshare(mm, pmd, address); + } else + pte = pte_offset_map(pmd, address); + } else { + pte = pte_alloc_map(mm, pmd, address); + if (pte) + pte_page_lock(pmd_ptpage(*pmd)); + } + return pte; +} + +/** + * pte_map_unshare - if a pmd_entry exists, make sure it is unshared and map it + * @mm - The current mm_struct + * @pmd - The pmd entry that needs to be mapped + * @address - The current address, needed if it's unshared. + * + * If a pmd entry is valid, make sure the pte page is unshared, then map it + * and return it locked. If none exists, return NULL. + * + * This function must be called with the page_table_lock held. It takes the + * pte_page_lock for the pte page being returned and returns with it locked + * if one exists. It is up to the caller to unlock it. if no pte page exists + * no lock is taken. + */ +pte_t *pte_map_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + pte_t *pte; + + if (pmd_present(*pmd)) { + struct ptpage *ptepage; + + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pte = pte_unshare(mm, pmd, address); + } else + pte = pte_offset_map(pmd, address); + } else + pte = NULL; + + return pte; +} + +/** + * zap_shared_range - helper function for zap_pmd_range in mm/memory.c + * @tlb - The mmu_gather_t being used to coalesce deleted pages + * @pmd - The pmd entry currently being worked on + * @address - The start of the current range + * @end - The end of the current range + * + * Returns false if the pte page was shared and the count decremented, + * true if the page wasn't shared or was unshared. + * + * This function is called as part of deleting a range of pages from a page + * table. It takes care of detecting when a pmd entry points to a shared pte + * page. + * + * If the pte page is shared and the range covers the entire pte page, + * the share count is decremented and the function returns false. If + * the range does not cover the entire range, the pte page is unshared. + * If the pte page is not shared or was unshared, the pte_page_lock is taken + * and the function returns true. It is the responsibility of the caller + * to unlock it. + */ +int zap_shared_range(struct mmu_gather **tlb, pmd_t *pmd, + unsigned long address, unsigned long end) +{ + struct mm_struct *mm = (*tlb)->mm; + struct ptpage *ptepage; + int ret = 1; + + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + if ((address <= ptepage->virtual) && + (end >= (ptepage->virtual + PMD_SIZE))) { + pmd_clear(pmd); + pgtable_remove_rmap_locked(ptepage, mm); + mm->rss -= ptepage->mapcount; + atomic_dec(&mm->ptepages); + put_page((struct page *)ptepage); + pte_page_unlock(ptepage); + ret = 0; + } else { + pte_t *pte; + + tlb_finish_mmu(*tlb, address, end); + pte = pte_unshare(mm, pmd, address); + pte_unmap(pte); + *tlb = tlb_gather_mmu(mm, 0); + } + + } + return ret; +} + +/** + * zap_shared_pmd - helper function for unmap_all_pages in mm/memory.c + * @mm - The mm_struct this page table is associated with + * @pmd - The pmd entry currently being worked on + * + * Returns false if the pte page was shared and the count decremented, + * true if the page wasn't shared. + * + * This function is called when an entire page table is being removed. It + * detects when a pte page is shared and takes care of decrementing the count. + */ +int zap_shared_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct ptpage *ptepage; + int ret = 1; + + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pmd_clear(pmd); + pgtable_remove_rmap_locked(ptepage, mm); + mm->rss -= ptepage->mapcount; + atomic_dec(&mm->ptepages); + put_page((struct page *)ptepage); + ret = 0; + } + pte_page_unlock(ptepage); + return ret; +} + +/** + * mprotect_shared_range - Helper function for change_pte_range in mm/mprotect.c + * @vma - The memory area being changed + * @pmd - The current pmd entry + * @address - The base of the current range + * @end - The end of the current range + * + * If the current range spans the entire pte page, set protections at the pmd entry + * level and return NULL to show nothing else needs to be done. Otherwise lock and + * map the pte page to be worked on. It is up to the caller to unmap the pte pointer + * and unlock the pte_page_lock if the pte page is returned. + */ +pte_t *mprotect_shared_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long end) +{ + struct ptpage *ptepage; + pte_t *pte; + + ptepage = pmd_ptpage(*pmd); + pte_page_lock(ptepage); + + if (is_pte_shared(ptepage)) { + if (((address & ~PMD_MASK) == 0) && ((end & ~PMD_MASK) == 0)) { + pmd_t pmdval = *pmd; + + pmdval = pmd_modify(pmdval, protection_pmd[vma->vm_flags & 0x7]); + set_pmd(pmd, pmdval); + pte_page_unlock(ptepage); + pte = NULL; + } else + pte = pte_unshare(vma->vm_mm, pmd, address); + } else + pte = pte_offset_map(pmd, address); + + return pte; +} + +/** + * mremap_unshare - Helper function for move_one_page in mm/mremap.c + * @mm - The current mm_struct + * @src_pmd - The originating pmd entry + * @dst_pmd - The target pmd entry + * @src_addr - The source address + * @dst_addr - The destination address + * + * Make sure both source and destination are unshared for mremap. Note that + * the existence of src_pmd is guaranteed by the caller, but dst_pmd may + * not exist. The mapping is discarded here since mremap needs them mapped + * differently. + * + * Both the page_table_lock and the mmap_sem are held when this function is called, + * so it is safe to not keep the pte_page_locks for these pages when it's finished. + */ + +void mremap_unshare(struct mm_struct *mm, pmd_t *src_pmd, pmd_t *dst_pmd, + unsigned long src_addr, unsigned long dst_addr) +{ + struct ptpage *ptepage; + pte_t *pte; + + ptepage = pmd_ptpage(*src_pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pte = pte_unshare(mm, src_pmd, src_addr); + pte_unmap(pte); + ptepage = pmd_ptpage(*src_pmd); + } + pte_page_unlock(ptepage); + + if ((src_pmd != dst_pmd) && + (pmd_present(*dst_pmd))) { + ptepage = pmd_ptpage(*dst_pmd); + pte_page_lock(ptepage); + if (is_pte_shared(ptepage)) { + pte = pte_unshare(mm, dst_pmd, dst_addr); + pte_unmap(pte); + ptepage = pmd_ptpage(*dst_pmd); + } + pte_page_unlock(ptepage); + } +} + +/** + * pte_fault_alloc - Helper function for handle_mm_fault in mm/memory.c + * @mm - The faulting mm_struct + * @vma The area the fault is in + * @pmd - The pmd entry that needs handling + * @address - The faulting addresss + * @write_access - True if it's a write fault + * + * This function takes care of allocating and/or sharing/unsharing the pte + * page on a page fault. It determines the shareability of the pte page based + * on the type of fault and the flags in the vma. It then locks and maps + * the pte page before returning a pointer to the pte entry that needs to + * be filled in by the fault. + */ +pte_t *pte_fault_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address, int write_access) +{ + pte_t *pte; + + if (pmd_present(*pmd)) { + pte_page_lock(pmd_ptpage(*pmd)); + if (pte_needs_unshare(mm, vma, pmd, address, write_access)) + pte = pte_unshare(mm, pmd, address); + else + pte = pte_offset_map(pmd, address); + } else { + pte = pte_try_to_share(mm, vma, pmd, address); + if (!pte) { + pte = pte_alloc_map(mm, pmd, address); + if (pte) + pte_page_lock(pmd_ptpage(*pmd)); + } + } + return pte; +} diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/mm/rmap.c 950-shpte/mm/rmap.c --- 900-mjb1/mm/rmap.c Thu Mar 27 21:57:40 2003 +++ 950-shpte/mm/rmap.c Sat Mar 29 09:24:22 2003 @@ -14,11 +14,11 @@ /* * Locking: * - the page->pte.chain is protected by the PG_chainlock bit, - * which nests within the zone->lru_lock, then the - * mm->page_table_lock, and then the page lock. + * which nests within the zone->lru_lock, then the pte_page_lock, + * and then the page lock. * - because swapout locking is opposite to the locking order * in the page fault path, the swapout path uses trylocks - * on the mm->page_table_lock + * on the pte_page_lock. */ #include #include @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -48,11 +49,17 @@ */ #define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t)) +struct mm_chain { + struct mm_chain *next; + struct mm_struct *mm; +}; + struct pte_chain { struct pte_chain *next; pte_addr_t ptes[NRPTE]; } ____cacheline_aligned; +kmem_cache_t *mm_chain_cache; kmem_cache_t *pte_chain_cache; /* @@ -76,6 +83,25 @@ kmem_cache_t *pte_chain_cache; ** VM stuff below this comment **/ +static inline struct mm_chain *mm_chain_alloc(void) +{ + struct mm_chain *ret; + + ret = kmem_cache_alloc(mm_chain_cache, GFP_ATOMIC); + return ret; +} + +static void mm_chain_free(struct mm_chain *mc, + struct mm_chain *prev_mc, struct ptpage *ptepage) +{ + if (prev_mc) + prev_mc->next = mc->next; + else if (ptepage) + ptepage->pte.mmchain = mc->next; + + kmem_cache_free(mm_chain_cache, mc); +} + /** * page_referenced - test if the page was referenced * @page: the page to test @@ -219,13 +245,140 @@ out: return referenced; } +/* + * pgtable_add_rmap_locked - Add an mm_struct to the chain for a pte page. + * @ptepage: The pte page to add the mm_struct to + * @mm: The mm_struct to add + * @address: The address of the page we're mapping + * + * Pte pages maintain a chain of mm_structs that use it. This adds a new + * mm_struct to the chain. + * + * This function must be called with the pte_page_lock held for the page + */ +void pgtable_add_rmap_locked(struct ptpage * ptepage, struct mm_struct * mm, + unsigned long address) +{ + struct mm_chain *mc; + +#ifdef BROKEN_PPC_PTE_ALLOC_ONE + /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ + extern int mem_init_done; + + if (!mem_init_done) + return; +#endif +#ifdef RMAP_DEBUG + BUG_ON(mm == NULL); + BUG_ON(!PagePtepage(ptepage)); +#endif + + if (PageDirect(ptepage)) { + mc = mm_chain_alloc(); + mc->mm = ptepage->pte.mmdirect; + mc->next = NULL; + ptepage->pte.mmchain = mc; + ClearPageDirect(ptepage); + } + if (ptepage->pte.mmchain) { + /* Hook up the mm_chain to the page. */ + mc = mm_chain_alloc(); + mc->mm = mm; + mc->next = ptepage->pte.mmchain; + ptepage->pte.mmchain = mc; + } else { + ptepage->pte.mmdirect = mm; + SetPageDirect(ptepage); + ptepage->virtual = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + } +} + +/* + * pgtable_remove_rmap_locked - Remove an mm_struct from the chain for a pte page. + * @ptepage: The pte page to remove the mm_struct from + * @mm: The mm_struct to remove + * + * Pte pages maintain a chain of mm_structs that use it. This removes an + * mm_struct from the chain. + * + * This function must be called with the pte_page_lock held for the page + */ +void pgtable_remove_rmap_locked(struct ptpage *ptepage, struct mm_struct *mm) +{ + struct mm_chain * mc, * prev_mc = NULL; + +#ifdef DEBUG_RMAP + BUG_ON(mm == NULL); + BUG_ON(!PagePtepage(ptepage)); +#endif + + if (PageDirect(ptepage)) { + if (ptepage->pte.mmdirect == mm) { + ptepage->pte.mmdirect = NULL; + ClearPageDirect(ptepage); + ptepage->virtual = 0; + goto out; + } + } else { +#ifdef DEBUG_RMAP + BUG_ON(ptepage->pte.mmchain->next == NULL); +#endif + for (mc = ptepage->pte.mmchain; mc; prev_mc = mc, mc = mc->next) { + if (mc->mm == mm) { + mm_chain_free(mc, prev_mc, ptepage); + /* Check whether we can convert to direct */ + mc = ptepage->pte.mmchain; + if (!mc->next) { + ptepage->pte.mmdirect = mc->mm; + SetPageDirect(ptepage); + mm_chain_free(mc, NULL, NULL); + } + goto out; + } + } + } + BUG(); +out: + return; +} + +/* + * pgtable_add_rmap - Add an mm_struct to the chain for a pte page. + * @ptepage: The pte page to add the mm_struct to + * @mm: The mm_struct to add + * @address: The address of the page we're mapping + * + * This is a wrapper for pgtable_add_rmap_locked that takes the lock + */ +void pgtable_add_rmap(struct ptpage *ptepage, struct mm_struct *mm, + unsigned long address) +{ + pte_page_lock(ptepage); + pgtable_add_rmap_locked(ptepage, mm, address); + pte_page_unlock(ptepage); +} + +/* + * pgtable_remove_rmap_locked - Remove an mm_struct from the chain for a pte page. + * @ptepage: The pte page to remove the mm_struct from + * @mm: The mm_struct to remove + * + * This is a wrapper for pgtable_remove_rmap_locked that takes the lock + */ +void pgtable_remove_rmap(struct ptpage *ptepage, struct mm_struct *mm) +{ + pte_page_lock(ptepage); + pgtable_remove_rmap_locked(ptepage, mm); + pte_page_unlock(ptepage); +} + /** * page_add_rmap - add reverse mapping entry to a page * @page: the page to add the mapping to * @ptep: the page table entry mapping this page * * Add a new pte reverse mapping to a page. - * The caller needs to hold the mm->page_table_lock. + * The caller needs to hold the pte_page_lock. */ struct pte_chain * page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) @@ -239,8 +392,7 @@ page_add_rmap(struct page *page, pte_t * BUG(); if (!pte_present(*ptep)) BUG(); - if (!ptep_to_mm(ptep)) - BUG(); + BUG_ON(PagePtepage(page)); #endif if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) @@ -270,12 +422,15 @@ page_add_rmap(struct page *page, pte_t * if (page->pte.direct == pte_paddr) BUG(); } else { + int count = 0; for (pc = page->pte.chain; pc; pc = pc->next) { - for (i = 0; i < NRPTE; i++) { + for (i = 0; i < NRPTE; i++, count++) { pte_addr_t p = pc->ptes[i]; - if (p && p == pte_paddr) + if (p && p == pte_paddr) { + printk(KERN_ERR "page_add_rmap: page %08lx (count %d), ptep %08lx, rmap count %d\n", page, page_count(page), ptep, count); BUG(); + } } } } @@ -332,7 +487,7 @@ out: * Removes the reverse mapping from the pte_chain of the page, * after that the caller can clear the page table entry and free * the page. - * Caller needs to hold the mm->page_table_lock. + * Caller needs to hold the pte_page_lock. */ void page_remove_rmap(struct page * page, pte_t * ptep) { @@ -346,6 +501,10 @@ void page_remove_rmap(struct page * page if (!page_mapped(page)) return; /* remap_page_range() from a driver? */ +#ifdef DEBUG_RMAP + BUG_ON(PagePtepage(page)); +#endif + pte_chain_lock(page); if (!PageAnon(page)) { @@ -425,6 +584,117 @@ out: return; } +static int pgtable_check_mlocked_mm(struct mm_struct *mm, unsigned long address) +{ + struct vm_area_struct *vma; + int ret = SWAP_SUCCESS; + + /* + * If this mm is in the process of exiting, skip this page + * for now to let the exit finish. + */ + if (atomic_read(&mm->mm_users) == 0) { + ret = SWAP_AGAIN; + goto out; + } + + /* During mremap, it's possible pages are not in a VMA. */ + vma = find_vma(mm, address); + if (!vma) { + ret = SWAP_FAIL; + goto out; + } + + /* The page is mlock()d, we cannot swap it out. */ + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + } +out: + return ret; +} + +static int pgtable_check_mlocked(struct ptpage *ptepage, unsigned long address) +{ + struct mm_chain *mc; + int ret = SWAP_SUCCESS; + +#ifdef DEBUG_RMAP + BUG_ON(!PagePtepage(ptepage)); +#endif + if (PageDirect(ptepage)) { + ret = pgtable_check_mlocked_mm(ptepage->pte.mmdirect, address); + goto out; + } + + for (mc = ptepage->pte.mmchain; mc; mc = mc->next) { +#ifdef DEBUG_RMAP + BUG_ON(mc->mm == NULL); +#endif + ret = pgtable_check_mlocked_mm(mc->mm, address); + if (ret != SWAP_SUCCESS) + goto out; + } +out: + return ret; +} + +/** + * pgtable_unmap_one_mm - Decrement the rss count and flush for an mm_struct + * @mm: - the mm_struct to decrement + * @address: - The address of the page we're removing + * + * All pte pages keep a chain of mm_struct that are using it. This does a flush + * of the address for that mm_struct and decrements the rss count. + */ +static int pgtable_unmap_one_mm(struct mm_struct *mm, unsigned long address) +{ + struct vm_area_struct *vma; + int ret = SWAP_SUCCESS; + + /* During mremap, it's possible pages are not in a VMA. */ + vma = find_vma(mm, address); + if (!vma) { + ret = SWAP_FAIL; + goto out; + } + flush_tlb_page(vma, address); + flush_cache_page(vma, address); + mm->rss--; + +out: + return ret; +} + +/** + * pgtable_unmap_one - Decrement all rss counts and flush caches for a pte page + * @ptepage: the pte page to decrement the count for + * @address: the address of the page we're removing + * + * This decrements the rss counts of all mm_structs that map this pte page + * and flushes the tlb and cache for these mm_structs and address + */ +static int pgtable_unmap_one(struct ptpage *ptepage, unsigned long address) +{ + struct mm_chain *mc; + int ret = SWAP_SUCCESS; + +#ifdef DEBUG_RMAP + BUG_ON(!PagePtepage(ptepage)); +#endif + + if (PageDirect(ptepage)) { + ret = pgtable_unmap_one_mm(ptepage->pte.mmdirect, address); + if (ret != SWAP_SUCCESS) + goto out; + } else for (mc = ptepage->pte.mmchain; mc; mc = mc->next) { + ret = pgtable_unmap_one_mm(mc->mm, address); + if (ret != SWAP_SUCCESS) + goto out; + } +out: + return ret; +} + static inline int try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) { @@ -433,6 +703,7 @@ try_to_unmap_obj_one(struct vm_area_stru pmd_t *pmd; pte_t *pte; pte_t pteval; + struct ptpage *ptepage; unsigned long loffset; unsigned long address; int ret = SWAP_SUCCESS; @@ -465,15 +736,22 @@ try_to_unmap_obj_one(struct vm_area_stru if (page_to_pfn(page) != pte_pfn(*pte)) { goto out_unmap; } - - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_FAIL; + ptepage = pmd_ptpage(*pmd); + if (!pte_page_trylock(ptepage)) { + ret = SWAP_AGAIN; goto out_unmap; } - flush_cache_page(vma, address); + ret = pgtable_check_mlocked(ptepage, address); + if (ret != SWAP_SUCCESS) + goto out_unlock_pt; + pteval = ptep_get_and_clear(pte); - flush_tlb_page(vma, address); + ret = pgtable_unmap_one(ptepage, address); + if (ret != SWAP_SUCCESS) { + set_pte(pte, pteval); + goto out_unlock_pt; + } if (pte_dirty(pteval)) set_page_dirty(page); @@ -481,10 +759,13 @@ try_to_unmap_obj_one(struct vm_area_stru if (!page->pte.mapcount) BUG(); - mm->rss--; + ptepage->mapcount--; page->pte.mapcount--; page_cache_release(page); +out_unlock_pt: + pte_page_unlock(ptepage); + out_unmap: pte_unmap(pte); @@ -543,49 +824,37 @@ out: * zone->lru_lock page_launder() * page lock page_launder(), trylock * pte_chain_lock page_launder() - * mm->page_table_lock try_to_unmap_one(), trylock + * pte_page_lock try_to_unmap_one(), trylock */ static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); static int try_to_unmap_one(struct page * page, pte_addr_t paddr) { pte_t *ptep = rmap_ptep_map(paddr); - unsigned long address = ptep_to_address(ptep); - struct mm_struct * mm = ptep_to_mm(ptep); - struct vm_area_struct * vma; pte_t pte; + struct ptpage *ptepage = (struct ptpage *)kmap_atomic_to_page(ptep); + unsigned long address = ptep_to_address(ptep); int ret; - if (!mm) - BUG(); - - /* - * We need the page_table_lock to protect us from page faults, - * munmap, fork, etc... - */ - if (!spin_trylock(&mm->page_table_lock)) { +#ifdef DEBUG_RMAP + BUG_ON(!PagePtepage(ptepage)); +#endif + if (!pte_page_trylock(ptepage)) { rmap_ptep_unmap(ptep); return SWAP_AGAIN; } - - /* During mremap, it's possible pages are not in a VMA. */ - vma = find_vma(mm, address); - if (!vma) { - ret = SWAP_FAIL; + ret = pgtable_check_mlocked(ptepage, address); + if (ret != SWAP_SUCCESS) goto out_unlock; - } - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_FAIL; + pte = ptep_get_and_clear(ptep); + + ret = pgtable_unmap_one(ptepage, address); + if (ret != SWAP_SUCCESS) { + set_pte(ptep, pte); goto out_unlock; } - /* Nuke the page table entry. */ - flush_cache_page(vma, address); - pte = ptep_get_and_clear(ptep); - flush_tlb_page(vma, address); - if (PageSwapCache(page)) { /* * Store the swap location in the pte. @@ -594,13 +863,27 @@ static int try_to_unmap_one(struct page swp_entry_t entry = { .val = page->index }; swap_duplicate(entry); set_pte(ptep, swp_entry_to_pte(entry)); + increment_swapcount(ptepage); BUG_ON(pte_file(*ptep)); - } else { + } else if (PageDirect(ptepage)) { + /* + * We're looking for nonlinear pages, which never have shared + * pagetable pages, so only need this if PageDirect(ptepage) + */ unsigned long pgidx; + struct mm_struct *mm = ptepage->pte.mmdirect; + struct vm_area_struct * vma = find_vma(mm, address); + + if (!vma) { + ret = SWAP_FAIL; + goto out_unlock; + } + /* * If a nonlinear mapping then store the file page offset * in the pte. */ + pgidx = (address - vma->vm_start) >> PAGE_SHIFT; pgidx += vma->vm_pgoff; pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; @@ -609,18 +892,21 @@ static int try_to_unmap_one(struct page BUG_ON(!pte_file(*ptep)); } } + ptepage->mapcount--; + pte_page_unlock(ptepage); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pte)) set_page_dirty(page); - mm->rss--; page_cache_release(page); ret = SWAP_SUCCESS; + goto out; out_unlock: + pte_page_unlock(ptepage); +out: rmap_ptep_unmap(ptep); - spin_unlock(&mm->page_table_lock); return ret; } @@ -721,6 +1007,58 @@ out: } /** + * increment_rss - increment the rss count by one + * @ptepage: The pte page that's getting a new paged mapped + * + * Since mapping a page into a pte page can increment the rss + * for multiple mm_structs, this function iterates through all + * the mms and increments them. It also keeps an rss count + * per pte page. + */ +void increment_rss(struct ptpage *ptepage) +{ + struct mm_chain *mc; + + if (PageDirect(ptepage)) + ptepage->pte.mmdirect->rss++; + else for (mc = ptepage->pte.mmchain; mc; mc = mc->next) + mc->mm->rss++; + + ptepage->mapcount++; +} + +/** + * decrement_rss - decrement the rss count by one + * @ptepage: The pte page that's unmapping a page + * + * Since unmapping a page can decrement the rss + * for multiple mm_structs, this function iterates through all + * the mms and decrements them. It also keeps an rss count + * per pte page. + */ +void decrement_rss(struct ptpage *ptepage) +{ + struct mm_chain *mc; + + if (PageDirect(ptepage)) + ptepage->pte.mmdirect->rss--; + else for (mc = ptepage->pte.mmchain; mc; mc = mc->next) + mc->mm->rss--; + + ptepage->mapcount--; +} + +void increment_swapcount(struct ptpage *ptepage) +{ + ptepage->swapcount++; +} + +void decrement_swapcount(struct ptpage *ptepage) +{ + ptepage->swapcount--; +} + +/** ** No more VM stuff below this comment, only pte_chain helper ** functions. **/ @@ -786,6 +1124,17 @@ struct pte_chain *pte_chain_alloc(int gf void __init pte_chain_init(void) { + + mm_chain_cache = kmem_cache_create( "mm_chain", + sizeof(struct mm_chain), + 0, + 0, + NULL, + NULL); + + if (!mm_chain_cache) + panic("failed to create mm_chain cache!\n"); + pte_chain_cache = kmem_cache_create( "pte_chain", sizeof(struct pte_chain), 0, diff -urpN -X /home/fletch/.diff.exclude 900-mjb1/mm/swapfile.c 950-shpte/mm/swapfile.c --- 900-mjb1/mm/swapfile.c Thu Mar 27 21:57:38 2003 +++ 950-shpte/mm/swapfile.c Sat Mar 29 07:53:14 2003 @@ -21,8 +21,10 @@ #include #include #include +#include #include +#include #include spinlock_t swaplock = SPIN_LOCK_UNLOCKED; @@ -379,7 +381,7 @@ void free_swap_and_cache(swp_entry_t ent */ /* mmlist_lock and vma->vm_mm->page_table_lock are held */ static void -unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, +unuse_pte(struct vm_area_struct *vma, struct ptpage *ptepage, pte_t *dir, swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { pte_t pte = *dir; @@ -394,8 +396,9 @@ unuse_pte(struct vm_area_struct *vma, un set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); SetPageAnon(page); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); + increment_rss(ptepage); + decrement_swapcount(ptepage); swap_free(entry); - ++vma->vm_mm->rss; } /* mmlist_lock and vma->vm_mm->page_table_lock are held */ @@ -403,6 +406,7 @@ static void unuse_pmd(struct vm_area_str unsigned long address, unsigned long size, unsigned long offset, swp_entry_t entry, struct page* page) { + struct ptpage *ptepage; pte_t * pte; unsigned long end; struct pte_chain *pte_chain = NULL; @@ -414,6 +418,8 @@ static void unuse_pmd(struct vm_area_str pmd_clear(dir); return; } + ptepage = pmd_ptpage(*dir); + pte_page_lock(ptepage); pte = pte_offset_map(dir, address); offset += address & PMD_MASK; address &= ~PMD_MASK; @@ -426,11 +432,11 @@ static void unuse_pmd(struct vm_area_str */ if (pte_chain == NULL) pte_chain = pte_chain_alloc(GFP_ATOMIC); - unuse_pte(vma, offset+address-vma->vm_start, - pte, entry, page, &pte_chain); + unuse_pte(vma, ptepage, pte, entry, page, &pte_chain); address += PAGE_SIZE; pte++; } while (address && (address < end)); + pte_page_unlock(ptepage); pte_unmap(pte - 1); pte_chain_free(pte_chain); }