--- linux-2.3.48-pre2/Documentation/kiobuf.sample.c.~1~ Fri Feb 25 15:17:17 2000 +++ linux-2.3.48-pre2/Documentation/kiobuf.sample.c Fri Feb 25 16:26:01 2000 @@ -0,0 +1,216 @@ +/* + * Example code for using kiobufs within a device driver for memory + * mapping of kernel memory into user space. + * + * This module creates a device driver which allocates memory in the + * kernel from arbitrary pages via vmalloc(), and then uses kiobufs to + * map those into user space. + * + * This module registers a misc char device driver called "test". Its + * major number will be 10; the minor number is registered dynamically + * and can be found by looking up /proc/misc (the usual minor number + * will be 63 unless other misc devices have already been registered). + * + * The device driver here can be opened, but read and write functions + * are not declared. However, the file descriptor to the driver can be + * mmap()ed, and the driver will use the kiobuf mmaping routines to map + * an area of kernel memory into a process's address space. + * + * Any number of processes may map the same memory at once, and it will + * act as shared memory. The kiobuf_vmap code will track the number of + * references to the memory, and the driver's module reference count is + * adjusted to make sure that the driver can be unloaded only when there + * are no users active. + * + * Written by Stephen C. Tweedie, 2000 + * (C) Red Hat, Inc. 2000 + */ + +#include +#include +#include +#include +#include +#include + +#if 1 +#define dprintk(x...) +#else +#define dprintk printk +#endif + +static int kiomap_mmap(struct file * file, struct vm_area_struct * vma); +static int kiomap_open(struct inode * inode, struct file * file); +static int kiomap_release(struct inode * inode, struct file * file); + + +static struct file_operations kiomap_fops = { + mmap: kiomap_mmap, + open: kiomap_open, + release:kiomap_release, +}; + +static struct miscdevice kiomap_device = { + minor: MISC_DYNAMIC_MINOR, + name: "kiomap", + fops: &kiomap_fops +}; + + +static void * local_data_area; +#define DATA_SIZE (100 * PAGE_SIZE) + +static struct kiobuf local_kiobuf; +static struct kiobuf_vmap local_vmap; + + +/* A pointer to the kiomap_finished function will be stored in the + * kiobuf_vmap we use for mmap()ing, and this function will be called + * once there are no more users of the kvmap. */ + +static void kiomap_finished(struct kiobuf_vmap *vmap) +{ + MOD_DEC_USE_COUNT; + dprintk(KERN_INFO __FUNCTION__ ": decremented module count\n"); +} + + +/* The initialisation function here creates three data structures. + * First of all it uses vmalloc() to reserve an area of memory. + * Secondly, it initialises a kiobuf into which that vmalloced memory is + * mapped. Finally, it initialises a kiobuf_vmap which can be used to + * mmap that kiobuf into user space. */ + +void __init create_local_heap(void) +{ + int err; + + /* Get our kernel memory for the data heap first */ + + local_data_area = vmalloc(DATA_SIZE); + if (!local_data_area) + return; + + /* Now initialise a kiobuf and kiobuf_vmap structure */ + + kiobuf_init(&local_kiobuf); + kvmap_init(&local_vmap, &local_kiobuf); + local_vmap.kiobuf = &local_kiobuf; + local_vmap.deref_callback = kiomap_finished; + + /* map_kernel_kiobuf will find all of the physical pages + * referred to by the vmalloc()ed virtual memory area, and will + * map those physical pages into the kiobuf we have just + * prepared. */ + + err = map_kernel_kiobuf(&local_kiobuf, + (unsigned long) local_data_area, + DATA_SIZE); + if (err) { + vfree(local_data_area); + local_data_area = 0; + } + + /* Initialise the vmalloced area --- we don't want users mapping + * this memory and peeking into stale kernel data! */ + + memset(local_data_area, 0xff, DATA_SIZE); +} + + +/* All we have to do on device open/close is to maintain the module + * reference counts. */ + +static int kiomap_open(struct inode * inode, struct file * file) +{ + MOD_INC_USE_COUNT; + dprintk(KERN_INFO __FUNCTION__ ": incremented module count\n"); + return 0; +} + +static int kiomap_release(struct inode * inode, struct file * file) +{ + MOD_DEC_USE_COUNT; + dprintk(KERN_INFO __FUNCTION__ ": decremented module count\n"); + return 0; +} + +/* This is our example device's mmap function, as declared to the rest + * of the VM. All we have to do in our case is call mmap_kiobuf() + * supplying the pre-initialised kiobuf_vmap struct that we created in + * create_local_heap() above. + * + * We have to be careful to take the vmap semaphore here before calling + * mmap_kiobuf --- that's something all callers of that function will + * always have to do to be safe on SMP systems. While we hold that + * semaphore we can change the module reference count safely, since the + * same semaphore will protect the call to MOD_DEC_USE_COUNT in the + * kvmap close function above. + */ + +static int kiomap_mmap(struct file * file, struct vm_area_struct * vma) +{ + int err; + + dprintk (KERN_INFO __FUNCTION__ ": begin(file %p, vma %p)\n", + file, vma); + + /* A quick check to make sure we were initialised properly... */ + + if (!local_data_area) + return -ENOMEM; + + /* Now we can take the kvmap semaphore and perform the mmap. */ + + down(&local_vmap.sem); + dprintk (KERN_INFO __FUNCTION__ ": Attempting mmap.\n"); + err = mmap_kiobuf(&local_vmap, vma); + dprintk (KERN_INFO __FUNCTION__ ": mmap_kiobuf returned %d\n", err); + + /* A return value of one means the kvmap was not in use when we + * called mmap_kiobuf() */ + + if (err == 1) { + MOD_INC_USE_COUNT; + dprintk(KERN_INFO __FUNCTION__ ": incremented module count\n"); + } + up(&local_vmap.sem); + + /* and a negative return value means an error. */ + + if (err < 0) + return err; + + return 0; +} + + +/* The module initialisation and cleanup functions just create and + * destroy our local kvmap data structures, and register and deregister + * the testing character device driver. */ + +int kiomap_init_module(void) +{ + int err; + + create_local_heap(); + if (!local_data_area) + return -ENOMEM; + + err = misc_register(&kiomap_device); + return err; +} + +int kiomap_destroy_module(void) +{ + int err; + + vfree(local_data_area); + err = misc_deregister(&kiomap_device); + return err; +} + +module_init(kiomap_init_module); +module_exit(kiomap_destroy_module); + +MODULE_DESCRIPTION("kiobuf vmap test driver"); --- linux-2.3.48-pre2/drivers/char/raw.c.~1~ Fri Feb 25 10:08:48 2000 +++ linux-2.3.48-pre2/drivers/char/raw.c Fri Feb 25 15:18:58 2000 @@ -197,14 +197,17 @@ raw_device_bindings[minor] = bdget(kdev_t_to_nr(MKDEV(rq.block_major, rq.block_minor))); } else { + struct block_device *bdev; kdev_t dev; - if (!raw_device_bindings[minor]) { - err = -ENODEV; - break; + + bdev = raw_device_bindings[minor]; + if (bdev) { + dev = to_kdev_t(bdev->bd_dev); + rq.block_major = MAJOR(dev); + rq.block_minor = MINOR(dev); + } else { + rq.block_major = rq.block_minor = 0; } - dev = to_kdev_t(raw_device_bindings[minor]->bd_dev); - rq.block_major = MAJOR(dev); - rq.block_minor = MINOR(dev); err = copy_to_user((void *) arg, &rq, sizeof(rq)); } break; @@ -304,7 +307,12 @@ err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); if (err) break; - +#if 0 + err = lock_kiovec(1, &iobuf, 1); + if (err) + break; +#endif + for (i=0; i < blocks; i++) b[i] = blocknr++; @@ -316,7 +324,7 @@ buf += err; } - unmap_kiobuf(iobuf); + unmap_kiobuf(iobuf); /* The unlock_kiobuf is implicit here */ if (err != iosize) break; --- linux-2.3.48-pre2/fs/buffer.c.~1~ Fri Feb 25 10:08:49 2000 +++ linux-2.3.48-pre2/fs/buffer.c Fri Feb 25 15:17:17 2000 @@ -1754,10 +1754,10 @@ mark_buffer_uptodate(bh, uptodate); kiobuf = bh->b_kiobuf; - if (!uptodate) - kiobuf->errno = -EIO; - if (atomic_dec_and_test(&kiobuf->io_count)) - kiobuf->end_io(kiobuf); + unlock_buffer(bh); + + kiobuf = bh->b_kiobuf; + end_kio_request(kiobuf, uptodate); } @@ -1766,8 +1766,7 @@ * for them to complete. Clean up the buffer_heads afterwards. */ -static int do_kio(struct kiobuf *kiobuf, - int rw, int nr, struct buffer_head *bh[], int size) +static int do_kio(int rw, int nr, struct buffer_head *bh[], int size) { int iosize; int i; @@ -1778,18 +1777,20 @@ if (rw == WRITE) rw = WRITERAW; - atomic_add(nr, &kiobuf->io_count); - kiobuf->errno = 0; ll_rw_block(rw, nr, bh); - kiobuf_wait_for_io(kiobuf); - - spin_lock(&unused_list_lock); - iosize = 0; + spin_lock(&unused_list_lock); + for (i = nr; --i >= 0; ) { iosize += size; tmp = bh[i]; + if (buffer_locked(tmp)) { + spin_unlock(&unused_list_lock); + wait_on_buffer(tmp); + spin_lock(&unused_list_lock); + } + if (!buffer_uptodate(tmp)) { /* We are traversing bh'es in reverse order so clearing iosize on error calculates the @@ -1801,11 +1802,7 @@ spin_unlock(&unused_list_lock); - if (iosize) - return iosize; - if (kiobuf->errno) - return kiobuf->errno; - return -EIO; + return iosize; } /* @@ -1847,8 +1844,6 @@ if ((iobuf->offset & (size-1)) || (iobuf->length & (size-1))) return -EINVAL; - if (!iobuf->locked) - panic("brw_kiovec: iobuf not locked for I/O"); if (!iobuf->nr_pages) panic("brw_kiovec: iobuf not initialised"); } @@ -1861,10 +1856,15 @@ iobuf = iovec[i]; offset = iobuf->offset; length = iobuf->length; - + iobuf->errno = 0; + for (pageind = 0; pageind < iobuf->nr_pages; pageind++) { map = iobuf->maplist[pageind]; - + if (!map) { + err = -EFAULT; + goto error; + } + while (length > 0) { blocknr = b[bufind++]; tmp = get_unused_buffer_head(0); @@ -1893,11 +1893,13 @@ length -= size; offset += size; + atomic_inc(&iobuf->io_count); + /* * Start the IO if we have got too much */ if (bhind >= KIO_MAX_SECTORS) { - err = do_kio(iobuf, rw, bhind, bh, size); + err = do_kio(rw, bhind, bh, size); if (err >= 0) transferred += err; else @@ -1915,7 +1917,7 @@ /* Is there any IO still left to submit? */ if (bhind) { - err = do_kio(iobuf, rw, bhind, bh, size); + err = do_kio(rw, bhind, bh, size); if (err >= 0) transferred += err; else --- linux-2.3.48-pre2/fs/iobuf.c.~1~ Sun Jan 23 20:34:33 2000 +++ linux-2.3.48-pre2/fs/iobuf.c Fri Feb 25 15:17:17 2000 @@ -12,18 +12,21 @@ static kmem_cache_t *kiobuf_cachep; -/* - * The default IO completion routine for kiobufs: just wake up - * the kiobuf, nothing more. - */ -void simple_wakeup_kiobuf(struct kiobuf *kiobuf) +void end_kio_request(struct kiobuf *kiobuf, int uptodate) { - wake_up(&kiobuf->wait_queue); + if ((!uptodate) && !kiobuf->errno) + kiobuf->errno = -EIO; + + if (atomic_dec_and_test(&kiobuf->io_count)) { + if (kiobuf->end_io) + kiobuf->end_io(kiobuf); + wake_up(&kiobuf->wait_queue); + } } -void __init kiobuf_init(void) +void __init kiobuf_setup(void) { kiobuf_cachep = kmem_cache_create("kiobuf", sizeof(struct kiobuf), @@ -33,6 +36,13 @@ panic("Cannot create kernel iobuf cache\n"); } +void kiobuf_init(struct kiobuf *iobuf) +{ + memset(iobuf, 0, sizeof(*iobuf)); + init_waitqueue_head(&iobuf->wait_queue); + iobuf->array_len = KIO_STATIC_PAGES; + iobuf->maplist = iobuf->map_array; +} int alloc_kiovec(int nr, struct kiobuf **bufp) { @@ -45,12 +55,7 @@ free_kiovec(i, bufp); return -ENOMEM; } - - memset(iobuf, 0, sizeof(*iobuf)); - init_waitqueue_head(&iobuf->wait_queue); - iobuf->end_io = simple_wakeup_kiobuf; - iobuf->array_len = KIO_STATIC_PAGES; - iobuf->maplist = iobuf->map_array; + kiobuf_init(iobuf); *bufp++ = iobuf; } @@ -64,6 +69,8 @@ for (i = 0; i < nr; i++) { iobuf = bufp[i]; + if (iobuf->locked) + unlock_kiovec(1, &iobuf); if (iobuf->array_len > KIO_STATIC_PAGES) kfree (iobuf->maplist); kmem_cache_free(kiobuf_cachep, bufp[i]); @@ -103,6 +110,9 @@ { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); + + if (atomic_read(&kiobuf->io_count) == 0) + return; add_wait_queue(&kiobuf->wait_queue, &wait); repeat: --- linux-2.3.48-pre2/include/linux/iobuf.h.~1~ Fri Feb 25 14:59:49 2000 +++ linux-2.3.48-pre2/include/linux/iobuf.h Fri Feb 25 15:17:17 2000 @@ -29,6 +29,8 @@ #define KIO_STATIC_PAGES (KIO_MAX_ATOMIC_IO / (PAGE_SIZE >> 10) + 1) #define KIO_MAX_SECTORS (KIO_MAX_ATOMIC_IO * 2) +/* The main kiobuf struct used for all our IO! */ + struct kiobuf { int nr_pages; /* Pages actually referenced */ @@ -46,7 +48,6 @@ unsigned int locked : 1; /* If set, pages has been locked */ /* Always embed enough struct pages for 64k of IO */ - unsigned long page_array[KIO_STATIC_PAGES]; struct page * map_array[KIO_STATIC_PAGES]; /* Dynamic state for IO completion: */ @@ -57,14 +58,51 @@ }; +/* For true mmap() of kiobufs, we need to be able to refcount the number + * of vmas accessing the kiobuf to be able to clean up properly when the + * entire kiobuf is no longer being accessed. + * + * The kvmap semaphore is necessary to synchronise reference counting in + * all cases. It is not sufficient to rely on the mm semaphore for + * this, as vmap references are inherited over fork() and we need to do + * the right thing for vmaps which end up shared as a result. + */ + +struct kiobuf_vmap; +typedef void kvmap_deref_fn (struct kiobuf_vmap *); + +struct kiobuf_vmap +{ + struct kiobuf * kiobuf; + void * private_data; + + struct semaphore sem; + + /* The following are always protected by the semaphore mutex */ + int refcount; + kvmap_deref_fn *deref_callback; +}; + + /* mm/memory.c */ int map_user_kiobuf(int rw, struct kiobuf *, unsigned long va, size_t len); +int map_kernel_kiobuf(struct kiobuf *, unsigned long va, size_t len); void unmap_kiobuf(struct kiobuf *iobuf); +int lock_kiovec(int nr, struct kiobuf *iovec[], int wait); +int unlock_kiovec(int nr, struct kiobuf *iovec[]); + +/* mm/iomap.c */ + +#define KIOMAP_PREFAULT 0x0001 +int mmap_kiobuf(struct kiobuf_vmap *iobuf, struct vm_area_struct * vma); +void kvmap_init(struct kiobuf_vmap *, struct kiobuf *); /* fs/iobuf.c */ -void __init kiobuf_init(void); +void __init kiobuf_setup(void); +void kiobuf_init(struct kiobuf *); +void end_kio_request(struct kiobuf *, int); void simple_wakeup_kiobuf(struct kiobuf *); int alloc_kiovec(int nr, struct kiobuf **); void free_kiovec(int nr, struct kiobuf **); --- linux-2.3.48-pre2/init/main.c.~1~ Thu Feb 17 11:50:55 2000 +++ linux-2.3.48-pre2/init/main.c Fri Feb 25 15:17:17 2000 @@ -534,7 +534,7 @@ vma_init(); buffer_init(mempages); page_cache_init(mempages); - kiobuf_init(); + kiobuf_setup(); signals_init(); bdev_init(); inode_init(); --- linux-2.3.48-pre2/kernel/ksyms.c.~1~ Fri Feb 25 10:08:50 2000 +++ linux-2.3.48-pre2/kernel/ksyms.c Fri Feb 25 15:17:17 2000 @@ -43,6 +43,7 @@ #include #include #include +#include #if defined(CONFIG_PROC_FS) #include @@ -156,11 +157,6 @@ EXPORT_SYMBOL(mark_buffer_dirty); EXPORT_SYMBOL(__mark_buffer_dirty); EXPORT_SYMBOL(__mark_inode_dirty); -EXPORT_SYMBOL(free_kiovec); -EXPORT_SYMBOL(brw_kiovec); -EXPORT_SYMBOL(alloc_kiovec); -EXPORT_SYMBOL(expand_kiobuf); -EXPORT_SYMBOL(unmap_kiobuf); EXPORT_SYMBOL(get_empty_filp); EXPORT_SYMBOL(init_private_file); EXPORT_SYMBOL(filp_open); @@ -339,6 +335,23 @@ /* Various random spinlocks we want to export */ EXPORT_SYMBOL(tqueue_lock); #endif + +/* Kiobufs */ +EXPORT_SYMBOL(kiobuf_init); +EXPORT_SYMBOL(kvmap_init); + +EXPORT_SYMBOL(alloc_kiovec); +EXPORT_SYMBOL(free_kiovec); +EXPORT_SYMBOL(expand_kiobuf); + +EXPORT_SYMBOL(map_user_kiobuf); +EXPORT_SYMBOL(map_kernel_kiobuf); +EXPORT_SYMBOL(unmap_kiobuf); +EXPORT_SYMBOL(mmap_kiobuf); + +EXPORT_SYMBOL(lock_kiovec); +EXPORT_SYMBOL(unlock_kiovec); +EXPORT_SYMBOL(brw_kiovec); /* autoirq from drivers/net/auto_irq.c */ EXPORT_SYMBOL(autoirq_setup); --- linux-2.3.48-pre2/mm/Makefile.~1~ Fri Dec 10 15:24:41 1999 +++ linux-2.3.48-pre2/mm/Makefile Fri Feb 25 15:17:17 2000 @@ -10,7 +10,7 @@ O_TARGET := mm.o O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ - page_alloc.o swap_state.o swapfile.o numa.o + page_alloc.o swap_state.o swapfile.o numa.o iomap.o ifeq ($(CONFIG_HIGHMEM),y) O_OBJS += highmem.o --- linux-2.3.48-pre2/mm/iomap.c.~1~ Fri Feb 25 15:17:17 2000 +++ linux-2.3.48-pre2/mm/iomap.c Fri Feb 25 15:17:17 2000 @@ -0,0 +1,144 @@ +/* + * iomap.c + * + * Perform mmap()ing of arbitrary kiobufs. + * + * Written by Stephen C. Tweedie, 2000 + * (C) Red Hat, Inc. 2000 + * + * Refer to Documentation/kiobuf* for instructions. + * + * The kiobuf_vmap structure contains the information necessary to track + * the mmap of a kiobuf into user space. The rest of this file defines + * functions necessary to maintain that mapping. + */ + +#include +#include +#include + +#define dprintk(x...) + +/* + * Open/close methods for the kvmap only need to track the reference counts. + * + * Both open and close will be called with the vma mm semaphore held, + * but without the mm page lock. + */ + +static void kvmap_open(struct vm_area_struct *vma) +{ + struct kiobuf_vmap *vmap; + vmap = (struct kiobuf_vmap *) vma->vm_private_data; + + /* Just increment the refcount on open: there has been an unmap + * or fork increasing the number of vmas on this kvmap. */ + down(&vmap->sem); + vmap->refcount++; + up(&vmap->sem); +} + +static void kvmap_close(struct vm_area_struct *vma) +{ + struct kiobuf_vmap *vmap; + vmap = (struct kiobuf_vmap *) vma->vm_private_data; + + down(&vmap->sem); + if (--vmap->refcount == 0) { + if (vmap->deref_callback) + vmap->deref_callback(vmap); + } + up(&vmap->sem); +} + + +static struct page * kvmap_nopage(struct vm_area_struct * vma, + unsigned long address, + int no_share) +{ + unsigned long pgoff; + struct kiobuf_vmap *vmap; + struct kiobuf *iobuf; + struct page *page; + + vmap = (struct kiobuf_vmap *) vma->vm_private_data; + iobuf = vmap->kiobuf; + pgoff = ((address - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; + dprintk(KERN_INFO __FUNCTION__ "(%p, %p, %d): offset %lu\n", + vma, (void *) address, no_share, pgoff); + + if (no_share) + BUG(); + + /* mmap() of a larger region than specified by the kiobuf + * results in a SIGBUS, as does faulting on a page not present + * in the kiobuf. */ + + if (pgoff > iobuf->nr_pages) { + dprintk(KERN_INFO __FUNCTION__ ": no such page in iobuf\n"); + return NULL; + } + + page = iobuf->maplist[pgoff]; + dprintk(KERN_INFO __FUNCTION__ ": found page %p\n", page); + if (!page) + return NULL; + + /* We need to obey the same rules as copy_page_range() when it + * comes to maintaining reference counts on pages in the kvmap. + * We only increment the refcount for normal, physically + * present, unreserved pages. */ + + if ((page-mem_map) < max_mapnr && ! PageReserved(page)) { + get_page(page); + dprintk(KERN_INFO __FUNCTION__ ": page count now %d\n", + atomic_read(&page->count)); + } + + return page; +} + + +static struct vm_operations_struct kio_vmops = +{ + open: kvmap_open, + close: kvmap_close, + nopage: kvmap_nopage, +}; + +void kvmap_init(struct kiobuf_vmap *vmap, struct kiobuf *iobuf) +{ + memset(vmap, 0, sizeof(*vmap)); + vmap->kiobuf = iobuf; + init_MUTEX(&vmap->sem); +} + +/* + * This routine is intended to be called by the mmap_* methods of other + * device drivers. + * + * Returns <0 on error, 1 if this is the first reference to the vmap, else 0. + * + * The vmap semaphore must be held before calling this. + */ + +int mmap_kiobuf(struct kiobuf_vmap *vmap, struct vm_area_struct * vma) +{ + int retval = 0; + + dprintk(KERN_INFO __FUNCTION__ "(vmap %p, vma %p)\n", vmap, vma); + + vma->vm_ops = &kio_vmops; + vma->vm_private_data = vmap; + vma->vm_flags |= VM_LOCKED; /* Don't swap out kvmaps! */ + + /* This is supposed to be a shared map --- reject COW mappings. */ + if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vmap->refcount++; + if (vmap->refcount == 1) + retval = 1; + + return retval; +} --- linux-2.3.48-pre2/mm/memory.c.~1~ Fri Feb 25 10:08:50 2000 +++ linux-2.3.48-pre2/mm/memory.c Fri Feb 25 15:17:17 2000 @@ -408,28 +408,25 @@ return pte_page(*pte); } - printk(KERN_ERR "Missing page in follow_page\n"); return NULL; } /* - * Given a physical address, is there a useful struct page pointing to it? + * Given a physical address, is there a useful struct page pointing to + * it? This may become more complex in the future if we start dealing + * with IO-aperture pages in kiobufs. */ -struct page * get_page_map(struct page *page, unsigned long vaddr) +static inline struct page * get_page_map(struct page *page) { - if (MAP_NR(vaddr) >= max_mapnr) - return 0; - if (page == ZERO_PAGE(vaddr)) - return 0; - if (PageReserved(page)) + if (page > (mem_map + max_mapnr)) return 0; return page; } /* * Force in an entire range of pages from the current process's user VA, - * and pin and lock the pages for IO. + * and pin them in physical memory. */ #define dprintk(x...) @@ -440,8 +437,6 @@ struct mm_struct * mm; struct vm_area_struct * vma = 0; struct page * map; - int doublepage = 0; - int repeat = 0; int i; /* Make sure the iobuf is not already mapped somewhere. */ @@ -457,11 +452,10 @@ if (err) return err; - repeat: down(&mm->mmap_sem); err = -EFAULT; - iobuf->locked = 1; + iobuf->locked = 0; iobuf->offset = va & ~PAGE_MASK; iobuf->length = len; @@ -481,16 +475,15 @@ spin_lock(&mm->page_table_lock); map = follow_page(ptr); if (!map) { + spin_unlock(&mm->page_table_lock); dprintk (KERN_ERR "Missing page in map_user_kiobuf\n"); - goto retry; + goto out_unlock; } - map = get_page_map(map, ptr); - if (map) { - if (TryLockPage(map)) { - goto retry; - } + map = get_page_map(map); + if (map) atomic_inc(&map->count); - } + else + printk (KERN_INFO "Mapped page missing [%d]\n", i); spin_unlock(&mm->page_table_lock); iobuf->maplist[i] = map; iobuf->nr_pages = ++i; @@ -507,42 +500,110 @@ unmap_kiobuf(iobuf); dprintk ("map_user_kiobuf: end %d\n", err); return err; +} - retry: - /* - * Undo the locking so far, wait on the page we got to, and try again. - */ - spin_unlock(&mm->page_table_lock); - unmap_kiobuf(iobuf); - up(&mm->mmap_sem); +/* + * Force in an entire range of pages from the current process's kernel + * VA. We do not expect to see unmapped pages here, so no page faults + * will be taken. The map_kernel_kiobuf() routine should work happily + * both for normal kernel allocations and for vmalloc()ed regions. + */ - /* - * Did the release also unlock the page we got stuck on? - */ - if (map) { - if (!PageLocked(map)) { - /* If so, we may well have the page mapped twice - * in the IO address range. Bad news. Of - * course, it _might_ * just be a coincidence, - * but if it happens more than * once, chances - * are we have a double-mapped page. */ - if (++doublepage >= 3) { - return -EINVAL; - } +static inline int map_pte_range(struct page **pmap, pmd_t * pmd, unsigned long va, unsigned long end) +{ + pte_t * pte; + int nr_pages = 0; + struct page *page; + + if (pmd_none(*pmd)) + return 0; + + pte = pte_offset(pmd, va); + + do { + if (pte_none(*pte)) + page = NULL; + else { + page = pte_page(*pte); + atomic_inc(&page->count); } + *pmap++ = page; + + va += PAGE_SIZE; + pte++; + nr_pages++; + } while (va < end && (va && PMD_MASK)); - /* - * Try again... - */ - wait_on_page(map); - } + return nr_pages; +} + +static inline int map_pmd_range(struct page **pmap, pgd_t * dir, unsigned long va, unsigned long end) +{ + pmd_t * pmd; + int total_pages = 0; + int nr_pages; - if (++repeat < 16) { - ptr = va & PAGE_MASK; - goto repeat; + if (pgd_none(*dir)) + return 0; + pmd = pmd_offset(dir, va); + + do { + nr_pages = map_pte_range(pmap, pmd, va, end); + pmd++; + pmap += nr_pages; + va += (nr_pages << PAGE_SHIFT); + total_pages += nr_pages; + } while (nr_pages && va < end && (va && PGDIR_MASK)); + return total_pages; +} + +int map_kernel_kiobuf(struct kiobuf *iobuf, unsigned long va, size_t len) +{ + unsigned long ptr, end; + int err; + struct mm_struct * mm; + struct page ** pmap; + int nr_pages; + pgd_t * dir; + + /* Make sure the iobuf is not already mapped somewhere. */ + if (iobuf->nr_pages) + return -EINVAL; + + mm = current->mm; + dprintk ("map_kernel_kiobuf: begin\n"); + + ptr = va & PAGE_MASK; + end = (va + len + PAGE_SIZE - 1) & PAGE_MASK; + err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT); + if (err) + return err; + + err = -EFAULT; + iobuf->locked = 0; + iobuf->nr_pages = 0; + iobuf->offset = va & ~PAGE_MASK; + iobuf->length = len; + + spin_lock(&mm->page_table_lock); + + dir = pgd_offset(mm, va); + pmap = iobuf->maplist; + + while (ptr < end) { + nr_pages = map_pmd_range(pmap, dir, ptr, end); + if (!nr_pages) + break; + dir++; + ptr += (nr_pages << PAGE_SHIFT); + pmap += nr_pages; + iobuf->nr_pages += nr_pages; } - return -EAGAIN; + + spin_unlock(&mm->page_table_lock); + dprintk ("map_kernel_kiobuf: end OK\n"); + return 0; } @@ -558,9 +619,9 @@ for (i = 0; i < iobuf->nr_pages; i++) { map = iobuf->maplist[i]; - - if (map && iobuf->locked) { - UnlockPage(map); + if (map) { + if (iobuf->locked) + UnlockPage(map); __free_page(map); } } @@ -568,6 +629,109 @@ iobuf->nr_pages = 0; iobuf->locked = 0; } + +/* + * Lock down all of the pages of a kiovec for IO. + * + * If any page is mapped twice in the kiovec, we return the error -EINVAL. + * + * The optional wait parameter causes the lock call to block until all + * pages can be locked if set. If wait==0, the lock operation is + * aborted if any locked pages are found and -EAGAIN is returned. + */ + +int lock_kiovec(int nr, struct kiobuf *iovec[], int wait) +{ + struct kiobuf *iobuf; + int i, j; + struct page *page, **ppage; + int doublepage = 0; + int repeat = 0; + + repeat: + + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + + if (iobuf->locked) + continue; + iobuf->locked = 1; + + ppage = iobuf->maplist; + for (j = 0; j < iobuf->nr_pages; ppage++, j++) { + page = *ppage; + if (!page) + continue; + + if (TryLockPage(page)) + goto retry; + } + } + + return 0; + + retry: + + /* + * We couldn't lock one of the pages. Undo the locking so far, + * wait on the page we got to, and try again. + */ + + unlock_kiovec(nr, iovec); + if (!wait) + return -EAGAIN; + + /* + * Did the release also unlock the page we got stuck on? + */ + if (!PageLocked(page)) { + /* + * If so, we may well have the page mapped twice + * in the IO address range. Bad news. Of + * course, it _might_ just be a coincidence, + * but if it happens more than once, chances + * are we have a double-mapped page. + */ + if (++doublepage >= 3) + return -EINVAL; + + /* Try again... */ + wait_on_page(page); + } + + if (++repeat < 16) + goto repeat; + return -EAGAIN; +} + +/* + * Unlock all of the pages of a kiovec after IO. + */ + +int unlock_kiovec(int nr, struct kiobuf *iovec[]) +{ + struct kiobuf *iobuf; + int i, j; + struct page *page, **ppage; + + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + + if (!iobuf->locked) + continue; + iobuf->locked = 0; + + ppage = iobuf->maplist; + for (j = 0; j < iobuf->nr_pages; ppage++, j++) { + page = *ppage; + if (!page) + continue; + UnlockPage(page); + } + } + return 0; +} + static inline void zeromap_pte_range(pte_t * pte, unsigned long address, unsigned long size, pgprot_t prot) --- linux-2.3.48-pre2/mm/vmscan.c.~1~ Thu Feb 17 11:50:53 2000 +++ linux-2.3.48-pre2/mm/vmscan.c Fri Feb 25 15:17:17 2000 @@ -259,7 +259,7 @@ unsigned long end; /* Don't swap out areas which are locked down */ - if (vma->vm_flags & VM_LOCKED) + if (vma->vm_flags & (VM_LOCKED | VM_IO)) return 0; pgdir = pgd_offset(vma->vm_mm, address);