SOURCES: kernel-desktop-fcache.patch (NEW) - filesystem cache, fro...

Wed Jun 21 16:02:01 CEST 2006

Author: sparky                       Date: Wed Jun 21 14:02:01 2006 GMT
Module: SOURCES                       Tag: HEAD
---- Log message:
- filesystem cache, from Con Kolivas patchset:
  http://www.kernel.org/pub/linux/kernel/people/ck/patches/2.6/2.6.17/2.6.17-ck1/patch-2.6.17-ck1.bz2

---- Files affected:
SOURCES:
   kernel-desktop-fcache.patch (NONE -> 1.1)  (NEW)

---- Diffs:

================================================================
Index: SOURCES/kernel-desktop-fcache.patch
diff -u /dev/null SOURCES/kernel-desktop-fcache.patch:1.1

--- /dev/null	Wed Jun 21 16:02:01 2006
+++ SOURCES/kernel-desktop-fcache.patch	Wed Jun 21 16:01:56 2006
@@ -0,0 +1,1783 @@
+diff -Nur linux-2.6.17/block/ll_rw_blk.c linux-2.6.17.fcache/block/ll_rw_blk.c
+--- linux-2.6.17/block/ll_rw_blk.c	2006-06-21 15:52:12.000000000 +0200
++++ linux-2.6.17.fcache/block/ll_rw_blk.c	2006-06-21 15:58:45.000000000 +0200
+@@ -2817,12 +2817,10 @@
+ 	 */
+ 	if (bio_rw_ahead(bio) || bio_failfast(bio))
+ 		req->flags |= REQ_FAILFAST;
+-
+-	/*
+-	 * REQ_BARRIER implies no merging, but lets make it explicit
+-	 */
+ 	if (unlikely(bio_barrier(bio)))
+-		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
++		req->flags |= REQ_HARDBARRIER;
++	if (!bio_mergeable(bio))
++		req->flags |= REQ_NOMERGE;
+ 
+ 	req->errors = 0;
+ 	req->hard_sector = req->sector = bio->bi_sector;
+@@ -2870,7 +2868,7 @@
+ 
+ 	spin_lock_irq(q->queue_lock);
+ 
+-	if (unlikely(barrier) || elv_queue_empty(q))
++	if (!bio_mergeable(bio) || elv_queue_empty(q))
+ 		goto get_rq;
+ 
+ 	el_ret = elv_merge(q, &req, bio);
+@@ -3109,6 +3107,7 @@
+ 
+ 	BIO_BUG_ON(!bio->bi_size);
+ 	BIO_BUG_ON(!bio->bi_io_vec);
++	BIO_BUG_ON(bio->bi_next);
+ 	bio->bi_rw |= rw;
+ 	if (rw & WRITE)
+ 		mod_page_state(pgpgout, count);
+diff -Nur linux-2.6.17/drivers/block/fcache.c linux-2.6.17.fcache/drivers/block/fcache.c
+--- linux-2.6.17/drivers/block/fcache.c	1970-01-01 01:00:00.000000000 +0100
++++ linux-2.6.17.fcache/drivers/block/fcache.c	2006-06-21 15:58:45.000000000 +0200
+@@ -0,0 +1,1475 @@
++/*
++ * A frontend cache for a block device. The purpose is to speedup a
++ * fairly random but repeated read work load, like the boot of a system.
++ *
++ * When run in priming mode, fcache allocates and writes data read from
++ * the source drive to our extent cache in the order in which they are
++ * accessed. When later run in non-priming mode, data accessed in the same
++ * order will be linearly available in the cache.
++ *
++ * Performance when priming is slower than non-fcache usage would be. If
++ * the fcache is located on another disk, the hit should be small. If the
++ * the fcache is located on the same disk (another partition), it runs
++ * at about half the speed. Non-priming performance should be fairly
++ * similar on same/other disk.
++ *
++ * On-disk format is as follows:
++ *	Block0:		header
++ *	Block1..X	extent maps
++ *	BlockX+1..Y	extent data
++ *
++ * Copyright (C) 2006 Jens Axboe <axboe at suse.de>
++ *
++ */
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/moduleparam.h>
++#include <linux/sched.h>
++#include <linux/blkdev.h>
++#include <linux/prio_tree.h>
++#include <linux/buffer_head.h>
++#include <linux/slab.h>
++
++#define FCACHE_MAGIC	0x61786663
++#define FCACHE_VERSION	0x02
++
++#define FCACHE_HEADER_BLOCK	0
++#define FCACHE_EXTENT_BLOCK	1
++
++#undef FCACHE_PAGES_PROTECTED
++
++struct fcache_dev {
++	struct block_device *bdev;
++	struct block_device *fs_bdev;
++	make_request_fn *mfn;
++	struct prio_tree_root prio_root;
++	unsigned long next_cache_block;
++	unsigned long nr_extents;
++	unsigned long max_extents;
++	unsigned int old_bs;
++	spinlock_t lock;
++
++	sector_t cache_start_sector;
++	unsigned long cache_blocks;
++	sector_t fs_start_sector;
++	sector_t fs_sectors;
++
++	unsigned long flags;
++	int priming;
++	int serial;
++	int chop_ios;
++
++	struct list_head list;
++	struct work_struct work;
++
++	/*
++	 * stats
++	 */
++	unsigned int ios[2];
++	unsigned int hits;
++	unsigned int misses;
++	unsigned int overwrites;
++};
++
++enum {
++	FDEV_F_DOWN = 0,
++};
++
++static struct fcache_dev fcache_dev;
++
++static int disable;
++module_param(disable, int, 0444);
++
++struct fcache_endio_data {
++	struct fcache_dev *fdev;
++	sector_t fs_sector;
++	unsigned int fs_size;
++	sector_t cache_sector;
++	atomic_t completions;
++	struct bio *bio;
++	int io_error;
++	struct list_head list;
++};
++
++/*
++ * Maps a file system block to the fcache
++ */
++struct fcache_extent {
++	sector_t fs_sector;	/* real device offset */
++	unsigned int fs_size;	/* extent length */
++	sector_t cache_sector;	/* cache device offset */
++
++	struct prio_tree_node prio_node;
++};
++
++/*
++ * Header on fcache device - will take up the first page of data, so
++ * plenty of room to go around.
++ */
++struct fcache_header {
++	u32 magic;		/* fcache magic */
++	u32 version;		/* fcache version */
++	u32 nr_extents;		/* nr of extents in cache */
++	u32 max_extents;	/* max nr of extents available */
++	u32 serial;		/* fs and cache serial */
++	u32 extent_offset;	/* where extents start */
++	u64 fs_start_sector;	/* where fs starts */
++	u64 fs_sectors;		/* how big fs is */
++	char fs_dev[BDEVNAME_SIZE];	/* fs partition */
++	u64 cache_blocks;	/* number of blocks in cache */
++	u64 cache_blocks_used;	/* used blocks in cache */
++	u16 sector_t_size;	/* user space helper */
++	u16 extent_size;	/* user space helper */
++};
++
++#define BLOCK_SHIFT	(PAGE_SHIFT - 9)
++
++static struct kmem_cache *fcache_slab;
++static struct kmem_cache *fcache_fed_slab;
++static mempool_t *fed_pool;
++static struct workqueue_struct *fcache_workqueue;
++
++static int fcache_rw_page_endio(struct bio *bio, unsigned int bytes, int err)
++{
++	if (bio->bi_size)
++		return 1;
++
++	complete(bio->bi_private);
++	return 0;
++}
++
++/*
++ * Writes out a page of data and waits for it to complete.
++ */
++static int fcache_rw_page(struct fcache_dev *fdev, sector_t index,
++			  struct page *page, int rw)
++{
++	DECLARE_COMPLETION(wait);
++	struct bio *bio;
++	int ret = 0;
++
++	bio = bio_alloc(GFP_KERNEL, 1);
++
++	bio->bi_sector = index << BLOCK_SHIFT;
++	bio->bi_bdev = fdev->bdev;
++	bio->bi_rw |= (1 << BIO_RW_SYNC);
++	bio->bi_end_io = fcache_rw_page_endio;
++	bio->bi_private = &wait;
++
++	bio_add_page(bio, page, PAGE_SIZE, 0);
++	submit_bio(rw, bio);
++
++	wait_for_completion(&wait);
++
++	if (!bio_flagged(bio, BIO_UPTODATE))
++		ret = -EIO;
++
++	bio_put(bio);
++	return ret;
++}
++
++static inline void fcache_fill_header(struct fcache_dev *fdev,
++				      struct fcache_header *header,
++				      unsigned int nr_extents)
++{
++	/*
++	 * See how many pages we need for extent headers, then we know where
++	 * to start putting data. Assume worst case of 1 page per extent, and
++	 * reserve the first page for the header.
++	 */
++
++	header->magic = FCACHE_MAGIC;
++	header->version = FCACHE_VERSION;
++	header->nr_extents = nr_extents;
++	header->max_extents = ((fdev->cache_blocks - 1) * PAGE_SIZE) / (PAGE_SIZE - sizeof(struct fcache_extent));
++	header->serial = fdev->serial;
++
++	header->extent_offset = 1 + (header->max_extents * sizeof(struct fcache_extent) / PAGE_SIZE);
++
++	header->fs_start_sector = fdev->fs_start_sector;
++	header->fs_sectors = fdev->fs_sectors;
++	bdevname(fdev->fs_bdev, header->fs_dev);
++	header->cache_blocks = fdev->cache_blocks;
++	header->cache_blocks_used = fdev->next_cache_block;
++	header->sector_t_size = sizeof(sector_t);
++	header->extent_size = sizeof(struct fcache_extent);
++}
++
++static int fcache_write_new_header(struct fcache_dev *fdev)
++{
++	struct fcache_header *header;
++	struct page *page;
++	int ret;
++
++	page = alloc_page(GFP_HIGHUSER);
++	if (unlikely(!page))
++		return -ENOMEM;
++
++	header = kmap_atomic(page, KM_USER0);
++	clear_page(header);
++	fcache_fill_header(fdev, header, 0);
++	fdev->next_cache_block = header->extent_offset;
++	fdev->max_extents = header->max_extents;
++	kunmap_atomic(header, KM_USER0);
++
++	printk("fcache: new header: first block %lu, max %lu\n",
++				fdev->next_cache_block, fdev->max_extents);
++	ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
++	__free_page(page);
++	return ret;
++}
++
++static void fcache_free_prio_tree(struct fcache_dev *fdev)
++{
++	struct fcache_extent *fe;
++	struct prio_tree_iter iter;
++	struct prio_tree_node *node;
++
++	/*
++	 * Now prune and free tree, wish there was a better way...
++	 */
++	do {
++		prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
++
++		node = prio_tree_next(&iter);
++		if (!node)
++			break;
++
++		fe = prio_tree_entry(node, struct fcache_extent, prio_node);
++		prio_tree_remove(&fdev->prio_root, node);
++		kmem_cache_free(fcache_slab, fe);
++	} while (1);
++}
++
++/*
++ * First clear the header, write extents, then write real header.
++ */
++static int fcache_write_extents(struct fcache_dev *fdev)
++{
++	struct fcache_header *header;
++	sector_t index, sectors;
++	unsigned int nr_extents, this_extents;
++	struct fcache_extent *fe;
++	struct prio_tree_iter iter;
++	struct prio_tree_node *node;
++	struct page *page;
++	void *p;
++	int ret;
++
++	page = alloc_page(GFP_KERNEL);
++	if (unlikely(!page))
++		return -ENOMEM;
++
++	header = page_address(page);
++	clear_page(header);
++	fcache_fill_header(fdev, header, 0);
++	ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
++	if (ret)
++		goto err;
++
++	/*
++	 * Now write the extents in page size chunks.
++	 */
++	p = page_address(page);
++	clear_page(p);
++	index = FCACHE_EXTENT_BLOCK;
++	sectors = 0;
++	this_extents = nr_extents = 0;
++
++	prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
++
++	do {
++		node = prio_tree_next(&iter);
++		if (!node)
++			break;
++
++		fe = prio_tree_entry(node, struct fcache_extent, prio_node);
++		nr_extents++;
++		this_extents++;
++		sectors += fe->fs_size >> 9;
++		memcpy(p, fe, sizeof(*fe));
++		p += sizeof(*fe);
++		if ((this_extents + 1) * sizeof(*fe) > PAGE_SIZE) {
++			ret = fcache_rw_page(fdev, index, page, WRITE);
++			if (ret)
++				break;
++
++			this_extents = 0;
++			index++;
++			p = page_address(page);
++		}
++	} while (1);
++
++	if (this_extents)
++		ret = fcache_rw_page(fdev, index, page, WRITE);
++
++	fdev->nr_extents = nr_extents;
++	printk("fcache: wrote %d extents, holding %llu sectors of data\n",
++				nr_extents, (unsigned long long) sectors);
++err:
++	__free_page(page);
++	return ret;
++}
++
++static int fcache_write_header(struct fcache_dev *fdev)
++{
++	struct page *page;
++	int ret;
++
++	page = alloc_page(GFP_KERNEL);
++	if (unlikely(!page))
++		return -ENOMEM;
++
++	ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ);
++	if (!ret) {
++		struct fcache_header *header = page_address(page);
++
++		fcache_fill_header(fdev, header, fdev->nr_extents);
++		ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
++		printk("fcache: wrote header (extents=%lu,serial=%d)\n",
++						fdev->nr_extents, fdev->serial);
++	}
++
++	__free_page(page);
++	return ret;
++}
++
++static void fcache_tree_link(struct fcache_dev *fdev, struct fcache_extent *fe)
++{
++	struct prio_tree_node *node = &fe->prio_node;
++	unsigned long flags;
++
++	INIT_PRIO_TREE_NODE(node);
++	node->start = fe->fs_sector;
++	node->last = fe->fs_sector + (fe->fs_size >> 9) - 1;
++
++	spin_lock_irqsave(&fdev->lock, flags);
++	prio_tree_insert(&fdev->prio_root, node);
++	spin_unlock_irqrestore(&fdev->lock, flags);
++}
++
++#define MAX_FE	16
++
++/*
++ * Lookup the range of a given request in the prio tree. Used for both
++ * looking up a range covering a read operation to be served from cache,
++ * and to lookup potential conflicts from a new write with an existing
++ * extent.
++ */
++static int fcache_lookup_extent(struct fcache_dev *fdev, sector_t offset,
++				unsigned int bytes, struct fcache_extent **map)
++{
++	sector_t end_sector = offset + (bytes >> 9) - 1;
++	struct prio_tree_node *node;
++	struct prio_tree_iter iter;
++	int i = 0;
++
++	prio_tree_iter_init(&iter, &fdev->prio_root, offset, end_sector);
++
++	/*
++	 * We only need to lock, if we are priming. The prio tree does
++	 * not change when in normal mode.
++	 */
++	if (fdev->priming)
++		spin_lock_irq(&fdev->lock);
++
++	do {
++		node = prio_tree_next(&iter);
++		if (!node)
++			break;
++
++		map[i] = prio_tree_entry(node, struct fcache_extent, prio_node);
++	} while (++i < MAX_FE);
++
++	if (fdev->priming)
++		spin_unlock_irq(&fdev->lock);
++
++	return i;
++}
++
++/*
++ * Our data write is done, now insert the fcache extents into the rbtree.
++ */
++static int fcache_instantiate_extent(struct fcache_dev *fdev,
++				     struct fcache_endio_data *fed)
++{
++	struct fcache_extent *fe;
++
++	fe = kmem_cache_alloc(fcache_slab, GFP_ATOMIC);
++	if (fe) {
++		fe->fs_sector = fed->fs_sector;
++		fe->fs_size = fed->fs_size;
++		fe->cache_sector = fed->cache_sector;
++
++		fcache_tree_link(fdev, fe);
++		return 0;
++	}
++
++	return -ENOMEM;
++}
++
++/*
++ * Hang on to the bio and its pages - ideally we would want to ensure
++ * that the page data doesn't change between calling this function and
++ * fcache_put_bio_pages() as well...
++ */
++static void fcache_get_bio_pages(struct fcache_dev *fdev, struct bio *bio)
++{
++	/*
++	 * Currently stubbed out, as we cannot end the bio read before
++	 * the write completes without also making sure that the pages
++	 * don't get reused for something else in the mean time.
++	 */
++#ifdef FCACHE_PAGES_PROTECTED
++	struct bio_vec *bvec;
++	int i;
++
++	bio_get(bio);
++
++	__bio_for_each_segment(bvec, bio, i, 0)
++		get_page(bvec->bv_page);
++#endif
++}
++
++static void fcache_put_bio_pages(struct fcache_dev *fdev, struct bio *bio)
++{
++#ifdef FCACHE_PAGES_PROTECTED
++	struct bio_vec *bvec;
++	int i;
++
++	__bio_for_each_segment(bvec, bio, i, 0)
++		put_page(bvec->bv_page);
++
++	bio_put(bio);
++#endif
++}
++
++static void fcache_chop_write_done(struct fcache_endio_data *fed)
++{
++	/*
++	 * Last io completes.
++	 */
++	if (atomic_dec_and_test(&fed->completions)) {
++		struct fcache_dev *fdev = fed->fdev;
++		struct bio *bio = fed->bio;
++
++		/*
++		 * Release our reference to the original bio and
++		 * its pages.
++		 */
++		fcache_put_bio_pages(fdev, bio);
++
++		/*
++		 * End the read!
++		 */
++		bio_endio(bio, bio->bi_size, 0);
++
++		/*
++		 * All done, now add extent to our list if io completed ok.
++		 */
++		if (!fed->io_error)
++			fcache_instantiate_extent(fdev, fed);
++
++		mempool_free(fed, fed_pool);
++	}
++}
++
++/*
++ * Our data write to the cache completes, we can free our clone and
++ * instantiate the extent block.
++ */
++static int fcache_extent_write_endio(struct bio *bio, unsigned int bytes,
++				     int err)
++{
++	struct fcache_endio_data *fed;
++
++	if (bio->bi_size)
++		return 1;
++
++	fed = bio->bi_private;
++
++	if (!bio_flagged(bio, BIO_UPTODATE))
++		fed->io_error = -EIO;
++
++	bio_put(bio);
++	fcache_chop_write_done(fed);
++	return 0;
++}
++
++static void fcache_chop_read_done(struct fcache_endio_data *fed)
++{
++	if (atomic_dec_and_test(&fed->completions)) {
++		struct bio *bio = fed->bio;
++
++		bio_endio(bio, bio->bi_size, fed->io_error);
++		mempool_free(fed, fed_pool);
++	}
++}
++
++static int fcache_chop_read_endio(struct bio *bio, unsigned int bytes, int err)
++{
++	struct fcache_endio_data *fed;
++
++	if (bio->bi_size)
++		return 1;
++
++	fed = bio->bi_private;
++
++	if (!bio_flagged(bio, BIO_UPTODATE))
++		fed->io_error = -EIO;
++
++	bio_put(bio);
++	fcache_chop_read_done(fed);
++	return 0;
++}
++
++typedef void (chopper_done_t) (struct fcache_endio_data *);
++
++/*
++ * This is our io chopper - it hacks a bio into smaller pieces, suitable
++ * for the target device. Caller supplies suitable end_io and done functions.
++ */
++static void fcache_io_chopper(struct fcache_dev *fdev,
++			      struct fcache_endio_data *fed,
++			      bio_end_io_t *endio, chopper_done_t *done, int rw)
++{
++	struct bio *bio = NULL;
++	struct bio_vec *bv;
++	unsigned int total_bytes;
++	sector_t sector;
++	int i, vecs;
++
++	/*
++	 * Make sure 'fed' doesn't disappear while we are still issuing
++	 * ios, the artificial reference is dropped at the end.
++	 */
++	atomic_set(&fed->completions, 1);
++
++	sector = fed->cache_sector;
++	total_bytes = fed->fs_size;
++	vecs = fed->bio->bi_vcnt;
++	bio_for_each_segment(bv, fed->bio, i) {
<<Diff was trimmed, longer than 597 lines>>