SOURCES (LINUX_2_6_16): linux-dmcache.patch - http://www.acis.ufl....

glen glen at pld-linux.org
Wed Nov 14 01:08:44 CET 2007


Author: glen                         Date: Wed Nov 14 00:08:44 2007 GMT
Module: SOURCES                       Tag: LINUX_2_6_16
---- Log message:
- http://www.acis.ufl.edu/~ming/dmcache/patch-2.6.19.1

---- Files affected:
SOURCES:
   linux-dmcache.patch (1.1 -> 1.1.2.1) 

---- Diffs:

================================================================
Index: SOURCES/linux-dmcache.patch
diff -u SOURCES/linux-dmcache.patch:1.1 SOURCES/linux-dmcache.patch:1.1.2.1
--- SOURCES/linux-dmcache.patch:1.1	Wed Nov 14 01:07:34 2007
+++ SOURCES/linux-dmcache.patch	Wed Nov 14 01:08:39 2007
@@ -1,1797 +1,1786 @@
-diff -Naur linux-2.6.21.7-orig/drivers/md/dm-cache.c linux-2.6.21.7-dmcache/drivers/md/dm-cache.c
---- linux-2.6.21.7-orig/drivers/md/dm-cache.c	1969-12-31 19:00:00.000000000 -0500
-+++ linux-2.6.21.7-dmcache/drivers/md/dm-cache.c	2007-08-23 14:10:58.000000000 -0400
-@@ -0,0 +1,1766 @@
-+/****************************************************************************
-+ *  dm-cache.c
-+ *  Device mapper target for block-level disk caching
-+ *
-+ *  Copyright (C) International Business Machines Corp., 2006
-+ *  Author: Ming Zhao (mingzhao at ufl.edu)
-+ *
-+ *  This program is free software; you can redistribute it and/or modify
-+ *  it under the terms of the GNU General Public License as published by
-+ *  the Free Software Foundation; under version 2 of the License.
-+ *
-+ *  This program is distributed in the hope that it will be useful,
-+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+ *  GNU General Public License for more details.
-+ *
-+ *  You should have received a copy of the GNU General Public License
-+ *  along with this program; if not, write to the Free Software
-+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-+ *
-+ ****************************************************************************/
-+
-+#include <asm/atomic.h>
-+#include <asm/checksum.h>
-+#include <linux/module.h>
-+#include <linux/init.h>
-+#include <linux/list.h>
-+#include <linux/blkdev.h>
-+#include <linux/bio.h>
-+#include <linux/slab.h>
-+#include <linux/hash.h>
-+#include <linux/spinlock.h>
-+#include <linux/workqueue.h>
-+#include <linux/pagemap.h>
-+
-+#include "dm.h"
-+#include "dm-io.h"
-+#include "dm-bio-list.h"
-+#include "kcopyd.h"
-+
-+#define DMC_DEBUG 0
-+
-+#define DM_MSG_PREFIX "cache"
-+#define DMC_PREFIX "dm-cache: "
-+
-+#if DMC_DEBUG
-+#define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg)
-+#else
-+#define DPRINTK( s, arg... )
-+#endif
-+
-+/* Default cache parameters */
-+#define DEFAULT_CACHE_SIZE	65536
-+#define DEFAULT_CACHE_ASSOC	1024
-+#define DEFAULT_BLOCK_SIZE	8
-+#define CONSECUTIVE_BLOCKS	512
-+
-+/* Write policy */
-+#define WRITE_THROUGH 0
-+#define WRITE_BACK 1
-+#define DEFAULT_WRITE_POLICY WRITE_THROUGH
-+
-+/* Number of pages for I/O */
-+#define DMCACHE_COPY_PAGES 1024
-+
-+/* States of a cache block */
-+#define INVALID		0
-+#define VALID		1	/* Valid */
-+#define RESERVED	2	/* Allocated but data not in place yet */
-+#define DIRTY		4	/* Locally modified */
-+#define WRITEBACK	8	/* In the process of write back */
-+
-+#define is_state(x, y)		(x & y)
-+#define set_state(x, y)		(x |= y)
-+#define clear_state(x, y)	(x &= ~y)
-+
-+/*
-+ * Cache context
-+ */
-+struct cache_c {
-+	struct dm_dev *src_dev;		/* Source device */
-+	struct dm_dev *cache_dev;	/* Cache device */
-+	struct kcopyd_client *kcp_client; /* Kcopyd client for writing back data */
-+
-+	struct cacheblock *cache;	/* Hash table for cache blocks */
-+	sector_t size;			/* Cache size */
-+	unsigned int bits;		/* Cache size in bits */
-+	unsigned int assoc;		/* Cache associativity */
-+	unsigned int block_size;	/* Cache block size */
-+	unsigned int block_shift;	/* Cache block size in bits */
-+	unsigned int block_mask;	/* Cache block mask */
-+	unsigned int consecutive_shift;	/* Consecutive blocks size in bits */
-+	unsigned long counter;		/* Logical timestamp of last access */
-+	unsigned int write_policy;	/* Cache write policy */
-+	sector_t dirty_blocks;		/* Number of dirty blocks */
-+
-+	spinlock_t lock;		/* Lock to protect page allocation/deallocation */
-+	struct page_list *pages;	/* Pages for I/O */
-+	unsigned int nr_pages;		/* Number of pages */
-+	unsigned int nr_free_pages;	/* Number of free pages */
-+	wait_queue_head_t destroyq;	/* Wait queue for I/O completion */
-+	atomic_t nr_jobs;		/* Number of I/O jobs */
-+	/* Stats */
-+	unsigned long reads;		/* Number of reads */
-+	unsigned long writes;		/* Number of writes */
-+	unsigned long cache_hits;	/* Number of cache hits */
-+	unsigned long replace;		/* Number of cache replacements */
-+	unsigned long writeback;	/* Number of replaced dirty blocks */
-+	unsigned long dirty;		/* Number of submitted dirty blocks */
-+};
-+
-+/* Cache block metadata structure */
-+struct cacheblock {
-+	spinlock_t lock;	/* Lock to protect operations on the bio list */
-+	sector_t block;		/* Sector number of the cached block */
-+	unsigned short state;	/* State of a block */
-+	unsigned long counter;	/* Logical timestamp of the block's last access */
-+	struct bio_list bios;	/* List of pending bios */
-+};
-+
-+
-+/****************************************************************************
-+ *  Functions and data structures for implementing a kcached to handle async
-+ *  I/O. Code for page and queue handling is borrowed from kcopyd.c.
-+ ****************************************************************************/
-+
-+/*
-+ * Functions for handling pages used by async I/O.
-+ * The data asked by a bio request may not be aligned with cache blocks, in
-+ * which case additional pages are required for the request that is forwarded
-+ * to the server. A pool of pages are reserved for this purpose.
-+ */
-+
-+static struct page_list *alloc_pl(void)
-+{
-+	struct page_list *pl;
-+
-+	pl = kmalloc(sizeof(*pl), GFP_KERNEL);
-+	if (!pl)
-+		return NULL;
-+
-+	pl->page = alloc_page(GFP_KERNEL);
-+	if (!pl->page) {
-+		kfree(pl);
-+		return NULL;
-+	}
-+
-+	return pl;
-+}
-+
-+static void free_pl(struct page_list *pl)
-+{
-+	__free_page(pl->page);
-+	kfree(pl);
-+}
-+
-+static void drop_pages(struct page_list *pl)
-+{
-+	struct page_list *next;
-+
-+	while (pl) {
-+		next = pl->next;
-+		free_pl(pl);
-+		pl = next;
-+	}
-+}
-+
-+static int kcached_get_pages(struct cache_c *dmc, unsigned int nr,
-+	                         struct page_list **pages)
-+{
-+	struct page_list *pl;
-+
-+	spin_lock(&dmc->lock);
-+	if (dmc->nr_free_pages < nr) {
-+		DPRINTK("kcached_get_pages: No free pages: %u<%u",
-+		        dmc->nr_free_pages, nr);
-+		spin_unlock(&dmc->lock);
-+		return -ENOMEM;
-+	}
-+
-+	dmc->nr_free_pages -= nr;
-+	for (*pages = pl = dmc->pages; --nr; pl = pl->next)
-+		;
-+
-+	dmc->pages = pl->next;
-+	pl->next = NULL;
-+
-+	spin_unlock(&dmc->lock);
-+
-+	return 0;
-+}
-+
-+static void kcached_put_pages(struct cache_c *dmc, struct page_list *pl)
-+{
-+	struct page_list *cursor;
-+
-+	spin_lock(&dmc->lock);
-+	for (cursor = pl; cursor->next; cursor = cursor->next)
-+		dmc->nr_free_pages++;
-+
-+	dmc->nr_free_pages++;
-+	cursor->next = dmc->pages;
-+	dmc->pages = pl;
-+
-+	spin_unlock(&dmc->lock);
-+}
-+
-+static int alloc_bio_pages(struct cache_c *dmc, unsigned int nr)
-+{
-+	unsigned int i;
-+	struct page_list *pl = NULL, *next;
-+
-+	for (i = 0; i < nr; i++) {
-+		next = alloc_pl();
-+		if (!next) {
-+			if (pl)
-+				drop_pages(pl);
-+			return -ENOMEM;
-+		}
-+		next->next = pl;
-+		pl = next;
-+	}
-+
-+	kcached_put_pages(dmc, pl);
-+	dmc->nr_pages += nr;
-+
-+	return 0;
-+}
-+
-+static void free_bio_pages(struct cache_c *dmc)
-+{
-+	BUG_ON(dmc->nr_free_pages != dmc->nr_pages);
-+	drop_pages(dmc->pages);
-+	dmc->pages = NULL;
-+	dmc->nr_free_pages = dmc->nr_pages = 0;
-+}
-+
-+/* Structure for a kcached job */
-+struct kcached_job {
-+	struct list_head list;
-+	struct cache_c *dmc;
-+	struct bio *bio;	/* Original bio */
-+	struct io_region src;
-+	struct io_region dest;
-+	struct cacheblock *cacheblock;
-+	int rw;
-+	/*
-+	 * When the original bio is not aligned with cache blocks,
-+	 * we need extra bvecs and pages for padding.
-+	 */
-+	struct bio_vec *bvec;
-+	unsigned int nr_pages;
-+	struct page_list *pages;
-+};
-+
-+static struct workqueue_struct *_kcached_wq;
-+static struct work_struct _kcached_work;
-+
-+static inline void wake(void)
-+{
-+	queue_work(_kcached_wq, &_kcached_work);
-+}
-+
-+#define MIN_JOBS 1024
-+
-+static struct kmem_cache *_job_cache;
-+static mempool_t *_job_pool;
-+
-+static DEFINE_SPINLOCK(_job_lock);
-+
-+static LIST_HEAD(_complete_jobs);
-+static LIST_HEAD(_io_jobs);
-+static LIST_HEAD(_pages_jobs);
-+
-+static int jobs_init(void)
-+{
-+	_job_cache = kmem_cache_create("kcached-jobs",
-+	                               sizeof(struct kcached_job),
-+	                               __alignof__(struct kcached_job),
-+	                               0, NULL, NULL);
-+	if (!_job_cache)
-+		return -ENOMEM;
-+
-+	_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
-+	                           mempool_free_slab, _job_cache);
-+	if (!_job_pool) {
-+		kmem_cache_destroy(_job_cache);
-+		return -ENOMEM;
-+	}
-+
-+	return 0;
-+}
-+
-+static void jobs_exit(void)
-+{
-+	BUG_ON(!list_empty(&_complete_jobs));
-+	BUG_ON(!list_empty(&_io_jobs));
-+	BUG_ON(!list_empty(&_pages_jobs));
-+
-+	mempool_destroy(_job_pool);
-+	kmem_cache_destroy(_job_cache);
-+	_job_pool = NULL;
-+	_job_cache = NULL;
-+}
-+
-+/*
-+ * Functions to push and pop a job onto the head of a given job list.
-+ */
-+static inline struct kcached_job *pop(struct list_head *jobs)
-+{
-+	struct kcached_job *job = NULL;
-+	unsigned long flags;
-+
-+	spin_lock_irqsave(&_job_lock, flags);
-+
-+	if (!list_empty(jobs)) {
-+		job = list_entry(jobs->next, struct kcached_job, list);
-+		list_del(&job->list);
-+	}
-+	spin_unlock_irqrestore(&_job_lock, flags);
-+
-+	return job;
-+}
-+
-+static inline void push(struct list_head *jobs, struct kcached_job *job)
-+{
-+	unsigned long flags;
-+
-+	spin_lock_irqsave(&_job_lock, flags);
-+	list_add_tail(&job->list, jobs);
-+	spin_unlock_irqrestore(&_job_lock, flags);
-+}
-+
-+
-+/****************************************************************************
-+ * Functions for asynchronously fetching data from source device and storing
-+ * data in cache device. Because the requested data may not align with the
-+ * cache blocks, extra handling is required to pad a block request and extract
-+ * the requested data from the results.
-+ ****************************************************************************/
-+
-+static void io_callback(unsigned long error, void *context)
-+{
-+	struct kcached_job *job = (struct kcached_job *) context;
-+
-+	if (error) {
-+		/* TODO */
-+		DMERR("io_callback: io error");
-+		return;
-+	}
-+
-+	if (job->rw == READ) {
-+		job->rw = WRITE;
-+		push(&_io_jobs, job);
-+	} else
-+		push(&_complete_jobs, job);
-+	wake();
-+}
-+
-+/*
-+ * Fetch data from the source device asynchronously.
-+ * For a READ bio, if a cache block is larger than the requested data, then
-+ * additional data are prefetched. Larger cache block size enables more
-+ * aggressive read prefetching, which is useful for read-mostly usage.
-+ * For a WRITE bio, if a cache block is larger than the requested data, the
-+ * entire block needs to be fetched, and larger block size incurs more overhead.
-+ * In scenaros where writes are frequent, 4KB is a good cache block size.
-+ */
-+static int do_fetch(struct kcached_job *job)
-+{
-+	int r = 0, i, j;
-+	struct bio *bio = job->bio;
-+	struct cache_c *dmc = job->dmc;
-+	unsigned int offset, head, tail, remaining, nr_vecs, idx = 0;
-+	struct bio_vec *bvec;
-+	struct page_list *pl;
-+
-+	offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
-+	head = to_bytes(offset);
-+	tail = to_bytes(dmc->block_size) - bio->bi_size - head;
-+
-+	DPRINTK("do_fetch: %llu(%llu->%llu,%llu), head:%u,tail:%u",
-+	        bio->bi_sector, job->src.sector, job->dest.sector,
-+	        job->src.count, head, tail);
-+
-+	if (bio_data_dir(bio) == READ) { /* The original request is a READ */
-+		if (0 == job->nr_pages) { /* The request is aligned to cache block */
-+			r = dm_io_async_bvec(1, &job->src, READ,
-+			                     bio->bi_io_vec + bio->bi_idx,
-+			                     io_callback, job);
-+			return r;
-+		}
-+
-+		nr_vecs = bio->bi_vcnt - bio->bi_idx + job->nr_pages;
-+		bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_NOIO);
-+		if (!bvec) {
-+			DMERR("do_fetch: No memory");
-+			return 1;
-+		}
-+
-+		pl = job->pages;
-+		i = 0;
-+		while (head) {
-+			bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
-+			bvec[i].bv_offset = 0;
-+			bvec[i].bv_page = pl->page;
-+			head -= bvec[i].bv_len;
-+			pl = pl->next;
-+			i++;
-+		}
-+
-+		remaining = bio->bi_size;
-+		j = bio->bi_idx;
-+		while (remaining) {
-+			bvec[i] = bio->bi_io_vec[j];
-+			remaining -= bvec[i].bv_len;
-+			i++; j++;
-+		}
-+
-+		while (tail) {
-+			bvec[i].bv_len = min(tail, (unsigned int)PAGE_SIZE);
-+			bvec[i].bv_offset = 0;
-+			bvec[i].bv_page = pl->page;
-+			tail -= bvec[i].bv_len;
-+			pl = pl->next;
-+			i++;
-+		}
-+
-+		job->bvec = bvec;
-+		r = dm_io_async_bvec(1, &job->src, READ, job->bvec, io_callback, job);
-+		return r;
-+	} else { /* The original request is a WRITE */
-+		pl = job->pages;
-+
-+		if (head && tail) { /* Special case */
-+			bvec = kmalloc(job->nr_pages * sizeof(*bvec), GFP_KERNEL);
-+			if (!bvec) {
-+				DMERR("do_fetch: No memory");
-+				return 1;
-+			}
-+			for (i=0; i<job->nr_pages; i++) {
-+				bvec[i].bv_len = PAGE_SIZE;
-+				bvec[i].bv_offset = 0;
-+				bvec[i].bv_page = pl->page;
-+				pl = pl->next;
-+			}
-+			job->bvec = bvec;
-+			r = dm_io_async_bvec(1, &job->src, READ, job->bvec,
-+			                     io_callback, job);
-+			return r;
-+		}
-+
-+		bvec = kmalloc((job->nr_pages + bio->bi_vcnt - bio->bi_idx)
-+				* sizeof(*bvec), GFP_KERNEL);
-+		if (!bvec) {
-+			DMERR("do_fetch: No memory");
-+			return 1;
-+		}
-+
-+		i = 0;
-+		while (head) {
-+			bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
-+			bvec[i].bv_offset = 0;
-+			bvec[i].bv_page = pl->page;
-+			head -= bvec[i].bv_len;
-+			pl = pl->next;
-+			i++;
-+		}
-+
-+		remaining = bio->bi_size;
-+		j = bio->bi_idx;
-+		while (remaining) {
-+			bvec[i] = bio->bi_io_vec[j];
-+			remaining -= bvec[i].bv_len;
-+			i++; j++;
-+		}
-+
-+		if (tail) {
-+			idx = i;
-+			bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) &
-+			                    (PAGE_SIZE - 1);
-+			bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
-+			bvec[i].bv_page = pl->page;
-+			tail -= bvec[i].bv_len;
-+			pl = pl->next; i++;
-+			while (tail) {
-+				bvec[i].bv_len = PAGE_SIZE;
-+				bvec[i].bv_offset = 0;
-+				bvec[i].bv_page = pl->page;
-+				tail -= bvec[i].bv_len;
-+				pl = pl->next; i++;
-+			}
-+		}
-+
-+		job->bvec = bvec;
-+		r = dm_io_async_bvec(1, &job->src, READ, job->bvec + idx,
-+		                     io_callback, job);
-+
-+		return r;
-+	}
-+}
-+
-+/*
-+ * Store data to the cache source device asynchronously.
-+ * For a READ bio request, the data fetched from the source device are returned
-+ * to kernel and stored in cache at the same time.
-+ * For a WRITE bio request, the data are written to the cache and source device
-+ * at the same time.
-+ */
-+static int do_store(struct kcached_job *job)
-+{
-+	int i, j, r = 0;
-+	struct bio *bio = job->bio, *clone;
-+	struct cache_c *dmc = job->dmc;
-+	unsigned int offset, head, tail, remaining, nr_vecs;
-+	struct bio_vec *bvec;
-+
-+	offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
-+	head = to_bytes(offset);
-+	tail = to_bytes(dmc->block_size) - bio->bi_size - head;
-+
-+	DPRINTK("do_store: %llu(%llu->%llu,%llu), head:%u,tail:%u",
-+	        bio->bi_sector, job->src.sector, job->dest.sector,
-+	        job->src.count, head, tail);
-+
-+	/* A READ is acknowledged as soon as the requested data is fetched, and
-+	   does not have to wait for it being stored in cache. The bio is cloned
-+	   so that the original one can be ended here. But to avoid copying
-+	   pages, we reuse the pages allocated for the original bio, and mark
-+	   each of them to prevent the pages being freed before the cache
-+	   insertion is completed.
-+	 */
-+	if (bio_data_dir(bio) == READ) {
-+		clone = bio_clone(bio, GFP_NOIO);
-+		for (i=bio->bi_idx; i<bio->bi_vcnt; i++) {
-+			get_page(bio->bi_io_vec[i].bv_page);
-+		}
-+		DPRINTK("bio ended for %llu:%u", bio->bi_sector, bio->bi_size);
-+		bio_endio(bio, bio->bi_size, 0);
-+		bio = clone;
-+		job->bio = clone;
-+	}
-+
-+	if (0 == job->nr_pages) /* Original request is aligned with cache blocks */
-+		r = dm_io_async_bvec(1, &job->dest, WRITE, bio->bi_io_vec + bio->bi_idx,
-+		                     io_callback, job);
-+	else {
-+		if (bio_data_dir(bio) == WRITE && head > 0 && tail > 0) {
-+			DPRINTK("Special case: %lu %u %u", bio_data_dir(bio), head, tail);
-+			nr_vecs = job->nr_pages + bio->bi_vcnt - bio->bi_idx;
-+			if (offset && (offset + bio->bi_size < PAGE_SIZE)) nr_vecs++;
-+			DPRINTK("Create %u new vecs", nr_vecs);
-+			bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_KERNEL);
-+			if (!bvec) {
-+				DMERR("do_store: No memory");
-+				return 1;
-+			}
-+
-+			i = 0;
-+			while (head) {
-+				bvec[i].bv_len = min(head, job->bvec[i].bv_len);
-+				bvec[i].bv_offset = 0;
-+				bvec[i].bv_page = job->bvec[i].bv_page;
-+				head -= bvec[i].bv_len;
-+				i++;
-+			}
-+			remaining = bio->bi_size;
-+			j = bio->bi_idx;
-+			while (remaining) {
-+				bvec[i] = bio->bi_io_vec[j];
-+				remaining -= bvec[i].bv_len;
-+				i++; j++;
-+			}
-+			j = (to_bytes(offset) + bio->bi_size) / PAGE_SIZE;
-+			bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) -
-+			                    j * PAGE_SIZE;
-+			bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
-+			bvec[i].bv_page = job->bvec[j].bv_page;
-+			tail -= bvec[i].bv_len;
-+			i++; j++;
-+			while (tail) {
-+				bvec[i] = job->bvec[j];
-+				tail -= bvec[i].bv_len;
-+				i++; j++;
-+			}
-+			kfree(job->bvec);
-+			job->bvec = bvec;
<<Diff was trimmed, longer than 597 lines>>

---- CVS-web:
    http://cvs.pld-linux.org/cgi-bin/cvsweb.cgi/SOURCES/linux-dmcache.patch?r1=1.1&r2=1.1.2.1&f=u



More information about the pld-cvs-commit mailing list