SOURCES: linux-dmcache.patch (NEW) - DM-Cache: A Generic Block-lev...

glen glen at pld-linux.org
Wed Nov 14 01:07:39 CET 2007


Author: glen                         Date: Wed Nov 14 00:07:39 2007 GMT
Module: SOURCES                       Tag: HEAD
---- Log message:
- DM-Cache: A Generic Block-level Disk Cache - http://www.acis.ufl.edu/~ming/dmcache/index.html
  saved http://www.acis.ufl.edu/~ming/dmcache/patch-2.6.21

---- Files affected:
SOURCES:
   linux-dmcache.patch (NONE -> 1.1)  (NEW)

---- Diffs:

================================================================
Index: SOURCES/linux-dmcache.patch
diff -u /dev/null SOURCES/linux-dmcache.patch:1.1
--- /dev/null	Wed Nov 14 01:07:39 2007
+++ SOURCES/linux-dmcache.patch	Wed Nov 14 01:07:34 2007
@@ -0,0 +1,1797 @@
+diff -Naur linux-2.6.21.7-orig/drivers/md/dm-cache.c linux-2.6.21.7-dmcache/drivers/md/dm-cache.c
+--- linux-2.6.21.7-orig/drivers/md/dm-cache.c	1969-12-31 19:00:00.000000000 -0500
++++ linux-2.6.21.7-dmcache/drivers/md/dm-cache.c	2007-08-23 14:10:58.000000000 -0400
+@@ -0,0 +1,1766 @@
++/****************************************************************************
++ *  dm-cache.c
++ *  Device mapper target for block-level disk caching
++ *
++ *  Copyright (C) International Business Machines Corp., 2006
++ *  Author: Ming Zhao (mingzhao at ufl.edu)
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; under version 2 of the License.
++ *
++ *  This program is distributed in the hope that it will be useful,
++ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *  GNU General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License
++ *  along with this program; if not, write to the Free Software
++ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ *
++ ****************************************************************************/
++
++#include <asm/atomic.h>
++#include <asm/checksum.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/list.h>
++#include <linux/blkdev.h>
++#include <linux/bio.h>
++#include <linux/slab.h>
++#include <linux/hash.h>
++#include <linux/spinlock.h>
++#include <linux/workqueue.h>
++#include <linux/pagemap.h>
++
++#include "dm.h"
++#include "dm-io.h"
++#include "dm-bio-list.h"
++#include "kcopyd.h"
++
++#define DMC_DEBUG 0
++
++#define DM_MSG_PREFIX "cache"
++#define DMC_PREFIX "dm-cache: "
++
++#if DMC_DEBUG
++#define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg)
++#else
++#define DPRINTK( s, arg... )
++#endif
++
++/* Default cache parameters */
++#define DEFAULT_CACHE_SIZE	65536
++#define DEFAULT_CACHE_ASSOC	1024
++#define DEFAULT_BLOCK_SIZE	8
++#define CONSECUTIVE_BLOCKS	512
++
++/* Write policy */
++#define WRITE_THROUGH 0
++#define WRITE_BACK 1
++#define DEFAULT_WRITE_POLICY WRITE_THROUGH
++
++/* Number of pages for I/O */
++#define DMCACHE_COPY_PAGES 1024
++
++/* States of a cache block */
++#define INVALID		0
++#define VALID		1	/* Valid */
++#define RESERVED	2	/* Allocated but data not in place yet */
++#define DIRTY		4	/* Locally modified */
++#define WRITEBACK	8	/* In the process of write back */
++
++#define is_state(x, y)		(x & y)
++#define set_state(x, y)		(x |= y)
++#define clear_state(x, y)	(x &= ~y)
++
++/*
++ * Cache context
++ */
++struct cache_c {
++	struct dm_dev *src_dev;		/* Source device */
++	struct dm_dev *cache_dev;	/* Cache device */
++	struct kcopyd_client *kcp_client; /* Kcopyd client for writing back data */
++
++	struct cacheblock *cache;	/* Hash table for cache blocks */
++	sector_t size;			/* Cache size */
++	unsigned int bits;		/* Cache size in bits */
++	unsigned int assoc;		/* Cache associativity */
++	unsigned int block_size;	/* Cache block size */
++	unsigned int block_shift;	/* Cache block size in bits */
++	unsigned int block_mask;	/* Cache block mask */
++	unsigned int consecutive_shift;	/* Consecutive blocks size in bits */
++	unsigned long counter;		/* Logical timestamp of last access */
++	unsigned int write_policy;	/* Cache write policy */
++	sector_t dirty_blocks;		/* Number of dirty blocks */
++
++	spinlock_t lock;		/* Lock to protect page allocation/deallocation */
++	struct page_list *pages;	/* Pages for I/O */
++	unsigned int nr_pages;		/* Number of pages */
++	unsigned int nr_free_pages;	/* Number of free pages */
++	wait_queue_head_t destroyq;	/* Wait queue for I/O completion */
++	atomic_t nr_jobs;		/* Number of I/O jobs */
++	/* Stats */
++	unsigned long reads;		/* Number of reads */
++	unsigned long writes;		/* Number of writes */
++	unsigned long cache_hits;	/* Number of cache hits */
++	unsigned long replace;		/* Number of cache replacements */
++	unsigned long writeback;	/* Number of replaced dirty blocks */
++	unsigned long dirty;		/* Number of submitted dirty blocks */
++};
++
++/* Cache block metadata structure */
++struct cacheblock {
++	spinlock_t lock;	/* Lock to protect operations on the bio list */
++	sector_t block;		/* Sector number of the cached block */
++	unsigned short state;	/* State of a block */
++	unsigned long counter;	/* Logical timestamp of the block's last access */
++	struct bio_list bios;	/* List of pending bios */
++};
++
++
++/****************************************************************************
++ *  Functions and data structures for implementing a kcached to handle async
++ *  I/O. Code for page and queue handling is borrowed from kcopyd.c.
++ ****************************************************************************/
++
++/*
++ * Functions for handling pages used by async I/O.
++ * The data asked by a bio request may not be aligned with cache blocks, in
++ * which case additional pages are required for the request that is forwarded
++ * to the server. A pool of pages are reserved for this purpose.
++ */
++
++static struct page_list *alloc_pl(void)
++{
++	struct page_list *pl;
++
++	pl = kmalloc(sizeof(*pl), GFP_KERNEL);
++	if (!pl)
++		return NULL;
++
++	pl->page = alloc_page(GFP_KERNEL);
++	if (!pl->page) {
++		kfree(pl);
++		return NULL;
++	}
++
++	return pl;
++}
++
++static void free_pl(struct page_list *pl)
++{
++	__free_page(pl->page);
++	kfree(pl);
++}
++
++static void drop_pages(struct page_list *pl)
++{
++	struct page_list *next;
++
++	while (pl) {
++		next = pl->next;
++		free_pl(pl);
++		pl = next;
++	}
++}
++
++static int kcached_get_pages(struct cache_c *dmc, unsigned int nr,
++	                         struct page_list **pages)
++{
++	struct page_list *pl;
++
++	spin_lock(&dmc->lock);
++	if (dmc->nr_free_pages < nr) {
++		DPRINTK("kcached_get_pages: No free pages: %u<%u",
++		        dmc->nr_free_pages, nr);
++		spin_unlock(&dmc->lock);
++		return -ENOMEM;
++	}
++
++	dmc->nr_free_pages -= nr;
++	for (*pages = pl = dmc->pages; --nr; pl = pl->next)
++		;
++
++	dmc->pages = pl->next;
++	pl->next = NULL;
++
++	spin_unlock(&dmc->lock);
++
++	return 0;
++}
++
++static void kcached_put_pages(struct cache_c *dmc, struct page_list *pl)
++{
++	struct page_list *cursor;
++
++	spin_lock(&dmc->lock);
++	for (cursor = pl; cursor->next; cursor = cursor->next)
++		dmc->nr_free_pages++;
++
++	dmc->nr_free_pages++;
++	cursor->next = dmc->pages;
++	dmc->pages = pl;
++
++	spin_unlock(&dmc->lock);
++}
++
++static int alloc_bio_pages(struct cache_c *dmc, unsigned int nr)
++{
++	unsigned int i;
++	struct page_list *pl = NULL, *next;
++
++	for (i = 0; i < nr; i++) {
++		next = alloc_pl();
++		if (!next) {
++			if (pl)
++				drop_pages(pl);
++			return -ENOMEM;
++		}
++		next->next = pl;
++		pl = next;
++	}
++
++	kcached_put_pages(dmc, pl);
++	dmc->nr_pages += nr;
++
++	return 0;
++}
++
++static void free_bio_pages(struct cache_c *dmc)
++{
++	BUG_ON(dmc->nr_free_pages != dmc->nr_pages);
++	drop_pages(dmc->pages);
++	dmc->pages = NULL;
++	dmc->nr_free_pages = dmc->nr_pages = 0;
++}
++
++/* Structure for a kcached job */
++struct kcached_job {
++	struct list_head list;
++	struct cache_c *dmc;
++	struct bio *bio;	/* Original bio */
++	struct io_region src;
++	struct io_region dest;
++	struct cacheblock *cacheblock;
++	int rw;
++	/*
++	 * When the original bio is not aligned with cache blocks,
++	 * we need extra bvecs and pages for padding.
++	 */
++	struct bio_vec *bvec;
++	unsigned int nr_pages;
++	struct page_list *pages;
++};
++
++static struct workqueue_struct *_kcached_wq;
++static struct work_struct _kcached_work;
++
++static inline void wake(void)
++{
++	queue_work(_kcached_wq, &_kcached_work);
++}
++
++#define MIN_JOBS 1024
++
++static struct kmem_cache *_job_cache;
++static mempool_t *_job_pool;
++
++static DEFINE_SPINLOCK(_job_lock);
++
++static LIST_HEAD(_complete_jobs);
++static LIST_HEAD(_io_jobs);
++static LIST_HEAD(_pages_jobs);
++
++static int jobs_init(void)
++{
++	_job_cache = kmem_cache_create("kcached-jobs",
++	                               sizeof(struct kcached_job),
++	                               __alignof__(struct kcached_job),
++	                               0, NULL, NULL);
++	if (!_job_cache)
++		return -ENOMEM;
++
++	_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
++	                           mempool_free_slab, _job_cache);
++	if (!_job_pool) {
++		kmem_cache_destroy(_job_cache);
++		return -ENOMEM;
++	}
++
++	return 0;
++}
++
++static void jobs_exit(void)
++{
++	BUG_ON(!list_empty(&_complete_jobs));
++	BUG_ON(!list_empty(&_io_jobs));
++	BUG_ON(!list_empty(&_pages_jobs));
++
++	mempool_destroy(_job_pool);
++	kmem_cache_destroy(_job_cache);
++	_job_pool = NULL;
++	_job_cache = NULL;
++}
++
++/*
++ * Functions to push and pop a job onto the head of a given job list.
++ */
++static inline struct kcached_job *pop(struct list_head *jobs)
++{
++	struct kcached_job *job = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&_job_lock, flags);
++
++	if (!list_empty(jobs)) {
++		job = list_entry(jobs->next, struct kcached_job, list);
++		list_del(&job->list);
++	}
++	spin_unlock_irqrestore(&_job_lock, flags);
++
++	return job;
++}
++
++static inline void push(struct list_head *jobs, struct kcached_job *job)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&_job_lock, flags);
++	list_add_tail(&job->list, jobs);
++	spin_unlock_irqrestore(&_job_lock, flags);
++}
++
++
++/****************************************************************************
++ * Functions for asynchronously fetching data from source device and storing
++ * data in cache device. Because the requested data may not align with the
++ * cache blocks, extra handling is required to pad a block request and extract
++ * the requested data from the results.
++ ****************************************************************************/
++
++static void io_callback(unsigned long error, void *context)
++{
++	struct kcached_job *job = (struct kcached_job *) context;
++
++	if (error) {
++		/* TODO */
++		DMERR("io_callback: io error");
++		return;
++	}
++
++	if (job->rw == READ) {
++		job->rw = WRITE;
++		push(&_io_jobs, job);
++	} else
++		push(&_complete_jobs, job);
++	wake();
++}
++
++/*
++ * Fetch data from the source device asynchronously.
++ * For a READ bio, if a cache block is larger than the requested data, then
++ * additional data are prefetched. Larger cache block size enables more
++ * aggressive read prefetching, which is useful for read-mostly usage.
++ * For a WRITE bio, if a cache block is larger than the requested data, the
++ * entire block needs to be fetched, and larger block size incurs more overhead.
++ * In scenaros where writes are frequent, 4KB is a good cache block size.
++ */
++static int do_fetch(struct kcached_job *job)
++{
++	int r = 0, i, j;
++	struct bio *bio = job->bio;
++	struct cache_c *dmc = job->dmc;
++	unsigned int offset, head, tail, remaining, nr_vecs, idx = 0;
++	struct bio_vec *bvec;
++	struct page_list *pl;
++
++	offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
++	head = to_bytes(offset);
++	tail = to_bytes(dmc->block_size) - bio->bi_size - head;
++
++	DPRINTK("do_fetch: %llu(%llu->%llu,%llu), head:%u,tail:%u",
++	        bio->bi_sector, job->src.sector, job->dest.sector,
++	        job->src.count, head, tail);
++
++	if (bio_data_dir(bio) == READ) { /* The original request is a READ */
++		if (0 == job->nr_pages) { /* The request is aligned to cache block */
++			r = dm_io_async_bvec(1, &job->src, READ,
++			                     bio->bi_io_vec + bio->bi_idx,
++			                     io_callback, job);
++			return r;
++		}
++
++		nr_vecs = bio->bi_vcnt - bio->bi_idx + job->nr_pages;
++		bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_NOIO);
++		if (!bvec) {
++			DMERR("do_fetch: No memory");
++			return 1;
++		}
++
++		pl = job->pages;
++		i = 0;
++		while (head) {
++			bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
++			bvec[i].bv_offset = 0;
++			bvec[i].bv_page = pl->page;
++			head -= bvec[i].bv_len;
++			pl = pl->next;
++			i++;
++		}
++
++		remaining = bio->bi_size;
++		j = bio->bi_idx;
++		while (remaining) {
++			bvec[i] = bio->bi_io_vec[j];
++			remaining -= bvec[i].bv_len;
++			i++; j++;
++		}
++
++		while (tail) {
++			bvec[i].bv_len = min(tail, (unsigned int)PAGE_SIZE);
++			bvec[i].bv_offset = 0;
++			bvec[i].bv_page = pl->page;
++			tail -= bvec[i].bv_len;
++			pl = pl->next;
++			i++;
++		}
++
++		job->bvec = bvec;
++		r = dm_io_async_bvec(1, &job->src, READ, job->bvec, io_callback, job);
++		return r;
++	} else { /* The original request is a WRITE */
++		pl = job->pages;
++
++		if (head && tail) { /* Special case */
++			bvec = kmalloc(job->nr_pages * sizeof(*bvec), GFP_KERNEL);
++			if (!bvec) {
++				DMERR("do_fetch: No memory");
++				return 1;
++			}
++			for (i=0; i<job->nr_pages; i++) {
++				bvec[i].bv_len = PAGE_SIZE;
++				bvec[i].bv_offset = 0;
++				bvec[i].bv_page = pl->page;
++				pl = pl->next;
++			}
++			job->bvec = bvec;
++			r = dm_io_async_bvec(1, &job->src, READ, job->bvec,
++			                     io_callback, job);
++			return r;
++		}
++
++		bvec = kmalloc((job->nr_pages + bio->bi_vcnt - bio->bi_idx)
++				* sizeof(*bvec), GFP_KERNEL);
++		if (!bvec) {
++			DMERR("do_fetch: No memory");
++			return 1;
++		}
++
++		i = 0;
++		while (head) {
++			bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
++			bvec[i].bv_offset = 0;
++			bvec[i].bv_page = pl->page;
++			head -= bvec[i].bv_len;
++			pl = pl->next;
++			i++;
++		}
++
++		remaining = bio->bi_size;
++		j = bio->bi_idx;
++		while (remaining) {
++			bvec[i] = bio->bi_io_vec[j];
++			remaining -= bvec[i].bv_len;
++			i++; j++;
++		}
++
++		if (tail) {
++			idx = i;
++			bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) &
++			                    (PAGE_SIZE - 1);
++			bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
++			bvec[i].bv_page = pl->page;
++			tail -= bvec[i].bv_len;
++			pl = pl->next; i++;
++			while (tail) {
++				bvec[i].bv_len = PAGE_SIZE;
++				bvec[i].bv_offset = 0;
++				bvec[i].bv_page = pl->page;
++				tail -= bvec[i].bv_len;
++				pl = pl->next; i++;
++			}
++		}
++
++		job->bvec = bvec;
++		r = dm_io_async_bvec(1, &job->src, READ, job->bvec + idx,
++		                     io_callback, job);
++
++		return r;
++	}
++}
++
++/*
++ * Store data to the cache source device asynchronously.
++ * For a READ bio request, the data fetched from the source device are returned
++ * to kernel and stored in cache at the same time.
++ * For a WRITE bio request, the data are written to the cache and source device
++ * at the same time.
++ */
++static int do_store(struct kcached_job *job)
++{
++	int i, j, r = 0;
++	struct bio *bio = job->bio, *clone;
++	struct cache_c *dmc = job->dmc;
++	unsigned int offset, head, tail, remaining, nr_vecs;
++	struct bio_vec *bvec;
++
++	offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
++	head = to_bytes(offset);
++	tail = to_bytes(dmc->block_size) - bio->bi_size - head;
++
++	DPRINTK("do_store: %llu(%llu->%llu,%llu), head:%u,tail:%u",
++	        bio->bi_sector, job->src.sector, job->dest.sector,
++	        job->src.count, head, tail);
++
++	/* A READ is acknowledged as soon as the requested data is fetched, and
++	   does not have to wait for it being stored in cache. The bio is cloned
++	   so that the original one can be ended here. But to avoid copying
++	   pages, we reuse the pages allocated for the original bio, and mark
++	   each of them to prevent the pages being freed before the cache
++	   insertion is completed.
++	 */
++	if (bio_data_dir(bio) == READ) {
++		clone = bio_clone(bio, GFP_NOIO);
++		for (i=bio->bi_idx; i<bio->bi_vcnt; i++) {
++			get_page(bio->bi_io_vec[i].bv_page);
++		}
++		DPRINTK("bio ended for %llu:%u", bio->bi_sector, bio->bi_size);
++		bio_endio(bio, bio->bi_size, 0);
++		bio = clone;
++		job->bio = clone;
++	}
++
++	if (0 == job->nr_pages) /* Original request is aligned with cache blocks */
++		r = dm_io_async_bvec(1, &job->dest, WRITE, bio->bi_io_vec + bio->bi_idx,
++		                     io_callback, job);
++	else {
++		if (bio_data_dir(bio) == WRITE && head > 0 && tail > 0) {
++			DPRINTK("Special case: %lu %u %u", bio_data_dir(bio), head, tail);
++			nr_vecs = job->nr_pages + bio->bi_vcnt - bio->bi_idx;
++			if (offset && (offset + bio->bi_size < PAGE_SIZE)) nr_vecs++;
++			DPRINTK("Create %u new vecs", nr_vecs);
++			bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_KERNEL);
++			if (!bvec) {
++				DMERR("do_store: No memory");
++				return 1;
++			}
++
++			i = 0;
++			while (head) {
++				bvec[i].bv_len = min(head, job->bvec[i].bv_len);
++				bvec[i].bv_offset = 0;
++				bvec[i].bv_page = job->bvec[i].bv_page;
++				head -= bvec[i].bv_len;
++				i++;
++			}
++			remaining = bio->bi_size;
++			j = bio->bi_idx;
++			while (remaining) {
++				bvec[i] = bio->bi_io_vec[j];
++				remaining -= bvec[i].bv_len;
++				i++; j++;
++			}
++			j = (to_bytes(offset) + bio->bi_size) / PAGE_SIZE;
++			bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) -
++			                    j * PAGE_SIZE;
++			bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
++			bvec[i].bv_page = job->bvec[j].bv_page;
++			tail -= bvec[i].bv_len;
++			i++; j++;
++			while (tail) {
++				bvec[i] = job->bvec[j];
++				tail -= bvec[i].bv_len;
++				i++; j++;
++			}
++			kfree(job->bvec);
++			job->bvec = bvec;
<<Diff was trimmed, longer than 597 lines>>


More information about the pld-cvs-commit mailing list