SOURCES: linux-dmcache.patch (NEW) - DM-Cache: A Generic Block-lev...
glen
glen at pld-linux.org
Wed Nov 14 01:07:39 CET 2007
Author: glen Date: Wed Nov 14 00:07:39 2007 GMT
Module: SOURCES Tag: HEAD
---- Log message:
- DM-Cache: A Generic Block-level Disk Cache - http://www.acis.ufl.edu/~ming/dmcache/index.html
saved http://www.acis.ufl.edu/~ming/dmcache/patch-2.6.21
---- Files affected:
SOURCES:
linux-dmcache.patch (NONE -> 1.1) (NEW)
---- Diffs:
================================================================
Index: SOURCES/linux-dmcache.patch
diff -u /dev/null SOURCES/linux-dmcache.patch:1.1
--- /dev/null Wed Nov 14 01:07:39 2007
+++ SOURCES/linux-dmcache.patch Wed Nov 14 01:07:34 2007
@@ -0,0 +1,1797 @@
+diff -Naur linux-2.6.21.7-orig/drivers/md/dm-cache.c linux-2.6.21.7-dmcache/drivers/md/dm-cache.c
+--- linux-2.6.21.7-orig/drivers/md/dm-cache.c 1969-12-31 19:00:00.000000000 -0500
++++ linux-2.6.21.7-dmcache/drivers/md/dm-cache.c 2007-08-23 14:10:58.000000000 -0400
+@@ -0,0 +1,1766 @@
++/****************************************************************************
++ * dm-cache.c
++ * Device mapper target for block-level disk caching
++ *
++ * Copyright (C) International Business Machines Corp., 2006
++ * Author: Ming Zhao (mingzhao at ufl.edu)
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; under version 2 of the License.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ *
++ ****************************************************************************/
++
++#include <asm/atomic.h>
++#include <asm/checksum.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/list.h>
++#include <linux/blkdev.h>
++#include <linux/bio.h>
++#include <linux/slab.h>
++#include <linux/hash.h>
++#include <linux/spinlock.h>
++#include <linux/workqueue.h>
++#include <linux/pagemap.h>
++
++#include "dm.h"
++#include "dm-io.h"
++#include "dm-bio-list.h"
++#include "kcopyd.h"
++
++#define DMC_DEBUG 0
++
++#define DM_MSG_PREFIX "cache"
++#define DMC_PREFIX "dm-cache: "
++
++#if DMC_DEBUG
++#define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg)
++#else
++#define DPRINTK( s, arg... )
++#endif
++
++/* Default cache parameters */
++#define DEFAULT_CACHE_SIZE 65536
++#define DEFAULT_CACHE_ASSOC 1024
++#define DEFAULT_BLOCK_SIZE 8
++#define CONSECUTIVE_BLOCKS 512
++
++/* Write policy */
++#define WRITE_THROUGH 0
++#define WRITE_BACK 1
++#define DEFAULT_WRITE_POLICY WRITE_THROUGH
++
++/* Number of pages for I/O */
++#define DMCACHE_COPY_PAGES 1024
++
++/* States of a cache block */
++#define INVALID 0
++#define VALID 1 /* Valid */
++#define RESERVED 2 /* Allocated but data not in place yet */
++#define DIRTY 4 /* Locally modified */
++#define WRITEBACK 8 /* In the process of write back */
++
++#define is_state(x, y) (x & y)
++#define set_state(x, y) (x |= y)
++#define clear_state(x, y) (x &= ~y)
++
++/*
++ * Cache context
++ */
++struct cache_c {
++ struct dm_dev *src_dev; /* Source device */
++ struct dm_dev *cache_dev; /* Cache device */
++ struct kcopyd_client *kcp_client; /* Kcopyd client for writing back data */
++
++ struct cacheblock *cache; /* Hash table for cache blocks */
++ sector_t size; /* Cache size */
++ unsigned int bits; /* Cache size in bits */
++ unsigned int assoc; /* Cache associativity */
++ unsigned int block_size; /* Cache block size */
++ unsigned int block_shift; /* Cache block size in bits */
++ unsigned int block_mask; /* Cache block mask */
++ unsigned int consecutive_shift; /* Consecutive blocks size in bits */
++ unsigned long counter; /* Logical timestamp of last access */
++ unsigned int write_policy; /* Cache write policy */
++ sector_t dirty_blocks; /* Number of dirty blocks */
++
++ spinlock_t lock; /* Lock to protect page allocation/deallocation */
++ struct page_list *pages; /* Pages for I/O */
++ unsigned int nr_pages; /* Number of pages */
++ unsigned int nr_free_pages; /* Number of free pages */
++ wait_queue_head_t destroyq; /* Wait queue for I/O completion */
++ atomic_t nr_jobs; /* Number of I/O jobs */
++ /* Stats */
++ unsigned long reads; /* Number of reads */
++ unsigned long writes; /* Number of writes */
++ unsigned long cache_hits; /* Number of cache hits */
++ unsigned long replace; /* Number of cache replacements */
++ unsigned long writeback; /* Number of replaced dirty blocks */
++ unsigned long dirty; /* Number of submitted dirty blocks */
++};
++
++/* Cache block metadata structure */
++struct cacheblock {
++ spinlock_t lock; /* Lock to protect operations on the bio list */
++ sector_t block; /* Sector number of the cached block */
++ unsigned short state; /* State of a block */
++ unsigned long counter; /* Logical timestamp of the block's last access */
++ struct bio_list bios; /* List of pending bios */
++};
++
++
++/****************************************************************************
++ * Functions and data structures for implementing a kcached to handle async
++ * I/O. Code for page and queue handling is borrowed from kcopyd.c.
++ ****************************************************************************/
++
++/*
++ * Functions for handling pages used by async I/O.
++ * The data asked by a bio request may not be aligned with cache blocks, in
++ * which case additional pages are required for the request that is forwarded
++ * to the server. A pool of pages are reserved for this purpose.
++ */
++
++static struct page_list *alloc_pl(void)
++{
++ struct page_list *pl;
++
++ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
++ if (!pl)
++ return NULL;
++
++ pl->page = alloc_page(GFP_KERNEL);
++ if (!pl->page) {
++ kfree(pl);
++ return NULL;
++ }
++
++ return pl;
++}
++
++static void free_pl(struct page_list *pl)
++{
++ __free_page(pl->page);
++ kfree(pl);
++}
++
++static void drop_pages(struct page_list *pl)
++{
++ struct page_list *next;
++
++ while (pl) {
++ next = pl->next;
++ free_pl(pl);
++ pl = next;
++ }
++}
++
++static int kcached_get_pages(struct cache_c *dmc, unsigned int nr,
++ struct page_list **pages)
++{
++ struct page_list *pl;
++
++ spin_lock(&dmc->lock);
++ if (dmc->nr_free_pages < nr) {
++ DPRINTK("kcached_get_pages: No free pages: %u<%u",
++ dmc->nr_free_pages, nr);
++ spin_unlock(&dmc->lock);
++ return -ENOMEM;
++ }
++
++ dmc->nr_free_pages -= nr;
++ for (*pages = pl = dmc->pages; --nr; pl = pl->next)
++ ;
++
++ dmc->pages = pl->next;
++ pl->next = NULL;
++
++ spin_unlock(&dmc->lock);
++
++ return 0;
++}
++
++static void kcached_put_pages(struct cache_c *dmc, struct page_list *pl)
++{
++ struct page_list *cursor;
++
++ spin_lock(&dmc->lock);
++ for (cursor = pl; cursor->next; cursor = cursor->next)
++ dmc->nr_free_pages++;
++
++ dmc->nr_free_pages++;
++ cursor->next = dmc->pages;
++ dmc->pages = pl;
++
++ spin_unlock(&dmc->lock);
++}
++
++static int alloc_bio_pages(struct cache_c *dmc, unsigned int nr)
++{
++ unsigned int i;
++ struct page_list *pl = NULL, *next;
++
++ for (i = 0; i < nr; i++) {
++ next = alloc_pl();
++ if (!next) {
++ if (pl)
++ drop_pages(pl);
++ return -ENOMEM;
++ }
++ next->next = pl;
++ pl = next;
++ }
++
++ kcached_put_pages(dmc, pl);
++ dmc->nr_pages += nr;
++
++ return 0;
++}
++
++static void free_bio_pages(struct cache_c *dmc)
++{
++ BUG_ON(dmc->nr_free_pages != dmc->nr_pages);
++ drop_pages(dmc->pages);
++ dmc->pages = NULL;
++ dmc->nr_free_pages = dmc->nr_pages = 0;
++}
++
++/* Structure for a kcached job */
++struct kcached_job {
++ struct list_head list;
++ struct cache_c *dmc;
++ struct bio *bio; /* Original bio */
++ struct io_region src;
++ struct io_region dest;
++ struct cacheblock *cacheblock;
++ int rw;
++ /*
++ * When the original bio is not aligned with cache blocks,
++ * we need extra bvecs and pages for padding.
++ */
++ struct bio_vec *bvec;
++ unsigned int nr_pages;
++ struct page_list *pages;
++};
++
++static struct workqueue_struct *_kcached_wq;
++static struct work_struct _kcached_work;
++
++static inline void wake(void)
++{
++ queue_work(_kcached_wq, &_kcached_work);
++}
++
++#define MIN_JOBS 1024
++
++static struct kmem_cache *_job_cache;
++static mempool_t *_job_pool;
++
++static DEFINE_SPINLOCK(_job_lock);
++
++static LIST_HEAD(_complete_jobs);
++static LIST_HEAD(_io_jobs);
++static LIST_HEAD(_pages_jobs);
++
++static int jobs_init(void)
++{
++ _job_cache = kmem_cache_create("kcached-jobs",
++ sizeof(struct kcached_job),
++ __alignof__(struct kcached_job),
++ 0, NULL, NULL);
++ if (!_job_cache)
++ return -ENOMEM;
++
++ _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
++ mempool_free_slab, _job_cache);
++ if (!_job_pool) {
++ kmem_cache_destroy(_job_cache);
++ return -ENOMEM;
++ }
++
++ return 0;
++}
++
++static void jobs_exit(void)
++{
++ BUG_ON(!list_empty(&_complete_jobs));
++ BUG_ON(!list_empty(&_io_jobs));
++ BUG_ON(!list_empty(&_pages_jobs));
++
++ mempool_destroy(_job_pool);
++ kmem_cache_destroy(_job_cache);
++ _job_pool = NULL;
++ _job_cache = NULL;
++}
++
++/*
++ * Functions to push and pop a job onto the head of a given job list.
++ */
++static inline struct kcached_job *pop(struct list_head *jobs)
++{
++ struct kcached_job *job = NULL;
++ unsigned long flags;
++
++ spin_lock_irqsave(&_job_lock, flags);
++
++ if (!list_empty(jobs)) {
++ job = list_entry(jobs->next, struct kcached_job, list);
++ list_del(&job->list);
++ }
++ spin_unlock_irqrestore(&_job_lock, flags);
++
++ return job;
++}
++
++static inline void push(struct list_head *jobs, struct kcached_job *job)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&_job_lock, flags);
++ list_add_tail(&job->list, jobs);
++ spin_unlock_irqrestore(&_job_lock, flags);
++}
++
++
++/****************************************************************************
++ * Functions for asynchronously fetching data from source device and storing
++ * data in cache device. Because the requested data may not align with the
++ * cache blocks, extra handling is required to pad a block request and extract
++ * the requested data from the results.
++ ****************************************************************************/
++
++static void io_callback(unsigned long error, void *context)
++{
++ struct kcached_job *job = (struct kcached_job *) context;
++
++ if (error) {
++ /* TODO */
++ DMERR("io_callback: io error");
++ return;
++ }
++
++ if (job->rw == READ) {
++ job->rw = WRITE;
++ push(&_io_jobs, job);
++ } else
++ push(&_complete_jobs, job);
++ wake();
++}
++
++/*
++ * Fetch data from the source device asynchronously.
++ * For a READ bio, if a cache block is larger than the requested data, then
++ * additional data are prefetched. Larger cache block size enables more
++ * aggressive read prefetching, which is useful for read-mostly usage.
++ * For a WRITE bio, if a cache block is larger than the requested data, the
++ * entire block needs to be fetched, and larger block size incurs more overhead.
++ * In scenaros where writes are frequent, 4KB is a good cache block size.
++ */
++static int do_fetch(struct kcached_job *job)
++{
++ int r = 0, i, j;
++ struct bio *bio = job->bio;
++ struct cache_c *dmc = job->dmc;
++ unsigned int offset, head, tail, remaining, nr_vecs, idx = 0;
++ struct bio_vec *bvec;
++ struct page_list *pl;
++
++ offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
++ head = to_bytes(offset);
++ tail = to_bytes(dmc->block_size) - bio->bi_size - head;
++
++ DPRINTK("do_fetch: %llu(%llu->%llu,%llu), head:%u,tail:%u",
++ bio->bi_sector, job->src.sector, job->dest.sector,
++ job->src.count, head, tail);
++
++ if (bio_data_dir(bio) == READ) { /* The original request is a READ */
++ if (0 == job->nr_pages) { /* The request is aligned to cache block */
++ r = dm_io_async_bvec(1, &job->src, READ,
++ bio->bi_io_vec + bio->bi_idx,
++ io_callback, job);
++ return r;
++ }
++
++ nr_vecs = bio->bi_vcnt - bio->bi_idx + job->nr_pages;
++ bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_NOIO);
++ if (!bvec) {
++ DMERR("do_fetch: No memory");
++ return 1;
++ }
++
++ pl = job->pages;
++ i = 0;
++ while (head) {
++ bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
++ bvec[i].bv_offset = 0;
++ bvec[i].bv_page = pl->page;
++ head -= bvec[i].bv_len;
++ pl = pl->next;
++ i++;
++ }
++
++ remaining = bio->bi_size;
++ j = bio->bi_idx;
++ while (remaining) {
++ bvec[i] = bio->bi_io_vec[j];
++ remaining -= bvec[i].bv_len;
++ i++; j++;
++ }
++
++ while (tail) {
++ bvec[i].bv_len = min(tail, (unsigned int)PAGE_SIZE);
++ bvec[i].bv_offset = 0;
++ bvec[i].bv_page = pl->page;
++ tail -= bvec[i].bv_len;
++ pl = pl->next;
++ i++;
++ }
++
++ job->bvec = bvec;
++ r = dm_io_async_bvec(1, &job->src, READ, job->bvec, io_callback, job);
++ return r;
++ } else { /* The original request is a WRITE */
++ pl = job->pages;
++
++ if (head && tail) { /* Special case */
++ bvec = kmalloc(job->nr_pages * sizeof(*bvec), GFP_KERNEL);
++ if (!bvec) {
++ DMERR("do_fetch: No memory");
++ return 1;
++ }
++ for (i=0; i<job->nr_pages; i++) {
++ bvec[i].bv_len = PAGE_SIZE;
++ bvec[i].bv_offset = 0;
++ bvec[i].bv_page = pl->page;
++ pl = pl->next;
++ }
++ job->bvec = bvec;
++ r = dm_io_async_bvec(1, &job->src, READ, job->bvec,
++ io_callback, job);
++ return r;
++ }
++
++ bvec = kmalloc((job->nr_pages + bio->bi_vcnt - bio->bi_idx)
++ * sizeof(*bvec), GFP_KERNEL);
++ if (!bvec) {
++ DMERR("do_fetch: No memory");
++ return 1;
++ }
++
++ i = 0;
++ while (head) {
++ bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
++ bvec[i].bv_offset = 0;
++ bvec[i].bv_page = pl->page;
++ head -= bvec[i].bv_len;
++ pl = pl->next;
++ i++;
++ }
++
++ remaining = bio->bi_size;
++ j = bio->bi_idx;
++ while (remaining) {
++ bvec[i] = bio->bi_io_vec[j];
++ remaining -= bvec[i].bv_len;
++ i++; j++;
++ }
++
++ if (tail) {
++ idx = i;
++ bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) &
++ (PAGE_SIZE - 1);
++ bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
++ bvec[i].bv_page = pl->page;
++ tail -= bvec[i].bv_len;
++ pl = pl->next; i++;
++ while (tail) {
++ bvec[i].bv_len = PAGE_SIZE;
++ bvec[i].bv_offset = 0;
++ bvec[i].bv_page = pl->page;
++ tail -= bvec[i].bv_len;
++ pl = pl->next; i++;
++ }
++ }
++
++ job->bvec = bvec;
++ r = dm_io_async_bvec(1, &job->src, READ, job->bvec + idx,
++ io_callback, job);
++
++ return r;
++ }
++}
++
++/*
++ * Store data to the cache source device asynchronously.
++ * For a READ bio request, the data fetched from the source device are returned
++ * to kernel and stored in cache at the same time.
++ * For a WRITE bio request, the data are written to the cache and source device
++ * at the same time.
++ */
++static int do_store(struct kcached_job *job)
++{
++ int i, j, r = 0;
++ struct bio *bio = job->bio, *clone;
++ struct cache_c *dmc = job->dmc;
++ unsigned int offset, head, tail, remaining, nr_vecs;
++ struct bio_vec *bvec;
++
++ offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
++ head = to_bytes(offset);
++ tail = to_bytes(dmc->block_size) - bio->bi_size - head;
++
++ DPRINTK("do_store: %llu(%llu->%llu,%llu), head:%u,tail:%u",
++ bio->bi_sector, job->src.sector, job->dest.sector,
++ job->src.count, head, tail);
++
++ /* A READ is acknowledged as soon as the requested data is fetched, and
++ does not have to wait for it being stored in cache. The bio is cloned
++ so that the original one can be ended here. But to avoid copying
++ pages, we reuse the pages allocated for the original bio, and mark
++ each of them to prevent the pages being freed before the cache
++ insertion is completed.
++ */
++ if (bio_data_dir(bio) == READ) {
++ clone = bio_clone(bio, GFP_NOIO);
++ for (i=bio->bi_idx; i<bio->bi_vcnt; i++) {
++ get_page(bio->bi_io_vec[i].bv_page);
++ }
++ DPRINTK("bio ended for %llu:%u", bio->bi_sector, bio->bi_size);
++ bio_endio(bio, bio->bi_size, 0);
++ bio = clone;
++ job->bio = clone;
++ }
++
++ if (0 == job->nr_pages) /* Original request is aligned with cache blocks */
++ r = dm_io_async_bvec(1, &job->dest, WRITE, bio->bi_io_vec + bio->bi_idx,
++ io_callback, job);
++ else {
++ if (bio_data_dir(bio) == WRITE && head > 0 && tail > 0) {
++ DPRINTK("Special case: %lu %u %u", bio_data_dir(bio), head, tail);
++ nr_vecs = job->nr_pages + bio->bi_vcnt - bio->bi_idx;
++ if (offset && (offset + bio->bi_size < PAGE_SIZE)) nr_vecs++;
++ DPRINTK("Create %u new vecs", nr_vecs);
++ bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_KERNEL);
++ if (!bvec) {
++ DMERR("do_store: No memory");
++ return 1;
++ }
++
++ i = 0;
++ while (head) {
++ bvec[i].bv_len = min(head, job->bvec[i].bv_len);
++ bvec[i].bv_offset = 0;
++ bvec[i].bv_page = job->bvec[i].bv_page;
++ head -= bvec[i].bv_len;
++ i++;
++ }
++ remaining = bio->bi_size;
++ j = bio->bi_idx;
++ while (remaining) {
++ bvec[i] = bio->bi_io_vec[j];
++ remaining -= bvec[i].bv_len;
++ i++; j++;
++ }
++ j = (to_bytes(offset) + bio->bi_size) / PAGE_SIZE;
++ bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) -
++ j * PAGE_SIZE;
++ bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
++ bvec[i].bv_page = job->bvec[j].bv_page;
++ tail -= bvec[i].bv_len;
++ i++; j++;
++ while (tail) {
++ bvec[i] = job->bvec[j];
++ tail -= bvec[i].bv_len;
++ i++; j++;
++ }
++ kfree(job->bvec);
++ job->bvec = bvec;
<<Diff was trimmed, longer than 597 lines>>
More information about the pld-cvs-commit
mailing list