/* $NetBSD: rumpblk.c,v 1.64 2016/07/07 06:55:44 msaitoh Exp $ */ /* * Copyright (c) 2009 Antti Kantee. All Rights Reserved. * * Development of this software was supported by the * Finnish Cultural Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Block device emulation. Presents a block device interface and * uses rumpuser system calls to satisfy I/O requests. * * We provide fault injection. The driver can be made to fail * I/O occasionally. */ #include __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.64 2016/07/07 06:55:44 msaitoh Exp $"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if 0 #define DPRINTF(x) printf x #else #define DPRINTF(x) #endif #define RUMPBLK_SIZE 16 static struct rblkdev { char *rblk_path; int rblk_fd; int rblk_mode; uint64_t rblk_size; uint64_t rblk_hostoffset; uint64_t rblk_hostsize; int rblk_ftype; struct disklabel rblk_label; } minors[RUMPBLK_SIZE]; static struct evcnt ev_io_total; static struct evcnt ev_io_async; static struct evcnt ev_bwrite_total; static struct evcnt ev_bwrite_async; static struct evcnt ev_bread_total; dev_type_open(rumpblk_open); dev_type_close(rumpblk_close); dev_type_read(rumpblk_read); dev_type_write(rumpblk_write); dev_type_ioctl(rumpblk_ioctl); dev_type_strategy(rumpblk_strategy); dev_type_strategy(rumpblk_strategy_fail); dev_type_dump(rumpblk_dump); dev_type_size(rumpblk_size); static const struct bdevsw rumpblk_bdevsw = { .d_open = rumpblk_open, .d_close = rumpblk_close, .d_strategy = rumpblk_strategy, .d_ioctl = rumpblk_ioctl, .d_dump = nodump, .d_psize = nosize, .d_discard = nodiscard, .d_flag = D_DISK }; static const struct bdevsw rumpblk_bdevsw_fail = { .d_open = rumpblk_open, .d_close = rumpblk_close, .d_strategy = rumpblk_strategy_fail, .d_ioctl = rumpblk_ioctl, .d_dump = nodump, .d_psize = nosize, .d_discard = nodiscard, .d_flag = D_DISK }; static const struct cdevsw rumpblk_cdevsw = { .d_open = rumpblk_open, .d_close = rumpblk_close, .d_read = rumpblk_read, .d_write = rumpblk_write, .d_ioctl = rumpblk_ioctl, .d_stop = nostop, .d_tty = notty, .d_poll = nopoll, .d_mmap = nommap, .d_kqfilter = nokqfilter, .d_discard = nodiscard, .d_flag = D_DISK }; static int backend_open(struct rblkdev *, const char *); static int backend_close(struct rblkdev *); /* fail every n out of BLKFAIL_MAX */ #define BLKFAIL_MAX 10000 static int blkfail; static unsigned randstate; static kmutex_t rumpblk_lock; static int sectshift = DEV_BSHIFT; static void makedefaultlabel(struct disklabel *lp, off_t size, int part) { int i; memset(lp, 0, sizeof(*lp)); lp->d_secperunit = size; lp->d_secsize = 1 << sectshift; lp->d_nsectors = size >> sectshift; lp->d_ntracks = 1; lp->d_ncylinders = 1; lp->d_secpercyl = lp->d_nsectors; /* oh dear oh dear */ strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename)); strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); lp->d_type = DKTYPE_RUMPD; lp->d_rpm = 11; lp->d_interleave = 1; lp->d_flags = 0; /* XXX: RAW_PART handling? */ for (i = 0; i < part; i++) { lp->d_partitions[i].p_fstype = FS_UNUSED; } lp->d_partitions[part].p_size = size >> sectshift; lp->d_npartitions = part+1; /* XXX: file system type? */ lp->d_magic = DISKMAGIC; lp->d_magic2 = DISKMAGIC; lp->d_checksum = 0; /* XXX */ } int rumpblk_init(void) { char buf[64]; devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR; unsigned tmp; int i; mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE); if (rumpuser_getparam("RUMP_BLKFAIL", buf, sizeof(buf)) == 0) { blkfail = strtoul(buf, NULL, 10); /* fail everything */ if (blkfail > BLKFAIL_MAX) blkfail = BLKFAIL_MAX; if (rumpuser_getparam("RUMP_BLKFAIL_SEED", buf, sizeof(buf)) == 0) { randstate = strtoul(buf, NULL, 10); } else { randstate = cprng_fast32(); } printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. " "seed %u\n", blkfail, BLKFAIL_MAX, randstate); } else { blkfail = 0; } if (rumpuser_getparam("RUMP_BLKSECTSHIFT", buf, sizeof(buf)) == 0) { printf("rumpblk: "); tmp = strtoul(buf, NULL, 10); if (tmp >= DEV_BSHIFT) sectshift = tmp; else printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ", DEV_BSHIFT, tmp); printf("using %d for sector shift (size %d)\n", sectshift, 1<rblk_path = __UNCONST("taken"); mutex_exit(&rumpblk_lock); len = strlen(path); rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK); strcpy(rblk->rblk_path, path); rblk->rblk_hostoffset = offset; if (size != RUMPBLK_SIZENOTSET) { KASSERT(size + offset <= flen); rblk->rblk_size = size; } else { KASSERT(offset < flen); rblk->rblk_size = flen - offset; } rblk->rblk_hostsize = flen; rblk->rblk_ftype = ftype; makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i); if ((error = backend_open(rblk, path)) != 0) { memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label)); free(rblk->rblk_path, M_TEMP); rblk->rblk_path = NULL; return error; } *dmin = i; return 0; } /* * Unregister rumpblk. It's the callers responsibility to make * sure it's no longer in use. */ int rumpblk_deregister(const char *path) { struct rblkdev *rblk; int i; mutex_enter(&rumpblk_lock); for (i = 0; i < RUMPBLK_SIZE; i++) { if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) { break; } } mutex_exit(&rumpblk_lock); if (i == RUMPBLK_SIZE) return ENOENT; rblk = &minors[i]; backend_close(rblk); free(rblk->rblk_path, M_TEMP); memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label)); rblk->rblk_path = NULL; return 0; } /* * Release all backend resources, to be called only when the rump * kernel is being shut down. * This routine does not do a full "fini" since we're going down anyway. */ void rumpblk_fini(void) { int i; for (i = 0; i < RUMPBLK_SIZE; i++) { struct rblkdev *rblk; rblk = &minors[i]; if (rblk->rblk_fd != -1) backend_close(rblk); } } static int backend_open(struct rblkdev *rblk, const char *path) { int error, fd; KASSERT(rblk->rblk_fd == -1); error = rumpuser_open(path, RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_BIO, &fd); if (error) { error = rumpuser_open(path, RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_BIO, &fd); if (error) return error; rblk->rblk_mode = FREAD; } else { rblk->rblk_mode = FREAD|FWRITE; } rblk->rblk_fd = fd; KASSERT(rblk->rblk_fd != -1); return 0; } static int backend_close(struct rblkdev *rblk) { rumpuser_close(rblk->rblk_fd); rblk->rblk_fd = -1; return 0; } int rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l) { struct rblkdev *rblk = &minors[minor(dev)]; if (rblk->rblk_fd == -1) return ENXIO; if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) { return EACCES; } return 0; } int rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l) { return 0; } int rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l) { devminor_t dmin = minor(dev); struct rblkdev *rblk = &minors[dmin]; struct partinfo *pi; struct partition *dp; int error = 0; /* well, me should support a few more, but we don't for now */ switch (xfer) { case DIOCGDINFO: *(struct disklabel *)addr = rblk->rblk_label; break; case DIOCGPARTINFO: dp = &rblk->rblk_label.d_partitions[DISKPART(dmin)]; pi = addr; pi->pi_offset = dp->p_offset; pi->pi_size = dp->p_size; pi->pi_secsize = rblk->rblk_label.d_secsize; pi->pi_bsize = BLKDEV_IOSIZE; pi->pi_fstype = dp->p_fstype; pi->pi_fsize = dp->p_fsize; pi->pi_frag = dp->p_frag; pi->pi_cpg = dp->p_cpg; break; /* it's synced enough along the write path */ case DIOCCACHESYNC: break; case DIOCGMEDIASIZE: *(off_t *)addr = (off_t)rblk->rblk_size; break; default: error = ENOTTY; break; } return error; } static int do_physio(dev_t dev, struct uio *uio, int which) { void (*strat)(struct buf *); if (blkfail) strat = rumpblk_strategy_fail; else strat = rumpblk_strategy; return physio(strat, NULL, dev, which, minphys, uio); } int rumpblk_read(dev_t dev, struct uio *uio, int flags) { return do_physio(dev, uio, B_READ); } int rumpblk_write(dev_t dev, struct uio *uio, int flags) { return do_physio(dev, uio, B_WRITE); } static void dostrategy(struct buf *bp) { struct rblkdev *rblk = &minors[minor(bp->b_dev)]; off_t off; int async = bp->b_flags & B_ASYNC; int op; if (bp->b_bcount % (1<b_bcount; if (async) ev_bwrite_async.ev_count += bp->b_bcount; } else { ev_bread_total.ev_count++; } /* * b_blkno is always in terms of DEV_BSIZE, and since we need * to translate to a byte offset for the host read, this * calculation does not need sectshift. */ off = bp->b_blkno << DEV_BSHIFT; /* * Do bounds checking if we're working on a file. Otherwise * invalid file systems might attempt to read beyond EOF. This * is bad(tm) especially on mmapped images. This is essentially * the kernel bounds_check() routines. */ if (off + bp->b_bcount > rblk->rblk_size) { int64_t sz = rblk->rblk_size - off; /* EOF */ if (sz == 0) { rump_biodone(bp, 0, 0); return; } /* beyond EOF ==> error */ if (sz < 0) { rump_biodone(bp, 0, EINVAL); return; } /* truncate to device size */ bp->b_bcount = sz; } off += rblk->rblk_hostoffset; DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64 " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n", bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE", off, off, (off + bp->b_bcount), async ? "a" : "")); op = BUF_ISREAD(bp) ? RUMPUSER_BIO_READ : RUMPUSER_BIO_WRITE; if (BUF_ISWRITE(bp) && !async) op |= RUMPUSER_BIO_SYNC; rumpuser_bio(rblk->rblk_fd, op, bp->b_data, bp->b_bcount, off, rump_biodone, bp); } void rumpblk_strategy(struct buf *bp) { dostrategy(bp); } /* * Simple random number generator. This is private so that we can * very repeatedly control which blocks will fail. * * pooka, rand() * [paste] */ static unsigned gimmerand(void) { return (randstate = randstate * 1103515245 + 12345) % (0x80000000L); } /* * Block device with very simple fault injection. Fails every * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env * variable RUMP_BLKFAIL. */ void rumpblk_strategy_fail(struct buf *bp) { if (gimmerand() % BLKFAIL_MAX >= blkfail) { dostrategy(bp); } else { printf("block fault injection: failing I/O on block %lld\n", (long long)bp->b_blkno); bp->b_error = EIO; biodone(bp); } }