[RFC] vfs_bio additions, motivated by XFS for FreeBSD project

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[RFC] vfs_bio additions, motivated by XFS for FreeBSD project

Craig Rodrigues
Hi,

Now that FreeBSD 6.0 is released, I would like to work
on integrating code from the XFS for FreeBSD project into
FreeBSD-CURRENT.

Alexander Kabaev made some changes to vfs_bio.c which are
needed by the XFS for FreeBSD code.  In addition to some
new functions, this patch adds three new fields
to struct buf (b_fsprivate1, b_fsprivate2, b_fsprivate3).
You don't see their use here, but in the XFS for FreeBSD code
(which you can get from http://people.freebsd.org/~rodrigc/xfs/ ),
they are used to cache certain information.

Comments?


--- //depot/vendor/freebsd/src/sys/kern/vfs_bio.c 2005/10/08 15:01:11
+++ //depot/projects/src/sys/kern/vfs_bio.c 2005/10/08 16:09:54
@@ -216,7 +216,7 @@
  */
 static struct mtx rbreqlock;
 
-/*
+/*
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
@@ -233,8 +233,12 @@
 /*
  * Lock that protects against bwait()/bdone()/B_DONE races.
  */
+static struct mtx bdonelock;
 
-static struct mtx bdonelock;
+/*
+ * Lock that protects against bwait()/bdone()/B_DONE races.
+ */
+static struct mtx bpinlock;
 
 /*
  * Definitions for the buffer free lists.
@@ -523,6 +527,7 @@
  mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
  mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
  mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF);
+ mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF);
 
  /* next, make a null set of free lists */
  for (i = 0; i < BUFFER_QUEUES; i++)
@@ -636,7 +641,7 @@
  * bremfree:
  *
  * Mark the buffer for removal from the appropriate free list in brelse.
- *
+ *
  */
 void
 bremfree(struct buf *bp)
@@ -720,18 +725,51 @@
 }
 
 /*
+ * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
+ * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
+ * the buffer is valid and we do not have to do anything.
+ */
+void
+breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
+    int cnt, struct ucred * cred)
+{
+ struct buf *rabp;
+ int i;
+
+ for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+ if (inmem(vp, *rablkno))
+ continue;
+ rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+
+ if ((rabp->b_flags & B_CACHE) == 0) {
+ if (curthread != PCPU_GET(idlethread))
+ curthread->td_proc->p_stats->p_ru.ru_inblock++;
+ rabp->b_flags |= B_ASYNC;
+ rabp->b_flags &= ~B_INVAL;
+ rabp->b_ioflags &= ~BIO_ERROR;
+ rabp->b_iocmd = BIO_READ;
+ if (rabp->b_rcred == NOCRED && cred != NOCRED)
+ rabp->b_rcred = crhold(cred);
+ vfs_busy_pages(rabp, 0);
+ BUF_KERNPROC(rabp);
+ rabp->b_iooffset = dbtob(rabp->b_blkno);
+ bstrategy(rabp);
+ } else {
+ brelse(rabp);
+ }
+ }
+}
+
+/*
  * Operates like bread, but also starts asynchronous I/O on
- * read-ahead blocks.  We must clear BIO_ERROR and B_INVAL prior
- * to initiating I/O . If B_CACHE is set, the buffer is valid
- * and we do not have to do anything.
+ * read-ahead blocks.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
     daddr_t * rablkno, int *rabsize,
     int cnt, struct ucred * cred, struct buf **bpp)
 {
- struct buf *bp, *rabp;
- int i;
+ struct buf *bp;
  int rv = 0, readwait = 0;
 
  CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
@@ -752,29 +790,8 @@
  ++readwait;
  }
 
- for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
- if (inmem(vp, *rablkno))
- continue;
- rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+ breada(vp, rablkno, rabsize, cnt, cred);
 
- if ((rabp->b_flags & B_CACHE) == 0) {
- if (curthread != PCPU_GET(idlethread))
- curthread->td_proc->p_stats->p_ru.ru_inblock++;
- rabp->b_flags |= B_ASYNC;
- rabp->b_flags &= ~B_INVAL;
- rabp->b_ioflags &= ~BIO_ERROR;
- rabp->b_iocmd = BIO_READ;
- if (rabp->b_rcred == NOCRED && cred != NOCRED)
- rabp->b_rcred = crhold(cred);
- vfs_busy_pages(rabp, 0);
- BUF_KERNPROC(rabp);
- rabp->b_iooffset = dbtob(rabp->b_blkno);
- bstrategy(rabp);
- } else {
- brelse(rabp);
- }
- }
-
  if (readwait) {
  rv = bufwait(bp);
  }
@@ -807,6 +824,10 @@
 
  if (BUF_REFCNT(bp) == 0)
  panic("bufwrite: buffer is not busy???");
+
+ if (bp->b_pin_count > 0)
+ bunpin_wait(bp);
+
  KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
     ("FFS background buffer should not get here %p", bp));
 
@@ -1117,6 +1138,11 @@
  KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
     ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
+ if (bp->b_flags & B_MANAGED) {
+ bqrelse(bp);
+ return;
+ }
+
  if (bp->b_iocmd == BIO_WRITE &&
     (bp->b_ioflags & BIO_ERROR) &&
     !(bp->b_flags & B_INVAL)) {
@@ -1286,7 +1312,7 @@
  }
 
  }
-
+
  if (BUF_REFCNT(bp) > 1) {
  /* do not release to free list */
  BUF_UNLOCK(bp);
@@ -1394,6 +1420,18 @@
  BUF_UNLOCK(bp);
  return;
  }
+
+ if (bp->b_flags & B_MANAGED) {
+ if (bp->b_flags & B_REMFREE) {
+ mtx_lock(&bqlock);
+ bremfreel(bp);
+ mtx_unlock(&bqlock);
+ }
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ BUF_UNLOCK(bp);
+ return;
+ }
+
  mtx_lock(&bqlock);
  /* Handle delayed bremfree() processing. */
  if (bp->b_flags & B_REMFREE)
@@ -1821,6 +1859,10 @@
  bp->b_npages = 0;
  bp->b_dirtyoff = bp->b_dirtyend = 0;
  bp->b_bufobj = NULL;
+ bp->b_pin_count = 0;
+ bp->b_fsprivate1 = NULL;
+ bp->b_fsprivate2 = NULL;
+ bp->b_fsprivate3 = NULL;
 
  LIST_INIT(&bp->b_dep);
 
@@ -2059,6 +2101,10 @@
 
  if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
  continue;
+ if (bp->b_pin_count > 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
  BO_LOCK(bp->b_bufobj);
  if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
     (bp->b_flags & B_DELWRI) == 0) {
@@ -2393,6 +2439,19 @@
  if ((bp->b_flags & B_VMIO) == 0 ||
     (size > bp->b_kvasize)) {
  if (bp->b_flags & B_DELWRI) {
+ /*
+ * If buffer is pinned and caller does
+ * not want sleep  waiting for it to be
+ * unpinned, bail out
+ * */
+ if (bp->b_pin_count > 0) {
+ if (flags & GB_LOCK_NOWAIT) {
+ bqrelse(bp);
+ return (NULL);
+ } else {
+ bunpin_wait(bp);
+ }
+ }
  bp->b_flags |= B_NOCACHE;
  bwrite(bp);
  } else {
@@ -3034,11 +3093,11 @@
  struct bufobj *dropobj;
  void    (*biodone)(struct buf *);
 
-
  CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
  dropobj = NULL;
 
- KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
+ KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
+    BUF_REFCNT(bp)));
  KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 
  runningbufwakeup(bp);
@@ -3053,6 +3112,19 @@
  bufobj_wdrop(dropobj);
  return;
  }
+
+ bufdone_finish(bp);
+
+ if (dropobj)
+ bufobj_wdrop(dropobj);
+}
+
+void
+bufdone_finish(struct buf *bp)
+{
+ KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
+    BUF_REFCNT(bp)));
+
  if (LIST_FIRST(&bp->b_dep) != NULL)
  buf_complete(bp);
 
@@ -3118,7 +3190,8 @@
  if (m == NULL)
  panic("biodone: page disappeared!");
  bp->b_pages[i] = m;
- pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+    bp->b_pages, bp->b_npages);
  }
 #if defined(VFS_BIO_DEBUG)
  if (OFF_TO_IDX(foff) != m->pindex) {
@@ -3130,7 +3203,7 @@
 
  /*
  * In the write case, the valid and clean bits are
- * already changed correctly ( see bdwrite() ), so we
+ * already changed correctly ( see bdwrite() ), so we
  * only need to do this here in the read case.
  */
  if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
@@ -3185,8 +3258,6 @@
  bqrelse(bp);
  } else
  bdone(bp);
- if (dropobj)
- bufobj_wdrop(dropobj);
 }
 
 /*
@@ -3742,6 +3813,32 @@
  return (error);
 }
 
+void
+bpin(struct buf *bp)
+{
+ mtx_lock(&bpinlock);
+ bp->b_pin_count ++;
+ mtx_unlock(&bpinlock);
+}
+
+void
+bunpin(struct buf *bp)
+{
+ mtx_lock(&bpinlock);
+ if ( --bp->b_pin_count == 0)
+ wakeup(bp);
+ mtx_unlock(&bpinlock);
+}
+
+void
+bunpin_wait(struct buf *bp)
+{
+ mtx_lock(&bpinlock);
+ while (bp->b_pin_count > 0)
+ msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0);
+ mtx_unlock(&bpinlock);
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -3794,3 +3891,4 @@
  }
 }
 #endif /* DDB */
+
--- //depot/vendor/freebsd/src/sys/kern/vfs_cluster.c 2005/08/14 09:53:08
+++ //depot/projects/src/sys/kern/vfs_cluster.c 2005/08/14 10:01:58
@@ -765,6 +765,12 @@
  --len;
  continue;
  }
+ if (tbp->b_pin_count >  0) {
+ BUF_UNLOCK(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
  bremfree(tbp);
  tbp->b_flags &= ~B_DONE;
 
@@ -868,6 +874,15 @@
  BUF_UNLOCK(tbp);
  break;
  }
+
+ /*
+ * Do not pull in pinned buffers.
+ */
+ if (tbp->b_pin_count > 0) {
+ BUF_UNLOCK(tbp);
+ break;
+ }
+
  /*
  * Ok, it's passed all the tests,
  * so remove it from the free list
@@ -979,3 +994,4 @@
  buflist->bs_nchildren = i + 1;
  return (buflist);
 }
+
--- //depot/vendor/freebsd/src/sys/sys/buf.h 2005/10/08 15:01:11
+++ //depot/projects/src/sys/sys/buf.h 2005/10/08 16:09:54
@@ -135,6 +135,10 @@
  struct vm_page *b_pages[btoc(MAXPHYS)];
  int b_npages;
  struct workhead b_dep; /* (D) List of filesystem dependencies. */
+ void *b_fsprivate1;
+ void *b_fsprivate2;
+ void *b_fsprivate3;
+ int b_pin_count;
 };
 
 #define b_object b_bufobj->bo_object
@@ -214,7 +218,7 @@
 #define B_01000000 0x01000000 /* Available flag. */
 #define B_02000000 0x02000000 /* Available flag. */
 #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */
-#define B_08000000 0x08000000 /* Available flag. */
+#define B_MANAGED 0x08000000 /* Managed by FS. */
 #define B_RAM 0x10000000 /* Read ahead mark (flag) */
 #define B_VMIO 0x20000000 /* VMIO flag */
 #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */
@@ -486,6 +490,7 @@
 void bremfree(struct buf *);
 void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */
 int bread(struct vnode *, daddr_t, int, struct ucred *, struct buf **);
+void breada(struct vnode *, daddr_t *, int *, int, struct ucred *);
 int breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int,
     struct ucred *, struct buf **);
 void bdwrite(struct buf *);
@@ -504,6 +509,7 @@
 int bufwait(struct buf *);
 int bufwrite(struct buf *);
 void bufdone(struct buf *);
+void bufdone_finish(struct buf *);
 
 int cluster_read(struct vnode *, u_quad_t, daddr_t, long,
     struct ucred *, long, int, struct buf **);
@@ -527,7 +533,11 @@
 struct buf *trypbuf(int *);
 void bwait(struct buf *, u_char, const char *);
 void bdone(struct buf *);
+void bpin(struct buf *);
+void bunpin(struct buf *);
+void bunpin_wait(struct buf *);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_BUF_H_ */
+
--
Craig Rodrigues        
[hidden email]
_______________________________________________
[hidden email] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-arch
To unsubscribe, send any mail to "[hidden email]"