On Thu, Nov 06, 2008 at 11:33:23AM -0800, Bangaru, Sailaja wrote:
> One of the problems I'm running into is a deadlock where I have threads
> blocked on newbuf (with vnode lock held) due to buffer shortage and
> bufdaemon unable to flush buffers because it can't get the exclusive
> lock for the vnode. From the code:
>
> struct buf *
> getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int
> slptimeo,
> int flags)
> {
> .
> .
>
> /*
> * Buffer is not in-core, create new buffer. The buffer
> * returned by getnewbuf() is locked. Note that the
> returned
> * buffer is also considered valid (not marked B_INVAL).
> */
> BO_UNLOCK(bo);
> .
> .
> bp = getnewbuf(slpflag, slptimeo, size, maxsize,
> vp_to_buf_index(vp));
> if (bp == NULL) {
> .
> .
> /*
> * This code is used to make sure that a buffer is not
> * created while the getnewbuf routine is blocked.
> * This can be a problem whether the vnode is locked or
> not.
> * If the buffer is created out from under us, we have
> to
> * throw away the one we just created.
> *
> * Note: this must occur before we associate the buffer
> * with the vp especially considering limitations in
> * the splay tree implementation when dealing with
> duplicate
> * lblkno's.
> */
> BO_LOCK(bo);
> if (gbincore(bo, blkno)) {
> BO_UNLOCK(bo);
> bp->b_flags |= B_INVAL;
> brelse(bp);
> goto loop;
> }
>
> Given that this code rechecks if a buffer has already been created with
> the possibility that this thread might have blocked on getnewbuf (), Is
> there really
> any need to hold the vnode lock across getnewbuf call?


Vnode lock is not needed for the fragment you cited. But dropping it
would allow for vnode to be reclaimed, that cause problems for the
callers. Callers actually do not expect the vnode to become VI_DOOMED
after call to bread().

Some time ago I developed patch that handles these bufdaemon deadlocks
with another strategy. I loaned threads that called getblk, to help
bufdaemon. The thread that is to blocked in getblk due to buffer or buffer
KVA shortage, instead does flushing of the buffers belonging to the
vnode locked by that thread.

Also, I fixed the situation where ffs_bufwrite, called to flush buffer
due to shortage, tries to allocate shadow buffer for cg block.

I did not committed this because there is a problem, at least for UFS.
Writing buffer while curthread may hold a buffer lock on the indirect
inode block might cause a panic with recursive lock attempt. This is
fixable, by changing UFS/FFS to never hold two buffer locks, but I
did not got to do it.

Peter Holm tested the patch, it eliminated "nbufkv" deadlocks for our
test scenarious.

diff --git a/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c b/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c
index 9759d5a..ff06865 100644
--- a/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c
+++ b/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c
@@ -81,7 +81,7 @@ xfs_buf_get_empty(size_t size, xfs_buftarg_t *target)
{
struct buf *bp;

- bp = geteblk(0);
+ bp = geteblk(0, 0);
if (bp != NULL) {
bp->b_bufsize = size;
bp->b_bcount = size;
@@ -100,7 +100,7 @@ xfs_buf_get_noaddr(size_t len, xfs_buftarg_t *target)
if (len >= MAXPHYS)
return (NULL);

- bp = geteblk(len);
+ bp = geteblk(len, 0);
if (bp != NULL) {
BUF_ASSERT_HELD(bp);

diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index f75963d..b5a5154 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -105,7 +105,8 @@ static void vfs_setdirty_locked_object(struct buf *bp);
static void vfs_vmio_release(struct buf *bp);
static int vfs_bio_clcheck(struct vnode *vp, int size,
daddr_t lblkno, daddr_t blkno);
-static int flushbufqueues(int, int);
+static int buf_do_flush(struct vnode *vp);
+static int flushbufqueues(struct vnode *, int, int);
static void buf_daemon(void);
static void bremfreel(struct buf *bp);

@@ -247,6 +248,7 @@ static struct mtx nblock;
#define QUEUE_DIRTY_GIANT 3 /* B_DELWRI buffers that need giant */
#define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */
#define QUEUE_EMPTY 5 /* empty buffer headers */
+#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */

/* Queues for free buffers with various properties */
static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
@@ -1673,21 +1675,23 @@ vfs_bio_awrite(struct buf *bp)
*/

static struct buf *
-getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
+ int gbflags)
{
+ struct thread *td;
struct buf *bp;
struct buf *nbp;
int defrag = 0;
int nqindex;
static int flushingbufs;

+ td = curthread;
/*
* We can't afford to block since we might be holding a vnode lock,
* which may prevent system daemons from running. We deal with
* low-memory situations by proactively returning memory and running
* async I/O rather then sync I/O.
*/
-
atomic_add_int(&getnewbufcalls, 1);
atomic_subtract_int(&getnewbufrestarts, 1);
restart:
@@ -1919,8 +1923,9 @@ restart:
*/

if (bp == NULL) {
- int flags;
+ int flags, norunbuf;
char *waitmsg;
+ int fl;

if (defrag) {
flags = VFS_BIO_NEED_BUFSPACE;
@@ -1938,9 +1943,35 @@ restart:
mtx_unlock(&bqlock);

bd_speedup(); /* heeeelp */
+ if (gbflags & GB_NOWAIT_BD)
+ return (NULL);

mtx_lock(&nblock);
while (needsbuffer & flags) {
+ if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
+ mtx_unlock(&nblock);
+ /*
+ * getblk() is called with a vnode
+ * locked, and some majority of the
+ * dirty buffers may as well belong to
+ * the vnode. Flushing the buffers
+ * there would make a progress that
+ * cannot be achieved by the
+ * buf_daemon, that cannot lock the
+ * vnode.
+ */
+ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+ (td->td_pflags & TDP_NORUNNINGBUF);
+ /* play bufdaemon */
+ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+ fl = buf_do_flush(vp);
+ td->td_pflags &= norunbuf;
+ mtx_lock(&nblock);
+ if (fl != 0)
+ continue;
+ if ((needsbuffer & flags) == 0)
+ break;
+ }
if (msleep(&needsbuffer, &nblock,
(PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
mtx_unlock(&nblock);
@@ -2009,6 +2040,35 @@ static struct kproc_desc buf_kp = {
};
SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);

+static int
+buf_do_flush(struct vnode *vp)
+{
+ int flushed;
+
+ flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
+ /* The list empty check here is slightly racy */
+ if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
+ mtx_lock(&Giant);
+ flushed += flushbufqueues(vp, QUEUE_DIRTY_GIANT, 0);
+ mtx_unlock(&Giant);
+ }
+ if (flushed == 0) {
+ /*
+ * Could not find any buffers without rollback
+ * dependencies, so just write the first one
+ * in the hopes of eventually making progress.
+ */
+ flushbufqueues(vp, QUEUE_DIRTY, 1);
+ if (!TAILQ_EMPTY(
+ &bufqueues[QUEUE_DIRTY_GIANT])) {
+ mtx_lock(&Giant);
+ flushbufqueues(vp, QUEUE_DIRTY_GIANT, 1);
+ mtx_unlock(&Giant);
+ }
+ }
+ return (flushed);
+}
+
static void
buf_daemon()
{
@@ -2022,7 +2082,7 @@ buf_daemon()
/*
* This process is allowed to take the buffer cache to the limit
*/
- curthread->td_pflags |= TDP_NORUNNINGBUF;
+ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
mtx_lock(&bdlock);
for (; {
bd_request = 0;
@@ -2037,30 +2097,8 @@ buf_daemon()
* normally would so they can run in parallel with our drain.
*/
while (numdirtybuffers > lodirtybuffers) {
- int flushed;
-
- flushed = flushbufqueues(QUEUE_DIRTY, 0);
- /* The list empty check here is slightly racy */
- if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
- mtx_lock(&Giant);
- flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0);
- mtx_unlock(&Giant);
- }
- if (flushed == 0) {
- /*
- * Could not find any buffers without rollback
- * dependencies, so just write the first one
- * in the hopes of eventually making progress.
- */
- flushbufqueues(QUEUE_DIRTY, 1);
- if (!TAILQ_EMPTY(
- &bufqueues[QUEUE_DIRTY_GIANT])) {
- mtx_lock(&Giant);
- flushbufqueues(QUEUE_DIRTY_GIANT, 1);
- mtx_unlock(&Giant);
- }
+ if (buf_do_flush(NULL) == 0)
break;
- }
uio_yield();
}

@@ -2106,7 +2144,7 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW,&flushwithdeps,
0, "Number of buffers flushed with dependecies that require rollbacks");

static int
-flushbufqueues(int queue, int flushdeps)
+flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
{
struct buf sentinel;
struct vnode *vp;
@@ -2121,15 +2159,29 @@ flushbufqueues(int queue, int flushdeps)
target /= 2;
flushed = 0;
bp = NULL;
+ sentinel.b_qindex = QUEUE_SENTINEL;
mtx_lock(&bqlock);
- TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist);
+ TAILQ_INSERT_HEAD(&bufqueues[queue], &sentinel, b_freelist);
while (flushed != target) {
- bp = TAILQ_FIRST(&bufqueues[queue]);
- if (bp == &sentinel)
+ bp = TAILQ_NEXT(&sentinel, b_freelist);
+ if (bp != NULL) {
+ TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist);
+ TAILQ_INSERT_AFTER(&bufqueues[queue], bp, &sentinel,
+ b_freelist);
+ } else
break;
- TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist);
- TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist);
-
+ /*
+ * Skip sentinels inserted by other invocations of the
+ * flushbufqueues(), taking care to not reorder them.
+ */
+ if (bp->b_qindex == QUEUE_SENTINEL)
+ continue;
+ /*
+ * Only flush the buffers that belong to the
+ * vnode locked by the curthread.
+ */
+ if (lvp != NULL && bp->b_vp != lvp)
+ continue;
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
continue;
if (bp->b_pin_count > 0) {
@@ -2177,16 +2229,26 @@ flushbufqueues(int queue, int flushdeps)
BUF_UNLOCK(bp);
continue;
}
- if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
+ if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
mtx_unlock(&bqlock);
CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
bp, bp->b_vp, bp->b_flags);
- vfs_bio_awrite(bp);
+ if (curproc == bufdaemonproc)
+ vfs_bio_awrite(bp);
+ else {
+ bremfree(bp);
+ bwrite(bp);
+ }
vn_finished_write(mp);
VOP_UNLOCK(vp, 0);
flushwithdeps += hasdeps;
flushed++;
- waitrunningbufspace();
+ /*
+ * Sleeping on runningbufspace while holding
+ * vnode lock leads to deadlock.
+ */
+ if (curproc == bufdaemonproc)
+ waitrunningbufspace();
numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
mtx_lock(&bqlock);
continue;
@@ -2568,7 +2630,7 @@ loop:
maxsize = vmio ? size + (offset & PAGE_MASK) : size;
maxsize = imax(maxsize, bsize);

- bp = getnewbuf(slpflag, slptimeo, size, maxsize);
+ bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
if (bp == NULL) {
if (slpflag || slptimeo)
return NULL;
@@ -2643,14 +2705,17 @@ loop:
* set to B_INVAL.
*/
struct buf *
-geteblk(int size)
+geteblk(int size, int flags)
{
struct buf *bp;
int maxsize;

maxsize = (size + BKVAMASK) & ~BKVAMASK;
- while ((bp = getnewbuf(0, 0, size, maxsize)) == 0)
- continue;
+ while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
+ if ((flags & GB_NOWAIT_BD) &&
+ (curthread->td_pflags & TDP_BUFNEED) != 0)
+ return (NULL);
+ }
allocbuf(bp, size);
bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
BUF_ASSERT_HELD(bp);
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index e05b20c..09e5c03 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -443,6 +443,7 @@ buf_countdeps(struct buf *bp, int i)
*/
#define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */
#define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */
+#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */

#ifdef _KERNEL
extern int nbuf; /* The number of buffer headers */
@@ -487,7 +488,7 @@ struct buf * getpbuf(int *);
struct buf *incore(struct bufobj *, daddr_t);
struct buf *gbincore(struct bufobj *, daddr_t);
struct buf *getblk(struct vnode *, daddr_t, int, int, int, int);
-struct buf *geteblk(int);
+struct buf *geteblk(int, int);
int bufwait(struct buf *);
int bufwrite(struct buf *);
void bufdone(struct buf *);
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index b326ba7..f38f0e4 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -344,7 +344,7 @@ do { \
#define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */
#define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */
#define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */
-#define TDP_UNUSED8 0x00000008 /* available */
+#define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */
#define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress.. */
#define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */
#define TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock treatment.. */
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 563473c..8118b80 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -1824,7 +1824,9 @@ ffs_bufwrite(struct buf *bp)
("bufwrite: needs chained iodone (%p)", bp->b_iodone));

/* get a new block */
- newbp = geteblk(bp->b_bufsize);
+ newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD);
+ if (newbp == NULL)
+ goto normal_write;

/*
* set it to be identical to the old block. We have to
@@ -1864,6 +1866,7 @@ ffs_bufwrite(struct buf *bp)
}

/* Let the normal bufwrite do the rest for us */
+normal_write:
return (bufwrite(bp));
}


-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.9 (FreeBSD)

iEYEARECAAYFAkkTUO0ACgkQC3+MBN1Mb4i8XACgsyI7qFwRYX Yz3mncLcQ+sdJB
hpUAn3WOW7DkNnjCjlq0iDdj6mt4qTBp
=ap9I
-----END PGP SIGNATURE-----