Contributors: 18
Author Tokens Token Proportion Commits Commit Proportion
yu kuai 6302 92.61% 20 27.40%
Neil Brown 338 4.97% 22 30.14%
Linus Torvalds (pre-git) 40 0.59% 9 12.33%
Paul Clements 35 0.51% 2 2.74%
Goldwyn Rodrigues 25 0.37% 6 8.22%
Hannes Reinecke 20 0.29% 1 1.37%
Xiao Ni 12 0.18% 1 1.37%
Nathan Chancellor 7 0.10% 1 1.37%
Christoph Hellwig 5 0.07% 2 2.74%
Al Viro 4 0.06% 1 1.37%
Guoqing Jiang 4 0.06% 1 1.37%
Song Liu 3 0.04% 1 1.37%
Andrew Morton 2 0.03% 1 1.37%
Xiao Jiang 2 0.03% 1 1.37%
Shaohua Li 2 0.03% 1 1.37%
Linus Torvalds 2 0.03% 1 1.37%
Thomas Gleixner 1 0.01% 1 1.37%
Mike Snitzer 1 0.01% 1 1.37%
Total 6805 73


// SPDX-License-Identifier: GPL-2.0-or-later

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/timer.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/file.h>
#include <linux/seq_file.h>
#include <trace/events/block.h>

#include "md.h"
#include "md-bitmap.h"

/*
 * #### Background
 *
 * Redundant data is used to enhance data fault tolerance, and the storage
 * methods for redundant data vary depending on the RAID levels. And it's
 * important to maintain the consistency of redundant data.
 *
 * Bitmap is used to record which data blocks have been synchronized and which
 * ones need to be resynchronized or recovered. Each bit in the bitmap
 * represents a segment of data in the array. When a bit is set, it indicates
 * that the multiple redundant copies of that data segment may not be
 * consistent. Data synchronization can be performed based on the bitmap after
 * power failure or readding a disk. If there is no bitmap, a full disk
 * synchronization is required.
 *
 * #### Key Features
 *
 *  - IO fastpath is lockless, if user issues lots of write IO to the same
 *  bitmap bit in a short time, only the first write has additional overhead
 *  to update bitmap bit, no additional overhead for the following writes;
 *  - support only resync or recover written data, means in the case creating
 *  new array or replacing with a new disk, there is no need to do a full disk
 *  resync/recovery;
 *
 * #### Key Concept
 *
 * ##### State Machine
 *
 * Each bit is one byte, contain 6 different states, see llbitmap_state. And
 * there are total 8 different actions, see llbitmap_action, can change state:
 *
 * llbitmap state machine: transitions between states
 *
 * |           | Startwrite | Startsync | Endsync | Abortsync|
 * | --------- | ---------- | --------- | ------- | -------  |
 * | Unwritten | Dirty      | x         | x       | x        |
 * | Clean     | Dirty      | x         | x       | x        |
 * | Dirty     | x          | x         | x       | x        |
 * | NeedSync  | x          | Syncing   | x       | x        |
 * | Syncing   | x          | Syncing   | Dirty   | NeedSync |
 *
 * |           | Reload   | Daemon | Discard   | Stale     |
 * | --------- | -------- | ------ | --------- | --------- |
 * | Unwritten | x        | x      | x         | x         |
 * | Clean     | x        | x      | Unwritten | NeedSync  |
 * | Dirty     | NeedSync | Clean  | Unwritten | NeedSync  |
 * | NeedSync  | x        | x      | Unwritten | x         |
 * | Syncing   | NeedSync | x      | Unwritten | NeedSync  |
 *
 * Typical scenarios:
 *
 * 1) Create new array
 * All bits will be set to Unwritten by default, if --assume-clean is set,
 * all bits will be set to Clean instead.
 *
 * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
 * rely on xor data
 *
 * 2.1) write new data to raid1/raid10:
 * Unwritten --StartWrite--> Dirty
 *
 * 2.2) write new data to raid456:
 * Unwritten --StartWrite--> NeedSync
 *
 * Because the initial recover for raid456 is skipped, the xor data is not built
 * yet, the bit must be set to NeedSync first and after lazy initial recover is
 * finished, the bit will finally set to Dirty(see 5.1 and 5.4);
 *
 * 2.3) cover write
 * Clean --StartWrite--> Dirty
 *
 * 3) daemon, if the array is not degraded:
 * Dirty --Daemon--> Clean
 *
 * 4) discard
 * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
 *
 * 5) resync and recover
 *
 * 5.1) common process
 * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
 *
 * 5.2) resync after power failure
 * Dirty --Reload--> NeedSync
 *
 * 5.3) recover while replacing with a new disk
 * By default, the old bitmap framework will recover all data, and llbitmap
 * implements this by a new helper, see llbitmap_skip_sync_blocks:
 *
 * skip recover for bits other than dirty or clean;
 *
 * 5.4) lazy initial recover for raid5:
 * By default, the old bitmap framework will only allow new recover when there
 * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
 * to perform raid456 lazy recover for set bits(from 2.2).
 *
 * 6. special handling for degraded array:
 *
 * - Dirty bits will never be cleared, daemon will just do nothing, so that if
 *   a disk is readded, Clean bits can be skipped with recovery;
 * - Dirty bits will convert to Syncing from start write, to do data recovery
 *   for new added disks;
 * - New write will convert bits to NeedSync directly;
 *
 * ##### Bitmap IO
 *
 * ##### Chunksize
 *
 * The default bitmap size is 128k, incluing 1k bitmap super block, and
 * the default size of segment of data in the array each bit(chunksize) is 64k,
 * and chunksize will adjust to twice the old size each time if the total number
 * bits is not less than 127k.(see llbitmap_init)
 *
 * ##### READ
 *
 * While creating bitmap, all pages will be allocated and read for llbitmap,
 * there won't be read afterwards
 *
 * ##### WRITE
 *
 * WRITE IO is divided into logical_block_size of the array, the dirty state
 * of each block is tracked independently, for example:
 *
 * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
 *
 * | page0 | page1 | ... | page 31 |
 * |       |
 * |        \-----------------------\
 * |                                |
 * | block0 | block1 | ... | block 8|
 * |        |
 * |         \-----------------\
 * |                            |
 * | bit0 | bit1 | ... | bit511 |
 *
 * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
 * subpage will be marked dirty, such block must write first before the IO is
 * issued. This behaviour will affect IO performance, to reduce the impact, if
 * multiple bits are changed in the same block in a short time, all bits in this
 * block will be changed to Dirty/NeedSync, so that there won't be any overhead
 * until daemon clears dirty bits.
 *
 * ##### Dirty Bits synchronization
 *
 * IO fast path will set bits to dirty, and those dirty bits will be cleared
 * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
 * IO path and daemon;
 *
 * IO path:
 *  1) try to grab a reference, if succeed, set expire time after 5s and return;
 *  2) if failed to grab a reference, wait for daemon to finish clearing dirty
 *  bits;
 *
 * Daemon (Daemon will be woken up every daemon_sleep seconds):
 * For each page:
 *  1) check if page expired, if not skip this page; for expired page:
 *  2) suspend the page and wait for inflight write IO to be done;
 *  3) change dirty page to clean;
 *  4) resume the page;
 */

#define BITMAP_DATA_OFFSET 1024

/* 64k is the max IO size of sync IO for raid1/raid10 */
#define MIN_CHUNK_SIZE (64 * 2)

/* By default, daemon will be woken up every 30s */
#define DEFAULT_DAEMON_SLEEP 30

/*
 * Dirtied bits that have not been accessed for more than 5s will be cleared
 * by daemon.
 */
#define DEFAULT_BARRIER_IDLE 5

enum llbitmap_state {
	/* No valid data, init state after assemble the array */
	BitUnwritten = 0,
	/* data is consistent */
	BitClean,
	/* data will be consistent after IO is done, set directly for writes */
	BitDirty,
	/*
	 * data need to be resynchronized:
	 * 1) set directly for writes if array is degraded, prevent full disk
	 * synchronization after readding a disk;
	 * 2) reassemble the array after power failure, and dirty bits are
	 * found after reloading the bitmap;
	 * 3) set for first write for raid5, to build initial xor data lazily
	 */
	BitNeedSync,
	/* data is synchronizing */
	BitSyncing,
	BitStateCount,
	BitNone = 0xff,
};

enum llbitmap_action {
	/* User write new data, this is the only action from IO fast path */
	BitmapActionStartwrite = 0,
	/* Start recovery */
	BitmapActionStartsync,
	/* Finish recovery */
	BitmapActionEndsync,
	/* Failed recovery */
	BitmapActionAbortsync,
	/* Reassemble the array */
	BitmapActionReload,
	/* Daemon thread is trying to clear dirty bits */
	BitmapActionDaemon,
	/* Data is deleted */
	BitmapActionDiscard,
	/*
	 * Bitmap is stale, mark all bits in addition to BitUnwritten to
	 * BitNeedSync.
	 */
	BitmapActionStale,
	BitmapActionCount,
	/* Init state is BitUnwritten */
	BitmapActionInit,
};

enum llbitmap_page_state {
	LLPageFlush = 0,
	LLPageDirty,
};

struct llbitmap_page_ctl {
	char *state;
	struct page *page;
	unsigned long expire;
	unsigned long flags;
	wait_queue_head_t wait;
	struct percpu_ref active;
	/* Per block size dirty state, maximum 64k page / 1 sector = 128 */
	unsigned long dirty[];
};

struct llbitmap {
	struct mddev *mddev;
	struct llbitmap_page_ctl **pctl;

	unsigned int nr_pages;
	unsigned int io_size;
	unsigned int blocks_per_page;

	/* shift of one chunk */
	unsigned long chunkshift;
	/* size of one chunk in sector */
	unsigned long chunksize;
	/* total number of chunks */
	unsigned long chunks;
	unsigned long last_end_sync;
	/*
	 * time in seconds that dirty bits will be cleared if the page is not
	 * accessed.
	 */
	unsigned long barrier_idle;
	/* fires on first BitDirty state */
	struct timer_list pending_timer;
	struct work_struct daemon_work;

	unsigned long flags;
	__u64	events_cleared;

	/* for slow disks */
	atomic_t behind_writes;
	wait_queue_head_t behind_wait;
};

struct llbitmap_unplug_work {
	struct work_struct work;
	struct llbitmap *llbitmap;
	struct completion *done;
};

static struct workqueue_struct *md_llbitmap_io_wq;
static struct workqueue_struct *md_llbitmap_unplug_wq;

static char state_machine[BitStateCount][BitmapActionCount] = {
	[BitUnwritten] = {
		[BitmapActionStartwrite]	= BitDirty,
		[BitmapActionStartsync]		= BitNone,
		[BitmapActionEndsync]		= BitNone,
		[BitmapActionAbortsync]		= BitNone,
		[BitmapActionReload]		= BitNone,
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitNone,
		[BitmapActionStale]		= BitNone,
	},
	[BitClean] = {
		[BitmapActionStartwrite]	= BitDirty,
		[BitmapActionStartsync]		= BitNone,
		[BitmapActionEndsync]		= BitNone,
		[BitmapActionAbortsync]		= BitNone,
		[BitmapActionReload]		= BitNone,
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitNeedSync,
	},
	[BitDirty] = {
		[BitmapActionStartwrite]	= BitNone,
		[BitmapActionStartsync]		= BitNone,
		[BitmapActionEndsync]		= BitNone,
		[BitmapActionAbortsync]		= BitNone,
		[BitmapActionReload]		= BitNeedSync,
		[BitmapActionDaemon]		= BitClean,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitNeedSync,
	},
	[BitNeedSync] = {
		[BitmapActionStartwrite]	= BitNone,
		[BitmapActionStartsync]		= BitSyncing,
		[BitmapActionEndsync]		= BitNone,
		[BitmapActionAbortsync]		= BitNone,
		[BitmapActionReload]		= BitNone,
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitNone,
	},
	[BitSyncing] = {
		[BitmapActionStartwrite]	= BitNone,
		[BitmapActionStartsync]		= BitSyncing,
		[BitmapActionEndsync]		= BitDirty,
		[BitmapActionAbortsync]		= BitNeedSync,
		[BitmapActionReload]		= BitNeedSync,
		[BitmapActionDaemon]		= BitNone,
		[BitmapActionDiscard]		= BitUnwritten,
		[BitmapActionStale]		= BitNeedSync,
	},
};

static void __llbitmap_flush(struct mddev *mddev);

static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
{
	unsigned int idx;
	unsigned int offset;

	pos += BITMAP_DATA_OFFSET;
	idx = pos >> PAGE_SHIFT;
	offset = offset_in_page(pos);

	return llbitmap->pctl[idx]->state[offset];
}

/* set all the bits in the subpage as dirty */
static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
				       struct llbitmap_page_ctl *pctl,
				       unsigned int block)
{
	bool level_456 = raid_is_456(llbitmap->mddev);
	unsigned int io_size = llbitmap->io_size;
	int pos;

	for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
		switch (pctl->state[pos]) {
		case BitUnwritten:
			pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
			break;
		case BitClean:
			pctl->state[pos] = BitDirty;
			break;
		};
	}
}

static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
				    int offset)
{
	struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
	unsigned int io_size = llbitmap->io_size;
	int block = offset / io_size;
	int pos;

	if (!test_bit(LLPageDirty, &pctl->flags))
		set_bit(LLPageDirty, &pctl->flags);

	/*
	 * For degraded array, dirty bits will never be cleared, and we must
	 * resync all the dirty bits, hence skip infect new dirty bits to
	 * prevent resync unnecessary data.
	 */
	if (llbitmap->mddev->degraded) {
		set_bit(block, pctl->dirty);
		return;
	}

	/*
	 * The subpage usually contains a total of 512 bits. If any single bit
	 * within the subpage is marked as dirty, the entire sector will be
	 * written. To avoid impacting write performance, when multiple bits
	 * within the same sector are modified within llbitmap->barrier_idle,
	 * all bits in the sector will be collectively marked as dirty at once.
	 */
	if (test_and_set_bit(block, pctl->dirty)) {
		llbitmap_infect_dirty_bits(llbitmap, pctl, block);
		return;
	}

	for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
		if (pos == offset)
			continue;
		if (pctl->state[pos] == BitDirty ||
		    pctl->state[pos] == BitNeedSync) {
			llbitmap_infect_dirty_bits(llbitmap, pctl, block);
			return;
		}
	}
}

static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
			   loff_t pos)
{
	unsigned int idx;
	unsigned int bit;

	pos += BITMAP_DATA_OFFSET;
	idx = pos >> PAGE_SHIFT;
	bit = offset_in_page(pos);

	llbitmap->pctl[idx]->state[bit] = state;
	if (state == BitDirty || state == BitNeedSync)
		llbitmap_set_page_dirty(llbitmap, idx, bit);
}

static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
{
	struct mddev *mddev = llbitmap->mddev;
	struct page *page = NULL;
	struct md_rdev *rdev;

	if (llbitmap->pctl && llbitmap->pctl[idx])
		page = llbitmap->pctl[idx]->page;
	if (page)
		return page;

	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
	if (!page)
		return ERR_PTR(-ENOMEM);

	rdev_for_each(rdev, mddev) {
		sector_t sector;

		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
			continue;

		sector = mddev->bitmap_info.offset +
			 (idx << PAGE_SECTORS_SHIFT);

		if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
				 true))
			return page;

		md_error(mddev, rdev);
	}

	__free_page(page);
	return ERR_PTR(-EIO);
}

static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
{
	struct page *page = llbitmap->pctl[idx]->page;
	struct mddev *mddev = llbitmap->mddev;
	struct md_rdev *rdev;
	int block;

	for (block = 0; block < llbitmap->blocks_per_page; block++) {
		struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];

		if (!test_and_clear_bit(block, pctl->dirty))
			continue;

		rdev_for_each(rdev, mddev) {
			sector_t sector;
			sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;

			if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
				continue;

			sector = mddev->bitmap_info.offset + rdev->sb_start +
				 (idx << PAGE_SECTORS_SHIFT) +
				 block * bit_sector;
			md_write_metadata(mddev, rdev, sector,
					  llbitmap->io_size, page,
					  block * llbitmap->io_size);
		}
	}
}

static void active_release(struct percpu_ref *ref)
{
	struct llbitmap_page_ctl *pctl =
		container_of(ref, struct llbitmap_page_ctl, active);

	wake_up(&pctl->wait);
}

static void llbitmap_free_pages(struct llbitmap *llbitmap)
{
	int i;

	if (!llbitmap->pctl)
		return;

	for (i = 0; i < llbitmap->nr_pages; i++) {
		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];

		if (!pctl || !pctl->page)
			break;

		__free_page(pctl->page);
		percpu_ref_exit(&pctl->active);
	}

	kfree(llbitmap->pctl[0]);
	kfree(llbitmap->pctl);
	llbitmap->pctl = NULL;
}

static int llbitmap_cache_pages(struct llbitmap *llbitmap)
{
	struct llbitmap_page_ctl *pctl;
	unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
					     BITMAP_DATA_OFFSET, PAGE_SIZE);
	unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
						llbitmap->blocks_per_page));
	int i;

	llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
				       GFP_KERNEL | __GFP_ZERO);
	if (!llbitmap->pctl)
		return -ENOMEM;

	size = round_up(size, cache_line_size());
	pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
	if (!pctl) {
		kfree(llbitmap->pctl);
		return -ENOMEM;
	}

	llbitmap->nr_pages = nr_pages;

	for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
		struct page *page = llbitmap_read_page(llbitmap, i);

		llbitmap->pctl[i] = pctl;

		if (IS_ERR(page)) {
			llbitmap_free_pages(llbitmap);
			return PTR_ERR(page);
		}

		if (percpu_ref_init(&pctl->active, active_release,
				    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
			__free_page(page);
			llbitmap_free_pages(llbitmap);
			return -ENOMEM;
		}

		pctl->page = page;
		pctl->state = page_address(page);
		init_waitqueue_head(&pctl->wait);
	}

	return 0;
}

static void llbitmap_init_state(struct llbitmap *llbitmap)
{
	enum llbitmap_state state = BitUnwritten;
	unsigned long i;

	if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
		state = BitClean;

	for (i = 0; i < llbitmap->chunks; i++)
		llbitmap_write(llbitmap, state, i);
}

/* The return value is only used from resync, where @start == @end. */
static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
						  unsigned long start,
						  unsigned long end,
						  enum llbitmap_action action)
{
	struct mddev *mddev = llbitmap->mddev;
	enum llbitmap_state state = BitNone;
	bool level_456 = raid_is_456(llbitmap->mddev);
	bool need_resync = false;
	bool need_recovery = false;

	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
		return BitNone;

	if (action == BitmapActionInit) {
		llbitmap_init_state(llbitmap);
		return BitNone;
	}

	while (start <= end) {
		enum llbitmap_state c = llbitmap_read(llbitmap, start);

		if (c < 0 || c >= BitStateCount) {
			pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
			       __func__, start, c, action);
			state = BitNeedSync;
			goto write_bitmap;
		}

		if (c == BitNeedSync)
			need_resync = !mddev->degraded;

		state = state_machine[c][action];

write_bitmap:
		if (unlikely(mddev->degraded)) {
			/* For degraded array, mark new data as need sync. */
			if (state == BitDirty &&
			    action == BitmapActionStartwrite)
				state = BitNeedSync;
			/*
			 * For degraded array, resync dirty data as well, noted
			 * if array is still degraded after resync is done, all
			 * new data will still be dirty until array is clean.
			 */
			else if (c == BitDirty &&
				action == BitmapActionStartsync)
				state = BitSyncing;
		} else if (c == BitUnwritten && state == BitDirty &&
			   action == BitmapActionStartwrite && level_456) {
			/* Delay raid456 initial recovery to first write. */
			state = BitNeedSync;
		}

		if (state == BitNone) {
			start++;
			continue;
		}

		llbitmap_write(llbitmap, state, start);

		if (state == BitNeedSync)
			need_resync = !mddev->degraded;
		else if (state == BitDirty &&
			 !timer_pending(&llbitmap->pending_timer))
			mod_timer(&llbitmap->pending_timer,
				  jiffies + mddev->bitmap_info.daemon_sleep * HZ);

		start++;
	}

	if (need_resync && level_456)
		need_recovery = true;

	if (need_recovery) {
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
	} else if (need_resync) {
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
	}

	return state;
}

static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
{
	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];

retry:
	if (likely(percpu_ref_tryget_live(&pctl->active))) {
		WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
		return;
	}

	wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
	goto retry;
}

static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
{
	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];

	percpu_ref_put(&pctl->active);
}

static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
{
	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];

	percpu_ref_kill(&pctl->active);

	if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
			llbitmap->mddev->bitmap_info.daemon_sleep * HZ))
		return -ETIMEDOUT;

	return 0;
}

static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
{
	struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];

	pctl->expire = LONG_MAX;
	percpu_ref_resurrect(&pctl->active);
	wake_up(&pctl->wait);
}

static int llbitmap_check_support(struct mddev *mddev)
{
	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
		pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
			  mdname(mddev));
		return -EBUSY;
	}

	if (mddev->bitmap_info.space == 0) {
		if (mddev->bitmap_info.default_space == 0) {
			pr_notice("md/llbitmap: %s: no space for bitmap\n",
				  mdname(mddev));
			return -ENOSPC;
		}
	}

	if (!mddev->persistent) {
		pr_notice("md/llbitmap: %s: array must be persistent\n",
			  mdname(mddev));
		return -EOPNOTSUPP;
	}

	if (mddev->bitmap_info.file) {
		pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
			  mdname(mddev));
		return -EOPNOTSUPP;
	}

	if (mddev->bitmap_info.external) {
		pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
			  mdname(mddev));
		return -EOPNOTSUPP;
	}

	if (mddev_is_dm(mddev)) {
		pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
			  mdname(mddev));
		return -EOPNOTSUPP;
	}

	return 0;
}

static int llbitmap_init(struct llbitmap *llbitmap)
{
	struct mddev *mddev = llbitmap->mddev;
	sector_t blocks = mddev->resync_max_sectors;
	unsigned long chunksize = MIN_CHUNK_SIZE;
	unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
	unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
	int ret;

	while (chunks > space) {
		chunksize = chunksize << 1;
		chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
	}

	llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
	llbitmap->chunkshift = ffz(~chunksize);
	llbitmap->chunksize = chunksize;
	llbitmap->chunks = chunks;
	mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;

	ret = llbitmap_cache_pages(llbitmap);
	if (ret)
		return ret;

	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
			       BitmapActionInit);
	/* flush initial llbitmap to disk */
	__llbitmap_flush(mddev);

	return 0;
}

static int llbitmap_read_sb(struct llbitmap *llbitmap)
{
	struct mddev *mddev = llbitmap->mddev;
	unsigned long daemon_sleep;
	unsigned long chunksize;
	unsigned long events;
	struct page *sb_page;
	bitmap_super_t *sb;
	int ret = -EINVAL;

	if (!mddev->bitmap_info.offset) {
		pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
		return -EINVAL;
	}

	sb_page = llbitmap_read_page(llbitmap, 0);
	if (IS_ERR(sb_page)) {
		pr_err("md/llbitmap: %s: read super block failed",
		       mdname(mddev));
		return -EIO;
	}

	sb = kmap_local_page(sb_page);
	if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
		pr_err("md/llbitmap: %s: invalid super block magic number",
		       mdname(mddev));
		goto out_put_page;
	}

	if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
		pr_err("md/llbitmap: %s: invalid super block version",
		       mdname(mddev));
		goto out_put_page;
	}

	if (memcmp(sb->uuid, mddev->uuid, 16)) {
		pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
		       mdname(mddev));
		goto out_put_page;
	}

	if (mddev->bitmap_info.space == 0) {
		int room = le32_to_cpu(sb->sectors_reserved);

		if (room)
			mddev->bitmap_info.space = room;
		else
			mddev->bitmap_info.space = mddev->bitmap_info.default_space;
	}
	llbitmap->flags = le32_to_cpu(sb->state);
	if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
		ret = llbitmap_init(llbitmap);
		goto out_put_page;
	}

	chunksize = le32_to_cpu(sb->chunksize);
	if (!is_power_of_2(chunksize)) {
		pr_err("md/llbitmap: %s: chunksize not a power of 2",
		       mdname(mddev));
		goto out_put_page;
	}

	if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
					      mddev->bitmap_info.space << SECTOR_SHIFT)) {
		pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
		       mdname(mddev), chunksize, mddev->resync_max_sectors,
		       mddev->bitmap_info.space);
		goto out_put_page;
	}

	daemon_sleep = le32_to_cpu(sb->daemon_sleep);
	if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
		pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
		       mdname(mddev), daemon_sleep);
		goto out_put_page;
	}

	events = le64_to_cpu(sb->events);
	if (events < mddev->events) {
		pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
			mdname(mddev), events, mddev->events);
		set_bit(BITMAP_STALE, &llbitmap->flags);
	}

	sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
	mddev->bitmap_info.chunksize = chunksize;
	mddev->bitmap_info.daemon_sleep = daemon_sleep;

	llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
	llbitmap->chunksize = chunksize;
	llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
	llbitmap->chunkshift = ffz(~chunksize);
	ret = llbitmap_cache_pages(llbitmap);

out_put_page:
	__free_page(sb_page);
	kunmap_local(sb);
	return ret;
}

static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
{
	struct llbitmap *llbitmap =
		container_of(pending_timer, struct llbitmap, pending_timer);

	if (work_busy(&llbitmap->daemon_work)) {
		pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
			mdname(llbitmap->mddev),
			llbitmap->mddev->bitmap_info.daemon_sleep);
		set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
		return;
	}

	queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
}

static void md_llbitmap_daemon_fn(struct work_struct *work)
{
	struct llbitmap *llbitmap =
		container_of(work, struct llbitmap, daemon_work);
	unsigned long start;
	unsigned long end;
	bool restart;
	int idx;

	if (llbitmap->mddev->degraded)
		return;
retry:
	start = 0;
	end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
	restart = false;

	for (idx = 0; idx < llbitmap->nr_pages; idx++) {
		struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];

		if (idx > 0) {
			start = end + 1;
			end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
		}

		if (!test_bit(LLPageFlush, &pctl->flags) &&
		    time_before(jiffies, pctl->expire)) {
			restart = true;
			continue;
		}

		if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
			pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
				mdname(llbitmap->mddev), __func__, idx);
			continue;
		}

		llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
		llbitmap_resume(llbitmap, idx);
	}

	/*
	 * If the daemon took a long time to finish, retry to prevent missing
	 * clearing dirty bits.
	 */
	if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
		goto retry;

	/* If some page is dirty but not expired, setup timer again */
	if (restart)
		mod_timer(&llbitmap->pending_timer,
			  jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
}

static int llbitmap_create(struct mddev *mddev)
{
	struct llbitmap *llbitmap;
	int ret;

	ret = llbitmap_check_support(mddev);
	if (ret)
		return ret;

	llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL);
	if (!llbitmap)
		return -ENOMEM;

	llbitmap->mddev = mddev;
	llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
	llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;

	timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
	INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
	atomic_set(&llbitmap->behind_writes, 0);
	init_waitqueue_head(&llbitmap->behind_wait);

	mutex_lock(&mddev->bitmap_info.mutex);
	mddev->bitmap = llbitmap;
	ret = llbitmap_read_sb(llbitmap);
	mutex_unlock(&mddev->bitmap_info.mutex);
	if (ret) {
		kfree(llbitmap);
		mddev->bitmap = NULL;
	}

	return ret;
}

static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long chunks;

	if (chunksize == 0)
		chunksize = llbitmap->chunksize;

	/* If there is enough space, leave the chunksize unchanged. */
	chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
	while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
		chunksize = chunksize << 1;
		chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
	}

	llbitmap->chunkshift = ffz(~chunksize);
	llbitmap->chunksize = chunksize;
	llbitmap->chunks = chunks;

	return 0;
}

static int llbitmap_load(struct mddev *mddev)
{
	enum llbitmap_action action = BitmapActionReload;
	struct llbitmap *llbitmap = mddev->bitmap;

	if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
		action = BitmapActionStale;

	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
	return 0;
}

static void llbitmap_destroy(struct mddev *mddev)
{
	struct llbitmap *llbitmap = mddev->bitmap;

	if (!llbitmap)
		return;

	mutex_lock(&mddev->bitmap_info.mutex);

	timer_delete_sync(&llbitmap->pending_timer);
	flush_workqueue(md_llbitmap_io_wq);
	flush_workqueue(md_llbitmap_unplug_wq);

	mddev->bitmap = NULL;
	llbitmap_free_pages(llbitmap);
	kfree(llbitmap);
	mutex_unlock(&mddev->bitmap_info.mutex);
}

static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
				 unsigned long sectors)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long start = offset >> llbitmap->chunkshift;
	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;

	llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);

	while (page_start <= page_end) {
		llbitmap_raise_barrier(llbitmap, page_start);
		page_start++;
	}
}

static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
			       unsigned long sectors)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long start = offset >> llbitmap->chunkshift;
	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;

	while (page_start <= page_end) {
		llbitmap_release_barrier(llbitmap, page_start);
		page_start++;
	}
}

static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
				   unsigned long sectors)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;

	llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);

	while (page_start <= page_end) {
		llbitmap_raise_barrier(llbitmap, page_start);
		page_start++;
	}
}

static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
				 unsigned long sectors)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
	unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;

	while (page_start <= page_end) {
		llbitmap_release_barrier(llbitmap, page_start);
		page_start++;
	}
}

static void llbitmap_unplug_fn(struct work_struct *work)
{
	struct llbitmap_unplug_work *unplug_work =
		container_of(work, struct llbitmap_unplug_work, work);
	struct llbitmap *llbitmap = unplug_work->llbitmap;
	struct blk_plug plug;
	int i;

	blk_start_plug(&plug);

	for (i = 0; i < llbitmap->nr_pages; i++) {
		if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
		    !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
			continue;

		llbitmap_write_page(llbitmap, i);
	}

	blk_finish_plug(&plug);
	md_super_wait(llbitmap->mddev);
	complete(unplug_work->done);
}

static bool llbitmap_dirty(struct llbitmap *llbitmap)
{
	int i;

	for (i = 0; i < llbitmap->nr_pages; i++)
		if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
			return true;

	return false;
}

static void llbitmap_unplug(struct mddev *mddev, bool sync)
{
	DECLARE_COMPLETION_ONSTACK(done);
	struct llbitmap *llbitmap = mddev->bitmap;
	struct llbitmap_unplug_work unplug_work = {
		.llbitmap = llbitmap,
		.done = &done,
	};

	if (!llbitmap_dirty(llbitmap))
		return;

	/*
	 * Issue new bitmap IO under submit_bio() context will deadlock:
	 *  - the bio will wait for bitmap bio to be done, before it can be
	 *  issued;
	 *  - bitmap bio will be added to current->bio_list and wait for this
	 *  bio to be issued;
	 */
	INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
	queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
	wait_for_completion(&done);
	destroy_work_on_stack(&unplug_work.work);
}

/*
 * Force to write all bitmap pages to disk, called when stopping the array, or
 * every daemon_sleep seconds when sync_thread is running.
 */
static void __llbitmap_flush(struct mddev *mddev)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	struct blk_plug plug;
	int i;

	blk_start_plug(&plug);
	for (i = 0; i < llbitmap->nr_pages; i++) {
		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];

		/* mark all blocks as dirty */
		set_bit(LLPageDirty, &pctl->flags);
		bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
		llbitmap_write_page(llbitmap, i);
	}
	blk_finish_plug(&plug);
	md_super_wait(llbitmap->mddev);
}

static void llbitmap_flush(struct mddev *mddev)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	int i;

	for (i = 0; i < llbitmap->nr_pages; i++)
		set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);

	timer_delete_sync(&llbitmap->pending_timer);
	queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
	flush_work(&llbitmap->daemon_work);

	__llbitmap_flush(mddev);
}

/* This is used for raid5 lazy initial recovery */
static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long p = offset >> llbitmap->chunkshift;
	enum llbitmap_state c = llbitmap_read(llbitmap, p);

	return c == BitClean || c == BitDirty;
}

static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long p = offset >> llbitmap->chunkshift;
	int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
	enum llbitmap_state c = llbitmap_read(llbitmap, p);

	/* always skip unwritten blocks */
	if (c == BitUnwritten)
		return blocks;

	/* For degraded array, don't skip */
	if (mddev->degraded)
		return 0;

	/* For resync also skip clean/dirty blocks */
	if ((c == BitClean || c == BitDirty) &&
	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
		return blocks;

	return 0;
}

static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
				sector_t *blocks, bool degraded)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long p = offset >> llbitmap->chunkshift;

	/*
	 * Handle one bit at a time, this is much simpler. And it doesn't matter
	 * if md_do_sync() loop more times.
	 */
	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
	return llbitmap_state_machine(llbitmap, p, p,
				      BitmapActionStartsync) == BitSyncing;
}

/* Something is wrong, sync_thread stop at @offset */
static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
			      sector_t *blocks)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long p = offset >> llbitmap->chunkshift;

	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
	llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
			       BitmapActionAbortsync);
}

/* A full sync_thread is finished */
static void llbitmap_close_sync(struct mddev *mddev)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	int i;

	for (i = 0; i < llbitmap->nr_pages; i++) {
		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];

		/* let daemon_fn clear dirty bits immediately */
		WRITE_ONCE(pctl->expire, jiffies);
	}

	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
			       BitmapActionEndsync);
}

/*
 * sync_thread have reached @sector, update metadata every daemon_sleep seconds,
 * just in case sync_thread have to restart after power failure.
 */
static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
				   bool force)
{
	struct llbitmap *llbitmap = mddev->bitmap;

	if (sector == 0) {
		llbitmap->last_end_sync = jiffies;
		return;
	}

	if (time_before(jiffies, llbitmap->last_end_sync +
				 HZ * mddev->bitmap_info.daemon_sleep))
		return;

	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));

	mddev->curr_resync_completed = sector;
	set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
	llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
			       BitmapActionEndsync);
	__llbitmap_flush(mddev);

	llbitmap->last_end_sync = jiffies;
	sysfs_notify_dirent_safe(mddev->sysfs_completed);
}

static bool llbitmap_enabled(void *data, bool flush)
{
	struct llbitmap *llbitmap = data;

	return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
}

static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
				unsigned long e)
{
	llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
}

static void llbitmap_write_sb(struct llbitmap *llbitmap)
{
	int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);

	bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
	llbitmap_write_page(llbitmap, 0);
	md_super_wait(llbitmap->mddev);
}

static void llbitmap_update_sb(void *data)
{
	struct llbitmap *llbitmap = data;
	struct mddev *mddev = llbitmap->mddev;
	struct page *sb_page;
	bitmap_super_t *sb;

	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
		return;

	sb_page = llbitmap_read_page(llbitmap, 0);
	if (IS_ERR(sb_page)) {
		pr_err("%s: %s: read super block failed", __func__,
		       mdname(mddev));
		set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
		return;
	}

	if (mddev->events < llbitmap->events_cleared)
		llbitmap->events_cleared = mddev->events;

	sb = kmap_local_page(sb_page);
	sb->events = cpu_to_le64(mddev->events);
	sb->state = cpu_to_le32(llbitmap->flags);
	sb->chunksize = cpu_to_le32(llbitmap->chunksize);
	sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
	sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
	sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
	sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);

	kunmap_local(sb);
	llbitmap_write_sb(llbitmap);
}

static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
{
	struct llbitmap *llbitmap = data;

	memset(stats, 0, sizeof(*stats));

	stats->missing_pages = 0;
	stats->pages = llbitmap->nr_pages;
	stats->file_pages = llbitmap->nr_pages;

	stats->behind_writes = atomic_read(&llbitmap->behind_writes);
	stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
	stats->events_cleared = llbitmap->events_cleared;

	return 0;
}

/* just flag all pages as needing to be written */
static void llbitmap_write_all(struct mddev *mddev)
{
	int i;
	struct llbitmap *llbitmap = mddev->bitmap;

	for (i = 0; i < llbitmap->nr_pages; i++) {
		struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];

		set_bit(LLPageDirty, &pctl->flags);
		bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
	}
}

static void llbitmap_start_behind_write(struct mddev *mddev)
{
	struct llbitmap *llbitmap = mddev->bitmap;

	atomic_inc(&llbitmap->behind_writes);
}

static void llbitmap_end_behind_write(struct mddev *mddev)
{
	struct llbitmap *llbitmap = mddev->bitmap;

	if (atomic_dec_and_test(&llbitmap->behind_writes))
		wake_up(&llbitmap->behind_wait);
}

static void llbitmap_wait_behind_writes(struct mddev *mddev)
{
	struct llbitmap *llbitmap = mddev->bitmap;

	if (!llbitmap)
		return;

	wait_event(llbitmap->behind_wait,
		   atomic_read(&llbitmap->behind_writes) == 0);

}

static ssize_t bits_show(struct mddev *mddev, char *page)
{
	struct llbitmap *llbitmap;
	int bits[BitStateCount] = {0};
	loff_t start = 0;

	mutex_lock(&mddev->bitmap_info.mutex);
	llbitmap = mddev->bitmap;
	if (!llbitmap || !llbitmap->pctl) {
		mutex_unlock(&mddev->bitmap_info.mutex);
		return sprintf(page, "no bitmap\n");
	}

	if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
		mutex_unlock(&mddev->bitmap_info.mutex);
		return sprintf(page, "bitmap io error\n");
	}

	while (start < llbitmap->chunks) {
		enum llbitmap_state c = llbitmap_read(llbitmap, start);

		if (c < 0 || c >= BitStateCount)
			pr_err("%s: invalid bit %llu state %d\n",
			       __func__, start, c);
		else
			bits[c]++;
		start++;
	}

	mutex_unlock(&mddev->bitmap_info.mutex);
	return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
		       bits[BitUnwritten], bits[BitClean], bits[BitDirty],
		       bits[BitNeedSync], bits[BitSyncing]);
}

static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);

static ssize_t metadata_show(struct mddev *mddev, char *page)
{
	struct llbitmap *llbitmap;
	ssize_t ret;

	mutex_lock(&mddev->bitmap_info.mutex);
	llbitmap = mddev->bitmap;
	if (!llbitmap) {
		mutex_unlock(&mddev->bitmap_info.mutex);
		return sprintf(page, "no bitmap\n");
	}

	ret =  sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
		       llbitmap->chunksize, llbitmap->chunkshift,
		       llbitmap->chunks, mddev->bitmap_info.offset,
		       llbitmap->mddev->bitmap_info.daemon_sleep);
	mutex_unlock(&mddev->bitmap_info.mutex);

	return ret;
}

static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);

static ssize_t
daemon_sleep_show(struct mddev *mddev, char *page)
{
	return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
}

static ssize_t
daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
{
	unsigned long timeout;
	int rv = kstrtoul(buf, 10, &timeout);

	if (rv)
		return rv;

	mddev->bitmap_info.daemon_sleep = timeout;
	return len;
}

static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);

static ssize_t
barrier_idle_show(struct mddev *mddev, char *page)
{
	struct llbitmap *llbitmap = mddev->bitmap;

	return sprintf(page, "%lu\n", llbitmap->barrier_idle);
}

static ssize_t
barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
{
	struct llbitmap *llbitmap = mddev->bitmap;
	unsigned long timeout;
	int rv = kstrtoul(buf, 10, &timeout);

	if (rv)
		return rv;

	llbitmap->barrier_idle = timeout;
	return len;
}

static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);

static struct attribute *md_llbitmap_attrs[] = {
	&llbitmap_bits.attr,
	&llbitmap_metadata.attr,
	&llbitmap_daemon_sleep.attr,
	&llbitmap_barrier_idle.attr,
	NULL
};

static struct attribute_group md_llbitmap_group = {
	.name = "llbitmap",
	.attrs = md_llbitmap_attrs,
};

static struct bitmap_operations llbitmap_ops = {
	.head = {
		.type	= MD_BITMAP,
		.id	= ID_LLBITMAP,
		.name	= "llbitmap",
	},

	.enabled		= llbitmap_enabled,
	.create			= llbitmap_create,
	.resize			= llbitmap_resize,
	.load			= llbitmap_load,
	.destroy		= llbitmap_destroy,

	.start_write		= llbitmap_start_write,
	.end_write		= llbitmap_end_write,
	.start_discard		= llbitmap_start_discard,
	.end_discard		= llbitmap_end_discard,
	.unplug			= llbitmap_unplug,
	.flush			= llbitmap_flush,

	.start_behind_write	= llbitmap_start_behind_write,
	.end_behind_write	= llbitmap_end_behind_write,
	.wait_behind_writes	= llbitmap_wait_behind_writes,

	.blocks_synced		= llbitmap_blocks_synced,
	.skip_sync_blocks	= llbitmap_skip_sync_blocks,
	.start_sync		= llbitmap_start_sync,
	.end_sync		= llbitmap_end_sync,
	.close_sync		= llbitmap_close_sync,
	.cond_end_sync		= llbitmap_cond_end_sync,

	.update_sb		= llbitmap_update_sb,
	.get_stats		= llbitmap_get_stats,
	.dirty_bits		= llbitmap_dirty_bits,
	.write_all		= llbitmap_write_all,

	.group			= &md_llbitmap_group,
};

int md_llbitmap_init(void)
{
	md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
					 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
	if (!md_llbitmap_io_wq)
		return -ENOMEM;

	md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
					 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
	if (!md_llbitmap_unplug_wq) {
		destroy_workqueue(md_llbitmap_io_wq);
		md_llbitmap_io_wq = NULL;
		return -ENOMEM;
	}

	return register_md_submodule(&llbitmap_ops.head);
}

void md_llbitmap_exit(void)
{
	destroy_workqueue(md_llbitmap_io_wq);
	md_llbitmap_io_wq = NULL;
	destroy_workqueue(md_llbitmap_unplug_wq);
	md_llbitmap_unplug_wq = NULL;
	unregister_md_submodule(&llbitmap_ops.head);
}