Contributors: 2
Author Tokens Token Proportion Commits Commit Proportion
Matthew Sakai 974 98.78% 3 50.00%
Mike Snitzer 12 1.22% 3 50.00%
Total 986 6


/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2023 Red Hat
 */

#ifndef VDO_BLOCK_MAP_H
#define VDO_BLOCK_MAP_H

#include <linux/list.h>

#include "numeric.h"

#include "admin-state.h"
#include "completion.h"
#include "encodings.h"
#include "int-map.h"
#include "statistics.h"
#include "types.h"
#include "vio.h"
#include "wait-queue.h"

/*
 * The block map is responsible for tracking all the logical to physical mappings of a VDO. It
 * consists of a collection of 60 radix trees gradually allocated as logical addresses are used.
 * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle
 * each logical address. Each logical zone also has a dedicated portion of the leaf page cache.
 *
 * Each logical zone has a single dedicated queue and thread for performing all updates to the
 * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model
 * allow the code to omit more fine-grained locking for the block map structures.
 *
 * Load operations must be performed on the admin thread. Normal operations, such as reading and
 * updating mappings, must be performed on the appropriate logical zone thread. Save operations
 * must be launched from the same admin thread as the original load operation.
 */

enum {
	BLOCK_MAP_VIO_POOL_SIZE = 64,
};

/*
 * Generation counter for page references.
 */
typedef u32 vdo_page_generation;

extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY;

/* The VDO Page Cache abstraction. */
struct vdo_page_cache {
	/* the VDO which owns this cache */
	struct vdo *vdo;
	/* number of pages in cache */
	page_count_t page_count;
	/* number of pages to write in the current batch */
	page_count_t pages_in_batch;
	/* Whether the VDO is doing a read-only rebuild */
	bool rebuilding;

	/* array of page information entries */
	struct page_info *infos;
	/* raw memory for pages */
	char *pages;
	/* cache last found page info */
	struct page_info *last_found;
	/* map of page number to info */
	struct int_map *page_map;
	/* main LRU list (all infos) */
	struct list_head lru_list;
	/* free page list (oldest first) */
	struct list_head free_list;
	/* outgoing page list */
	struct list_head outgoing_list;
	/* number of read I/O operations pending */
	page_count_t outstanding_reads;
	/* number of write I/O operations pending */
	page_count_t outstanding_writes;
	/* number of pages covered by the current flush */
	page_count_t pages_in_flush;
	/* number of pages waiting to be included in the next flush */
	page_count_t pages_to_flush;
	/* number of discards in progress */
	unsigned int discard_count;
	/* how many VPCs waiting for free page */
	unsigned int waiter_count;
	/* queue of waiters who want a free page */
	struct vdo_wait_queue free_waiters;
	/*
	 * Statistics are only updated on the logical zone thread, but are accessed from other
	 * threads.
	 */
	struct block_map_statistics stats;
	/* counter for pressure reports */
	u32 pressure_report;
	/* the block map zone to which this cache belongs */
	struct block_map_zone *zone;
};

/*
 * The state of a page buffer. If the page buffer is free no particular page is bound to it,
 * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If
 * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is
 * in flight (incoming or outgoing) and its data should not be accessed.
 *
 * @note Update the static data in get_page_state_name() if you change this enumeration.
 */
enum vdo_page_buffer_state {
	/* this page buffer is not being used */
	PS_FREE,
	/* this page is being read from store */
	PS_INCOMING,
	/* attempt to load this page failed */
	PS_FAILED,
	/* this page is valid and un-modified */
	PS_RESIDENT,
	/* this page is valid and modified */
	PS_DIRTY,
	/* this page is being written and should not be used */
	PS_OUTGOING,
	/* not a state */
	PAGE_STATE_COUNT,
} __packed;

/*
 * The write status of page
 */
enum vdo_page_write_status {
	WRITE_STATUS_NORMAL,
	WRITE_STATUS_DISCARD,
	WRITE_STATUS_DEFERRED,
} __packed;

/* Per-page-slot information. */
struct page_info {
	/* Preallocated page struct vio */
	struct vio *vio;
	/* back-link for references */
	struct vdo_page_cache *cache;
	/* the pbn of the page */
	physical_block_number_t pbn;
	/* page is busy (temporarily locked) */
	u16 busy;
	/* the write status the page */
	enum vdo_page_write_status write_status;
	/* page state */
	enum vdo_page_buffer_state state;
	/* queue of completions awaiting this item */
	struct vdo_wait_queue waiting;
	/* state linked list entry */
	struct list_head state_entry;
	/* LRU entry */
	struct list_head lru_entry;
	/*
	 * The earliest recovery journal block containing uncommitted updates to the block map page
	 * associated with this page_info. A reference (lock) is held on that block to prevent it
	 * from being reaped. When this value changes, the reference on the old value must be
	 * released and a reference on the new value must be acquired.
	 */
	sequence_number_t recovery_lock;
};

/*
 * A completion awaiting a specific page. Also a live reference into the page once completed, until
 * freed.
 */
struct vdo_page_completion {
	/* The generic completion */
	struct vdo_completion completion;
	/* The cache involved */
	struct vdo_page_cache *cache;
	/* The waiter for the pending list */
	struct vdo_waiter waiter;
	/* The absolute physical block number of the page on disk */
	physical_block_number_t pbn;
	/* Whether the page may be modified */
	bool writable;
	/* Whether the page is available */
	bool ready;
	/* The info structure for the page, only valid when ready */
	struct page_info *info;
};

struct forest;

struct tree_page {
	struct vdo_waiter waiter;

	/* Dirty list entry */
	struct list_head entry;

	/* If dirty, the tree zone flush generation in which it was last dirtied. */
	u8 generation;

	/* Whether this page is an interior tree page being written out. */
	bool writing;

	/* If writing, the tree zone flush generation of the copy being written. */
	u8 writing_generation;

	/*
	 * Sequence number of the earliest recovery journal block containing uncommitted updates to
	 * this page
	 */
	sequence_number_t recovery_lock;

	/* The value of recovery_lock when the this page last started writing */
	sequence_number_t writing_recovery_lock;

	char page_buffer[VDO_BLOCK_SIZE];
};

enum block_map_page_type {
	VDO_TREE_PAGE,
	VDO_CACHE_PAGE,
};

typedef struct list_head dirty_era_t[2];

struct dirty_lists {
	/* The number of periods after which an element will be expired */
	block_count_t maximum_age;
	/* The oldest period which has unexpired elements */
	sequence_number_t oldest_period;
	/* One more than the current period */
	sequence_number_t next_period;
	/* The offset in the array of lists of the oldest period */
	block_count_t offset;
	/* Expired pages */
	dirty_era_t expired;
	/* The lists of dirty pages */
	dirty_era_t eras[];
};

struct block_map_zone {
	zone_count_t zone_number;
	thread_id_t thread_id;
	struct admin_state state;
	struct block_map *block_map;
	/* Dirty pages, by era*/
	struct dirty_lists *dirty_lists;
	struct vdo_page_cache page_cache;
	data_vio_count_t active_lookups;
	struct int_map *loading_pages;
	struct vio_pool *vio_pool;
	/* The tree page which has issued or will be issuing a flush */
	struct tree_page *flusher;
	struct vdo_wait_queue flush_waiters;
	/* The generation after the most recent flush */
	u8 generation;
	u8 oldest_generation;
	/* The counts of dirty pages in each generation */
	u32 dirty_page_counts[256];
};

struct block_map {
	struct vdo *vdo;
	struct action_manager *action_manager;
	/* The absolute PBN of the first root of the tree part of the block map */
	physical_block_number_t root_origin;
	block_count_t root_count;

	/* The era point we are currently distributing to the zones */
	sequence_number_t current_era_point;
	/* The next era point */
	sequence_number_t pending_era_point;

	/* The number of entries in block map */
	block_count_t entry_count;
	nonce_t nonce;
	struct recovery_journal *journal;

	/* The trees for finding block map pages */
	struct forest *forest;
	/* The expanded trees awaiting growth */
	struct forest *next_forest;
	/* The number of entries after growth */
	block_count_t next_entry_count;

	zone_count_t zone_count;
	struct block_map_zone zones[];
};

/**
 * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing
 *                                 the forest.
 * @pbn: A PBN of a tree node.
 * @completion: The parent completion of the traversal.
 *
 * Return: VDO_SUCCESS or an error.
 */
typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn,
				     struct vdo_completion *completion);

static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion)
{
	vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION);
	return container_of(completion, struct vdo_page_completion, completion);
}

void vdo_release_page_completion(struct vdo_completion *completion);

void vdo_get_page(struct vdo_page_completion *page_completion,
		  struct block_map_zone *zone, physical_block_number_t pbn,
		  bool writable, void *parent, vdo_action_fn callback,
		  vdo_action_fn error_handler, bool requeue);

void vdo_request_page_write(struct vdo_completion *completion);

int __must_check vdo_get_cached_page(struct vdo_completion *completion,
				     struct block_map_page **page_ptr);

int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache);

static inline struct block_map_page * __must_check
vdo_as_block_map_page(struct tree_page *tree_page)
{
	return (struct block_map_page *) tree_page->page_buffer;
}

bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
			 physical_block_number_t pbn,
			 struct block_map_page *page);

void vdo_find_block_map_slot(struct data_vio *data_vio);

physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map,
						    page_number_t page_number);

void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone);

void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
			 struct vdo_completion *completion);

int __must_check vdo_decode_block_map(struct block_map_state_2_0 state,
				      block_count_t logical_blocks, struct vdo *vdo,
				      struct recovery_journal *journal, nonce_t nonce,
				      page_count_t cache_size, block_count_t maximum_age,
				      struct block_map **map_ptr);

void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation,
			 struct vdo_completion *parent);

void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent);

int __must_check vdo_prepare_to_grow_block_map(struct block_map *map,
					       block_count_t new_logical_blocks);

void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent);

void vdo_abandon_block_map_growth(struct block_map *map);

void vdo_free_block_map(struct block_map *map);

struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map);

void vdo_initialize_block_map_from_journal(struct block_map *map,
					   struct recovery_journal *journal);

zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio);

void vdo_advance_block_map_era(struct block_map *map,
			       sequence_number_t recovery_block_number);

void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio,
			       physical_block_number_t pbn,
			       enum block_mapping_state mapping_state,
			       sequence_number_t *recovery_lock);

void vdo_get_mapped_block(struct data_vio *data_vio);

void vdo_put_mapped_block(struct data_vio *data_vio);

struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map);

/**
 * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format
 * @age: The configured maximum age
 *
 * Return: The converted age
 *
 * In the old recovery journal format, each journal block held 311 entries, and every write bio
 * made two entries. The old maximum age was half the usable journal length. In the new format,
 * each block holds only 217 entries, but each bio only makes one entry. We convert the configured
 * age so that the number of writes in a block map era is the same in the old and new formats. This
 * keeps the bound on the amount of work required to recover the block map from the recovery
 * journal the same across the format change. It also keeps the amortization of block map page
 * writes to write bios the same.
 */
static inline block_count_t vdo_convert_maximum_age(block_count_t age)
{
	return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK,
			    2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK);
}

#endif /* VDO_BLOCK_MAP_H */