diff mbox series

[07/20] bcache: add initial data structures for nvm pages

Message ID 20210210050742.31237-8-colyli@suse.de (mailing list archive)
State New, archived
Headers show
Series bcache patches for Linux v5.12 | expand

Commit Message

Coly Li Feb. 10, 2021, 5:07 a.m. UTC
This patch initializes the prototype data structures for nvm pages
allocator,

- struct bch_nvm_pages_sb
This is the super block allocated on each nvdimm namespace. A nvdimm
set may have multiple namespaces, bch_nvm_pages_sb->set_uuid is used
to mark which nvdimm set this name space belongs to. Normally we will
use the bcache's cache set UUID to initialize this uuid, to connect this
nvdimm set to a specified bcache cache set.

- struct bch_owner_list_head
This is a table for all heads of all owner lists. A owner list records
which page(s) allocated to which owner. After reboot from power failure,
the ownwer may find all its requested and allocated pages from the owner
list by a handler which is converted by a UUID.

- struct bch_nvm_pages_owner_head
This is a head of an owner list. Each owner only has one owner list,
and a nvm page only belongs to an specific owner. uuid[] will be set to
owner's uuid, for bcache it is the bcache's cache set uuid. label is not
mandatory, it is a human-readable string for debug purpose. The pointer
*recs references to separated nvm page which hold the table of struct
bch_nvm_pgalloc_rec.

- struct bch_nvm_pgalloc_recs
This struct occupies a whole page, owner_uuid should match the uuid
in struct bch_nvm_pages_owner_head. recs[] is the real table contains all
allocated records.

- struct bch_nvm_pgalloc_rec
Each structure records a range of allocated nvm pages. pgoff is offset
in unit of page size of this allocated nvm page range. The adjoint page
ranges of same owner can be merged into a larger one, therefore pages_nr
is NOT always power of 2.

Signed-off-by: Coly Li <colyli@suse.de>
Cc: Jianpeng Ma <jianpeng.ma@intel.com>
Cc: Qiaowei Ren <qiaowei.ren@intel.com>
---
 include/uapi/linux/bcache-nvm.h | 195 ++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100644 include/uapi/linux/bcache-nvm.h

Comments

Jens Axboe Feb. 10, 2021, 3:09 p.m. UTC | #1
On 2/9/21 10:07 PM, Coly Li wrote:
> +struct bch_nvm_pgalloc_recs {
> +union {
> +	struct {
> +		struct bch_nvm_pages_owner_head	*owner;
> +		struct bch_nvm_pgalloc_recs	*next;
> +		__u8				magic[16];
> +		__u8				owner_uuid[16];
> +		__u32				size;
> +		__u32				used;
> +		__u64				_pad[4];
> +		struct bch_pgalloc_rec		recs[];
> +	};
> +	__u8	pad[8192];
> +};
> +};

This doesn't look right in a user header, any user API should be 32-bit
and 64-bit agnostic.

> +struct bch_nvm_pages_owner_head {
> +	__u8			uuid[16];
> +	char			label[BCH_NVM_PAGES_LABEL_SIZE];
> +	/* Per-namespace own lists */
> +	struct bch_nvm_pgalloc_recs	*recs[BCH_NVM_PAGES_NAMESPACES_MAX];
> +};

Same here.

> +/* heads[0] is always for nvm_pages internal usage */
> +struct bch_owner_list_head {
> +union {
> +	struct {
> +		__u32				size;
> +		__u32				used;
> +		__u64				_pad[4];
> +		struct bch_nvm_pages_owner_head	heads[];
> +	};
> +	__u8	pad[8192];
> +};
> +};

And here.

> +#define BCH_MAX_OWNER_LIST				\
> +	((sizeof(struct bch_owner_list_head) -		\
> +	 offsetof(struct bch_owner_list_head, heads)) /	\
> +	 sizeof(struct bch_nvm_pages_owner_head))
> +
> +/* The on-media bit order is local CPU order */
> +struct bch_nvm_pages_sb {
> +	__u64			csum;
> +	__u64			ns_start;
> +	__u64			sb_offset;
> +	__u64			version;
> +	__u8			magic[16];
> +	__u8			uuid[16];
> +	__u32			page_size;
> +	__u32			total_namespaces_nr;
> +	__u32			this_namespace_nr;
> +	union {
> +		__u8		set_uuid[16];
> +		__u64		set_magic;
> +	};

This doesn't look like it packs right either.

> +
> +	__u64			flags;
> +	__u64			seq;
> +
> +	__u64			feature_compat;
> +	__u64			feature_incompat;
> +	__u64			feature_ro_compat;
> +
> +	/* For allocable nvm pages from buddy systems */
> +	__u64			pages_offset;
> +	__u64			pages_total;
> +
> +	__u64			pad[8];
> +
> +	/* Only on the first name space */
> +	struct bch_owner_list_head	*owner_list_head;

And here's another pointer...
Coly Li Feb. 11, 2021, 3:58 a.m. UTC | #2
On 2/10/21 11:09 PM, Jens Axboe wrote:
> On 2/9/21 10:07 PM, Coly Li wrote:
>> +struct bch_nvm_pgalloc_recs {
>> +union {
>> +	struct {
>> +		struct bch_nvm_pages_owner_head	*owner;
>> +		struct bch_nvm_pgalloc_recs	*next;
>> +		__u8				magic[16];
>> +		__u8				owner_uuid[16];
>> +		__u32				size;
>> +		__u32				used;
>> +		__u64				_pad[4];
>> +		struct bch_pgalloc_rec		recs[];
>> +	};
>> +	__u8	pad[8192];
>> +};
>> +};
> 

Hi Jens,

> This doesn't look right in a user header, any user API should be 32-bit
> and 64-bit agnostic.

The above data structure is stored in NVDIMM as allocator's meta data.
It is designed to be directly accessed (in future update) as in-memory
object, but stored on non-volatiled memory like on-disk data structure.

To me, it is fine to use unsigned int/long/long long to define the
members, because nvdimm driver only works on 64bit platform. It is just
unclear to me which form/style I should use to define such data
structure. On one side they are stores as non-volatiled media, on other
side they are accessed directly as in-memory object...


> 
>> +struct bch_nvm_pages_owner_head {
>> +	__u8			uuid[16];
>> +	char			label[BCH_NVM_PAGES_LABEL_SIZE];
>> +	/* Per-namespace own lists */
>> +	struct bch_nvm_pgalloc_recs	*recs[BCH_NVM_PAGES_NAMESPACES_MAX];
>> +};
> 
> Same here.

For the above pointer, it is the same reason. In later version, such
object on NVDIMM will be referenced directly by an in-memory pointer
like we normally do for an in-memory object.

Therefore I do treat the data structure as in-memory object after the
DAX mapping accomplished. If not define it as an in-memory pointer, I
have to cast it into (void *) every time when I use it.


> 
>> +/* heads[0] is always for nvm_pages internal usage */
>> +struct bch_owner_list_head {
>> +union {
>> +	struct {
>> +		__u32				size;
>> +		__u32				used;
>> +		__u64				_pad[4];
>> +		struct bch_nvm_pages_owner_head	heads[];
>> +	};
>> +	__u8	pad[8192];
>> +};
>> +};
> 
> And here.
> 
>> +#define BCH_MAX_OWNER_LIST				\
>> +	((sizeof(struct bch_owner_list_head) -		\
>> +	 offsetof(struct bch_owner_list_head, heads)) /	\
>> +	 sizeof(struct bch_nvm_pages_owner_head))
>> +
>> +/* The on-media bit order is local CPU order */
>> +struct bch_nvm_pages_sb {
>> +	__u64			csum;
>> +	__u64			ns_start;
>> +	__u64			sb_offset;
>> +	__u64			version;
>> +	__u8			magic[16];
>> +	__u8			uuid[16];
>> +	__u32			page_size;
>> +	__u32			total_namespaces_nr;
>> +	__u32			this_namespace_nr;
>> +	union {
>> +		__u8		set_uuid[16];
>> +		__u64		set_magic;
>> +	};
> 
> This doesn't look like it packs right either.

This is my mimicry from bcache code, which uses the least significant 8
bytes from the randomly generated UUID as a magic number. It is solid
and not changed during the whole life cycle for the nvm pages set.


> 
>> +
>> +	__u64			flags;
>> +	__u64			seq;
>> +
>> +	__u64			feature_compat;
>> +	__u64			feature_incompat;
>> +	__u64			feature_ro_compat;
>> +
>> +	/* For allocable nvm pages from buddy systems */
>> +	__u64			pages_offset;
>> +	__u64			pages_total;
>> +
>> +	__u64			pad[8];
>> +
>> +	/* Only on the first name space */
>> +	struct bch_owner_list_head	*owner_list_head;
> 
> And here's another pointer...
> 

Same reason for I use it as an in-memory pointer.

The above definition is just using all the structures as in-memory
object, the difference is just they are non-volatiled after reboot.

Thanks.

Coly Li
diff mbox series

Patch

diff --git a/include/uapi/linux/bcache-nvm.h b/include/uapi/linux/bcache-nvm.h
new file mode 100644
index 000000000000..61108bf2a63e
--- /dev/null
+++ b/include/uapi/linux/bcache-nvm.h
@@ -0,0 +1,195 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _UAPI_BCACHE_NVM_H
+#define _UAPI_BCACHE_NVM_H
+
+/*
+ * Bcache on NVDIMM data structures
+ */
+
+/*
+ * - struct bch_nvm_pages_sb
+ *   This is the super block allocated on each nvdimm namespace. A nvdimm
+ * set may have multiple namespaces, bch_nvm_pages_sb->set_uuid is used to mark
+ * which nvdimm set this name space belongs to. Normally we will use the
+ * bcache's cache set UUID to initialize this uuid, to connect this nvdimm
+ * set to a specified bcache cache set.
+ *
+ * - struct bch_owner_list_head
+ *   This is a table for all heads of all owner lists. A owner list records
+ * which page(s) allocated to which owner. After reboot from power failure,
+ * the ownwer may find all its requested and allocated pages from the owner
+ * list by a handler which is converted by a UUID.
+ *
+ * - struct bch_nvm_pages_owner_head
+ *   This is a head of an owner list. Each owner only has one owner list,
+ * and a nvm page only belongs to an specific owner. uuid[] will be set to
+ * owner's uuid, for bcache it is the bcache's cache set uuid. label is not
+ * mandatory, it is a human-readable string for debug purpose. The pointer
+ * recs references to separated nvm page which hold the table of struct
+ * bch_pgalloc_rec.
+ *
+ *- struct bch_nvm_pgalloc_recs
+ *  This structure occupies a whole page, owner_uuid should match the uuid
+ * in struct bch_nvm_pages_owner_head. recs[] is the real table contains all
+ * allocated records.
+ *
+ * - struct bch_pgalloc_rec
+ *   Each structure records a range of allocated nvm pages. pgoff is offset
+ * in unit of page size of this allocated nvm page range. The adjoint page
+ * ranges of same owner can be merged into a larger one, therefore pages_nr
+ * is NOT always power of 2.
+ *
+ *
+ * Memory layout on nvdimm namespace 0
+ *
+ *    0 +---------------------------------+
+ *      |                                 |
+ *  4KB +---------------------------------+
+ *      |         bch_nvm_pages_sb        |
+ *  8KB +---------------------------------+ <--- bch_nvm_pages_sb.bch_owner_list_head
+ *      |       bch_owner_list_head       |
+ *      |                                 |
+ * 16KB +---------------------------------+ <--- bch_owner_list_head.heads[0].recs[0]
+ *      |       bch_nvm_pgalloc_recs      |
+ *      |  (nvm pages internal usage)     |
+ * 24KB +---------------------------------+
+ *      |                                 |
+ *      |                                 |
+ * 16MB  +---------------------------------+
+ *      |      allocable nvm pages        |
+ *      |      for buddy allocator        |
+ * end  +---------------------------------+
+ *
+ *
+ *
+ * Memory layout on nvdimm namespace N
+ * (doesn't have owner list)
+ *
+ *    0 +---------------------------------+
+ *      |                                 |
+ *  4KB +---------------------------------+
+ *      |         bch_nvm_pages_sb        |
+ *  8KB +---------------------------------+
+ *      |                                 |
+ *      |                                 |
+ *      |                                 |
+ *      |                                 |
+ *      |                                 |
+ *      |                                 |
+ * 16MB  +---------------------------------+
+ *      |      allocable nvm pages        |
+ *      |      for buddy allocator        |
+ * end  +---------------------------------+
+ *
+ */
+
+#include <linux/types.h>
+
+/* In sectors */
+#define BCH_NVM_PAGES_SB_OFFSET			4096
+#define BCH_NVM_PAGES_OFFSET			(16 << 20)
+
+#define BCH_NVM_PAGES_LABEL_SIZE		32
+#define BCH_NVM_PAGES_NAMESPACES_MAX		8
+
+#define BCH_NVM_PAGES_OWNER_LIST_HEAD_OFFSET	(8<<10)
+#define BCH_NVM_PAGES_SYS_RECS_HEAD_OFFSET	(16<<10)
+
+#define BCH_NVM_PAGES_SB_VERSION		0
+#define BCH_NVM_PAGES_SB_VERSION_MAX		0
+
+static const char bch_nvm_pages_magic[] = {
+	0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83,
+	0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 };
+static const char bch_nvm_pages_pgalloc_magic[] = {
+	0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9,
+	0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae };
+
+struct bch_pgalloc_rec {
+	__u32			pgoff;
+	__u32			nr;
+};
+
+struct bch_nvm_pgalloc_recs {
+union {
+	struct {
+		struct bch_nvm_pages_owner_head	*owner;
+		struct bch_nvm_pgalloc_recs	*next;
+		__u8				magic[16];
+		__u8				owner_uuid[16];
+		__u32				size;
+		__u32				used;
+		__u64				_pad[4];
+		struct bch_pgalloc_rec		recs[];
+	};
+	__u8	pad[8192];
+};
+};
+#define BCH_MAX_RECS					\
+	((sizeof(struct bch_nvm_pgalloc_recs) -		\
+	 offsetof(struct bch_nvm_pgalloc_recs, recs)) /	\
+	 sizeof(struct bch_pgalloc_rec))
+
+struct bch_nvm_pages_owner_head {
+	__u8			uuid[16];
+	char			label[BCH_NVM_PAGES_LABEL_SIZE];
+	/* Per-namespace own lists */
+	struct bch_nvm_pgalloc_recs	*recs[BCH_NVM_PAGES_NAMESPACES_MAX];
+};
+
+/* heads[0] is always for nvm_pages internal usage */
+struct bch_owner_list_head {
+union {
+	struct {
+		__u32				size;
+		__u32				used;
+		__u64				_pad[4];
+		struct bch_nvm_pages_owner_head	heads[];
+	};
+	__u8	pad[8192];
+};
+};
+#define BCH_MAX_OWNER_LIST				\
+	((sizeof(struct bch_owner_list_head) -		\
+	 offsetof(struct bch_owner_list_head, heads)) /	\
+	 sizeof(struct bch_nvm_pages_owner_head))
+
+/* The on-media bit order is local CPU order */
+struct bch_nvm_pages_sb {
+	__u64			csum;
+	__u64			ns_start;
+	__u64			sb_offset;
+	__u64			version;
+	__u8			magic[16];
+	__u8			uuid[16];
+	__u32			page_size;
+	__u32			total_namespaces_nr;
+	__u32			this_namespace_nr;
+	union {
+		__u8		set_uuid[16];
+		__u64		set_magic;
+	};
+
+	__u64			flags;
+	__u64			seq;
+
+	__u64			feature_compat;
+	__u64			feature_incompat;
+	__u64			feature_ro_compat;
+
+	/* For allocable nvm pages from buddy systems */
+	__u64			pages_offset;
+	__u64			pages_total;
+
+	__u64			pad[8];
+
+	/* Only on the first name space */
+	struct bch_owner_list_head	*owner_list_head;
+
+	/* Just for csum_set() */
+	__u32			keys;
+	__u64			d[0];
+};
+
+#endif /* _UAPI_BCACHE_NVM_H */