@@ -174,6 +174,8 @@ typedef struct xfs_sb {
xfs_lsn_t sb_lsn; /* last write sequence */
uuid_t sb_meta_uuid; /* metadata file system unique id */
+ xfs_ino_t sb_metadirino; /* metadata directory tree root */
+
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -259,6 +261,8 @@ struct xfs_dsb {
__be64 sb_lsn; /* last write sequence */
uuid_t sb_meta_uuid; /* metadata file system unique id */
+ __be64 sb_metadirino; /* metadata directory tree root */
+
/* must be padded to 64 bit alignment */
};
@@ -374,6 +378,7 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
+#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
XFS_SB_FEAT_INCOMPAT_SPINODES | \
@@ -790,6 +795,27 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
}
+enum xfs_metafile_type {
+ XFS_METAFILE_UNKNOWN, /* unknown */
+ XFS_METAFILE_DIR, /* metadir directory */
+ XFS_METAFILE_USRQUOTA, /* user quota */
+ XFS_METAFILE_GRPQUOTA, /* group quota */
+ XFS_METAFILE_PRJQUOTA, /* project quota */
+ XFS_METAFILE_RTBITMAP, /* rt bitmap */
+ XFS_METAFILE_RTSUMMARY, /* rt summary */
+
+ XFS_METAFILE_MAX
+} __packed;
+
+#define XFS_METAFILE_TYPE_STR \
+ { XFS_METAFILE_UNKNOWN, "unknown" }, \
+ { XFS_METAFILE_DIR, "dir" }, \
+ { XFS_METAFILE_USRQUOTA, "usrquota" }, \
+ { XFS_METAFILE_GRPQUOTA, "grpquota" }, \
+ { XFS_METAFILE_PRJQUOTA, "prjquota" }, \
+ { XFS_METAFILE_RTBITMAP, "rtbitmap" }, \
+ { XFS_METAFILE_RTSUMMARY, "rtsummary" }
+
/*
* On-disk inode structure.
*
@@ -812,7 +838,7 @@ struct xfs_dinode {
__be16 di_mode; /* mode and type of file */
__u8 di_version; /* inode version */
__u8 di_format; /* format of di_c data */
- __be16 di_onlink; /* old number of links to file */
+ __be16 di_metatype; /* XFS_METAFILE_*; was di_onlink */
__be32 di_uid; /* owner's user id */
__be32 di_gid; /* owner's group id */
__be32 di_nlink; /* number of links to file */
@@ -1088,21 +1114,60 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
* Values for di_flags2 These start by being exposed to userspace in the upper
* 16 bits of the XFS_XFLAG_s range.
*/
-#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
-#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
-#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
-#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
-#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
-
-#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
-#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
-#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
-#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
-#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
+/* use DAX for this inode */
+#define XFS_DIFLAG2_DAX_BIT 0
+
+/* file's blocks may be shared */
+#define XFS_DIFLAG2_REFLINK_BIT 1
+
+/* copy on write extent size hint */
+#define XFS_DIFLAG2_COWEXTSIZE_BIT 2
+
+/* big timestamps */
+#define XFS_DIFLAG2_BIGTIME_BIT 3
+
+/* large extent counters */
+#define XFS_DIFLAG2_NREXT64_BIT 4
+
+/*
+ * The inode contains filesystem metadata and can be found through the metadata
+ * directory tree. Metadata inodes must satisfy the following constraints:
+ *
+ * - V5 filesystem (and ftype) are enabled;
+ * - The only valid modes are regular files and directories;
+ * - The access bits must be zero;
+ * - DMAPI event and state masks are zero;
+ * - The user and group IDs must be zero;
+ * - The project ID can be used as a u32 annotation;
+ * - The immutable, sync, noatime, nodump, nodefrag flags must be set.
+ * - The dax flag must not be set.
+ * - Directories must have nosymlinks set.
+ *
+ * These requirements are chosen defensively to minimize the ability of
+ * userspace to read or modify the contents, should a metadata file ever
+ * escape to userspace.
+ *
+ * There are further constraints on the directory tree itself:
+ *
+ * - Metadata inodes must never be resolvable through the root directory;
+ * - They must never be accessed by userspace;
+ * - Metadata directory entries must have correct ftype.
+ *
+ * Superblock-rooted metadata files must have the METADATA iflag set even
+ * though they do not have a parent directory.
+ */
+#define XFS_DIFLAG2_METADATA_BIT 5
+
+#define XFS_DIFLAG2_DAX (1ULL << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK (1ULL << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE (1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME (1ULL << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64 (1ULL << XFS_DIFLAG2_NREXT64_BIT)
+#define XFS_DIFLAG2_METADATA (1ULL << XFS_DIFLAG2_METADATA_BIT)
#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
- XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA)
static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
@@ -1117,6 +1182,12 @@ static inline bool xfs_dinode_has_large_extent_counts(
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
}
+static inline bool xfs_dinode_is_metadir(const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA));
+}
+
/*
* Inode number format:
* low inopblog bits - offset in block
@@ -209,12 +209,15 @@ xfs_inode_from_disk(
* They will also be unconditionally written back to disk as v2 inodes.
*/
if (unlikely(from->di_version == 1)) {
- set_nlink(inode, be16_to_cpu(from->di_onlink));
+ /* di_metatype used to be di_onlink */
+ set_nlink(inode, be16_to_cpu(from->di_metatype));
ip->i_projid = 0;
} else {
set_nlink(inode, be32_to_cpu(from->di_nlink));
ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
be16_to_cpu(from->di_projid_lo);
+ if (xfs_dinode_is_metadir(from))
+ ip->i_metatype = be16_to_cpu(from->di_metatype);
}
i_uid_write(inode, be32_to_cpu(from->di_uid));
@@ -315,7 +318,10 @@ xfs_inode_to_disk(
struct inode *inode = VFS_I(ip);
to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
- to->di_onlink = 0;
+ if (xfs_is_metadir_inode(ip))
+ to->di_metatype = cpu_to_be16(ip->i_metatype);
+ else
+ to->di_metatype = 0;
to->di_format = xfs_ifork_format(&ip->i_df);
to->di_uid = cpu_to_be32(i_uid_read(inode));
@@ -523,8 +529,11 @@ xfs_dinode_verify(
* di_nlink==0 on a V1 inode. V2/3 inodes would get written out with
* di_onlink==0, so we can check that.
*/
- if (dip->di_version >= 2) {
- if (dip->di_onlink)
+ if (dip->di_version == 2) {
+ if (dip->di_metatype)
+ return __this_address;
+ } else if (dip->di_version >= 3) {
+ if (!xfs_dinode_is_metadir(dip) && dip->di_metatype)
return __this_address;
}
@@ -546,7 +555,8 @@ xfs_dinode_verify(
if (dip->di_nlink)
return __this_address;
} else {
- if (dip->di_onlink)
+ /* di_metatype used to be di_onlink */
+ if (dip->di_metatype)
return __this_address;
}
}
@@ -224,6 +224,8 @@ xfs_inode_inherit_flags2(
}
if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+ if (xfs_is_metadir_inode(pip))
+ ip->i_diflags2 |= XFS_DIFLAG2_METADATA;
/* Don't let invalid cowextsize hints propagate. */
failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
@@ -404,7 +404,7 @@ struct xfs_log_dinode {
uint16_t di_mode; /* mode and type of file */
int8_t di_version; /* inode version */
int8_t di_format; /* format of di_c data */
- uint8_t di_pad3[2]; /* unused in v2/3 inodes */
+ uint16_t di_metatype; /* metadata type, if DIFLAG2_METADATA */
uint32_t di_uid; /* owner's user id */
uint32_t di_gid; /* owner's group id */
uint32_t di_nlink; /* number of links to file */
@@ -37,7 +37,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136);
- XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 272);
XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16);
@@ -180,6 +180,8 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_EXCHANGE_RANGE;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT)
features |= XFS_FEAT_PARENT;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
+ features |= XFS_FEAT_METADIR;
return features;
}
@@ -689,6 +691,11 @@ __xfs_sb_from_disk(
/* Convert on-disk flags to in-memory flags? */
if (convert_xquota)
xfs_sb_quota_from_disk(to);
+
+ if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
+ to->sb_metadirino = be64_to_cpu(from->sb_metadirino);
+ else
+ to->sb_metadirino = NULLFSINO;
}
void
@@ -836,6 +843,9 @@ xfs_sb_to_disk(
to->sb_lsn = cpu_to_be64(from->sb_lsn);
if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+
+ if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
+ to->sb_metadirino = cpu_to_be64(from->sb_metadirino);
}
/*
@@ -421,7 +421,7 @@ xchk_dinode(
break;
case 2:
case 3:
- if (dip->di_onlink != 0)
+ if (!xfs_dinode_is_metadir(dip) && dip->di_metatype)
xchk_ino_set_corrupt(sc, ino);
if (dip->di_mode == 0 && sc->ip)
@@ -521,10 +521,13 @@ STATIC void
xrep_dinode_nlinks(
struct xfs_dinode *dip)
{
- if (dip->di_version > 1)
- dip->di_onlink = 0;
- else
+ if (dip->di_version < 2) {
dip->di_nlink = 0;
+ return;
+ }
+
+ if (!xfs_dinode_is_metadir(dip))
+ dip->di_metatype = 0;
}
/* Fix any conflicting flags that the verifiers complain about. */
@@ -65,6 +65,7 @@ typedef struct xfs_inode {
uint16_t i_flushiter; /* incremented on flush */
};
uint8_t i_forkoff; /* attr fork offset >> 3 */
+ enum xfs_metafile_type i_metatype; /* XFS_METAFILE_* */
uint16_t i_diflags; /* XFS_DIFLAG_... */
uint64_t i_diflags2; /* XFS_DIFLAG2_... */
struct timespec64 i_crtime; /* time created */
@@ -276,10 +277,23 @@ static inline bool xfs_is_reflink_inode(const struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
}
+static inline bool xfs_is_metadir_inode(const struct xfs_inode *ip)
+{
+ return ip->i_diflags2 & XFS_DIFLAG2_METADATA;
+}
+
static inline bool xfs_is_internal_inode(const struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
+ /* Any file in the metadata directory tree is a metadata inode. */
+ if (xfs_has_metadir(mp))
+ return xfs_is_metadir_inode(ip);
+
+ /*
+ * Before metadata directories, the only metadata inodes were the
+ * three quota files, the realtime bitmap, and the realtime summary.
+ */
return ip->i_ino == mp->m_sb.sb_rbmino ||
ip->i_ino == mp->m_sb.sb_rsumino ||
xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
@@ -556,7 +556,6 @@ xfs_inode_to_log_dinode(
to->di_projid_lo = ip->i_projid & 0xffff;
to->di_projid_hi = ip->i_projid >> 16;
- memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode));
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode));
to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
@@ -590,10 +589,16 @@ xfs_inode_to_log_dinode(
/* dummy value for initialisation */
to->di_crc = 0;
+
+ if (xfs_is_metadir_inode(ip))
+ to->di_metatype = ip->i_metatype;
+ else
+ to->di_metatype = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
+ to->di_metatype = 0;
}
xfs_inode_to_log_dinode_iext_counters(ip, to);
@@ -175,7 +175,7 @@ xfs_log_dinode_to_disk(
to->di_mode = cpu_to_be16(from->di_mode);
to->di_version = from->di_version;
to->di_format = from->di_format;
- to->di_onlink = 0;
+ to->di_metatype = cpu_to_be16(from->di_metatype);
to->di_uid = cpu_to_be32(from->di_uid);
to->di_gid = cpu_to_be32(from->di_gid);
to->di_nlink = cpu_to_be32(from->di_nlink);
@@ -165,6 +165,10 @@ xfs_warn_experimental(
.opstate = XFS_OPSTATE_WARNED_PPTR,
.name = "parent pointer",
},
+ [XFS_EXPERIMENTAL_METADIR] = {
+ .opstate = XFS_OPSTATE_WARNED_METADIR,
+ .name = "metadata directory tree",
+ },
};
ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX);
BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);
@@ -97,6 +97,7 @@ enum xfs_experimental_feat {
XFS_EXPERIMENTAL_LBS,
XFS_EXPERIMENTAL_EXCHRANGE,
XFS_EXPERIMENTAL_PPTR,
+ XFS_EXPERIMENTAL_METADIR,
XFS_EXPERIMENTAL_MAX,
};
@@ -332,6 +332,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
+#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -387,6 +388,7 @@ __XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
+__XFS_HAS_FEAT(metadir, METADIR)
/*
* Some features are always on for v5 file systems, allow the compiler to
@@ -485,6 +487,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_EXCHRANGE 14
/* Kernel has logged a warning about parent pointers being used on this fs. */
#define XFS_OPSTATE_WARNED_PPTR 15
+/* Kernel has logged a warning about metadata dirs being used on this fs. */
+#define XFS_OPSTATE_WARNED_METADIR 16
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -1731,6 +1731,9 @@ xfs_fs_fill_super(
mp->m_features &= ~XFS_FEAT_DISCARD;
}
+ if (xfs_has_metadir(mp))
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
+
if (xfs_has_reflink(mp)) {
if (mp->m_sb.sb_rblocks) {
xfs_alert(mp,