diff mbox

[RFC,v2,05/83] Add NOVA filesystem definitions and useful helper routines.

Message ID 1520705944-6723-6-git-send-email-jix024@eng.ucsd.edu (mailing list archive)
State Changes Requested
Headers show

Commit Message

Andiry Xu March 10, 2018, 6:17 p.m. UTC
From: Andiry Xu <jix024@cs.ucsd.edu>

NOVA stores offset rather than absolute addresses in pmem.
nova_get_block() and nova_get_addr_off() provide transitions
between these two kinds of addresses.

Signed-off-by: Andiry Xu <jix024@cs.ucsd.edu>
---
 fs/nova/nova.h | 299 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 299 insertions(+)
 create mode 100644 fs/nova/nova.h

Comments

Nikolay Borisov March 11, 2018, noon UTC | #1
[Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
maintainer]

On 10.03.2018 20:17, Andiry Xu wrote:
<snip>

> +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
> +{
> +	u8 *ptr = (u8 *) data;
> +	u64 acc = crc; /* accumulator, crc32c value in lower 32b */
> +	u32 csum;
> +
> +	/* x86 instruction crc32 is part of SSE-4.2 */
> +	if (static_cpu_has(X86_FEATURE_XMM4_2)) {
> +		/* This inline assembly implementation should be equivalent
> +		 * to the kernel's crc32c_intel_le_hw() function used by
> +		 * crc32c(), but this performs better on test machines.
> +		 */
> +		while (len > 8) {
> +			asm volatile(/* 64b quad words */
> +				"crc32q (%1), %0"
> +				: "=r" (acc)
> +				: "r"  (ptr), "0" (acc)
> +			);
> +			ptr += 8;
> +			len -= 8;
> +		}
> +
> +		while (len > 0) {
> +			asm volatile(/* trailing bytes */
> +				"crc32b (%1), %0"
> +				: "=r" (acc)
> +				: "r"  (ptr), "0" (acc)
> +			);
> +			ptr++;
> +			len--;
> +		}
> +
> +		csum = (u32) acc;
> +	} else {
> +		/* The kernel's crc32c() function should also detect and use the
> +		 * crc32 instruction of SSE-4.2. But calling in to this function
> +		 * is about 3x to 5x slower than the inline assembly version on
> +		 * some test machines.

That is really odd. Did you try to characterize why this is the case? Is
it purely the overhead of dispatching to the correct backend function?
That's a rather big performance hit.

> +		 */
> +		csum = crc32c(crc, data, len);
> +	}
> +
> +	return csum;
> +}
> +

<snip>
Eric Biggers March 11, 2018, 7:22 p.m. UTC | #2
On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
> maintainer]
> 
> On 10.03.2018 20:17, Andiry Xu wrote:
> <snip>
> 
> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
> > +{
> > +	u8 *ptr = (u8 *) data;
> > +	u64 acc = crc; /* accumulator, crc32c value in lower 32b */
> > +	u32 csum;
> > +
> > +	/* x86 instruction crc32 is part of SSE-4.2 */
> > +	if (static_cpu_has(X86_FEATURE_XMM4_2)) {
> > +		/* This inline assembly implementation should be equivalent
> > +		 * to the kernel's crc32c_intel_le_hw() function used by
> > +		 * crc32c(), but this performs better on test machines.
> > +		 */
> > +		while (len > 8) {
> > +			asm volatile(/* 64b quad words */
> > +				"crc32q (%1), %0"
> > +				: "=r" (acc)
> > +				: "r"  (ptr), "0" (acc)
> > +			);
> > +			ptr += 8;
> > +			len -= 8;
> > +		}
> > +
> > +		while (len > 0) {
> > +			asm volatile(/* trailing bytes */
> > +				"crc32b (%1), %0"
> > +				: "=r" (acc)
> > +				: "r"  (ptr), "0" (acc)
> > +			);
> > +			ptr++;
> > +			len--;
> > +		}
> > +
> > +		csum = (u32) acc;
> > +	} else {
> > +		/* The kernel's crc32c() function should also detect and use the
> > +		 * crc32 instruction of SSE-4.2. But calling in to this function
> > +		 * is about 3x to 5x slower than the inline assembly version on
> > +		 * some test machines.
> 
> That is really odd. Did you try to characterize why this is the case? Is
> it purely the overhead of dispatching to the correct backend function?
> That's a rather big performance hit.
> 
> > +		 */
> > +		csum = crc32c(crc, data, len);
> > +	}
> > +
> > +	return csum;
> > +}
> > +

Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
that the accelerated version was being called?  Or, perhaps CRC32C_PCL_BREAKEVEN
(defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
don't hack around performance problems like this; if they exist, they need to be
fixed for everyone.

Eric
Andiry Xu March 11, 2018, 9:45 p.m. UTC | #3
On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebiggers3@gmail.com> wrote:
> On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> maintainer]
>>
>> On 10.03.2018 20:17, Andiry Xu wrote:
>> <snip>
>>
>> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> > +{
>> > +   u8 *ptr = (u8 *) data;
>> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> > +   u32 csum;
>> > +
>> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> > +           /* This inline assembly implementation should be equivalent
>> > +            * to the kernel's crc32c_intel_le_hw() function used by
>> > +            * crc32c(), but this performs better on test machines.
>> > +            */
>> > +           while (len > 8) {
>> > +                   asm volatile(/* 64b quad words */
>> > +                           "crc32q (%1), %0"
>> > +                           : "=r" (acc)
>> > +                           : "r"  (ptr), "0" (acc)
>> > +                   );
>> > +                   ptr += 8;
>> > +                   len -= 8;
>> > +           }
>> > +
>> > +           while (len > 0) {
>> > +                   asm volatile(/* trailing bytes */
>> > +                           "crc32b (%1), %0"
>> > +                           : "=r" (acc)
>> > +                           : "r"  (ptr), "0" (acc)
>> > +                   );
>> > +                   ptr++;
>> > +                   len--;
>> > +           }
>> > +
>> > +           csum = (u32) acc;
>> > +   } else {
>> > +           /* The kernel's crc32c() function should also detect and use the
>> > +            * crc32 instruction of SSE-4.2. But calling in to this function
>> > +            * is about 3x to 5x slower than the inline assembly version on
>> > +            * some test machines.
>>
>> That is really odd. Did you try to characterize why this is the case? Is
>> it purely the overhead of dispatching to the correct backend function?
>> That's a rather big performance hit.
>>
>> > +            */
>> > +           csum = crc32c(crc, data, len);
>> > +   }
>> > +
>> > +   return csum;
>> > +}
>> > +
>
> Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> that the accelerated version was being called?  Or, perhaps CRC32C_PCL_BREAKEVEN
> (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> don't hack around performance problems like this; if they exist, they need to be
> fixed for everyone.
>

I think we found the issue when implementing NOVA-Fortis metadata and
data protections, which use crc32c a lot. They have been removed in
this patchset; but I will double check and make sure if the issue
exists or not.

Thanks,
Andiry

> Eric
Andiry Xu March 19, 2018, 7:39 p.m. UTC | #4
On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebiggers3@gmail.com> wrote:
> On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> maintainer]
>>
>> On 10.03.2018 20:17, Andiry Xu wrote:
>> <snip>
>>
>> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> > +{
>> > +   u8 *ptr = (u8 *) data;
>> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> > +   u32 csum;
>> > +
>> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> > +           /* This inline assembly implementation should be equivalent
>> > +            * to the kernel's crc32c_intel_le_hw() function used by
>> > +            * crc32c(), but this performs better on test machines.
>> > +            */
>> > +           while (len > 8) {
>> > +                   asm volatile(/* 64b quad words */
>> > +                           "crc32q (%1), %0"
>> > +                           : "=r" (acc)
>> > +                           : "r"  (ptr), "0" (acc)
>> > +                   );
>> > +                   ptr += 8;
>> > +                   len -= 8;
>> > +           }
>> > +
>> > +           while (len > 0) {
>> > +                   asm volatile(/* trailing bytes */
>> > +                           "crc32b (%1), %0"
>> > +                           : "=r" (acc)
>> > +                           : "r"  (ptr), "0" (acc)
>> > +                   );
>> > +                   ptr++;
>> > +                   len--;
>> > +           }
>> > +
>> > +           csum = (u32) acc;
>> > +   } else {
>> > +           /* The kernel's crc32c() function should also detect and use the
>> > +            * crc32 instruction of SSE-4.2. But calling in to this function
>> > +            * is about 3x to 5x slower than the inline assembly version on
>> > +            * some test machines.
>>
>> That is really odd. Did you try to characterize why this is the case? Is
>> it purely the overhead of dispatching to the correct backend function?
>> That's a rather big performance hit.
>>
>> > +            */
>> > +           csum = crc32c(crc, data, len);
>> > +   }
>> > +
>> > +   return csum;
>> > +}
>> > +
>
> Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> that the accelerated version was being called?  Or, perhaps CRC32C_PCL_BREAKEVEN
> (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> don't hack around performance problems like this; if they exist, they need to be
> fixed for everyone.
>

I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
memory at 1066MHz platform.
You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
performance significantly. nova_crc32c() is still slightly faster than
crc32c() with the flag enabled.

Result numbers are follows: data size in bytes, latency in ns, column
3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
disabled.

data size (bytes)        nova_crc32c()        crc32c() -enabled
crc32c() -disabled
64                              19                           21
                        56
128                            28                           29
                       99
256                            46                           43
                       182
512                            82                           149
                      354
1024                          157                         232
                    728
2048                          305                         415
                    1440
4096                          603                         725
                    2869

Thanks,
Andiry
Eric Biggers March 19, 2018, 8:30 p.m. UTC | #5
On Mon, Mar 19, 2018 at 12:39:55PM -0700, Andiry Xu wrote:
> On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebiggers3@gmail.com> wrote:
> > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
> >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
> >> maintainer]
> >>
> >> On 10.03.2018 20:17, Andiry Xu wrote:
> >> <snip>
> >>
> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
> >> > +{
> >> > +   u8 *ptr = (u8 *) data;
> >> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
> >> > +   u32 csum;
> >> > +
> >> > +   /* x86 instruction crc32 is part of SSE-4.2 */
> >> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
> >> > +           /* This inline assembly implementation should be equivalent
> >> > +            * to the kernel's crc32c_intel_le_hw() function used by
> >> > +            * crc32c(), but this performs better on test machines.
> >> > +            */
> >> > +           while (len > 8) {
> >> > +                   asm volatile(/* 64b quad words */
> >> > +                           "crc32q (%1), %0"
> >> > +                           : "=r" (acc)
> >> > +                           : "r"  (ptr), "0" (acc)
> >> > +                   );
> >> > +                   ptr += 8;
> >> > +                   len -= 8;
> >> > +           }
> >> > +
> >> > +           while (len > 0) {
> >> > +                   asm volatile(/* trailing bytes */
> >> > +                           "crc32b (%1), %0"
> >> > +                           : "=r" (acc)
> >> > +                           : "r"  (ptr), "0" (acc)
> >> > +                   );
> >> > +                   ptr++;
> >> > +                   len--;
> >> > +           }
> >> > +
> >> > +           csum = (u32) acc;
> >> > +   } else {
> >> > +           /* The kernel's crc32c() function should also detect and use the
> >> > +            * crc32 instruction of SSE-4.2. But calling in to this function
> >> > +            * is about 3x to 5x slower than the inline assembly version on
> >> > +            * some test machines.
> >>
> >> That is really odd. Did you try to characterize why this is the case? Is
> >> it purely the overhead of dispatching to the correct backend function?
> >> That's a rather big performance hit.
> >>
> >> > +            */
> >> > +           csum = crc32c(crc, data, len);
> >> > +   }
> >> > +
> >> > +   return csum;
> >> > +}
> >> > +
> >
> > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> > that the accelerated version was being called?  Or, perhaps CRC32C_PCL_BREAKEVEN
> > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> > don't hack around performance problems like this; if they exist, they need to be
> > fixed for everyone.
> >
> 
> I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
> memory at 1066MHz platform.
> You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
> performance significantly. nova_crc32c() is still slightly faster than
> crc32c() with the flag enabled.
> 
> Result numbers are follows: data size in bytes, latency in ns, column
> 3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
> disabled.
> 
> data size (bytes)        nova_crc32c()        crc32c() -enabled
> crc32c() -disabled
> 64                              19                           21 56
> 128                            28                           29 99
> 256                            46                           43 182
> 512                            82                           149 354
> 1024                          157                         232 728
> 2048                          305                         415 1440
> 4096                          603                         725 2869
> 

Probably CRC32C_PCL_BREAKEVEN needs to be adjusted for that CPU, as I suggested
may be the case; notice that your measured speeds are about the same before 512
(CRC32C_PCL_BREAKEVEN) bytes, but the crypto API version is slower at >= 512
bytes.   It would be possible to set the breakeven point in
crc32c_intel_mod_init() depending on the CPU.  Again, if the performance is not
good enough you need to fix it for everyone, not hack around it.

Thanks,

Eric
Andiry Xu March 19, 2018, 9:59 p.m. UTC | #6
On Mon, Mar 19, 2018 at 1:30 PM, Eric Biggers <ebiggers3@gmail.com> wrote:
> On Mon, Mar 19, 2018 at 12:39:55PM -0700, Andiry Xu wrote:
>> On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebiggers3@gmail.com> wrote:
>> > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> >> maintainer]
>> >>
>> >> On 10.03.2018 20:17, Andiry Xu wrote:
>> >> <snip>
>> >>
>> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> >> > +{
>> >> > +   u8 *ptr = (u8 *) data;
>> >> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> >> > +   u32 csum;
>> >> > +
>> >> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> >> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> >> > +           /* This inline assembly implementation should be equivalent
>> >> > +            * to the kernel's crc32c_intel_le_hw() function used by
>> >> > +            * crc32c(), but this performs better on test machines.
>> >> > +            */
>> >> > +           while (len > 8) {
>> >> > +                   asm volatile(/* 64b quad words */
>> >> > +                           "crc32q (%1), %0"
>> >> > +                           : "=r" (acc)
>> >> > +                           : "r"  (ptr), "0" (acc)
>> >> > +                   );
>> >> > +                   ptr += 8;
>> >> > +                   len -= 8;
>> >> > +           }
>> >> > +
>> >> > +           while (len > 0) {
>> >> > +                   asm volatile(/* trailing bytes */
>> >> > +                           "crc32b (%1), %0"
>> >> > +                           : "=r" (acc)
>> >> > +                           : "r"  (ptr), "0" (acc)
>> >> > +                   );
>> >> > +                   ptr++;
>> >> > +                   len--;
>> >> > +           }
>> >> > +
>> >> > +           csum = (u32) acc;
>> >> > +   } else {
>> >> > +           /* The kernel's crc32c() function should also detect and use the
>> >> > +            * crc32 instruction of SSE-4.2. But calling in to this function
>> >> > +            * is about 3x to 5x slower than the inline assembly version on
>> >> > +            * some test machines.
>> >>
>> >> That is really odd. Did you try to characterize why this is the case? Is
>> >> it purely the overhead of dispatching to the correct backend function?
>> >> That's a rather big performance hit.
>> >>
>> >> > +            */
>> >> > +           csum = crc32c(crc, data, len);
>> >> > +   }
>> >> > +
>> >> > +   return csum;
>> >> > +}
>> >> > +
>> >
>> > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
>> > that the accelerated version was being called?  Or, perhaps CRC32C_PCL_BREAKEVEN
>> > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
>> > don't hack around performance problems like this; if they exist, they need to be
>> > fixed for everyone.
>> >
>>
>> I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
>> memory at 1066MHz platform.
>> You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
>> performance significantly. nova_crc32c() is still slightly faster than
>> crc32c() with the flag enabled.
>>
>> Result numbers are follows: data size in bytes, latency in ns, column
>> 3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
>> disabled.
>>
>> data size (bytes)        nova_crc32c()        crc32c() -enabled
>> crc32c() -disabled
>> 64                              19                           21 56
>> 128                            28                           29 99
>> 256                            46                           43 182
>> 512                            82                           149 354
>> 1024                          157                         232 728
>> 2048                          305                         415 1440
>> 4096                          603                         725 2869
>>
>
> Probably CRC32C_PCL_BREAKEVEN needs to be adjusted for that CPU, as I suggested
> may be the case; notice that your measured speeds are about the same before 512
> (CRC32C_PCL_BREAKEVEN) bytes, but the crypto API version is slower at >= 512
> bytes.   It would be possible to set the breakeven point in
> crc32c_intel_mod_init() depending on the CPU.  Again, if the performance is not
> good enough you need to fix it for everyone, not hack around it.
>

We verify that by setting CRC32C_PCL_BREAKEVEN to 8192, the
performance difference between nova_crc32c() and kernel's crc32c() is
negligible. Thanks for the comments, and I will use kernel's crc32c()
in the next version.

Thanks,
Andiry
diff mbox

Patch

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
new file mode 100644
index 0000000..5eb696c
--- /dev/null
+++ b/fs/nova/nova.h
@@ -0,0 +1,299 @@ 
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the NOVA filesystem.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix024@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef __NOVA_H
+#define __NOVA_H
+
+#include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/rtc.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/radix-tree.h>
+#include <linux/version.h>
+#include <linux/kthread.h>
+#include <linux/buffer_head.h>
+#include <linux/uio.h>
+#include <linux/iomap.h>
+#include <linux/crc32c.h>
+#include <asm/tlbflush.h>
+#include <linux/version.h>
+#include <linux/pfn_t.h>
+#include <linux/pagevec.h>
+
+#include "nova_def.h"
+
+#define PAGE_SHIFT_2M 21
+#define PAGE_SHIFT_1G 30
+
+
+/*
+ * Debug code
+ */
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
+/* #define nova_dbg(s, args...)		pr_debug(s, ## args) */
+#define nova_dbg(s, args ...)		pr_info(s, ## args)
+#define nova_err(sb, s, args ...)	nova_error_mng(sb, s, ## args)
+#define nova_warn(s, args ...)		pr_warn(s, ## args)
+#define nova_info(s, args ...)		pr_info(s, ## args)
+
+extern unsigned int nova_dbgmask;
+#define NOVA_DBGMASK_MMAPHUGE	       (0x00000001)
+#define NOVA_DBGMASK_MMAP4K	       (0x00000002)
+#define NOVA_DBGMASK_MMAPVERBOSE       (0x00000004)
+#define NOVA_DBGMASK_MMAPVVERBOSE      (0x00000008)
+#define NOVA_DBGMASK_VERBOSE	       (0x00000010)
+#define NOVA_DBGMASK_TRANSACTION       (0x00000020)
+
+#define nova_dbg_mmap4k(s, args ...)		 \
+	((nova_dbgmask & NOVA_DBGMASK_MMAP4K) ? nova_dbg(s, args) : 0)
+#define nova_dbg_mmapv(s, args ...)		 \
+	((nova_dbgmask & NOVA_DBGMASK_MMAPVERBOSE) ? nova_dbg(s, args) : 0)
+#define nova_dbg_mmapvv(s, args ...)		 \
+	((nova_dbgmask & NOVA_DBGMASK_MMAPVVERBOSE) ? nova_dbg(s, args) : 0)
+
+#define nova_dbg_verbose(s, args ...)		 \
+	((nova_dbgmask & NOVA_DBGMASK_VERBOSE) ? nova_dbg(s, ##args) : 0)
+#define nova_dbgv(s, args ...)	nova_dbg_verbose(s, ##args)
+#define nova_dbg_trans(s, args ...)		 \
+	((nova_dbgmask & NOVA_DBGMASK_TRANSACTION) ? nova_dbg(s, ##args) : 0)
+
+#define NOVA_ASSERT(x) do {\
+			       if (!(x))\
+				       nova_warn("assertion failed %s:%d: %s\n", \
+			       __FILE__, __LINE__, #x);\
+		       } while (0)
+
+#define nova_set_bit		       __test_and_set_bit_le
+#define nova_clear_bit		       __test_and_clear_bit_le
+#define nova_find_next_zero_bit	       find_next_zero_bit_le
+
+#define clear_opt(o, opt)	(o &= ~NOVA_MOUNT_ ## opt)
+#define set_opt(o, opt)		(o |= NOVA_MOUNT_ ## opt)
+#define test_opt(sb, opt)	(NOVA_SB(sb)->s_mount_opt & NOVA_MOUNT_ ## opt)
+
+#define NOVA_LARGE_INODE_TABLE_SIZE    (0x200000)
+/* NOVA size threshold for using 2M blocks for inode table */
+#define NOVA_LARGE_INODE_TABLE_THREASHOLD    (0x20000000)
+/*
+ * nova inode flags
+ *
+ * NOVA_EOFBLOCKS_FL	There are blocks allocated beyond eof
+ */
+#define NOVA_EOFBLOCKS_FL      0x20000000
+/* Flags that should be inherited by new inodes from their parent. */
+#define NOVA_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \
+			    FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL |	\
+			    FS_COMPRBLK_FL | FS_NOCOMP_FL | \
+			    FS_JOURNAL_DATA_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define NOVA_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define NOVA_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define NOVA_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | NOVA_EOFBLOCKS_FL)
+
+/* IOCTLs */
+#define	NOVA_PRINT_TIMING		0xBCD00010
+#define	NOVA_CLEAR_STATS		0xBCD00011
+#define	NOVA_PRINT_LOG			0xBCD00013
+#define	NOVA_PRINT_LOG_BLOCKNODE	0xBCD00014
+#define	NOVA_PRINT_LOG_PAGES		0xBCD00015
+#define	NOVA_PRINT_FREE_LISTS		0xBCD00018
+
+
+#define	READDIR_END			(ULONG_MAX)
+#define	ANY_CPU				(65536)
+#define	FREE_BATCH			(16)
+
+extern unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX];
+extern unsigned int blk_type_to_size[NOVA_BLOCK_TYPE_MAX];
+
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __le32 nova_mask_flags(umode_t mode, __le32 flags)
+{
+	flags &= cpu_to_le32(NOVA_FL_INHERITED);
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & cpu_to_le32(NOVA_REG_FLMASK);
+	else
+		return flags & cpu_to_le32(NOVA_OTHER_FLMASK);
+}
+
+static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
+{
+	u8 *ptr = (u8 *) data;
+	u64 acc = crc; /* accumulator, crc32c value in lower 32b */
+	u32 csum;
+
+	/* x86 instruction crc32 is part of SSE-4.2 */
+	if (static_cpu_has(X86_FEATURE_XMM4_2)) {
+		/* This inline assembly implementation should be equivalent
+		 * to the kernel's crc32c_intel_le_hw() function used by
+		 * crc32c(), but this performs better on test machines.
+		 */
+		while (len > 8) {
+			asm volatile(/* 64b quad words */
+				"crc32q (%1), %0"
+				: "=r" (acc)
+				: "r"  (ptr), "0" (acc)
+			);
+			ptr += 8;
+			len -= 8;
+		}
+
+		while (len > 0) {
+			asm volatile(/* trailing bytes */
+				"crc32b (%1), %0"
+				: "=r" (acc)
+				: "r"  (ptr), "0" (acc)
+			);
+			ptr++;
+			len--;
+		}
+
+		csum = (u32) acc;
+	} else {
+		/* The kernel's crc32c() function should also detect and use the
+		 * crc32 instruction of SSE-4.2. But calling in to this function
+		 * is about 3x to 5x slower than the inline assembly version on
+		 * some test machines.
+		 */
+		csum = crc32c(crc, data, len);
+	}
+
+	return csum;
+}
+
+static inline int memcpy_to_pmem_nocache(void *dst, const void *src,
+	unsigned int size)
+{
+	int ret;
+
+	ret = __copy_from_user_inatomic_nocache(dst, src, size);
+
+	return ret;
+}
+
+
+/* assumes the length to be 4-byte aligned */
+static inline void memset_nt(void *dest, uint32_t dword, size_t length)
+{
+	uint64_t dummy1, dummy2;
+	uint64_t qword = ((uint64_t)dword << 32) | dword;
+
+	asm volatile ("movl %%edx,%%ecx\n"
+		"andl $63,%%edx\n"
+		"shrl $6,%%ecx\n"
+		"jz 9f\n"
+		"1:	 movnti %%rax,(%%rdi)\n"
+		"2:	 movnti %%rax,1*8(%%rdi)\n"
+		"3:	 movnti %%rax,2*8(%%rdi)\n"
+		"4:	 movnti %%rax,3*8(%%rdi)\n"
+		"5:	 movnti %%rax,4*8(%%rdi)\n"
+		"8:	 movnti %%rax,5*8(%%rdi)\n"
+		"7:	 movnti %%rax,6*8(%%rdi)\n"
+		"8:	 movnti %%rax,7*8(%%rdi)\n"
+		"leaq 64(%%rdi),%%rdi\n"
+		"decl %%ecx\n"
+		"jnz 1b\n"
+		"9:	movl %%edx,%%ecx\n"
+		"andl $7,%%edx\n"
+		"shrl $3,%%ecx\n"
+		"jz 11f\n"
+		"10:	 movnti %%rax,(%%rdi)\n"
+		"leaq 8(%%rdi),%%rdi\n"
+		"decl %%ecx\n"
+		"jnz 10b\n"
+		"11:	 movl %%edx,%%ecx\n"
+		"shrl $2,%%ecx\n"
+		"jz 12f\n"
+		"movnti %%eax,(%%rdi)\n"
+		"12:\n"
+		: "=D"(dummy1), "=d" (dummy2)
+		: "D" (dest), "a" (qword), "d" (length)
+		: "memory", "rcx");
+}
+
+
+#include "super.h" // Remove when we factor out these and other functions.
+
+/* Translate an offset the beginning of the Nova instance to a PMEM address.
+ *
+ * If this is part of a read-modify-write of the block,
+ * nova_memunlock_block() before calling!
+ */
+static inline void *nova_get_block(struct super_block *sb, u64 block)
+{
+	struct nova_super_block *ps = nova_get_super(sb);
+
+	return block ? ((void *)ps + block) : NULL;
+}
+
+static inline int nova_get_reference(struct super_block *sb, u64 block,
+	void *dram, void **nvmm, size_t size)
+{
+	int rc;
+
+	*nvmm = nova_get_block(sb, block);
+	rc = memcpy_mcsafe(dram, *nvmm, size);
+	return rc;
+}
+
+
+static inline u64
+nova_get_addr_off(struct nova_sb_info *sbi, void *addr)
+{
+	NOVA_ASSERT((addr >= sbi->virt_addr) &&
+			(addr < (sbi->virt_addr + sbi->initsize)));
+	return (u64)(addr - sbi->virt_addr);
+}
+
+static inline u64
+nova_get_block_off(struct super_block *sb, unsigned long blocknr,
+		    unsigned short btype)
+{
+	return (u64)blocknr << PAGE_SHIFT;
+}
+
+
+static inline u64 nova_get_epoch_id(struct super_block *sb)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+
+	return sbi->s_epoch_id;
+}
+
+#include "inode.h"
+#endif /* __NOVA_H */