Message ID | 1520705944-6723-6-git-send-email-jix024@eng.ucsd.edu (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
[Adding Herbert Xu to CC since he is the maintainer of the crypto subsys maintainer] On 10.03.2018 20:17, Andiry Xu wrote: <snip> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len) > +{ > + u8 *ptr = (u8 *) data; > + u64 acc = crc; /* accumulator, crc32c value in lower 32b */ > + u32 csum; > + > + /* x86 instruction crc32 is part of SSE-4.2 */ > + if (static_cpu_has(X86_FEATURE_XMM4_2)) { > + /* This inline assembly implementation should be equivalent > + * to the kernel's crc32c_intel_le_hw() function used by > + * crc32c(), but this performs better on test machines. > + */ > + while (len > 8) { > + asm volatile(/* 64b quad words */ > + "crc32q (%1), %0" > + : "=r" (acc) > + : "r" (ptr), "0" (acc) > + ); > + ptr += 8; > + len -= 8; > + } > + > + while (len > 0) { > + asm volatile(/* trailing bytes */ > + "crc32b (%1), %0" > + : "=r" (acc) > + : "r" (ptr), "0" (acc) > + ); > + ptr++; > + len--; > + } > + > + csum = (u32) acc; > + } else { > + /* The kernel's crc32c() function should also detect and use the > + * crc32 instruction of SSE-4.2. But calling in to this function > + * is about 3x to 5x slower than the inline assembly version on > + * some test machines. That is really odd. Did you try to characterize why this is the case? Is it purely the overhead of dispatching to the correct backend function? That's a rather big performance hit. > + */ > + csum = crc32c(crc, data, len); > + } > + > + return csum; > +} > + <snip>
On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote: > [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys > maintainer] > > On 10.03.2018 20:17, Andiry Xu wrote: > <snip> > > > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len) > > +{ > > + u8 *ptr = (u8 *) data; > > + u64 acc = crc; /* accumulator, crc32c value in lower 32b */ > > + u32 csum; > > + > > + /* x86 instruction crc32 is part of SSE-4.2 */ > > + if (static_cpu_has(X86_FEATURE_XMM4_2)) { > > + /* This inline assembly implementation should be equivalent > > + * to the kernel's crc32c_intel_le_hw() function used by > > + * crc32c(), but this performs better on test machines. > > + */ > > + while (len > 8) { > > + asm volatile(/* 64b quad words */ > > + "crc32q (%1), %0" > > + : "=r" (acc) > > + : "r" (ptr), "0" (acc) > > + ); > > + ptr += 8; > > + len -= 8; > > + } > > + > > + while (len > 0) { > > + asm volatile(/* trailing bytes */ > > + "crc32b (%1), %0" > > + : "=r" (acc) > > + : "r" (ptr), "0" (acc) > > + ); > > + ptr++; > > + len--; > > + } > > + > > + csum = (u32) acc; > > + } else { > > + /* The kernel's crc32c() function should also detect and use the > > + * crc32 instruction of SSE-4.2. But calling in to this function > > + * is about 3x to 5x slower than the inline assembly version on > > + * some test machines. > > That is really odd. Did you try to characterize why this is the case? Is > it purely the overhead of dispatching to the correct backend function? > That's a rather big performance hit. > > > + */ > > + csum = crc32c(crc, data, len); > > + } > > + > > + return csum; > > +} > > + Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and that the accelerated version was being called? Or, perhaps CRC32C_PCL_BREAKEVEN (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted. Please don't hack around performance problems like this; if they exist, they need to be fixed for everyone. Eric
On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebiggers3@gmail.com> wrote: > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote: >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys >> maintainer] >> >> On 10.03.2018 20:17, Andiry Xu wrote: >> <snip> >> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len) >> > +{ >> > + u8 *ptr = (u8 *) data; >> > + u64 acc = crc; /* accumulator, crc32c value in lower 32b */ >> > + u32 csum; >> > + >> > + /* x86 instruction crc32 is part of SSE-4.2 */ >> > + if (static_cpu_has(X86_FEATURE_XMM4_2)) { >> > + /* This inline assembly implementation should be equivalent >> > + * to the kernel's crc32c_intel_le_hw() function used by >> > + * crc32c(), but this performs better on test machines. >> > + */ >> > + while (len > 8) { >> > + asm volatile(/* 64b quad words */ >> > + "crc32q (%1), %0" >> > + : "=r" (acc) >> > + : "r" (ptr), "0" (acc) >> > + ); >> > + ptr += 8; >> > + len -= 8; >> > + } >> > + >> > + while (len > 0) { >> > + asm volatile(/* trailing bytes */ >> > + "crc32b (%1), %0" >> > + : "=r" (acc) >> > + : "r" (ptr), "0" (acc) >> > + ); >> > + ptr++; >> > + len--; >> > + } >> > + >> > + csum = (u32) acc; >> > + } else { >> > + /* The kernel's crc32c() function should also detect and use the >> > + * crc32 instruction of SSE-4.2. But calling in to this function >> > + * is about 3x to 5x slower than the inline assembly version on >> > + * some test machines. >> >> That is really odd. Did you try to characterize why this is the case? Is >> it purely the overhead of dispatching to the correct backend function? >> That's a rather big performance hit. >> >> > + */ >> > + csum = crc32c(crc, data, len); >> > + } >> > + >> > + return csum; >> > +} >> > + > > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and > that the accelerated version was being called? Or, perhaps CRC32C_PCL_BREAKEVEN > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted. Please > don't hack around performance problems like this; if they exist, they need to be > fixed for everyone. > I think we found the issue when implementing NOVA-Fortis metadata and data protections, which use crc32c a lot. They have been removed in this patchset; but I will double check and make sure if the issue exists or not. Thanks, Andiry > Eric
On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebiggers3@gmail.com> wrote: > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote: >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys >> maintainer] >> >> On 10.03.2018 20:17, Andiry Xu wrote: >> <snip> >> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len) >> > +{ >> > + u8 *ptr = (u8 *) data; >> > + u64 acc = crc; /* accumulator, crc32c value in lower 32b */ >> > + u32 csum; >> > + >> > + /* x86 instruction crc32 is part of SSE-4.2 */ >> > + if (static_cpu_has(X86_FEATURE_XMM4_2)) { >> > + /* This inline assembly implementation should be equivalent >> > + * to the kernel's crc32c_intel_le_hw() function used by >> > + * crc32c(), but this performs better on test machines. >> > + */ >> > + while (len > 8) { >> > + asm volatile(/* 64b quad words */ >> > + "crc32q (%1), %0" >> > + : "=r" (acc) >> > + : "r" (ptr), "0" (acc) >> > + ); >> > + ptr += 8; >> > + len -= 8; >> > + } >> > + >> > + while (len > 0) { >> > + asm volatile(/* trailing bytes */ >> > + "crc32b (%1), %0" >> > + : "=r" (acc) >> > + : "r" (ptr), "0" (acc) >> > + ); >> > + ptr++; >> > + len--; >> > + } >> > + >> > + csum = (u32) acc; >> > + } else { >> > + /* The kernel's crc32c() function should also detect and use the >> > + * crc32 instruction of SSE-4.2. But calling in to this function >> > + * is about 3x to 5x slower than the inline assembly version on >> > + * some test machines. >> >> That is really odd. Did you try to characterize why this is the case? Is >> it purely the overhead of dispatching to the correct backend function? >> That's a rather big performance hit. >> >> > + */ >> > + csum = crc32c(crc, data, len); >> > + } >> > + >> > + return csum; >> > +} >> > + > > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and > that the accelerated version was being called? Or, perhaps CRC32C_PCL_BREAKEVEN > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted. Please > don't hack around performance problems like this; if they exist, they need to be > fixed for everyone. > I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3 memory at 1066MHz platform. You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the performance significantly. nova_crc32c() is still slightly faster than crc32c() with the flag enabled. Result numbers are follows: data size in bytes, latency in ns, column 3 is crc32c() with CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4 disabled. data size (bytes) nova_crc32c() crc32c() -enabled crc32c() -disabled 64 19 21 56 128 28 29 99 256 46 43 182 512 82 149 354 1024 157 232 728 2048 305 415 1440 4096 603 725 2869 Thanks, Andiry
On Mon, Mar 19, 2018 at 12:39:55PM -0700, Andiry Xu wrote: > On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebiggers3@gmail.com> wrote: > > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote: > >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys > >> maintainer] > >> > >> On 10.03.2018 20:17, Andiry Xu wrote: > >> <snip> > >> > >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len) > >> > +{ > >> > + u8 *ptr = (u8 *) data; > >> > + u64 acc = crc; /* accumulator, crc32c value in lower 32b */ > >> > + u32 csum; > >> > + > >> > + /* x86 instruction crc32 is part of SSE-4.2 */ > >> > + if (static_cpu_has(X86_FEATURE_XMM4_2)) { > >> > + /* This inline assembly implementation should be equivalent > >> > + * to the kernel's crc32c_intel_le_hw() function used by > >> > + * crc32c(), but this performs better on test machines. > >> > + */ > >> > + while (len > 8) { > >> > + asm volatile(/* 64b quad words */ > >> > + "crc32q (%1), %0" > >> > + : "=r" (acc) > >> > + : "r" (ptr), "0" (acc) > >> > + ); > >> > + ptr += 8; > >> > + len -= 8; > >> > + } > >> > + > >> > + while (len > 0) { > >> > + asm volatile(/* trailing bytes */ > >> > + "crc32b (%1), %0" > >> > + : "=r" (acc) > >> > + : "r" (ptr), "0" (acc) > >> > + ); > >> > + ptr++; > >> > + len--; > >> > + } > >> > + > >> > + csum = (u32) acc; > >> > + } else { > >> > + /* The kernel's crc32c() function should also detect and use the > >> > + * crc32 instruction of SSE-4.2. But calling in to this function > >> > + * is about 3x to 5x slower than the inline assembly version on > >> > + * some test machines. > >> > >> That is really odd. Did you try to characterize why this is the case? Is > >> it purely the overhead of dispatching to the correct backend function? > >> That's a rather big performance hit. > >> > >> > + */ > >> > + csum = crc32c(crc, data, len); > >> > + } > >> > + > >> > + return csum; > >> > +} > >> > + > > > > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and > > that the accelerated version was being called? Or, perhaps CRC32C_PCL_BREAKEVEN > > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted. Please > > don't hack around performance problems like this; if they exist, they need to be > > fixed for everyone. > > > > I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3 > memory at 1066MHz platform. > You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the > performance significantly. nova_crc32c() is still slightly faster than > crc32c() with the flag enabled. > > Result numbers are follows: data size in bytes, latency in ns, column > 3 is crc32c() with CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4 > disabled. > > data size (bytes) nova_crc32c() crc32c() -enabled > crc32c() -disabled > 64 19 21 56 > 128 28 29 99 > 256 46 43 182 > 512 82 149 354 > 1024 157 232 728 > 2048 305 415 1440 > 4096 603 725 2869 > Probably CRC32C_PCL_BREAKEVEN needs to be adjusted for that CPU, as I suggested may be the case; notice that your measured speeds are about the same before 512 (CRC32C_PCL_BREAKEVEN) bytes, but the crypto API version is slower at >= 512 bytes. It would be possible to set the breakeven point in crc32c_intel_mod_init() depending on the CPU. Again, if the performance is not good enough you need to fix it for everyone, not hack around it. Thanks, Eric
On Mon, Mar 19, 2018 at 1:30 PM, Eric Biggers <ebiggers3@gmail.com> wrote: > On Mon, Mar 19, 2018 at 12:39:55PM -0700, Andiry Xu wrote: >> On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebiggers3@gmail.com> wrote: >> > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote: >> >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys >> >> maintainer] >> >> >> >> On 10.03.2018 20:17, Andiry Xu wrote: >> >> <snip> >> >> >> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len) >> >> > +{ >> >> > + u8 *ptr = (u8 *) data; >> >> > + u64 acc = crc; /* accumulator, crc32c value in lower 32b */ >> >> > + u32 csum; >> >> > + >> >> > + /* x86 instruction crc32 is part of SSE-4.2 */ >> >> > + if (static_cpu_has(X86_FEATURE_XMM4_2)) { >> >> > + /* This inline assembly implementation should be equivalent >> >> > + * to the kernel's crc32c_intel_le_hw() function used by >> >> > + * crc32c(), but this performs better on test machines. >> >> > + */ >> >> > + while (len > 8) { >> >> > + asm volatile(/* 64b quad words */ >> >> > + "crc32q (%1), %0" >> >> > + : "=r" (acc) >> >> > + : "r" (ptr), "0" (acc) >> >> > + ); >> >> > + ptr += 8; >> >> > + len -= 8; >> >> > + } >> >> > + >> >> > + while (len > 0) { >> >> > + asm volatile(/* trailing bytes */ >> >> > + "crc32b (%1), %0" >> >> > + : "=r" (acc) >> >> > + : "r" (ptr), "0" (acc) >> >> > + ); >> >> > + ptr++; >> >> > + len--; >> >> > + } >> >> > + >> >> > + csum = (u32) acc; >> >> > + } else { >> >> > + /* The kernel's crc32c() function should also detect and use the >> >> > + * crc32 instruction of SSE-4.2. But calling in to this function >> >> > + * is about 3x to 5x slower than the inline assembly version on >> >> > + * some test machines. >> >> >> >> That is really odd. Did you try to characterize why this is the case? Is >> >> it purely the overhead of dispatching to the correct backend function? >> >> That's a rather big performance hit. >> >> >> >> > + */ >> >> > + csum = crc32c(crc, data, len); >> >> > + } >> >> > + >> >> > + return csum; >> >> > +} >> >> > + >> > >> > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and >> > that the accelerated version was being called? Or, perhaps CRC32C_PCL_BREAKEVEN >> > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted. Please >> > don't hack around performance problems like this; if they exist, they need to be >> > fixed for everyone. >> > >> >> I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3 >> memory at 1066MHz platform. >> You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the >> performance significantly. nova_crc32c() is still slightly faster than >> crc32c() with the flag enabled. >> >> Result numbers are follows: data size in bytes, latency in ns, column >> 3 is crc32c() with CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4 >> disabled. >> >> data size (bytes) nova_crc32c() crc32c() -enabled >> crc32c() -disabled >> 64 19 21 56 >> 128 28 29 99 >> 256 46 43 182 >> 512 82 149 354 >> 1024 157 232 728 >> 2048 305 415 1440 >> 4096 603 725 2869 >> > > Probably CRC32C_PCL_BREAKEVEN needs to be adjusted for that CPU, as I suggested > may be the case; notice that your measured speeds are about the same before 512 > (CRC32C_PCL_BREAKEVEN) bytes, but the crypto API version is slower at >= 512 > bytes. It would be possible to set the breakeven point in > crc32c_intel_mod_init() depending on the CPU. Again, if the performance is not > good enough you need to fix it for everyone, not hack around it. > We verify that by setting CRC32C_PCL_BREAKEVEN to 8192, the performance difference between nova_crc32c() and kernel's crc32c() is negligible. Thanks for the comments, and I will use kernel's crc32c() in the next version. Thanks, Andiry
diff --git a/fs/nova/nova.h b/fs/nova/nova.h new file mode 100644 index 0000000..5eb696c --- /dev/null +++ b/fs/nova/nova.h @@ -0,0 +1,299 @@ +/* + * BRIEF DESCRIPTION + * + * Definitions for the NOVA filesystem. + * + * Copyright 2015-2016 Regents of the University of California, + * UCSD Non-Volatile Systems Lab, Andiry Xu <jix024@cs.ucsd.edu> + * Copyright 2012-2013 Intel Corporation + * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com> + * Copyright 2003 Sony Corporation + * Copyright 2003 Matsushita Electric Industrial Co., Ltd. + * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ +#ifndef __NOVA_H +#define __NOVA_H + +#include <linux/fs.h> +#include <linux/dax.h> +#include <linux/init.h> +#include <linux/time.h> +#include <linux/rtc.h> +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/sched.h> +#include <linux/mutex.h> +#include <linux/pagemap.h> +#include <linux/backing-dev.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/rcupdate.h> +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/radix-tree.h> +#include <linux/version.h> +#include <linux/kthread.h> +#include <linux/buffer_head.h> +#include <linux/uio.h> +#include <linux/iomap.h> +#include <linux/crc32c.h> +#include <asm/tlbflush.h> +#include <linux/version.h> +#include <linux/pfn_t.h> +#include <linux/pagevec.h> + +#include "nova_def.h" + +#define PAGE_SHIFT_2M 21 +#define PAGE_SHIFT_1G 30 + + +/* + * Debug code + */ +#ifdef pr_fmt +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#endif + +/* #define nova_dbg(s, args...) pr_debug(s, ## args) */ +#define nova_dbg(s, args ...) pr_info(s, ## args) +#define nova_err(sb, s, args ...) nova_error_mng(sb, s, ## args) +#define nova_warn(s, args ...) pr_warn(s, ## args) +#define nova_info(s, args ...) pr_info(s, ## args) + +extern unsigned int nova_dbgmask; +#define NOVA_DBGMASK_MMAPHUGE (0x00000001) +#define NOVA_DBGMASK_MMAP4K (0x00000002) +#define NOVA_DBGMASK_MMAPVERBOSE (0x00000004) +#define NOVA_DBGMASK_MMAPVVERBOSE (0x00000008) +#define NOVA_DBGMASK_VERBOSE (0x00000010) +#define NOVA_DBGMASK_TRANSACTION (0x00000020) + +#define nova_dbg_mmap4k(s, args ...) \ + ((nova_dbgmask & NOVA_DBGMASK_MMAP4K) ? nova_dbg(s, args) : 0) +#define nova_dbg_mmapv(s, args ...) \ + ((nova_dbgmask & NOVA_DBGMASK_MMAPVERBOSE) ? nova_dbg(s, args) : 0) +#define nova_dbg_mmapvv(s, args ...) \ + ((nova_dbgmask & NOVA_DBGMASK_MMAPVVERBOSE) ? nova_dbg(s, args) : 0) + +#define nova_dbg_verbose(s, args ...) \ + ((nova_dbgmask & NOVA_DBGMASK_VERBOSE) ? nova_dbg(s, ##args) : 0) +#define nova_dbgv(s, args ...) nova_dbg_verbose(s, ##args) +#define nova_dbg_trans(s, args ...) \ + ((nova_dbgmask & NOVA_DBGMASK_TRANSACTION) ? nova_dbg(s, ##args) : 0) + +#define NOVA_ASSERT(x) do {\ + if (!(x))\ + nova_warn("assertion failed %s:%d: %s\n", \ + __FILE__, __LINE__, #x);\ + } while (0) + +#define nova_set_bit __test_and_set_bit_le +#define nova_clear_bit __test_and_clear_bit_le +#define nova_find_next_zero_bit find_next_zero_bit_le + +#define clear_opt(o, opt) (o &= ~NOVA_MOUNT_ ## opt) +#define set_opt(o, opt) (o |= NOVA_MOUNT_ ## opt) +#define test_opt(sb, opt) (NOVA_SB(sb)->s_mount_opt & NOVA_MOUNT_ ## opt) + +#define NOVA_LARGE_INODE_TABLE_SIZE (0x200000) +/* NOVA size threshold for using 2M blocks for inode table */ +#define NOVA_LARGE_INODE_TABLE_THREASHOLD (0x20000000) +/* + * nova inode flags + * + * NOVA_EOFBLOCKS_FL There are blocks allocated beyond eof + */ +#define NOVA_EOFBLOCKS_FL 0x20000000 +/* Flags that should be inherited by new inodes from their parent. */ +#define NOVA_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \ + FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL | \ + FS_COMPRBLK_FL | FS_NOCOMP_FL | \ + FS_JOURNAL_DATA_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL) +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define NOVA_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +/* Flags that are appropriate for non-directories/regular files. */ +#define NOVA_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define NOVA_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | NOVA_EOFBLOCKS_FL) + +/* IOCTLs */ +#define NOVA_PRINT_TIMING 0xBCD00010 +#define NOVA_CLEAR_STATS 0xBCD00011 +#define NOVA_PRINT_LOG 0xBCD00013 +#define NOVA_PRINT_LOG_BLOCKNODE 0xBCD00014 +#define NOVA_PRINT_LOG_PAGES 0xBCD00015 +#define NOVA_PRINT_FREE_LISTS 0xBCD00018 + + +#define READDIR_END (ULONG_MAX) +#define ANY_CPU (65536) +#define FREE_BATCH (16) + +extern unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX]; +extern unsigned int blk_type_to_size[NOVA_BLOCK_TYPE_MAX]; + + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __le32 nova_mask_flags(umode_t mode, __le32 flags) +{ + flags &= cpu_to_le32(NOVA_FL_INHERITED); + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & cpu_to_le32(NOVA_REG_FLMASK); + else + return flags & cpu_to_le32(NOVA_OTHER_FLMASK); +} + +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len) +{ + u8 *ptr = (u8 *) data; + u64 acc = crc; /* accumulator, crc32c value in lower 32b */ + u32 csum; + + /* x86 instruction crc32 is part of SSE-4.2 */ + if (static_cpu_has(X86_FEATURE_XMM4_2)) { + /* This inline assembly implementation should be equivalent + * to the kernel's crc32c_intel_le_hw() function used by + * crc32c(), but this performs better on test machines. + */ + while (len > 8) { + asm volatile(/* 64b quad words */ + "crc32q (%1), %0" + : "=r" (acc) + : "r" (ptr), "0" (acc) + ); + ptr += 8; + len -= 8; + } + + while (len > 0) { + asm volatile(/* trailing bytes */ + "crc32b (%1), %0" + : "=r" (acc) + : "r" (ptr), "0" (acc) + ); + ptr++; + len--; + } + + csum = (u32) acc; + } else { + /* The kernel's crc32c() function should also detect and use the + * crc32 instruction of SSE-4.2. But calling in to this function + * is about 3x to 5x slower than the inline assembly version on + * some test machines. + */ + csum = crc32c(crc, data, len); + } + + return csum; +} + +static inline int memcpy_to_pmem_nocache(void *dst, const void *src, + unsigned int size) +{ + int ret; + + ret = __copy_from_user_inatomic_nocache(dst, src, size); + + return ret; +} + + +/* assumes the length to be 4-byte aligned */ +static inline void memset_nt(void *dest, uint32_t dword, size_t length) +{ + uint64_t dummy1, dummy2; + uint64_t qword = ((uint64_t)dword << 32) | dword; + + asm volatile ("movl %%edx,%%ecx\n" + "andl $63,%%edx\n" + "shrl $6,%%ecx\n" + "jz 9f\n" + "1: movnti %%rax,(%%rdi)\n" + "2: movnti %%rax,1*8(%%rdi)\n" + "3: movnti %%rax,2*8(%%rdi)\n" + "4: movnti %%rax,3*8(%%rdi)\n" + "5: movnti %%rax,4*8(%%rdi)\n" + "8: movnti %%rax,5*8(%%rdi)\n" + "7: movnti %%rax,6*8(%%rdi)\n" + "8: movnti %%rax,7*8(%%rdi)\n" + "leaq 64(%%rdi),%%rdi\n" + "decl %%ecx\n" + "jnz 1b\n" + "9: movl %%edx,%%ecx\n" + "andl $7,%%edx\n" + "shrl $3,%%ecx\n" + "jz 11f\n" + "10: movnti %%rax,(%%rdi)\n" + "leaq 8(%%rdi),%%rdi\n" + "decl %%ecx\n" + "jnz 10b\n" + "11: movl %%edx,%%ecx\n" + "shrl $2,%%ecx\n" + "jz 12f\n" + "movnti %%eax,(%%rdi)\n" + "12:\n" + : "=D"(dummy1), "=d" (dummy2) + : "D" (dest), "a" (qword), "d" (length) + : "memory", "rcx"); +} + + +#include "super.h" // Remove when we factor out these and other functions. + +/* Translate an offset the beginning of the Nova instance to a PMEM address. + * + * If this is part of a read-modify-write of the block, + * nova_memunlock_block() before calling! + */ +static inline void *nova_get_block(struct super_block *sb, u64 block) +{ + struct nova_super_block *ps = nova_get_super(sb); + + return block ? ((void *)ps + block) : NULL; +} + +static inline int nova_get_reference(struct super_block *sb, u64 block, + void *dram, void **nvmm, size_t size) +{ + int rc; + + *nvmm = nova_get_block(sb, block); + rc = memcpy_mcsafe(dram, *nvmm, size); + return rc; +} + + +static inline u64 +nova_get_addr_off(struct nova_sb_info *sbi, void *addr) +{ + NOVA_ASSERT((addr >= sbi->virt_addr) && + (addr < (sbi->virt_addr + sbi->initsize))); + return (u64)(addr - sbi->virt_addr); +} + +static inline u64 +nova_get_block_off(struct super_block *sb, unsigned long blocknr, + unsigned short btype) +{ + return (u64)blocknr << PAGE_SHIFT; +} + + +static inline u64 nova_get_epoch_id(struct super_block *sb) +{ + struct nova_sb_info *sbi = NOVA_SB(sb); + + return sbi->s_epoch_id; +} + +#include "inode.h" +#endif /* __NOVA_H */