diff mbox series

[RFC,1/6] Implement reexport helper library

Message ID 20220217131531.2890-2-richard@nod.at (mailing list archive)
State New, archived
Headers show
Series nfs-utils: Improving NFS re-exports | expand

Commit Message

Richard Weinberger Feb. 17, 2022, 1:15 p.m. UTC
This internal library contains code that will be used by various
tools within the nfs-utils package to deal better with NFS re-export,
especially cross mounts.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 configure.ac                 |  12 +
 support/Makefile.am          |   4 +
 support/reexport/Makefile.am |   6 +
 support/reexport/reexport.c  | 477 +++++++++++++++++++++++++++++++++++
 support/reexport/reexport.h  |  53 ++++
 5 files changed, 552 insertions(+)
 create mode 100644 support/reexport/Makefile.am
 create mode 100644 support/reexport/reexport.c
 create mode 100644 support/reexport/reexport.h

Comments

J. Bruce Fields March 8, 2022, 9:44 p.m. UTC | #1
On Thu, Feb 17, 2022 at 02:15:26PM +0100, Richard Weinberger wrote:
> +#define REEXPDB_SHM_NAME "/nfs_reexport_db_lock"
> +#define REEXPDB_SHM_SZ 4096
> +#define REEXPDB_INIT_LOCK NFS_STATEDIR "/reexpdb_init.lock"
> +#define REEXPDB_DBFILE NFS_STATEDIR "/reexpdb.sqlite3"

I don't know much about sqlite--why do we need to do our own file
locking?  If we do need to do it ourself, could we lock the database
file instead instead of using a separate lock file?

> +static const char initdb_sql[] = "CREATE TABLE IF NOT EXISTS fsidnums (num INTEGER PRIMARY KEY CHECK (num > 0 AND num < 4294967296), path TEXT UNIQUE); CREATE TABLE IF NOT EXISTS subvolumes (path TEXT PRIMARY KEY); CREATE INDEX IF NOT EXISTS idx_ids_path ON fsidnums (path);";

I'd personally find it easier to read if these were defined in the place
where they're used.  (And, honestly, if this is just used once, maybe
the definition is unnecessary.)

What are the two tables used for?  Naively I'd've thought the
"subvolumes" table was redundant.

> +/*
> + * This query is a little tricky. We use SQL to find and claim the smallest free fsid number.

Yes, that is a little tricky.  Is it necessary?  My SQL Is rusty, but
the database should be able to pick a unique value for us, shouldn't it?

> + * To find a free fsid the fsidnums is left joined to itself but with an offset of 1.
> + * Everything after the UNION statement is to handle the corner case where fsidnums
> + * is empty. In this case we want 1 as first fsid number.
> + */
> +static const char new_fsidnum_by_path_sql[] = "INSERT INTO fsidnums VALUES ((SELECT ids1.num + 1 FROM fsidnums AS ids1 LEFT JOIN fsidnums AS ids2 ON ids2.num = ids1.num + 1 WHERE ids2.num IS NULL UNION SELECT 1 WHERE NOT EXISTS (SELECT NULL FROM fsidnums WHERE num = 1) LIMIT 1), ?1) RETURNING num;";
> +static const char fsidnum_by_path_sql[] = "SELECT num FROM fsidnums WHERE path = ?1;";
> +static const char add_crossed_volume_sql[] = "REPLACE INTO subvolumes VALUES(?1);";
> +static const char drop_crossed_volume_sql[] = "DELETE FROM subvolumes WHERE path = ?1;";
> +static const char get_crossed_volumes_sql[] = "SELECT path from subvolumes;";
...
> +/*
> + * reexpdb_init - Initialize reexport database
> + *
> + * Setup shared lock (database is concurrently used by multiple processes),

So, this should all work when rpc.mountd is run with --num_threads > 1?

--b.
Richard Weinberger March 9, 2022, 9:43 a.m. UTC | #2
Bruce,

----- Ursprüngliche Mail -----
> Von: "bfields" <bfields@fieldses.org>
> On Thu, Feb 17, 2022 at 02:15:26PM +0100, Richard Weinberger wrote:
>> +#define REEXPDB_SHM_NAME "/nfs_reexport_db_lock"
>> +#define REEXPDB_SHM_SZ 4096
>> +#define REEXPDB_INIT_LOCK NFS_STATEDIR "/reexpdb_init.lock"
>> +#define REEXPDB_DBFILE NFS_STATEDIR "/reexpdb.sqlite3"
> 
> I don't know much about sqlite--why do we need to do our own file
> locking?  If we do need to do it ourself, could we lock the database
> file instead instead of using a separate lock file?

Concurrent access to the database is synchronized using a shared rwlock (on shared memory).
reexpdb_init.lock is used to make sure that creating and initializing the shared memory/lock
happens once.
 
>> +static const char initdb_sql[] = "CREATE TABLE IF NOT EXISTS fsidnums (num
>> INTEGER PRIMARY KEY CHECK (num > 0 AND num < 4294967296), path TEXT UNIQUE);
>> CREATE TABLE IF NOT EXISTS subvolumes (path TEXT PRIMARY KEY); CREATE INDEX IF
>> NOT EXISTS idx_ids_path ON fsidnums (path);";
> 
> I'd personally find it easier to read if these were defined in the place
> where they're used.  (And, honestly, if this is just used once, maybe
> the definition is unnecessary.)

Ok.
 
> What are the two tables used for?  Naively I'd've thought the
> "subvolumes" table was redundant.

fsidnums is used to store generated and predefined fsid numbers.
It is only used in reexport modes auto-fsidnum and predefined-fsidnum.

subvolumes contains a list of subvolumes which a are likely in use by
a client. Up start all these paths will get touched such that they can
be exported.

>> +/*
>> + * This query is a little tricky. We use SQL to find and claim the smallest
>> free fsid number.
> 
> Yes, that is a little tricky.  Is it necessary?  My SQL Is rusty, but
> the database should be able to pick a unique value for us, shouldn't it?

SQLite can generate a unique value, but we cannot select the range.
It will give a value between 0 and 2^64.
We need an id between 1 and 2^32. 
 
>> + * To find a free fsid the fsidnums is left joined to itself but with an offset
>> of 1.
>> + * Everything after the UNION statement is to handle the corner case where
>> fsidnums
>> + * is empty. In this case we want 1 as first fsid number.
>> + */
>> +static const char new_fsidnum_by_path_sql[] = "INSERT INTO fsidnums VALUES
>> ((SELECT ids1.num + 1 FROM fsidnums AS ids1 LEFT JOIN fsidnums AS ids2 ON
>> ids2.num = ids1.num + 1 WHERE ids2.num IS NULL UNION SELECT 1 WHERE NOT EXISTS
>> (SELECT NULL FROM fsidnums WHERE num = 1) LIMIT 1), ?1) RETURNING num;";
>> +static const char fsidnum_by_path_sql[] = "SELECT num FROM fsidnums WHERE path
>> = ?1;";
>> +static const char add_crossed_volume_sql[] = "REPLACE INTO subvolumes
>> VALUES(?1);";
>> +static const char drop_crossed_volume_sql[] = "DELETE FROM subvolumes WHERE
>> path = ?1;";
>> +static const char get_crossed_volumes_sql[] = "SELECT path from subvolumes;";
> ...
>> +/*
>> + * reexpdb_init - Initialize reexport database
>> + *
>> + * Setup shared lock (database is concurrently used by multiple processes),
> 
> So, this should all work when rpc.mountd is run with --num_threads > 1?

Yes, that's why we need the shared rwlock.

Thanks,
//richard
J. Bruce Fields March 9, 2022, 2:19 p.m. UTC | #3
On Wed, Mar 09, 2022 at 10:43:34AM +0100, Richard Weinberger wrote:
> Bruce,
> 
> ----- Ursprüngliche Mail -----
> > Von: "bfields" <bfields@fieldses.org>
> > On Thu, Feb 17, 2022 at 02:15:26PM +0100, Richard Weinberger wrote:
> >> +#define REEXPDB_SHM_NAME "/nfs_reexport_db_lock"
> >> +#define REEXPDB_SHM_SZ 4096
> >> +#define REEXPDB_INIT_LOCK NFS_STATEDIR "/reexpdb_init.lock"
> >> +#define REEXPDB_DBFILE NFS_STATEDIR "/reexpdb.sqlite3"
> > 
> > I don't know much about sqlite--why do we need to do our own file
> > locking?  If we do need to do it ourself, could we lock the database
> > file instead instead of using a separate lock file?
> 
> Concurrent access to the database is synchronized using a shared rwlock (on shared memory).
> reexpdb_init.lock is used to make sure that creating and initializing the shared memory/lock
> happens once.

Could you point me to sqlite documentation that explains why the user
would need to do their own locking?

I assumed sqlite would do any necessary locking for you.  It seems like
a core function for a database.

> > What are the two tables used for?  Naively I'd've thought the
> > "subvolumes" table was redundant.
> 
> fsidnums is used to store generated and predefined fsid numbers.
> It is only used in reexport modes auto-fsidnum and predefined-fsidnum.
> 
> subvolumes contains a list of subvolumes which a are likely in use by
> a client. Up start all these paths will get touched such that they can
> be exported.

The fsidnums also contains that same list of paths, right?  So I don't
understand why we need both.

Also, if we're depending on touching all the paths on startup, something
is wrong.

What we want to do is touch the path when we get an upcall for the given
fsid.  That way we don't have to assume, for example, that the system
will never expire mounts that haven't been used recently.

> >> +/*
> >> + * This query is a little tricky. We use SQL to find and claim the smallest
> >> free fsid number.
> > 
> > Yes, that is a little tricky.  Is it necessary?  My SQL Is rusty, but
> > the database should be able to pick a unique value for us, shouldn't it?
> 
> SQLite can generate a unique value, but we cannot select the range.
> It will give a value between 0 and 2^64.
> We need an id between 1 and 2^32. 

Surely that CHECK constraint doesn't somehow cause sqlite to generate
non-unique primary keys?  At worst I'd think it would cause INSERTs to
fail if the ordinary primary-key-choosing algorithm chooses something
over 2^32.

--b.
Richard Weinberger March 9, 2022, 3:02 p.m. UTC | #4
Bruce,

----- Ursprüngliche Mail -----
> Von: "bfields" <bfields@fieldses.org>
>> Concurrent access to the database is synchronized using a shared rwlock (on
>> shared memory).
>> reexpdb_init.lock is used to make sure that creating and initializing the shared
>> memory/lock
>> happens once.
> 
> Could you point me to sqlite documentation that explains why the user
> would need to do their own locking?

https://www.sqlite.org/rescode.html#busy
 
> I assumed sqlite would do any necessary locking for you.  It seems like
> a core function for a database.

Well, SQLite does locking but no queuing.
So, as soon somebody is writing the data base it is locked and all other
read/writes will fail either with SQLITE_BUSY or SQLITE_LOCKED.
It is up to the user how to react on that.
 
That's why I chose to use a shared rwlock where a task can *wait* upon
conflicting access.

Maybe there is a better way do it, dunno.

>> > What are the two tables used for?  Naively I'd've thought the
>> > "subvolumes" table was redundant.
>> 
>> fsidnums is used to store generated and predefined fsid numbers.
>> It is only used in reexport modes auto-fsidnum and predefined-fsidnum.
>> 
>> subvolumes contains a list of subvolumes which a are likely in use by
>> a client. Up start all these paths will get touched such that they can
>> be exported.
> 
> The fsidnums also contains that same list of paths, right?  So I don't
> understand why we need both.

In the current design generated fsidnums will stay forever while the paths
in subvolumes can get cleaned.
 
> Also, if we're depending on touching all the paths on startup, something
> is wrong.

I think we talked about that already and agreed that it should work without
touching. So far I didn't had a chance to investigate into this.

> What we want to do is touch the path when we get an upcall for the given
> fsid.  That way we don't have to assume, for example, that the system
> will never expire mounts that haven't been used recently.
> 
>> >> +/*
>> >> + * This query is a little tricky. We use SQL to find and claim the smallest
>> >> free fsid number.
>> > 
>> > Yes, that is a little tricky.  Is it necessary?  My SQL Is rusty, but
>> > the database should be able to pick a unique value for us, shouldn't it?
>> 
>> SQLite can generate a unique value, but we cannot select the range.
>> It will give a value between 0 and 2^64.
>> We need an id between 1 and 2^32.
> 
> Surely that CHECK constraint doesn't somehow cause sqlite to generate
> non-unique primary keys?  At worst I'd think it would cause INSERTs to
> fail if the ordinary primary-key-choosing algorithm chooses something
> over 2^32.

The CHECK is just a paranoid check. My SQL INSERT generates ids starting with 1.
Sure, if you run it 2^32 times, it will fail due to the CHECK.

Thanks,
//richard
J. Bruce Fields March 9, 2022, 3:28 p.m. UTC | #5
On Wed, Mar 09, 2022 at 04:02:11PM +0100, Richard Weinberger wrote:
> Bruce,
> 
> ----- Ursprüngliche Mail -----
> > Von: "bfields" <bfields@fieldses.org>
> >> Concurrent access to the database is synchronized using a shared rwlock (on
> >> shared memory).
> >> reexpdb_init.lock is used to make sure that creating and initializing the shared
> >> memory/lock
> >> happens once.
> > 
> > Could you point me to sqlite documentation that explains why the user
> > would need to do their own locking?
> 
> https://www.sqlite.org/rescode.html#busy
>  
> > I assumed sqlite would do any necessary locking for you.  It seems like
> > a core function for a database.
> 
> Well, SQLite does locking but no queuing.
> So, as soon somebody is writing the data base it is locked and all other
> read/writes will fail either with SQLITE_BUSY or SQLITE_LOCKED.
> It is up to the user how to react on that.
>  
> That's why I chose to use a shared rwlock where a task can *wait* upon
> conflicting access.
> 
> Maybe there is a better way do it, dunno.

Oh, got it, thanks for the explanation.

Assuming writes are rare, maybe a dumb retry loop would be adequate.
Sounds like that's what we'd need anyway if we were to share the
database between cooperating re-export servers.  (Would we have a
performance problem in that case, if several reexport servers start at
once and all start trying to populate the shared database?  I don't
know.)

Anyway, it's a judgement call, fair enough.  Might be worth a brief
comment, at least.

> >> > What are the two tables used for?  Naively I'd've thought the
> >> > "subvolumes" table was redundant.
> >> 
> >> fsidnums is used to store generated and predefined fsid numbers.
> >> It is only used in reexport modes auto-fsidnum and predefined-fsidnum.
> >> 
> >> subvolumes contains a list of subvolumes which a are likely in use by
> >> a client. Up start all these paths will get touched such that they can
> >> be exported.
> > 
> > The fsidnums also contains that same list of paths, right?  So I don't
> > understand why we need both.
> 
> In the current design generated fsidnums will stay forever while the paths
> in subvolumes can get cleaned.
>  
> > Also, if we're depending on touching all the paths on startup, something
> > is wrong.
> 
> I think we talked about that already and agreed that it should work without
> touching. So far I didn't had a chance to investigate into this.

OK.  Do you think you could look into that, and strip this down to the
one auto-fsidnum case, and then continue the discussion?  I think that'd
clarify things.

As I say, I wouldn't necessarily be opposed to later adding a reexport=
option back in, but for now I'd first like to see if we can find the
simplest patches that will solve the problem in one good-enough way.

> > What we want to do is touch the path when we get an upcall for the given
> > fsid.  That way we don't have to assume, for example, that the system
> > will never expire mounts that haven't been used recently.
> > 
> >> >> +/*
> >> >> + * This query is a little tricky. We use SQL to find and claim the smallest
> >> >> free fsid number.
> >> > 
> >> > Yes, that is a little tricky.  Is it necessary?  My SQL Is rusty, but
> >> > the database should be able to pick a unique value for us, shouldn't it?
> >> 
> >> SQLite can generate a unique value, but we cannot select the range.
> >> It will give a value between 0 and 2^64.
> >> We need an id between 1 and 2^32.
> > 
> > Surely that CHECK constraint doesn't somehow cause sqlite to generate
> > non-unique primary keys?  At worst I'd think it would cause INSERTs to
> > fail if the ordinary primary-key-choosing algorithm chooses something
> > over 2^32.
> 
> The CHECK is just a paranoid check. My SQL INSERT generates ids starting with 1.
> Sure, if you run it 2^32 times, it will fail due to the CHECK.

OK.

--b.
diff mbox series

Patch

diff --git a/configure.ac b/configure.ac
index 93626d62..86bf8ba9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -274,6 +274,17 @@  AC_ARG_ENABLE(nfsv4server,
 	fi
 	AM_CONDITIONAL(CONFIG_NFSV4SERVER, [test "$enable_nfsv4server" = "yes" ])
 
+AC_ARG_ENABLE(reexport,
+	[AC_HELP_STRING([--enable-reexport],
+			[enable support for re-exporting NFS mounts  @<:@default=no@:>@])],
+	enable_reexport=$enableval,
+	enable_reexport="no")
+	if test "$enable_reexport" = yes; then
+		AC_DEFINE(HAVE_REEXPORT_SUPPORT, 1,
+                          [Define this if you want NFS re-export support compiled in])
+	fi
+	AM_CONDITIONAL(CONFIG_REEXPORT, [test "$enable_reexport" = "yes" ])
+
 dnl Check for TI-RPC library and headers
 AC_LIBTIRPC
 
@@ -730,6 +741,7 @@  AC_CONFIG_FILES([
 	support/nsm/Makefile
 	support/nfsidmap/Makefile
 	support/nfsidmap/libnfsidmap.pc
+	support/reexport/Makefile
 	tools/Makefile
 	tools/locktest/Makefile
 	tools/nlmtest/Makefile
diff --git a/support/Makefile.am b/support/Makefile.am
index c962d4d4..986e9b5f 100644
--- a/support/Makefile.am
+++ b/support/Makefile.am
@@ -10,6 +10,10 @@  if CONFIG_JUNCTION
 OPTDIRS += junction
 endif
 
+if CONFIG_REEXPORT
+OPTDIRS += reexport
+endif
+
 SUBDIRS = export include misc nfs nsm $(OPTDIRS)
 
 MAINTAINERCLEANFILES = Makefile.in
diff --git a/support/reexport/Makefile.am b/support/reexport/Makefile.am
new file mode 100644
index 00000000..9d544a8f
--- /dev/null
+++ b/support/reexport/Makefile.am
@@ -0,0 +1,6 @@ 
+## Process this file with automake to produce Makefile.in
+
+noinst_LIBRARIES = libreexport.a
+libreexport_a_SOURCES = reexport.c
+
+MAINTAINERCLEANFILES = Makefile.in
diff --git a/support/reexport/reexport.c b/support/reexport/reexport.c
new file mode 100644
index 00000000..551ec278
--- /dev/null
+++ b/support/reexport/reexport.c
@@ -0,0 +1,477 @@ 
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sqlite3.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/file.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+
+#include "nfslib.h"
+#include "reexport.h"
+#include "xlog.h"
+
+#define REEXPDB_SHM_NAME "/nfs_reexport_db_lock"
+#define REEXPDB_SHM_SZ 4096
+#define REEXPDB_INIT_LOCK NFS_STATEDIR "/reexpdb_init.lock"
+#define REEXPDB_DBFILE NFS_STATEDIR "/reexpdb.sqlite3"
+
+static const char initdb_sql[] = "CREATE TABLE IF NOT EXISTS fsidnums (num INTEGER PRIMARY KEY CHECK (num > 0 AND num < 4294967296), path TEXT UNIQUE); CREATE TABLE IF NOT EXISTS subvolumes (path TEXT PRIMARY KEY); CREATE INDEX IF NOT EXISTS idx_ids_path ON fsidnums (path);";
+/*
+ * This query is a little tricky. We use SQL to find and claim the smallest free fsid number.
+ * To find a free fsid the fsidnums is left joined to itself but with an offset of 1.
+ * Everything after the UNION statement is to handle the corner case where fsidnums
+ * is empty. In this case we want 1 as first fsid number.
+ */
+static const char new_fsidnum_by_path_sql[] = "INSERT INTO fsidnums VALUES ((SELECT ids1.num + 1 FROM fsidnums AS ids1 LEFT JOIN fsidnums AS ids2 ON ids2.num = ids1.num + 1 WHERE ids2.num IS NULL UNION SELECT 1 WHERE NOT EXISTS (SELECT NULL FROM fsidnums WHERE num = 1) LIMIT 1), ?1) RETURNING num;";
+static const char fsidnum_by_path_sql[] = "SELECT num FROM fsidnums WHERE path = ?1;";
+static const char add_crossed_volume_sql[] = "REPLACE INTO subvolumes VALUES(?1);";
+static const char drop_crossed_volume_sql[] = "DELETE FROM subvolumes WHERE path = ?1;";
+static const char get_crossed_volumes_sql[] = "SELECT path from subvolumes;";
+
+static sqlite3 *db;
+static pthread_rwlock_t *reexpdb_rwlock;
+static int init_done;
+
+static void reexpdb_wrlock(void)
+{
+	assert(pthread_rwlock_wrlock(reexpdb_rwlock) == 0);
+}
+
+static void reexpdb_rdlock(void)
+{
+	assert(pthread_rwlock_rdlock(reexpdb_rwlock) == 0);
+}
+
+static void reexpdb_unlock(void)
+{
+	assert(pthread_rwlock_unlock(reexpdb_rwlock) == 0);
+}
+
+static int init_shm_lock(void)
+{
+	int lockfd = -1, shmfd = -1;
+	int initlock = 0;
+	int ret = 0;
+
+	assert(sizeof(*reexpdb_rwlock) <= REEXPDB_SHM_SZ);
+
+	lockfd = open(REEXPDB_INIT_LOCK, O_RDWR | O_CREAT, 0600);
+	if (lockfd == -1) {
+		ret = -1;
+		xlog(L_FATAL, "Unable to open %s: %m", REEXPDB_INIT_LOCK);
+
+		goto out;
+	}
+
+	ret = flock(lockfd, LOCK_EX);
+	if (ret == -1) {
+		ret = -1;
+		xlog(L_FATAL, "Unable to lock %s: %m", REEXPDB_INIT_LOCK);
+
+		goto out_close;
+	}
+
+	shmfd = shm_open(REEXPDB_SHM_NAME, O_RDWR, 0600);
+	if (shmfd == -1 && errno == ENOENT) {
+		shmfd = shm_open(REEXPDB_SHM_NAME, O_RDWR | O_CREAT, 0600);
+		if (shmfd == -1) {
+			ret = -1;
+			xlog(L_FATAL, "Unable to create shared memory: %m");
+			goto out_unflock;
+		}
+
+		ret = ftruncate(shmfd, REEXPDB_SHM_SZ);
+		if (ret == -1) {
+			ret = -1;
+			xlog(L_FATAL, "Unable to ftruncate shared memory: %m");
+			goto out_unflock;
+		}
+
+		initlock = 1;
+	} else if (shmfd == -1) {
+		ret = -1;
+		xlog(L_FATAL, "Unable to open shared memory: %m");
+		goto out_unflock;
+	}
+
+        reexpdb_rwlock = mmap(NULL, REEXPDB_SHM_SZ, PROT_READ | PROT_WRITE, MAP_SHARED, shmfd, 0);
+	close(shmfd);
+        if (reexpdb_rwlock == (void *)-1) {
+                xlog(L_FATAL, "Unable to mmap shared memory: %m");
+		ret = -1;
+		goto out_unflock;
+        }
+
+	if (initlock) {
+		pthread_rwlockattr_t attr;
+
+		ret = pthread_rwlockattr_init(&attr);
+		if (ret != 0) {
+			xlog(L_FATAL, "Unable to pthread_rwlockattr_init: %m");
+			ret = -1;
+			goto out_unflock;
+		}
+
+		ret = pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+		if (ret != 0) {
+			xlog(L_FATAL, "Unable to set PTHREAD_PROCESS_SHARED: %m");
+			ret = -1;
+			goto out_unflock;
+		}
+
+		ret = pthread_rwlock_init(reexpdb_rwlock, &attr);
+		if (ret != 0) {
+			xlog(L_FATAL, "Unable to pthread_rwlock_init: %m");
+			ret = -1;
+			goto out_unflock;
+		}
+	}
+
+	ret = 0;
+
+out_unflock:
+	flock(lockfd, LOCK_UN);
+out_close:
+	close(lockfd);
+out:
+	return ret;
+} 
+
+/*
+ * reexpdb_init - Initialize reexport database
+ *
+ * Setup shared lock (database is concurrently used by multiple processes),
+ * if needed create tables and create database handle.
+ * It is okay to call this function multiple times per process.
+ */
+int reexpdb_init(void)
+{
+	char *sqlerr;
+	int ret;
+
+	if (init_done)
+		return 0;
+
+	ret = init_shm_lock();
+	if (ret)
+		return -1;
+
+	ret = sqlite3_open_v2(REEXPDB_DBFILE, &db, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_FULLMUTEX, NULL);
+	if (ret != SQLITE_OK) {
+		xlog(L_ERROR, "Unable to open reexport database: %s", sqlite3_errstr(ret));
+		return -1;
+	}
+
+	reexpdb_wrlock();
+	ret = sqlite3_exec(db, initdb_sql, NULL, NULL, &sqlerr);
+	reexpdb_unlock();
+	if (ret != SQLITE_OK) {
+		xlog(L_ERROR, "Unable to init reexport database: %s", sqlite3_errstr(ret));
+		sqlite3_free(sqlerr);
+		sqlite3_close_v2(db);
+		ret = -1;
+	} else {
+		init_done = 1;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * reexpdb_destroy - Undo reexpdb_init().
+ *
+ * The shared lock keeps. We cannot know which other
+ * processes are still use the database.
+ */
+void reexpdb_destroy(void)
+{
+	if (!init_done)
+		return;
+
+	sqlite3_close_v2(db);
+	munmap((void *)reexpdb_rwlock, REEXPDB_SHM_SZ);
+	reexpdb_rwlock = NULL;
+}
+
+static int get_fsidnum_by_path(char *path, uint32_t *fsidnum)
+{
+	sqlite3_stmt *stmt = NULL;
+	int found = 0;
+	int ret;
+
+	ret = sqlite3_prepare_v2(db, fsidnum_by_path_sql, sizeof(fsidnum_by_path_sql), &stmt, NULL);
+	if (ret != SQLITE_OK) {
+		xlog(L_WARNING, "Unable to prepare SQL query: %s", sqlite3_errstr(ret));
+		goto out;
+	}
+
+	ret = sqlite3_bind_text(stmt, 1, path, -1, NULL);
+	if (ret != SQLITE_OK) {
+		xlog(L_WARNING, "Unable to bind \"%s\" SQL query: %s", __func__, sqlite3_errstr(ret));
+		goto out;
+	}
+
+	reexpdb_rdlock();
+	ret = sqlite3_step(stmt);
+	if (ret == SQLITE_ROW) {
+		*fsidnum = sqlite3_column_int(stmt, 0);
+		found = 1;
+	} else if (ret == SQLITE_DONE) {
+		/* No hit */
+		found = 0;
+	} else {
+		xlog(L_WARNING, "Error while looking up \"%s\" in database: %s", path, sqlite3_errstr(ret));
+	}
+	reexpdb_unlock();
+
+out:
+	sqlite3_finalize(stmt);
+	return found;
+}
+
+static int new_fsidnum_by_path(char *path, uint32_t *fsidnum)
+{
+	sqlite3_stmt *stmt = NULL;
+	int found = 0, check = 0;
+	int ret;
+
+	ret = sqlite3_prepare_v2(db, new_fsidnum_by_path_sql, sizeof(new_fsidnum_by_path_sql), &stmt, NULL);
+	if (ret != SQLITE_OK) {
+		xlog(L_WARNING, "Unable to prepare SQL query: %s", sqlite3_errstr(ret));
+		goto out;
+	}
+
+	ret = sqlite3_bind_text(stmt, 1, path, -1, NULL);
+	if (ret != SQLITE_OK) {
+		xlog(L_WARNING, "Unable to bind \"%s\" SQL query: %s", path, sqlite3_errstr(ret));
+		goto out;
+	}
+
+	reexpdb_wrlock();
+	ret = sqlite3_step(stmt);
+	if (ret == SQLITE_ROW) {
+		*fsidnum = sqlite3_column_int(stmt, 0);
+		found = 1;
+	} else if (ret == SQLITE_CONSTRAINT) {
+		/* Maybe we lost the race against another writer and the path is now present. */
+		check = 1;
+	} else {
+		xlog(L_WARNING, "Error while looking up \"%s\" in database: %s", path, sqlite3_errstr(ret));
+	}
+	reexpdb_unlock();
+
+out:
+	sqlite3_finalize(stmt);
+
+	if (check) {
+		found = get_fsidnum_by_path(path, fsidnum);
+		if (!found)
+			xlog(L_WARNING, "SQLITE_CONSTRAINT error while inserting \"%s\" in database", path);
+	}
+
+	return found;
+}
+
+int reexpdb_fsidnum_by_path(char *path, uint32_t *fsidnum, int may_create)
+{
+	int found;
+
+	found = get_fsidnum_by_path(path, fsidnum);
+
+	if (!found && may_create)
+		found = new_fsidnum_by_path(path, fsidnum);
+
+	return found;
+}
+
+int reexpdb_apply_reexport_settings(struct exportent *ep, char *flname, int flline)
+{
+	int ret = 0;
+
+	switch (ep->e_reexport) {
+	case REEXP_REMOTE_DEVFSID:
+		if (!ep->e_fsid && !ep->e_uuid) {
+			xlog(L_ERROR, "%s:%i: Selected 'reexport=' mode needs either a numerical or UUID 'fsid='\n",
+			     flname, flline);
+			ret = -1;
+		}
+		break;
+	case REEXP_AUTO_FSIDNUM:
+	case REEXP_PREDEFINED_FSIDNUM: {
+		uint32_t fsidnum;
+		int found;
+
+		if (ep->e_uuid)
+			break;
+
+		if (reexpdb_init() != 0) {
+			ret = -1;
+
+			break;
+		}
+
+		found = reexpdb_fsidnum_by_path(ep->e_path, &fsidnum, 0);
+		if (!found) {
+			if (ep->e_reexport == REEXP_AUTO_FSIDNUM) {
+				found = reexpdb_fsidnum_by_path(ep->e_path, &fsidnum, 1);
+				if (!found) {
+					xlog(L_ERROR, "%s:%i: Unable to generate fsid for %s",
+					     flname, flline, ep->e_path);
+					ret = -1;
+
+					break;
+				}
+			} else {
+				if (!ep->e_fsid) {
+					xlog(L_ERROR, "%s:%i: Selected 'reexport=' mode requires either a UUID 'fsid=' or a numerical 'fsid=' or a reexport db entry %d",
+					     flname, flline, ep->e_fsid);
+					ret = -1;
+				}
+
+				break;
+			}
+		}
+
+		if (ep->e_fsid) {
+			if (ep->e_fsid != fsidnum) {
+				xlog(L_ERROR, "%s:%i: Selected 'reexport=' mode requires configured numerical 'fsid=' to agree with reexport db entry",
+				     flname, flline);
+				ret = -1;
+			}
+
+			break;
+		}
+
+		ep->e_fsid = fsidnum;
+
+		break;
+	}
+	}
+
+	return ret;
+}
+
+int reexpdb_add_subvolume(char *path)
+{
+	sqlite3_stmt *stmt = NULL;
+	int ret;
+
+	reexpdb_wrlock();
+	ret = sqlite3_prepare_v2(db, add_crossed_volume_sql, sizeof(add_crossed_volume_sql), &stmt, NULL);
+	if (ret != SQLITE_OK) {
+		xlog(L_WARNING, "Unable to prepare SQL query: %s", sqlite3_errstr(ret));
+		ret = -1;
+		goto out;
+	}
+
+	ret = sqlite3_bind_text(stmt, 1, path, -1, NULL);
+	if (ret != SQLITE_OK) {
+		xlog(L_WARNING, "Unable to bind \"%s\" SQL query: %s", __func__, sqlite3_errstr(ret));
+		ret = -1;
+		goto out;
+	}
+
+	ret = sqlite3_step(stmt);
+	if (ret != SQLITE_DONE) {
+		xlog(L_WARNING, "Error while adding \"%s\" from database: %s", path, sqlite3_errstr(ret));
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+
+out:
+	reexpdb_unlock();
+	sqlite3_finalize(stmt);
+	return ret;
+}
+
+int reexpdb_drop_subvolume_unlocked(char *path)
+{
+	sqlite3_stmt *stmt = NULL;
+	int ret;
+
+	ret = sqlite3_prepare_v2(db, drop_crossed_volume_sql, sizeof(drop_crossed_volume_sql), &stmt, NULL);
+	if (ret != SQLITE_OK) {
+		xlog(L_WARNING, "Unable to prepare SQL query: %s", sqlite3_errstr(ret));
+		ret = -1;
+		goto out;
+	}
+
+	ret = sqlite3_bind_text(stmt, 1, path, -1, NULL);
+	if (ret != SQLITE_OK) {
+		xlog(L_WARNING, "Unable to bind \"%s\" SQL query: %s", __func__, sqlite3_errstr(ret));
+		ret = -1;
+		goto out;
+	}
+
+	ret = sqlite3_step(stmt);
+	if (ret != SQLITE_DONE) {
+		xlog(L_WARNING, "Error while deleting \"%s\" from database: %s", path, sqlite3_errstr(ret));
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+
+out:
+	sqlite3_finalize(stmt);
+	return ret;
+}
+
+
+int reexpdb_uncover_subvolumes(void (*cb)(char *path))
+{
+	sqlite3_stmt *stmt = NULL;
+	struct statfs st;
+	const unsigned char *path;
+	int ret;
+
+	if (cb)
+		reexpdb_wrlock();
+	else
+		reexpdb_rdlock();
+
+	ret = sqlite3_prepare_v2(db, get_crossed_volumes_sql, sizeof(get_crossed_volumes_sql), &stmt, NULL);
+	if (ret != SQLITE_OK) {
+		xlog(L_WARNING, "Unable to prepare SQL query: %s", sqlite3_errstr(ret));
+		ret = -1;
+		goto out;
+	}
+
+	for (;;) {
+		ret = sqlite3_step(stmt);
+		if (ret != SQLITE_ROW)
+			break;
+
+		path = sqlite3_column_text(stmt, 0);
+		if (cb)
+			cb((char *)path);
+		else
+			statfs((char *)path, &st);
+	}
+
+	if (ret != SQLITE_DONE) {
+		xlog(L_WARNING, "Error while reading all subvolumes: %s", sqlite3_errstr(ret));
+		ret = -1;
+		goto out_unlock;
+	}
+
+	ret = 0;
+
+out_unlock:
+	reexpdb_unlock();
+	sqlite3_finalize(stmt);
+out:
+	return ret;
+}
diff --git a/support/reexport/reexport.h b/support/reexport/reexport.h
new file mode 100644
index 00000000..46ec8a96
--- /dev/null
+++ b/support/reexport/reexport.h
@@ -0,0 +1,53 @@ 
+#ifndef REEXPORT_H
+#define REEXPORT_H
+
+enum {
+	REEXP_NONE = 0,
+	REEXP_AUTO_FSIDNUM,
+	REEXP_PREDEFINED_FSIDNUM,
+	REEXP_REMOTE_DEVFSID,
+};
+
+#ifdef HAVE_REEXPORT_SUPPORT
+int reexpdb_init(void);
+void reexpdb_destroy(void);
+int reexpdb_fsidnum_by_path(char *path, uint32_t *fsidnum, int may_create);
+int reexpdb_apply_reexport_settings(struct exportent *ep, char *flname, int flline);
+int reexpdb_add_subvolume(char *path);
+int reexpdb_uncover_subvolumes(void (*cb)(char *path));
+int reexpdb_drop_subvolume_unlocked(char *path);
+#else
+static inline int reexpdb_init(void) { return 0; }
+static inline void reexpdb_destroy(void) {}
+static inline int reexpdb_fsidnum_by_path(char *path, uint32_t *fsidnum, int may_create)
+{
+	(void)path;
+	(void)may_create;
+	*fsidnum = 0;
+	return 0;
+}
+static inline int reexpdb_apply_reexport_settings(struct exportent *ep, char *flname, int flline)
+{
+	(void)ep;
+	(void)flname;
+	(void)flline;
+	return 0;
+}
+static inline int reexpdb_add_subvolume(char *path)
+{
+	(void)path;
+	return 0;
+}
+static inline int reexpdb_uncover_subvolumes(void (*cb)(char *path))
+{
+	(void)cb;
+	return 0;
+}
+static inline int reexpdb_drop_subvolume_unlocked(char *path)
+{
+	(void)path;
+	return 0;
+}
+#endif /* HAVE_REEXPORT_SUPPORT */
+
+#endif /* REEXPORT_H */