Message ID | 20190219115136.29952-5-boaz@plexistor.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | zuf: ZUFS Zero-copy User-mode FileSystem | expand |
On Tue, 2019-02-19 at 13:51 +0200, Boaz harrosh wrote: > NetApp Security WARNING: This is an external email. Do not click links or open > attachments unless you recognize the sender and know the content is safe. > > > > > From: Boaz Harrosh <boazh@netapp.com> > > zuf-core established the communication channels with the ZUS > User Mode Server. > > In this patch we have the core communication mechanics. > Which is the Novelty of this project. > (See previous submitted documentation for more info) > > Users will come later in the patchset > > Signed-off-by: Boaz Harrosh <boazh@netapp.com> > --- > fs/zuf/_extern.h | 22 + > fs/zuf/_pr.h | 4 + > fs/zuf/relay.h | 88 ++++ > fs/zuf/zuf-core.c | 1016 ++++++++++++++++++++++++++++++++++++++++++++- > fs/zuf/zuf-root.c | 7 + > fs/zuf/zuf.h | 46 ++ > fs/zuf/zus_api.h | 185 +++++++++ > 7 files changed, 1367 insertions(+), 1 deletion(-) > create mode 100644 fs/zuf/relay.h > > diff --git a/fs/zuf/_extern.h b/fs/zuf/_extern.h > index 3bb9f1d9acf6..52bb6b9deafe 100644 > --- a/fs/zuf/_extern.h > +++ b/fs/zuf/_extern.h > @@ -28,10 +28,32 @@ struct dentry *zuf_mount(struct file_system_type *fs_type, > int flags, > const char *dev_name, void *data); > > /* zuf-core.c */ > +int zufc_zts_init(struct zuf_root_info *zri); /* Some private types in core > */ > +void zufc_zts_fini(struct zuf_root_info *zri); > + > long zufc_ioctl(struct file *filp, unsigned int cmd, ulong arg); > int zufc_release(struct inode *inode, struct file *file); > int zufc_mmap(struct file *file, struct vm_area_struct *vma); > > +int __zufc_dispatch_mount(struct zuf_root_info *zri, > + enum e_mount_operation op, > + struct zufs_ioc_mount *zim); > +int zufc_dispatch_mount(struct zuf_root_info *zri, struct zus_fs_info > *zus_zfi, > + enum e_mount_operation operation, > + struct zufs_ioc_mount *zim); > + > +const char *zuf_op_name(enum e_zufs_operation op); > +int __zufc_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo); > +static inline > +int zufc_dispatch(struct zuf_root_info *zri, struct zufs_ioc_hdr *hdr, > + struct page **pages, uint nump) > +{ > + struct zuf_dispatch_op zdo; > + > + zuf_dispatch_init(&zdo, hdr, pages, nump); > + return __zufc_dispatch(zri, &zdo); > +} > + > /* zuf-root.c */ > int zufr_register_fs(struct super_block *sb, struct zufs_ioc_register_fs > *rfs); > > diff --git a/fs/zuf/_pr.h b/fs/zuf/_pr.h > index 30b8cf912c1f..dc9f85453890 100644 > --- a/fs/zuf/_pr.h > +++ b/fs/zuf/_pr.h > @@ -39,5 +39,9 @@ > > /* ~~~ channel prints ~~~ */ > #define zuf_dbg_err(s, args ...) zuf_chan_debug("error", s, ##args) > +#define zuf_dbg_vfs(s, args ...) zuf_chan_debug("vfs ", s, ##args) > +#define zuf_dbg_core(s, args ...) zuf_chan_debug("core ", s, ##args) > +#define zuf_dbg_zus(s, args ...) zuf_chan_debug("zusdg", s, ##args) > +#define zuf_dbg_verbose(s, args ...) zuf_chan_debug("d-oto", s, ##args) > > #endif /* define __ZUF_PR_H__ */ > diff --git a/fs/zuf/relay.h b/fs/zuf/relay.h > new file mode 100644 > index 000000000000..a17d242b313a > --- /dev/null > +++ b/fs/zuf/relay.h > @@ -0,0 +1,88 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Relay scheduler-object Header file. > + * > + * Copyright (c) 2018 NetApp Inc. All rights reserved. > + * > + * ZUFS-License: GPL-2.0. See module.c for LICENSE details. > + * > + * Authors: > + * Boaz Harrosh <boazh@netapp.com> > + */ > + > +#ifndef __RELAY_H__ > +#define __RELAY_H__ > + > +/* ~~~~ Relay ~~~~ */ > +struct relay { > + wait_queue_head_t fss_wq; > + bool fss_wakeup; > + bool fss_waiting; > + > + wait_queue_head_t app_wq; > + bool app_wakeup; > + bool app_waiting; > + > + cpumask_t cpus_allowed; > +}; > + > +static inline void relay_init(struct relay *relay) > +{ > + init_waitqueue_head(&relay->fss_wq); > + init_waitqueue_head(&relay->app_wq); > +} > + > +static inline bool relay_is_app_waiting(struct relay *relay) > +{ > + return relay->app_waiting; > +} > + > +static inline void relay_app_wakeup(struct relay *relay) > +{ > + relay->app_waiting = false; > + > + relay->app_wakeup = true; > + wake_up(&relay->app_wq); > +} > + > +static inline int relay_fss_wait(struct relay *relay) > +{ > + int err; > + > + relay->fss_waiting = true; > + relay->fss_wakeup = false; > + err = wait_event_interruptible(relay->fss_wq, relay->fss_wakeup); > + > + return err; Could you just do: "return wait_event_interruptible()" directly, instead of using the err variable? > +} > + > +static inline bool relay_is_fss_waiting_grab(struct relay *relay) > +{ > + if (relay->fss_waiting) { > + relay->fss_waiting = false; > + return true; > + } > + return false; > +} > + > +static inline void relay_fss_wakeup(struct relay *relay) > +{ > + relay->fss_wakeup = true; > + wake_up(&relay->fss_wq); > +} > + > +static inline void relay_fss_wakeup_app_wait(struct relay *relay, > + spinlock_t *spinlock) > +{ > + relay->app_waiting = true; > + > + relay_fss_wakeup(relay); > + > + relay->app_wakeup = false; > + if (spinlock) > + spin_unlock(spinlock); > + > + wait_event(relay->app_wq, relay->app_wakeup); > +} > + > +#endif /* ifndef __RELAY_H__ */ > diff --git a/fs/zuf/zuf-core.c b/fs/zuf/zuf-core.c > index e12cae584f8a..95582c0a4ba5 100644 > --- a/fs/zuf/zuf-core.c > +++ b/fs/zuf/zuf-core.c > @@ -18,14 +18,820 @@ > #include <linux/delay.h> > #include <linux/pfn_t.h> > #include <linux/sched/signal.h> > +#include <linux/uaccess.h> > > #include "zuf.h" > > +struct zufc_thread { > + struct zuf_special_file hdr; > + struct relay relay; > + struct vm_area_struct *vma; > + int no; > + int chan; > + > + /* Kernel side allocated IOCTL buffer */ > + struct vm_area_struct *opt_buff_vma; > + void *opt_buff; > + ulong max_zt_command; > + > + /* Next operation*/ > + struct zuf_dispatch_op *zdo; > +}; > + > +enum { INITIAL_ZT_CHANNELS = 3 }; > + > +struct zuf_threads_pool { > + uint _max_zts; > + uint _max_channels; > + /* array of pcp_arrays */ > + struct zufc_thread *_all_zt[ZUFS_MAX_ZT_CHANNELS]; > +}; > + > +static int _alloc_zts_channel(struct zuf_root_info *zri, int channel) > +{ > + zri->_ztp->_all_zt[channel] = alloc_percpu(struct zufc_thread); > + if (unlikely(!zri->_ztp->_all_zt[channel])) { > + zuf_err("!!! alloc_percpu channel=%d failed\n", channel); > + return -ENOMEM; > + } > + return 0; > +} > + > +static inline ulong _zt_pr_no(struct zufc_thread *zt) > +{ > + /* So in hex it will be channel as first nibble and cpu as 3rd and on > */ > + return ((ulong)zt->no << 8) | zt->chan; > +} > + > +int zufc_zts_init(struct zuf_root_info *zri) > +{ > + int c; > + > + zri->_ztp = kcalloc(1, sizeof(struct zuf_threads_pool), GFP_KERNEL); > + if (unlikely(!zri->_ztp)) > + return -ENOMEM; > + > + zri->_ztp->_max_zts = num_online_cpus(); > + zri->_ztp->_max_channels = INITIAL_ZT_CHANNELS; > + > + for (c = 0; c < INITIAL_ZT_CHANNELS; ++c) { > + int err = _alloc_zts_channel(zri, c); > + > + if (unlikely(err)) > + return err; > + } > + > + return 0; > +} > + > +void zufc_zts_fini(struct zuf_root_info *zri) > +{ > + int c; > + > + /* Always safe/must call zufc_zts_fini */ > + if (!zri->_ztp) > + return; > + > + for (c = 0; c < zri->_ztp->_max_channels; ++c) { > + if (zri->_ztp->_all_zt[c]) > + free_percpu(zri->_ztp->_all_zt[c]); > + } > + kfree(zri->_ztp); > + zri->_ztp = NULL; > +} > + > +static struct zufc_thread *_zt_from_cpu(struct zuf_root_info *zri, > + int cpu, uint chan) > +{ > + return per_cpu_ptr(zri->_ztp->_all_zt[chan], cpu); > +} > + > +static int _zt_from_f(struct file *filp, int cpu, uint chan, > + struct zufc_thread **ztp) > +{ > + *ztp = _zt_from_cpu(ZRI(filp->f_inode->i_sb), cpu, chan); > + if (unlikely(!*ztp)) > + return -ERANGE; > + return 0; I'm curious if there is a reason you did it this way instead of making use of the ERR_PTR() macro to return ztp directly? > +} > + > +static int _zu_register_fs(struct file *file, void *parg) > +{ > + struct zufs_ioc_register_fs rfs; > + int err; > + > + err = copy_from_user(&rfs, parg, sizeof(rfs)); > + if (unlikely(err)) { > + zuf_err("=>%d\n", err); > + return err; > + } > + > + err = zufr_register_fs(file->f_inode->i_sb, &rfs); > + if (err) > + zuf_err("=>%d\n", err); > + err = put_user(err, (int *)parg); > + return err; > +} > + > +/* ~~~~ mounting ~~~~*/ > +int __zufc_dispatch_mount(struct zuf_root_info *zri, > + enum e_mount_operation operation, > + struct zufs_ioc_mount *zim) > +{ > + zim->hdr.operation = operation; > + > + for (;;) { > + bool fss_waiting; > + > + spin_lock(&zri->mount.lock); > + > + if (unlikely(!zri->mount.zsf.file)) { > + spin_unlock(&zri->mount.lock); > + zuf_err("Server not up\n"); > + zim->hdr.err = -EIO; > + return zim->hdr.err; > + } > + > + fss_waiting = relay_is_fss_waiting_grab(&zri->mount.relay); > + if (fss_waiting) > + break; > + /* in case of break above spin_unlock is done inside > + * relay_fss_wakeup_app_wait > + */ > + > + spin_unlock(&zri->mount.lock); > + > + /* It is OK to wait if user storms mounts */ > + zuf_dbg_verbose("waiting\n"); > + msleep(100); > + } > + > + zri->mount.zim = zim; > + relay_fss_wakeup_app_wait(&zri->mount.relay, &zri->mount.lock); > + > + return zim->hdr.err; > +} > + > +int zufc_dispatch_mount(struct zuf_root_info *zri, struct zus_fs_info > *zus_zfi, > + enum e_mount_operation operation, > + struct zufs_ioc_mount *zim) > +{ > + zim->hdr.out_len = sizeof(*zim); > + zim->hdr.in_len = sizeof(*zim); > + if (operation == ZUFS_M_MOUNT || operation == ZUFS_M_REMOUNT) > + zim->hdr.in_len += zim->zmi.po.mount_options_len; > + zim->zmi.zus_zfi = zus_zfi; > + zim->zmi.num_cpu = zri->_ztp->_max_zts; > + zim->zmi.num_channels = zri->_ztp->_max_channels; > + > + return __zufc_dispatch_mount(zri, operation, zim); > +} > + > +static int _zu_mount(struct file *file, void *parg) > +{ > + struct super_block *sb = file->f_inode->i_sb; > + struct zuf_root_info *zri = ZRI(sb); > + bool waiting_for_reply; > + struct zufs_ioc_mount *zim; > + ulong cp_ret; > + int err; > + > + spin_lock(&zri->mount.lock); > + > + if (unlikely(!file->private_data)) { > + /* First time register this file as the mount-thread owner */ > + zri->mount.zsf.type = zlfs_e_mout_thread; > + zri->mount.zsf.file = file; > + file->private_data = &zri->mount.zsf; > + } else if (unlikely(file->private_data != &zri->mount)) { > + spin_unlock(&zri->mount.lock); > + zuf_err("Say what?? %p != %p\n", > + file->private_data, &zri->mount); > + return -EIO; > + } > + > + zim = zri->mount.zim; > + zri->mount.zim = NULL; > + waiting_for_reply = zim && relay_is_app_waiting(&zri->mount.relay); > + > + spin_unlock(&zri->mount.lock); > + > + if (waiting_for_reply) { > + cp_ret = copy_from_user(zim, parg, zim->hdr.out_len); > + if (unlikely(cp_ret)) { > + zuf_err("copy_from_user => %ld\n", cp_ret); > + zim->hdr.err = -EFAULT; > + } > + > + relay_app_wakeup(&zri->mount.relay); > + } > + > + /* This gets to sleep until a mount comes */ > + err = relay_fss_wait(&zri->mount.relay); > + if (unlikely(err || !zri->mount.zim)) { > + struct zufs_ioc_hdr *hdr = parg; > + > + /* Released by _zu_break INTER or crash */ > + zuf_dbg_zus("_zu_break? %p => %d\n", zri->mount.zim, err); > + put_user(ZUFS_OP_BREAK, &hdr->operation); > + put_user(EIO, &hdr->err); > + return err; > + } > + > + zim = zri->mount.zim; > + cp_ret = copy_to_user(parg, zim, zim->hdr.in_len); > + if (unlikely(cp_ret)) { > + err = -EFAULT; > + zuf_err("copy_to_user =>%ld\n", cp_ret); > + } > + return err; > +} > + > +static void zufc_mounter_release(struct file *file) > +{ > + struct zuf_root_info *zri = ZRI(file->f_inode->i_sb); > + > + zuf_dbg_zus("closed fu=%d au=%d fw=%d aw=%d\n", > + zri->mount.relay.fss_wakeup, zri->mount.relay.app_wakeup, > + zri->mount.relay.fss_waiting, zri->mount.relay.app_waiting); > + > + spin_lock(&zri->mount.lock); > + zri->mount.zsf.file = NULL; > + if (relay_is_app_waiting(&zri->mount.relay)) { > + zuf_err("server emergency exit while IO\n"); > + > + if (zri->mount.zim) > + zri->mount.zim->hdr.err = -EIO; > + spin_unlock(&zri->mount.lock); > + > + relay_app_wakeup(&zri->mount.relay); > + msleep(1000); /* crap */ > + } else { > + if (zri->mount.zim) > + zri->mount.zim->hdr.err = 0; > + spin_unlock(&zri->mount.lock); > + } > +} > + > +/* ~~~~ ZU_IOC_NUMA_MAP ~~~~ */ > +static int _zu_numa_map(struct file *file, void *parg) > +{ > + struct zufs_ioc_numa_map *numa_map; > + int n_nodes = num_online_nodes(); > + int n_cpus = num_online_cpus(); > + uint *nodes_cpu_count; > + uint max_cpu_per_node = 0; > + uint alloc_size; > + int cpu, i, err; > + > + alloc_size = sizeof(*numa_map) + n_cpus; /* char per cpu */ > + > + if ((n_nodes > 255) || (alloc_size > PAGE_SIZE)) { > + zuf_warn("!!!unexpected big machine with %d nodes > alloc_size=0x%x\n", > + n_nodes, alloc_size); > + return -ENOTSUPP; > + } > + > + nodes_cpu_count = kcalloc(n_nodes, sizeof(uint), GFP_KERNEL); > + if (unlikely(!nodes_cpu_count)) > + return -ENOMEM; > + > + numa_map = kzalloc(alloc_size, GFP_KERNEL); > + if (unlikely(!numa_map)) { > + err = -ENOMEM; > + goto out; > + } > + > + numa_map->possible_nodes = num_possible_nodes(); > + numa_map->possible_cpus = num_possible_cpus(); > + > + numa_map->online_nodes = n_nodes; > + numa_map->online_cpus = n_cpus; > + > + for_each_cpu(cpu, cpu_online_mask) { > + uint ctn = cpu_to_node(cpu); > + uint ncc = ++nodes_cpu_count[ctn]; > + > + numa_map->cpu_to_node[cpu] = ctn; > + max_cpu_per_node = max(max_cpu_per_node, ncc); > + } > + > + for (i = 1; i < n_nodes; ++i) { > + if (nodes_cpu_count[i] != nodes_cpu_count[0]) { > + zuf_info("@[%d]=%d Unbalanced CPU sockets @[0]=%d\n", > + i, nodes_cpu_count[i], nodes_cpu_count[0]); > + numa_map->nodes_not_symmetrical = true; > + break; > + } > + } > + > + numa_map->max_cpu_per_node = max_cpu_per_node; > + > + zuf_dbg_verbose( > + "possible_nodes=%d possible_cpus=%d online_nodes=%d > online_cpus=%d\n", > + numa_map->possible_nodes, numa_map->possible_cpus, > + n_nodes, n_cpus); > + > + err = copy_to_user(parg, numa_map, alloc_size); > + kfree(numa_map); > +out: > + kfree(nodes_cpu_count); > + return err; > +} > + > +static int _map_pages(struct zufc_thread *zt, struct page **pages, uint nump, > + bool map_readonly) > +{ > + int p, err; > + > + if (!(zt->vma && pages && nump)) > + return 0; > + > + for (p = 0; p < nump; ++p) { > + ulong zt_addr = zt->vma->vm_start + p * PAGE_SIZE; > + ulong pfn = page_to_pfn(pages[p]); > + pfn_t pfnt = phys_to_pfn_t(PFN_PHYS(pfn), PFN_MAP | PFN_DEV); > + vm_fault_t flt; > + > + if (map_readonly) > + flt = vmf_insert_mixed(zt->vma, zt_addr, pfnt); > + else > + flt = vmf_insert_mixed_mkwrite(zt->vma, zt_addr, > pfnt); > + err = zuf_flt_to_err(flt); > + if (unlikely(err)) { > + zuf_err("zuf: remap_pfn_range => %d p=0x%x > start=0x%lx\n", > + err, p, zt->vma->vm_start); > + return err; > + } > + } > + return 0; > +} > + > +static void _unmap_pages(struct zufc_thread *zt, struct page **pages, uint > nump) > +{ > + if (!(zt->vma && zt->zdo && pages && nump)) > + return; > + > + zt->zdo->pages = NULL; > + zt->zdo->nump = 0; > + > + zap_vma_ptes(zt->vma, zt->vma->vm_start, nump * PAGE_SIZE); > +} > + > +static void _fill_buff(ulong *buff, uint size) > +{ > + ulong *buff_end = buff + size; > + ulong val = 0; > + > + for (; buff < buff_end; ++buff, ++val) > + *buff = val; > +} > + > +static int _zu_init(struct file *file, void *parg) > +{ > + struct zufc_thread *zt; > + int cpu = smp_processor_id(); > + struct zufs_ioc_init zi_init; > + int err; > + > + err = copy_from_user(&zi_init, parg, sizeof(zi_init)); > + if (unlikely(err)) { > + zuf_err("=>%d\n", err); > + return err; > + } > + if (unlikely(zi_init.channel_no >= ZUFS_MAX_ZT_CHANNELS)) { > + zuf_err("[%d] channel_no=%d\n", cpu, zi_init.channel_no); > + return -EINVAL; > + } > + > + zuf_dbg_zus("[%d] aff=0x%lx channel=%d\n", > + cpu, zi_init.affinity, zi_init.channel_no); > + > + zi_init.hdr.err = _zt_from_f(file, cpu, zi_init.channel_no, &zt); > + if (unlikely(zi_init.hdr.err)) { > + zuf_err("=>%d\n", err); > + goto out; > + } > + > + if (unlikely(zt->hdr.file)) { > + zi_init.hdr.err = -EINVAL; > + zuf_err("[%d] !!! thread already set\n", cpu); > + goto out; > + } > + > + relay_init(&zt->relay); > + zt->hdr.type = zlfs_e_zt; > + zt->hdr.file = file; > + zt->no = cpu; > + zt->chan = zi_init.channel_no; > + > + zt->max_zt_command = zi_init.max_command; > + zt->opt_buff = vmalloc(zi_init.max_command); > + if (unlikely(!zt->opt_buff)) { > + zi_init.hdr.err = -ENOMEM; > + goto out; > + } > + _fill_buff(zt->opt_buff, zi_init.max_command / sizeof(ulong)); > + > + file->private_data = &zt->hdr; > +out: > + err = copy_to_user(parg, &zi_init, sizeof(zi_init)); > + if (err) > + zuf_err("=>%d\n", err); > + return err; > +} > + > +struct zufc_thread *_zt_from_f_private(struct file *file) > +{ > + struct zuf_special_file *zsf = file->private_data; > + > + WARN_ON(zsf->type != zlfs_e_zt); > + return container_of(zsf, struct zufc_thread, hdr); > +} > + > +/* Caller checks that file->private_data != NULL */ > +static void zufc_zt_release(struct file *file) > +{ > + struct zufc_thread *zt = _zt_from_f_private(file); > + > + if (unlikely(zt->hdr.file != file)) > + zuf_err("What happened zt->file(%p) != file(%p)\n", > + zt->hdr.file, file); > + > + zuf_dbg_zus("[%d] closed fu=%d au=%d fw=%d aw=%d\n", > + zt->no, zt->relay.fss_wakeup, zt->relay.app_wakeup, > + zt->relay.fss_waiting, zt->relay.app_waiting); > + > + if (relay_is_app_waiting(&zt->relay)) { > + zuf_err("server emergency exit while IO\n"); > + > + /* NOTE: Do not call _unmap_pages the vma is gone */ > + zt->hdr.file = NULL; > + > + relay_app_wakeup(&zt->relay); > + msleep(1000); /* crap */ > + } > + > + vfree(zt->opt_buff); > + memset(zt, 0, sizeof(*zt)); > +} > + > +static int _copy_outputs(struct zufc_thread *zt, void *arg) > +{ > + struct zufs_ioc_hdr *hdr = zt->zdo->hdr; > + struct zufs_ioc_hdr *user_hdr = zt->opt_buff; > + > + if (zt->opt_buff_vma->vm_start != (ulong)arg) { > + zuf_err("malicious Server\n"); > + return -EINVAL; > + } > + > + /* Update on the user out_len and return-code */ > + hdr->err = user_hdr->err; > + hdr->out_len = user_hdr->out_len; > + > + if (!hdr->out_len) > + return 0; > + > + if ((hdr->err == -EZUFS_RETRY) || (hdr->out_max < hdr->out_len)) { > + if (WARN_ON(!zt->zdo->oh)) { > + zuf_err("Trouble op(%s) out_max=%d out_len=%d\n", > + zuf_op_name(hdr->operation), > + hdr->out_max, hdr->out_len); > + return -EFAULT; > + } > + zuf_dbg_zus("[%s] %d %d => %d\n", > + zuf_op_name(hdr->operation), > + hdr->out_max, hdr->out_len, hdr->err); > + return zt->zdo->oh(zt->zdo, zt->opt_buff, zt->max_zt_command); > + } else { > + void *rply = (void *)hdr + hdr->out_start; > + void *from = zt->opt_buff + hdr->out_start; > + > + memcpy(rply, from, hdr->out_len); > + return 0; > + } > +} > + > +static int _zu_wait(struct file *file, void *parg) > +{ > + struct zufc_thread *zt; > + int err; > + > + zt = _zt_from_f_private(file); > + if (unlikely(!zt)) { > + zuf_err("Unexpected ZT state\n"); > + err = -ERANGE; > + goto err; > + } > + > + if (!zt->hdr.file || file != zt->hdr.file) { > + zuf_err("fatal\n"); > + err = -E2BIG; > + goto err; > + } > + if (unlikely((ulong)parg != zt->opt_buff_vma->vm_start)) { > + zuf_err("fatal 2\n"); > + err = -EINVAL; > + goto err; > + } > + > + if (relay_is_app_waiting(&zt->relay)) { > + if (unlikely(!zt->zdo)) { > + zuf_err("User has gone...\n"); > + err = -E2BIG; > + goto err; > + } else { > + /* overflow_handler might decide to execute the > + *parg here at zus context and return to server > + * If it also has an error to report to zus it > + * will set zdo->hdr->err. > + * EZUS_RETRY_DONE is when that happens. > + * In this case pages stay mapped in zt->vma > + */ > + err = _copy_outputs(zt, parg); > + if (err == EZUF_RETRY_DONE) { > + put_user(zt->zdo->hdr->err, (int *)parg); > + return 0; > + } > + > + _unmap_pages(zt, zt->zdo->pages, zt->zdo->nump); > + zt->zdo = NULL; > + if (unlikely(err)) /* _copy_outputs returned an err */ > + goto err; > + } > + relay_app_wakeup(&zt->relay); > + } > + > + err = relay_fss_wait(&zt->relay); > + if (err) > + zuf_dbg_err("[%d] relay error: %d\n", zt->no, err); > + > + if (zt->zdo && zt->zdo->hdr && > + zt->zdo->hdr->operation < ZUFS_OP_BREAK) { > + /* call map here at the zuf thread so we need no locks > + * TODO: Currently only ZUFS_OP_WRITE protects user-buffers > + * we should have a bit set in zt->zdo->hdr set per operation. > + * TODO: Why this does not work? > + */ > + _map_pages(zt, zt->zdo->pages, zt->zdo->nump, 0); > + memcpy(zt->opt_buff, zt->zdo->hdr, zt->zdo->hdr->in_len); > + } else { > + struct zufs_ioc_hdr *hdr = zt->opt_buff; > + > + /* This Means we were released by _zu_break */ > + zuf_dbg_zus("_zu_break? => %d\n", err); > + hdr->operation = ZUFS_OP_BREAK; > + hdr->err = err; > + } > + > + return err; > + > +err: > + put_user(err, (int *)parg); > + return err; > +} > + > +static int _try_grab_zt_channel(struct zuf_root_info *zri, int cpu, > + struct zufc_thread **ztp) > +{ > + struct zufc_thread *zt; > + int c; > + > + for (c = 0; ; ++c) { > + zt = _zt_from_cpu(zri, cpu, c); > + if (unlikely(!zt || !zt->hdr.file)) > + break; > + > + if (relay_is_fss_waiting_grab(&zt->relay)) { > + *ztp = zt; > + return true; > + } > + } > + > + *ztp = _zt_from_cpu(zri, cpu, 0); > + return false; > +} > + > +#define _zuf_get_cpu() get_cpu() > +#define _zuf_put_cpu() put_cpu() > + > +#ifdef CONFIG_ZUF_DEBUG > +static > +int _r_zufs_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo) > +#else > +int __zufc_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo) > +#endif > +{ > + struct task_struct *app = get_current(); > + struct zufs_ioc_hdr *hdr = zdo->hdr; > + int cpu, cpu2; > + struct zufc_thread *zt; > + > + if (unlikely(hdr->out_len && !hdr->out_max)) { > + /* TODO: Complain here and let caller code do this proper */ > + hdr->out_max = hdr->out_len; > + } > + > +channel_busy: > + cpu = _zuf_get_cpu(); > + > + if (!_try_grab_zt_channel(zri, cpu, &zt)) { > + _zuf_put_cpu(); > + > + /* If channel was grabbed then maybe a break_all is in > progress > + * on a different CPU make sure zt->file on this core is > + * updated > + */ > + mb(); > + if (unlikely(!zt->hdr.file)) { > + zuf_err("[%d] !zt->file\n", cpu); > + return -EIO; > + } > + zuf_dbg_err("[%d] can this be\n", cpu); > + /* FIXME: Do something much smarter */ > + msleep(10); > + if (signal_pending(get_current())) { > + zuf_dbg_err("[%d] => EINTR\n", cpu); > + return -EINTR; > + } > + goto channel_busy; > + } > + > + /* lock app to this cpu while waiting */ > + cpumask_copy(&zt->relay.cpus_allowed, &app->cpus_allowed); > + cpumask_copy(&app->cpus_allowed, cpumask_of(smp_processor_id())); > + > + zt->zdo = zdo; > + > + _zuf_put_cpu(); > + > + relay_fss_wakeup_app_wait(&zt->relay, NULL); > + > + /* restore cpu affinity after wakeup */ > + cpumask_copy(&app->cpus_allowed, &zt->relay.cpus_allowed); > + > +cpu2 = smp_processor_id(); > +if (cpu2 != cpu) > + zuf_warn("App switched cpu1=%u cpu2=%u\n", cpu, cpu2); > + > + return zt->hdr.file ? hdr->err : -EIO; > +} > + > +const char *zuf_op_name(enum e_zufs_operation op) > +{ > +#define CASE_ENUM_NAME(e) case e: return #e > + switch (op) { > + CASE_ENUM_NAME(ZUFS_OP_BREAK ); > + default: > + return "UNKNOWN"; > + } > +} > + > +#ifdef CONFIG_ZUF_DEBUG > + > +#define MAX_ZT_SEC 5 > +int __zufc_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo) > +{ > + u64 t1, t2; > + int err; > + > + t1 = ktime_get_ns(); > + err = _r_zufs_dispatch(zri, zdo); > + t2 = ktime_get_ns(); > + > + if ((t2 - t1) > MAX_ZT_SEC * NSEC_PER_SEC) > + zuf_err("zufc_dispatch(%s, [0x%x-0x%x]) took %lld sec\n", > + zuf_op_name(zdo->hdr->operation), zdo->hdr->offset, > + zdo->hdr->len, > + (t2 - t1) / NSEC_PER_SEC); > + > + return err; > +} > +#endif /* def CONFIG_ZUF_DEBUG */ > + > +/* ~~~ iomap_exec && exec_buffer allocation ~~~ */ > +struct zu_exec_buff { > + struct zuf_special_file hdr; > + struct vm_area_struct *vma; > + void *opt_buff; > + ulong alloc_size; > +}; > + > +/* Do some common checks and conversions */ > +static inline struct zu_exec_buff *_ebuff_from_file(struct file *file) > +{ > + struct zu_exec_buff *ebuff = file->private_data; > + > + if (WARN_ON_ONCE(ebuff->hdr.type != zlfs_e_dpp_buff)) { > + zuf_err("Must call ZU_IOC_ALLOC_BUFFER first\n"); > + return NULL; > + } > + > + if (WARN_ON_ONCE(ebuff->hdr.file != file)) > + return NULL; > + > + return ebuff; > +} > + > +static int _zu_ebuff_alloc(struct file *file, void *arg) > +{ > + struct zufs_ioc_alloc_buffer ioc_alloc; > + struct zu_exec_buff *ebuff; > + int err; > + > + err = copy_from_user(&ioc_alloc, arg, sizeof(ioc_alloc)); > + if (unlikely(err)) { > + zuf_err("=>%d\n", err); > + return err; > + } > + > + if (ioc_alloc.init_size > ioc_alloc.max_size) > + return -EINVAL; > + > + /* TODO: Easily Support growing */ > + /* TODO: Support global pools, also easy */ > + if (ioc_alloc.pool_no || ioc_alloc.init_size != ioc_alloc.max_size) > + return -ENOTSUPP; > + > + ebuff = kzalloc(sizeof(*ebuff), GFP_KERNEL); > + if (unlikely(!ebuff)) > + return -ENOMEM; > + > + ebuff->hdr.type = zlfs_e_dpp_buff; > + ebuff->hdr.file = file; > + i_size_write(file->f_inode, ioc_alloc.max_size); > + ebuff->alloc_size = ioc_alloc.init_size; > + ebuff->opt_buff = vmalloc(ioc_alloc.init_size); > + if (unlikely(!ebuff->opt_buff)) { > + kfree(ebuff); > + return -ENOMEM; > + } > + _fill_buff(ebuff->opt_buff, ioc_alloc.init_size / sizeof(ulong)); > + > + file->private_data = &ebuff->hdr; > + return 0; > +} > + > +static void zufc_ebuff_release(struct file *file) > +{ > + struct zu_exec_buff *ebuff = _ebuff_from_file(file); > + > + if (unlikely(!ebuff)) > + return; > + > + vfree(ebuff->opt_buff); > + ebuff->hdr.type = 0; > + ebuff->hdr.file = NULL; /* for none-dbg Kernels && use-after-free */ > + kfree(ebuff); > +} > + > +static int _zu_break(struct file *filp, void *parg) > +{ > + struct zuf_root_info *zri = ZRI(filp->f_inode->i_sb); > + int i, c; > + > + zuf_dbg_core("enter\n"); > + mb(); /* TODO how to schedule on all CPU's */ > + > + for (i = 0; i < zri->_ztp->_max_zts; ++i) { > + for (c = 0; c < zri->_ztp->_max_channels; ++c) { > + struct zufc_thread *zt = _zt_from_cpu(zri, i, c); > + > + if (unlikely(!(zt && zt->hdr.file))) > + continue; > + relay_fss_wakeup(&zt->relay); > + } > + } > + > + if (zri->mount.zsf.file) > + relay_fss_wakeup(&zri->mount.relay); > + > + zuf_dbg_core("exit\n"); > + return 0; > +} > + > long zufc_ioctl(struct file *file, unsigned int cmd, ulong arg) > { > + void __user *parg = (void __user *)arg; > + > switch (cmd) { > + case ZU_IOC_REGISTER_FS: > + return _zu_register_fs(file, parg); > + case ZU_IOC_MOUNT: > + return _zu_mount(file, parg); > + case ZU_IOC_NUMA_MAP: > + return _zu_numa_map(file, parg); > + case ZU_IOC_INIT_THREAD: > + return _zu_init(file, parg); > + case ZU_IOC_WAIT_OPT: > + return _zu_wait(file, parg); > + case ZU_IOC_ALLOC_BUFFER: > + return _zu_ebuff_alloc(file, parg); > + case ZU_IOC_BREAK_ALL: > + return _zu_break(file, parg); > default: > - zuf_err("%d\n", cmd); > + zuf_err("%d %ld\n", cmd, ZU_IOC_WAIT_OPT); > return -ENOTTY; > } > } > @@ -38,11 +844,215 @@ int zufc_release(struct inode *inode, struct file *file) > return 0; > > switch (zsf->type) { > + case zlfs_e_zt: > + zufc_zt_release(file); > + return 0; > + case zlfs_e_mout_thread: > + zufc_mounter_release(file); > + return 0; > + case zlfs_e_pmem: > + /* NOTHING to clean for pmem file yet */ > + /* zuf_pmem_release(file);*/ > + return 0; > + case zlfs_e_dpp_buff: > + zufc_ebuff_release(file); > + return 0; > default: > return 0; > } > } > > +/* ~~~~ mmap area of app buffers into server ~~~~ */ > + > +static int zuf_zt_fault(struct vm_fault *vmf) > +{ > + zuf_err("should not fault\n"); > + return VM_FAULT_SIGBUS; > +} > + > +static const struct vm_operations_struct zuf_vm_ops = { > + .fault = zuf_zt_fault, > +}; > + > +static int _zufc_zt_mmap(struct file *file, struct vm_area_struct *vma, > + struct zufc_thread *zt) > +{ > + /* Tell Kernel We will only access on a single core */ > + vma->vm_flags |= VM_MIXEDMAP; > + vma->vm_ops = &zuf_vm_ops; > + > + zt->vma = vma; > + > + zuf_dbg_core( > + "[0x%lx] start=0x%lx end=0x%lx flags=0x%lx file- > start=0x%lx\n", > + _zt_pr_no(zt), vma->vm_start, vma->vm_end, vma->vm_flags, > + vma->vm_pgoff); > + > + return 0; > +} > + > +/* ~~~~ mmap the Kernel allocated IOCTL buffer per ZT ~~~~ */ > +static int _opt_buff_mmap(struct vm_area_struct *vma, void *opt_buff, > + ulong opt_size) > +{ > + ulong offset; > + > + if (!opt_buff) > + return -ENOMEM; > + > + for (offset = 0; offset < opt_size; offset += PAGE_SIZE) { > + ulong addr = vma->vm_start + offset; > + ulong pfn = vmalloc_to_pfn(opt_buff + offset); > + pfn_t pfnt = phys_to_pfn_t(PFN_PHYS(pfn), PFN_MAP | PFN_DEV); > + int err; > + > + zuf_dbg_verbose("[0x%lx] pfn-0x%lx addr=0x%lx buff=0x%lx\n", > + offset, pfn, addr, (ulong)opt_buff + offset); > + > + err = zuf_flt_to_err(vmf_insert_mixed_mkwrite(vma, addr, > pfnt)); > + if (unlikely(err)) { > + zuf_err("zuf: zuf_insert_mixed_mkwrite => %d > offset=0x%lx addr=0x%lx\n", > + err, offset, addr); > + return err; > + } > + } > + > + return 0; > +} > + > +static int zuf_obuff_fault(struct vm_fault *vmf) > +{ > + struct vm_area_struct *vma = vmf->vma; > + struct zufc_thread *zt = _zt_from_f_private(vma->vm_file); > + long offset = (vmf->pgoff << PAGE_SHIFT) - ZUS_API_MAP_MAX_SIZE; > + int err; > + > + zuf_dbg_core( > + "[0x%lx] start=0x%lx end=0x%lx file-start=0x%lx > offset=0x%lx\n", > + _zt_pr_no(zt), vma->vm_start, vma->vm_end, vma->vm_pgoff, > + offset); > + > + /* if Server overruns its buffer crash it dead */ > + if (unlikely((offset < 0) || (zt->max_zt_command < offset))) { > + zuf_err("[0x%lx] start=0x%lx end=0x%lx file-start=0x%lx > offset=0x%lx\n", > + _zt_pr_no(zt), vma->vm_start, > + vma->vm_end, vma->vm_pgoff, offset); > + return VM_FAULT_SIGBUS; > + } > + > + /* We never released a zus-core.c that does not fault the > + * first page first. I want to see if this happens > + */ > + if (unlikely(offset)) > + zuf_warn("Suspicious server activity\n"); > + > + /* This faults only once at very first access */ > + err = _opt_buff_mmap(vma, zt->opt_buff, zt->max_zt_command); > + if (unlikely(err)) > + return VM_FAULT_SIGBUS; > + > + return VM_FAULT_NOPAGE; > +} > + > +static const struct vm_operations_struct zuf_obuff_ops = { > + .fault = zuf_obuff_fault, > +}; > + > +static int _zufc_obuff_mmap(struct file *file, struct vm_area_struct *vma, > + struct zufc_thread *zt) > +{ > + vma->vm_flags |= VM_MIXEDMAP; > + vma->vm_ops = &zuf_obuff_ops; > + > + zt->opt_buff_vma = vma; > + > + zuf_dbg_core( > + "[0x%lx] start=0x%lx end=0x%lx flags=0x%lx file- > start=0x%lx\n", > + _zt_pr_no(zt), vma->vm_start, vma->vm_end, vma->vm_flags, > + vma->vm_pgoff); > + > + return 0; > +} > + > +/* ~~~ */ > + > +static int zufc_zt_mmap(struct file *file, struct vm_area_struct *vma) > +{ > + struct zufc_thread *zt = _zt_from_f_private(file); > + > + /* We have two areas of mmap in this special file. > + * 0 to ZUS_API_MAP_MAX_SIZE: > + * The first part where app pages are mapped > + * into server per operation. > + * ZUS_API_MAP_MAX_SIZE of size zuf_root_info->max_zt_command > + * Is where we map the per ZT ioctl-buffer, later passed > + * to the zus_ioc_wait IOCTL call > + */ > + if (vma->vm_pgoff == ZUS_API_MAP_MAX_SIZE / PAGE_SIZE) > + return _zufc_obuff_mmap(file, vma, zt); > + > + /* zuf ZT API is very particular about where in its > + * special file we communicate > + */ > + if (unlikely(vma->vm_pgoff)) > + return -EINVAL; > + > + return _zufc_zt_mmap(file, vma, zt); > +} > + > +/* ~~~~ Implementation of the ZU_IOC_ALLOC_BUFFER mmap facility ~~~~ */ > + > +static int zuf_ebuff_fault(struct vm_fault *vmf) > +{ > + struct vm_area_struct *vma = vmf->vma; > + struct zu_exec_buff *ebuff = _ebuff_from_file(vma->vm_file); > + long offset = (vmf->pgoff << PAGE_SHIFT); > + int err; > + > + zuf_dbg_core("start=0x%lx end=0x%lx file-start=0x%lx file- > off=0x%lx\n", > + vma->vm_start, vma->vm_end, vma->vm_pgoff, offset); > + > + /* if Server overruns its buffer crash it dead */ > + if (unlikely((offset < 0) || (ebuff->alloc_size < offset))) { > + zuf_err("start=0x%lx end=0x%lx file-start=0x%lx file- > off=0x%lx\n", > + vma->vm_start, vma->vm_end, vma->vm_pgoff, > + offset); > + return VM_FAULT_SIGBUS; > + } > + > + /* We never released a zus-core.c that does not fault the > + * first page first. I want to see if this happens > + */ > + if (unlikely(offset)) > + zuf_warn("Suspicious server activity\n"); > + > + /* This faults only once at very first access */ > + err = _opt_buff_mmap(vma, ebuff->opt_buff, ebuff->alloc_size); > + if (unlikely(err)) > + return VM_FAULT_SIGBUS; > + > + return VM_FAULT_NOPAGE; > +} > + > +static const struct vm_operations_struct zuf_ebuff_ops = { > + .fault = zuf_ebuff_fault, > +}; > + > +static int zufc_ebuff_mmap(struct file *file, struct vm_area_struct *vma) > +{ > + struct zu_exec_buff *ebuff = _ebuff_from_file(vma->vm_file); > + > + vma->vm_flags |= VM_MIXEDMAP; > + vma->vm_ops = &zuf_ebuff_ops; > + > + ebuff->vma = vma; > + > + zuf_dbg_core("start=0x%lx end=0x%lx flags=0x%lx file-start=0x%lx\n", > + vma->vm_start, vma->vm_end, vma->vm_flags, vma- > >vm_pgoff); > + > + return 0; > +} > + > int zufc_mmap(struct file *file, struct vm_area_struct *vma) > { > struct zuf_special_file *zsf = file->private_data; > @@ -53,6 +1063,10 @@ int zufc_mmap(struct file *file, struct vm_area_struct > *vma) > } > > switch (zsf->type) { > + case zlfs_e_zt: > + return zufc_zt_mmap(file, vma); > + case zlfs_e_dpp_buff: > + return zufc_ebuff_mmap(file, vma); > default: > zuf_err("type=%d\n", zsf->type); > return -ENOTTY; > diff --git a/fs/zuf/zuf-root.c b/fs/zuf/zuf-root.c > index 55a839dbc854..37b70ca33d3c 100644 > --- a/fs/zuf/zuf-root.c > +++ b/fs/zuf/zuf-root.c > @@ -227,6 +227,7 @@ static void zufr_put_super(struct super_block *sb) > { > struct zuf_root_info *zri = ZRI(sb); > > + zufc_zts_fini(zri); > _unregister_all_fses(zri); > > zuf_info("zuf_root umount\n"); > @@ -282,10 +283,16 @@ static int zufr_fill_super(struct super_block *sb, void > *data, int silent) > root_i->i_fop = &zufr_file_dir_operations; > root_i->i_op = &zufr_inode_operations; > > + spin_lock_init(&zri->mount.lock); > mutex_init(&zri->sbl_lock); > + relay_init(&zri->mount.relay); > INIT_LIST_HEAD(&zri->fst_list); > INIT_LIST_HEAD(&zri->pmem_list); > > + err = zufc_zts_init(zri); > + if (unlikely(err)) > + return err; /* put will be called we have a root */ > + > return 0; > } > > diff --git a/fs/zuf/zuf.h b/fs/zuf/zuf.h > index f979d8cbe60c..a33f5908155d 100644 > --- a/fs/zuf/zuf.h > +++ b/fs/zuf/zuf.h > @@ -23,9 +23,11 @@ > #include <linux/xattr.h> > #include <linux/exportfs.h> > #include <linux/page_ref.h> > +#include <linux/mm.h> > > #include "zus_api.h" > > +#include "relay.h" > #include "_pr.h" > > enum zlfs_e_special_file { > @@ -44,6 +46,8 @@ struct zuf_special_file { > struct zuf_root_info { > struct __mount_thread_info { > struct zuf_special_file zsf; > + spinlock_t lock; > + struct relay relay; > struct zufs_ioc_mount *zim; > } mount; > > @@ -102,6 +106,48 @@ static inline struct zuf_inode_info *ZUII(struct inode > *inode) > return container_of(inode, struct zuf_inode_info, vfs_inode); > } > > +static inline struct zuf_fs_type *ZUF_FST(struct file_system_type *fs_type) > +{ > + return container_of(fs_type, struct zuf_fs_type, vfs_fst); > +} > + > +static inline struct zuf_fs_type *zuf_fst(struct super_block *sb) > +{ > + return ZUF_FST(sb->s_type); > +} > + > +struct zuf_dispatch_op; > +typedef int (*overflow_handler)(struct zuf_dispatch_op *zdo, void *parg, > + ulong zt_max_bytes); > +struct zuf_dispatch_op { > + struct zufs_ioc_hdr *hdr; > + struct page **pages; > + uint nump; > + overflow_handler oh; > + struct super_block *sb; > + struct inode *inode; > +}; > + > +static inline void > +zuf_dispatch_init(struct zuf_dispatch_op *zdo, struct zufs_ioc_hdr *hdr, > + struct page **pages, uint nump) > +{ > + memset(zdo, 0, sizeof(*zdo)); > + zdo->hdr = hdr; > + zdo->pages = pages; zdo->nump = nump; > +} > + > +static inline int zuf_flt_to_err(vm_fault_t flt) > +{ > + if (likely(flt == VM_FAULT_NOPAGE)) > + return 0; > + > + if (flt == VM_FAULT_OOM) > + return -ENOMEM; > + > + return -EACCES; > +} > + > /* Keep this include last thing in file */ > #include "_extern.h" > > diff --git a/fs/zuf/zus_api.h b/fs/zuf/zus_api.h > index 34e3e1a9a107..3319a70b5ccc 100644 > --- a/fs/zuf/zus_api.h > +++ b/fs/zuf/zus_api.h > @@ -66,6 +66,47 @@ > > #endif /* ndef __KERNEL__ */ > > +/* first available error code after include/linux/errno.h */ > +#define EZUFS_RETRY 531 > + > +/* The below is private to zuf Kernel only. Is not exposed to VFS nor zus > + * (defined here to allocate the constant) > + */ > +#define EZUF_RETRY_DONE 540 > + > +/** > + * zufs dual port memory > + * This is a special type of offset to either memory or persistent-memory, > + * that is designed to be used in the interface mechanism between userspace > + * and kernel, and can be accessed by both. > + * 3 first bits denote a mem-pool: > + * 0 - pmem pool > + * 1-6 - established shared pool by a call to zufs_ioc_create_mempool (below) > + * 7 - offset into app memory > + */ > +typedef __u64 __bitwise zu_dpp_t; > + > +static inline uint zu_dpp_t_pool(zu_dpp_t t) > +{ > + return t & 0x7; > +} > + > +static inline ulong zu_dpp_t_val(zu_dpp_t t) > +{ > + return t & ~0x7; > +} > + > +static inline zu_dpp_t enc_zu_dpp_t(ulong v, uint pool) > +{ > + return v | pool; > +} > + > +/* ~~~~~ ZUFS API ioctl commands ~~~~~ */ > +enum { > + ZUS_API_MAP_MAX_PAGES = 1024, > + ZUS_API_MAP_MAX_SIZE = ZUS_API_MAP_MAX_PAGES * PAGE_SIZE, > +}; > + > struct zufs_ioc_hdr { > __u32 err; /* IN/OUT must be first */ > __u16 in_len; /* How much to be copied *to* zus */ > @@ -102,4 +143,148 @@ struct zufs_ioc_register_fs { > }; > #define ZU_IOC_REGISTER_FS _IOWR('Z', 10, struct zufs_ioc_register_fs) > > +/* A cookie from user-mode returned by mount */ > +struct zus_sb_info; > + > +/* zus cookie per inode */ > +struct zus_inode_info; > + > +enum ZUFS_M_FLAGS { > + ZUFS_M_PEDANTIC = 0x00000001, > + ZUFS_M_EPHEMERAL = 0x00000002, > + ZUFS_M_SILENT = 0x00000004, > +}; > + > +struct zufs_parse_options { > + __u32 mount_options_len; > + __u32 pedantic; > + __u64 mount_flags; > + char mount_options[0]; > +}; > + > +enum e_mount_operation { > + ZUFS_M_MOUNT = 1, > + ZUFS_M_UMOUNT, > + ZUFS_M_REMOUNT, > + ZUFS_M_DDBG_RD, > + ZUFS_M_DDBG_WR, > +}; > + > +struct zufs_mount_info { > + /* IN */ > + struct zus_fs_info *zus_zfi; > + __u16 num_cpu; > + __u16 num_channels; > + __u32 pmem_kern_id; > + __u64 sb_id; > + > + /* OUT */ > + struct zus_sb_info *zus_sbi; > + /* mount is also iget of root */ > + struct zus_inode_info *zus_ii; > + zu_dpp_t _zi; > + __u64 old_mount_opt; > + __u64 remount_flags; > + > + /* More FS specific info */ > + __u32 s_blocksize_bits; > + __u8 acl_on; > + struct zufs_parse_options po; > +}; > + > +/* mount / umount */ > +struct zufs_ioc_mount { > + struct zufs_ioc_hdr hdr; > + struct zufs_mount_info zmi; > +}; > +#define ZU_IOC_MOUNT _IOWR('Z', 11, struct zufs_ioc_mount) > + > +/* pmem */ > +struct zufs_ioc_numa_map { > + /* Set by zus */ > + struct zufs_ioc_hdr hdr; > + > + __u32 possible_nodes; > + __u32 possible_cpus; > + __u32 online_nodes; > + __u32 online_cpus; > + > + __u32 max_cpu_per_node; > + > + /* This indicates that NOT all nodes have @max_cpu_per_node cpus */ > + bool nodes_not_symmetrical; > + > + /* Variable size must keep last > + * size @online_cpus > + */ > + __u8 cpu_to_node[]; > +}; > +#define ZU_IOC_NUMA_MAP _IOWR('Z', 12, struct zufs_ioc_numa_map) > + > +/* ZT init */ > +enum { ZUFS_MAX_ZT_CHANNELS = 64 }; > + > +struct zufs_ioc_init { > + struct zufs_ioc_hdr hdr; > + ulong affinity; /* IN */ > + uint channel_no; > + uint max_command; > +}; > +#define ZU_IOC_INIT_THREAD _IOWR('Z', 14, struct zufs_ioc_init) > + > +/* break_all (Server telling kernel to clean) */ > +struct zufs_ioc_break_all { > + struct zufs_ioc_hdr hdr; > +}; > +#define ZU_IOC_BREAK_ALL _IOWR('Z', 15, struct zufs_ioc_break_all) > + > +/* ~~~ zufs_ioc_wait_operation ~~~ */ > +struct zufs_ioc_wait_operation { > + struct zufs_ioc_hdr hdr; > + /* maximum size is governed by zufs_ioc_init->max_command */ > + char opt_buff[]; > +}; > +#define ZU_IOC_WAIT_OPT _IOWR('Z', 16, struct > zufs_ioc_wait_operation) > + > +/* These are the possible operations sent from Kernel to the Server in the > + * return of the ZU_IOC_WAIT_OPT. > + */ > +enum e_zufs_operation { > + ZUFS_OP_NULL = 0, > + > + ZUFS_OP_BREAK, /* Kernel telling Server to exit */ > + ZUFS_OP_MAX_OPT, > +}; > + > +/* Allocate a special_file that will be a dual-port communication buffer with > + * user mode. > + * Server will access the buffer via the mmap of this file. > + * Kernel will access the file via the valloc() pointer > + * > + * Some IOCTLs below demand use of this kind of buffer for communication > + * TODO: > + * pool_no is if we want to associate this buffer onto the 6 possible > + * mem-pools per zuf_sbi. So anywhere we have a zu_dpp_t it will mean > + * access from this pool. > + * If pool_no is zero then it is private to only this file. In this case > + * sb_id && zus_sbi are ignored / not needed. > + */ > +struct zufs_ioc_alloc_buffer { > + struct zufs_ioc_hdr hdr; > + /* The ID of the super block received in mount */ > + __u64 sb_id; > + /* We verify the sb_id validity against zus_sbi */ > + struct zus_sb_info *zus_sbi; > + /* max size of buffer allowed (size of mmap) */ > + __u32 max_size; > + /* allocate this much on initial call and set into vma */ > + __u32 init_size; > + > + /* TODO: These below are now set to ZERO. Need implementation */ > + __u16 pool_no; > + __u16 flags; > + __u32 reserved; > +}; > +#define ZU_IOC_ALLOC_BUFFER _IOWR('Z', 17, struct zufs_ioc_init) > + > #endif /* _LINUX_ZUFS_API_H */ > -- > 2.20.1 >
On 26/02/19 20:34, Schumaker, Anna wrote: > On Tue, 2019-02-19 at 13:51 +0200, Boaz harrosh wrote: <> >> zuf-core established the communication channels with the ZUS >> User Mode Server. >> >> In this patch we have the core communication mechanics. >> Which is the Novelty of this project. >> (See previous submitted documentation for more info) >> >> Users will come later in the patchset >> <> >> +static inline int relay_fss_wait(struct relay *relay) >> +{ >> + int err; >> + >> + relay->fss_waiting = true; >> + relay->fss_wakeup = false; >> + err = wait_event_interruptible(relay->fss_wq, relay->fss_wakeup); >> + >> + return err; > > Could you just do: "return wait_event_interruptible()" directly, instead of > using the err variable? > Totally there used to be a dbg_print here there for the reminder of that time Will change ... >> +} >> + <> >> +static struct zufc_thread *_zt_from_cpu(struct zuf_root_info *zri, >> + int cpu, uint chan) >> +{ >> + return per_cpu_ptr(zri->_ztp->_all_zt[chan], cpu); >> +} >> + >> +static int _zt_from_f(struct file *filp, int cpu, uint chan, >> + struct zufc_thread **ztp) >> +{ >> + *ztp = _zt_from_cpu(ZRI(filp->f_inode->i_sb), cpu, chan); >> + if (unlikely(!*ztp)) >> + return -ERANGE; >> + return 0; > > I'm curious if there is a reason you did it this way instead of making use of > the ERR_PTR() macro to return ztp directly? > For one now looking at it I hate the name its wrong. I will change that. It is done like that because it used to be used in many places and I did not want every place to have its print and invent its own error code. But now it has a single user I might just fold it into its only user. All other places must use _zt_from_f_private. Cool I'll kill it. >> +} >> + <> Thanks, will fix Boaz >> +static int _zu_init(struct file *file, void *parg) >> +{ >> + struct zufc_thread *zt; >> + int cpu = smp_processor_id(); >> + struct zufs_ioc_init zi_init; >> + int err; >> + >> + err = copy_from_user(&zi_init, parg, sizeof(zi_init)); >> + if (unlikely(err)) { >> + zuf_err("=>%d\n", err); >> + return err; >> + } >> + if (unlikely(zi_init.channel_no >= ZUFS_MAX_ZT_CHANNELS)) { >> + zuf_err("[%d] channel_no=%d\n", cpu, zi_init.channel_no); >> + return -EINVAL; >> + } >> + >> + zuf_dbg_zus("[%d] aff=0x%lx channel=%d\n", >> + cpu, zi_init.affinity, zi_init.channel_no); >> + >> + zi_init.hdr.err = _zt_from_f(file, cpu, zi_init.channel_no, &zt); >> + if (unlikely(zi_init.hdr.err)) { >> + zuf_err("=>%d\n", err); >> + goto out; >> + } >> + >> + if (unlikely(zt->hdr.file)) { >> + zi_init.hdr.err = -EINVAL; >> + zuf_err("[%d] !!! thread already set\n", cpu); >> + goto out; >> + } >> + >> + relay_init(&zt->relay); >> + zt->hdr.type = zlfs_e_zt; >> + zt->hdr.file = file; >> + zt->no = cpu; >> + zt->chan = zi_init.channel_no; >> + >> + zt->max_zt_command = zi_init.max_command; >> + zt->opt_buff = vmalloc(zi_init.max_command); >> + if (unlikely(!zt->opt_buff)) { >> + zi_init.hdr.err = -ENOMEM; >> + goto out; >> + } >> + _fill_buff(zt->opt_buff, zi_init.max_command / sizeof(ulong)); >> + >> + file->private_data = &zt->hdr; >> +out: >> + err = copy_to_user(parg, &zi_init, sizeof(zi_init)); >> + if (err) >> + zuf_err("=>%d\n", err); >> + return err; >> +} >> + >> +struct zufc_thread *_zt_from_f_private(struct file *file) >> +{ >> + struct zuf_special_file *zsf = file->private_data; >> + >> + WARN_ON(zsf->type != zlfs_e_zt); >> + return container_of(zsf, struct zufc_thread, hdr); >> +} >> + >> +/* Caller checks that file->private_data != NULL */ >> +static void zufc_zt_release(struct file *file) >> +{ >> + struct zufc_thread *zt = _zt_from_f_private(file); >> + <>
diff --git a/fs/zuf/_extern.h b/fs/zuf/_extern.h index 3bb9f1d9acf6..52bb6b9deafe 100644 --- a/fs/zuf/_extern.h +++ b/fs/zuf/_extern.h @@ -28,10 +28,32 @@ struct dentry *zuf_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); /* zuf-core.c */ +int zufc_zts_init(struct zuf_root_info *zri); /* Some private types in core */ +void zufc_zts_fini(struct zuf_root_info *zri); + long zufc_ioctl(struct file *filp, unsigned int cmd, ulong arg); int zufc_release(struct inode *inode, struct file *file); int zufc_mmap(struct file *file, struct vm_area_struct *vma); +int __zufc_dispatch_mount(struct zuf_root_info *zri, + enum e_mount_operation op, + struct zufs_ioc_mount *zim); +int zufc_dispatch_mount(struct zuf_root_info *zri, struct zus_fs_info *zus_zfi, + enum e_mount_operation operation, + struct zufs_ioc_mount *zim); + +const char *zuf_op_name(enum e_zufs_operation op); +int __zufc_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo); +static inline +int zufc_dispatch(struct zuf_root_info *zri, struct zufs_ioc_hdr *hdr, + struct page **pages, uint nump) +{ + struct zuf_dispatch_op zdo; + + zuf_dispatch_init(&zdo, hdr, pages, nump); + return __zufc_dispatch(zri, &zdo); +} + /* zuf-root.c */ int zufr_register_fs(struct super_block *sb, struct zufs_ioc_register_fs *rfs); diff --git a/fs/zuf/_pr.h b/fs/zuf/_pr.h index 30b8cf912c1f..dc9f85453890 100644 --- a/fs/zuf/_pr.h +++ b/fs/zuf/_pr.h @@ -39,5 +39,9 @@ /* ~~~ channel prints ~~~ */ #define zuf_dbg_err(s, args ...) zuf_chan_debug("error", s, ##args) +#define zuf_dbg_vfs(s, args ...) zuf_chan_debug("vfs ", s, ##args) +#define zuf_dbg_core(s, args ...) zuf_chan_debug("core ", s, ##args) +#define zuf_dbg_zus(s, args ...) zuf_chan_debug("zusdg", s, ##args) +#define zuf_dbg_verbose(s, args ...) zuf_chan_debug("d-oto", s, ##args) #endif /* define __ZUF_PR_H__ */ diff --git a/fs/zuf/relay.h b/fs/zuf/relay.h new file mode 100644 index 000000000000..a17d242b313a --- /dev/null +++ b/fs/zuf/relay.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Relay scheduler-object Header file. + * + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0. See module.c for LICENSE details. + * + * Authors: + * Boaz Harrosh <boazh@netapp.com> + */ + +#ifndef __RELAY_H__ +#define __RELAY_H__ + +/* ~~~~ Relay ~~~~ */ +struct relay { + wait_queue_head_t fss_wq; + bool fss_wakeup; + bool fss_waiting; + + wait_queue_head_t app_wq; + bool app_wakeup; + bool app_waiting; + + cpumask_t cpus_allowed; +}; + +static inline void relay_init(struct relay *relay) +{ + init_waitqueue_head(&relay->fss_wq); + init_waitqueue_head(&relay->app_wq); +} + +static inline bool relay_is_app_waiting(struct relay *relay) +{ + return relay->app_waiting; +} + +static inline void relay_app_wakeup(struct relay *relay) +{ + relay->app_waiting = false; + + relay->app_wakeup = true; + wake_up(&relay->app_wq); +} + +static inline int relay_fss_wait(struct relay *relay) +{ + int err; + + relay->fss_waiting = true; + relay->fss_wakeup = false; + err = wait_event_interruptible(relay->fss_wq, relay->fss_wakeup); + + return err; +} + +static inline bool relay_is_fss_waiting_grab(struct relay *relay) +{ + if (relay->fss_waiting) { + relay->fss_waiting = false; + return true; + } + return false; +} + +static inline void relay_fss_wakeup(struct relay *relay) +{ + relay->fss_wakeup = true; + wake_up(&relay->fss_wq); +} + +static inline void relay_fss_wakeup_app_wait(struct relay *relay, + spinlock_t *spinlock) +{ + relay->app_waiting = true; + + relay_fss_wakeup(relay); + + relay->app_wakeup = false; + if (spinlock) + spin_unlock(spinlock); + + wait_event(relay->app_wq, relay->app_wakeup); +} + +#endif /* ifndef __RELAY_H__ */ diff --git a/fs/zuf/zuf-core.c b/fs/zuf/zuf-core.c index e12cae584f8a..95582c0a4ba5 100644 --- a/fs/zuf/zuf-core.c +++ b/fs/zuf/zuf-core.c @@ -18,14 +18,820 @@ #include <linux/delay.h> #include <linux/pfn_t.h> #include <linux/sched/signal.h> +#include <linux/uaccess.h> #include "zuf.h" +struct zufc_thread { + struct zuf_special_file hdr; + struct relay relay; + struct vm_area_struct *vma; + int no; + int chan; + + /* Kernel side allocated IOCTL buffer */ + struct vm_area_struct *opt_buff_vma; + void *opt_buff; + ulong max_zt_command; + + /* Next operation*/ + struct zuf_dispatch_op *zdo; +}; + +enum { INITIAL_ZT_CHANNELS = 3 }; + +struct zuf_threads_pool { + uint _max_zts; + uint _max_channels; + /* array of pcp_arrays */ + struct zufc_thread *_all_zt[ZUFS_MAX_ZT_CHANNELS]; +}; + +static int _alloc_zts_channel(struct zuf_root_info *zri, int channel) +{ + zri->_ztp->_all_zt[channel] = alloc_percpu(struct zufc_thread); + if (unlikely(!zri->_ztp->_all_zt[channel])) { + zuf_err("!!! alloc_percpu channel=%d failed\n", channel); + return -ENOMEM; + } + return 0; +} + +static inline ulong _zt_pr_no(struct zufc_thread *zt) +{ + /* So in hex it will be channel as first nibble and cpu as 3rd and on */ + return ((ulong)zt->no << 8) | zt->chan; +} + +int zufc_zts_init(struct zuf_root_info *zri) +{ + int c; + + zri->_ztp = kcalloc(1, sizeof(struct zuf_threads_pool), GFP_KERNEL); + if (unlikely(!zri->_ztp)) + return -ENOMEM; + + zri->_ztp->_max_zts = num_online_cpus(); + zri->_ztp->_max_channels = INITIAL_ZT_CHANNELS; + + for (c = 0; c < INITIAL_ZT_CHANNELS; ++c) { + int err = _alloc_zts_channel(zri, c); + + if (unlikely(err)) + return err; + } + + return 0; +} + +void zufc_zts_fini(struct zuf_root_info *zri) +{ + int c; + + /* Always safe/must call zufc_zts_fini */ + if (!zri->_ztp) + return; + + for (c = 0; c < zri->_ztp->_max_channels; ++c) { + if (zri->_ztp->_all_zt[c]) + free_percpu(zri->_ztp->_all_zt[c]); + } + kfree(zri->_ztp); + zri->_ztp = NULL; +} + +static struct zufc_thread *_zt_from_cpu(struct zuf_root_info *zri, + int cpu, uint chan) +{ + return per_cpu_ptr(zri->_ztp->_all_zt[chan], cpu); +} + +static int _zt_from_f(struct file *filp, int cpu, uint chan, + struct zufc_thread **ztp) +{ + *ztp = _zt_from_cpu(ZRI(filp->f_inode->i_sb), cpu, chan); + if (unlikely(!*ztp)) + return -ERANGE; + return 0; +} + +static int _zu_register_fs(struct file *file, void *parg) +{ + struct zufs_ioc_register_fs rfs; + int err; + + err = copy_from_user(&rfs, parg, sizeof(rfs)); + if (unlikely(err)) { + zuf_err("=>%d\n", err); + return err; + } + + err = zufr_register_fs(file->f_inode->i_sb, &rfs); + if (err) + zuf_err("=>%d\n", err); + err = put_user(err, (int *)parg); + return err; +} + +/* ~~~~ mounting ~~~~*/ +int __zufc_dispatch_mount(struct zuf_root_info *zri, + enum e_mount_operation operation, + struct zufs_ioc_mount *zim) +{ + zim->hdr.operation = operation; + + for (;;) { + bool fss_waiting; + + spin_lock(&zri->mount.lock); + + if (unlikely(!zri->mount.zsf.file)) { + spin_unlock(&zri->mount.lock); + zuf_err("Server not up\n"); + zim->hdr.err = -EIO; + return zim->hdr.err; + } + + fss_waiting = relay_is_fss_waiting_grab(&zri->mount.relay); + if (fss_waiting) + break; + /* in case of break above spin_unlock is done inside + * relay_fss_wakeup_app_wait + */ + + spin_unlock(&zri->mount.lock); + + /* It is OK to wait if user storms mounts */ + zuf_dbg_verbose("waiting\n"); + msleep(100); + } + + zri->mount.zim = zim; + relay_fss_wakeup_app_wait(&zri->mount.relay, &zri->mount.lock); + + return zim->hdr.err; +} + +int zufc_dispatch_mount(struct zuf_root_info *zri, struct zus_fs_info *zus_zfi, + enum e_mount_operation operation, + struct zufs_ioc_mount *zim) +{ + zim->hdr.out_len = sizeof(*zim); + zim->hdr.in_len = sizeof(*zim); + if (operation == ZUFS_M_MOUNT || operation == ZUFS_M_REMOUNT) + zim->hdr.in_len += zim->zmi.po.mount_options_len; + zim->zmi.zus_zfi = zus_zfi; + zim->zmi.num_cpu = zri->_ztp->_max_zts; + zim->zmi.num_channels = zri->_ztp->_max_channels; + + return __zufc_dispatch_mount(zri, operation, zim); +} + +static int _zu_mount(struct file *file, void *parg) +{ + struct super_block *sb = file->f_inode->i_sb; + struct zuf_root_info *zri = ZRI(sb); + bool waiting_for_reply; + struct zufs_ioc_mount *zim; + ulong cp_ret; + int err; + + spin_lock(&zri->mount.lock); + + if (unlikely(!file->private_data)) { + /* First time register this file as the mount-thread owner */ + zri->mount.zsf.type = zlfs_e_mout_thread; + zri->mount.zsf.file = file; + file->private_data = &zri->mount.zsf; + } else if (unlikely(file->private_data != &zri->mount)) { + spin_unlock(&zri->mount.lock); + zuf_err("Say what?? %p != %p\n", + file->private_data, &zri->mount); + return -EIO; + } + + zim = zri->mount.zim; + zri->mount.zim = NULL; + waiting_for_reply = zim && relay_is_app_waiting(&zri->mount.relay); + + spin_unlock(&zri->mount.lock); + + if (waiting_for_reply) { + cp_ret = copy_from_user(zim, parg, zim->hdr.out_len); + if (unlikely(cp_ret)) { + zuf_err("copy_from_user => %ld\n", cp_ret); + zim->hdr.err = -EFAULT; + } + + relay_app_wakeup(&zri->mount.relay); + } + + /* This gets to sleep until a mount comes */ + err = relay_fss_wait(&zri->mount.relay); + if (unlikely(err || !zri->mount.zim)) { + struct zufs_ioc_hdr *hdr = parg; + + /* Released by _zu_break INTER or crash */ + zuf_dbg_zus("_zu_break? %p => %d\n", zri->mount.zim, err); + put_user(ZUFS_OP_BREAK, &hdr->operation); + put_user(EIO, &hdr->err); + return err; + } + + zim = zri->mount.zim; + cp_ret = copy_to_user(parg, zim, zim->hdr.in_len); + if (unlikely(cp_ret)) { + err = -EFAULT; + zuf_err("copy_to_user =>%ld\n", cp_ret); + } + return err; +} + +static void zufc_mounter_release(struct file *file) +{ + struct zuf_root_info *zri = ZRI(file->f_inode->i_sb); + + zuf_dbg_zus("closed fu=%d au=%d fw=%d aw=%d\n", + zri->mount.relay.fss_wakeup, zri->mount.relay.app_wakeup, + zri->mount.relay.fss_waiting, zri->mount.relay.app_waiting); + + spin_lock(&zri->mount.lock); + zri->mount.zsf.file = NULL; + if (relay_is_app_waiting(&zri->mount.relay)) { + zuf_err("server emergency exit while IO\n"); + + if (zri->mount.zim) + zri->mount.zim->hdr.err = -EIO; + spin_unlock(&zri->mount.lock); + + relay_app_wakeup(&zri->mount.relay); + msleep(1000); /* crap */ + } else { + if (zri->mount.zim) + zri->mount.zim->hdr.err = 0; + spin_unlock(&zri->mount.lock); + } +} + +/* ~~~~ ZU_IOC_NUMA_MAP ~~~~ */ +static int _zu_numa_map(struct file *file, void *parg) +{ + struct zufs_ioc_numa_map *numa_map; + int n_nodes = num_online_nodes(); + int n_cpus = num_online_cpus(); + uint *nodes_cpu_count; + uint max_cpu_per_node = 0; + uint alloc_size; + int cpu, i, err; + + alloc_size = sizeof(*numa_map) + n_cpus; /* char per cpu */ + + if ((n_nodes > 255) || (alloc_size > PAGE_SIZE)) { + zuf_warn("!!!unexpected big machine with %d nodes alloc_size=0x%x\n", + n_nodes, alloc_size); + return -ENOTSUPP; + } + + nodes_cpu_count = kcalloc(n_nodes, sizeof(uint), GFP_KERNEL); + if (unlikely(!nodes_cpu_count)) + return -ENOMEM; + + numa_map = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!numa_map)) { + err = -ENOMEM; + goto out; + } + + numa_map->possible_nodes = num_possible_nodes(); + numa_map->possible_cpus = num_possible_cpus(); + + numa_map->online_nodes = n_nodes; + numa_map->online_cpus = n_cpus; + + for_each_cpu(cpu, cpu_online_mask) { + uint ctn = cpu_to_node(cpu); + uint ncc = ++nodes_cpu_count[ctn]; + + numa_map->cpu_to_node[cpu] = ctn; + max_cpu_per_node = max(max_cpu_per_node, ncc); + } + + for (i = 1; i < n_nodes; ++i) { + if (nodes_cpu_count[i] != nodes_cpu_count[0]) { + zuf_info("@[%d]=%d Unbalanced CPU sockets @[0]=%d\n", + i, nodes_cpu_count[i], nodes_cpu_count[0]); + numa_map->nodes_not_symmetrical = true; + break; + } + } + + numa_map->max_cpu_per_node = max_cpu_per_node; + + zuf_dbg_verbose( + "possible_nodes=%d possible_cpus=%d online_nodes=%d online_cpus=%d\n", + numa_map->possible_nodes, numa_map->possible_cpus, + n_nodes, n_cpus); + + err = copy_to_user(parg, numa_map, alloc_size); + kfree(numa_map); +out: + kfree(nodes_cpu_count); + return err; +} + +static int _map_pages(struct zufc_thread *zt, struct page **pages, uint nump, + bool map_readonly) +{ + int p, err; + + if (!(zt->vma && pages && nump)) + return 0; + + for (p = 0; p < nump; ++p) { + ulong zt_addr = zt->vma->vm_start + p * PAGE_SIZE; + ulong pfn = page_to_pfn(pages[p]); + pfn_t pfnt = phys_to_pfn_t(PFN_PHYS(pfn), PFN_MAP | PFN_DEV); + vm_fault_t flt; + + if (map_readonly) + flt = vmf_insert_mixed(zt->vma, zt_addr, pfnt); + else + flt = vmf_insert_mixed_mkwrite(zt->vma, zt_addr, pfnt); + err = zuf_flt_to_err(flt); + if (unlikely(err)) { + zuf_err("zuf: remap_pfn_range => %d p=0x%x start=0x%lx\n", + err, p, zt->vma->vm_start); + return err; + } + } + return 0; +} + +static void _unmap_pages(struct zufc_thread *zt, struct page **pages, uint nump) +{ + if (!(zt->vma && zt->zdo && pages && nump)) + return; + + zt->zdo->pages = NULL; + zt->zdo->nump = 0; + + zap_vma_ptes(zt->vma, zt->vma->vm_start, nump * PAGE_SIZE); +} + +static void _fill_buff(ulong *buff, uint size) +{ + ulong *buff_end = buff + size; + ulong val = 0; + + for (; buff < buff_end; ++buff, ++val) + *buff = val; +} + +static int _zu_init(struct file *file, void *parg) +{ + struct zufc_thread *zt; + int cpu = smp_processor_id(); + struct zufs_ioc_init zi_init; + int err; + + err = copy_from_user(&zi_init, parg, sizeof(zi_init)); + if (unlikely(err)) { + zuf_err("=>%d\n", err); + return err; + } + if (unlikely(zi_init.channel_no >= ZUFS_MAX_ZT_CHANNELS)) { + zuf_err("[%d] channel_no=%d\n", cpu, zi_init.channel_no); + return -EINVAL; + } + + zuf_dbg_zus("[%d] aff=0x%lx channel=%d\n", + cpu, zi_init.affinity, zi_init.channel_no); + + zi_init.hdr.err = _zt_from_f(file, cpu, zi_init.channel_no, &zt); + if (unlikely(zi_init.hdr.err)) { + zuf_err("=>%d\n", err); + goto out; + } + + if (unlikely(zt->hdr.file)) { + zi_init.hdr.err = -EINVAL; + zuf_err("[%d] !!! thread already set\n", cpu); + goto out; + } + + relay_init(&zt->relay); + zt->hdr.type = zlfs_e_zt; + zt->hdr.file = file; + zt->no = cpu; + zt->chan = zi_init.channel_no; + + zt->max_zt_command = zi_init.max_command; + zt->opt_buff = vmalloc(zi_init.max_command); + if (unlikely(!zt->opt_buff)) { + zi_init.hdr.err = -ENOMEM; + goto out; + } + _fill_buff(zt->opt_buff, zi_init.max_command / sizeof(ulong)); + + file->private_data = &zt->hdr; +out: + err = copy_to_user(parg, &zi_init, sizeof(zi_init)); + if (err) + zuf_err("=>%d\n", err); + return err; +} + +struct zufc_thread *_zt_from_f_private(struct file *file) +{ + struct zuf_special_file *zsf = file->private_data; + + WARN_ON(zsf->type != zlfs_e_zt); + return container_of(zsf, struct zufc_thread, hdr); +} + +/* Caller checks that file->private_data != NULL */ +static void zufc_zt_release(struct file *file) +{ + struct zufc_thread *zt = _zt_from_f_private(file); + + if (unlikely(zt->hdr.file != file)) + zuf_err("What happened zt->file(%p) != file(%p)\n", + zt->hdr.file, file); + + zuf_dbg_zus("[%d] closed fu=%d au=%d fw=%d aw=%d\n", + zt->no, zt->relay.fss_wakeup, zt->relay.app_wakeup, + zt->relay.fss_waiting, zt->relay.app_waiting); + + if (relay_is_app_waiting(&zt->relay)) { + zuf_err("server emergency exit while IO\n"); + + /* NOTE: Do not call _unmap_pages the vma is gone */ + zt->hdr.file = NULL; + + relay_app_wakeup(&zt->relay); + msleep(1000); /* crap */ + } + + vfree(zt->opt_buff); + memset(zt, 0, sizeof(*zt)); +} + +static int _copy_outputs(struct zufc_thread *zt, void *arg) +{ + struct zufs_ioc_hdr *hdr = zt->zdo->hdr; + struct zufs_ioc_hdr *user_hdr = zt->opt_buff; + + if (zt->opt_buff_vma->vm_start != (ulong)arg) { + zuf_err("malicious Server\n"); + return -EINVAL; + } + + /* Update on the user out_len and return-code */ + hdr->err = user_hdr->err; + hdr->out_len = user_hdr->out_len; + + if (!hdr->out_len) + return 0; + + if ((hdr->err == -EZUFS_RETRY) || (hdr->out_max < hdr->out_len)) { + if (WARN_ON(!zt->zdo->oh)) { + zuf_err("Trouble op(%s) out_max=%d out_len=%d\n", + zuf_op_name(hdr->operation), + hdr->out_max, hdr->out_len); + return -EFAULT; + } + zuf_dbg_zus("[%s] %d %d => %d\n", + zuf_op_name(hdr->operation), + hdr->out_max, hdr->out_len, hdr->err); + return zt->zdo->oh(zt->zdo, zt->opt_buff, zt->max_zt_command); + } else { + void *rply = (void *)hdr + hdr->out_start; + void *from = zt->opt_buff + hdr->out_start; + + memcpy(rply, from, hdr->out_len); + return 0; + } +} + +static int _zu_wait(struct file *file, void *parg) +{ + struct zufc_thread *zt; + int err; + + zt = _zt_from_f_private(file); + if (unlikely(!zt)) { + zuf_err("Unexpected ZT state\n"); + err = -ERANGE; + goto err; + } + + if (!zt->hdr.file || file != zt->hdr.file) { + zuf_err("fatal\n"); + err = -E2BIG; + goto err; + } + if (unlikely((ulong)parg != zt->opt_buff_vma->vm_start)) { + zuf_err("fatal 2\n"); + err = -EINVAL; + goto err; + } + + if (relay_is_app_waiting(&zt->relay)) { + if (unlikely(!zt->zdo)) { + zuf_err("User has gone...\n"); + err = -E2BIG; + goto err; + } else { + /* overflow_handler might decide to execute the + *parg here at zus context and return to server + * If it also has an error to report to zus it + * will set zdo->hdr->err. + * EZUS_RETRY_DONE is when that happens. + * In this case pages stay mapped in zt->vma + */ + err = _copy_outputs(zt, parg); + if (err == EZUF_RETRY_DONE) { + put_user(zt->zdo->hdr->err, (int *)parg); + return 0; + } + + _unmap_pages(zt, zt->zdo->pages, zt->zdo->nump); + zt->zdo = NULL; + if (unlikely(err)) /* _copy_outputs returned an err */ + goto err; + } + relay_app_wakeup(&zt->relay); + } + + err = relay_fss_wait(&zt->relay); + if (err) + zuf_dbg_err("[%d] relay error: %d\n", zt->no, err); + + if (zt->zdo && zt->zdo->hdr && + zt->zdo->hdr->operation < ZUFS_OP_BREAK) { + /* call map here at the zuf thread so we need no locks + * TODO: Currently only ZUFS_OP_WRITE protects user-buffers + * we should have a bit set in zt->zdo->hdr set per operation. + * TODO: Why this does not work? + */ + _map_pages(zt, zt->zdo->pages, zt->zdo->nump, 0); + memcpy(zt->opt_buff, zt->zdo->hdr, zt->zdo->hdr->in_len); + } else { + struct zufs_ioc_hdr *hdr = zt->opt_buff; + + /* This Means we were released by _zu_break */ + zuf_dbg_zus("_zu_break? => %d\n", err); + hdr->operation = ZUFS_OP_BREAK; + hdr->err = err; + } + + return err; + +err: + put_user(err, (int *)parg); + return err; +} + +static int _try_grab_zt_channel(struct zuf_root_info *zri, int cpu, + struct zufc_thread **ztp) +{ + struct zufc_thread *zt; + int c; + + for (c = 0; ; ++c) { + zt = _zt_from_cpu(zri, cpu, c); + if (unlikely(!zt || !zt->hdr.file)) + break; + + if (relay_is_fss_waiting_grab(&zt->relay)) { + *ztp = zt; + return true; + } + } + + *ztp = _zt_from_cpu(zri, cpu, 0); + return false; +} + +#define _zuf_get_cpu() get_cpu() +#define _zuf_put_cpu() put_cpu() + +#ifdef CONFIG_ZUF_DEBUG +static +int _r_zufs_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo) +#else +int __zufc_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo) +#endif +{ + struct task_struct *app = get_current(); + struct zufs_ioc_hdr *hdr = zdo->hdr; + int cpu, cpu2; + struct zufc_thread *zt; + + if (unlikely(hdr->out_len && !hdr->out_max)) { + /* TODO: Complain here and let caller code do this proper */ + hdr->out_max = hdr->out_len; + } + +channel_busy: + cpu = _zuf_get_cpu(); + + if (!_try_grab_zt_channel(zri, cpu, &zt)) { + _zuf_put_cpu(); + + /* If channel was grabbed then maybe a break_all is in progress + * on a different CPU make sure zt->file on this core is + * updated + */ + mb(); + if (unlikely(!zt->hdr.file)) { + zuf_err("[%d] !zt->file\n", cpu); + return -EIO; + } + zuf_dbg_err("[%d] can this be\n", cpu); + /* FIXME: Do something much smarter */ + msleep(10); + if (signal_pending(get_current())) { + zuf_dbg_err("[%d] => EINTR\n", cpu); + return -EINTR; + } + goto channel_busy; + } + + /* lock app to this cpu while waiting */ + cpumask_copy(&zt->relay.cpus_allowed, &app->cpus_allowed); + cpumask_copy(&app->cpus_allowed, cpumask_of(smp_processor_id())); + + zt->zdo = zdo; + + _zuf_put_cpu(); + + relay_fss_wakeup_app_wait(&zt->relay, NULL); + + /* restore cpu affinity after wakeup */ + cpumask_copy(&app->cpus_allowed, &zt->relay.cpus_allowed); + +cpu2 = smp_processor_id(); +if (cpu2 != cpu) + zuf_warn("App switched cpu1=%u cpu2=%u\n", cpu, cpu2); + + return zt->hdr.file ? hdr->err : -EIO; +} + +const char *zuf_op_name(enum e_zufs_operation op) +{ +#define CASE_ENUM_NAME(e) case e: return #e + switch (op) { + CASE_ENUM_NAME(ZUFS_OP_BREAK ); + default: + return "UNKNOWN"; + } +} + +#ifdef CONFIG_ZUF_DEBUG + +#define MAX_ZT_SEC 5 +int __zufc_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo) +{ + u64 t1, t2; + int err; + + t1 = ktime_get_ns(); + err = _r_zufs_dispatch(zri, zdo); + t2 = ktime_get_ns(); + + if ((t2 - t1) > MAX_ZT_SEC * NSEC_PER_SEC) + zuf_err("zufc_dispatch(%s, [0x%x-0x%x]) took %lld sec\n", + zuf_op_name(zdo->hdr->operation), zdo->hdr->offset, + zdo->hdr->len, + (t2 - t1) / NSEC_PER_SEC); + + return err; +} +#endif /* def CONFIG_ZUF_DEBUG */ + +/* ~~~ iomap_exec && exec_buffer allocation ~~~ */ +struct zu_exec_buff { + struct zuf_special_file hdr; + struct vm_area_struct *vma; + void *opt_buff; + ulong alloc_size; +}; + +/* Do some common checks and conversions */ +static inline struct zu_exec_buff *_ebuff_from_file(struct file *file) +{ + struct zu_exec_buff *ebuff = file->private_data; + + if (WARN_ON_ONCE(ebuff->hdr.type != zlfs_e_dpp_buff)) { + zuf_err("Must call ZU_IOC_ALLOC_BUFFER first\n"); + return NULL; + } + + if (WARN_ON_ONCE(ebuff->hdr.file != file)) + return NULL; + + return ebuff; +} + +static int _zu_ebuff_alloc(struct file *file, void *arg) +{ + struct zufs_ioc_alloc_buffer ioc_alloc; + struct zu_exec_buff *ebuff; + int err; + + err = copy_from_user(&ioc_alloc, arg, sizeof(ioc_alloc)); + if (unlikely(err)) { + zuf_err("=>%d\n", err); + return err; + } + + if (ioc_alloc.init_size > ioc_alloc.max_size) + return -EINVAL; + + /* TODO: Easily Support growing */ + /* TODO: Support global pools, also easy */ + if (ioc_alloc.pool_no || ioc_alloc.init_size != ioc_alloc.max_size) + return -ENOTSUPP; + + ebuff = kzalloc(sizeof(*ebuff), GFP_KERNEL); + if (unlikely(!ebuff)) + return -ENOMEM; + + ebuff->hdr.type = zlfs_e_dpp_buff; + ebuff->hdr.file = file; + i_size_write(file->f_inode, ioc_alloc.max_size); + ebuff->alloc_size = ioc_alloc.init_size; + ebuff->opt_buff = vmalloc(ioc_alloc.init_size); + if (unlikely(!ebuff->opt_buff)) { + kfree(ebuff); + return -ENOMEM; + } + _fill_buff(ebuff->opt_buff, ioc_alloc.init_size / sizeof(ulong)); + + file->private_data = &ebuff->hdr; + return 0; +} + +static void zufc_ebuff_release(struct file *file) +{ + struct zu_exec_buff *ebuff = _ebuff_from_file(file); + + if (unlikely(!ebuff)) + return; + + vfree(ebuff->opt_buff); + ebuff->hdr.type = 0; + ebuff->hdr.file = NULL; /* for none-dbg Kernels && use-after-free */ + kfree(ebuff); +} + +static int _zu_break(struct file *filp, void *parg) +{ + struct zuf_root_info *zri = ZRI(filp->f_inode->i_sb); + int i, c; + + zuf_dbg_core("enter\n"); + mb(); /* TODO how to schedule on all CPU's */ + + for (i = 0; i < zri->_ztp->_max_zts; ++i) { + for (c = 0; c < zri->_ztp->_max_channels; ++c) { + struct zufc_thread *zt = _zt_from_cpu(zri, i, c); + + if (unlikely(!(zt && zt->hdr.file))) + continue; + relay_fss_wakeup(&zt->relay); + } + } + + if (zri->mount.zsf.file) + relay_fss_wakeup(&zri->mount.relay); + + zuf_dbg_core("exit\n"); + return 0; +} + long zufc_ioctl(struct file *file, unsigned int cmd, ulong arg) { + void __user *parg = (void __user *)arg; + switch (cmd) { + case ZU_IOC_REGISTER_FS: + return _zu_register_fs(file, parg); + case ZU_IOC_MOUNT: + return _zu_mount(file, parg); + case ZU_IOC_NUMA_MAP: + return _zu_numa_map(file, parg); + case ZU_IOC_INIT_THREAD: + return _zu_init(file, parg); + case ZU_IOC_WAIT_OPT: + return _zu_wait(file, parg); + case ZU_IOC_ALLOC_BUFFER: + return _zu_ebuff_alloc(file, parg); + case ZU_IOC_BREAK_ALL: + return _zu_break(file, parg); default: - zuf_err("%d\n", cmd); + zuf_err("%d %ld\n", cmd, ZU_IOC_WAIT_OPT); return -ENOTTY; } } @@ -38,11 +844,215 @@ int zufc_release(struct inode *inode, struct file *file) return 0; switch (zsf->type) { + case zlfs_e_zt: + zufc_zt_release(file); + return 0; + case zlfs_e_mout_thread: + zufc_mounter_release(file); + return 0; + case zlfs_e_pmem: + /* NOTHING to clean for pmem file yet */ + /* zuf_pmem_release(file);*/ + return 0; + case zlfs_e_dpp_buff: + zufc_ebuff_release(file); + return 0; default: return 0; } } +/* ~~~~ mmap area of app buffers into server ~~~~ */ + +static int zuf_zt_fault(struct vm_fault *vmf) +{ + zuf_err("should not fault\n"); + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct zuf_vm_ops = { + .fault = zuf_zt_fault, +}; + +static int _zufc_zt_mmap(struct file *file, struct vm_area_struct *vma, + struct zufc_thread *zt) +{ + /* Tell Kernel We will only access on a single core */ + vma->vm_flags |= VM_MIXEDMAP; + vma->vm_ops = &zuf_vm_ops; + + zt->vma = vma; + + zuf_dbg_core( + "[0x%lx] start=0x%lx end=0x%lx flags=0x%lx file-start=0x%lx\n", + _zt_pr_no(zt), vma->vm_start, vma->vm_end, vma->vm_flags, + vma->vm_pgoff); + + return 0; +} + +/* ~~~~ mmap the Kernel allocated IOCTL buffer per ZT ~~~~ */ +static int _opt_buff_mmap(struct vm_area_struct *vma, void *opt_buff, + ulong opt_size) +{ + ulong offset; + + if (!opt_buff) + return -ENOMEM; + + for (offset = 0; offset < opt_size; offset += PAGE_SIZE) { + ulong addr = vma->vm_start + offset; + ulong pfn = vmalloc_to_pfn(opt_buff + offset); + pfn_t pfnt = phys_to_pfn_t(PFN_PHYS(pfn), PFN_MAP | PFN_DEV); + int err; + + zuf_dbg_verbose("[0x%lx] pfn-0x%lx addr=0x%lx buff=0x%lx\n", + offset, pfn, addr, (ulong)opt_buff + offset); + + err = zuf_flt_to_err(vmf_insert_mixed_mkwrite(vma, addr, pfnt)); + if (unlikely(err)) { + zuf_err("zuf: zuf_insert_mixed_mkwrite => %d offset=0x%lx addr=0x%lx\n", + err, offset, addr); + return err; + } + } + + return 0; +} + +static int zuf_obuff_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct zufc_thread *zt = _zt_from_f_private(vma->vm_file); + long offset = (vmf->pgoff << PAGE_SHIFT) - ZUS_API_MAP_MAX_SIZE; + int err; + + zuf_dbg_core( + "[0x%lx] start=0x%lx end=0x%lx file-start=0x%lx offset=0x%lx\n", + _zt_pr_no(zt), vma->vm_start, vma->vm_end, vma->vm_pgoff, + offset); + + /* if Server overruns its buffer crash it dead */ + if (unlikely((offset < 0) || (zt->max_zt_command < offset))) { + zuf_err("[0x%lx] start=0x%lx end=0x%lx file-start=0x%lx offset=0x%lx\n", + _zt_pr_no(zt), vma->vm_start, + vma->vm_end, vma->vm_pgoff, offset); + return VM_FAULT_SIGBUS; + } + + /* We never released a zus-core.c that does not fault the + * first page first. I want to see if this happens + */ + if (unlikely(offset)) + zuf_warn("Suspicious server activity\n"); + + /* This faults only once at very first access */ + err = _opt_buff_mmap(vma, zt->opt_buff, zt->max_zt_command); + if (unlikely(err)) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} + +static const struct vm_operations_struct zuf_obuff_ops = { + .fault = zuf_obuff_fault, +}; + +static int _zufc_obuff_mmap(struct file *file, struct vm_area_struct *vma, + struct zufc_thread *zt) +{ + vma->vm_flags |= VM_MIXEDMAP; + vma->vm_ops = &zuf_obuff_ops; + + zt->opt_buff_vma = vma; + + zuf_dbg_core( + "[0x%lx] start=0x%lx end=0x%lx flags=0x%lx file-start=0x%lx\n", + _zt_pr_no(zt), vma->vm_start, vma->vm_end, vma->vm_flags, + vma->vm_pgoff); + + return 0; +} + +/* ~~~ */ + +static int zufc_zt_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct zufc_thread *zt = _zt_from_f_private(file); + + /* We have two areas of mmap in this special file. + * 0 to ZUS_API_MAP_MAX_SIZE: + * The first part where app pages are mapped + * into server per operation. + * ZUS_API_MAP_MAX_SIZE of size zuf_root_info->max_zt_command + * Is where we map the per ZT ioctl-buffer, later passed + * to the zus_ioc_wait IOCTL call + */ + if (vma->vm_pgoff == ZUS_API_MAP_MAX_SIZE / PAGE_SIZE) + return _zufc_obuff_mmap(file, vma, zt); + + /* zuf ZT API is very particular about where in its + * special file we communicate + */ + if (unlikely(vma->vm_pgoff)) + return -EINVAL; + + return _zufc_zt_mmap(file, vma, zt); +} + +/* ~~~~ Implementation of the ZU_IOC_ALLOC_BUFFER mmap facility ~~~~ */ + +static int zuf_ebuff_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct zu_exec_buff *ebuff = _ebuff_from_file(vma->vm_file); + long offset = (vmf->pgoff << PAGE_SHIFT); + int err; + + zuf_dbg_core("start=0x%lx end=0x%lx file-start=0x%lx file-off=0x%lx\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, offset); + + /* if Server overruns its buffer crash it dead */ + if (unlikely((offset < 0) || (ebuff->alloc_size < offset))) { + zuf_err("start=0x%lx end=0x%lx file-start=0x%lx file-off=0x%lx\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + offset); + return VM_FAULT_SIGBUS; + } + + /* We never released a zus-core.c that does not fault the + * first page first. I want to see if this happens + */ + if (unlikely(offset)) + zuf_warn("Suspicious server activity\n"); + + /* This faults only once at very first access */ + err = _opt_buff_mmap(vma, ebuff->opt_buff, ebuff->alloc_size); + if (unlikely(err)) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} + +static const struct vm_operations_struct zuf_ebuff_ops = { + .fault = zuf_ebuff_fault, +}; + +static int zufc_ebuff_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct zu_exec_buff *ebuff = _ebuff_from_file(vma->vm_file); + + vma->vm_flags |= VM_MIXEDMAP; + vma->vm_ops = &zuf_ebuff_ops; + + ebuff->vma = vma; + + zuf_dbg_core("start=0x%lx end=0x%lx flags=0x%lx file-start=0x%lx\n", + vma->vm_start, vma->vm_end, vma->vm_flags, vma->vm_pgoff); + + return 0; +} + int zufc_mmap(struct file *file, struct vm_area_struct *vma) { struct zuf_special_file *zsf = file->private_data; @@ -53,6 +1063,10 @@ int zufc_mmap(struct file *file, struct vm_area_struct *vma) } switch (zsf->type) { + case zlfs_e_zt: + return zufc_zt_mmap(file, vma); + case zlfs_e_dpp_buff: + return zufc_ebuff_mmap(file, vma); default: zuf_err("type=%d\n", zsf->type); return -ENOTTY; diff --git a/fs/zuf/zuf-root.c b/fs/zuf/zuf-root.c index 55a839dbc854..37b70ca33d3c 100644 --- a/fs/zuf/zuf-root.c +++ b/fs/zuf/zuf-root.c @@ -227,6 +227,7 @@ static void zufr_put_super(struct super_block *sb) { struct zuf_root_info *zri = ZRI(sb); + zufc_zts_fini(zri); _unregister_all_fses(zri); zuf_info("zuf_root umount\n"); @@ -282,10 +283,16 @@ static int zufr_fill_super(struct super_block *sb, void *data, int silent) root_i->i_fop = &zufr_file_dir_operations; root_i->i_op = &zufr_inode_operations; + spin_lock_init(&zri->mount.lock); mutex_init(&zri->sbl_lock); + relay_init(&zri->mount.relay); INIT_LIST_HEAD(&zri->fst_list); INIT_LIST_HEAD(&zri->pmem_list); + err = zufc_zts_init(zri); + if (unlikely(err)) + return err; /* put will be called we have a root */ + return 0; } diff --git a/fs/zuf/zuf.h b/fs/zuf/zuf.h index f979d8cbe60c..a33f5908155d 100644 --- a/fs/zuf/zuf.h +++ b/fs/zuf/zuf.h @@ -23,9 +23,11 @@ #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/page_ref.h> +#include <linux/mm.h> #include "zus_api.h" +#include "relay.h" #include "_pr.h" enum zlfs_e_special_file { @@ -44,6 +46,8 @@ struct zuf_special_file { struct zuf_root_info { struct __mount_thread_info { struct zuf_special_file zsf; + spinlock_t lock; + struct relay relay; struct zufs_ioc_mount *zim; } mount; @@ -102,6 +106,48 @@ static inline struct zuf_inode_info *ZUII(struct inode *inode) return container_of(inode, struct zuf_inode_info, vfs_inode); } +static inline struct zuf_fs_type *ZUF_FST(struct file_system_type *fs_type) +{ + return container_of(fs_type, struct zuf_fs_type, vfs_fst); +} + +static inline struct zuf_fs_type *zuf_fst(struct super_block *sb) +{ + return ZUF_FST(sb->s_type); +} + +struct zuf_dispatch_op; +typedef int (*overflow_handler)(struct zuf_dispatch_op *zdo, void *parg, + ulong zt_max_bytes); +struct zuf_dispatch_op { + struct zufs_ioc_hdr *hdr; + struct page **pages; + uint nump; + overflow_handler oh; + struct super_block *sb; + struct inode *inode; +}; + +static inline void +zuf_dispatch_init(struct zuf_dispatch_op *zdo, struct zufs_ioc_hdr *hdr, + struct page **pages, uint nump) +{ + memset(zdo, 0, sizeof(*zdo)); + zdo->hdr = hdr; + zdo->pages = pages; zdo->nump = nump; +} + +static inline int zuf_flt_to_err(vm_fault_t flt) +{ + if (likely(flt == VM_FAULT_NOPAGE)) + return 0; + + if (flt == VM_FAULT_OOM) + return -ENOMEM; + + return -EACCES; +} + /* Keep this include last thing in file */ #include "_extern.h" diff --git a/fs/zuf/zus_api.h b/fs/zuf/zus_api.h index 34e3e1a9a107..3319a70b5ccc 100644 --- a/fs/zuf/zus_api.h +++ b/fs/zuf/zus_api.h @@ -66,6 +66,47 @@ #endif /* ndef __KERNEL__ */ +/* first available error code after include/linux/errno.h */ +#define EZUFS_RETRY 531 + +/* The below is private to zuf Kernel only. Is not exposed to VFS nor zus + * (defined here to allocate the constant) + */ +#define EZUF_RETRY_DONE 540 + +/** + * zufs dual port memory + * This is a special type of offset to either memory or persistent-memory, + * that is designed to be used in the interface mechanism between userspace + * and kernel, and can be accessed by both. + * 3 first bits denote a mem-pool: + * 0 - pmem pool + * 1-6 - established shared pool by a call to zufs_ioc_create_mempool (below) + * 7 - offset into app memory + */ +typedef __u64 __bitwise zu_dpp_t; + +static inline uint zu_dpp_t_pool(zu_dpp_t t) +{ + return t & 0x7; +} + +static inline ulong zu_dpp_t_val(zu_dpp_t t) +{ + return t & ~0x7; +} + +static inline zu_dpp_t enc_zu_dpp_t(ulong v, uint pool) +{ + return v | pool; +} + +/* ~~~~~ ZUFS API ioctl commands ~~~~~ */ +enum { + ZUS_API_MAP_MAX_PAGES = 1024, + ZUS_API_MAP_MAX_SIZE = ZUS_API_MAP_MAX_PAGES * PAGE_SIZE, +}; + struct zufs_ioc_hdr { __u32 err; /* IN/OUT must be first */ __u16 in_len; /* How much to be copied *to* zus */ @@ -102,4 +143,148 @@ struct zufs_ioc_register_fs { }; #define ZU_IOC_REGISTER_FS _IOWR('Z', 10, struct zufs_ioc_register_fs) +/* A cookie from user-mode returned by mount */ +struct zus_sb_info; + +/* zus cookie per inode */ +struct zus_inode_info; + +enum ZUFS_M_FLAGS { + ZUFS_M_PEDANTIC = 0x00000001, + ZUFS_M_EPHEMERAL = 0x00000002, + ZUFS_M_SILENT = 0x00000004, +}; + +struct zufs_parse_options { + __u32 mount_options_len; + __u32 pedantic; + __u64 mount_flags; + char mount_options[0]; +}; + +enum e_mount_operation { + ZUFS_M_MOUNT = 1, + ZUFS_M_UMOUNT, + ZUFS_M_REMOUNT, + ZUFS_M_DDBG_RD, + ZUFS_M_DDBG_WR, +}; + +struct zufs_mount_info { + /* IN */ + struct zus_fs_info *zus_zfi; + __u16 num_cpu; + __u16 num_channels; + __u32 pmem_kern_id; + __u64 sb_id; + + /* OUT */ + struct zus_sb_info *zus_sbi; + /* mount is also iget of root */ + struct zus_inode_info *zus_ii; + zu_dpp_t _zi; + __u64 old_mount_opt; + __u64 remount_flags; + + /* More FS specific info */ + __u32 s_blocksize_bits; + __u8 acl_on; + struct zufs_parse_options po; +}; + +/* mount / umount */ +struct zufs_ioc_mount { + struct zufs_ioc_hdr hdr; + struct zufs_mount_info zmi; +}; +#define ZU_IOC_MOUNT _IOWR('Z', 11, struct zufs_ioc_mount) + +/* pmem */ +struct zufs_ioc_numa_map { + /* Set by zus */ + struct zufs_ioc_hdr hdr; + + __u32 possible_nodes; + __u32 possible_cpus; + __u32 online_nodes; + __u32 online_cpus; + + __u32 max_cpu_per_node; + + /* This indicates that NOT all nodes have @max_cpu_per_node cpus */ + bool nodes_not_symmetrical; + + /* Variable size must keep last + * size @online_cpus + */ + __u8 cpu_to_node[]; +}; +#define ZU_IOC_NUMA_MAP _IOWR('Z', 12, struct zufs_ioc_numa_map) + +/* ZT init */ +enum { ZUFS_MAX_ZT_CHANNELS = 64 }; + +struct zufs_ioc_init { + struct zufs_ioc_hdr hdr; + ulong affinity; /* IN */ + uint channel_no; + uint max_command; +}; +#define ZU_IOC_INIT_THREAD _IOWR('Z', 14, struct zufs_ioc_init) + +/* break_all (Server telling kernel to clean) */ +struct zufs_ioc_break_all { + struct zufs_ioc_hdr hdr; +}; +#define ZU_IOC_BREAK_ALL _IOWR('Z', 15, struct zufs_ioc_break_all) + +/* ~~~ zufs_ioc_wait_operation ~~~ */ +struct zufs_ioc_wait_operation { + struct zufs_ioc_hdr hdr; + /* maximum size is governed by zufs_ioc_init->max_command */ + char opt_buff[]; +}; +#define ZU_IOC_WAIT_OPT _IOWR('Z', 16, struct zufs_ioc_wait_operation) + +/* These are the possible operations sent from Kernel to the Server in the + * return of the ZU_IOC_WAIT_OPT. + */ +enum e_zufs_operation { + ZUFS_OP_NULL = 0, + + ZUFS_OP_BREAK, /* Kernel telling Server to exit */ + ZUFS_OP_MAX_OPT, +}; + +/* Allocate a special_file that will be a dual-port communication buffer with + * user mode. + * Server will access the buffer via the mmap of this file. + * Kernel will access the file via the valloc() pointer + * + * Some IOCTLs below demand use of this kind of buffer for communication + * TODO: + * pool_no is if we want to associate this buffer onto the 6 possible + * mem-pools per zuf_sbi. So anywhere we have a zu_dpp_t it will mean + * access from this pool. + * If pool_no is zero then it is private to only this file. In this case + * sb_id && zus_sbi are ignored / not needed. + */ +struct zufs_ioc_alloc_buffer { + struct zufs_ioc_hdr hdr; + /* The ID of the super block received in mount */ + __u64 sb_id; + /* We verify the sb_id validity against zus_sbi */ + struct zus_sb_info *zus_sbi; + /* max size of buffer allowed (size of mmap) */ + __u32 max_size; + /* allocate this much on initial call and set into vma */ + __u32 init_size; + + /* TODO: These below are now set to ZERO. Need implementation */ + __u16 pool_no; + __u16 flags; + __u32 reserved; +}; +#define ZU_IOC_ALLOC_BUFFER _IOWR('Z', 17, struct zufs_ioc_init) + #endif /* _LINUX_ZUFS_API_H */