diff mbox series

[V2,7/9] vhost: do not use RCU to synchronize MMU notifier with worker

Message ID 20190731084655.7024-8-jasowang@redhat.com (mailing list archive)
State New, archived
Headers show
Series Fixes for metadata accelreation | expand

Commit Message

Jason Wang July 31, 2019, 8:46 a.m. UTC
We used to use RCU to synchronize MMU notifier with worker. This leads
calling synchronize_rcu() in invalidate_range_start(). But on a busy
system, there would be many factors that may slow down the
synchronize_rcu() which makes it unsuitable to be called in MMU
notifier.

A solution is SRCU but its overhead is obvious with the expensive full
memory barrier. Another choice is to use seqlock, but it doesn't
provide a synchronization method between readers and writers. The last
choice is to use vq mutex, but it need to deal with the worst case
that MMU notifier must be blocked and wait for the finish of swap in.

So this patch switches use a counter to track whether or not the map
was used. The counter was increased when vq try to start or finish
uses the map. This means, when it was even, we're sure there's no
readers and MMU notifier is synchronized. When it was odd, it means
there's a reader we need to wait it to be even again then we are
synchronized. To avoid full memory barrier, store_release +
load_acquire on the counter is used.

Consider the read critical section is pretty small the synchronization
should be done very fast.

Note the patch lead about 3% PPS dropping.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++----------------
 drivers/vhost/vhost.h |   7 +-
 2 files changed, 94 insertions(+), 58 deletions(-)

Comments

Jason Wang July 31, 2019, 8:50 a.m. UTC | #1
On 2019/7/31 下午4:46, Jason Wang wrote:
> We used to use RCU to synchronize MMU notifier with worker. This leads
> calling synchronize_rcu() in invalidate_range_start(). But on a busy
> system, there would be many factors that may slow down the
> synchronize_rcu() which makes it unsuitable to be called in MMU
> notifier.
>
> A solution is SRCU but its overhead is obvious with the expensive full
> memory barrier. Another choice is to use seqlock, but it doesn't
> provide a synchronization method between readers and writers. The last
> choice is to use vq mutex, but it need to deal with the worst case
> that MMU notifier must be blocked and wait for the finish of swap in.
>
> So this patch switches use a counter to track whether or not the map
> was used. The counter was increased when vq try to start or finish
> uses the map. This means, when it was even, we're sure there's no
> readers and MMU notifier is synchronized. When it was odd, it means
> there's a reader we need to wait it to be even again then we are
> synchronized. To avoid full memory barrier, store_release +
> load_acquire on the counter is used.


For reviewers, I try hard to avoid e.g smp_mb(), please double check 
whether or not this trick work.

Thanks


>
> Consider the read critical section is pretty small the synchronization
> should be done very fast.
>
> Note the patch lead about 3% PPS dropping.
>
> Reported-by: Michael S. Tsirkin <mst@redhat.com>
> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>   drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++----------------
>   drivers/vhost/vhost.h |   7 +-
>   2 files changed, 94 insertions(+), 58 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index cfc11f9ed9c9..db2c81cb1e90 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
>   
>   	spin_lock(&vq->mmu_lock);
>   	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> -		map[i] = rcu_dereference_protected(vq->maps[i],
> -				  lockdep_is_held(&vq->mmu_lock));
> +		map[i] = vq->maps[i];
>   		if (map[i]) {
>   			vhost_set_map_dirty(vq, map[i], i);
> -			rcu_assign_pointer(vq->maps[i], NULL);
> +			vq->maps[i] = NULL;
>   		}
>   	}
>   	spin_unlock(&vq->mmu_lock);
>   
> -	/* No need for synchronize_rcu() or kfree_rcu() since we are
> -	 * serialized with memory accessors (e.g vq mutex held).
> +	/* No need for synchronization since we are serialized with
> +	 * memory accessors (e.g vq mutex held).
>   	 */
>   
>   	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> @@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>   	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
>   }
>   
> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> +{
> +	int ref = READ_ONCE(vq->ref);
> +
> +	smp_store_release(&vq->ref, ref + 1);
> +	/* Make sure ref counter is visible before accessing the map */
> +	smp_load_acquire(&vq->ref);
> +}
> +
> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> +{
> +	int ref = READ_ONCE(vq->ref);
> +
> +	/* Make sure vq access is done before increasing ref counter */
> +	smp_store_release(&vq->ref, ref + 1);
> +}
> +
> +static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
> +{
> +	int ref;
> +
> +	/* Make sure map change was done before checking ref counter */
> +	smp_mb();
> +
> +	ref = READ_ONCE(vq->ref);
> +	if (ref & 0x1) {
> +		/* When ref change, we are sure no reader can see
> +		 * previous map */
> +		while (READ_ONCE(vq->ref) == ref) {
> +			set_current_state(TASK_RUNNING);
> +			schedule();
> +		}
> +	}
> +	/* Make sure ref counter was checked before any other
> +	 * operations that was dene on map. */
> +	smp_mb();
> +}
> +
>   static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>   				      int index,
>   				      unsigned long start,
> @@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>   	spin_lock(&vq->mmu_lock);
>   	++vq->invalidate_count;
>   
> -	map = rcu_dereference_protected(vq->maps[index],
> -					lockdep_is_held(&vq->mmu_lock));
> +	map = vq->maps[index];
>   	if (map) {
>   		vhost_set_map_dirty(vq, map, index);
> -		rcu_assign_pointer(vq->maps[index], NULL);
> +		vq->maps[index] = NULL;
>   	}
>   	spin_unlock(&vq->mmu_lock);
>   
>   	if (map) {
> -		synchronize_rcu();
> +		vhost_vq_sync_access(vq);
>   		vhost_map_unprefetch(map);
>   	}
>   }
> @@ -457,7 +493,7 @@ static void vhost_init_maps(struct vhost_dev *dev)
>   	for (i = 0; i < dev->nvqs; ++i) {
>   		vq = dev->vqs[i];
>   		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> -			RCU_INIT_POINTER(vq->maps[j], NULL);
> +			vq->maps[j] = NULL;
>   	}
>   }
>   #endif
> @@ -655,6 +691,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>   		vq->indirect = NULL;
>   		vq->heads = NULL;
>   		vq->dev = dev;
> +		vq->ref = 0;
>   		mutex_init(&vq->mutex);
>   		spin_lock_init(&vq->mmu_lock);
>   		vhost_vq_reset(dev, vq);
> @@ -921,7 +958,7 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq,
>   	map->npages = npages;
>   	map->pages = pages;
>   
> -	rcu_assign_pointer(vq->maps[index], map);
> +	vq->maps[index] = map;
>   	/* No need for a synchronize_rcu(). This function should be
>   	 * called by dev->worker so we are serialized with all
>   	 * readers.
> @@ -1216,18 +1253,18 @@ static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>   	struct vring_used *used;
>   
>   	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>   
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> +		map = vq->maps[VHOST_ADDR_USED];
>   		if (likely(map)) {
>   			used = map->addr;
>   			*((__virtio16 *)&used->ring[vq->num]) =
>   				cpu_to_vhost16(vq, vq->avail_idx);
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>   			return 0;
>   		}
>   
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>   	}
>   #endif
>   
> @@ -1245,18 +1282,18 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>   	size_t size;
>   
>   	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>   
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> +		map = vq->maps[VHOST_ADDR_USED];
>   		if (likely(map)) {
>   			used = map->addr;
>   			size = count * sizeof(*head);
>   			memcpy(used->ring + idx, head, size);
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>   			return 0;
>   		}
>   
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>   	}
>   #endif
>   
> @@ -1272,17 +1309,17 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>   	struct vring_used *used;
>   
>   	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>   
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> +		map = vq->maps[VHOST_ADDR_USED];
>   		if (likely(map)) {
>   			used = map->addr;
>   			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>   			return 0;
>   		}
>   
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>   	}
>   #endif
>   
> @@ -1298,17 +1335,17 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>   	struct vring_used *used;
>   
>   	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>   
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> +		map = vq->maps[VHOST_ADDR_USED];
>   		if (likely(map)) {
>   			used = map->addr;
>   			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>   			return 0;
>   		}
>   
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>   	}
>   #endif
>   
> @@ -1362,17 +1399,17 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>   	struct vring_avail *avail;
>   
>   	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>   
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
>   		if (likely(map)) {
>   			avail = map->addr;
>   			*idx = avail->idx;
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>   			return 0;
>   		}
>   
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>   	}
>   #endif
>   
> @@ -1387,17 +1424,17 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>   	struct vring_avail *avail;
>   
>   	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>   
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
>   		if (likely(map)) {
>   			avail = map->addr;
>   			*head = avail->ring[idx & (vq->num - 1)];
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>   			return 0;
>   		}
>   
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>   	}
>   #endif
>   
> @@ -1413,17 +1450,17 @@ static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>   	struct vring_avail *avail;
>   
>   	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>   
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
>   		if (likely(map)) {
>   			avail = map->addr;
>   			*flags = avail->flags;
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>   			return 0;
>   		}
>   
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>   	}
>   #endif
>   
> @@ -1438,15 +1475,15 @@ static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>   	struct vring_avail *avail;
>   
>   	if (!vq->iotlb) {
> -		rcu_read_lock();
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> +		vhost_vq_access_map_begin(vq);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
>   		if (likely(map)) {
>   			avail = map->addr;
>   			*event = (__virtio16)avail->ring[vq->num];
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>   			return 0;
>   		}
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>   	}
>   #endif
>   
> @@ -1461,17 +1498,17 @@ static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>   	struct vring_used *used;
>   
>   	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>   
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> +		map = vq->maps[VHOST_ADDR_USED];
>   		if (likely(map)) {
>   			used = map->addr;
>   			*idx = used->idx;
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>   			return 0;
>   		}
>   
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>   	}
>   #endif
>   
> @@ -1486,17 +1523,17 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>   	struct vring_desc *d;
>   
>   	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>   
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
> +		map = vq->maps[VHOST_ADDR_DESC];
>   		if (likely(map)) {
>   			d = map->addr;
>   			*desc = *(d + idx);
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>   			return 0;
>   		}
>   
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>   	}
>   #endif
>   
> @@ -1843,13 +1880,11 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
>   #if VHOST_ARCH_CAN_ACCEL_UACCESS
>   static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
>   {
> -	struct vhost_map __rcu *map;
> +	struct vhost_map *map;
>   	int i;
>   
>   	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> -		rcu_read_lock();
> -		map = rcu_dereference(vq->maps[i]);
> -		rcu_read_unlock();
> +		map = vq->maps[i];
>   		if (unlikely(!map))
>   			vhost_map_prefetch(vq, i);
>   	}
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index a9a2a93857d2..f9e9558a529d 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -115,16 +115,17 @@ struct vhost_virtqueue {
>   #if VHOST_ARCH_CAN_ACCEL_UACCESS
>   	/* Read by memory accessors, modified by meta data
>   	 * prefetching, MMU notifier and vring ioctl().
> -	 * Synchonrized through mmu_lock (writers) and RCU (writers
> -	 * and readers).
> +	 * Synchonrized through mmu_lock (writers) and ref counters,
> +	 * see vhost_vq_access_map_begin()/vhost_vq_access_map_end().
>   	 */
> -	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
> +	struct vhost_map *maps[VHOST_NUM_ADDRS];
>   	/* Read by MMU notifier, modified by vring ioctl(),
>   	 * synchronized through MMU notifier
>   	 * registering/unregistering.
>   	 */
>   	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
>   #endif
> +	int ref;
>   	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
>   
>   	struct file *kick;
Jason Gunthorpe July 31, 2019, 12:39 p.m. UTC | #2
On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
> We used to use RCU to synchronize MMU notifier with worker. This leads
> calling synchronize_rcu() in invalidate_range_start(). But on a busy
> system, there would be many factors that may slow down the
> synchronize_rcu() which makes it unsuitable to be called in MMU
> notifier.
> 
> A solution is SRCU but its overhead is obvious with the expensive full
> memory barrier. Another choice is to use seqlock, but it doesn't
> provide a synchronization method between readers and writers. The last
> choice is to use vq mutex, but it need to deal with the worst case
> that MMU notifier must be blocked and wait for the finish of swap in.
> 
> So this patch switches use a counter to track whether or not the map
> was used. The counter was increased when vq try to start or finish
> uses the map. This means, when it was even, we're sure there's no
> readers and MMU notifier is synchronized. When it was odd, it means
> there's a reader we need to wait it to be even again then we are
> synchronized. 

You just described a seqlock.

We've been talking about providing this as some core service from mmu
notifiers because nearly every use of this API needs it.

IMHO this gets the whole thing backwards, the common pattern is to
protect the 'shadow pte' data with a seqlock (usually open coded),
such that the mmu notififer side has the write side of that lock and
the read side is consumed by the thread accessing or updating the SPTE.


> Reported-by: Michael S. Tsirkin <mst@redhat.com>
> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
>  drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++----------------
>  drivers/vhost/vhost.h |   7 +-
>  2 files changed, 94 insertions(+), 58 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index cfc11f9ed9c9..db2c81cb1e90 100644
> +++ b/drivers/vhost/vhost.c
> @@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
>  
>  	spin_lock(&vq->mmu_lock);
>  	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> -		map[i] = rcu_dereference_protected(vq->maps[i],
> -				  lockdep_is_held(&vq->mmu_lock));
> +		map[i] = vq->maps[i];
>  		if (map[i]) {
>  			vhost_set_map_dirty(vq, map[i], i);
> -			rcu_assign_pointer(vq->maps[i], NULL);
> +			vq->maps[i] = NULL;
>  		}
>  	}
>  	spin_unlock(&vq->mmu_lock);
>  
> -	/* No need for synchronize_rcu() or kfree_rcu() since we are
> -	 * serialized with memory accessors (e.g vq mutex held).
> +	/* No need for synchronization since we are serialized with
> +	 * memory accessors (e.g vq mutex held).
>  	 */
>  
>  	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> @@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>  	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
>  }
>  
> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> +{
> +	int ref = READ_ONCE(vq->ref);

Is a lock/single threaded supposed to be held for this?

> +
> +	smp_store_release(&vq->ref, ref + 1);
> +	/* Make sure ref counter is visible before accessing the map */
> +	smp_load_acquire(&vq->ref);

release/acquire semantics are intended to protect blocks of related
data, so reading something with acquire and throwing away the result
is nonsense.

> +}
> +
> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> +{
> +	int ref = READ_ONCE(vq->ref);

If the write to vq->ref is not locked this algorithm won't work, if it
is locked the READ_ONCE is not needed.

> +	/* Make sure vq access is done before increasing ref counter */
> +	smp_store_release(&vq->ref, ref + 1);
> +}
> +
> +static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
> +{
> +	int ref;
> +
> +	/* Make sure map change was done before checking ref counter */
> +	smp_mb();

This is probably smp_rmb after reading ref, and if you are setting ref
with smp_store_release then this should be smp_load_acquire() without
an explicit mb.

> +	ref = READ_ONCE(vq->ref);
> +	if (ref & 0x1) {
> +		/* When ref change, we are sure no reader can see
> +		 * previous map */
> +		while (READ_ONCE(vq->ref) == ref) {
> +			set_current_state(TASK_RUNNING);
> +			schedule();
> +		}
> +	}

This is basically read_seqcount_begin()' with a schedule instead of
cpu_relax


> +	/* Make sure ref counter was checked before any other
> +	 * operations that was dene on map. */
> +	smp_mb();

should be in a smp_load_acquire()

> +}
> +
>  static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>  				      int index,
>  				      unsigned long start,
> @@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>  	spin_lock(&vq->mmu_lock);
>  	++vq->invalidate_count;
>  
> -	map = rcu_dereference_protected(vq->maps[index],
> -					lockdep_is_held(&vq->mmu_lock));
> +	map = vq->maps[index];
>  	if (map) {
>  		vhost_set_map_dirty(vq, map, index);
> -		rcu_assign_pointer(vq->maps[index], NULL);
> +		vq->maps[index] = NULL;
>  	}
>  	spin_unlock(&vq->mmu_lock);
>  
>  	if (map) {
> -		synchronize_rcu();
> +		vhost_vq_sync_access(vq);

What prevents racing with vhost_vq_access_map_end here?

>  		vhost_map_unprefetch(map);
>  	}
>  }

Overall I don't like it. 

We are trying to get rid of these botique mmu notifier patterns in
drivers. 

Jason
Jason Wang July 31, 2019, 1:28 p.m. UTC | #3
On 2019/7/31 下午8:39, Jason Gunthorpe wrote:
> On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
>> We used to use RCU to synchronize MMU notifier with worker. This leads
>> calling synchronize_rcu() in invalidate_range_start(). But on a busy
>> system, there would be many factors that may slow down the
>> synchronize_rcu() which makes it unsuitable to be called in MMU
>> notifier.
>>
>> A solution is SRCU but its overhead is obvious with the expensive full
>> memory barrier. Another choice is to use seqlock, but it doesn't
>> provide a synchronization method between readers and writers. The last
>> choice is to use vq mutex, but it need to deal with the worst case
>> that MMU notifier must be blocked and wait for the finish of swap in.
>>
>> So this patch switches use a counter to track whether or not the map
>> was used. The counter was increased when vq try to start or finish
>> uses the map. This means, when it was even, we're sure there's no
>> readers and MMU notifier is synchronized. When it was odd, it means
>> there's a reader we need to wait it to be even again then we are
>> synchronized.
> You just described a seqlock.


Kind of, see my explanation below.


>
> We've been talking about providing this as some core service from mmu
> notifiers because nearly every use of this API needs it.


That would be very helpful.


>
> IMHO this gets the whole thing backwards, the common pattern is to
> protect the 'shadow pte' data with a seqlock (usually open coded),
> such that the mmu notififer side has the write side of that lock and
> the read side is consumed by the thread accessing or updating the SPTE.


Yes, I've considered something like that. But the problem is, mmu 
notifier (writer) need to wait for the vhost worker to finish the read 
before it can do things like setting dirty pages and unmapping page.  It 
looks to me seqlock doesn't provide things like this.  Or are you 
suggesting that taking writer seq lock in vhost worker and busy wait for 
seqcount to be even in MMU notifier (something similar to what this 
patch did)? I don't do this because e.g:


write_seqcount_begin()

map = vq->map[X]

write or read through map->addr directly

write_seqcount_end()


There's no rmb() in write_seqcount_begin(), so map could be read before 
write_seqcount_begin(), but it looks to me now that this doesn't harm at 
all, maybe we can try this way.


>
>
>> Reported-by: Michael S. Tsirkin <mst@redhat.com>
>> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>   drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++----------------
>>   drivers/vhost/vhost.h |   7 +-
>>   2 files changed, 94 insertions(+), 58 deletions(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index cfc11f9ed9c9..db2c81cb1e90 100644
>> +++ b/drivers/vhost/vhost.c
>> @@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
>>   
>>   	spin_lock(&vq->mmu_lock);
>>   	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
>> -		map[i] = rcu_dereference_protected(vq->maps[i],
>> -				  lockdep_is_held(&vq->mmu_lock));
>> +		map[i] = vq->maps[i];
>>   		if (map[i]) {
>>   			vhost_set_map_dirty(vq, map[i], i);
>> -			rcu_assign_pointer(vq->maps[i], NULL);
>> +			vq->maps[i] = NULL;
>>   		}
>>   	}
>>   	spin_unlock(&vq->mmu_lock);
>>   
>> -	/* No need for synchronize_rcu() or kfree_rcu() since we are
>> -	 * serialized with memory accessors (e.g vq mutex held).
>> +	/* No need for synchronization since we are serialized with
>> +	 * memory accessors (e.g vq mutex held).
>>   	 */
>>   
>>   	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>> @@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>>   	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
>>   }
>>   
>> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
>> +{
>> +	int ref = READ_ONCE(vq->ref);
> Is a lock/single threaded supposed to be held for this?


Yes, only vhost worker kthread can accept this.


>
>> +
>> +	smp_store_release(&vq->ref, ref + 1);
>> +	/* Make sure ref counter is visible before accessing the map */
>> +	smp_load_acquire(&vq->ref);
> release/acquire semantics are intended to protect blocks of related
> data, so reading something with acquire and throwing away the result
> is nonsense.


Actually I want to use smp_mb() here, so I admit it's a trick that even 
won't work. But now I think I can just use write_seqcount_begin() here.


>
>> +}
>> +
>> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
>> +{
>> +	int ref = READ_ONCE(vq->ref);
> If the write to vq->ref is not locked this algorithm won't work, if it
> is locked the READ_ONCE is not needed.


Yes.


>
>> +	/* Make sure vq access is done before increasing ref counter */
>> +	smp_store_release(&vq->ref, ref + 1);
>> +}
>> +
>> +static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
>> +{
>> +	int ref;
>> +
>> +	/* Make sure map change was done before checking ref counter */
>> +	smp_mb();
> This is probably smp_rmb after reading ref, and if you are setting ref
> with smp_store_release then this should be smp_load_acquire() without
> an explicit mb.


We had something like:

spin_lock();

vq->maps[index] = NULL;

spin_unlock();

vhost_vq_sync_access(vq);

we need to make sure the read of ref is done after setting 
vq->maps[index] to NULL. It looks to me neither smp_load_acquire() nor 
smp_store_release() can help in this case.


>
>> +	ref = READ_ONCE(vq->ref);
>> +	if (ref & 0x1) {
>> +		/* When ref change, we are sure no reader can see
>> +		 * previous map */
>> +		while (READ_ONCE(vq->ref) == ref) {
>> +			set_current_state(TASK_RUNNING);
>> +			schedule();
>> +		}
>> +	}
> This is basically read_seqcount_begin()' with a schedule instead of
> cpu_relax


Yes it is.


>
>
>> +	/* Make sure ref counter was checked before any other
>> +	 * operations that was dene on map. */
>> +	smp_mb();
> should be in a smp_load_acquire()


Right, if we use smp_load_acquire() to load the counter.


>
>> +}
>> +
>>   static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>>   				      int index,
>>   				      unsigned long start,
>> @@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>>   	spin_lock(&vq->mmu_lock);
>>   	++vq->invalidate_count;
>>   
>> -	map = rcu_dereference_protected(vq->maps[index],
>> -					lockdep_is_held(&vq->mmu_lock));
>> +	map = vq->maps[index];
>>   	if (map) {
>>   		vhost_set_map_dirty(vq, map, index);
>> -		rcu_assign_pointer(vq->maps[index], NULL);
>> +		vq->maps[index] = NULL;
>>   	}
>>   	spin_unlock(&vq->mmu_lock);
>>   
>>   	if (map) {
>> -		synchronize_rcu();
>> +		vhost_vq_sync_access(vq);
> What prevents racing with vhost_vq_access_map_end here?


vhost_vq_access_map_end() uses smp_store_release() for the counter. Is 
this not sufficient?


>
>>   		vhost_map_unprefetch(map);
>>   	}
>>   }
> Overall I don't like it.
>
> We are trying to get rid of these botique mmu notifier patterns in
> drivers.


I agree, so do you think we can take write lock in vhost worker then 
wait for the counter to be even in MMU notifier? It looks much cleaner 
than this patch.

Thanks


>
> Jason
Michael S. Tsirkin July 31, 2019, 6:29 p.m. UTC | #4
On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
> We used to use RCU to synchronize MMU notifier with worker. This leads
> calling synchronize_rcu() in invalidate_range_start(). But on a busy
> system, there would be many factors that may slow down the
> synchronize_rcu() which makes it unsuitable to be called in MMU
> notifier.
> 
> A solution is SRCU but its overhead is obvious with the expensive full
> memory barrier. Another choice is to use seqlock, but it doesn't
> provide a synchronization method between readers and writers. The last
> choice is to use vq mutex, but it need to deal with the worst case
> that MMU notifier must be blocked and wait for the finish of swap in.
> 
> So this patch switches use a counter to track whether or not the map
> was used. The counter was increased when vq try to start or finish
> uses the map. This means, when it was even, we're sure there's no
> readers and MMU notifier is synchronized. When it was odd, it means
> there's a reader we need to wait it to be even again then we are
> synchronized. To avoid full memory barrier, store_release +
> load_acquire on the counter is used.

Unfortunately this needs a lot of review and testing, so this can't make
rc2, and I don't think this is the kind of patch I can merge after rc3.
Subtle memory barrier tricks like this can introduce new bugs while they
are fixing old ones.





> 
> Consider the read critical section is pretty small the synchronization
> should be done very fast.
> 
> Note the patch lead about 3% PPS dropping.

Sorry what do you mean by this last sentence? This degrades performance
compared to what?

> 
> Reported-by: Michael S. Tsirkin <mst@redhat.com>
> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++----------------
>  drivers/vhost/vhost.h |   7 +-
>  2 files changed, 94 insertions(+), 58 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index cfc11f9ed9c9..db2c81cb1e90 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
>  
>  	spin_lock(&vq->mmu_lock);
>  	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> -		map[i] = rcu_dereference_protected(vq->maps[i],
> -				  lockdep_is_held(&vq->mmu_lock));
> +		map[i] = vq->maps[i];
>  		if (map[i]) {
>  			vhost_set_map_dirty(vq, map[i], i);
> -			rcu_assign_pointer(vq->maps[i], NULL);
> +			vq->maps[i] = NULL;
>  		}
>  	}
>  	spin_unlock(&vq->mmu_lock);
>  
> -	/* No need for synchronize_rcu() or kfree_rcu() since we are
> -	 * serialized with memory accessors (e.g vq mutex held).
> +	/* No need for synchronization since we are serialized with
> +	 * memory accessors (e.g vq mutex held).
>  	 */
>  
>  	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> @@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>  	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
>  }
>  
> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> +{
> +	int ref = READ_ONCE(vq->ref);
> +
> +	smp_store_release(&vq->ref, ref + 1);
> +	/* Make sure ref counter is visible before accessing the map */
> +	smp_load_acquire(&vq->ref);

The map access is after this sequence, correct?

Just going by the rules in Documentation/memory-barriers.txt,
I think that this pair will not order following accesses with ref store.

Documentation/memory-barriers.txt says:


+     In addition, a RELEASE+ACQUIRE
+     pair is -not- guaranteed to act as a full memory barrier.



The guarantee that is made is this:
	after
     an ACQUIRE on a given variable, all memory accesses preceding any prior
     RELEASE on that same variable are guaranteed to be visible. 


And if we also had the reverse rule we'd end up with a full barrier,
won't we?

Cc Paul in case I missed something here. And if I'm right,
maybe we should call this out, adding

	"The opposite is not true: a prior RELEASE is not
	 guaranteed to be visible before memory accesses following
	 the subsequent ACQUIRE".



> +}
> +
> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> +{
> +	int ref = READ_ONCE(vq->ref);
> +
> +	/* Make sure vq access is done before increasing ref counter */
> +	smp_store_release(&vq->ref, ref + 1);
> +}
> +
> +static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
> +{
> +	int ref;
> +
> +	/* Make sure map change was done before checking ref counter */
> +	smp_mb();
> +
> +	ref = READ_ONCE(vq->ref);
> +	if (ref & 0x1) {

Please document the even/odd trick here too, not just in the commit log.

> +		/* When ref change,

changes

> we are sure no reader can see
> +		 * previous map */
> +		while (READ_ONCE(vq->ref) == ref) {


what is the below line in aid of?

> +			set_current_state(TASK_RUNNING);
> +			schedule();

                        if (need_resched())
                                schedule();

?

> +		}

On an interruptible kernel, there's a risk here is that
a task got preempted with an odd ref.
So I suspect we'll have to disable preemption when we
make ref odd.


> +	}
> +	/* Make sure ref counter was checked before any other
> +	 * operations that was dene on map. */

was dene -> were done?

> +	smp_mb();
> +}
> +
>  static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>  				      int index,
>  				      unsigned long start,
> @@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>  	spin_lock(&vq->mmu_lock);
>  	++vq->invalidate_count;
>  
> -	map = rcu_dereference_protected(vq->maps[index],
> -					lockdep_is_held(&vq->mmu_lock));
> +	map = vq->maps[index];
>  	if (map) {
>  		vhost_set_map_dirty(vq, map, index);
> -		rcu_assign_pointer(vq->maps[index], NULL);
> +		vq->maps[index] = NULL;
>  	}
>  	spin_unlock(&vq->mmu_lock);
>  
>  	if (map) {
> -		synchronize_rcu();
> +		vhost_vq_sync_access(vq);
>  		vhost_map_unprefetch(map);
>  	}
>  }
> @@ -457,7 +493,7 @@ static void vhost_init_maps(struct vhost_dev *dev)
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		vq = dev->vqs[i];
>  		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> -			RCU_INIT_POINTER(vq->maps[j], NULL);
> +			vq->maps[j] = NULL;
>  	}
>  }
>  #endif
> @@ -655,6 +691,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vq->indirect = NULL;
>  		vq->heads = NULL;
>  		vq->dev = dev;
> +		vq->ref = 0;
>  		mutex_init(&vq->mutex);
>  		spin_lock_init(&vq->mmu_lock);
>  		vhost_vq_reset(dev, vq);
> @@ -921,7 +958,7 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq,
>  	map->npages = npages;
>  	map->pages = pages;
>  
> -	rcu_assign_pointer(vq->maps[index], map);
> +	vq->maps[index] = map;
>  	/* No need for a synchronize_rcu(). This function should be
>  	 * called by dev->worker so we are serialized with all
>  	 * readers.
> @@ -1216,18 +1253,18 @@ static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>  	struct vring_used *used;
>  
>  	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>  
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> +		map = vq->maps[VHOST_ADDR_USED];
>  		if (likely(map)) {
>  			used = map->addr;
>  			*((__virtio16 *)&used->ring[vq->num]) =
>  				cpu_to_vhost16(vq, vq->avail_idx);
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>  			return 0;
>  		}
>  
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>  	}
>  #endif
>  
> @@ -1245,18 +1282,18 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  	size_t size;
>  
>  	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>  
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> +		map = vq->maps[VHOST_ADDR_USED];
>  		if (likely(map)) {
>  			used = map->addr;
>  			size = count * sizeof(*head);
>  			memcpy(used->ring + idx, head, size);
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>  			return 0;
>  		}
>  
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>  	}
>  #endif
>  
> @@ -1272,17 +1309,17 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  	struct vring_used *used;
>  
>  	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>  
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> +		map = vq->maps[VHOST_ADDR_USED];
>  		if (likely(map)) {
>  			used = map->addr;
>  			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>  			return 0;
>  		}
>  
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>  	}
>  #endif
>  
> @@ -1298,17 +1335,17 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>  	struct vring_used *used;
>  
>  	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>  
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> +		map = vq->maps[VHOST_ADDR_USED];
>  		if (likely(map)) {
>  			used = map->addr;
>  			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>  			return 0;
>  		}
>  
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>  	}
>  #endif
>  
> @@ -1362,17 +1399,17 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>  	struct vring_avail *avail;
>  
>  	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>  
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
>  		if (likely(map)) {
>  			avail = map->addr;
>  			*idx = avail->idx;
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>  			return 0;
>  		}
>  
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>  	}
>  #endif
>  
> @@ -1387,17 +1424,17 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  	struct vring_avail *avail;
>  
>  	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>  
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
>  		if (likely(map)) {
>  			avail = map->addr;
>  			*head = avail->ring[idx & (vq->num - 1)];
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>  			return 0;
>  		}
>  
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>  	}
>  #endif
>  
> @@ -1413,17 +1450,17 @@ static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>  	struct vring_avail *avail;
>  
>  	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>  
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
>  		if (likely(map)) {
>  			avail = map->addr;
>  			*flags = avail->flags;
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>  			return 0;
>  		}
>  
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>  	}
>  #endif
>  
> @@ -1438,15 +1475,15 @@ static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>  	struct vring_avail *avail;
>  
>  	if (!vq->iotlb) {
> -		rcu_read_lock();
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> +		vhost_vq_access_map_begin(vq);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
>  		if (likely(map)) {
>  			avail = map->addr;
>  			*event = (__virtio16)avail->ring[vq->num];
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>  			return 0;
>  		}
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>  	}
>  #endif
>  
> @@ -1461,17 +1498,17 @@ static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>  	struct vring_used *used;
>  
>  	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>  
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> +		map = vq->maps[VHOST_ADDR_USED];
>  		if (likely(map)) {
>  			used = map->addr;
>  			*idx = used->idx;
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>  			return 0;
>  		}
>  
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>  	}
>  #endif
>  
> @@ -1486,17 +1523,17 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>  	struct vring_desc *d;
>  
>  	if (!vq->iotlb) {
> -		rcu_read_lock();
> +		vhost_vq_access_map_begin(vq);
>  
> -		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
> +		map = vq->maps[VHOST_ADDR_DESC];
>  		if (likely(map)) {
>  			d = map->addr;
>  			*desc = *(d + idx);
> -			rcu_read_unlock();
> +			vhost_vq_access_map_end(vq);
>  			return 0;
>  		}
>  
> -		rcu_read_unlock();
> +		vhost_vq_access_map_end(vq);
>  	}
>  #endif
>  
> @@ -1843,13 +1880,11 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
>  #if VHOST_ARCH_CAN_ACCEL_UACCESS
>  static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
>  {
> -	struct vhost_map __rcu *map;
> +	struct vhost_map *map;
>  	int i;
>  
>  	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> -		rcu_read_lock();
> -		map = rcu_dereference(vq->maps[i]);
> -		rcu_read_unlock();
> +		map = vq->maps[i];
>  		if (unlikely(!map))
>  			vhost_map_prefetch(vq, i);
>  	}
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index a9a2a93857d2..f9e9558a529d 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -115,16 +115,17 @@ struct vhost_virtqueue {
>  #if VHOST_ARCH_CAN_ACCEL_UACCESS
>  	/* Read by memory accessors, modified by meta data
>  	 * prefetching, MMU notifier and vring ioctl().
> -	 * Synchonrized through mmu_lock (writers) and RCU (writers
> -	 * and readers).
> +	 * Synchonrized through mmu_lock (writers) and ref counters,
> +	 * see vhost_vq_access_map_begin()/vhost_vq_access_map_end().
>  	 */
> -	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
> +	struct vhost_map *maps[VHOST_NUM_ADDRS];
>  	/* Read by MMU notifier, modified by vring ioctl(),
>  	 * synchronized through MMU notifier
>  	 * registering/unregistering.
>  	 */
>  	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
>  #endif
> +	int ref;

Is it important that this is signed? If not I'd do unsigned here:
even though kernel does compile with 2s complement sign overflow,
it seems cleaner not to depend on that.

>  	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
>  
>  	struct file *kick;
> -- 
> 2.18.1
Jason Gunthorpe July 31, 2019, 7:30 p.m. UTC | #5
On Wed, Jul 31, 2019 at 09:28:20PM +0800, Jason Wang wrote:
> 
> On 2019/7/31 下午8:39, Jason Gunthorpe wrote:
> > On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
> > > We used to use RCU to synchronize MMU notifier with worker. This leads
> > > calling synchronize_rcu() in invalidate_range_start(). But on a busy
> > > system, there would be many factors that may slow down the
> > > synchronize_rcu() which makes it unsuitable to be called in MMU
> > > notifier.
> > > 
> > > A solution is SRCU but its overhead is obvious with the expensive full
> > > memory barrier. Another choice is to use seqlock, but it doesn't
> > > provide a synchronization method between readers and writers. The last
> > > choice is to use vq mutex, but it need to deal with the worst case
> > > that MMU notifier must be blocked and wait for the finish of swap in.
> > > 
> > > So this patch switches use a counter to track whether or not the map
> > > was used. The counter was increased when vq try to start or finish
> > > uses the map. This means, when it was even, we're sure there's no
> > > readers and MMU notifier is synchronized. When it was odd, it means
> > > there's a reader we need to wait it to be even again then we are
> > > synchronized.
> > You just described a seqlock.
> 
> 
> Kind of, see my explanation below.
> 
> 
> > 
> > We've been talking about providing this as some core service from mmu
> > notifiers because nearly every use of this API needs it.
> 
> 
> That would be very helpful.
> 
> 
> > 
> > IMHO this gets the whole thing backwards, the common pattern is to
> > protect the 'shadow pte' data with a seqlock (usually open coded),
> > such that the mmu notififer side has the write side of that lock and
> > the read side is consumed by the thread accessing or updating the SPTE.
> 
> 
> Yes, I've considered something like that. But the problem is, mmu notifier
> (writer) need to wait for the vhost worker to finish the read before it can
> do things like setting dirty pages and unmapping page.  It looks to me
> seqlock doesn't provide things like this.  

The seqlock is usually used to prevent a 2nd thread from accessing the
VA while it is being changed by the mm. ie you use something seqlocky
instead of the ugly mmu_notifier_unregister/register cycle.

You are supposed to use something simple like a spinlock or mutex
inside the invalidate_range_start to serialized tear down of the SPTEs
with their accessors.

> write_seqcount_begin()
> 
> map = vq->map[X]
> 
> write or read through map->addr directly
> 
> write_seqcount_end()
> 
> 
> There's no rmb() in write_seqcount_begin(), so map could be read before
> write_seqcount_begin(), but it looks to me now that this doesn't harm at
> all, maybe we can try this way.

That is because it is a write side lock, not a read lock. IIRC
seqlocks have weaker barriers because the write side needs to be
serialized in some other way.

The requirement I see is you need invalidate_range_start to block
until another thread exits its critical section (ie stops accessing
the SPTEs). 

That is a spinlock/mutex.

You just can't invent a faster spinlock by open coding something with
barriers, it doesn't work.

Jason
Jason Wang Aug. 1, 2019, 5:02 a.m. UTC | #6
On 2019/8/1 上午3:30, Jason Gunthorpe wrote:
> On Wed, Jul 31, 2019 at 09:28:20PM +0800, Jason Wang wrote:
>> On 2019/7/31 下午8:39, Jason Gunthorpe wrote:
>>> On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
>>>> We used to use RCU to synchronize MMU notifier with worker. This leads
>>>> calling synchronize_rcu() in invalidate_range_start(). But on a busy
>>>> system, there would be many factors that may slow down the
>>>> synchronize_rcu() which makes it unsuitable to be called in MMU
>>>> notifier.
>>>>
>>>> A solution is SRCU but its overhead is obvious with the expensive full
>>>> memory barrier. Another choice is to use seqlock, but it doesn't
>>>> provide a synchronization method between readers and writers. The last
>>>> choice is to use vq mutex, but it need to deal with the worst case
>>>> that MMU notifier must be blocked and wait for the finish of swap in.
>>>>
>>>> So this patch switches use a counter to track whether or not the map
>>>> was used. The counter was increased when vq try to start or finish
>>>> uses the map. This means, when it was even, we're sure there's no
>>>> readers and MMU notifier is synchronized. When it was odd, it means
>>>> there's a reader we need to wait it to be even again then we are
>>>> synchronized.
>>> You just described a seqlock.
>>
>> Kind of, see my explanation below.
>>
>>
>>> We've been talking about providing this as some core service from mmu
>>> notifiers because nearly every use of this API needs it.
>>
>> That would be very helpful.
>>
>>
>>> IMHO this gets the whole thing backwards, the common pattern is to
>>> protect the 'shadow pte' data with a seqlock (usually open coded),
>>> such that the mmu notififer side has the write side of that lock and
>>> the read side is consumed by the thread accessing or updating the SPTE.
>>
>> Yes, I've considered something like that. But the problem is, mmu notifier
>> (writer) need to wait for the vhost worker to finish the read before it can
>> do things like setting dirty pages and unmapping page.  It looks to me
>> seqlock doesn't provide things like this.
> The seqlock is usually used to prevent a 2nd thread from accessing the
> VA while it is being changed by the mm. ie you use something seqlocky
> instead of the ugly mmu_notifier_unregister/register cycle.


Yes, so we have two mappings:

[1] vring address to VA
[2] VA to PA

And have several readers and writers

1) set_vring_num_addr(): writer of both [1] and [2]
2) MMU notifier: reader of [1] writer of [2]
3) GUP: reader of [1] writer of [2]
4) memory accessors: reader of [1] and [2]

Fortunately, 1) 3) and 4) have already synchronized through vq->mutex. 
We only need to deal with synchronization between 2) and each of the reset:
Sync between 1) and 2): For mapping [1], I do 
mmu_notifier_unregister/register. This help to avoid holding any lock to 
do overlap check. Anyway we only care about one or three pages , but the 
whole guest memory could be several TBs. For mapping [2], both 1) and 2) 
are writers, so use spinlock (mmu_lock) to synchronize.
Sync between 2) and 3): For mapping [1], both are readers, no need any 
synchronization. For mapping [2], both 2) and 3) are writers, so 
synchronize through spinlock (mmu_lock);
Sync between 2) and 4): For mapping [1], both are readers, no need any 
synchronization. For mapping [2], synchronize through RCU (or something 
simliar to seqlock).

You suggestion is about the synchronization of [1] which may make sense, 
but it could be done on top as an optimization. What this path tries to 
do is to not use RCU for [2]. Of course, the simplest way is to use vq 
mutex in 2) but it means:
- we must hold vq lock to check range overlap
- since the critical section was increased, the worst case is to wait 
guest memory to be swapped in, this could be even slower than 
synchronize_rcu().


>
> You are supposed to use something simple like a spinlock or mutex
> inside the invalidate_range_start to serialized tear down of the SPTEs
> with their accessors.


Technically yes, but we probably can't afford that for vhost fast path, 
the atomics eliminate almost all the performance improvement brought by 
this patch on a machine without SMAP.


>
>> write_seqcount_begin()
>>
>> map = vq->map[X]
>>
>> write or read through map->addr directly
>>
>> write_seqcount_end()
>>
>>
>> There's no rmb() in write_seqcount_begin(), so map could be read before
>> write_seqcount_begin(), but it looks to me now that this doesn't harm at
>> all, maybe we can try this way.
> That is because it is a write side lock, not a read lock. IIRC
> seqlocks have weaker barriers because the write side needs to be
> serialized in some other way.


Yes. Having a hard thought of the code, it looks to me 
write_seqcount_begin()/end() is sufficient here:

- Notifier will only assign NULL to map, so it doesn't harm to read map 
before seq, then we will fallback to normal copy_from/to_user() slow 
path earlier
- if we write through map->addr, it should be done before increasing the 
seqcount because of the smp_wmb() in write_seqcount_end()
- if we read through map->addr which also contain a store to a pointer, 
we have a good data dependency so smp_wmb() also work here.


>
> The requirement I see is you need invalidate_range_start to block
> until another thread exits its critical section (ie stops accessing
> the SPTEs).


Yes.


>
> That is a spinlock/mutex.


Or a semantics similar to RCU.


>
> You just can't invent a faster spinlock by open coding something with
> barriers, it doesn't work.
>
> Jason


If write_seqlock() works here, we can simply wait for seqcount to move 
advance in MMU notifier. The original idea is to use RCU which solves 
this perfectly. But as pointed out it could be slow.

Thanks
Jason Wang Aug. 1, 2019, 8:06 a.m. UTC | #7
On 2019/8/1 上午2:29, Michael S. Tsirkin wrote:
> On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
>> We used to use RCU to synchronize MMU notifier with worker. This leads
>> calling synchronize_rcu() in invalidate_range_start(). But on a busy
>> system, there would be many factors that may slow down the
>> synchronize_rcu() which makes it unsuitable to be called in MMU
>> notifier.
>>
>> A solution is SRCU but its overhead is obvious with the expensive full
>> memory barrier. Another choice is to use seqlock, but it doesn't
>> provide a synchronization method between readers and writers. The last
>> choice is to use vq mutex, but it need to deal with the worst case
>> that MMU notifier must be blocked and wait for the finish of swap in.
>>
>> So this patch switches use a counter to track whether or not the map
>> was used. The counter was increased when vq try to start or finish
>> uses the map. This means, when it was even, we're sure there's no
>> readers and MMU notifier is synchronized. When it was odd, it means
>> there's a reader we need to wait it to be even again then we are
>> synchronized. To avoid full memory barrier, store_release +
>> load_acquire on the counter is used.
>
> Unfortunately this needs a lot of review and testing, so this can't make
> rc2, and I don't think this is the kind of patch I can merge after rc3.
> Subtle memory barrier tricks like this can introduce new bugs while they
> are fixing old ones.

I admit the patch is tricky. Some questions:

- Do we must address the case of e.g swap in? If not, a simple
  vhost_work_flush() instead of synchronize_rcu() may work.
- Having some hard thought, I think we can use seqlock, it looks
  to me smp_wmb() is in write_segcount_begin() is sufficient, we don't
  care vq->map read before smp_wmb(), and for the other we all have
  good data devendency so smp_wmb() in the write_seqbegin_end() is
  sufficient.

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index db2c81cb1e90..6d9501303258 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -363,39 +363,29 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
 
 static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
 {
-	int ref = READ_ONCE(vq->ref);
-
-	smp_store_release(&vq->ref, ref + 1);
-	/* Make sure ref counter is visible before accessing the map */
-	smp_load_acquire(&vq->ref);
+	write_seqcount_begin(&vq->seq);
 }
 
 static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
 {
-	int ref = READ_ONCE(vq->ref);
-
-	/* Make sure vq access is done before increasing ref counter */
-	smp_store_release(&vq->ref, ref + 1);
+	write_seqcount_end(&vq->seq);
 }
 
 static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
 {
-	int ref;
+	unsigned int ret;
 
 	/* Make sure map change was done before checking ref counter */
 	smp_mb();
-
-	ref = READ_ONCE(vq->ref);
-	if (ref & 0x1) {
-		/* When ref change, we are sure no reader can see
+	ret = raw_read_seqcount(&vq->seq);
+	if (ret & 0x1) {
+		/* When seq changes, we are sure no reader can see
 		 * previous map */
-		while (READ_ONCE(vq->ref) == ref) {
-			set_current_state(TASK_RUNNING);
+		while (raw_read_seqcount(&vq->seq) == ret)
 			schedule();
-		}
 	}
-	/* Make sure ref counter was checked before any other
-	 * operations that was dene on map. */
+	/* Make sure seq was checked before any other operations that
+	 * was dene on map. */
 	smp_mb();
 }
 
@@ -691,7 +681,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->indirect = NULL;
 		vq->heads = NULL;
 		vq->dev = dev;
-		vq->ref = 0;
+		seqcount_init(&vq->seq);
 		mutex_init(&vq->mutex);
 		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 3d10da0ae511..1a705e181a84 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -125,7 +125,7 @@ struct vhost_virtqueue {
 	 */
 	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
 #endif
-	int ref;
+	seqcount_t seq;
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
 
 	struct file *kick;
Jason Gunthorpe Aug. 1, 2019, 2:15 p.m. UTC | #8
On Thu, Aug 01, 2019 at 01:02:18PM +0800, Jason Wang wrote:
> 
> On 2019/8/1 上午3:30, Jason Gunthorpe wrote:
> > On Wed, Jul 31, 2019 at 09:28:20PM +0800, Jason Wang wrote:
> > > On 2019/7/31 下午8:39, Jason Gunthorpe wrote:
> > > > On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
> > > > > We used to use RCU to synchronize MMU notifier with worker. This leads
> > > > > calling synchronize_rcu() in invalidate_range_start(). But on a busy
> > > > > system, there would be many factors that may slow down the
> > > > > synchronize_rcu() which makes it unsuitable to be called in MMU
> > > > > notifier.
> > > > > 
> > > > > A solution is SRCU but its overhead is obvious with the expensive full
> > > > > memory barrier. Another choice is to use seqlock, but it doesn't
> > > > > provide a synchronization method between readers and writers. The last
> > > > > choice is to use vq mutex, but it need to deal with the worst case
> > > > > that MMU notifier must be blocked and wait for the finish of swap in.
> > > > > 
> > > > > So this patch switches use a counter to track whether or not the map
> > > > > was used. The counter was increased when vq try to start or finish
> > > > > uses the map. This means, when it was even, we're sure there's no
> > > > > readers and MMU notifier is synchronized. When it was odd, it means
> > > > > there's a reader we need to wait it to be even again then we are
> > > > > synchronized.
> > > > You just described a seqlock.
> > > 
> > > Kind of, see my explanation below.
> > > 
> > > 
> > > > We've been talking about providing this as some core service from mmu
> > > > notifiers because nearly every use of this API needs it.
> > > 
> > > That would be very helpful.
> > > 
> > > 
> > > > IMHO this gets the whole thing backwards, the common pattern is to
> > > > protect the 'shadow pte' data with a seqlock (usually open coded),
> > > > such that the mmu notififer side has the write side of that lock and
> > > > the read side is consumed by the thread accessing or updating the SPTE.
> > > 
> > > Yes, I've considered something like that. But the problem is, mmu notifier
> > > (writer) need to wait for the vhost worker to finish the read before it can
> > > do things like setting dirty pages and unmapping page.  It looks to me
> > > seqlock doesn't provide things like this.
> > The seqlock is usually used to prevent a 2nd thread from accessing the
> > VA while it is being changed by the mm. ie you use something seqlocky
> > instead of the ugly mmu_notifier_unregister/register cycle.
> 
> 
> Yes, so we have two mappings:
> 
> [1] vring address to VA
> [2] VA to PA
> 
> And have several readers and writers
> 
> 1) set_vring_num_addr(): writer of both [1] and [2]
> 2) MMU notifier: reader of [1] writer of [2]
> 3) GUP: reader of [1] writer of [2]
> 4) memory accessors: reader of [1] and [2]
> 
> Fortunately, 1) 3) and 4) have already synchronized through vq->mutex. We
> only need to deal with synchronization between 2) and each of the reset:
> Sync between 1) and 2): For mapping [1], I do
> mmu_notifier_unregister/register. This help to avoid holding any lock to do
> overlap check.

I suspect you could have done this with a RCU technique instead of
register/unregister.

> Sync between 2) and 4): For mapping [1], both are readers, no need any
> synchronization. For mapping [2], synchronize through RCU (or something
> simliar to seqlock).

You can't really use a seqlock, seqlocks are collision-retry locks,
and the semantic here is that invalidate_range_start *MUST* not
continue until thread doing #4 above is guarenteed no longer touching
the memory.

This must be a proper barrier, like a spinlock, mutex, or
synchronize_rcu.

And, again, you can't re-invent a spinlock with open coding and get
something better.

Jason
Jason Wang Aug. 2, 2019, 9:40 a.m. UTC | #9
On 2019/8/1 下午10:15, Jason Gunthorpe wrote:
> On Thu, Aug 01, 2019 at 01:02:18PM +0800, Jason Wang wrote:
>> On 2019/8/1 上午3:30, Jason Gunthorpe wrote:
>>> On Wed, Jul 31, 2019 at 09:28:20PM +0800, Jason Wang wrote:
>>>> On 2019/7/31 下午8:39, Jason Gunthorpe wrote:
>>>>> On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
>>>>>> We used to use RCU to synchronize MMU notifier with worker. This leads
>>>>>> calling synchronize_rcu() in invalidate_range_start(). But on a busy
>>>>>> system, there would be many factors that may slow down the
>>>>>> synchronize_rcu() which makes it unsuitable to be called in MMU
>>>>>> notifier.
>>>>>>
>>>>>> A solution is SRCU but its overhead is obvious with the expensive full
>>>>>> memory barrier. Another choice is to use seqlock, but it doesn't
>>>>>> provide a synchronization method between readers and writers. The last
>>>>>> choice is to use vq mutex, but it need to deal with the worst case
>>>>>> that MMU notifier must be blocked and wait for the finish of swap in.
>>>>>>
>>>>>> So this patch switches use a counter to track whether or not the map
>>>>>> was used. The counter was increased when vq try to start or finish
>>>>>> uses the map. This means, when it was even, we're sure there's no
>>>>>> readers and MMU notifier is synchronized. When it was odd, it means
>>>>>> there's a reader we need to wait it to be even again then we are
>>>>>> synchronized.
>>>>> You just described a seqlock.
>>>> Kind of, see my explanation below.
>>>>
>>>>
>>>>> We've been talking about providing this as some core service from mmu
>>>>> notifiers because nearly every use of this API needs it.
>>>> That would be very helpful.
>>>>
>>>>
>>>>> IMHO this gets the whole thing backwards, the common pattern is to
>>>>> protect the 'shadow pte' data with a seqlock (usually open coded),
>>>>> such that the mmu notififer side has the write side of that lock and
>>>>> the read side is consumed by the thread accessing or updating the SPTE.
>>>> Yes, I've considered something like that. But the problem is, mmu notifier
>>>> (writer) need to wait for the vhost worker to finish the read before it can
>>>> do things like setting dirty pages and unmapping page.  It looks to me
>>>> seqlock doesn't provide things like this.
>>> The seqlock is usually used to prevent a 2nd thread from accessing the
>>> VA while it is being changed by the mm. ie you use something seqlocky
>>> instead of the ugly mmu_notifier_unregister/register cycle.
>>
>> Yes, so we have two mappings:
>>
>> [1] vring address to VA
>> [2] VA to PA
>>
>> And have several readers and writers
>>
>> 1) set_vring_num_addr(): writer of both [1] and [2]
>> 2) MMU notifier: reader of [1] writer of [2]
>> 3) GUP: reader of [1] writer of [2]
>> 4) memory accessors: reader of [1] and [2]
>>
>> Fortunately, 1) 3) and 4) have already synchronized through vq->mutex. We
>> only need to deal with synchronization between 2) and each of the reset:
>> Sync between 1) and 2): For mapping [1], I do
>> mmu_notifier_unregister/register. This help to avoid holding any lock to do
>> overlap check.
> I suspect you could have done this with a RCU technique instead of
> register/unregister.


Probably. But the issue to be addressed by this patch is the 
synchronization between MMU notifier and vhost worker.


>
>> Sync between 2) and 4): For mapping [1], both are readers, no need any
>> synchronization. For mapping [2], synchronize through RCU (or something
>> simliar to seqlock).
> You can't really use a seqlock, seqlocks are collision-retry locks,
> and the semantic here is that invalidate_range_start *MUST* not
> continue until thread doing #4 above is guarenteed no longer touching
> the memory.


Yes, that's the tricky part. For hardware like CPU, kicking through IPI 
is sufficient for synchronization. But for vhost kthread, it requires a 
low overhead synchronization.


>
> This must be a proper barrier, like a spinlock, mutex, or
> synchronize_rcu.


I start with synchronize_rcu() but both you and Michael raise some 
concern. Then I try spinlock and mutex:

1) spinlock: add lots of overhead on datapath, this leads 0 performance 
improvement.

2) SRCU: full memory barrier requires on srcu_read_lock(), which still 
leads little performance improvement

3) mutex: a possible issue is need to wait for the page to be swapped in 
(is this unacceptable ?), another issue is that we need hold vq lock 
during range overlap check.

4) using vhost_flush_work() instead of synchronize_rcu(): still need to 
wait for swap. But can do overlap checking without the lock


>
> And, again, you can't re-invent a spinlock with open coding and get
> something better.


So the question is if waiting for swap is considered to be unsuitable 
for MMU notifiers. If not, it would simplify codes. If not, we still 
need to figure out a possible solution.

Btw, I come up another idea, that is to disable preemption when vhost 
thread need to access the memory. Then register preempt notifier and if 
vhost thread is preempted, we're sure no one will access the memory and 
can do the cleanup.

Thanks


>
> Jason
Jason Gunthorpe Aug. 2, 2019, 12:46 p.m. UTC | #10
On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> > This must be a proper barrier, like a spinlock, mutex, or
> > synchronize_rcu.
> 
> 
> I start with synchronize_rcu() but both you and Michael raise some
> concern.

I've also idly wondered if calling synchronize_rcu() under the various
mm locks is a deadlock situation.

> Then I try spinlock and mutex:
> 
> 1) spinlock: add lots of overhead on datapath, this leads 0 performance
> improvement.

I think the topic here is correctness not performance improvement

> 2) SRCU: full memory barrier requires on srcu_read_lock(), which still leads
> little performance improvement
 
> 3) mutex: a possible issue is need to wait for the page to be swapped in (is
> this unacceptable ?), another issue is that we need hold vq lock during
> range overlap check.

I have a feeling that mmu notififers cannot safely become dependent on
progress of swap without causing deadlock. You probably should avoid
this.

> > And, again, you can't re-invent a spinlock with open coding and get
> > something better.
> 
> So the question is if waiting for swap is considered to be unsuitable for
> MMU notifiers. If not, it would simplify codes. If not, we still need to
> figure out a possible solution.
> 
> Btw, I come up another idea, that is to disable preemption when vhost thread
> need to access the memory. Then register preempt notifier and if vhost
> thread is preempted, we're sure no one will access the memory and can do the
> cleanup.

I think you should use the spinlock so at least the code is obviously
functionally correct and worry about designing some properly justified
performance change after.

Jason
Michael S. Tsirkin Aug. 2, 2019, 2:03 p.m. UTC | #11
On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> Btw, I come up another idea, that is to disable preemption when vhost thread
> need to access the memory. Then register preempt notifier and if vhost
> thread is preempted, we're sure no one will access the memory and can do the
> cleanup.

Great, more notifiers :(

Maybe can live with
1- disable preemption while using the cached pointer
2- teach vhost to recover from memory access failures,
   by switching to regular from/to user path

So if you want to try that, fine since it's a step in
the right direction.

But I think fundamentally it's not what we want to do long term.

It's always been a fundamental problem with this patch series that only
metadata is accessed through a direct pointer.

The difference in ways you handle metadata and data is what is
now coming and messing everything up.

So if continuing the direct map approach,
what is needed is a cache of mapped VM memory, then on a cache miss
we'd queue work along the lines of 1-2 above.

That's one direction to take. Another one is to give up on that and
write our own version of uaccess macros.  Add a "high security" flag to
the vhost module and if not active use these for userspace memory
access.
Michael S. Tsirkin Aug. 2, 2019, 2:27 p.m. UTC | #12
On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> > > This must be a proper barrier, like a spinlock, mutex, or
> > > synchronize_rcu.
> > 
> > 
> > I start with synchronize_rcu() but both you and Michael raise some
> > concern.
> 
> I've also idly wondered if calling synchronize_rcu() under the various
> mm locks is a deadlock situation.
> 
> > Then I try spinlock and mutex:
> > 
> > 1) spinlock: add lots of overhead on datapath, this leads 0 performance
> > improvement.
> 
> I think the topic here is correctness not performance improvement

The topic is whether we should revert
commit 7f466032dc9 ("vhost: access vq metadata through kernel virtual address")

or keep it in. The only reason to keep it is performance.

Now as long as all this code is disabled anyway, we can experiment a
bit.

I personally feel we would be best served by having two code paths:

- Access to VM memory directly mapped into kernel
- Access to userspace


Having it all cleanly split will allow a bunch of optimizations, for
example for years now we planned to be able to process an incoming short
packet directly on softirq path, or an outgoing on directly within
eventfd.
Jason Gunthorpe Aug. 2, 2019, 5:24 p.m. UTC | #13
On Fri, Aug 02, 2019 at 10:27:21AM -0400, Michael S. Tsirkin wrote:
> On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
> > On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> > > > This must be a proper barrier, like a spinlock, mutex, or
> > > > synchronize_rcu.
> > > 
> > > 
> > > I start with synchronize_rcu() but both you and Michael raise some
> > > concern.
> > 
> > I've also idly wondered if calling synchronize_rcu() under the various
> > mm locks is a deadlock situation.
> > 
> > > Then I try spinlock and mutex:
> > > 
> > > 1) spinlock: add lots of overhead on datapath, this leads 0 performance
> > > improvement.
> > 
> > I think the topic here is correctness not performance improvement
> 
> The topic is whether we should revert
> commit 7f466032dc9 ("vhost: access vq metadata through kernel virtual address")
> 
> or keep it in. The only reason to keep it is performance.

Yikes, I'm not sure you can ever win against copy_from_user using
mmu_notifiers?  The synchronization requirements are likely always
more expensive unless large and scattered copies are being done..

The rcu is about the only simple approach that could be less
expensive, and that gets back to the question if you can block an
invalidate_start_range in synchronize_rcu or not..

So, frankly, I'd revert it until someone could prove the rcu solution is
OK..

BTW, how do you get copy_from_user to work outside a syscall?

Also, why can't this just permanently GUP the pages? In fact, where
does it put_page them anyhow? Worrying that 7f466 adds a get_user page
but does not add a put_page??

Jason
Michael S. Tsirkin Aug. 3, 2019, 9:36 p.m. UTC | #14
On Fri, Aug 02, 2019 at 02:24:18PM -0300, Jason Gunthorpe wrote:
> On Fri, Aug 02, 2019 at 10:27:21AM -0400, Michael S. Tsirkin wrote:
> > On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
> > > On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> > > > > This must be a proper barrier, like a spinlock, mutex, or
> > > > > synchronize_rcu.
> > > > 
> > > > 
> > > > I start with synchronize_rcu() but both you and Michael raise some
> > > > concern.
> > > 
> > > I've also idly wondered if calling synchronize_rcu() under the various
> > > mm locks is a deadlock situation.
> > > 
> > > > Then I try spinlock and mutex:
> > > > 
> > > > 1) spinlock: add lots of overhead on datapath, this leads 0 performance
> > > > improvement.
> > > 
> > > I think the topic here is correctness not performance improvement
> > 
> > The topic is whether we should revert
> > commit 7f466032dc9 ("vhost: access vq metadata through kernel virtual address")
> > 
> > or keep it in. The only reason to keep it is performance.
> 
> Yikes, I'm not sure you can ever win against copy_from_user using
> mmu_notifiers?

Ever since copy_from_user started playing with flags (for SMAP) and
added speculation barriers there's a chance we can win by accessing
memory through the kernel address.


Another reason would be to access it from e.g. softirq
context. copy_from_user will only work if the
correct mmu is active.


> The synchronization requirements are likely always
> more expensive unless large and scattered copies are being done..
> 
> The rcu is about the only simple approach that could be less
> expensive, and that gets back to the question if you can block an
> invalidate_start_range in synchronize_rcu or not..
> 
> So, frankly, I'd revert it until someone could prove the rcu solution is
> OK..

I have it all disabled at compile time, so reverting isn't urgent
anymore. I'll wait a couple more days to decide what's cleanest.

> BTW, how do you get copy_from_user to work outside a syscall?

By switching to the correct mm.

> 
> Also, why can't this just permanently GUP the pages? In fact, where
> does it put_page them anyhow? Worrying that 7f466 adds a get_user page
> but does not add a put_page??
> 
> Jason
Michael S. Tsirkin Aug. 3, 2019, 9:54 p.m. UTC | #15
On Thu, Aug 01, 2019 at 04:06:13AM -0400, Jason Wang wrote:
> On 2019/8/1 上午2:29, Michael S. Tsirkin wrote:
> > On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
> >> We used to use RCU to synchronize MMU notifier with worker. This leads
> >> calling synchronize_rcu() in invalidate_range_start(). But on a busy
> >> system, there would be many factors that may slow down the
> >> synchronize_rcu() which makes it unsuitable to be called in MMU
> >> notifier.
> >>
> >> A solution is SRCU but its overhead is obvious with the expensive full
> >> memory barrier. Another choice is to use seqlock, but it doesn't
> >> provide a synchronization method between readers and writers. The last
> >> choice is to use vq mutex, but it need to deal with the worst case
> >> that MMU notifier must be blocked and wait for the finish of swap in.
> >>
> >> So this patch switches use a counter to track whether or not the map
> >> was used. The counter was increased when vq try to start or finish
> >> uses the map. This means, when it was even, we're sure there's no
> >> readers and MMU notifier is synchronized. When it was odd, it means
> >> there's a reader we need to wait it to be even again then we are
> >> synchronized. To avoid full memory barrier, store_release +
> >> load_acquire on the counter is used.
> >
> > Unfortunately this needs a lot of review and testing, so this can't make
> > rc2, and I don't think this is the kind of patch I can merge after rc3.
> > Subtle memory barrier tricks like this can introduce new bugs while they
> > are fixing old ones.
> 
> I admit the patch is tricky. Some questions:
> 
> - Do we must address the case of e.g swap in? If not, a simple
>   vhost_work_flush() instead of synchronize_rcu() may work.
> - Having some hard thought, I think we can use seqlock, it looks
>   to me smp_wmb() is in write_segcount_begin() is sufficient, we don't
>   care vq->map read before smp_wmb(), and for the other we all have
>   good data devendency so smp_wmb() in the write_seqbegin_end() is
>   sufficient.

If we need an mb in the begin() we can switch to
dependent_ptr_mb. if you need me to fix it up
and repost, let me know.

Why isn't it a problem if the map is
accessed outside the lock?



> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index db2c81cb1e90..6d9501303258 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -363,39 +363,29 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>  
>  static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
>  {
> -	int ref = READ_ONCE(vq->ref);
> -
> -	smp_store_release(&vq->ref, ref + 1);
> -	/* Make sure ref counter is visible before accessing the map */
> -	smp_load_acquire(&vq->ref);
> +	write_seqcount_begin(&vq->seq);
>  }
>  
>  static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
>  {
> -	int ref = READ_ONCE(vq->ref);
> -
> -	/* Make sure vq access is done before increasing ref counter */
> -	smp_store_release(&vq->ref, ref + 1);
> +	write_seqcount_end(&vq->seq);
>  }
>  
>  static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
>  {
> -	int ref;
> +	unsigned int ret;
>  
>  	/* Make sure map change was done before checking ref counter */
>  	smp_mb();
> -
> -	ref = READ_ONCE(vq->ref);
> -	if (ref & 0x1) {
> -		/* When ref change, we are sure no reader can see
> +	ret = raw_read_seqcount(&vq->seq);
> +	if (ret & 0x1) {
> +		/* When seq changes, we are sure no reader can see
>  		 * previous map */
> -		while (READ_ONCE(vq->ref) == ref) {
> -			set_current_state(TASK_RUNNING);
> +		while (raw_read_seqcount(&vq->seq) == ret)
>  			schedule();


So why do we set state here? And should not we
check need_sched?


> -		}
>  	}
> -	/* Make sure ref counter was checked before any other
> -	 * operations that was dene on map. */
> +	/* Make sure seq was checked before any other operations that
> +	 * was dene on map. */
>  	smp_mb();
>  }
>  
> @@ -691,7 +681,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vq->indirect = NULL;
>  		vq->heads = NULL;
>  		vq->dev = dev;
> -		vq->ref = 0;
> +		seqcount_init(&vq->seq);
>  		mutex_init(&vq->mutex);
>  		spin_lock_init(&vq->mmu_lock);
>  		vhost_vq_reset(dev, vq);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 3d10da0ae511..1a705e181a84 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -125,7 +125,7 @@ struct vhost_virtqueue {
>  	 */
>  	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
>  #endif
> -	int ref;
> +	seqcount_t seq;
>  	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
>  
>  	struct file *kick;
> -- 
> 2.18.1
> 
> >
> >
> >
> >
> >
> >>
> >> Consider the read critical section is pretty small the synchronization
> >> should be done very fast.
> >>
> >> Note the patch lead about 3% PPS dropping.
> >
> > Sorry what do you mean by this last sentence? This degrades performance
> > compared to what?
> 
> Compare to without this patch.

OK is the feature still a performance win? or should we drop it for now?

> >
> >>
> >> Reported-by: Michael S. Tsirkin <mst@redhat.com>
> >> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
> >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> >> ---
> >>  drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++----------------
> >>  drivers/vhost/vhost.h |   7 +-
> >>  2 files changed, 94 insertions(+), 58 deletions(-)
> >>
> >> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> >> index cfc11f9ed9c9..db2c81cb1e90 100644
> >> --- a/drivers/vhost/vhost.c
> >> +++ b/drivers/vhost/vhost.c
> >> @@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
> >>  
> >>  	spin_lock(&vq->mmu_lock);
> >>  	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> >> -		map[i] = rcu_dereference_protected(vq->maps[i],
> >> -				  lockdep_is_held(&vq->mmu_lock));
> >> +		map[i] = vq->maps[i];
> >>  		if (map[i]) {
> >>  			vhost_set_map_dirty(vq, map[i], i);
> >> -			rcu_assign_pointer(vq->maps[i], NULL);
> >> +			vq->maps[i] = NULL;
> >>  		}
> >>  	}
> >>  	spin_unlock(&vq->mmu_lock);
> >>  
> >> -	/* No need for synchronize_rcu() or kfree_rcu() since we are
> >> -	 * serialized with memory accessors (e.g vq mutex held).
> >> +	/* No need for synchronization since we are serialized with
> >> +	 * memory accessors (e.g vq mutex held).
> >>  	 */
> >>  
> >>  	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> >> @@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
> >>  	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
> >>  }
> >>  
> >> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> >> +{
> >> +	int ref = READ_ONCE(vq->ref);
> >> +
> >> +	smp_store_release(&vq->ref, ref + 1);
> >> +	/* Make sure ref counter is visible before accessing the map */
> >> +	smp_load_acquire(&vq->ref);
> >
> > The map access is after this sequence, correct?
> 
> Yes.
> 
> >
> > Just going by the rules in Documentation/memory-barriers.txt,
> > I think that this pair will not order following accesses with ref store.
> >
> > Documentation/memory-barriers.txt says:
> >
> >
> > +     In addition, a RELEASE+ACQUIRE
> > +     pair is -not- guaranteed to act as a full memory barrier.
> >
> >
> >
> > The guarantee that is made is this:
> > 	after
> >      an ACQUIRE on a given variable, all memory accesses preceding any prior
> >      RELEASE on that same variable are guaranteed to be visible.
> 
> Yes, but it's not clear about the order of ACQUIRE the same location
> of previous RELEASE. And it only has a example like:
> 
> "
> 	*A = a;
> 	RELEASE M
> 	ACQUIRE N
> 	*B = b;
> 
> could occur as:
> 
> 	ACQUIRE N, STORE *B, STORE *A, RELEASE M
> "
> 
> But it doesn't explain what happen when
> 
> *A = a
> RELEASE M
> ACQUIRE M
> *B = b;
> 
> And tools/memory-model/Documentation said
> 
> "
> First, when a lock-acquire reads from a lock-release, the LKMM
> requires that every instruction po-before the lock-release must
> execute before any instruction po-after the lock-acquire.
> "
> 
> Is this a hint that I was correct?

I don't think it's correct since by this logic
memory barriers can be nops on x86.

> >
> >
> > And if we also had the reverse rule we'd end up with a full barrier,
> > won't we?
> >
> > Cc Paul in case I missed something here. And if I'm right,
> > maybe we should call this out, adding
> >
> > 	"The opposite is not true: a prior RELEASE is not
> > 	 guaranteed to be visible before memory accesses following
> > 	 the subsequent ACQUIRE".
> 
> That kinds of violates the RELEASE?
> 
> "
>      This also acts as a one-way permeable barrier.  It guarantees that all
>      memory operations before the RELEASE operation will appear to happen
>      before the RELEASE operation with respect to the other components of the
> "


yes but we are talking about RELEASE itself versus stuff
that comes after it.

> >
> >
> >
> >> +}
> >> +
> >> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> >> +{
> >> +	int ref = READ_ONCE(vq->ref);
> >> +
> >> +	/* Make sure vq access is done before increasing ref counter */
> >> +	smp_store_release(&vq->ref, ref + 1);
> >> +}
> >> +
> >> +static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
> >> +{
> >> +	int ref;
> >> +
> >> +	/* Make sure map change was done before checking ref counter */
> >> +	smp_mb();
> >> +
> >> +	ref = READ_ONCE(vq->ref);
> >> +	if (ref & 0x1) {
> >
> > Please document the even/odd trick here too, not just in the commit log.
> >
> 
> Ok.
> 
> >> +		/* When ref change,
> >
> > changes
> >
> >> we are sure no reader can see
> >> +		 * previous map */
> >> +		while (READ_ONCE(vq->ref) == ref) {
> >
> >
> > what is the below line in aid of?
> >
> >> +			set_current_state(TASK_RUNNING);

any answers here?

> >> +			schedule();
> >
> >                         if (need_resched())
> >                                 schedule();
> >
> > ?
> 
> Yes, better.
> 
> >
> >> +		}
> >
> > On an interruptible kernel, there's a risk here is that
> > a task got preempted with an odd ref.
> > So I suspect we'll have to disable preemption when we
> > make ref odd.
> 
> I'm not sure I get, if the odd is not the original value we read,
> we're sure it won't read the new map here I believe.

But we will spin for a very long time in this case.

> >
> >
> >> +	}
> >> +	/* Make sure ref counter was checked before any other
> >> +	 * operations that was dene on map. */
> >
> > was dene -> were done?
> >
> 
> Yes.
> 
> >> +	smp_mb();
> >> +}
> >> +
> >>  static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> >>  				      int index,
> >>  				      unsigned long start,
> >> @@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> >>  	spin_lock(&vq->mmu_lock);
> >>  	++vq->invalidate_count;
> >>  
> >> -	map = rcu_dereference_protected(vq->maps[index],
> >> -					lockdep_is_held(&vq->mmu_lock));
> >> +	map = vq->maps[index];
> >>  	if (map) {
> >>  		vhost_set_map_dirty(vq, map, index);
> >> -		rcu_assign_pointer(vq->maps[index], NULL);
> >> +		vq->maps[index] = NULL;
> >>  	}
> >>  	spin_unlock(&vq->mmu_lock);
> >>  
> >>  	if (map) {
> >> -		synchronize_rcu();
> >> +		vhost_vq_sync_access(vq);
> >>  		vhost_map_unprefetch(map);
> >>  	}
> >>  }
> >> @@ -457,7 +493,7 @@ static void vhost_init_maps(struct vhost_dev *dev)
> >>  	for (i = 0; i < dev->nvqs; ++i) {
> >>  		vq = dev->vqs[i];
> >>  		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> >> -			RCU_INIT_POINTER(vq->maps[j], NULL);
> >> +			vq->maps[j] = NULL;
> >>  	}
> >>  }
> >>  #endif
> >> @@ -655,6 +691,7 @@ void vhost_dev_init(struct vhost_dev *dev,
> >>  		vq->indirect = NULL;
> >>  		vq->heads = NULL;
> >>  		vq->dev = dev;
> >> +		vq->ref = 0;
> >>  		mutex_init(&vq->mutex);
> >>  		spin_lock_init(&vq->mmu_lock);
> >>  		vhost_vq_reset(dev, vq);
> >> @@ -921,7 +958,7 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq,
> >>  	map->npages = npages;
> >>  	map->pages = pages;
> >>  
> >> -	rcu_assign_pointer(vq->maps[index], map);
> >> +	vq->maps[index] = map;
> >>  	/* No need for a synchronize_rcu(). This function should be
> >>  	 * called by dev->worker so we are serialized with all
> >>  	 * readers.
> >> @@ -1216,18 +1253,18 @@ static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
> >>  	struct vring_used *used;
> >>  
> >>  	if (!vq->iotlb) {
> >> -		rcu_read_lock();
> >> +		vhost_vq_access_map_begin(vq);
> >>  
> >> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> >> +		map = vq->maps[VHOST_ADDR_USED];
> >>  		if (likely(map)) {
> >>  			used = map->addr;
> >>  			*((__virtio16 *)&used->ring[vq->num]) =
> >>  				cpu_to_vhost16(vq, vq->avail_idx);
> >> -			rcu_read_unlock();
> >> +			vhost_vq_access_map_end(vq);
> >>  			return 0;
> >>  		}
> >>  
> >> -		rcu_read_unlock();
> >> +		vhost_vq_access_map_end(vq);
> >>  	}
> >>  #endif
> >>  
> >> @@ -1245,18 +1282,18 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
> >>  	size_t size;
> >>  
> >>  	if (!vq->iotlb) {
> >> -		rcu_read_lock();
> >> +		vhost_vq_access_map_begin(vq);
> >>  
> >> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> >> +		map = vq->maps[VHOST_ADDR_USED];
> >>  		if (likely(map)) {
> >>  			used = map->addr;
> >>  			size = count * sizeof(*head);
> >>  			memcpy(used->ring + idx, head, size);
> >> -			rcu_read_unlock();
> >> +			vhost_vq_access_map_end(vq);
> >>  			return 0;
> >>  		}
> >>  
> >> -		rcu_read_unlock();
> >> +		vhost_vq_access_map_end(vq);
> >>  	}
> >>  #endif
> >>  
> >> @@ -1272,17 +1309,17 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
> >>  	struct vring_used *used;
> >>  
> >>  	if (!vq->iotlb) {
> >> -		rcu_read_lock();
> >> +		vhost_vq_access_map_begin(vq);
> >>  
> >> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> >> +		map = vq->maps[VHOST_ADDR_USED];
> >>  		if (likely(map)) {
> >>  			used = map->addr;
> >>  			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> >> -			rcu_read_unlock();
> >> +			vhost_vq_access_map_end(vq);
> >>  			return 0;
> >>  		}
> >>  
> >> -		rcu_read_unlock();
> >> +		vhost_vq_access_map_end(vq);
> >>  	}
> >>  #endif
> >>  
> >> @@ -1298,17 +1335,17 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
> >>  	struct vring_used *used;
> >>  
> >>  	if (!vq->iotlb) {
> >> -		rcu_read_lock();
> >> +		vhost_vq_access_map_begin(vq);
> >>  
> >> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> >> +		map = vq->maps[VHOST_ADDR_USED];
> >>  		if (likely(map)) {
> >>  			used = map->addr;
> >>  			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> >> -			rcu_read_unlock();
> >> +			vhost_vq_access_map_end(vq);
> >>  			return 0;
> >>  		}
> >>  
> >> -		rcu_read_unlock();
> >> +		vhost_vq_access_map_end(vq);
> >>  	}
> >>  #endif
> >>  
> >> @@ -1362,17 +1399,17 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
> >>  	struct vring_avail *avail;
> >>  
> >>  	if (!vq->iotlb) {
> >> -		rcu_read_lock();
> >> +		vhost_vq_access_map_begin(vq);
> >>  
> >> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> >> +		map = vq->maps[VHOST_ADDR_AVAIL];
> >>  		if (likely(map)) {
> >>  			avail = map->addr;
> >>  			*idx = avail->idx;
> >> -			rcu_read_unlock();
> >> +			vhost_vq_access_map_end(vq);
> >>  			return 0;
> >>  		}
> >>  
> >> -		rcu_read_unlock();
> >> +		vhost_vq_access_map_end(vq);
> >>  	}
> >>  #endif
> >>  
> >> @@ -1387,17 +1424,17 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
> >>  	struct vring_avail *avail;
> >>  
> >>  	if (!vq->iotlb) {
> >> -		rcu_read_lock();
> >> +		vhost_vq_access_map_begin(vq);
> >>  
> >> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> >> +		map = vq->maps[VHOST_ADDR_AVAIL];
> >>  		if (likely(map)) {
> >>  			avail = map->addr;
> >>  			*head = avail->ring[idx & (vq->num - 1)];
> >> -			rcu_read_unlock();
> >> +			vhost_vq_access_map_end(vq);
> >>  			return 0;
> >>  		}
> >>  
> >> -		rcu_read_unlock();
> >> +		vhost_vq_access_map_end(vq);
> >>  	}
> >>  #endif
> >>  
> >> @@ -1413,17 +1450,17 @@ static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
> >>  	struct vring_avail *avail;
> >>  
> >>  	if (!vq->iotlb) {
> >> -		rcu_read_lock();
> >> +		vhost_vq_access_map_begin(vq);
> >>  
> >> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> >> +		map = vq->maps[VHOST_ADDR_AVAIL];
> >>  		if (likely(map)) {
> >>  			avail = map->addr;
> >>  			*flags = avail->flags;
> >> -			rcu_read_unlock();
> >> +			vhost_vq_access_map_end(vq);
> >>  			return 0;
> >>  		}
> >>  
> >> -		rcu_read_unlock();
> >> +		vhost_vq_access_map_end(vq);
> >>  	}
> >>  #endif
> >>  
> >> @@ -1438,15 +1475,15 @@ static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
> >>  	struct vring_avail *avail;
> >>  
> >>  	if (!vq->iotlb) {
> >> -		rcu_read_lock();
> >> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
> >> +		vhost_vq_access_map_begin(vq);
> >> +		map = vq->maps[VHOST_ADDR_AVAIL];
> >>  		if (likely(map)) {
> >>  			avail = map->addr;
> >>  			*event = (__virtio16)avail->ring[vq->num];
> >> -			rcu_read_unlock();
> >> +			vhost_vq_access_map_end(vq);
> >>  			return 0;
> >>  		}
> >> -		rcu_read_unlock();
> >> +		vhost_vq_access_map_end(vq);
> >>  	}
> >>  #endif
> >>  
> >> @@ -1461,17 +1498,17 @@ static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
> >>  	struct vring_used *used;
> >>  
> >>  	if (!vq->iotlb) {
> >> -		rcu_read_lock();
> >> +		vhost_vq_access_map_begin(vq);
> >>  
> >> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
> >> +		map = vq->maps[VHOST_ADDR_USED];
> >>  		if (likely(map)) {
> >>  			used = map->addr;
> >>  			*idx = used->idx;
> >> -			rcu_read_unlock();
> >> +			vhost_vq_access_map_end(vq);
> >>  			return 0;
> >>  		}
> >>  
> >> -		rcu_read_unlock();
> >> +		vhost_vq_access_map_end(vq);
> >>  	}
> >>  #endif
> >>  
> >> @@ -1486,17 +1523,17 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq,
> >>  	struct vring_desc *d;
> >>  
> >>  	if (!vq->iotlb) {
> >> -		rcu_read_lock();
> >> +		vhost_vq_access_map_begin(vq);
> >>  
> >> -		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
> >> +		map = vq->maps[VHOST_ADDR_DESC];
> >>  		if (likely(map)) {
> >>  			d = map->addr;
> >>  			*desc = *(d + idx);
> >> -			rcu_read_unlock();
> >> +			vhost_vq_access_map_end(vq);
> >>  			return 0;
> >>  		}
> >>  
> >> -		rcu_read_unlock();
> >> +		vhost_vq_access_map_end(vq);
> >>  	}
> >>  #endif
> >>  
> >> @@ -1843,13 +1880,11 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
> >>  #if VHOST_ARCH_CAN_ACCEL_UACCESS
> >>  static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
> >>  {
> >> -	struct vhost_map __rcu *map;
> >> +	struct vhost_map *map;
> >>  	int i;
> >>  
> >>  	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> >> -		rcu_read_lock();
> >> -		map = rcu_dereference(vq->maps[i]);
> >> -		rcu_read_unlock();
> >> +		map = vq->maps[i];
> >>  		if (unlikely(!map))
> >>  			vhost_map_prefetch(vq, i);
> >>  	}
> >> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> >> index a9a2a93857d2..f9e9558a529d 100644
> >> --- a/drivers/vhost/vhost.h
> >> +++ b/drivers/vhost/vhost.h
> >> @@ -115,16 +115,17 @@ struct vhost_virtqueue {
> >>  #if VHOST_ARCH_CAN_ACCEL_UACCESS
> >>  	/* Read by memory accessors, modified by meta data
> >>  	 * prefetching, MMU notifier and vring ioctl().
> >> -	 * Synchonrized through mmu_lock (writers) and RCU (writers
> >> -	 * and readers).
> >> +	 * Synchonrized through mmu_lock (writers) and ref counters,
> >> +	 * see vhost_vq_access_map_begin()/vhost_vq_access_map_end().
> >>  	 */
> >> -	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
> >> +	struct vhost_map *maps[VHOST_NUM_ADDRS];
> >>  	/* Read by MMU notifier, modified by vring ioctl(),
> >>  	 * synchronized through MMU notifier
> >>  	 * registering/unregistering.
> >>  	 */
> >>  	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
> >>  #endif
> >> +	int ref;
> >
> > Is it important that this is signed? If not I'd do unsigned here:
> > even though kernel does compile with 2s complement sign overflow,
> > it seems cleaner not to depend on that.
> 
> Not a must, let me fix.
> 
> Thanks
> 
> >
> >>  	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
> >>  
> >>  	struct file *kick;
> >> -- 
> >> 2.18.1
Jason Gunthorpe Aug. 4, 2019, 12:14 a.m. UTC | #16
On Sat, Aug 03, 2019 at 05:36:13PM -0400, Michael S. Tsirkin wrote:
> On Fri, Aug 02, 2019 at 02:24:18PM -0300, Jason Gunthorpe wrote:
> > On Fri, Aug 02, 2019 at 10:27:21AM -0400, Michael S. Tsirkin wrote:
> > > On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
> > > > On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> > > > > > This must be a proper barrier, like a spinlock, mutex, or
> > > > > > synchronize_rcu.
> > > > > 
> > > > > 
> > > > > I start with synchronize_rcu() but both you and Michael raise some
> > > > > concern.
> > > > 
> > > > I've also idly wondered if calling synchronize_rcu() under the various
> > > > mm locks is a deadlock situation.
> > > > 
> > > > > Then I try spinlock and mutex:
> > > > > 
> > > > > 1) spinlock: add lots of overhead on datapath, this leads 0 performance
> > > > > improvement.
> > > > 
> > > > I think the topic here is correctness not performance improvement
> > > 
> > > The topic is whether we should revert
> > > commit 7f466032dc9 ("vhost: access vq metadata through kernel virtual address")
> > > 
> > > or keep it in. The only reason to keep it is performance.
> > 
> > Yikes, I'm not sure you can ever win against copy_from_user using
> > mmu_notifiers?
> 
> Ever since copy_from_user started playing with flags (for SMAP) and
> added speculation barriers there's a chance we can win by accessing
> memory through the kernel address.

You think copy_to_user will be more expensive than the minimum two
atomics required to synchronize with another thread?

> > Also, why can't this just permanently GUP the pages? In fact, where
> > does it put_page them anyhow? Worrying that 7f466 adds a get_user page
> > but does not add a put_page??

You didn't answer this.. Why not just use GUP?

Jason
Michael S. Tsirkin Aug. 4, 2019, 8:07 a.m. UTC | #17
On Sat, Aug 03, 2019 at 09:14:00PM -0300, Jason Gunthorpe wrote:
> On Sat, Aug 03, 2019 at 05:36:13PM -0400, Michael S. Tsirkin wrote:
> > On Fri, Aug 02, 2019 at 02:24:18PM -0300, Jason Gunthorpe wrote:
> > > On Fri, Aug 02, 2019 at 10:27:21AM -0400, Michael S. Tsirkin wrote:
> > > > On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
> > > > > On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> > > > > > > This must be a proper barrier, like a spinlock, mutex, or
> > > > > > > synchronize_rcu.
> > > > > > 
> > > > > > 
> > > > > > I start with synchronize_rcu() but both you and Michael raise some
> > > > > > concern.
> > > > > 
> > > > > I've also idly wondered if calling synchronize_rcu() under the various
> > > > > mm locks is a deadlock situation.
> > > > > 
> > > > > > Then I try spinlock and mutex:
> > > > > > 
> > > > > > 1) spinlock: add lots of overhead on datapath, this leads 0 performance
> > > > > > improvement.
> > > > > 
> > > > > I think the topic here is correctness not performance improvement
> > > > 
> > > > The topic is whether we should revert
> > > > commit 7f466032dc9 ("vhost: access vq metadata through kernel virtual address")
> > > > 
> > > > or keep it in. The only reason to keep it is performance.
> > > 
> > > Yikes, I'm not sure you can ever win against copy_from_user using
> > > mmu_notifiers?
> > 
> > Ever since copy_from_user started playing with flags (for SMAP) and
> > added speculation barriers there's a chance we can win by accessing
> > memory through the kernel address.
> 
> You think copy_to_user will be more expensive than the minimum two
> atomics required to synchronize with another thread?

I frankly don't know. With SMAP you flip flags twice, and with spectre
you flush the pipeline. Is that cheaper or more expensive than an atomic
operation? Testing is the only way to tell.

> > > Also, why can't this just permanently GUP the pages? In fact, where
> > > does it put_page them anyhow? Worrying that 7f466 adds a get_user page
> > > but does not add a put_page??
> 
> You didn't answer this.. Why not just use GUP?
> 
> Jason

Sorry I misunderstood the question. Permanent GUP breaks lots of
functionality we need such as THP and numa balancing.

release_pages is used instead of put_page.
Jason Wang Aug. 5, 2019, 4:20 a.m. UTC | #18
On 2019/8/2 下午8:46, Jason Gunthorpe wrote:
> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
>>> This must be a proper barrier, like a spinlock, mutex, or
>>> synchronize_rcu.
>>
>> I start with synchronize_rcu() but both you and Michael raise some
>> concern.
> I've also idly wondered if calling synchronize_rcu() under the various
> mm locks is a deadlock situation.


Maybe, that's why I suggest to use vhost_work_flush() which is much 
lightweight can can achieve the same function. It can guarantee all 
previous work has been processed after vhost_work_flush() return.


>
>> Then I try spinlock and mutex:
>>
>> 1) spinlock: add lots of overhead on datapath, this leads 0 performance
>> improvement.
> I think the topic here is correctness not performance improvement


But the whole series is to speed up vhost.


>
>> 2) SRCU: full memory barrier requires on srcu_read_lock(), which still leads
>> little performance improvement
>   
>> 3) mutex: a possible issue is need to wait for the page to be swapped in (is
>> this unacceptable ?), another issue is that we need hold vq lock during
>> range overlap check.
> I have a feeling that mmu notififers cannot safely become dependent on
> progress of swap without causing deadlock. You probably should avoid
> this.


Yes, so that's why I try to synchronize the critical region by myself.


>>> And, again, you can't re-invent a spinlock with open coding and get
>>> something better.
>> So the question is if waiting for swap is considered to be unsuitable for
>> MMU notifiers. If not, it would simplify codes. If not, we still need to
>> figure out a possible solution.
>>
>> Btw, I come up another idea, that is to disable preemption when vhost thread
>> need to access the memory. Then register preempt notifier and if vhost
>> thread is preempted, we're sure no one will access the memory and can do the
>> cleanup.
> I think you should use the spinlock so at least the code is obviously
> functionally correct and worry about designing some properly justified
> performance change after.
>
> Jason


Spinlock is correct but make the whole series meaningless consider it 
won't bring any performance improvement.

Thanks
Jason Wang Aug. 5, 2019, 4:33 a.m. UTC | #19
On 2019/8/2 下午10:03, Michael S. Tsirkin wrote:
> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
>> Btw, I come up another idea, that is to disable preemption when vhost thread
>> need to access the memory. Then register preempt notifier and if vhost
>> thread is preempted, we're sure no one will access the memory and can do the
>> cleanup.
> Great, more notifiers :(
>
> Maybe can live with
> 1- disable preemption while using the cached pointer
> 2- teach vhost to recover from memory access failures,
>     by switching to regular from/to user path


I don't get this, I believe we want to recover from regular from/to user 
path, isn't it?


>
> So if you want to try that, fine since it's a step in
> the right direction.
>
> But I think fundamentally it's not what we want to do long term.


Yes.


>
> It's always been a fundamental problem with this patch series that only
> metadata is accessed through a direct pointer.
>
> The difference in ways you handle metadata and data is what is
> now coming and messing everything up.


I do propose soemthing like this in the past: 
https://www.spinics.net/lists/linux-virtualization/msg36824.html. But 
looks like you have some concern about its locality.

But the problem still there, GUP can do page fault, so still need to 
synchronize it with MMU notifiers. The solution might be something like 
moving GUP to a dedicated kind of vhost work.


>
> So if continuing the direct map approach,
> what is needed is a cache of mapped VM memory, then on a cache miss
> we'd queue work along the lines of 1-2 above.
>
> That's one direction to take. Another one is to give up on that and
> write our own version of uaccess macros.  Add a "high security" flag to
> the vhost module and if not active use these for userspace memory
> access.


Or using SET_BACKEND_FEATURES? But do you mean permanent GUP as I did in 
original RFC https://lkml.org/lkml/2018/12/13/218?

Thanks

>
>
Jason Wang Aug. 5, 2019, 4:36 a.m. UTC | #20
On 2019/8/2 下午10:27, Michael S. Tsirkin wrote:
> On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
>> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
>>>> This must be a proper barrier, like a spinlock, mutex, or
>>>> synchronize_rcu.
>>>
>>> I start with synchronize_rcu() but both you and Michael raise some
>>> concern.
>> I've also idly wondered if calling synchronize_rcu() under the various
>> mm locks is a deadlock situation.
>>
>>> Then I try spinlock and mutex:
>>>
>>> 1) spinlock: add lots of overhead on datapath, this leads 0 performance
>>> improvement.
>> I think the topic here is correctness not performance improvement
> The topic is whether we should revert
> commit 7f466032dc9 ("vhost: access vq metadata through kernel virtual address")
>
> or keep it in. The only reason to keep it is performance.


Maybe it's time to introduce the config option?


>
> Now as long as all this code is disabled anyway, we can experiment a
> bit.
>
> I personally feel we would be best served by having two code paths:
>
> - Access to VM memory directly mapped into kernel
> - Access to userspace
>
>
> Having it all cleanly split will allow a bunch of optimizations, for
> example for years now we planned to be able to process an incoming short
> packet directly on softirq path, or an outgoing on directly within
> eventfd.


It's not hard consider we've already had our own accssors. But the 
question is (as asked in another thread), do you want permanent GUP or 
still use MMU notifiers.

Thanks
Jason Wang Aug. 5, 2019, 4:39 a.m. UTC | #21
On 2019/8/4 下午4:07, Michael S. Tsirkin wrote:
> On Sat, Aug 03, 2019 at 09:14:00PM -0300, Jason Gunthorpe wrote:
>> On Sat, Aug 03, 2019 at 05:36:13PM -0400, Michael S. Tsirkin wrote:
>>> On Fri, Aug 02, 2019 at 02:24:18PM -0300, Jason Gunthorpe wrote:
>>>> On Fri, Aug 02, 2019 at 10:27:21AM -0400, Michael S. Tsirkin wrote:
>>>>> On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
>>>>>> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
>>>>>>>> This must be a proper barrier, like a spinlock, mutex, or
>>>>>>>> synchronize_rcu.
>>>>>>>
>>>>>>> I start with synchronize_rcu() but both you and Michael raise some
>>>>>>> concern.
>>>>>> I've also idly wondered if calling synchronize_rcu() under the various
>>>>>> mm locks is a deadlock situation.
>>>>>>
>>>>>>> Then I try spinlock and mutex:
>>>>>>>
>>>>>>> 1) spinlock: add lots of overhead on datapath, this leads 0 performance
>>>>>>> improvement.
>>>>>> I think the topic here is correctness not performance improvement
>>>>> The topic is whether we should revert
>>>>> commit 7f466032dc9 ("vhost: access vq metadata through kernel virtual address")
>>>>>
>>>>> or keep it in. The only reason to keep it is performance.
>>>> Yikes, I'm not sure you can ever win against copy_from_user using
>>>> mmu_notifiers?
>>> Ever since copy_from_user started playing with flags (for SMAP) and
>>> added speculation barriers there's a chance we can win by accessing
>>> memory through the kernel address.
>> You think copy_to_user will be more expensive than the minimum two
>> atomics required to synchronize with another thread?
> I frankly don't know. With SMAP you flip flags twice, and with spectre
> you flush the pipeline. Is that cheaper or more expensive than an atomic
> operation? Testing is the only way to tell.


Let me test, I only did test on a non SMAP machine. Switching to 
spinlock kills all performance improvement.

Thanks


>
>>>> Also, why can't this just permanently GUP the pages? In fact, where
>>>> does it put_page them anyhow? Worrying that 7f466 adds a get_user page
>>>> but does not add a put_page??
>> You didn't answer this.. Why not just use GUP?
>>
>> Jason
> Sorry I misunderstood the question. Permanent GUP breaks lots of
> functionality we need such as THP and numa balancing.
>
> release_pages is used instead of put_page.
>
>
>
>
Jason Wang Aug. 5, 2019, 4:41 a.m. UTC | #22
On 2019/8/5 下午12:36, Jason Wang wrote:
>
> On 2019/8/2 下午10:27, Michael S. Tsirkin wrote:
>> On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
>>> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
>>>>> This must be a proper barrier, like a spinlock, mutex, or
>>>>> synchronize_rcu.
>>>>
>>>> I start with synchronize_rcu() but both you and Michael raise some
>>>> concern.
>>> I've also idly wondered if calling synchronize_rcu() under the various
>>> mm locks is a deadlock situation.
>>>
>>>> Then I try spinlock and mutex:
>>>>
>>>> 1) spinlock: add lots of overhead on datapath, this leads 0 
>>>> performance
>>>> improvement.
>>> I think the topic here is correctness not performance improvement
>> The topic is whether we should revert
>> commit 7f466032dc9 ("vhost: access vq metadata through kernel virtual 
>> address")
>>
>> or keep it in. The only reason to keep it is performance.
>
>
> Maybe it's time to introduce the config option?


Or does it make sense if I post a V3 with:

- introduce config option and disable the optimization by default

- switch from synchronize_rcu() to vhost_flush_work(), but the rest are 
the same

This can give us some breath to decide which way should go for next release?

Thanks


>
>
>>
>> Now as long as all this code is disabled anyway, we can experiment a
>> bit.
>>
>> I personally feel we would be best served by having two code paths:
>>
>> - Access to VM memory directly mapped into kernel
>> - Access to userspace
>>
>>
>> Having it all cleanly split will allow a bunch of optimizations, for
>> example for years now we planned to be able to process an incoming short
>> packet directly on softirq path, or an outgoing on directly within
>> eventfd.
>
>
> It's not hard consider we've already had our own accssors. But the 
> question is (as asked in another thread), do you want permanent GUP or 
> still use MMU notifiers.
>
> Thanks
>
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Michael S. Tsirkin Aug. 5, 2019, 6:28 a.m. UTC | #23
On Mon, Aug 05, 2019 at 12:33:45PM +0800, Jason Wang wrote:
> 
> On 2019/8/2 下午10:03, Michael S. Tsirkin wrote:
> > On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> > > Btw, I come up another idea, that is to disable preemption when vhost thread
> > > need to access the memory. Then register preempt notifier and if vhost
> > > thread is preempted, we're sure no one will access the memory and can do the
> > > cleanup.
> > Great, more notifiers :(
> > 
> > Maybe can live with
> > 1- disable preemption while using the cached pointer
> > 2- teach vhost to recover from memory access failures,
> >     by switching to regular from/to user path
> 
> 
> I don't get this, I believe we want to recover from regular from/to user
> path, isn't it?

That (disable copy to/from user completely) would be a nice to have
since it would reduce the attack surface of the driver, but e.g. your
code already doesn't do that.



> 
> > 
> > So if you want to try that, fine since it's a step in
> > the right direction.
> > 
> > But I think fundamentally it's not what we want to do long term.
> 
> 
> Yes.
> 
> 
> > 
> > It's always been a fundamental problem with this patch series that only
> > metadata is accessed through a direct pointer.
> > 
> > The difference in ways you handle metadata and data is what is
> > now coming and messing everything up.
> 
> 
> I do propose soemthing like this in the past:
> https://www.spinics.net/lists/linux-virtualization/msg36824.html. But looks
> like you have some concern about its locality.

Right and it doesn't go away. You'll need to come up
with a test that messes it up and triggers a worst-case
scenario, so we can measure how bad is that worst-case.

> But the problem still there, GUP can do page fault, so still need to
> synchronize it with MMU notifiers.

I think the idea was, if GUP would need a pagefault, don't
do a GUP and do to/from user instead. Hopefully that
will fault the page in and the next access will go through.

> The solution might be something like
> moving GUP to a dedicated kind of vhost work.

Right, generally GUP.

> 
> > 
> > So if continuing the direct map approach,
> > what is needed is a cache of mapped VM memory, then on a cache miss
> > we'd queue work along the lines of 1-2 above.
> > 
> > That's one direction to take. Another one is to give up on that and
> > write our own version of uaccess macros.  Add a "high security" flag to
> > the vhost module and if not active use these for userspace memory
> > access.
> 
> 
> Or using SET_BACKEND_FEATURES?

No, I don't think it's considered best practice to allow unpriveledged
userspace control over whether kernel enables security features.

> But do you mean permanent GUP as I did in
> original RFC https://lkml.org/lkml/2018/12/13/218?
> 
> Thanks

Permanent GUP breaks THP and NUMA.

> > 
> >
Michael S. Tsirkin Aug. 5, 2019, 6:30 a.m. UTC | #24
On Mon, Aug 05, 2019 at 12:36:40PM +0800, Jason Wang wrote:
> 
> On 2019/8/2 下午10:27, Michael S. Tsirkin wrote:
> > On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
> > > On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> > > > > This must be a proper barrier, like a spinlock, mutex, or
> > > > > synchronize_rcu.
> > > > 
> > > > I start with synchronize_rcu() but both you and Michael raise some
> > > > concern.
> > > I've also idly wondered if calling synchronize_rcu() under the various
> > > mm locks is a deadlock situation.
> > > 
> > > > Then I try spinlock and mutex:
> > > > 
> > > > 1) spinlock: add lots of overhead on datapath, this leads 0 performance
> > > > improvement.
> > > I think the topic here is correctness not performance improvement
> > The topic is whether we should revert
> > commit 7f466032dc9 ("vhost: access vq metadata through kernel virtual address")
> > 
> > or keep it in. The only reason to keep it is performance.
> 
> 
> Maybe it's time to introduce the config option?

Depending on CONFIG_BROKEN? I'm not sure it's a good idea.

> 
> > 
> > Now as long as all this code is disabled anyway, we can experiment a
> > bit.
> > 
> > I personally feel we would be best served by having two code paths:
> > 
> > - Access to VM memory directly mapped into kernel
> > - Access to userspace
> > 
> > 
> > Having it all cleanly split will allow a bunch of optimizations, for
> > example for years now we planned to be able to process an incoming short
> > packet directly on softirq path, or an outgoing on directly within
> > eventfd.
> 
> 
> It's not hard consider we've already had our own accssors. But the question
> is (as asked in another thread), do you want permanent GUP or still use MMU
> notifiers.
> 
> Thanks

We want THP and NUMA to work. Both are important for performance.
Michael S. Tsirkin Aug. 5, 2019, 6:40 a.m. UTC | #25
On Mon, Aug 05, 2019 at 12:41:45PM +0800, Jason Wang wrote:
> 
> On 2019/8/5 下午12:36, Jason Wang wrote:
> > 
> > On 2019/8/2 下午10:27, Michael S. Tsirkin wrote:
> > > On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
> > > > On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> > > > > > This must be a proper barrier, like a spinlock, mutex, or
> > > > > > synchronize_rcu.
> > > > > 
> > > > > I start with synchronize_rcu() but both you and Michael raise some
> > > > > concern.
> > > > I've also idly wondered if calling synchronize_rcu() under the various
> > > > mm locks is a deadlock situation.
> > > > 
> > > > > Then I try spinlock and mutex:
> > > > > 
> > > > > 1) spinlock: add lots of overhead on datapath, this leads 0
> > > > > performance
> > > > > improvement.
> > > > I think the topic here is correctness not performance improvement
> > > The topic is whether we should revert
> > > commit 7f466032dc9 ("vhost: access vq metadata through kernel
> > > virtual address")
> > > 
> > > or keep it in. The only reason to keep it is performance.
> > 
> > 
> > Maybe it's time to introduce the config option?
> 
> 
> Or does it make sense if I post a V3 with:
> 
> - introduce config option and disable the optimization by default
> 
> - switch from synchronize_rcu() to vhost_flush_work(), but the rest are the
> same
> 
> This can give us some breath to decide which way should go for next release?
> 
> Thanks

As is, with preempt enabled?  Nope I don't think blocking an invalidator
on swap IO is ok, so I don't believe this stuff is going into this
release at this point.

So it's more a question of whether it's better to revert and apply a clean
patch on top, or just keep the code around but disabled with an ifdef as is.
I'm open to both options, and would like your opinion on this.

> 
> > 
> > 
> > > 
> > > Now as long as all this code is disabled anyway, we can experiment a
> > > bit.
> > > 
> > > I personally feel we would be best served by having two code paths:
> > > 
> > > - Access to VM memory directly mapped into kernel
> > > - Access to userspace
> > > 
> > > 
> > > Having it all cleanly split will allow a bunch of optimizations, for
> > > example for years now we planned to be able to process an incoming short
> > > packet directly on softirq path, or an outgoing on directly within
> > > eventfd.
> > 
> > 
> > It's not hard consider we've already had our own accssors. But the
> > question is (as asked in another thread), do you want permanent GUP or
> > still use MMU notifiers.
> > 
> > Thanks
> > 
> > _______________________________________________
> > Virtualization mailing list
> > Virtualization@lists.linux-foundation.org
> > https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Jason Wang Aug. 5, 2019, 8:18 a.m. UTC | #26
On 2019/8/4 上午5:54, Michael S. Tsirkin wrote:
> On Thu, Aug 01, 2019 at 04:06:13AM -0400, Jason Wang wrote:
>> On 2019/8/1 上午2:29, Michael S. Tsirkin wrote:
>>> On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
>>>> We used to use RCU to synchronize MMU notifier with worker. This leads
>>>> calling synchronize_rcu() in invalidate_range_start(). But on a busy
>>>> system, there would be many factors that may slow down the
>>>> synchronize_rcu() which makes it unsuitable to be called in MMU
>>>> notifier.
>>>>
>>>> A solution is SRCU but its overhead is obvious with the expensive full
>>>> memory barrier. Another choice is to use seqlock, but it doesn't
>>>> provide a synchronization method between readers and writers. The last
>>>> choice is to use vq mutex, but it need to deal with the worst case
>>>> that MMU notifier must be blocked and wait for the finish of swap in.
>>>>
>>>> So this patch switches use a counter to track whether or not the map
>>>> was used. The counter was increased when vq try to start or finish
>>>> uses the map. This means, when it was even, we're sure there's no
>>>> readers and MMU notifier is synchronized. When it was odd, it means
>>>> there's a reader we need to wait it to be even again then we are
>>>> synchronized. To avoid full memory barrier, store_release +
>>>> load_acquire on the counter is used.
>>> Unfortunately this needs a lot of review and testing, so this can't make
>>> rc2, and I don't think this is the kind of patch I can merge after rc3.
>>> Subtle memory barrier tricks like this can introduce new bugs while they
>>> are fixing old ones.
>> I admit the patch is tricky. Some questions:
>>
>> - Do we must address the case of e.g swap in? If not, a simple
>>    vhost_work_flush() instead of synchronize_rcu() may work.
>> - Having some hard thought, I think we can use seqlock, it looks
>>    to me smp_wmb() is in write_segcount_begin() is sufficient, we don't
>>    care vq->map read before smp_wmb(), and for the other we all have
>>    good data devendency so smp_wmb() in the write_seqbegin_end() is
>>    sufficient.
> If we need an mb in the begin() we can switch to
> dependent_ptr_mb. if you need me to fix it up
> and repost, let me know.


Yes, but please let me figure out whether mb is really necessary here.


>
> Why isn't it a problem if the map is
> accessed outside the lock?


Correct me if I was wrong. E.g for vhost_put_avail_event()

vhost_vq_access_map_begin(vq);

                 map = vq->maps[VHOST_ADDR_USED];
                 if (likely(map)) {
                         used = map->addr;
                         *((__virtio16 *)&used->ring[vq->num]) =
                                 cpu_to_vhost16(vq, vq->avail_idx);
vhost_vq_access_map_end(vq);
                         return 0;
}

                 vhost_vq_access_map_end(vq);


We dont' care whether map is accessed before vhost_vq_access_map_begin() 
since MMU notifier can only change map from non-NULL to NULL. If we read 
it too early, we will only get NULL and won't use the map at all. And 
smp_wmb() in vhost_vq_access_map_begin() can make sure the real access 
to map->addr is done after we increasing the counter.


>
>
>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index db2c81cb1e90..6d9501303258 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -363,39 +363,29 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>>   
>>   static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
>>   {
>> -	int ref = READ_ONCE(vq->ref);
>> -
>> -	smp_store_release(&vq->ref, ref + 1);
>> -	/* Make sure ref counter is visible before accessing the map */
>> -	smp_load_acquire(&vq->ref);
>> +	write_seqcount_begin(&vq->seq);
>>   }
>>   
>>   static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
>>   {
>> -	int ref = READ_ONCE(vq->ref);
>> -
>> -	/* Make sure vq access is done before increasing ref counter */
>> -	smp_store_release(&vq->ref, ref + 1);
>> +	write_seqcount_end(&vq->seq);
>>   }
>>   
>>   static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
>>   {
>> -	int ref;
>> +	unsigned int ret;
>>   
>>   	/* Make sure map change was done before checking ref counter */
>>   	smp_mb();
>> -
>> -	ref = READ_ONCE(vq->ref);
>> -	if (ref & 0x1) {
>> -		/* When ref change, we are sure no reader can see
>> +	ret = raw_read_seqcount(&vq->seq);
>> +	if (ret & 0x1) {
>> +		/* When seq changes, we are sure no reader can see
>>   		 * previous map */
>> -		while (READ_ONCE(vq->ref) == ref) {
>> -			set_current_state(TASK_RUNNING);
>> +		while (raw_read_seqcount(&vq->seq) == ret)
>>   			schedule();
>
> So why do we set state here?


No need, just a artifact of previous patch.


> And should not we
> check need_sched?


We need use need_sched().


>
>
>> -		}
>>   	}
>> -	/* Make sure ref counter was checked before any other
>> -	 * operations that was dene on map. */
>> +	/* Make sure seq was checked before any other operations that
>> +	 * was dene on map. */
>>   	smp_mb();
>>   }
>>   
>> @@ -691,7 +681,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>>   		vq->indirect = NULL;
>>   		vq->heads = NULL;
>>   		vq->dev = dev;
>> -		vq->ref = 0;
>> +		seqcount_init(&vq->seq);
>>   		mutex_init(&vq->mutex);
>>   		spin_lock_init(&vq->mmu_lock);
>>   		vhost_vq_reset(dev, vq);
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 3d10da0ae511..1a705e181a84 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -125,7 +125,7 @@ struct vhost_virtqueue {
>>   	 */
>>   	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
>>   #endif
>> -	int ref;
>> +	seqcount_t seq;
>>   	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
>>   
>>   	struct file *kick;
>> -- 
>> 2.18.1
>>
>>>
>>>
>>>
>>>
>>>> Consider the read critical section is pretty small the synchronization
>>>> should be done very fast.
>>>>
>>>> Note the patch lead about 3% PPS dropping.
>>> Sorry what do you mean by this last sentence? This degrades performance
>>> compared to what?
>> Compare to without this patch.
> OK is the feature still a performance win? or should we drop it for now?


Still a win, just a drop from 23% improvement to 20% improvement.


>>>> Reported-by: Michael S. Tsirkin <mst@redhat.com>
>>>> Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>> ---
>>>>   drivers/vhost/vhost.c | 145 ++++++++++++++++++++++++++----------------
>>>>   drivers/vhost/vhost.h |   7 +-
>>>>   2 files changed, 94 insertions(+), 58 deletions(-)
>>>>
>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>> index cfc11f9ed9c9..db2c81cb1e90 100644
>>>> --- a/drivers/vhost/vhost.c
>>>> +++ b/drivers/vhost/vhost.c
>>>> @@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
>>>>   
>>>>   	spin_lock(&vq->mmu_lock);
>>>>   	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
>>>> -		map[i] = rcu_dereference_protected(vq->maps[i],
>>>> -				  lockdep_is_held(&vq->mmu_lock));
>>>> +		map[i] = vq->maps[i];
>>>>   		if (map[i]) {
>>>>   			vhost_set_map_dirty(vq, map[i], i);
>>>> -			rcu_assign_pointer(vq->maps[i], NULL);
>>>> +			vq->maps[i] = NULL;
>>>>   		}
>>>>   	}
>>>>   	spin_unlock(&vq->mmu_lock);
>>>>   
>>>> -	/* No need for synchronize_rcu() or kfree_rcu() since we are
>>>> -	 * serialized with memory accessors (e.g vq mutex held).
>>>> +	/* No need for synchronization since we are serialized with
>>>> +	 * memory accessors (e.g vq mutex held).
>>>>   	 */
>>>>   
>>>>   	for (i = 0; i < VHOST_NUM_ADDRS; i++)
>>>> @@ -362,6 +361,44 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
>>>>   	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
>>>>   }
>>>>   
>>>> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
>>>> +{
>>>> +	int ref = READ_ONCE(vq->ref);
>>>> +
>>>> +	smp_store_release(&vq->ref, ref + 1);
>>>> +	/* Make sure ref counter is visible before accessing the map */
>>>> +	smp_load_acquire(&vq->ref);
>>> The map access is after this sequence, correct?
>> Yes.
>>
>>> Just going by the rules in Documentation/memory-barriers.txt,
>>> I think that this pair will not order following accesses with ref store.
>>>
>>> Documentation/memory-barriers.txt says:
>>>
>>>
>>> +     In addition, a RELEASE+ACQUIRE
>>> +     pair is -not- guaranteed to act as a full memory barrier.
>>>
>>>
>>>
>>> The guarantee that is made is this:
>>> 	after
>>>       an ACQUIRE on a given variable, all memory accesses preceding any prior
>>>       RELEASE on that same variable are guaranteed to be visible.
>> Yes, but it's not clear about the order of ACQUIRE the same location
>> of previous RELEASE. And it only has a example like:
>>
>> "
>> 	*A = a;
>> 	RELEASE M
>> 	ACQUIRE N
>> 	*B = b;
>>
>> could occur as:
>>
>> 	ACQUIRE N, STORE *B, STORE *A, RELEASE M
>> "
>>
>> But it doesn't explain what happen when
>>
>> *A = a
>> RELEASE M
>> ACQUIRE M
>> *B = b;
>>
>> And tools/memory-model/Documentation said
>>
>> "
>> First, when a lock-acquire reads from a lock-release, the LKMM
>> requires that every instruction po-before the lock-release must
>> execute before any instruction po-after the lock-acquire.
>> "
>>
>> Is this a hint that I was correct?
> I don't think it's correct since by this logic
> memory barriers can be nops on x86.


It not a nop, instead, it goes to a write and then read to one same 
location of memory.


>
>>>
>>> And if we also had the reverse rule we'd end up with a full barrier,
>>> won't we?
>>>
>>> Cc Paul in case I missed something here. And if I'm right,
>>> maybe we should call this out, adding
>>>
>>> 	"The opposite is not true: a prior RELEASE is not
>>> 	 guaranteed to be visible before memory accesses following
>>> 	 the subsequent ACQUIRE".
>> That kinds of violates the RELEASE?
>>
>> "
>>       This also acts as a one-way permeable barrier.  It guarantees that all
>>       memory operations before the RELEASE operation will appear to happen
>>       before the RELEASE operation with respect to the other components of the
>> "
>
> yes but we are talking about RELEASE itself versus stuff
> that comes after it.


Unless RELEASE and ACQUIRE on the same address can be reordered (at 
least doesn't happen x86). The following ACQUIRE can make sure stuff 
after ACQUIRE id done after RELEASE.


>
>>>
>>>
>>>> +}
>>>> +
>>>> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
>>>> +{
>>>> +	int ref = READ_ONCE(vq->ref);
>>>> +
>>>> +	/* Make sure vq access is done before increasing ref counter */
>>>> +	smp_store_release(&vq->ref, ref + 1);
>>>> +}
>>>> +
>>>> +static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
>>>> +{
>>>> +	int ref;
>>>> +
>>>> +	/* Make sure map change was done before checking ref counter */
>>>> +	smp_mb();
>>>> +
>>>> +	ref = READ_ONCE(vq->ref);
>>>> +	if (ref & 0x1) {
>>> Please document the even/odd trick here too, not just in the commit log.
>>>
>> Ok.
>>
>>>> +		/* When ref change,
>>> changes
>>>
>>>> we are sure no reader can see
>>>> +		 * previous map */
>>>> +		while (READ_ONCE(vq->ref) == ref) {
>>>
>>> what is the below line in aid of?
>>>
>>>> +			set_current_state(TASK_RUNNING);
> any answers here?


It's unecessary.


>
>>>> +			schedule();
>>>                          if (need_resched())
>>>                                  schedule();
>>>
>>> ?
>> Yes, better.
>>
>>>> +		}
>>> On an interruptible kernel, there's a risk here is that
>>> a task got preempted with an odd ref.
>>> So I suspect we'll have to disable preemption when we
>>> make ref odd.
>> I'm not sure I get, if the odd is not the original value we read,
>> we're sure it won't read the new map here I believe.
> But we will spin for a very long time in this case.


Yes, but do we disable preemption in MMU notifier callback. If not it 
should be the same as MMU notifier was preempted for long time.

We can disable the preempt count here, but it needs an extra cacheline.

Thanks


>
>>>
>>>> +	}
>>>> +	/* Make sure ref counter was checked before any other
>>>> +	 * operations that was dene on map. */
>>> was dene -> were done?
>>>
>> Yes.
>>
>>>> +	smp_mb();
>>>> +}
>>>> +
>>>>   static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>>>>   				      int index,
>>>>   				      unsigned long start,
>>>> @@ -376,16 +413,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
>>>>   	spin_lock(&vq->mmu_lock);
>>>>   	++vq->invalidate_count;
>>>>   
>>>> -	map = rcu_dereference_protected(vq->maps[index],
>>>> -					lockdep_is_held(&vq->mmu_lock));
>>>> +	map = vq->maps[index];
>>>>   	if (map) {
>>>>   		vhost_set_map_dirty(vq, map, index);
>>>> -		rcu_assign_pointer(vq->maps[index], NULL);
>>>> +		vq->maps[index] = NULL;
>>>>   	}
>>>>   	spin_unlock(&vq->mmu_lock);
>>>>   
>>>>   	if (map) {
>>>> -		synchronize_rcu();
>>>> +		vhost_vq_sync_access(vq);
>>>>   		vhost_map_unprefetch(map);
>>>>   	}
>>>>   }
>>>> @@ -457,7 +493,7 @@ static void vhost_init_maps(struct vhost_dev *dev)
>>>>   	for (i = 0; i < dev->nvqs; ++i) {
>>>>   		vq = dev->vqs[i];
>>>>   		for (j = 0; j < VHOST_NUM_ADDRS; j++)
>>>> -			RCU_INIT_POINTER(vq->maps[j], NULL);
>>>> +			vq->maps[j] = NULL;
>>>>   	}
>>>>   }
>>>>   #endif
>>>> @@ -655,6 +691,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>>>>   		vq->indirect = NULL;
>>>>   		vq->heads = NULL;
>>>>   		vq->dev = dev;
>>>> +		vq->ref = 0;
>>>>   		mutex_init(&vq->mutex);
>>>>   		spin_lock_init(&vq->mmu_lock);
>>>>   		vhost_vq_reset(dev, vq);
>>>> @@ -921,7 +958,7 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq,
>>>>   	map->npages = npages;
>>>>   	map->pages = pages;
>>>>   
>>>> -	rcu_assign_pointer(vq->maps[index], map);
>>>> +	vq->maps[index] = map;
>>>>   	/* No need for a synchronize_rcu(). This function should be
>>>>   	 * called by dev->worker so we are serialized with all
>>>>   	 * readers.
>>>> @@ -1216,18 +1253,18 @@ static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>>>>   	struct vring_used *used;
>>>>   
>>>>   	if (!vq->iotlb) {
>>>> -		rcu_read_lock();
>>>> +		vhost_vq_access_map_begin(vq);
>>>>   
>>>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>>>> +		map = vq->maps[VHOST_ADDR_USED];
>>>>   		if (likely(map)) {
>>>>   			used = map->addr;
>>>>   			*((__virtio16 *)&used->ring[vq->num]) =
>>>>   				cpu_to_vhost16(vq, vq->avail_idx);
>>>> -			rcu_read_unlock();
>>>> +			vhost_vq_access_map_end(vq);
>>>>   			return 0;
>>>>   		}
>>>>   
>>>> -		rcu_read_unlock();
>>>> +		vhost_vq_access_map_end(vq);
>>>>   	}
>>>>   #endif
>>>>   
>>>> @@ -1245,18 +1282,18 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>>>>   	size_t size;
>>>>   
>>>>   	if (!vq->iotlb) {
>>>> -		rcu_read_lock();
>>>> +		vhost_vq_access_map_begin(vq);
>>>>   
>>>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>>>> +		map = vq->maps[VHOST_ADDR_USED];
>>>>   		if (likely(map)) {
>>>>   			used = map->addr;
>>>>   			size = count * sizeof(*head);
>>>>   			memcpy(used->ring + idx, head, size);
>>>> -			rcu_read_unlock();
>>>> +			vhost_vq_access_map_end(vq);
>>>>   			return 0;
>>>>   		}
>>>>   
>>>> -		rcu_read_unlock();
>>>> +		vhost_vq_access_map_end(vq);
>>>>   	}
>>>>   #endif
>>>>   
>>>> @@ -1272,17 +1309,17 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>>>>   	struct vring_used *used;
>>>>   
>>>>   	if (!vq->iotlb) {
>>>> -		rcu_read_lock();
>>>> +		vhost_vq_access_map_begin(vq);
>>>>   
>>>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>>>> +		map = vq->maps[VHOST_ADDR_USED];
>>>>   		if (likely(map)) {
>>>>   			used = map->addr;
>>>>   			used->flags = cpu_to_vhost16(vq, vq->used_flags);
>>>> -			rcu_read_unlock();
>>>> +			vhost_vq_access_map_end(vq);
>>>>   			return 0;
>>>>   		}
>>>>   
>>>> -		rcu_read_unlock();
>>>> +		vhost_vq_access_map_end(vq);
>>>>   	}
>>>>   #endif
>>>>   
>>>> @@ -1298,17 +1335,17 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>>>>   	struct vring_used *used;
>>>>   
>>>>   	if (!vq->iotlb) {
>>>> -		rcu_read_lock();
>>>> +		vhost_vq_access_map_begin(vq);
>>>>   
>>>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>>>> +		map = vq->maps[VHOST_ADDR_USED];
>>>>   		if (likely(map)) {
>>>>   			used = map->addr;
>>>>   			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
>>>> -			rcu_read_unlock();
>>>> +			vhost_vq_access_map_end(vq);
>>>>   			return 0;
>>>>   		}
>>>>   
>>>> -		rcu_read_unlock();
>>>> +		vhost_vq_access_map_end(vq);
>>>>   	}
>>>>   #endif
>>>>   
>>>> @@ -1362,17 +1399,17 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>>>>   	struct vring_avail *avail;
>>>>   
>>>>   	if (!vq->iotlb) {
>>>> -		rcu_read_lock();
>>>> +		vhost_vq_access_map_begin(vq);
>>>>   
>>>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>>>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>>>>   		if (likely(map)) {
>>>>   			avail = map->addr;
>>>>   			*idx = avail->idx;
>>>> -			rcu_read_unlock();
>>>> +			vhost_vq_access_map_end(vq);
>>>>   			return 0;
>>>>   		}
>>>>   
>>>> -		rcu_read_unlock();
>>>> +		vhost_vq_access_map_end(vq);
>>>>   	}
>>>>   #endif
>>>>   
>>>> @@ -1387,17 +1424,17 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>>>>   	struct vring_avail *avail;
>>>>   
>>>>   	if (!vq->iotlb) {
>>>> -		rcu_read_lock();
>>>> +		vhost_vq_access_map_begin(vq);
>>>>   
>>>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>>>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>>>>   		if (likely(map)) {
>>>>   			avail = map->addr;
>>>>   			*head = avail->ring[idx & (vq->num - 1)];
>>>> -			rcu_read_unlock();
>>>> +			vhost_vq_access_map_end(vq);
>>>>   			return 0;
>>>>   		}
>>>>   
>>>> -		rcu_read_unlock();
>>>> +		vhost_vq_access_map_end(vq);
>>>>   	}
>>>>   #endif
>>>>   
>>>> @@ -1413,17 +1450,17 @@ static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>>>>   	struct vring_avail *avail;
>>>>   
>>>>   	if (!vq->iotlb) {
>>>> -		rcu_read_lock();
>>>> +		vhost_vq_access_map_begin(vq);
>>>>   
>>>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>>>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>>>>   		if (likely(map)) {
>>>>   			avail = map->addr;
>>>>   			*flags = avail->flags;
>>>> -			rcu_read_unlock();
>>>> +			vhost_vq_access_map_end(vq);
>>>>   			return 0;
>>>>   		}
>>>>   
>>>> -		rcu_read_unlock();
>>>> +		vhost_vq_access_map_end(vq);
>>>>   	}
>>>>   #endif
>>>>   
>>>> @@ -1438,15 +1475,15 @@ static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>>>>   	struct vring_avail *avail;
>>>>   
>>>>   	if (!vq->iotlb) {
>>>> -		rcu_read_lock();
>>>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
>>>> +		vhost_vq_access_map_begin(vq);
>>>> +		map = vq->maps[VHOST_ADDR_AVAIL];
>>>>   		if (likely(map)) {
>>>>   			avail = map->addr;
>>>>   			*event = (__virtio16)avail->ring[vq->num];
>>>> -			rcu_read_unlock();
>>>> +			vhost_vq_access_map_end(vq);
>>>>   			return 0;
>>>>   		}
>>>> -		rcu_read_unlock();
>>>> +		vhost_vq_access_map_end(vq);
>>>>   	}
>>>>   #endif
>>>>   
>>>> @@ -1461,17 +1498,17 @@ static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>>>>   	struct vring_used *used;
>>>>   
>>>>   	if (!vq->iotlb) {
>>>> -		rcu_read_lock();
>>>> +		vhost_vq_access_map_begin(vq);
>>>>   
>>>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
>>>> +		map = vq->maps[VHOST_ADDR_USED];
>>>>   		if (likely(map)) {
>>>>   			used = map->addr;
>>>>   			*idx = used->idx;
>>>> -			rcu_read_unlock();
>>>> +			vhost_vq_access_map_end(vq);
>>>>   			return 0;
>>>>   		}
>>>>   
>>>> -		rcu_read_unlock();
>>>> +		vhost_vq_access_map_end(vq);
>>>>   	}
>>>>   #endif
>>>>   
>>>> @@ -1486,17 +1523,17 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>>>>   	struct vring_desc *d;
>>>>   
>>>>   	if (!vq->iotlb) {
>>>> -		rcu_read_lock();
>>>> +		vhost_vq_access_map_begin(vq);
>>>>   
>>>> -		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
>>>> +		map = vq->maps[VHOST_ADDR_DESC];
>>>>   		if (likely(map)) {
>>>>   			d = map->addr;
>>>>   			*desc = *(d + idx);
>>>> -			rcu_read_unlock();
>>>> +			vhost_vq_access_map_end(vq);
>>>>   			return 0;
>>>>   		}
>>>>   
>>>> -		rcu_read_unlock();
>>>> +		vhost_vq_access_map_end(vq);
>>>>   	}
>>>>   #endif
>>>>   
>>>> @@ -1843,13 +1880,11 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
>>>>   #if VHOST_ARCH_CAN_ACCEL_UACCESS
>>>>   static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
>>>>   {
>>>> -	struct vhost_map __rcu *map;
>>>> +	struct vhost_map *map;
>>>>   	int i;
>>>>   
>>>>   	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
>>>> -		rcu_read_lock();
>>>> -		map = rcu_dereference(vq->maps[i]);
>>>> -		rcu_read_unlock();
>>>> +		map = vq->maps[i];
>>>>   		if (unlikely(!map))
>>>>   			vhost_map_prefetch(vq, i);
>>>>   	}
>>>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>>>> index a9a2a93857d2..f9e9558a529d 100644
>>>> --- a/drivers/vhost/vhost.h
>>>> +++ b/drivers/vhost/vhost.h
>>>> @@ -115,16 +115,17 @@ struct vhost_virtqueue {
>>>>   #if VHOST_ARCH_CAN_ACCEL_UACCESS
>>>>   	/* Read by memory accessors, modified by meta data
>>>>   	 * prefetching, MMU notifier and vring ioctl().
>>>> -	 * Synchonrized through mmu_lock (writers) and RCU (writers
>>>> -	 * and readers).
>>>> +	 * Synchonrized through mmu_lock (writers) and ref counters,
>>>> +	 * see vhost_vq_access_map_begin()/vhost_vq_access_map_end().
>>>>   	 */
>>>> -	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
>>>> +	struct vhost_map *maps[VHOST_NUM_ADDRS];
>>>>   	/* Read by MMU notifier, modified by vring ioctl(),
>>>>   	 * synchronized through MMU notifier
>>>>   	 * registering/unregistering.
>>>>   	 */
>>>>   	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
>>>>   #endif
>>>> +	int ref;
>>> Is it important that this is signed? If not I'd do unsigned here:
>>> even though kernel does compile with 2s complement sign overflow,
>>> it seems cleaner not to depend on that.
>> Not a must, let me fix.
>>
>> Thanks
>>
>>>>   	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
>>>>   
>>>>   	struct file *kick;
>>>> -- 
>>>> 2.18.1
Jason Wang Aug. 5, 2019, 8:21 a.m. UTC | #27
On 2019/8/5 下午2:28, Michael S. Tsirkin wrote:
> On Mon, Aug 05, 2019 at 12:33:45PM +0800, Jason Wang wrote:
>> On 2019/8/2 下午10:03, Michael S. Tsirkin wrote:
>>> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
>>>> Btw, I come up another idea, that is to disable preemption when vhost thread
>>>> need to access the memory. Then register preempt notifier and if vhost
>>>> thread is preempted, we're sure no one will access the memory and can do the
>>>> cleanup.
>>> Great, more notifiers :(
>>>
>>> Maybe can live with
>>> 1- disable preemption while using the cached pointer
>>> 2- teach vhost to recover from memory access failures,
>>>      by switching to regular from/to user path
>>
>> I don't get this, I believe we want to recover from regular from/to user
>> path, isn't it?
> That (disable copy to/from user completely) would be a nice to have
> since it would reduce the attack surface of the driver, but e.g. your
> code already doesn't do that.
>

Yes since it requires a lot of changes.


>
>>> So if you want to try that, fine since it's a step in
>>> the right direction.
>>>
>>> But I think fundamentally it's not what we want to do long term.
>>
>> Yes.
>>
>>
>>> It's always been a fundamental problem with this patch series that only
>>> metadata is accessed through a direct pointer.
>>>
>>> The difference in ways you handle metadata and data is what is
>>> now coming and messing everything up.
>>
>> I do propose soemthing like this in the past:
>> https://www.spinics.net/lists/linux-virtualization/msg36824.html. But looks
>> like you have some concern about its locality.
> Right and it doesn't go away. You'll need to come up
> with a test that messes it up and triggers a worst-case
> scenario, so we can measure how bad is that worst-case.




>
>> But the problem still there, GUP can do page fault, so still need to
>> synchronize it with MMU notifiers.
> I think the idea was, if GUP would need a pagefault, don't
> do a GUP and do to/from user instead.


But this still need to be synchronized with MMU notifiers (or using 
dedicated work for GUP).


>   Hopefully that
> will fault the page in and the next access will go through.
>
>> The solution might be something like
>> moving GUP to a dedicated kind of vhost work.
> Right, generally GUP.
>
>>> So if continuing the direct map approach,
>>> what is needed is a cache of mapped VM memory, then on a cache miss
>>> we'd queue work along the lines of 1-2 above.
>>>
>>> That's one direction to take. Another one is to give up on that and
>>> write our own version of uaccess macros.  Add a "high security" flag to
>>> the vhost module and if not active use these for userspace memory
>>> access.
>>
>> Or using SET_BACKEND_FEATURES?
> No, I don't think it's considered best practice to allow unpriveledged
> userspace control over whether kernel enables security features.


Get this.


>
>> But do you mean permanent GUP as I did in
>> original RFC https://lkml.org/lkml/2018/12/13/218?
>>
>> Thanks
> Permanent GUP breaks THP and NUMA.


Yes.

Thanks


>
>>>
Jason Wang Aug. 5, 2019, 8:22 a.m. UTC | #28
On 2019/8/5 下午2:30, Michael S. Tsirkin wrote:
> On Mon, Aug 05, 2019 at 12:36:40PM +0800, Jason Wang wrote:
>> On 2019/8/2 下午10:27, Michael S. Tsirkin wrote:
>>> On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
>>>> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
>>>>>> This must be a proper barrier, like a spinlock, mutex, or
>>>>>> synchronize_rcu.
>>>>> I start with synchronize_rcu() but both you and Michael raise some
>>>>> concern.
>>>> I've also idly wondered if calling synchronize_rcu() under the various
>>>> mm locks is a deadlock situation.
>>>>
>>>>> Then I try spinlock and mutex:
>>>>>
>>>>> 1) spinlock: add lots of overhead on datapath, this leads 0 performance
>>>>> improvement.
>>>> I think the topic here is correctness not performance improvement
>>> The topic is whether we should revert
>>> commit 7f466032dc9 ("vhost: access vq metadata through kernel virtual address")
>>>
>>> or keep it in. The only reason to keep it is performance.
>>
>> Maybe it's time to introduce the config option?
> Depending on CONFIG_BROKEN? I'm not sure it's a good idea.


Ok.


>>> Now as long as all this code is disabled anyway, we can experiment a
>>> bit.
>>>
>>> I personally feel we would be best served by having two code paths:
>>>
>>> - Access to VM memory directly mapped into kernel
>>> - Access to userspace
>>>
>>>
>>> Having it all cleanly split will allow a bunch of optimizations, for
>>> example for years now we planned to be able to process an incoming short
>>> packet directly on softirq path, or an outgoing on directly within
>>> eventfd.
>>
>> It's not hard consider we've already had our own accssors. But the question
>> is (as asked in another thread), do you want permanent GUP or still use MMU
>> notifiers.
>>
>> Thanks
> We want THP and NUMA to work. Both are important for performance.
>

Yes.

Thanks
Jason Wang Aug. 5, 2019, 8:24 a.m. UTC | #29
On 2019/8/5 下午2:40, Michael S. Tsirkin wrote:
> On Mon, Aug 05, 2019 at 12:41:45PM +0800, Jason Wang wrote:
>> On 2019/8/5 下午12:36, Jason Wang wrote:
>>> On 2019/8/2 下午10:27, Michael S. Tsirkin wrote:
>>>> On Fri, Aug 02, 2019 at 09:46:13AM -0300, Jason Gunthorpe wrote:
>>>>> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
>>>>>>> This must be a proper barrier, like a spinlock, mutex, or
>>>>>>> synchronize_rcu.
>>>>>> I start with synchronize_rcu() but both you and Michael raise some
>>>>>> concern.
>>>>> I've also idly wondered if calling synchronize_rcu() under the various
>>>>> mm locks is a deadlock situation.
>>>>>
>>>>>> Then I try spinlock and mutex:
>>>>>>
>>>>>> 1) spinlock: add lots of overhead on datapath, this leads 0
>>>>>> performance
>>>>>> improvement.
>>>>> I think the topic here is correctness not performance improvement
>>>> The topic is whether we should revert
>>>> commit 7f466032dc9 ("vhost: access vq metadata through kernel
>>>> virtual address")
>>>>
>>>> or keep it in. The only reason to keep it is performance.
>>>
>>> Maybe it's time to introduce the config option?
>>
>> Or does it make sense if I post a V3 with:
>>
>> - introduce config option and disable the optimization by default
>>
>> - switch from synchronize_rcu() to vhost_flush_work(), but the rest are the
>> same
>>
>> This can give us some breath to decide which way should go for next release?
>>
>> Thanks
> As is, with preempt enabled?  Nope I don't think blocking an invalidator
> on swap IO is ok, so I don't believe this stuff is going into this
> release at this point.
>
> So it's more a question of whether it's better to revert and apply a clean
> patch on top, or just keep the code around but disabled with an ifdef as is.
> I'm open to both options, and would like your opinion on this.


Then I prefer to leave current code (VHOST_ARCH_CAN_ACCEL to 0) as is. 
This can also save efforts on rebasing packed virtqueues.

Thanks


>
>>>
>>>> Now as long as all this code is disabled anyway, we can experiment a
>>>> bit.
>>>>
>>>> I personally feel we would be best served by having two code paths:
>>>>
>>>> - Access to VM memory directly mapped into kernel
>>>> - Access to userspace
>>>>
>>>>
>>>> Having it all cleanly split will allow a bunch of optimizations, for
>>>> example for years now we planned to be able to process an incoming short
>>>> packet directly on softirq path, or an outgoing on directly within
>>>> eventfd.
>>>
>>> It's not hard consider we've already had our own accssors. But the
>>> question is (as asked in another thread), do you want permanent GUP or
>>> still use MMU notifiers.
>>>
>>> Thanks
>>>
>>> _______________________________________________
>>> Virtualization mailing list
>>> Virtualization@lists.linux-foundation.org
>>> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Jason Gunthorpe Aug. 6, 2019, 11:53 a.m. UTC | #30
On Sun, Aug 04, 2019 at 04:07:17AM -0400, Michael S. Tsirkin wrote:
> > > > Also, why can't this just permanently GUP the pages? In fact, where
> > > > does it put_page them anyhow? Worrying that 7f466 adds a get_user page
> > > > but does not add a put_page??
> > 
> > You didn't answer this.. Why not just use GUP?
> > 
> > Jason
> 
> Sorry I misunderstood the question. Permanent GUP breaks lots of
> functionality we need such as THP and numa balancing.

Really? It doesn't look like that many pages are involved..

Jason
Jason Gunthorpe Aug. 6, 2019, 12:04 p.m. UTC | #31
On Mon, Aug 05, 2019 at 12:20:45PM +0800, Jason Wang wrote:
> 
> On 2019/8/2 下午8:46, Jason Gunthorpe wrote:
> > On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
> > > > This must be a proper barrier, like a spinlock, mutex, or
> > > > synchronize_rcu.
> > > 
> > > I start with synchronize_rcu() but both you and Michael raise some
> > > concern.
> > I've also idly wondered if calling synchronize_rcu() under the various
> > mm locks is a deadlock situation.
> 
> 
> Maybe, that's why I suggest to use vhost_work_flush() which is much
> lightweight can can achieve the same function. It can guarantee all previous
> work has been processed after vhost_work_flush() return.

If things are already running in a work, then yes, you can piggyback
on the existing spinlocks inside the workqueue and be Ok

However, if that work is doing any copy_from_user, then the flush
becomes dependent on swap and it won't work again...

> > > 1) spinlock: add lots of overhead on datapath, this leads 0 performance
> > > improvement.
> > I think the topic here is correctness not performance improvement> 
 
> But the whole series is to speed up vhost.

So? Starting with a whole bunch of crazy, possibly broken, locking and
claiming a performance win is not reasonable.

> Spinlock is correct but make the whole series meaningless consider it won't
> bring any performance improvement.

You can't invent a faster spinlock by opencoding some wild
scheme. There is nothing special about the usage here, it needs a
blocking lock, plain and simple.

Jason
Michael S. Tsirkin Aug. 6, 2019, 1:36 p.m. UTC | #32
On Tue, Aug 06, 2019 at 08:53:17AM -0300, Jason Gunthorpe wrote:
> On Sun, Aug 04, 2019 at 04:07:17AM -0400, Michael S. Tsirkin wrote:
> > > > > Also, why can't this just permanently GUP the pages? In fact, where
> > > > > does it put_page them anyhow? Worrying that 7f466 adds a get_user page
> > > > > but does not add a put_page??
> > > 
> > > You didn't answer this.. Why not just use GUP?
> > > 
> > > Jason
> > 
> > Sorry I misunderstood the question. Permanent GUP breaks lots of
> > functionality we need such as THP and numa balancing.
> 
> Really? It doesn't look like that many pages are involved..
> 
> Jason

Yea. But they just might happen to be heavily accessed ones....
Jason Gunthorpe Aug. 6, 2019, 1:40 p.m. UTC | #33
On Tue, Aug 06, 2019 at 09:36:58AM -0400, Michael S. Tsirkin wrote:
> On Tue, Aug 06, 2019 at 08:53:17AM -0300, Jason Gunthorpe wrote:
> > On Sun, Aug 04, 2019 at 04:07:17AM -0400, Michael S. Tsirkin wrote:
> > > > > > Also, why can't this just permanently GUP the pages? In fact, where
> > > > > > does it put_page them anyhow? Worrying that 7f466 adds a get_user page
> > > > > > but does not add a put_page??
> > > > 
> > > > You didn't answer this.. Why not just use GUP?
> > > > 
> > > > Jason
> > > 
> > > Sorry I misunderstood the question. Permanent GUP breaks lots of
> > > functionality we need such as THP and numa balancing.
> > 
> > Really? It doesn't look like that many pages are involved..
> > 
> > Jason
> 
> Yea. But they just might happen to be heavily accessed ones....

Maybe you can solve the numa balance problem some other way and use
normal GUP..

Jason
Jason Wang Aug. 7, 2019, 6:49 a.m. UTC | #34
On 2019/8/6 下午8:04, Jason Gunthorpe wrote:
> On Mon, Aug 05, 2019 at 12:20:45PM +0800, Jason Wang wrote:
>> On 2019/8/2 下午8:46, Jason Gunthorpe wrote:
>>> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
>>>>> This must be a proper barrier, like a spinlock, mutex, or
>>>>> synchronize_rcu.
>>>> I start with synchronize_rcu() but both you and Michael raise some
>>>> concern.
>>> I've also idly wondered if calling synchronize_rcu() under the various
>>> mm locks is a deadlock situation.
>>
>> Maybe, that's why I suggest to use vhost_work_flush() which is much
>> lightweight can can achieve the same function. It can guarantee all previous
>> work has been processed after vhost_work_flush() return.
> If things are already running in a work, then yes, you can piggyback
> on the existing spinlocks inside the workqueue and be Ok
>
> However, if that work is doing any copy_from_user, then the flush
> becomes dependent on swap and it won't work again...


Yes it do copy_from_user(), so we can't do this.


>
>>>> 1) spinlock: add lots of overhead on datapath, this leads 0 performance
>>>> improvement.
>>> I think the topic here is correctness not performance improvement>
>   
>> But the whole series is to speed up vhost.
> So? Starting with a whole bunch of crazy, possibly broken, locking and
> claiming a performance win is not reasonable.


Yes, I admit this patch is tricky, I'm not going to push this. Will post 
a V3.


>
>> Spinlock is correct but make the whole series meaningless consider it won't
>> bring any performance improvement.
> You can't invent a faster spinlock by opencoding some wild
> scheme. There is nothing special about the usage here, it needs a
> blocking lock, plain and simple.
>
> Jason


Will post V3. Let's see if you are happy with that version.

Thanks
diff mbox series

Patch

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index cfc11f9ed9c9..db2c81cb1e90 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -324,17 +324,16 @@  static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
 
 	spin_lock(&vq->mmu_lock);
 	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
-		map[i] = rcu_dereference_protected(vq->maps[i],
-				  lockdep_is_held(&vq->mmu_lock));
+		map[i] = vq->maps[i];
 		if (map[i]) {
 			vhost_set_map_dirty(vq, map[i], i);
-			rcu_assign_pointer(vq->maps[i], NULL);
+			vq->maps[i] = NULL;
 		}
 	}
 	spin_unlock(&vq->mmu_lock);
 
-	/* No need for synchronize_rcu() or kfree_rcu() since we are
-	 * serialized with memory accessors (e.g vq mutex held).
+	/* No need for synchronization since we are serialized with
+	 * memory accessors (e.g vq mutex held).
 	 */
 
 	for (i = 0; i < VHOST_NUM_ADDRS; i++)
@@ -362,6 +361,44 @@  static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
 	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
 }
 
+static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
+{
+	int ref = READ_ONCE(vq->ref);
+
+	smp_store_release(&vq->ref, ref + 1);
+	/* Make sure ref counter is visible before accessing the map */
+	smp_load_acquire(&vq->ref);
+}
+
+static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
+{
+	int ref = READ_ONCE(vq->ref);
+
+	/* Make sure vq access is done before increasing ref counter */
+	smp_store_release(&vq->ref, ref + 1);
+}
+
+static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
+{
+	int ref;
+
+	/* Make sure map change was done before checking ref counter */
+	smp_mb();
+
+	ref = READ_ONCE(vq->ref);
+	if (ref & 0x1) {
+		/* When ref change, we are sure no reader can see
+		 * previous map */
+		while (READ_ONCE(vq->ref) == ref) {
+			set_current_state(TASK_RUNNING);
+			schedule();
+		}
+	}
+	/* Make sure ref counter was checked before any other
+	 * operations that was dene on map. */
+	smp_mb();
+}
+
 static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
 				      int index,
 				      unsigned long start,
@@ -376,16 +413,15 @@  static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
 	spin_lock(&vq->mmu_lock);
 	++vq->invalidate_count;
 
-	map = rcu_dereference_protected(vq->maps[index],
-					lockdep_is_held(&vq->mmu_lock));
+	map = vq->maps[index];
 	if (map) {
 		vhost_set_map_dirty(vq, map, index);
-		rcu_assign_pointer(vq->maps[index], NULL);
+		vq->maps[index] = NULL;
 	}
 	spin_unlock(&vq->mmu_lock);
 
 	if (map) {
-		synchronize_rcu();
+		vhost_vq_sync_access(vq);
 		vhost_map_unprefetch(map);
 	}
 }
@@ -457,7 +493,7 @@  static void vhost_init_maps(struct vhost_dev *dev)
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
 		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			RCU_INIT_POINTER(vq->maps[j], NULL);
+			vq->maps[j] = NULL;
 	}
 }
 #endif
@@ -655,6 +691,7 @@  void vhost_dev_init(struct vhost_dev *dev,
 		vq->indirect = NULL;
 		vq->heads = NULL;
 		vq->dev = dev;
+		vq->ref = 0;
 		mutex_init(&vq->mutex);
 		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
@@ -921,7 +958,7 @@  static int vhost_map_prefetch(struct vhost_virtqueue *vq,
 	map->npages = npages;
 	map->pages = pages;
 
-	rcu_assign_pointer(vq->maps[index], map);
+	vq->maps[index] = map;
 	/* No need for a synchronize_rcu(). This function should be
 	 * called by dev->worker so we are serialized with all
 	 * readers.
@@ -1216,18 +1253,18 @@  static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 	struct vring_used *used;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		map = vq->maps[VHOST_ADDR_USED];
 		if (likely(map)) {
 			used = map->addr;
 			*((__virtio16 *)&used->ring[vq->num]) =
 				cpu_to_vhost16(vq, vq->avail_idx);
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1245,18 +1282,18 @@  static inline int vhost_put_used(struct vhost_virtqueue *vq,
 	size_t size;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		map = vq->maps[VHOST_ADDR_USED];
 		if (likely(map)) {
 			used = map->addr;
 			size = count * sizeof(*head);
 			memcpy(used->ring + idx, head, size);
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1272,17 +1309,17 @@  static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 	struct vring_used *used;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		map = vq->maps[VHOST_ADDR_USED];
 		if (likely(map)) {
 			used = map->addr;
 			used->flags = cpu_to_vhost16(vq, vq->used_flags);
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1298,17 +1335,17 @@  static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 	struct vring_used *used;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		map = vq->maps[VHOST_ADDR_USED];
 		if (likely(map)) {
 			used = map->addr;
 			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1362,17 +1399,17 @@  static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 	struct vring_avail *avail;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		map = vq->maps[VHOST_ADDR_AVAIL];
 		if (likely(map)) {
 			avail = map->addr;
 			*idx = avail->idx;
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1387,17 +1424,17 @@  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 	struct vring_avail *avail;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		map = vq->maps[VHOST_ADDR_AVAIL];
 		if (likely(map)) {
 			avail = map->addr;
 			*head = avail->ring[idx & (vq->num - 1)];
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1413,17 +1450,17 @@  static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 	struct vring_avail *avail;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		map = vq->maps[VHOST_ADDR_AVAIL];
 		if (likely(map)) {
 			avail = map->addr;
 			*flags = avail->flags;
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1438,15 +1475,15 @@  static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 	struct vring_avail *avail;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		vhost_vq_access_map_begin(vq);
+		map = vq->maps[VHOST_ADDR_AVAIL];
 		if (likely(map)) {
 			avail = map->addr;
 			*event = (__virtio16)avail->ring[vq->num];
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1461,17 +1498,17 @@  static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 	struct vring_used *used;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		map = vq->maps[VHOST_ADDR_USED];
 		if (likely(map)) {
 			used = map->addr;
 			*idx = used->idx;
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1486,17 +1523,17 @@  static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 	struct vring_desc *d;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
+		map = vq->maps[VHOST_ADDR_DESC];
 		if (likely(map)) {
 			d = map->addr;
 			*desc = *(d + idx);
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1843,13 +1880,11 @@  static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 #if VHOST_ARCH_CAN_ACCEL_UACCESS
 static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
 {
-	struct vhost_map __rcu *map;
+	struct vhost_map *map;
 	int i;
 
 	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
-		rcu_read_lock();
-		map = rcu_dereference(vq->maps[i]);
-		rcu_read_unlock();
+		map = vq->maps[i];
 		if (unlikely(!map))
 			vhost_map_prefetch(vq, i);
 	}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a9a2a93857d2..f9e9558a529d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -115,16 +115,17 @@  struct vhost_virtqueue {
 #if VHOST_ARCH_CAN_ACCEL_UACCESS
 	/* Read by memory accessors, modified by meta data
 	 * prefetching, MMU notifier and vring ioctl().
-	 * Synchonrized through mmu_lock (writers) and RCU (writers
-	 * and readers).
+	 * Synchonrized through mmu_lock (writers) and ref counters,
+	 * see vhost_vq_access_map_begin()/vhost_vq_access_map_end().
 	 */
-	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
+	struct vhost_map *maps[VHOST_NUM_ADDRS];
 	/* Read by MMU notifier, modified by vring ioctl(),
 	 * synchronized through MMU notifier
 	 * registering/unregistering.
 	 */
 	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
 #endif
+	int ref;
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
 
 	struct file *kick;