Message ID | 1506744354-20979-4-git-send-email-wei.w.wang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Looks good to me. minor comments below. On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote: > @@ -141,13 +146,128 @@ static void set_page_pfns(struct virtio_balloon *vb, > page_to_balloon_pfn(page) + i); > } > > + > +static void kick_and_wait(struct virtqueue *vq, wait_queue_head_t wq_head) > +{ > + unsigned int len; > + > + virtqueue_kick(vq); > + wait_event(wq_head, virtqueue_get_buf(vq, &len)); > +} > + > +static int add_one_sg(struct virtqueue *vq, void *addr, uint32_t size) > +{ > + struct scatterlist sg; > + unsigned int len; > + > + sg_init_one(&sg, addr, size); > + > + /* Detach all the used buffers from the vq */ > + while (virtqueue_get_buf(vq, &len)) > + ; > + > + return virtqueue_add_inbuf(vq, &sg, 1, vq, GFP_KERNEL); > +} > + > +static int send_balloon_page_sg(struct virtio_balloon *vb, > + struct virtqueue *vq, > + void *addr, > + uint32_t size, > + bool batch) > +{ > + int err; > + > + err = add_one_sg(vq, addr, size); > + > + /* If batchng is requested, we batch till the vq is full */ typo > + if (!batch || !vq->num_free) > + kick_and_wait(vq, vb->acked); > + > + return err; > +} If add_one_sg fails, kick_and_wait will hang forever. The reason this might work in because 1. with 1 sg there are no memory allocations 2. if adding fails on vq full, then something is in queue and will wake up kick_and_wait. So in short this is expected to never fail. How about a BUG_ON here then? And make it void, and add a comment with above explanation. > + > +/* > + * Send balloon pages in sgs to host. The balloon pages are recorded in the > + * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE. > + * The page xbitmap is searched for continuous "1" bits, which correspond > + * to continuous pages, to chunk into sgs. > + * > + * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that > + * need to be searched. > + */ > +static void tell_host_sgs(struct virtio_balloon *vb, > + struct virtqueue *vq, > + unsigned long page_xb_start, > + unsigned long page_xb_end) > +{ > + unsigned long sg_pfn_start, sg_pfn_end; > + void *sg_addr; > + uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE); > + int err = 0; > + > + sg_pfn_start = page_xb_start; > + while (sg_pfn_start < page_xb_end) { > + sg_pfn_start = xb_find_next_set_bit(&vb->page_xb, sg_pfn_start, > + page_xb_end); > + if (sg_pfn_start == page_xb_end + 1) > + break; > + sg_pfn_end = xb_find_next_zero_bit(&vb->page_xb, > + sg_pfn_start + 1, > + page_xb_end); > + sg_addr = (void *)pfn_to_kaddr(sg_pfn_start); > + sg_len = (sg_pfn_end - sg_pfn_start) << PAGE_SHIFT; > + while (sg_len > sg_max_len) { > + err = send_balloon_page_sg(vb, vq, sg_addr, sg_max_len, > + true); > + if (unlikely(err < 0)) > + goto err_out; > + sg_addr += sg_max_len; > + sg_len -= sg_max_len; > + } > + err = send_balloon_page_sg(vb, vq, sg_addr, sg_len, true); > + if (unlikely(err < 0)) > + goto err_out; > + sg_pfn_start = sg_pfn_end + 1; > + } > + > + /* > + * The last few sgs may not reach the batch size, but need a kick to > + * notify the device to handle them. > + */ > + if (vq->num_free != virtqueue_get_vring_size(vq)) > + kick_and_wait(vq, vb->acked); > + > + xb_clear_bit_range(&vb->page_xb, page_xb_start, page_xb_end); > + return; > + > +err_out: > + dev_warn(&vb->vdev->dev, "%s failure: %d\n", __func__, err); so fundamentally just make send_balloon_page_sg void then. > +} > + > +static inline void xb_set_page(struct virtio_balloon *vb, > + struct page *page, > + unsigned long *pfn_min, > + unsigned long *pfn_max) > +{ > + unsigned long pfn = page_to_pfn(page); > + > + *pfn_min = min(pfn, *pfn_min); > + *pfn_max = max(pfn, *pfn_max); > + xb_preload(GFP_KERNEL); > + xb_set_bit(&vb->page_xb, pfn); > + xb_preload_end(); > +} > + > static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) > { > struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; > unsigned num_allocated_pages; > + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); > + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; > > /* We can only do one array worth at a time. */ > - num = min(num, ARRAY_SIZE(vb->pfns)); > + if (!use_sg) > + num = min(num, ARRAY_SIZE(vb->pfns)); > > mutex_lock(&vb->balloon_lock); > for (vb->num_pfns = 0; vb->num_pfns < num; > @@ -162,7 +282,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) > msleep(200); > break; > } > - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > + > + if (use_sg) > + xb_set_page(vb, page, &pfn_min, &pfn_max); > + else > + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > + > vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; > if (!virtio_has_feature(vb->vdev, > VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) > @@ -171,8 +296,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) > > num_allocated_pages = vb->num_pfns; > /* Did we get any? */ > - if (vb->num_pfns != 0) > - tell_host(vb, vb->inflate_vq); > + if (vb->num_pfns) { > + if (use_sg) > + tell_host_sgs(vb, vb->inflate_vq, pfn_min, pfn_max); > + else > + tell_host(vb, vb->inflate_vq); > + } > mutex_unlock(&vb->balloon_lock); > > return num_allocated_pages; > @@ -198,9 +327,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) > struct page *page; > struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; > LIST_HEAD(pages); > + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); > + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; > > - /* We can only do one array worth at a time. */ > - num = min(num, ARRAY_SIZE(vb->pfns)); > + /* Traditionally, we can only do one array worth at a time. */ > + if (!use_sg) > + num = min(num, ARRAY_SIZE(vb->pfns)); > > mutex_lock(&vb->balloon_lock); > /* We can't release more pages than taken */ > @@ -210,7 +342,11 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) > page = balloon_page_dequeue(vb_dev_info); > if (!page) > break; > - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > + if (use_sg) > + xb_set_page(vb, page, &pfn_min, &pfn_max); > + else > + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); > + > list_add(&page->lru, &pages); > vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; > } > @@ -221,8 +357,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) > * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); > * is true, we *have* to do it in this order > */ > - if (vb->num_pfns != 0) > - tell_host(vb, vb->deflate_vq); > + if (vb->num_pfns) { > + if (use_sg) > + tell_host_sgs(vb, vb->deflate_vq, pfn_min, pfn_max); > + else > + tell_host(vb, vb->deflate_vq); > + } > release_pages_balloon(vb, &pages); > mutex_unlock(&vb->balloon_lock); > return num_freed_pages; > @@ -441,6 +581,7 @@ static int init_vqs(struct virtio_balloon *vb) > } > > #ifdef CONFIG_BALLOON_COMPACTION > + > /* > * virtballoon_migratepage - perform the balloon page migration on behalf of > * a compation thread. (called under page lock) > @@ -464,6 +605,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, > { > struct virtio_balloon *vb = container_of(vb_dev_info, > struct virtio_balloon, vb_dev_info); > + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); > unsigned long flags; > > /* > @@ -485,16 +627,24 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, > vb_dev_info->isolated_pages--; > __count_vm_event(BALLOON_MIGRATE); > spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); > - vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; > - set_page_pfns(vb, vb->pfns, newpage); > - tell_host(vb, vb->inflate_vq); > - > + if (use_sg) { > + send_balloon_page_sg(vb, vb->inflate_vq, page_address(newpage), > + PAGE_SIZE, false); > + } else { > + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; > + set_page_pfns(vb, vb->pfns, newpage); > + tell_host(vb, vb->inflate_vq); > + } > /* balloon's page migration 2nd step -- deflate "page" */ > balloon_page_delete(page); > - vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; > - set_page_pfns(vb, vb->pfns, page); > - tell_host(vb, vb->deflate_vq); > - > + if (use_sg) { > + send_balloon_page_sg(vb, vb->deflate_vq, page_address(page), > + PAGE_SIZE, false); > + } else { > + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; > + set_page_pfns(vb, vb->pfns, page); > + tell_host(vb, vb->deflate_vq); > + } > mutex_unlock(&vb->balloon_lock); > > put_page(page); /* balloon reference */ > @@ -553,6 +703,9 @@ static int virtballoon_probe(struct virtio_device *vdev) > if (err) > goto out_free_vb; > > + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_SG)) > + xb_init(&vb->page_xb); > + > vb->nb.notifier_call = virtballoon_oom_notify; > vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY; > err = register_oom_notifier(&vb->nb); > @@ -669,6 +822,7 @@ static unsigned int features[] = { > VIRTIO_BALLOON_F_MUST_TELL_HOST, > VIRTIO_BALLOON_F_STATS_VQ, > VIRTIO_BALLOON_F_DEFLATE_ON_OOM, > + VIRTIO_BALLOON_F_SG, > }; > > static struct virtio_driver virtio_balloon_driver = { > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h > index 343d7dd..37780a7 100644 > --- a/include/uapi/linux/virtio_balloon.h > +++ b/include/uapi/linux/virtio_balloon.h > @@ -34,6 +34,7 @@ > #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ > #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ > #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > +#define VIRTIO_BALLOON_F_SG 3 /* Use sg instead of PFN lists */ > > /* Size of a PFN in the balloon interface. */ > #define VIRTIO_BALLOON_PFN_SHIFT 12 > -- > 2.7.4
On Monday, October 2, 2017 12:30 PM, Michael S. Tsirkin wrote: > On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote: > > +static int send_balloon_page_sg(struct virtio_balloon *vb, > > + struct virtqueue *vq, > > + void *addr, > > + uint32_t size, > > + bool batch) > > +{ > > + int err; > > + > > + err = add_one_sg(vq, addr, size); > > + > > + /* If batchng is requested, we batch till the vq is full */ > > typo > > > + if (!batch || !vq->num_free) > > + kick_and_wait(vq, vb->acked); > > + > > + return err; > > +} > > If add_one_sg fails, kick_and_wait will hang forever. > > The reason this might work in because > 1. with 1 sg there are no memory allocations 2. if adding fails on vq full, then > something > is in queue and will wake up kick_and_wait. > > So in short this is expected to never fail. > How about a BUG_ON here then? > And make it void, and add a comment with above explanation. > Yes, agree that this wouldn't fail - the worker thread performing the ballooning operations has been put into sleep when the vq is full, so I think there shouldn't be anyone else to put more sgs onto the vq then. Btw, not sure if we need to mention memory allocation in the comment, I found virtqueue_add() doesn't return any error when allocation (for indirect desc-s) fails - it simply avoids the use of indirect desc. What do you think of the following? err = add_one_sg(vq, addr, size); /* * This is expected to never fail: there is always at least 1 entry available on the vq, * because when the vq is full the worker thread that adds the sg will be put into * sleep until at least 1 entry is available to use. */ BUG_ON(err); Best, Wei
On Mon, Oct 02, 2017 at 12:39:30PM +0000, Wang, Wei W wrote: > On Monday, October 2, 2017 12:30 PM, Michael S. Tsirkin wrote: > > On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote: > > > +static int send_balloon_page_sg(struct virtio_balloon *vb, > > > + struct virtqueue *vq, > > > + void *addr, > > > + uint32_t size, > > > + bool batch) > > > +{ > > > + int err; > > > + > > > + err = add_one_sg(vq, addr, size); > > > + > > > + /* If batchng is requested, we batch till the vq is full */ > > > > typo > > > > > + if (!batch || !vq->num_free) > > > + kick_and_wait(vq, vb->acked); > > > + > > > + return err; > > > +} > > > > If add_one_sg fails, kick_and_wait will hang forever. > > > > The reason this might work in because > > 1. with 1 sg there are no memory allocations 2. if adding fails on vq full, then > > something > > is in queue and will wake up kick_and_wait. > > > > So in short this is expected to never fail. > > How about a BUG_ON here then? > > And make it void, and add a comment with above explanation. > > > > > Yes, agree that this wouldn't fail - the worker thread performing the ballooning operations has been put into sleep when the vq is full, so I think there shouldn't be anyone else to put more sgs onto the vq then. > Btw, not sure if we need to mention memory allocation in the comment, I found virtqueue_add() doesn't return any error when allocation (for indirect desc-s) fails - it simply avoids the use of indirect desc. > > What do you think of the following? > > err = add_one_sg(vq, addr, size); > /* > * This is expected to never fail: there is always at least 1 entry available on the vq, > * because when the vq is full the worker thread that adds the sg will be put into > * sleep until at least 1 entry is available to use. > */ > BUG_ON(err); > > Best, > Wei > > > > Sounds good.
On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote: > +static inline void xb_set_page(struct virtio_balloon *vb, > + struct page *page, > + unsigned long *pfn_min, > + unsigned long *pfn_max) > +{ > + unsigned long pfn = page_to_pfn(page); > + > + *pfn_min = min(pfn, *pfn_min); > + *pfn_max = max(pfn, *pfn_max); > + xb_preload(GFP_KERNEL); > + xb_set_bit(&vb->page_xb, pfn); > + xb_preload_end(); > +} > + So, this will allocate memory ... > @@ -198,9 +327,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) > struct page *page; > struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; > LIST_HEAD(pages); > + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); > + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; > > - /* We can only do one array worth at a time. */ > - num = min(num, ARRAY_SIZE(vb->pfns)); > + /* Traditionally, we can only do one array worth at a time. */ > + if (!use_sg) > + num = min(num, ARRAY_SIZE(vb->pfns)); > > mutex_lock(&vb->balloon_lock); > /* We can't release more pages than taken */ And is sometimes called on OOM. I suspect we need to 1. keep around some memory for leak on oom 2. for non oom allocate outside locks
On 10/09/2017 11:20 PM, Michael S. Tsirkin wrote: > On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote: >> +static inline void xb_set_page(struct virtio_balloon *vb, >> + struct page *page, >> + unsigned long *pfn_min, >> + unsigned long *pfn_max) >> +{ >> + unsigned long pfn = page_to_pfn(page); >> + >> + *pfn_min = min(pfn, *pfn_min); >> + *pfn_max = max(pfn, *pfn_max); >> + xb_preload(GFP_KERNEL); >> + xb_set_bit(&vb->page_xb, pfn); >> + xb_preload_end(); >> +} >> + > So, this will allocate memory > > ... > >> @@ -198,9 +327,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) >> struct page *page; >> struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; >> LIST_HEAD(pages); >> + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); >> + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; >> >> - /* We can only do one array worth at a time. */ >> - num = min(num, ARRAY_SIZE(vb->pfns)); >> + /* Traditionally, we can only do one array worth at a time. */ >> + if (!use_sg) >> + num = min(num, ARRAY_SIZE(vb->pfns)); >> >> mutex_lock(&vb->balloon_lock); >> /* We can't release more pages than taken */ > And is sometimes called on OOM. > > > I suspect we need to > > 1. keep around some memory for leak on oom > > 2. for non oom allocate outside locks > > I think maybe we can optimize the existing balloon logic, which could remove the big balloon lock: It would not be necessary to have the inflating and deflating run at the same time. For example, 1st request to inflate 7G RAM, when 1GB has been given to the host (so 6G left), the 2nd request to deflate 5G is received. Instead of waiting for the 1st request to inflate 6G and then continuing with the 2nd request to deflate 5G, we can do a diff (6G to inflate - 5G to deflate) immediately, and got 1G to inflate. In this way, all that driver will do is to simply inflate another 1G. Same for the OOM case: when OOM asks for 1G, while inflating 5G is in progress, then the driver can deduct 1G from the amount that needs to inflate, and as a result, it will inflate 4G. In this case, we will never have the inflating and deflating task run at the same time, so I think it is possible to remove the lock, and therefore, we will not have that deadlock issue. What would you guys think? Best, Wei
Wei Wang wrote: > On 10/09/2017 11:20 PM, Michael S. Tsirkin wrote: > > On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote: > >> +static inline void xb_set_page(struct virtio_balloon *vb, > >> + struct page *page, > >> + unsigned long *pfn_min, > >> + unsigned long *pfn_max) > >> +{ > >> + unsigned long pfn = page_to_pfn(page); > >> + > >> + *pfn_min = min(pfn, *pfn_min); > >> + *pfn_max = max(pfn, *pfn_max); > >> + xb_preload(GFP_KERNEL); > >> + xb_set_bit(&vb->page_xb, pfn); > >> + xb_preload_end(); > >> +} > >> + > > So, this will allocate memory > > > > ... > > > >> @@ -198,9 +327,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) > >> struct page *page; > >> struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; > >> LIST_HEAD(pages); > >> + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); > >> + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; > >> > >> - /* We can only do one array worth at a time. */ > >> - num = min(num, ARRAY_SIZE(vb->pfns)); > >> + /* Traditionally, we can only do one array worth at a time. */ > >> + if (!use_sg) > >> + num = min(num, ARRAY_SIZE(vb->pfns)); > >> > >> mutex_lock(&vb->balloon_lock); > >> /* We can't release more pages than taken */ > > And is sometimes called on OOM. > > > > > > I suspect we need to > > > > 1. keep around some memory for leak on oom > > > > 2. for non oom allocate outside locks > > > > > > I think maybe we can optimize the existing balloon logic, which could > remove the big balloon lock: > > It would not be necessary to have the inflating and deflating run at the > same time. > For example, 1st request to inflate 7G RAM, when 1GB has been given to > the host (so 6G left), the > 2nd request to deflate 5G is received. Instead of waiting for the 1st > request to inflate 6G and then > continuing with the 2nd request to deflate 5G, we can do a diff (6G to > inflate - 5G to deflate) immediately, > and got 1G to inflate. In this way, all that driver will do is to simply > inflate another 1G. > > Same for the OOM case: when OOM asks for 1G, while inflating 5G is in > progress, then the driver can > deduct 1G from the amount that needs to inflate, and as a result, it > will inflate 4G. > > In this case, we will never have the inflating and deflating task run at > the same time, so I think it is > possible to remove the lock, and therefore, we will not have that > deadlock issue. > > What would you guys think? What is balloon_lock at virtballoon_migratepage() for? e22504296d4f64fb "virtio_balloon: introduce migration primitives to balloon pages" f68b992bbb474641 "virtio_balloon: fix race by fill and leak" And even if we could remove balloon_lock, you still cannot use __GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use "whether it is safe to wait" flag from "[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" .
On 10/10/2017 07:08 PM, Tetsuo Handa wrote: > Wei Wang wrote: >> On 10/09/2017 11:20 PM, Michael S. Tsirkin wrote: >>> On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote: >>>> +static inline void xb_set_page(struct virtio_balloon *vb, >>>> + struct page *page, >>>> + unsigned long *pfn_min, >>>> + unsigned long *pfn_max) >>>> +{ >>>> + unsigned long pfn = page_to_pfn(page); >>>> + >>>> + *pfn_min = min(pfn, *pfn_min); >>>> + *pfn_max = max(pfn, *pfn_max); >>>> + xb_preload(GFP_KERNEL); >>>> + xb_set_bit(&vb->page_xb, pfn); >>>> + xb_preload_end(); >>>> +} >>>> + >>> So, this will allocate memory >>> >>> ... >>> >>>> @@ -198,9 +327,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) >>>> struct page *page; >>>> struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; >>>> LIST_HEAD(pages); >>>> + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); >>>> + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; >>>> >>>> - /* We can only do one array worth at a time. */ >>>> - num = min(num, ARRAY_SIZE(vb->pfns)); >>>> + /* Traditionally, we can only do one array worth at a time. */ >>>> + if (!use_sg) >>>> + num = min(num, ARRAY_SIZE(vb->pfns)); >>>> >>>> mutex_lock(&vb->balloon_lock); >>>> /* We can't release more pages than taken */ >>> And is sometimes called on OOM. >>> >>> >>> I suspect we need to >>> >>> 1. keep around some memory for leak on oom >>> >>> 2. for non oom allocate outside locks >>> >>> >> I think maybe we can optimize the existing balloon logic, which could >> remove the big balloon lock: >> >> It would not be necessary to have the inflating and deflating run at the >> same time. >> For example, 1st request to inflate 7G RAM, when 1GB has been given to >> the host (so 6G left), the >> 2nd request to deflate 5G is received. Instead of waiting for the 1st >> request to inflate 6G and then >> continuing with the 2nd request to deflate 5G, we can do a diff (6G to >> inflate - 5G to deflate) immediately, >> and got 1G to inflate. In this way, all that driver will do is to simply >> inflate another 1G. >> >> Same for the OOM case: when OOM asks for 1G, while inflating 5G is in >> progress, then the driver can >> deduct 1G from the amount that needs to inflate, and as a result, it >> will inflate 4G. >> >> In this case, we will never have the inflating and deflating task run at >> the same time, so I think it is >> possible to remove the lock, and therefore, we will not have that >> deadlock issue. >> >> What would you guys think? > What is balloon_lock at virtballoon_migratepage() for? > > e22504296d4f64fb "virtio_balloon: introduce migration primitives to balloon pages" > f68b992bbb474641 "virtio_balloon: fix race by fill and leak" I think that's the part we need to improve for the existing implementation when going with the above direction. As also stated in the commit log, the lock was proposed to synchronize accesses to elements of struct virtio_balloon and its queue operation. To be more precise, fill_balloon/leak_balloon/migrationpage share vb->pfns[] and vb->num_pfns, which can actually be changed to use local variables of their own each. For example, for migratepage: + __virtio32 pfn; ... - vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; - set_page_pfns(vb, vb->pfns, newpage); - tell_host(vb, vb->inflate_vq); + set_page_pfns(vb, &pfn, newpage); + tell_host(vb, vb->inflate_vq, &pfn, VIRTIO_BALLOON_PAGES_PER_PAGE); For the queue access, it could be a small lock for each queue access, which I think won't cause the issue. > And even if we could remove balloon_lock, you still cannot use > __GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use > "whether it is safe to wait" flag from > "[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" . Without the lock being held, why couldn't we use __GFP_DIRECT_RECLAIM at xb_set_page()? Best, Wei
Wei Wang wrote: > > And even if we could remove balloon_lock, you still cannot use > > __GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use > > "whether it is safe to wait" flag from > > "[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" . > > Without the lock being held, why couldn't we use __GFP_DIRECT_RECLAIM at > xb_set_page()? Because of dependency shown below. leak_balloon() xb_set_page() xb_preload(GFP_KERNEL) kmalloc(GFP_KERNEL) __alloc_pages_may_oom() Takes oom_lock out_of_memory() blocking_notifier_call_chain() leak_balloon() xb_set_page() xb_preload(GFP_KERNEL) kmalloc(GFP_KERNEL) __alloc_pages_may_oom() Fails to take oom_lock and loop forever By the way, is xb_set_page() safe? Sleeping in the kernel with preemption disabled is a bug, isn't it? __radix_tree_preload() returns 0 with preemption disabled upon success. xb_preload() disables preemption if __radix_tree_preload() fails. Then, kmalloc() is called with preemption disabled, isn't it? But xb_set_page() calls xb_preload(GFP_KERNEL) which might sleep with preemption disabled.
On 10/10/2017 09:09 PM, Tetsuo Handa wrote: > Wei Wang wrote: >>> And even if we could remove balloon_lock, you still cannot use >>> __GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use >>> "whether it is safe to wait" flag from >>> "[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" . >> Without the lock being held, why couldn't we use __GFP_DIRECT_RECLAIM at >> xb_set_page()? > Because of dependency shown below. > > leak_balloon() > xb_set_page() > xb_preload(GFP_KERNEL) > kmalloc(GFP_KERNEL) > __alloc_pages_may_oom() > Takes oom_lock > out_of_memory() > blocking_notifier_call_chain() > leak_balloon() > xb_set_page() > xb_preload(GFP_KERNEL) > kmalloc(GFP_KERNEL) > __alloc_pages_may_oom() > Fails to take oom_lock and loop forever __alloc_pages_may_oom() uses mutex_trylock(&oom_lock). I think the second __alloc_pages_may_oom() will not continue since the first one is in progress. > > By the way, is xb_set_page() safe? > Sleeping in the kernel with preemption disabled is a bug, isn't it? > __radix_tree_preload() returns 0 with preemption disabled upon success. > xb_preload() disables preemption if __radix_tree_preload() fails. > Then, kmalloc() is called with preemption disabled, isn't it? > But xb_set_page() calls xb_preload(GFP_KERNEL) which might sleep with > preemption disabled. Yes, I think that should not be expected, thanks. I plan to change it like this: bool xb_preload(gfp_t gfp) { if (!this_cpu_read(ida_bitmap)) { struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp); if (!bitmap) return false; bitmap = this_cpu_cmpxchg(ida_bitmap, NULL, bitmap); kfree(bitmap); } if (__radix_tree_preload(gfp, XB_PRELOAD_SIZE) < 0) return false; return true; } Best, Wei
On 10/11/2017 10:26 AM, Tetsuo Handa wrote: > Wei Wang wrote: >> On 10/10/2017 09:09 PM, Tetsuo Handa wrote: >>> Wei Wang wrote: >>>>> And even if we could remove balloon_lock, you still cannot use >>>>> __GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use >>>>> "whether it is safe to wait" flag from >>>>> "[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" . >>>> Without the lock being held, why couldn't we use __GFP_DIRECT_RECLAIM at >>>> xb_set_page()? >>> Because of dependency shown below. >>> >>> leak_balloon() >>> xb_set_page() >>> xb_preload(GFP_KERNEL) >>> kmalloc(GFP_KERNEL) >>> __alloc_pages_may_oom() >>> Takes oom_lock >>> out_of_memory() >>> blocking_notifier_call_chain() >>> leak_balloon() >>> xb_set_page() >>> xb_preload(GFP_KERNEL) >>> kmalloc(GFP_KERNEL) >>> __alloc_pages_may_oom() >>> Fails to take oom_lock and loop forever >> __alloc_pages_may_oom() uses mutex_trylock(&oom_lock). > Yes. But this mutex_trylock(&oom_lock) is semantically mutex_lock(&oom_lock) > because __alloc_pages_slowpath() will continue looping until > mutex_trylock(&oom_lock) succeeds (or somebody releases memory). > >> I think the second __alloc_pages_may_oom() will not continue since the >> first one is in progress. > The second __alloc_pages_may_oom() will be called repeatedly because > __alloc_pages_slowpath() will continue looping (unless somebody releases > memory). > OK, I see, thanks. So, the point is that the OOM code path should not have memory allocation, and the old leak_balloon (without the F_SG feature) don't need xb_preload(). I think one solution would be to let the OOM uses the old leak_balloon() code path, and we can add one more parameter to leak_balloon to control that: leak_balloon(struct virtio_balloon *vb, size_t num, bool oom) >>> By the way, is xb_set_page() safe? >>> Sleeping in the kernel with preemption disabled is a bug, isn't it? >>> __radix_tree_preload() returns 0 with preemption disabled upon success. >>> xb_preload() disables preemption if __radix_tree_preload() fails. >>> Then, kmalloc() is called with preemption disabled, isn't it? >>> But xb_set_page() calls xb_preload(GFP_KERNEL) which might sleep with >>> preemption disabled. >> Yes, I think that should not be expected, thanks. >> >> I plan to change it like this: >> >> bool xb_preload(gfp_t gfp) >> { >> if (!this_cpu_read(ida_bitmap)) { >> struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp); >> >> if (!bitmap) >> return false; >> bitmap = this_cpu_cmpxchg(ida_bitmap, NULL, bitmap); >> kfree(bitmap); >> } > Excuse me, but you are allocating per-CPU memory when running CPU might > change at this line? What happens if running CPU has changed at this line? > Will it work even with new CPU's ida_bitmap == NULL ? > Yes, it will be detected in xb_set_bit(): when ida_bitmap = NULL on the new CPU, xb_set_bit() will return -EAGAIN to the caller, and the caller should restart from xb_preload(). Best, Wei
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index f0b3a0b..6952e19 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -32,6 +32,8 @@ #include <linux/mm.h> #include <linux/mount.h> #include <linux/magic.h> +#include <linux/xbitmap.h> +#include <asm/page.h> /* * Balloon device works in 4K page units. So each page is pointed to by @@ -79,6 +81,9 @@ struct virtio_balloon { /* Synchronize access/update to this struct virtio_balloon elements */ struct mutex balloon_lock; + /* The xbitmap used to record balloon pages */ + struct xb page_xb; + /* The array of pfns we tell the Host about. */ unsigned int num_pfns; __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; @@ -141,13 +146,128 @@ static void set_page_pfns(struct virtio_balloon *vb, page_to_balloon_pfn(page) + i); } + +static void kick_and_wait(struct virtqueue *vq, wait_queue_head_t wq_head) +{ + unsigned int len; + + virtqueue_kick(vq); + wait_event(wq_head, virtqueue_get_buf(vq, &len)); +} + +static int add_one_sg(struct virtqueue *vq, void *addr, uint32_t size) +{ + struct scatterlist sg; + unsigned int len; + + sg_init_one(&sg, addr, size); + + /* Detach all the used buffers from the vq */ + while (virtqueue_get_buf(vq, &len)) + ; + + return virtqueue_add_inbuf(vq, &sg, 1, vq, GFP_KERNEL); +} + +static int send_balloon_page_sg(struct virtio_balloon *vb, + struct virtqueue *vq, + void *addr, + uint32_t size, + bool batch) +{ + int err; + + err = add_one_sg(vq, addr, size); + + /* If batchng is requested, we batch till the vq is full */ + if (!batch || !vq->num_free) + kick_and_wait(vq, vb->acked); + + return err; +} + +/* + * Send balloon pages in sgs to host. The balloon pages are recorded in the + * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE. + * The page xbitmap is searched for continuous "1" bits, which correspond + * to continuous pages, to chunk into sgs. + * + * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that + * need to be searched. + */ +static void tell_host_sgs(struct virtio_balloon *vb, + struct virtqueue *vq, + unsigned long page_xb_start, + unsigned long page_xb_end) +{ + unsigned long sg_pfn_start, sg_pfn_end; + void *sg_addr; + uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE); + int err = 0; + + sg_pfn_start = page_xb_start; + while (sg_pfn_start < page_xb_end) { + sg_pfn_start = xb_find_next_set_bit(&vb->page_xb, sg_pfn_start, + page_xb_end); + if (sg_pfn_start == page_xb_end + 1) + break; + sg_pfn_end = xb_find_next_zero_bit(&vb->page_xb, + sg_pfn_start + 1, + page_xb_end); + sg_addr = (void *)pfn_to_kaddr(sg_pfn_start); + sg_len = (sg_pfn_end - sg_pfn_start) << PAGE_SHIFT; + while (sg_len > sg_max_len) { + err = send_balloon_page_sg(vb, vq, sg_addr, sg_max_len, + true); + if (unlikely(err < 0)) + goto err_out; + sg_addr += sg_max_len; + sg_len -= sg_max_len; + } + err = send_balloon_page_sg(vb, vq, sg_addr, sg_len, true); + if (unlikely(err < 0)) + goto err_out; + sg_pfn_start = sg_pfn_end + 1; + } + + /* + * The last few sgs may not reach the batch size, but need a kick to + * notify the device to handle them. + */ + if (vq->num_free != virtqueue_get_vring_size(vq)) + kick_and_wait(vq, vb->acked); + + xb_clear_bit_range(&vb->page_xb, page_xb_start, page_xb_end); + return; + +err_out: + dev_warn(&vb->vdev->dev, "%s failure: %d\n", __func__, err); +} + +static inline void xb_set_page(struct virtio_balloon *vb, + struct page *page, + unsigned long *pfn_min, + unsigned long *pfn_max) +{ + unsigned long pfn = page_to_pfn(page); + + *pfn_min = min(pfn, *pfn_min); + *pfn_max = max(pfn, *pfn_max); + xb_preload(GFP_KERNEL); + xb_set_bit(&vb->page_xb, pfn); + xb_preload_end(); +} + static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) { struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; unsigned num_allocated_pages; + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; /* We can only do one array worth at a time. */ - num = min(num, ARRAY_SIZE(vb->pfns)); + if (!use_sg) + num = min(num, ARRAY_SIZE(vb->pfns)); mutex_lock(&vb->balloon_lock); for (vb->num_pfns = 0; vb->num_pfns < num; @@ -162,7 +282,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) msleep(200); break; } - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + + if (use_sg) + xb_set_page(vb, page, &pfn_min, &pfn_max); + else + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) @@ -171,8 +296,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) num_allocated_pages = vb->num_pfns; /* Did we get any? */ - if (vb->num_pfns != 0) - tell_host(vb, vb->inflate_vq); + if (vb->num_pfns) { + if (use_sg) + tell_host_sgs(vb, vb->inflate_vq, pfn_min, pfn_max); + else + tell_host(vb, vb->inflate_vq); + } mutex_unlock(&vb->balloon_lock); return num_allocated_pages; @@ -198,9 +327,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) struct page *page; struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; LIST_HEAD(pages); + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); + unsigned long pfn_max = 0, pfn_min = ULONG_MAX; - /* We can only do one array worth at a time. */ - num = min(num, ARRAY_SIZE(vb->pfns)); + /* Traditionally, we can only do one array worth at a time. */ + if (!use_sg) + num = min(num, ARRAY_SIZE(vb->pfns)); mutex_lock(&vb->balloon_lock); /* We can't release more pages than taken */ @@ -210,7 +342,11 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) page = balloon_page_dequeue(vb_dev_info); if (!page) break; - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + if (use_sg) + xb_set_page(vb, page, &pfn_min, &pfn_max); + else + set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + list_add(&page->lru, &pages); vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; } @@ -221,8 +357,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); * is true, we *have* to do it in this order */ - if (vb->num_pfns != 0) - tell_host(vb, vb->deflate_vq); + if (vb->num_pfns) { + if (use_sg) + tell_host_sgs(vb, vb->deflate_vq, pfn_min, pfn_max); + else + tell_host(vb, vb->deflate_vq); + } release_pages_balloon(vb, &pages); mutex_unlock(&vb->balloon_lock); return num_freed_pages; @@ -441,6 +581,7 @@ static int init_vqs(struct virtio_balloon *vb) } #ifdef CONFIG_BALLOON_COMPACTION + /* * virtballoon_migratepage - perform the balloon page migration on behalf of * a compation thread. (called under page lock) @@ -464,6 +605,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, { struct virtio_balloon *vb = container_of(vb_dev_info, struct virtio_balloon, vb_dev_info); + bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG); unsigned long flags; /* @@ -485,16 +627,24 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, vb_dev_info->isolated_pages--; __count_vm_event(BALLOON_MIGRATE); spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); - vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; - set_page_pfns(vb, vb->pfns, newpage); - tell_host(vb, vb->inflate_vq); - + if (use_sg) { + send_balloon_page_sg(vb, vb->inflate_vq, page_address(newpage), + PAGE_SIZE, false); + } else { + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; + set_page_pfns(vb, vb->pfns, newpage); + tell_host(vb, vb->inflate_vq); + } /* balloon's page migration 2nd step -- deflate "page" */ balloon_page_delete(page); - vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; - set_page_pfns(vb, vb->pfns, page); - tell_host(vb, vb->deflate_vq); - + if (use_sg) { + send_balloon_page_sg(vb, vb->deflate_vq, page_address(page), + PAGE_SIZE, false); + } else { + vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; + set_page_pfns(vb, vb->pfns, page); + tell_host(vb, vb->deflate_vq); + } mutex_unlock(&vb->balloon_lock); put_page(page); /* balloon reference */ @@ -553,6 +703,9 @@ static int virtballoon_probe(struct virtio_device *vdev) if (err) goto out_free_vb; + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_SG)) + xb_init(&vb->page_xb); + vb->nb.notifier_call = virtballoon_oom_notify; vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY; err = register_oom_notifier(&vb->nb); @@ -669,6 +822,7 @@ static unsigned int features[] = { VIRTIO_BALLOON_F_MUST_TELL_HOST, VIRTIO_BALLOON_F_STATS_VQ, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, + VIRTIO_BALLOON_F_SG, }; static struct virtio_driver virtio_balloon_driver = { diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 343d7dd..37780a7 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -34,6 +34,7 @@ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ +#define VIRTIO_BALLOON_F_SG 3 /* Use sg instead of PFN lists */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12