Message ID | 73ad59dc-cc88-96ee-2867-9fb628823782@kamp.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Thank you for the response! I'll run off and test that. =) /* * Michael R. Hines * Senior Engineer, DigitalOcean. */ On 10/18/2016 05:47 AM, Peter Lieven wrote: > Am 12.10.2016 um 23:18 schrieb Michael R. Hines: >> Peter, >> >> Greetings from DigitalOcean. We're experiencing the same symptoms >> without this patch. >> We have, collectively, many gigabytes of un-planned-for RSS being >> used per-hypervisor >> that we would like to get rid of =). >> >> Without explicitly trying this patch (will do that ASAP), we >> immediately noticed that the >> 192MB mentioned immediately melts away (Yay) when we disabled the >> coroutine thread pool explicitly, >> with another ~100MB in additional stack usage that would likely also >> go away if we >> applied the entirety of your patch. >> >> Is there any chance you have revisited this or have a timeline for it? > > Hi Michael, > > the current master already includes some of the patches of this > original series. There are still some changes left, but > what works for me is the current master + > > diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c > index 5816702..3eaef68 100644 > --- a/util/qemu-coroutine.c > +++ b/util/qemu-coroutine.c > @@ -25,8 +25,6 @@ enum { > }; > > /** Free list to speed up creation */ > -static QSLIST_HEAD(, Coroutine) release_pool = > QSLIST_HEAD_INITIALIZER(pool); > -static unsigned int release_pool_size; > static __thread QSLIST_HEAD(, Coroutine) alloc_pool = > QSLIST_HEAD_INITIALIZER(pool); > static __thread unsigned int alloc_pool_size; > static __thread Notifier coroutine_pool_cleanup_notifier; > @@ -49,20 +47,10 @@ Coroutine *qemu_coroutine_create(CoroutineEntry > *entry) > if (CONFIG_COROUTINE_POOL) { > co = QSLIST_FIRST(&alloc_pool); > if (!co) { > - if (release_pool_size > POOL_BATCH_SIZE) { > - /* Slow path; a good place to register the > destructor, too. */ > - if (!coroutine_pool_cleanup_notifier.notify) { > - coroutine_pool_cleanup_notifier.notify = > coroutine_pool_cleanup; > - qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); > - } > - > - /* This is not exact; there could be a little skew > between > - * release_pool_size and the actual size of > release_pool. But > - * it is just a heuristic, it does not need to be > perfect. > - */ > - alloc_pool_size = atomic_xchg(&release_pool_size, 0); > - QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool); > - co = QSLIST_FIRST(&alloc_pool); > + /* Slow path; a good place to register the destructor, > too. */ > + if (!coroutine_pool_cleanup_notifier.notify) { > + coroutine_pool_cleanup_notifier.notify = > coroutine_pool_cleanup; > + qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); > } > } > if (co) { > @@ -85,11 +73,6 @@ static void coroutine_delete(Coroutine *co) > co->caller = NULL; > > if (CONFIG_COROUTINE_POOL) { > - if (release_pool_size < POOL_BATCH_SIZE * 2) { > - QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); > - atomic_inc(&release_pool_size); > - return; > - } > if (alloc_pool_size < POOL_BATCH_SIZE) { > QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next); > alloc_pool_size++; > > + invoking qemu with the following environemnet variable set: > > MALLOC_MMAP_THRESHOLD_=32768 qemu-system-x86_64 .... > > The last one makes glibc automatically using mmap when the malloced > memory exceeds 32kByte. > > Hope this helps, > Peter >
On 10/18/2016 05:47 AM, Peter Lieven wrote: > Am 12.10.2016 um 23:18 schrieb Michael R. Hines: >> Peter, >> >> Greetings from DigitalOcean. We're experiencing the same symptoms >> without this patch. >> We have, collectively, many gigabytes of un-planned-for RSS being >> used per-hypervisor >> that we would like to get rid of =). >> >> Without explicitly trying this patch (will do that ASAP), we >> immediately noticed that the >> 192MB mentioned immediately melts away (Yay) when we disabled the >> coroutine thread pool explicitly, >> with another ~100MB in additional stack usage that would likely also >> go away if we >> applied the entirety of your patch. >> >> Is there any chance you have revisited this or have a timeline for it? > > Hi Michael, > > the current master already includes some of the patches of this > original series. There are still some changes left, but > what works for me is the current master + > > diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c > index 5816702..3eaef68 100644 > --- a/util/qemu-coroutine.c > +++ b/util/qemu-coroutine.c > @@ -25,8 +25,6 @@ enum { > }; > > /** Free list to speed up creation */ > -static QSLIST_HEAD(, Coroutine) release_pool = > QSLIST_HEAD_INITIALIZER(pool); > -static unsigned int release_pool_size; > static __thread QSLIST_HEAD(, Coroutine) alloc_pool = > QSLIST_HEAD_INITIALIZER(pool); > static __thread unsigned int alloc_pool_size; > static __thread Notifier coroutine_pool_cleanup_notifier; > @@ -49,20 +47,10 @@ Coroutine *qemu_coroutine_create(CoroutineEntry > *entry) > if (CONFIG_COROUTINE_POOL) { > co = QSLIST_FIRST(&alloc_pool); > if (!co) { > - if (release_pool_size > POOL_BATCH_SIZE) { > - /* Slow path; a good place to register the > destructor, too. */ > - if (!coroutine_pool_cleanup_notifier.notify) { > - coroutine_pool_cleanup_notifier.notify = > coroutine_pool_cleanup; > - qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); > - } > - > - /* This is not exact; there could be a little skew > between > - * release_pool_size and the actual size of > release_pool. But > - * it is just a heuristic, it does not need to be > perfect. > - */ > - alloc_pool_size = atomic_xchg(&release_pool_size, 0); > - QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool); > - co = QSLIST_FIRST(&alloc_pool); > + /* Slow path; a good place to register the destructor, > too. */ > + if (!coroutine_pool_cleanup_notifier.notify) { > + coroutine_pool_cleanup_notifier.notify = > coroutine_pool_cleanup; > + qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); > } > } > if (co) { > @@ -85,11 +73,6 @@ static void coroutine_delete(Coroutine *co) > co->caller = NULL; > > if (CONFIG_COROUTINE_POOL) { > - if (release_pool_size < POOL_BATCH_SIZE * 2) { > - QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); > - atomic_inc(&release_pool_size); > - return; > - } > if (alloc_pool_size < POOL_BATCH_SIZE) { > QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next); > alloc_pool_size++; > > + invoking qemu with the following environemnet variable set: > > MALLOC_MMAP_THRESHOLD_=32768 qemu-system-x86_64 .... > > The last one makes glibc automatically using mmap when the malloced > memory exceeds 32kByte. > Peter, I tested the above patch (and the environment variable --- it doesn't quite come close to as lean of an RSS tally as the original patchset -------- there's still about 70-80 MB of remaining RSS. Any chance you could trim the remaining fat before merging this? =) /* * Michael R. Hines * Senior Engineer, DigitalOcean. */
On 10/31/2016 05:00 PM, Michael R. Hines wrote: > On 10/18/2016 05:47 AM, Peter Lieven wrote: >> Am 12.10.2016 um 23:18 schrieb Michael R. Hines: >>> Peter, >>> >>> Greetings from DigitalOcean. We're experiencing the same symptoms >>> without this patch. >>> We have, collectively, many gigabytes of un-planned-for RSS being >>> used per-hypervisor >>> that we would like to get rid of =). >>> >>> Without explicitly trying this patch (will do that ASAP), we >>> immediately noticed that the >>> 192MB mentioned immediately melts away (Yay) when we disabled the >>> coroutine thread pool explicitly, >>> with another ~100MB in additional stack usage that would likely also >>> go away if we >>> applied the entirety of your patch. >>> >>> Is there any chance you have revisited this or have a timeline for it? >> >> Hi Michael, >> >> the current master already includes some of the patches of this >> original series. There are still some changes left, but >> what works for me is the current master + >> >> diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c >> index 5816702..3eaef68 100644 >> --- a/util/qemu-coroutine.c >> +++ b/util/qemu-coroutine.c >> @@ -25,8 +25,6 @@ enum { >> }; >> >> /** Free list to speed up creation */ >> -static QSLIST_HEAD(, Coroutine) release_pool = >> QSLIST_HEAD_INITIALIZER(pool); >> -static unsigned int release_pool_size; >> static __thread QSLIST_HEAD(, Coroutine) alloc_pool = >> QSLIST_HEAD_INITIALIZER(pool); >> static __thread unsigned int alloc_pool_size; >> static __thread Notifier coroutine_pool_cleanup_notifier; >> @@ -49,20 +47,10 @@ Coroutine *qemu_coroutine_create(CoroutineEntry >> *entry) >> if (CONFIG_COROUTINE_POOL) { >> co = QSLIST_FIRST(&alloc_pool); >> if (!co) { >> - if (release_pool_size > POOL_BATCH_SIZE) { >> - /* Slow path; a good place to register the >> destructor, too. */ >> - if (!coroutine_pool_cleanup_notifier.notify) { >> - coroutine_pool_cleanup_notifier.notify = >> coroutine_pool_cleanup; >> - qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); >> - } >> - >> - /* This is not exact; there could be a little skew >> between >> - * release_pool_size and the actual size of >> release_pool. But >> - * it is just a heuristic, it does not need to be >> perfect. >> - */ >> - alloc_pool_size = atomic_xchg(&release_pool_size, 0); >> - QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool); >> - co = QSLIST_FIRST(&alloc_pool); >> + /* Slow path; a good place to register the destructor, >> too. */ >> + if (!coroutine_pool_cleanup_notifier.notify) { >> + coroutine_pool_cleanup_notifier.notify = >> coroutine_pool_cleanup; >> + qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); >> } >> } >> if (co) { >> @@ -85,11 +73,6 @@ static void coroutine_delete(Coroutine *co) >> co->caller = NULL; >> >> if (CONFIG_COROUTINE_POOL) { >> - if (release_pool_size < POOL_BATCH_SIZE * 2) { >> - QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); >> - atomic_inc(&release_pool_size); >> - return; >> - } >> if (alloc_pool_size < POOL_BATCH_SIZE) { >> QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next); >> alloc_pool_size++; >> >> + invoking qemu with the following environemnet variable set: >> >> MALLOC_MMAP_THRESHOLD_=32768 qemu-system-x86_64 .... >> >> The last one makes glibc automatically using mmap when the malloced >> memory exceeds 32kByte. >> > > Peter, > > I tested the above patch (and the environment variable --- it doesn't > quite come close to as lean of > an RSS tally as the original patchset -------- there's still about > 70-80 MB of remaining RSS. > > Any chance you could trim the remaining fat before merging this? =) > > False alarm! I didn't set the MMAP threshold low enough. Now the results are on-par with the other patchset. Thank you!
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c index 5816702..3eaef68 100644 --- a/util/qemu-coroutine.c +++ b/util/qemu-coroutine.c @@ -25,8 +25,6 @@ enum { }; /** Free list to speed up creation */ -static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool); -static unsigned int release_pool_size; static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool); static __thread unsigned int alloc_pool_size; static __thread Notifier coroutine_pool_cleanup_notifier; @@ -49,20 +47,10 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry) if (CONFIG_COROUTINE_POOL) { co = QSLIST_FIRST(&alloc_pool); if (!co) { - if (release_pool_size > POOL_BATCH_SIZE) { - /* Slow path; a good place to register the destructor, too. */ - if (!coroutine_pool_cleanup_notifier.notify) { - coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup; - qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); - } - - /* This is not exact; there could be a little skew between - * release_pool_size and the actual size of release_pool. But - * it is just a heuristic, it does not need to be perfect. - */ - alloc_pool_size = atomic_xchg(&release_pool_size, 0); - QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool); - co = QSLIST_FIRST(&alloc_pool); + /* Slow path; a good place to register the destructor, too. */ + if (!coroutine_pool_cleanup_notifier.notify) { + coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup; + qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); } } if (co) { @@ -85,11 +73,6 @@ static void coroutine_delete(Coroutine *co) co->caller = NULL; if (CONFIG_COROUTINE_POOL) { - if (release_pool_size < POOL_BATCH_SIZE * 2) { - QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); - atomic_inc(&release_pool_size); - return; - } if (alloc_pool_size < POOL_BATCH_SIZE) { QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next); alloc_pool_size++;