Message ID | 1449578684-16320-1-git-send-email-derek.j.morton@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Dec 08, 2015 at 12:44:44PM +0000, Derek Morton wrote: > gem_flink_race and prime_self_import have subtests which read the > number of open gem objects from debugfs to determine if objects have > leaked during the test. However the test can fail sporadically if > the number of gem objects changes due to other process activity. > This patch introduces a change to check the number of gem objects > several times to filter out any fluctuations. Why exactly does this happen? IGT tests should be run on bare metal, with everything else killed/subdued/shutup. If there's still things going on that create objects, we need to stop them from doing that. If this only applies to Android, or some special Android deamon them imo check for that at runtime and igt_skip("your setup is invalid, deamon %s running\n"); is the correct fix. After all just because you sampled for a bit doesn't mean that it wont still change right when you start running the test for real, so this is still fragile. Also would be good to extract get_stable_obj_count to a proper igt library function, if it indeed needs to be this tricky. And then add the explanation for why we need this in the gtkdoc. Thanks, Daniel > > Signed-off-by: Derek Morton <derek.j.morton@intel.com> > --- > tests/gem_flink_race.c | 34 +++++++++++++++++++++++++++++----- > tests/prime_self_import.c | 43 +++++++++++++++++++++++++++++++++---------- > 2 files changed, 62 insertions(+), 15 deletions(-) > > diff --git a/tests/gem_flink_race.c b/tests/gem_flink_race.c > index b17ef85..0552e12 100644 > --- a/tests/gem_flink_race.c > +++ b/tests/gem_flink_race.c > @@ -44,7 +44,7 @@ IGT_TEST_DESCRIPTION("Check for flink/open vs. gem close races."); > * in the flink name and corresponding reference getting leaked. > */ > > -/* We want lockless and I'm to lazy to dig out an atomic libarary. On x86 this > +/* We want lockless and I'm to lazy to dig out an atomic library. On x86 this > * works, too. */ > volatile int pls_die = 0; > int fd; > @@ -65,6 +65,32 @@ static int get_object_count(void) > return ret; > } > > +static int get_stable_obj_count(int driver) > +{ > + /* The test relies on the system being in the same state before and > + after the test so any difference in the object count is a result of > + leaks during the test. gem_quiescent_gpu() mostly achieves this but > + occasionally obj_count can still change. The loop ensures obj_count > + has remained stable over several checks */ > + int obj_count, prev_obj_count; > + int loop_count = 0; > + gem_quiescent_gpu(driver); > + prev_obj_count = get_object_count(); > + while (loop_count < 4) { > + usleep(200000); > + gem_quiescent_gpu(driver); > + obj_count = get_object_count(); > + if (obj_count == prev_obj_count) { > + loop_count++; > + } else { > + igt_debug("loop_count=%d, obj_count=%d, prev_obj_count=%d\n", loop_count, obj_count, prev_obj_count); > + loop_count = 0; > + prev_obj_count = obj_count; > + } > + > + } > + return obj_count; > +} > > static void *thread_fn_flink_name(void *p) > { > @@ -164,8 +190,7 @@ static void test_flink_close(void) > * up the counts */ > fake = drm_open_driver(DRIVER_INTEL); > > - gem_quiescent_gpu(fake); > - obj_count = get_object_count(); > + obj_count = get_stable_obj_count(fake); > > num_threads = sysconf(_SC_NPROCESSORS_ONLN); > > @@ -190,8 +215,7 @@ static void test_flink_close(void) > > close(fd); > > - gem_quiescent_gpu(fake); > - obj_count = get_object_count() - obj_count; > + obj_count = get_stable_obj_count(fake) - obj_count; > > igt_info("leaked %i objects\n", obj_count); > > diff --git a/tests/prime_self_import.c b/tests/prime_self_import.c > index 91fe231..977c7b2 100644 > --- a/tests/prime_self_import.c > +++ b/tests/prime_self_import.c > @@ -47,7 +47,7 @@ > #include "drm.h" > > IGT_TEST_DESCRIPTION("Check whether prime import/export works on the same" > - " device."); > + " device... but with different fds."); > > #define BO_SIZE (16*1024) > > @@ -157,7 +157,7 @@ static void test_with_one_bo_two_files(void) > dma_buf_fd2 = prime_handle_to_fd(fd2, handle_open); > handle_import = prime_fd_to_handle(fd2, dma_buf_fd2); > > - /* dma-buf selfimporting an flink bo should give the same handle */ > + /* dma-buf self importing an flink bo should give the same handle */ > igt_assert_eq_u32(handle_import, handle_open); > > close(fd1); > @@ -226,6 +226,33 @@ static int get_object_count(void) > return ret; > } > > +static int get_stable_obj_count(int driver) > +{ > + /* The test relies on the system being in the same state before and > + after the test so any difference in the object count is a result of > + leaks during the test. gem_quiescent_gpu() mostly achieves this but > + occasionally obj_count can still change. The loop ensures obj_count > + has remained stable over several checks */ > + int obj_count, prev_obj_count; > + int loop_count = 0; > + gem_quiescent_gpu(driver); > + prev_obj_count = get_object_count(); > + while (loop_count < 4) { > + usleep(200000); > + gem_quiescent_gpu(driver); > + obj_count = get_object_count(); > + if (obj_count == prev_obj_count) { > + loop_count++; > + } else { > + igt_debug("loop_count=%d, obj_count=%d, prev_obj_count=%d\n", loop_count, obj_count, prev_obj_count); > + loop_count = 0; > + prev_obj_count = obj_count; > + } > + > + } > + return obj_count; > +} > + > static void *thread_fn_reimport_vs_close(void *p) > { > struct drm_gem_close close_bo; > @@ -258,8 +285,7 @@ static void test_reimport_close_race(void) > * up the counts */ > fake = drm_open_driver(DRIVER_INTEL); > > - gem_quiescent_gpu(fake); > - obj_count = get_object_count(); > + obj_count = get_stable_obj_count(fake); > > num_threads = sysconf(_SC_NPROCESSORS_ONLN); > > @@ -290,8 +316,7 @@ static void test_reimport_close_race(void) > close(fds[0]); > close(fds[1]); > > - gem_quiescent_gpu(fake); > - obj_count = get_object_count() - obj_count; > + obj_count = get_stable_obj_count(fake) - obj_count; > > igt_info("leaked %i objects\n", obj_count); > > @@ -350,8 +375,7 @@ static void test_export_close_race(void) > * up the counts */ > fake = drm_open_driver(DRIVER_INTEL); > > - gem_quiescent_gpu(fake); > - obj_count = get_object_count(); > + obj_count = get_stable_obj_count(fake); > > fd = drm_open_driver(DRIVER_INTEL); > > @@ -373,8 +397,7 @@ static void test_export_close_race(void) > > close(fd); > > - gem_quiescent_gpu(fake); > - obj_count = get_object_count() - obj_count; > + obj_count = get_stable_obj_count(fake) - obj_count; > > igt_info("leaked %i objects\n", obj_count); > > -- > 1.9.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > >-----Original Message----- >From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter >Sent: Thursday, December 10, 2015 10:13 AM >To: Morton, Derek J >Cc: intel-gfx@lists.freedesktop.org; Wood, Thomas >Subject: Re: [Intel-gfx] [PATCH i-g-t] gem_flink_race/prime_self_import: Improve test reliability > >On Tue, Dec 08, 2015 at 12:44:44PM +0000, Derek Morton wrote: >> gem_flink_race and prime_self_import have subtests which read the >> number of open gem objects from debugfs to determine if objects have >> leaked during the test. However the test can fail sporadically if the >> number of gem objects changes due to other process activity. >> This patch introduces a change to check the number of gem objects >> several times to filter out any fluctuations. > >Why exactly does this happen? IGT tests should be run on bare metal, with everything else killed/subdued/shutup. If there's still things going on that create objects, we need to stop them from doing that. > >If this only applies to Android, or some special Android deamon them imo check for that at runtime and igt_skip("your setup is invalid, deamon %s running\n"); is the correct fix. After all just because you sampled for a bit doesn't mean that it wont still change right when you start running the test for real, so this is still fragile. Before running tests on android we do stop everything possible. I suspect the culprit is coreu getting automatically restarted after it is stopped. I had additional debug while developing this patch and what I saw was the system being mostly quiescent but with some very low level background activity. 1 extra object being created and then deleted occasionally. Depending on whether it occurred at the start or end of the test it was resulting in a reported leak of either 1 or -1 objects. The patch fixes that issue by taking several samples and requiring them to be the same, therefore filtering out the low level background noise. It would not help if something in the background allocated an object and kept it allocated, but I have not seen that happen. I only saw once the object count increasing for 2 consecutive reads hence the count to 4 to give a margin. The test was failing about 10%. With this patch I got 100% pass across 300 runs of each of the tests. If you are concerned about the behaviour when running the test with a load of background activity I could add code to limit to the reset of the count and fail the test in that instance. That would give a benefit of distinguishing a test fail due to excessive background activity from a detected leak. I would not want to just have the test skip as that introduces a hole in our test coverage. > >Also would be good to extract get_stable_obj_count to a proper igt library function, if it indeed needs to be this tricky. And then add the explanation for why we need this in the gtkdoc. I can move the code to an igt library. Which library would you suggest? Igt_debugfs ? //Derek > >Thanks, Daniel >> >> Signed-off-by: Derek Morton <derek.j.morton@intel.com> >> --- >> tests/gem_flink_race.c | 34 +++++++++++++++++++++++++++++----- >> tests/prime_self_import.c | 43 >> +++++++++++++++++++++++++++++++++---------- >> 2 files changed, 62 insertions(+), 15 deletions(-) >> >> diff --git a/tests/gem_flink_race.c b/tests/gem_flink_race.c index >> b17ef85..0552e12 100644 >> --- a/tests/gem_flink_race.c >> +++ b/tests/gem_flink_race.c >> @@ -44,7 +44,7 @@ IGT_TEST_DESCRIPTION("Check for flink/open vs. gem close races."); >> * in the flink name and corresponding reference getting leaked. >> */ >> >> -/* We want lockless and I'm to lazy to dig out an atomic libarary. On >> x86 this >> +/* We want lockless and I'm to lazy to dig out an atomic library. On >> +x86 this >> * works, too. */ >> volatile int pls_die = 0; >> int fd; >> @@ -65,6 +65,32 @@ static int get_object_count(void) >> return ret; >> } >> >> +static int get_stable_obj_count(int driver) { >> + /* The test relies on the system being in the same state before and >> + after the test so any difference in the object count is a result of >> + leaks during the test. gem_quiescent_gpu() mostly achieves this but >> + occasionally obj_count can still change. The loop ensures obj_count >> + has remained stable over several checks */ >> + int obj_count, prev_obj_count; >> + int loop_count = 0; >> + gem_quiescent_gpu(driver); >> + prev_obj_count = get_object_count(); >> + while (loop_count < 4) { >> + usleep(200000); >> + gem_quiescent_gpu(driver); >> + obj_count = get_object_count(); >> + if (obj_count == prev_obj_count) { >> + loop_count++; >> + } else { >> + igt_debug("loop_count=%d, obj_count=%d, prev_obj_count=%d\n", loop_count, obj_count, prev_obj_count); >> + loop_count = 0; >> + prev_obj_count = obj_count; >> + } >> + >> + } >> + return obj_count; >> +} >> >> static void *thread_fn_flink_name(void *p) { @@ -164,8 +190,7 @@ >> static void test_flink_close(void) >> * up the counts */ >> fake = drm_open_driver(DRIVER_INTEL); >> >> - gem_quiescent_gpu(fake); >> - obj_count = get_object_count(); >> + obj_count = get_stable_obj_count(fake); >> >> num_threads = sysconf(_SC_NPROCESSORS_ONLN); >> >> @@ -190,8 +215,7 @@ static void test_flink_close(void) >> >> close(fd); >> >> - gem_quiescent_gpu(fake); >> - obj_count = get_object_count() - obj_count; >> + obj_count = get_stable_obj_count(fake) - obj_count; >> >> igt_info("leaked %i objects\n", obj_count); >> >> diff --git a/tests/prime_self_import.c b/tests/prime_self_import.c >> index 91fe231..977c7b2 100644 >> --- a/tests/prime_self_import.c >> +++ b/tests/prime_self_import.c >> @@ -47,7 +47,7 @@ >> #include "drm.h" >> >> IGT_TEST_DESCRIPTION("Check whether prime import/export works on the same" >> - " device."); >> + " device... but with different fds."); >> >> #define BO_SIZE (16*1024) >> >> @@ -157,7 +157,7 @@ static void test_with_one_bo_two_files(void) >> dma_buf_fd2 = prime_handle_to_fd(fd2, handle_open); >> handle_import = prime_fd_to_handle(fd2, dma_buf_fd2); >> >> - /* dma-buf selfimporting an flink bo should give the same handle */ >> + /* dma-buf self importing an flink bo should give the same handle */ >> igt_assert_eq_u32(handle_import, handle_open); >> >> close(fd1); >> @@ -226,6 +226,33 @@ static int get_object_count(void) >> return ret; >> } >> >> +static int get_stable_obj_count(int driver) { >> + /* The test relies on the system being in the same state before and >> + after the test so any difference in the object count is a result of >> + leaks during the test. gem_quiescent_gpu() mostly achieves this but >> + occasionally obj_count can still change. The loop ensures obj_count >> + has remained stable over several checks */ >> + int obj_count, prev_obj_count; >> + int loop_count = 0; >> + gem_quiescent_gpu(driver); >> + prev_obj_count = get_object_count(); >> + while (loop_count < 4) { >> + usleep(200000); >> + gem_quiescent_gpu(driver); >> + obj_count = get_object_count(); >> + if (obj_count == prev_obj_count) { >> + loop_count++; >> + } else { >> + igt_debug("loop_count=%d, obj_count=%d, prev_obj_count=%d\n", loop_count, obj_count, prev_obj_count); >> + loop_count = 0; >> + prev_obj_count = obj_count; >> + } >> + >> + } >> + return obj_count; >> +} >> + >> static void *thread_fn_reimport_vs_close(void *p) { >> struct drm_gem_close close_bo; >> @@ -258,8 +285,7 @@ static void test_reimport_close_race(void) >> * up the counts */ >> fake = drm_open_driver(DRIVER_INTEL); >> >> - gem_quiescent_gpu(fake); >> - obj_count = get_object_count(); >> + obj_count = get_stable_obj_count(fake); >> >> num_threads = sysconf(_SC_NPROCESSORS_ONLN); >> >> @@ -290,8 +316,7 @@ static void test_reimport_close_race(void) >> close(fds[0]); >> close(fds[1]); >> >> - gem_quiescent_gpu(fake); >> - obj_count = get_object_count() - obj_count; >> + obj_count = get_stable_obj_count(fake) - obj_count; >> >> igt_info("leaked %i objects\n", obj_count); >> >> @@ -350,8 +375,7 @@ static void test_export_close_race(void) >> * up the counts */ >> fake = drm_open_driver(DRIVER_INTEL); >> >> - gem_quiescent_gpu(fake); >> - obj_count = get_object_count(); >> + obj_count = get_stable_obj_count(fake); >> >> fd = drm_open_driver(DRIVER_INTEL); >> >> @@ -373,8 +397,7 @@ static void test_export_close_race(void) >> >> close(fd); >> >> - gem_quiescent_gpu(fake); >> - obj_count = get_object_count() - obj_count; >> + obj_count = get_stable_obj_count(fake) - obj_count; >> >> igt_info("leaked %i objects\n", obj_count); >> >> -- >> 1.9.1 >> >> _______________________________________________ >> Intel-gfx mailing list >> Intel-gfx@lists.freedesktop.org >> http://lists.freedesktop.org/mailman/listinfo/intel-gfx > >-- >Daniel Vetter >Software Engineer, Intel Corporation >http://blog.ffwll.ch >
On Thu, Dec 10, 2015 at 11:51:29AM +0000, Morton, Derek J wrote: > > > > > >-----Original Message----- > >From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter > >Sent: Thursday, December 10, 2015 10:13 AM > >To: Morton, Derek J > >Cc: intel-gfx@lists.freedesktop.org; Wood, Thomas > >Subject: Re: [Intel-gfx] [PATCH i-g-t] gem_flink_race/prime_self_import: Improve test reliability > > > >On Tue, Dec 08, 2015 at 12:44:44PM +0000, Derek Morton wrote: > >> gem_flink_race and prime_self_import have subtests which read the > >> number of open gem objects from debugfs to determine if objects have > >> leaked during the test. However the test can fail sporadically if the > >> number of gem objects changes due to other process activity. > >> This patch introduces a change to check the number of gem objects > >> several times to filter out any fluctuations. > > > >Why exactly does this happen? IGT tests should be run on bare metal, > >with everything else killed/subdued/shutup. If there's still things > >going on that create objects, we need to stop them from doing that. > > > >If this only applies to Android, or some special Android deamon them > >imo check for that at runtime and igt_skip("your setup is invalid, > >deamon %s running\n"); is the correct fix. After all just because you > >sampled for a bit doesn't mean that it wont still change right when you > >start running the test for real, so this is still fragile. > > Before running tests on android we do stop everything possible. I > suspect the culprit is coreu getting automatically restarted after it is > stopped. I had additional debug while developing this patch and what I > saw was the system being mostly quiescent but with some very low level > background activity. 1 extra object being created and then deleted > occasionally. Depending on whether it occurred at the start or end of > the test it was resulting in a reported leak of either 1 or -1 objects. > The patch fixes that issue by taking several samples and requiring them > to be the same, therefore filtering out the low level background noise. > It would not help if something in the background allocated an object and > kept it allocated, but I have not seen that happen. I only saw once the > object count increasing for 2 consecutive reads hence the count to 4 to > give a margin. The test was failing about 10%. With this patch I got > 100% pass across 300 runs of each of the tests. Hm, piglit checks that there's no other drm clients running. Have you tried re-running that check to zero in on the culprit? > If you are concerned about the behaviour when running the test with a > load of background activity I could add code to limit to the reset of > the count and fail the test in that instance. That would give a benefit > of distinguishing a test fail due to excessive background activity from > a detected leak. I'm also concerned for the overhead this causes everyone else. If this really is some Android trouble then I think it'd be good to only compile this on Android. But would still be much better if you can get to a reliably clean test environment. > I would not want to just have the test skip as that introduces a hole in > our test coverage. > > >Also would be good to extract get_stable_obj_count to a proper igt > >library function, if it indeed needs to be this tricky. And then add > >the explanation for why we need this in the gtkdoc. > > I can move the code to an igt library. Which library would you suggest? Igt_debugfs ? Hm yeah, it's a bit the dumping ground for all things debugfs access ;-) -Daniel
> > >-----Original Message----- >From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter >Sent: Thursday, December 10, 2015 12:53 PM >To: Morton, Derek J >Cc: Daniel Vetter; intel-gfx@lists.freedesktop.org; Wood, Thomas >Subject: Re: [Intel-gfx] [PATCH i-g-t] gem_flink_race/prime_self_import: Improve test reliability > >On Thu, Dec 10, 2015 at 11:51:29AM +0000, Morton, Derek J wrote: >> > >> > >> >-----Original Message----- >> >From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of >> >Daniel Vetter >> >Sent: Thursday, December 10, 2015 10:13 AM >> >To: Morton, Derek J >> >Cc: intel-gfx@lists.freedesktop.org; Wood, Thomas >> >Subject: Re: [Intel-gfx] [PATCH i-g-t] >> >gem_flink_race/prime_self_import: Improve test reliability >> > >> >On Tue, Dec 08, 2015 at 12:44:44PM +0000, Derek Morton wrote: >> >> gem_flink_race and prime_self_import have subtests which read the >> >> number of open gem objects from debugfs to determine if objects >> >> have leaked during the test. However the test can fail sporadically >> >> if the number of gem objects changes due to other process activity. >> >> This patch introduces a change to check the number of gem objects >> >> several times to filter out any fluctuations. >> > >> >Why exactly does this happen? IGT tests should be run on bare metal, >> >with everything else killed/subdued/shutup. If there's still things >> >going on that create objects, we need to stop them from doing that. >> > >> >If this only applies to Android, or some special Android deamon them >> >imo check for that at runtime and igt_skip("your setup is invalid, >> >deamon %s running\n"); is the correct fix. After all just because you >> >sampled for a bit doesn't mean that it wont still change right when >> >you start running the test for real, so this is still fragile. >> >> Before running tests on android we do stop everything possible. I >> suspect the culprit is coreu getting automatically restarted after it >> is stopped. I had additional debug while developing this patch and >> what I saw was the system being mostly quiescent but with some very >> low level background activity. 1 extra object being created and then >> deleted occasionally. Depending on whether it occurred at the start or >> end of the test it was resulting in a reported leak of either 1 or -1 objects. >> The patch fixes that issue by taking several samples and requiring >> them to be the same, therefore filtering out the low level background noise. >> It would not help if something in the background allocated an object >> and kept it allocated, but I have not seen that happen. I only saw >> once the object count increasing for 2 consecutive reads hence the >> count to 4 to give a margin. The test was failing about 10%. With this >> patch I got 100% pass across 300 runs of each of the tests. > >Hm, piglit checks that there's no other drm clients running. Have you tried re-running that check to zero in on the culprit? We don't use piglet to run IGT tests on Android. I have had a look at what piglet does and added the same check to our scripts. (It reads a list of clients from /sys/kernel/debug/dri/0/clients) For CHV it shows a process called 'y', though that seems to be some issue on CHV that all driver clients are called 'y'. I checked on BXT which properly shows the process names and it looks like it is the binder process (which is handling some inter process communication). I don't think this is something we can stop. > >> If you are concerned about the behaviour when running the test with a >> load of background activity I could add code to limit to the reset of >> the count and fail the test in that instance. That would give a >> benefit of distinguishing a test fail due to excessive background >> activity from a detected leak. > >I'm also concerned for the overhead this causes everyone else. If this really is some Android trouble then I think it'd be good to only compile this on Android. But would still be much better if you can get to a reliably clean test environment. I will make the loop part android specific. //Derek > >> I would not want to just have the test skip as that introduces a hole >> in our test coverage. >> >> >Also would be good to extract get_stable_obj_count to a proper igt >> >library function, if it indeed needs to be this tricky. And then add >> >the explanation for why we need this in the gtkdoc. >> >> I can move the code to an igt library. Which library would you suggest? Igt_debugfs ? > >Hm yeah, it's a bit the dumping ground for all things debugfs access ;-) -Daniel >-- >Daniel Vetter >Software Engineer, Intel Corporation >http://blog.ffwll.ch >
On Fri, Dec 11, 2015 at 10:33:46AM +0000, Morton, Derek J wrote: > > > > > >-----Original Message----- > >From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter > >Sent: Thursday, December 10, 2015 12:53 PM > >To: Morton, Derek J > >Cc: Daniel Vetter; intel-gfx@lists.freedesktop.org; Wood, Thomas > >Subject: Re: [Intel-gfx] [PATCH i-g-t] gem_flink_race/prime_self_import: Improve test reliability > > > >On Thu, Dec 10, 2015 at 11:51:29AM +0000, Morton, Derek J wrote: > >> > > >> > > >> >-----Original Message----- > >> >From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of > >> >Daniel Vetter > >> >Sent: Thursday, December 10, 2015 10:13 AM > >> >To: Morton, Derek J > >> >Cc: intel-gfx@lists.freedesktop.org; Wood, Thomas > >> >Subject: Re: [Intel-gfx] [PATCH i-g-t] > >> >gem_flink_race/prime_self_import: Improve test reliability > >> > > >> >On Tue, Dec 08, 2015 at 12:44:44PM +0000, Derek Morton wrote: > >> >> gem_flink_race and prime_self_import have subtests which read the > >> >> number of open gem objects from debugfs to determine if objects > >> >> have leaked during the test. However the test can fail sporadically > >> >> if the number of gem objects changes due to other process activity. > >> >> This patch introduces a change to check the number of gem objects > >> >> several times to filter out any fluctuations. > >> > > >> >Why exactly does this happen? IGT tests should be run on bare metal, > >> >with everything else killed/subdued/shutup. If there's still things > >> >going on that create objects, we need to stop them from doing that. > >> > > >> >If this only applies to Android, or some special Android deamon them > >> >imo check for that at runtime and igt_skip("your setup is invalid, > >> >deamon %s running\n"); is the correct fix. After all just because you > >> >sampled for a bit doesn't mean that it wont still change right when > >> >you start running the test for real, so this is still fragile. > >> > >> Before running tests on android we do stop everything possible. I > >> suspect the culprit is coreu getting automatically restarted after it > >> is stopped. I had additional debug while developing this patch and > >> what I saw was the system being mostly quiescent but with some very > >> low level background activity. 1 extra object being created and then > >> deleted occasionally. Depending on whether it occurred at the start or > >> end of the test it was resulting in a reported leak of either 1 or -1 objects. > >> The patch fixes that issue by taking several samples and requiring > >> them to be the same, therefore filtering out the low level background noise. > >> It would not help if something in the background allocated an object > >> and kept it allocated, but I have not seen that happen. I only saw > >> once the object count increasing for 2 consecutive reads hence the > >> count to 4 to give a margin. The test was failing about 10%. With this > >> patch I got 100% pass across 300 runs of each of the tests. > > > >Hm, piglit checks that there's no other drm clients running. Have you tried re-running that check to zero in on the culprit? > > We don't use piglet to run IGT tests on Android. I have had a look at what piglet does and added the same check to our scripts. (It reads a list of clients from /sys/kernel/debug/dri/0/clients) > For CHV it shows a process called 'y', though that seems to be some issue on CHV that all driver clients are called 'y'. I checked on BXT which properly shows the process names and it looks like it is the binder process (which is handling some inter process communication). I don't think this is something we can stop. Nah, you definitely can't stop binder, won't have an android left after that ;-) But it is strange that binder owns these buffers. Binder is just IPC, but like unix domain sockets you can also throw around file descriptors. So something on your system is moving open drm fd devices still around. I don't have an idea what kind of audit/debug tooling binder offers, but there should be a way to figure out who really owns that file descriptor. If you're lucky lsof (if android has that, otherwise walk /proc/*/fd/* symlinks manually) should help. Cheers, Daniel > >> If you are concerned about the behaviour when running the test with a > >> load of background activity I could add code to limit to the reset of > >> the count and fail the test in that instance. That would give a > >> benefit of distinguishing a test fail due to excessive background > >> activity from a detected leak. > > > >I'm also concerned for the overhead this causes everyone else. If this really is some Android trouble then I think it'd be good to only compile this on Android. But would still be much better if you can get to a reliably clean test environment. > > I will make the loop part android specific. > > > //Derek > > > > >> I would not want to just have the test skip as that introduces a hole > >> in our test coverage. > >> > >> >Also would be good to extract get_stable_obj_count to a proper igt > >> >library function, if it indeed needs to be this tricky. And then add > >> >the explanation for why we need this in the gtkdoc. > >> > >> I can move the code to an igt library. Which library would you suggest? Igt_debugfs ? > > > >Hm yeah, it's a bit the dumping ground for all things debugfs access ;-) -Daniel > >-- > >Daniel Vetter > >Software Engineer, Intel Corporation > >http://blog.ffwll.ch > >
> > >-----Original Message----- >From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of Daniel Vetter >Sent: Friday, December 11, 2015 5:06 PM >To: Morton, Derek J >Cc: Daniel Vetter; intel-gfx@lists.freedesktop.org; Wood, Thomas >Subject: Re: [Intel-gfx] [PATCH i-g-t] gem_flink_race/prime_self_import: Improve test reliability > >On Fri, Dec 11, 2015 at 10:33:46AM +0000, Morton, Derek J wrote: >> > >> > >> >-----Original Message----- >> >From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of >> >Daniel Vetter >> >Sent: Thursday, December 10, 2015 12:53 PM >> >To: Morton, Derek J >> >Cc: Daniel Vetter; intel-gfx@lists.freedesktop.org; Wood, Thomas >> >Subject: Re: [Intel-gfx] [PATCH i-g-t] >> >gem_flink_race/prime_self_import: Improve test reliability >> > >> >On Thu, Dec 10, 2015 at 11:51:29AM +0000, Morton, Derek J wrote: >> >> > >> >> > >> >> >-----Original Message----- >> >> >From: Daniel Vetter [mailto:daniel.vetter@ffwll.ch] On Behalf Of >> >> >Daniel Vetter >> >> >Sent: Thursday, December 10, 2015 10:13 AM >> >> >To: Morton, Derek J >> >> >Cc: intel-gfx@lists.freedesktop.org; Wood, Thomas >> >> >Subject: Re: [Intel-gfx] [PATCH i-g-t] >> >> >gem_flink_race/prime_self_import: Improve test reliability >> >> > >> >> >On Tue, Dec 08, 2015 at 12:44:44PM +0000, Derek Morton wrote: >> >> >> gem_flink_race and prime_self_import have subtests which read >> >> >> the number of open gem objects from debugfs to determine if >> >> >> objects have leaked during the test. However the test can fail >> >> >> sporadically if the number of gem objects changes due to other process activity. >> >> >> This patch introduces a change to check the number of gem >> >> >> objects several times to filter out any fluctuations. >> >> > >> >> >Why exactly does this happen? IGT tests should be run on bare >> >> >metal, with everything else killed/subdued/shutup. If there's >> >> >still things going on that create objects, we need to stop them from doing that. >> >> > >> >> >If this only applies to Android, or some special Android deamon >> >> >them imo check for that at runtime and igt_skip("your setup is >> >> >invalid, deamon %s running\n"); is the correct fix. After all just >> >> >because you sampled for a bit doesn't mean that it wont still >> >> >change right when you start running the test for real, so this is still fragile. >> >> >> >> Before running tests on android we do stop everything possible. I >> >> suspect the culprit is coreu getting automatically restarted after >> >> it is stopped. I had additional debug while developing this patch >> >> and what I saw was the system being mostly quiescent but with some >> >> very low level background activity. 1 extra object being created >> >> and then deleted occasionally. Depending on whether it occurred at >> >> the start or end of the test it was resulting in a reported leak of either 1 or -1 objects. >> >> The patch fixes that issue by taking several samples and requiring >> >> them to be the same, therefore filtering out the low level background noise. >> >> It would not help if something in the background allocated an >> >> object and kept it allocated, but I have not seen that happen. I >> >> only saw once the object count increasing for 2 consecutive reads >> >> hence the count to 4 to give a margin. The test was failing about >> >> 10%. With this patch I got 100% pass across 300 runs of each of the tests. >> > >> >Hm, piglit checks that there's no other drm clients running. Have you tried re-running that check to zero in on the culprit? >> >> We don't use piglet to run IGT tests on Android. I have had a look at >> what piglet does and added the same check to our scripts. (It reads a list of clients from /sys/kernel/debug/dri/0/clients) For CHV it shows a process called 'y', though that seems to be some issue on CHV that all driver clients are called 'y'. I checked on BXT which properly shows the process names and it looks like it is the binder process (which is handling some inter process communication). I don't think this is something we can stop. > >Nah, you definitely can't stop binder, won't have an android left after that ;-) > >But it is strange that binder owns these buffers. Binder is just IPC, but like unix domain sockets you can also throw around file descriptors. So something on your system is moving open drm fd devices still around. I don't have an idea what kind of audit/debug tooling binder offers, but there should be a way to figure out who really owns that file descriptor. >If you're lucky lsof (if android has that, otherwise walk /proc/*/fd/* symlinks manually) should help. > >Cheers, Daniel I am still suspicious that it is coreu that is the real culprit. I ran the tests Friday and did some reading of i915_gem_objects in parallel to get the full content of what it reported. I once caught coreu being listed as owning an active gem object during the test. Coreu gets stopped explicitly before tests are run but I suspect the init demon is detecting that it is stopped and is restarting it automatically. Something is reporting in logcat when coreu is stopped anyway. I will address your comments on V2 and submit another patch. //Derek > >> >> If you are concerned about the behaviour when running the test with >> >> a load of background activity I could add code to limit to the >> >> reset of the count and fail the test in that instance. That would >> >> give a benefit of distinguishing a test fail due to excessive >> >> background activity from a detected leak. >> > >> >I'm also concerned for the overhead this causes everyone else. If this really is some Android trouble then I think it'd be good to only compile this on Android. But would still be much better if you can get to a reliably clean test environment. >> >> I will make the loop part android specific. >> >> >> //Derek >> >> > >> >> I would not want to just have the test skip as that introduces a >> >> hole in our test coverage. >> >> >> >> >Also would be good to extract get_stable_obj_count to a proper igt >> >> >library function, if it indeed needs to be this tricky. And then >> >> >add the explanation for why we need this in the gtkdoc. >> >> >> >> I can move the code to an igt library. Which library would you suggest? Igt_debugfs ? >> > >> >Hm yeah, it's a bit the dumping ground for all things debugfs access >> >;-) -Daniel >> >-- >> >Daniel Vetter >> >Software Engineer, Intel Corporation >> >http://blog.ffwll.ch >> > > >-- >Daniel Vetter >Software Engineer, Intel Corporation >http://blog.ffwll.ch >
diff --git a/tests/gem_flink_race.c b/tests/gem_flink_race.c index b17ef85..0552e12 100644 --- a/tests/gem_flink_race.c +++ b/tests/gem_flink_race.c @@ -44,7 +44,7 @@ IGT_TEST_DESCRIPTION("Check for flink/open vs. gem close races."); * in the flink name and corresponding reference getting leaked. */ -/* We want lockless and I'm to lazy to dig out an atomic libarary. On x86 this +/* We want lockless and I'm to lazy to dig out an atomic library. On x86 this * works, too. */ volatile int pls_die = 0; int fd; @@ -65,6 +65,32 @@ static int get_object_count(void) return ret; } +static int get_stable_obj_count(int driver) +{ + /* The test relies on the system being in the same state before and + after the test so any difference in the object count is a result of + leaks during the test. gem_quiescent_gpu() mostly achieves this but + occasionally obj_count can still change. The loop ensures obj_count + has remained stable over several checks */ + int obj_count, prev_obj_count; + int loop_count = 0; + gem_quiescent_gpu(driver); + prev_obj_count = get_object_count(); + while (loop_count < 4) { + usleep(200000); + gem_quiescent_gpu(driver); + obj_count = get_object_count(); + if (obj_count == prev_obj_count) { + loop_count++; + } else { + igt_debug("loop_count=%d, obj_count=%d, prev_obj_count=%d\n", loop_count, obj_count, prev_obj_count); + loop_count = 0; + prev_obj_count = obj_count; + } + + } + return obj_count; +} static void *thread_fn_flink_name(void *p) { @@ -164,8 +190,7 @@ static void test_flink_close(void) * up the counts */ fake = drm_open_driver(DRIVER_INTEL); - gem_quiescent_gpu(fake); - obj_count = get_object_count(); + obj_count = get_stable_obj_count(fake); num_threads = sysconf(_SC_NPROCESSORS_ONLN); @@ -190,8 +215,7 @@ static void test_flink_close(void) close(fd); - gem_quiescent_gpu(fake); - obj_count = get_object_count() - obj_count; + obj_count = get_stable_obj_count(fake) - obj_count; igt_info("leaked %i objects\n", obj_count); diff --git a/tests/prime_self_import.c b/tests/prime_self_import.c index 91fe231..977c7b2 100644 --- a/tests/prime_self_import.c +++ b/tests/prime_self_import.c @@ -47,7 +47,7 @@ #include "drm.h" IGT_TEST_DESCRIPTION("Check whether prime import/export works on the same" - " device."); + " device... but with different fds."); #define BO_SIZE (16*1024) @@ -157,7 +157,7 @@ static void test_with_one_bo_two_files(void) dma_buf_fd2 = prime_handle_to_fd(fd2, handle_open); handle_import = prime_fd_to_handle(fd2, dma_buf_fd2); - /* dma-buf selfimporting an flink bo should give the same handle */ + /* dma-buf self importing an flink bo should give the same handle */ igt_assert_eq_u32(handle_import, handle_open); close(fd1); @@ -226,6 +226,33 @@ static int get_object_count(void) return ret; } +static int get_stable_obj_count(int driver) +{ + /* The test relies on the system being in the same state before and + after the test so any difference in the object count is a result of + leaks during the test. gem_quiescent_gpu() mostly achieves this but + occasionally obj_count can still change. The loop ensures obj_count + has remained stable over several checks */ + int obj_count, prev_obj_count; + int loop_count = 0; + gem_quiescent_gpu(driver); + prev_obj_count = get_object_count(); + while (loop_count < 4) { + usleep(200000); + gem_quiescent_gpu(driver); + obj_count = get_object_count(); + if (obj_count == prev_obj_count) { + loop_count++; + } else { + igt_debug("loop_count=%d, obj_count=%d, prev_obj_count=%d\n", loop_count, obj_count, prev_obj_count); + loop_count = 0; + prev_obj_count = obj_count; + } + + } + return obj_count; +} + static void *thread_fn_reimport_vs_close(void *p) { struct drm_gem_close close_bo; @@ -258,8 +285,7 @@ static void test_reimport_close_race(void) * up the counts */ fake = drm_open_driver(DRIVER_INTEL); - gem_quiescent_gpu(fake); - obj_count = get_object_count(); + obj_count = get_stable_obj_count(fake); num_threads = sysconf(_SC_NPROCESSORS_ONLN); @@ -290,8 +316,7 @@ static void test_reimport_close_race(void) close(fds[0]); close(fds[1]); - gem_quiescent_gpu(fake); - obj_count = get_object_count() - obj_count; + obj_count = get_stable_obj_count(fake) - obj_count; igt_info("leaked %i objects\n", obj_count); @@ -350,8 +375,7 @@ static void test_export_close_race(void) * up the counts */ fake = drm_open_driver(DRIVER_INTEL); - gem_quiescent_gpu(fake); - obj_count = get_object_count(); + obj_count = get_stable_obj_count(fake); fd = drm_open_driver(DRIVER_INTEL); @@ -373,8 +397,7 @@ static void test_export_close_race(void) close(fd); - gem_quiescent_gpu(fake); - obj_count = get_object_count() - obj_count; + obj_count = get_stable_obj_count(fake) - obj_count; igt_info("leaked %i objects\n", obj_count);
gem_flink_race and prime_self_import have subtests which read the number of open gem objects from debugfs to determine if objects have leaked during the test. However the test can fail sporadically if the number of gem objects changes due to other process activity. This patch introduces a change to check the number of gem objects several times to filter out any fluctuations. Signed-off-by: Derek Morton <derek.j.morton@intel.com> --- tests/gem_flink_race.c | 34 +++++++++++++++++++++++++++++----- tests/prime_self_import.c | 43 +++++++++++++++++++++++++++++++++---------- 2 files changed, 62 insertions(+), 15 deletions(-)