Message ID | 1311211809-5085-1-git-send-email-lmr@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 07/20/2011 10:30 PM, Lucas Meneghel Rodrigues wrote: > From: pradeep<you@example.com> Ok Pradeep, I checked out the new version of the test and made corrections to it (see changelog). Now, what I don't quite like on this test is: * There's no PASS/FAIL criteria, that is, the test never fails. This is not good. * The method of determining the drift looks strange to me. The drift monitor produces a line of drifts every second, then your code checks only the last line of it. Not sure if this is correct. * Also, when trying out the test here I found problems. Did you actually run the test until the end? I kindly ask you to test with a reduced time length (say, 15 or 30 minutes). I've adapted the test so it can use fractions of an hour instead of full hours. So please, go through the new patch: http://patchwork.test.kernel.org/patch/3570/mbox/ And give me a failure criteria and justify the drift calculation being done the way your are doing (or fix it). Thanks, Lucas > This patch introduces a soft lockup/drift test with stress. > > 1) Boot up a VM. > 2) Build stress on host and guest. > 3) run heartbeat monitor with the given options on server and host. > 3) Run for a relatively long time length, ex: 12, 18 or 24 hours. > 4) Output the test result and observe drift. > > Changes from v2: > * Fixed up commands being used on guest, lack of proper output > redirection was confusing aexpect > * Proper clean up previous instances of the monitor programs > lying around, as well as log files > * Resort to another method of determining host IP if the same > has no fully qualified hostname (stand alone laptops, for > example) > * Only use a single session on guest to execute all the commands. > previous version was opening unneeded connections. > * Fix stress execution in guest and host, now the stress instances > effectively start > * Actively open guest and host firewall rules so heartbeat monitor > communication can happen > > Signed-off-by: Lucas Meneghel Rodrigues<lmr@redhat.com> > Signed-off-by: Pradeep Kumar Surisetty<psuriset@linux.vnet.ibm.com> > --- > client/tests/kvm/deps/heartbeat_slu.py | 205 ++++++++++++++++++++++++++++++++ > client/tests/kvm/tests_base.cfg.sample | 18 +++ > client/virt/tests/softlockup.py | 147 +++++++++++++++++++++++ > 3 files changed, 370 insertions(+), 0 deletions(-) > create mode 100755 client/tests/kvm/deps/heartbeat_slu.py > create mode 100644 client/virt/tests/softlockup.py > > diff --git a/client/tests/kvm/deps/heartbeat_slu.py b/client/tests/kvm/deps/heartbeat_slu.py > new file mode 100755 > index 0000000..697bbbf > --- /dev/null > +++ b/client/tests/kvm/deps/heartbeat_slu.py > @@ -0,0 +1,205 @@ > +#!/usr/bin/env python > + > +""" > +Heartbeat server/client to detect soft lockups > +""" > + > +import socket, os, sys, time, getopt > + > +def daemonize(output_file): > + try: > + pid = os.fork() > + except OSError, e: > + raise Exception, "error %d: %s" % (e.strerror, e.errno) > + > + if pid: > + os._exit(0) > + > + os.umask(0) > + os.setsid() > + sys.stdout.flush() > + sys.stderr.flush() > + > + if file: > + output_handle = file(output_file, 'a+', 0) > + # autoflush stdout/stderr > + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) > + sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0) > + else: > + output_handle = file('/dev/null', 'a+') > + > + stdin_handle = open('/dev/null', 'r') > + os.dup2(output_handle.fileno(), sys.stdout.fileno()) > + os.dup2(output_handle.fileno(), sys.stderr.fileno()) > + os.dup2(stdin_handle.fileno(), sys.stdin.fileno()) > + > +def recv_all(sock): > + total_data = [] > + while True: > + data = sock.recv(1024) > + if not data: > + break > + total_data.append(data) > + return ''.join(total_data) > + > +def run_server(host, port, daemon, file, queue_size, threshold, drift): > + if daemon: > + daemonize(output_file=file) > + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) > + sock.bind((host, port)) > + sock.listen(queue_size) > + timeout_interval = threshold * 2 > + prev_check_timestamp = float(time.time()) > + while 1: > + c_sock, c_addr = sock.accept() > + heartbeat = recv_all(c_sock) > + local_timestamp = float(time.time()) > + drift = check_heartbeat(heartbeat, local_timestamp, threshold, check_drift) > + # NOTE: this doesn't work if the only client is the one that timed > + # out, but anything more complete would require another thread and > + # a lock for client_prev_timestamp. > + if local_timestamp - prev_check_timestamp> threshold * 2.0: > + check_for_timeouts(threshold, check_drift) > + prev_check_timestamp = local_timestamp > + if verbose: > + if check_drift: > + print "%.2f: %s (%s)" % (local_timestamp, heartbeat, drift) > + else: > + print "%.2f: %s" % (local_timestamp, heartbeat) > + > +def run_client(host, port, daemon, file, interval): > + if daemon: > + daemonize(output_file=file) > + seq = 1 > + while 1: > + try: > + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) > + sock.connect((host, port)) > + heartbeat = get_heartbeat(seq) > + sock.sendall(heartbeat) > + sock.close() > + if verbose: > + print heartbeat > + except socket.error, (value, message): > + print "%.2f: ERROR, %d - %s" % (float(time.time()), value, message) > + > + seq += 1 > + time.sleep(interval) > + > +def get_heartbeat(seq=1): > + return "%s %06d %.2f" % (hostname, seq, float(time.time())) > + > +def check_heartbeat(heartbeat, local_timestamp, threshold, check_drift): > + hostname, seq, timestamp = heartbeat.rsplit() > + timestamp = float(timestamp) > + if client_prev_timestamp.has_key(hostname): > + delta = local_timestamp - client_prev_timestamp[hostname] > + if delta> threshold: > + print "%.2f: ALERT, SLU detected on host %s, delta %ds" \ > + % (float(time.time()), hostname, delta) > + > + client_prev_timestamp[hostname] = local_timestamp > + > + if check_drift: > + if not client_clock_offset.has_key(hostname): > + client_clock_offset[hostname] = timestamp - local_timestamp > + client_prev_drift[hostname] = 0 > + drift = timestamp - local_timestamp - client_clock_offset[hostname] > + drift_delta = drift - client_prev_drift[hostname] > + client_prev_drift[hostname] = drift > + return "drift %+4.2f (%+4.2f)" % (drift, drift_delta) > + > +def check_for_timeouts(threshold, check_drift): > + local_timestamp = float(time.time()) > + hostname_list = list(client_prev_timestamp) > + for hostname in hostname_list: > + timestamp = client_prev_timestamp[hostname] > + delta = local_timestamp - timestamp > + if delta> threshold * 2: > + print "%.2f: ALERT, SLU detected on host %s, no heartbeat for %ds" \ > + % (local_timestamp, hostname, delta) > + del client_prev_timestamp[hostname] > + if check_drift: > + del client_clock_offset[hostname] > + del client_prev_drift[hostname] > + > +def usage(): > + print """ > +Usage: > + > + heartbeat_slu.py --server --address<bind_address> --port<bind_port> > + [--file<output_file>] [--no-daemon] [--verbose] > + [--threshold<heartbeat threshold>] > + > + heartbeat_slu.py --client --address<server_address> -p<server_port> > + [--file output_file] [--no-daemon] [--verbose] > + [--interval<heartbeat interval in seconds>] > +""" > + > +# host information and global data > +hostname = socket.gethostname() > +client_prev_timestamp = {} > +client_clock_offset = {} > +client_prev_drift = {} > + > +# default param values > +host_port = 9001 > +host_address = '' > +interval = 1 # seconds between heartbeats > +threshold = 10 # seconds late till alert > +is_server = False > +is_daemon = True > +file_server = "/tmp/heartbeat_server.out" > +file_client = "/tmp/heartbeat_client.out" > +file_selected = None > +queue_size = 5 > +verbose = False > +check_drift = False > + > +# process cmdline opts > +try: > + opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [ > + "server", "client", "no-daemon", "address=", "port=", > + "file=", "server", "interval=", "threshold=", "verbose", > + "check-drift", "help"]) > +except getopt.GetoptError, e: > + print "error: %s" % str(e) > + usage() > + exit(1) > + > +for param, value in opts: > + if param in ["-p", "--port"]: > + host_port = int(value) > + elif param in ["-a", "--address"]: > + host_address = value > + elif param in ["-s", "--server"]: > + is_server = True > + elif param in ["-c", "--client"]: > + is_server = False > + elif param in ["--no-daemon"]: > + is_daemon = False > + elif param in ["-f", "--file"]: > + file_selected = value > + elif param in ["-i", "--interval"]: > + interval = int(value) > + elif param in ["-t", "--threshold"]: > + threshold = int(value) > + elif param in ["-d", "--check-drift"]: > + check_drift = True > + elif param in ["-v", "--verbose"]: > + verbose = True > + elif param in ["-h", "--help"]: > + usage() > + exit(0) > + else: > + print "error: unrecognized option: %s" % value > + usage() > + exit(1) > + > +# run until we're terminated > +if is_server: > + file_server = file_selected or file_server > + run_server(host_address, host_port, is_daemon, file_server, queue_size, threshold, check_drift) > +else: > + file_client = file_selected or file_client > + run_client(host_address, host_port, is_daemon, file_client, interval) > diff --git a/client/tests/kvm/tests_base.cfg.sample b/client/tests/kvm/tests_base.cfg.sample > index 65880d8..e9e41f9 100644 > --- a/client/tests/kvm/tests_base.cfg.sample > +++ b/client/tests/kvm/tests_base.cfg.sample > @@ -420,6 +420,24 @@ variants: > type = smbios_table > start_vm = no > > + - softlockup: install setup unattended_install.cdrom > + only Linux > + type = softlockup > + softlockup_files = stress-1.0.4.tar.gz > + stress_setup_cmd = "cd %s&& tar xvf stress-1.0.4.tar.gz&& cd stress-1.0.4&& ./configure&& make&& cd src" > + server_setup_cmd = "%s/heartbeat_slu.py --server --threshold %s --file %s --port %s --verbose --check-drift" > + client_setup_cmd = "%s/heartbeat_slu.py --client --address %s --file %s --port %s --interval 1" > + stress_cmd = "cd %s&& cd stress-1.0.4&& cd src&& nohup ./stress -c %s> /dev/null 2>&1&" > + kill_monitor_cmd = "ps aux | grep heart | grep -v grep | awk '{print$2}' | xargs kill -9> /dev/null 2>&1" > + kill_stress_cmd = "pkill -f stress> /dev/null 2>&1" > + drift_cmd = "tail -1 %s | awk '{print $7}'" > + monitor_log_file_server = /tmp/heartbeat_server.log > + monitor_log_file_client = /tmp/heartbeat_client.log > + monitor_port = 13330 > + stress_threshold = 10 > + # time_to_run (hours) = 12, 18, 24, 48 hours > + test_length = 0.10 > + > - stress_boot: install setup image_copy unattended_install.cdrom > type = stress_boot > max_vms = 5 > diff --git a/client/virt/tests/softlockup.py b/client/virt/tests/softlockup.py > new file mode 100644 > index 0000000..d946965 > --- /dev/null > +++ b/client/virt/tests/softlockup.py > @@ -0,0 +1,147 @@ > +import logging, os, socket, time > +from autotest_lib.client.bin import utils > + > + > +def run_softlockup(test, params, env): > + """ > + soft lockup/drift test with stress. > + > + 1) Boot up a VM. > + 2) Build stress on host and guest. > + 3) run heartbeat with the given options on server and host. > + 3) Run for a relatively long time length. ex: 12, 18 or 24 hours. > + 4) Output the test result and observe drift. > + > + @param test: KVM test object. > + @param params: Dictionary with the test parameters. > + @param env: Dictionary with test environment. > + """ > + stress_setup_cmd = params.get("stress_setup_cmd") > + stress_cmd = params.get("stress_cmd") > + server_setup_cmd = params.get("server_setup_cmd") > + drift_cmd = params.get("drift_cmd") > + kill_stress_cmd = params.get("kill_stress_cmd") > + kill_monitor_cmd = params.get("kill_monitor_cmd") > + > + threshold = int(params.get("stress_threshold")) > + monitor_log_file_server = params.get("monitor_log_file_server") > + monitor_log_file_client = params.get("monitor_log_file_client") > + test_length = int(3600 * float(params.get("test_length"))) > + monitor_port = int(params.get("monitor_port")) > + > + vm = env.get_vm(params["main_vm"]) > + login_timeout = int(params.get("login_timeout", 360)) > + stress_dir = os.path.join(os.environ['AUTODIR'], "tests/stress") > + monitor_dir = os.path.join(test.bindir, 'deps') > + > + > + def _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd): > + logging.info("Kill stress and monitor on guest") > + try: > + session.cmd(kill_stress_cmd) > + except: > + pass > + try: > + session.cmd(kill_monitor_cmd) > + except: > + pass > + > + > + def _kill_host_programs(kill_stress_cmd, kill_monitor_cmd): > + logging.info("Kill stress and monitor on host") > + utils.run(kill_stress_cmd, ignore_status=True) > + utils.run(kill_monitor_cmd, ignore_status=True) > + > + > + def host(): > + logging.info("Setup monitor server on host") > + # Kill previous instances of the host load programs, if any > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > + # Cleanup previous log instances > + if os.path.isfile(monitor_log_file_server): > + os.remove(monitor_log_file_server) > + # Opening firewall ports on host > + utils.run("iptables -F", ignore_status=True) > + > + # Run heartbeat on host > + utils.run(server_setup_cmd % (monitor_dir, threshold, > + monitor_log_file_server, monitor_port)) > + > + logging.info("Build stress on host") > + # Uncompress and build stress on host > + utils.run(stress_setup_cmd % stress_dir) > + > + logging.info("Run stress on host") > + # stress_threads = 2 * n_cpus > + threads_host = 2 * utils.count_cpus() > + # Run stress test on host > + utils.run(stress_cmd % (stress_dir, threads_host)) > + > + > + def guest(): > + try: > + host_ip = socket.gethostbyname(socket.gethostname()) > + except socket.error: > + try: > + # Hackish, but works well on stand alone (laptop) setups > + # with access to the internet. If this fails, well, then > + # not much else can be done... > + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) > + s.connect(("redhat.com", 80)) > + host_ip = s.getsockname()[0] > + except socket.error, (value, e): > + raise error.TestError("Could not determine host IP: %d %s" % > + (value, e)) > + > + # Now, starting the guest > + vm.verify_alive() > + session = vm.wait_for_login(timeout=login_timeout) > + > + # Kill previous instances of the load programs, if any > + _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd) > + # Clean up previous log instances > + session.cmd("rm -f %s" % monitor_log_file_client) > + > + # Opening firewall ports on guest > + try: > + session.cmd("iptables -F") > + except: > + pass > + > + # Get required files and copy them from host to guest > + monitor_path = os.path.join(test.bindir, 'deps', 'heartbeat_slu.py') > + stress_path = os.path.join(os.environ['AUTODIR'], "tests", "stress", > + "stress-1.0.4.tar.gz") > + vm.copy_files_to(monitor_path, "/tmp") > + vm.copy_files_to(stress_path, "/tmp") > + > + logging.info("Setup monitor client on guest") > + # Start heartbeat on guest > + session.cmd(params.get("client_setup_cmd") % > + ("/tmp", monitor_log_file_client, host_ip, monitor_port)) > + > + logging.info("Build stress on guest") > + # Uncompress and build stress on guest > + session.cmd(stress_setup_cmd % "/tmp", timeout=200) > + > + logging.info("Run stress on guest") > + # stress_threads = 2 * n_vcpus > + threads_guest = 2 * int(params.get("smp", 1)) > + # Run stress test on guest > + session.cmd(stress_cmd % ("/tmp", threads_guest)) > + > + # Wait and report > + logging.debug("Wait for %d s", test_length) > + time.sleep(test_length) > + > + # Kill instances of the load programs on both guest and host > + _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd) > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > + > + # Collect drift > + drift = utils.system_output(drift_cmd % monitor_log_file_server) > + logging.info("Drift noticed: %s", drift) > + > + > + host() > + guest() -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 20 Jul 2011 22:30:09 -0300 Lucas Meneghel Rodrigues <lmr@redhat.com> wrote: > From: pradeep <you@example.com> > > This patch introduces a soft lockup/drift test with stress. > > 1) Boot up a VM. > 2) Build stress on host and guest. > 3) run heartbeat monitor with the given options on server and > host. 3) Run for a relatively long time length, ex: 12, 18 or 24 > hours. 4) Output the test result and observe drift. Thanks for making changes. How about taking average of last 10 drift values? > > Changes from v2: > * Fixed up commands being used on guest, lack of proper output > redirection was confusing aexpect > * Proper clean up previous instances of the monitor programs > lying around, as well as log files > * Resort to another method of determining host IP if the same > has no fully qualified hostname (stand alone laptops, for > example) > * Only use a single session on guest to execute all the commands. > previous version was opening unneeded connections. > * Fix stress execution in guest and host, now the stress instances > effectively start > * Actively open guest and host firewall rules so heartbeat monitor > communication can happen > > Signed-off-by: Lucas Meneghel Rodrigues <lmr@redhat.com> > Signed-off-by: Pradeep Kumar Surisetty <psuriset@linux.vnet.ibm.com> > --- > client/tests/kvm/deps/heartbeat_slu.py | 205 > ++++++++++++++++++++++++++++++++ > client/tests/kvm/tests_base.cfg.sample | 18 +++ > client/virt/tests/softlockup.py | 147 +++++++++++++++++++++++ > 3 files changed, 370 insertions(+), 0 deletions(-) create mode 100755 > client/tests/kvm/deps/heartbeat_slu.py create mode 100644 > client/virt/tests/softlockup.py > > diff --git a/client/tests/kvm/deps/heartbeat_slu.py > b/client/tests/kvm/deps/heartbeat_slu.py new file mode 100755 > index 0000000..697bbbf > --- /dev/null > +++ b/client/tests/kvm/deps/heartbeat_slu.py > @@ -0,0 +1,205 @@ > +#!/usr/bin/env python > + > +""" > +Heartbeat server/client to detect soft lockups > +""" > + > +import socket, os, sys, time, getopt > + > +def daemonize(output_file): > + try: > + pid = os.fork() > + except OSError, e: > + raise Exception, "error %d: %s" % (e.strerror, e.errno) > + > + if pid: > + os._exit(0) > + > + os.umask(0) > + os.setsid() > + sys.stdout.flush() > + sys.stderr.flush() > + > + if file: > + output_handle = file(output_file, 'a+', 0) > + # autoflush stdout/stderr > + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) > + sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0) > + else: > + output_handle = file('/dev/null', 'a+') > + > + stdin_handle = open('/dev/null', 'r') > + os.dup2(output_handle.fileno(), sys.stdout.fileno()) > + os.dup2(output_handle.fileno(), sys.stderr.fileno()) > + os.dup2(stdin_handle.fileno(), sys.stdin.fileno()) > + > +def recv_all(sock): > + total_data = [] > + while True: > + data = sock.recv(1024) > + if not data: > + break > + total_data.append(data) > + return ''.join(total_data) > + > +def run_server(host, port, daemon, file, queue_size, threshold, > drift): > + if daemon: > + daemonize(output_file=file) > + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) > + sock.bind((host, port)) > + sock.listen(queue_size) > + timeout_interval = threshold * 2 > + prev_check_timestamp = float(time.time()) > + while 1: > + c_sock, c_addr = sock.accept() > + heartbeat = recv_all(c_sock) > + local_timestamp = float(time.time()) > + drift = check_heartbeat(heartbeat, local_timestamp, > threshold, check_drift) > + # NOTE: this doesn't work if the only client is the one that > timed > + # out, but anything more complete would require another > thread and > + # a lock for client_prev_timestamp. > + if local_timestamp - prev_check_timestamp > threshold * 2.0: > + check_for_timeouts(threshold, check_drift) > + prev_check_timestamp = local_timestamp > + if verbose: > + if check_drift: > + print "%.2f: %s (%s)" % (local_timestamp, heartbeat, > drift) > + else: > + print "%.2f: %s" % (local_timestamp, heartbeat) > + > +def run_client(host, port, daemon, file, interval): > + if daemon: > + daemonize(output_file=file) > + seq = 1 > + while 1: > + try: > + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) > + sock.connect((host, port)) > + heartbeat = get_heartbeat(seq) > + sock.sendall(heartbeat) > + sock.close() > + if verbose: > + print heartbeat > + except socket.error, (value, message): > + print "%.2f: ERROR, %d - %s" % (float(time.time()), > value, message) + > + seq += 1 > + time.sleep(interval) > + > +def get_heartbeat(seq=1): > + return "%s %06d %.2f" % (hostname, seq, float(time.time())) > + > +def check_heartbeat(heartbeat, local_timestamp, threshold, > check_drift): > + hostname, seq, timestamp = heartbeat.rsplit() > + timestamp = float(timestamp) > + if client_prev_timestamp.has_key(hostname): > + delta = local_timestamp - client_prev_timestamp[hostname] > + if delta > threshold: > + print "%.2f: ALERT, SLU detected on host %s, delta %ds" \ > + % (float(time.time()), hostname, delta) > + > + client_prev_timestamp[hostname] = local_timestamp > + > + if check_drift: > + if not client_clock_offset.has_key(hostname): > + client_clock_offset[hostname] = timestamp - > local_timestamp > + client_prev_drift[hostname] = 0 > + drift = timestamp - local_timestamp - > client_clock_offset[hostname] > + drift_delta = drift - client_prev_drift[hostname] > + client_prev_drift[hostname] = drift > + return "drift %+4.2f (%+4.2f)" % (drift, drift_delta) > + > +def check_for_timeouts(threshold, check_drift): > + local_timestamp = float(time.time()) > + hostname_list = list(client_prev_timestamp) > + for hostname in hostname_list: > + timestamp = client_prev_timestamp[hostname] > + delta = local_timestamp - timestamp > + if delta > threshold * 2: > + print "%.2f: ALERT, SLU detected on host %s, no > heartbeat for %ds" \ > + % (local_timestamp, hostname, delta) > + del client_prev_timestamp[hostname] > + if check_drift: > + del client_clock_offset[hostname] > + del client_prev_drift[hostname] > + > +def usage(): > + print """ > +Usage: > + > + heartbeat_slu.py --server --address <bind_address> --port > <bind_port> > + [--file <output_file>] [--no-daemon] [--verbose] > + [--threshold <heartbeat threshold>] > + > + heartbeat_slu.py --client --address <server_address> -p > <server_port> > + [--file output_file] [--no-daemon] [--verbose] > + [--interval <heartbeat interval in seconds>] > +""" > + > +# host information and global data > +hostname = socket.gethostname() > +client_prev_timestamp = {} > +client_clock_offset = {} > +client_prev_drift = {} > + > +# default param values > +host_port = 9001 > +host_address = '' > +interval = 1 # seconds between heartbeats > +threshold = 10 # seconds late till alert > +is_server = False > +is_daemon = True > +file_server = "/tmp/heartbeat_server.out" > +file_client = "/tmp/heartbeat_client.out" > +file_selected = None > +queue_size = 5 > +verbose = False > +check_drift = False > + > +# process cmdline opts > +try: > + opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [ > + "server", "client", "no-daemon", "address=", > "port=", > + "file=", "server", "interval=", "threshold=", > "verbose", > + "check-drift", "help"]) > +except getopt.GetoptError, e: > + print "error: %s" % str(e) > + usage() > + exit(1) > + > +for param, value in opts: > + if param in ["-p", "--port"]: > + host_port = int(value) > + elif param in ["-a", "--address"]: > + host_address = value > + elif param in ["-s", "--server"]: > + is_server = True > + elif param in ["-c", "--client"]: > + is_server = False > + elif param in ["--no-daemon"]: > + is_daemon = False > + elif param in ["-f", "--file"]: > + file_selected = value > + elif param in ["-i", "--interval"]: > + interval = int(value) > + elif param in ["-t", "--threshold"]: > + threshold = int(value) > + elif param in ["-d", "--check-drift"]: > + check_drift = True > + elif param in ["-v", "--verbose"]: > + verbose = True > + elif param in ["-h", "--help"]: > + usage() > + exit(0) > + else: > + print "error: unrecognized option: %s" % value > + usage() > + exit(1) > + > +# run until we're terminated > +if is_server: > + file_server = file_selected or file_server > + run_server(host_address, host_port, is_daemon, file_server, > queue_size, threshold, check_drift) +else: > + file_client = file_selected or file_client > + run_client(host_address, host_port, is_daemon, file_client, > interval) diff --git a/client/tests/kvm/tests_base.cfg.sample > b/client/tests/kvm/tests_base.cfg.sample index 65880d8..e9e41f9 100644 > --- a/client/tests/kvm/tests_base.cfg.sample > +++ b/client/tests/kvm/tests_base.cfg.sample > @@ -420,6 +420,24 @@ variants: > type = smbios_table > start_vm = no > > + - softlockup: install setup unattended_install.cdrom > + only Linux > + type = softlockup > + softlockup_files = stress-1.0.4.tar.gz > + stress_setup_cmd = "cd %s && tar xvf stress-1.0.4.tar.gz && > cd stress-1.0.4 && ./configure && make && cd src" > + server_setup_cmd = "%s/heartbeat_slu.py --server --threshold > %s --file %s --port %s --verbose --check-drift" > + client_setup_cmd = "%s/heartbeat_slu.py --client --address > %s --file %s --port %s --interval 1" > + stress_cmd = "cd %s && cd stress-1.0.4 && cd src && > nohup ./stress -c %s > /dev/null 2>&1&" > + kill_monitor_cmd = "ps aux | grep heart | grep -v grep | awk > '{print$2}' | xargs kill -9 > /dev/null 2>&1" > + kill_stress_cmd = "pkill -f stress > /dev/null 2>&1" > + drift_cmd = "tail -1 %s | awk '{print $7}'" > + monitor_log_file_server = /tmp/heartbeat_server.log > + monitor_log_file_client = /tmp/heartbeat_client.log > + monitor_port = 13330 > + stress_threshold = 10 > + # time_to_run (hours) = 12, 18, 24, 48 hours > + test_length = 0.10 > + > - stress_boot: install setup image_copy unattended_install.cdrom > type = stress_boot > max_vms = 5 > diff --git a/client/virt/tests/softlockup.py > b/client/virt/tests/softlockup.py new file mode 100644 > index 0000000..d946965 > --- /dev/null > +++ b/client/virt/tests/softlockup.py > @@ -0,0 +1,147 @@ > +import logging, os, socket, time > +from autotest_lib.client.bin import utils > + > + > +def run_softlockup(test, params, env): > + """ > + soft lockup/drift test with stress. > + > + 1) Boot up a VM. > + 2) Build stress on host and guest. > + 3) run heartbeat with the given options on server and host. > + 3) Run for a relatively long time length. ex: 12, 18 or 24 hours. > + 4) Output the test result and observe drift. > + > + @param test: KVM test object. > + @param params: Dictionary with the test parameters. > + @param env: Dictionary with test environment. > + """ > + stress_setup_cmd = params.get("stress_setup_cmd") > + stress_cmd = params.get("stress_cmd") > + server_setup_cmd = params.get("server_setup_cmd") > + drift_cmd = params.get("drift_cmd") > + kill_stress_cmd = params.get("kill_stress_cmd") > + kill_monitor_cmd = params.get("kill_monitor_cmd") > + > + threshold = int(params.get("stress_threshold")) > + monitor_log_file_server = params.get("monitor_log_file_server") > + monitor_log_file_client = params.get("monitor_log_file_client") > + test_length = int(3600 * float(params.get("test_length"))) > + monitor_port = int(params.get("monitor_port")) > + > + vm = env.get_vm(params["main_vm"]) > + login_timeout = int(params.get("login_timeout", 360)) > + stress_dir = os.path.join(os.environ['AUTODIR'], "tests/stress") > + monitor_dir = os.path.join(test.bindir, 'deps') > + > + > + def _kill_guest_programs(session, kill_stress_cmd, > kill_monitor_cmd): > + logging.info("Kill stress and monitor on guest") > + try: > + session.cmd(kill_stress_cmd) > + except: > + pass > + try: > + session.cmd(kill_monitor_cmd) > + except: > + pass > + > + > + def _kill_host_programs(kill_stress_cmd, kill_monitor_cmd): > + logging.info("Kill stress and monitor on host") > + utils.run(kill_stress_cmd, ignore_status=True) > + utils.run(kill_monitor_cmd, ignore_status=True) > + > + > + def host(): > + logging.info("Setup monitor server on host") > + # Kill previous instances of the host load programs, if any > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > + # Cleanup previous log instances > + if os.path.isfile(monitor_log_file_server): > + os.remove(monitor_log_file_server) > + # Opening firewall ports on host > + utils.run("iptables -F", ignore_status=True) > + > + # Run heartbeat on host > + utils.run(server_setup_cmd % (monitor_dir, threshold, > + monitor_log_file_server, > monitor_port)) + > + logging.info("Build stress on host") > + # Uncompress and build stress on host > + utils.run(stress_setup_cmd % stress_dir) > + > + logging.info("Run stress on host") > + # stress_threads = 2 * n_cpus > + threads_host = 2 * utils.count_cpus() > + # Run stress test on host > + utils.run(stress_cmd % (stress_dir, threads_host)) > + > + > + def guest(): > + try: > + host_ip = socket.gethostbyname(socket.gethostname()) > + except socket.error: > + try: > + # Hackish, but works well on stand alone (laptop) > setups > + # with access to the internet. If this fails, well, > then > + # not much else can be done... > + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) > + s.connect(("redhat.com", 80)) > + host_ip = s.getsockname()[0] > + except socket.error, (value, e): > + raise error.TestError("Could not determine host IP: > %d %s" % > + (value, e)) > + > + # Now, starting the guest > + vm.verify_alive() > + session = vm.wait_for_login(timeout=login_timeout) > + > + # Kill previous instances of the load programs, if any > + _kill_guest_programs(session, kill_stress_cmd, > kill_monitor_cmd) > + # Clean up previous log instances > + session.cmd("rm -f %s" % monitor_log_file_client) > + > + # Opening firewall ports on guest > + try: > + session.cmd("iptables -F") > + except: > + pass > + > + # Get required files and copy them from host to guest > + monitor_path = os.path.join(test.bindir, 'deps', > 'heartbeat_slu.py') > + stress_path = os.path.join(os.environ['AUTODIR'], "tests", > "stress", > + "stress-1.0.4.tar.gz") > + vm.copy_files_to(monitor_path, "/tmp") > + vm.copy_files_to(stress_path, "/tmp") > + > + logging.info("Setup monitor client on guest") > + # Start heartbeat on guest > + session.cmd(params.get("client_setup_cmd") % > + ("/tmp", monitor_log_file_client, host_ip, > monitor_port)) + in tests_base.cfg client_setup_cmd = "%s/heartbeat_slu.py --client --address %s --file %s --port %s --interval 1" in softlockup.py session.cmd(params.get("client_setup_cmd") % ("/tmp", monitor_log_file_client, host_ip, monitor_port)) address, file picking up options interchangeably. > + logging.info("Build stress on guest") > + # Uncompress and build stress on guest > + session.cmd(stress_setup_cmd % "/tmp", timeout=200) > + > + logging.info("Run stress on guest") > + # stress_threads = 2 * n_vcpus > + threads_guest = 2 * int(params.get("smp", 1)) > + # Run stress test on guest > + session.cmd(stress_cmd % ("/tmp", threads_guest)) > + > + # Wait and report > + logging.debug("Wait for %d s", test_length) > + time.sleep(test_length) > + > + # Kill instances of the load programs on both guest and host > + _kill_guest_programs(session, kill_stress_cmd, > kill_monitor_cmd) > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > + > + # Collect drift > + drift = utils.system_output(drift_cmd % > monitor_log_file_server) > + logging.info("Drift noticed: %s", drift) > + > + > + host() > + guest() -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 12 Aug 2011 12:37:15 +0530 pradeep <psuriset@linux.vnet.ibm.com> wrote: > On Wed, 20 Jul 2011 22:30:09 -0300 > Lucas Meneghel Rodrigues <lmr@redhat.com> wrote: > > > From: pradeep <you@example.com> > > > > This patch introduces a soft lockup/drift test with stress. > > > > 1) Boot up a VM. > > 2) Build stress on host and guest. > > 3) run heartbeat monitor with the given options on server and > > host. 3) Run for a relatively long time length, ex: 12, 18 or 24 > > hours. 4) Output the test result and observe drift. > > Thanks for making changes. > How about taking average of last 10 drift values? I observed below values for my softlockup test. More or less drift values are similar. (+0.01, +0.02). There wouldn't be much diff between last value or average of last 10 also. For stress & performance kind of tests, why do we need a PASS/FAIL. We just bother about drift value here. 1313148260.65: localhost.localdomain 000417 1313148259.45 (drift +0.01 (-0.00)) 1313148261.65: localhost.localdomain 000418 1313148260.46 (drift +0.02 (+0.01)) 1313148262.65: localhost.localdomain 000419 1313148261.46 (drift +0.02 (-0.00)) 1313148263.66: localhost.localdomain 000420 1313148262.46 (drift +0.02 (-0.00)) 1313148264.66: localhost.localdomain 000421 1313148263.46 (drift +0.01 (-0.00)) 1313148265.76: localhost.localdomain 000422 1313148264.56 (drift +0.01 (-0.00)) 1313148266.76: localhost.localdomain 000423 1313148265.56 (drift +0.01 (-0.00)) 1313148267.76: localhost.localdomain 000424 1313148266.57 (drift +0.02 (+0.01)) 1313148268.76: localhost.localdomain 000425 1313148267.57 (drift +0.02 (-0.00)) 1313148269.77: localhost.localdomain 000426 1313148268.57 (drift +0.02 (-0.00)) 1313148270.87: localhost.localdomain 000427 1313148269.67 (drift +0.01 (-0.00)) 1313148271.87: localhost.localdomain 000428 1313148270.68 (drift +0.02 (+0.01)) 1313148272.87: localhost.localdomain 000429 1313148271.68 (drift +0.02 (-0.00)) 1313148273.88: localhost.localdomain 000430 1313148272.68 (drift +0.02 (-0.00)) 1313148274.88: localhost.localdomain 000431 1313148273.68 (drift +0.01 (-0.00)) 1313148275.97: localhost.localdomain 000432 1313148274.78 (drift +0.02 (+0.01)) 1313148276.97: localhost.localdomain 000433 1313148275.78 (drift +0.02 (-0.00)) 1313148277.98: localhost.localdomain 000434 1313148276.78 (drift +0.02 (-0.00)) 1313148278.98: localhost.localdomain 000435 1313148277.78 (drift +0.01 (-0.00)) 1313148279.98: localhost.localdomain 000436 1313148278.78 (drift +0.01 (-0.00)) 1313148281.08: localhost.localdomain 000437 1313148279.89 (drift +0.02 (+0.01)) 1313148282.09: localhost.localdomain 000438 1313148280.89 (drift +0.02 (-0.00)) 1313148283.09: localhost.localdomain 000439 1313148281.89 (drift +0.01 (-0.00)) > > > > > Changes from v2: > > * Fixed up commands being used on guest, lack of proper output > > redirection was confusing aexpect > > * Proper clean up previous instances of the monitor programs > > lying around, as well as log files > > * Resort to another method of determining host IP if the same > > has no fully qualified hostname (stand alone laptops, for > > example) > > * Only use a single session on guest to execute all the commands. > > previous version was opening unneeded connections. > > * Fix stress execution in guest and host, now the stress instances > > effectively start > > * Actively open guest and host firewall rules so heartbeat monitor > > communication can happen > > > > Signed-off-by: Lucas Meneghel Rodrigues <lmr@redhat.com> > > Signed-off-by: Pradeep Kumar Surisetty <psuriset@linux.vnet.ibm.com> > > --- > > client/tests/kvm/deps/heartbeat_slu.py | 205 > > ++++++++++++++++++++++++++++++++ > > client/tests/kvm/tests_base.cfg.sample | 18 +++ > > client/virt/tests/softlockup.py | 147 > > +++++++++++++++++++++++ 3 files changed, 370 insertions(+), 0 > > deletions(-) create mode 100755 > > client/tests/kvm/deps/heartbeat_slu.py create mode 100644 > > client/virt/tests/softlockup.py > > > > diff --git a/client/tests/kvm/deps/heartbeat_slu.py > > b/client/tests/kvm/deps/heartbeat_slu.py new file mode 100755 > > index 0000000..697bbbf > > --- /dev/null > > +++ b/client/tests/kvm/deps/heartbeat_slu.py > > @@ -0,0 +1,205 @@ > > +#!/usr/bin/env python > > + > > +""" > > +Heartbeat server/client to detect soft lockups > > +""" > > + > > +import socket, os, sys, time, getopt > > + > > +def daemonize(output_file): > > + try: > > + pid = os.fork() > > + except OSError, e: > > + raise Exception, "error %d: %s" % (e.strerror, e.errno) > > + > > + if pid: > > + os._exit(0) > > + > > + os.umask(0) > > + os.setsid() > > + sys.stdout.flush() > > + sys.stderr.flush() > > + > > + if file: > > + output_handle = file(output_file, 'a+', 0) > > + # autoflush stdout/stderr > > + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) > > + sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0) > > + else: > > + output_handle = file('/dev/null', 'a+') > > + > > + stdin_handle = open('/dev/null', 'r') > > + os.dup2(output_handle.fileno(), sys.stdout.fileno()) > > + os.dup2(output_handle.fileno(), sys.stderr.fileno()) > > + os.dup2(stdin_handle.fileno(), sys.stdin.fileno()) > > + > > +def recv_all(sock): > > + total_data = [] > > + while True: > > + data = sock.recv(1024) > > + if not data: > > + break > > + total_data.append(data) > > + return ''.join(total_data) > > + > > +def run_server(host, port, daemon, file, queue_size, threshold, > > drift): > > + if daemon: > > + daemonize(output_file=file) > > + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) > > + sock.bind((host, port)) > > + sock.listen(queue_size) > > + timeout_interval = threshold * 2 > > + prev_check_timestamp = float(time.time()) > > + while 1: > > + c_sock, c_addr = sock.accept() > > + heartbeat = recv_all(c_sock) > > + local_timestamp = float(time.time()) > > + drift = check_heartbeat(heartbeat, local_timestamp, > > threshold, check_drift) > > + # NOTE: this doesn't work if the only client is the one > > that timed > > + # out, but anything more complete would require another > > thread and > > + # a lock for client_prev_timestamp. > > + if local_timestamp - prev_check_timestamp > threshold * > > 2.0: > > + check_for_timeouts(threshold, check_drift) > > + prev_check_timestamp = local_timestamp > > + if verbose: > > + if check_drift: > > + print "%.2f: %s (%s)" % (local_timestamp, > > heartbeat, drift) > > + else: > > + print "%.2f: %s" % (local_timestamp, heartbeat) > > + > > +def run_client(host, port, daemon, file, interval): > > + if daemon: > > + daemonize(output_file=file) > > + seq = 1 > > + while 1: > > + try: > > + sock = socket.socket(socket.AF_INET, > > socket.SOCK_STREAM) > > + sock.connect((host, port)) > > + heartbeat = get_heartbeat(seq) > > + sock.sendall(heartbeat) > > + sock.close() > > + if verbose: > > + print heartbeat > > + except socket.error, (value, message): > > + print "%.2f: ERROR, %d - %s" % (float(time.time()), > > value, message) + > > + seq += 1 > > + time.sleep(interval) > > + > > +def get_heartbeat(seq=1): > > + return "%s %06d %.2f" % (hostname, seq, float(time.time())) > > + > > +def check_heartbeat(heartbeat, local_timestamp, threshold, > > check_drift): > > + hostname, seq, timestamp = heartbeat.rsplit() > > + timestamp = float(timestamp) > > + if client_prev_timestamp.has_key(hostname): > > + delta = local_timestamp - client_prev_timestamp[hostname] > > + if delta > threshold: > > + print "%.2f: ALERT, SLU detected on host %s, delta > > %ds" \ > > + % (float(time.time()), hostname, delta) > > + > > + client_prev_timestamp[hostname] = local_timestamp > > + > > + if check_drift: > > + if not client_clock_offset.has_key(hostname): > > + client_clock_offset[hostname] = timestamp - > > local_timestamp > > + client_prev_drift[hostname] = 0 > > + drift = timestamp - local_timestamp - > > client_clock_offset[hostname] > > + drift_delta = drift - client_prev_drift[hostname] > > + client_prev_drift[hostname] = drift > > + return "drift %+4.2f (%+4.2f)" % (drift, drift_delta) > > + > > +def check_for_timeouts(threshold, check_drift): > > + local_timestamp = float(time.time()) > > + hostname_list = list(client_prev_timestamp) > > + for hostname in hostname_list: > > + timestamp = client_prev_timestamp[hostname] > > + delta = local_timestamp - timestamp > > + if delta > threshold * 2: > > + print "%.2f: ALERT, SLU detected on host %s, no > > heartbeat for %ds" \ > > + % (local_timestamp, hostname, delta) > > + del client_prev_timestamp[hostname] > > + if check_drift: > > + del client_clock_offset[hostname] > > + del client_prev_drift[hostname] > > + > > +def usage(): > > + print """ > > +Usage: > > + > > + heartbeat_slu.py --server --address <bind_address> --port > > <bind_port> > > + [--file <output_file>] [--no-daemon] > > [--verbose] > > + [--threshold <heartbeat threshold>] > > + > > + heartbeat_slu.py --client --address <server_address> -p > > <server_port> > > + [--file output_file] [--no-daemon] [--verbose] > > + [--interval <heartbeat interval in seconds>] > > +""" > > + > > +# host information and global data > > +hostname = socket.gethostname() > > +client_prev_timestamp = {} > > +client_clock_offset = {} > > +client_prev_drift = {} > > + > > +# default param values > > +host_port = 9001 > > +host_address = '' > > +interval = 1 # seconds between heartbeats > > +threshold = 10 # seconds late till alert > > +is_server = False > > +is_daemon = True > > +file_server = "/tmp/heartbeat_server.out" > > +file_client = "/tmp/heartbeat_client.out" > > +file_selected = None > > +queue_size = 5 > > +verbose = False > > +check_drift = False > > + > > +# process cmdline opts > > +try: > > + opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [ > > + "server", "client", "no-daemon", "address=", > > "port=", > > + "file=", "server", "interval=", "threshold=", > > "verbose", > > + "check-drift", "help"]) > > +except getopt.GetoptError, e: > > + print "error: %s" % str(e) > > + usage() > > + exit(1) > > + > > +for param, value in opts: > > + if param in ["-p", "--port"]: > > + host_port = int(value) > > + elif param in ["-a", "--address"]: > > + host_address = value > > + elif param in ["-s", "--server"]: > > + is_server = True > > + elif param in ["-c", "--client"]: > > + is_server = False > > + elif param in ["--no-daemon"]: > > + is_daemon = False > > + elif param in ["-f", "--file"]: > > + file_selected = value > > + elif param in ["-i", "--interval"]: > > + interval = int(value) > > + elif param in ["-t", "--threshold"]: > > + threshold = int(value) > > + elif param in ["-d", "--check-drift"]: > > + check_drift = True > > + elif param in ["-v", "--verbose"]: > > + verbose = True > > + elif param in ["-h", "--help"]: > > + usage() > > + exit(0) > > + else: > > + print "error: unrecognized option: %s" % value > > + usage() > > + exit(1) > > + > > +# run until we're terminated > > +if is_server: > > + file_server = file_selected or file_server > > + run_server(host_address, host_port, is_daemon, file_server, > > queue_size, threshold, check_drift) +else: > > + file_client = file_selected or file_client > > + run_client(host_address, host_port, is_daemon, file_client, > > interval) diff --git a/client/tests/kvm/tests_base.cfg.sample > > b/client/tests/kvm/tests_base.cfg.sample index 65880d8..e9e41f9 > > 100644 --- a/client/tests/kvm/tests_base.cfg.sample > > +++ b/client/tests/kvm/tests_base.cfg.sample > > @@ -420,6 +420,24 @@ variants: > > type = smbios_table > > start_vm = no > > > > + - softlockup: install setup unattended_install.cdrom > > + only Linux > > + type = softlockup > > + softlockup_files = stress-1.0.4.tar.gz > > + stress_setup_cmd = "cd %s && tar xvf stress-1.0.4.tar.gz && > > cd stress-1.0.4 && ./configure && make && cd src" > > + server_setup_cmd = "%s/heartbeat_slu.py --server > > --threshold %s --file %s --port %s --verbose --check-drift" > > + client_setup_cmd = "%s/heartbeat_slu.py --client --address > > %s --file %s --port %s --interval 1" > > + stress_cmd = "cd %s && cd stress-1.0.4 && cd src && > > nohup ./stress -c %s > /dev/null 2>&1&" > > + kill_monitor_cmd = "ps aux | grep heart | grep -v grep | > > awk '{print$2}' | xargs kill -9 > /dev/null 2>&1" > > + kill_stress_cmd = "pkill -f stress > /dev/null 2>&1" > > + drift_cmd = "tail -1 %s | awk '{print $7}'" > > + monitor_log_file_server = /tmp/heartbeat_server.log > > + monitor_log_file_client = /tmp/heartbeat_client.log > > + monitor_port = 13330 > > + stress_threshold = 10 > > + # time_to_run (hours) = 12, 18, 24, 48 hours > > + test_length = 0.10 > > + > > - stress_boot: install setup image_copy > > unattended_install.cdrom type = stress_boot > > max_vms = 5 > > diff --git a/client/virt/tests/softlockup.py > > b/client/virt/tests/softlockup.py new file mode 100644 > > index 0000000..d946965 > > --- /dev/null > > +++ b/client/virt/tests/softlockup.py > > @@ -0,0 +1,147 @@ > > +import logging, os, socket, time > > +from autotest_lib.client.bin import utils > > + > > + > > +def run_softlockup(test, params, env): > > + """ > > + soft lockup/drift test with stress. > > + > > + 1) Boot up a VM. > > + 2) Build stress on host and guest. > > + 3) run heartbeat with the given options on server and host. > > + 3) Run for a relatively long time length. ex: 12, 18 or 24 > > hours. > > + 4) Output the test result and observe drift. > > + > > + @param test: KVM test object. > > + @param params: Dictionary with the test parameters. > > + @param env: Dictionary with test environment. > > + """ > > + stress_setup_cmd = params.get("stress_setup_cmd") > > + stress_cmd = params.get("stress_cmd") > > + server_setup_cmd = params.get("server_setup_cmd") > > + drift_cmd = params.get("drift_cmd") > > + kill_stress_cmd = params.get("kill_stress_cmd") > > + kill_monitor_cmd = params.get("kill_monitor_cmd") > > + > > + threshold = int(params.get("stress_threshold")) > > + monitor_log_file_server = params.get("monitor_log_file_server") > > + monitor_log_file_client = params.get("monitor_log_file_client") > > + test_length = int(3600 * float(params.get("test_length"))) > > + monitor_port = int(params.get("monitor_port")) > > + > > + vm = env.get_vm(params["main_vm"]) > > + login_timeout = int(params.get("login_timeout", 360)) > > + stress_dir = os.path.join(os.environ['AUTODIR'], > > "tests/stress") > > + monitor_dir = os.path.join(test.bindir, 'deps') > > + > > + > > + def _kill_guest_programs(session, kill_stress_cmd, > > kill_monitor_cmd): > > + logging.info("Kill stress and monitor on guest") > > + try: > > + session.cmd(kill_stress_cmd) > > + except: > > + pass > > + try: > > + session.cmd(kill_monitor_cmd) > > + except: > > + pass > > + > > + > > + def _kill_host_programs(kill_stress_cmd, kill_monitor_cmd): > > + logging.info("Kill stress and monitor on host") > > + utils.run(kill_stress_cmd, ignore_status=True) > > + utils.run(kill_monitor_cmd, ignore_status=True) > > + > > + > > + def host(): > > + logging.info("Setup monitor server on host") > > + # Kill previous instances of the host load programs, if any > > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > > + # Cleanup previous log instances > > + if os.path.isfile(monitor_log_file_server): > > + os.remove(monitor_log_file_server) > > + # Opening firewall ports on host > > + utils.run("iptables -F", ignore_status=True) > > + > > + # Run heartbeat on host > > + utils.run(server_setup_cmd % (monitor_dir, threshold, > > + monitor_log_file_server, > > monitor_port)) + > > + logging.info("Build stress on host") > > + # Uncompress and build stress on host > > + utils.run(stress_setup_cmd % stress_dir) > > + > > + logging.info("Run stress on host") > > + # stress_threads = 2 * n_cpus > > + threads_host = 2 * utils.count_cpus() > > + # Run stress test on host > > + utils.run(stress_cmd % (stress_dir, threads_host)) > > + > > + > > + def guest(): > > + try: > > + host_ip = socket.gethostbyname(socket.gethostname()) > > + except socket.error: > > + try: > > + # Hackish, but works well on stand alone (laptop) > > setups > > + # with access to the internet. If this fails, well, > > then > > + # not much else can be done... > > + s = socket.socket(socket.AF_INET, > > socket.SOCK_DGRAM) > > + s.connect(("redhat.com", 80)) > > + host_ip = s.getsockname()[0] > > + except socket.error, (value, e): > > + raise error.TestError("Could not determine host IP: > > %d %s" % > > + (value, e)) > > + > > + # Now, starting the guest > > + vm.verify_alive() > > + session = vm.wait_for_login(timeout=login_timeout) > > + > > + # Kill previous instances of the load programs, if any > > + _kill_guest_programs(session, kill_stress_cmd, > > kill_monitor_cmd) > > + # Clean up previous log instances > > + session.cmd("rm -f %s" % monitor_log_file_client) > > + > > + # Opening firewall ports on guest > > + try: > > + session.cmd("iptables -F") > > + except: > > + pass > > + > > + # Get required files and copy them from host to guest > > + monitor_path = os.path.join(test.bindir, 'deps', > > 'heartbeat_slu.py') > > + stress_path = os.path.join(os.environ['AUTODIR'], "tests", > > "stress", > > + "stress-1.0.4.tar.gz") > > + vm.copy_files_to(monitor_path, "/tmp") > > + vm.copy_files_to(stress_path, "/tmp") > > + > > + logging.info("Setup monitor client on guest") > > + # Start heartbeat on guest > > + session.cmd(params.get("client_setup_cmd") % > > + ("/tmp", monitor_log_file_client, host_ip, > > monitor_port)) + > > in tests_base.cfg > > client_setup_cmd = "%s/heartbeat_slu.py --client --address %s --file > %s --port %s --interval 1" > > in softlockup.py > > session.cmd(params.get("client_setup_cmd") % ("/tmp", > monitor_log_file_client, host_ip, > monitor_port)) > > address, file picking up options interchangeably. > > > > + logging.info("Build stress on guest") > > + # Uncompress and build stress on guest > > + session.cmd(stress_setup_cmd % "/tmp", timeout=200) > > + > > + logging.info("Run stress on guest") > > + # stress_threads = 2 * n_vcpus > > + threads_guest = 2 * int(params.get("smp", 1)) > > + # Run stress test on guest > > + session.cmd(stress_cmd % ("/tmp", threads_guest)) > > + > > + # Wait and report > > + logging.debug("Wait for %d s", test_length) > > + time.sleep(test_length) > > + > > + # Kill instances of the load programs on both guest and > > host > > + _kill_guest_programs(session, kill_stress_cmd, > > kill_monitor_cmd) > > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > > + > > + # Collect drift > > + drift = utils.system_output(drift_cmd % > > monitor_log_file_server) > > + logging.info("Drift noticed: %s", drift) > > + > > + > > + host() > > + guest() > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jul 20, 2011 at 10:30 PM, Lucas Meneghel Rodrigues <lmr@redhat.com> wrote: > From: pradeep <you@example.com> > > This patch introduces a soft lockup/drift test with stress. > > 1) Boot up a VM. > 2) Build stress on host and guest. > 3) run heartbeat monitor with the given options on server and host. > 3) Run for a relatively long time length, ex: 12, 18 or 24 hours. > 4) Output the test result and observe drift. > > Changes from v2: > * Fixed up commands being used on guest, lack of proper output > redirection was confusing aexpect > * Proper clean up previous instances of the monitor programs > lying around, as well as log files > * Resort to another method of determining host IP if the same > has no fully qualified hostname (stand alone laptops, for > example) > * Only use a single session on guest to execute all the commands. > previous version was opening unneeded connections. > * Fix stress execution in guest and host, now the stress instances > effectively start > * Actively open guest and host firewall rules so heartbeat monitor > communication can happen Ok Pradeep, if we are going to consider this a sort of stress test, then I won't require a PASS/FAIL criteria, even though grumpily. Applied: http://autotest.kernel.org/changeset/5528 Cheers, Lucas -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/client/tests/kvm/deps/heartbeat_slu.py b/client/tests/kvm/deps/heartbeat_slu.py new file mode 100755 index 0000000..697bbbf --- /dev/null +++ b/client/tests/kvm/deps/heartbeat_slu.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python + +""" +Heartbeat server/client to detect soft lockups +""" + +import socket, os, sys, time, getopt + +def daemonize(output_file): + try: + pid = os.fork() + except OSError, e: + raise Exception, "error %d: %s" % (e.strerror, e.errno) + + if pid: + os._exit(0) + + os.umask(0) + os.setsid() + sys.stdout.flush() + sys.stderr.flush() + + if file: + output_handle = file(output_file, 'a+', 0) + # autoflush stdout/stderr + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) + sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0) + else: + output_handle = file('/dev/null', 'a+') + + stdin_handle = open('/dev/null', 'r') + os.dup2(output_handle.fileno(), sys.stdout.fileno()) + os.dup2(output_handle.fileno(), sys.stderr.fileno()) + os.dup2(stdin_handle.fileno(), sys.stdin.fileno()) + +def recv_all(sock): + total_data = [] + while True: + data = sock.recv(1024) + if not data: + break + total_data.append(data) + return ''.join(total_data) + +def run_server(host, port, daemon, file, queue_size, threshold, drift): + if daemon: + daemonize(output_file=file) + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind((host, port)) + sock.listen(queue_size) + timeout_interval = threshold * 2 + prev_check_timestamp = float(time.time()) + while 1: + c_sock, c_addr = sock.accept() + heartbeat = recv_all(c_sock) + local_timestamp = float(time.time()) + drift = check_heartbeat(heartbeat, local_timestamp, threshold, check_drift) + # NOTE: this doesn't work if the only client is the one that timed + # out, but anything more complete would require another thread and + # a lock for client_prev_timestamp. + if local_timestamp - prev_check_timestamp > threshold * 2.0: + check_for_timeouts(threshold, check_drift) + prev_check_timestamp = local_timestamp + if verbose: + if check_drift: + print "%.2f: %s (%s)" % (local_timestamp, heartbeat, drift) + else: + print "%.2f: %s" % (local_timestamp, heartbeat) + +def run_client(host, port, daemon, file, interval): + if daemon: + daemonize(output_file=file) + seq = 1 + while 1: + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((host, port)) + heartbeat = get_heartbeat(seq) + sock.sendall(heartbeat) + sock.close() + if verbose: + print heartbeat + except socket.error, (value, message): + print "%.2f: ERROR, %d - %s" % (float(time.time()), value, message) + + seq += 1 + time.sleep(interval) + +def get_heartbeat(seq=1): + return "%s %06d %.2f" % (hostname, seq, float(time.time())) + +def check_heartbeat(heartbeat, local_timestamp, threshold, check_drift): + hostname, seq, timestamp = heartbeat.rsplit() + timestamp = float(timestamp) + if client_prev_timestamp.has_key(hostname): + delta = local_timestamp - client_prev_timestamp[hostname] + if delta > threshold: + print "%.2f: ALERT, SLU detected on host %s, delta %ds" \ + % (float(time.time()), hostname, delta) + + client_prev_timestamp[hostname] = local_timestamp + + if check_drift: + if not client_clock_offset.has_key(hostname): + client_clock_offset[hostname] = timestamp - local_timestamp + client_prev_drift[hostname] = 0 + drift = timestamp - local_timestamp - client_clock_offset[hostname] + drift_delta = drift - client_prev_drift[hostname] + client_prev_drift[hostname] = drift + return "drift %+4.2f (%+4.2f)" % (drift, drift_delta) + +def check_for_timeouts(threshold, check_drift): + local_timestamp = float(time.time()) + hostname_list = list(client_prev_timestamp) + for hostname in hostname_list: + timestamp = client_prev_timestamp[hostname] + delta = local_timestamp - timestamp + if delta > threshold * 2: + print "%.2f: ALERT, SLU detected on host %s, no heartbeat for %ds" \ + % (local_timestamp, hostname, delta) + del client_prev_timestamp[hostname] + if check_drift: + del client_clock_offset[hostname] + del client_prev_drift[hostname] + +def usage(): + print """ +Usage: + + heartbeat_slu.py --server --address <bind_address> --port <bind_port> + [--file <output_file>] [--no-daemon] [--verbose] + [--threshold <heartbeat threshold>] + + heartbeat_slu.py --client --address <server_address> -p <server_port> + [--file output_file] [--no-daemon] [--verbose] + [--interval <heartbeat interval in seconds>] +""" + +# host information and global data +hostname = socket.gethostname() +client_prev_timestamp = {} +client_clock_offset = {} +client_prev_drift = {} + +# default param values +host_port = 9001 +host_address = '' +interval = 1 # seconds between heartbeats +threshold = 10 # seconds late till alert +is_server = False +is_daemon = True +file_server = "/tmp/heartbeat_server.out" +file_client = "/tmp/heartbeat_client.out" +file_selected = None +queue_size = 5 +verbose = False +check_drift = False + +# process cmdline opts +try: + opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [ + "server", "client", "no-daemon", "address=", "port=", + "file=", "server", "interval=", "threshold=", "verbose", + "check-drift", "help"]) +except getopt.GetoptError, e: + print "error: %s" % str(e) + usage() + exit(1) + +for param, value in opts: + if param in ["-p", "--port"]: + host_port = int(value) + elif param in ["-a", "--address"]: + host_address = value + elif param in ["-s", "--server"]: + is_server = True + elif param in ["-c", "--client"]: + is_server = False + elif param in ["--no-daemon"]: + is_daemon = False + elif param in ["-f", "--file"]: + file_selected = value + elif param in ["-i", "--interval"]: + interval = int(value) + elif param in ["-t", "--threshold"]: + threshold = int(value) + elif param in ["-d", "--check-drift"]: + check_drift = True + elif param in ["-v", "--verbose"]: + verbose = True + elif param in ["-h", "--help"]: + usage() + exit(0) + else: + print "error: unrecognized option: %s" % value + usage() + exit(1) + +# run until we're terminated +if is_server: + file_server = file_selected or file_server + run_server(host_address, host_port, is_daemon, file_server, queue_size, threshold, check_drift) +else: + file_client = file_selected or file_client + run_client(host_address, host_port, is_daemon, file_client, interval) diff --git a/client/tests/kvm/tests_base.cfg.sample b/client/tests/kvm/tests_base.cfg.sample index 65880d8..e9e41f9 100644 --- a/client/tests/kvm/tests_base.cfg.sample +++ b/client/tests/kvm/tests_base.cfg.sample @@ -420,6 +420,24 @@ variants: type = smbios_table start_vm = no + - softlockup: install setup unattended_install.cdrom + only Linux + type = softlockup + softlockup_files = stress-1.0.4.tar.gz + stress_setup_cmd = "cd %s && tar xvf stress-1.0.4.tar.gz && cd stress-1.0.4 && ./configure && make && cd src" + server_setup_cmd = "%s/heartbeat_slu.py --server --threshold %s --file %s --port %s --verbose --check-drift" + client_setup_cmd = "%s/heartbeat_slu.py --client --address %s --file %s --port %s --interval 1" + stress_cmd = "cd %s && cd stress-1.0.4 && cd src && nohup ./stress -c %s > /dev/null 2>&1&" + kill_monitor_cmd = "ps aux | grep heart | grep -v grep | awk '{print$2}' | xargs kill -9 > /dev/null 2>&1" + kill_stress_cmd = "pkill -f stress > /dev/null 2>&1" + drift_cmd = "tail -1 %s | awk '{print $7}'" + monitor_log_file_server = /tmp/heartbeat_server.log + monitor_log_file_client = /tmp/heartbeat_client.log + monitor_port = 13330 + stress_threshold = 10 + # time_to_run (hours) = 12, 18, 24, 48 hours + test_length = 0.10 + - stress_boot: install setup image_copy unattended_install.cdrom type = stress_boot max_vms = 5 diff --git a/client/virt/tests/softlockup.py b/client/virt/tests/softlockup.py new file mode 100644 index 0000000..d946965 --- /dev/null +++ b/client/virt/tests/softlockup.py @@ -0,0 +1,147 @@ +import logging, os, socket, time +from autotest_lib.client.bin import utils + + +def run_softlockup(test, params, env): + """ + soft lockup/drift test with stress. + + 1) Boot up a VM. + 2) Build stress on host and guest. + 3) run heartbeat with the given options on server and host. + 3) Run for a relatively long time length. ex: 12, 18 or 24 hours. + 4) Output the test result and observe drift. + + @param test: KVM test object. + @param params: Dictionary with the test parameters. + @param env: Dictionary with test environment. + """ + stress_setup_cmd = params.get("stress_setup_cmd") + stress_cmd = params.get("stress_cmd") + server_setup_cmd = params.get("server_setup_cmd") + drift_cmd = params.get("drift_cmd") + kill_stress_cmd = params.get("kill_stress_cmd") + kill_monitor_cmd = params.get("kill_monitor_cmd") + + threshold = int(params.get("stress_threshold")) + monitor_log_file_server = params.get("monitor_log_file_server") + monitor_log_file_client = params.get("monitor_log_file_client") + test_length = int(3600 * float(params.get("test_length"))) + monitor_port = int(params.get("monitor_port")) + + vm = env.get_vm(params["main_vm"]) + login_timeout = int(params.get("login_timeout", 360)) + stress_dir = os.path.join(os.environ['AUTODIR'], "tests/stress") + monitor_dir = os.path.join(test.bindir, 'deps') + + + def _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd): + logging.info("Kill stress and monitor on guest") + try: + session.cmd(kill_stress_cmd) + except: + pass + try: + session.cmd(kill_monitor_cmd) + except: + pass + + + def _kill_host_programs(kill_stress_cmd, kill_monitor_cmd): + logging.info("Kill stress and monitor on host") + utils.run(kill_stress_cmd, ignore_status=True) + utils.run(kill_monitor_cmd, ignore_status=True) + + + def host(): + logging.info("Setup monitor server on host") + # Kill previous instances of the host load programs, if any + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) + # Cleanup previous log instances + if os.path.isfile(monitor_log_file_server): + os.remove(monitor_log_file_server) + # Opening firewall ports on host + utils.run("iptables -F", ignore_status=True) + + # Run heartbeat on host + utils.run(server_setup_cmd % (monitor_dir, threshold, + monitor_log_file_server, monitor_port)) + + logging.info("Build stress on host") + # Uncompress and build stress on host + utils.run(stress_setup_cmd % stress_dir) + + logging.info("Run stress on host") + # stress_threads = 2 * n_cpus + threads_host = 2 * utils.count_cpus() + # Run stress test on host + utils.run(stress_cmd % (stress_dir, threads_host)) + + + def guest(): + try: + host_ip = socket.gethostbyname(socket.gethostname()) + except socket.error: + try: + # Hackish, but works well on stand alone (laptop) setups + # with access to the internet. If this fails, well, then + # not much else can be done... + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(("redhat.com", 80)) + host_ip = s.getsockname()[0] + except socket.error, (value, e): + raise error.TestError("Could not determine host IP: %d %s" % + (value, e)) + + # Now, starting the guest + vm.verify_alive() + session = vm.wait_for_login(timeout=login_timeout) + + # Kill previous instances of the load programs, if any + _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd) + # Clean up previous log instances + session.cmd("rm -f %s" % monitor_log_file_client) + + # Opening firewall ports on guest + try: + session.cmd("iptables -F") + except: + pass + + # Get required files and copy them from host to guest + monitor_path = os.path.join(test.bindir, 'deps', 'heartbeat_slu.py') + stress_path = os.path.join(os.environ['AUTODIR'], "tests", "stress", + "stress-1.0.4.tar.gz") + vm.copy_files_to(monitor_path, "/tmp") + vm.copy_files_to(stress_path, "/tmp") + + logging.info("Setup monitor client on guest") + # Start heartbeat on guest + session.cmd(params.get("client_setup_cmd") % + ("/tmp", monitor_log_file_client, host_ip, monitor_port)) + + logging.info("Build stress on guest") + # Uncompress and build stress on guest + session.cmd(stress_setup_cmd % "/tmp", timeout=200) + + logging.info("Run stress on guest") + # stress_threads = 2 * n_vcpus + threads_guest = 2 * int(params.get("smp", 1)) + # Run stress test on guest + session.cmd(stress_cmd % ("/tmp", threads_guest)) + + # Wait and report + logging.debug("Wait for %d s", test_length) + time.sleep(test_length) + + # Kill instances of the load programs on both guest and host + _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd) + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) + + # Collect drift + drift = utils.system_output(drift_cmd % monitor_log_file_server) + logging.info("Drift noticed: %s", drift) + + + host() + guest()