diff --git a/scripts/wk/hw/diags.py b/scripts/wk/hw/diags.py index eec9c2b6..f56b51fb 100644 --- a/scripts/wk/hw/diags.py +++ b/scripts/wk/hw/diags.py @@ -108,7 +108,7 @@ class State(): self.tests = OrderedDict({ 'CPU & Cooling': { 'Enabled': False, - 'Function': cpu_mprime_test, + 'Function': cpu_stress_tests, 'Objects': [], }, 'Disk Attributes': { @@ -302,7 +302,7 @@ class State(): if not details['Selected']: continue if 'CPU' in name: - # Create two Test objects which will both be used by cpu_mprime_test + # Create two Test objects which will both be used by cpu_stress_tests # NOTE: Prime95 should be added first test_mprime_obj = hw_obj.Test(dev=self.cpu, label='Prime95') test_cooling_obj = hw_obj.Test(dev=self.cpu, label='Cooling') @@ -556,9 +556,12 @@ def calc_io_dd_values(dev_size): } -def check_cooling_results(test_obj, sensors): +def check_cooling_results(test_obj, sensors, run_sysbench=False): """Check cooling results and update test_obj.""" max_temp = sensors.cpu_max_temp() + temp_labels = ['Idle', 'Max', 'Cooldown'] + if run_sysbench: + temp_labels.append('Sysbench') # Check temps if not max_temp: @@ -571,8 +574,7 @@ def check_cooling_results(test_obj, sensors): test_obj.set_status('Passed') # Add temps to report - for line in sensors.generate_report( - 'Idle', 'Max', 'Cooldown', only_cpu=True): + for line in sensors.generate_report(*temp_labels, only_cpu=True): test_obj.report.append(f' {line}') @@ -702,12 +704,13 @@ def check_self_test_results(test_obj, aborted=False): test_obj.set_status('Failed') -def cpu_mprime_test(state, test_objects): +def cpu_stress_tests(state, test_objects): # pylint: disable=too-many-statements - """CPU & cooling check using Prime95.""" + """CPU & cooling check using Prime95 and Sysbench.""" LOG.info('CPU Test (Prime95)') aborted = False prime_log = pathlib.Path(f'{state.log_dir}/prime.log') + run_sysbench = False sensors_out = pathlib.Path(f'{state.log_dir}/sensors.out') test_mprime_obj, test_cooling_obj = test_objects @@ -777,9 +780,41 @@ def cpu_mprime_test(state, test_objects): test_mprime_obj.report.append(std.color_string('Prime95', 'BLUE')) check_mprime_results(test_obj=test_mprime_obj, working_dir=state.log_dir) + # Run Sysbench test if necessary + run_sysbench = ( + not aborted and sensors.cpu_max_temp() >= cfg.hw.CPU_FAILURE_TEMP + ) + if run_sysbench: + LOG.info('CPU Test (Sysbench)') + std.clear_screen() + std.print_info('Starting alternate stress test') + print('') + proc_sysbench, filehandle_sysbench = start_sysbench( + sensors, + sensors_out, + log_path=prime_log.with_name('sysbench.log'), + pane='Prime95', + ) + try: + print_countdown(proc=proc_sysbench, seconds=cfg.hw.CPU_TEST_MINUTES*60) + except AttributeError: + # Assuming the sysbench process wasn't found and proc was set to None + LOG.error('Failed to find sysbench process', exc_info=True) + except KeyboardInterrupt: + aborted = True + stop_sysbench(proc_sysbench, filehandle_sysbench) + + # Update progress + # NOTE: CPU critical temp check isn't really necessary + # Hard to imagine it wasn't hit during Prime95 but was in sysbench + if sensors.cpu_reached_critical_temp() or aborted: + test_cooling_obj.set_status('Aborted') + test_mprime_obj.set_status('Aborted') + state.update_progress_pane() + # Check Cooling results test_cooling_obj.report.append(std.color_string('Temps', 'BLUE')) - check_cooling_results(test_obj=test_cooling_obj, sensors=sensors) + check_cooling_results(test_cooling_obj, sensors, run_sysbench) # Cleanup state.update_progress_pane() @@ -1300,7 +1335,8 @@ def print_countdown(proc, seconds): except subprocess.TimeoutExpired: # proc still going, continue pass - if proc.poll() is not None: + if ((hasattr(proc, 'poll') and proc.poll() is not None) + or (hasattr(proc, 'is_running') and not proc.is_running())): # proc exited, stop countdown break @@ -1460,6 +1496,35 @@ def start_mprime(working_dir, log_path): return proc_mprime +def start_sysbench(sensors, sensors_out, log_path, pane): + """Start sysbench, returns tuple with Popen object and file handle.""" + set_apple_fan_speed('max') + sysbench_cmd = [ + 'sysbench', + f'--threads={exe.psutil.cpu_count()}', + '--cpu-max-prime=1000000000', + 'cpu', + 'run', + ] + + # Restart background monitor for Sysbench + sensors.stop_background_monitor() + sensors.start_background_monitor( + sensors_out, + alt_max='Sysbench', + thermal_action=('killall', 'sysbench', '-INT'), + ) + + # Update bottom pane + tmux.respawn_pane(pane, watch_file=log_path) + + # Start sysbench + filehandle_sysbench = open(log_path, 'a') + proc_sysbench = exe.popen_program(sysbench_cmd, stdout=filehandle_sysbench) + + # Done + return (proc_sysbench, filehandle_sysbench) + def stop_mprime(proc_mprime): """Stop mprime gracefully, then forcefully as needed.""" proc_mprime.terminate() @@ -1470,6 +1535,18 @@ def stop_mprime(proc_mprime): set_apple_fan_speed('auto') +def stop_sysbench(proc_sysbench, filehandle_sysbench): + """Stop sysbench.""" + proc_sysbench.terminate() + try: + proc_sysbench.wait(timeout=5) + except subprocess.TimeoutExpired: + proc_sysbench.kill() + filehandle_sysbench.flush() + filehandle_sysbench.close() + set_apple_fan_speed('auto') + + def sync_clock(): """Sync clock under macOS using sntp.""" cmd = ['sudo', 'sntp', '-Ss', 'us.pool.ntp.org'] diff --git a/scripts/wk/hw/sensors.py b/scripts/wk/hw/sensors.py index 573c9c3b..e5b27bc1 100644 --- a/scripts/wk/hw/sensors.py +++ b/scripts/wk/hw/sensors.py @@ -10,6 +10,7 @@ from subprocess import CalledProcessError from wk.cfg.hw import CPU_CRITICAL_TEMP, SMC_IDS, TEMP_COLORS from wk.exe import run_program, start_thread +from wk.io import non_clobber_path from wk.std import PLATFORM, color_string, sleep @@ -115,20 +116,27 @@ class Sensors(): return report def monitor_to_file( - self, out_path, + self, out_path, alt_max=None, exit_on_thermal_limit=True, temp_labels=None, thermal_action=None): + # pylint: disable=too-many-arguments """Write report to path every second until stopped. thermal_action is a cmd to run if ThermalLimitReachedError is caught. """ stop_path = pathlib.Path(out_path).resolve().with_suffix('.stop') + if stop_path.exists(): + # Rename existing file to allow thread to start as expected + # Yes this is excessive but safe + stop_path.rename(non_clobber_path(stop_path)) if not temp_labels: - temp_labels = ('Current', 'Max') + temp_labels = ['Current', 'Max'] + if alt_max: + temp_labels.append(alt_max) # Start loop while True: try: - self.update_sensor_data(exit_on_thermal_limit) + self.update_sensor_data(alt_max, exit_on_thermal_limit) except ThermalLimitReachedError: if thermal_action: run_program(thermal_action, check=False) @@ -169,8 +177,9 @@ class Sensors(): source_data[temp_label] = 0 def start_background_monitor( - self, out_path, + self, out_path, alt_max=None, exit_on_thermal_limit=True, temp_labels=None, thermal_action=None): + # pylint: disable=too-many-arguments """Start background thread to save report to file. thermal_action is a cmd to run if ThermalLimitReachedError is caught. @@ -181,7 +190,9 @@ class Sensors(): self.out_path = pathlib.Path(out_path) self.background_thread = start_thread( self.monitor_to_file, - args=(out_path, exit_on_thermal_limit, temp_labels, thermal_action), + args=( + out_path, alt_max, exit_on_thermal_limit, temp_labels, thermal_action, + ), ) def stop_background_monitor(self): @@ -193,14 +204,14 @@ class Sensors(): self.background_thread = None self.out_path = None - def update_sensor_data(self, exit_on_thermal_limit=True): + def update_sensor_data(self, alt_max=None, exit_on_thermal_limit=True): """Update sensor data via OS-specific means.""" if PLATFORM == 'Darwin': - self.update_sensor_data_macos(exit_on_thermal_limit) + self.update_sensor_data_macos(alt_max, exit_on_thermal_limit) elif PLATFORM == 'Linux': - self.update_sensor_data_linux(exit_on_thermal_limit) + self.update_sensor_data_linux(alt_max, exit_on_thermal_limit) - def update_sensor_data_linux(self, exit_on_thermal_limit=True): + def update_sensor_data_linux(self, alt_max, exit_on_thermal_limit=True): """Update sensor data via lm_sensors.""" lm_sensor_data = get_sensor_data_lm() for section, adapters in self.data.items(): @@ -212,6 +223,8 @@ class Sensors(): source_data['Current'] = temp source_data['Max'] = max(temp, source_data['Max']) source_data['Temps'].append(temp) + if alt_max: + source_data[alt_max] = max(temp, source_data.get(alt_max, 0)) except KeyError: # Dumb workaround for Dell sensors with changing source names pass @@ -221,7 +234,7 @@ class Sensors(): if source_data['Current'] >= CPU_CRITICAL_TEMP: raise ThermalLimitReachedError('CPU temps reached limit') - def update_sensor_data_macos(self, exit_on_thermal_limit=True): + def update_sensor_data_macos(self, alt_max, exit_on_thermal_limit=True): """Update sensor data via SMC.""" for section, adapters in self.data.items(): for sources in adapters.values(): @@ -239,6 +252,8 @@ class Sensors(): source_data['Current'] = temp source_data['Max'] = max(temp, source_data['Max']) source_data['Temps'].append(temp) + if alt_max: + source_data[alt_max] = max(temp, source_data.get(alt_max, 0)) # Raise exception if thermal limit reached if exit_on_thermal_limit and section == 'CPUTemps': diff --git a/setup/linux/packages/base b/setup/linux/packages/base index c9084bd2..8fff04f3 100644 --- a/setup/linux/packages/base +++ b/setup/linux/packages/base @@ -64,6 +64,7 @@ rxvt-unicode-terminfo smartmontools-svn speedtest-cli sudo +sysbench sysfsutils syslinux systemd-sysvcompat diff --git a/setup/macos/install-deps b/setup/macos/install-deps index 492d693c..a17499d9 100755 --- a/setup/macos/install-deps +++ b/setup/macos/install-deps @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # set -o errexit @@ -103,3 +103,11 @@ git clone https://github.com/yuyichao/gnuplot-py gnuplot-py cd gnuplot-py git checkout 2c2218dc67 python3 setup.py install + +# Sysbench +git clone https://github.com/akopytov/sysbench sysbench +cd sysbench +./autogen.sh LDFLAGS=-L/usr/local/opt/openssl/lib --without-mysql +./configure LDFLAGS=-L/usr/local/opt/openssl/lib --without-mysql +make MACOSX_DEPLOYMENT_TARGET="${OS_VERSION:0:5}" -j +sudo mv -nv sysbench/src/sysbench /usr/local/bin/ diff --git a/setup/macos/live-macos-startup b/setup/macos/live-macos-startup index 7b5d3e7b..db939bdf 100755 --- a/setup/macos/live-macos-startup +++ b/setup/macos/live-macos-startup @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # ## Init macOS env diff --git a/setup/macos/update-base-image b/setup/macos/update-base-image index abfc00ed..3d985af0 100755 --- a/setup/macos/update-base-image +++ b/setup/macos/update-base-image @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # ## Update BaseImage for use as WK