From 8e0fdf641b8e9a14005b93abff52816ddaa70315 Mon Sep 17 00:00:00 2001 From: 2Shirt <2xShirt@gmail.com> Date: Thu, 17 Jun 2021 03:16:34 -0600 Subject: [PATCH] Retest temps with sysbench if Prime95 runs too hot If the CPU reaches the failing temps during Prime95 then sysbench will be run to emulate a less artificial workload. The both the overall and sysbench max temps are recorded and shown in the results. Added new option to track an alternate max temp value in wk.hw.sensors. This was needed so show two different max temps recorded during CPU testing. Sysbench was added to the Linux package list and is compiled for macOS. Without manually compiling the package it brings in way too many dependencies to support SQL DB testing (which we don't need). --- scripts/wk/hw/diags.py | 95 ++++++++++++++++++++++++++++++---- scripts/wk/hw/sensors.py | 35 +++++++++---- setup/linux/packages/base | 1 + setup/macos/install-deps | 10 +++- setup/macos/live-macos-startup | 2 +- setup/macos/update-base-image | 2 +- 6 files changed, 123 insertions(+), 22 deletions(-) diff --git a/scripts/wk/hw/diags.py b/scripts/wk/hw/diags.py index eec9c2b6..f56b51fb 100644 --- a/scripts/wk/hw/diags.py +++ b/scripts/wk/hw/diags.py @@ -108,7 +108,7 @@ class State(): self.tests = OrderedDict({ 'CPU & Cooling': { 'Enabled': False, - 'Function': cpu_mprime_test, + 'Function': cpu_stress_tests, 'Objects': [], }, 'Disk Attributes': { @@ -302,7 +302,7 @@ class State(): if not details['Selected']: continue if 'CPU' in name: - # Create two Test objects which will both be used by cpu_mprime_test + # Create two Test objects which will both be used by cpu_stress_tests # NOTE: Prime95 should be added first test_mprime_obj = hw_obj.Test(dev=self.cpu, label='Prime95') test_cooling_obj = hw_obj.Test(dev=self.cpu, label='Cooling') @@ -556,9 +556,12 @@ def calc_io_dd_values(dev_size): } -def check_cooling_results(test_obj, sensors): +def check_cooling_results(test_obj, sensors, run_sysbench=False): """Check cooling results and update test_obj.""" max_temp = sensors.cpu_max_temp() + temp_labels = ['Idle', 'Max', 'Cooldown'] + if run_sysbench: + temp_labels.append('Sysbench') # Check temps if not max_temp: @@ -571,8 +574,7 @@ def check_cooling_results(test_obj, sensors): test_obj.set_status('Passed') # Add temps to report - for line in sensors.generate_report( - 'Idle', 'Max', 'Cooldown', only_cpu=True): + for line in sensors.generate_report(*temp_labels, only_cpu=True): test_obj.report.append(f' {line}') @@ -702,12 +704,13 @@ def check_self_test_results(test_obj, aborted=False): test_obj.set_status('Failed') -def cpu_mprime_test(state, test_objects): +def cpu_stress_tests(state, test_objects): # pylint: disable=too-many-statements - """CPU & cooling check using Prime95.""" + """CPU & cooling check using Prime95 and Sysbench.""" LOG.info('CPU Test (Prime95)') aborted = False prime_log = pathlib.Path(f'{state.log_dir}/prime.log') + run_sysbench = False sensors_out = pathlib.Path(f'{state.log_dir}/sensors.out') test_mprime_obj, test_cooling_obj = test_objects @@ -777,9 +780,41 @@ def cpu_mprime_test(state, test_objects): test_mprime_obj.report.append(std.color_string('Prime95', 'BLUE')) check_mprime_results(test_obj=test_mprime_obj, working_dir=state.log_dir) + # Run Sysbench test if necessary + run_sysbench = ( + not aborted and sensors.cpu_max_temp() >= cfg.hw.CPU_FAILURE_TEMP + ) + if run_sysbench: + LOG.info('CPU Test (Sysbench)') + std.clear_screen() + std.print_info('Starting alternate stress test') + print('') + proc_sysbench, filehandle_sysbench = start_sysbench( + sensors, + sensors_out, + log_path=prime_log.with_name('sysbench.log'), + pane='Prime95', + ) + try: + print_countdown(proc=proc_sysbench, seconds=cfg.hw.CPU_TEST_MINUTES*60) + except AttributeError: + # Assuming the sysbench process wasn't found and proc was set to None + LOG.error('Failed to find sysbench process', exc_info=True) + except KeyboardInterrupt: + aborted = True + stop_sysbench(proc_sysbench, filehandle_sysbench) + + # Update progress + # NOTE: CPU critical temp check isn't really necessary + # Hard to imagine it wasn't hit during Prime95 but was in sysbench + if sensors.cpu_reached_critical_temp() or aborted: + test_cooling_obj.set_status('Aborted') + test_mprime_obj.set_status('Aborted') + state.update_progress_pane() + # Check Cooling results test_cooling_obj.report.append(std.color_string('Temps', 'BLUE')) - check_cooling_results(test_obj=test_cooling_obj, sensors=sensors) + check_cooling_results(test_cooling_obj, sensors, run_sysbench) # Cleanup state.update_progress_pane() @@ -1300,7 +1335,8 @@ def print_countdown(proc, seconds): except subprocess.TimeoutExpired: # proc still going, continue pass - if proc.poll() is not None: + if ((hasattr(proc, 'poll') and proc.poll() is not None) + or (hasattr(proc, 'is_running') and not proc.is_running())): # proc exited, stop countdown break @@ -1460,6 +1496,35 @@ def start_mprime(working_dir, log_path): return proc_mprime +def start_sysbench(sensors, sensors_out, log_path, pane): + """Start sysbench, returns tuple with Popen object and file handle.""" + set_apple_fan_speed('max') + sysbench_cmd = [ + 'sysbench', + f'--threads={exe.psutil.cpu_count()}', + '--cpu-max-prime=1000000000', + 'cpu', + 'run', + ] + + # Restart background monitor for Sysbench + sensors.stop_background_monitor() + sensors.start_background_monitor( + sensors_out, + alt_max='Sysbench', + thermal_action=('killall', 'sysbench', '-INT'), + ) + + # Update bottom pane + tmux.respawn_pane(pane, watch_file=log_path) + + # Start sysbench + filehandle_sysbench = open(log_path, 'a') + proc_sysbench = exe.popen_program(sysbench_cmd, stdout=filehandle_sysbench) + + # Done + return (proc_sysbench, filehandle_sysbench) + def stop_mprime(proc_mprime): """Stop mprime gracefully, then forcefully as needed.""" proc_mprime.terminate() @@ -1470,6 +1535,18 @@ def stop_mprime(proc_mprime): set_apple_fan_speed('auto') +def stop_sysbench(proc_sysbench, filehandle_sysbench): + """Stop sysbench.""" + proc_sysbench.terminate() + try: + proc_sysbench.wait(timeout=5) + except subprocess.TimeoutExpired: + proc_sysbench.kill() + filehandle_sysbench.flush() + filehandle_sysbench.close() + set_apple_fan_speed('auto') + + def sync_clock(): """Sync clock under macOS using sntp.""" cmd = ['sudo', 'sntp', '-Ss', 'us.pool.ntp.org'] diff --git a/scripts/wk/hw/sensors.py b/scripts/wk/hw/sensors.py index 573c9c3b..e5b27bc1 100644 --- a/scripts/wk/hw/sensors.py +++ b/scripts/wk/hw/sensors.py @@ -10,6 +10,7 @@ from subprocess import CalledProcessError from wk.cfg.hw import CPU_CRITICAL_TEMP, SMC_IDS, TEMP_COLORS from wk.exe import run_program, start_thread +from wk.io import non_clobber_path from wk.std import PLATFORM, color_string, sleep @@ -115,20 +116,27 @@ class Sensors(): return report def monitor_to_file( - self, out_path, + self, out_path, alt_max=None, exit_on_thermal_limit=True, temp_labels=None, thermal_action=None): + # pylint: disable=too-many-arguments """Write report to path every second until stopped. thermal_action is a cmd to run if ThermalLimitReachedError is caught. """ stop_path = pathlib.Path(out_path).resolve().with_suffix('.stop') + if stop_path.exists(): + # Rename existing file to allow thread to start as expected + # Yes this is excessive but safe + stop_path.rename(non_clobber_path(stop_path)) if not temp_labels: - temp_labels = ('Current', 'Max') + temp_labels = ['Current', 'Max'] + if alt_max: + temp_labels.append(alt_max) # Start loop while True: try: - self.update_sensor_data(exit_on_thermal_limit) + self.update_sensor_data(alt_max, exit_on_thermal_limit) except ThermalLimitReachedError: if thermal_action: run_program(thermal_action, check=False) @@ -169,8 +177,9 @@ class Sensors(): source_data[temp_label] = 0 def start_background_monitor( - self, out_path, + self, out_path, alt_max=None, exit_on_thermal_limit=True, temp_labels=None, thermal_action=None): + # pylint: disable=too-many-arguments """Start background thread to save report to file. thermal_action is a cmd to run if ThermalLimitReachedError is caught. @@ -181,7 +190,9 @@ class Sensors(): self.out_path = pathlib.Path(out_path) self.background_thread = start_thread( self.monitor_to_file, - args=(out_path, exit_on_thermal_limit, temp_labels, thermal_action), + args=( + out_path, alt_max, exit_on_thermal_limit, temp_labels, thermal_action, + ), ) def stop_background_monitor(self): @@ -193,14 +204,14 @@ class Sensors(): self.background_thread = None self.out_path = None - def update_sensor_data(self, exit_on_thermal_limit=True): + def update_sensor_data(self, alt_max=None, exit_on_thermal_limit=True): """Update sensor data via OS-specific means.""" if PLATFORM == 'Darwin': - self.update_sensor_data_macos(exit_on_thermal_limit) + self.update_sensor_data_macos(alt_max, exit_on_thermal_limit) elif PLATFORM == 'Linux': - self.update_sensor_data_linux(exit_on_thermal_limit) + self.update_sensor_data_linux(alt_max, exit_on_thermal_limit) - def update_sensor_data_linux(self, exit_on_thermal_limit=True): + def update_sensor_data_linux(self, alt_max, exit_on_thermal_limit=True): """Update sensor data via lm_sensors.""" lm_sensor_data = get_sensor_data_lm() for section, adapters in self.data.items(): @@ -212,6 +223,8 @@ class Sensors(): source_data['Current'] = temp source_data['Max'] = max(temp, source_data['Max']) source_data['Temps'].append(temp) + if alt_max: + source_data[alt_max] = max(temp, source_data.get(alt_max, 0)) except KeyError: # Dumb workaround for Dell sensors with changing source names pass @@ -221,7 +234,7 @@ class Sensors(): if source_data['Current'] >= CPU_CRITICAL_TEMP: raise ThermalLimitReachedError('CPU temps reached limit') - def update_sensor_data_macos(self, exit_on_thermal_limit=True): + def update_sensor_data_macos(self, alt_max, exit_on_thermal_limit=True): """Update sensor data via SMC.""" for section, adapters in self.data.items(): for sources in adapters.values(): @@ -239,6 +252,8 @@ class Sensors(): source_data['Current'] = temp source_data['Max'] = max(temp, source_data['Max']) source_data['Temps'].append(temp) + if alt_max: + source_data[alt_max] = max(temp, source_data.get(alt_max, 0)) # Raise exception if thermal limit reached if exit_on_thermal_limit and section == 'CPUTemps': diff --git a/setup/linux/packages/base b/setup/linux/packages/base index c9084bd2..8fff04f3 100644 --- a/setup/linux/packages/base +++ b/setup/linux/packages/base @@ -64,6 +64,7 @@ rxvt-unicode-terminfo smartmontools-svn speedtest-cli sudo +sysbench sysfsutils syslinux systemd-sysvcompat diff --git a/setup/macos/install-deps b/setup/macos/install-deps index 492d693c..a17499d9 100755 --- a/setup/macos/install-deps +++ b/setup/macos/install-deps @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # set -o errexit @@ -103,3 +103,11 @@ git clone https://github.com/yuyichao/gnuplot-py gnuplot-py cd gnuplot-py git checkout 2c2218dc67 python3 setup.py install + +# Sysbench +git clone https://github.com/akopytov/sysbench sysbench +cd sysbench +./autogen.sh LDFLAGS=-L/usr/local/opt/openssl/lib --without-mysql +./configure LDFLAGS=-L/usr/local/opt/openssl/lib --without-mysql +make MACOSX_DEPLOYMENT_TARGET="${OS_VERSION:0:5}" -j +sudo mv -nv sysbench/src/sysbench /usr/local/bin/ diff --git a/setup/macos/live-macos-startup b/setup/macos/live-macos-startup index 7b5d3e7b..db939bdf 100755 --- a/setup/macos/live-macos-startup +++ b/setup/macos/live-macos-startup @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # ## Init macOS env diff --git a/setup/macos/update-base-image b/setup/macos/update-base-image index abfc00ed..3d985af0 100755 --- a/setup/macos/update-base-image +++ b/setup/macos/update-base-image @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # ## Update BaseImage for use as WK