Retest temps with sysbench if Prime95 runs too hot

If the CPU reaches the failing temps during Prime95 then sysbench will be
run to emulate a less artificial workload.  The both the overall and sysbench
max temps are recorded and shown in the results.

Added new option to track an alternate max temp value in wk.hw.sensors.
This was needed so show two different max temps recorded during CPU testing.

Sysbench was added to the Linux package list and is compiled for macOS.
Without manually compiling the package it brings in way too many dependencies
to support SQL DB testing (which we don't need).
This commit is contained in:
2Shirt 2021-06-17 03:16:34 -06:00
parent bfea4b9910
commit 8e0fdf641b
Signed by: 2Shirt
GPG key ID: 152FAC923B0E132C
6 changed files with 123 additions and 22 deletions

View file

@ -108,7 +108,7 @@ class State():
self.tests = OrderedDict({
'CPU & Cooling': {
'Enabled': False,
'Function': cpu_mprime_test,
'Function': cpu_stress_tests,
'Objects': [],
},
'Disk Attributes': {
@ -302,7 +302,7 @@ class State():
if not details['Selected']:
continue
if 'CPU' in name:
# Create two Test objects which will both be used by cpu_mprime_test
# Create two Test objects which will both be used by cpu_stress_tests
# NOTE: Prime95 should be added first
test_mprime_obj = hw_obj.Test(dev=self.cpu, label='Prime95')
test_cooling_obj = hw_obj.Test(dev=self.cpu, label='Cooling')
@ -556,9 +556,12 @@ def calc_io_dd_values(dev_size):
}
def check_cooling_results(test_obj, sensors):
def check_cooling_results(test_obj, sensors, run_sysbench=False):
"""Check cooling results and update test_obj."""
max_temp = sensors.cpu_max_temp()
temp_labels = ['Idle', 'Max', 'Cooldown']
if run_sysbench:
temp_labels.append('Sysbench')
# Check temps
if not max_temp:
@ -571,8 +574,7 @@ def check_cooling_results(test_obj, sensors):
test_obj.set_status('Passed')
# Add temps to report
for line in sensors.generate_report(
'Idle', 'Max', 'Cooldown', only_cpu=True):
for line in sensors.generate_report(*temp_labels, only_cpu=True):
test_obj.report.append(f' {line}')
@ -702,12 +704,13 @@ def check_self_test_results(test_obj, aborted=False):
test_obj.set_status('Failed')
def cpu_mprime_test(state, test_objects):
def cpu_stress_tests(state, test_objects):
# pylint: disable=too-many-statements
"""CPU & cooling check using Prime95."""
"""CPU & cooling check using Prime95 and Sysbench."""
LOG.info('CPU Test (Prime95)')
aborted = False
prime_log = pathlib.Path(f'{state.log_dir}/prime.log')
run_sysbench = False
sensors_out = pathlib.Path(f'{state.log_dir}/sensors.out')
test_mprime_obj, test_cooling_obj = test_objects
@ -777,9 +780,41 @@ def cpu_mprime_test(state, test_objects):
test_mprime_obj.report.append(std.color_string('Prime95', 'BLUE'))
check_mprime_results(test_obj=test_mprime_obj, working_dir=state.log_dir)
# Run Sysbench test if necessary
run_sysbench = (
not aborted and sensors.cpu_max_temp() >= cfg.hw.CPU_FAILURE_TEMP
)
if run_sysbench:
LOG.info('CPU Test (Sysbench)')
std.clear_screen()
std.print_info('Starting alternate stress test')
print('')
proc_sysbench, filehandle_sysbench = start_sysbench(
sensors,
sensors_out,
log_path=prime_log.with_name('sysbench.log'),
pane='Prime95',
)
try:
print_countdown(proc=proc_sysbench, seconds=cfg.hw.CPU_TEST_MINUTES*60)
except AttributeError:
# Assuming the sysbench process wasn't found and proc was set to None
LOG.error('Failed to find sysbench process', exc_info=True)
except KeyboardInterrupt:
aborted = True
stop_sysbench(proc_sysbench, filehandle_sysbench)
# Update progress
# NOTE: CPU critical temp check isn't really necessary
# Hard to imagine it wasn't hit during Prime95 but was in sysbench
if sensors.cpu_reached_critical_temp() or aborted:
test_cooling_obj.set_status('Aborted')
test_mprime_obj.set_status('Aborted')
state.update_progress_pane()
# Check Cooling results
test_cooling_obj.report.append(std.color_string('Temps', 'BLUE'))
check_cooling_results(test_obj=test_cooling_obj, sensors=sensors)
check_cooling_results(test_cooling_obj, sensors, run_sysbench)
# Cleanup
state.update_progress_pane()
@ -1300,7 +1335,8 @@ def print_countdown(proc, seconds):
except subprocess.TimeoutExpired:
# proc still going, continue
pass
if proc.poll() is not None:
if ((hasattr(proc, 'poll') and proc.poll() is not None)
or (hasattr(proc, 'is_running') and not proc.is_running())):
# proc exited, stop countdown
break
@ -1460,6 +1496,35 @@ def start_mprime(working_dir, log_path):
return proc_mprime
def start_sysbench(sensors, sensors_out, log_path, pane):
"""Start sysbench, returns tuple with Popen object and file handle."""
set_apple_fan_speed('max')
sysbench_cmd = [
'sysbench',
f'--threads={exe.psutil.cpu_count()}',
'--cpu-max-prime=1000000000',
'cpu',
'run',
]
# Restart background monitor for Sysbench
sensors.stop_background_monitor()
sensors.start_background_monitor(
sensors_out,
alt_max='Sysbench',
thermal_action=('killall', 'sysbench', '-INT'),
)
# Update bottom pane
tmux.respawn_pane(pane, watch_file=log_path)
# Start sysbench
filehandle_sysbench = open(log_path, 'a')
proc_sysbench = exe.popen_program(sysbench_cmd, stdout=filehandle_sysbench)
# Done
return (proc_sysbench, filehandle_sysbench)
def stop_mprime(proc_mprime):
"""Stop mprime gracefully, then forcefully as needed."""
proc_mprime.terminate()
@ -1470,6 +1535,18 @@ def stop_mprime(proc_mprime):
set_apple_fan_speed('auto')
def stop_sysbench(proc_sysbench, filehandle_sysbench):
"""Stop sysbench."""
proc_sysbench.terminate()
try:
proc_sysbench.wait(timeout=5)
except subprocess.TimeoutExpired:
proc_sysbench.kill()
filehandle_sysbench.flush()
filehandle_sysbench.close()
set_apple_fan_speed('auto')
def sync_clock():
"""Sync clock under macOS using sntp."""
cmd = ['sudo', 'sntp', '-Ss', 'us.pool.ntp.org']

View file

@ -10,6 +10,7 @@ from subprocess import CalledProcessError
from wk.cfg.hw import CPU_CRITICAL_TEMP, SMC_IDS, TEMP_COLORS
from wk.exe import run_program, start_thread
from wk.io import non_clobber_path
from wk.std import PLATFORM, color_string, sleep
@ -115,20 +116,27 @@ class Sensors():
return report
def monitor_to_file(
self, out_path,
self, out_path, alt_max=None,
exit_on_thermal_limit=True, temp_labels=None, thermal_action=None):
# pylint: disable=too-many-arguments
"""Write report to path every second until stopped.
thermal_action is a cmd to run if ThermalLimitReachedError is caught.
"""
stop_path = pathlib.Path(out_path).resolve().with_suffix('.stop')
if stop_path.exists():
# Rename existing file to allow thread to start as expected
# Yes this is excessive but safe
stop_path.rename(non_clobber_path(stop_path))
if not temp_labels:
temp_labels = ('Current', 'Max')
temp_labels = ['Current', 'Max']
if alt_max:
temp_labels.append(alt_max)
# Start loop
while True:
try:
self.update_sensor_data(exit_on_thermal_limit)
self.update_sensor_data(alt_max, exit_on_thermal_limit)
except ThermalLimitReachedError:
if thermal_action:
run_program(thermal_action, check=False)
@ -169,8 +177,9 @@ class Sensors():
source_data[temp_label] = 0
def start_background_monitor(
self, out_path,
self, out_path, alt_max=None,
exit_on_thermal_limit=True, temp_labels=None, thermal_action=None):
# pylint: disable=too-many-arguments
"""Start background thread to save report to file.
thermal_action is a cmd to run if ThermalLimitReachedError is caught.
@ -181,7 +190,9 @@ class Sensors():
self.out_path = pathlib.Path(out_path)
self.background_thread = start_thread(
self.monitor_to_file,
args=(out_path, exit_on_thermal_limit, temp_labels, thermal_action),
args=(
out_path, alt_max, exit_on_thermal_limit, temp_labels, thermal_action,
),
)
def stop_background_monitor(self):
@ -193,14 +204,14 @@ class Sensors():
self.background_thread = None
self.out_path = None
def update_sensor_data(self, exit_on_thermal_limit=True):
def update_sensor_data(self, alt_max=None, exit_on_thermal_limit=True):
"""Update sensor data via OS-specific means."""
if PLATFORM == 'Darwin':
self.update_sensor_data_macos(exit_on_thermal_limit)
self.update_sensor_data_macos(alt_max, exit_on_thermal_limit)
elif PLATFORM == 'Linux':
self.update_sensor_data_linux(exit_on_thermal_limit)
self.update_sensor_data_linux(alt_max, exit_on_thermal_limit)
def update_sensor_data_linux(self, exit_on_thermal_limit=True):
def update_sensor_data_linux(self, alt_max, exit_on_thermal_limit=True):
"""Update sensor data via lm_sensors."""
lm_sensor_data = get_sensor_data_lm()
for section, adapters in self.data.items():
@ -212,6 +223,8 @@ class Sensors():
source_data['Current'] = temp
source_data['Max'] = max(temp, source_data['Max'])
source_data['Temps'].append(temp)
if alt_max:
source_data[alt_max] = max(temp, source_data.get(alt_max, 0))
except KeyError:
# Dumb workaround for Dell sensors with changing source names
pass
@ -221,7 +234,7 @@ class Sensors():
if source_data['Current'] >= CPU_CRITICAL_TEMP:
raise ThermalLimitReachedError('CPU temps reached limit')
def update_sensor_data_macos(self, exit_on_thermal_limit=True):
def update_sensor_data_macos(self, alt_max, exit_on_thermal_limit=True):
"""Update sensor data via SMC."""
for section, adapters in self.data.items():
for sources in adapters.values():
@ -239,6 +252,8 @@ class Sensors():
source_data['Current'] = temp
source_data['Max'] = max(temp, source_data['Max'])
source_data['Temps'].append(temp)
if alt_max:
source_data[alt_max] = max(temp, source_data.get(alt_max, 0))
# Raise exception if thermal limit reached
if exit_on_thermal_limit and section == 'CPUTemps':

View file

@ -64,6 +64,7 @@ rxvt-unicode-terminfo
smartmontools-svn
speedtest-cli
sudo
sysbench
sysfsutils
syslinux
systemd-sysvcompat

View file

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
#
set -o errexit
@ -103,3 +103,11 @@ git clone https://github.com/yuyichao/gnuplot-py gnuplot-py
cd gnuplot-py
git checkout 2c2218dc67
python3 setup.py install
# Sysbench
git clone https://github.com/akopytov/sysbench sysbench
cd sysbench
./autogen.sh LDFLAGS=-L/usr/local/opt/openssl/lib --without-mysql
./configure LDFLAGS=-L/usr/local/opt/openssl/lib --without-mysql
make MACOSX_DEPLOYMENT_TARGET="${OS_VERSION:0:5}" -j
sudo mv -nv sysbench/src/sysbench /usr/local/bin/

View file

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
#
## Init macOS env

View file

@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
#
## Update BaseImage for use as WK