Handle critical temps correctly in mprime sections

* Moved ThermalLimitReachedError catches to wk.hw.sensors
  * Before they would never be caught and would never stop the script
* Added cpu_reached_critical_temp() to wk.hw.sensors
  * This allows us to check if it happened without exceptions
* Added thermal_action to wk.hw.sensors
  * This is run when ThermalLimitReachedError(s) are caught
* Stop print_countdown if mprime is terminated
  * This is required since it may be killed in the background
This commit is contained in:
2Shirt 2019-12-02 20:11:02 -07:00
parent c520b5a865
commit 4990537082
Signed by: 2Shirt
GPG key ID: 152FAC923B0E132C
2 changed files with 81 additions and 30 deletions

View file

@ -68,15 +68,15 @@ MENU_TOGGLES = (
'Skip USB Benchmarks', 'Skip USB Benchmarks',
) )
STATUS_COLORS = { STATUS_COLORS = {
'Passed': 'GREEN',
'Aborted': 'YELLOW', 'Aborted': 'YELLOW',
'Denied': 'RED',
'ERROR': 'RED',
'FAIL': 'RED',
'N/A': 'YELLOW', 'N/A': 'YELLOW',
'PASS': 'GREEN',
'TimedOut': 'RED',
'Unknown': 'YELLOW', 'Unknown': 'YELLOW',
'Working': 'YELLOW', 'Working': 'YELLOW',
'Denied': 'RED',
'ERROR': 'RED',
'Failed': 'RED',
'TimedOut': 'RED',
} }
WK_LABEL_REGEX = re.compile( WK_LABEL_REGEX = re.compile(
fr'{cfg.main.KIT_NAME_SHORT}_(LINUX|UFD)', fr'{cfg.main.KIT_NAME_SHORT}_(LINUX|UFD)',
@ -417,6 +417,7 @@ def check_mprime_results(test_obj, working_dir):
def cpu_mprime_test(state, test_objects): def cpu_mprime_test(state, test_objects):
"""CPU & cooling check using Prime95.""" """CPU & cooling check using Prime95."""
LOG.info('CPU Test (Prime95)') LOG.info('CPU Test (Prime95)')
aborted = False
prime_log = pathlib.Path(f'{state.log_dir}/prime.log') prime_log = pathlib.Path(f'{state.log_dir}/prime.log')
sensors_out = pathlib.Path(f'{state.log_dir}/sensors.out') sensors_out = pathlib.Path(f'{state.log_dir}/sensors.out')
test_mprime_obj, test_cooling_obj = test_objects test_mprime_obj, test_cooling_obj = test_objects
@ -432,7 +433,10 @@ def cpu_mprime_test(state, test_objects):
# Start sensors monitor # Start sensors monitor
sensors = hw_sensors.Sensors() sensors = hw_sensors.Sensors()
sensors.start_background_monitor(sensors_out) sensors.start_background_monitor(
sensors_out,
thermal_action=('killall', 'mprime'),
)
# Create monitor and worker panes # Create monitor and worker panes
state.update_progress_pane() state.update_progress_pane()
@ -450,28 +454,27 @@ def cpu_mprime_test(state, test_objects):
# Stress CPU # Stress CPU
std.print_info('Starting stress test') std.print_info('Starting stress test')
std.print_warning('If running too hot, press CTRL+c to abort the test')
set_apple_fan_speed('max') set_apple_fan_speed('max')
proc_mprime = start_mprime_thread(state.log_dir, prime_log) proc_mprime = start_mprime(state.log_dir, prime_log)
# Show countdown # Show countdown
print('')
try: try:
print_countdown(seconds=cfg.hw.CPU_TEST_MINUTES*60) print_countdown(proc=proc_mprime, seconds=cfg.hw.CPU_TEST_MINUTES*60)
except KeyboardInterrupt: except KeyboardInterrupt:
test_cooling_obj.set_status('Aborted') aborted = True
test_mprime_obj.set_status('Aborted')
except hw_sensors.ThermalLimitReachedError:
test_mprime_obj.set_status('Aborted')
# Stop Prime95 # Stop Prime95
proc_mprime.terminate() stop_mprime(proc_mprime)
try:
proc_mprime.wait(timeout=5) # Update progress if necessary
except subprocess.TimeoutExpired: if sensors.cpu_reached_critical_temp() or aborted:
proc_mprime.kill() test_cooling_obj.set_status('Aborted')
set_apple_fan_speed('auto') test_mprime_obj.set_status('Aborted')
state.update_progress_pane()
# Get cooldown temp # Get cooldown temp
std.clear_screen()
std.print_standard('Letting CPU cooldown...') std.print_standard('Letting CPU cooldown...')
std.sleep(5) std.sleep(5)
std.print_standard('Saving cooldown temps...') std.print_standard('Saving cooldown temps...')
@ -705,8 +708,8 @@ def network_test():
std.pause('Press Enter to return to main menu...') std.pause('Press Enter to return to main menu...')
def print_countdown(seconds): def print_countdown(proc, seconds):
"""Print countdown to screen.""" """Print countdown to screen while proc is alive."""
for i in range(seconds): for i in range(seconds):
sec_left = (seconds - i) % 60 sec_left = (seconds - i) % 60
min_left = int((seconds - i) / 60) min_left = int((seconds - i) / 60)
@ -718,7 +721,17 @@ def print_countdown(seconds):
out_str += ' remaining' out_str += ' remaining'
print(f'{out_str:<42}', end='', flush=True) print(f'{out_str:<42}', end='', flush=True)
std.sleep(1) try:
proc.wait(1)
except KeyboardInterrupt:
# Stop countdown
break
except subprocess.TimeoutExpired:
# proc still going, continue
pass
if proc.poll() is not None:
# proc exited, stop countdown
break
# Done # Done
print('') print('')
@ -835,8 +848,9 @@ def show_results(state):
std.print_standard(' ') std.print_standard(' ')
def start_mprime_thread(working_dir, log_path): def start_mprime(working_dir, log_path):
"""Start mprime and save filtered output to log, returns Popen object.""" """Start mprime and save filtered output to log, returns Popen object."""
set_apple_fan_speed('max')
proc_mprime = subprocess.Popen( proc_mprime = subprocess.Popen(
['mprime', '-t'], ['mprime', '-t'],
cwd=working_dir, cwd=working_dir,
@ -859,5 +873,15 @@ def start_mprime_thread(working_dir, log_path):
return proc_mprime return proc_mprime
def stop_mprime(proc):
"""Stop mprime gracefully, then forcefully as needed."""
proc_mprime.terminate()
try:
proc_mprime.wait(timeout=5)
except subprocess.TimeoutExpired:
proc_mprime.kill()
set_apple_fan_speed('auto')
if __name__ == '__main__': if __name__ == '__main__':
print("This file is not meant to be called directly.") print("This file is not meant to be called directly.")

View file

@ -64,6 +64,22 @@ class Sensors():
# Done # Done
return max_temp return max_temp
def cpu_reached_critical_temp(self):
"""Check if CPU reached CPU_CRITICAL_TEMP, returns bool."""
for section, adapters in self.data.items():
if not section.startswith('CPU'):
# Limit to CPU temps
continue
# Ugly section
for sources in adapters.values():
for source_data in sources.values():
if source_data.get('Max', -1) >= CPU_CRITICAL_TEMP:
return True
# Didn't return above so temps are within the threshold
return False
def generate_report(self, *temp_labels, colored=True, only_cpu=False): def generate_report(self, *temp_labels, colored=True, only_cpu=False):
"""Generate report based on given temp_labels, returns list.""" """Generate report based on given temp_labels, returns list."""
report = [] report = []
@ -72,7 +88,7 @@ class Sensors():
if only_cpu and not section.startswith('CPU'): if only_cpu and not section.startswith('CPU'):
continue continue
# Ugly section # Ugly section
for adapter, sources in sorted(adapters.items()): for adapter, sources in sorted(adapters.items()):
report.append(fix_sensor_name(adapter)) report.append(fix_sensor_name(adapter))
for source, source_data in sorted(sources.items()): for source, source_data in sorted(sources.items()):
@ -99,15 +115,22 @@ class Sensors():
# Done # Done
return report return report
def monitor_to_file(self, out_path, temp_labels=None): def monitor_to_file(self, out_path, temp_labels=None, thermal_action=None):
"""Write report to path every second until stopped.""" """Write report to path every second until stopped.
thermal_action is a cmd to run if ThermalLimitReachedError is caught.
"""
stop_path = pathlib.Path(out_path).resolve().with_suffix('.stop') stop_path = pathlib.Path(out_path).resolve().with_suffix('.stop')
if not temp_labels: if not temp_labels:
temp_labels = ('Current', 'Max') temp_labels = ('Current', 'Max')
# Start loop # Start loop
while True: while True:
self.update_sensor_data() try:
self.update_sensor_data()
except ThermalLimitReachedError:
if thermal_action:
run_program(thermal_action, check=False)
report = self.generate_report(*temp_labels) report = self.generate_report(*temp_labels)
with open(out_path, 'w') as _f: with open(out_path, 'w') as _f:
_f.write('\n'.join(report)) _f.write('\n'.join(report))
@ -136,15 +159,19 @@ class Sensors():
temps = source_data['Temps'] temps = source_data['Temps']
source_data[temp_label] = sum(temps) / len(temps) source_data[temp_label] = sum(temps) / len(temps)
def start_background_monitor(self, out_path, temp_labels=None): def start_background_monitor(
"""Start background thread to save report to file.""" self, out_path, temp_labels=None, thermal_action=None):
"""Start background thread to save report to file.
thermal_action is a cmd to run if ThermalLimitReachedError is caught.
"""
if self.background_thread: if self.background_thread:
raise RuntimeError('Background thread already running') raise RuntimeError('Background thread already running')
self.out_path = pathlib.Path(out_path) self.out_path = pathlib.Path(out_path)
self.background_thread = start_thread( self.background_thread = start_thread(
self.monitor_to_file, self.monitor_to_file,
args=(out_path, temp_labels), args=(out_path, temp_labels, thermal_action),
) )
def stop_background_monitor(self): def stop_background_monitor(self):