From 499053708231a5cbe11d3399c4aa66869284cb61 Mon Sep 17 00:00:00 2001 From: 2Shirt <2xShirt@gmail.com> Date: Mon, 2 Dec 2019 20:11:02 -0700 Subject: [PATCH] Handle critical temps correctly in mprime sections * Moved ThermalLimitReachedError catches to wk.hw.sensors * Before they would never be caught and would never stop the script * Added cpu_reached_critical_temp() to wk.hw.sensors * This allows us to check if it happened without exceptions * Added thermal_action to wk.hw.sensors * This is run when ThermalLimitReachedError(s) are caught * Stop print_countdown if mprime is terminated * This is required since it may be killed in the background --- scripts/wk/hw/diags.py | 70 +++++++++++++++++++++++++++------------- scripts/wk/hw/sensors.py | 41 +++++++++++++++++++---- 2 files changed, 81 insertions(+), 30 deletions(-) diff --git a/scripts/wk/hw/diags.py b/scripts/wk/hw/diags.py index 41d784f7..602d3d16 100644 --- a/scripts/wk/hw/diags.py +++ b/scripts/wk/hw/diags.py @@ -68,15 +68,15 @@ MENU_TOGGLES = ( 'Skip USB Benchmarks', ) STATUS_COLORS = { + 'Passed': 'GREEN', 'Aborted': 'YELLOW', - 'Denied': 'RED', - 'ERROR': 'RED', - 'FAIL': 'RED', 'N/A': 'YELLOW', - 'PASS': 'GREEN', - 'TimedOut': 'RED', 'Unknown': 'YELLOW', 'Working': 'YELLOW', + 'Denied': 'RED', + 'ERROR': 'RED', + 'Failed': 'RED', + 'TimedOut': 'RED', } WK_LABEL_REGEX = re.compile( fr'{cfg.main.KIT_NAME_SHORT}_(LINUX|UFD)', @@ -417,6 +417,7 @@ def check_mprime_results(test_obj, working_dir): def cpu_mprime_test(state, test_objects): """CPU & cooling check using Prime95.""" LOG.info('CPU Test (Prime95)') + aborted = False prime_log = pathlib.Path(f'{state.log_dir}/prime.log') sensors_out = pathlib.Path(f'{state.log_dir}/sensors.out') test_mprime_obj, test_cooling_obj = test_objects @@ -432,7 +433,10 @@ def cpu_mprime_test(state, test_objects): # Start sensors monitor sensors = hw_sensors.Sensors() - sensors.start_background_monitor(sensors_out) + sensors.start_background_monitor( + sensors_out, + thermal_action=('killall', 'mprime'), + ) # Create monitor and worker panes state.update_progress_pane() @@ -450,28 +454,27 @@ def cpu_mprime_test(state, test_objects): # Stress CPU std.print_info('Starting stress test') - std.print_warning('If running too hot, press CTRL+c to abort the test') set_apple_fan_speed('max') - proc_mprime = start_mprime_thread(state.log_dir, prime_log) + proc_mprime = start_mprime(state.log_dir, prime_log) # Show countdown + print('') try: - print_countdown(seconds=cfg.hw.CPU_TEST_MINUTES*60) + print_countdown(proc=proc_mprime, seconds=cfg.hw.CPU_TEST_MINUTES*60) except KeyboardInterrupt: - test_cooling_obj.set_status('Aborted') - test_mprime_obj.set_status('Aborted') - except hw_sensors.ThermalLimitReachedError: - test_mprime_obj.set_status('Aborted') + aborted = True # Stop Prime95 - proc_mprime.terminate() - try: - proc_mprime.wait(timeout=5) - except subprocess.TimeoutExpired: - proc_mprime.kill() - set_apple_fan_speed('auto') + stop_mprime(proc_mprime) + + # Update progress if necessary + if sensors.cpu_reached_critical_temp() or aborted: + test_cooling_obj.set_status('Aborted') + test_mprime_obj.set_status('Aborted') + state.update_progress_pane() # Get cooldown temp + std.clear_screen() std.print_standard('Letting CPU cooldown...') std.sleep(5) std.print_standard('Saving cooldown temps...') @@ -705,8 +708,8 @@ def network_test(): std.pause('Press Enter to return to main menu...') -def print_countdown(seconds): - """Print countdown to screen.""" +def print_countdown(proc, seconds): + """Print countdown to screen while proc is alive.""" for i in range(seconds): sec_left = (seconds - i) % 60 min_left = int((seconds - i) / 60) @@ -718,7 +721,17 @@ def print_countdown(seconds): out_str += ' remaining' print(f'{out_str:<42}', end='', flush=True) - std.sleep(1) + try: + proc.wait(1) + except KeyboardInterrupt: + # Stop countdown + break + except subprocess.TimeoutExpired: + # proc still going, continue + pass + if proc.poll() is not None: + # proc exited, stop countdown + break # Done print('') @@ -835,8 +848,9 @@ def show_results(state): std.print_standard(' ') -def start_mprime_thread(working_dir, log_path): +def start_mprime(working_dir, log_path): """Start mprime and save filtered output to log, returns Popen object.""" + set_apple_fan_speed('max') proc_mprime = subprocess.Popen( ['mprime', '-t'], cwd=working_dir, @@ -859,5 +873,15 @@ def start_mprime_thread(working_dir, log_path): return proc_mprime +def stop_mprime(proc): + """Stop mprime gracefully, then forcefully as needed.""" + proc_mprime.terminate() + try: + proc_mprime.wait(timeout=5) + except subprocess.TimeoutExpired: + proc_mprime.kill() + set_apple_fan_speed('auto') + + if __name__ == '__main__': print("This file is not meant to be called directly.") diff --git a/scripts/wk/hw/sensors.py b/scripts/wk/hw/sensors.py index 763f14fa..dcf304be 100644 --- a/scripts/wk/hw/sensors.py +++ b/scripts/wk/hw/sensors.py @@ -64,6 +64,22 @@ class Sensors(): # Done return max_temp + def cpu_reached_critical_temp(self): + """Check if CPU reached CPU_CRITICAL_TEMP, returns bool.""" + for section, adapters in self.data.items(): + if not section.startswith('CPU'): + # Limit to CPU temps + continue + + # Ugly section + for sources in adapters.values(): + for source_data in sources.values(): + if source_data.get('Max', -1) >= CPU_CRITICAL_TEMP: + return True + + # Didn't return above so temps are within the threshold + return False + def generate_report(self, *temp_labels, colored=True, only_cpu=False): """Generate report based on given temp_labels, returns list.""" report = [] @@ -72,7 +88,7 @@ class Sensors(): if only_cpu and not section.startswith('CPU'): continue - # Ugly section + # Ugly section for adapter, sources in sorted(adapters.items()): report.append(fix_sensor_name(adapter)) for source, source_data in sorted(sources.items()): @@ -99,15 +115,22 @@ class Sensors(): # Done return report - def monitor_to_file(self, out_path, temp_labels=None): - """Write report to path every second until stopped.""" + def monitor_to_file(self, out_path, temp_labels=None, thermal_action=None): + """Write report to path every second until stopped. + + thermal_action is a cmd to run if ThermalLimitReachedError is caught. + """ stop_path = pathlib.Path(out_path).resolve().with_suffix('.stop') if not temp_labels: temp_labels = ('Current', 'Max') # Start loop while True: - self.update_sensor_data() + try: + self.update_sensor_data() + except ThermalLimitReachedError: + if thermal_action: + run_program(thermal_action, check=False) report = self.generate_report(*temp_labels) with open(out_path, 'w') as _f: _f.write('\n'.join(report)) @@ -136,15 +159,19 @@ class Sensors(): temps = source_data['Temps'] source_data[temp_label] = sum(temps) / len(temps) - def start_background_monitor(self, out_path, temp_labels=None): - """Start background thread to save report to file.""" + def start_background_monitor( + self, out_path, temp_labels=None, thermal_action=None): + """Start background thread to save report to file. + + thermal_action is a cmd to run if ThermalLimitReachedError is caught. + """ if self.background_thread: raise RuntimeError('Background thread already running') self.out_path = pathlib.Path(out_path) self.background_thread = start_thread( self.monitor_to_file, - args=(out_path, temp_labels), + args=(out_path, temp_labels, thermal_action), ) def stop_background_monitor(self):