diff --git a/scripts/wk/clone/ddrescue.py b/scripts/wk/clone/ddrescue.py index a09b276f..f56854cf 100644 --- a/scripts/wk/clone/ddrescue.py +++ b/scripts/wk/clone/ddrescue.py @@ -29,8 +29,8 @@ from wk.cfg.ddrescue import ( ) from wk.hw import disk as hw_disk from wk.hw.smart import ( - CriticalHardwareError, - safety_checks, + check_attributes, + smart_status_ok, update_smart_details, ) @@ -951,13 +951,23 @@ class State(): def safety_check_destination(self): """Run safety checks for destination and abort if necessary.""" - try: - safety_checks(self.destination) - except CriticalHardwareError as err: + errors_detected = False + + # Check for critical errors + if not smart_status_ok(self.destination): std.print_error( f'Critical error(s) detected for: {self.destination.path}', ) - raise std.GenericAbort() from err + + # Check for minor errors + if not check_attributes(self.destination, only_blocking=False): + std.print_warning( + f'Attribute error(s) detected for: {self.destination.path}', + ) + + # Done + if errors_detected: + raise std.GenericAbort() def safety_check_size(self): """Run size safety check and abort if necessary.""" @@ -1494,11 +1504,13 @@ def check_destination_health(destination): # Return empty string return result - # Run safety checks - try: - safety_checks(destination) - except CriticalHardwareError: - result = 'Critical hardware error detected on destination' + # Check for critical errors + if not smart_status_ok(destination): + result = 'Critical error(s) detected for: {destination.path}' + + # Check for minor errors + if not check_attributes(destination, only_blocking=False): + result = f'Attribute error(s) detected for: {destination.path}' # Done return result diff --git a/scripts/wk/hw/diags.py b/scripts/wk/hw/diags.py index 6da145c8..1307190a 100644 --- a/scripts/wk/hw/diags.py +++ b/scripts/wk/hw/diags.py @@ -110,61 +110,12 @@ class State(): tmux.kill_pane(_id) self.panes.pop(key) - def disk_safety_checks(self, prep=False) -> None: - # pylint: disable=too-many-branches,too-many-statements - """Run disk safety checks.""" - for disk in self.disks: - disable_tests = False - - # Skip already disabled devices - if all(test.disabled for test in disk.tests): - continue - - try: - hw_smart.safety_checks(disk) - except hw_smart.CriticalHardwareError: - disable_tests = True - disk.add_note('Critical hardware error detected.', 'RED') - if 'Disk Attributes' in disk.tests: - disk.tests['Disk Attributes'].failed = True - disk.tests['Disk Attributes'].set_status('Failed') - if not prep: - # Mid-diag failure detected - LOG.warning('Critical hardware error detected during diagnostics') - disk.add_note( - 'Critical hardware error detected during diagnostics', - 'YELLOW', - ) - else: - if ( - 'Disk Attributes' in disk.tests - and not disk.tests['Disk Attributes'].failed - and not hw_smart.check_attributes(disk, only_blocking=False) - ): - # No blocking errors encountered, but found minor attribute failures - if not prep: - # Mid-diag failure detected - LOG.warning('Attribute(s) failure detected during diagnostics') - disk.add_note( - 'Attribute(s) failure detected during diagnostics', - 'YELLOW', - ) - disk.tests['Disk Attributes'].failed = True - disk.tests['Disk Attributes'].set_status('Failed') - - # Check Surface Scan - if ( - 'Disk Surface Scan' in disk.tests - and disk.tests['Disk Surface Scan'].failed - and 'Disk I/O Benchmark' in disk.tests - ): - # Disable I/O Benchmark test - disk.tests['Disk I/O Benchmark'].set_status('Skipped') - disk.tests['Disk I/O Benchmark'].disabled = True - - # Disable tests if necessary - if disable_tests: - disk.disable_disk_tests() + def disk_safety_checks(self) -> None: + """Check for mid-run SMART failures and failed test(s).""" + for dev in self.disks: + disk_smart_status_check(dev, mid_run=True) + if any(test.failed for test in dev.tests): + dev.disable_disk_tests() def fix_tmux_layout(self, forced=True) -> None: """Fix tmux layout based on cfg.hw.TMUX_LAYOUT.""" @@ -251,9 +202,6 @@ class State(): test_group.test_objects.append(test_obj) self.test_groups.append(test_group) - # Run safety checks - self.disk_safety_checks(prep=True) - def init_tmux(self) -> None: """Initialize tmux layout.""" tmux.kill_all_panes() @@ -552,18 +500,12 @@ def disk_attribute_check(state, test_objects) -> None: """Disk attribute check.""" LOG.info('Disk Attribute Check') for test in test_objects: + disk_smart_status_check(test.dev, mid_run=False) if not test.dev.attributes: # No NVMe/SMART data test.set_status('N/A') continue - if hw_smart.check_attributes(test.dev): - test.passed = True - test.set_status('Passed') - else: - test.failed = True - test.set_status('Failed') - # Done state.update_progress_pane() @@ -685,6 +627,43 @@ def disk_self_test(state, test_objects) -> None: raise std.GenericAbort('Aborted') +def disk_smart_status_check(dev, mid_run=True) -> None: + """Check SMART status.""" + msg = None + color = None + disable_tests = False + + # Check SMART status and attributes + if not hw_smart.smart_status_ok(dev): + msg = 'Critical SMART error detected' + color = 'RED' + disable_tests = True + elif not hw_smart.check_attributes(dev, only_blocking=False): + # Non-blocking errors + msg = 'SMART attribute failure(s) detected' + color = 'YELLOW' + + # Log errors if detected + if msg: + msg = f'{msg}{" during diagnostics" if mid_run else ""}.' + LOG.warning(msg) + dev.add_note(msg, color) + + # Set Disk Attributes test result + for test in dev.tests: + if test.name == 'Disk Attributes': + test.failed = test.failed or msg + test.passed = not test.failed + if test.failed: + test.set_status('Failed') + elif 'N/A' not in test.status: + test.set_status('Passed') + + # Disable further testing if needed + if disable_tests: + dev.disable_disk_tests() + + def disk_surface_scan(state, test_objects) -> None: """Read-only disk surface scan using badblocks.""" LOG.info('Disk Surface Scan (badblocks)') @@ -889,10 +868,10 @@ def run_diags(state, menu, quick_mode=False) -> None: state.abort_testing() state.update_progress_pane() break - - # Run safety checks - if group.name.startswith('Disk'): - state.disk_safety_checks() + else: + # Run safety checks after disk tests + if group.name.startswith('Disk'): + state.disk_safety_checks() # Handle aborts if aborted: diff --git a/scripts/wk/hw/smart.py b/scripts/wk/hw/smart.py index 0b78af18..f4292b06 100644 --- a/scripts/wk/hw/smart.py +++ b/scripts/wk/hw/smart.py @@ -24,11 +24,6 @@ from wk.std import bytes_to_string, color_string, sleep LOG = logging.getLogger(__name__) -# Exception Classes -class CriticalHardwareError(RuntimeError): - """Exception used for critical hardware failures.""" - - # Functions def abort_self_test(dev) -> None: """Abort currently running non-captive self-test.""" @@ -335,8 +330,8 @@ def run_smart_self_test(test_obj, log_path) -> bool: build_self_test_report(test_obj) -def safety_checks(dev) -> None: - """Run safety checks and raise an exception if necessary.""" +def smart_status_ok(dev) -> bool: + """Check SMART attributes and overall assessment, returns bool.""" blocking_event_encountered = False update_smart_details(dev) @@ -371,9 +366,8 @@ def safety_checks(dev) -> None: dev.add_note(msg, 'RED') LOG.error('%s %s', dev.path, msg) - # Raise blocking exception if necessary - if blocking_event_encountered: - raise CriticalHardwareError(f'Critical error(s) for: {dev.path}') + # Done + return not blocking_event_encountered def self_test_in_progress(dev) -> bool: