Refactor disk safety checks

This commit is contained in:
2Shirt 2022-04-18 09:21:29 -07:00
parent 95cd3b645d
commit 2585ed584c
Signed by: 2Shirt
GPG key ID: 152FAC923B0E132C
3 changed files with 75 additions and 90 deletions

View file

@ -29,8 +29,8 @@ from wk.cfg.ddrescue import (
) )
from wk.hw import disk as hw_disk from wk.hw import disk as hw_disk
from wk.hw.smart import ( from wk.hw.smart import (
CriticalHardwareError, check_attributes,
safety_checks, smart_status_ok,
update_smart_details, update_smart_details,
) )
@ -951,13 +951,23 @@ class State():
def safety_check_destination(self): def safety_check_destination(self):
"""Run safety checks for destination and abort if necessary.""" """Run safety checks for destination and abort if necessary."""
try: errors_detected = False
safety_checks(self.destination)
except CriticalHardwareError as err: # Check for critical errors
if not smart_status_ok(self.destination):
std.print_error( std.print_error(
f'Critical error(s) detected for: {self.destination.path}', f'Critical error(s) detected for: {self.destination.path}',
) )
raise std.GenericAbort() from err
# Check for minor errors
if not check_attributes(self.destination, only_blocking=False):
std.print_warning(
f'Attribute error(s) detected for: {self.destination.path}',
)
# Done
if errors_detected:
raise std.GenericAbort()
def safety_check_size(self): def safety_check_size(self):
"""Run size safety check and abort if necessary.""" """Run size safety check and abort if necessary."""
@ -1494,11 +1504,13 @@ def check_destination_health(destination):
# Return empty string # Return empty string
return result return result
# Run safety checks # Check for critical errors
try: if not smart_status_ok(destination):
safety_checks(destination) result = 'Critical error(s) detected for: {destination.path}'
except CriticalHardwareError:
result = 'Critical hardware error detected on destination' # Check for minor errors
if not check_attributes(destination, only_blocking=False):
result = f'Attribute error(s) detected for: {destination.path}'
# Done # Done
return result return result

View file

@ -110,61 +110,12 @@ class State():
tmux.kill_pane(_id) tmux.kill_pane(_id)
self.panes.pop(key) self.panes.pop(key)
def disk_safety_checks(self, prep=False) -> None: def disk_safety_checks(self) -> None:
# pylint: disable=too-many-branches,too-many-statements """Check for mid-run SMART failures and failed test(s)."""
"""Run disk safety checks.""" for dev in self.disks:
for disk in self.disks: disk_smart_status_check(dev, mid_run=True)
disable_tests = False if any(test.failed for test in dev.tests):
dev.disable_disk_tests()
# Skip already disabled devices
if all(test.disabled for test in disk.tests):
continue
try:
hw_smart.safety_checks(disk)
except hw_smart.CriticalHardwareError:
disable_tests = True
disk.add_note('Critical hardware error detected.', 'RED')
if 'Disk Attributes' in disk.tests:
disk.tests['Disk Attributes'].failed = True
disk.tests['Disk Attributes'].set_status('Failed')
if not prep:
# Mid-diag failure detected
LOG.warning('Critical hardware error detected during diagnostics')
disk.add_note(
'Critical hardware error detected during diagnostics',
'YELLOW',
)
else:
if (
'Disk Attributes' in disk.tests
and not disk.tests['Disk Attributes'].failed
and not hw_smart.check_attributes(disk, only_blocking=False)
):
# No blocking errors encountered, but found minor attribute failures
if not prep:
# Mid-diag failure detected
LOG.warning('Attribute(s) failure detected during diagnostics')
disk.add_note(
'Attribute(s) failure detected during diagnostics',
'YELLOW',
)
disk.tests['Disk Attributes'].failed = True
disk.tests['Disk Attributes'].set_status('Failed')
# Check Surface Scan
if (
'Disk Surface Scan' in disk.tests
and disk.tests['Disk Surface Scan'].failed
and 'Disk I/O Benchmark' in disk.tests
):
# Disable I/O Benchmark test
disk.tests['Disk I/O Benchmark'].set_status('Skipped')
disk.tests['Disk I/O Benchmark'].disabled = True
# Disable tests if necessary
if disable_tests:
disk.disable_disk_tests()
def fix_tmux_layout(self, forced=True) -> None: def fix_tmux_layout(self, forced=True) -> None:
"""Fix tmux layout based on cfg.hw.TMUX_LAYOUT.""" """Fix tmux layout based on cfg.hw.TMUX_LAYOUT."""
@ -251,9 +202,6 @@ class State():
test_group.test_objects.append(test_obj) test_group.test_objects.append(test_obj)
self.test_groups.append(test_group) self.test_groups.append(test_group)
# Run safety checks
self.disk_safety_checks(prep=True)
def init_tmux(self) -> None: def init_tmux(self) -> None:
"""Initialize tmux layout.""" """Initialize tmux layout."""
tmux.kill_all_panes() tmux.kill_all_panes()
@ -552,18 +500,12 @@ def disk_attribute_check(state, test_objects) -> None:
"""Disk attribute check.""" """Disk attribute check."""
LOG.info('Disk Attribute Check') LOG.info('Disk Attribute Check')
for test in test_objects: for test in test_objects:
disk_smart_status_check(test.dev, mid_run=False)
if not test.dev.attributes: if not test.dev.attributes:
# No NVMe/SMART data # No NVMe/SMART data
test.set_status('N/A') test.set_status('N/A')
continue continue
if hw_smart.check_attributes(test.dev):
test.passed = True
test.set_status('Passed')
else:
test.failed = True
test.set_status('Failed')
# Done # Done
state.update_progress_pane() state.update_progress_pane()
@ -685,6 +627,43 @@ def disk_self_test(state, test_objects) -> None:
raise std.GenericAbort('Aborted') raise std.GenericAbort('Aborted')
def disk_smart_status_check(dev, mid_run=True) -> None:
"""Check SMART status."""
msg = None
color = None
disable_tests = False
# Check SMART status and attributes
if not hw_smart.smart_status_ok(dev):
msg = 'Critical SMART error detected'
color = 'RED'
disable_tests = True
elif not hw_smart.check_attributes(dev, only_blocking=False):
# Non-blocking errors
msg = 'SMART attribute failure(s) detected'
color = 'YELLOW'
# Log errors if detected
if msg:
msg = f'{msg}{" during diagnostics" if mid_run else ""}.'
LOG.warning(msg)
dev.add_note(msg, color)
# Set Disk Attributes test result
for test in dev.tests:
if test.name == 'Disk Attributes':
test.failed = test.failed or msg
test.passed = not test.failed
if test.failed:
test.set_status('Failed')
elif 'N/A' not in test.status:
test.set_status('Passed')
# Disable further testing if needed
if disable_tests:
dev.disable_disk_tests()
def disk_surface_scan(state, test_objects) -> None: def disk_surface_scan(state, test_objects) -> None:
"""Read-only disk surface scan using badblocks.""" """Read-only disk surface scan using badblocks."""
LOG.info('Disk Surface Scan (badblocks)') LOG.info('Disk Surface Scan (badblocks)')
@ -889,10 +868,10 @@ def run_diags(state, menu, quick_mode=False) -> None:
state.abort_testing() state.abort_testing()
state.update_progress_pane() state.update_progress_pane()
break break
else:
# Run safety checks # Run safety checks after disk tests
if group.name.startswith('Disk'): if group.name.startswith('Disk'):
state.disk_safety_checks() state.disk_safety_checks()
# Handle aborts # Handle aborts
if aborted: if aborted:

View file

@ -24,11 +24,6 @@ from wk.std import bytes_to_string, color_string, sleep
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
# Exception Classes
class CriticalHardwareError(RuntimeError):
"""Exception used for critical hardware failures."""
# Functions # Functions
def abort_self_test(dev) -> None: def abort_self_test(dev) -> None:
"""Abort currently running non-captive self-test.""" """Abort currently running non-captive self-test."""
@ -335,8 +330,8 @@ def run_smart_self_test(test_obj, log_path) -> bool:
build_self_test_report(test_obj) build_self_test_report(test_obj)
def safety_checks(dev) -> None: def smart_status_ok(dev) -> bool:
"""Run safety checks and raise an exception if necessary.""" """Check SMART attributes and overall assessment, returns bool."""
blocking_event_encountered = False blocking_event_encountered = False
update_smart_details(dev) update_smart_details(dev)
@ -371,9 +366,8 @@ def safety_checks(dev) -> None:
dev.add_note(msg, 'RED') dev.add_note(msg, 'RED')
LOG.error('%s %s', dev.path, msg) LOG.error('%s %s', dev.path, msg)
# Raise blocking exception if necessary # Done
if blocking_event_encountered: return not blocking_event_encountered
raise CriticalHardwareError(f'Critical error(s) for: {dev.path}')
def self_test_in_progress(dev) -> bool: def self_test_in_progress(dev) -> bool: