Refactor disk safety checks

This commit is contained in:
2Shirt 2022-04-18 09:21:29 -07:00
parent 95cd3b645d
commit 2585ed584c
Signed by: 2Shirt
GPG key ID: 152FAC923B0E132C
3 changed files with 75 additions and 90 deletions

View file

@ -29,8 +29,8 @@ from wk.cfg.ddrescue import (
)
from wk.hw import disk as hw_disk
from wk.hw.smart import (
CriticalHardwareError,
safety_checks,
check_attributes,
smart_status_ok,
update_smart_details,
)
@ -951,13 +951,23 @@ class State():
def safety_check_destination(self):
"""Run safety checks for destination and abort if necessary."""
try:
safety_checks(self.destination)
except CriticalHardwareError as err:
errors_detected = False
# Check for critical errors
if not smart_status_ok(self.destination):
std.print_error(
f'Critical error(s) detected for: {self.destination.path}',
)
raise std.GenericAbort() from err
# Check for minor errors
if not check_attributes(self.destination, only_blocking=False):
std.print_warning(
f'Attribute error(s) detected for: {self.destination.path}',
)
# Done
if errors_detected:
raise std.GenericAbort()
def safety_check_size(self):
"""Run size safety check and abort if necessary."""
@ -1494,11 +1504,13 @@ def check_destination_health(destination):
# Return empty string
return result
# Run safety checks
try:
safety_checks(destination)
except CriticalHardwareError:
result = 'Critical hardware error detected on destination'
# Check for critical errors
if not smart_status_ok(destination):
result = 'Critical error(s) detected for: {destination.path}'
# Check for minor errors
if not check_attributes(destination, only_blocking=False):
result = f'Attribute error(s) detected for: {destination.path}'
# Done
return result

View file

@ -110,61 +110,12 @@ class State():
tmux.kill_pane(_id)
self.panes.pop(key)
def disk_safety_checks(self, prep=False) -> None:
# pylint: disable=too-many-branches,too-many-statements
"""Run disk safety checks."""
for disk in self.disks:
disable_tests = False
# Skip already disabled devices
if all(test.disabled for test in disk.tests):
continue
try:
hw_smart.safety_checks(disk)
except hw_smart.CriticalHardwareError:
disable_tests = True
disk.add_note('Critical hardware error detected.', 'RED')
if 'Disk Attributes' in disk.tests:
disk.tests['Disk Attributes'].failed = True
disk.tests['Disk Attributes'].set_status('Failed')
if not prep:
# Mid-diag failure detected
LOG.warning('Critical hardware error detected during diagnostics')
disk.add_note(
'Critical hardware error detected during diagnostics',
'YELLOW',
)
else:
if (
'Disk Attributes' in disk.tests
and not disk.tests['Disk Attributes'].failed
and not hw_smart.check_attributes(disk, only_blocking=False)
):
# No blocking errors encountered, but found minor attribute failures
if not prep:
# Mid-diag failure detected
LOG.warning('Attribute(s) failure detected during diagnostics')
disk.add_note(
'Attribute(s) failure detected during diagnostics',
'YELLOW',
)
disk.tests['Disk Attributes'].failed = True
disk.tests['Disk Attributes'].set_status('Failed')
# Check Surface Scan
if (
'Disk Surface Scan' in disk.tests
and disk.tests['Disk Surface Scan'].failed
and 'Disk I/O Benchmark' in disk.tests
):
# Disable I/O Benchmark test
disk.tests['Disk I/O Benchmark'].set_status('Skipped')
disk.tests['Disk I/O Benchmark'].disabled = True
# Disable tests if necessary
if disable_tests:
disk.disable_disk_tests()
def disk_safety_checks(self) -> None:
"""Check for mid-run SMART failures and failed test(s)."""
for dev in self.disks:
disk_smart_status_check(dev, mid_run=True)
if any(test.failed for test in dev.tests):
dev.disable_disk_tests()
def fix_tmux_layout(self, forced=True) -> None:
"""Fix tmux layout based on cfg.hw.TMUX_LAYOUT."""
@ -251,9 +202,6 @@ class State():
test_group.test_objects.append(test_obj)
self.test_groups.append(test_group)
# Run safety checks
self.disk_safety_checks(prep=True)
def init_tmux(self) -> None:
"""Initialize tmux layout."""
tmux.kill_all_panes()
@ -552,18 +500,12 @@ def disk_attribute_check(state, test_objects) -> None:
"""Disk attribute check."""
LOG.info('Disk Attribute Check')
for test in test_objects:
disk_smart_status_check(test.dev, mid_run=False)
if not test.dev.attributes:
# No NVMe/SMART data
test.set_status('N/A')
continue
if hw_smart.check_attributes(test.dev):
test.passed = True
test.set_status('Passed')
else:
test.failed = True
test.set_status('Failed')
# Done
state.update_progress_pane()
@ -685,6 +627,43 @@ def disk_self_test(state, test_objects) -> None:
raise std.GenericAbort('Aborted')
def disk_smart_status_check(dev, mid_run=True) -> None:
"""Check SMART status."""
msg = None
color = None
disable_tests = False
# Check SMART status and attributes
if not hw_smart.smart_status_ok(dev):
msg = 'Critical SMART error detected'
color = 'RED'
disable_tests = True
elif not hw_smart.check_attributes(dev, only_blocking=False):
# Non-blocking errors
msg = 'SMART attribute failure(s) detected'
color = 'YELLOW'
# Log errors if detected
if msg:
msg = f'{msg}{" during diagnostics" if mid_run else ""}.'
LOG.warning(msg)
dev.add_note(msg, color)
# Set Disk Attributes test result
for test in dev.tests:
if test.name == 'Disk Attributes':
test.failed = test.failed or msg
test.passed = not test.failed
if test.failed:
test.set_status('Failed')
elif 'N/A' not in test.status:
test.set_status('Passed')
# Disable further testing if needed
if disable_tests:
dev.disable_disk_tests()
def disk_surface_scan(state, test_objects) -> None:
"""Read-only disk surface scan using badblocks."""
LOG.info('Disk Surface Scan (badblocks)')
@ -889,10 +868,10 @@ def run_diags(state, menu, quick_mode=False) -> None:
state.abort_testing()
state.update_progress_pane()
break
# Run safety checks
if group.name.startswith('Disk'):
state.disk_safety_checks()
else:
# Run safety checks after disk tests
if group.name.startswith('Disk'):
state.disk_safety_checks()
# Handle aborts
if aborted:

View file

@ -24,11 +24,6 @@ from wk.std import bytes_to_string, color_string, sleep
LOG = logging.getLogger(__name__)
# Exception Classes
class CriticalHardwareError(RuntimeError):
"""Exception used for critical hardware failures."""
# Functions
def abort_self_test(dev) -> None:
"""Abort currently running non-captive self-test."""
@ -335,8 +330,8 @@ def run_smart_self_test(test_obj, log_path) -> bool:
build_self_test_report(test_obj)
def safety_checks(dev) -> None:
"""Run safety checks and raise an exception if necessary."""
def smart_status_ok(dev) -> bool:
"""Check SMART attributes and overall assessment, returns bool."""
blocking_event_encountered = False
update_smart_details(dev)
@ -371,9 +366,8 @@ def safety_checks(dev) -> None:
dev.add_note(msg, 'RED')
LOG.error('%s %s', dev.path, msg)
# Raise blocking exception if necessary
if blocking_event_encountered:
raise CriticalHardwareError(f'Critical error(s) for: {dev.path}')
# Done
return not blocking_event_encountered
def self_test_in_progress(dev) -> bool: