Refactor disk safety checks
This commit is contained in:
parent
95cd3b645d
commit
2585ed584c
3 changed files with 75 additions and 90 deletions
|
|
@ -29,8 +29,8 @@ from wk.cfg.ddrescue import (
|
|||
)
|
||||
from wk.hw import disk as hw_disk
|
||||
from wk.hw.smart import (
|
||||
CriticalHardwareError,
|
||||
safety_checks,
|
||||
check_attributes,
|
||||
smart_status_ok,
|
||||
update_smart_details,
|
||||
)
|
||||
|
||||
|
|
@ -951,13 +951,23 @@ class State():
|
|||
|
||||
def safety_check_destination(self):
|
||||
"""Run safety checks for destination and abort if necessary."""
|
||||
try:
|
||||
safety_checks(self.destination)
|
||||
except CriticalHardwareError as err:
|
||||
errors_detected = False
|
||||
|
||||
# Check for critical errors
|
||||
if not smart_status_ok(self.destination):
|
||||
std.print_error(
|
||||
f'Critical error(s) detected for: {self.destination.path}',
|
||||
)
|
||||
raise std.GenericAbort() from err
|
||||
|
||||
# Check for minor errors
|
||||
if not check_attributes(self.destination, only_blocking=False):
|
||||
std.print_warning(
|
||||
f'Attribute error(s) detected for: {self.destination.path}',
|
||||
)
|
||||
|
||||
# Done
|
||||
if errors_detected:
|
||||
raise std.GenericAbort()
|
||||
|
||||
def safety_check_size(self):
|
||||
"""Run size safety check and abort if necessary."""
|
||||
|
|
@ -1494,11 +1504,13 @@ def check_destination_health(destination):
|
|||
# Return empty string
|
||||
return result
|
||||
|
||||
# Run safety checks
|
||||
try:
|
||||
safety_checks(destination)
|
||||
except CriticalHardwareError:
|
||||
result = 'Critical hardware error detected on destination'
|
||||
# Check for critical errors
|
||||
if not smart_status_ok(destination):
|
||||
result = 'Critical error(s) detected for: {destination.path}'
|
||||
|
||||
# Check for minor errors
|
||||
if not check_attributes(destination, only_blocking=False):
|
||||
result = f'Attribute error(s) detected for: {destination.path}'
|
||||
|
||||
# Done
|
||||
return result
|
||||
|
|
|
|||
|
|
@ -110,61 +110,12 @@ class State():
|
|||
tmux.kill_pane(_id)
|
||||
self.panes.pop(key)
|
||||
|
||||
def disk_safety_checks(self, prep=False) -> None:
|
||||
# pylint: disable=too-many-branches,too-many-statements
|
||||
"""Run disk safety checks."""
|
||||
for disk in self.disks:
|
||||
disable_tests = False
|
||||
|
||||
# Skip already disabled devices
|
||||
if all(test.disabled for test in disk.tests):
|
||||
continue
|
||||
|
||||
try:
|
||||
hw_smart.safety_checks(disk)
|
||||
except hw_smart.CriticalHardwareError:
|
||||
disable_tests = True
|
||||
disk.add_note('Critical hardware error detected.', 'RED')
|
||||
if 'Disk Attributes' in disk.tests:
|
||||
disk.tests['Disk Attributes'].failed = True
|
||||
disk.tests['Disk Attributes'].set_status('Failed')
|
||||
if not prep:
|
||||
# Mid-diag failure detected
|
||||
LOG.warning('Critical hardware error detected during diagnostics')
|
||||
disk.add_note(
|
||||
'Critical hardware error detected during diagnostics',
|
||||
'YELLOW',
|
||||
)
|
||||
else:
|
||||
if (
|
||||
'Disk Attributes' in disk.tests
|
||||
and not disk.tests['Disk Attributes'].failed
|
||||
and not hw_smart.check_attributes(disk, only_blocking=False)
|
||||
):
|
||||
# No blocking errors encountered, but found minor attribute failures
|
||||
if not prep:
|
||||
# Mid-diag failure detected
|
||||
LOG.warning('Attribute(s) failure detected during diagnostics')
|
||||
disk.add_note(
|
||||
'Attribute(s) failure detected during diagnostics',
|
||||
'YELLOW',
|
||||
)
|
||||
disk.tests['Disk Attributes'].failed = True
|
||||
disk.tests['Disk Attributes'].set_status('Failed')
|
||||
|
||||
# Check Surface Scan
|
||||
if (
|
||||
'Disk Surface Scan' in disk.tests
|
||||
and disk.tests['Disk Surface Scan'].failed
|
||||
and 'Disk I/O Benchmark' in disk.tests
|
||||
):
|
||||
# Disable I/O Benchmark test
|
||||
disk.tests['Disk I/O Benchmark'].set_status('Skipped')
|
||||
disk.tests['Disk I/O Benchmark'].disabled = True
|
||||
|
||||
# Disable tests if necessary
|
||||
if disable_tests:
|
||||
disk.disable_disk_tests()
|
||||
def disk_safety_checks(self) -> None:
|
||||
"""Check for mid-run SMART failures and failed test(s)."""
|
||||
for dev in self.disks:
|
||||
disk_smart_status_check(dev, mid_run=True)
|
||||
if any(test.failed for test in dev.tests):
|
||||
dev.disable_disk_tests()
|
||||
|
||||
def fix_tmux_layout(self, forced=True) -> None:
|
||||
"""Fix tmux layout based on cfg.hw.TMUX_LAYOUT."""
|
||||
|
|
@ -251,9 +202,6 @@ class State():
|
|||
test_group.test_objects.append(test_obj)
|
||||
self.test_groups.append(test_group)
|
||||
|
||||
# Run safety checks
|
||||
self.disk_safety_checks(prep=True)
|
||||
|
||||
def init_tmux(self) -> None:
|
||||
"""Initialize tmux layout."""
|
||||
tmux.kill_all_panes()
|
||||
|
|
@ -552,18 +500,12 @@ def disk_attribute_check(state, test_objects) -> None:
|
|||
"""Disk attribute check."""
|
||||
LOG.info('Disk Attribute Check')
|
||||
for test in test_objects:
|
||||
disk_smart_status_check(test.dev, mid_run=False)
|
||||
if not test.dev.attributes:
|
||||
# No NVMe/SMART data
|
||||
test.set_status('N/A')
|
||||
continue
|
||||
|
||||
if hw_smart.check_attributes(test.dev):
|
||||
test.passed = True
|
||||
test.set_status('Passed')
|
||||
else:
|
||||
test.failed = True
|
||||
test.set_status('Failed')
|
||||
|
||||
# Done
|
||||
state.update_progress_pane()
|
||||
|
||||
|
|
@ -685,6 +627,43 @@ def disk_self_test(state, test_objects) -> None:
|
|||
raise std.GenericAbort('Aborted')
|
||||
|
||||
|
||||
def disk_smart_status_check(dev, mid_run=True) -> None:
|
||||
"""Check SMART status."""
|
||||
msg = None
|
||||
color = None
|
||||
disable_tests = False
|
||||
|
||||
# Check SMART status and attributes
|
||||
if not hw_smart.smart_status_ok(dev):
|
||||
msg = 'Critical SMART error detected'
|
||||
color = 'RED'
|
||||
disable_tests = True
|
||||
elif not hw_smart.check_attributes(dev, only_blocking=False):
|
||||
# Non-blocking errors
|
||||
msg = 'SMART attribute failure(s) detected'
|
||||
color = 'YELLOW'
|
||||
|
||||
# Log errors if detected
|
||||
if msg:
|
||||
msg = f'{msg}{" during diagnostics" if mid_run else ""}.'
|
||||
LOG.warning(msg)
|
||||
dev.add_note(msg, color)
|
||||
|
||||
# Set Disk Attributes test result
|
||||
for test in dev.tests:
|
||||
if test.name == 'Disk Attributes':
|
||||
test.failed = test.failed or msg
|
||||
test.passed = not test.failed
|
||||
if test.failed:
|
||||
test.set_status('Failed')
|
||||
elif 'N/A' not in test.status:
|
||||
test.set_status('Passed')
|
||||
|
||||
# Disable further testing if needed
|
||||
if disable_tests:
|
||||
dev.disable_disk_tests()
|
||||
|
||||
|
||||
def disk_surface_scan(state, test_objects) -> None:
|
||||
"""Read-only disk surface scan using badblocks."""
|
||||
LOG.info('Disk Surface Scan (badblocks)')
|
||||
|
|
@ -889,10 +868,10 @@ def run_diags(state, menu, quick_mode=False) -> None:
|
|||
state.abort_testing()
|
||||
state.update_progress_pane()
|
||||
break
|
||||
|
||||
# Run safety checks
|
||||
if group.name.startswith('Disk'):
|
||||
state.disk_safety_checks()
|
||||
else:
|
||||
# Run safety checks after disk tests
|
||||
if group.name.startswith('Disk'):
|
||||
state.disk_safety_checks()
|
||||
|
||||
# Handle aborts
|
||||
if aborted:
|
||||
|
|
|
|||
|
|
@ -24,11 +24,6 @@ from wk.std import bytes_to_string, color_string, sleep
|
|||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Exception Classes
|
||||
class CriticalHardwareError(RuntimeError):
|
||||
"""Exception used for critical hardware failures."""
|
||||
|
||||
|
||||
# Functions
|
||||
def abort_self_test(dev) -> None:
|
||||
"""Abort currently running non-captive self-test."""
|
||||
|
|
@ -335,8 +330,8 @@ def run_smart_self_test(test_obj, log_path) -> bool:
|
|||
build_self_test_report(test_obj)
|
||||
|
||||
|
||||
def safety_checks(dev) -> None:
|
||||
"""Run safety checks and raise an exception if necessary."""
|
||||
def smart_status_ok(dev) -> bool:
|
||||
"""Check SMART attributes and overall assessment, returns bool."""
|
||||
blocking_event_encountered = False
|
||||
update_smart_details(dev)
|
||||
|
||||
|
|
@ -371,9 +366,8 @@ def safety_checks(dev) -> None:
|
|||
dev.add_note(msg, 'RED')
|
||||
LOG.error('%s %s', dev.path, msg)
|
||||
|
||||
# Raise blocking exception if necessary
|
||||
if blocking_event_encountered:
|
||||
raise CriticalHardwareError(f'Critical error(s) for: {dev.path}')
|
||||
# Done
|
||||
return not blocking_event_encountered
|
||||
|
||||
|
||||
def self_test_in_progress(dev) -> bool:
|
||||
|
|
|
|||
Loading…
Reference in a new issue