Refactor SMART self-test sections
This commit is contained in:
parent
41b4ffd9fb
commit
af8b2b7dd3
3 changed files with 171 additions and 139 deletions
|
|
@ -30,7 +30,6 @@ from wk.cfg.ddrescue import (
|
||||||
from wk.hw import disk as hw_disk
|
from wk.hw import disk as hw_disk
|
||||||
from wk.hw.smart import (
|
from wk.hw.smart import (
|
||||||
CriticalHardwareError,
|
CriticalHardwareError,
|
||||||
SMARTNotSupportedError,
|
|
||||||
SMARTSelfTestInProgressError,
|
SMARTSelfTestInProgressError,
|
||||||
safety_checks,
|
safety_checks,
|
||||||
update_smart_details,
|
update_smart_details,
|
||||||
|
|
@ -1503,8 +1502,6 @@ def check_destination_health(destination):
|
||||||
result = 'Critical hardware error detected on destination'
|
result = 'Critical hardware error detected on destination'
|
||||||
except SMARTSelfTestInProgressError:
|
except SMARTSelfTestInProgressError:
|
||||||
result = 'SMART self-test in progress on destination'
|
result = 'SMART self-test in progress on destination'
|
||||||
except SMARTNotSupportedError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Done
|
# Done
|
||||||
return result
|
return result
|
||||||
|
|
|
||||||
|
|
@ -701,10 +701,7 @@ def disk_self_test(state, test_objects) -> None:
|
||||||
for test in test_objects:
|
for test in test_objects:
|
||||||
hw_smart.abort_self_test(test.dev)
|
hw_smart.abort_self_test(test.dev)
|
||||||
std.sleep(0.5)
|
std.sleep(0.5)
|
||||||
|
hw_smart.build_self_test_report(test, aborted=True)
|
||||||
# Save report(s)
|
|
||||||
for test in test_objects:
|
|
||||||
hw_smart.check_self_test_results(test, aborted=aborted)
|
|
||||||
|
|
||||||
# Cleanup
|
# Cleanup
|
||||||
state.update_progress_pane()
|
state.update_progress_pane()
|
||||||
|
|
|
||||||
|
|
@ -28,9 +28,6 @@ LOG = logging.getLogger(__name__)
|
||||||
class CriticalHardwareError(RuntimeError):
|
class CriticalHardwareError(RuntimeError):
|
||||||
"""Exception used for critical hardware failures."""
|
"""Exception used for critical hardware failures."""
|
||||||
|
|
||||||
class SMARTNotSupportedError(TypeError):
|
|
||||||
"""Exception used for disks lacking SMART support."""
|
|
||||||
|
|
||||||
class SMARTSelfTestInProgressError(RuntimeError):
|
class SMARTSelfTestInProgressError(RuntimeError):
|
||||||
"""Exception used when a SMART self-test is in progress."""
|
"""Exception used when a SMART self-test is in progress."""
|
||||||
|
|
||||||
|
|
@ -42,6 +39,39 @@ def abort_self_test(dev) -> None:
|
||||||
run_program(cmd, check=False)
|
run_program(cmd, check=False)
|
||||||
|
|
||||||
|
|
||||||
|
def build_self_test_report(test_obj, aborted=False) -> None:
|
||||||
|
"""Check self-test results and build report (saved to test_obj).
|
||||||
|
|
||||||
|
NOTE: Not updating SMART data to preserve the result for the report.
|
||||||
|
|
||||||
|
For instance if the test was aborted the report should include the
|
||||||
|
last known progress instead of just "was aborted by host."
|
||||||
|
"""
|
||||||
|
report = [color_string('Self-Test', 'BLUE')]
|
||||||
|
test_details = get_smart_self_test_details(test_obj.dev)
|
||||||
|
test_result = test_details.get('status', {}).get('string', 'Unknown')
|
||||||
|
|
||||||
|
# Build report
|
||||||
|
if test_obj.disabled or test_obj.status == 'Denied':
|
||||||
|
report.append(color_string(f' {test_obj.status}', 'RED'))
|
||||||
|
elif test_obj.status == 'N/A' or not test_obj.dev.attributes:
|
||||||
|
report.append(color_string(f' {test_obj.status}', 'YELLOW'))
|
||||||
|
elif test_obj.status == 'TestInProgress':
|
||||||
|
report.append(color_string(' Failed to stop previous test', 'RED'))
|
||||||
|
test_obj.set_status('Failed')
|
||||||
|
else:
|
||||||
|
# Other cases include self-test result string
|
||||||
|
report.append(f' {test_result.capitalize()}')
|
||||||
|
if aborted and not (test_obj.passed or test_obj.failed):
|
||||||
|
report.append(color_string(' Aborted', 'YELLOW'))
|
||||||
|
test_obj.set_status('Aborted')
|
||||||
|
elif test_obj.status == 'TimedOut':
|
||||||
|
report.append(color_string(' TimedOut', 'YELLOW'))
|
||||||
|
|
||||||
|
# Done
|
||||||
|
test_obj.report.extend(report)
|
||||||
|
|
||||||
|
|
||||||
def check_attributes(dev, only_blocking=False) -> bool:
|
def check_attributes(dev, only_blocking=False) -> bool:
|
||||||
"""Check if any known attributes are failing, returns bool."""
|
"""Check if any known attributes are failing, returns bool."""
|
||||||
attributes_ok = True
|
attributes_ok = True
|
||||||
|
|
@ -77,32 +107,6 @@ def check_attributes(dev, only_blocking=False) -> bool:
|
||||||
return attributes_ok
|
return attributes_ok
|
||||||
|
|
||||||
|
|
||||||
def check_self_test_results(test_obj, aborted=False) -> None:
|
|
||||||
"""Check SMART self-test results."""
|
|
||||||
test_obj.report.append(color_string('Self-Test', 'BLUE'))
|
|
||||||
if test_obj.disabled or test_obj.status == 'Denied':
|
|
||||||
test_obj.report.append(color_string(f' {test_obj.status}', 'RED'))
|
|
||||||
elif test_obj.status == 'N/A' or not test_obj.dev.attributes:
|
|
||||||
test_obj.report.append(color_string(f' {test_obj.status}', 'YELLOW'))
|
|
||||||
else:
|
|
||||||
# Not updating SMART data here to preserve the test status for the report
|
|
||||||
# For instance if the test was aborted the report should inlcude the last
|
|
||||||
# known progress instead of just "was aborted by host"
|
|
||||||
test_details = get_smart_self_test_details(test_obj.dev)
|
|
||||||
test_result = test_details.get('status', {}).get('string', 'Unknown')
|
|
||||||
test_obj.report.append(f' {test_result.capitalize()}')
|
|
||||||
if aborted and not (test_obj.passed or test_obj.failed):
|
|
||||||
test_obj.report.append(color_string(' Aborted', 'YELLOW'))
|
|
||||||
test_obj.set_status('Aborted')
|
|
||||||
elif test_obj.status == 'TimedOut':
|
|
||||||
test_obj.report.append(color_string(' TimedOut', 'YELLOW'))
|
|
||||||
test_obj.set_status('TimedOut')
|
|
||||||
else:
|
|
||||||
test_obj.failed = not test_obj.passed
|
|
||||||
if test_obj.failed:
|
|
||||||
test_obj.set_status('Failed')
|
|
||||||
|
|
||||||
|
|
||||||
def enable_smart(dev) -> None:
|
def enable_smart(dev) -> None:
|
||||||
"""Try enabling SMART for this disk."""
|
"""Try enabling SMART for this disk."""
|
||||||
cmd = [
|
cmd = [
|
||||||
|
|
@ -202,6 +206,138 @@ def get_smart_self_test_details(dev) -> dict[Any, Any]:
|
||||||
return details
|
return details
|
||||||
|
|
||||||
|
|
||||||
|
def monitor_smart_self_test(test_obj, header_str, log_path) -> bool:
|
||||||
|
"""Monitor SMART self-test status and update test_obj, returns bool."""
|
||||||
|
started = False
|
||||||
|
finished = False
|
||||||
|
status_str = 'Starting self-test...'
|
||||||
|
test_details = get_smart_self_test_details(test_obj.dev)
|
||||||
|
test_minutes = 15
|
||||||
|
|
||||||
|
# Get real test length
|
||||||
|
test_minutes = test_details.get('polling_minutes', {}).get('short', 5)
|
||||||
|
test_minutes = int(test_minutes) + 10
|
||||||
|
|
||||||
|
# Monitor progress (in five second intervals)
|
||||||
|
for _i in range(int(test_minutes*60/5)):
|
||||||
|
sleep(5)
|
||||||
|
|
||||||
|
# Update log
|
||||||
|
## NOTE: This is run at least once with the default "Starting..." status
|
||||||
|
with open(log_path, 'w', encoding='utf-8') as _f:
|
||||||
|
_f.write(f'{header_str}\nSMART self-test status:\n {status_str}')
|
||||||
|
|
||||||
|
# Update status
|
||||||
|
update_smart_details(test_obj.dev)
|
||||||
|
test_details = get_smart_self_test_details(test_obj.dev)
|
||||||
|
|
||||||
|
# Check if test started
|
||||||
|
started = started or 'remaining_percent' in test_details.get('status', {})
|
||||||
|
if not started:
|
||||||
|
if _i * 5 >= SMART_SELF_TEST_START_TIMEOUT_IN_SECONDS:
|
||||||
|
# Test didn't start within limit, stop waiting
|
||||||
|
abort_self_test(test_obj.dev)
|
||||||
|
test_obj.failed = True
|
||||||
|
test_obj.set_status('TimedOut')
|
||||||
|
break
|
||||||
|
# Still within starting limit, continue to next loop
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check test progress
|
||||||
|
status_str = test_details.get('status', {}).get('string', 'Unknown')
|
||||||
|
status_str = status_str.capitalize()
|
||||||
|
|
||||||
|
# Check if finished
|
||||||
|
if 'remaining_percent' not in test_details.get('status', {}):
|
||||||
|
finished = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Done
|
||||||
|
return finished
|
||||||
|
|
||||||
|
|
||||||
|
def run_self_test(test_obj, log_path) -> None:
|
||||||
|
"""Run disk self-test and update test results.
|
||||||
|
|
||||||
|
NOTE: This function is here to reserve a place for future
|
||||||
|
NVMe self-tests announced in NVMe spec v1.3.
|
||||||
|
"""
|
||||||
|
run_smart_self_test(test_obj, log_path)
|
||||||
|
|
||||||
|
|
||||||
|
def run_smart_self_test(test_obj, log_path) -> bool:
|
||||||
|
"""Run SMART self-test and check if it passed, returns bool.
|
||||||
|
|
||||||
|
NOTE: An exception will be raised if the disk lacks SMART support.
|
||||||
|
"""
|
||||||
|
finished = False
|
||||||
|
test_details = get_smart_self_test_details(test_obj.dev)
|
||||||
|
size_str = bytes_to_string(test_obj.dev.size, use_binary=False)
|
||||||
|
header_str = color_string(
|
||||||
|
['[', test_obj.dev.path.name, ' ', size_str, ']'],
|
||||||
|
[None, 'BLUE', None, 'CYAN', None],
|
||||||
|
sep='',
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if disk supports self-tests
|
||||||
|
if not test_details:
|
||||||
|
# Mark test as passed since it doesn't apply
|
||||||
|
test_obj.passed = True
|
||||||
|
test_obj.set_status('N/A')
|
||||||
|
build_self_test_report(test_obj)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Update status
|
||||||
|
with open(log_path, 'w', encoding='utf-8') as _f:
|
||||||
|
_f.write(f'{header_str}\nInitializing...')
|
||||||
|
|
||||||
|
# Check for, and stop, self-test if currently in-progress
|
||||||
|
if self_test_in_progress(test_obj.dev):
|
||||||
|
abort_self_test(test_obj.dev)
|
||||||
|
for _ in range(6):
|
||||||
|
# Wait up to a minute for current test to exit
|
||||||
|
sleep(10)
|
||||||
|
update_smart_details(test_obj.dev)
|
||||||
|
if not self_test_in_progress(test_obj.dev):
|
||||||
|
break
|
||||||
|
|
||||||
|
# Recheck if self-test is in-progress, bail if so
|
||||||
|
if self_test_in_progress(test_obj.dev):
|
||||||
|
test_obj.failed = True
|
||||||
|
test_obj.set_status('TestInProgress')
|
||||||
|
build_self_test_report(test_obj)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Start test
|
||||||
|
cmd = [
|
||||||
|
'sudo',
|
||||||
|
'smartctl',
|
||||||
|
'--tolerance=normal',
|
||||||
|
'--test=short',
|
||||||
|
test_obj.dev.path,
|
||||||
|
]
|
||||||
|
run_program(cmd, check=False)
|
||||||
|
|
||||||
|
# Monitor progress
|
||||||
|
finished = monitor_smart_self_test(test_obj, header_str, log_path)
|
||||||
|
|
||||||
|
# Check result
|
||||||
|
if finished:
|
||||||
|
test_obj.passed = test_details.get('status', {}).get('passed', False)
|
||||||
|
test_obj.failed = test_obj.failed or not test_obj.passed
|
||||||
|
|
||||||
|
# Set status
|
||||||
|
if test_obj.failed and test_obj.status != 'TimedOut':
|
||||||
|
test_obj.set_status('Failed')
|
||||||
|
elif test_obj.passed:
|
||||||
|
test_obj.set_status('Passed')
|
||||||
|
else:
|
||||||
|
test_obj.set_status('Unknown')
|
||||||
|
|
||||||
|
# Build report
|
||||||
|
build_self_test_report(test_obj)
|
||||||
|
|
||||||
|
|
||||||
def safety_checks(dev) -> None:
|
def safety_checks(dev) -> None:
|
||||||
"""Run safety checks and raise an exception if necessary."""
|
"""Run safety checks and raise an exception if necessary."""
|
||||||
blocking_event_encountered = False
|
blocking_event_encountered = False
|
||||||
|
|
@ -243,114 +379,16 @@ def safety_checks(dev) -> None:
|
||||||
raise CriticalHardwareError(f'Critical error(s) for: {dev.path}')
|
raise CriticalHardwareError(f'Critical error(s) for: {dev.path}')
|
||||||
|
|
||||||
# SMART self-test status
|
# SMART self-test status
|
||||||
test_details = get_smart_self_test_details(dev)
|
if self_test_in_progress(dev):
|
||||||
if 'remaining_percent' in test_details.get('status', ''):
|
|
||||||
msg = f'SMART self-test in progress for: {dev.path}'
|
msg = f'SMART self-test in progress for: {dev.path}'
|
||||||
LOG.error(msg)
|
LOG.error(msg)
|
||||||
raise SMARTSelfTestInProgressError(msg)
|
raise SMARTSelfTestInProgressError(msg)
|
||||||
|
|
||||||
|
|
||||||
def run_self_test(test_obj, log_path) -> None:
|
def self_test_in_progress(dev) -> bool:
|
||||||
"""Run disk self-test and check if it passed, returns bool."""
|
"""Check if SMART self-test is in progress, returns bool."""
|
||||||
result = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
test_obj.passed = run_smart_self_test(test_obj.dev, log_path)
|
|
||||||
except TimeoutError:
|
|
||||||
test_obj.failed = True
|
|
||||||
result = 'TimedOut'
|
|
||||||
except SMARTNotSupportedError:
|
|
||||||
# Pass test since it doesn't apply
|
|
||||||
test_obj.passed = True
|
|
||||||
result = 'N/A'
|
|
||||||
|
|
||||||
# Set status
|
|
||||||
if result:
|
|
||||||
test_obj.set_status(result)
|
|
||||||
else:
|
|
||||||
if test_obj.failed:
|
|
||||||
test_obj.set_status('Failed')
|
|
||||||
elif test_obj.passed:
|
|
||||||
test_obj.set_status('Passed')
|
|
||||||
else:
|
|
||||||
test_obj.set_status('Unknown')
|
|
||||||
|
|
||||||
|
|
||||||
def run_smart_self_test(dev, log_path) -> bool:
|
|
||||||
"""Run SMART self-test and check if it passed, returns bool.
|
|
||||||
|
|
||||||
NOTE: An exception will be raised if the disk lacks SMART support.
|
|
||||||
"""
|
|
||||||
finished = False
|
|
||||||
result = None
|
|
||||||
started = False
|
|
||||||
status_str = 'Starting self-test...'
|
|
||||||
test_details = get_smart_self_test_details(dev)
|
test_details = get_smart_self_test_details(dev)
|
||||||
test_minutes = 15
|
return 'remaining_percent' in test_details.get('status', '')
|
||||||
size_str = bytes_to_string(dev.size, use_binary=False)
|
|
||||||
header_str = color_string(
|
|
||||||
['[', dev.path.name, ' ', size_str, ']'],
|
|
||||||
[None, 'BLUE', None, 'CYAN', None],
|
|
||||||
sep='',
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check if disk supports self-tests
|
|
||||||
if not test_details:
|
|
||||||
raise SMARTNotSupportedError(
|
|
||||||
f'SMART self-test not supported for {dev.path}')
|
|
||||||
|
|
||||||
# Get real test length
|
|
||||||
test_minutes = test_details.get('polling_minutes', {}).get('short', 5)
|
|
||||||
test_minutes = int(test_minutes) + 10
|
|
||||||
|
|
||||||
# Start test
|
|
||||||
with open(log_path, 'w', encoding='utf-8') as _f:
|
|
||||||
_f.write(f'{header_str}\nInitializing...')
|
|
||||||
cmd = [
|
|
||||||
'sudo',
|
|
||||||
'smartctl',
|
|
||||||
'--tolerance=normal',
|
|
||||||
'--test=short',
|
|
||||||
dev.path,
|
|
||||||
]
|
|
||||||
run_program(cmd, check=False)
|
|
||||||
|
|
||||||
# Monitor progress (in five second intervals)
|
|
||||||
for _i in range(int(test_minutes*60/5)):
|
|
||||||
sleep(5)
|
|
||||||
|
|
||||||
# Update status
|
|
||||||
update_smart_details(dev)
|
|
||||||
test_details = get_smart_self_test_details(dev)
|
|
||||||
|
|
||||||
# Check test progress
|
|
||||||
if started:
|
|
||||||
status_str = test_details.get('status', {}).get('string', 'Unknown')
|
|
||||||
status_str = status_str.capitalize()
|
|
||||||
|
|
||||||
# Update log
|
|
||||||
with open(log_path, 'w', encoding='utf-8') as _f:
|
|
||||||
_f.write(f'{header_str}\nSMART self-test status:\n {status_str}')
|
|
||||||
|
|
||||||
# Check if finished
|
|
||||||
if 'remaining_percent' not in test_details.get('status', {}):
|
|
||||||
finished = True
|
|
||||||
break
|
|
||||||
|
|
||||||
elif 'remaining_percent' in test_details.get('status', {}):
|
|
||||||
started = True
|
|
||||||
elif _i * 5 >= SMART_SELF_TEST_START_TIMEOUT_IN_SECONDS:
|
|
||||||
# Test didn't start within limit, stop waiting
|
|
||||||
break
|
|
||||||
|
|
||||||
# Check result
|
|
||||||
if finished:
|
|
||||||
result = test_details.get('status', {}).get('passed', False)
|
|
||||||
elif started:
|
|
||||||
raise TimeoutError(f'SMART self-test timed out for {dev.path}')
|
|
||||||
|
|
||||||
# Done
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def update_smart_details(dev) -> None:
|
def update_smart_details(dev) -> None:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue