This new design uses copy.deepcopy() to avoid erroneous thresholds being applied to drives during diags. This also reduces the number of lookups to one per Disk.
442 lines
13 KiB
Python
442 lines
13 KiB
Python
"""WizardKit: SMART test functions"""
|
|
# vim: sts=2 sw=2 ts=2
|
|
|
|
import copy
|
|
import logging
|
|
import re
|
|
|
|
from typing import Any
|
|
|
|
from wk.cfg.hw import (
|
|
ATTRIBUTE_COLORS,
|
|
KEY_NVME,
|
|
KEY_SMART,
|
|
KNOWN_DISK_ATTRIBUTES,
|
|
KNOWN_DISK_MODELS,
|
|
NVME_WARNING_KEYS,
|
|
REGEX_POWER_ON_TIME,
|
|
SMART_SELF_TEST_START_TIMEOUT_IN_SECONDS,
|
|
)
|
|
from wk.exe import get_json_from_command, run_program
|
|
from wk.std import bytes_to_string, color_string, sleep
|
|
|
|
|
|
# STATIC VARIABLES
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
# Functions
|
|
def abort_self_test(dev) -> None:
|
|
"""Abort currently running non-captive self-test."""
|
|
cmd = ['sudo', 'smartctl', '--abort', dev.path]
|
|
run_program(cmd, check=False)
|
|
|
|
|
|
def build_self_test_report(test_obj, aborted=False) -> None:
|
|
"""Check self-test results and build report (saved to test_obj).
|
|
|
|
NOTE: Not updating SMART data to preserve the result for the report.
|
|
|
|
For instance if the test was aborted the report should include the
|
|
last known progress instead of just "was aborted by host."
|
|
"""
|
|
report = [color_string('Self-Test', 'BLUE')]
|
|
test_details = get_smart_self_test_details(test_obj.dev)
|
|
test_result = test_details.get('status', {}).get('string', 'Unknown')
|
|
|
|
# Build report
|
|
if test_obj.disabled or test_obj.status == 'Denied':
|
|
report.append(color_string(f' {test_obj.status}', 'RED'))
|
|
elif test_obj.status == 'N/A' or not test_obj.dev.attributes:
|
|
report.append(color_string(f' {test_obj.status}', 'YELLOW'))
|
|
elif test_obj.status == 'TestInProgress':
|
|
report.append(color_string(' Failed to stop previous test', 'RED'))
|
|
test_obj.set_status('Failed')
|
|
else:
|
|
# Other cases include self-test result string
|
|
report.append(f' {test_result.capitalize()}')
|
|
if aborted and not (test_obj.passed or test_obj.failed):
|
|
report.append(color_string(' Aborted', 'YELLOW'))
|
|
test_obj.set_status('Aborted')
|
|
elif test_obj.status == 'TimedOut':
|
|
report.append(color_string(' TimedOut', 'YELLOW'))
|
|
|
|
# Done
|
|
test_obj.report.extend(report)
|
|
|
|
|
|
def check_attributes(dev, only_blocking=False) -> bool:
|
|
"""Check if any known attributes are failing, returns bool."""
|
|
attributes_ok = True
|
|
for attr, value in dev.attributes.items():
|
|
# Skip unknown attributes
|
|
if attr not in dev.known_attributes:
|
|
continue
|
|
|
|
# Get thresholds
|
|
blocking_attribute = dev.known_attributes[attr].get('Blocking', False)
|
|
err_thresh = dev.known_attributes[attr].get('Error', None)
|
|
max_thresh = dev.known_attributes[attr].get('Maximum', None)
|
|
if not max_thresh:
|
|
max_thresh = float('inf')
|
|
|
|
# Skip non-blocking attributes if necessary
|
|
if only_blocking and not blocking_attribute:
|
|
continue
|
|
|
|
# Skip informational attributes
|
|
if not err_thresh:
|
|
continue
|
|
|
|
# Check attribute
|
|
if dev.known_attributes[attr].get('PercentageLife', False):
|
|
if 0 <= value['raw'] <= err_thresh:
|
|
attributes_ok = False
|
|
elif err_thresh <= value['raw'] < max_thresh:
|
|
attributes_ok = False
|
|
|
|
# Done
|
|
return attributes_ok
|
|
|
|
|
|
def enable_smart(dev) -> None:
|
|
"""Try enabling SMART for this disk."""
|
|
cmd = [
|
|
'sudo',
|
|
'smartctl',
|
|
f'--device={"sat,auto" if dev.use_sat else "auto"}',
|
|
'--tolerance=permissive',
|
|
'--smart=on',
|
|
dev.path,
|
|
]
|
|
run_program(cmd, check=False)
|
|
|
|
|
|
def generate_attribute_report(dev) -> list[str]:
|
|
"""Generate attribute report, returns list."""
|
|
report = []
|
|
for attr, value in sorted(dev.attributes.items()):
|
|
note = ''
|
|
value_color = 'GREEN'
|
|
|
|
# Skip attributes not in our list
|
|
if attr not in dev.known_attributes:
|
|
continue
|
|
|
|
# Check for attribute note
|
|
note = dev.known_attributes[attr].get('Note', '')
|
|
|
|
# ID / Name
|
|
label = f'{attr:>3}'
|
|
if isinstance(attr, int):
|
|
# Assuming SMART, include hex ID and name
|
|
label += f' / {str(hex(attr))[2:].upper():0>2}: {value["name"]}'
|
|
label = f' {label.replace("_", " "):38}'
|
|
|
|
# Value color
|
|
if dev.known_attributes[attr].get('PercentageLife', False):
|
|
# PercentageLife values
|
|
if 0 <= value['raw'] <= dev.known_attributes[attr]['Error']:
|
|
value_color = 'RED'
|
|
note = '(failed, % life remaining)'
|
|
elif value['raw'] < 0 or value['raw'] > 100:
|
|
value_color = 'PURPLE'
|
|
note = '(invalid?)'
|
|
else:
|
|
for threshold, color in ATTRIBUTE_COLORS:
|
|
threshold_val = dev.known_attributes[attr].get(threshold, None)
|
|
if threshold_val and value['raw'] >= threshold_val:
|
|
value_color = color
|
|
if threshold == 'Error':
|
|
note = '(failed)'
|
|
elif threshold == 'Maximum':
|
|
note = '(invalid?)'
|
|
|
|
# 199/C7 warning
|
|
if str(attr) == '199' and value['raw'] > 0:
|
|
note = '(bad cable?)'
|
|
|
|
# Build colored string and append to report
|
|
line = color_string(
|
|
[label, value['raw_str'], note],
|
|
[None, value_color, 'YELLOW'],
|
|
)
|
|
report.append(line)
|
|
|
|
# Done
|
|
return report
|
|
|
|
|
|
def get_known_disk_attributes(model) -> None:
|
|
"""Get known disk attributes based on the device model."""
|
|
known_attributes = copy.deepcopy(KNOWN_DISK_ATTRIBUTES)
|
|
|
|
# Apply model-specific data
|
|
for regex, data in KNOWN_DISK_MODELS.items():
|
|
if not re.search(regex, model):
|
|
continue
|
|
for attr, thresholds in data.items():
|
|
if attr in known_attributes:
|
|
known_attributes[attr].update(thresholds)
|
|
else:
|
|
known_attributes[attr] = copy.deepcopy(thresholds)
|
|
|
|
# Done
|
|
return known_attributes
|
|
|
|
|
|
def get_smart_self_test_details(dev) -> dict[Any, Any]:
|
|
"""Shorthand to get deeply nested self-test details, returns dict."""
|
|
details = {}
|
|
try:
|
|
details = dev.raw_smartctl['ata_smart_data']['self_test']
|
|
except (KeyError, TypeError):
|
|
# Assuming disk lacks SMART support, ignore and return empty dict.
|
|
pass
|
|
|
|
# Done
|
|
return details
|
|
|
|
|
|
def monitor_smart_self_test(test_obj, header_str, log_path) -> bool:
|
|
"""Monitor SMART self-test status and update test_obj, returns bool."""
|
|
started = False
|
|
finished = False
|
|
status_str = 'Starting self-test...'
|
|
test_details = get_smart_self_test_details(test_obj.dev)
|
|
test_minutes = 15
|
|
|
|
# Get real test length
|
|
test_minutes = test_details.get('polling_minutes', {}).get('short', 5)
|
|
test_minutes = int(test_minutes) + 10
|
|
|
|
# Monitor progress (in five second intervals)
|
|
for _i in range(int(test_minutes*60/5)):
|
|
sleep(5)
|
|
|
|
# Update log
|
|
## NOTE: This is run at least once with the default "Starting..." status
|
|
with open(log_path, 'w', encoding='utf-8') as _f:
|
|
_f.write(f'{header_str}\nSMART self-test status:\n {status_str}')
|
|
|
|
# Update status
|
|
update_smart_details(test_obj.dev)
|
|
test_details = get_smart_self_test_details(test_obj.dev)
|
|
|
|
# Check if test started
|
|
started = started or 'remaining_percent' in test_details.get('status', {})
|
|
if not started:
|
|
if _i * 5 >= SMART_SELF_TEST_START_TIMEOUT_IN_SECONDS:
|
|
# Test didn't start within limit, stop waiting
|
|
abort_self_test(test_obj.dev)
|
|
test_obj.failed = True
|
|
test_obj.set_status('TimedOut')
|
|
break
|
|
# Still within starting limit, continue to next loop
|
|
continue
|
|
|
|
# Check test progress
|
|
status_str = test_details.get('status', {}).get('string', 'Unknown')
|
|
status_str = status_str.capitalize()
|
|
|
|
# Check if finished
|
|
if 'remaining_percent' not in test_details.get('status', {}):
|
|
finished = True
|
|
break
|
|
|
|
# Done
|
|
return finished
|
|
|
|
|
|
def run_self_test(test_obj, log_path) -> None:
|
|
"""Run disk self-test and update test results.
|
|
|
|
NOTE: This function is here to reserve a place for future
|
|
NVMe self-tests announced in NVMe spec v1.3.
|
|
"""
|
|
run_smart_self_test(test_obj, log_path)
|
|
|
|
|
|
def run_smart_self_test(test_obj, log_path) -> bool:
|
|
"""Run SMART self-test and check if it passed, returns bool.
|
|
|
|
NOTE: An exception will be raised if the disk lacks SMART support.
|
|
"""
|
|
finished = False
|
|
test_details = get_smart_self_test_details(test_obj.dev)
|
|
size_str = bytes_to_string(test_obj.dev.size, use_binary=False)
|
|
header_str = color_string(
|
|
['[', test_obj.dev.path.name, ' ', size_str, ']'],
|
|
[None, 'BLUE', None, 'CYAN', None],
|
|
sep='',
|
|
)
|
|
|
|
# Check if disk supports self-tests
|
|
if not test_details:
|
|
# Mark test as passed since it doesn't apply
|
|
test_obj.passed = True
|
|
test_obj.set_status('N/A')
|
|
build_self_test_report(test_obj)
|
|
return
|
|
|
|
# Update status
|
|
with open(log_path, 'w', encoding='utf-8') as _f:
|
|
_f.write(f'{header_str}\nInitializing...')
|
|
|
|
# Check for, and stop, self-test if currently in-progress
|
|
if self_test_in_progress(test_obj.dev):
|
|
abort_self_test(test_obj.dev)
|
|
for _ in range(6):
|
|
# Wait up to a minute for current test to exit
|
|
sleep(10)
|
|
update_smart_details(test_obj.dev)
|
|
if not self_test_in_progress(test_obj.dev):
|
|
break
|
|
|
|
# Recheck if self-test is in-progress, bail if so
|
|
if self_test_in_progress(test_obj.dev):
|
|
test_obj.failed = True
|
|
test_obj.set_status('TestInProgress')
|
|
build_self_test_report(test_obj)
|
|
return
|
|
|
|
# Start test
|
|
cmd = [
|
|
'sudo',
|
|
'smartctl',
|
|
'--tolerance=normal',
|
|
'--test=short',
|
|
test_obj.dev.path,
|
|
]
|
|
run_program(cmd, check=False)
|
|
|
|
# Monitor progress
|
|
finished = monitor_smart_self_test(test_obj, header_str, log_path)
|
|
|
|
# Check result
|
|
if finished:
|
|
test_obj.passed = test_details.get('status', {}).get('passed', False)
|
|
test_obj.failed = test_obj.failed or not test_obj.passed
|
|
|
|
# Set status
|
|
if test_obj.failed and test_obj.status != 'TimedOut':
|
|
test_obj.set_status('Failed')
|
|
elif test_obj.passed:
|
|
test_obj.set_status('Passed')
|
|
else:
|
|
test_obj.set_status('Unknown')
|
|
|
|
# Build report
|
|
build_self_test_report(test_obj)
|
|
|
|
|
|
def smart_status_ok(dev) -> bool:
|
|
"""Check SMART attributes and overall assessment, returns bool."""
|
|
blocking_event_encountered = False
|
|
update_smart_details(dev)
|
|
|
|
# Attributes
|
|
if not check_attributes(dev, only_blocking=True):
|
|
blocking_event_encountered = True
|
|
LOG.error('%s: Blocked for failing attribute(s)', dev.path)
|
|
|
|
# NVMe status
|
|
nvme_status = dev.raw_smartctl.get('smart_status', {}).get('nvme', {})
|
|
if nvme_status.get('media_read_only', False):
|
|
blocking_event_encountered = True
|
|
msg = 'Media has been placed in read-only mode'
|
|
dev.add_note(msg, 'RED')
|
|
LOG.error('%s %s', dev.path, msg)
|
|
for key in NVME_WARNING_KEYS:
|
|
if nvme_status.get(key, False):
|
|
msg = key.replace('_', ' ')
|
|
dev.add_note(msg, 'YELLOW')
|
|
LOG.warning('%s %s', dev.path, msg)
|
|
|
|
# SMART overall assessment
|
|
smart_passed = True
|
|
try:
|
|
smart_passed = dev.raw_smartctl['smart_status']['passed']
|
|
except (KeyError, TypeError):
|
|
# Assuming disk doesn't support SMART overall assessment
|
|
pass
|
|
if not smart_passed:
|
|
blocking_event_encountered = True
|
|
msg = 'SMART overall self-assessment: Failed'
|
|
dev.add_note(msg, 'RED')
|
|
LOG.error('%s %s', dev.path, msg)
|
|
|
|
# Done
|
|
return not blocking_event_encountered
|
|
|
|
|
|
def self_test_in_progress(dev) -> bool:
|
|
"""Check if SMART self-test is in progress, returns bool."""
|
|
test_details = get_smart_self_test_details(dev)
|
|
return 'remaining_percent' in test_details.get('status', '')
|
|
|
|
|
|
def update_smart_details(dev) -> None:
|
|
"""Update SMART details via smartctl."""
|
|
updated_attributes = {}
|
|
|
|
# Bail if device was disconnected
|
|
if not dev.present:
|
|
dev.add_note('Device disconnected', 'RED')
|
|
return
|
|
|
|
# Get SMART data
|
|
cmd = [
|
|
'sudo',
|
|
'smartctl',
|
|
f'--device={"sat,auto" if dev.use_sat else "auto"}',
|
|
'--tolerance=verypermissive',
|
|
'--all',
|
|
'--json',
|
|
dev.path,
|
|
]
|
|
dev.raw_smartctl = get_json_from_command(cmd, check=False)
|
|
|
|
# Check for attributes
|
|
if KEY_NVME in dev.raw_smartctl:
|
|
for name, value in dev.raw_smartctl[KEY_NVME].items():
|
|
try:
|
|
updated_attributes[name] = {
|
|
'name': name,
|
|
'raw': int(value),
|
|
'raw_str': str(value),
|
|
}
|
|
except (TypeError, ValueError):
|
|
# Ignoring invalid attribute
|
|
LOG.error('Invalid NVMe attribute: %s %s', name, value)
|
|
elif KEY_SMART in dev.raw_smartctl:
|
|
for attribute in dev.raw_smartctl[KEY_SMART].get('table', {}):
|
|
try:
|
|
_id = int(attribute['id'])
|
|
except (KeyError, ValueError):
|
|
# Ignoring invalid attribute
|
|
LOG.error('Invalid SMART attribute: %s', attribute)
|
|
continue
|
|
name = str(attribute.get('name', 'Unknown')).replace('_', ' ').title()
|
|
raw = int(attribute.get('raw', {}).get('value', -1))
|
|
raw_str = attribute.get('raw', {}).get('string', 'Unknown')
|
|
|
|
# Fix power-on time
|
|
match = REGEX_POWER_ON_TIME.match(raw_str)
|
|
if _id == 9 and match:
|
|
raw = int(match.group(1))
|
|
|
|
# Add to dict
|
|
updated_attributes[_id] = {
|
|
'name': name, 'raw': raw, 'raw_str': raw_str}
|
|
|
|
# Add note if necessary
|
|
if not updated_attributes:
|
|
dev.add_note('No NVMe or SMART data available', 'YELLOW')
|
|
|
|
# Done
|
|
dev.attributes.update(updated_attributes)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
print("This file is not meant to be called directly.")
|