Added Disk().safety_checks()
* Raises an exception for blocking events * Removed "Ignore" column from ATTRIBUTES * Listed attributes should either be warnings or errors * Only 'Critical' attributes should block futher tests
This commit is contained in:
parent
d25b341eb3
commit
07fdbcdd7c
2 changed files with 87 additions and 15 deletions
|
|
@ -5,22 +5,22 @@
|
|||
|
||||
ATTRIBUTES = {
|
||||
# NVMe
|
||||
'critical_warning': {'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
'media_errors': {'Critical': False, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
'power_on_hours': {'Critical': False, 'Ignore': True, 'Warning': 17532, 'Error': 26298, 'Maximum': None, },
|
||||
'unsafe_shutdowns': {'Critical': False, 'Ignore': True, 'Warning': 1, 'Error': None, 'Maximum': None, },
|
||||
'critical_warning': {'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
'media_errors': {'Critical': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
'power_on_hours': {'Critical': False, 'Warning': 17532, 'Error': 26298, 'Maximum': None, },
|
||||
'unsafe_shutdowns': {'Critical': False, 'Warning': 1, 'Error': None, 'Maximum': None, },
|
||||
# SMART
|
||||
5: {'Hex': '05', 'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
9: {'Hex': '09', 'Critical': False, 'Ignore': True, 'Warning': 17532, 'Error': 26298, 'Maximum': None, },
|
||||
10: {'Hex': '10', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
|
||||
184: {'Hex': 'B8', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
|
||||
187: {'Hex': 'BB', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
|
||||
188: {'Hex': 'BC', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
|
||||
196: {'Hex': 'C4', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
|
||||
197: {'Hex': 'C5', 'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
198: {'Hex': 'C6', 'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
199: {'Hex': 'C7', 'Critical': False, 'Ignore': True, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
201: {'Hex': 'C9', 'Critical': False, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': 10000, },
|
||||
5: {'Hex': '05', 'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
9: {'Hex': '09', 'Critical': False, 'Warning': 17532, 'Error': 26298, 'Maximum': None, },
|
||||
10: {'Hex': '10', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
|
||||
184: {'Hex': 'B8', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
|
||||
187: {'Hex': 'BB', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
|
||||
188: {'Hex': 'BC', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
|
||||
196: {'Hex': 'C4', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
|
||||
197: {'Hex': 'C5', 'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
198: {'Hex': 'C6', 'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
199: {'Hex': 'C7', 'Critical': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
|
||||
201: {'Hex': 'C9', 'Critical': False, 'Warning': None, 'Error': 1, 'Maximum': 10000, },
|
||||
}
|
||||
ATTRIBUTE_COLORS = (
|
||||
# NOTE: Ordered by ascending importance
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ from wk.cfg.hw import ATTRIBUTES, ATTRIBUTE_COLORS
|
|||
from wk.exe import get_json_from_command, run_program
|
||||
from wk.std import bytes_to_string, color_string, string_to_bytes
|
||||
|
||||
|
||||
# STATIC VARIABLES
|
||||
KEY_NVME = 'nvme_smart_health_information_log'
|
||||
KEY_SMART = 'ata_smart_attributes'
|
||||
|
|
@ -31,6 +32,12 @@ REGEX_POWER_ON_TIME = re.compile(
|
|||
r'^(\d+)([Hh].*|\s+\(\d+\s+\d+\s+\d+\).*)'
|
||||
)
|
||||
|
||||
|
||||
# Exception Classes
|
||||
class CriticalHardwareError(RuntimeError):
|
||||
"""Exception used for critical hardware failures."""
|
||||
|
||||
|
||||
# Classes
|
||||
class CpuRam():
|
||||
"""Object for tracking CPU & RAM specific data."""
|
||||
|
|
@ -273,6 +280,71 @@ class Disk():
|
|||
|
||||
return aligned
|
||||
|
||||
def safety_checks(self):
|
||||
"""Run safety checks and raise an exception if necessary."""
|
||||
blocking_event_encountered = False
|
||||
self.update_smart_details()
|
||||
|
||||
# Attributes
|
||||
for attr, value in self.attributes.items():
|
||||
# Skip unknown attributes
|
||||
if attr not in ATTRIBUTES:
|
||||
continue
|
||||
|
||||
# Get thresholds
|
||||
critical = ATTRIBUTES[attr].get('Critical', False)
|
||||
err_thresh = ATTRIBUTES[attr].get('Error', None)
|
||||
max_thresh = ATTRIBUTES[attr].get('Maximum', None)
|
||||
if not max_thresh:
|
||||
max_thresh = float('inf')
|
||||
|
||||
# Skip non-critical attributes
|
||||
if not critical:
|
||||
continue
|
||||
|
||||
# Skip informational attributes
|
||||
if not err_thresh:
|
||||
continue
|
||||
|
||||
# Check attribute
|
||||
if err_thresh <= value['raw'] < max_thresh:
|
||||
blocking_event_encountered = True
|
||||
msg = f'Failed attribute: {attr}'
|
||||
LOG.error('%s %s', self.path, msg)
|
||||
|
||||
# NVMe status
|
||||
# TODO: See https://github.com/2Shirt/WizardKit/issues/130
|
||||
|
||||
# SMART overall assessment
|
||||
smart_passed = True
|
||||
try:
|
||||
smart_passed = self.smartctl['smart_status']['passed']
|
||||
except (KeyError, TypeError):
|
||||
# Assuming disk doesn't support SMART overall assessment
|
||||
pass
|
||||
if not smart_passed:
|
||||
blocking_event_encountered = True
|
||||
msg = 'SMART overall self-assessment: Failed'
|
||||
self.add_note(msg, 'RED')
|
||||
LOG.error('%s %s', self.path, msg)
|
||||
|
||||
# SMART self-test status
|
||||
test_status = ''
|
||||
try:
|
||||
test_status = self.smartctl['ata_smart_data']['self_test']['status']
|
||||
except (KeyError, TypeError):
|
||||
# Assuming disk doesn't support SMART self-tests
|
||||
pass
|
||||
if 'remaining_percent' in test_status:
|
||||
blocking_event_encountered = True
|
||||
msg = 'SMART self-test in progress'
|
||||
self.add_note(msg, 'RED')
|
||||
LOG.error('%s %s', self.path, msg)
|
||||
|
||||
# Raise exception if necessary
|
||||
if blocking_event_encountered:
|
||||
raise CriticalHardwareError(f'Critical error(s) for: {self.path}')
|
||||
|
||||
def update_smart_details(self):
|
||||
"""Update SMART details via smartctl."""
|
||||
self.attributes = {}
|
||||
|
|
|
|||
Loading…
Reference in a new issue