Added Disk().safety_checks()

* Raises an exception for blocking events
* Removed "Ignore" column from ATTRIBUTES
  * Listed attributes should either be warnings or errors
  * Only 'Critical' attributes should block futher tests
This commit is contained in:
2Shirt 2019-10-31 18:28:34 -06:00
parent d25b341eb3
commit 07fdbcdd7c
Signed by: 2Shirt
GPG key ID: 152FAC923B0E132C
2 changed files with 87 additions and 15 deletions

View file

@ -5,22 +5,22 @@
ATTRIBUTES = {
# NVMe
'critical_warning': {'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
'media_errors': {'Critical': False, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
'power_on_hours': {'Critical': False, 'Ignore': True, 'Warning': 17532, 'Error': 26298, 'Maximum': None, },
'unsafe_shutdowns': {'Critical': False, 'Ignore': True, 'Warning': 1, 'Error': None, 'Maximum': None, },
'critical_warning': {'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, },
'media_errors': {'Critical': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
'power_on_hours': {'Critical': False, 'Warning': 17532, 'Error': 26298, 'Maximum': None, },
'unsafe_shutdowns': {'Critical': False, 'Warning': 1, 'Error': None, 'Maximum': None, },
# SMART
5: {'Hex': '05', 'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
9: {'Hex': '09', 'Critical': False, 'Ignore': True, 'Warning': 17532, 'Error': 26298, 'Maximum': None, },
10: {'Hex': '10', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
184: {'Hex': 'B8', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
187: {'Hex': 'BB', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
188: {'Hex': 'BC', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
196: {'Hex': 'C4', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
197: {'Hex': 'C5', 'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
198: {'Hex': 'C6', 'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
199: {'Hex': 'C7', 'Critical': False, 'Ignore': True, 'Warning': None, 'Error': 1, 'Maximum': None, },
201: {'Hex': 'C9', 'Critical': False, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': 10000, },
5: {'Hex': '05', 'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, },
9: {'Hex': '09', 'Critical': False, 'Warning': 17532, 'Error': 26298, 'Maximum': None, },
10: {'Hex': '10', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
184: {'Hex': 'B8', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
187: {'Hex': 'BB', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
188: {'Hex': 'BC', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
196: {'Hex': 'C4', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, },
197: {'Hex': 'C5', 'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, },
198: {'Hex': 'C6', 'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, },
199: {'Hex': 'C7', 'Critical': False, 'Warning': None, 'Error': 1, 'Maximum': None, },
201: {'Hex': 'C9', 'Critical': False, 'Warning': None, 'Error': 1, 'Maximum': 10000, },
}
ATTRIBUTE_COLORS = (
# NOTE: Ordered by ascending importance

View file

@ -13,6 +13,7 @@ from wk.cfg.hw import ATTRIBUTES, ATTRIBUTE_COLORS
from wk.exe import get_json_from_command, run_program
from wk.std import bytes_to_string, color_string, string_to_bytes
# STATIC VARIABLES
KEY_NVME = 'nvme_smart_health_information_log'
KEY_SMART = 'ata_smart_attributes'
@ -31,6 +32,12 @@ REGEX_POWER_ON_TIME = re.compile(
r'^(\d+)([Hh].*|\s+\(\d+\s+\d+\s+\d+\).*)'
)
# Exception Classes
class CriticalHardwareError(RuntimeError):
"""Exception used for critical hardware failures."""
# Classes
class CpuRam():
"""Object for tracking CPU & RAM specific data."""
@ -273,6 +280,71 @@ class Disk():
return aligned
def safety_checks(self):
"""Run safety checks and raise an exception if necessary."""
blocking_event_encountered = False
self.update_smart_details()
# Attributes
for attr, value in self.attributes.items():
# Skip unknown attributes
if attr not in ATTRIBUTES:
continue
# Get thresholds
critical = ATTRIBUTES[attr].get('Critical', False)
err_thresh = ATTRIBUTES[attr].get('Error', None)
max_thresh = ATTRIBUTES[attr].get('Maximum', None)
if not max_thresh:
max_thresh = float('inf')
# Skip non-critical attributes
if not critical:
continue
# Skip informational attributes
if not err_thresh:
continue
# Check attribute
if err_thresh <= value['raw'] < max_thresh:
blocking_event_encountered = True
msg = f'Failed attribute: {attr}'
LOG.error('%s %s', self.path, msg)
# NVMe status
# TODO: See https://github.com/2Shirt/WizardKit/issues/130
# SMART overall assessment
smart_passed = True
try:
smart_passed = self.smartctl['smart_status']['passed']
except (KeyError, TypeError):
# Assuming disk doesn't support SMART overall assessment
pass
if not smart_passed:
blocking_event_encountered = True
msg = 'SMART overall self-assessment: Failed'
self.add_note(msg, 'RED')
LOG.error('%s %s', self.path, msg)
# SMART self-test status
test_status = ''
try:
test_status = self.smartctl['ata_smart_data']['self_test']['status']
except (KeyError, TypeError):
# Assuming disk doesn't support SMART self-tests
pass
if 'remaining_percent' in test_status:
blocking_event_encountered = True
msg = 'SMART self-test in progress'
self.add_note(msg, 'RED')
LOG.error('%s %s', self.path, msg)
# Raise exception if necessary
if blocking_event_encountered:
raise CriticalHardwareError(f'Critical error(s) for: {self.path}')
def update_smart_details(self):
"""Update SMART details via smartctl."""
self.attributes = {}