From 07fdbcdd7c74d01689120a0367c1167862d63a17 Mon Sep 17 00:00:00 2001 From: 2Shirt <2xShirt@gmail.com> Date: Thu, 31 Oct 2019 18:28:34 -0600 Subject: [PATCH] Added Disk().safety_checks() * Raises an exception for blocking events * Removed "Ignore" column from ATTRIBUTES * Listed attributes should either be warnings or errors * Only 'Critical' attributes should block futher tests --- scripts/wk/cfg/hw.py | 30 +++++++++--------- scripts/wk/hw/obj.py | 72 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 15 deletions(-) diff --git a/scripts/wk/cfg/hw.py b/scripts/wk/cfg/hw.py index 972f6ce0..06554c9e 100644 --- a/scripts/wk/cfg/hw.py +++ b/scripts/wk/cfg/hw.py @@ -5,22 +5,22 @@ ATTRIBUTES = { # NVMe - 'critical_warning': {'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, }, - 'media_errors': {'Critical': False, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, }, - 'power_on_hours': {'Critical': False, 'Ignore': True, 'Warning': 17532, 'Error': 26298, 'Maximum': None, }, - 'unsafe_shutdowns': {'Critical': False, 'Ignore': True, 'Warning': 1, 'Error': None, 'Maximum': None, }, + 'critical_warning': {'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, }, + 'media_errors': {'Critical': False, 'Warning': None, 'Error': 1, 'Maximum': None, }, + 'power_on_hours': {'Critical': False, 'Warning': 17532, 'Error': 26298, 'Maximum': None, }, + 'unsafe_shutdowns': {'Critical': False, 'Warning': 1, 'Error': None, 'Maximum': None, }, # SMART - 5: {'Hex': '05', 'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, }, - 9: {'Hex': '09', 'Critical': False, 'Ignore': True, 'Warning': 17532, 'Error': 26298, 'Maximum': None, }, - 10: {'Hex': '10', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, }, - 184: {'Hex': 'B8', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, }, - 187: {'Hex': 'BB', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, }, - 188: {'Hex': 'BC', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, }, - 196: {'Hex': 'C4', 'Critical': False, 'Ignore': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, }, - 197: {'Hex': 'C5', 'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, }, - 198: {'Hex': 'C6', 'Critical': True, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': None, }, - 199: {'Hex': 'C7', 'Critical': False, 'Ignore': True, 'Warning': None, 'Error': 1, 'Maximum': None, }, - 201: {'Hex': 'C9', 'Critical': False, 'Ignore': False, 'Warning': None, 'Error': 1, 'Maximum': 10000, }, + 5: {'Hex': '05', 'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, }, + 9: {'Hex': '09', 'Critical': False, 'Warning': 17532, 'Error': 26298, 'Maximum': None, }, + 10: {'Hex': '10', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, }, + 184: {'Hex': 'B8', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, }, + 187: {'Hex': 'BB', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, }, + 188: {'Hex': 'BC', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, }, + 196: {'Hex': 'C4', 'Critical': False, 'Warning': 1, 'Error': 10, 'Maximum': 10000, }, + 197: {'Hex': 'C5', 'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, }, + 198: {'Hex': 'C6', 'Critical': True, 'Warning': None, 'Error': 1, 'Maximum': None, }, + 199: {'Hex': 'C7', 'Critical': False, 'Warning': None, 'Error': 1, 'Maximum': None, }, + 201: {'Hex': 'C9', 'Critical': False, 'Warning': None, 'Error': 1, 'Maximum': 10000, }, } ATTRIBUTE_COLORS = ( # NOTE: Ordered by ascending importance diff --git a/scripts/wk/hw/obj.py b/scripts/wk/hw/obj.py index 70fe4657..bed9ec0a 100644 --- a/scripts/wk/hw/obj.py +++ b/scripts/wk/hw/obj.py @@ -13,6 +13,7 @@ from wk.cfg.hw import ATTRIBUTES, ATTRIBUTE_COLORS from wk.exe import get_json_from_command, run_program from wk.std import bytes_to_string, color_string, string_to_bytes + # STATIC VARIABLES KEY_NVME = 'nvme_smart_health_information_log' KEY_SMART = 'ata_smart_attributes' @@ -31,6 +32,12 @@ REGEX_POWER_ON_TIME = re.compile( r'^(\d+)([Hh].*|\s+\(\d+\s+\d+\s+\d+\).*)' ) + +# Exception Classes +class CriticalHardwareError(RuntimeError): + """Exception used for critical hardware failures.""" + + # Classes class CpuRam(): """Object for tracking CPU & RAM specific data.""" @@ -273,6 +280,71 @@ class Disk(): return aligned + def safety_checks(self): + """Run safety checks and raise an exception if necessary.""" + blocking_event_encountered = False + self.update_smart_details() + + # Attributes + for attr, value in self.attributes.items(): + # Skip unknown attributes + if attr not in ATTRIBUTES: + continue + + # Get thresholds + critical = ATTRIBUTES[attr].get('Critical', False) + err_thresh = ATTRIBUTES[attr].get('Error', None) + max_thresh = ATTRIBUTES[attr].get('Maximum', None) + if not max_thresh: + max_thresh = float('inf') + + # Skip non-critical attributes + if not critical: + continue + + # Skip informational attributes + if not err_thresh: + continue + + # Check attribute + if err_thresh <= value['raw'] < max_thresh: + blocking_event_encountered = True + msg = f'Failed attribute: {attr}' + LOG.error('%s %s', self.path, msg) + + # NVMe status + # TODO: See https://github.com/2Shirt/WizardKit/issues/130 + + # SMART overall assessment + smart_passed = True + try: + smart_passed = self.smartctl['smart_status']['passed'] + except (KeyError, TypeError): + # Assuming disk doesn't support SMART overall assessment + pass + if not smart_passed: + blocking_event_encountered = True + msg = 'SMART overall self-assessment: Failed' + self.add_note(msg, 'RED') + LOG.error('%s %s', self.path, msg) + + # SMART self-test status + test_status = '' + try: + test_status = self.smartctl['ata_smart_data']['self_test']['status'] + except (KeyError, TypeError): + # Assuming disk doesn't support SMART self-tests + pass + if 'remaining_percent' in test_status: + blocking_event_encountered = True + msg = 'SMART self-test in progress' + self.add_note(msg, 'RED') + LOG.error('%s %s', self.path, msg) + + # Raise exception if necessary + if blocking_event_encountered: + raise CriticalHardwareError(f'Critical error(s) for: {self.path}') + def update_smart_details(self): """Update SMART details via smartctl.""" self.attributes = {}