WizardKit/.bin/Scripts/functions/hw_diags.py

# Wizard Kit: Functions - HW Diagnostics

import libtmux
import json

from functions.common import *

# STATIC VARIABLES
## tmux
TMUX = libtmux.Server()
SESSION = TMUX.find_where({'session_name': 'hw-diags'})
WINDOW = SESSION.windows[0] # Should be a safe assumption
PANE = WINDOW.panes[0]      # Should be a safe assumption
## other
ATTRIBUTES = {
    'NVMe': {
        'critical_warning': {'Error': 1},
        'media_errors': {'Error': 1},
        'power_on_hours': {'Warning': 12000, 'Error': 18000, 'Ignore': True},
        'unsafe_shutdowns': {'Warning': 1},
        },
    'SMART': {
        5: {'Error': 1},
        9: {'Warning': 12000, 'Error': 18000, 'Ignore': True},
        10: {'Warning': 1},
        184: {'Error': 1},
        187: {'Warning': 1},
        188: {'Warning': 1},
        197: {'Error': 1},
        198: {'Error': 1},
        201: {'Warning': 1},
        },
    }
TESTS = {
    'Prime95': {
        'Enabled': False,
        'Status':  'Pending',
        },
    'NVMe/SMART': {
        'Enabled': False,
        'Quick':   False,
        },
    'badblocks': {
        'Enabled': False,
        },
    }

def get_smart_details(dev):
    cmd = 'sudo smartctl --all --json /dev/{}'.format(dev).split()
    result = run_program(cmd, check=False)
    try:
        return json.loads(result.stdout.decode())
    except Exception:
        # Let other sections deal with the missing data
        return {}

def get_status_color(s):
    color = COLORS['CLEAR']
    if s in ['NS', 'Unknown']:
        color = COLORS['RED']
    elif s in ['Working', 'Skipped']:
        color = COLORS['YELLOW']
    elif s in ['CS']:
        color = COLORS['GREEN']
    return color

def menu_diags():
    diag_modes = [
        {'Name': 'All tests',
            'Tests': ['Prime95', 'NVMe/SMART', 'badblocks']},
        {'Name': 'Prime95',
            'Tests': ['Prime95']},
        {'Name': 'NVMe/SMART & badblocks',
            'Tests': ['NVMe/SMART', 'badblocks']},
        {'Name': 'NVMe/SMART',
            'Tests': ['NVMe/SMART']},
        {'Name': 'badblocks',
            'Tests': ['badblocks']},
        {'Name': 'Quick drive test',
            'Tests': ['Quick', 'NVMe/SMART']},
        ]
    actions = [
        {'Letter': 'A', 'Name': 'Audio test'},
        {'Letter': 'N', 'Name': 'Network test'},
        {'Letter': 'M', 'Name': 'Screen Saver - Matrix', 'CRLF': True},
        {'Letter': 'P', 'Name': 'Screen Saver - Pipes'},
        {'Letter': 'Q', 'Name': 'Quit', 'CRLF': True},
        ]

    # Show menu
    while True:
        selection = menu_select(
            title = 'Hardware Diagnostics: Menu',
            main_entries = diag_modes,
            action_entries = actions,
            spacer = '──────────────────────────')
        if selection.isnumeric():
            run_tests(diag_modes[int(selection)-1]['Tests'])
        elif selection == 'A':
            run_program(['hw-diags-audio'], check=False, pipe=False)
            sleep(1)
        elif selection == 'N':
            run_program(['hw-diags-network'], check=False, pipe=False)
            sleep(1)
        elif selection == 'M':
            run_program(['cmatrix', '-abs'], check=False, pipe=False)
        elif selection == 'P':
            run_program(
                'pipes -t 0 -t 1 -t 2 -t 3 -p 5 -R -r 4000'.split(),
                check=False, pipe=False)
        elif selection == 'Q':
            break

def run_badblocks():
    pass

def run_mprime():
    # Set Window layout
    window = SESSION.new_window()
    pane_sensors = window.panes[0]
    pane_smart = window.split_window(attach=False)
    pane_smart.set_height(10)
    pane_progress = window.split_window(attach=False, vertical=False)
    pane_progress.set_width(15)
    pane_progress.clear()
    pane_sensors.send_keys('watch -c -n1 -t hw-sensors')
    #pane_progress.send_keys('watch -c -n1 -t cat "{}"'.format(TESTS['Progress Out']))
    pane_progress.send_keys('tail -f "{}"'.format(TESTS['Progress Out']))

    # Start test
    run_program(['apple-fans', 'max'])
    pane_mprime.send_keys('mprime -t')
    sleep(MPRIME_LIMIT*60)

    # Done
    run_program(['apple-fans', 'auto'])
    window.kill_window()

def run_smart():
    # Set Window layout
    pane_worker = WINDOW.split_window(attach=False)
    pane_worker.set_height(10)
    pane_progress = WINDOW.split_window(attach=False, vertical=False)
    pane_progress.set_width(15)
    pane_progress.clear()
    #pane_progress.send_keys('watch -c -n1 -t cat "{}"'.format(TESTS['Progress Out']))
    pane_progress.send_keys('tail -f "{}"'.format(TESTS['Progress Out']))

    # Start test
    sleep(120)

    # Done
    run_program(['tmux kill-pane -a'.split()], check=False)

def run_tests(tests):
    # Enable selected tests
    for t in ['Prime95', 'NVMe/SMART', 'badblocks']:
        TESTS[t]['Enabled'] = t in tests
    TESTS['NVMe/SMART']['Quick'] = 'Quick' in tests

    # Initialize
    if TESTS['NVMe/SMART']['Enabled'] or TESTS['badblocks']['Enabled']:
        scan_disks()
    update_progress()

    # Run
    if TESTS['Prime95']['Enabled']:
        run_mprime()
    if TESTS['NVMe/SMART']['Enabled']:
        run_smart()
    if TESTS['badblocks']['Enabled']:
        run_badblocks()

def scan_disks():
    clear_screen()

    # Get eligible disk list
    result = run_program(['lsblk', '-J', '-O'])
    json_data = json.loads(result.stdout.decode())
    devs = json_data.get('blockdevices', [])
    devs = {d['name']: {'lsblk': d, 'Status': 'Pending'} for d in devs
        if d['type'] == 'disk' and d['hotplug'] == '0'}

    for dev, data in devs.items():
        # Get SMART attributes
        data['smartctl'] = get_smart_details(dev)

        # Get NVMe attributes
        if data['lsblk']['tran'] == 'nvme':
            cmd = 'sudo nvme smart-log /dev/{} -o json'.format(dev).split()
            result = run_program(cmd, check=False)
            try:
                data['nvme-cli'] = json.loads(result.stdout.decode())
            except Exception:
                # Let other sections deal with the missing data
                data['nvme-cli'] = {}
            data['NVMe Disk'] = True

        # Set "Quick Health OK" value
        ## NOTE: If False then require override for badblocks test
        wanted_smart_list = [
            'ata_smart_attributes',
            'ata_smart_data',
            'smart_status',
            ]
        if data.get('NVMe Disk', False):
            crit_warn = data['nvme-cli'].get('critical_warning', 1)
            data['Quick Health OK'] = True if crit_warn == 0 else False
        elif set(wanted_smart_list).issubset(data['smartctl'].keys()):
            data['SMART Pass'] = data['smartctl'].get('smart_status', {}).get(
                'passed', False)
            data['Quick Health OK'] = data['SMART Pass']
            data['SMART Support'] = True
        else:
            data['Quick Health OK'] = False
            data['SMART Support'] = False

        # Ask for manual overrides if necessary
        if not data['Quick Health OK'] and TESTS['badblocks']['Enabled']:
            show_disk_details(data)
            print_warning("WARNING: Health can't be confirmed for: {}".format(
                '/dev/{}'.format(dev)))
            if ask('Run badblocks for this device anyway?'):
                data['OVERRIDE'] = True

    TESTS['NVMe/SMART']['Devices'] = devs
    TESTS['badblocks']['Devices'] = devs

def show_disk_details(dev):
    # Device description
    print_info('Device: /dev/{}'.format(dev['lsblk']['name']))
    for key in ['model', 'size', 'serial']:
        print_standard('  {:8}{}'.format(key, dev['lsblk'].get(key, 'Unknown')))
    if dev['lsblk'].get('tran', 'Unknown') == 'nvme':
        print_standard('  {:8}{}'.format('type', 'NVMe'))
    else:
        print_standard('  {:8}{}'.format(
            'type',
            dev['lsblk'].get('tran', 'Unknown').upper()))

    # Warnings
    if dev.get('NVMe Disk', False):
        if dev['Quick Health OK']:
            print_warning('WARNING: NVMe support is still experimental')
        else:
            print_error('ERROR: NVMe disk is reporting critical warnings')
    elif not dev['SMART Support']:
        print_error('ERROR: Unable to retrieve SMART data')
    elif not dev['SMART Pass']:
        print_error('ERROR: SMART overall-health assessment result: FAILED')

    # Attributes
    print_info('Attributes:')
    if dev.get('NVMe Disk', False):
        for attrib, threshold in sorted(ATTRIBUTES['NVMe'].items()):
            if attrib in dev['nvme-cli']:
                print_standard(
                    '  {:37}'.format(attrib.replace('_', ' ').title()),
                    end='', flush=True)
                raw_num = dev['nvme-cli'][attrib]
                raw_str = str(raw_num)
                if (threshold.get('Error', False) and
                    raw_num >= threshold.get('Error', -1)):
                    print_error(raw_str, timestamp=False)
                    if not threshold.get('Ignore', False):
                        dev['NVMe/SMART']['Status'] = 'NS'
                elif (threshold.get('Warning', False) and
                    raw_num >= threshold.get('Warning', -1)):
                    print_warning(raw_str, timestamp=False)
                else:
                    print_success(raw_str, timestamp=False)
    else:
        # SMART attributes
        s_table = dev['smartctl'].get('ata_smart_attributes', {}).get(
            'table', {})
        s_table = {a.get('id', 'Unknown'): a for a in s_table}
        for attrib, threshold in sorted(ATTRIBUTES['SMART'].items()):
            if attrib in s_table:
                print_standard(
                    '  {:>3}  {:32}'.format(attrib, s_table[attrib]['name']),
                    end='', flush=True)
                raw_str = s_table[attrib]['raw']['string']
                raw_num = re.sub(r'^(\d+).*$', r'\1', raw_str)
                try:
                    raw_num = float(raw_num)
                except ValueError:
                    # Not sure about this one, print raw_str without color?
                    print_standard(raw_str, timestamp=False)
                    continue
                if (threshold.get('Error', False) and
                    raw_num >= threshold.get('Error', -1)):
                    print_error(raw_str, timestamp=False)
                    if not threshold.get('Ignore', False):
                        dev['NVMe/SMART']['Status'] = 'NS'
                elif (threshold.get('Warning', False) and
                    raw_num >= threshold.get('Warning', -1)):
                    print_warning(raw_str, timestamp=False)
                else:
                    print_success(raw_str, timestamp=False)

    # Quick Health OK
    print_standard('Quick health assessment: ', end='', flush=True)
    if dev['Quick Health OK']:
        print_success('Passed.\n', timestamp=False)
    else:
        print_error('Failed.\n', timestamp=False)

def update_progress():
    if 'Progress Out' not in TESTS:
        TESTS['Progress Out'] = '{}/progress.out'.format(global_vars['LogDir'])
    output = []
    output.append('{BLUE}HW  Diagnostics{CLEAR}'.format(**COLORS))
    output.append('───────────────')
    if TESTS['Prime95']['Enabled']:
        output.append('')
        output.append('{BLUE}Prime95{s_color}{status:>8}{CLEAR}'.format(
            s_color = get_status_color(TESTS['Prime95']['Status']),
            status = TESTS['Prime95']['Status'],
            **COLORS))
    if TESTS['NVMe/SMART']['Enabled']:
        output.append('')
        output.append('{BLUE}NVMe / SMART{CLEAR}'.format(**COLORS))
        if TESTS['NVMe/SMART']['Quick']:
            output.append('{YELLOW} (Quick Check){CLEAR}'.format(**COLORS))
        for dev, data in sorted(TESTS['NVMe/SMART']['Devices'].items()):
            output.append('{dev}{s_color}{status:>{pad}}{CLEAR}'.format(
                dev = dev,
                pad = 15-len(dev),
                s_color = get_status_color(data['Status']),
                status = data['Status'],
                **COLORS))
    if TESTS['badblocks']['Enabled']:
        output.append('')
        output.append('{BLUE}badblocks{CLEAR}'.format(**COLORS))
        for dev, data in sorted(TESTS['badblocks']['Devices'].items()):
            output.append('{dev}{s_color}{status:>{pad}}{CLEAR}'.format(
                dev = dev,
                pad = 15-len(dev),
                s_color = get_status_color(data['Status']),
                status = data['Status'],
                **COLORS))

    # Add line-endings
    output = ['{}\n'.format(line) for line in output]

    with open(TESTS['Progress Out'], 'w') as f:
        f.writelines(output)

if __name__ == '__main__':
    print("This file is not meant to be called directly.")