Updated HW-Diags and sensor sections

This commit is contained in:
2Shirt 2019-06-04 20:53:34 -06:00
parent ec5591453e
commit 70823d2cd8
Signed by: 2Shirt
GPG key ID: 152FAC923B0E132C
4 changed files with 226 additions and 168 deletions

View file

@ -149,11 +149,14 @@ def save_debug_reports(state, global_vars):
f.write('{}\n'.format(line))
def upload_logdir(global_vars):
def upload_logdir(global_vars, reason='Crash'):
"""Upload compressed LogDir to CRASH_SERVER."""
source = global_vars['LogDir']
source = source[source.rfind('/')+1:]
dest = '{}.txz'.format(source)
dest = 'HW-Diags_{reason}_{Date-Time}.txz'.format(
reason=reason,
**global_vars,
)
data = None
# Compress LogDir
@ -166,7 +169,7 @@ def upload_logdir(global_vars):
data = f.read()
# Upload data
url = '{}/Crash_{}.txz'.format(CRASH_SERVER['Url'], source)
url = '{}/{}'.format(CRASH_SERVER['Url'], dest)
r = requests.put(
url,
data=data,

View file

@ -36,6 +36,7 @@ class CpuObj():
self.tests = OrderedDict()
self.get_details()
self.name = self.lscpu.get('Model name', 'Unknown CPU')
self.description = self.name
def get_details(self):
"""Get CPU details from lscpu."""
@ -57,6 +58,13 @@ class CpuObj():
report.append('{BLUE}Device{CLEAR}'.format(**COLORS))
report.append(' {}'.format(self.name))
# Include RAM details
ram_details = get_ram_details()
ram_total = human_readable_size(ram_details.pop('Total', 0)).strip()
ram_dimms = ['{}x {}'.format(v, k) for k, v in sorted(ram_details.items())]
report.append('{BLUE}RAM{CLEAR}'.format(**COLORS))
report.append(' {} ({})'.format(ram_total, ', '.join(ram_dimms)))
# Tests
for test in self.tests.values():
report.extend(test.report)
@ -220,11 +228,12 @@ class DiskObj():
# Done
return test_running
def disable_test(self, name, status):
def disable_test(self, name, status, test_failed=False):
"""Disable test by name and update status."""
if name in self.tests:
self.tests[name].update_status(status)
self.tests[name].disabled = True
self.tests[name].failed = test_failed
def generate_attribute_report(
self, description=False, timestamp=False):
@ -487,7 +496,7 @@ class DiskObj():
for t in ['badblocks', 'I/O Benchmark']:
self.disable_test(t, 'Denied')
elif not disk_ok:
self.disable_test('NVMe / SMART', 'NS')
self.disable_test('NVMe / SMART', 'NS', test_failed=True)
for t in ['badblocks', 'I/O Benchmark']:
self.disable_test(t, 'Denied')
@ -495,6 +504,7 @@ class DiskObj():
class State():
"""Object to track device objects and overall state."""
def __init__(self):
self.args = None
self.cpu = None
self.disks = []
self.panes = {}
@ -522,6 +532,83 @@ class State():
},
})
def build_outer_panes(self):
"""Build top and side panes."""
clear_screen()
# Top
self.panes['Top'] = tmux_split_window(
behind=True, lines=2, vertical=True,
text=TOP_PANE_TEXT)
# Started
self.panes['Started'] = tmux_split_window(
lines=SIDE_PANE_WIDTH, target_pane=self.panes['Top'],
text='{BLUE}Started{CLEAR}\n{s}'.format(
s=time.strftime("%Y-%m-%d %H:%M %Z"),
**COLORS))
# Progress
self.panes['Progress'] = tmux_split_window(
lines=SIDE_PANE_WIDTH,
watch=self.progress_out)
def fix_tmux_panes(self):
"""Fix pane sizes if the window has been resized."""
needs_fixed = False
# Bail?
if not self.panes:
return
# Check layout
for k, v in self.tmux_layout.items():
if not v.get('Check'):
# Not concerned with the size of this pane
continue
# Get target
target = None
if k != 'Current':
if k not in self.panes:
# Skip missing panes
continue
else:
target = self.panes[k]
# Check pane size
x, y = tmux_get_pane_size(pane_id=target)
if v.get('x', False) and v['x'] != x:
needs_fixed = True
if v.get('y', False) and v['y'] != y:
needs_fixed = True
# Bail?
if not needs_fixed:
return
# Update layout
for k, v in self.tmux_layout.items():
# Get target
target = None
if k != 'Current':
if k not in self.panes:
# Skip missing panes
continue
else:
target = self.panes[k]
# Resize pane
tmux_resize_pane(pane_id=target, **v)
def fix_tmux_panes_loop(self):
while True:
try:
self.fix_tmux_panes()
sleep(1)
except RuntimeError:
# Assuming layout definitions changes mid-run, ignoring
pass
def init(self):
"""Remove test objects, set log, and add devices."""
self.disks = []
@ -529,14 +616,18 @@ class State():
v['Objects'] = []
# Update LogDir
if not self.quick_mode:
if self.quick_mode:
global_vars['LogDir'] = '{}/Logs/{}'.format(
global_vars['Env']['HOME'],
time.strftime('%Y-%m-%d_%H%M_%z'))
else:
global_vars['LogDir'] = '{}/Logs/{}_{}'.format(
global_vars['Env']['HOME'],
get_ticket_number(),
time.strftime('%Y-%m-%d_%H%M_%z'))
os.makedirs(global_vars['LogDir'], exist_ok=True)
global_vars['LogFile'] = '{}/Hardware Diagnostics.log'.format(
global_vars['LogDir'])
os.makedirs(global_vars['LogDir'], exist_ok=True)
global_vars['LogFile'] = '{}/Hardware Diagnostics.log'.format(
global_vars['LogDir'])
self.progress_out = '{}/progress.out'.format(global_vars['LogDir'])
# Add CPU
@ -565,7 +656,13 @@ class State():
# Start tmux thread
self.tmux_layout = TMUX_LAYOUT.copy()
start_thread(fix_tmux_panes_loop, args=[self])
start_thread(self.fix_tmux_panes_loop)
def set_top_pane_text(self, text):
"""Set top pane text using TOP_PANE_TEXT and provided text."""
tmux_update_pane(
self.panes['Top'],
text='{}\n{}'.format(TOP_PANE_TEXT, text))
class TestObj():
@ -600,28 +697,6 @@ class TestObj():
# Functions
def build_outer_panes(state):
"""Build top and side panes."""
clear_screen()
# Top
state.panes['Top'] = tmux_split_window(
behind=True, lines=2, vertical=True,
text=TOP_PANE_TEXT)
# Started
state.panes['Started'] = tmux_split_window(
lines=SIDE_PANE_WIDTH, target_pane=state.panes['Top'],
text='{BLUE}Started{CLEAR}\n{s}'.format(
s=time.strftime("%Y-%m-%d %H:%M %Z"),
**COLORS))
# Progress
state.panes['Progress'] = tmux_split_window(
lines=SIDE_PANE_WIDTH,
watch=state.progress_out)
def build_status_string(label, status, info_label=False):
"""Build status string with appropriate colors."""
status_color = COLORS['CLEAR']
@ -638,64 +713,6 @@ def build_status_string(label, status, info_label=False):
**COLORS)
def fix_tmux_panes_loop(state):
while True:
try:
fix_tmux_panes(state)
sleep(1)
except RuntimeError:
# Assuming layout definitions changes mid-run, ignoring
pass
def fix_tmux_panes(state):
"""Fix pane sizes if the window has been resized."""
needs_fixed = False
# Bail?
if not state.panes:
return
# Check layout
for k, v in state.tmux_layout.items():
if not v.get('Check'):
# Not concerned with the size of this pane
continue
# Get target
target = None
if k != 'Current':
if k not in state.panes:
# Skip missing panes
continue
else:
target = state.panes[k]
# Check pane size
x, y = tmux_get_pane_size(pane_id=target)
if v.get('x', False) and v['x'] != x:
needs_fixed = True
if v.get('y', False) and v['y'] != y:
needs_fixed = True
# Bail?
if not needs_fixed:
return
# Update layout
for k, v in state.tmux_layout.items():
# Get target
target = None
if k != 'Current':
if k not in state.panes:
# Skip missing panes
continue
else:
target = state.panes[k]
# Resize pane
tmux_resize_pane(pane_id=target, **v)
def generate_horizontal_graph(rates, oneline=False):
"""Generate horizontal graph from rates, returns list."""
graph = ['', '', '', '']
@ -755,6 +772,44 @@ def get_graph_step(rate, scale=16):
return step
def get_ram_details():
"""Get RAM details via dmidecode, returns dict."""
cmd = ['sudo', 'dmidecode', '--type', 'memory']
manufacturer = 'UNKNOWN'
ram_details = {'Total': 0}
size = 0
# Get DMI data
result = run_program(cmd, encoding='utf-8', errors='ignore')
dmi_data = result.stdout.splitlines()
# Parse data
for line in dmi_data:
line = line.strip()
if line == 'Memory Device':
# Reset vars
manufacturer = 'UNKNOWN'
size = 0
elif line.startswith('Size:'):
size = convert_to_bytes(line.replace('Size: ', ''))
elif line.startswith('Manufacturer:'):
manufacturer = line.replace('Manufacturer: ', '')
if size > 0:
# Add RAM to list if slot populated
ram_str = '{} {}'.format(
human_readable_size(size).strip(),
manufacturer,
)
ram_details['Total'] += size
if ram_str in ram_details:
ram_details[ram_str] += 1
else:
ram_details[ram_str] = 1
# Done
return ram_details
def get_read_rate(s):
"""Get read rate in bytes/s from dd progress output."""
real_rate = None
@ -767,6 +822,7 @@ def get_read_rate(s):
def menu_diags(state, args):
"""Main menu to select and run HW tests."""
args = [a.lower() for a in args]
state.args = args
checkmark = '*'
if 'DISPLAY' in global_vars['Env']:
checkmark = ''
@ -908,10 +964,7 @@ def run_badblocks_test(state, test):
update_progress_pane(state)
# Update tmux layout
tmux_update_pane(
state.panes['Top'],
text='{}\n{}'.format(
TOP_PANE_TEXT, dev.description))
state.set_top_pane_text(dev.description)
# Create monitor pane
test.badblocks_out = '{}/badblocks_{}.out'.format(
@ -994,10 +1047,11 @@ def run_hw_tests(state):
"""Run enabled hardware tests."""
print_standard('Scanning devices...')
state.init()
tests_enabled = False
# Build Panes
update_progress_pane(state)
build_outer_panes(state)
state.build_outer_panes()
# Show selected tests and create TestObj()s
print_info('Selected Tests:')
@ -1009,6 +1063,8 @@ def run_hw_tests(state):
COLORS['CLEAR'],
QUICK_LABEL if state.quick_mode and 'NVMe' in k else ''))
if v['Enabled']:
tests_enabled = True
# Create TestObj and track under both CpuObj/DiskObj and State
if k in TESTS_CPU:
test_obj = TestObj(
@ -1022,6 +1078,11 @@ def run_hw_tests(state):
v['Objects'].append(test_obj)
print_standard('')
# Bail if no tests selected
if not tests_enabled:
tmux_kill_pane(*state.panes.values())
return
# Run disk safety checks (if necessary)
_disk_tests_enabled = False
for k in TESTS_DISK:
@ -1064,7 +1125,7 @@ def run_hw_tests(state):
# Rebuild panes
update_progress_pane(state)
build_outer_panes(state)
state.build_outer_panes()
# Mark unfinished tests as aborted
for k, v in state.tests.items():
@ -1076,8 +1137,22 @@ def run_hw_tests(state):
# Update side pane
update_progress_pane(state)
# Done
# Show results
show_results(state)
# Upload for review
if ENABLED_UPLOAD_DATA and ask('Upload results for review?'):
try_and_print(
message='Saving debug reports...',
function=save_debug_reports,
state=state, global_vars=global_vars)
try_and_print(
message='Uploading Data...',
function=upload_logdir,
global_vars=global_vars,
reason='Review')
# Done
sleep(1)
if state.quick_mode:
pause('Press Enter to exit... ')
@ -1104,10 +1179,7 @@ def run_io_benchmark(state, test):
update_progress_pane(state)
# Update tmux layout
tmux_update_pane(
state.panes['Top'],
text='{}\n{}'.format(
TOP_PANE_TEXT, dev.description))
state.set_top_pane_text(dev.description)
state.tmux_layout['Current'] = {'y': 15, 'Check': True}
# Create monitor pane
@ -1266,9 +1338,7 @@ def run_mprime_test(state, test):
test.thermal_abort = False
# Update tmux layout
tmux_update_pane(
state.panes['Top'],
text='{}\n{}'.format(TOP_PANE_TEXT, dev.name))
state.set_top_pane_text(dev.name)
# Start live sensor monitor
test.sensors_out = '{}/sensors.out'.format(global_vars['TmpDir'])
@ -1431,7 +1501,7 @@ def run_mprime_test(state, test):
# Add temps to report
test.report.append('{BLUE}Temps{CLEAR}'.format(**COLORS))
for line in generate_sensor_report(
test.sensor_data, 'Idle', 'Max', 'Cooldown', core_only=True):
test.sensor_data, 'Idle', 'Max', 'Cooldown', cpu_only=True):
test.report.append(' {}'.format(line))
# Add abort message(s)
@ -1481,10 +1551,7 @@ def run_nvme_smart_tests(state, test, update_mode=False):
update_progress_pane(state)
# Update tmux layout
tmux_update_pane(
state.panes['Top'],
text='{}\n{}'.format(
TOP_PANE_TEXT, dev.description))
state.set_top_pane_text(dev.description)
# SMART short self-test
if dev.smart_attributes and not (state.quick_mode or update_mode):
@ -1629,9 +1696,7 @@ def show_report(report, log_report=False):
def show_results(state):
"""Show results for all tests."""
clear_screen()
tmux_update_pane(
state.panes['Top'],
text='{}\nResults'.format(TOP_PANE_TEXT))
state.set_top_pane_text('Results')
# CPU tests
_enabled = False
@ -1661,17 +1726,6 @@ def show_results(state):
# Update progress
update_progress_pane(state)
# Ask for review
if ENABLED_UPLOAD_DATA and ask('Upload results for review?'):
try_and_print(
message='Saving debug reports...',
function=save_debug_reports,
state=state, global_vars=global_vars)
try_and_print(
message='Uploading Data...',
function=upload_logdir,
global_vars=global_vars)
def update_main_options(state, selection, main_options):
"""Update menu and state based on selection."""

View file

@ -1,4 +1,6 @@
# Wizard Kit: Functions - Sensors
'''Wizard Kit: Functions - Sensors'''
# pylint: disable=no-name-in-module,wildcard-import
# vim: sts=2 sw=2 ts=2
import json
import re
@ -9,7 +11,7 @@ from settings.sensors import *
# Error Classes
class ThermalLimitReachedError(Exception):
pass
'''Thermal limit reached error.'''
def clear_temps(sensor_data):
@ -20,28 +22,30 @@ def clear_temps(sensor_data):
_data['Temps'] = []
def fix_sensor_str(s):
def fix_sensor_str(_s):
"""Cleanup string and return str."""
s = re.sub(r'^(\w+)-(\w+)-(\w+)', r'\1 (\2 \3)', s, re.IGNORECASE)
s = s.title()
s = s.replace('Coretemp', 'CoreTemp')
s = s.replace('Acpi', 'ACPI')
s = s.replace('ACPItz', 'ACPI TZ')
s = s.replace('Isa ', 'ISA ')
s = s.replace('Id ', 'ID ')
s = re.sub(r'(\D+)(\d+)', r'\1 \2', s, re.IGNORECASE)
s = s.replace(' ', ' ')
return s
_s = re.sub(r'^(\w+)-(\w+)-(\w+)', r'\1 (\2 \3)', _s, re.IGNORECASE)
_s = _s.title()
_s = _s.replace('Coretemp', 'CPUTemp')
_s = _s.replace('Acpi', 'ACPI')
_s = _s.replace('ACPItz', 'ACPI TZ')
_s = _s.replace('Isa ', 'ISA ')
_s = _s.replace('Pci ', 'PCI ')
_s = _s.replace('Id ', 'ID ')
_s = re.sub(r'(\D+)(\d+)', r'\1 \2', _s, re.IGNORECASE)
_s = re.sub(r'^K (\d+)Temp', r'AMD K\1 Temps', _s, re.IGNORECASE)
_s = re.sub(r'T(ctl|die)', r'CPU (T\1)', _s, re.IGNORECASE)
return _s
def generate_sensor_report(
sensor_data, *temp_labels,
colors=True, core_only=False):
colors=True, cpu_only=False):
"""Generate report based on temp_labels, returns list if str."""
report = []
for _section, _adapters in sorted(sensor_data.items()):
# CoreTemps then Other temps
if core_only and 'Core' not in _section:
# CPU temps then Other temps
if cpu_only and 'CPU' not in _section:
continue
for _adapter, _sources in sorted(_adapters.items()):
# Adapter
@ -56,7 +60,7 @@ def generate_sensor_report(
': ' if _label != 'Current' else '',
get_temp_str(_data.get(_label, '???'), colors=colors))
report.append(_line)
if not core_only:
if not cpu_only:
report.append(' ')
# Handle empty reports (i.e. no sensors detected)
@ -91,17 +95,17 @@ def get_colored_temp_str(temp):
else:
color = COLORS['CLEAR']
return '{color}{prefix}{temp:2.0f}°C{CLEAR}'.format(
color = color,
prefix = '-' if temp < 0 else '',
temp = temp,
color=color,
prefix='-' if temp < 0 else '',
temp=temp,
**COLORS)
def get_raw_sensor_data():
"""Read sensor data and return dict."""
data = {}
json_data = {}
cmd = ['sensors', '-j']
# Get raw data
try:
result = run_program(cmd)
@ -122,8 +126,8 @@ def get_raw_sensor_data():
try:
json_data = json.loads('\n'.join(raw_data))
except json.JSONDecodeError:
# Still broken, just set to empty dict
json_data = {}
# Still broken, just return the empty dict
pass
# Done
return json_data
@ -132,10 +136,10 @@ def get_raw_sensor_data():
def get_sensor_data():
"""Parse raw sensor data and return new dict."""
json_data = get_raw_sensor_data()
sensor_data = {'CoreTemps': {}, 'Other': {}}
sensor_data = {'CPUTemps': {}, 'Other': {}}
for _adapter, _sources in json_data.items():
if 'coretemp' in _adapter:
_section = 'CoreTemps'
if is_cpu_adapter(_adapter):
_section = 'CPUTemps'
else:
_section = 'Other'
sensor_data[_section][_adapter] = {}
@ -157,8 +161,8 @@ def get_sensor_data():
}
# Remove empty sections
for k, v in sensor_data.items():
v = {k2: v2 for k2, v2 in v.items() if v2}
for _k, _v in sensor_data.items():
_v = {_k2: _v2 for _k2, _v2 in _v.items() if _v2}
# Done
return sensor_data
@ -178,14 +182,20 @@ def get_temp_str(temp, colors=True):
temp)
def is_cpu_adapter(adapter):
"""Checks if adapter is a known CPU adapter, returns bool."""
is_cpu = re.search(r'(core|k\d+)temp', adapter, re.IGNORECASE)
return bool(is_cpu)
def monitor_sensors(monitor_pane, monitor_file):
"""Continually update sensor data and report to screen."""
sensor_data = get_sensor_data()
while True:
update_sensor_data(sensor_data)
with open(monitor_file, 'w') as f:
with open(monitor_file, 'w') as _f:
report = generate_sensor_report(sensor_data, 'Current', 'Max')
f.write('\n'.join(report))
_f.write('\n'.join(report))
sleep(1)
if monitor_pane and not tmux_poll_pane(monitor_pane):
break
@ -196,7 +206,7 @@ def save_average_temp(sensor_data, temp_label, seconds=10):
clear_temps(sensor_data)
# Get temps
for i in range(seconds):
for _i in range(seconds): # pylint: disable=unused-variable
update_sensor_data(sensor_data)
sleep(1)
@ -219,24 +229,15 @@ def update_sensor_data(sensor_data, thermal_limit=None):
_data['Current'] = _temp
_data['Max'] = max(_temp, _data['Max'])
_data['Temps'].append(_temp)
except Exception:
except Exception: # pylint: disable=broad-except
# Dumb workound for Dell sensors with changing source names
pass
# Check if thermal limit reached
if thermal_limit and _section == 'CoreTemps':
if thermal_limit and _section == 'CPUTemps':
if max(_data['Current'], _data['Max']) >= thermal_limit:
raise ThermalLimitReachedError('CoreTemps reached limit')
def join_columns(column1, column2, width=55):
return '{:<{}}{}'.format(
column1,
55+len(column1)-len(REGEX_COLORS.sub('', column1)),
column2)
raise ThermalLimitReachedError('CPU temps reached limit')
if __name__ == '__main__':
print("This file is not meant to be called directly.")
# vim: sts=2 sw=2 ts=2

View file

@ -49,7 +49,7 @@ if __name__ == '__main__':
global_vars=global_vars)
# Done
sleep(10)
sleep(1)
pause('Press Enter to exit...')
exit_script(1)