Added I/O Benchmark sections

2019-12-06 19:10:36 -07:00 · 2019-12-06 19:10:36 -07:00 · a0b07cbfde
commit a0b07cbfde
parent c7585d17f0
3 changed files with 298 additions and 22 deletions
--- a/scripts/wk/cfg/hw.py
+++ b/scripts/wk/cfg/hw.py
@ -15,7 +15,7 @@ ATTRIBUTE_COLORS = (
  ('Maximum', 'PURPLE'),
  )
 # NOTE: Force 4K read block size for disks >= 3TB
-BADBLOCKS_LARGE_DISK = 3*1024**4
+BADBLOCKS_LARGE_DISK = 3 * 1024**4
 CPU_CRITICAL_TEMP = 99
 CPU_FAILURE_TEMP = 90
 CPU_TEST_MINUTES = 7
@ -115,6 +115,13 @@ TEMP_COLORS = {
  90:             'RED',
  100:            'ORANGE_RED',
  }
+# THRESHOLDS: Rates used to determine HDD/SSD pass/fail
+THRESH_HDD_MIN =       50 * 1024**2
+THRESH_HDD_AVG_HIGH =  75 * 1024**2
+THRESH_HDD_AVG_LOW =   65 * 1024**2
+THRESH_SSD_MIN =       90 * 1024**2
+THRESH_SSD_AVG_HIGH = 135 * 1024**2
+THRESH_SSD_AVG_LOW =  100 * 1024**2
 TMUX_SIDE_WIDTH = 20
 TMUX_LAYOUT = OrderedDict({
  'Top':            {'height':  2,                'Check': True},
--- a/scripts/wk/graph.py
+++ b/scripts/wk/graph.py
@ -9,11 +9,7 @@ from wk.std import color_string

 # STATIC VARIABLES
 LOG = logging.getLogger(__name__)
-ALT_TEST_SIZE_FACTOR = 0.01
-BLOCK_SIZE = 512 * 1024
-CHUNK_SIZE = 32 * 1024**2
 GRAPH_HORIZONTAL = ('▁', '▂', '▃', '▄', '▅', '▆', '▇', '█')
-GRAPH_WIDTH = 40
 GRAPH_VERTICAL = (
    '▏',    '▎',    '▍',    '▌',
    '▋',    '▊',    '▉',    '█',
@ -24,7 +20,6 @@ GRAPH_VERTICAL = (
    '███▏', '███▎', '███▍', '███▌',
    '███▋', '███▊', '███▉', '████',
  )
-MINIMUM_TEST_SIZE = 10 * 1024**3
 # SCALE_STEPS: These scales allow showing differences between HDDs and SSDs
 #              on the same graph.
 SCALE_STEPS = {
@ -39,13 +34,13 @@ THRESH_GREAT =        750 * 1024**2


 # Functions
-def generate_horizontal_graph(rate_list, oneline=False):
+def generate_horizontal_graph(rate_list, graph_width=40, oneline=False):
  """Generate horizontal graph from rate_list, returns list."""
  graph = ['', '', '', '']
  scale = 8 if oneline else 32

  # Build graph
-  for rate in merge_rates(rate_list):
+  for rate in merge_rates(rate_list, graph_width=graph_width):
    step = get_graph_step(rate, scale=scale)

    # Set color
@ -101,7 +96,7 @@ def get_graph_step(rate, scale=16):
  return step


-def merge_rates(rates, graph_width=GRAPH_WIDTH):
+def merge_rates(rates, graph_width=40):
  """Merge rates to have entries equal to the width, returns list."""
  merged_rates = []
  offset = 0
@ -116,5 +111,41 @@ def merge_rates(rates, graph_width=GRAPH_WIDTH):
  return merged_rates


+def vertical_graph_line(percent, rate, scale=32):
+  """Build colored graph string using thresholds, returns str."""
+  color_bar = None
+  color_rate = None
+  step = get_graph_step(rate, scale=scale)
+
+  # Set colors
+  if rate < THRESH_FAIL:
+    color_bar = 'RED'
+    color_rate = 'YELLOW'
+  elif rate < THRESH_WARN:
+    color_bar = 'YELLOW'
+    color_rate = 'YELLOW'
+  elif rate > THRESH_GREAT:
+    color_bar = 'GREEN'
+    color_rate = 'GREEN'
+
+  # Build string
+  line = color_string(
+    strings=(
+      f'{percent:5.1f}%',
+      f'{GRAPH_VERTICAL[step]:<4}',
+      f'{rate/(1000**2):6.1f} MB/s',
+      ),
+    colors=(
+      None,
+      color_bar,
+      color_rate,
+      ),
+    sep='  ',
+    )
+
+  # Done
+  return line
+
+
 if __name__ == '__main__':
  print("This file is not meant to be called directly.")
--- a/scripts/wk/hw/diags.py
+++ b/scripts/wk/hw/diags.py
@ -15,7 +15,7 @@ import time
 from collections import OrderedDict
 from docopt import docopt

-from wk import cfg, exe, log, net, std, tmux
+from wk import cfg, exe, graph, log, net, std, tmux
 from wk.hw import obj as hw_obj
 from wk.hw import sensors as hw_sensors

@ -32,11 +32,17 @@ Options:
  -h --help           Show this page
  -q --quick          Skip menu and perform a quick check
 '''
+LOG = logging.getLogger(__name__)
 BADBLOCKS_REGEX = re.compile(
  r'^Pass completed, (\d+) bad blocks found. .(\d+)/(\d+)/(\d+) errors',
  re.IGNORECASE,
  )
-LOG = logging.getLogger(__name__)
+IO_GRAPH_WIDTH = 40
+IO_ALT_TEST_SIZE_FACTOR = 0.01
+IO_BLOCK_SIZE = 512 * 1024
+IO_CHUNK_SIZE = 32 * 1024**2
+IO_MINIMUM_TEST_SIZE = 10 * 1024**3
+IO_RATE_REGEX = re.compile(r'(?P<bytes>\d+) bytes.* (?P<seconds>\S+) s,')
 MENU_ACTIONS = (
  'Audio Test',
  'Keyboard Test',
@ -72,6 +78,7 @@ STATUS_COLORS = {
  'Passed': 'GREEN',
  'Aborted': 'YELLOW',
  'N/A': 'YELLOW',
+  'Skipped': 'YELLOW',
  'Unknown': 'YELLOW',
  'Working': 'YELLOW',
  'Denied': 'RED',
@ -85,6 +92,11 @@ WK_LABEL_REGEX = re.compile(
  )


+# Error Classes
+class DeviceTooSmallError(RuntimeError):
+  """Raised when a device is too small to test."""
+
+
 # Classes
 class State():
  """Object for tracking hardware diagnostic data."""
@ -333,6 +345,58 @@ def build_menu(cli_mode=False, quick_mode=False):
  return menu


+def calc_io_dd_values(dev_size):
+  """Calculate I/O benchmark dd values, returns dict.
+
+  Calculations:
+  The minimum dev size is IO_GRAPH_WIDTH * IO_CHUNK_SIZE
+    (e.g. 1.25 GB for a width of 40 and a chunk size of 32MB)
+
+  read_total is the area to be read in bytes
+    If the dev is < IO_MINIMUM_TEST_SIZE then it's the whole dev
+    Else it's the larger of IO_MINIMUM_TEST_SIZE or the alt test size
+    (determined by dev * IO_ALT_TEST_SIZE_FACTOR)
+
+  read_chunks is the number of groups of IO_CHUNK_SIZE in test_obj.dev
+    This number is reduced to a multiple of IO_GRAPH_WIDTH in order
+    to allow for the data to be condensed cleanly
+
+  read_blocks is the chunk size in number of blocks
+    (e.g. 64 if block size is 512KB and chunk size is 32MB
+
+  skip_total is the number of IO_BLOCK_SIZE groups not tested
+  skip_blocks is the number of blocks to skip per IO_CHUNK_SIZE
+  skip_extra_rate is how often to add an additional skip block
+    This is needed to ensure an even testing across the dev
+    This is calculated by using the fractional amount left off
+    of the skip_blocks variable
+  """
+  read_total = min(IO_MINIMUM_TEST_SIZE, dev_size)
+  read_total = max(read_total, dev_size*IO_ALT_TEST_SIZE_FACTOR)
+  read_chunks = int(read_total // IO_CHUNK_SIZE)
+  read_chunks -= read_chunks % IO_GRAPH_WIDTH
+  if read_chunks < IO_GRAPH_WIDTH:
+    raise DeviceTooSmallError
+  read_blocks = int(IO_CHUNK_SIZE / IO_BLOCK_SIZE)
+  read_total = read_chunks * IO_CHUNK_SIZE
+  skip_total = int((dev_size - read_total) // IO_BLOCK_SIZE)
+  skip_blocks = int((skip_total / read_chunks) // 1)
+  skip_extra_rate = 0
+  try:
+    skip_extra_rate = 1 + int(1 / ((skip_total / read_chunks) % 1))
+  except ZeroDivisionError:
+    # skip_extra_rate == 0 is fine
+    pass
+
+  # Done
+  return {
+    'Read Chunks': read_chunks,
+    'Read Blocks': read_blocks,
+    'Skip Blocks': skip_blocks,
+    'Skip Extra': skip_extra_rate,
+    }
+
+
 def check_cooling_results(test_obj, sensors):
  """Check cooling results and update test_obj."""
  max_temp = sensors.cpu_max_temp()
@ -353,6 +417,51 @@ def check_cooling_results(test_obj, sensors):
    test_obj.report.append(f'  {line}')


+def check_io_benchmark_results(test_obj, rate_list, graph_width):
+  """Generate colored report using rate_list, returns list of str."""
+  avg_read = sum(rate_list) / len(rate_list)
+  min_read = min(rate_list)
+  max_read = max(rate_list)
+  if test_obj.dev.details['ssd']:
+    thresh_min = cfg.hw.THRESH_SSD_MIN
+    thresh_avg_high = cfg.hw.THRESH_SSD_AVG_HIGH
+    thresh_avg_low = cfg.hw.THRESH_SSD_AVG_LOW
+  else:
+    thresh_min = cfg.hw.THRESH_HDD_MIN
+    thresh_avg_high = cfg.hw.THRESH_HDD_AVG_HIGH
+    thresh_avg_low = cfg.hw.THRESH_HDD_AVG_LOW
+
+  # Add horizontal graph to report
+  for line in graph.generate_horizontal_graph(rate_list, graph_width):
+    if not std.strip_colors(line).strip():
+      # Skip empty lines
+      continue
+    test_obj.report.append(line)
+
+  # Add read rates to report
+  test_obj.report.append(
+    f'Read speeds    avg: {avg_read/(1000**2):3.1f}'
+    f' min: {min_read/(1000**2):3.1f}'
+    f' max: {max_read/(1000**2):3.1f}'
+    )
+
+  # Compare against thresholds
+  if min_read <= thresh_min and avg_read <= thresh_avg_high:
+    test_obj.failed = True
+  elif avg_read <= thresh_avg_low:
+    test_obj.failed = True
+  else:
+    test_obj.passed = True
+
+  # Set status
+  if test_obj.failed:
+    test_obj.set_status('Failed')
+  elif test_obj.passed:
+    test_obj.set_status('Passed')
+  else:
+    test_obj.set_status('Unknown')
+
+
 def check_mprime_results(test_obj, working_dir):
  """Check mprime log files and update test_obj."""
  passing_lines = {}
@ -512,16 +621,135 @@ def disk_attribute_check(state, test_objects):
  state.update_progress_pane()


-def disk_io_benchmark(state, test_objects):
+def disk_io_benchmark(state, test_objects, skip_usb=True):
+  # pylint: disable=too-many-statements
  """Disk I/O benchmark using dd."""
  LOG.info('Disk I/O Benchmark (dd)')
-  #TODO: io
-  LOG.debug('%s, %s', state, test_objects)
-  std.print_warning('TODO: io')
-  std.pause()
+  aborted = False
+
+  def _run_io_benchmark(test_obj, log_path):
+    """Run I/O benchmark and handle exceptions."""
+    offset = 0
+    read_rates = []
+    test_obj.report.append(std.color_string('I/O Benchmark', 'BLUE'))
+
+    # Get dd values or bail
+    try:
+      dd_values = calc_io_dd_values(test_obj.dev.details['size'])
+    except DeviceTooSmallError:
+      test_obj.set_status('N/A')
+      test_obj.report.append(
+        std.color_string('Disk too small to test', 'YELLOW'),
+        )
+      return
+
+    # Run dd read tests
+    for _i in range(dd_values['Read Chunks']):
+      _i += 1
+
+      # Build cmd
+      skip = dd_values['Skip Blocks']
+      if dd_values['Skip Extra'] and _i % dd_values['Skip Extra'] == 0:
+        skip += 1
+      cmd = [
+        'sudo', 'dd',
+        f'bs={IO_BLOCK_SIZE}',
+        f'skip={offset+skip}',
+        f'count={dd_values["Read Blocks"]}',
+        f'if={test_obj.dev.path}',
+        'of=/dev/null',
+        ]
+      if platform.system() == 'Linux':
+        cmd.append('iflag=direct')
+
+      # Run and get read rate
+      try:
+        proc = exe.run_program(
+          cmd,
+          pipe=False,
+          stdout=subprocess.PIPE,
+          stderr=subprocess.STDOUT,
+          )
+      except PermissionError:
+        # Since we're using sudo we can't kill dd
+        # Assuming this happened during a CTRL+c
+        raise KeyboardInterrupt
+      match = IO_RATE_REGEX.search(proc.stdout)
+      if match:
+        read_rates.append(
+          int(match.group('bytes')) / float(match.group('seconds')),
+          )
+        match.group(1)
+
+      # Show progress
+      with open(log_path, 'a') as _f:
+        if _i % 5 == 0:
+          percent = (_i / dd_values['Read Chunks']) * 100
+          _f.write(f'  {graph.vertical_graph_line(percent, read_rates[-1])}\n')
+
+      # Update offset
+      offset += dd_values['Read Blocks'] + skip
+
+    # Check results
+    check_io_benchmark_results(test_obj, read_rates, IO_GRAPH_WIDTH)
+
+  # Run benchmarks
+  state.update_top_pane(
+    f'Disk I/O Benchmark{"s" if len(test_objects) > 1 else ""}',
+    )
+  state.panes['I/O Benchmark'] = tmux.split_window(
+    percent=75,
+    vertical=True,
+    text=' ',
+    )
+  for test in test_objects:
+    if test.disabled:
+      # Skip
+      continue
+
+    # Skip USB devices if requested
+    if skip_usb and test.dev.details['bus'] == 'USB':
+      test.set_status('Skipped')
+      continue
+
+    # Start benchmark
+    if not aborted:
+      std.clear_screen()
+      std.print_report(test.dev.generate_report())
+      test.set_status('Working')
+      test_log = f'{state.log_dir}/{test.dev.path.name}_benchmark.out'
+      tmux.respawn_pane(
+        state.panes['I/O Benchmark'],
+        watch_cmd='tail',
+        watch_file=test_log,
+        )
+      state.update_progress_pane()
+      try:
+        _run_io_benchmark(test, test_log)
+      except KeyboardInterrupt:
+        aborted = True
+      except (subprocess.CalledProcessError, TypeError, ValueError) as err:
+        # Something went wrong
+        test.set_status('ERROR')
+        print(' ')
+        print(err)
+        std.pause('lolwut?')
+
+    # Mark test(s) aborted if necessary
+    if aborted:
+      test.set_status('Aborted')
+      test.report.append(std.color_string('  Aborted', 'YELLOW'))
+
+    # Update progress after each test
+    state.update_progress_pane()
+
+  # Cleanup
+  state.update_progress_pane()
+  tmux.kill_pane(state.panes.pop('I/O Benchmark', None))


 def disk_self_test(state, test_objects):
+  # pylint: disable=too-many-statements
  """Disk self-test if available."""
  LOG.info('Disk Self-Test(s)')
  aborted = False
@ -558,10 +786,13 @@ def disk_self_test(state, test_objects):
    )
  std.print_info(f'Starting self-test{"s" if len(test_objects) > 1 else ""}')
  for test in reversed(test_objects):
-    test.set_status('Working')
-    test_log = f'{state.log_dir}/{test.dev.path.name}_selftest.log'
+    if test.disabled:
+      # Skip
+      continue

    # Start thread
+    test.set_status('Working')
+    test_log = f'{state.log_dir}/{test.dev.path.name}_selftest.log'
    threads.append(exe.start_thread(_run_self_test, args=(test, test_log)))

    # Show progress
@ -601,6 +832,7 @@ def disk_self_test(state, test_objects):


 def disk_surface_scan(state, test_objects):
+  # pylint: disable=too-many-statements
  """Read-only disk surface scan using badblocks."""
  LOG.info('Disk Surface Scan (badblocks)')
  threads = []
@ -668,9 +900,12 @@ def disk_surface_scan(state, test_objects):
    f'Starting disk surface scan{"s" if len(test_objects) > 1 else ""}',
    )
  for test in reversed(test_objects):
-    test_log = f'{state.log_dir}/{test.dev.path.name}_badblocks.log'
+    if test.disabled:
+      # Skip
+      continue

    # Start thread
+    test_log = f'{state.log_dir}/{test.dev.path.name}_badblocks.log'
    threads.append(exe.start_thread(_run_surface_scan, args=(test, test_log)))

    # Show progress
@ -923,16 +1158,19 @@ def run_diags(state, menu, quick_mode=False):
    return

  # Run tests
-  for details in state.tests.values():
+  for name, details in state.tests.items():
    if not details['Enabled']:
      # Skip disabled tests
      continue

    # Run test(s)
    function = details['Function']
+    args = [details['Objects']]
+    if name == 'Disk I/O Benchmark':
+      args.append(menu.toggles['Skip USB Benchmarks']['Selected'])
+    std.clear_screen()
    try:
-      std.clear_screen()
-      function(state, details['Objects'])
+      function(state, *args)
    except std.GenericAbort:
      aborted = True
      # Restart tmux