Refactor SMART self-test sections

2022-04-08 18:47:08 -06:00 · 2022-04-08 18:47:08 -06:00 · af8b2b7dd3
commit af8b2b7dd3
parent 41b4ffd9fb
3 changed files with 171 additions and 139 deletions
--- a/scripts/wk/clone/ddrescue.py
+++ b/scripts/wk/clone/ddrescue.py
@ -30,7 +30,6 @@ from wk.cfg.ddrescue import (
 from wk.hw import disk as hw_disk
 from wk.hw.smart import (
  CriticalHardwareError,
  SMARTNotSupportedError,
  SMARTSelfTestInProgressError,
  safety_checks,
  update_smart_details,
@ -1503,8 +1502,6 @@ def check_destination_health(destination):
    result = 'Critical hardware error detected on destination'
  except SMARTSelfTestInProgressError:
    result = 'SMART self-test in progress on destination'
  except SMARTNotSupportedError:
    pass
  # Done
  return result
--- a/scripts/wk/hw/diags.py
+++ b/scripts/wk/hw/diags.py
@ -701,10 +701,7 @@ def disk_self_test(state, test_objects) -> None:
    for test in test_objects:
      hw_smart.abort_self_test(test.dev)
      std.sleep(0.5)
-
+      hw_smart.build_self_test_report(test, aborted=True)
  # Save report(s)
  for test in test_objects:
    hw_smart.check_self_test_results(test, aborted=aborted)
  # Cleanup
  state.update_progress_pane()
--- a/scripts/wk/hw/smart.py
+++ b/scripts/wk/hw/smart.py
@ -28,9 +28,6 @@ LOG = logging.getLogger(__name__)
 class CriticalHardwareError(RuntimeError):
  """Exception used for critical hardware failures."""
 class SMARTNotSupportedError(TypeError):
  """Exception used for disks lacking SMART support."""
 class SMARTSelfTestInProgressError(RuntimeError):
  """Exception used when a SMART self-test is in progress."""
@ -42,6 +39,39 @@ def abort_self_test(dev) -> None:
  run_program(cmd, check=False)
 def build_self_test_report(test_obj, aborted=False) -> None:
  """Check self-test results and build report (saved to test_obj).
  NOTE: Not updating SMART data to preserve the result for the report.
  For instance if the test was aborted the report should include the
  last known progress instead of just "was aborted by host."
  """
  report = [color_string('Self-Test', 'BLUE')]
  test_details = get_smart_self_test_details(test_obj.dev)
  test_result = test_details.get('status', {}).get('string', 'Unknown')
  # Build report
  if test_obj.disabled or test_obj.status == 'Denied':
    report.append(color_string(f'  {test_obj.status}', 'RED'))
  elif test_obj.status == 'N/A' or not test_obj.dev.attributes:
    report.append(color_string(f'  {test_obj.status}', 'YELLOW'))
  elif test_obj.status == 'TestInProgress':
    report.append(color_string('  Failed to stop previous test', 'RED'))
    test_obj.set_status('Failed')
  else:
    # Other cases include self-test result string
    report.append(f'  {test_result.capitalize()}')
    if aborted and not (test_obj.passed or test_obj.failed):
      report.append(color_string('  Aborted', 'YELLOW'))
      test_obj.set_status('Aborted')
    elif test_obj.status == 'TimedOut':
      report.append(color_string('  TimedOut', 'YELLOW'))
  # Done
  test_obj.report.extend(report)
 def check_attributes(dev, only_blocking=False) -> bool:
  """Check if any known attributes are failing, returns bool."""
  attributes_ok = True
@ -77,32 +107,6 @@ def check_attributes(dev, only_blocking=False) -> bool:
  return attributes_ok
 def check_self_test_results(test_obj, aborted=False) -> None:
  """Check SMART self-test results."""
  test_obj.report.append(color_string('Self-Test', 'BLUE'))
  if test_obj.disabled or test_obj.status == 'Denied':
    test_obj.report.append(color_string(f'  {test_obj.status}', 'RED'))
  elif test_obj.status == 'N/A' or not test_obj.dev.attributes:
    test_obj.report.append(color_string(f'  {test_obj.status}', 'YELLOW'))
  else:
    # Not updating SMART data here to preserve the test status for the report
    # For instance if the test was aborted the report should inlcude the last
    # known progress instead of just "was aborted by host"
    test_details = get_smart_self_test_details(test_obj.dev)
    test_result = test_details.get('status', {}).get('string', 'Unknown')
    test_obj.report.append(f'  {test_result.capitalize()}')
    if aborted and not (test_obj.passed or test_obj.failed):
      test_obj.report.append(color_string('  Aborted', 'YELLOW'))
      test_obj.set_status('Aborted')
    elif test_obj.status == 'TimedOut':
      test_obj.report.append(color_string('  TimedOut', 'YELLOW'))
      test_obj.set_status('TimedOut')
    else:
      test_obj.failed = not test_obj.passed
      if test_obj.failed:
        test_obj.set_status('Failed')
 def enable_smart(dev) -> None:
  """Try enabling SMART for this disk."""
  cmd = [
@ -202,6 +206,138 @@ def get_smart_self_test_details(dev) -> dict[Any, Any]:
  return details
 def monitor_smart_self_test(test_obj, header_str, log_path) -> bool:
  """Monitor SMART self-test status and update test_obj, returns bool."""
  started = False
  finished = False
  status_str = 'Starting self-test...'
  test_details = get_smart_self_test_details(test_obj.dev)
  test_minutes = 15
  # Get real test length
  test_minutes = test_details.get('polling_minutes', {}).get('short', 5)
  test_minutes = int(test_minutes) + 10
  # Monitor progress (in five second intervals)
  for _i in range(int(test_minutes*60/5)):
    sleep(5)
    # Update log
    ## NOTE: This is run at least once with the default "Starting..." status
    with open(log_path, 'w', encoding='utf-8') as _f:
      _f.write(f'{header_str}\nSMART self-test status:\n  {status_str}')
    # Update status
    update_smart_details(test_obj.dev)
    test_details = get_smart_self_test_details(test_obj.dev)
    # Check if test started
    started = started or 'remaining_percent' in test_details.get('status', {})
    if not started:
      if _i * 5 >= SMART_SELF_TEST_START_TIMEOUT_IN_SECONDS:
        # Test didn't start within limit, stop waiting
        abort_self_test(test_obj.dev)
        test_obj.failed = True
        test_obj.set_status('TimedOut')
        break
      # Still within starting limit, continue to next loop
      continue
    # Check test progress
    status_str = test_details.get('status', {}).get('string', 'Unknown')
    status_str = status_str.capitalize()
    # Check if finished
    if 'remaining_percent' not in test_details.get('status', {}):
      finished = True
      break
  # Done
  return finished
 def run_self_test(test_obj, log_path) -> None:
  """Run disk self-test and update test results.
  NOTE: This function is here to reserve a place for future
        NVMe self-tests announced in NVMe spec v1.3.
  """
  run_smart_self_test(test_obj, log_path)
 def run_smart_self_test(test_obj, log_path) -> bool:
  """Run SMART self-test and check if it passed, returns bool.
  NOTE: An exception will be raised if the disk lacks SMART support.
  """
  finished = False
  test_details = get_smart_self_test_details(test_obj.dev)
  size_str = bytes_to_string(test_obj.dev.size, use_binary=False)
  header_str = color_string(
    ['[', test_obj.dev.path.name, ' ', size_str, ']'],
    [None, 'BLUE', None, 'CYAN', None],
    sep='',
    )
  # Check if disk supports self-tests
  if not test_details:
    # Mark test as passed since it doesn't apply
    test_obj.passed = True
    test_obj.set_status('N/A')
    build_self_test_report(test_obj)
    return
  # Update status
  with open(log_path, 'w', encoding='utf-8') as _f:
    _f.write(f'{header_str}\nInitializing...')
  # Check for, and stop, self-test if currently in-progress
  if self_test_in_progress(test_obj.dev):
    abort_self_test(test_obj.dev)
    for _ in range(6):
      # Wait up to a minute for current test to exit
      sleep(10)
      update_smart_details(test_obj.dev)
      if not self_test_in_progress(test_obj.dev):
        break
  # Recheck if self-test is in-progress, bail if so
  if self_test_in_progress(test_obj.dev):
    test_obj.failed = True
    test_obj.set_status('TestInProgress')
    build_self_test_report(test_obj)
    return
  # Start test
  cmd = [
    'sudo',
    'smartctl',
    '--tolerance=normal',
    '--test=short',
    test_obj.dev.path,
    ]
  run_program(cmd, check=False)
  # Monitor progress
  finished = monitor_smart_self_test(test_obj, header_str, log_path)
  # Check result
  if finished:
    test_obj.passed = test_details.get('status', {}).get('passed', False)
    test_obj.failed = test_obj.failed or not test_obj.passed
  # Set status
  if test_obj.failed and test_obj.status != 'TimedOut':
    test_obj.set_status('Failed')
  elif test_obj.passed:
    test_obj.set_status('Passed')
  else:
    test_obj.set_status('Unknown')
  # Build report
  build_self_test_report(test_obj)
 def safety_checks(dev) -> None:
  """Run safety checks and raise an exception if necessary."""
  blocking_event_encountered = False
@ -243,114 +379,16 @@ def safety_checks(dev) -> None:
    raise CriticalHardwareError(f'Critical error(s) for: {dev.path}')
  # SMART self-test status
-  test_details = get_smart_self_test_details(dev)
+  if self_test_in_progress(dev):
  if 'remaining_percent' in test_details.get('status', ''):
    msg = f'SMART self-test in progress for: {dev.path}'
    LOG.error(msg)
    raise SMARTSelfTestInProgressError(msg)
-def run_self_test(test_obj, log_path) -> None:
+def self_test_in_progress(dev) -> bool:
-  """Run disk self-test and check if it passed, returns bool."""
+  """Check if SMART self-test is in progress, returns bool."""
  result = None
  try:
    test_obj.passed = run_smart_self_test(test_obj.dev, log_path)
  except TimeoutError:
    test_obj.failed = True
    result = 'TimedOut'
  except SMARTNotSupportedError:
    # Pass test since it doesn't apply
    test_obj.passed = True
    result = 'N/A'
  # Set status
  if result:
    test_obj.set_status(result)
  else:
    if test_obj.failed:
      test_obj.set_status('Failed')
    elif test_obj.passed:
      test_obj.set_status('Passed')
    else:
      test_obj.set_status('Unknown')
 def run_smart_self_test(dev, log_path) -> bool:
  """Run SMART self-test and check if it passed, returns bool.
  NOTE: An exception will be raised if the disk lacks SMART support.
  """
  finished = False
  result = None
  started = False
  status_str = 'Starting self-test...'
  test_details = get_smart_self_test_details(dev)
-  test_minutes = 15
+  return 'remaining_percent' in test_details.get('status', '')
  size_str = bytes_to_string(dev.size, use_binary=False)
  header_str = color_string(
    ['[', dev.path.name, ' ', size_str, ']'],
    [None, 'BLUE', None, 'CYAN', None],
    sep='',
    )
  # Check if disk supports self-tests
  if not test_details:
    raise SMARTNotSupportedError(
      f'SMART self-test not supported for {dev.path}')
  # Get real test length
  test_minutes = test_details.get('polling_minutes', {}).get('short', 5)
  test_minutes = int(test_minutes) + 10
  # Start test
  with open(log_path, 'w', encoding='utf-8') as _f:
    _f.write(f'{header_str}\nInitializing...')
  cmd = [
    'sudo',
    'smartctl',
    '--tolerance=normal',
    '--test=short',
    dev.path,
    ]
  run_program(cmd, check=False)
  # Monitor progress (in five second intervals)
  for _i in range(int(test_minutes*60/5)):
    sleep(5)
    # Update status
    update_smart_details(dev)
    test_details = get_smart_self_test_details(dev)
    # Check test progress
    if started:
      status_str = test_details.get('status', {}).get('string', 'Unknown')
      status_str = status_str.capitalize()
      # Update log
      with open(log_path, 'w', encoding='utf-8') as _f:
        _f.write(f'{header_str}\nSMART self-test status:\n  {status_str}')
      # Check if finished
      if 'remaining_percent' not in test_details.get('status', {}):
        finished = True
        break
    elif 'remaining_percent' in test_details.get('status', {}):
      started = True
    elif _i * 5 >= SMART_SELF_TEST_START_TIMEOUT_IN_SECONDS:
      # Test didn't start within limit, stop waiting
      break
  # Check result
  if finished:
    result = test_details.get('status', {}).get('passed', False)
  elif started:
    raise TimeoutError(f'SMART self-test timed out for {dev.path}')
  # Done
  return result
 def update_smart_details(dev) -> None: