WizardKit/scripts/msword-search

#!/bin/python3
#
## WizardKit: MS Word content search tool

import os
import re
import sys

import wk

# STATIC VARIABLES
SCANDIR = os.getcwd()
USAGE = '''Usage: {script} <search-terms>...
  e.g.  {script} "Book Title" "Keyword" "etc"

  This script will search all doc/docx files below the current directory for
  the search-terms provided (case-insensitive).'''.format(script=__file__)
REGEX_DOC_FILES = re.compile(r'\.docx?$', re.IGNORECASE)


def scan_for_docs(path):
  for entry in os.scandir(path):
    if entry.is_dir(follow_symlinks=False):
      yield from scan_for_docs(entry.path)
    elif entry.is_file and REGEX_DOC_FILES.search(entry.name):
      yield entry


def scan_file(file_path, search):
  match = False
  try:
    if entry.name.lower().endswith('.docx'):
      result = wk.exe.run_program(['unzip', '-p', entry.path])
    else:
      # Assuming .doc
      result = wk.exe.run_program(['antiword', entry.path])
    out = result.stdout.decode()
    match = re.search(search, out, re.IGNORECASE)
  except Exception:
    # Ignore errors since files may be corrupted
    pass

  return entry.path if match else None


if __name__ == '__main__':
  try:
    # Prep
    wk.ui.cli.clear_screen()
    terms = [re.sub(r'\s+', r'\s*', t) for t in sys.argv[1:]]
    search = '({})'.format('|'.join(terms))

    if len(sys.argv) == 1:
      # Print usage
      wk.ui.cli.print_standard(USAGE)
    else:
      matches = []
      for entry in scan_for_docs(SCANDIR):
        matches.append(scan_file(entry.path, search))
      # Strip None values (i.e. non-matching entries)
      matches = [m for m in matches if m]
      if matches:
        wk.ui.cli.print_success('Found {} {}:'.format(
          len(matches),
          'Matches' if len(matches) > 1 else 'Match'))
        for match in matches:
          wk.ui.cli.print_standard(match)
      else:
        wk.ui.cli.print_error('No matches found.')

    # Done
    wk.ui.cli.print_standard('\nDone.')
    #pause("Press Enter to exit...")
  except SystemExit:
    raise
  except: # noqa: E722
    wk.ui.cli.major_exception()

# vim: sts=2 sw=2 ts=2